This commit is contained in:
jeremygan2021
2026-03-03 21:12:03 +08:00
parent 124b185b8a
commit 2470013ef3
6 changed files with 175 additions and 356 deletions

Binary file not shown.

Binary file not shown.

View File

@@ -4,9 +4,11 @@ import asyncio
import os
import subprocess
import struct
import base64
from dotenv import load_dotenv
import dashscope
from dashscope.audio.asr import Recognition, RecognitionCallback, RecognitionResult
from dashscope import ImageSynthesis
import json
# 加载环境变量
@@ -19,13 +21,16 @@ app = FastAPI()
audio_buffer = bytearray()
RECORDING_RAW_FILE = "received_audio.raw"
RECORDING_MP3_FILE = "received_audio.mp3"
VOLUME_GAIN = 10.0 # 放大倍数
VOLUME_GAIN = 10.0 # 放大倍数
FONT_FILE = "GB2312-16.bin"
GENERATED_IMAGE_FILE = "generated_image.png"
GENERATED_THUMB_FILE = "generated_thumb.bin"
class MyRecognitionCallback(RecognitionCallback):
def __init__(self, websocket: WebSocket, loop: asyncio.AbstractEventLoop):
self.websocket = websocket
self.loop = loop
self.final_text = "" # 保存最终识别结果
def on_open(self) -> None:
print("ASR Session started")
@@ -37,6 +42,7 @@ class MyRecognitionCallback(RecognitionCallback):
if result.get_sentence():
text = result.get_sentence()['text']
print(f"ASR Result: {text}")
self.final_text = text # 保存识别结果
# 将识别结果发送回客户端
try:
asyncio.run_coroutine_threadsafe(
@@ -65,6 +71,74 @@ def process_chunk_32_to_16(chunk_bytes, gain=1.0):
processed_chunk.extend(struct.pack('<h', sample))
return processed_chunk
def generate_image(prompt, websocket=None):
"""调用万相文生图API生成图片"""
print(f"Generating image for prompt: {prompt}")
try:
response = ImageSynthesis.call(
model='wanx-v1.0-text-to-image',
prompt=prompt,
size='512x512',
n=1
)
if response.status_code == 200:
image_url = response.output['results'][0]['url']
print(f"Image generated, downloading from: {image_url}")
import urllib.request
urllib.request.urlretrieve(image_url, GENERATED_IMAGE_FILE)
print(f"Image saved to {GENERATED_IMAGE_FILE}")
# 缩放图片并转换为RGB565格式
try:
from PIL import Image
img = Image.open(GENERATED_IMAGE_FILE)
# 缩小到120x120 (屏幕是240x240但需要考虑内存限制)
thumb_size = 120
img = img.resize((thumb_size, thumb_size), Image.LANCZOS)
# 转换为RGB565格式的原始数据
# 每个像素2字节 (R5 G6 B5)
rgb565_data = bytearray()
for y in range(thumb_size):
for x in range(thumb_size):
r, g, b = img.getpixel((x, y))[:3]
# 转换为RGB565
r5 = (r >> 3) & 0x1F
g6 = (g >> 2) & 0x3F
b5 = (b >> 3) & 0x1F
# 小端模式:低字节在前
rgb565 = (r5 << 11) | (g6 << 5) | b5
rgb565_data.extend(struct.pack('<H', rgb565))
# 保存为.bin文件
with open(GENERATED_THUMB_FILE, 'wb') as f:
f.write(rgb565_data)
print(f"Thumbnail saved to {GENERATED_THUMB_FILE}, size: {len(rgb565_data)} bytes")
return GENERATED_THUMB_FILE
except ImportError:
print("PIL not available, sending original image")
return GENERATED_IMAGE_FILE
except Exception as e:
print(f"Error processing image: {e}")
return GENERATED_IMAGE_FILE
else:
print(f"Image generation failed: {response.code} - {response.message}")
return None
except Exception as e:
print(f"Error generating image: {e}")
return None
@app.websocket("/ws/audio")
async def websocket_endpoint(websocket: WebSocket):
global audio_buffer
@@ -72,6 +146,7 @@ async def websocket_endpoint(websocket: WebSocket):
print("Client connected")
recognition = None
callback = None # 保存callback对象
processed_buffer = bytearray()
loop = asyncio.get_running_loop()
@@ -109,6 +184,7 @@ async def websocket_endpoint(websocket: WebSocket):
except Exception as e:
print(f"Failed to start ASR: {e}")
recognition = None
callback = None
elif text == "STOP_RECORDING":
print(f"Stop recording. Total raw bytes: {len(audio_buffer)}")
@@ -127,6 +203,12 @@ async def websocket_endpoint(websocket: WebSocket):
print(f"Processed audio size: {len(processed_audio)} bytes (Gain: {VOLUME_GAIN}x)")
# 获取ASR识别结果
asr_text = ""
if callback:
asr_text = callback.final_text
print(f"Final ASR text: {asr_text}")
# 2. 保存原始 RAW 文件 (16-bit PCM)
with open(RECORDING_RAW_FILE, "wb") as f:
f.write(processed_audio)
@@ -167,22 +249,45 @@ async def websocket_endpoint(websocket: WebSocket):
except Exception as e:
print(f"Error converting to MP3: {e}")
# 4. 不再发送回客户端播放,提升性能
# print("Sending audio back...")
# await websocket.send_text("START_PLAYBACK")
# 分块发送
# chunk_size = 4096
# for i in range(0, len(processed_audio), chunk_size):
# chunk = processed_audio[i:i+chunk_size]
# await websocket.send_bytes(chunk)
# # 小延时,避免发送过快导致 ESP32 缓冲区溢出
# # 4096 bytes / 32000 bytes/s (16k*2) = ~0.128s
# # 0.04s 约为 3 倍速发送,既保证缓冲又不至于拥塞
# await asyncio.sleep(0.04)
# 4. 如果有识别结果调用文生图API生成图片
if asr_text:
print(f"Generating image for: {asr_text}")
# await websocket.send_text("STOP_PLAYBACK")
print("Server processing finished (No playback sent).")
# 先发送 ASR 文字到 ESP32 显示
await websocket.send_text(f"ASR:{asr_text}")
await websocket.send_text("GENERATING_IMAGE:正在生成图片,请稍候...")
# 等待一会让 ESP32 显示文字
await asyncio.sleep(0.5)
# 调用文生图API
image_path = await asyncio.to_thread(generate_image, asr_text)
if image_path and os.path.exists(image_path):
# 读取图片并发送回ESP32
with open(image_path, 'rb') as f:
image_data = f.read()
print(f"Sending image to ESP32, size: {len(image_data)} bytes")
# 将图片转换为base64发送
image_b64 = base64.b64encode(image_data).decode('utf-8')
await websocket.send_text(f"IMAGE_START:{len(image_data)}")
# 分片发送图片数据
chunk_size = 4096
for i in range(0, len(image_b64), chunk_size):
chunk = image_b64[i:i+chunk_size]
await websocket.send_text(f"IMAGE_DATA:{chunk}")
await websocket.send_text("IMAGE_END")
print("Image sent to ESP32")
else:
await websocket.send_text("IMAGE_ERROR:图片生成失败")
else:
print("No ASR text, skipping image generation")
print("Server processing finished.")
elif text.startswith("GET_FONTS_BATCH:"):
# Format: GET_FONTS_BATCH:code1,code2,code3 (decimal unicode)