1
This commit is contained in:
@@ -4,9 +4,11 @@ import asyncio
|
||||
import os
|
||||
import subprocess
|
||||
import struct
|
||||
import base64
|
||||
from dotenv import load_dotenv
|
||||
import dashscope
|
||||
from dashscope.audio.asr import Recognition, RecognitionCallback, RecognitionResult
|
||||
from dashscope import ImageSynthesis
|
||||
import json
|
||||
|
||||
# 加载环境变量
|
||||
@@ -19,13 +21,16 @@ app = FastAPI()
|
||||
audio_buffer = bytearray()
|
||||
RECORDING_RAW_FILE = "received_audio.raw"
|
||||
RECORDING_MP3_FILE = "received_audio.mp3"
|
||||
VOLUME_GAIN = 10.0 # 放大倍数
|
||||
VOLUME_GAIN = 10.0 # 放大倍数
|
||||
FONT_FILE = "GB2312-16.bin"
|
||||
GENERATED_IMAGE_FILE = "generated_image.png"
|
||||
GENERATED_THUMB_FILE = "generated_thumb.bin"
|
||||
|
||||
class MyRecognitionCallback(RecognitionCallback):
|
||||
def __init__(self, websocket: WebSocket, loop: asyncio.AbstractEventLoop):
|
||||
self.websocket = websocket
|
||||
self.loop = loop
|
||||
self.final_text = "" # 保存最终识别结果
|
||||
|
||||
def on_open(self) -> None:
|
||||
print("ASR Session started")
|
||||
@@ -37,6 +42,7 @@ class MyRecognitionCallback(RecognitionCallback):
|
||||
if result.get_sentence():
|
||||
text = result.get_sentence()['text']
|
||||
print(f"ASR Result: {text}")
|
||||
self.final_text = text # 保存识别结果
|
||||
# 将识别结果发送回客户端
|
||||
try:
|
||||
asyncio.run_coroutine_threadsafe(
|
||||
@@ -65,6 +71,74 @@ def process_chunk_32_to_16(chunk_bytes, gain=1.0):
|
||||
processed_chunk.extend(struct.pack('<h', sample))
|
||||
return processed_chunk
|
||||
|
||||
|
||||
def generate_image(prompt, websocket=None):
|
||||
"""调用万相文生图API生成图片"""
|
||||
print(f"Generating image for prompt: {prompt}")
|
||||
|
||||
try:
|
||||
response = ImageSynthesis.call(
|
||||
model='wanx-v1.0-text-to-image',
|
||||
prompt=prompt,
|
||||
size='512x512',
|
||||
n=1
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
image_url = response.output['results'][0]['url']
|
||||
print(f"Image generated, downloading from: {image_url}")
|
||||
|
||||
import urllib.request
|
||||
urllib.request.urlretrieve(image_url, GENERATED_IMAGE_FILE)
|
||||
print(f"Image saved to {GENERATED_IMAGE_FILE}")
|
||||
|
||||
# 缩放图片并转换为RGB565格式
|
||||
try:
|
||||
from PIL import Image
|
||||
img = Image.open(GENERATED_IMAGE_FILE)
|
||||
|
||||
# 缩小到120x120 (屏幕是240x240,但需要考虑内存限制)
|
||||
thumb_size = 120
|
||||
img = img.resize((thumb_size, thumb_size), Image.LANCZOS)
|
||||
|
||||
# 转换为RGB565格式的原始数据
|
||||
# 每个像素2字节 (R5 G6 B5)
|
||||
rgb565_data = bytearray()
|
||||
|
||||
for y in range(thumb_size):
|
||||
for x in range(thumb_size):
|
||||
r, g, b = img.getpixel((x, y))[:3]
|
||||
|
||||
# 转换为RGB565
|
||||
r5 = (r >> 3) & 0x1F
|
||||
g6 = (g >> 2) & 0x3F
|
||||
b5 = (b >> 3) & 0x1F
|
||||
|
||||
# 小端模式:低字节在前
|
||||
rgb565 = (r5 << 11) | (g6 << 5) | b5
|
||||
rgb565_data.extend(struct.pack('<H', rgb565))
|
||||
|
||||
# 保存为.bin文件
|
||||
with open(GENERATED_THUMB_FILE, 'wb') as f:
|
||||
f.write(rgb565_data)
|
||||
|
||||
print(f"Thumbnail saved to {GENERATED_THUMB_FILE}, size: {len(rgb565_data)} bytes")
|
||||
return GENERATED_THUMB_FILE
|
||||
|
||||
except ImportError:
|
||||
print("PIL not available, sending original image")
|
||||
return GENERATED_IMAGE_FILE
|
||||
except Exception as e:
|
||||
print(f"Error processing image: {e}")
|
||||
return GENERATED_IMAGE_FILE
|
||||
else:
|
||||
print(f"Image generation failed: {response.code} - {response.message}")
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error generating image: {e}")
|
||||
return None
|
||||
|
||||
@app.websocket("/ws/audio")
|
||||
async def websocket_endpoint(websocket: WebSocket):
|
||||
global audio_buffer
|
||||
@@ -72,6 +146,7 @@ async def websocket_endpoint(websocket: WebSocket):
|
||||
print("Client connected")
|
||||
|
||||
recognition = None
|
||||
callback = None # 保存callback对象
|
||||
processed_buffer = bytearray()
|
||||
loop = asyncio.get_running_loop()
|
||||
|
||||
@@ -109,6 +184,7 @@ async def websocket_endpoint(websocket: WebSocket):
|
||||
except Exception as e:
|
||||
print(f"Failed to start ASR: {e}")
|
||||
recognition = None
|
||||
callback = None
|
||||
|
||||
elif text == "STOP_RECORDING":
|
||||
print(f"Stop recording. Total raw bytes: {len(audio_buffer)}")
|
||||
@@ -127,6 +203,12 @@ async def websocket_endpoint(websocket: WebSocket):
|
||||
|
||||
print(f"Processed audio size: {len(processed_audio)} bytes (Gain: {VOLUME_GAIN}x)")
|
||||
|
||||
# 获取ASR识别结果
|
||||
asr_text = ""
|
||||
if callback:
|
||||
asr_text = callback.final_text
|
||||
print(f"Final ASR text: {asr_text}")
|
||||
|
||||
# 2. 保存原始 RAW 文件 (16-bit PCM)
|
||||
with open(RECORDING_RAW_FILE, "wb") as f:
|
||||
f.write(processed_audio)
|
||||
@@ -167,22 +249,45 @@ async def websocket_endpoint(websocket: WebSocket):
|
||||
except Exception as e:
|
||||
print(f"Error converting to MP3: {e}")
|
||||
|
||||
# 4. 不再发送回客户端播放,提升性能
|
||||
# print("Sending audio back...")
|
||||
# await websocket.send_text("START_PLAYBACK")
|
||||
|
||||
# 分块发送
|
||||
# chunk_size = 4096
|
||||
# for i in range(0, len(processed_audio), chunk_size):
|
||||
# chunk = processed_audio[i:i+chunk_size]
|
||||
# await websocket.send_bytes(chunk)
|
||||
# # 小延时,避免发送过快导致 ESP32 缓冲区溢出
|
||||
# # 4096 bytes / 32000 bytes/s (16k*2) = ~0.128s
|
||||
# # 0.04s 约为 3 倍速发送,既保证缓冲又不至于拥塞
|
||||
# await asyncio.sleep(0.04)
|
||||
# 4. 如果有识别结果,调用文生图API生成图片
|
||||
if asr_text:
|
||||
print(f"Generating image for: {asr_text}")
|
||||
|
||||
# await websocket.send_text("STOP_PLAYBACK")
|
||||
print("Server processing finished (No playback sent).")
|
||||
# 先发送 ASR 文字到 ESP32 显示
|
||||
await websocket.send_text(f"ASR:{asr_text}")
|
||||
await websocket.send_text("GENERATING_IMAGE:正在生成图片,请稍候...")
|
||||
|
||||
# 等待一会让 ESP32 显示文字
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
# 调用文生图API
|
||||
image_path = await asyncio.to_thread(generate_image, asr_text)
|
||||
|
||||
if image_path and os.path.exists(image_path):
|
||||
# 读取图片并发送回ESP32
|
||||
with open(image_path, 'rb') as f:
|
||||
image_data = f.read()
|
||||
|
||||
print(f"Sending image to ESP32, size: {len(image_data)} bytes")
|
||||
|
||||
# 将图片转换为base64发送
|
||||
image_b64 = base64.b64encode(image_data).decode('utf-8')
|
||||
await websocket.send_text(f"IMAGE_START:{len(image_data)}")
|
||||
|
||||
# 分片发送图片数据
|
||||
chunk_size = 4096
|
||||
for i in range(0, len(image_b64), chunk_size):
|
||||
chunk = image_b64[i:i+chunk_size]
|
||||
await websocket.send_text(f"IMAGE_DATA:{chunk}")
|
||||
|
||||
await websocket.send_text("IMAGE_END")
|
||||
print("Image sent to ESP32")
|
||||
else:
|
||||
await websocket.send_text("IMAGE_ERROR:图片生成失败")
|
||||
else:
|
||||
print("No ASR text, skipping image generation")
|
||||
|
||||
print("Server processing finished.")
|
||||
|
||||
elif text.startswith("GET_FONTS_BATCH:"):
|
||||
# Format: GET_FONTS_BATCH:code1,code2,code3 (decimal unicode)
|
||||
|
||||
Reference in New Issue
Block a user