Files
V2_micropython/websocket_server/server.py
jeremygan2021 2470013ef3 1
2026-03-03 21:12:03 +08:00

464 lines
21 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from fastapi import FastAPI, WebSocket, WebSocketDisconnect
import uvicorn
import asyncio
import os
import subprocess
import struct
import base64
from dotenv import load_dotenv
import dashscope
from dashscope.audio.asr import Recognition, RecognitionCallback, RecognitionResult
from dashscope import ImageSynthesis
import json
# 加载环境变量
load_dotenv()
dashscope.api_key = os.getenv("DASHSCOPE_API_KEY")
app = FastAPI()
# 存储接收到的音频数据
audio_buffer = bytearray()
RECORDING_RAW_FILE = "received_audio.raw"
RECORDING_MP3_FILE = "received_audio.mp3"
VOLUME_GAIN = 10.0 # 放大倍数
FONT_FILE = "GB2312-16.bin"
GENERATED_IMAGE_FILE = "generated_image.png"
GENERATED_THUMB_FILE = "generated_thumb.bin"
class MyRecognitionCallback(RecognitionCallback):
def __init__(self, websocket: WebSocket, loop: asyncio.AbstractEventLoop):
self.websocket = websocket
self.loop = loop
self.final_text = "" # 保存最终识别结果
def on_open(self) -> None:
print("ASR Session started")
def on_close(self) -> None:
print("ASR Session closed")
def on_event(self, result: RecognitionResult) -> None:
if result.get_sentence():
text = result.get_sentence()['text']
print(f"ASR Result: {text}")
self.final_text = text # 保存识别结果
# 将识别结果发送回客户端
try:
asyncio.run_coroutine_threadsafe(
self.websocket.send_text(f"ASR:{text}"),
self.loop
)
except Exception as e:
print(f"Failed to send ASR result to client: {e}")
def process_chunk_32_to_16(chunk_bytes, gain=1.0):
processed_chunk = bytearray()
# Iterate 4 bytes at a time
for i in range(0, len(chunk_bytes), 4):
if i+3 < len(chunk_bytes):
# 取 chunk[i+2] 和 chunk[i+3] 组成 16-bit signed int
sample = struct.unpack_from('<h', chunk_bytes, i+2)[0]
# 放大音量
sample = int(sample * gain)
# 限幅 (Clamping) 防止溢出爆音
if sample > 32767: sample = 32767
elif sample < -32768: sample = -32768
# 重新打包为 16-bit little-endian
processed_chunk.extend(struct.pack('<h', sample))
return processed_chunk
def generate_image(prompt, websocket=None):
"""调用万相文生图API生成图片"""
print(f"Generating image for prompt: {prompt}")
try:
response = ImageSynthesis.call(
model='wanx-v1.0-text-to-image',
prompt=prompt,
size='512x512',
n=1
)
if response.status_code == 200:
image_url = response.output['results'][0]['url']
print(f"Image generated, downloading from: {image_url}")
import urllib.request
urllib.request.urlretrieve(image_url, GENERATED_IMAGE_FILE)
print(f"Image saved to {GENERATED_IMAGE_FILE}")
# 缩放图片并转换为RGB565格式
try:
from PIL import Image
img = Image.open(GENERATED_IMAGE_FILE)
# 缩小到120x120 (屏幕是240x240但需要考虑内存限制)
thumb_size = 120
img = img.resize((thumb_size, thumb_size), Image.LANCZOS)
# 转换为RGB565格式的原始数据
# 每个像素2字节 (R5 G6 B5)
rgb565_data = bytearray()
for y in range(thumb_size):
for x in range(thumb_size):
r, g, b = img.getpixel((x, y))[:3]
# 转换为RGB565
r5 = (r >> 3) & 0x1F
g6 = (g >> 2) & 0x3F
b5 = (b >> 3) & 0x1F
# 小端模式:低字节在前
rgb565 = (r5 << 11) | (g6 << 5) | b5
rgb565_data.extend(struct.pack('<H', rgb565))
# 保存为.bin文件
with open(GENERATED_THUMB_FILE, 'wb') as f:
f.write(rgb565_data)
print(f"Thumbnail saved to {GENERATED_THUMB_FILE}, size: {len(rgb565_data)} bytes")
return GENERATED_THUMB_FILE
except ImportError:
print("PIL not available, sending original image")
return GENERATED_IMAGE_FILE
except Exception as e:
print(f"Error processing image: {e}")
return GENERATED_IMAGE_FILE
else:
print(f"Image generation failed: {response.code} - {response.message}")
return None
except Exception as e:
print(f"Error generating image: {e}")
return None
@app.websocket("/ws/audio")
async def websocket_endpoint(websocket: WebSocket):
global audio_buffer
await websocket.accept()
print("Client connected")
recognition = None
callback = None # 保存callback对象
processed_buffer = bytearray()
loop = asyncio.get_running_loop()
try:
while True:
# 接收消息 (可能是文本指令或二进制音频数据)
try:
message = await websocket.receive()
except RuntimeError as e:
if "Cannot call \"receive\" once a disconnect message has been received" in str(e):
print("Client disconnected (RuntimeError caught)")
break
raise e
if "text" in message:
text = message["text"]
print(f"Received text: {text}")
if text == "START_RECORDING":
print("Start recording...")
audio_buffer = bytearray() # 清空缓冲区
processed_buffer = bytearray()
# 启动实时语音识别
try:
callback = MyRecognitionCallback(websocket, loop)
recognition = Recognition(
model='paraformer-realtime-v2',
format='pcm',
sample_rate=16000,
callback=callback
)
recognition.start()
print("DashScope ASR started")
except Exception as e:
print(f"Failed to start ASR: {e}")
recognition = None
callback = None
elif text == "STOP_RECORDING":
print(f"Stop recording. Total raw bytes: {len(audio_buffer)}")
# 停止语音识别
if recognition:
try:
recognition.stop()
print("DashScope ASR stopped")
except Exception as e:
print(f"Error stopping ASR: {e}")
recognition = None
# 使用实时处理过的音频数据
processed_audio = processed_buffer
print(f"Processed audio size: {len(processed_audio)} bytes (Gain: {VOLUME_GAIN}x)")
# 获取ASR识别结果
asr_text = ""
if callback:
asr_text = callback.final_text
print(f"Final ASR text: {asr_text}")
# 2. 保存原始 RAW 文件 (16-bit PCM)
with open(RECORDING_RAW_FILE, "wb") as f:
f.write(processed_audio)
# 3. 转换为 MP3 并保存 (使用 ffmpeg 命令行,避免 Python 3.13 audioop 问题)
try:
# ffmpeg -y -f s16le -ar 16000 -ac 1 -i received_audio.raw received_audio.mp3
cmd = [
"ffmpeg",
"-y", # 覆盖输出文件
"-f", "s16le", # 输入格式: signed 16-bit little endian
"-ar", "16000", # 输入采样率
"-ac", "1", # 输入声道数
"-i", RECORDING_RAW_FILE,
RECORDING_MP3_FILE
]
print(f"Running command: {' '.join(cmd)}")
# Use asyncio.create_subprocess_exec instead of subprocess.run to avoid blocking the event loop
process = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE
)
stdout, stderr = await process.communicate()
if process.returncode != 0:
raise subprocess.CalledProcessError(process.returncode, cmd, output=stdout, stderr=stderr)
print(f"Saved MP3 to {RECORDING_MP3_FILE}")
except subprocess.CalledProcessError as e:
print(f"Error converting to MP3: {e}")
# stderr might be bytes
error_msg = e.stderr.decode() if isinstance(e.stderr, bytes) else str(e.stderr)
print(f"FFmpeg stderr: {error_msg}")
except FileNotFoundError:
print("Error: ffmpeg not found. Please install ffmpeg.")
except Exception as e:
print(f"Error converting to MP3: {e}")
# 4. 如果有识别结果调用文生图API生成图片
if asr_text:
print(f"Generating image for: {asr_text}")
# 先发送 ASR 文字到 ESP32 显示
await websocket.send_text(f"ASR:{asr_text}")
await websocket.send_text("GENERATING_IMAGE:正在生成图片,请稍候...")
# 等待一会让 ESP32 显示文字
await asyncio.sleep(0.5)
# 调用文生图API
image_path = await asyncio.to_thread(generate_image, asr_text)
if image_path and os.path.exists(image_path):
# 读取图片并发送回ESP32
with open(image_path, 'rb') as f:
image_data = f.read()
print(f"Sending image to ESP32, size: {len(image_data)} bytes")
# 将图片转换为base64发送
image_b64 = base64.b64encode(image_data).decode('utf-8')
await websocket.send_text(f"IMAGE_START:{len(image_data)}")
# 分片发送图片数据
chunk_size = 4096
for i in range(0, len(image_b64), chunk_size):
chunk = image_b64[i:i+chunk_size]
await websocket.send_text(f"IMAGE_DATA:{chunk}")
await websocket.send_text("IMAGE_END")
print("Image sent to ESP32")
else:
await websocket.send_text("IMAGE_ERROR:图片生成失败")
else:
print("No ASR text, skipping image generation")
print("Server processing finished.")
elif text.startswith("GET_FONTS_BATCH:"):
# Format: GET_FONTS_BATCH:code1,code2,code3 (decimal unicode)
try:
codes_str = text.split(":")[1]
code_list = codes_str.split(",")
print(f"Batch Font Request for {len(code_list)} chars: {code_list}")
for code_str in code_list:
if not code_str: continue
try:
unicode_val = int(code_str)
char = chr(unicode_val)
gb_bytes = char.encode('gb2312')
if len(gb_bytes) == 2:
code = struct.unpack('>H', gb_bytes)[0]
else:
print(f"Character {char} is not a valid 2-byte GB2312 char")
# Send empty/dummy? Or just skip.
# Better to send something so client doesn't wait forever if it counts responses.
# But client probably uses a set of missing chars.
continue
# Calc offset
area = (code >> 8) - 0xA0
index = (code & 0xFF) - 0xA0
if area >= 1 and index >= 1:
offset = ((area - 1) * 94 + (index - 1)) * 32
# Read font file
# Optimization: Open file once outside loop?
# For simplicity, keep it here, OS caching helps.
script_dir = os.path.dirname(os.path.abspath(__file__))
font_path = os.path.join(script_dir, FONT_FILE)
if not os.path.exists(font_path):
font_path = os.path.join(script_dir, "..", FONT_FILE)
if not os.path.exists(font_path):
font_path = FONT_FILE
if os.path.exists(font_path):
with open(font_path, "rb") as f:
f.seek(offset)
font_data = f.read(32)
if len(font_data) == 32:
import binascii
hex_data = binascii.hexlify(font_data).decode('utf-8')
response = f"FONT_DATA:{code_str}:{hex_data}"
await websocket.send_text(response)
# Small yield to let network flush?
# await asyncio.sleep(0.001)
except Exception as e:
print(f"Error processing batch item {code_str}: {e}")
# Send a completion marker
await websocket.send_text("FONT_BATCH_END")
except Exception as e:
print(f"Error handling BATCH FONT request: {e}")
await websocket.send_text("FONT_BATCH_END") # Ensure we unblock client
elif text.startswith("GET_FONT_UNICODE:") or text.startswith("GET_FONT:"):
# 格式: GET_FONT_UNICODE:12345 (decimal) or GET_FONT:0xA1A1 (hex)
try:
is_unicode = text.startswith("GET_FONT_UNICODE:")
code_str = text.split(":")[1]
target_code_str = code_str # Used for response
if is_unicode:
unicode_val = int(code_str)
char = chr(unicode_val)
try:
gb_bytes = char.encode('gb2312')
if len(gb_bytes) == 2:
code = struct.unpack('>H', gb_bytes)[0]
else:
print(f"Character {char} is not a valid 2-byte GB2312 char")
continue
except Exception as e:
print(f"Failed to encode {char} to gb2312: {e}")
continue
else:
code = int(code_str, 16)
# 计算偏移量
# GB2312 编码范围0xA1A1 - 0xFEFE
# 区码:高字节 - 0xA0
# 位码:低字节 - 0xA0
area = (code >> 8) - 0xA0
index = (code & 0xFF) - 0xA0
if area >= 1 and index >= 1:
offset = ((area - 1) * 94 + (index - 1)) * 32
# 读取字体文件
# 注意:这里为了简单,每次都打开文件。如果并发高,应该缓存文件句柄或内容。
# 假设字体文件在当前目录或上级目录
# Prioritize finding the file in the script's directory
script_dir = os.path.dirname(os.path.abspath(__file__))
font_path = os.path.join(script_dir, FONT_FILE)
# Fallback: check one level up
if not os.path.exists(font_path):
font_path = os.path.join(script_dir, "..", FONT_FILE)
# Fallback: check current working directory
if not os.path.exists(font_path):
font_path = FONT_FILE
if os.path.exists(font_path):
print(f"Reading font from: {font_path} (Offset: {offset})")
with open(font_path, "rb") as f:
f.seek(offset)
font_data = f.read(32)
if len(font_data) == 32:
import binascii
hex_data = binascii.hexlify(font_data).decode('utf-8')
# Return the original requested code (unicode or hex) so client can map it back
response = f"FONT_DATA:{target_code_str}:{hex_data}"
# print(f"Sending Font Response: {response[:30]}...")
await websocket.send_text(response)
else:
print(f"Error: Read {len(font_data)} bytes for font data (expected 32)")
else:
print(f"Font file not found: {font_path}")
else:
print(f"Invalid GB2312 code derived: {code:X} (Area: {area}, Index: {index})")
except Exception as e:
print(f"Error handling FONT request: {e}")
elif "bytes" in message:
# 接收音频数据并追加到缓冲区
data = message["bytes"]
audio_buffer.extend(data)
# 实时处理并发送给 ASR
pcm_chunk = process_chunk_32_to_16(data, VOLUME_GAIN)
processed_buffer.extend(pcm_chunk)
if recognition:
try:
recognition.send_audio_frame(pcm_chunk)
except Exception as e:
print(f"Error sending audio frame to ASR: {e}")
except WebSocketDisconnect:
print("Client disconnected")
if recognition:
try:
recognition.stop()
except:
pass
except Exception as e:
print(f"Error: {e}")
if recognition:
try:
recognition.stop()
except:
pass
if __name__ == "__main__":
# 获取本机IP方便ESP32连接
import socket
hostname = socket.gethostname()
local_ip = socket.gethostbyname(hostname)
print(f"Server running on ws://{local_ip}:8000/ws/audio")
uvicorn.run(app, host="0.0.0.0", port=8000)