V2_micropython/websocket_server/server.py

from fastapi import FastAPI, WebSocket, WebSocketDisconnect
import uvicorn
import asyncio
import os
import subprocess
import struct
import base64
from dotenv import load_dotenv
import dashscope
from dashscope.audio.asr import Recognition, RecognitionCallback, RecognitionResult
from dashscope import ImageSynthesis
import json

# 加载环境变量
load_dotenv()
dashscope.api_key = os.getenv("DASHSCOPE_API_KEY")

app = FastAPI()

# 存储接收到的音频数据
audio_buffer = bytearray()
RECORDING_RAW_FILE = "received_audio.raw"
RECORDING_MP3_FILE = "received_audio.mp3"
VOLUME_GAIN = 10.0  # 放大倍数
FONT_FILE = "GB2312-16.bin"
GENERATED_IMAGE_FILE = "generated_image.png"
GENERATED_THUMB_FILE = "generated_thumb.bin"

class MyRecognitionCallback(RecognitionCallback):
    def __init__(self, websocket: WebSocket, loop: asyncio.AbstractEventLoop):
        self.websocket = websocket
        self.loop = loop
        self.final_text = ""  # 保存最终识别结果

    def on_open(self) -> None:
        print("ASR Session started")

    def on_close(self) -> None:
        print("ASR Session closed")

    def on_event(self, result: RecognitionResult) -> None:
        if result.get_sentence():
             text = result.get_sentence()['text']
             print(f"ASR Result: {text}")
             self.final_text = text  # 保存识别结果
             # 将识别结果发送回客户端
             try:
                 asyncio.run_coroutine_threadsafe(
                     self.websocket.send_text(f"ASR:{text}"),
                     self.loop
                 )
             except Exception as e:
                 print(f"Failed to send ASR result to client: {e}")

def process_chunk_32_to_16(chunk_bytes, gain=1.0):
    processed_chunk = bytearray()
    # Iterate 4 bytes at a time
    for i in range(0, len(chunk_bytes), 4):
        if i+3 < len(chunk_bytes):
             # 取 chunk[i+2] 和 chunk[i+3] 组成 16-bit signed int
             sample = struct.unpack_from('<h', chunk_bytes, i+2)[0]

             # 放大音量
             sample = int(sample * gain)

             # 限幅 (Clamping) 防止溢出爆音
             if sample > 32767: sample = 32767
             elif sample < -32768: sample = -32768

             # 重新打包为 16-bit little-endian
             processed_chunk.extend(struct.pack('<h', sample))
    return processed_chunk


def generate_image(prompt, websocket=None):
    """调用万相文生图API生成图片"""
    print(f"Generating image for prompt: {prompt}")

    try:
        response = ImageSynthesis.call(
            model='wanx-v1.0-text-to-image',
            prompt=prompt,
            size='512x512',
            n=1
        )

        if response.status_code == 200:
            image_url = response.output['results'][0]['url']
            print(f"Image generated, downloading from: {image_url}")

            import urllib.request
            urllib.request.urlretrieve(image_url, GENERATED_IMAGE_FILE)
            print(f"Image saved to {GENERATED_IMAGE_FILE}")

            # 缩放图片并转换为RGB565格式
            try:
                from PIL import Image
                img = Image.open(GENERATED_IMAGE_FILE)

                # 缩小到120x120 (屏幕是240x240，但需要考虑内存限制)
                thumb_size = 120
                img = img.resize((thumb_size, thumb_size), Image.LANCZOS)

                # 转换为RGB565格式的原始数据
                # 每个像素2字节 (R5 G6 B5)
                rgb565_data = bytearray()

                for y in range(thumb_size):
                    for x in range(thumb_size):
                        r, g, b = img.getpixel((x, y))[:3]

                        # 转换为RGB565
                        r5 = (r >> 3) & 0x1F
                        g6 = (g >> 2) & 0x3F
                        b5 = (b >> 3) & 0x1F

                        # 小端模式：低字节在前
                        rgb565 = (r5 << 11) | (g6 << 5) | b5
                        rgb565_data.extend(struct.pack('<H', rgb565))

                # 保存为.bin文件
                with open(GENERATED_THUMB_FILE, 'wb') as f:
                    f.write(rgb565_data)

                print(f"Thumbnail saved to {GENERATED_THUMB_FILE}, size: {len(rgb565_data)} bytes")
                return GENERATED_THUMB_FILE

            except ImportError:
                print("PIL not available, sending original image")
                return GENERATED_IMAGE_FILE
            except Exception as e:
                print(f"Error processing image: {e}")
                return GENERATED_IMAGE_FILE
        else:
            print(f"Image generation failed: {response.code} - {response.message}")
            return None

    except Exception as e:
        print(f"Error generating image: {e}")
        return None

@app.websocket("/ws/audio")
async def websocket_endpoint(websocket: WebSocket):
    global audio_buffer
    await websocket.accept()
    print("Client connected")

    recognition = None
    callback = None  # 保存callback对象
    processed_buffer = bytearray()
    loop = asyncio.get_running_loop()

    try:
        while True:
            # 接收消息 (可能是文本指令或二进制音频数据)
            try:
                message = await websocket.receive()
            except RuntimeError as e:
                if "Cannot call \"receive\" once a disconnect message has been received" in str(e):
                    print("Client disconnected (RuntimeError caught)")
                    break
                raise e

            if "text" in message:
                text = message["text"]
                print(f"Received text: {text}")

                if text == "START_RECORDING":
                    print("Start recording...")
                    audio_buffer = bytearray() # 清空缓冲区
                    processed_buffer = bytearray()

                    # 启动实时语音识别
                    try:
                        callback = MyRecognitionCallback(websocket, loop)
                        recognition = Recognition(
                            model='paraformer-realtime-v2',
                            format='pcm',
                            sample_rate=16000,
                            callback=callback
                        )
                        recognition.start()
                        print("DashScope ASR started")
                    except Exception as e:
                        print(f"Failed to start ASR: {e}")
                        recognition = None
                        callback = None

                elif text == "STOP_RECORDING":
                    print(f"Stop recording. Total raw bytes: {len(audio_buffer)}")

                    # 停止语音识别
                    if recognition:
                        try:
                            recognition.stop()
                            print("DashScope ASR stopped")
                        except Exception as e:
                            print(f"Error stopping ASR: {e}")
                        recognition = None

                    # 使用实时处理过的音频数据
                    processed_audio = processed_buffer

                    print(f"Processed audio size: {len(processed_audio)} bytes (Gain: {VOLUME_GAIN}x)")

                    # 获取ASR识别结果
                    asr_text = ""
                    if callback:
                        asr_text = callback.final_text
                        print(f"Final ASR text: {asr_text}")

                    # 2. 保存原始 RAW 文件 (16-bit PCM)
                    with open(RECORDING_RAW_FILE, "wb") as f:
                        f.write(processed_audio)

                    # 3. 转换为 MP3 并保存 (使用 ffmpeg 命令行，避免 Python 3.13 audioop 问题)
                    try:
                        # ffmpeg -y -f s16le -ar 16000 -ac 1 -i received_audio.raw received_audio.mp3
                        cmd = [
                            "ffmpeg",
                            "-y", # 覆盖输出文件
                            "-f", "s16le", # 输入格式: signed 16-bit little endian
                            "-ar", "16000", # 输入采样率
                            "-ac", "1", # 输入声道数
                            "-i", RECORDING_RAW_FILE,
                            RECORDING_MP3_FILE
                        ]
                        print(f"Running command: {' '.join(cmd)}")

                        # Use asyncio.create_subprocess_exec instead of subprocess.run to avoid blocking the event loop
                        process = await asyncio.create_subprocess_exec(
                            *cmd,
                            stdout=asyncio.subprocess.PIPE,
                            stderr=asyncio.subprocess.PIPE
                        )
                        stdout, stderr = await process.communicate()

                        if process.returncode != 0:
                            raise subprocess.CalledProcessError(process.returncode, cmd, output=stdout, stderr=stderr)

                        print(f"Saved MP3 to {RECORDING_MP3_FILE}")
                    except subprocess.CalledProcessError as e:
                        print(f"Error converting to MP3: {e}")
                        # stderr might be bytes
                        error_msg = e.stderr.decode() if isinstance(e.stderr, bytes) else str(e.stderr)
                        print(f"FFmpeg stderr: {error_msg}")
                    except FileNotFoundError:
                        print("Error: ffmpeg not found. Please install ffmpeg.")
                    except Exception as e:
                         print(f"Error converting to MP3: {e}")

                    # 4. 如果有识别结果，调用文生图API生成图片
                    if asr_text:
                        print(f"Generating image for: {asr_text}")

                        # 先发送 ASR 文字到 ESP32 显示
                        await websocket.send_text(f"ASR:{asr_text}")
                        await websocket.send_text("GENERATING_IMAGE:正在生成图片，请稍候...")

                        # 等待一会让 ESP32 显示文字
                        await asyncio.sleep(0.5)

                        # 调用文生图API
                        image_path = await asyncio.to_thread(generate_image, asr_text)

                        if image_path and os.path.exists(image_path):
                            # 读取图片并发送回ESP32
                            with open(image_path, 'rb') as f:
                                image_data = f.read()

                            print(f"Sending image to ESP32, size: {len(image_data)} bytes")

                            # 将图片转换为base64发送
                            image_b64 = base64.b64encode(image_data).decode('utf-8')
                            await websocket.send_text(f"IMAGE_START:{len(image_data)}")

                            # 分片发送图片数据
                            chunk_size = 4096
                            for i in range(0, len(image_b64), chunk_size):
                                chunk = image_b64[i:i+chunk_size]
                                await websocket.send_text(f"IMAGE_DATA:{chunk}")

                            await websocket.send_text("IMAGE_END")
                            print("Image sent to ESP32")
                        else:
                            await websocket.send_text("IMAGE_ERROR:图片生成失败")
                    else:
                        print("No ASR text, skipping image generation")

                    print("Server processing finished.")

                elif text.startswith("GET_FONTS_BATCH:"):
                    # Format: GET_FONTS_BATCH:code1,code2,code3 (decimal unicode)
                    try:
                        codes_str = text.split(":")[1]
                        code_list = codes_str.split(",")
                        print(f"Batch Font Request for {len(code_list)} chars: {code_list}")

                        for code_str in code_list:
                            if not code_str: continue

                            try:
                                unicode_val = int(code_str)
                                char = chr(unicode_val)

                                gb_bytes = char.encode('gb2312')
                                if len(gb_bytes) == 2:
                                    code = struct.unpack('>H', gb_bytes)[0]
                                else:
                                    print(f"Character {char} is not a valid 2-byte GB2312 char")
                                    # Send empty/dummy? Or just skip.
                                    # Better to send something so client doesn't wait forever if it counts responses.
                                    # But client probably uses a set of missing chars.
                                    continue

                                # Calc offset
                                area = (code >> 8) - 0xA0
                                index = (code & 0xFF) - 0xA0

                                if area >= 1 and index >= 1:
                                    offset = ((area - 1) * 94 + (index - 1)) * 32

                                    # Read font file
                                    # Optimization: Open file once outside loop?
                                    # For simplicity, keep it here, OS caching helps.

                                    script_dir = os.path.dirname(os.path.abspath(__file__))
                                    font_path = os.path.join(script_dir, FONT_FILE)
                                    if not os.path.exists(font_path):
                                        font_path = os.path.join(script_dir, "..", FONT_FILE)
                                    if not os.path.exists(font_path):
                                        font_path = FONT_FILE

                                    if os.path.exists(font_path):
                                        with open(font_path, "rb") as f:
                                            f.seek(offset)
                                            font_data = f.read(32)
                                            if len(font_data) == 32:
                                                import binascii
                                                hex_data = binascii.hexlify(font_data).decode('utf-8')
                                                response = f"FONT_DATA:{code_str}:{hex_data}"
                                                await websocket.send_text(response)
                                                # Small yield to let network flush?
                                                # await asyncio.sleep(0.001)
                            except Exception as e:
                                print(f"Error processing batch item {code_str}: {e}")

                        # Send a completion marker
                        await websocket.send_text("FONT_BATCH_END")

                    except Exception as e:
                         print(f"Error handling BATCH FONT request: {e}")
                         await websocket.send_text("FONT_BATCH_END") # Ensure we unblock client

                elif text.startswith("GET_FONT_UNICODE:") or text.startswith("GET_FONT:"):
                    # 格式: GET_FONT_UNICODE:12345 (decimal) or GET_FONT:0xA1A1 (hex)
                    try:
                        is_unicode = text.startswith("GET_FONT_UNICODE:")
                        code_str = text.split(":")[1]

                        target_code_str = code_str # Used for response

                        if is_unicode:
                            unicode_val = int(code_str)
                            char = chr(unicode_val)
                            try:
                                gb_bytes = char.encode('gb2312')
                                if len(gb_bytes) == 2:
                                    code = struct.unpack('>H', gb_bytes)[0]
                                else:
                                    print(f"Character {char} is not a valid 2-byte GB2312 char")
                                    continue
                            except Exception as e:
                                print(f"Failed to encode {char} to gb2312: {e}")
                                continue
                        else:
                            code = int(code_str, 16)

                        # 计算偏移量
                        # GB2312 编码范围：0xA1A1 - 0xFEFE
                        # 区码：高字节 - 0xA0
                        # 位码：低字节 - 0xA0
                        area = (code >> 8) - 0xA0
                        index = (code & 0xFF) - 0xA0

                        if area >= 1 and index >= 1:
                            offset = ((area - 1) * 94 + (index - 1)) * 32

                            # 读取字体文件
                            # 注意：这里为了简单，每次都打开文件。如果并发高，应该缓存文件句柄或内容。
                            # 假设字体文件在当前目录或上级目录
                            # Prioritize finding the file in the script's directory
                            script_dir = os.path.dirname(os.path.abspath(__file__))
                            font_path = os.path.join(script_dir, FONT_FILE)

                            # Fallback: check one level up
                            if not os.path.exists(font_path):
                                font_path = os.path.join(script_dir, "..", FONT_FILE)

                            # Fallback: check current working directory
                            if not os.path.exists(font_path):
                                font_path = FONT_FILE

                            if os.path.exists(font_path):
                                print(f"Reading font from: {font_path} (Offset: {offset})")
                                with open(font_path, "rb") as f:
                                    f.seek(offset)
                                    font_data = f.read(32)

                                    if len(font_data) == 32:
                                        import binascii
                                        hex_data = binascii.hexlify(font_data).decode('utf-8')
                                        # Return the original requested code (unicode or hex) so client can map it back
                                        response = f"FONT_DATA:{target_code_str}:{hex_data}"
                                        # print(f"Sending Font Response: {response[:30]}...")
                                        await websocket.send_text(response)
                                    else:
                                        print(f"Error: Read {len(font_data)} bytes for font data (expected 32)")
                            else:
                                print(f"Font file not found: {font_path}")
                        else:
                             print(f"Invalid GB2312 code derived: {code:X} (Area: {area}, Index: {index})")
                    except Exception as e:
                        print(f"Error handling FONT request: {e}")

            elif "bytes" in message:
                # 接收音频数据并追加到缓冲区
                data = message["bytes"]
                audio_buffer.extend(data)

                # 实时处理并发送给 ASR
                pcm_chunk = process_chunk_32_to_16(data, VOLUME_GAIN)
                processed_buffer.extend(pcm_chunk)

                if recognition:
                    try:
                        recognition.send_audio_frame(pcm_chunk)
                    except Exception as e:
                        print(f"Error sending audio frame to ASR: {e}")

    except WebSocketDisconnect:
        print("Client disconnected")
        if recognition:
            try:
                recognition.stop()
            except:
                pass
    except Exception as e:
        print(f"Error: {e}")
        if recognition:
            try:
                recognition.stop()
            except:
                pass

if __name__ == "__main__":
    # 获取本机IP，方便ESP32连接
    import socket
    hostname = socket.gethostname()
    local_ip = socket.gethostbyname(hostname)
    print(f"Server running on ws://{local_ip}:8000/ws/audio")

    uvicorn.run(app, host="0.0.0.0", port=8000)