t

2026-03-02 22:43:04 +08:00
parent c0882a93a9
commit e0776a1839
18 changed files with 1331 additions and 82 deletions
--- a/websocket_server/.env
+++ b/websocket_server/.env
@@ -0,0 +1 @@
+DASHSCOPE_API_KEY=sk-a294f382488d46a1aa0d7cd8e750729b
--- a/websocket_server/GB2312-16.bin
+++ b/websocket_server/GB2312-16.bin
--- a/websocket_server/README.md
+++ b/websocket_server/README.md
@@ -0,0 +1,31 @@
+# WebSocket Audio Server
+
+This is a FastAPI server that receives audio from an ESP32 via WebSocket, saves it, processes it (converts 32-bit to 16-bit), and sends it back for playback.
+
+## Installation
+
+1. Install dependencies:
+   ```bash
+   pip install -r requirements.txt
+   ```
+
+## Usage
+
+1. Start the server:
+   ```bash
+   python server.py
+   ```
+   Or:
+   ```bash
+   uvicorn server:app --host 0.0.0.0 --port 8000
+   ```
+
+2. Update the IP address in `main.py` on your ESP32 to match your computer's IP address.
+   Look for `SERVER_IP` variable.
+
+## Features
+
+- Receives raw audio stream from ESP32.
+- Saves raw audio to `received_audio.raw`.
+- Converts 32-bit audio (from ICS-43434) to 16-bit audio (for MAX98357A).
+- Streams processed audio back to ESP32 for playback.
--- a/websocket_server/pycache/server.cpython-312.pyc
+++ b/websocket_server/pycache/server.cpython-312.pyc
--- a/websocket_server/pycache/server.cpython-313.pyc
+++ b/websocket_server/pycache/server.cpython-313.pyc
--- a/websocket_server/generate_font.py
+++ b/websocket_server/generate_font.py
@@ -0,0 +1,127 @@
+import struct
+import freetype
+import os
+
+# Font file and output file
+FONT_FILE = "/Users/jeremygan/Desktop/python_dev/epaper2/websocket_server/GB2312.ttf"
+OUTPUT_FILE = "/Users/jeremygan/Desktop/python_dev/epaper2/GB2312-16.bin"
+
+# Font size (16x16)
+FONT_SIZE = 16
+
+def create_gb2312_font():
+    # Load the face
+    try:
+        face = freetype.Face(FONT_FILE)
+    except Exception as e:
+        print(f"Error loading font: {e}")
+        return
+
+    # Set char size
+    face.set_pixel_sizes(FONT_SIZE, FONT_SIZE)
+
+    print(f"Generating GB2312 font file: {OUTPUT_FILE}")
+    
+    with open(OUTPUT_FILE, 'wb') as f:
+        # Iterate through GB2312 code points
+        # Area: 0xA1 - 0xFE (161 - 254) -> 94 areas
+        # Index: 0xA1 - 0xFE (161 - 254) -> 94 chars per area
+        
+        count = 0
+        total_chars = 94 * 94
+        
+        # Buffer for empty char (32 bytes of 0x00)
+        empty_char = b'\x00' * 32
+
+        for area in range(0xA1, 0xFF):
+            for index in range(0xA1, 0xFF):
+                # Construct GB2312 code
+                gb_code = bytes([area, index])
+                
+                try:
+                    # Decode to unicode character
+                    char = gb_code.decode('gb2312')
+                    
+                    # Load glyph
+                    face.load_char(char, freetype.FT_LOAD_RENDER | freetype.FT_LOAD_TARGET_MONO)
+                    bitmap = face.glyph.bitmap
+                    
+                    # Convert bitmap to 32 bytes (16x16 / 8)
+                    # The bitmap.buffer is a flat list of bytes.
+                    # For mono rendering, each byte is 0 or 255? No, it's packed?
+                    # FT_LOAD_TARGET_MONO packs 8 pixels into 1 byte.
+                    
+                    # We need to ensure it's 16x16.
+                    # Center the glyph in 16x16 box.
+                    
+                    glyph_width = bitmap.width
+                    glyph_rows = bitmap.rows
+                    glyph_pitch = bitmap.pitch
+                    
+                    # Create a 16x16 buffer (32 bytes)
+                    char_buffer = bytearray(32)
+                    
+                    # Calculate offsets to center
+                    x_off = (FONT_SIZE - glyph_width) // 2
+                    # Vertical alignment is tricky. Let's use bearing Y or just center based on rows.
+                    # A better way is using face.glyph.bitmap_top
+                    # But for fixed height font generation, usually we just center or align baseline.
+                    # Let's try simple centering for now.
+                    y_off = (FONT_SIZE - glyph_rows) // 2
+                    # Adjust y_off if it's too high/low? 
+                    # Let's align to baseline approximately. 
+                    # Usually baseline is at 12-13px for 16px font.
+                    # face.size.ascender might help but let's stick to bitmap center for simplicity first.
+                    
+                    # Copy bitmap to buffer
+                    src_buf = bitmap.buffer
+                    
+                    for row in range(glyph_rows):
+                        # Target row
+                        dst_row = row + y_off
+                        if dst_row < 0 or dst_row >= FONT_SIZE:
+                            continue
+                            
+                        # Source row bytes
+                        # pitch is bytes per row
+                        src_start = row * glyph_pitch
+                        
+                        # We need to copy bits.
+                        # This is getting complicated because FreeType mono bitmap format 
+                        # might not match our target format exactly (MSB/LSB).
+                        # Let's iterate pixels.
+                        
+                        for col in range(glyph_width):
+                            dst_col = col + x_off
+                            if dst_col < 0 or dst_col >= FONT_SIZE:
+                                continue
+                                
+                            # Get pixel from src
+                            byte_idx = src_start + (col >> 3)
+                            bit_idx = 7 - (col & 7)
+                            pixel = (src_buf[byte_idx] >> bit_idx) & 1
+                            
+                            if pixel:
+                                # Set pixel in dst
+                                # format: row by row, 2 bytes per row.
+                                # row 0: byte 0, byte 1
+                                # byte 0: bits 0-7 (left to right) -> wait, usually MSB is left.
+                                dst_byte_idx = dst_row * 2 + (dst_col >> 3)
+                                dst_bit_idx = 7 - (dst_col & 7)
+                                char_buffer[dst_byte_idx] |= (1 << dst_bit_idx)
+                    
+                    f.write(char_buffer)
+                    count += 1
+                    
+                except Exception:
+                    # Character not found or decode error
+                    f.write(empty_char)
+                
+                # Progress
+                if count % 1000 == 0:
+                    print(f"Processed {count} characters...")
+
+    print(f"Done! Generated {OUTPUT_FILE} with size {os.path.getsize(OUTPUT_FILE)} bytes.")
+
+if __name__ == "__main__":
+    create_gb2312_font()
--- a/websocket_server/received_audio.mp3
+++ b/websocket_server/received_audio.mp3
--- a/websocket_server/received_audio.raw
+++ b/websocket_server/received_audio.raw
--- a/websocket_server/requirements.txt
+++ b/websocket_server/requirements.txt
@@ -0,0 +1,6 @@
+fastapi
+uvicorn
+websockets
+pydub
+dashscope
+python-dotenv
--- a/websocket_server/server.py
+++ b/websocket_server/server.py
@@ -0,0 +1,277 @@
+from fastapi import FastAPI, WebSocket, WebSocketDisconnect
+import uvicorn
+import asyncio
+import os
+import subprocess
+import struct
+from dotenv import load_dotenv
+import dashscope
+from dashscope.audio.asr import Recognition, RecognitionCallback, RecognitionResult
+import json
+
+# 加载环境变量
+load_dotenv()
+dashscope.api_key = os.getenv("DASHSCOPE_API_KEY")
+
+app = FastAPI()
+
+# 存储接收到的音频数据
+audio_buffer = bytearray()
+RECORDING_RAW_FILE = "received_audio.raw"
+RECORDING_MP3_FILE = "received_audio.mp3"
+VOLUME_GAIN = 10.0 # 放大倍数
+FONT_FILE = "GB2312-16.bin"
+
+class MyRecognitionCallback(RecognitionCallback):
+    def __init__(self, websocket: WebSocket, loop: asyncio.AbstractEventLoop):
+        self.websocket = websocket
+        self.loop = loop
+
+    def on_open(self) -> None:
+        print("ASR Session started")
+
+    def on_close(self) -> None:
+        print("ASR Session closed")
+
+    def on_event(self, result: RecognitionResult) -> None:
+        if result.get_sentence():
+             text = result.get_sentence()['text']
+             print(f"ASR Result: {text}")
+             # 将识别结果发送回客户端
+             try:
+                 asyncio.run_coroutine_threadsafe(
+                     self.websocket.send_text(f"ASR:{text}"), 
+                     self.loop
+                 )
+             except Exception as e:
+                 print(f"Failed to send ASR result to client: {e}")
+
+def process_chunk_32_to_16(chunk_bytes, gain=1.0):
+    processed_chunk = bytearray()
+    # Iterate 4 bytes at a time
+    for i in range(0, len(chunk_bytes), 4):
+        if i+3 < len(chunk_bytes):
+             # 取 chunk[i+2] 和 chunk[i+3] 组成 16-bit signed int
+             sample = struct.unpack_from('<h', chunk_bytes, i+2)[0]
+             
+             # 放大音量
+             sample = int(sample * gain)
+             
+             # 限幅 (Clamping) 防止溢出爆音
+             if sample > 32767: sample = 32767
+             elif sample < -32768: sample = -32768
+             
+             # 重新打包为 16-bit little-endian
+             processed_chunk.extend(struct.pack('<h', sample))
+    return processed_chunk
+
+@app.websocket("/ws/audio")
+async def websocket_endpoint(websocket: WebSocket):
+    global audio_buffer
+    await websocket.accept()
+    print("Client connected")
+    
+    recognition = None
+    processed_buffer = bytearray()
+    loop = asyncio.get_running_loop()
+    
+    try:
+        while True:
+            # 接收消息 (可能是文本指令或二进制音频数据)
+            try:
+                message = await websocket.receive()
+            except RuntimeError as e:
+                if "Cannot call \"receive\" once a disconnect message has been received" in str(e):
+                    print("Client disconnected (RuntimeError caught)")
+                    break
+                raise e
+            
+            if "text" in message:
+                text = message["text"]
+                print(f"Received text: {text}")
+                
+                if text == "START_RECORDING":
+                    print("Start recording...")
+                    audio_buffer = bytearray() # 清空缓冲区
+                    processed_buffer = bytearray()
+                    
+                    # 启动实时语音识别
+                    try:
+                        callback = MyRecognitionCallback(websocket, loop)
+                        recognition = Recognition(
+                            model='paraformer-realtime-v2',
+                            format='pcm',
+                            sample_rate=16000,
+                            callback=callback
+                        )
+                        recognition.start()
+                        print("DashScope ASR started")
+                    except Exception as e:
+                        print(f"Failed to start ASR: {e}")
+                        recognition = None
+                    
+                elif text == "STOP_RECORDING":
+                    print(f"Stop recording. Total raw bytes: {len(audio_buffer)}")
+                    
+                    # 停止语音识别
+                    if recognition:
+                        try:
+                            recognition.stop()
+                            print("DashScope ASR stopped")
+                        except Exception as e:
+                            print(f"Error stopping ASR: {e}")
+                        recognition = None
+                    
+                    # 使用实时处理过的音频数据
+                    processed_audio = processed_buffer
+                    
+                    print(f"Processed audio size: {len(processed_audio)} bytes (Gain: {VOLUME_GAIN}x)")
+                    
+                    # 2. 保存原始 RAW 文件 (16-bit PCM)
+                    with open(RECORDING_RAW_FILE, "wb") as f:
+                        f.write(processed_audio)
+                        
+                    # 3. 转换为 MP3 并保存 (使用 ffmpeg 命令行，避免 Python 3.13 audioop 问题)
+                    try:
+                        # ffmpeg -y -f s16le -ar 16000 -ac 1 -i received_audio.raw received_audio.mp3
+                        cmd = [
+                            "ffmpeg",
+                            "-y", # 覆盖输出文件
+                            "-f", "s16le", # 输入格式: signed 16-bit little endian
+                            "-ar", "16000", # 输入采样率
+                            "-ac", "1", # 输入声道数
+                            "-i", RECORDING_RAW_FILE,
+                            RECORDING_MP3_FILE
+                        ]
+                        print(f"Running command: {' '.join(cmd)}")
+                        
+                        # Use asyncio.create_subprocess_exec instead of subprocess.run to avoid blocking the event loop
+                        process = await asyncio.create_subprocess_exec(
+                            *cmd,
+                            stdout=asyncio.subprocess.PIPE,
+                            stderr=asyncio.subprocess.PIPE
+                        )
+                        stdout, stderr = await process.communicate()
+                        
+                        if process.returncode != 0:
+                            raise subprocess.CalledProcessError(process.returncode, cmd, output=stdout, stderr=stderr)
+                            
+                        print(f"Saved MP3 to {RECORDING_MP3_FILE}")
+                    except subprocess.CalledProcessError as e:
+                        print(f"Error converting to MP3: {e}")
+                        # stderr might be bytes
+                        error_msg = e.stderr.decode() if isinstance(e.stderr, bytes) else str(e.stderr)
+                        print(f"FFmpeg stderr: {error_msg}")
+                    except FileNotFoundError:
+                        print("Error: ffmpeg not found. Please install ffmpeg.")
+                    except Exception as e:
+                         print(f"Error converting to MP3: {e}")
+
+                    # 4. 发送回客户端播放
+                    print("Sending audio back...")
+                    await websocket.send_text("START_PLAYBACK")
+                    
+                    # 分块发送
+                    chunk_size = 4096 
+                    for i in range(0, len(processed_audio), chunk_size):
+                        chunk = processed_audio[i:i+chunk_size]
+                        await websocket.send_bytes(chunk)
+                        # 小延时，避免发送过快导致 ESP32 缓冲区溢出
+                        # 4096 bytes / 32000 bytes/s (16k*2) = ~0.128s
+                        # 0.04s 约为 3 倍速发送，既保证缓冲又不至于拥塞
+                        await asyncio.sleep(0.04) 
+                        
+                    await websocket.send_text("STOP_PLAYBACK")
+                    print("Audio sent back finished.")
+                    
+                elif text.startswith("GET_FONT:"):
+                    # 格式: GET_FONT:0xA1A1
+                    try:
+                        print(f"Font Request Received: {text}")
+                        hex_code = text.split(":")[1]
+                        code = int(hex_code, 16)
+                        
+                        # 计算偏移量
+                        # GB2312 编码范围：0xA1A1 - 0xFEFE
+                        # 区码：高字节 - 0xA0
+                        # 位码：低字节 - 0xA0
+                        area = (code >> 8) - 0xA0
+                        index = (code & 0xFF) - 0xA0
+                        
+                        if area >= 1 and index >= 1:
+                            offset = ((area - 1) * 94 + (index - 1)) * 32
+                            
+                            # 读取字体文件
+                            # 注意：这里为了简单，每次都打开文件。如果并发高，应该缓存文件句柄或内容。
+                            # 假设字体文件在当前目录或上级目录
+                            # Prioritize finding the file in the script's directory
+                            script_dir = os.path.dirname(os.path.abspath(__file__))
+                            font_path = os.path.join(script_dir, FONT_FILE)
+                            
+                            # Fallback: check one level up
+                            if not os.path.exists(font_path):
+                                font_path = os.path.join(script_dir, "..", FONT_FILE)
+                                
+                            # Fallback: check current working directory
+                            if not os.path.exists(font_path):
+                                font_path = FONT_FILE
+                            
+                            if os.path.exists(font_path):
+                                print(f"Reading font from: {font_path} (Offset: {offset})")
+                                with open(font_path, "rb") as f:
+                                    f.seek(offset)
+                                    font_data = f.read(32)
+                                    
+                                    if len(font_data) == 32:
+                                        import binascii
+                                        hex_data = binascii.hexlify(font_data).decode('utf-8')
+                                        response = f"FONT_DATA:{hex_code}:{hex_data}"
+                                        print(f"Sending Font Response: {response[:30]}...")
+                                        await websocket.send_text(response)
+                                    else:
+                                        print(f"Error: Read {len(font_data)} bytes for font data (expected 32)")
+                            else:
+                                print(f"Font file not found: {font_path}")
+                        else:
+                             print(f"Invalid GB2312 code: {hex_code} (Area: {area}, Index: {index})")
+                    except Exception as e:
+                        print(f"Error handling GET_FONT: {e}")
+            
+            elif "bytes" in message:
+                # 接收音频数据并追加到缓冲区
+                data = message["bytes"]
+                audio_buffer.extend(data)
+                
+                # 实时处理并发送给 ASR
+                pcm_chunk = process_chunk_32_to_16(data, VOLUME_GAIN)
+                processed_buffer.extend(pcm_chunk)
+                
+                if recognition:
+                    try:
+                        recognition.send_audio_frame(pcm_chunk)
+                    except Exception as e:
+                        print(f"Error sending audio frame to ASR: {e}")
+                
+    except WebSocketDisconnect:
+        print("Client disconnected")
+        if recognition:
+            try:
+                recognition.stop()
+            except:
+                pass
+    except Exception as e:
+        print(f"Error: {e}")
+        if recognition:
+            try:
+                recognition.stop()
+            except:
+                pass
+
+if __name__ == "__main__":
+    # 获取本机IP，方便ESP32连接
+    import socket
+    hostname = socket.gethostname()
+    local_ip = socket.gethostbyname(hostname)
+    print(f"Server running on ws://{local_ip}:8000/ws/audio")
+    
+    uvicorn.run(app, host="0.0.0.0", port=8000)
--- a/websocket_server/test_font.py
+++ b/websocket_server/test_font.py
@@ -0,0 +1,55 @@
+import os
+
+FONT_FILE = "/Users/jeremygan/Desktop/python_dev/epaper2/websocket_server/GB2312-16.bin"
+
+def test_font():
+    if not os.path.exists(FONT_FILE):
+        print(f"Error: File not found at {FONT_FILE}")
+        return
+
+    file_size = os.path.getsize(FONT_FILE)
+    print(f"Font file size: {file_size} bytes")
+    
+    # Expected size for GB2312-16 (94x94 chars * 32 bytes)
+    expected_size = 94 * 94 * 32
+    print(f"Expected size: {expected_size} bytes")
+    
+    if file_size != expected_size:
+        print(f"Warning: File size mismatch! (Diff: {file_size - expected_size})")
+
+    # Try to render '中' (0xD6D0)
+    # Area: 0xD6 - 0xA0 = 54
+    # Index: 0xD0 - 0xA0 = 48
+    area = 0xD6 - 0xA0
+    index = 0xD0 - 0xA0
+    offset = ((area - 1) * 94 + (index - 1)) * 32
+    
+    print(f"Testing character '中' (0xD6D0)")
+    print(f"Area: {area}, Index: {index}, Offset: {offset}")
+    
+    with open(FONT_FILE, "rb") as f:
+        f.seek(offset)
+        data = f.read(32)
+        
+        if len(data) != 32:
+            print("Error: Could not read 32 bytes")
+            return
+            
+        print("Bitmap data:")
+        for i in range(16):
+            # Each row is 2 bytes (16 bits)
+            byte1 = data[i*2]
+            byte2 = data[i*2+1]
+            
+            # Print as bits
+            line = ""
+            for b in range(8):
+                if (byte1 >> (7-b)) & 1: line += "##"
+                else: line += ".."
+            for b in range(8):
+                if (byte2 >> (7-b)) & 1: line += "##"
+                else: line += ".."
+            print(line)
+
+if __name__ == "__main__":
+    test_font()
				`@@ -0,0 +1 @@`
				`DASHSCOPE_API_KEY=sk-a294f382488d46a1aa0d7cd8e750729b`