t
This commit is contained in:
1
websocket_server/.env
Normal file
1
websocket_server/.env
Normal file
@@ -0,0 +1 @@
|
||||
DASHSCOPE_API_KEY=sk-a294f382488d46a1aa0d7cd8e750729b
|
||||
BIN
websocket_server/GB2312-16.bin
Normal file
BIN
websocket_server/GB2312-16.bin
Normal file
Binary file not shown.
31
websocket_server/README.md
Normal file
31
websocket_server/README.md
Normal file
@@ -0,0 +1,31 @@
|
||||
# WebSocket Audio Server
|
||||
|
||||
This is a FastAPI server that receives audio from an ESP32 via WebSocket, saves it, processes it (converts 32-bit to 16-bit), and sends it back for playback.
|
||||
|
||||
## Installation
|
||||
|
||||
1. Install dependencies:
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
1. Start the server:
|
||||
```bash
|
||||
python server.py
|
||||
```
|
||||
Or:
|
||||
```bash
|
||||
uvicorn server:app --host 0.0.0.0 --port 8000
|
||||
```
|
||||
|
||||
2. Update the IP address in `main.py` on your ESP32 to match your computer's IP address.
|
||||
Look for `SERVER_IP` variable.
|
||||
|
||||
## Features
|
||||
|
||||
- Receives raw audio stream from ESP32.
|
||||
- Saves raw audio to `received_audio.raw`.
|
||||
- Converts 32-bit audio (from ICS-43434) to 16-bit audio (for MAX98357A).
|
||||
- Streams processed audio back to ESP32 for playback.
|
||||
BIN
websocket_server/__pycache__/server.cpython-312.pyc
Normal file
BIN
websocket_server/__pycache__/server.cpython-312.pyc
Normal file
Binary file not shown.
BIN
websocket_server/__pycache__/server.cpython-313.pyc
Normal file
BIN
websocket_server/__pycache__/server.cpython-313.pyc
Normal file
Binary file not shown.
127
websocket_server/generate_font.py
Normal file
127
websocket_server/generate_font.py
Normal file
@@ -0,0 +1,127 @@
|
||||
import struct
|
||||
import freetype
|
||||
import os
|
||||
|
||||
# Font file and output file
|
||||
FONT_FILE = "/Users/jeremygan/Desktop/python_dev/epaper2/websocket_server/GB2312.ttf"
|
||||
OUTPUT_FILE = "/Users/jeremygan/Desktop/python_dev/epaper2/GB2312-16.bin"
|
||||
|
||||
# Font size (16x16)
|
||||
FONT_SIZE = 16
|
||||
|
||||
def create_gb2312_font():
|
||||
# Load the face
|
||||
try:
|
||||
face = freetype.Face(FONT_FILE)
|
||||
except Exception as e:
|
||||
print(f"Error loading font: {e}")
|
||||
return
|
||||
|
||||
# Set char size
|
||||
face.set_pixel_sizes(FONT_SIZE, FONT_SIZE)
|
||||
|
||||
print(f"Generating GB2312 font file: {OUTPUT_FILE}")
|
||||
|
||||
with open(OUTPUT_FILE, 'wb') as f:
|
||||
# Iterate through GB2312 code points
|
||||
# Area: 0xA1 - 0xFE (161 - 254) -> 94 areas
|
||||
# Index: 0xA1 - 0xFE (161 - 254) -> 94 chars per area
|
||||
|
||||
count = 0
|
||||
total_chars = 94 * 94
|
||||
|
||||
# Buffer for empty char (32 bytes of 0x00)
|
||||
empty_char = b'\x00' * 32
|
||||
|
||||
for area in range(0xA1, 0xFF):
|
||||
for index in range(0xA1, 0xFF):
|
||||
# Construct GB2312 code
|
||||
gb_code = bytes([area, index])
|
||||
|
||||
try:
|
||||
# Decode to unicode character
|
||||
char = gb_code.decode('gb2312')
|
||||
|
||||
# Load glyph
|
||||
face.load_char(char, freetype.FT_LOAD_RENDER | freetype.FT_LOAD_TARGET_MONO)
|
||||
bitmap = face.glyph.bitmap
|
||||
|
||||
# Convert bitmap to 32 bytes (16x16 / 8)
|
||||
# The bitmap.buffer is a flat list of bytes.
|
||||
# For mono rendering, each byte is 0 or 255? No, it's packed?
|
||||
# FT_LOAD_TARGET_MONO packs 8 pixels into 1 byte.
|
||||
|
||||
# We need to ensure it's 16x16.
|
||||
# Center the glyph in 16x16 box.
|
||||
|
||||
glyph_width = bitmap.width
|
||||
glyph_rows = bitmap.rows
|
||||
glyph_pitch = bitmap.pitch
|
||||
|
||||
# Create a 16x16 buffer (32 bytes)
|
||||
char_buffer = bytearray(32)
|
||||
|
||||
# Calculate offsets to center
|
||||
x_off = (FONT_SIZE - glyph_width) // 2
|
||||
# Vertical alignment is tricky. Let's use bearing Y or just center based on rows.
|
||||
# A better way is using face.glyph.bitmap_top
|
||||
# But for fixed height font generation, usually we just center or align baseline.
|
||||
# Let's try simple centering for now.
|
||||
y_off = (FONT_SIZE - glyph_rows) // 2
|
||||
# Adjust y_off if it's too high/low?
|
||||
# Let's align to baseline approximately.
|
||||
# Usually baseline is at 12-13px for 16px font.
|
||||
# face.size.ascender might help but let's stick to bitmap center for simplicity first.
|
||||
|
||||
# Copy bitmap to buffer
|
||||
src_buf = bitmap.buffer
|
||||
|
||||
for row in range(glyph_rows):
|
||||
# Target row
|
||||
dst_row = row + y_off
|
||||
if dst_row < 0 or dst_row >= FONT_SIZE:
|
||||
continue
|
||||
|
||||
# Source row bytes
|
||||
# pitch is bytes per row
|
||||
src_start = row * glyph_pitch
|
||||
|
||||
# We need to copy bits.
|
||||
# This is getting complicated because FreeType mono bitmap format
|
||||
# might not match our target format exactly (MSB/LSB).
|
||||
# Let's iterate pixels.
|
||||
|
||||
for col in range(glyph_width):
|
||||
dst_col = col + x_off
|
||||
if dst_col < 0 or dst_col >= FONT_SIZE:
|
||||
continue
|
||||
|
||||
# Get pixel from src
|
||||
byte_idx = src_start + (col >> 3)
|
||||
bit_idx = 7 - (col & 7)
|
||||
pixel = (src_buf[byte_idx] >> bit_idx) & 1
|
||||
|
||||
if pixel:
|
||||
# Set pixel in dst
|
||||
# format: row by row, 2 bytes per row.
|
||||
# row 0: byte 0, byte 1
|
||||
# byte 0: bits 0-7 (left to right) -> wait, usually MSB is left.
|
||||
dst_byte_idx = dst_row * 2 + (dst_col >> 3)
|
||||
dst_bit_idx = 7 - (dst_col & 7)
|
||||
char_buffer[dst_byte_idx] |= (1 << dst_bit_idx)
|
||||
|
||||
f.write(char_buffer)
|
||||
count += 1
|
||||
|
||||
except Exception:
|
||||
# Character not found or decode error
|
||||
f.write(empty_char)
|
||||
|
||||
# Progress
|
||||
if count % 1000 == 0:
|
||||
print(f"Processed {count} characters...")
|
||||
|
||||
print(f"Done! Generated {OUTPUT_FILE} with size {os.path.getsize(OUTPUT_FILE)} bytes.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
create_gb2312_font()
|
||||
BIN
websocket_server/received_audio.mp3
Normal file
BIN
websocket_server/received_audio.mp3
Normal file
Binary file not shown.
BIN
websocket_server/received_audio.raw
Normal file
BIN
websocket_server/received_audio.raw
Normal file
Binary file not shown.
6
websocket_server/requirements.txt
Normal file
6
websocket_server/requirements.txt
Normal file
@@ -0,0 +1,6 @@
|
||||
fastapi
|
||||
uvicorn
|
||||
websockets
|
||||
pydub
|
||||
dashscope
|
||||
python-dotenv
|
||||
277
websocket_server/server.py
Normal file
277
websocket_server/server.py
Normal file
@@ -0,0 +1,277 @@
|
||||
from fastapi import FastAPI, WebSocket, WebSocketDisconnect
|
||||
import uvicorn
|
||||
import asyncio
|
||||
import os
|
||||
import subprocess
|
||||
import struct
|
||||
from dotenv import load_dotenv
|
||||
import dashscope
|
||||
from dashscope.audio.asr import Recognition, RecognitionCallback, RecognitionResult
|
||||
import json
|
||||
|
||||
# 加载环境变量
|
||||
load_dotenv()
|
||||
dashscope.api_key = os.getenv("DASHSCOPE_API_KEY")
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
# 存储接收到的音频数据
|
||||
audio_buffer = bytearray()
|
||||
RECORDING_RAW_FILE = "received_audio.raw"
|
||||
RECORDING_MP3_FILE = "received_audio.mp3"
|
||||
VOLUME_GAIN = 10.0 # 放大倍数
|
||||
FONT_FILE = "GB2312-16.bin"
|
||||
|
||||
class MyRecognitionCallback(RecognitionCallback):
|
||||
def __init__(self, websocket: WebSocket, loop: asyncio.AbstractEventLoop):
|
||||
self.websocket = websocket
|
||||
self.loop = loop
|
||||
|
||||
def on_open(self) -> None:
|
||||
print("ASR Session started")
|
||||
|
||||
def on_close(self) -> None:
|
||||
print("ASR Session closed")
|
||||
|
||||
def on_event(self, result: RecognitionResult) -> None:
|
||||
if result.get_sentence():
|
||||
text = result.get_sentence()['text']
|
||||
print(f"ASR Result: {text}")
|
||||
# 将识别结果发送回客户端
|
||||
try:
|
||||
asyncio.run_coroutine_threadsafe(
|
||||
self.websocket.send_text(f"ASR:{text}"),
|
||||
self.loop
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"Failed to send ASR result to client: {e}")
|
||||
|
||||
def process_chunk_32_to_16(chunk_bytes, gain=1.0):
|
||||
processed_chunk = bytearray()
|
||||
# Iterate 4 bytes at a time
|
||||
for i in range(0, len(chunk_bytes), 4):
|
||||
if i+3 < len(chunk_bytes):
|
||||
# 取 chunk[i+2] 和 chunk[i+3] 组成 16-bit signed int
|
||||
sample = struct.unpack_from('<h', chunk_bytes, i+2)[0]
|
||||
|
||||
# 放大音量
|
||||
sample = int(sample * gain)
|
||||
|
||||
# 限幅 (Clamping) 防止溢出爆音
|
||||
if sample > 32767: sample = 32767
|
||||
elif sample < -32768: sample = -32768
|
||||
|
||||
# 重新打包为 16-bit little-endian
|
||||
processed_chunk.extend(struct.pack('<h', sample))
|
||||
return processed_chunk
|
||||
|
||||
@app.websocket("/ws/audio")
|
||||
async def websocket_endpoint(websocket: WebSocket):
|
||||
global audio_buffer
|
||||
await websocket.accept()
|
||||
print("Client connected")
|
||||
|
||||
recognition = None
|
||||
processed_buffer = bytearray()
|
||||
loop = asyncio.get_running_loop()
|
||||
|
||||
try:
|
||||
while True:
|
||||
# 接收消息 (可能是文本指令或二进制音频数据)
|
||||
try:
|
||||
message = await websocket.receive()
|
||||
except RuntimeError as e:
|
||||
if "Cannot call \"receive\" once a disconnect message has been received" in str(e):
|
||||
print("Client disconnected (RuntimeError caught)")
|
||||
break
|
||||
raise e
|
||||
|
||||
if "text" in message:
|
||||
text = message["text"]
|
||||
print(f"Received text: {text}")
|
||||
|
||||
if text == "START_RECORDING":
|
||||
print("Start recording...")
|
||||
audio_buffer = bytearray() # 清空缓冲区
|
||||
processed_buffer = bytearray()
|
||||
|
||||
# 启动实时语音识别
|
||||
try:
|
||||
callback = MyRecognitionCallback(websocket, loop)
|
||||
recognition = Recognition(
|
||||
model='paraformer-realtime-v2',
|
||||
format='pcm',
|
||||
sample_rate=16000,
|
||||
callback=callback
|
||||
)
|
||||
recognition.start()
|
||||
print("DashScope ASR started")
|
||||
except Exception as e:
|
||||
print(f"Failed to start ASR: {e}")
|
||||
recognition = None
|
||||
|
||||
elif text == "STOP_RECORDING":
|
||||
print(f"Stop recording. Total raw bytes: {len(audio_buffer)}")
|
||||
|
||||
# 停止语音识别
|
||||
if recognition:
|
||||
try:
|
||||
recognition.stop()
|
||||
print("DashScope ASR stopped")
|
||||
except Exception as e:
|
||||
print(f"Error stopping ASR: {e}")
|
||||
recognition = None
|
||||
|
||||
# 使用实时处理过的音频数据
|
||||
processed_audio = processed_buffer
|
||||
|
||||
print(f"Processed audio size: {len(processed_audio)} bytes (Gain: {VOLUME_GAIN}x)")
|
||||
|
||||
# 2. 保存原始 RAW 文件 (16-bit PCM)
|
||||
with open(RECORDING_RAW_FILE, "wb") as f:
|
||||
f.write(processed_audio)
|
||||
|
||||
# 3. 转换为 MP3 并保存 (使用 ffmpeg 命令行,避免 Python 3.13 audioop 问题)
|
||||
try:
|
||||
# ffmpeg -y -f s16le -ar 16000 -ac 1 -i received_audio.raw received_audio.mp3
|
||||
cmd = [
|
||||
"ffmpeg",
|
||||
"-y", # 覆盖输出文件
|
||||
"-f", "s16le", # 输入格式: signed 16-bit little endian
|
||||
"-ar", "16000", # 输入采样率
|
||||
"-ac", "1", # 输入声道数
|
||||
"-i", RECORDING_RAW_FILE,
|
||||
RECORDING_MP3_FILE
|
||||
]
|
||||
print(f"Running command: {' '.join(cmd)}")
|
||||
|
||||
# Use asyncio.create_subprocess_exec instead of subprocess.run to avoid blocking the event loop
|
||||
process = await asyncio.create_subprocess_exec(
|
||||
*cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
stdout, stderr = await process.communicate()
|
||||
|
||||
if process.returncode != 0:
|
||||
raise subprocess.CalledProcessError(process.returncode, cmd, output=stdout, stderr=stderr)
|
||||
|
||||
print(f"Saved MP3 to {RECORDING_MP3_FILE}")
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"Error converting to MP3: {e}")
|
||||
# stderr might be bytes
|
||||
error_msg = e.stderr.decode() if isinstance(e.stderr, bytes) else str(e.stderr)
|
||||
print(f"FFmpeg stderr: {error_msg}")
|
||||
except FileNotFoundError:
|
||||
print("Error: ffmpeg not found. Please install ffmpeg.")
|
||||
except Exception as e:
|
||||
print(f"Error converting to MP3: {e}")
|
||||
|
||||
# 4. 发送回客户端播放
|
||||
print("Sending audio back...")
|
||||
await websocket.send_text("START_PLAYBACK")
|
||||
|
||||
# 分块发送
|
||||
chunk_size = 4096
|
||||
for i in range(0, len(processed_audio), chunk_size):
|
||||
chunk = processed_audio[i:i+chunk_size]
|
||||
await websocket.send_bytes(chunk)
|
||||
# 小延时,避免发送过快导致 ESP32 缓冲区溢出
|
||||
# 4096 bytes / 32000 bytes/s (16k*2) = ~0.128s
|
||||
# 0.04s 约为 3 倍速发送,既保证缓冲又不至于拥塞
|
||||
await asyncio.sleep(0.04)
|
||||
|
||||
await websocket.send_text("STOP_PLAYBACK")
|
||||
print("Audio sent back finished.")
|
||||
|
||||
elif text.startswith("GET_FONT:"):
|
||||
# 格式: GET_FONT:0xA1A1
|
||||
try:
|
||||
print(f"Font Request Received: {text}")
|
||||
hex_code = text.split(":")[1]
|
||||
code = int(hex_code, 16)
|
||||
|
||||
# 计算偏移量
|
||||
# GB2312 编码范围:0xA1A1 - 0xFEFE
|
||||
# 区码:高字节 - 0xA0
|
||||
# 位码:低字节 - 0xA0
|
||||
area = (code >> 8) - 0xA0
|
||||
index = (code & 0xFF) - 0xA0
|
||||
|
||||
if area >= 1 and index >= 1:
|
||||
offset = ((area - 1) * 94 + (index - 1)) * 32
|
||||
|
||||
# 读取字体文件
|
||||
# 注意:这里为了简单,每次都打开文件。如果并发高,应该缓存文件句柄或内容。
|
||||
# 假设字体文件在当前目录或上级目录
|
||||
# Prioritize finding the file in the script's directory
|
||||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
font_path = os.path.join(script_dir, FONT_FILE)
|
||||
|
||||
# Fallback: check one level up
|
||||
if not os.path.exists(font_path):
|
||||
font_path = os.path.join(script_dir, "..", FONT_FILE)
|
||||
|
||||
# Fallback: check current working directory
|
||||
if not os.path.exists(font_path):
|
||||
font_path = FONT_FILE
|
||||
|
||||
if os.path.exists(font_path):
|
||||
print(f"Reading font from: {font_path} (Offset: {offset})")
|
||||
with open(font_path, "rb") as f:
|
||||
f.seek(offset)
|
||||
font_data = f.read(32)
|
||||
|
||||
if len(font_data) == 32:
|
||||
import binascii
|
||||
hex_data = binascii.hexlify(font_data).decode('utf-8')
|
||||
response = f"FONT_DATA:{hex_code}:{hex_data}"
|
||||
print(f"Sending Font Response: {response[:30]}...")
|
||||
await websocket.send_text(response)
|
||||
else:
|
||||
print(f"Error: Read {len(font_data)} bytes for font data (expected 32)")
|
||||
else:
|
||||
print(f"Font file not found: {font_path}")
|
||||
else:
|
||||
print(f"Invalid GB2312 code: {hex_code} (Area: {area}, Index: {index})")
|
||||
except Exception as e:
|
||||
print(f"Error handling GET_FONT: {e}")
|
||||
|
||||
elif "bytes" in message:
|
||||
# 接收音频数据并追加到缓冲区
|
||||
data = message["bytes"]
|
||||
audio_buffer.extend(data)
|
||||
|
||||
# 实时处理并发送给 ASR
|
||||
pcm_chunk = process_chunk_32_to_16(data, VOLUME_GAIN)
|
||||
processed_buffer.extend(pcm_chunk)
|
||||
|
||||
if recognition:
|
||||
try:
|
||||
recognition.send_audio_frame(pcm_chunk)
|
||||
except Exception as e:
|
||||
print(f"Error sending audio frame to ASR: {e}")
|
||||
|
||||
except WebSocketDisconnect:
|
||||
print("Client disconnected")
|
||||
if recognition:
|
||||
try:
|
||||
recognition.stop()
|
||||
except:
|
||||
pass
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
if recognition:
|
||||
try:
|
||||
recognition.stop()
|
||||
except:
|
||||
pass
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 获取本机IP,方便ESP32连接
|
||||
import socket
|
||||
hostname = socket.gethostname()
|
||||
local_ip = socket.gethostbyname(hostname)
|
||||
print(f"Server running on ws://{local_ip}:8000/ws/audio")
|
||||
|
||||
uvicorn.run(app, host="0.0.0.0", port=8000)
|
||||
55
websocket_server/test_font.py
Normal file
55
websocket_server/test_font.py
Normal file
@@ -0,0 +1,55 @@
|
||||
import os
|
||||
|
||||
FONT_FILE = "/Users/jeremygan/Desktop/python_dev/epaper2/websocket_server/GB2312-16.bin"
|
||||
|
||||
def test_font():
|
||||
if not os.path.exists(FONT_FILE):
|
||||
print(f"Error: File not found at {FONT_FILE}")
|
||||
return
|
||||
|
||||
file_size = os.path.getsize(FONT_FILE)
|
||||
print(f"Font file size: {file_size} bytes")
|
||||
|
||||
# Expected size for GB2312-16 (94x94 chars * 32 bytes)
|
||||
expected_size = 94 * 94 * 32
|
||||
print(f"Expected size: {expected_size} bytes")
|
||||
|
||||
if file_size != expected_size:
|
||||
print(f"Warning: File size mismatch! (Diff: {file_size - expected_size})")
|
||||
|
||||
# Try to render '中' (0xD6D0)
|
||||
# Area: 0xD6 - 0xA0 = 54
|
||||
# Index: 0xD0 - 0xA0 = 48
|
||||
area = 0xD6 - 0xA0
|
||||
index = 0xD0 - 0xA0
|
||||
offset = ((area - 1) * 94 + (index - 1)) * 32
|
||||
|
||||
print(f"Testing character '中' (0xD6D0)")
|
||||
print(f"Area: {area}, Index: {index}, Offset: {offset}")
|
||||
|
||||
with open(FONT_FILE, "rb") as f:
|
||||
f.seek(offset)
|
||||
data = f.read(32)
|
||||
|
||||
if len(data) != 32:
|
||||
print("Error: Could not read 32 bytes")
|
||||
return
|
||||
|
||||
print("Bitmap data:")
|
||||
for i in range(16):
|
||||
# Each row is 2 bytes (16 bits)
|
||||
byte1 = data[i*2]
|
||||
byte2 = data[i*2+1]
|
||||
|
||||
# Print as bits
|
||||
line = ""
|
||||
for b in range(8):
|
||||
if (byte1 >> (7-b)) & 1: line += "##"
|
||||
else: line += ".."
|
||||
for b in range(8):
|
||||
if (byte2 >> (7-b)) & 1: line += "##"
|
||||
else: line += ".."
|
||||
print(line)
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_font()
|
||||
Reference in New Issue
Block a user