t

2026-03-05 21:43:39 +08:00
parent b79d45cf34
commit 64ff8ffbd4
4 changed files with 311 additions and 200 deletions
--- a/websocket_server/server.py
+++ b/websocket_server/server.py
@@ -11,18 +11,23 @@ import json
 from dotenv import load_dotenv
 import dashscope
 from dashscope.audio.asr import Recognition, RecognitionCallback, RecognitionResult
-from dashscope import ImageSynthesis
-from dashscope import Generation
+# from dashscope import ImageSynthesis
+# from dashscope import Generation

 import sys
 # import os
 # sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 import convert_img
+from image_generator import ImageGenerator

 # 加载环境变量
 load_dotenv()
 dashscope.api_key = os.getenv("DASHSCOPE_API_KEY")

+# Initialize image generator
+# provider="doubao" or "dashscope"
+image_generator = ImageGenerator(provider="doubao")
+
 app = FastAPI()

 # 字体文件配置
@@ -526,82 +531,11 @@ def process_chunk_32_to_16(chunk_bytes, gain=1.0):

 def optimize_prompt(asr_text, progress_callback=None):
    """使用大模型优化提示词"""
-    print(f"Optimizing prompt for: {asr_text}")
-    
-    if progress_callback:
-        progress_callback(0, "正在准备优化提示词...")
-    
-    system_prompt = """你是一个AI图像提示词优化专家。你的任务是将用户的语音识别结果转化为适合生成"黑白线稿"的提示词。
-关键要求：
-1. 风格必须是：简单的黑白线稿、简笔画、图标风格 (Line art, Sketch, Icon style)。
-2. 画面必须清晰、线条粗壮，适合低分辨率热敏打印机打印。
-3. 绝对不要有复杂的阴影、渐变、黑白线条描述。
-4. 背景必须是纯白 (White background)。
-5. 提示词内容请使用英文描述，因为绘图模型对英文理解更好，但在描述中强调 "black and white line art", "simple lines", "vector style"。
-6. 尺寸比例遵循宽48mm:高30mm (约 1.6:1)。
-7. 直接输出优化后的提示词，不要包含任何解释。
-如果用户要求输入文字，则用双引号把文字包裹起来，文字是中文"""
-
-    try:
-        if progress_callback:
-            progress_callback(10, "正在调用AI优化提示词...")
-            print(f"Calling AI with prompt: {system_prompt}\n\n用户语音识别结果：{asr_text}\n\n优化后的提示词：")
-        
-        response = Generation.call(
-            model='qwen-turbo',
-            prompt=f'{system_prompt}\n\n用户语音识别结果：{asr_text}\n\n优化后的提示词：',
-            max_tokens=200,
-            temperature=0.8
-        )
-        
-        if response.status_code == 200:
-            if hasattr(response, 'output') and response.output and \
-               hasattr(response.output, 'choices') and response.output.choices and \
-               len(response.output.choices) > 0:
-                
-                optimized = response.output.choices[0].message.content.strip()
-                print(f"Optimized prompt: {optimized}")
-                
-                if progress_callback:
-                    progress_callback(30, f"提示词优化完成: {optimized[:50]}...")
-                
-                return optimized
-            elif hasattr(response, 'output') and response.output and hasattr(response.output, 'text'):
-                # Handle case where API returns text directly instead of choices
-                optimized = response.output.text.strip()
-                print(f"Optimized prompt (direct text): {optimized}")
-                
-                if progress_callback:
-                    progress_callback(30, f"提示词优化完成: {optimized[:50]}...")
-                
-                return optimized
-            else:
-                print(f"Prompt optimization response format error: {response}")
-                if progress_callback:
-                    progress_callback(0, "提示词优化响应格式错误")
-                return asr_text
-        else:
-            print(f"Prompt optimization failed: {response.code} - {response.message}")
-            if progress_callback:
-                progress_callback(0, f"提示词优化失败: {response.message}")
-            return asr_text
-            
-    except Exception as e:
-        print(f"Error optimizing prompt: {e}")
-        if progress_callback:
-            progress_callback(0, f"提示词优化出错: {str(e)}")
-        return asr_text
+    return image_generator.optimize_prompt(asr_text, progress_callback)


 def generate_image(prompt, progress_callback=None, retry_count=0, max_retries=2):
-    """调用万相文生图API生成图片
-    
-    Args:
-        prompt: 图像生成提示词
-        progress_callback: 进度回调函数 (progress, message)
-        retry_count: 当前重试次数
-        max_retries: 最大重试次数
-    """
+    """调用AI生图API生成图片"""
    print(f"Generating image for prompt: {prompt}")
    
    if progress_callback:
@@ -614,139 +548,96 @@ def generate_image(prompt, progress_callback=None, retry_count=0, max_retries=2)
                progress_callback(0, "提示词为空")
            return None
            
-        response = ImageSynthesis.call(
-            model='wanx2.0-t2i-turbo',
-            prompt=prompt,
-            size='1280*720'
-        )
+        # Call the generator
+        image_url = image_generator.generate_image(prompt, progress_callback)
        
-        if response.status_code == 200:
-            if not response.output:
-                print("Error: response.output is None")
-                if progress_callback:
-                    progress_callback(0, "API响应无效")
-                return None
-                
-            task_status = response.output.get('task_status')
-            
-            if task_status == 'PENDING' or task_status == 'RUNNING':
-                print("Waiting for image generation to complete...")
-                if progress_callback:
-                    progress_callback(45, "AI正在生成图片中...")
-                
-                import time
-                task_id = response.output.get('task_id')
-                max_wait = 120
-                waited = 0
-                while waited < max_wait:
-                    time.sleep(2)
-                    waited += 2
-                    task_result = ImageSynthesis.fetch(task_id)
-                    if task_result.output.task_status == 'SUCCEEDED':
-                        response.output = task_result.output
-                        break
-                    elif task_result.output.task_status == 'FAILED':
-                        error_msg = task_result.output.message if hasattr(task_result.output, 'message') else 'Unknown error'
-                        print(f"Image generation failed: {error_msg}")
-                        if progress_callback:
-                            progress_callback(35, f"图片生成失败: {error_msg}")
-                        return None
-            
-            if response.output.get('task_status') == 'SUCCEEDED':
-                image_url = response.output['results'][0]['url']
-                print(f"Image generated, downloading from: {image_url}")
-                
-                if progress_callback:
-                    progress_callback(70, "正在下载生成的图片...")
-            
-            import urllib.request
-            urllib.request.urlretrieve(image_url, GENERATED_IMAGE_FILE)
-            print(f"Image saved to {GENERATED_IMAGE_FILE}")
-            
-            # 保存一份到 output_images 目录
-            output_path = get_output_path()
-            import shutil
-            shutil.copy(GENERATED_IMAGE_FILE, output_path)
-            print(f"Image also saved to {output_path}")
-            
-            if progress_callback:
-                progress_callback(80, "正在处理图片...")
-            
-            # 缩放图片并转换为RGB565格式
-            try:
-                from PIL import Image
-                img = Image.open(GENERATED_IMAGE_FILE)
-                
-                # 缩小到THUMB_SIZE x THUMB_SIZE
-                img = img.resize((THUMB_SIZE, THUMB_SIZE), Image.LANCZOS)
-                
-                # 转换为RGB565格式的原始数据
-                # 每个像素2字节 (R5 G6 B5)
-                rgb565_data = bytearray()
-                
-                for y in range(THUMB_SIZE):
-                    for x in range(THUMB_SIZE):
-                        r, g, b = img.getpixel((x, y))[:3]
-                        
-                        # 转换为RGB565
-                        r5 = (r >> 3) & 0x1F
-                        g6 = (g >> 2) & 0x3F
-                        b5 = (b >> 3) & 0x1F
-                        
-                        # Pack as Big Endian (>H) which is standard for SPI displays
-                        # RGB565: Red(5) Green(6) Blue(5)
-                        rgb565 = (r5 << 11) | (g6 << 5) | b5
-                        rgb565_data.extend(struct.pack('>H', rgb565))
-                
-                # 保存为.bin文件
-                with open(GENERATED_THUMB_FILE, 'wb') as f:
-                    f.write(rgb565_data)
-                
-                print(f"Thumbnail saved to {GENERATED_THUMB_FILE}, size: {len(rgb565_data)} bytes")
-                
-                if progress_callback:
-                    progress_callback(100, "图片生成完成!")
-                
-                return GENERATED_THUMB_FILE
-                
-            except ImportError:
-                print("PIL not available, sending original image")
-                if progress_callback:
-                    progress_callback(100, "图片生成完成!(原始格式)")
-                return GENERATED_IMAGE_FILE
-            except Exception as e:
-                print(f"Error processing image: {e}")
-                if progress_callback:
-                    progress_callback(80, f"图片处理出错: {str(e)}")
-                return GENERATED_IMAGE_FILE
-        else:
-            error_msg = f"{response.code} - {response.message}"
-            print(f"Image generation failed: {error_msg}")
-            
-            # 重试机制
+        if not image_url:
+            # Retry logic
            if retry_count < max_retries:
                print(f"Retrying... ({retry_count + 1}/{max_retries})")
                if progress_callback:
-                    progress_callback(35, f"图片生成失败，正在重试 ({retry_count + 1}/{max_retries})...")
+                    progress_callback(35, f"生成失败，正在重试 ({retry_count + 1}/{max_retries})...")
                return generate_image(prompt, progress_callback, retry_count + 1, max_retries)
            else:
-                if progress_callback:
-                    progress_callback(35, f"图片生成失败: {error_msg}")
                return None
+
+        # Download and process
+        print(f"Image generated, downloading from: {image_url}")
+        if progress_callback:
+            progress_callback(70, "正在下载生成的图片...")
+            
+        import urllib.request
+        try:
+            urllib.request.urlretrieve(image_url, GENERATED_IMAGE_FILE)
+            print(f"Image saved to {GENERATED_IMAGE_FILE}")
+        except Exception as e:
+            print(f"Download error: {e}")
+            if progress_callback:
+                progress_callback(35, f"下载失败: {e}")
+            return None
+        
+        # Save to output dir
+        output_path = get_output_path()
+        import shutil
+        shutil.copy(GENERATED_IMAGE_FILE, output_path)
+        print(f"Image also saved to {output_path}")
+        
+        if progress_callback:
+            progress_callback(80, "正在处理图片...")
+            
+        # Resize and convert to RGB565 (Reuse existing logic)
+        try:
+            from PIL import Image
+            img = Image.open(GENERATED_IMAGE_FILE)
+            
+            # 缩小到THUMB_SIZE x THUMB_SIZE
+            img = img.resize((THUMB_SIZE, THUMB_SIZE), Image.LANCZOS)
+            
+            # 转换为RGB565格式的原始数据
+            # 每个像素2字节 (R5 G6 B5)
+            rgb565_data = bytearray()
+            
+            for y in range(THUMB_SIZE):
+                for x in range(THUMB_SIZE):
+                    r, g, b = img.getpixel((x, y))[:3]
+                    
+                    # 转换为RGB565
+                    r5 = (r >> 3) & 0x1F
+                    g6 = (g >> 2) & 0x3F
+                    b5 = (b >> 3) & 0x1F
+                    
+                    # Pack as Big Endian (>H) which is standard for SPI displays
+                    # RGB565: Red(5) Green(6) Blue(5)
+                    rgb565 = (r5 << 11) | (g6 << 5) | b5
+                    rgb565_data.extend(struct.pack('>H', rgb565))
+            
+            # 保存为.bin文件
+            with open(GENERATED_THUMB_FILE, 'wb') as f:
+                f.write(rgb565_data)
+            
+            print(f"Thumbnail saved to {GENERATED_THUMB_FILE}, size: {len(rgb565_data)} bytes")
+            
+            if progress_callback:
+                progress_callback(100, "图片生成完成!")
+            
+            return GENERATED_THUMB_FILE
+            
+        except ImportError:
+            print("PIL not available, sending original image")
+            if progress_callback:
+                progress_callback(100, "图片生成完成!(原始格式)")
+            return GENERATED_IMAGE_FILE
+        except Exception as e:
+            print(f"Error processing image: {e}")
+            if progress_callback:
+                progress_callback(80, f"图片处理出错: {str(e)}")
+            return GENERATED_IMAGE_FILE
            
    except Exception as e:
-        print(f"Error generating image: {e}")
-        
-        # 重试机制
+        print(f"Error in generate_image: {e}")
        if retry_count < max_retries:
-            print(f"Retrying after error... ({retry_count + 1}/{max_retries})")
-            if progress_callback:
-                progress_callback(35, f"生成出错，正在重试 ({retry_count + 1}/{max_retries})...")
-            return generate_image(prompt, progress_callback, retry_count + 1, max_retries)
-        else:
-            if progress_callback:
-                progress_callback(35, f"图片生成出错: {str(e)}")
-            return None
+             return generate_image(prompt, progress_callback, retry_count + 1, max_retries)
+        return None

@app.websocket("/ws/audio")
 async def websocket_endpoint(websocket: WebSocket):