support async

2025-12-29 21:59:11 +08:00
parent 53ecbebb0a
commit ab5dda1f21
8 changed files with 429 additions and 26 deletions
--- a/fastapi_server/fake_stream_server_dashscopy.py
+++ b/fastapi_server/fake_stream_server_dashscopy.py
@@ -128,7 +128,7 @@ async def application_responses(
            user_msg = last.get("content") if isinstance(last, dict) else str(last)

        # Invoke pipeline (non-stream) then stream-chunk it to the client
-        result_text = pipeline.chat(inp=user_msg, as_stream=False, thread_id=thread_id)
+        result_text = await pipeline.achat(inp=user_msg, as_stream=False, thread_id=thread_id)
        if not isinstance(result_text, str):
            result_text = str(result_text)

@@ -206,7 +206,7 @@ async def application_completion(
            last = messages[-1]
            user_msg = last.get("content") if isinstance(last, dict) else str(last)

-        result_text = pipeline.chat(inp=user_msg, as_stream=False, thread_id=thread_id)
+        result_text = await pipeline.achat(inp=user_msg, as_stream=False, thread_id=thread_id)
        if not isinstance(result_text, str):
            result_text = str(result_text)

--- a/fastapi_server/server_dashscope.py
+++ b/fastapi_server/server_dashscope.py
@@ -89,6 +89,46 @@ def sse_chunks_from_stream(chunk_generator, response_id: str, model: str = "qwen
    yield f"data: {json.dumps(final)}\n\n"


+async def sse_chunks_from_astream(chunk_generator, response_id: str, model: str = "qwen-flash"):
+    """
+    Async version: Stream chunks from pipeline and format as SSE.
+    Accumulates text and sends incremental updates.
+    DashScope SDK expects accumulated text in each chunk (not deltas).
+    """
+    created_time = int(time.time())
+    accumulated_text = ""
+
+    async for chunk in chunk_generator:
+        if chunk:
+            accumulated_text += chunk
+            data = {
+                "request_id": response_id,
+                "code": 200,
+                "message": "OK",
+                "output": {
+                    "text": accumulated_text,
+                    "created": created_time,
+                    "model": model,
+                },
+                "is_end": False,
+            }
+            yield f"data: {json.dumps(data)}\n\n"
+
+    # Final message with complete text
+    final = {
+        "request_id": response_id,
+        "code": 200,
+        "message": "OK",
+        "output": {
+            "text": accumulated_text,
+            "created": created_time,
+            "model": model,
+        },
+        "is_end": True,
+    }
+    yield f"data: {json.dumps(final)}\n\n"
+
+
@app.post("/v1/apps/{app_id}/sessions/{session_id}/responses")
@app.post("/api/v1/apps/{app_id}/sessions/{session_id}/responses")
 async def application_responses(
@@ -137,15 +177,15 @@ async def application_responses(
        response_id = f"appcmpl-{os.urandom(12).hex()}"

        if stream:
-            # Use actual streaming from pipeline
-            chunk_generator = pipeline.chat(inp=user_msg, as_stream=True, thread_id=thread_id)
+            # Use async streaming from pipeline
+            chunk_generator = await pipeline.achat(inp=user_msg, as_stream=True, thread_id=thread_id)
            return StreamingResponse(
-                sse_chunks_from_stream(chunk_generator, response_id=response_id, model=pipeline_config.llm_name),
+                sse_chunks_from_astream(chunk_generator, response_id=response_id, model=pipeline_config.llm_name),
                media_type="text/event-stream",
            )

-        # Non-streaming: get full result
-        result_text = pipeline.chat(inp=user_msg, as_stream=False, thread_id=thread_id)
+        # Non-streaming: get full result using async
+        result_text = await pipeline.achat(inp=user_msg, as_stream=False, thread_id=thread_id)
        if not isinstance(result_text, str):
            result_text = str(result_text)

@@ -217,15 +257,15 @@ async def application_completion(
        response_id = f"appcmpl-{os.urandom(12).hex()}"

        if stream:
-            # Use actual streaming from pipeline
-            chunk_generator = pipeline.chat(inp=user_msg, as_stream=True, thread_id=thread_id)
+            # Use async streaming from pipeline
+            chunk_generator = await pipeline.achat(inp=user_msg, as_stream=True, thread_id=thread_id)
            return StreamingResponse(
-                sse_chunks_from_stream(chunk_generator, response_id=response_id, model=pipeline_config.llm_name),
+                sse_chunks_from_astream(chunk_generator, response_id=response_id, model=pipeline_config.llm_name),
                media_type="text/event-stream",
            )

-        # Non-streaming: get full result
-        result_text = pipeline.chat(inp=user_msg, as_stream=False, thread_id=thread_id)
+        # Non-streaming: get full result using async
+        result_text = await pipeline.achat(inp=user_msg, as_stream=False, thread_id=thread_id)
        if not isinstance(result_text, str):
            result_text = str(result_text)

--- a/fastapi_server/server_openai.py
+++ b/fastapi_server/server_openai.py
@@ -91,6 +91,47 @@ def sse_chunks_from_stream(chunk_generator, response_id: str, model: str, create
    yield "data: [DONE]\n\n"


+async def sse_chunks_from_astream(chunk_generator, response_id: str, model: str, created_time: int):
+    """
+    Async version: Stream chunks from pipeline and format as OpenAI SSE.
+    """
+    async for chunk in chunk_generator:
+        if chunk:
+            data = {
+                "id": response_id,
+                "object": "chat.completion.chunk",
+                "created": created_time,
+                "model": model,
+                "choices": [
+                    {
+                        "index": 0,
+                        "delta": {
+                            "content": chunk
+                        },
+                        "finish_reason": None
+                    }
+                ]
+            }
+            yield f"data: {json.dumps(data)}\n\n"
+
+    # Final message
+    final = {
+        "id": response_id,
+        "object": "chat.completion.chunk",
+        "created": created_time,
+        "model": model,
+        "choices": [
+            {
+                "index": 0,
+                "delta": {},
+                "finish_reason": "stop"
+            }
+        ]
+    }
+    yield f"data: {json.dumps(final)}\n\n"
+    yield "data: [DONE]\n\n"
+
+
@app.post("/v1/chat/completions")
 async def chat_completions(request: Request):
    try:
@@ -121,15 +162,15 @@ async def chat_completions(request: Request):
        created_time = int(time.time())
        
        if stream:
-            # Use actual streaming from pipeline
-            chunk_generator = pipeline.chat(inp=user_msg, as_stream=True, thread_id=thread_id)
+            # Use async streaming from pipeline
+            chunk_generator = await pipeline.achat(inp=user_msg, as_stream=True, thread_id=thread_id)
            return StreamingResponse(
-                sse_chunks_from_stream(chunk_generator, response_id=response_id, model=model, created_time=created_time),
+                sse_chunks_from_astream(chunk_generator, response_id=response_id, model=model, created_time=created_time),
                media_type="text/event-stream",
            )
        
-        # Non-streaming: get full result
-        result_text = pipeline.chat(inp=user_msg, as_stream=False, thread_id=thread_id)
+        # Non-streaming: get full result using async
+        result_text = await pipeline.achat(inp=user_msg, as_stream=False, thread_id=thread_id)
        if not isinstance(result_text, str):
            result_text = str(result_text)
        
--- a/fastapi_server/test_openai_client.py
+++ b/fastapi_server/test_openai_client.py
@@ -113,13 +113,13 @@ def main():
    print(f"\nUsing base_url = {BASE_URL}\n")
    
    # Test both streaming and non-streaming
-    # streaming_result = test_streaming()
+    streaming_result = test_streaming()
    non_streaming_result = test_non_streaming()
    
    print("\n" + "="*60)
    print("SUMMARY")
    print("="*60)
-    # print(f"Streaming response length: {len(streaming_result)}")
+    print(f"Streaming response length: {len(streaming_result)}")
    print(f"Non-streaming response length: {len(non_streaming_result)}")
    print("\nBoth tests completed successfully!")