support async

This commit is contained in:
2025-12-29 21:59:11 +08:00
parent 53ecbebb0a
commit ab5dda1f21
8 changed files with 429 additions and 26 deletions

View File

@@ -128,7 +128,7 @@ async def application_responses(
user_msg = last.get("content") if isinstance(last, dict) else str(last)
# Invoke pipeline (non-stream) then stream-chunk it to the client
result_text = pipeline.chat(inp=user_msg, as_stream=False, thread_id=thread_id)
result_text = await pipeline.achat(inp=user_msg, as_stream=False, thread_id=thread_id)
if not isinstance(result_text, str):
result_text = str(result_text)
@@ -206,7 +206,7 @@ async def application_completion(
last = messages[-1]
user_msg = last.get("content") if isinstance(last, dict) else str(last)
result_text = pipeline.chat(inp=user_msg, as_stream=False, thread_id=thread_id)
result_text = await pipeline.achat(inp=user_msg, as_stream=False, thread_id=thread_id)
if not isinstance(result_text, str):
result_text = str(result_text)

View File

@@ -89,6 +89,46 @@ def sse_chunks_from_stream(chunk_generator, response_id: str, model: str = "qwen
yield f"data: {json.dumps(final)}\n\n"
async def sse_chunks_from_astream(chunk_generator, response_id: str, model: str = "qwen-flash"):
"""
Async version: Stream chunks from pipeline and format as SSE.
Accumulates text and sends incremental updates.
DashScope SDK expects accumulated text in each chunk (not deltas).
"""
created_time = int(time.time())
accumulated_text = ""
async for chunk in chunk_generator:
if chunk:
accumulated_text += chunk
data = {
"request_id": response_id,
"code": 200,
"message": "OK",
"output": {
"text": accumulated_text,
"created": created_time,
"model": model,
},
"is_end": False,
}
yield f"data: {json.dumps(data)}\n\n"
# Final message with complete text
final = {
"request_id": response_id,
"code": 200,
"message": "OK",
"output": {
"text": accumulated_text,
"created": created_time,
"model": model,
},
"is_end": True,
}
yield f"data: {json.dumps(final)}\n\n"
@app.post("/v1/apps/{app_id}/sessions/{session_id}/responses")
@app.post("/api/v1/apps/{app_id}/sessions/{session_id}/responses")
async def application_responses(
@@ -137,15 +177,15 @@ async def application_responses(
response_id = f"appcmpl-{os.urandom(12).hex()}"
if stream:
# Use actual streaming from pipeline
chunk_generator = pipeline.chat(inp=user_msg, as_stream=True, thread_id=thread_id)
# Use async streaming from pipeline
chunk_generator = await pipeline.achat(inp=user_msg, as_stream=True, thread_id=thread_id)
return StreamingResponse(
sse_chunks_from_stream(chunk_generator, response_id=response_id, model=pipeline_config.llm_name),
sse_chunks_from_astream(chunk_generator, response_id=response_id, model=pipeline_config.llm_name),
media_type="text/event-stream",
)
# Non-streaming: get full result
result_text = pipeline.chat(inp=user_msg, as_stream=False, thread_id=thread_id)
# Non-streaming: get full result using async
result_text = await pipeline.achat(inp=user_msg, as_stream=False, thread_id=thread_id)
if not isinstance(result_text, str):
result_text = str(result_text)
@@ -217,15 +257,15 @@ async def application_completion(
response_id = f"appcmpl-{os.urandom(12).hex()}"
if stream:
# Use actual streaming from pipeline
chunk_generator = pipeline.chat(inp=user_msg, as_stream=True, thread_id=thread_id)
# Use async streaming from pipeline
chunk_generator = await pipeline.achat(inp=user_msg, as_stream=True, thread_id=thread_id)
return StreamingResponse(
sse_chunks_from_stream(chunk_generator, response_id=response_id, model=pipeline_config.llm_name),
sse_chunks_from_astream(chunk_generator, response_id=response_id, model=pipeline_config.llm_name),
media_type="text/event-stream",
)
# Non-streaming: get full result
result_text = pipeline.chat(inp=user_msg, as_stream=False, thread_id=thread_id)
# Non-streaming: get full result using async
result_text = await pipeline.achat(inp=user_msg, as_stream=False, thread_id=thread_id)
if not isinstance(result_text, str):
result_text = str(result_text)

View File

@@ -91,6 +91,47 @@ def sse_chunks_from_stream(chunk_generator, response_id: str, model: str, create
yield "data: [DONE]\n\n"
async def sse_chunks_from_astream(chunk_generator, response_id: str, model: str, created_time: int):
"""
Async version: Stream chunks from pipeline and format as OpenAI SSE.
"""
async for chunk in chunk_generator:
if chunk:
data = {
"id": response_id,
"object": "chat.completion.chunk",
"created": created_time,
"model": model,
"choices": [
{
"index": 0,
"delta": {
"content": chunk
},
"finish_reason": None
}
]
}
yield f"data: {json.dumps(data)}\n\n"
# Final message
final = {
"id": response_id,
"object": "chat.completion.chunk",
"created": created_time,
"model": model,
"choices": [
{
"index": 0,
"delta": {},
"finish_reason": "stop"
}
]
}
yield f"data: {json.dumps(final)}\n\n"
yield "data: [DONE]\n\n"
@app.post("/v1/chat/completions")
async def chat_completions(request: Request):
try:
@@ -121,15 +162,15 @@ async def chat_completions(request: Request):
created_time = int(time.time())
if stream:
# Use actual streaming from pipeline
chunk_generator = pipeline.chat(inp=user_msg, as_stream=True, thread_id=thread_id)
# Use async streaming from pipeline
chunk_generator = await pipeline.achat(inp=user_msg, as_stream=True, thread_id=thread_id)
return StreamingResponse(
sse_chunks_from_stream(chunk_generator, response_id=response_id, model=model, created_time=created_time),
sse_chunks_from_astream(chunk_generator, response_id=response_id, model=model, created_time=created_time),
media_type="text/event-stream",
)
# Non-streaming: get full result
result_text = pipeline.chat(inp=user_msg, as_stream=False, thread_id=thread_id)
# Non-streaming: get full result using async
result_text = await pipeline.achat(inp=user_msg, as_stream=False, thread_id=thread_id)
if not isinstance(result_text, str):
result_text = str(result_text)

View File

@@ -113,13 +113,13 @@ def main():
print(f"\nUsing base_url = {BASE_URL}\n")
# Test both streaming and non-streaming
# streaming_result = test_streaming()
streaming_result = test_streaming()
non_streaming_result = test_non_streaming()
print("\n" + "="*60)
print("SUMMARY")
print("="*60)
# print(f"Streaming response length: {len(streaming_result)}")
print(f"Streaming response length: {len(streaming_result)}")
print(f"Non-streaming response length: {len(non_streaming_result)}")
print("\nBoth tests completed successfully!")