openai api

2025-12-26 14:51:41 +08:00
parent 3edc4427fc
commit bd6cdffb28
2 changed files with 314 additions and 0 deletions
--- a/fastapi_server/server_openai.py
+++ b/fastapi_server/server_openai.py
@@ -0,0 +1,188 @@
+from fastapi import FastAPI, HTTPException, Request
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import StreamingResponse, JSONResponse
+from pydantic import BaseModel, Field
+from typing import List, Optional, Union, Literal
+import os
+import sys
+import time
+import json
+import uvicorn
+from loguru import logger
+import tyro
+
+# Ensure we can import from project root
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from lang_agent.pipeline import Pipeline, PipelineConfig
+
+# Initialize Pipeline once
+pipeline_config = tyro.cli(PipelineConfig)
+pipeline: Pipeline = pipeline_config.setup()
+
+
+class OpenAIMessage(BaseModel):
+    role: str
+    content: str
+
+
+class OpenAIChatCompletionRequest(BaseModel):
+    model: str = Field(default="gpt-3.5-turbo")
+    messages: List[OpenAIMessage]
+    stream: bool = Field(default=False)
+    temperature: Optional[float] = Field(default=1.0)
+    max_tokens: Optional[int] = Field(default=None)
+    # Optional overrides for pipeline behavior
+    thread_id: Optional[int] = Field(default=3)
+
+
+app = FastAPI(
+    title="OpenAI-Compatible Chat API",
+    description="OpenAI Chat Completions API compatible endpoint backed by pipeline.chat"
+)
+
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+
+def sse_chunks_from_stream(chunk_generator, response_id: str, model: str, created_time: int):
+    """
+    Stream chunks from pipeline and format as OpenAI SSE.
+    """
+    for chunk in chunk_generator:
+        if chunk:
+            data = {
+                "id": response_id,
+                "object": "chat.completion.chunk",
+                "created": created_time,
+                "model": model,
+                "choices": [
+                    {
+                        "index": 0,
+                        "delta": {
+                            "content": chunk
+                        },
+                        "finish_reason": None
+                    }
+                ]
+            }
+            yield f"data: {json.dumps(data)}\n\n"
+
+    # Final message
+    final = {
+        "id": response_id,
+        "object": "chat.completion.chunk",
+        "created": created_time,
+        "model": model,
+        "choices": [
+            {
+                "index": 0,
+                "delta": {},
+                "finish_reason": "stop"
+            }
+        ]
+    }
+    yield f"data: {json.dumps(final)}\n\n"
+    yield "data: [DONE]\n\n"
+
+
+@app.post("/v1/chat/completions")
+async def chat_completions(request: Request):
+    try:
+        body = await request.json()
+        
+        messages = body.get("messages")
+        if not messages:
+            raise HTTPException(status_code=400, detail="messages is required")
+        
+        stream = body.get("stream", False)
+        model = body.get("model", "gpt-3.5-turbo")
+        thread_id = body.get("thread_id", 3)
+        
+        # Extract latest user message
+        user_msg = None
+        for m in reversed(messages):
+            role = m.get("role") if isinstance(m, dict) else None
+            content = m.get("content") if isinstance(m, dict) else None
+            if role == "user" and content:
+                user_msg = content
+                break
+        
+        if user_msg is None:
+            last = messages[-1]
+            user_msg = last.get("content") if isinstance(last, dict) else str(last)
+        
+        response_id = f"chatcmpl-{os.urandom(12).hex()}"
+        created_time = int(time.time())
+        
+        if stream:
+            # Use actual streaming from pipeline
+            chunk_generator = pipeline.chat(inp=user_msg, as_stream=True, thread_id=thread_id)
+            return StreamingResponse(
+                sse_chunks_from_stream(chunk_generator, response_id=response_id, model=model, created_time=created_time),
+                media_type="text/event-stream",
+            )
+        
+        # Non-streaming: get full result
+        result_text = pipeline.chat(inp=user_msg, as_stream=False, thread_id=thread_id)
+        if not isinstance(result_text, str):
+            result_text = str(result_text)
+        
+        data = {
+            "id": response_id,
+            "object": "chat.completion",
+            "created": created_time,
+            "model": model,
+            "choices": [
+                {
+                    "index": 0,
+                    "message": {
+                        "role": "assistant",
+                        "content": result_text
+                    },
+                    "finish_reason": "stop"
+                }
+            ],
+            "usage": {
+                "prompt_tokens": 0,
+                "completion_tokens": 0,
+                "total_tokens": 0
+            }
+        }
+        return JSONResponse(content=data)
+    
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"OpenAI-compatible endpoint error: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+
+
+@app.get("/")
+async def root():
+    return {
+        "message": "OpenAI-compatible Chat API",
+        "endpoints": [
+            "/v1/chat/completions",
+            "/health"
+        ]
+    }
+
+
+@app.get("/health")
+async def health():
+    return {"status": "healthy"}
+
+
+if __name__ == "__main__":
+    uvicorn.run(
+        "server_openai:app",
+        host="0.0.0.0",
+        port=8589,
+        reload=True,
+    )
--- a/fastapi_server/test_openai_client.py
+++ b/fastapi_server/test_openai_client.py
@@ -0,0 +1,126 @@
+#!/usr/bin/env python3
+"""
+Test for OpenAI-compatible API against server_openai.py
+
+Instructions:
+- Start the OpenAI-compatible server first, e.g.:
+    python fastapi_server/server_openai.py --llm_name qwen-plus --llm_provider openai --base_url https://dashscope.aliyuncs.com/compatible-mode/v1
+- Or with uvicorn:
+    uvicorn fastapi_server.server_openai:app --host 0.0.0.0 --port 8589 --reload
+- Set BASE_URL below to the server base URL you started.
+"""
+import os
+from dotenv import load_dotenv
+from loguru import logger
+
+TAG = __name__
+
+load_dotenv()
+
+try:
+    from openai import OpenAI
+except Exception as e:
+    print("openai package not found. Please install it: pip install openai")
+    raise
+
+
+# <<< Paste your running FastAPI base url here >>>
+BASE_URL = os.getenv("OPENAI_BASE_URL", "http://127.0.0.1:8589/v1")
+
+# Test configuration matching the server setup
+# llm_name: "qwen-plus"
+# llm_provider: "openai"
+# base_url: "https://dashscope.aliyuncs.com/compatible-mode/v1"
+
+# Test messages
+messages = [
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": "use calculator to calculate 1234*5641"},
+]
+
+
+def test_streaming():
+    """Test streaming chat completion"""
+    print("\n" + "="*60)
+    print("Testing STREAMING chat completion...")
+    print("="*60 + "\n")
+    
+    client = OpenAI(
+        base_url=BASE_URL,
+        api_key="test-key"  # Dummy key for testing
+    )
+    
+    try:
+        stream = client.chat.completions.create(
+            model="qwen-plus",  # Using qwen-plus as configured
+            messages=messages,
+            stream=True
+        )
+        
+        full_response = ""
+        for chunk in stream:
+            if chunk.choices[0].delta.content is not None:
+                content = chunk.choices[0].delta.content
+                full_response += content
+                print(content, end="", flush=True)
+        
+        print("\n\n" + "-"*60)
+        print(f"Full streaming response length: {len(full_response)}")
+        print("-"*60)
+        
+        return full_response
+    
+    except Exception as e:
+        logger.error(f"Streaming test error: {e}")
+        raise
+
+
+def test_non_streaming():
+    """Test non-streaming chat completion"""
+    print("\n" + "="*60)
+    print("Testing NON-STREAMING chat completion...")
+    print("="*60 + "\n")
+    
+    client = OpenAI(
+        base_url=BASE_URL,
+        api_key="test-key"  # Dummy key for testing
+    )
+    
+    try:
+        response = client.chat.completions.create(
+            model="qwen-plus",  # Using qwen-plus as configured
+            messages=messages,
+            stream=False
+        )
+        
+        content = response.choices[0].message.content
+        print(f"Response: {content}")
+        print("\n" + "-"*60)
+        print(f"Full non-streaming response length: {len(content)}")
+        print(f"Finish reason: {response.choices[0].finish_reason}")
+        print("-"*60)
+        
+        return content
+    
+    except Exception as e:
+        logger.error(f"Non-streaming test error: {e}")
+        raise
+
+
+def main():
+    print(f"\nUsing base_url = {BASE_URL}\n")
+    
+    # Test both streaming and non-streaming
+    streaming_result = test_streaming()
+    non_streaming_result = test_non_streaming()
+    
+    print("\n" + "="*60)
+    print("SUMMARY")
+    print("="*60)
+    print(f"Streaming response length: {len(streaming_result)}")
+    print(f"Non-streaming response length: {len(non_streaming_result)}")
+    print("\nBoth tests completed successfully!")
+
+
+if __name__ == "__main__":
+    main()