admin update

2026-02-18 01:26:22 +08:00
parent 2d315948a2
commit 06f2b2928b
4 changed files with 794 additions and 125 deletions
--- a/fastAPI_tarot.py
+++ b/fastAPI_tarot.py
@@ -21,6 +21,7 @@ import traceback
 import re
 import asyncio
 import shutil
+import subprocess
 from datetime import datetime
 from typing import Optional, List, Dict, Any
 from contextlib import asynccontextmanager
@@ -1287,12 +1288,38 @@ async def get_config(request: Request):
    """
    Get system config info
    """
-    device = "Unknown"
+    device_str = "Unknown"
+    gpu_status = {}
+    
    if hasattr(request.app.state, "device"):
-        device = str(request.app.state.device)
+        device_str = str(request.app.state.device)
+        
+    # 获取 GPU 详细信息
+    if torch.cuda.is_available():
+        try:
+            device_id = torch.cuda.current_device()
+            props = torch.cuda.get_device_properties(device_id)
+            
+            total_mem = props.total_memory
+            reserved_mem = torch.cuda.memory_reserved(device_id)
+            allocated_mem = torch.cuda.memory_allocated(device_id)
+            
+            gpu_status = {
+                "available": True,
+                "name": props.name,
+                "total_memory": f"{total_mem / 1024**3:.2f} GB",
+                "reserved_memory": f"{reserved_mem / 1024**3:.2f} GB",
+                "allocated_memory": f"{allocated_mem / 1024**3:.2f} GB",
+                "memory_usage_percent": round((reserved_mem / total_mem) * 100, 1)
+            }
+        except Exception as e:
+            gpu_status = {"available": True, "error": str(e)}
+    else:
+        gpu_status = {"available": False, "reason": "No CUDA device detected"}
        
    return {
-        "device": device,
+        "device": device_str,
+        "gpu_status": gpu_status,
        "cleanup_config": CLEANUP_CONFIG,
        "current_qwen_model": QWEN_MODEL,
        "available_qwen_models": AVAILABLE_QWEN_MODELS
@@ -1348,6 +1375,93 @@ async def update_prompts(
    PROMPTS[key] = content
    return {"status": "success", "message": f"Prompt '{key}' updated"}

+# ------------------------------------------
+# GPU Status Helper & API
+# ------------------------------------------
+
+def get_gpu_status_smi():
+    """
+    Get detailed GPU status using nvidia-smi
+    Returns: dict with utilization, memory, temp, power, etc.
+    """
+    cuda_version = "Unknown"
+    try:
+        import torch
+        if torch.version.cuda:
+            cuda_version = torch.version.cuda
+    except:
+        pass
+
+    try:
+        # Check if nvidia-smi is available
+        # Fields: utilization.gpu, utilization.memory, temperature.gpu, power.draw, power.limit, memory.total, memory.used, memory.free, name, driver_version
+        result = subprocess.run(
+            ['nvidia-smi', '--query-gpu=utilization.gpu,utilization.memory,temperature.gpu,power.draw,power.limit,memory.total,memory.used,memory.free,name,driver_version', '--format=csv,noheader,nounits'],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            encoding='utf-8'
+        )
+        
+        if result.returncode != 0:
+            raise Exception("nvidia-smi failed")
+
+        # Parse the first line (assuming single GPU for now, or take the first one)
+        line = result.stdout.strip().split('\n')[0]
+        vals = [x.strip() for x in line.split(',')]
+        
+        return {
+            "available": True,
+            "gpu_util": float(vals[0]),       # %
+            "mem_util": float(vals[1]),       # % (controller utilization)
+            "temperature": float(vals[2]),    # C
+            "power_draw": float(vals[3]),     # W
+            "power_limit": float(vals[4]),    # W
+            "mem_total": float(vals[5]),      # MB
+            "mem_used": float(vals[6]),       # MB
+            "mem_free": float(vals[7]),       # MB
+            "name": vals[8],
+            "driver_version": vals[9],
+            "cuda_version": cuda_version,
+            "source": "nvidia-smi",
+            "timestamp": time.time()
+        }
+    except Exception as e:
+        # Fallback to torch if available
+        if torch.cuda.is_available():
+            try:
+                device_id = torch.cuda.current_device()
+                props = torch.cuda.get_device_properties(device_id)
+                mem_reserved = torch.cuda.memory_reserved(device_id) / 1024**2 # MB
+                mem_total = props.total_memory / 1024**2 # MB
+                
+                return {
+                    "available": True,
+                    "gpu_util": 0, # Torch can't get this easily
+                    "mem_util": (mem_reserved / mem_total) * 100,
+                    "temperature": 0,
+                    "power_draw": 0,
+                    "power_limit": 0,
+                    "mem_total": mem_total,
+                    "mem_used": mem_reserved,
+                    "mem_free": mem_total - mem_reserved,
+                    "name": props.name,
+                    "driver_version": "Unknown",
+                    "cuda_version": cuda_version,
+                    "source": "torch",
+                    "timestamp": time.time()
+                }
+            except:
+                pass
+        
+        return {"available": False, "error": str(e)}
+
+@app.get("/admin/api/gpu/status", dependencies=[Depends(verify_admin)])
+async def get_gpu_status_api():
+    """
+    Get real-time GPU status
+    """
+    return get_gpu_status_smi()
+
 # ==========================================
 # 10. Main Entry Point (启动入口)
 # ==========================================