admin update
This commit is contained in:
120
fastAPI_tarot.py
120
fastAPI_tarot.py
@@ -21,6 +21,7 @@ import traceback
|
||||
import re
|
||||
import asyncio
|
||||
import shutil
|
||||
import subprocess
|
||||
from datetime import datetime
|
||||
from typing import Optional, List, Dict, Any
|
||||
from contextlib import asynccontextmanager
|
||||
@@ -1287,12 +1288,38 @@ async def get_config(request: Request):
|
||||
"""
|
||||
Get system config info
|
||||
"""
|
||||
device = "Unknown"
|
||||
device_str = "Unknown"
|
||||
gpu_status = {}
|
||||
|
||||
if hasattr(request.app.state, "device"):
|
||||
device = str(request.app.state.device)
|
||||
device_str = str(request.app.state.device)
|
||||
|
||||
# 获取 GPU 详细信息
|
||||
if torch.cuda.is_available():
|
||||
try:
|
||||
device_id = torch.cuda.current_device()
|
||||
props = torch.cuda.get_device_properties(device_id)
|
||||
|
||||
total_mem = props.total_memory
|
||||
reserved_mem = torch.cuda.memory_reserved(device_id)
|
||||
allocated_mem = torch.cuda.memory_allocated(device_id)
|
||||
|
||||
gpu_status = {
|
||||
"available": True,
|
||||
"name": props.name,
|
||||
"total_memory": f"{total_mem / 1024**3:.2f} GB",
|
||||
"reserved_memory": f"{reserved_mem / 1024**3:.2f} GB",
|
||||
"allocated_memory": f"{allocated_mem / 1024**3:.2f} GB",
|
||||
"memory_usage_percent": round((reserved_mem / total_mem) * 100, 1)
|
||||
}
|
||||
except Exception as e:
|
||||
gpu_status = {"available": True, "error": str(e)}
|
||||
else:
|
||||
gpu_status = {"available": False, "reason": "No CUDA device detected"}
|
||||
|
||||
return {
|
||||
"device": device,
|
||||
"device": device_str,
|
||||
"gpu_status": gpu_status,
|
||||
"cleanup_config": CLEANUP_CONFIG,
|
||||
"current_qwen_model": QWEN_MODEL,
|
||||
"available_qwen_models": AVAILABLE_QWEN_MODELS
|
||||
@@ -1348,6 +1375,93 @@ async def update_prompts(
|
||||
PROMPTS[key] = content
|
||||
return {"status": "success", "message": f"Prompt '{key}' updated"}
|
||||
|
||||
# ------------------------------------------
|
||||
# GPU Status Helper & API
|
||||
# ------------------------------------------
|
||||
|
||||
def get_gpu_status_smi():
|
||||
"""
|
||||
Get detailed GPU status using nvidia-smi
|
||||
Returns: dict with utilization, memory, temp, power, etc.
|
||||
"""
|
||||
cuda_version = "Unknown"
|
||||
try:
|
||||
import torch
|
||||
if torch.version.cuda:
|
||||
cuda_version = torch.version.cuda
|
||||
except:
|
||||
pass
|
||||
|
||||
try:
|
||||
# Check if nvidia-smi is available
|
||||
# Fields: utilization.gpu, utilization.memory, temperature.gpu, power.draw, power.limit, memory.total, memory.used, memory.free, name, driver_version
|
||||
result = subprocess.run(
|
||||
['nvidia-smi', '--query-gpu=utilization.gpu,utilization.memory,temperature.gpu,power.draw,power.limit,memory.total,memory.used,memory.free,name,driver_version', '--format=csv,noheader,nounits'],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
encoding='utf-8'
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
raise Exception("nvidia-smi failed")
|
||||
|
||||
# Parse the first line (assuming single GPU for now, or take the first one)
|
||||
line = result.stdout.strip().split('\n')[0]
|
||||
vals = [x.strip() for x in line.split(',')]
|
||||
|
||||
return {
|
||||
"available": True,
|
||||
"gpu_util": float(vals[0]), # %
|
||||
"mem_util": float(vals[1]), # % (controller utilization)
|
||||
"temperature": float(vals[2]), # C
|
||||
"power_draw": float(vals[3]), # W
|
||||
"power_limit": float(vals[4]), # W
|
||||
"mem_total": float(vals[5]), # MB
|
||||
"mem_used": float(vals[6]), # MB
|
||||
"mem_free": float(vals[7]), # MB
|
||||
"name": vals[8],
|
||||
"driver_version": vals[9],
|
||||
"cuda_version": cuda_version,
|
||||
"source": "nvidia-smi",
|
||||
"timestamp": time.time()
|
||||
}
|
||||
except Exception as e:
|
||||
# Fallback to torch if available
|
||||
if torch.cuda.is_available():
|
||||
try:
|
||||
device_id = torch.cuda.current_device()
|
||||
props = torch.cuda.get_device_properties(device_id)
|
||||
mem_reserved = torch.cuda.memory_reserved(device_id) / 1024**2 # MB
|
||||
mem_total = props.total_memory / 1024**2 # MB
|
||||
|
||||
return {
|
||||
"available": True,
|
||||
"gpu_util": 0, # Torch can't get this easily
|
||||
"mem_util": (mem_reserved / mem_total) * 100,
|
||||
"temperature": 0,
|
||||
"power_draw": 0,
|
||||
"power_limit": 0,
|
||||
"mem_total": mem_total,
|
||||
"mem_used": mem_reserved,
|
||||
"mem_free": mem_total - mem_reserved,
|
||||
"name": props.name,
|
||||
"driver_version": "Unknown",
|
||||
"cuda_version": cuda_version,
|
||||
"source": "torch",
|
||||
"timestamp": time.time()
|
||||
}
|
||||
except:
|
||||
pass
|
||||
|
||||
return {"available": False, "error": str(e)}
|
||||
|
||||
@app.get("/admin/api/gpu/status", dependencies=[Depends(verify_admin)])
|
||||
async def get_gpu_status_api():
|
||||
"""
|
||||
Get real-time GPU status
|
||||
"""
|
||||
return get_gpu_status_smi()
|
||||
|
||||
# ==========================================
|
||||
# 10. Main Entry Point (启动入口)
|
||||
# ==========================================
|
||||
|
||||
Reference in New Issue
Block a user