165 lines
5.9 KiB
Bash
Executable File
165 lines
5.9 KiB
Bash
Executable File
#!/bin/bash
|
||
|
||
# ==============================================================================
|
||
# SAM3 项目启动与监控脚本
|
||
# 功能:启动 Python FastAPI 服务,并持续监控健康状态
|
||
# 作者:Trae AI
|
||
# 日期:2026-02-17
|
||
# ==============================================================================
|
||
|
||
# 配置部分
|
||
PROJECT_DIR="/home/quant/data/dev/sam3" # 项目根目录
|
||
SCRIPT_NAME="fastAPI_tarot.py" # Python 启动脚本
|
||
LOG_FILE="${PROJECT_DIR}/log/monitor.log" # 监控日志文件
|
||
APP_LOG_FILE="${PROJECT_DIR}/log/app.log" # 应用输出日志文件
|
||
PORT=55600 # 服务端口
|
||
CHECK_INTERVAL=5 # 检查间隔(秒)
|
||
MAX_FAILURES=3 # 最大连续失败次数,超过则重启
|
||
STARTUP_TIMEOUT=300 # 启动超时时间(秒),等待模型加载
|
||
PYTHON_CMD="python" # Python 命令,根据环境可能是 python3
|
||
|
||
# 切换到项目目录
|
||
cd "$PROJECT_DIR" || exit 1
|
||
|
||
# 初始化变量
|
||
APP_PID=0
|
||
FAIL_COUNT=0
|
||
|
||
# ==============================================================================
|
||
# 函数:记录日志 (log_message)
|
||
# 功能:将带有时间戳的信息写入日志文件并输出到控制台
|
||
# 参数:$1 - 日志内容
|
||
# ==============================================================================
|
||
log_message() {
|
||
local timestamp=$(date "+%Y-%m-%d %H:%M:%S")
|
||
echo "[$timestamp] $1" | tee -a "$LOG_FILE"
|
||
}
|
||
|
||
# ==============================================================================
|
||
# 函数:启动应用 (start_app)
|
||
# 功能:启动 FastAPI 服务,并记录 PID
|
||
# ==============================================================================
|
||
start_app() {
|
||
log_message "正在启动项目: $SCRIPT_NAME ..."
|
||
log_message "应用日志将输出到: $APP_LOG_FILE"
|
||
|
||
# 后台启动 Python 脚本,将 stdout 和 stderr 重定向到日志
|
||
# 使用 -u 参数启用无缓冲输出,确保日志实时更新
|
||
nohup $PYTHON_CMD -u "$SCRIPT_NAME" > "$APP_LOG_FILE" 2>&1 &
|
||
|
||
APP_PID=$!
|
||
log_message "项目已启动,PID: $APP_PID"
|
||
|
||
log_message "正在等待服务初始化 (最多等待 ${STARTUP_TIMEOUT} 秒)..."
|
||
|
||
# 循环检查服务是否就绪
|
||
local elapsed=0
|
||
while [ $elapsed -lt $STARTUP_TIMEOUT ]; do
|
||
# 检查进程是否还活着
|
||
if ! kill -0 $APP_PID 2>/dev/null; then
|
||
log_message "错误: 进程在启动过程中退出。请检查应用日志。"
|
||
return 1
|
||
fi
|
||
|
||
# 检查端口响应
|
||
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" "http://127.0.0.1:$PORT/docs")
|
||
if [ "$HTTP_CODE" == "200" ]; then
|
||
log_message "服务启动成功!"
|
||
return 0
|
||
fi
|
||
|
||
sleep 5
|
||
elapsed=$((elapsed + 5))
|
||
|
||
# 每30秒打印一次等待日志
|
||
if [ $((elapsed % 30)) -eq 0 ]; then
|
||
log_message "仍在等待服务启动... (已耗时 ${elapsed} 秒)"
|
||
fi
|
||
done
|
||
|
||
log_message "错误: 服务启动超时 (${STARTUP_TIMEOUT} 秒)。正在终止进程..."
|
||
kill -9 $APP_PID 2>/dev/null
|
||
return 1
|
||
}
|
||
|
||
# ==============================================================================
|
||
# 函数:停止应用 (stop_app)
|
||
# 功能:通过 PID 停止应用,如果失败则强制杀死
|
||
# ==============================================================================
|
||
stop_app() {
|
||
if [ $APP_PID -gt 0 ]; then
|
||
log_message "正在停止项目 (PID: $APP_PID)..."
|
||
kill $APP_PID 2>/dev/null
|
||
|
||
# 等待进程结束
|
||
for i in {1..5}; do
|
||
if ! kill -0 $APP_PID 2>/dev/null; then
|
||
log_message "项目已停止"
|
||
return
|
||
fi
|
||
sleep 1
|
||
done
|
||
|
||
# 如果还在运行,强制杀死
|
||
log_message "项目未响应,正在强制终止..."
|
||
kill -9 $APP_PID 2>/dev/null
|
||
fi
|
||
}
|
||
|
||
# ==============================================================================
|
||
# 函数:检查健康状态 (check_health)
|
||
# 功能:检查进程是否存在以及端口是否响应
|
||
# 返回:0 (正常) / 1 (异常)
|
||
# ==============================================================================
|
||
check_health() {
|
||
# 1. 检查进程是否存在
|
||
if ! kill -0 $APP_PID 2>/dev/null; then
|
||
log_message "警告: 进程 $APP_PID 不存在"
|
||
return 1
|
||
fi
|
||
|
||
# 2. 检查端口响应 (请求 /docs 接口)
|
||
# 使用 curl 获取 HTTP 状态码
|
||
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" "http://127.0.0.1:$PORT/docs")
|
||
|
||
if [ "$HTTP_CODE" == "200" ]; then
|
||
return 0
|
||
else
|
||
log_message "警告: 健康检查失败,HTTP 状态码: $HTTP_CODE"
|
||
return 1
|
||
fi
|
||
}
|
||
|
||
# ==============================================================================
|
||
# 主循环
|
||
# ==============================================================================
|
||
|
||
# 初始启动
|
||
start_app
|
||
|
||
while true; do
|
||
if check_health; then
|
||
# 健康检查通过
|
||
FAIL_COUNT=0
|
||
# log_message "健康检查通过" # 可选:为了减少日志量,可以注释掉这行
|
||
else
|
||
# 健康检查失败
|
||
((FAIL_COUNT++))
|
||
log_message "健康检查失败 ($FAIL_COUNT/$MAX_FAILURES)"
|
||
|
||
if [ $FAIL_COUNT -ge $MAX_FAILURES ]; then
|
||
log_message "错误: 连续检测失败次数过多,准备重启项目..."
|
||
stop_app
|
||
start_app
|
||
FAIL_COUNT=0
|
||
elif ! kill -0 $APP_PID 2>/dev/null; then
|
||
# 如果进程直接没了,立即重启
|
||
log_message "错误: 进程意外退出,立即重启..."
|
||
start_app
|
||
FAIL_COUNT=0
|
||
fi
|
||
fi
|
||
|
||
sleep $CHECK_INTERVAL
|
||
done
|