admin
This commit is contained in:
164
run_monitor.sh
Executable file
164
run_monitor.sh
Executable file
@@ -0,0 +1,164 @@
|
||||
#!/bin/bash
|
||||
|
||||
# ==============================================================================
|
||||
# SAM3 项目启动与监控脚本
|
||||
# 功能:启动 Python FastAPI 服务,并持续监控健康状态
|
||||
# 作者:Trae AI
|
||||
# 日期:2026-02-17
|
||||
# ==============================================================================
|
||||
|
||||
# 配置部分
|
||||
PROJECT_DIR="/home/quant/data/dev/sam3" # 项目根目录
|
||||
SCRIPT_NAME="fastAPI_tarot.py" # Python 启动脚本
|
||||
LOG_FILE="${PROJECT_DIR}/log/monitor.log" # 监控日志文件
|
||||
APP_LOG_FILE="${PROJECT_DIR}/log/app.log" # 应用输出日志文件
|
||||
PORT=55600 # 服务端口
|
||||
CHECK_INTERVAL=5 # 检查间隔(秒)
|
||||
MAX_FAILURES=3 # 最大连续失败次数,超过则重启
|
||||
STARTUP_TIMEOUT=300 # 启动超时时间(秒),等待模型加载
|
||||
PYTHON_CMD="python" # Python 命令,根据环境可能是 python3
|
||||
|
||||
# 切换到项目目录
|
||||
cd "$PROJECT_DIR" || exit 1
|
||||
|
||||
# 初始化变量
|
||||
APP_PID=0
|
||||
FAIL_COUNT=0
|
||||
|
||||
# ==============================================================================
|
||||
# 函数:记录日志 (log_message)
|
||||
# 功能:将带有时间戳的信息写入日志文件并输出到控制台
|
||||
# 参数:$1 - 日志内容
|
||||
# ==============================================================================
|
||||
log_message() {
|
||||
local timestamp=$(date "+%Y-%m-%d %H:%M:%S")
|
||||
echo "[$timestamp] $1" | tee -a "$LOG_FILE"
|
||||
}
|
||||
|
||||
# ==============================================================================
|
||||
# 函数:启动应用 (start_app)
|
||||
# 功能:启动 FastAPI 服务,并记录 PID
|
||||
# ==============================================================================
|
||||
start_app() {
|
||||
log_message "正在启动项目: $SCRIPT_NAME ..."
|
||||
log_message "应用日志将输出到: $APP_LOG_FILE"
|
||||
|
||||
# 后台启动 Python 脚本,将 stdout 和 stderr 重定向到日志
|
||||
# 使用 -u 参数启用无缓冲输出,确保日志实时更新
|
||||
nohup $PYTHON_CMD -u "$SCRIPT_NAME" > "$APP_LOG_FILE" 2>&1 &
|
||||
|
||||
APP_PID=$!
|
||||
log_message "项目已启动,PID: $APP_PID"
|
||||
|
||||
log_message "正在等待服务初始化 (最多等待 ${STARTUP_TIMEOUT} 秒)..."
|
||||
|
||||
# 循环检查服务是否就绪
|
||||
local elapsed=0
|
||||
while [ $elapsed -lt $STARTUP_TIMEOUT ]; do
|
||||
# 检查进程是否还活着
|
||||
if ! kill -0 $APP_PID 2>/dev/null; then
|
||||
log_message "错误: 进程在启动过程中退出。请检查应用日志。"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# 检查端口响应
|
||||
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" "http://127.0.0.1:$PORT/docs")
|
||||
if [ "$HTTP_CODE" == "200" ]; then
|
||||
log_message "服务启动成功!"
|
||||
return 0
|
||||
fi
|
||||
|
||||
sleep 5
|
||||
elapsed=$((elapsed + 5))
|
||||
|
||||
# 每30秒打印一次等待日志
|
||||
if [ $((elapsed % 30)) -eq 0 ]; then
|
||||
log_message "仍在等待服务启动... (已耗时 ${elapsed} 秒)"
|
||||
fi
|
||||
done
|
||||
|
||||
log_message "错误: 服务启动超时 (${STARTUP_TIMEOUT} 秒)。正在终止进程..."
|
||||
kill -9 $APP_PID 2>/dev/null
|
||||
return 1
|
||||
}
|
||||
|
||||
# ==============================================================================
|
||||
# 函数:停止应用 (stop_app)
|
||||
# 功能:通过 PID 停止应用,如果失败则强制杀死
|
||||
# ==============================================================================
|
||||
stop_app() {
|
||||
if [ $APP_PID -gt 0 ]; then
|
||||
log_message "正在停止项目 (PID: $APP_PID)..."
|
||||
kill $APP_PID 2>/dev/null
|
||||
|
||||
# 等待进程结束
|
||||
for i in {1..5}; do
|
||||
if ! kill -0 $APP_PID 2>/dev/null; then
|
||||
log_message "项目已停止"
|
||||
return
|
||||
fi
|
||||
sleep 1
|
||||
done
|
||||
|
||||
# 如果还在运行,强制杀死
|
||||
log_message "项目未响应,正在强制终止..."
|
||||
kill -9 $APP_PID 2>/dev/null
|
||||
fi
|
||||
}
|
||||
|
||||
# ==============================================================================
|
||||
# 函数:检查健康状态 (check_health)
|
||||
# 功能:检查进程是否存在以及端口是否响应
|
||||
# 返回:0 (正常) / 1 (异常)
|
||||
# ==============================================================================
|
||||
check_health() {
|
||||
# 1. 检查进程是否存在
|
||||
if ! kill -0 $APP_PID 2>/dev/null; then
|
||||
log_message "警告: 进程 $APP_PID 不存在"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# 2. 检查端口响应 (请求 /docs 接口)
|
||||
# 使用 curl 获取 HTTP 状态码
|
||||
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" "http://127.0.0.1:$PORT/docs")
|
||||
|
||||
if [ "$HTTP_CODE" == "200" ]; then
|
||||
return 0
|
||||
else
|
||||
log_message "警告: 健康检查失败,HTTP 状态码: $HTTP_CODE"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# ==============================================================================
|
||||
# 主循环
|
||||
# ==============================================================================
|
||||
|
||||
# 初始启动
|
||||
start_app
|
||||
|
||||
while true; do
|
||||
if check_health; then
|
||||
# 健康检查通过
|
||||
FAIL_COUNT=0
|
||||
# log_message "健康检查通过" # 可选:为了减少日志量,可以注释掉这行
|
||||
else
|
||||
# 健康检查失败
|
||||
((FAIL_COUNT++))
|
||||
log_message "健康检查失败 ($FAIL_COUNT/$MAX_FAILURES)"
|
||||
|
||||
if [ $FAIL_COUNT -ge $MAX_FAILURES ]; then
|
||||
log_message "错误: 连续检测失败次数过多,准备重启项目..."
|
||||
stop_app
|
||||
start_app
|
||||
FAIL_COUNT=0
|
||||
elif ! kill -0 $APP_PID 2>/dev/null; then
|
||||
# 如果进程直接没了,立即重启
|
||||
log_message "错误: 进程意外退出,立即重启..."
|
||||
start_app
|
||||
FAIL_COUNT=0
|
||||
fi
|
||||
fi
|
||||
|
||||
sleep $CHECK_INTERVAL
|
||||
done
|
||||
Reference in New Issue
Block a user