test / scripts /hf-entrypoint.sh
GGSheng's picture
feat: deploy Gemma 4 to hf space
55ba4a3 verified
#!/usr/bin/env bash
# hf-entrypoint.sh - HF Spaces 容器入口
set -euo pipefail
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: starting..."
# ============================================
# 0. 保存环境变量到 /etc/profile.d,供后续 bash 会话使用
# ============================================
if [[ -x /usr/local/bin/save-env.sh ]]; then
/usr/local/bin/save-env.sh
else
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: warning: save-env.sh not found, skipping env export"
fi
# 加载已保存的环境变量
if [[ -f /etc/profile.d/openclaw-env.sh ]]; then
# shellcheck source=/dev/null
source /etc/profile.d/openclaw-env.sh
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: loaded environment from /etc/profile.d/openclaw-env.sh"
fi
# ============================================
# 1. 启动 supervisord(管理 cron + openclaw-gateway)
# ============================================
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: starting supervisord..."
mkdir -p /var/run /var/log/supervisor /var/log/hf-entrypoint
/usr/bin/supervisord -c /etc/supervisor/supervisord.conf \
>> /var/log/hf-entrypoint/supervisord-stdout.log \
2>> /var/log/hf-entrypoint/supervisord-stderr.log &
SUPERVISORD_PID=$!
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: supervisord started (pid=$SUPERVISORD_PID)"
while [[ ! -f /var/run/supervisord.pid ]]; do
sleep 0.5
done
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: supervisord is ready"
# 1.0 启动 SSH 服务和看门狗(确保SSH持续可用)
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: starting SSH service and watchdog..."
# 0. 设置root密码(如果已设置ROOT_PASSWORD环境变量)
if [ -n "${ROOT_PASSWORD:-}" ]; then
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: setting root password..."
echo "root:${ROOT_PASSWORD}" | chpasswd 2>/dev/null
if [ $? -eq 0 ]; then
# 确保root账户未锁定
passwd -u root 2>/dev/null || true
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: root password set successfully"
else
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: WARNING: failed to set root password"
fi
fi
# 0.1 创建 SSH 权限分离目录并清理残留 PID/套接字
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: preparing SSH runtime directories..."
mkdir -p /run/sshd /var/run/sshd 2>/dev/null || true
chmod 755 /run/sshd /var/run/sshd 2>/dev/null || true
rm -f /var/run/sshd.pid /var/run/sshd.init.pid /tmp/ssh-* 2>/dev/null || true
# 0.2 生成 SSH 主机密钥(如果不存在)
if [ ! -f "/etc/ssh/ssh_host_rsa_key" ]; then
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: generating SSH host keys..."
ssh-keygen -A 2>/dev/null || true
fi
# 1. 确保SSH服务启动
if ! pgrep -x "sshd" > /dev/null 2>&1; then
_sshd_bin=""
if [ -x "/usr/sbin/sshd" ]; then
_sshd_bin="/usr/sbin/sshd"
elif [ -x "/usr/bin/sshd" ]; then
_sshd_bin="/usr/bin/sshd"
fi
if [ -n "$_sshd_bin" ]; then
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: starting sshd from $_sshd_bin..."
$_sshd_bin
sleep 2
if pgrep -x "sshd" > /dev/null 2>&1; then
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: sshd started successfully"
# 保护 sshd 不被 OOM killer 杀死(降低优先级)
for _pid in $(pgrep -x "sshd" 2>/dev/null); do
echo -500 > /proc/$_pid/oom_score_adj 2>/dev/null || true
done
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: sshd OOM protection applied (oom_score_adj=-500)"
# 调整内核内存策略,降低 OOM 误杀关键服务的概率
echo "2" > /proc/sys/vm/overcommit_memory 2>/dev/null || true
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: vm.overcommit_memory=2 set (never overcommit)"
else
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: WARNING: sshd failed to start, will be handled by watchdog"
fi
else
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: WARNING: sshd executable not found"
fi
fi
# 启动SSH看门狗(确保SSH服务持续可用)
# 环境变量 SSH_WATCHDOG_DOCKER_LOG: 控制看门狗日志是否重定向到 Docker logs(默认 true)
SSH_WATCHDOG_DOCKER_LOG="${SSH_WATCHDOG_DOCKER_LOG:-false}"
if [ -x "/usr/local/bin/ssh_service_watchdog.sh" ]; then
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: starting SSH watchdog (docker_log=$SSH_WATCHDOG_DOCKER_LOG)..."
if [ "$SSH_WATCHDOG_DOCKER_LOG" = "true" ]; then
# 将看门狗输出重定向到Docker标准输出,这样即使xterm无法连接,也可以通过docker logs查看
nohup /usr/local/bin/ssh_service_watchdog.sh >> /proc/1/fd/1 2>> /proc/1/fd/2 &
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: SSH watchdog logs are available via: docker logs <container>"
else
# 不重定向到 Docker logs,日志仅写入本地文件 /var/log/ssh_watchdog.log
nohup /usr/local/bin/ssh_service_watchdog.sh > /dev/null 2>&1 &
fi
SSH_WATCHDOG_PID=$!
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: SSH watchdog started (pid=$SSH_WATCHDOG_PID)"
else
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: WARNING: ssh_service_watchdog.sh not found"
fi
# 1.1 启动 BT Panel(与 restore 并行启动,节省时间)
# ============================================
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: starting BT Panel..."
if [[ -f "/www/server/panel/default.pl" ]]; then
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: BT Panel is installed, starting..."
bt start 2>/dev/null || true
bt default 2>/dev/null || true
else
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: BT Panel not installed, skipping"
fi
# 1.2 等待 openclaw-gateway 完成恢复
# ============================================
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: waiting for openclaw-gateway to complete restore..."
RESTORE_COMPLETED_FILE="/tmp/openclaw-restore-completed"
OPENCLAW_LOG_FILE="/var/log/hf-entrypoint/openclaw-gateway-stdout.log"
RESTORE_LOG_FILE="/var/log/openclaw/restore.log"
WAITED=0
LAST_RESTORE_LINE=0
LAST_LOG_SIZE=0
PROGRESS_CHECK_INTERVAL=20
SLOW_WARN_THRESHOLD=900 # 15分钟超时预警
MAX_WAIT_TIMEOUT=3600 # 最大等待3600秒(1小时),超时后强制继续启动
IDLE_WARN_THRESHOLD=120 # 日志无新内容120秒则告警
TIME_NO_NEW_LOG=0
mkdir -p "$(dirname "$RESTORE_LOG_FILE")"
show_restore_progress() {
if [[ ! -f "$RESTORE_LOG_FILE" ]]; then
return
fi
local current_lines
current_lines=$(wc -l < "$RESTORE_LOG_FILE" 2>/dev/null || echo "0")
if [[ -n "$current_lines" ]] && [[ "$current_lines" -gt "$LAST_RESTORE_LINE" ]]; then
local new_lines=$((current_lines - LAST_RESTORE_LINE))
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: --- Restore log ($new_lines new lines) ---"
tail -n "$new_lines" "$RESTORE_LOG_FILE" | while IFS= read -r line; do
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] $line"
done
LAST_RESTORE_LINE="$current_lines"
TIME_NO_NEW_LOG=0
fi
# Track log file size (bytes) as an activity indicator
local current_size
current_size=$(stat -c%s "$RESTORE_LOG_FILE" 2>/dev/null || echo "0")
if [[ "$current_size" != "$LAST_LOG_SIZE" ]]; then
LAST_LOG_SIZE="$current_size"
fi
}
while true; do
# 首次进入循环时显示诊断信息
if [[ $WAITED -eq 0 ]]; then
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: waiting for restore completion..."
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: RESTORE_LOG_FILE=$RESTORE_LOG_FILE"
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: RESTORE_COMPLETED_FILE=$RESTORE_COMPLETED_FILE"
fi
# 每2秒:检查是否完成 + 获取最新日志
show_restore_progress
# 超时兜底:超过3600秒仍未完成,强制继续启动流程
if [[ $WAITED -ge $MAX_WAIT_TIMEOUT ]]; then
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: ⚠ WARNING: Restore timed out after ${WAITED}s ($((WAITED / 60))min), forcing proceed"
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: ⚠ The restore process may still be running in the background"
break
fi
# 只检查恢复完成标志文件
if [[ -f "$RESTORE_COMPLETED_FILE" ]]; then
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: ✓ Restore completed"
# 显示恢复日志的最后几行
if [[ -f "$RESTORE_LOG_FILE" ]]; then
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: --- Final restore log (last 10 lines) ---"
tail -n 10 "$RESTORE_LOG_FILE" | while IFS= read -r line; do
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] $line"
done
fi
break
fi
sleep 2
WAITED=$((WAITED + 2))
# 如果日志无新内容,累计无更新时长
if [[ -f "$RESTORE_LOG_FILE" ]]; then
current_size=$(stat -c%s "$RESTORE_LOG_FILE" 2>/dev/null || echo "0")
if [[ "$current_size" -eq "$LAST_LOG_SIZE" ]]; then
TIME_NO_NEW_LOG=$((TIME_NO_NEW_LOG + 2))
else
TIME_NO_NEW_LOG=0
fi
fi
# 每20秒输出一次聚合状态
if [[ $((WAITED % PROGRESS_CHECK_INTERVAL)) -eq 0 ]]; then
elapsed_min=$((WAITED / 60))
log_size_str=""
if [[ -f "$RESTORE_LOG_FILE" ]]; then
file_size=$(stat -c%s "$RESTORE_LOG_FILE" 2>/dev/null || echo "0")
if [[ $file_size -ge 1048576 ]]; then
log_size_str="$((file_size / 1048576)).$(( (file_size % 1048576) * 10 / 1048576 ))MB"
elif [[ $file_size -ge 1024 ]]; then
log_size_str="$((file_size / 1024)).$(( (file_size % 1024) * 10 / 1024 ))KB"
else
log_size_str="${file_size}B"
fi
fi
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: still waiting for restore... (${WAITED}s / ${elapsed_min}min, log: ${log_size_str:-N/A})"
# 如果超过15分钟还没完成,发出预警
if [[ $WAITED -ge $SLOW_WARN_THRESHOLD ]] && [[ $((WAITED % 60)) -eq 0 ]]; then
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: ⚠ WARNING: Restore is taking longer than expected (>${elapsed_min}min). Large backup (>10GB) may require more time."
fi
show_restore_progress
fi
done
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: ✓ Restore wait completed (${WAITED}s / $((WAITED / 60))min), proceeding with PM2 startup"
# 1.2 确保 cron daemon 运行
if ! pgrep -x cron >/dev/null 2>&1; then
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: starting cron daemon..."
/usr/sbin/cron
fi
# ============================================
# 2. 启动 PM2 管理, 附加的 node 进程(如果需要)
# ============================================
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: starting PM2 for others..."
mkdir -p /root/.pm2 /var/log/hf-entrypoint
if grep -qE '"name"\s*:' /app/pm2/ecosystem.config.js 2>/dev/null; then
/usr/bin/pm2-runtime /app/pm2/ecosystem.config.js \
>> /var/log/hf-entrypoint/pm2-stdout.log \
2>> /var/log/hf-entrypoint/pm2-stderr.log &
PM2_PID=$!
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: PM2 started (pid=$PM2_PID)"
else
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: PM2: no applications defined in ecosystem.config.js, skipping..."
PM2_PID=""
fi
# ============================================
# 3. 信号转发(确保 PID 1 的 SIGTERM 能传到 supervisord)
# ============================================
signal_handler() {
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: received SIGTERM, forwarding to all processes..."
# 停止SSH看门狗
if [ -n "${SSH_WATCHDOG_PID:-}" ]; then
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: stopping SSH watchdog (pid=$SSH_WATCHDOG_PID)..."
kill -TERM "$SSH_WATCHDOG_PID" 2>/dev/null || true
fi
# 停止supervisord
kill -TERM "$SUPERVISORD_PID" 2>/dev/null || true
# 停止PM2
kill -TERM "$PM2_PID" 2>/dev/null || true
# 停止SSH服务
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: stopping SSH service..."
if pgrep -x "sshd" > /dev/null 2>&1; then
killall sshd 2>/dev/null || true
fi
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: all services stopped"
}
trap signal_handler TERM INT QUIT
# ============================================
# 5. 启动 node hf-server.js 作为 PID 1
# ============================================
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: starting node server.js..."
cd /app
exec node hf-server.js