Spaces:

GGSheng
/

test

Paused

App Files Files Community

test / scripts /hf-entrypoint.sh

GGSheng

feat: deploy Gemma 4 to hf space

55ba4a3 verified 9 days ago

raw

history blame contribute delete

13 kB

	#!/usr/bin/env bash
	# hf-entrypoint.sh - HF Spaces 容器入口
	set -euo pipefail

	echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: starting..."

	# ============================================
	# 0. 保存环境变量到 /etc/profile.d，供后续 bash 会话使用
	# ============================================
	if [[ -x /usr/local/bin/save-env.sh ]]; then
	/usr/local/bin/save-env.sh
	else
	echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: warning: save-env.sh not found, skipping env export"
	fi

	# 加载已保存的环境变量
	if [[ -f /etc/profile.d/openclaw-env.sh ]]; then
	# shellcheck source=/dev/null
	source /etc/profile.d/openclaw-env.sh
	echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: loaded environment from /etc/profile.d/openclaw-env.sh"
	fi

	# ============================================
	# 1. 启动 supervisord（管理 cron + openclaw-gateway）
	# ============================================
	echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: starting supervisord..."
	mkdir -p /var/run /var/log/supervisor /var/log/hf-entrypoint
	/usr/bin/supervisord -c /etc/supervisor/supervisord.conf \
	>> /var/log/hf-entrypoint/supervisord-stdout.log \
	2>> /var/log/hf-entrypoint/supervisord-stderr.log &
	SUPERVISORD_PID=$!
	echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: supervisord started (pid=$SUPERVISORD_PID)"

	while [[ ! -f /var/run/supervisord.pid ]]; do
	sleep 0.5
	done
	echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: supervisord is ready"

	# 1.0 启动 SSH 服务和看门狗（确保SSH持续可用）
	echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: starting SSH service and watchdog..."

	# 0. 设置root密码（如果已设置ROOT_PASSWORD环境变量）
	if [ -n "${ROOT_PASSWORD:-}" ]; then
	echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: setting root password..."
	echo "root:${ROOT_PASSWORD}" \| chpasswd 2>/dev/null
	if [ $? -eq 0 ]; then
	# 确保root账户未锁定
	passwd -u root 2>/dev/null \|\| true
	echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: root password set successfully"
	else
	echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: WARNING: failed to set root password"
	fi
	fi

	# 0.1 创建 SSH 权限分离目录并清理残留 PID/套接字
	echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: preparing SSH runtime directories..."
	mkdir -p /run/sshd /var/run/sshd 2>/dev/null \|\| true
	chmod 755 /run/sshd /var/run/sshd 2>/dev/null \|\| true
	rm -f /var/run/sshd.pid /var/run/sshd.init.pid /tmp/ssh-* 2>/dev/null \|\| true

	# 0.2 生成 SSH 主机密钥（如果不存在）
	if [ ! -f "/etc/ssh/ssh_host_rsa_key" ]; then
	echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: generating SSH host keys..."
	ssh-keygen -A 2>/dev/null \|\| true
	fi

	# 1. 确保SSH服务启动
	if ! pgrep -x "sshd" > /dev/null 2>&1; then
	_sshd_bin=""
	if [ -x "/usr/sbin/sshd" ]; then
	_sshd_bin="/usr/sbin/sshd"
	elif [ -x "/usr/bin/sshd" ]; then
	_sshd_bin="/usr/bin/sshd"
	fi

	if [ -n "$_sshd_bin" ]; then
	echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: starting sshd from $_sshd_bin..."
	$_sshd_bin
	sleep 2
	if pgrep -x "sshd" > /dev/null 2>&1; then
	echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: sshd started successfully"
	# 保护 sshd 不被 OOM killer 杀死（降低优先级）
	for _pid in $(pgrep -x "sshd" 2>/dev/null); do
	echo -500 > /proc/$_pid/oom_score_adj 2>/dev/null \|\| true
	done
	echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: sshd OOM protection applied (oom_score_adj=-500)"
	# 调整内核内存策略，降低 OOM 误杀关键服务的概率
	echo "2" > /proc/sys/vm/overcommit_memory 2>/dev/null \|\| true
	echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: vm.overcommit_memory=2 set (never overcommit)"
	else
	echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: WARNING: sshd failed to start, will be handled by watchdog"
	fi
	else
	echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: WARNING: sshd executable not found"
	fi
	fi

	# 启动SSH看门狗（确保SSH服务持续可用）
	# 环境变量 SSH_WATCHDOG_DOCKER_LOG: 控制看门狗日志是否重定向到 Docker logs（默认 true）
	SSH_WATCHDOG_DOCKER_LOG="${SSH_WATCHDOG_DOCKER_LOG:-false}"
	if [ -x "/usr/local/bin/ssh_service_watchdog.sh" ]; then
	echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: starting SSH watchdog (docker_log=$SSH_WATCHDOG_DOCKER_LOG)..."
	if [ "$SSH_WATCHDOG_DOCKER_LOG" = "true" ]; then
	# 将看门狗输出重定向到Docker标准输出，这样即使xterm无法连接，也可以通过docker logs查看
	nohup /usr/local/bin/ssh_service_watchdog.sh >> /proc/1/fd/1 2>> /proc/1/fd/2 &
	echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: SSH watchdog logs are available via: docker logs <container>"
	else
	# 不重定向到 Docker logs，日志仅写入本地文件 /var/log/ssh_watchdog.log
	nohup /usr/local/bin/ssh_service_watchdog.sh > /dev/null 2>&1 &
	fi
	SSH_WATCHDOG_PID=$!
	echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: SSH watchdog started (pid=$SSH_WATCHDOG_PID)"
	else
	echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: WARNING: ssh_service_watchdog.sh not found"
	fi

	# 1.1 启动 BT Panel（与 restore 并行启动，节省时间）
	# ============================================
	echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: starting BT Panel..."
	if [[ -f "/www/server/panel/default.pl" ]]; then
	echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: BT Panel is installed, starting..."
	bt start 2>/dev/null \|\| true
	bt default 2>/dev/null \|\| true
	else
	echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: BT Panel not installed, skipping"
	fi

	# 1.2 等待 openclaw-gateway 完成恢复
	# ============================================
	echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: waiting for openclaw-gateway to complete restore..."

	RESTORE_COMPLETED_FILE="/tmp/openclaw-restore-completed"
	OPENCLAW_LOG_FILE="/var/log/hf-entrypoint/openclaw-gateway-stdout.log"
	RESTORE_LOG_FILE="/var/log/openclaw/restore.log"
	WAITED=0
	LAST_RESTORE_LINE=0
	LAST_LOG_SIZE=0
	PROGRESS_CHECK_INTERVAL=20
	SLOW_WARN_THRESHOLD=900 # 15分钟超时预警
	MAX_WAIT_TIMEOUT=3600 # 最大等待3600秒（1小时），超时后强制继续启动
	IDLE_WARN_THRESHOLD=120 # 日志无新内容120秒则告警
	TIME_NO_NEW_LOG=0

	mkdir -p "$(dirname "$RESTORE_LOG_FILE")"

	show_restore_progress() {
	if [[ ! -f "$RESTORE_LOG_FILE" ]]; then
	return
	fi

	local current_lines
	current_lines=$(wc -l < "$RESTORE_LOG_FILE" 2>/dev/null \|\| echo "0")

	if [[ -n "$current_lines" ]] && [[ "$current_lines" -gt "$LAST_RESTORE_LINE" ]]; then
	local new_lines=$((current_lines - LAST_RESTORE_LINE))
	echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: --- Restore log ($new_lines new lines) ---"
	tail -n "$new_lines" "$RESTORE_LOG_FILE" \| while IFS= read -r line; do
	echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] $line"
	done
	LAST_RESTORE_LINE="$current_lines"
	TIME_NO_NEW_LOG=0
	fi

	# Track log file size (bytes) as an activity indicator
	local current_size
	current_size=$(stat -c%s "$RESTORE_LOG_FILE" 2>/dev/null \|\| echo "0")
	if [[ "$current_size" != "$LAST_LOG_SIZE" ]]; then
	LAST_LOG_SIZE="$current_size"
	fi
	}

	while true; do
	# 首次进入循环时显示诊断信息
	if [[ $WAITED -eq 0 ]]; then
	echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: waiting for restore completion..."
	echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: RESTORE_LOG_FILE=$RESTORE_LOG_FILE"
	echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: RESTORE_COMPLETED_FILE=$RESTORE_COMPLETED_FILE"
	fi

	# 每2秒：检查是否完成 + 获取最新日志
	show_restore_progress

	# 超时兜底：超过3600秒仍未完成，强制继续启动流程
	if [[ $WAITED -ge $MAX_WAIT_TIMEOUT ]]; then
	echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: ⚠ WARNING: Restore timed out after ${WAITED}s ($((WAITED / 60))min), forcing proceed"
	echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: ⚠ The restore process may still be running in the background"
	break
	fi

	# 只检查恢复完成标志文件
	if [[ -f "$RESTORE_COMPLETED_FILE" ]]; then
	echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: ✓ Restore completed"
	# 显示恢复日志的最后几行
	if [[ -f "$RESTORE_LOG_FILE" ]]; then
	echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: --- Final restore log (last 10 lines) ---"
	tail -n 10 "$RESTORE_LOG_FILE" \| while IFS= read -r line; do
	echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] $line"
	done
	fi
	break
	fi

	sleep 2
	WAITED=$((WAITED + 2))

	# 如果日志无新内容，累计无更新时长
	if [[ -f "$RESTORE_LOG_FILE" ]]; then
	current_size=$(stat -c%s "$RESTORE_LOG_FILE" 2>/dev/null \|\| echo "0")
	if [[ "$current_size" -eq "$LAST_LOG_SIZE" ]]; then
	TIME_NO_NEW_LOG=$((TIME_NO_NEW_LOG + 2))
	else
	TIME_NO_NEW_LOG=0
	fi
	fi

	# 每20秒输出一次聚合状态
	if [[ $((WAITED % PROGRESS_CHECK_INTERVAL)) -eq 0 ]]; then
	elapsed_min=$((WAITED / 60))
	log_size_str=""
	if [[ -f "$RESTORE_LOG_FILE" ]]; then
	file_size=$(stat -c%s "$RESTORE_LOG_FILE" 2>/dev/null \|\| echo "0")
	if [[ $file_size -ge 1048576 ]]; then
	log_size_str="$((file_size / 1048576)).$(( (file_size % 1048576) * 10 / 1048576 ))MB"
	elif [[ $file_size -ge 1024 ]]; then
	log_size_str="$((file_size / 1024)).$(( (file_size % 1024) * 10 / 1024 ))KB"
	else
	log_size_str="${file_size}B"
	fi
	fi

	echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: still waiting for restore... (${WAITED}s / ${elapsed_min}min, log: ${log_size_str:-N/A})"

	# 如果超过15分钟还没完成，发出预警
	if [[ $WAITED -ge $SLOW_WARN_THRESHOLD ]] && [[ $((WAITED % 60)) -eq 0 ]]; then
	echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: ⚠ WARNING: Restore is taking longer than expected (>${elapsed_min}min). Large backup (>10GB) may require more time."
	fi

	show_restore_progress
	fi
	done

	echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: ✓ Restore wait completed (${WAITED}s / $((WAITED / 60))min), proceeding with PM2 startup"

	# 1.2 确保 cron daemon 运行
	if ! pgrep -x cron >/dev/null 2>&1; then
	echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: starting cron daemon..."
	/usr/sbin/cron
	fi

	# ============================================
	# 2. 启动 PM2 管理, 附加的 node 进程（如果需要）
	# ============================================
	echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: starting PM2 for others..."
	mkdir -p /root/.pm2 /var/log/hf-entrypoint

	if grep -qE '"name"\s*:' /app/pm2/ecosystem.config.js 2>/dev/null; then
	/usr/bin/pm2-runtime /app/pm2/ecosystem.config.js \
	>> /var/log/hf-entrypoint/pm2-stdout.log \
	2>> /var/log/hf-entrypoint/pm2-stderr.log &
	PM2_PID=$!
	echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: PM2 started (pid=$PM2_PID)"
	else
	echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: PM2: no applications defined in ecosystem.config.js, skipping..."
	PM2_PID=""
	fi

	# ============================================
	# 3. 信号转发（确保 PID 1 的 SIGTERM 能传到 supervisord）
	# ============================================
	signal_handler() {
	echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: received SIGTERM, forwarding to all processes..."
	# 停止SSH看门狗
	if [ -n "${SSH_WATCHDOG_PID:-}" ]; then
	echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: stopping SSH watchdog (pid=$SSH_WATCHDOG_PID)..."
	kill -TERM "$SSH_WATCHDOG_PID" 2>/dev/null \|\| true
	fi
	# 停止supervisord
	kill -TERM "$SUPERVISORD_PID" 2>/dev/null \|\| true
	# 停止PM2
	kill -TERM "$PM2_PID" 2>/dev/null \|\| true
	# 停止SSH服务
	echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: stopping SSH service..."
	if pgrep -x "sshd" > /dev/null 2>&1; then
	killall sshd 2>/dev/null \|\| true
	fi
	echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: all services stopped"
	}

	trap signal_handler TERM INT QUIT

	# ============================================
	# 5. 启动 node hf-server.js 作为 PID 1
	# ============================================
	echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] hf-entrypoint: starting node server.js..."
	cd /app
	exec node hf-server.js