OpenClaw 运维与故障修复
概述
OpenClaw 日常运维操作、备份恢复、故障修复和高可用管理指南。
日常运维
服务管理
# Systemd 方式
systemctl start openclaw-server
systemctl stop openclaw-server
systemctl restart openclaw-server
systemctl status openclaw-server
# 脚本方式
/opt/openclaw/bin/openclaw-server.sh start
/opt/openclaw/bin/openclaw-server.sh stop
/opt/openclaw/bin/openclaw-server.sh restart
/opt/openclaw/bin/openclaw-server.sh status
# Docker 方式
docker-compose start
docker-compose stop
docker-compose restart
docker-compose ps
# Kubernetes 方式
kubectl rollout restart deployment/openclaw-server -n openclaw
kubectl scale deployment/openclaw-worker --replicas=5 -n openclaw
健康检查
# 服务健康检查
curl -s http://localhost:8080/api/health | jq .
# 集群健康检查
curl -s http://localhost:8080/api/cluster/health | jq .
# 自动化健康检查脚本
#!/bin/bash
HEALTH=$(curl -s -o /dev/null -w "%{http_code}" http://localhost:8080/api/health)
if [ "$HEALTH" != "200" ]; then
echo "OpenClaw 服务异常,状态码: $HEALTH"
# 发送告警
curl -X POST https://webhook.example.com/alert \
-H "Content-Type: application/json" \
-d '{"message": "OpenClaw 服务异常"}'
fi
日志管理
# 日志轮转配置
cat > /etc/logrotate.d/openclaw << 'EOF'
/opt/openclaw/logs/*.log {
daily
rotate 30
compress
delaycompress
missingok
notifempty
create 644 openclaw openclaw
postrotate
systemctl reload openclaw-server > /dev/null 2>&1 || true
endscript
}
EOF
# 手动清理日志
find /opt/openclaw/logs -name "*.log.*" -mtime +30 -delete
find /opt/openclaw/logs/tasks -name "*.log" -mtime +7 -delete
# 日志归档
tar -czf /backup/openclaw-logs-$(date +%Y%m%d).tar.gz /opt/openclaw/logs/
备份与恢复
数据库备份
# 全量备份
mysqldump -h localhost -u openclaw -p \
--single-transaction \
--routines \
--triggers \
openclaw > /backup/openclaw_$(date +%Y%m%d_%H%M%S).sql
# 压缩备份
mysqldump -h localhost -u openclaw -p openclaw | gzip > /backup/openclaw_$(date +%Y%m%d).sql.gz
# 定时备份脚本
#!/bin/bash
BACKUP_DIR="/backup/mysql"
DATE=$(date +%Y%m%d_%H%M%S)
mkdir -p $BACKUP_DIR
mysqldump -h localhost -u openclaw -p$MYSQL_PASSWORD \
--single-transaction \
openclaw | gzip > $BACKUP_DIR/openclaw_$DATE.sql.gz
# 清理 7 天前的备份
find $BACKUP_DIR -name "openclaw_*.sql.gz" -mtime +7 -delete
echo "备份完成: $BACKUP_DIR/openclaw_$DATE.sql.gz"
数据库恢复
# 停止服务
systemctl stop openclaw-server
# 恢复数据库
gunzip < /backup/openclaw_20240115.sql.gz | mysql -h localhost -u openclaw -p openclaw
# 或直接恢复
mysql -h localhost -u openclaw -p openclaw < /backup/openclaw_20240115.sql
# 启动服务
systemctl start openclaw-server
配置备份
# 备份配置文件
tar -czf /backup/openclaw-config-$(date +%Y%m%d).tar.gz \
/opt/openclaw/conf/ \
/etc/systemd/system/openclaw*.service
# Docker 配置备份
tar -czf /backup/openclaw-docker-config-$(date +%Y%m%d).tar.gz \
/opt/openclaw/docker-compose.yml \
/opt/openclaw/.env \
/opt/openclaw/config/
完整备份
#!/bin/bash
# full_backup.sh - 完整备份脚本
BACKUP_DIR="/backup/openclaw/$(date +%Y%m%d)"
mkdir -p $BACKUP_DIR
echo "开始备份..."
# 1. 备份数据库
echo "备份数据库..."
mysqldump -h localhost -u openclaw -p$DB_PASSWORD \
--single-transaction openclaw | gzip > $BACKUP_DIR/database.sql.gz
# 2. 备份配置
echo "备份配置文件..."
tar -czf $BACKUP_DIR/config.tar.gz /opt/openclaw/conf/
# 3. 备份 Redis (如需要)
echo "备份 Redis..."
redis-cli -h localhost BGSAVE
sleep 5
cp /var/lib/redis/dump.rdb $BACKUP_DIR/redis.rdb
# 4. 生成备份清单
echo "生成备份清单..."
cat > $BACKUP_DIR/manifest.txt << EOF
备份时间: $(date)
数据库: database.sql.gz
配置: config.tar.gz
Redis: redis.rdb
OpenClaw 版本: $(curl -s http://localhost:8080/api/version | jq -r .version)
EOF
echo "备份完成: $BACKUP_DIR"
故障修复
服务无法启动
端口占用
# 检查端口占用
netstat -tlnp | grep -E "8080|9090"
lsof -i :8080
# 杀死占用进程
kill -9 $(lsof -t -i :8080)
# 或修改配置使用其他端口
vim /opt/openclaw/conf/application.yml
# 修改 server.port
配置错误
# 检查配置语法
/opt/openclaw/bin/openclaw-server.sh validate
# 查看启动日志
tail -100 /opt/openclaw/logs/openclaw-server.log
# 常见配置问题
# 1. YAML 格式错误 - 检查缩进
# 2. 数据库连接字符串错误
# 3. 环境变量未设置
权限问题
# 修复目录权限
chown -R openclaw:openclaw /opt/openclaw
chmod 755 /opt/openclaw/bin/*.sh
chmod 644 /opt/openclaw/conf/*.yml
# 修复日志目录权限
chown -R openclaw:openclaw /opt/openclaw/logs
chmod 755 /opt/openclaw/logs
数据库问题修复
连接失败
# 测试连接
mysql -h localhost -u openclaw -p -e "SELECT 1"
# 重置密码
mysql -u root -p << 'EOF'
ALTER USER 'openclaw'@'%' IDENTIFIED BY 'new_password';
FLUSH PRIVILEGES;
EOF
# 检查用户权限
mysql -u root -p -e "SHOW GRANTS FOR 'openclaw'@'%'"
# 修复权限
mysql -u root -p << 'EOF'
GRANT ALL PRIVILEGES ON openclaw.* TO 'openclaw'@'%';
FLUSH PRIVILEGES;
EOF
表损坏修复
# 检查表状态
mysqlcheck -u root -p --check openclaw
# 修复表
mysqlcheck -u root -p --repair openclaw
# 修复特定表
mysqlcheck -u root -p --repair openclaw task
mysqlcheck -u root -p --repair openclaw execution
数据不一致修复
# 清理僵尸任务(长时间 RUNNING 但 Worker 已下线)
mysql -u root -p openclaw << 'EOF'
UPDATE task SET status = 'FAILED',
error_message = 'Worker offline - auto recovered'
WHERE status = 'RUNNING'
AND worker_id NOT IN (SELECT id FROM worker WHERE status = 'ONLINE')
AND update_time < DATE_SUB(NOW(), INTERVAL 1 HOUR);
EOF
# 重置卡住的调度任务
mysql -u root -p openclaw << 'EOF'
UPDATE task SET status = 'PENDING', worker_id = NULL
WHERE status = 'ASSIGNED'
AND update_time < DATE_SUB(NOW(), INTERVAL 10 MINUTE);
EOF
Redis 问题修复
连接问题
# 测试连接
redis-cli -h localhost ping
# 重启 Redis
systemctl restart redis
# 检查 Redis 日志
tail -100 /var/log/redis/redis-server.log
内存问题
# 检查内存使用
redis-cli info memory
# 清理过期 key
redis-cli --scan --pattern "openclaw:task:log:*" | xargs redis-cli del
# 设置内存策略
redis-cli config set maxmemory 2gb
redis-cli config set maxmemory-policy allkeys-lru
数据恢复
# 从 RDB 恢复
systemctl stop redis
cp /backup/redis.rdb /var/lib/redis/dump.rdb
chown redis:redis /var/lib/redis/dump.rdb
systemctl start redis
# 验证恢复
redis-cli info keyspace
Worker 故障修复
Worker 无法注册
# 检查网络连接
telnet openclaw-server 9090
nc -zv openclaw-server 9090
# 检查防火墙
iptables -L -n | grep 9090
firewall-cmd --list-ports
# 开放端口
firewall-cmd --add-port=9090/tcp --permanent
firewall-cmd --reload
# 检查 Worker 配置
grep -E "server|host|port" /opt/openclaw/conf/worker.yml
Worker 频繁离线
# 调整心跳配置
vim /opt/openclaw/conf/worker.yml
# 增加心跳间隔和超时时间
# heartbeat:
# interval: 10000
# timeout: 60000
# 检查系统资源
top -p $(pgrep -f openclaw-worker)
free -h
# 检查网络稳定性
ping -c 100 openclaw-server | tail -5
Worker 任务堆积
# 查看 Worker 负载
curl -s http://localhost:8080/api/workers | jq '.[] | {name, runningTasks, maxTasks}'
# 增加 Worker 线程数
vim /opt/openclaw/conf/worker.yml
# threads: 16
# 或扩容 Worker
docker-compose up -d --scale openclaw-worker=5
任务故障修复
批量重试失败任务
# API 方式重试
curl -X POST http://localhost:8080/api/tasks/batch-retry \
-H "Content-Type: application/json" \
-H "Authorization: Bearer ${TOKEN}" \
-d '{"status": "FAILED", "startTime": "2024-01-15T00:00:00", "endTime": "2024-01-15T23:59:59"}'
# 数据库方式重置
mysql -u root -p openclaw << 'EOF'
UPDATE task SET status = 'PENDING', retry_count = 0, worker_id = NULL
WHERE status = 'FAILED'
AND create_time BETWEEN '2024-01-15 00:00:00' AND '2024-01-15 23:59:59';
EOF
清理过期任务
# 清理历史执行记录
mysql -u root -p openclaw << 'EOF'
DELETE FROM execution
WHERE create_time < DATE_SUB(NOW(), INTERVAL 30 DAY);
EOF
# 清理已完成任务日志
find /opt/openclaw/logs/tasks -name "*.log" -mtime +7 -delete
终止卡住的任务
# API 方式终止
curl -X POST http://localhost:8080/api/tasks/12345/kill \
-H "Authorization: Bearer ${TOKEN}"
# 批量终止
curl -X POST http://localhost:8080/api/tasks/batch-kill \
-H "Content-Type: application/json" \
-H "Authorization: Bearer ${TOKEN}" \
-d '{"taskIds": ["12345", "12346", "12347"]}'
高可用运维
主备切换
# 查看当前主节点
curl -s http://localhost:8080/api/cluster/leader | jq .
# 手动切换主节点(计划维