
摘要: 本文分享 5 个生产环境验证的 Bash 脚本模板,涵盖系统巡检、日志分析、自动备份、监控告警、批量部署场景,包含完整代码和配置说明,可直接复用。
一、系统巡检脚本 #
1.1 功能说明 #
- 检查 CPU、内存、磁盘使用率
- 检查关键服务状态
- 检查系统负载
- 生成巡检报告
1.2 脚本代码 #
#!/bin/bash
# system-health-check.sh - 系统健康检查脚本
set -e
# 配置
LOG_FILE="/var/log/system-health-$(date +%Y%m%d).log"
ALERT_EMAIL="admin@example.com"
CPU_THRESHOLD=80
MEM_THRESHOLD=85
DISK_THRESHOLD=90
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
# 日志函数
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
}
# 告警函数
alert() {
local subject="[ALERT] $(hostname) - $1"
local message="$2"
echo "$message" | mail -s "$subject" "$ALERT_EMAIL"
log "${RED}⚠️ 告警:$1${NC}"
}
# 检查 CPU 使用率
check_cpu() {
log "检查 CPU 使用率..."
local cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | cut -d'%' -f1)
cpu_usage=${cpu_usage%.*} # 取整数
if [ "$cpu_usage" -gt "$CPU_THRESHOLD" ]; then
alert "CPU 使用率过高" "CPU 使用率:${cpu_usage}%(阈值:${CPU_THRESHOLD}%)"
return 1
else
log "${GREEN}✅ CPU 使用率:${cpu_usage}%${NC}"
return 0
fi
}
# 检查内存使用率
check_memory() {
log "检查内存使用率..."
local mem_info=$(free | grep Mem)
local total=$(echo $mem_info | awk '{print $2}')
local used=$(echo $mem_info | awk '{print $3}')
local mem_usage=$((used * 100 / total))
if [ "$mem_usage" -gt "$MEM_THRESHOLD" ]; then
alert "内存使用率过高" "内存使用率:${mem_usage}%(阈值:${MEM_THRESHOLD}%)"
return 1
else
log "${GREEN}✅ 内存使用率:${mem_usage}%${NC}"
return 0
fi
}
# 检查磁盘使用率
check_disk() {
log "检查磁盘使用率..."
local disk_usage=$(df -h / | awk 'NR==2 {print $5}' | cut -d'%' -f1)
if [ "$disk_usage" -gt "$DISK_THRESHOLD" ]; then
alert "磁盘使用率过高" "磁盘使用率:${disk_usage}%(阈值:${DISK_THRESHOLD}%)"
return 1
else
log "${GREEN}✅ 磁盘使用率:${disk_usage}%${NC}"
return 0
fi
}
# 检查关键服务
check_services() {
log "检查关键服务..."
local services=("sshd" "nginx" "docker" "kubelet")
local failed=0
for service in "${services[@]}"; do
if systemctl is-active --quiet "$service"; then
log "${GREEN}✅ $service 运行正常${NC}"
else
log "${RED}❌ $service 未运行${NC}"
alert "服务异常" "服务 $service 未运行"
((failed++))
fi
done
return $failed
}
# 检查系统负载
check_load() {
log "检查系统负载..."
local load=$(uptime | awk -F'load average:' '{print $2}' | cut -d',' -f1 | tr -d ' ')
local cpu_count=$(nproc)
local load_per_cpu=$(echo "$load $cpu_count" | awk '{printf "%.2f", $1/$2}')
if (( $(echo "$load_per_cpu > 2" | bc -l) )); then
alert "系统负载过高" "平均负载:$load(CPU 核心数:$cpu_count,负载/核心:$load_per_cpu)"
return 1
else
log "${GREEN}✅ 系统负载:$load(负载/核心:$load_per_cpu)${NC}"
return 0
fi
}
# 生成巡检报告
generate_report() {
local report_file="/var/log/system-health-report-$(date +%Y%m%d-%H%M%S).txt"
{
echo "================================"
echo "系统健康巡检报告"
echo "================================"
echo "时间:$(date '+%Y-%m-%d %H:%M:%S')"
echo "主机:$(hostname)"
echo "内核:$(uname -r)"
echo ""
echo "=== 资源使用 ==="
echo "CPU 使用率:$(top -bn1 | grep 'Cpu(s)' | awk '{print $2}')"
echo "内存使用:$(free -h | grep Mem | awk '{print $3 "/" $2}')"
echo "磁盘使用:$(df -h / | awk 'NR==2 {print $3 "/" $2}')"
echo "系统负载:$(uptime)"
echo ""
echo "=== 服务状态 ==="
systemctl is-active sshd nginx docker kubelet 2>/dev/null || true
echo ""
echo "=== 最近告警 ==="
tail -20 /var/log/system-health-*.log | grep "⚠️" || echo "无告警"
} > "$report_file"
log "📄 巡检报告:$report_file"
}
# 主函数
main() {
log "================================"
log "开始系统健康检查"
log "================================"
local failed=0
check_cpu || ((failed++))
check_memory || ((failed++))
check_disk || ((failed++))
check_services || ((failed++))
check_load || ((failed++))
generate_report
log "================================"
if [ $failed -eq 0 ]; then
log "${GREEN}✅ 系统健康检查通过${NC}"
else
log "${RED}❌ 发现 $failed 项异常${NC}"
fi
log "================================"
return $failed
}
# 执行
main
exit $?1.3 配置定时任务 #
# 每天 9AM 执行
crontab -e
0 9 * * * /usr/local/bin/system-health-check.sh二、日志分析脚本 #
2.1 功能说明 #
- 分析 Nginx/Apache 访问日志
- 统计 PV、UV、错误率
- 识别异常 IP
- 生成日报
2.2 脚本代码 #
#!/bin/bash
# log-analyzer.sh - Nginx 日志分析脚本
set -e
LOG_FILE="/var/log/nginx/access.log"
REPORT_DIR="/var/log/nginx-reports"
DATE=$(date +%Y-%m-%d)
mkdir -p "$REPORT_DIR"
echo "================================"
echo "Nginx 日志分析报告"
echo "日期:$DATE"
echo "================================"
echo ""
# 1. 总访问量(PV)
pv=$(wc -l < "$LOG_FILE")
echo "📊 总访问量 (PV): $pv"
# 2. 独立 IP 数(UV)
uv=$(awk '{print $1}' "$LOG_FILE" | sort -u | wc -l)
echo "👥 独立 IP 数 (UV): $uv"
# 3. 状态码统计
echo ""
echo "=== 状态码分布 ==="
awk '{print $9}' "$LOG_FILE" | sort | uniq -c | sort -rn | head -10
# 4. 错误请求(4xx + 5xx)
errors=$(awk '$9 ~ /^[45]/' "$LOG_FILE" | wc -l)
error_rate=$(echo "scale=2; $errors * 100 / $pv" | bc)
echo ""
echo "❌ 错误请求数:$errors"
echo "⚠️ 错误率:${error_rate}%"
# 5. Top 10 访问 IP
echo ""
echo "=== Top 10 访问 IP ==="
awk '{print $1}' "$LOG_FILE" | sort | uniq -c | sort -rn | head -10
# 6. Top 10 访问 URL
echo ""
echo "=== Top 10 访问 URL ==="
awk '{print $7}' "$LOG_FILE" | sort | uniq -c | sort -rn | head -10
# 7. 异常 IP 检测(访问超过 1000 次)
echo ""
echo "=== 可疑 IP(访问>1000 次) ==="
awk '{print $1}' "$LOG_FILE" | sort | uniq -c | sort -rn | awk '$1 > 1000 {print $0}'
# 8. 生成 HTML 报告
report_file="$REPORT_DIR/report-$DATE.html"
cat > "$report_file" << EOF
<!DOCTYPE html>
<html>
<head>
<title>Nginx 日志分析报告 - $DATE</title>
<style>
body { font-family: Arial, sans-serif; margin: 20px; }
table { border-collapse: collapse; width: 100%; }
th, td { border: 1px solid #ddd; padding: 8px; text-align: left; }
th { background-color: #4CAF50; color: white; }
tr:nth-child(even) { background-color: #f2f2f2; }
</style>
</head>
<body>
<h1>Nginx 日志分析报告</h1>
<p>日期:$DATE</p>
<h2>概览</h2>
<ul>
<li>PV: $pv</li>
<li>UV: $uv</li>
<li>错误率:${error_rate}%</li>
</ul>
<h2>Top 10 访问 IP</h2>
<table>
<tr><th>IP</th><th>访问次数</th></tr>
$(awk '{print $1}' "$LOG_FILE" | sort | uniq -c | sort -rn | head -10 | awk '{printf "<tr><td>%s</td><td>%s</td></tr>\n", $2, $1}')
</table>
</body>
</html>
EOF
echo ""
echo "📄 HTML 报告:$report_file"三、自动备份脚本 #
3.1 功能说明 #
- 备份数据库(MySQL/PostgreSQL)
- 备份重要文件
- 压缩加密
- 上传到远程存储
- 清理旧备份
3.2 脚本代码 #
#!/bin/bash
# auto-backup.sh - 自动备份脚本
set -e
# 配置
BACKUP_DIR="/backup"
REMOTE_HOST="backup-server@example.com"
REMOTE_DIR="/backups"
RETENTION_DAYS=30
DATE=$(date +%Y%m%d_%H%M%S)
LOG_FILE="/var/log/backup-$DATE.log"
# 需要备份的内容
DB_NAMES=("app_db" "user_db")
FILE_DIRS=("/var/www" "/etc/nginx" "/home")
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
}
# 备份 MySQL 数据库
backup_mysql() {
log "开始备份 MySQL 数据库..."
for db in "${DB_NAMES[@]}"; do
backup_file="$BACKUP_DIR/mysql-${db}-${DATE}.sql.gz"
mysqldump -u root -p"${MYSQL_PASSWORD}" "$db" | gzip > "$backup_file"
log "✅ 备份完成:$backup_file"
done
}
# 备份文件
backup_files() {
log "开始备份文件..."
for dir in "${FILE_DIRS[@]}"; do
dir_name=$(basename "$dir")
backup_file="$BACKUP_DIR/files-${dir_name}-${DATE}.tar.gz"
tar -czf "$backup_file" "$dir" 2>/dev/null || true
log "✅ 备份完成:$backup_file"
done
}
# 加密备份(可选)
encrypt_backup() {
log "加密备份文件..."
for file in "$BACKUP_DIR"/*-${DATE}.*; do
if [[ ! "$file" =~ \.gpg$ ]]; then
gpg --symmetric --batch --passphrase "${BACKUP_PASSWORD}" "$file"
rm "$file"
log "🔒 已加密:${file}.gpg"
fi
done
}
# 上传到远程服务器
upload_remote() {
log "上传到远程服务器..."
rsync -avz -e ssh "$BACKUP_DIR"/*-${DATE}.* \
"$REMOTE_HOST:$REMOTE_DIR/"
log "✅ 上传完成"
}
# 清理旧备份
cleanup_old() {
log "清理 ${RETENTION_DAYS} 天前的备份..."
# 本地清理
find "$BACKUP_DIR" -name "*-*-*" -mtime +$RETENTION_DAYS -delete
# 远程清理
ssh "$REMOTE_HOST" "find $REMOTE_DIR -mtime +$RETENTION_DAYS -delete"
log "✅ 清理完成"
}
# 验证备份
verify_backup() {
log "验证备份完整性..."
for file in "$BACKUP_DIR"/*-${DATE}.sql.gz; do
if gzip -t "$file" 2>/dev/null; then
log "✅ 验证通过:$file"
else
log "❌ 验证失败:$file"
exit 1
fi
done
}
# 发送通知
send_notification() {
local status=$1
local subject="[备份${status}] $(hostname) - $(date +%Y-%m-%d)"
local body="备份${status}完成\n\n日志:$LOG_FILE"
echo -e "$body" | mail -s "$subject" "$ALERT_EMAIL"
}
# 主函数
main() {
log "================================"
log "开始自动备份"
log "================================"
# 创建备份目录
mkdir -p "$BACKUP_DIR"
# 执行备份
backup_mysql
backup_files
verify_backup
encrypt_backup
upload_remote
cleanup_old
log "================================"
log "✅ 备份完成"
log "================================"
send_notification "成功"
}
# 错误处理
trap 'log "❌ 备份失败"; send_notification "失败"; exit 1' ERR
# 执行
main四、监控告警脚本 #
4.1 功能说明 #
- 监控端口可用性
- 监控 URL 响应
- 监控进程状态
- 多渠道告警(邮件、钉钉、企业微信)
4.2 脚本代码 #
#!/bin/bash
# monitor-alert.sh - 监控告警脚本
set -e
# 配置
ALERT_EMAIL="admin@example.com"
DINGTALK_WEBHOOK="https://oapi.dingtalk.com/robot/send?access_token=xxx"
WECHAT_WEBHOOK="https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=xxx"
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1"
}
# 邮件告警
send_email() {
local subject=$1
local message=$2
echo "$message" | mail -s "$subject" "$ALERT_EMAIL"
log "📧 邮件告警已发送:$subject"
}
# 钉钉告警
send_dingtalk() {
local message=$1
curl -s "$DINGTALK_WEBHOOK" \
-H 'Content-Type: application/json' \
-d "{
\"msgtype\": \"text\",
\"text\": {
\"content\": \"🚨 告警通知\n$message\n时间:$(date '+%Y-%m-%d %H:%M:%S')\"
}
}"
log "📱 钉钉告警已发送"
}
# 企业微信告警
send_wechat() {
local message=$1
curl -s "$WECHAT_WEBHOOK" \
-H 'Content-Type: application/json' \
-d "{
\"msgtype\": \"text\",
\"text\": {
\"content\": \"🚨 告警通知\n$message\n时间:$(date '+%Y-%m-%d %H:%M:%S')\"
}
}"
log "💬 企业微信告警已发送"
}
# 发送所有渠道告警
send_alert() {
local title=$1
local message=$2
local full_message="$title\n$message"
send_email "[$title] $(hostname)" "$message"
send_dingtalk "$full_message"
send_wechat "$full_message"
}
# 监控端口
check_port() {
local host=$1
local port=$2
local name=$3
if nc -z -w 5 "$host" "$port" 2>/dev/null; then
log "✅ $name ($host:$port) 正常"
return 0
else
send_alert "端口不可用" "$name ($host:$port) 无法连接"
return 1
fi
}
# 监控 URL
check_url() {
local url=$1
local name=$2
local expected_code=${3:-200}
local status_code=$(curl -s -o /dev/null -w "%{http_code}" -m 10 "$url")
if [ "$status_code" = "$expected_code" ]; then
log "✅ $name ($url) 正常 (HTTP $status_code)"
return 0
else
send_alert "URL 异常" "$name ($url) 返回 HTTP $status_code (期望 $expected_code)"
return 1
fi
}
# 监控进程
check_process() {
local process=$1
if pgrep -x "$process" > /dev/null; then
log "✅ 进程 $process 运行中"
return 0
else
send_alert "进程异常" "进程 $process 未运行"
return 1
fi
}
# 监控磁盘
check_disk() {
local mount_point=$1
local threshold=${2:-90}
local usage=$(df "$mount_point" | awk 'NR==2 {print $5}' | cut -d'%' -f1)
if [ "$usage" -lt "$threshold" ]; then
log "✅ 磁盘 $mount_point 使用率 ${usage}%"
return 0
else
send_alert "磁盘空间不足" "$mount_point 使用率 ${usage}%(阈值 ${threshold}%)"
return 1
fi
}
# 主函数
main() {
log "================================"
log "开始监控检查"
log "================================"
local failed=0
# 端口监控
check_port "localhost" "22" "SSH" || ((failed++))
check_port "localhost" "80" "Nginx" || ((failed++))
check_port "localhost" "3306" "MySQL" || ((failed++))
# URL 监控
check_url "https://example.com" "官网" "200" || ((failed++))
check_url "https://api.example.com/health" "API 健康检查" "200" || ((failed++))
# 进程监控
check_process "nginx" || ((failed++))
check_process "mysqld" || ((failed++))
check_process "dockerd" || ((failed++))
# 磁盘监控
check_disk "/" "90" || ((failed++))
check_disk "/var/log" "85" || ((failed++))
log "================================"
if [ $failed -eq 0 ]; then
log "✅ 所有监控项正常"
else
log "❌ $failed 项异常,已发送告警"
fi
log "================================"
}
# 执行
main五、批量部署脚本 #
5.1 功能说明 #
- 批量执行命令
- 并行部署
- 错误处理
- 进度显示
5.2 脚本代码 #
#!/bin/bash
# batch-deploy.sh - 批量部署脚本
set -e
# 服务器列表
SERVERS=(
"web01:192.168.1.10"
"web02:192.168.1.11"
"web03:192.168.1.12"
"api01:192.168.1.20"
"api02:192.168.1.21"
)
# 部署配置
DEPLOY_USER="deploy"
DEPLOY_KEY="$HOME/.ssh/deploy_key"
APP_DIR="/var/www/myapp"
BACKUP_DIR="/backup/myapp"
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1"
}
# 单台服务器部署
deploy_server() {
local name=$1
local ip=$2
log "🚀 开始部署:$name ($ip)"
# SSH 选项
SSH_OPTS="-i $DEPLOY_KEY -o StrictHostKeyChecking=no -o ConnectTimeout=10"
# 1. 备份当前版本
ssh $SSH_OPTS "$DEPLOY_USER@$ip" "
if [ -d '$APP_DIR' ]; then
cp -r '$APP_DIR' '$BACKUP_DIR/backup-$(date +%Y%m%d_%H%M%S)'
echo '✅ 备份完成'
fi
"
# 2. 拉取最新代码
ssh $SSH_OPTS "$DEPLOY_USER@$ip" "
cd '$APP_DIR'
git pull origin main
echo '✅ 代码更新完成'
"
# 3. 安装依赖
ssh $SSH_OPTS "$DEPLOY_USER@$ip" "
cd '$APP_DIR'
npm ci --production
echo '✅ 依赖安装完成'
"
# 4. 构建
ssh $SSH_OPTS "$DEPLOY_USER@$ip" "
cd '$APP_DIR'
npm run build
echo '✅ 构建完成'
"
# 5. 重启服务
ssh $SSH_OPTS "$DEPLOY_USER@$ip" "
sudo systemctl restart myapp
sudo systemctl status myapp --no-pager
echo '✅ 服务重启完成'
"
# 6. 健康检查
local max_retry=5
local retry=0
while [ $retry -lt $max_retry ]; do
if curl -s "http://$ip:8080/health" | grep -q "ok"; then
log "✅ $name 健康检查通过"
return 0
fi
((retry++))
log "⏳ $name 健康检查失败,第 $retry 次重试..."
sleep 5
done
log "❌ $name 健康检查失败"
return 1
}
# 并行部署
parallel_deploy() {
local max_parallel=${1:-3}
local failed=0
log "================================"
log "开始批量部署(并发数:$max_parallel)"
log "================================"
# 使用 GNU Parallel 或 xargs
if command -v parallel &> /dev/null; then
# 使用 parallel
printf '%s\n' "${SERVERS[@]}" | parallel -j $max_parallel --col ':' \
deploy_server {1} {2}
else
# 使用 xargs
printf '%s\n' "${SERVERS[@]}" | xargs -P $max_parallel -n 1 bash -c '
IFS=":" read name ip <<< "$1"
deploy_server "$name" "$ip"
' _
fi
}
# 回滚
rollback() {
local server=$1
log "🔄 回滚 $server..."
SSH_OPTS="-i $DEPLOY_KEY -o StrictHostKeyChecking=no"
ssh $SSH_OPTS "$DEPLOY_USER@$server" "
latest_backup=\$(ls -t '$BACKUP_DIR' | head -1)
if [ -n \"\$latest_backup\" ]; then
rm -rf '$APP_DIR'
cp -r '$BACKUP_DIR/\$latest_backup' '$APP_DIR'
sudo systemctl restart myapp
echo '✅ 回滚完成'
else
echo '❌ 未找到备份'
exit 1
fi
"
}
# 主函数
main() {
log "================================"
log "批量部署脚本"
log "服务器数量:${#SERVERS[@]}"
log "================================"
# 预检查
for server in "${SERVERS[@]}"; do
IFS=':' read -r name ip <<< "$server"
if ! ping -c 1 -W 2 "$ip" &> /dev/null; then
log "❌ $name ($ip) 无法连接"
exit 1
fi
done
log "✅ 预检查通过"
# 部署
parallel_deploy 3
log "================================"
log "✅ 批量部署完成"
log "================================"
}
# 执行
main "$@"六、脚本使用规范 #
6.1 安全建议 #
-
权限控制
chmod 700 /usr/local/bin/*.sh chown root:root /usr/local/bin/*.sh -
敏感信息
# 使用环境变量或配置文件 source /etc/backup/credentials # 不要硬编码密码 -
日志审计
# 所有脚本输出到日志 exec > >(tee -a /var/log/script.log) 2>&1
6.2 最佳实践 #
| 实践 | 说明 |
|---|---|
set -e |
遇到错误立即退出 |
set -u |
使用未定义变量时报错 |
set -o pipefail |
管道中任何命令失败都报错 |
| 函数封装 | 功能模块化,便于复用 |
| 错误处理 | trap 捕获异常 |
| 日志记录 | 所有操作记录到日志 |
七、总结 #
脚本清单 #
| 脚本 | 用途 | 执行频率 |
|---|---|---|
| system-health-check.sh | 系统巡检 | 每日 |
| log-analyzer.sh | 日志分析 | 每日 |
| auto-backup.sh | 自动备份 | 每日 |
| monitor-alert.sh | 监控告警 | 每 5 分钟 |
| batch-deploy.sh | 批量部署 | 按需 |
核心价值 #
- ✅ 自动化重复工作,节省 80% 时间
- ✅ 标准化操作流程,减少人为错误
- ✅ 及时发现问题,快速响应
- ✅ 可复用、可维护、可扩展
参考资料: