Skip to main content

Linux 运维自动化:5 个必备 Bash 脚本模板

·4905 words·10 mins

摘要: 本文分享 5 个生产环境验证的 Bash 脚本模板,涵盖系统巡检、日志分析、自动备份、监控告警、批量部署场景,包含完整代码和配置说明,可直接复用。


一、系统巡检脚本
#

1.1 功能说明
#

  • 检查 CPU、内存、磁盘使用率
  • 检查关键服务状态
  • 检查系统负载
  • 生成巡检报告

1.2 脚本代码
#

#!/bin/bash
# system-health-check.sh - 系统健康检查脚本

set -e

# 配置
LOG_FILE="/var/log/system-health-$(date +%Y%m%d).log"
ALERT_EMAIL="admin@example.com"
CPU_THRESHOLD=80
MEM_THRESHOLD=85
DISK_THRESHOLD=90

# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color

# 日志函数
log() {
    echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
}

# 告警函数
alert() {
    local subject="[ALERT] $(hostname) - $1"
    local message="$2"
    echo "$message" | mail -s "$subject" "$ALERT_EMAIL"
    log "${RED}⚠️  告警:$1${NC}"
}

# 检查 CPU 使用率
check_cpu() {
    log "检查 CPU 使用率..."
    local cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | cut -d'%' -f1)
    cpu_usage=${cpu_usage%.*}  # 取整数
    
    if [ "$cpu_usage" -gt "$CPU_THRESHOLD" ]; then
        alert "CPU 使用率过高" "CPU 使用率:${cpu_usage}%(阈值:${CPU_THRESHOLD}%)"
        return 1
    else
        log "${GREEN}✅ CPU 使用率:${cpu_usage}%${NC}"
        return 0
    fi
}

# 检查内存使用率
check_memory() {
    log "检查内存使用率..."
    local mem_info=$(free | grep Mem)
    local total=$(echo $mem_info | awk '{print $2}')
    local used=$(echo $mem_info | awk '{print $3}')
    local mem_usage=$((used * 100 / total))
    
    if [ "$mem_usage" -gt "$MEM_THRESHOLD" ]; then
        alert "内存使用率过高" "内存使用率:${mem_usage}%(阈值:${MEM_THRESHOLD}%)"
        return 1
    else
        log "${GREEN}✅ 内存使用率:${mem_usage}%${NC}"
        return 0
    fi
}

# 检查磁盘使用率
check_disk() {
    log "检查磁盘使用率..."
    local disk_usage=$(df -h / | awk 'NR==2 {print $5}' | cut -d'%' -f1)
    
    if [ "$disk_usage" -gt "$DISK_THRESHOLD" ]; then
        alert "磁盘使用率过高" "磁盘使用率:${disk_usage}%(阈值:${DISK_THRESHOLD}%)"
        return 1
    else
        log "${GREEN}✅ 磁盘使用率:${disk_usage}%${NC}"
        return 0
    fi
}

# 检查关键服务
check_services() {
    log "检查关键服务..."
    local services=("sshd" "nginx" "docker" "kubelet")
    local failed=0
    
    for service in "${services[@]}"; do
        if systemctl is-active --quiet "$service"; then
            log "${GREEN}$service 运行正常${NC}"
        else
            log "${RED}$service 未运行${NC}"
            alert "服务异常" "服务 $service 未运行"
            ((failed++))
        fi
    done
    
    return $failed
}

# 检查系统负载
check_load() {
    log "检查系统负载..."
    local load=$(uptime | awk -F'load average:' '{print $2}' | cut -d',' -f1 | tr -d ' ')
    local cpu_count=$(nproc)
    local load_per_cpu=$(echo "$load $cpu_count" | awk '{printf "%.2f", $1/$2}')
    
    if (( $(echo "$load_per_cpu > 2" | bc -l) )); then
        alert "系统负载过高" "平均负载:$load(CPU 核心数:$cpu_count,负载/核心:$load_per_cpu)"
        return 1
    else
        log "${GREEN}✅ 系统负载:$load(负载/核心:$load_per_cpu${NC}"
        return 0
    fi
}

# 生成巡检报告
generate_report() {
    local report_file="/var/log/system-health-report-$(date +%Y%m%d-%H%M%S).txt"
    
    {
        echo "================================"
        echo "系统健康巡检报告"
        echo "================================"
        echo "时间:$(date '+%Y-%m-%d %H:%M:%S')"
        echo "主机:$(hostname)"
        echo "内核:$(uname -r)"
        echo ""
        echo "=== 资源使用 ==="
        echo "CPU 使用率:$(top -bn1 | grep 'Cpu(s)' | awk '{print $2}')"
        echo "内存使用:$(free -h | grep Mem | awk '{print $3 "/" $2}')"
        echo "磁盘使用:$(df -h / | awk 'NR==2 {print $3 "/" $2}')"
        echo "系统负载:$(uptime)"
        echo ""
        echo "=== 服务状态 ==="
        systemctl is-active sshd nginx docker kubelet 2>/dev/null || true
        echo ""
        echo "=== 最近告警 ==="
        tail -20 /var/log/system-health-*.log | grep "⚠️" || echo "无告警"
    } > "$report_file"
    
    log "📄 巡检报告:$report_file"
}

# 主函数
main() {
    log "================================"
    log "开始系统健康检查"
    log "================================"
    
    local failed=0
    
    check_cpu || ((failed++))
    check_memory || ((failed++))
    check_disk || ((failed++))
    check_services || ((failed++))
    check_load || ((failed++))
    
    generate_report
    
    log "================================"
    if [ $failed -eq 0 ]; then
        log "${GREEN}✅ 系统健康检查通过${NC}"
    else
        log "${RED}❌ 发现 $failed 项异常${NC}"
    fi
    log "================================"
    
    return $failed
}

# 执行
main
exit $?

1.3 配置定时任务
#

# 每天 9AM 执行
crontab -e
0 9 * * * /usr/local/bin/system-health-check.sh

二、日志分析脚本
#

2.1 功能说明
#

  • 分析 Nginx/Apache 访问日志
  • 统计 PV、UV、错误率
  • 识别异常 IP
  • 生成日报

2.2 脚本代码
#

#!/bin/bash
# log-analyzer.sh - Nginx 日志分析脚本

set -e

LOG_FILE="/var/log/nginx/access.log"
REPORT_DIR="/var/log/nginx-reports"
DATE=$(date +%Y-%m-%d)

mkdir -p "$REPORT_DIR"

echo "================================"
echo "Nginx 日志分析报告"
echo "日期:$DATE"
echo "================================"
echo ""

# 1. 总访问量(PV)
pv=$(wc -l < "$LOG_FILE")
echo "📊 总访问量 (PV): $pv"

# 2. 独立 IP 数(UV)
uv=$(awk '{print $1}' "$LOG_FILE" | sort -u | wc -l)
echo "👥 独立 IP 数 (UV): $uv"

# 3. 状态码统计
echo ""
echo "=== 状态码分布 ==="
awk '{print $9}' "$LOG_FILE" | sort | uniq -c | sort -rn | head -10

# 4. 错误请求(4xx + 5xx)
errors=$(awk '$9 ~ /^[45]/' "$LOG_FILE" | wc -l)
error_rate=$(echo "scale=2; $errors * 100 / $pv" | bc)
echo ""
echo "❌ 错误请求数:$errors"
echo "⚠️  错误率:${error_rate}%"

# 5. Top 10 访问 IP
echo ""
echo "=== Top 10 访问 IP ==="
awk '{print $1}' "$LOG_FILE" | sort | uniq -c | sort -rn | head -10

# 6. Top 10 访问 URL
echo ""
echo "=== Top 10 访问 URL ==="
awk '{print $7}' "$LOG_FILE" | sort | uniq -c | sort -rn | head -10

# 7. 异常 IP 检测(访问超过 1000 次)
echo ""
echo "=== 可疑 IP(访问>1000 次) ==="
awk '{print $1}' "$LOG_FILE" | sort | uniq -c | sort -rn | awk '$1 > 1000 {print $0}'

# 8. 生成 HTML 报告
report_file="$REPORT_DIR/report-$DATE.html"
cat > "$report_file" << EOF
<!DOCTYPE html>
<html>
<head>
    <title>Nginx 日志分析报告 - $DATE</title>
    <style>
        body { font-family: Arial, sans-serif; margin: 20px; }
        table { border-collapse: collapse; width: 100%; }
        th, td { border: 1px solid #ddd; padding: 8px; text-align: left; }
        th { background-color: #4CAF50; color: white; }
        tr:nth-child(even) { background-color: #f2f2f2; }
    </style>
</head>
<body>
    <h1>Nginx 日志分析报告</h1>
    <p>日期:$DATE</p>
    <h2>概览</h2>
    <ul>
        <li>PV: $pv</li>
        <li>UV: $uv</li>
        <li>错误率:${error_rate}%</li>
    </ul>
    <h2>Top 10 访问 IP</h2>
    <table>
        <tr><th>IP</th><th>访问次数</th></tr>
        $(awk '{print $1}' "$LOG_FILE" | sort | uniq -c | sort -rn | head -10 | awk '{printf "<tr><td>%s</td><td>%s</td></tr>\n", $2, $1}')
    </table>
</body>
</html>
EOF

echo ""
echo "📄 HTML 报告:$report_file"

三、自动备份脚本
#

3.1 功能说明
#

  • 备份数据库(MySQL/PostgreSQL)
  • 备份重要文件
  • 压缩加密
  • 上传到远程存储
  • 清理旧备份

3.2 脚本代码
#

#!/bin/bash
# auto-backup.sh - 自动备份脚本

set -e

# 配置
BACKUP_DIR="/backup"
REMOTE_HOST="backup-server@example.com"
REMOTE_DIR="/backups"
RETENTION_DAYS=30
DATE=$(date +%Y%m%d_%H%M%S)
LOG_FILE="/var/log/backup-$DATE.log"

# 需要备份的内容
DB_NAMES=("app_db" "user_db")
FILE_DIRS=("/var/www" "/etc/nginx" "/home")

log() {
    echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
}

# 备份 MySQL 数据库
backup_mysql() {
    log "开始备份 MySQL 数据库..."
    
    for db in "${DB_NAMES[@]}"; do
        backup_file="$BACKUP_DIR/mysql-${db}-${DATE}.sql.gz"
        mysqldump -u root -p"${MYSQL_PASSWORD}" "$db" | gzip > "$backup_file"
        log "✅ 备份完成:$backup_file"
    done
}

# 备份文件
backup_files() {
    log "开始备份文件..."
    
    for dir in "${FILE_DIRS[@]}"; do
        dir_name=$(basename "$dir")
        backup_file="$BACKUP_DIR/files-${dir_name}-${DATE}.tar.gz"
        tar -czf "$backup_file" "$dir" 2>/dev/null || true
        log "✅ 备份完成:$backup_file"
    done
}

# 加密备份(可选)
encrypt_backup() {
    log "加密备份文件..."
    
    for file in "$BACKUP_DIR"/*-${DATE}.*; do
        if [[ ! "$file" =~ \.gpg$ ]]; then
            gpg --symmetric --batch --passphrase "${BACKUP_PASSWORD}" "$file"
            rm "$file"
            log "🔒 已加密:${file}.gpg"
        fi
    done
}

# 上传到远程服务器
upload_remote() {
    log "上传到远程服务器..."
    
    rsync -avz -e ssh "$BACKUP_DIR"/*-${DATE}.* \
        "$REMOTE_HOST:$REMOTE_DIR/"
    
    log "✅ 上传完成"
}

# 清理旧备份
cleanup_old() {
    log "清理 ${RETENTION_DAYS} 天前的备份..."
    
    # 本地清理
    find "$BACKUP_DIR" -name "*-*-*" -mtime +$RETENTION_DAYS -delete
    
    # 远程清理
    ssh "$REMOTE_HOST" "find $REMOTE_DIR -mtime +$RETENTION_DAYS -delete"
    
    log "✅ 清理完成"
}

# 验证备份
verify_backup() {
    log "验证备份完整性..."
    
    for file in "$BACKUP_DIR"/*-${DATE}.sql.gz; do
        if gzip -t "$file" 2>/dev/null; then
            log "✅ 验证通过:$file"
        else
            log "❌ 验证失败:$file"
            exit 1
        fi
    done
}

# 发送通知
send_notification() {
    local status=$1
    local subject="[备份${status}] $(hostname) - $(date +%Y-%m-%d)"
    local body="备份${status}完成\n\n日志:$LOG_FILE"
    
    echo -e "$body" | mail -s "$subject" "$ALERT_EMAIL"
}

# 主函数
main() {
    log "================================"
    log "开始自动备份"
    log "================================"
    
    # 创建备份目录
    mkdir -p "$BACKUP_DIR"
    
    # 执行备份
    backup_mysql
    backup_files
    verify_backup
    encrypt_backup
    upload_remote
    cleanup_old
    
    log "================================"
    log "✅ 备份完成"
    log "================================"
    
    send_notification "成功"
}

# 错误处理
trap 'log "❌ 备份失败"; send_notification "失败"; exit 1' ERR

# 执行
main

四、监控告警脚本
#

4.1 功能说明
#

  • 监控端口可用性
  • 监控 URL 响应
  • 监控进程状态
  • 多渠道告警(邮件、钉钉、企业微信)

4.2 脚本代码
#

#!/bin/bash
# monitor-alert.sh - 监控告警脚本

set -e

# 配置
ALERT_EMAIL="admin@example.com"
DINGTALK_WEBHOOK="https://oapi.dingtalk.com/robot/send?access_token=xxx"
WECHAT_WEBHOOK="https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=xxx"

log() {
    echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1"
}

# 邮件告警
send_email() {
    local subject=$1
    local message=$2
    echo "$message" | mail -s "$subject" "$ALERT_EMAIL"
    log "📧 邮件告警已发送:$subject"
}

# 钉钉告警
send_dingtalk() {
    local message=$1
    curl -s "$DINGTALK_WEBHOOK" \
        -H 'Content-Type: application/json' \
        -d "{
            \"msgtype\": \"text\",
            \"text\": {
                \"content\": \"🚨 告警通知\n$message\n时间:$(date '+%Y-%m-%d %H:%M:%S')\"
            }
        }"
    log "📱 钉钉告警已发送"
}

# 企业微信告警
send_wechat() {
    local message=$1
    curl -s "$WECHAT_WEBHOOK" \
        -H 'Content-Type: application/json' \
        -d "{
            \"msgtype\": \"text\",
            \"text\": {
                \"content\": \"🚨 告警通知\n$message\n时间:$(date '+%Y-%m-%d %H:%M:%S')\"
            }
        }"
    log "💬 企业微信告警已发送"
}

# 发送所有渠道告警
send_alert() {
    local title=$1
    local message=$2
    local full_message="$title\n$message"
    
    send_email "[$title] $(hostname)" "$message"
    send_dingtalk "$full_message"
    send_wechat "$full_message"
}

# 监控端口
check_port() {
    local host=$1
    local port=$2
    local name=$3
    
    if nc -z -w 5 "$host" "$port" 2>/dev/null; then
        log "✅ $name ($host:$port) 正常"
        return 0
    else
        send_alert "端口不可用" "$name ($host:$port) 无法连接"
        return 1
    fi
}

# 监控 URL
check_url() {
    local url=$1
    local name=$2
    local expected_code=${3:-200}
    
    local status_code=$(curl -s -o /dev/null -w "%{http_code}" -m 10 "$url")
    
    if [ "$status_code" = "$expected_code" ]; then
        log "✅ $name ($url) 正常 (HTTP $status_code)"
        return 0
    else
        send_alert "URL 异常" "$name ($url) 返回 HTTP $status_code (期望 $expected_code)"
        return 1
    fi
}

# 监控进程
check_process() {
    local process=$1
    
    if pgrep -x "$process" > /dev/null; then
        log "✅ 进程 $process 运行中"
        return 0
    else
        send_alert "进程异常" "进程 $process 未运行"
        return 1
    fi
}

# 监控磁盘
check_disk() {
    local mount_point=$1
    local threshold=${2:-90}
    
    local usage=$(df "$mount_point" | awk 'NR==2 {print $5}' | cut -d'%' -f1)
    
    if [ "$usage" -lt "$threshold" ]; then
        log "✅ 磁盘 $mount_point 使用率 ${usage}%"
        return 0
    else
        send_alert "磁盘空间不足" "$mount_point 使用率 ${usage}%(阈值 ${threshold}%)"
        return 1
    fi
}

# 主函数
main() {
    log "================================"
    log "开始监控检查"
    log "================================"
    
    local failed=0
    
    # 端口监控
    check_port "localhost" "22" "SSH" || ((failed++))
    check_port "localhost" "80" "Nginx" || ((failed++))
    check_port "localhost" "3306" "MySQL" || ((failed++))
    
    # URL 监控
    check_url "https://example.com" "官网" "200" || ((failed++))
    check_url "https://api.example.com/health" "API 健康检查" "200" || ((failed++))
    
    # 进程监控
    check_process "nginx" || ((failed++))
    check_process "mysqld" || ((failed++))
    check_process "dockerd" || ((failed++))
    
    # 磁盘监控
    check_disk "/" "90" || ((failed++))
    check_disk "/var/log" "85" || ((failed++))
    
    log "================================"
    if [ $failed -eq 0 ]; then
        log "✅ 所有监控项正常"
    else
        log "❌ $failed 项异常,已发送告警"
    fi
    log "================================"
}

# 执行
main

五、批量部署脚本
#

5.1 功能说明
#

  • 批量执行命令
  • 并行部署
  • 错误处理
  • 进度显示

5.2 脚本代码
#

#!/bin/bash
# batch-deploy.sh - 批量部署脚本

set -e

# 服务器列表
SERVERS=(
    "web01:192.168.1.10"
    "web02:192.168.1.11"
    "web03:192.168.1.12"
    "api01:192.168.1.20"
    "api02:192.168.1.21"
)

# 部署配置
DEPLOY_USER="deploy"
DEPLOY_KEY="$HOME/.ssh/deploy_key"
APP_DIR="/var/www/myapp"
BACKUP_DIR="/backup/myapp"

log() {
    echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1"
}

# 单台服务器部署
deploy_server() {
    local name=$1
    local ip=$2
    
    log "🚀 开始部署:$name ($ip)"
    
    # SSH 选项
    SSH_OPTS="-i $DEPLOY_KEY -o StrictHostKeyChecking=no -o ConnectTimeout=10"
    
    # 1. 备份当前版本
    ssh $SSH_OPTS "$DEPLOY_USER@$ip" "
        if [ -d '$APP_DIR' ]; then
            cp -r '$APP_DIR' '$BACKUP_DIR/backup-$(date +%Y%m%d_%H%M%S)'
            echo '✅ 备份完成'
        fi
    "
    
    # 2. 拉取最新代码
    ssh $SSH_OPTS "$DEPLOY_USER@$ip" "
        cd '$APP_DIR'
        git pull origin main
        echo '✅ 代码更新完成'
    "
    
    # 3. 安装依赖
    ssh $SSH_OPTS "$DEPLOY_USER@$ip" "
        cd '$APP_DIR'
        npm ci --production
        echo '✅ 依赖安装完成'
    "
    
    # 4. 构建
    ssh $SSH_OPTS "$DEPLOY_USER@$ip" "
        cd '$APP_DIR'
        npm run build
        echo '✅ 构建完成'
    "
    
    # 5. 重启服务
    ssh $SSH_OPTS "$DEPLOY_USER@$ip" "
        sudo systemctl restart myapp
        sudo systemctl status myapp --no-pager
        echo '✅ 服务重启完成'
    "
    
    # 6. 健康检查
    local max_retry=5
    local retry=0
    while [ $retry -lt $max_retry ]; do
        if curl -s "http://$ip:8080/health" | grep -q "ok"; then
            log "✅ $name 健康检查通过"
            return 0
        fi
        ((retry++))
        log "⏳ $name 健康检查失败,第 $retry 次重试..."
        sleep 5
    done
    
    log "❌ $name 健康检查失败"
    return 1
}

# 并行部署
parallel_deploy() {
    local max_parallel=${1:-3}
    local failed=0
    
    log "================================"
    log "开始批量部署(并发数:$max_parallel)"
    log "================================"
    
    # 使用 GNU Parallel 或 xargs
    if command -v parallel &> /dev/null; then
        # 使用 parallel
        printf '%s\n' "${SERVERS[@]}" | parallel -j $max_parallel --col ':' \
            deploy_server {1} {2}
    else
        # 使用 xargs
        printf '%s\n' "${SERVERS[@]}" | xargs -P $max_parallel -n 1 bash -c '
            IFS=":" read name ip <<< "$1"
            deploy_server "$name" "$ip"
        ' _
    fi
}

# 回滚
rollback() {
    local server=$1
    
    log "🔄 回滚 $server..."
    
    SSH_OPTS="-i $DEPLOY_KEY -o StrictHostKeyChecking=no"
    
    ssh $SSH_OPTS "$DEPLOY_USER@$server" "
        latest_backup=\$(ls -t '$BACKUP_DIR' | head -1)
        if [ -n \"\$latest_backup\" ]; then
            rm -rf '$APP_DIR'
            cp -r '$BACKUP_DIR/\$latest_backup' '$APP_DIR'
            sudo systemctl restart myapp
            echo '✅ 回滚完成'
        else
            echo '❌ 未找到备份'
            exit 1
        fi
    "
}

# 主函数
main() {
    log "================================"
    log "批量部署脚本"
    log "服务器数量:${#SERVERS[@]}"
    log "================================"
    
    # 预检查
    for server in "${SERVERS[@]}"; do
        IFS=':' read -r name ip <<< "$server"
        if ! ping -c 1 -W 2 "$ip" &> /dev/null; then
            log "❌ $name ($ip) 无法连接"
            exit 1
        fi
    done
    
    log "✅ 预检查通过"
    
    # 部署
    parallel_deploy 3
    
    log "================================"
    log "✅ 批量部署完成"
    log "================================"
}

# 执行
main "$@"

六、脚本使用规范
#

6.1 安全建议
#

  1. 权限控制

    chmod 700 /usr/local/bin/*.sh
    chown root:root /usr/local/bin/*.sh
  2. 敏感信息

    # 使用环境变量或配置文件
    source /etc/backup/credentials
    # 不要硬编码密码
  3. 日志审计

    # 所有脚本输出到日志
    exec > >(tee -a /var/log/script.log) 2>&1

6.2 最佳实践
#

实践 说明
set -e 遇到错误立即退出
set -u 使用未定义变量时报错
set -o pipefail 管道中任何命令失败都报错
函数封装 功能模块化,便于复用
错误处理 trap 捕获异常
日志记录 所有操作记录到日志

七、总结
#

脚本清单
#

脚本 用途 执行频率
system-health-check.sh 系统巡检 每日
log-analyzer.sh 日志分析 每日
auto-backup.sh 自动备份 每日
monitor-alert.sh 监控告警 每 5 分钟
batch-deploy.sh 批量部署 按需

核心价值
#

  • ✅ 自动化重复工作,节省 80% 时间
  • ✅ 标准化操作流程,减少人为错误
  • ✅ 及时发现问题,快速响应
  • ✅ 可复用、可维护、可扩展

参考资料: