一, 操作系统优化

注意升级内核,内核要 4.11 以上,不升级的话缺少ip_unprivileged_port_start文件

cat << \EOF >> /etc/sysctl.conf
net.ipv4.tcp_tw_reuse = 1
net.ipv4.tcp_tw_recycle = 0
net.ipv4.ip_local_port_range="1024 65024"
net.ipv4.tcp_max_syn_backlog=100000
net.core.netdev_max_backlog=100000
net.ipv4.tcp_tw_reuse=1
net.ipv4.tcp_fin_timeout = 10
fs.file-max=500000
EOF

echo '*  -  nofile  500000' >> /etc/security/limits.conf
# 开启 ip 转发
echo 1 > /proc/sys/net/ipv4/ip_forward

二, 安装 HAProxy

1 准备配置文件

日志的级别为 local0 ～ local7，另外 16 ～ 23 保留为本地使用

级别	代码	描述
emerg	0	系统不可用
alert	1	必须马上采取行动的事件
crit	2	关键的事件
err	3	错误事件
warning	4	警告事件
notice	5	普通但重要的事件
info	6	有用的信息
debug	7	调试信息

vi /etc/haproxy.cfg

global
    log /dev/log  local0 warning
    maxconn     10000 # 会根据这个分配后面backend maxconn/10,使用时发现超过也没事,只要总共连接数在 1万以内就没事

defaults
  log global
  option  httplog
  option  dontlognull
  timeout http-request    10s
  timeout connect 10s
  timeout client 1m
  timeout server 1m
  timeout http-keep-alive 10s
  timeout check           10s

# tcp 穿透
frontend gitlab-rails-https
  bind *:443
  log 127.0.0.1 local0 err
  mode tcp
  option tcplog
  option forwardfor # ip 透传
  default_backend gitlab-rails-https
backend gitlab-rails-https
  mode tcp
  balance roundrobin
  server Gitlab-Ha-Prod-Rails-10-1-1-1 10.1.1.1:443 send-proxy check inter 5s rise 2 fall 3
  server Gitlab-Ha-Prod-Rails-10-1-1-2 10.1.1.2:443 send-proxy check inter 5s rise 2 fall 3
  server Gitlab-Ha-Prod-Rails-10-1-1-3 10.1.1.3:443 send-proxy check inter 5s rise 2 fall 3

# http 穿透
frontend gitlab-rails-http
  bind *:80
  log 127.0.0.1 local0 err
  mode http
  option tcplog
  option forwardfor # ip 透传
  default_backend gitlab-rails-http
backend gitlab-rails-http
  mode tcp
  balance roundrobin
  server Gitlab-Ha-Prod-Rails-10-1-1-1 10.1.1.1:80 check inter 5s rise 2 fall 3
  server Gitlab-Ha-Prod-Rails-10-1-1-2 10.1.1.2:80 check inter 5s rise 2 fall 3
  server Gitlab-Ha-Prod-Rails-10-1-1-3 10.1.1.3:80 check inter 5s rise 2 fall 3

frontend gitlab-rails-ssh
  bind *:22
  log 127.0.0.1 local0 err
  mode tcp
  option tcplog
  option forwardfor # ip 透传
  default_backend gitlab-rails-ssh
backend gitlab-rails-ssh
  mode tcp
  balance roundrobin
  server Gitlab-Ha-Prod-Rails-10-1-1-1 10.1.1.1:2222 send-proxy check inter 5s rise 2 fall 3
  server Gitlab-Ha-Prod-Rails-10-1-1-2 10.1.1.2:2222 send-proxy check inter 5s rise 2 fall 3
  server Gitlab-Ha-Prod-Rails-10-1-1-3 10.1.1.3:2222 send-proxy check inter 5s rise 2 fall 3

listen stats # 通过ip:1080/stats访问控制台
  mode    http
  log 127.0.0.1 local0 err
  http-request use-service prometheus-exporter if { path /metrics }
  stats realm "Haproxy Manager"
  bind    0.0.0.0:1080
  stats   enable
  stats   hide-version
  stats uri     /stats
  stats refresh 30s
  stats auth admin:admin # 账号密码
  stats admin if TRUE

2 配置 system 文件

尝试使用--privileged=true不生效,改成--user=root

vi /usr/lib/systemd/system/haproxy.service

[Unit]
Description=haproxy Container
Requires=docker.service
After=docker.service

[Service]
Restart=always
RestartSec=5
StartLimitInterval=0
ExecStartPre=-/bin/docker stop haproxy
ExecStartPre=-/bin/docker rm -f haproxy
ExecStart=/bin/docker run --rm \
    --user=root \
    --name haproxy \
    --net=host \
    -v /etc/haproxy.cfg:/usr/local/etc/haproxy/haproxy.cfg \
    -v /etc/localtime:/etc/localtime \
    haproxy:2.6.7
ExecReload=/bin/docker kill -s HUP haproxy
ExecStop=/bin/docker stop haproxy
ExecStopPost=/bin/docker rm -f haproxy

[Install]
WantedBy=multi-user.target

3 配置日志

# 升级rsyslog ,要不然报错
cd /etc/yum.repos.d/
wget http://rpms.adiscon.com/v8-stable/rsyslog.repo # for CentOS 7,8,9
wget http://rpms.adiscon.com/v8-stable-daily/rsyslog-daily.repo # for CentOS 7,8,9
yum install rsyslog

# 创建目录
mkdir -p /data/log/haproxy_log

# 开启远程日志
sed -i 's/\#$ModLoad imudp/$ModLoad imudp/' /etc/rsyslog.conf
sed -i 's/\*.emerg/\# *.emerg/' /etc/rsyslog.conf
sed -i 's/\#$UDPServerRun 514/$UDPServerRun 514/' /etc/rsyslog.conf
echo "local3.*   /data/log/haproxy_log/haproxy.log" >> /etc/rsyslog.conf
systemctl restart rsyslog

4 运行并查看

systemctl restart haproxy
systemctl status haproxy
systemctl enable haproxy
# 查看日志
ls /data/log/haproxy_log/
haproxy.log

5 日志收集

5.1 日志切割

5.1.1 logrotate 配置

cat << \EOF > /data/log/haproxy_log/haproxy
/data/log/haproxy_log/*.log
{
    rotate 3
    missingok
    compress
    delaycompress
    notifempty
    sharedscripts
    postrotate
        kill -s HUP $(/sbin/pidof rsyslogd)
        systemctl reload haproxy.service
    endscript
}
EOF

5.1.2 crontab 添加任务

1 2	crontab -e 0 0 * * * /sbin/logrotate -vf /data/log/haproxy_log/haproxy

5.2 Filebeat

5.2.1 安装

这里版本我选择的 7,按需更改

sudo rpm --import https://packages.elastic.co/GPG-KEY-elasticsearch

cat << EOF > /etc/yum.repos.d/elastic.repo
[elastic-8.x]
name=Elastic repository for 7.x packages
baseurl=https://artifacts.elastic.co/packages/7.x/yum
gpgcheck=1
gpgkey=https://artifacts.elastic.co/GPG-KEY-elasticsearch
enabled=1
autorefresh=1
type=rpm-md
EOF
yum install filebeat -y
systemctl enable filebeat

5.2.2 配置

日志上传的 kafka 上

vim /etc/filebeat/filebeat.yml

filebeat.inputs:
- type: log
  enabled: true
  paths:
    - /data/log/haproxy_log/haproxy.log
  ignore_older: 5m
  fields_under_root: true
  fields:
    x_module: gitlab_haproxy_logs
    x_type: haproxylogs
    log_topic: gitlab-haproxy-log
  document_type: gitlab-haproxy-logs
fields_under_root: true
fields:
  x_project: gitlab
  x_env: prod
  x_host: ${HOSTNAME}
output.kafka:
  hosts: ["10.1.2.1:9091"]
  topic: '%{[log_topic]}'
  partition.round_robin:
    reachable_only: false
  required_acks: 1
  compression: gzip
  max_message_bytes: 1000000
  version: 0.10.2 # 版本要跟kafka对上

5.2.3 启动与查看

systemctl restart filebeat

已经看到有 topic 了

三, 安装 keepalived

1 安装 gcc

1 2	yum -y install gcc gcc-c++ autoconf automake make yum -y install zlib zlib-devel openssl openssl-devel pcre pcre-devel

2 编译安装

–prefix 指定安装地址

/usr/local/keepalived/ 安装的目录

sysconf 指定配置文件目录

# 准备源码
wget https://www.keepalived.org/software/keepalived-2.2.8.tar.gz
tar -xvf keepalived-2.2.8.tar.gz
cd keepalived-2.2.8
# 编译配置
./configure --prefix=/usr/local/keepalived/ --sysconf=/etc
# 编译并安装
make && make install

3 配置 keepalived

两台机器上必须都安装 Keepalived，但在配置上略有不同。

router_id使用主机名

interface 指定网卡

unicast_src_ip本地 ip

unicast_peer对端 ip

priority + weight小于另一个节点的priority值

nginx 切换同理

vim /etc/keepalived/keepalived.conf

global_defs {
  #notification_email {
      #test@qq.com       # 指定keepalived在发生切换时需要发送email到的对象，一行一个
      #test@f163.com
  #}
  # notification_email_from xxx@163.com   # 指定发件人
  # smtp_server smtp@163.com              # smtp 服务器地址
  # smtp_connect_timeout 30               # smtp 服务器连接超时时间
  router_id LVS_1 # 标识本节点的字符串,通常为hostname,但不一定非得是hostname,故障发生时,邮件通知会用到
  vrrp_skip_check_adv_addr
  vrrp_garp_interval 5
  vrrp_gna_interval 5
}

vrrp_script chk_haproxy {
  script "/usr/bin/killall -0 haproxy" # 检测进程
  # script "</dev/tcp/127.0.0.1/80" # 检测端口
  interval 2
  weight -20 
}

vrrp_instance haproxy-vip {
  state BACKUP # 都设置为BACKUP，设置非抢占模式，防止主库故障重起之后数据不同步，就进行VIP飘移，
  priority 100 # 如果检测失败 priority + weight小于另一个节点切换 vip,比如当前配置,100+(-20)=80,领一个节点priority设置要大于 80
  interface eth0 # 节点固有IP（非VIP）的网卡，用来发VRRP包做心跳检测
  virtual_router_id 60 # # 虚拟路由ID,取值在0-255之间,用来区分多个instance的VRRP组播,同一网段内ID不能重复;主备必须为一样;
  advert_int 1 # 检查间隔默认为1秒,即1秒进行一次master选举(可以认为是健康查检时间间隔)
  authentication {
    auth_type PASS # 默认是PASS认证
    auth_pass qweasd # PASS认证密码
  }
  unicast_src_ip 10.0.0.1 # 本地ip
  unicast_peer { # 单播模式,防止脑裂
    10.0.0.2 # 对端ip
  }

  virtual_ipaddress {
    10.0.0.3/24 # 虚拟VIP地址,允许多个
  }

  track_script {
    chk_haproxy
  }
}

# 测试配置文件
keepalived -t -f /etc/keepalived/keepalived.conf

4 启动并查看

systemctl daemon-reload
systemctl restart keepalived
systemctl status keepalived
systemctl enable keepalived

四, 监控与告警

配置文件添加以下内容,上面的配置文件已经写了

1 HAProxy

listen stats
  mode    http
  log 127.0.0.1 local0 err
  http-request use-service prometheus-exporter if { path /metrics }
  stats realm "Haproxy Manager"
  bind    0.0.0.0:1080
  stats   enable
  stats   hide-version
  stats uri     /stats
  stats refresh 30s
  stats auth admin:admin
  stats admin if TRUE

2 node_exporter

2.1 配置 system 文件

vi /usr/lib/systemd/system/node_exporter.service

[Unit]
Description=node_exporter Container
Requires=docker.service
After=docker.service

[Service]
Restart=always
RestartSec=5
StartLimitInterval=0
ExecStartPre=-/bin/docker stop node_exporter
ExecStartPre=-/bin/docker rm -f node_exporter
ExecStart=/bin/docker run --rm \
  --name node_exporter \
  --net="host" \
  --pid="host" \
  -v "/:/host:ro,rslave" \
  quay.io/prometheus/node-exporter:latest \
  --path.rootfs=/host
ExecStop=/bin/docker stop node_exporter
ExecStopPost=/bin/docker rm -f node_exporter

[Install]
WantedBy=multi-user.target

2.2 启动

1 2	systemctl start node_exporter.service systemctl enable node_exporter.service

3 keepalived-exporter

点击获取最新的包

rpm -ivh keepalived-exporter-1.3.2.linux-amd64.rpm
vim /usr/lib/systemd/system/keepalived-exporter.service
[Unit]
Description=Keepalived Exporter
Wants=network-online.target
After=network-online.target

[Service]
Type=simple
User=root
Group=root
ExecStart=/usr/bin/keepalived-exporter -web.listen-address=:9713
ExecReload=/bin/kill -HUP
KillMode=process
TimeoutStopSec=20s
Restart=always

[Install]
WantedBy=default.target

systemctl enable keepalived-exporter.service
systemctl start keepalived-exporter.service

4 Prometheus

4.1 指标收集配置

- job_name: prb-haproxy
  honor_timestamps: true
  scrape_interval: 2m
  scrape_timeout: 10s
  metrics_path: /metrics
  scheme: http
  follow_redirects: true
  static_configs:
  - targets:
    - 10.0.0.1:1080
    - 10.0.0.2:1080
    labels:
      cluster: gitlab
      haproxy: yes
- job_name: prb-keepalived
  honor_timestamps: true
  scrape_interval: 2m
  scrape_timeout: 10s
  metrics_path: /metrics
  scheme: http
  follow_redirects: true
  static_configs:
  - targets:
    - 10.0.0.1:9713
    - 10.0.0.2:9713
    labels:
      cluster: gitlab
      haproxy: yes
- job_name: prb-node
  honor_timestamps: true
  scrape_interval: 2m
  scrape_timeout: 10s
  metrics_path: /metrics
  scheme: http
  follow_redirects: true
  static_configs:
  - targets:
    - 10.0.0.1:9100
    - 10.0.0.2:9100
    labels:
      cluster: gitlab
      haproxynode: "yes"

4.2 告警规则

根据自身环境修改

groups:
- name: HAProxy.rules
  rules:
  - alert: HAProxy down
    expr: up{haproxy="yes"}==0
    for: 1m
    labels:
      severity: critical
    annotations:
      description: 'HAProxy down (instance {{ $labels.instance }})'
      summary: HAProxy down
  - alert: HAProxy server down
    expr: haproxy_server_status{state="UP"}==0
    for: 1m
    labels:
      severity: critical
    annotations:
      description: 'HAProxy server is down\n SERVER = {{ $labels.server }}'
      summary: HAProxy server down (instance {{ $labels.instance }})
  - alert: HAProxy frontend down
    expr: haproxy_frontend_status{state="UP"}==0
    for: 1m
    labels:
      severity: critical
    annotations:
      description: 'HAProxy frontend is down\n PROXY = {{ $labels.proxy }}'
      summary: HAProxy frontend down (instance {{ $labels.instance }})
  - alert: HAProxy backend down
    expr: haproxy_backend_status{state="UP"}==0
    for: 1m
    labels:
      severity: critical
    annotations:
      summary: HAProxy backend down (instance {{ $labels.instance }})
      description: 'HAProxy backend is down\n PROXY = {{ $labels.proxy }}'
  - alert: HAProxy backend max active session
    expr: ((sum by (proxy) (avg_over_time(haproxy_backend_max_sessions[5m]) * 100) / sum by (proxy) (avg_over_time(haproxy_backend_limit_sessions[5m])))) > 80
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: HAProxy backend max active session (instance {{ $labels.instance }})
      description: 'HAproxy backend {{ $labels.proxy }} is reaching session limit (> 80%).\n  VALUE = {{ $value }}'
  - alert: HAProxy pending requests
    expr: sum by (proxy) (haproxy_backend_current_queue) > 0
    for: 1m
    labels:
      severity: warning
    annotations:
      summary: HAProxy pending requests (instance {{ $labels.instance }})
      description: 'Some HAProxy requests are pending on {{ $labels.proxy }}\n  VALUE = {{ $value }}'
  - alert: HAProxy server healthcheck failure
    expr: increase(haproxy_server_check_failures_total[5m]) > 0
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: HAProxy server healthcheck failure (instance {{ $labels.instance }})
      description: 'Some server healthcheck are failing on {{ $labels.server }}'
  - alert: HAProxy frontend security blocked requests
    expr: sum by (proxy) (rate(haproxy_frontend_requests_denied_total[5m])) > 10
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: HAProxy frontend security blocked requests (instance {{ $labels.instance }})
      description: 'HAProxy is blocking requests for security reason\n PROXY = {{ $labels.proxy }}\n VALUE = {{ $value }}'
  - alert: HAProxy server connection errors
    expr: sum by (proxy,server) (rate(haproxy_server_connection_errors_total[5m])) > 100
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: HAProxy server connection errors (instance {{ $labels.instance }})
      description: 'Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be too high.\n  VALUE = {{ $value }}'
  - alert: HAProxy backend connection errors
    expr: sum by (proxy) (rate(haproxy_backend_connection_errors_total[5m])) > 100
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: HAProxy backend connection errors (instance {{ $labels.instance }})
      description: 'Too many connection errors to {{ $labels.proxy }} (> 100 req/s). Request throughput may be too high.\n  VALUE = {{ $value }}'
  - alert: HAProxy retry high
    expr: sum by (proxy) (rate(haproxy_backend_retry_warnings_total[5m]))  > 10
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: HAProxy retry high (instance {{ $labels.instance }})
      description: 'High rate of retry on {{ $labels.proxy }}\n  VALUE = {{ $value }}'
  - alert: HAProxy high HTTP 5xx error rate backend
    expr: sum by (proxy) (rate(haproxy_backend_http_responses_total{code="5xx"}[5m])) / sum by (proxy) (rate(haproxy_backend_http_responses_total[5m])) > 5
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: HAProxy high HTTP 5xx error rate backend (instance {{ $labels.instance }})
      description: 'Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.proxy }}\n  VALUE = {{ $value }}'
  - alert: HAProxy high HTTP 4xx error rate backend
    expr: sum by (proxy) (rate(haproxy_backend_http_responses_total{code="4xx"}[5m])) / sum by (proxy) (rate(haproxy_backend_http_responses_total[5m])) > 5
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: HAProxy high HTTP 4xx error rate backend (instance {{ $labels.instance }})
      description: 'Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.proxy }}\n  VALUE = {{ $value }}'
  - alert: HAProxy HTTP slowing down
    expr: avg by (proxy) (haproxy_backend_total_time_average_seconds) > 1
    for: 1m
    labels:
      severity: warning
    annotations:
      summary: HAProxy HTTP slowing down (instance {{ $labels.instance }})
      description: 'Average request time is increasing\n PROXY = {{ $labels.proxy }}\n VALUE = {{ $value }}'

五, alertmanager

告警通知配置

# route字段下添加
  - receiver: 'gitlab-haproxy-webchat'
    continue: true
    match_re:
      alertname: HAProxy down|HAProxy server down|HAProxy frontend down|HAProxy backend down|HAProxy backend max active session|HAProxy pending requests|HAProxy server healthcheck failure|HAProxy frontend security blocked requests|HAProxy server connection errors|HAProxy backend connection errors|HAProxy retry high|HAProxy high HTTP 5xx error rate backend|HAProxy high HTTP 4xx error rate backend|HAProxy HTTP slowing down

# receivers下添加:
- name: 'gitlab-haproxy-webchat'
  wechat_configs:
  - corp_id: 'wwf948xxxxxxxxx'
    message: '{{ template "wechat.k8scorecomponentpod.message" . }}'
    api_secret: 'xxxxxxxxxxxxxxxxx'
    to_party: '2'
    agent_id: '1000000'
    send_resolved: true

HAProxy+Keepalived使用