一, 操作系统优化

注意升级内核,内核要 4.11 以上,不升级的话缺少ip_unprivileged_port_start文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
cat << \EOF >> /etc/sysctl.conf
net.ipv4.tcp_tw_reuse = 1
net.ipv4.tcp_tw_recycle = 0
net.ipv4.ip_local_port_range="1024 65024"
net.ipv4.tcp_max_syn_backlog=100000
net.core.netdev_max_backlog=100000
net.ipv4.tcp_tw_reuse=1
net.ipv4.tcp_fin_timeout = 10
fs.file-max=500000
EOF

echo '* - nofile 500000' >> /etc/security/limits.conf
# 开启 ip 转发
echo 1 > /proc/sys/net/ipv4/ip_forward

二, 安装 HAProxy

1 准备配置文件

日志的级别为 local0 ~ local7,另外 16 ~ 23 保留为本地使用

级别 代码 描述
emerg 0 系统不可用
alert 1 必须马上采取行动的事件
crit 2 关键的事件
err 3 错误事件
warning 4 警告事件
notice 5 普通但重要的事件
info 6 有用的信息
debug 7 调试信息

vi /etc/haproxy.cfg

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
global
log /dev/log local0 warning
maxconn 10000 # 会根据这个分配后面backend maxconn/10,使用时发现超过也没事,只要总共连接数在 1万以内就没事

defaults
log global
option httplog
option dontlognull
timeout http-request 10s
timeout connect 10s
timeout client 1m
timeout server 1m
timeout http-keep-alive 10s
timeout check 10s

# tcp 穿透
frontend gitlab-rails-https
bind *:443
log 127.0.0.1 local0 err
mode tcp
option tcplog
option forwardfor # ip 透传
default_backend gitlab-rails-https
backend gitlab-rails-https
mode tcp
balance roundrobin
server Gitlab-Ha-Prod-Rails-10-1-1-1 10.1.1.1:443 send-proxy check inter 5s rise 2 fall 3
server Gitlab-Ha-Prod-Rails-10-1-1-2 10.1.1.2:443 send-proxy check inter 5s rise 2 fall 3
server Gitlab-Ha-Prod-Rails-10-1-1-3 10.1.1.3:443 send-proxy check inter 5s rise 2 fall 3

# http 穿透
frontend gitlab-rails-http
bind *:80
log 127.0.0.1 local0 err
mode http
option tcplog
option forwardfor # ip 透传
default_backend gitlab-rails-http
backend gitlab-rails-http
mode tcp
balance roundrobin
server Gitlab-Ha-Prod-Rails-10-1-1-1 10.1.1.1:80 check inter 5s rise 2 fall 3
server Gitlab-Ha-Prod-Rails-10-1-1-2 10.1.1.2:80 check inter 5s rise 2 fall 3
server Gitlab-Ha-Prod-Rails-10-1-1-3 10.1.1.3:80 check inter 5s rise 2 fall 3

frontend gitlab-rails-ssh
bind *:22
log 127.0.0.1 local0 err
mode tcp
option tcplog
option forwardfor # ip 透传
default_backend gitlab-rails-ssh
backend gitlab-rails-ssh
mode tcp
balance roundrobin
server Gitlab-Ha-Prod-Rails-10-1-1-1 10.1.1.1:2222 send-proxy check inter 5s rise 2 fall 3
server Gitlab-Ha-Prod-Rails-10-1-1-2 10.1.1.2:2222 send-proxy check inter 5s rise 2 fall 3
server Gitlab-Ha-Prod-Rails-10-1-1-3 10.1.1.3:2222 send-proxy check inter 5s rise 2 fall 3

listen stats # 通过ip:1080/stats访问控制台
mode http
log 127.0.0.1 local0 err
http-request use-service prometheus-exporter if { path /metrics }
stats realm "Haproxy Manager"
bind 0.0.0.0:1080
stats enable
stats hide-version
stats uri /stats
stats refresh 30s
stats auth admin:admin # 账号密码
stats admin if TRUE

2 配置 system 文件

尝试使用--privileged=true不生效,改成--user=root

vi /usr/lib/systemd/system/haproxy.service

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
[Unit]
Description=haproxy Container
Requires=docker.service
After=docker.service

[Service]
Restart=always
RestartSec=5
StartLimitInterval=0
ExecStartPre=-/bin/docker stop haproxy
ExecStartPre=-/bin/docker rm -f haproxy
ExecStart=/bin/docker run --rm \
--user=root \
--name haproxy \
--net=host \
-v /etc/haproxy.cfg:/usr/local/etc/haproxy/haproxy.cfg \
-v /etc/localtime:/etc/localtime \
haproxy:2.6.7
ExecReload=/bin/docker kill -s HUP haproxy
ExecStop=/bin/docker stop haproxy
ExecStopPost=/bin/docker rm -f haproxy

[Install]
WantedBy=multi-user.target

3 配置日志

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# 升级rsyslog ,要不然报错
cd /etc/yum.repos.d/
wget http://rpms.adiscon.com/v8-stable/rsyslog.repo # for CentOS 7,8,9
wget http://rpms.adiscon.com/v8-stable-daily/rsyslog-daily.repo # for CentOS 7,8,9
yum install rsyslog

# 创建目录
mkdir -p /data/log/haproxy_log

# 开启远程日志
sed -i 's/\#$ModLoad imudp/$ModLoad imudp/' /etc/rsyslog.conf
sed -i 's/\*.emerg/\# *.emerg/' /etc/rsyslog.conf
sed -i 's/\#$UDPServerRun 514/$UDPServerRun 514/' /etc/rsyslog.conf
echo "local3.* /data/log/haproxy_log/haproxy.log" >> /etc/rsyslog.conf
systemctl restart rsyslog

4 运行并查看

1
2
3
4
5
6
systemctl restart haproxy
systemctl status haproxy
systemctl enable haproxy
# 查看日志
ls /data/log/haproxy_log/
haproxy.log

5 日志收集

5.1 日志切割

5.1.1 logrotate 配置

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
cat << \EOF > /data/log/haproxy_log/haproxy
/data/log/haproxy_log/*.log
{
rotate 3
missingok
compress
delaycompress
notifempty
sharedscripts
postrotate
kill -s HUP $(/sbin/pidof rsyslogd)
systemctl reload haproxy.service
endscript
}
EOF

5.1.2 crontab 添加任务

1
2
crontab -e
0 0 * * * /sbin/logrotate -vf /data/log/haproxy_log/haproxy

5.2 Filebeat

5.2.1 安装

这里版本我选择的 7,按需更改

1
2
3
4
5
6
7
8
9
10
11
12
13
14
sudo rpm --import https://packages.elastic.co/GPG-KEY-elasticsearch

cat << EOF > /etc/yum.repos.d/elastic.repo
[elastic-8.x]
name=Elastic repository for 7.x packages
baseurl=https://artifacts.elastic.co/packages/7.x/yum
gpgcheck=1
gpgkey=https://artifacts.elastic.co/GPG-KEY-elasticsearch
enabled=1
autorefresh=1
type=rpm-md
EOF
yum install filebeat -y
systemctl enable filebeat

5.2.2 配置

日志上传的 kafka 上

vim /etc/filebeat/filebeat.yml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
filebeat.inputs:
- type: log
enabled: true
paths:
- /data/log/haproxy_log/haproxy.log
ignore_older: 5m
fields_under_root: true
fields:
x_module: gitlab_haproxy_logs
x_type: haproxylogs
log_topic: gitlab-haproxy-log
document_type: gitlab-haproxy-logs
fields_under_root: true
fields:
x_project: gitlab
x_env: prod
x_host: ${HOSTNAME}
output.kafka:
hosts: ["10.1.2.1:9091"]
topic: '%{[log_topic]}'
partition.round_robin:
reachable_only: false
required_acks: 1
compression: gzip
max_message_bytes: 1000000
version: 0.10.2 # 版本要跟kafka对上

5.2.3 启动与查看

systemctl restart filebeat

已经看到有 topic 了

三, 安装 keepalived

1 安装 gcc

1
2
yum -y install gcc gcc-c++ autoconf automake make
yum -y install zlib zlib-devel openssl openssl-devel pcre pcre-devel

2 编译安装

  • –prefix 指定安装地址
  • /usr/local/keepalived/ 安装的目录
  • sysconf 指定配置文件目录
1
2
3
4
5
6
7
8
# 准备源码
wget https://www.keepalived.org/software/keepalived-2.2.8.tar.gz
tar -xvf keepalived-2.2.8.tar.gz
cd keepalived-2.2.8
# 编译配置
./configure --prefix=/usr/local/keepalived/ --sysconf=/etc
# 编译并安装
make && make install

3 配置 keepalived

两台机器上必须都安装 Keepalived,但在配置上略有不同。

  • router_id使用主机名
  • interface  指定网卡
  • unicast_src_ip本地 ip
  • unicast_peer对端 ip
  • priority + weight小于另一个节点的priority

nginx 切换同理

vim /etc/keepalived/keepalived.conf

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
global_defs {
#notification_email {
#test@qq.com # 指定keepalived在发生切换时需要发送email到的对象,一行一个
#test@f163.com
#}
# notification_email_from xxx@163.com # 指定发件人
# smtp_server smtp@163.com # smtp 服务器地址
# smtp_connect_timeout 30 # smtp 服务器连接超时时间
router_id LVS_1 # 标识本节点的字符串,通常为hostname,但不一定非得是hostname,故障发生时,邮件通知会用到
vrrp_skip_check_adv_addr
vrrp_garp_interval 5
vrrp_gna_interval 5
}

vrrp_script chk_haproxy {
script "/usr/bin/killall -0 haproxy" # 检测进程
# script "</dev/tcp/127.0.0.1/80" # 检测端口
interval 2
weight -20
}

vrrp_instance haproxy-vip {
state BACKUP # 都设置为BACKUP,设置非抢占模式,防止主库故障重起之后数据不同步,就进行VIP飘移,
priority 100 # 如果检测失败 priority + weight小于另一个节点切换 vip,比如当前配置,100+(-20)=80,领一个节点priority设置要大于 80
interface eth0 # 节点固有IP(非VIP)的网卡,用来发VRRP包做心跳检测
virtual_router_id 60 # # 虚拟路由ID,取值在0-255之间,用来区分多个instance的VRRP组播,同一网段内ID不能重复;主备必须为一样;
advert_int 1 # 检查间隔默认为1秒,即1秒进行一次master选举(可以认为是健康查检时间间隔)
authentication {
auth_type PASS # 默认是PASS认证
auth_pass qweasd # PASS认证密码
}
unicast_src_ip 10.0.0.1 # 本地ip
unicast_peer { # 单播模式,防止脑裂
10.0.0.2 # 对端ip
}

virtual_ipaddress {
10.0.0.3/24 # 虚拟VIP地址,允许多个
}

track_script {
chk_haproxy
}
}

# 测试配置文件
keepalived -t -f /etc/keepalived/keepalived.conf

4 启动并查看

1
2
3
4
systemctl daemon-reload
systemctl restart keepalived
systemctl status keepalived
systemctl enable keepalived

四, 监控与告警

配置文件添加以下内容,上面的配置文件已经写了

1 HAProxy

1
2
3
4
5
6
7
8
9
10
11
12
listen stats
mode http
log 127.0.0.1 local0 err
http-request use-service prometheus-exporter if { path /metrics }
stats realm "Haproxy Manager"
bind 0.0.0.0:1080
stats enable
stats hide-version
stats uri /stats
stats refresh 30s
stats auth admin:admin
stats admin if TRUE

2 node_exporter

2.1 配置 system 文件

vi /usr/lib/systemd/system/node_exporter.service

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
[Unit]
Description=node_exporter Container
Requires=docker.service
After=docker.service

[Service]
Restart=always
RestartSec=5
StartLimitInterval=0
ExecStartPre=-/bin/docker stop node_exporter
ExecStartPre=-/bin/docker rm -f node_exporter
ExecStart=/bin/docker run --rm \
--name node_exporter \
--net="host" \
--pid="host" \
-v "/:/host:ro,rslave" \
quay.io/prometheus/node-exporter:latest \
--path.rootfs=/host
ExecStop=/bin/docker stop node_exporter
ExecStopPost=/bin/docker rm -f node_exporter

[Install]
WantedBy=multi-user.target

2.2 启动

1
2
systemctl start node_exporter.service
systemctl enable node_exporter.service

3 keepalived-exporter

点击获取最新的包

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
rpm -ivh keepalived-exporter-1.3.2.linux-amd64.rpm
vim /usr/lib/systemd/system/keepalived-exporter.service
[Unit]
Description=Keepalived Exporter
Wants=network-online.target
After=network-online.target

[Service]
Type=simple
User=root
Group=root
ExecStart=/usr/bin/keepalived-exporter -web.listen-address=:9713
ExecReload=/bin/kill -HUP
KillMode=process
TimeoutStopSec=20s
Restart=always

[Install]
WantedBy=default.target

systemctl enable keepalived-exporter.service
systemctl start keepalived-exporter.service

4 Prometheus

4.1 指标收集配置

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
- job_name: prb-haproxy
honor_timestamps: true
scrape_interval: 2m
scrape_timeout: 10s
metrics_path: /metrics
scheme: http
follow_redirects: true
static_configs:
- targets:
- 10.0.0.1:1080
- 10.0.0.2:1080
labels:
cluster: gitlab
haproxy: yes
- job_name: prb-keepalived
honor_timestamps: true
scrape_interval: 2m
scrape_timeout: 10s
metrics_path: /metrics
scheme: http
follow_redirects: true
static_configs:
- targets:
- 10.0.0.1:9713
- 10.0.0.2:9713
labels:
cluster: gitlab
haproxy: yes
- job_name: prb-node
honor_timestamps: true
scrape_interval: 2m
scrape_timeout: 10s
metrics_path: /metrics
scheme: http
follow_redirects: true
static_configs:
- targets:
- 10.0.0.1:9100
- 10.0.0.2:9100
labels:
cluster: gitlab
haproxynode: "yes"

4.2 告警规则

根据自身环境修改

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
groups:
- name: HAProxy.rules
rules:
- alert: HAProxy down
expr: up{haproxy="yes"}==0
for: 1m
labels:
severity: critical
annotations:
description: 'HAProxy down (instance {{ $labels.instance }})'
summary: HAProxy down
- alert: HAProxy server down
expr: haproxy_server_status{state="UP"}==0
for: 1m
labels:
severity: critical
annotations:
description: 'HAProxy server is down\n SERVER = {{ $labels.server }}'
summary: HAProxy server down (instance {{ $labels.instance }})
- alert: HAProxy frontend down
expr: haproxy_frontend_status{state="UP"}==0
for: 1m
labels:
severity: critical
annotations:
description: 'HAProxy frontend is down\n PROXY = {{ $labels.proxy }}'
summary: HAProxy frontend down (instance {{ $labels.instance }})
- alert: HAProxy backend down
expr: haproxy_backend_status{state="UP"}==0
for: 1m
labels:
severity: critical
annotations:
summary: HAProxy backend down (instance {{ $labels.instance }})
description: 'HAProxy backend is down\n PROXY = {{ $labels.proxy }}'
- alert: HAProxy backend max active session
expr: ((sum by (proxy) (avg_over_time(haproxy_backend_max_sessions[5m]) * 100) / sum by (proxy) (avg_over_time(haproxy_backend_limit_sessions[5m])))) > 80
for: 5m
labels:
severity: warning
annotations:
summary: HAProxy backend max active session (instance {{ $labels.instance }})
description: 'HAproxy backend {{ $labels.proxy }} is reaching session limit (> 80%).\n VALUE = {{ $value }}'
- alert: HAProxy pending requests
expr: sum by (proxy) (haproxy_backend_current_queue) > 0
for: 1m
labels:
severity: warning
annotations:
summary: HAProxy pending requests (instance {{ $labels.instance }})
description: 'Some HAProxy requests are pending on {{ $labels.proxy }}\n VALUE = {{ $value }}'
- alert: HAProxy server healthcheck failure
expr: increase(haproxy_server_check_failures_total[5m]) > 0
for: 5m
labels:
severity: warning
annotations:
summary: HAProxy server healthcheck failure (instance {{ $labels.instance }})
description: 'Some server healthcheck are failing on {{ $labels.server }}'
- alert: HAProxy frontend security blocked requests
expr: sum by (proxy) (rate(haproxy_frontend_requests_denied_total[5m])) > 10
for: 5m
labels:
severity: warning
annotations:
summary: HAProxy frontend security blocked requests (instance {{ $labels.instance }})
description: 'HAProxy is blocking requests for security reason\n PROXY = {{ $labels.proxy }}\n VALUE = {{ $value }}'
- alert: HAProxy server connection errors
expr: sum by (proxy,server) (rate(haproxy_server_connection_errors_total[5m])) > 100
for: 5m
labels:
severity: critical
annotations:
summary: HAProxy server connection errors (instance {{ $labels.instance }})
description: 'Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be too high.\n VALUE = {{ $value }}'
- alert: HAProxy backend connection errors
expr: sum by (proxy) (rate(haproxy_backend_connection_errors_total[5m])) > 100
for: 5m
labels:
severity: critical
annotations:
summary: HAProxy backend connection errors (instance {{ $labels.instance }})
description: 'Too many connection errors to {{ $labels.proxy }} (> 100 req/s). Request throughput may be too high.\n VALUE = {{ $value }}'
- alert: HAProxy retry high
expr: sum by (proxy) (rate(haproxy_backend_retry_warnings_total[5m])) > 10
for: 5m
labels:
severity: warning
annotations:
summary: HAProxy retry high (instance {{ $labels.instance }})
description: 'High rate of retry on {{ $labels.proxy }}\n VALUE = {{ $value }}'
- alert: HAProxy high HTTP 5xx error rate backend
expr: sum by (proxy) (rate(haproxy_backend_http_responses_total{code="5xx"}[5m])) / sum by (proxy) (rate(haproxy_backend_http_responses_total[5m])) > 5
for: 5m
labels:
severity: critical
annotations:
summary: HAProxy high HTTP 5xx error rate backend (instance {{ $labels.instance }})
description: 'Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.proxy }}\n VALUE = {{ $value }}'
- alert: HAProxy high HTTP 4xx error rate backend
expr: sum by (proxy) (rate(haproxy_backend_http_responses_total{code="4xx"}[5m])) / sum by (proxy) (rate(haproxy_backend_http_responses_total[5m])) > 5
for: 5m
labels:
severity: critical
annotations:
summary: HAProxy high HTTP 4xx error rate backend (instance {{ $labels.instance }})
description: 'Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.proxy }}\n VALUE = {{ $value }}'
- alert: HAProxy HTTP slowing down
expr: avg by (proxy) (haproxy_backend_total_time_average_seconds) > 1
for: 1m
labels:
severity: warning
annotations:
summary: HAProxy HTTP slowing down (instance {{ $labels.instance }})
description: 'Average request time is increasing\n PROXY = {{ $labels.proxy }}\n VALUE = {{ $value }}'

五, alertmanager

告警通知配置

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# route字段下添加
- receiver: 'gitlab-haproxy-webchat'
continue: true
match_re:
alertname: HAProxy down|HAProxy server down|HAProxy frontend down|HAProxy backend down|HAProxy backend max active session|HAProxy pending requests|HAProxy server healthcheck failure|HAProxy frontend security blocked requests|HAProxy server connection errors|HAProxy backend connection errors|HAProxy retry high|HAProxy high HTTP 5xx error rate backend|HAProxy high HTTP 4xx error rate backend|HAProxy HTTP slowing down

# receivers下添加:
- name: 'gitlab-haproxy-webchat'
wechat_configs:
- corp_id: 'wwf948xxxxxxxxx'
message: '{{ template "wechat.k8scorecomponentpod.message" . }}'
api_secret: 'xxxxxxxxxxxxxxxxx'
to_party: '2'
agent_id: '1000000'
send_resolved: true

1 Grafana

1.1 导入模板

模板 id: 12693

keepalived模板

1.2 查看

1.2.1 HAProxy 监控

1.2.2 节点监控

网上 node_exporter 模板很多,这里就不分享了

1.3 大盘无数据

1.3.1 查询时间太短

Prometheus 拉取不频繁,太短时间显示不出来

1.3.2 值不对

改为haproxy_frontend_http_responses_total