1 架构设计

q

2 通过 helm 部署 skywalking

2.1 下载代码

git clone https://github.com/apache/skywalking-kubernetes.git

cd skywalking-kubernetes/chart/

values-es6.yaml 、values-es7.yaml 和 values-my-es.yaml,分别对应使用 es6、es7 和 外部 es 存储的配置。这里使用的是外部自有的 es 集群,并且 es 的版本是 7.10.0

2.2 自定义配置

Chart.yaml文件dependencies字段里删除postgresql

2.2.1 使用自搭建es

cat skywalking/values-my-es.yaml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
oap:
image:
tag: 8.5.0-es7 # skywalking版本,用哪个版本的es
storageType: elasticsearch7

ui:
image:
tag: 8.5.0

elasticsearch:
enabled: false
config: # elasticsearch配置
host: elasticsearch-master # 地址
port:
http: 9200 #端口

2.2.2 使用集成es

执行:
helm repo add elastic https://helm.elastic.co
helm repo update
helm dep up skywalking
如果timout
wget https://helm.elastic.co/helm/elasticsearch/elasticsearch-7.17.1.tgz # 注意下,如果公司注重license使用7.10以下版本
mkdir charts/
mv elasticsearch-7.17.1.tgz charts/

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
serviceAccounts:
oap:
nameOverride: skywalking # 覆盖名称
imagePullSecrets: []

initContainer:
image: busybox
tag: '1.30'
oap:
name: oap
dynamicConfigEnabled: false
image:
repository: skywalking.docker.scarf.sh/apache/skywalking-oap-server
tag: 9.4.0
pullPolicy: IfNotPresent
ports:
grpc: 11800
rest: 12800
replicas: 2
storageType: elasticsearch
service:
type: ClusterIP
javaOpts: -Xmx2g -Xms2g
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: oap
operator: In
values:
- "yes"
antiAffinity: "soft"
resources:
limits:
cpu: 2
memory: 4Gi
requests:
cpu: 1
memory: 2Gi
envoy:
als:
enabled: false
# more envoy ALS ,please refer to https://github.com/apache/skywalking/blob/master/docs/en/setup/envoy/als_setting.md#observe-service-mesh-through-als
env:
# more env, please refer to https://hub.docker.com/r/apache/skywalking-oap-server
# or https://github.com/apache/skywalking-docker/blob/master/6/6.4/oap/README.md#sw_telemetry

# Allows you to add any config files in /skywalking/config
config: {}
# metadata-service-mapping.yaml: |
# serviceName: e2e::${LABELS."service.istio.io/canonical-name"}
# serviceInstanceName: ${NAME}
# oal:
# core.oal: |
# service_resp_time = from(Service.latency).longAvg();
# service_sla = from(Service.*).percent(status == true);
# service_cpm = from(Service.*).cpm();
# log4j2.xml: |
# <Configuration status="DEBUG">
# <!-- ... -->
# </Configuration>


ui:
name: ui
replicas: 1
image:
repository: skywalking.docker.scarf.sh/apache/skywalking-ui
tag: 9.4.0
pullPolicy: IfNotPresent
nodeAffinity: {}
nodeSelector: {}
tolerations: []
ingress:
enabled: false
annotations: {}
# kubernetes.io/ingress.class: nginx
# kubernetes.io/tls-acme: "true"
path: /
hosts: []
# - skywalking.local
tls: []
# - secretName: skywalking-tls
# hosts:
# - skywalking.local
service:
type: NodePort
# clusterIP: None
externalPort: 80
internalPort: 8080
nodePort: 30080

annotations: {}

satellite:
name: satellite
replicas: 2
enabled: true
image:
repository: skywalking.docker.scarf.sh/apache/skywalking-satellite
tag: v1.1.0
pullPolicy: IfNotPresent
ports:
grpc: 11800
prometheus: 1234
service:
type: ClusterIP
antiAffinity: "soft"
nodeAffinity: {}
nodeSelector: {}
tolerations: []
resources:
limits:
cpu: 2
memory: 4Gi
requests:
cpu: 1
memory: 2Gi
podAnnotations:
env:
config: {}

oapInit:
nodeAffinity: {}
nodeSelector: {}
tolerations: []

elasticsearch:
enabled: true
config:
port:
http: 9200
host: elasticsearch
image: docker.elastic.co/elasticsearch/elasticsearch
imageTag: 7.17.1
clusterName: "elasticsearch"
nodeGroup: "master"
roles:
master: "true"
ingest: "true"
data: "true"
replicas: 3
minimumMasterNodes: 2
esConfig: {}
# elasticsearch.yml: |
# key:
# nestedkey: value
# log4j2.properties: |
# key = value

esJavaOpts: "-Xmx4g -Xms4g" # limits的一半,不能超过31g

resources:
requests:
cpu: "2"
memory: "4Gi"
limits:
cpu: "4"
memory: "8Gi"

nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: elasticsearch
operator: In
values:
- "yes"
networkHost: "0.0.0.0"
persistence:
enabled: true # 开启持久化
annotations: {}
volumeClaimTemplate:
accessModes: [ "ReadWriteOnce" ]
storageClassName: csi-cephfs-sc # 指定存储泪
resources:
requests:
storage: 30Gi


priorityClassName: ""
service:
labels: {}
labelsHeadless: {}
type: NodePort
nodePort: "30090"
annotations: {}
httpPortName: http
transportPortName: transport

ingress:
enabled: false
annotations: {}
# kubernetes.io/ingress.class: nginx
# kubernetes.io/tls-acme: "true"
path: /
hosts:
- chart-example.local
tls: []
# - secretName: chart-example-tls
# hosts:
# - chart-example.local

initResources: {}
sidecarResources: {}
rbac:
create: false
serviceAccountName: ""
podSecurityPolicy:
create: false
extraVolumes: ""
extraVolumeMounts: ""
extraInitContainers: ""
antiAffinityTopologyKey: "kubernetes.io/hostname"
antiAffinity: "hard"
podManagementPolicy: "Parallel"
protocol: http
httpPort: 9200
transportPort: 9300
updateStrategy: RollingUpdate
maxUnavailable: 1
podSecurityContext:
fsGroup: 1000
runAsUser: 1000
fsGroup: ""
securityContext:
capabilities:
drop:
- ALL
runAsNonRoot: true
runAsUser: 1000
terminationGracePeriod: 120
sysctlVmMaxMapCount: 262144
readinessProbe:
failureThreshold: 3
initialDelaySeconds: 10
periodSeconds: 10
successThreshold: 3
timeoutSeconds: 5
clusterHealthCheckParams: "wait_for_status=green&timeout=1s"
schedulerName: ""
imagePullSecrets: []
nodeSelector: {}
tolerations: []
nameOverride: ""
fullnameOverride: ""
masterTerminationFix: false
lifecycle: {}
sysctlInitContainer:
enabled: true
keystore: []

postgresql:
enabled: false # Whether to start a demo postgresql deployment, don't use this for production.

2.3 安装 skywalking

  • helm install 名称 chart 名称 命名空间 配置文件

-n 指定命名空间

helm install skywalking -n skywalking -f ./skywalking/values-my-es.yaml

2.3.1 使用集成es执行oap-init可能会卡住

卡住原因是因为索引没创建,查看es索引

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
# 查看索引
curl http://10.1.1.1:30090/_cat/indices?v&pretty
green open .geoip_databases rIefG_beT3iM2FTLd4cKQg 1 1 42 0 80.1mb 40mb
green open sw_ui_template 8vcPxuCdR9ueTH3VwOzDlQ 1 1 68 0 722.3kb 361.1kb
green open sw_metrics-all-20230508 Ix5m0unFRzq5fzlHusNjXw 1 1 0 0 452b 226b
green open sw_log-20230508 PGpRis3PTDqRuyUqivYY-g 5 0 0 0 1.1kb 1.1kb
green open sw_zipkin_span-20230508 SM2c8nINT1KStO7fDoY8YQ 5 0 0 0 1.1kb 1.1kb
green open sw_segment-20230508 z0gHHJL0Sdq7ZcOp8exxYA 5 0 0 0 1.1kb 1.1kb
green open sw_browser_error_log-20230508 GVBmufyzSHGiQdoo9r0utQ 5 0 0 0 1.1kb 1.1kb
green open sw_records-all-20230508 0vLuFnj2Ti6sl9eFRZ9Oqw 1 1 0 0 452b 226b

kubectl delete job -n skywalking skywalking-oap-init
helm upgrade skywalking -n skywalking .
# 查看任务执行状态
kubectl get pod -n skywalking |grep init
skywalking-oap-init-hghvm 0/1 Completed 0 3m15s
# 如果oap-init没任何报错执行下面命令
# kubectl rollout restart deployment -n skywalking skywalking-oap
kubectl scale deployment -n skywalking skywalking-oap --replicas=0
kubectl scale deployment -n skywalking skywalking-oap --replicas=2

2.4 查看

安装成功后可以使用下面的命令查看安装的 chart,安装后的 chart 叫做 skywalking:

1
2
helm list -n skywalking
skywalking ops-prod 1 2021-04-01 11:22:32.52065111 +0800 CST deployed skywalking-4.0.01

安装完成以后,可以通过下面的命令查看 pod 是否正常启动:

1
2
3
4
5
6
7
8
9
10
11
kubectl get pod -n skywalking
NAME READY STATUS RESTARTS AGE
elasticsearch-master-0 1/1 Running 0 39m
elasticsearch-master-1 1/1 Running 0 39m
elasticsearch-master-2 1/1 Running 0 39m
skywalking-oap-68d4f8cdf9-29mlr 1/1 Running 0 98s
skywalking-oap-68d4f8cdf9-fwdkt 1/1 Running 0 98s
skywalking-oap-init-mwwcd 0/1 Completed 0 4m58s # 这个是初始化任务
skywalking-satellite-7cf75c49f9-4jp9l 1/1 Running 0 39m
skywalking-satellite-7cf75c49f9-f775q 1/1 Running 0 39m
skywalking-ui-87c7f6f48-qkc69 1/1 Running 0 39m

3 应用服务接入 skywalking

  • 直接把从官网下载的 agent 安装包放到基础镜像的/opt/skywalking-agent 目录里(我目前再用的)
  • 使用 sidecar 将 pod 接入链路追踪(网上的方法,感觉不方便就没用)

3.1 配置 agent

1
2
3
4
5
6
7
agent.namespace=${SW_AGENT_NAMESPACE:default-namespace} #项目名称
agent.service_name=${APPLICATION_NAME} #应用名称
agent.instance_name=${SW_AGENT_NAM}@${HOSTNAME} #节点名称,一般应用名称+版本号@主机名
collector.backend_service=${SW_AGENT_COLLECTOR_BACKEND_SERVICES:127.0.0.1:11800} #skywalking地址
logging.file_name=${SW_LOGGING_FILE_NAME:skywalking-api.log}
logging.level=${SW_LOGGING_LEVEL:INFO}
plugin.kafka.bootstrap_servers=${SW_KAFKA_BOOTSTRAP_SERVERS:localhost:9092}

3.2 cd 的时候 Jenkins 通过 helm 使用环境变量注入值

这里我做了个 chart 包,Jenkins 生成配置的时候通过变量注入配置,后面文章会将怎么写 helm
主要参数

变量名 变量值 解释
JAVA_AGENT -javaagent:/opt/skywalking-agent/skywalking-agent.jar agent 路径
APPLICATION_NAME account-test 服务名
SW_AGENT_NAM account-test-19@account-test-6b4df8b44d-h6rz9 实例名

helm ${action} ${APP_NAME}-${BRANCH_NAME} --version 0.0.1 rongxin-java8/rongxin-java8 --set SW_AGENT_NAME=${APP_NAME}-${BRANCH_NAME}${BUILD_NUMBER},SW_AGENT_NAMESPACE=${BRANCH_NAME} -n ${PROJECT}-${BRANCH_NAME}

部署的时候变成:

helm upgrade rongbao-gateway-dev --version 0.0.1 rongxin-java8/rongxin-java8 --set 'SW_AGENT_NAME=rongbao-gateway-dev251,SW_AGENT_NAMESPACE=finance' -n rongbao-dev

4 配置 Apollo 动态获取配置

4.1 修改 skywalking-configmap

1
2
3
4
5
6
7
8
configuration:
selector: ${SW_CONFIGURATION:apollo}
apollo:
apolloMeta: ${SW_CONFIG_APOLLO:http://apollo-prod-apollo-configservice:8080}
apolloCluster: ${SW_CONFIG_APOLLO_CLUSTER:default}
apolloEnv: ${SW_CONFIG_APOLLO_ENV:"PROD"}
appId: ${SW_CONFIG_APOLLO_APP_ID:skywalking}
period: ${SW_CONFIG_APOLLO_PERIOD:5}

4.2 通过 Apollo 配置告警

4.2.1 Apollo 里创建项目

注意 appid 要与 skywalking-configmap 配置一样

image-20220425140832736

4.2.2 配置 Apollo

key 填写 alarm.default.alarm-settings

如果不想用 apollo,那就用 configmap,配置与 Apollo 唯一区别是 key 是 alarm-settings.yaml,并且挂载到/skywalking/config/alarm-settings.yaml

image-20220425140912979

  • value:其中 secret 为钉钉机器人加密字段

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    rules:
    # Rule unique name, must be ended with `_rule`.
    service_resp_time_rule:
    metrics-name: service_resp_time
    op: ">"
    threshold: 8000
    period: 10
    count: 3
    silence-period: 5
    message: 最近3分钟内服务 {name} 的平均响应时间超过8秒
    service_sla_rule:
    # Metrics value need to be long, double or int
    metrics-name: service_sla
    op: "<"
    threshold: 8000
    # The length of time to evaluate the metrics
    period: 10
    # How many times after the metrics match the condition, will trigger alarm
    count: 2
    # How many times of checks, the alarm keeps silence after alarm triggered, default as same as period.
    silence-period: 3
    message: 最近2分钟内服务 {name} 的成功率低于80%
    service_resp_time_percentile_rule:
    # Metrics value need to be long, double or int
    metrics-name: service_percentile
    op: ">"
    threshold: 8000,8000,8000,8000,8000
    period: 10
    count: 3
    silence-period: 5
    message: 最近3分钟的服务 {name} 的响应时间百分比超过8秒
    service_instance_resp_time_rule:
    metrics-name: service_instance_resp_time
    op: ">"
    threshold: 8000
    period: 10
    count: 2
    silence-period: 5
    message: 最近2分钟内服务实例 {name} 的平均响应时间超过8秒
    database_access_resp_time_rule:
    metrics-name: database_access_resp_time
    threshold: 1000
    op: ">"
    period: 10
    count: 2
    # message: Response time of database access {name} is more than 1000ms in 2 minutes of last 10 minutes
    message: 最近2分钟内数据库访问 {name} 的平均响应时间超过1秒
    endpoint_relation_resp_time_rule:
    metrics-name: endpoint_relation_resp_time
    threshold: 5000
    op: ">"
    period: 10
    count: 2
    message: 最近2分钟内端点 {name} 的平均响应时间超过5秒
    dingtalkHooks:
    textTemplate: |-
    {
    "msgtype": "text",
    "text": {
    "content": "生产环境 SkyWalking 链路追踪告警: \n %s."
    }
    }
    webhooks:
    - url: https://oapi.dingtalk.com/robot/send?access_token=cea09f09c3cfd3c7ed74d257413b73c0a15e5a69a56376786680e505XXX
    secret: SECab306b03fcdfa5e4c8643ed7712056fbb7c6006ee80ea1dcf1b6a0XXX

4.2.3 查看钉钉告警

image-20211008164251111

5 Kubernetes 监控

5.1 kube-state-metric

先检查下,如果kubectl top node报错说明没有安装执行以下命令

1
2
helm repo add bitnami https://charts.bitnami.com/bitnami
helm install kube-state-metrics bitnami/kube-state-metrics -n kube-system

6 OpenTelemetry Collector

1
2
https://github.com/apache/skywalking-showcase/tree/main/deploy/platform/kubernetes/feature-kubernetes-monitor
kubectl apply -f .

总结

这篇文章简单介绍了使用 helm 部署 skywalking,关于 helm 的使用以及如何自定义 chart,通过 Apollo 动态配置 skywalking,skywalking 告警配置