2.prometheus與alertmanager報警實現(xiàn)、haproxy exporter使用

prometheus采集cadvisor數(shù)據(jù):
添加cadvisor
root@master:~# vim /usr/local/prometheus/prometheus.yml
  - job_name: 'prometheus--cadvisor'
    static_configs:
    - targets: ['192.168.200.206:8080','192.168.200.207:8080']
重啟prometheus:
root@master:~#systemctl restart prometheus
導入鏡像
root@master:~# docker load -i cadvisor_v0.33.0.tar.gz 
打標簽
root@master:~# docker tag gcr.io/google-containers/cadvisor:v0.33.0 harbor.wyh.net/baseimages/cadvisor:v0.33.0

上傳鏡像
root@master:~# docker push harbor.wyh.net/baseimages/cadvisor:v0.33.0
啟動cadvisor容器:
docker run \
--volume=/:/rootfs:ro \
--volume=/var/run:/var/run:rw \
--volume=/sys:/sys:ro \
--volume=/var/lib/docker/:/var/lib/docker:ro \
--volume=/dev/disk/:/dev/disk:ro \
--publish=8080:8080 \
--detach=true \
--name=cadvisor \
harbor.wyh.net/baseimages/cadvisor:v0.33.0

驗證cadvisor web界面:
訪問node節(jié)點的cadvisor監(jiān)聽端口


image.png

image.png

查看已經(jīng)監(jiān)控上了

grafana添加pod監(jiān)控模板:
395 893 容器模板ID
395模板


image.png
查看pod信息

prometheus報警設(shè)置:
prometheus觸發(fā)一條告警的過程:
prometheus--->觸發(fā)閾值--->超出持續(xù)時間--->alertmanager--->分組|抑制|靜默--->媒體類型--->郵件|釘釘|微信
等。

分組(group): 將類似性質(zhì)的警報合并為單個通知。
靜默(silences): 是一種簡單的特定時間靜音的機制,例如:服務(wù)器要升級維護可以先設(shè)置這個時間段告警靜
默。
抑制(inhibition): 當警報發(fā)出后,停止重復發(fā)送由此警報引發(fā)的其他警報,可以消除冗余告警
解壓
root@master2:/usr/local/src# tar xf alertmanager-0.19.0.linux-amd64.tar.gz 
做個軟連接
root@master2:/usr/local/src# ln -sv /usr/local/src/alertmanager-0.19.0.linux-amd64 /usr/local/alertmanager
設(shè)置啟動腳本
root@master2:/usr/local/alertmanager# vim /etc/systemd/system/alertmanager.service
[Unit]
Description=Prometheus Server
Documentation=https://prometheus.io/docs/introduction/overview/
After=network.target
[Service]
Restart=on-faiure
ExecStart=/usr/local/alertmanager/alertmanager --config.file=/usr/local/alertmanager/alertmanager.yml
[Install]
WantedBy=multi-user.target

配置alertmanager:
root@master2:/usr/local/alertmanager# cat alertmanager.yml  | grep ^[^'#']
global:
  resolve_timeout: 5m
  smtp_smarthost: 'smtp.qq.com:465'
  smtp_from: '50589143@qq.com'
  smtp_auth_username: '50589143@qq.com'
  smtp_auth_password: 'pzjypoauatdvcadh'
  smtp_hello: '@qq.com'
  smtp_require_tls: false
route:
  group_by: ['alertname']
  group_wait: 10s
  group_interval: 10s
  repeat_interval: 60s
  receiver: 'web.hook'
receivers:
- name: 'web.hook'
  email_configs:
    - to: '2973707860@qq.com'
inhibit_rules:
  - source_match:
      severity: 'critical'
    target_match:
      severity: 'warning'
    equal: ['alertname', 'dev', 'instance']
重啟服務(wù)
root@master2:/usr/local/alertmanager# systemctl restart alertmanager
查看端口
root@master2:/usr/local/alertmanager# ss -tnl | grep 9093
LISTEN  0         128                         *:9093                   *:*    
驗證是否會報警
root@master2:/usr/local/alertmanager# ./amtool alert --alertmanager.url=http://192.168.200.197:9093
Alertname  Starts At  Summary  
配置prometheus報警規(guī)則:
root@master:/etc/ansible# vim /usr/local/prometheus/prometheus.yml
  8 alerting:
  9   alertmanagers:
 10   - static_configs:
 11     - targets:
 12       - 192.168.200.197:9093
 13       # - alertmanager:9093
 14 
 15 # Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
 16 rule_files:
 17   - "/usr/local/prometheus/rule-linux37.yml"

root@master:/etc/ansible# vim /usr/local/prometheus/rule-linux37.yml

groups:
  - name: linux37_pod.rules
    rules:
    - alert: Pod_all_cpu_usage
      expr: (sum by(name)(rate(container_cpu_usage_seconds_total{image!=""}[5m]))*100) > 10
      for: 5m
      labels:
        severity: critical
        service: pods
      annotations:
        description: 容器 {{ $labels.name }} CPU 資源利用率大于 75% , (current value is {{ $value }})
        summary: Dev CPU 負載告警
    - alert: Pod_all_memory_usage
      expr: sort_desc(avg by(name)(irate(container_memory_usage_bytes{name!=""}[5m]))*100) > 1024*10^3*2
      for: 10m
      labels:
        severity: critical
      annotations:
        description: 容器 {{ $labels.name }} Memory 資源利用率大于 2G , (current value is {{ $value }})
        summary: Dev Memory 負載告警
    - alert: Pod_all_network_receive_usage
      expr: sum by (name)(irate(container_network_receive_bytes_total{container_name="POD"}[1m])) > 1024*1024*50
      for: 10m
      labels:
        severity: critical
      annotations:
        description: 容器 {{ $labels.name }} network_receive 資源利用率大于 50M , (current value is {{ $value }})


root@master:/etc/ansible# systemctl restart prometheus
查看狀態(tài)

調(diào)整為5個
然后這個就變成紅色的

查看郵件已經(jīng)發(fā)送了

image.png

修改為%25

root@master:/usr/local/prometheus# vim rule-linux37.yml 
 11         description: 容器 {{ $labels.name }} CPU 資源利用率大于 25% , (current value is {{ $value }})
root@master:/usr/local/prometheus# systemctl restart prometheus

停止服務(wù)

root@master:/usr/local/prometheus# systemctl stop prometheus
root@master2:~# systemctl stop alertmanager.service 
prometheus監(jiān)控haproxy:
部署haproxy_exporter:
root@harbor:/usr/local/src# ln -sv /usr/local/src/haproxy_exporter-0.10.0.linux-amd64 /usr/local/harbor_exporter
啟動服務(wù)
./haproxy_exporter --haproxy.scrape-uri=unix:/run/haproxy/admin.sock

root@master:~# vim /usr/local/prometheus/prometheus.yml
  - job_name: 'prometheus--haproxy'
    static_configs:
    - targets: ['192.168.200.200:9101']

root@master:~# systemctl restart prometheus.service 

查看haproxy的
grafana添加模板
image.png
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時請結(jié)合常識與多方信息審慎甄別。
平臺聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點,簡書系信息發(fā)布平臺,僅提供信息存儲服務(wù)。

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容