groups:
- name: 主机状态-监控告警
rules:
- alert: 主机状态
expr: up {job="kubernetes-nodes"} == 0
for: 15s
labels:
status: 非常严重
annotations:
summary: "{{.instance}}:服务器宕机"
description: "{{.instance}}:服务器延时超过15s"
- alert: CPU使用情况
expr: 100-(avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by(instance)* 100) > 60
for: 1m
labels:
status: warning
annotations:
summary: "{{$labels.instance}}: High CPU Usage Detected"
description: "{{$labels.instance}}: CPU usage is {{$value}}, above 60%"
- alert: NodeFilesystemUsage
expr: 100 - (node_filesystem_free_bytes{fstype=~"ext4|xfs"} / node_filesystem_size_bytes{fstype=~"ext4|xfs"} * 100) > 80
for: 1m
labels:
severity: warning
annotations:
summary: "Instance {{ $labels.instance }} : {{ $labels.mountpoint }} 分区使用率过高"
description: "{{ $labels.instance }}: {{ $labels.mountpoint }} 分区使用大于80% (当前值: {{ $value }})"
- alert: 内存使用
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 80
for: 1m
labels:
status: 严重告警
annotations:
summary: "{{ $labels.instance}} 内存使用率过高!"
description: "{{ $labels.instance }} 内存使用大于80%(目前使用:{{ $value}}%)"
- alert: IO性能
expr: (avg(irate(node_disk_io_time_seconds_total[1m])) by(instance)* 100) > 60
for: 1m
labels:
status: 严重告警
annotations:
summary: "{{$labels.instance}} 流入磁盘IO使用率过高!"
description: "{{ $labels.instance }} 流入磁盘IO大于60%(目前使用:{{ $value }})"
- alert: 网络
expr: ((sum(rate (node_network_receive_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])) by (instance)) / 100) > 102400
for: 1m
labels:
status: 严重告警
annotations:
summary: "{{ $labels.instance}} 流入网络带宽过高!"
description: "{{ $labels.instance }}流入网络带宽持续2分钟高于100M. RX带宽使用率{{ $value }}"
- alert: TCP会话
expr: node_netstat_Tcp_CurrEstab > 1000
for: 1m
labels:
status: 严重告警
annotations:
summary: "{{ $labels.instance }} TCP_ESTABLISHED过高!"
description: "{{ $labels.instance }} TCP_ESTABLISHED大于1000%(目前使用:{{ $value }}%)"
node_rules.yml
未经允许不得转载:工具盒子 » node_rules.yml