用户工具

站点工具


linux:prometheus

blog promQL grafana

wget https://github.com/prometheus/prometheus/releases/download/v2.28.1/prometheus-2.28.1.linux-amd64.tar.gz
./prometheus --version
./prometheus --config.file=prometheus.yml
curl http://localhost:9090/metrics
http://localhost:9090/graph  //PromQL
wget https://github.com/prometheus/node_exporter/releases/download/v1.2.0/node_exporter-1.2.0.linux-amd64.tar.gz
./node_server &> /dev/null &
curl http://localhost:9100/metrics
vi prometheus.yml //默认scrape_configs包含prometheus自己,其他exporter需要手动配置
scrape_configs:  
  - job_name: 'prometheus'  
    static_configs:  
      - targets: ['192.168.0.107:9090']  
  - job_name: 'server'  
    static_configs:  
      - targets: ['192.168.0.107:9100'] 
kill -HUP prometheus  //重新加载配置
http://localhost:9090/targets  //这里可以看见所有目标状态
  • 监控指标:alert.rules
服务宕机:up{job="server"} == 0
cpu利用率:ceil(100 - sum(increase(node_cpu_seconds_total{job="server",mode="idle"}[5m]))  by(instance) / sum(increase(node_cpu_seconds_total{job="server"}[5m]))  by(instance)*100) >= 80
磁盘利用率:round((1 - (node_filesystem_avail_bytes{fstype=~"ext3|ext4|xfs|nfs",job="server"} / node_filesystem_size_bytes{fstype=~"ext3|ext4|xfs|nfs",job="server"})) * 100)  >= 80
内存利用率:ceil((1 - (node_memory_MemAvailable_bytes{job="server"} / (node_memory_MemTotal_bytes{job="server"})))* 100 ) >= 80
负载:node_load1{job="server"} >=50
文件句柄数:node_filefd_allocated{job="server"} >=50000
TCP连接数:node_sockstat_TCP_tw{job="server"} >=5000
入口流量:round((sum
  by(instance) (irate(node_network_receive_bytes_total{device!~"tap.*|veth.*|br.*|docker.*|virbr*|lo*",job="server"}[5m])))
  / 1024 / 1024) > 50
出口流量:round((sum
  by(instance) (irate(node_network_transmit_bytes_total{device!~"tap.*|veth.*|br.*|docker.*|virbr*|lo*",job="server"}[5m])))
  / 1024 / 1024) > 50
  • 报警规则
vi prometheus.yml  //编辑好之后重新加载配置即可,kill -HUP pid
rule_files:  
  - "alert.rules" 
vi alert.rules  //规则文件参考官方文档,/alerts查看报警,pending表示激活规则 firing表示触发告警
groups:  
- name: example  
  rules:  
   
  # Alert for any instance that is unreachable for >5 minutes.  
  - alert: InstanceDown  
    expr: up == 0  
    for: 5m  
    labels:  
      severity: page  
    annotations:  
      summary: "Instance {{ $labels.instance }} down"  
      description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."  
   
  # Alert for any instance that has a median request latency >1s.  
  - alert: APIHighRequestLatency  
    expr: api_http_request_latencies_second{quantile="0.5"} > 1  
    for: 10m  
    annotations:  
      summary: "High request latency on {{ $labels.instance }}"  
      description: "{{ $labels.instance }} has a median request latency above 1s (current value: {{ $value }}s)"  
wget https://github.com/prometheus/alertmanager/releases/download/v0.22.2/alertmanager-0.22.2.linux-amd64.tar.gz
./alertmanager &> /dev/null &
vi prometheus.yml  //配置alertmanager
alerting:  
  alertmanagers:  
  - scheme: http  
    static_configs:  
    - targets:  
      - "192.168.0.107:9093" 
vi alertmanager.yml //配置报警通知方式,kill -HUP pid更新配置
route:
  group_by: ['alertname']
  group_wait: 30s
  group_interval: 5m
  repeat_interval: 1h
  receiver: 'web.hook'
receivers:
- name: 'web.hook'
  webhook_configs:
  #- url: 'https://log.xlongwei.com/lajax?email=邮件通知&openid=微信通知'
  - url: 'https://log.xlongwei.com/lajax' 
webhook处理报警请求正文:{code:0,msg}={commonAnnotations:{summary概要,description详情},alerts:{}}
{
    "receiver":"webhook",
    "status":"resolved",
    "alerts":[
        {
            "status":"resolved",
            "labels":{
                "alertname":"hostCpuUsageAlert",
                "instance":"192.168.199.24:9100",
                "severity":"page"
            },
            "annotations":{
                "description":"192.168.199.24:9100 CPU 使用率超过 85% (当前值为: 0.9973333333333395)",
                "summary":"机器 192.168.199.24:9100 CPU 使用率过高"
            },
            "startsAt":"2020-02-29T19:45:21.799548092+08:00",
            "endsAt":"2020-02-29T19:49:21.799548092+08:00",
            "generatorURL":"http://localhost.localdomain:9090/graph?g0.expr=sum+by%28instance%29+%28avg+without%28cpu%29+%28irate%28node_cpu_seconds_total%7Bmode%21%3D%22idle%22%7D%5B5m%5D%29%29%29+%3E+0.85&g0.tab=1",
            "fingerprint":"368e9616d542ab48"
        }
    ],
    "groupLabels":{
        "alertname":"hostCpuUsageAlert"
    },
    "commonLabels":{
        "alertname":"hostCpuUsageAlert",
        "instance":"192.168.199.24:9100",
        "severity":"page"
    },
    "commonAnnotations":{
        "description":"192.168.199.24:9100 CPU 使用率超过 85% (当前值为: 0.9973333333333395)",
        "summary":"机器 192.168.199.24:9100 CPU 使用率过高"
    },
    "externalURL":"http://localhost.localdomain:9093",
    "version":"4",
    "groupKey":"{}:{alertname="hostCpuUsageAlert"}"
}
  • JMX指标:io.prometheus.jmx:jmx_prometheus_javaagent:0.16.1
-javaagent:/home/tomcat/ftpData/prometheus/jmx_prometheus_javaagent-0.16.1.jar=9404:/home/tomcat/ftpData/prometheus/config.yml
# 同一台服务器有多个微服务时,不方便都配置9404端口,更好的是应用直接依赖prometheus
        <dependency>
            <groupId>io.micrometer</groupId>
            <artifactId>micrometer-registry-prometheus</artifactId>
        </dependency>
# springboot2已集成prometheus,/actuator/prometheus查看指标,metrics_path配置指标路径
  - job_name: 'cmp'
    metrics_path: '/actuator/prometheus'
    static_configs:
    - targets: ['10.7.128.28:8030']
  • 服务发现
static_configs:
- targets: ['host:port','host:port']

file_sd_configs: #基于文件发现服务,文件由其他系统定时导出,不必kill -HUP prometheus
- files: # ['sd_config/*.yml','file']]
  - targets/*.json # 手动更新:promtool check config prometheus.yml
  refresh_interval: 5m # - targets: ['host:port','host:port']

dns_sd_configs
consul_sd_configs
kubernetes_sd_configs
linux/prometheus.txt · 最后更改: 2021/07/28 11:44 由 admin