wget https://github.com/prometheus/prometheus/releases/download/v2.28.1/prometheus-2.28.1.linux-amd64.tar.gz ./prometheus --version ./prometheus --config.file=prometheus.yml curl http://localhost:9090/metrics http://localhost:9090/graph //PromQL
wget https://github.com/prometheus/node_exporter/releases/download/v1.2.0/node_exporter-1.2.0.linux-amd64.tar.gz ./node_server &> /dev/null & curl http://localhost:9100/metrics vi prometheus.yml //默认scrape_configs包含prometheus自己,其他exporter需要手动配置 scrape_configs: - job_name: 'prometheus' static_configs: - targets: ['192.168.0.107:9090'] - job_name: 'server' static_configs: - targets: ['192.168.0.107:9100'] kill -HUP prometheus //重新加载配置 http://localhost:9090/targets //这里可以看见所有目标状态
服务宕机:up{job="server"} == 0 cpu利用率:ceil(100 - sum(increase(node_cpu_seconds_total{job="server",mode="idle"}[5m])) by(instance) / sum(increase(node_cpu_seconds_total{job="server"}[5m])) by(instance)*100) >= 80 磁盘利用率:round((1 - (node_filesystem_avail_bytes{fstype=~"ext3|ext4|xfs|nfs",job="server"} / node_filesystem_size_bytes{fstype=~"ext3|ext4|xfs|nfs",job="server"})) * 100) >= 80 内存利用率:ceil((1 - (node_memory_MemAvailable_bytes{job="server"} / (node_memory_MemTotal_bytes{job="server"})))* 100 ) >= 80 负载:node_load1{job="server"} >=50 文件句柄数:node_filefd_allocated{job="server"} >=50000 TCP连接数:node_sockstat_TCP_tw{job="server"} >=5000 入口流量:round((sum by(instance) (irate(node_network_receive_bytes_total{device!~"tap.*|veth.*|br.*|docker.*|virbr*|lo*",job="server"}[5m]))) / 1024 / 1024) > 50 出口流量:round((sum by(instance) (irate(node_network_transmit_bytes_total{device!~"tap.*|veth.*|br.*|docker.*|virbr*|lo*",job="server"}[5m]))) / 1024 / 1024) > 50
vi prometheus.yml //编辑好之后重新加载配置即可,kill -HUP pid rule_files: - "alert.rules" vi alert.rules //规则文件参考官方文档,/alerts查看报警,pending表示激活规则 firing表示触发告警 groups: - name: example rules: # Alert for any instance that is unreachable for >5 minutes. - alert: InstanceDown expr: up == 0 for: 5m labels: severity: page annotations: summary: "Instance {{ $labels.instance }} down" description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes." # Alert for any instance that has a median request latency >1s. - alert: APIHighRequestLatency expr: api_http_request_latencies_second{quantile="0.5"} > 1 for: 10m annotations: summary: "High request latency on {{ $labels.instance }}" description: "{{ $labels.instance }} has a median request latency above 1s (current value: {{ $value }}s)"
wget https://github.com/prometheus/alertmanager/releases/download/v0.22.2/alertmanager-0.22.2.linux-amd64.tar.gz ./alertmanager &> /dev/null & vi prometheus.yml //配置alertmanager alerting: alertmanagers: - scheme: http static_configs: - targets: - "192.168.0.107:9093" vi alertmanager.yml //配置报警通知方式,kill -HUP pid更新配置 route: group_by: ['alertname'] group_wait: 30s group_interval: 5m repeat_interval: 1h receiver: 'web.hook' receivers: - name: 'web.hook' webhook_configs: #- url: 'https://log.xlongwei.com/lajax?email=邮件通知&openid=微信通知' - url: 'https://log.xlongwei.com/lajax' webhook处理报警请求正文:{code:0,msg}={commonAnnotations:{summary概要,description详情},alerts:{}} { "receiver":"webhook", "status":"resolved", "alerts":[ { "status":"resolved", "labels":{ "alertname":"hostCpuUsageAlert", "instance":"192.168.199.24:9100", "severity":"page" }, "annotations":{ "description":"192.168.199.24:9100 CPU 使用率超过 85% (当前值为: 0.9973333333333395)", "summary":"机器 192.168.199.24:9100 CPU 使用率过高" }, "startsAt":"2020-02-29T19:45:21.799548092+08:00", "endsAt":"2020-02-29T19:49:21.799548092+08:00", "generatorURL":"http://localhost.localdomain:9090/graph?g0.expr=sum+by%28instance%29+%28avg+without%28cpu%29+%28irate%28node_cpu_seconds_total%7Bmode%21%3D%22idle%22%7D%5B5m%5D%29%29%29+%3E+0.85&g0.tab=1", "fingerprint":"368e9616d542ab48" } ], "groupLabels":{ "alertname":"hostCpuUsageAlert" }, "commonLabels":{ "alertname":"hostCpuUsageAlert", "instance":"192.168.199.24:9100", "severity":"page" }, "commonAnnotations":{ "description":"192.168.199.24:9100 CPU 使用率超过 85% (当前值为: 0.9973333333333395)", "summary":"机器 192.168.199.24:9100 CPU 使用率过高" }, "externalURL":"http://localhost.localdomain:9093", "version":"4", "groupKey":"{}:{alertname="hostCpuUsageAlert"}" }
-javaagent:/home/tomcat/ftpData/prometheus/jmx_prometheus_javaagent-0.16.1.jar=9404:/home/tomcat/ftpData/prometheus/config.yml # 同一台服务器有多个微服务时,不方便都配置9404端口,更好的是应用直接依赖prometheus <dependency> <groupId>io.micrometer</groupId> <artifactId>micrometer-registry-prometheus</artifactId> </dependency> # springboot2已集成prometheus,/actuator/prometheus查看指标,metrics_path配置指标路径 - job_name: 'cmp' metrics_path: '/actuator/prometheus' static_configs: - targets: ['10.7.128.28:8030']
static_configs: - targets: ['host:port','host:port'] file_sd_configs: #基于文件发现服务,文件由其他系统定时导出,不必kill -HUP prometheus - files: # ['sd_config/*.yml','file']] - targets/*.json # 手动更新:promtool check config prometheus.yml refresh_interval: 5m # - targets: ['host:port','host:port'] dns_sd_configs consul_sd_configs kubernetes_sd_configs