influxdb + grafana 搭建监控系统

图片很大,4K显示器截图的!

2.1 grafana

2.1.1 安装

wget https://s3-us-west-2.amazonaws.com/grafana-releases/release/grafana-5.2.2-1.x86_64.rpm 
sudo yum localinstall grafana-5.2.2-1.x86_64.rpm 
systemctl enable grafana-server

2.1.2 可选plugin

grafana-cli plugins install raintank-worldping-app
grafana-cli plugins install grafana-clock-panel
grafana-cli plugins install grafana-piechart-panel
grafana-cli plugins install novalabs-annotations-panel
grafana-cli plugins install vonage-status-panel
grafana-cli plugins install mtanda-histogram-panel

2.2 influxdb

InfluxDB默认使用下面的网络端口:

  • 8086/tcp: 用作InfluxDB的客户端和服务端的http api通信;
  • 8088/tcp: 给备份和恢复数据的RPC服务使用;
  • 25826/udp: collectd插件监听端口,接受客户端的collectd发送的数据;1)

2.2.1 安装

wget https://dl.influxdata.com/influxdb/releases/influxdb-1.6.1.x86_64.rpm
sudo yum localinstall influxdb-1.6.1.x86_64.rpm
systemctl enable influxdb

2.2.2 配置

[meta]
  dir = "/var/lib/influxdb/meta"
[data]
  query-log-enabled = false
  dir = "/var/lib/influxdb/data"
  wal-dir = "/var/lib/influxdb/wal"
  cache-max-memory-size = "4g"
  cache-snapshot-memory-size = "512m"
  cache-snapshot-write-cold-duration = "256m"
[coordinator]
[retention]
[shard-precreation]
[monitor]
[http]
  log-enabled = false
[ifql]
[logging]
  level = "error"
[subscriber]
[[graphite]]
[[collectd]]
# 配置''/etc/influxdb/influxdb.conf''以支持collectd
enabled = true
bind-address = ":25826"
database = "collectdb"
typesdb = "/usr/share/collectd/types.db"
batch-size = 5000
batch-pending = 10
batch-timeout = "10s"
read-buffer = 0
[[opentsdb]]
[[udp]]
[continuous_queries]

2.2.3 influxdb常用SQL操作

[root@monitor ~]# influx
> create database collectdb
> show databases
> create user monitor with password 'monitor'
> show users
> use collectdb
> grant all on collectdb to monitor
> show MEASUREMENTS
> SELECT * FROM ping_value

2.2.4 关于influxdb SQL语法

查询接口带宽:

# demo
SELECT 8 * derivative(mean("value"),1s) AS "value" FROM "interface_rx" WHERE ("host" = 'WWW-DMZ' AND "type" = 'if_octets' AND "instance" = 'eth0')  AND $timeFilter GROUP BY TIME($interval) fill(NONE)
# 解释,[[替换里面的查询条件,包括中括号本身]]
SELECT 8 * derivative(mean("value"),1s) AS "value" FROM [["数据源" WHERE (过滤条件)]] AND $timeFilter GROUP BY TIME($interval) fill(NONE)

2.2.5 influxdb API

curl -G 'http://localhost:8086/query?pretty=true' --data-urlencode "db=collectdb" --data-urlencode "q=SELECT * FROM hddtemp_value ORDER BY time DESC LIMIT 3"

3.1 telegraf

telegraf和influxdb是同一家公司的产品,我之前使用的collectd,感觉够用是没问题,就是配置语法,文档方面还是显得有点老态龙钟,没有很新潮的风格. 于是考虑telegraf,基本上功能 >= collectd.

3.1.1 安装

Warning: 一定安装nightly版本,因为双路服务器上拥有2个CPU,比如DELL的sensor输出温度都为Temp,所以不能区分。 于是master branch引入metric_version = 2 来解决此问题!本人参与了这个issue

# Ubuntu & Debian
wget https://dl.influxdata.com/telegraf/nightlies/telegraf_nightly_amd64.deb
sudo dpkg -i telegraf_nightly_amd64.deb
# RedHat & CentOS
wget https://dl.influxdata.com/telegraf/nightlies/telegraf-nightly.x86_64.rpm
sudo yum localinstall telegraf-nightly.x86_64.rpm
# 启动
systemctl enable telegraf
systemctl start telegraf
systemctl status telegraf

3.1.2 配置注意

配置权限,不然报错找不到ipmi设备:

要么修改/etc/systemd/system/multi-user.target.wants/telegraf.serviceUser=root,然后 systemctl daemon-reload,要么配置udev:

# make a udev rule
cat > /etc/udev/rules.d/52-telegraf-ipmi.rules <<'EOF'
KERNEL=="ipmi*", MODE="660", GROUP="telegraf"
EOF
# reload udev
udevadm control --reload-rules && udevadm trigger
# restart telegraf
systemctl restart telegraf.service

配置sudo:

Cmnd_Alias TELEGRAF = /usr/bin/ipmitool,/usr/sbin/smartctl
telegraf ALL=(root) NOPASSWD:TELEGRAF

3.1.3 telegraf.conf demo

[global_tags]
[agent]
  interval = "10s"
  round_interval = true
  metric_batch_size = 1000
  metric_buffer_limit = 10000
  collection_jitter = "0s"
  flush_interval = "10s"
  flush_jitter = "0s"
  precision = ""
  debug = false
  quiet = false
  logfile = ""
  hostname = ""
  omit_hostname = false
[[outputs.influxdb]]
  urls = ["http://172.17.1.120:8086"]
  database = "telegraf"
[[inputs.cpu]]
  percpu = true
  totalcpu = true
  collect_cpu_time = false
  report_active = false
[[inputs.disk]]
  ignore_fs = ["tmpfs", "devtmpfs", "devfs"]
[[inputs.diskio]]
[[inputs.kernel]]
[[inputs.mem]]
[[inputs.processes]]
[[inputs.swap]]
[[inputs.system]]
 [[inputs.net]]
 interfaces = ["eth*","em*","eno*","br*"]
 ignore_protocol_stats = true
 
 [[inputs.ping]]
   urls = ["bwg.xargs.cn","tx.xargs.cn"]
[[inputs.ipmi_sensor]]
   interval = "30s"
   timeout = "20s"
   metric_version = 2
 [[inputs.net]]
 interfaces = ["eth*","em*","eno*","br*"]
 ignore_protocol_stats = true
 [[inputs.smart]]
 interval = "30m"
 use_sudo = true
 nocheck = "standby"
 attributes = true
 [[inputs.zfs]]
 poolMetrics = true
 
[[inputs.exec]]
  interval = "10m"
  commands = ["sh /etc/telegraf/telegraf.d/wan-ip.sh"]
  timeout = "10s"
  data_format = "value"
  data_type = "string"
  name_suffix = "_wanip"

 [[inputs.snmp]]
 interval = "30s"
   agents = [ "10.1.1.75:161" ]
   timeout = "5s"
   retries = 3
   version = 1
   community = "public"
name = "ups"
[[inputs.snmp.field]]
  name = "InputVoltage"
  oid = ".1.3.6.1.4.1.318.1.1.1.3.2.1.0"
[[inputs.snmp.field]]
  name = "OutputVoltage"
  oid = ".1.3.6.1.4.1.318.1.1.1.4.2.1.0"
[[inputs.snmp.field]]
  name = "upsHighPrecOutputVoltage"
  oid = ".1.3.6.1.4.1.318.1.1.1.4.3.1.0"
[[inputs.snmp.field]]
  name = "upsHighPrecOutputCurrent"
  oid = ".1.3.6.1.4.1.318.1.1.1.4.3.4.0"
[[inputs.snmp.field]]
  name = "upsHighPrecOutputLoad"
  oid = ".1.3.6.1.4.1.318.1.1.1.4.3.3.0"
[[inputs.snmp.field]]
  name = "BatteryReplaceIndicator"
  oid = ".1.3.6.1.4.1.318.1.1.1.2.2.4.0"
[[inputs.snmp.field]]
  name = "InternalTemperature"
  oid = ".1.3.6.1.4.1.318.1.1.1.2.2.2.0"
[[inputs.snmp.field]]
  name = "BatteryActualVoltage"
  oid = ".1.3.6.1.4.1.318.1.1.1.2.2.8.0"
[[inputs.snmp.field]]
  name = "TimeRemaining"
  oid = ".1.3.6.1.4.1.318.1.1.1.2.2.3.0"
[[inputs.snmp.field]]
  name = "upsBasicBatteryTimeOnBattery"
  oid = ".1.3.6.1.4.1.318.1.1.1.2.1.2.0"
[[inputs.snmp.field]]
  name = "BatteryCapacity"
  oid = ".1.3.6.1.4.1.318.1.1.1.2.2.1.0"
[[inputs.snmp.field]]
  name = "uioTemperatureDegC"
  oid = "SNMPv2-SMI::enterprises.318.1.1.25.1.2.1.6.1.1"
[[inputs.snmp.field]]
  name = "uioTemperatureDegC-2"
  oid = "SNMPv2-SMI::enterprises.318.1.1.25.1.2.1.6.2.1"
[[inputs.snmp.field]]
  name = "TestDiagnosticsResults"
  oid = ".1.3.6.1.4.1.318.1.1.1.7.2.3.0"
[[inputs.snmp.field]]
  name = "TestLastDiagnosticsDate"
  oid = ".1.3.6.1.4.1.318.1.1.1.7.2.4.0"
[[inputs.snmp.field]]
  name = "upsAdvInputLineFailCause"
  oid = ".1.3.6.1.4.1.318.1.1.1.3.2.5.0"
[[inputs.snmp.field]]
 name = "upsCommStatus"
 oid = ".1.3.6.1.4.1.318.1.1.1.8.1.0"

3.1.4 测试

# 查看telegraf抓取的所有数据
telegraf -test
# 查看telegraf抓取的内存和网络数据(-input-filter的值为配置文件中inputs.xxx中的xxx,可以设置多个值,使用冒号分隔)
telegraf -test -input-filter mem:diskio

3.2 collectd

3.2.1 安装

# 在被监控端安装collectd及其额外插件的依赖
yum install collectd collectd-smart collectd-ping collectd-ipmi collectd-snmp

3.2.2 配置

# 测试配置文件并退出
collectd -t /etc/collectd.conf
# 测试plugin并退出
collectd -T

点击以显示 ⇲

点击以隐藏 ⇱

collectd.conf
Hostname    "r720xd"
FQDNLookup   false
#BaseDir     "/var/lib/collectd"
#PIDFile     "/var/run/collectd.pid"
#PluginDir   "/usr/lib64/collectd"
TypesDB     "/usr/share/collectd/types.db"
AutoLoadPlugin true
Interval 10
 
LoadPlugin users
LoadPlugin hddtemp
LoadPlugin uptime
LoadPlugin processes
LoadPlugin zfs_arc
 
LoadPlugin network
<plugin network>
	Server "172.17.1.125" "25826"
</plugin>
 
 
LoadPlugin cpu
<Plugin cpu>
	ReportByCpu true
	ReportByState true
	ValuesPercentage true
	# ReportNumCpu false
	# ReportGuestState false
	# SubtractGuestState true
</Plugin>
 
 
<LoadPlugin df>
	Interval 600
</LoadPlugin>
<Plugin df>
	# Device "/dev/hda1"
	# Device "192.168.0.2:/mnt/nfs"
	# MountPoint "/home"
	# FSType "ext3"
	# IgnoreSelected false
	# ReportByDevice false
	# ReportInodes false
	# ValuesAbsolute true
	ValuesPercentage true
</Plugin>
 
 
<LoadPlugin disk>
	Interval 600
</LoadPlugin>
<Plugin disk>
	# Disk "/^[hs]d[a-f][0-9]?$/"
	# IgnoreSelected false
	# UseBSDName false
	# UdevNameAttr "DEVNAME"
</Plugin>
 
 
<LoadPlugin smart>
	Interval 3600
</LoadPlugin>
<Plugin smart>
	Disk "/^[hs]d[a-z][0-9]?$/"
	# Disk "sda" 
	# Disk "sdb"
	IgnoreSelected false
</Plugin>
 
 
LoadPlugin load
<Plugin load>
	ReportRelative true
</Plugin>
 
 
LoadPlugin swap
<Plugin swap>
	# ReportByDevice false
	# ReportBytes true
	# ValuesAbsolute true
	# ValuesPercentage false
	# ReportIO true
</Plugin>
 
 
LoadPlugin memory
<Plugin memory>
	# ValuesAbsolute true
	# ValuesPercentage false
</Plugin>
 
 
#LoadPlugin ping
#<Plugin ping>
#    Host "dgc.xargs.cn"
#    Host "tx.xargs.cn"
#    Interval 3.0
#    Timeout 1.0
	# Device "eth0"
	# MaxMissed -1
#</Plugin>
 
 
LoadPlugin interface
<Plugin interface>
	# Interface "eth0"
	# IgnoreSelected false
	# ReportInactive true
	# UniqueName false
</Plugin>
 
 
LoadPlugin ipmi
<Plugin ipmi>
#  <Instance "local">
      IgnoreSelected true
      NotifySensorAdd false
      NotifySensorRemove false
      NotifySensorNotPresent false
      NotifyIPMIConnectionState false
      SELEnabled false
      SELClearEvent false
#  </Instance>
  # <Instance "remote">
      # Host "server.example.com"
      # Address  "1.2.3.4"
      # Username "user"
      # Password "secret"
      # AuthType "md5"
      # Sensor "some_sensor"
      # Sensor "another_one"
      # IgnoreSelected false
      # NotifySensorAdd false
      # NotifySensorRemove true
      # NotifySensorNotPresent false
      # NotifyIPMIConnectionState false
      # SELEnabled false
      # SELClearEvent false
  # </Instance>
</Plugin>
 
Include "/etc/collectd/collectd.conf.d/"


1)
InfluxDB也提供了多个可能需要自定义端口的插件,所有端口映射都可以通过配置文件修改,对于默认安装的InfluxDB,这个配置文件位于/etc/influxdb/influxdb.conf
  • monitoring/influxdb_grafana监控系统.txt
  • 最后更改: 2019/08/13 16:22
  • 由 mrco