refs #8776 PVE and Ceph monitoring
This commit is contained in:
parent
e969c33f65
commit
aef6cb5bc0
|
@ -1,4 +1,4 @@
|
|||
command[check_total_procs]=/usr/lib/nagios/plugins/check_procs -w 700 -c 1000
|
||||
command[check_total_procs]=/usr/lib/nagios/plugins/check_procs -w 1300 -c 1500
|
||||
command[check_chrony]=/usr/lib/nagios/plugins/check_chrony 1 2
|
||||
command[check_smartdisk]=/etc/nagios/plugins/check_smartdisk.sh /dev/sda /dev/sdb
|
||||
command[check_raid]=/usr/lib/nagios/plugins/check_raid
|
||||
|
|
|
@ -0,0 +1,163 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
#
|
||||
# Copyright (c) 2013 Catalyst IT http://www.catalyst.net.nz
|
||||
# Copyright (c) 2015 SWITCH http://www.switch.ch
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import print_function
|
||||
import argparse
|
||||
import socket
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
import json
|
||||
|
||||
__version__ = '1.5.0'
|
||||
|
||||
# default ceph values
|
||||
CEPH_EXEC = '/usr/bin/ceph'
|
||||
CEPH_COMMAND = 'quorum_status'
|
||||
|
||||
# nagios exit code
|
||||
STATUS_OK = 0
|
||||
STATUS_WARNING = 1
|
||||
STATUS_ERROR = 2
|
||||
STATUS_UNKNOWN = 3
|
||||
|
||||
##
|
||||
# ceph quorum_status output example
|
||||
##
|
||||
ceph_quorum_status_output_example = '''{
|
||||
"quorum_leader_name" : "s0001",
|
||||
"monmap" : {
|
||||
"mons" : [
|
||||
{
|
||||
"name" : "s0001",
|
||||
"addr" : "[2001:620:5ca1:8000::1001]:6789/0",
|
||||
"rank" : 0
|
||||
},
|
||||
{
|
||||
"name" : "s0003",
|
||||
"addr" : "[2001:620:5ca1:8000::1003]:6789/0",
|
||||
"rank" : 1
|
||||
}
|
||||
],
|
||||
"created" : "2014-12-15 08:28:35.153650",
|
||||
"epoch" : 2,
|
||||
"modified" : "2014-12-15 08:28:40.371878",
|
||||
"fsid" : "22348d2b-b69d-46cc-9a79-ca93cd6bae84"
|
||||
},
|
||||
"quorum_names" : [
|
||||
"s0001",
|
||||
"s0003"
|
||||
],
|
||||
"quorum" : [
|
||||
0,
|
||||
1
|
||||
],
|
||||
"election_epoch" : 24
|
||||
}'''
|
||||
|
||||
def main():
|
||||
|
||||
# parse args
|
||||
parser = argparse.ArgumentParser(description="'ceph quorum_status' nagios plugin.")
|
||||
parser.add_argument('-e','--exe', help='ceph executable [%s]' % CEPH_EXEC)
|
||||
parser.add_argument('-c','--conf', help='alternative ceph conf file')
|
||||
parser.add_argument('-m','--monaddress', help='ceph monitor to use for queries (address[:port])')
|
||||
parser.add_argument('-i','--id', help='ceph client id')
|
||||
parser.add_argument('-k','--keyring', help='ceph client keyring file')
|
||||
parser.add_argument('-V','--version', help='show version and exit', action='store_true')
|
||||
parser.add_argument('-I','--monid', help='mon ID to be checked for availability')
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.version:
|
||||
print('version %s' % __version__)
|
||||
return STATUS_OK
|
||||
|
||||
# validate args
|
||||
ceph_exec = args.exe if args.exe else CEPH_EXEC
|
||||
if not os.path.exists(ceph_exec):
|
||||
print("MON ERROR: ceph executable '%s' doesn't exist" % ceph_exec)
|
||||
return STATUS_UNKNOWN
|
||||
|
||||
if args.conf and not os.path.exists(args.conf):
|
||||
print("MON ERROR: ceph conf file '%s' doesn't exist" % args.conf)
|
||||
return STATUS_UNKNOWN
|
||||
|
||||
if args.keyring and not os.path.exists(args.keyring):
|
||||
print("MON ERROR: keyring file '%s' doesn't exist" % args.keyring)
|
||||
return STATUS_UNKNOWN
|
||||
|
||||
if not args.monid:
|
||||
print("MON ERROR: no MON ID given, use -I/--monid parameter")
|
||||
return STATUS_UNKNOWN
|
||||
|
||||
# build command
|
||||
ceph_cmd = [ceph_exec]
|
||||
if args.monaddress:
|
||||
ceph_cmd.append('-m')
|
||||
ceph_cmd.append(args.monaddress)
|
||||
if args.conf:
|
||||
ceph_cmd.append('-c')
|
||||
ceph_cmd.append(args.conf)
|
||||
if args.id:
|
||||
ceph_cmd.append('--id')
|
||||
ceph_cmd.append(args.id)
|
||||
if args.keyring:
|
||||
ceph_cmd.append('--keyring')
|
||||
ceph_cmd.append(args.keyring)
|
||||
ceph_cmd.append(CEPH_COMMAND)
|
||||
|
||||
# exec command
|
||||
p = subprocess.Popen(ceph_cmd,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
|
||||
output, err = p.communicate()
|
||||
|
||||
if p.returncode != 0 or not output:
|
||||
print("MON ERROR: %s" % err)
|
||||
return STATUS_ERROR
|
||||
|
||||
# load json output and parse
|
||||
quorum_status = False
|
||||
try:
|
||||
quorum_status = json.loads(output)
|
||||
except Exception as e:
|
||||
print("MON ERROR: could not parse '%s' output: %s: %s" % (CEPH_COMMAND,output,e))
|
||||
return STATUS_UNKNOWN
|
||||
|
||||
#print "XXX: quorum_status['quorum_names']:", quorum_status['quorum_names']
|
||||
|
||||
# do our checks
|
||||
is_monitor = False
|
||||
for mon in quorum_status['monmap']['mons']:
|
||||
if mon['name'] == args.monid:
|
||||
is_monitor = True
|
||||
if not is_monitor:
|
||||
print("MON WARN: mon '%s' is not in monmap: %s" % (args.monid,quorum_status['monmap']['mons']))
|
||||
return STATUS_WARNING
|
||||
|
||||
in_quorum = args.monid in quorum_status['quorum_names']
|
||||
if in_quorum:
|
||||
print("MON OK")
|
||||
return STATUS_OK
|
||||
else:
|
||||
print("MON WARN: no MON '%s' found in quorum" % args.monid)
|
||||
return STATUS_WARNING
|
||||
|
||||
# main
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
|
@ -0,0 +1,154 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
#
|
||||
# Copyright (c) 2013 Catalyst IT http://www.catalyst.net.nz
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
# 1.5.2 (2019-06-16) Martin Seener: fixed regex to work with Ceph Nautilus (14.2.x)
|
||||
|
||||
from __future__ import print_function
|
||||
import argparse
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
import socket
|
||||
|
||||
__version__ = '1.5.2'
|
||||
|
||||
# default ceph values
|
||||
CEPH_COMMAND = '/usr/bin/ceph'
|
||||
|
||||
# nagios exit code
|
||||
STATUS_OK = 0
|
||||
STATUS_WARNING = 1
|
||||
STATUS_ERROR = 2
|
||||
STATUS_UNKNOWN = 3
|
||||
|
||||
def main():
|
||||
|
||||
# parse args
|
||||
parser = argparse.ArgumentParser(description="'ceph osd' nagios plugin.")
|
||||
parser.add_argument('-e','--exe', help='ceph executable [%s]' % CEPH_COMMAND)
|
||||
parser.add_argument('-c','--conf', help='alternative ceph conf file')
|
||||
parser.add_argument('-m','--monaddress', help='ceph monitor address[:port]')
|
||||
parser.add_argument('-i','--id', help='ceph client id')
|
||||
parser.add_argument('-k','--keyring', help='ceph client keyring file')
|
||||
parser.add_argument('-V','--version', help='show version and exit', action='store_true')
|
||||
parser.add_argument('-H','--host', help='osd host', required=True)
|
||||
parser.add_argument('-I','--osdid', help='osd id', required=False)
|
||||
parser.add_argument('-C','--crit', help='Number of failed OSDs to trigger critical (default=2)',type=int,default=2, required=False)
|
||||
parser.add_argument('-o','--out', help='check osds that are set OUT', default=False, action='store_true', required=False)
|
||||
args = parser.parse_args()
|
||||
|
||||
# validate args
|
||||
ceph_exec = args.exe if args.exe else CEPH_COMMAND
|
||||
if not os.path.exists(ceph_exec):
|
||||
print("OSD ERROR: ceph executable '%s' doesn't exist" % ceph_exec)
|
||||
return STATUS_UNKNOWN
|
||||
|
||||
if args.version:
|
||||
print('version %s' % __version__)
|
||||
return STATUS_OK
|
||||
|
||||
if args.conf and not os.path.exists(args.conf):
|
||||
print("OSD ERROR: ceph conf file '%s' doesn't exist" % args.conf)
|
||||
return STATUS_UNKNOWN
|
||||
|
||||
if args.keyring and not os.path.exists(args.keyring):
|
||||
print("OSD ERROR: keyring file '%s' doesn't exist" % args.keyring)
|
||||
return STATUS_UNKNOWN
|
||||
|
||||
if not args.osdid:
|
||||
args.osdid = '[^ ]*'
|
||||
|
||||
if not args.host:
|
||||
print("OSD ERROR: no OSD hostname given")
|
||||
return STATUS_UNKNOWN
|
||||
|
||||
try:
|
||||
addrinfo = socket.getaddrinfo(args.host, None, 0, socket.SOCK_STREAM)
|
||||
args.host = addrinfo[0][-1][0]
|
||||
if addrinfo[0][0] == socket.AF_INET6:
|
||||
args.host = "[%s]" % args.host
|
||||
except:
|
||||
print('OSD ERROR: could not resolve %s' % args.host)
|
||||
return STATUS_UNKNOWN
|
||||
|
||||
|
||||
# build command
|
||||
ceph_cmd = [ceph_exec]
|
||||
if args.monaddress:
|
||||
ceph_cmd.append('-m')
|
||||
ceph_cmd.append(args.monaddress)
|
||||
if args.conf:
|
||||
ceph_cmd.append('-c')
|
||||
ceph_cmd.append(args.conf)
|
||||
if args.id:
|
||||
ceph_cmd.append('--id')
|
||||
ceph_cmd.append(args.id)
|
||||
if args.keyring:
|
||||
ceph_cmd.append('--keyring')
|
||||
ceph_cmd.append(args.keyring)
|
||||
ceph_cmd.append('osd')
|
||||
ceph_cmd.append('dump')
|
||||
|
||||
# exec command
|
||||
p = subprocess.Popen(ceph_cmd,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
|
||||
output, err = p.communicate()
|
||||
output = output.decode('utf8')
|
||||
|
||||
if err or not output:
|
||||
print("OSD ERROR: %s" % err)
|
||||
return STATUS_ERROR
|
||||
|
||||
# escape IPv4 host address
|
||||
osd_host = args.host.replace('.', '\.')
|
||||
# escape IPv6 host address
|
||||
osd_host = osd_host.replace('[', '\[')
|
||||
osd_host = osd_host.replace(']', '\]')
|
||||
up = re.findall(r"^(osd\.%s) up.*%s:" % (args.osdid, osd_host), output, re.MULTILINE)
|
||||
if args.out:
|
||||
down = re.findall(r"^(osd\.%s) down.*%s:" % (args.osdid, osd_host), output, re.MULTILINE)
|
||||
down_in = re.findall(r"^(osd\.%s) down[ ]+in.*%s:" % (args.osdid, osd_host), output, re.MULTILINE)
|
||||
down_out = re.findall(r"^(osd\.%s) down[ ]+out.*%s:" % (args.osdid, osd_host), output, re.MULTILINE)
|
||||
else:
|
||||
down = re.findall(r"^(osd\.%s) down[ ]+in.*%s:" % (args.osdid, osd_host), output, re.MULTILINE)
|
||||
down_in = down
|
||||
down_out = re.findall(r"^(osd\.%s) down[ ]+out.*%s:" % (args.osdid, osd_host), output, re.MULTILINE)
|
||||
|
||||
if down:
|
||||
print("OSD %s: Down OSD%s on %s: %s" % ('CRITICAL' if len(down)>=args.crit else 'WARNING' ,'s' if len(down)>1 else '', args.host, " ".join(down)))
|
||||
print("Up OSDs: " + " ".join(up))
|
||||
print("Down+In OSDs: " + " ".join(down_in))
|
||||
print("Down+Out OSDs: " + " ".join(down_out))
|
||||
print("| 'osd_up'=%d 'osd_down_in'=%d;;%d 'osd_down_out'=%d;;%d" % (len(up), len(down_in), args.crit, len(down_out), args.crit))
|
||||
if len(down)>=args.crit:
|
||||
return STATUS_ERROR
|
||||
else:
|
||||
return STATUS_WARNING
|
||||
|
||||
if up:
|
||||
print("OSD OK")
|
||||
print("Up OSDs: " + " ".join(up))
|
||||
print("Down+In OSDs: " + " ".join(down_in))
|
||||
print("Down+Out OSDs: " + " ".join(down_out))
|
||||
print("| 'osd_up'=%d 'osd_down_in'=%d;;%d 'osd_down_out'=%d;;%d" % (len(up), len(down_in), args.crit, len(down_out), args.crit))
|
||||
return STATUS_OK
|
||||
|
||||
print("OSD WARN: no OSD.%s found on host %s" % (args.osdid, args.host))
|
||||
return STATUS_WARNING
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
|
@ -0,0 +1,128 @@
|
|||
#!/usr/bin/env perl
|
||||
#===============================================================================
|
||||
# DESCRIPTION: Icinga2 / Nagios Check for chrony time sync status and offset
|
||||
#
|
||||
# OPTIONS: -h : Help
|
||||
# -w [warning threshold in seconds]
|
||||
# -c [critical threshold in seconds]
|
||||
#
|
||||
# REQUIREMENTS: Chrony, perl version 5.10.1+
|
||||
#
|
||||
# AUTHOR: Dennis Ullrich (request@decstasy.de)
|
||||
#
|
||||
# BUGS ETC: https://github.com/Decstasy/check_chrony
|
||||
#
|
||||
# LICENSE: GPL v3 (GNU General Public License, Version 3)
|
||||
# see https://www.gnu.org/licenses/gpl-3.0.txt
|
||||
#===============================================================================
|
||||
|
||||
use 5.10.1;
|
||||
use strict;
|
||||
use warnings;
|
||||
use utf8;
|
||||
use Getopt::Std;
|
||||
|
||||
#
|
||||
# Variables
|
||||
#
|
||||
my $chronyDaemonName = "chronyd";
|
||||
my $leapOk = "Normal";
|
||||
|
||||
my $rc = 3;
|
||||
my $msg= "";
|
||||
my $perfdata = "";
|
||||
|
||||
#
|
||||
# Subroutines
|
||||
#
|
||||
|
||||
sub help {
|
||||
print "check_chrony [options]
|
||||
-w [warning threshold in seconds]
|
||||
-c [critical threshold in seconds]
|
||||
e.g.: check_chrony -w 0.6 -c 2\n";
|
||||
exit(3);
|
||||
}
|
||||
|
||||
# Script exit with Nagios / Icinga typical output
|
||||
sub _exit {
|
||||
my ( $return, $line ) = @_;
|
||||
my @state = ( "OK", "WARNING", "CRITICAL", "UNKNOWN" );
|
||||
print "$state[$return]: $line\n";
|
||||
exit( $return );
|
||||
}
|
||||
|
||||
# Checks if a process with $_[0] as name exists
|
||||
sub proc_exists {
|
||||
my $PID = `ps -C $_[0] -o pid=`;
|
||||
if ( ${^CHILD_ERROR_NATIVE} == 0 ){
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
#
|
||||
# Options
|
||||
#
|
||||
|
||||
my %options=();
|
||||
getopts( "hw:c:", \%options );
|
||||
|
||||
# Check input
|
||||
if ( keys %options == 0 || defined $options{h} ){
|
||||
&help;
|
||||
}
|
||||
|
||||
for my $key ( keys %options ){
|
||||
if ( $options{$key} !~ /^[\d\.]+$/ ){
|
||||
&_exit( 3, "Value of option -$key is not a valid number!" );
|
||||
}
|
||||
}
|
||||
|
||||
#
|
||||
# Check chrony process
|
||||
#
|
||||
|
||||
&_exit( 2, "$chronyDaemonName is not running!" ) if not &proc_exists( $chronyDaemonName );
|
||||
|
||||
#
|
||||
# Get tracking data
|
||||
#
|
||||
|
||||
my $chronyOutput = `chronyc tracking`;
|
||||
&_exit( 3, "Chronyc tracking command failed!" ) if ${^CHILD_ERROR_NATIVE} != 0;
|
||||
|
||||
my ( $offset, $dir ) = $chronyOutput =~ /(?:System\stime)[^\d]+([\d\.]+)(?:.*?)(fast|slow)/;
|
||||
my ( $leap ) = $chronyOutput =~ /(?:Leap)[^\:]+(?::\s+)([\w\h]+)/;
|
||||
|
||||
#
|
||||
# Check stuff
|
||||
#
|
||||
|
||||
# Check offset
|
||||
if ( $offset >= $options{"c"} ){
|
||||
$rc = 2; # Critical
|
||||
}
|
||||
elsif ( $offset >= $options{"w"} ){
|
||||
$rc = 1; # Warning
|
||||
}
|
||||
else {
|
||||
$rc = 0; # Ok
|
||||
}
|
||||
|
||||
# Prepare offset performace data
|
||||
$offset = $dir =~ "slow" ? "-$offset" : "+$offset";
|
||||
$msg = sprintf( "Time offset of %+.9f seconds to reference.", $offset);
|
||||
$perfdata = sprintf( "|offset=%.9fs;%.9f;%.9f", ${offset}, $options{'w'}, $options{'c'});
|
||||
|
||||
# Check leap
|
||||
if( $leap !~ $leapOk ){
|
||||
&_exit( 2, "Chrony leap status \"$leap\" is not equal to \"$leapOk\"! $msg $perfdata" );
|
||||
}
|
||||
|
||||
#
|
||||
# Return stuff
|
||||
#
|
||||
|
||||
&_exit($rc, "$msg $perfdata");
|
||||
|
|
@ -0,0 +1 @@
|
|||
nagios ALL=(root) NOPASSWD: /usr/sbin/smartctl,/sbin/dmsetup
|
|
@ -1,8 +1,2 @@
|
|||
- name: Set NRPE Ceph configuration
|
||||
copy:
|
||||
src: nrpe.cfg
|
||||
dest: /etc/nagios/nrpe.d/95-ceph.cfg
|
||||
owner: root
|
||||
group: root
|
||||
mode: u=rw,g=r,o=r
|
||||
notify: restart-nrpe
|
||||
- import_tasks: nrpe.yml
|
||||
tags: nrpe
|
||||
|
|
|
@ -0,0 +1,24 @@
|
|||
- name: Set NRPE Ceph configuration
|
||||
copy:
|
||||
src: nrpe.cfg
|
||||
dest: /etc/nagios/nrpe.d/95-ceph.cfg
|
||||
owner: root
|
||||
group: root
|
||||
mode: u=rw,g=r,o=r
|
||||
notify: restart-nrpe
|
||||
- name: Copy Ceph NRPE plugins
|
||||
copy:
|
||||
src: nrpe/
|
||||
dest: /etc/nagios/plugins/
|
||||
owner: root
|
||||
group: root
|
||||
mode: u=rwx,g=rx,o=rx
|
||||
notify: restart-nrpe
|
||||
- name: Add nagios to sudoers
|
||||
copy:
|
||||
src: sudoers
|
||||
dest: /etc/sudoers.d/nagios
|
||||
mode: u=rw,g=r,o=
|
||||
owner: root
|
||||
group: root
|
||||
notify: restart-nrpe
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,22 @@
|
|||
#!/bin/bash
|
||||
# Checks status of disks SMART
|
||||
|
||||
STATUS_LABEL="SMART Health Status:"
|
||||
STATUS_OK="$STATUS_LABEL OK"
|
||||
|
||||
if [[ "$#" == "0" ]]; then
|
||||
echo "Usage: $0 <disk1> [<disk2> ... <diskX>]"
|
||||
exit
|
||||
fi
|
||||
|
||||
for DISK in "$@"
|
||||
do
|
||||
STATUS=$(sudo /usr/sbin/smartctl -H -d scsi "$DISK" | grep "$STATUS_LABEL")
|
||||
|
||||
if [ "$STATUS" != "$STATUS_OK" ]; then
|
||||
echo "CRITICAL: $DISK: $STATUS"
|
||||
exit 2
|
||||
fi
|
||||
done
|
||||
|
||||
echo "OK: $STATUS_OK"
|
|
@ -1,14 +1,20 @@
|
|||
- name: Install base packages
|
||||
apt:
|
||||
name:
|
||||
- libxml-simple-perl
|
||||
- libmonitoring-plugin-perl
|
||||
state: present
|
||||
- name: Set NRPE PVE configuration
|
||||
copy:
|
||||
src: nrpe.cfg
|
||||
dest: /etc/nagios/nrpe.d/95-pve.cfg
|
||||
src: nrpe.d/95-pve.cfg
|
||||
dest: /etc/nagios/nrpe.d/
|
||||
owner: root
|
||||
group: root
|
||||
mode: u=rw,g=r,o=r
|
||||
notify: restart-nrpe
|
||||
- name: Copy PVE NRPE plugins
|
||||
copy:
|
||||
src: nrpe/
|
||||
src: plugins/
|
||||
dest: /etc/nagios/plugins/
|
||||
owner: root
|
||||
group: root
|
||||
|
|
|
@ -7,8 +7,8 @@
|
|||
group: root
|
||||
register: copy_result
|
||||
|
||||
- name: Reboot the system if file was copied
|
||||
reboot:
|
||||
reboot_timeout: 600
|
||||
become: true
|
||||
when: copy_result.changed
|
||||
#- name: Reboot the system if file was copied
|
||||
# reboot:
|
||||
# reboot_timeout: 600
|
||||
# become: true
|
||||
# when: copy_result.changed
|
Loading…
Reference in New Issue