refs #8776 PVE and Ceph monitoring

This commit is contained in:
Juan Ferrer 2025-03-17 17:49:06 +01:00
parent e969c33f65
commit aef6cb5bc0
15 changed files with 1743 additions and 17 deletions

View File

@ -1,4 +1,4 @@
command[check_total_procs]=/usr/lib/nagios/plugins/check_procs -w 700 -c 1000
command[check_total_procs]=/usr/lib/nagios/plugins/check_procs -w 1300 -c 1500
command[check_chrony]=/usr/lib/nagios/plugins/check_chrony 1 2
command[check_smartdisk]=/etc/nagios/plugins/check_smartdisk.sh /dev/sda /dev/sdb
command[check_raid]=/usr/lib/nagios/plugins/check_raid

View File

@ -0,0 +1,163 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Copyright (c) 2013 Catalyst IT http://www.catalyst.net.nz
# Copyright (c) 2015 SWITCH http://www.switch.ch
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from __future__ import print_function
import argparse
import socket
import os
import re
import subprocess
import sys
import json
__version__ = '1.5.0'
# default ceph values
CEPH_EXEC = '/usr/bin/ceph'
CEPH_COMMAND = 'quorum_status'
# nagios exit code
STATUS_OK = 0
STATUS_WARNING = 1
STATUS_ERROR = 2
STATUS_UNKNOWN = 3
##
# ceph quorum_status output example
##
ceph_quorum_status_output_example = '''{
"quorum_leader_name" : "s0001",
"monmap" : {
"mons" : [
{
"name" : "s0001",
"addr" : "[2001:620:5ca1:8000::1001]:6789/0",
"rank" : 0
},
{
"name" : "s0003",
"addr" : "[2001:620:5ca1:8000::1003]:6789/0",
"rank" : 1
}
],
"created" : "2014-12-15 08:28:35.153650",
"epoch" : 2,
"modified" : "2014-12-15 08:28:40.371878",
"fsid" : "22348d2b-b69d-46cc-9a79-ca93cd6bae84"
},
"quorum_names" : [
"s0001",
"s0003"
],
"quorum" : [
0,
1
],
"election_epoch" : 24
}'''
def main():
# parse args
parser = argparse.ArgumentParser(description="'ceph quorum_status' nagios plugin.")
parser.add_argument('-e','--exe', help='ceph executable [%s]' % CEPH_EXEC)
parser.add_argument('-c','--conf', help='alternative ceph conf file')
parser.add_argument('-m','--monaddress', help='ceph monitor to use for queries (address[:port])')
parser.add_argument('-i','--id', help='ceph client id')
parser.add_argument('-k','--keyring', help='ceph client keyring file')
parser.add_argument('-V','--version', help='show version and exit', action='store_true')
parser.add_argument('-I','--monid', help='mon ID to be checked for availability')
args = parser.parse_args()
if args.version:
print('version %s' % __version__)
return STATUS_OK
# validate args
ceph_exec = args.exe if args.exe else CEPH_EXEC
if not os.path.exists(ceph_exec):
print("MON ERROR: ceph executable '%s' doesn't exist" % ceph_exec)
return STATUS_UNKNOWN
if args.conf and not os.path.exists(args.conf):
print("MON ERROR: ceph conf file '%s' doesn't exist" % args.conf)
return STATUS_UNKNOWN
if args.keyring and not os.path.exists(args.keyring):
print("MON ERROR: keyring file '%s' doesn't exist" % args.keyring)
return STATUS_UNKNOWN
if not args.monid:
print("MON ERROR: no MON ID given, use -I/--monid parameter")
return STATUS_UNKNOWN
# build command
ceph_cmd = [ceph_exec]
if args.monaddress:
ceph_cmd.append('-m')
ceph_cmd.append(args.monaddress)
if args.conf:
ceph_cmd.append('-c')
ceph_cmd.append(args.conf)
if args.id:
ceph_cmd.append('--id')
ceph_cmd.append(args.id)
if args.keyring:
ceph_cmd.append('--keyring')
ceph_cmd.append(args.keyring)
ceph_cmd.append(CEPH_COMMAND)
# exec command
p = subprocess.Popen(ceph_cmd,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
output, err = p.communicate()
if p.returncode != 0 or not output:
print("MON ERROR: %s" % err)
return STATUS_ERROR
# load json output and parse
quorum_status = False
try:
quorum_status = json.loads(output)
except Exception as e:
print("MON ERROR: could not parse '%s' output: %s: %s" % (CEPH_COMMAND,output,e))
return STATUS_UNKNOWN
#print "XXX: quorum_status['quorum_names']:", quorum_status['quorum_names']
# do our checks
is_monitor = False
for mon in quorum_status['monmap']['mons']:
if mon['name'] == args.monid:
is_monitor = True
if not is_monitor:
print("MON WARN: mon '%s' is not in monmap: %s" % (args.monid,quorum_status['monmap']['mons']))
return STATUS_WARNING
in_quorum = args.monid in quorum_status['quorum_names']
if in_quorum:
print("MON OK")
return STATUS_OK
else:
print("MON WARN: no MON '%s' found in quorum" % args.monid)
return STATUS_WARNING
# main
if __name__ == "__main__":
sys.exit(main())

View File

@ -0,0 +1,154 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Copyright (c) 2013 Catalyst IT http://www.catalyst.net.nz
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# 1.5.2 (2019-06-16) Martin Seener: fixed regex to work with Ceph Nautilus (14.2.x)
from __future__ import print_function
import argparse
import os
import re
import subprocess
import sys
import socket
__version__ = '1.5.2'
# default ceph values
CEPH_COMMAND = '/usr/bin/ceph'
# nagios exit code
STATUS_OK = 0
STATUS_WARNING = 1
STATUS_ERROR = 2
STATUS_UNKNOWN = 3
def main():
# parse args
parser = argparse.ArgumentParser(description="'ceph osd' nagios plugin.")
parser.add_argument('-e','--exe', help='ceph executable [%s]' % CEPH_COMMAND)
parser.add_argument('-c','--conf', help='alternative ceph conf file')
parser.add_argument('-m','--monaddress', help='ceph monitor address[:port]')
parser.add_argument('-i','--id', help='ceph client id')
parser.add_argument('-k','--keyring', help='ceph client keyring file')
parser.add_argument('-V','--version', help='show version and exit', action='store_true')
parser.add_argument('-H','--host', help='osd host', required=True)
parser.add_argument('-I','--osdid', help='osd id', required=False)
parser.add_argument('-C','--crit', help='Number of failed OSDs to trigger critical (default=2)',type=int,default=2, required=False)
parser.add_argument('-o','--out', help='check osds that are set OUT', default=False, action='store_true', required=False)
args = parser.parse_args()
# validate args
ceph_exec = args.exe if args.exe else CEPH_COMMAND
if not os.path.exists(ceph_exec):
print("OSD ERROR: ceph executable '%s' doesn't exist" % ceph_exec)
return STATUS_UNKNOWN
if args.version:
print('version %s' % __version__)
return STATUS_OK
if args.conf and not os.path.exists(args.conf):
print("OSD ERROR: ceph conf file '%s' doesn't exist" % args.conf)
return STATUS_UNKNOWN
if args.keyring and not os.path.exists(args.keyring):
print("OSD ERROR: keyring file '%s' doesn't exist" % args.keyring)
return STATUS_UNKNOWN
if not args.osdid:
args.osdid = '[^ ]*'
if not args.host:
print("OSD ERROR: no OSD hostname given")
return STATUS_UNKNOWN
try:
addrinfo = socket.getaddrinfo(args.host, None, 0, socket.SOCK_STREAM)
args.host = addrinfo[0][-1][0]
if addrinfo[0][0] == socket.AF_INET6:
args.host = "[%s]" % args.host
except:
print('OSD ERROR: could not resolve %s' % args.host)
return STATUS_UNKNOWN
# build command
ceph_cmd = [ceph_exec]
if args.monaddress:
ceph_cmd.append('-m')
ceph_cmd.append(args.monaddress)
if args.conf:
ceph_cmd.append('-c')
ceph_cmd.append(args.conf)
if args.id:
ceph_cmd.append('--id')
ceph_cmd.append(args.id)
if args.keyring:
ceph_cmd.append('--keyring')
ceph_cmd.append(args.keyring)
ceph_cmd.append('osd')
ceph_cmd.append('dump')
# exec command
p = subprocess.Popen(ceph_cmd,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
output, err = p.communicate()
output = output.decode('utf8')
if err or not output:
print("OSD ERROR: %s" % err)
return STATUS_ERROR
# escape IPv4 host address
osd_host = args.host.replace('.', '\.')
# escape IPv6 host address
osd_host = osd_host.replace('[', '\[')
osd_host = osd_host.replace(']', '\]')
up = re.findall(r"^(osd\.%s) up.*%s:" % (args.osdid, osd_host), output, re.MULTILINE)
if args.out:
down = re.findall(r"^(osd\.%s) down.*%s:" % (args.osdid, osd_host), output, re.MULTILINE)
down_in = re.findall(r"^(osd\.%s) down[ ]+in.*%s:" % (args.osdid, osd_host), output, re.MULTILINE)
down_out = re.findall(r"^(osd\.%s) down[ ]+out.*%s:" % (args.osdid, osd_host), output, re.MULTILINE)
else:
down = re.findall(r"^(osd\.%s) down[ ]+in.*%s:" % (args.osdid, osd_host), output, re.MULTILINE)
down_in = down
down_out = re.findall(r"^(osd\.%s) down[ ]+out.*%s:" % (args.osdid, osd_host), output, re.MULTILINE)
if down:
print("OSD %s: Down OSD%s on %s: %s" % ('CRITICAL' if len(down)>=args.crit else 'WARNING' ,'s' if len(down)>1 else '', args.host, " ".join(down)))
print("Up OSDs: " + " ".join(up))
print("Down+In OSDs: " + " ".join(down_in))
print("Down+Out OSDs: " + " ".join(down_out))
print("| 'osd_up'=%d 'osd_down_in'=%d;;%d 'osd_down_out'=%d;;%d" % (len(up), len(down_in), args.crit, len(down_out), args.crit))
if len(down)>=args.crit:
return STATUS_ERROR
else:
return STATUS_WARNING
if up:
print("OSD OK")
print("Up OSDs: " + " ".join(up))
print("Down+In OSDs: " + " ".join(down_in))
print("Down+Out OSDs: " + " ".join(down_out))
print("| 'osd_up'=%d 'osd_down_in'=%d;;%d 'osd_down_out'=%d;;%d" % (len(up), len(down_in), args.crit, len(down_out), args.crit))
return STATUS_OK
print("OSD WARN: no OSD.%s found on host %s" % (args.osdid, args.host))
return STATUS_WARNING
if __name__ == "__main__":
sys.exit(main())

View File

@ -0,0 +1,128 @@
#!/usr/bin/env perl
#===============================================================================
# DESCRIPTION: Icinga2 / Nagios Check for chrony time sync status and offset
#
# OPTIONS: -h : Help
# -w [warning threshold in seconds]
# -c [critical threshold in seconds]
#
# REQUIREMENTS: Chrony, perl version 5.10.1+
#
# AUTHOR: Dennis Ullrich (request@decstasy.de)
#
# BUGS ETC: https://github.com/Decstasy/check_chrony
#
# LICENSE: GPL v3 (GNU General Public License, Version 3)
# see https://www.gnu.org/licenses/gpl-3.0.txt
#===============================================================================
use 5.10.1;
use strict;
use warnings;
use utf8;
use Getopt::Std;
#
# Variables
#
my $chronyDaemonName = "chronyd";
my $leapOk = "Normal";
my $rc = 3;
my $msg= "";
my $perfdata = "";
#
# Subroutines
#
sub help {
print "check_chrony [options]
-w [warning threshold in seconds]
-c [critical threshold in seconds]
e.g.: check_chrony -w 0.6 -c 2\n";
exit(3);
}
# Script exit with Nagios / Icinga typical output
sub _exit {
my ( $return, $line ) = @_;
my @state = ( "OK", "WARNING", "CRITICAL", "UNKNOWN" );
print "$state[$return]: $line\n";
exit( $return );
}
# Checks if a process with $_[0] as name exists
sub proc_exists {
my $PID = `ps -C $_[0] -o pid=`;
if ( ${^CHILD_ERROR_NATIVE} == 0 ){
return 1;
}
return 0;
}
#
# Options
#
my %options=();
getopts( "hw:c:", \%options );
# Check input
if ( keys %options == 0 || defined $options{h} ){
&help;
}
for my $key ( keys %options ){
if ( $options{$key} !~ /^[\d\.]+$/ ){
&_exit( 3, "Value of option -$key is not a valid number!" );
}
}
#
# Check chrony process
#
&_exit( 2, "$chronyDaemonName is not running!" ) if not &proc_exists( $chronyDaemonName );
#
# Get tracking data
#
my $chronyOutput = `chronyc tracking`;
&_exit( 3, "Chronyc tracking command failed!" ) if ${^CHILD_ERROR_NATIVE} != 0;
my ( $offset, $dir ) = $chronyOutput =~ /(?:System\stime)[^\d]+([\d\.]+)(?:.*?)(fast|slow)/;
my ( $leap ) = $chronyOutput =~ /(?:Leap)[^\:]+(?::\s+)([\w\h]+)/;
#
# Check stuff
#
# Check offset
if ( $offset >= $options{"c"} ){
$rc = 2; # Critical
}
elsif ( $offset >= $options{"w"} ){
$rc = 1; # Warning
}
else {
$rc = 0; # Ok
}
# Prepare offset performace data
$offset = $dir =~ "slow" ? "-$offset" : "+$offset";
$msg = sprintf( "Time offset of %+.9f seconds to reference.", $offset);
$perfdata = sprintf( "|offset=%.9fs;%.9f;%.9f", ${offset}, $options{'w'}, $options{'c'});
# Check leap
if( $leap !~ $leapOk ){
&_exit( 2, "Chrony leap status \"$leap\" is not equal to \"$leapOk\"! $msg $perfdata" );
}
#
# Return stuff
#
&_exit($rc, "$msg $perfdata");

1
roles/ceph/files/sudoers Normal file
View File

@ -0,0 +1 @@
nagios ALL=(root) NOPASSWD: /usr/sbin/smartctl,/sbin/dmsetup

View File

@ -1,8 +1,2 @@
- name: Set NRPE Ceph configuration
copy:
src: nrpe.cfg
dest: /etc/nagios/nrpe.d/95-ceph.cfg
owner: root
group: root
mode: u=rw,g=r,o=r
notify: restart-nrpe
- import_tasks: nrpe.yml
tags: nrpe

24
roles/ceph/tasks/nrpe.yml Normal file
View File

@ -0,0 +1,24 @@
- name: Set NRPE Ceph configuration
copy:
src: nrpe.cfg
dest: /etc/nagios/nrpe.d/95-ceph.cfg
owner: root
group: root
mode: u=rw,g=r,o=r
notify: restart-nrpe
- name: Copy Ceph NRPE plugins
copy:
src: nrpe/
dest: /etc/nagios/plugins/
owner: root
group: root
mode: u=rwx,g=rx,o=rx
notify: restart-nrpe
- name: Add nagios to sudoers
copy:
src: sudoers
dest: /etc/sudoers.d/nagios
mode: u=rw,g=r,o=
owner: root
group: root
notify: restart-nrpe

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,22 @@
#!/bin/bash
# Checks status of disks SMART
STATUS_LABEL="SMART Health Status:"
STATUS_OK="$STATUS_LABEL OK"
if [[ "$#" == "0" ]]; then
echo "Usage: $0 <disk1> [<disk2> ... <diskX>]"
exit
fi
for DISK in "$@"
do
STATUS=$(sudo /usr/sbin/smartctl -H -d scsi "$DISK" | grep "$STATUS_LABEL")
if [ "$STATUS" != "$STATUS_OK" ]; then
echo "CRITICAL: $DISK: $STATUS"
exit 2
fi
done
echo "OK: $STATUS_OK"

View File

@ -1,14 +1,20 @@
- name: Install base packages
apt:
name:
- libxml-simple-perl
- libmonitoring-plugin-perl
state: present
- name: Set NRPE PVE configuration
copy:
src: nrpe.cfg
dest: /etc/nagios/nrpe.d/95-pve.cfg
src: nrpe.d/95-pve.cfg
dest: /etc/nagios/nrpe.d/
owner: root
group: root
mode: u=rw,g=r,o=r
notify: restart-nrpe
- name: Copy PVE NRPE plugins
copy:
src: nrpe/
src: plugins/
dest: /etc/nagios/plugins/
owner: root
group: root

View File

@ -7,8 +7,8 @@
group: root
register: copy_result
- name: Reboot the system if file was copied
reboot:
reboot_timeout: 600
become: true
when: copy_result.changed
#- name: Reboot the system if file was copied
# reboot:
# reboot_timeout: 600
# become: true
# when: copy_result.changed