diff --git a/roles/ceph/files/nrpe.cfg b/roles/ceph/files/nrpe.cfg index 76d252f..afc0035 100644 --- a/roles/ceph/files/nrpe.cfg +++ b/roles/ceph/files/nrpe.cfg @@ -1,4 +1,4 @@ -command[check_total_procs]=/usr/lib/nagios/plugins/check_procs -w 700 -c 1000 +command[check_total_procs]=/usr/lib/nagios/plugins/check_procs -w 1300 -c 1500 command[check_chrony]=/usr/lib/nagios/plugins/check_chrony 1 2 command[check_smartdisk]=/etc/nagios/plugins/check_smartdisk.sh /dev/sda /dev/sdb command[check_raid]=/usr/lib/nagios/plugins/check_raid diff --git a/roles/ceph/files/nrpe/check_ceph_mon b/roles/ceph/files/nrpe/check_ceph_mon new file mode 100755 index 0000000..f8decea --- /dev/null +++ b/roles/ceph/files/nrpe/check_ceph_mon @@ -0,0 +1,163 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# Copyright (c) 2013 Catalyst IT http://www.catalyst.net.nz +# Copyright (c) 2015 SWITCH http://www.switch.ch +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import print_function +import argparse +import socket +import os +import re +import subprocess +import sys +import json + +__version__ = '1.5.0' + +# default ceph values +CEPH_EXEC = '/usr/bin/ceph' +CEPH_COMMAND = 'quorum_status' + +# nagios exit code +STATUS_OK = 0 +STATUS_WARNING = 1 +STATUS_ERROR = 2 +STATUS_UNKNOWN = 3 + +## +# ceph quorum_status output example +## +ceph_quorum_status_output_example = '''{ + "quorum_leader_name" : "s0001", + "monmap" : { + "mons" : [ + { + "name" : "s0001", + "addr" : "[2001:620:5ca1:8000::1001]:6789/0", + "rank" : 0 + }, + { + "name" : "s0003", + "addr" : "[2001:620:5ca1:8000::1003]:6789/0", + "rank" : 1 + } + ], + "created" : "2014-12-15 08:28:35.153650", + "epoch" : 2, + "modified" : "2014-12-15 08:28:40.371878", + "fsid" : "22348d2b-b69d-46cc-9a79-ca93cd6bae84" + }, + "quorum_names" : [ + "s0001", + "s0003" + ], + "quorum" : [ + 0, + 1 + ], + "election_epoch" : 24 +}''' + +def main(): + + # parse args + parser = argparse.ArgumentParser(description="'ceph quorum_status' nagios plugin.") + parser.add_argument('-e','--exe', help='ceph executable [%s]' % CEPH_EXEC) + parser.add_argument('-c','--conf', help='alternative ceph conf file') + parser.add_argument('-m','--monaddress', help='ceph monitor to use for queries (address[:port])') + parser.add_argument('-i','--id', help='ceph client id') + parser.add_argument('-k','--keyring', help='ceph client keyring file') + parser.add_argument('-V','--version', help='show version and exit', action='store_true') + parser.add_argument('-I','--monid', help='mon ID to be checked for availability') + args = parser.parse_args() + + if args.version: + print('version %s' % __version__) + return STATUS_OK + + # validate args + ceph_exec = args.exe if args.exe else CEPH_EXEC + if not os.path.exists(ceph_exec): + print("MON ERROR: ceph executable '%s' doesn't exist" % ceph_exec) + return STATUS_UNKNOWN + + if args.conf and not os.path.exists(args.conf): + print("MON ERROR: ceph conf file '%s' doesn't exist" % args.conf) + return STATUS_UNKNOWN + + if args.keyring and not os.path.exists(args.keyring): + print("MON ERROR: keyring file '%s' doesn't exist" % args.keyring) + return STATUS_UNKNOWN + + if not args.monid: + print("MON ERROR: no MON ID given, use -I/--monid parameter") + return STATUS_UNKNOWN + + # build command + ceph_cmd = [ceph_exec] + if args.monaddress: + ceph_cmd.append('-m') + ceph_cmd.append(args.monaddress) + if args.conf: + ceph_cmd.append('-c') + ceph_cmd.append(args.conf) + if args.id: + ceph_cmd.append('--id') + ceph_cmd.append(args.id) + if args.keyring: + ceph_cmd.append('--keyring') + ceph_cmd.append(args.keyring) + ceph_cmd.append(CEPH_COMMAND) + + # exec command + p = subprocess.Popen(ceph_cmd,stdout=subprocess.PIPE,stderr=subprocess.PIPE) + output, err = p.communicate() + + if p.returncode != 0 or not output: + print("MON ERROR: %s" % err) + return STATUS_ERROR + + # load json output and parse + quorum_status = False + try: + quorum_status = json.loads(output) + except Exception as e: + print("MON ERROR: could not parse '%s' output: %s: %s" % (CEPH_COMMAND,output,e)) + return STATUS_UNKNOWN + + #print "XXX: quorum_status['quorum_names']:", quorum_status['quorum_names'] + + # do our checks + is_monitor = False + for mon in quorum_status['monmap']['mons']: + if mon['name'] == args.monid: + is_monitor = True + if not is_monitor: + print("MON WARN: mon '%s' is not in monmap: %s" % (args.monid,quorum_status['monmap']['mons'])) + return STATUS_WARNING + + in_quorum = args.monid in quorum_status['quorum_names'] + if in_quorum: + print("MON OK") + return STATUS_OK + else: + print("MON WARN: no MON '%s' found in quorum" % args.monid) + return STATUS_WARNING + +# main +if __name__ == "__main__": + sys.exit(main()) diff --git a/roles/ceph/files/nrpe/check_ceph_osd b/roles/ceph/files/nrpe/check_ceph_osd new file mode 100755 index 0000000..2ee9de6 --- /dev/null +++ b/roles/ceph/files/nrpe/check_ceph_osd @@ -0,0 +1,154 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# Copyright (c) 2013 Catalyst IT http://www.catalyst.net.nz +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# 1.5.2 (2019-06-16) Martin Seener: fixed regex to work with Ceph Nautilus (14.2.x) + +from __future__ import print_function +import argparse +import os +import re +import subprocess +import sys +import socket + +__version__ = '1.5.2' + +# default ceph values +CEPH_COMMAND = '/usr/bin/ceph' + +# nagios exit code +STATUS_OK = 0 +STATUS_WARNING = 1 +STATUS_ERROR = 2 +STATUS_UNKNOWN = 3 + +def main(): + + # parse args + parser = argparse.ArgumentParser(description="'ceph osd' nagios plugin.") + parser.add_argument('-e','--exe', help='ceph executable [%s]' % CEPH_COMMAND) + parser.add_argument('-c','--conf', help='alternative ceph conf file') + parser.add_argument('-m','--monaddress', help='ceph monitor address[:port]') + parser.add_argument('-i','--id', help='ceph client id') + parser.add_argument('-k','--keyring', help='ceph client keyring file') + parser.add_argument('-V','--version', help='show version and exit', action='store_true') + parser.add_argument('-H','--host', help='osd host', required=True) + parser.add_argument('-I','--osdid', help='osd id', required=False) + parser.add_argument('-C','--crit', help='Number of failed OSDs to trigger critical (default=2)',type=int,default=2, required=False) + parser.add_argument('-o','--out', help='check osds that are set OUT', default=False, action='store_true', required=False) + args = parser.parse_args() + + # validate args + ceph_exec = args.exe if args.exe else CEPH_COMMAND + if not os.path.exists(ceph_exec): + print("OSD ERROR: ceph executable '%s' doesn't exist" % ceph_exec) + return STATUS_UNKNOWN + + if args.version: + print('version %s' % __version__) + return STATUS_OK + + if args.conf and not os.path.exists(args.conf): + print("OSD ERROR: ceph conf file '%s' doesn't exist" % args.conf) + return STATUS_UNKNOWN + + if args.keyring and not os.path.exists(args.keyring): + print("OSD ERROR: keyring file '%s' doesn't exist" % args.keyring) + return STATUS_UNKNOWN + + if not args.osdid: + args.osdid = '[^ ]*' + + if not args.host: + print("OSD ERROR: no OSD hostname given") + return STATUS_UNKNOWN + + try: + addrinfo = socket.getaddrinfo(args.host, None, 0, socket.SOCK_STREAM) + args.host = addrinfo[0][-1][0] + if addrinfo[0][0] == socket.AF_INET6: + args.host = "[%s]" % args.host + except: + print('OSD ERROR: could not resolve %s' % args.host) + return STATUS_UNKNOWN + + + # build command + ceph_cmd = [ceph_exec] + if args.monaddress: + ceph_cmd.append('-m') + ceph_cmd.append(args.monaddress) + if args.conf: + ceph_cmd.append('-c') + ceph_cmd.append(args.conf) + if args.id: + ceph_cmd.append('--id') + ceph_cmd.append(args.id) + if args.keyring: + ceph_cmd.append('--keyring') + ceph_cmd.append(args.keyring) + ceph_cmd.append('osd') + ceph_cmd.append('dump') + + # exec command + p = subprocess.Popen(ceph_cmd,stdout=subprocess.PIPE,stderr=subprocess.PIPE) + output, err = p.communicate() + output = output.decode('utf8') + + if err or not output: + print("OSD ERROR: %s" % err) + return STATUS_ERROR + + # escape IPv4 host address + osd_host = args.host.replace('.', '\.') + # escape IPv6 host address + osd_host = osd_host.replace('[', '\[') + osd_host = osd_host.replace(']', '\]') + up = re.findall(r"^(osd\.%s) up.*%s:" % (args.osdid, osd_host), output, re.MULTILINE) + if args.out: + down = re.findall(r"^(osd\.%s) down.*%s:" % (args.osdid, osd_host), output, re.MULTILINE) + down_in = re.findall(r"^(osd\.%s) down[ ]+in.*%s:" % (args.osdid, osd_host), output, re.MULTILINE) + down_out = re.findall(r"^(osd\.%s) down[ ]+out.*%s:" % (args.osdid, osd_host), output, re.MULTILINE) + else: + down = re.findall(r"^(osd\.%s) down[ ]+in.*%s:" % (args.osdid, osd_host), output, re.MULTILINE) + down_in = down + down_out = re.findall(r"^(osd\.%s) down[ ]+out.*%s:" % (args.osdid, osd_host), output, re.MULTILINE) + + if down: + print("OSD %s: Down OSD%s on %s: %s" % ('CRITICAL' if len(down)>=args.crit else 'WARNING' ,'s' if len(down)>1 else '', args.host, " ".join(down))) + print("Up OSDs: " + " ".join(up)) + print("Down+In OSDs: " + " ".join(down_in)) + print("Down+Out OSDs: " + " ".join(down_out)) + print("| 'osd_up'=%d 'osd_down_in'=%d;;%d 'osd_down_out'=%d;;%d" % (len(up), len(down_in), args.crit, len(down_out), args.crit)) + if len(down)>=args.crit: + return STATUS_ERROR + else: + return STATUS_WARNING + + if up: + print("OSD OK") + print("Up OSDs: " + " ".join(up)) + print("Down+In OSDs: " + " ".join(down_in)) + print("Down+Out OSDs: " + " ".join(down_out)) + print("| 'osd_up'=%d 'osd_down_in'=%d;;%d 'osd_down_out'=%d;;%d" % (len(up), len(down_in), args.crit, len(down_out), args.crit)) + return STATUS_OK + + print("OSD WARN: no OSD.%s found on host %s" % (args.osdid, args.host)) + return STATUS_WARNING + +if __name__ == "__main__": + sys.exit(main()) diff --git a/roles/ceph/files/nrpe/check_chrony b/roles/ceph/files/nrpe/check_chrony new file mode 100755 index 0000000..1d220f4 --- /dev/null +++ b/roles/ceph/files/nrpe/check_chrony @@ -0,0 +1,128 @@ +#!/usr/bin/env perl +#=============================================================================== +# DESCRIPTION: Icinga2 / Nagios Check for chrony time sync status and offset +# +# OPTIONS: -h : Help +# -w [warning threshold in seconds] +# -c [critical threshold in seconds] +# +# REQUIREMENTS: Chrony, perl version 5.10.1+ +# +# AUTHOR: Dennis Ullrich (request@decstasy.de) +# +# BUGS ETC: https://github.com/Decstasy/check_chrony +# +# LICENSE: GPL v3 (GNU General Public License, Version 3) +# see https://www.gnu.org/licenses/gpl-3.0.txt +#=============================================================================== + +use 5.10.1; +use strict; +use warnings; +use utf8; +use Getopt::Std; + +# +# Variables +# +my $chronyDaemonName = "chronyd"; +my $leapOk = "Normal"; + +my $rc = 3; +my $msg= ""; +my $perfdata = ""; + +# +# Subroutines +# + +sub help { + print "check_chrony [options] + -w [warning threshold in seconds] + -c [critical threshold in seconds] + e.g.: check_chrony -w 0.6 -c 2\n"; + exit(3); +} + +# Script exit with Nagios / Icinga typical output +sub _exit { + my ( $return, $line ) = @_; + my @state = ( "OK", "WARNING", "CRITICAL", "UNKNOWN" ); + print "$state[$return]: $line\n"; + exit( $return ); +} + +# Checks if a process with $_[0] as name exists +sub proc_exists { + my $PID = `ps -C $_[0] -o pid=`; + if ( ${^CHILD_ERROR_NATIVE} == 0 ){ + return 1; + } + return 0; +} + +# +# Options +# + +my %options=(); +getopts( "hw:c:", \%options ); + +# Check input +if ( keys %options == 0 || defined $options{h} ){ + &help; +} + +for my $key ( keys %options ){ + if ( $options{$key} !~ /^[\d\.]+$/ ){ + &_exit( 3, "Value of option -$key is not a valid number!" ); + } +} + +# +# Check chrony process +# + +&_exit( 2, "$chronyDaemonName is not running!" ) if not &proc_exists( $chronyDaemonName ); + +# +# Get tracking data +# + +my $chronyOutput = `chronyc tracking`; +&_exit( 3, "Chronyc tracking command failed!" ) if ${^CHILD_ERROR_NATIVE} != 0; + +my ( $offset, $dir ) = $chronyOutput =~ /(?:System\stime)[^\d]+([\d\.]+)(?:.*?)(fast|slow)/; +my ( $leap ) = $chronyOutput =~ /(?:Leap)[^\:]+(?::\s+)([\w\h]+)/; + +# +# Check stuff +# + +# Check offset +if ( $offset >= $options{"c"} ){ + $rc = 2; # Critical +} +elsif ( $offset >= $options{"w"} ){ + $rc = 1; # Warning +} +else { + $rc = 0; # Ok +} + +# Prepare offset performace data +$offset = $dir =~ "slow" ? "-$offset" : "+$offset"; +$msg = sprintf( "Time offset of %+.9f seconds to reference.", $offset); +$perfdata = sprintf( "|offset=%.9fs;%.9f;%.9f", ${offset}, $options{'w'}, $options{'c'}); + +# Check leap +if( $leap !~ $leapOk ){ + &_exit( 2, "Chrony leap status \"$leap\" is not equal to \"$leapOk\"! $msg $perfdata" ); +} + +# +# Return stuff +# + +&_exit($rc, "$msg $perfdata"); + diff --git a/roles/pve/files/nrpe/check_smartdisk.sh b/roles/ceph/files/nrpe/check_smartdisk.sh similarity index 100% rename from roles/pve/files/nrpe/check_smartdisk.sh rename to roles/ceph/files/nrpe/check_smartdisk.sh diff --git a/roles/ceph/files/sudoers b/roles/ceph/files/sudoers new file mode 100644 index 0000000..5e2f0cd --- /dev/null +++ b/roles/ceph/files/sudoers @@ -0,0 +1 @@ +nagios ALL=(root) NOPASSWD: /usr/sbin/smartctl,/sbin/dmsetup \ No newline at end of file diff --git a/roles/ceph/tasks/main.yml b/roles/ceph/tasks/main.yml index c1af77e..f334f87 100644 --- a/roles/ceph/tasks/main.yml +++ b/roles/ceph/tasks/main.yml @@ -1,8 +1,2 @@ -- name: Set NRPE Ceph configuration - copy: - src: nrpe.cfg - dest: /etc/nagios/nrpe.d/95-ceph.cfg - owner: root - group: root - mode: u=rw,g=r,o=r - notify: restart-nrpe +- import_tasks: nrpe.yml + tags: nrpe diff --git a/roles/ceph/tasks/nrpe.yml b/roles/ceph/tasks/nrpe.yml new file mode 100644 index 0000000..c94df0a --- /dev/null +++ b/roles/ceph/tasks/nrpe.yml @@ -0,0 +1,24 @@ +- name: Set NRPE Ceph configuration + copy: + src: nrpe.cfg + dest: /etc/nagios/nrpe.d/95-ceph.cfg + owner: root + group: root + mode: u=rw,g=r,o=r + notify: restart-nrpe +- name: Copy Ceph NRPE plugins + copy: + src: nrpe/ + dest: /etc/nagios/plugins/ + owner: root + group: root + mode: u=rwx,g=rx,o=rx + notify: restart-nrpe +- name: Add nagios to sudoers + copy: + src: sudoers + dest: /etc/sudoers.d/nagios + mode: u=rw,g=r,o= + owner: root + group: root + notify: restart-nrpe diff --git a/roles/pve/files/nrpe.cfg b/roles/pve/files/nrpe.d/95-pve.cfg similarity index 100% rename from roles/pve/files/nrpe.cfg rename to roles/pve/files/nrpe.d/95-pve.cfg diff --git a/roles/pve/files/nrpe/check_chrony b/roles/pve/files/plugins/check_chrony similarity index 100% rename from roles/pve/files/nrpe/check_chrony rename to roles/pve/files/plugins/check_chrony diff --git a/roles/pve/files/plugins/check_ilo2_health.pl b/roles/pve/files/plugins/check_ilo2_health.pl new file mode 100755 index 0000000..4f5dba4 --- /dev/null +++ b/roles/pve/files/plugins/check_ilo2_health.pl @@ -0,0 +1,1234 @@ +#!/usr/bin/perl +# icinga: -epn + +# check_ilo2_health.pl +# based on check_stuff.pl and locfg.pl +# +# Nagios plugin using the Nagios::Plugin or Monitoring::Plugin module and the +# HP Lights-Out XML PERL Scripting Sample from +# ftp://ftp.hp.com/pub/softlib2/software1/pubsw-linux/p391992567/v60711/linux-LOsamplescripts3.00.0-2.tgz +# checks if all sensors are ok, returns warning on high temperatures and +# fan failures and critical on overall health failure +# +# Alexander Greiner-Baer 2007 - 2021 +# Matthew Stier 2011 +# Claudio Kuenzler 2021 +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# Changelog: +# 1.66 Wed, 21 Apr 2021 13:00:00 +0200 +# support both Nagios::Plugin and Monitoring::Plugin modules +# add option "-W" for checking server's current power usage (in Watt) +# 1.65 Sat, 06 Feb 2021 13:00:03 +0100 +# new option "--ignorecacheother|-O" +# ignores cache status "Other" +# 1.64 Tue, 02 Jun 2020 18:56:30 +0200 +# fix memory status (was Unknown on an otherwise healthy G8) +# fix drive check for ilo2, was broken for 5 years... +# new option "-L" +# retrieve event log from RIB_INFO section +# fix help message for "-l" +# new option "-S" +# check self_test value from https:///json/login_session +# 1.63 Tue, 13 Nov 2018 18:41:48 +0100 +# support iLO5 firmware infos +# applied patch from Rene Koch +# ignore link unknown (option "-U") +# 1.62 Mon, 14 May 2018 19:05:22 +0200 +# retrieve firmware infos only when using --getinfos +# 1.61 Thu, 01 Jun 2017 20:05:04 +0200 +# fix for iLO4 2.50 link state when using --ignorelinkdown +# 1.60 Wed, 12 Aug 2015 18:20:13 +0200 +# provide --sslopts to override defaults settings +# fix }; for GET_EVENT_LOG +# applied patch from Rene Koch : +# handle missing values when using "-g" +# CONTROLLER_STATUS not present on iLO4 anymore, use STATUS instead +# put SSL_VERIFY_NONE in '' +# 1.59 Wed, 28 Jan 2015 18:56:26 +0100 +# fix chunk size handling +# corrected HTTP/1.1 HOST Header +# applied patch from Max Winterstein : +# sslv3 support +# add retries option +# catch XMLin() errors +# applied patch from Rene Koch : +# ignore battery not installed status (option "-x") +# display server name (option "-g") +# added warning for logical drive status "Degraded (Recovering)" +# display system details (hardware model, serial number, SystemROM, iLO version) +# display memory size and part number in case of memory failure +# display hard disk model number in case of hard disk failure +# display power supply part number in case of power supply failure +# 1.58 Thu, 08 Aug 2013 18:17:02 +0200 +# ignore network link down status (option "-i") +# added ENCLOSURE_ADDR to drive bay label (bay numbering was inconsistent) +# ignore spare drives +# 1.57 Fri, 17 May 2013 19:30:48 +0200 +# SSL_verify_mode SSL_VERIFY_NONE (IO::Socket::SSL changed default) +# event log support for ilo2 +# disable embedded perl in icinga +# 1.56 Fri, 15 Mar 2013 20:47:13 +0100 +# applied patch from Niklas Edmundsson : +# check processor and memory details +# applied patches from Dragan Sekerovic : +# add location label to temperature (option "-b") +# support for checking event log (option "-l") +# add iLO version to output +# add 2 new values for power supply status +# -- +# 1.55 Sun, 05 Aug 2012 20:18:46 +0200 +# faulty drive (option "-c") exits now with CRITICAL instead of WARNING +# applied patches from Niklas Edmundsson : +# iLO4 RAID Controller Status +# nodriveexit +# add g6 drive status +# overall health probes every element now +# fixed bug with drive bay index +# supports iLO3 with multiple backplanes +# supports iLO4 disk check +# Note: overall health may show drive/storage status, even without "-c" +# -- +# 1.54 Thu, 14 Jun 2012 21:36:40 +0200 +# applied fix for iLO4 from Niklas Edmundsson +# -- +# 1.53 Tue, 14 Feb 2012 19:47:40 +0100 +# added new disk bay variant +# added power supply NOT APPLICABLE +# -- +# 1.52 Wed, 27 Jul 2011 20:46:14 +0200 +# fixed