librenms/scripts/agent-local/drbd

373 lines
7.8 KiB
Plaintext
Raw Normal View History

#!/usr/bin/perl
=head1 NAME
check_drbd - Nagios plugin for DRBD
=head1 SYNOPSIS
B<check_drbd> [B<--verbose> | B<-v>]
=head1 DESCRIPTION
B<check_drbd> is a Nagios plugin for DRBD. It checks the connection state,
resource roles and disk states for every configured DRBD resource, and
produces a WARNING or CRITICAL alert if anything is amiss. The states
of both the local and remote sides of each connection are monitored.
=head2 Nagios status information
The status information emitted by this plugin is similar to the information
in F</proc/drbd>:
drbd0: Connected Primary/Secondary UpToDate/UpToDate
| | | | | |
| | | | | Remote disk state
| | | | Local disk state
| | | Remote resource role
| | Local resource role
| Connection state
DRBD device
If more than one device is present, and all devices are OK, the output is
summarised:
drbd0: PriConUpT, drbd1: SecConUpT
If any devices are not OK, the output contains their statuses in full.
=head2 Nagios performance data
Complete performance data is emitted for all configured DRBD resources:
=over
=item drbdI<*>_ns
=item drbdI<*>_nr
The volume of network data sent to and received from the peer, in kiB.
=item drbdI<*>_dw
=item drbdI<*>_dr
The volume of network data written to and read from the local disk, in kiB.
=item drbdI<*>_al
The number of updates of the activity log area of the metadata.
=item drbdI<*>_lo
The number of open requests to the local I/O subsystem issued by DRBD.
=item drbdI<*>_pe
The number of requests sent to the peer but not yet been answered by the latter.
=item drbdI<*>_ua
The number of requests received by the peer but not yet been answered by the latter.
=item drbdI<*>_ap
The number of block I/O requests forwarded by DRBD, but not yet answered by DRBD.
=item drbdI<*>_ep
The number of epoch objects.
=item drbdI<*>_oos
The amount of storage currently out-of-sync, in kiB.
=back
=head1 OPTIONS
=over
=item B<-v>, B<--verbose>
Increase the verbosity of the output messages. This disables the Nagios status
information summarisation described above: all resources' statuses are printed
in full.
=back
=head1 EXIT STATUS
=over
=item 0
All resources are OK.
=item 1
Some resources are not OK, but do not need immediate attention.
=item 2
Some resources are not OK and need immediate attention.
=item 3
An error occurred while collecting the resources' statuses.
=back
=head1 FILES
F</proc/drbd>
=head1 SEE ALSO
L<The DRBD Home Page|http://www.drbd.org/>
=cut
use strict;
use warnings;
use constant BASENAME => ($0 =~ m{.*/([^/]+)})[0] || 'check_drbd';
use constant STATE_FILE => '/proc/drbd';
use constant {
OK => 0,
WARNING => 1,
CRITICAL => 2,
UNKNOWN => 3,
};
use Getopt::Long;
use IO::File;
sub help;
sub usage;
sub perfdata;
sub ok;
sub warning;
sub critical;
sub unknown;
sub get_state;
$SIG{__DIE__} = sub {
die @_ if $^S;
print @_;
exit UNKNOWN;
};
my $verbose;
Getopt::Long::Configure('bundling', 'no_ignore_case');
GetOptions(
'verbose|v+' => \$verbose,
'help|?' => sub { help; exit 0 },
'usage' => sub { usage; exit 0 },
) and @ARGV == 0
or do { usage; exit UNKNOWN };
my @state = get_state;
my $status = OK;
print "<<<drbd>>>\n";
foreach my $id (0 .. $#state) {
my $device = $state[$id]
or next;
# Assume CRITICAL by default
foreach (qw( cs )) {
$device->{"${_}_level"} = {
Connected => OK,
Unconfigured => OK,
StandAlone => WARNING,
SyncingAll => WARNING,
SyncingQuick => WARNING,
SyncSource => WARNING,
SyncTarget => WARNING,
VerifyS => WARNING,
VerifyT => WARNING,
Disconnecting => WARNING,
TearDown => WARNING,
StartingSyncS => WARNING,
StartingSyncT => WARNING,
WFSyncUUID => WARNING,
}->{$device->{$_}};
$device->{"${_}_level"} = CRITICAL unless defined $device->{"${_}_level"};
if ($device->{oos}) {
$device->{oos_level} = {
StartingSyncS => OK,
StartingSyncT => OK,
SyncSource => OK,
SyncTarget => OK,
PausedSyncS => OK,
PausedSyncT => OK,
}->{$device->{$_}};
$device->{oos_level} = CRITICAL unless defined $device->{oos_level};
}
}
foreach (qw( ro pro )) {
$device->{"${_}_level"} = {
Primary => OK,
Secondary => OK,
}->{$device->{$_}};
$device->{"${_}_level"} = CRITICAL unless defined $device->{"${_}_level"};
}
foreach (qw( ds pds )) {
$device->{"${_}_level"} = {
UpToDate => OK,
Consistent => OK,
Negotiating => WARNING,
Attaching => WARNING,
}->{$device->{$_}};
$device->{"${_}_level"} = CRITICAL unless defined $device->{"${_}_level"};
}
my @extra;
if ($device->{oos}) {
push @extra, sprintf '%d kiB out-of-sync', $device->{oos};
}
if ($device->{iof} !~ /^r.--(.(-)?)?$/) {
$device->{iof_level} = CRITICAL;
push @extra, sprintf 'I/O flags: %s', $device->{iof};
}
my $extra = @extra ? sprintf(' (%s)', join ', ', @extra) : '';
my $level = OK;
foreach (grep /_level$/, keys %$device) {
$level = $device->{$_} if $level < $device->{$_};
}
$status = $level if $status < $level;
$device->{level} = $level;
$device->{info} = sprintf 'drbd%d:cs=%s|ro=%s|pro=%s|ds=%s|pds=%s|extra=%s', $id, $device->{cs}, $device->{ro}, $device->{pro}, $device->{ds}, $device->{pds}, $extra;
$device->{short} = sprintf 'drbd%d: %0.3s%0.3s%0.3s%s', $id, $device->{ro}, $device->{cs}, $device->{ds}, $extra; # Role and connstate reversed, like old check_drbd
foreach (qw( ns nr dw dr al bm )) {
my $value = $device->{$_};
defined $value
or next;
perfdata "${_}=${value}";
}
foreach (qw( lo pe ua ap oos )) {
my $value = $device->{$_};
defined $value
or next;
perfdata "${_}=${value}";
}
}
@state
or critical 'No DRBD volumes present';
if ($status) {
my $message = join ', ', map $_->{info}, grep { defined and $_->{level} } @state;
if ($status == WARNING) {
warning $message;
} else {
critical $message;
}
} else {
my $message = join ', ', map { ($verbose || @state == 1) ? $_->{info} : $_->{short} } grep defined, @state;
ok $message;
}
die;
###########################################################################
sub help {
print <<EOF;
Usage: @{[BASENAME]} [OPTION...]
Check DRBD resources.
Plugin options:
-v, --verbose Increase verbosity
Help options:
-?, --help Give this help list
--usage Give a short usage message
EOF
}
sub usage {
print <<EOF;
Usage: @{[BASENAME]} [-v?] [--verbose] [--help] [--usage]
EOF
}
###########################################################################
{
my @perfdata;
sub perfdata { push @perfdata, @_ }
sub _exit {
my ($status, $message) = @_;
if (defined $message) {
print $message;
} else {
print qw( OK WARNING CRITICAL )[$status] || 'UNKNOWN';
}
if (my $perfdata = shift @perfdata) {
print "|$perfdata";
}
# print "\n";
if (@perfdata) {
print '|';
print map "$_|", @perfdata;
}
print "\n";
exit $status;
}
}
sub ok { _exit OK, @_ }
sub warning { _exit WARNING, @_ }
sub critical { _exit CRITICAL, @_ }
sub unknown { _exit UNKNOWN, @_ }
###########################################################################
sub get_state {
my $io = new IO::File(STATE_FILE)
or critical "Could not open @{[STATE_FILE]} for reading: $!";
# 0: cs:Connected ro:Primary/Secondary ds:UpToDate/UpToDate C r----
# ns:0 nr:20492 dw:20480 dr:124 al:5 bm:1296 lo:0 pe:0 ua:0 ap:0 ep:1 wo:d oos:0
my @state;
my $device;
while (<$io>) {
if (m(^ \s* (\d+): \s* cs:(\w+) \s+ (?:ro|st):(\w+)/(\w+) \s+ ds:(\w+)/(\w+) \s+ \S+ \s+ (\S+))x) {
$device = $state[$1] = {
cs => $2,
ro => $3,
pro => $4,
ds => $5,
pds => $6,
iof => $7,
};
next;
};
$device or next;
$device->{$1} = $2 while /(\w+):(\S+)/g;
}
@state;
}