system: move gateway monitor trigger to separate script #6231

1. The process runs forever to retain proper state, periodically
   syncing the configuration data in order to react correctly.
2. Missing gateways are not an issue.  They will not alert or stick
   to their last verified value.
3. We stop reacting unless a default gatway switch action will follow
   or the gateway is part of a gateway group.  Triggers are not refined
   for now so we just let it run in full processing if a candidate.
4. Emulate the strange monitor alarm output although I don't see the
   use for all of this cryptic goo.  The alarm state (0, 1) was changed
   to reflect the observed transition causing the alarm script to run.
5. Move the action for the script alarm to the script itself.  Requires
   a bit of backend shuffling as well.
6. Only create one script to watch all monitors.  Easier to manage and
   to present as service (which can be stopped and started if needed).
This commit is contained in:
Franco Fichtner 2023-03-23 11:58:06 +01:00
parent 21f1ccda94
commit a2bc2705ea
5 changed files with 185 additions and 29 deletions

1
plist
View File

@ -978,6 +978,7 @@
/usr/local/opnsense/scripts/proxy/setup.sh
/usr/local/opnsense/scripts/routes/del_route.py
/usr/local/opnsense/scripts/routes/gateway_status.php
/usr/local/opnsense/scripts/routes/gateway_watcher.php
/usr/local/opnsense/scripts/routes/gateways.php
/usr/local/opnsense/scripts/routes/show_routes.py
/usr/local/opnsense/scripts/shaper/dummynet_stats.py

View File

@ -35,7 +35,7 @@ function dpinger_services()
{
global $config;
$services = array();
$services = [];
if (!isset($config['gateways']['gateway_item'])) {
return $services;
@ -51,7 +51,7 @@ function dpinger_services()
}
$pconfig = [];
$pconfig['description'] = sprintf(gettext('Gateway Monitor (%s)'), $gateway['name']);
$pconfig['description'] = sprintf(gettext('Gateway monitor (%s)'), $gateway['name']);
$pconfig['php']['restart'] = ['dpinger_configure_do'];
$pconfig['php']['start'] = ['dpinger_configure_do'];
$pconfig['pidfile'] = "/var/run/dpinger_{$gateway['name']}.pid";
@ -62,6 +62,20 @@ function dpinger_services()
$services[] = $pconfig;
}
if (count($services)) {
$pconfig = [];
$pconfig['description'] = gettext('Gateway monitor watcher');
$pconfig['php']['restart'] = ['dpinger_configure_do'];
$pconfig['php']['start'] = ['dpinger_configure_do'];
$pconfig['pidfile'] = '/var/run/gateway_watcher.pid';
$pconfig['php']['args'] = ['verbose', 'id'];
$pconfig['name'] = 'dpinger';
$pconfig['verbose'] = false;
$pconfig['id'] = ':watcher:';
$pconfig['locked'] = true;
$services[] = $pconfig;
}
return $services;
}
@ -281,9 +295,6 @@ function dpinger_configure_do($verbose = false, $gwname = null, $bootup = false)
/* status socket */
$params .= exec_safe('-u %s ', "/var/run/dpinger_{$name}.sock");
/* command to run on alarm */
$params .= '-C "/usr/local/etc/rc.syshook monitor" ';
foreach (
[
'interval' => '-s %ss ',
@ -307,6 +318,13 @@ function dpinger_configure_do($verbose = false, $gwname = null, $bootup = false)
mwexec_bg("/usr/local/bin/dpinger -f {$params}");
}
/* use a separate script to produce the monitor alerts which runs forever */
mwexecf(
'/usr/sbin/daemon -f -p %s /usr/local/opnsense/scripts/routes/gateway_watcher.php %s',
['/var/run/gateway_watcher.pid', 'interface routes alarm'],
true
);
service_log("done.\n", $verbose);
}
@ -320,14 +338,13 @@ function dpinger_run()
function dpinger_status()
{
$status = array();
$gateways_arr = array();
$instances = dpinger_instances();
$status = [];
foreach (config_read_array('gateways', 'gateway_item') as $gwitem) {
foreach ($instances as $gwitem) {
if (isset($gwitem['disabled'])) {
continue;
}
$gateways_arr[$gwitem['name']] = $gwitem;
$gwstatus = isset($gwitem['monitor_disable']) ? 'none' : 'down';
@ -335,13 +352,14 @@ function dpinger_status()
$gwstatus = 'force_down';
}
$status[$gwitem['name']] = array(
$status[$gwitem['name']] = [
'monitor' => $gwitem['monitor'],
'name' => $gwitem['name'],
'status' => $gwstatus,
'stddev' => '~',
'delay' => '~',
'loss' => '~',
);
];
}
foreach (dpinger_processes() as $gwname => $proc) {
@ -368,13 +386,14 @@ function dpinger_status()
if ($r['latency_stddev'] == '0' && $r['loss'] == '0') {
/* not yet ready, act like nothing was returned, but don't consider the gateway to be down */
$status[$gwname] = array(
$status[$gwname] = [
'monitor' => $instances[$gwname]['monitor'],
'name' => $gwname,
'status' => 'none',
'stddev' => '~',
'delay' => '~',
'loss' => '~',
);
];
} else {
$r['latency_stddev'] = round($r['latency_stddev'] / 1000, 1);
$r['latency_avg'] = round($r['latency_avg'] / 1000, 1);
@ -383,12 +402,12 @@ function dpinger_status()
$settings = dpinger_defaults();
if ($r['status'] != 'force_down') {
$keys = array('latencylow', 'latencyhigh', 'losslow', 'losshigh');
$keys = ['latencylow', 'latencyhigh', 'losslow', 'losshigh'];
/* Replace default values by user-defined */
foreach ($keys as $key) {
if (isset($gateways_arr[$gwname][$key]) && is_numeric($gateways_arr[$gwname][$key])) {
$settings[$key] = $gateways_arr[$gwname][$key];
if (isset($instances[$gwname][$key]) && is_numeric($instances[$gwname][$key])) {
$settings[$key] = $instances[$gwname][$key];
}
}
@ -407,14 +426,15 @@ function dpinger_status()
}
}
$status[$gwname] = array(
$status[$gwname] = [
'delay' => sprintf('%0.1f ms', empty($r['latency_avg']) ? 0.0 : round($r['latency_avg'], 1)),
'stddev' => sprintf('%0.1f ms', empty($r['latency_stddev']) ? 0.0 : round($r['latency_stddev'], 1)),
'loss' => sprintf('%0.1f %%', empty($r['loss']) ? 0.0 : round($r['loss'], 1)),
'status' => $r['status'],
'monitor' => $instances[$gwname]['monitor'],
'details' => array_merge($r, $settings),
'status' => $r['status'],
'name' => $gwname,
);
];
}
}
@ -423,7 +443,7 @@ function dpinger_status()
function dpinger_processes()
{
$result = array();
$result = [];
$pidfiles = glob('/var/run/dpinger_*.pid');
if ($pidfiles === false) {
@ -437,7 +457,10 @@ function dpinger_processes()
}
if (preg_match('/^dpinger_(.+)\.pid$/', basename($pidfile), $matches)) {
$socket_file = preg_replace('/\.pid$/', '.sock', $pidfile);
$result[$matches[1]] = array('socket' => $socket_file, 'pidfile' => $pidfile);
$result[$matches[1]] = [
'socket' => $socket_file,
'pidfile' => $pidfile,
];
}
}

View File

@ -32,9 +32,4 @@ if [ -z "${GATEWAY}" ]; then
exit 1
fi
/usr/bin/logger -t dpinger "GATEWAY ALARM: ${GATEWAY} (Addr: ${2} Alarm: ${3} RTT: ${4}us RTTd: ${5}us Loss: ${6}%)"
echo -n "Reloading routes/filter: "
/usr/local/bin/flock -n -E 0 -o /tmp/filter_reload_gateway.lock configctl interface routes configure alarm
exit 0
/usr/bin/logger -t dpinger "GATEWAY ALARM: ${GATEWAY} (Addr: ${2} Alarm: ${3} RTT: ${4} RTTd: ${5} Loss: ${6})"

View File

@ -0,0 +1,131 @@
#!/usr/local/bin/php
<?php
/*
* Copyright (C) 2023 Franco Fichtner <franco@opnsense.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
* AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
* OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
require_once 'config.inc';
require_once 'util.inc';
require_once 'interfaces.inc';
$action = !empty($argv[1]) ? $argv[1] : null;
$poll = 1; /* live poll interval */
$wait = 10; /* startup and alarm delay */
$mode = [];
sleep($wait);
while (1) {
$alarm = false;
OPNsense\Core\Config::getInstance()->forceReload();
$config = parse_config();
$gw_switch_default = isset($config['system']['gw_switch_default']);
$status = return_gateways_status();
foreach ($status as $report) {
$ralarm = false;
if (empty($mode[$report['name']])) {
/* skip one round for baseline */
continue;
}
$gw_group_member = false;
foreach (config_read_array('gateways', 'gateway_group') as $group) {
foreach ($group['item'] as $item) {
$itemsplit = explode('|', $item);
if ($itemsplit[0] == $report['name']) {
/* XXX consider trigger conditions later on */
$gw_group_member = true;
break;
}
}
}
/* wait for valid data before triggering an alarm */
if ($report['loss'] == '~') {
continue;
}
if ($gw_switch_default) {
/* only consider down state transition in this case */
if (!empty($mode[$report['name']]) && $mode[$report['name']] != $report['status'] && ($mode[$report['name']] == 'down' || $report['status'] == 'down')) {
$ralarm = true;
}
}
if ($gw_group_member) {
/* consider all state transitions as they depend on individual trigger setting */
if (!empty($mode[$report['name']]) && $mode[$report['name']] != $report['status']) {
$ralarm = true;
}
}
/* XXX for testing */
echo sprintf(
"/usr/local/etc/rc.syshook monitor %s %s %s %s %s %s\n",
$report['name'],
$report['monitor'],
$mode[$report['name']] . ' -> ' . $report['status'],
$report['delay'],
$report['stddev'],
$report['loss']
);
if ($ralarm) {
/* raise an alarm via the rc.syshook monitor facility */
shell_safe("/usr/local/etc/rc.syshook monitor %s %s %s %s %s %s", [
$report['name'],
$report['monitor'],
$mode[$report['name']] . ' -> ' . $report['status'],
$report['delay'],
$report['stddev'],
$report['loss']
]);
$alarm = true;
}
}
/* react to alarm if backend action was given */
if ($alarm) {
if ($action != null) {
configd_run($action);
}
/* XXX this blacks out all alarms for the grace period after alarm */
sleep($wait);
} else {
sleep($poll);
}
foreach ($status as $report) {
$mode[$report['name']] = $report['status'];
}
}

View File

@ -109,9 +109,15 @@ message:show system routing table
[routes.configure]
command:/usr/local/etc/rc.routing_configure
parameters:%s
message:reconfiguring routing %s
parameters:
type:script
message:reconfiguring routing
[routes.alarm]
command:/usr/local/bin/flock -n -E 0 -o /tmp/filter_reload_gateway.lock /usr/local/etc/rc.routing_configure alarm
parameters:
type:script
message:reconfiguriging routing due to gateway alarm
[route.del]
command: /usr/local/opnsense/scripts/routes/del_route.py