Commit 3c3ca1cf authored by Jonathan Michalon's avatar Jonathan Michalon

Implement config reload (via HUP signal)

Reloading of the configuration implemented. Old/changed checks in failure send
an email saying it will never come back again. New checks are added to the list.
Unmodified checks are kept the same, preserving context.
Systemd unit now knows about reload and by the way uses systemd itself as a
watchdog instead of the one provided alongside the program.
parent c2efddd5
...@@ -59,6 +59,17 @@ def __alarm_handler(signum, frame): ...@@ -59,6 +59,17 @@ def __alarm_handler(signum, frame):
mails.send_email_report(report) mails.send_email_report(report)
def __hangup_handler(signum, frame):
print ("Signal SIGHUP caught, reloading config. (%s)" %
datetime.now())
from . import config
from importlib import reload
oldchecks = list(config.checks)
config.checks.clear()
reload(config.configmodule)
config.checks.merge(oldchecks)
def parse_args(): def parse_args():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("-1", "--one", parser.add_argument("-1", "--one",
...@@ -85,11 +96,12 @@ def import_config(configfile): ...@@ -85,11 +96,12 @@ def import_config(configfile):
filename = os.path.basename(configfile) filename = os.path.basename(configfile)
base, ext = os.path.splitext(filename) base, ext = os.path.splitext(filename)
try: try:
import_module(base) configmodule = import_module(base)
except ImportError as e: except ImportError as e:
logging.critical("Cannot load config from '%s': %s" % ( logging.critical("Cannot load config from '%s': %s" % (
configfile, str(e))) configfile, str(e)))
sys.exit(1) sys.exit(1)
config.install_attr('configmodule', configmodule)
def run(): def run():
...@@ -108,6 +120,7 @@ def run(): ...@@ -108,6 +120,7 @@ def run():
# register signal handling # register signal handling
signal.signal(signal.SIGUSR1, __usr1_handler) signal.signal(signal.SIGUSR1, __usr1_handler)
signal.signal(signal.SIGALRM, __alarm_handler) signal.signal(signal.SIGALRM, __alarm_handler)
signal.signal(signal.SIGHUP, __hangup_handler)
# register report signal interval # register report signal interval
if config.emails.report.every > 0: if config.emails.report.every > 0:
......
...@@ -22,6 +22,19 @@ class Checks(list): ...@@ -22,6 +22,19 @@ class Checks(list):
for check in checks: for check in checks:
self += [check(d, **options) for d in dests] self += [check(d, **options) for d in dests]
# this should (assuming "other" is the "older" list):
# - pickup checks defined in both list (keep old one with its variables)
# - email for checks in the other list but not in us (it was removed)
def merge(self, other):
for oldcheck in other:
found = False
for idx, newcheck in enumerate(self):
if oldcheck == newcheck:
self[idx] = oldcheck
found = True
if not found and not oldcheck.ok:
mails.send_email_for_check(oldcheck, True)
class Check(object): class Check(object):
def __init__(self, dest, **options): def __init__(self, dest, **options):
...@@ -51,6 +64,11 @@ class Check(object): ...@@ -51,6 +64,11 @@ class Check(object):
self.retry_count, self.retry_count,
self.retry) self.retry)
def __eq__(self, other):
return (self.__class__.__name__ == other.__class__.__name__ and
self.dest == other.dest and
self._options == other._options)
def setup(self): def setup(self):
pass pass
......
...@@ -105,12 +105,15 @@ def send_email(subject, body, extra_headers={}): ...@@ -105,12 +105,15 @@ def send_email(subject, body, extra_headers={}):
msg.as_string()) msg.as_string())
def send_email_for_check(check): def send_email_for_check(check, removed=False):
from . import config from . import config
# ensure we do not traceback with unknown substitutions # ensure we do not traceback with unknown substitutions
state = 'OK' if check.ok else 'Problem'
if removed:
state = 'Removed'
subject = config.emails.subject_tpl.format_map( subject = config.emails.subject_tpl.format_map(
defaultdict(lambda: "<no substitution>", defaultdict(lambda: "<no substitution>",
state='OK' if check.ok else 'Problem', state=state,
check=check.__class__.__name__, check=check.__class__.__name__,
dest=check.target_name)) dest=check.target_name))
...@@ -123,13 +126,16 @@ def send_email_for_check(check): ...@@ -123,13 +126,16 @@ def send_email_for_check(check):
msg_text += ("recovered after %s (%d %s)." % msg_text += ("recovered after %s (%d %s)." %
(delta, n, "retry" if n == 1 else "retries")) (delta, n, "retry" if n == 1 else "retries"))
else: else:
msg_text += ("failure:\n%s\n" % check.errmsg.strip()) if removed:
msg_text += ("\nFYI, last exec was:\n%s\n" % check.last_exec.strip()) msg_text += "The check was removed from configuration.\n"
else:
msg_text += ("failure:\n%s\n" % check.errmsg.strip())
msg_text += ("\nLast run was:\n%s\n" % check.last_exec.strip())
extra_headers = {} extra_headers = {}
extra_headers['Message-ID'] = make_msgid(type(check).__name__) extra_headers['Message-ID'] = make_msgid(type(check).__name__)
# if check is OK it's a follow up, so set In-Reply-To # if it's a follow up, set In-Reply-To
if check.ok and hasattr(check, 'mails_msgid'): if hasattr(check, 'mails_msgid'):
extra_headers['In-Reply-To'] = check.mails_msgid extra_headers['In-Reply-To'] = check.mails_msgid
extra_headers['References'] = check.mails_msgid extra_headers['References'] = check.mails_msgid
check.mails_msgid = extra_headers['Message-ID'] check.mails_msgid = extra_headers['Message-ID']
......
...@@ -3,8 +3,9 @@ Description=Picomon Monitoring Daemon for %i ...@@ -3,8 +3,9 @@ Description=Picomon Monitoring Daemon for %i
After=network.target After=network.target
[Service] [Service]
ExecStart=/usr/local/bin/picomon-watchdog -c /etc/picomon/%i.py ExecStart=/usr/local/bin/picomon -c /etc/picomon/%i.py
#ExecReload=/bin/kill -HUP $MAINPID ExecReload=/bin/kill -HUP $MAINPID
Restart=on-failure
User=nobody User=nobody
Group=nogroup Group=nogroup
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment