diff options
-rw-r--r-- | healthcheck.py | 54 | ||||
-rw-r--r-- | linaro-healthcheck.conf | 4 |
2 files changed, 40 insertions, 18 deletions
diff --git a/healthcheck.py b/healthcheck.py index 47b8d79..4cab2a4 100644 --- a/healthcheck.py +++ b/healthcheck.py @@ -15,6 +15,7 @@ class HealthCheck(object): self.last_state = prev_state self.last_response = prev_response self.last_address = prev_addr + self.state_process = None def silentremove(self, filename): try: @@ -30,6 +31,13 @@ class HealthCheck(object): def index(self): new_state = -1 + # If we had a state change process, poll it so that the process quits properly + # when it has finished, rather than leaving a defunct thing lying around. + if not (self.state_process is None): + if (self.state_process.poll() is None): + self.logmsg("State change script has finished") + self.state_process = None + # We can't abort the daemon starting if we fail to get the right # info so we only proceed if the various sanity checks work if (self.fqdn != "" and self.system_ip != ""): @@ -63,39 +71,52 @@ class HealthCheck(object): if (service_ip != self.last_address): # Active node has changed - if (self.last_state == States.Passive): - # We've become the new active node - switch to starting up - new_state = States.StartingUp - if (self.last_state == States.Active): + if (service_ip == self.system_ip): + if (self.last_state == States.Passive): + # We've become the new active node - switch to starting up + self.logmsg("We're the active node and we were passive, now starting up") + new_state = States.StartingUp + else: + self.logmsg("Now active ode with uncaught state of %s" % str(self.last_state)) + elif (self.last_state == States.Active): # We were the active node - see if we are still healthy, # in which case we switch to passive, or if we have failed. if (healthy): + self.logmsg("Active node has changed and we are healthy; switching to Passive") new_state = States.Passive else: self.logmsg("Active node has changed and we aren't healthy; switching to Failed") new_state = States.Failed + else: + self.logmsg("IP address has changed with uncaught state of %s" % str(self.last_state)) else: if (service_ip == self.system_ip): # We're the active node. if (self.last_state == States.Maintenance): + self.logmsg("Active node, last state was Maintenance, switching to Passive") new_state = States.Passive - if (self.last_state == States.Passive): + elif (self.last_state == States.Passive): + self.logmsg("Active node, last state was Passive, switching to StartingUp") new_state = States.StartingUp - if (self.last_state == States.StartingUp): + elif (self.last_state == States.StartingUp): if (healthy): # Finished starting up + self.logmsg("Healthy active node, switching from StartingUp to Active") new_state = States.Active else: # Still starting new_state = States.StartingUp - if (self.last_state == States.Active): + elif (self.last_state == States.Active): if (healthy): new_state = States.Active else: self.logmsg("Active node but we aren't healthy; switching to Failed") new_state = States.Failed + else: + self.logmsg("Active node with uncaught state of %s" % str(self.last_state)) else: # We're the passive node + self.logmsg("Passive node = passive state") new_state = States.Passive else: # Sanity checks failed = failed :-) @@ -114,7 +135,7 @@ class HealthCheck(object): elif (new_state == States.Frozen): new_response = last_response else: - log.error("Unmatched state of %s" % str(new_state)) + self.logmsg("Unmatched state of %s" % str(new_state)) # Clean up some of the trigger files if (new_state != States.Active): @@ -174,9 +195,10 @@ class HealthCheck(object): files = glob.glob("%s/%s" % (self.script_directory, filename)) if len(files) == 1: # os.system(files[0]) - subprocess.Popen([files[0]]) + self.logmsg("Firing state change script %s" % files[0]) + self.state_process = subprocess.Popen([files[0]]) elif (len(files) > 1): - self.logmsg("more than one matching script for stage change %s to %s" % (str(self.last_state), str(new_state))) + self.logmsg("More than one matching script for stage change %s to %s" % (str(self.last_state), str(new_state))) else: self.logmsg("Unexpected stage change from %s to %s" % (str(self.last_state), str(new_state))) @@ -205,6 +227,7 @@ def main(): script_dir = os.path.dirname(os.path.abspath(__file__)) syslog_logger.debug("Linaro Healthcheck running from %s" % script_dir) + service_fqdn = "" try: with open(script_dir + "/fqdn") as fp: @@ -242,9 +265,10 @@ def main(): if (prev_address == ""): prev_address = "error" - # If we were Active last time this script ran, switch to Passive so - # that things get started up properly, i.e. Passive -> StartingUp -> Active - if (prev_state == States.Active): + # If we were anything other then failed, maintenance or frozen last time + # this script ran, switch to Passive so that things get started up + # properly, i.e. Passive -> StartingUp -> Active + if (prev_state != States.Failed and prev_state != States.Maintenance and prev_state != States.Frozen): prev_state = States.Passive prev_response = 202 @@ -269,7 +293,7 @@ def main(): logscope = cherrypy.log # Make a new RotatingFileHandler for the error log. - fname = getattr(logscope, "rot_error_file", "error.log") + fname = getattr(logscope, "rot_error_file", "%s/error.log" % script_dir) h = handlers.TimedRotatingFileHandler(fname, when='midnight') h.setLevel(logging.DEBUG) h.setFormatter(_cplogging.logfmt) @@ -277,7 +301,7 @@ def main(): logscope.error_log.addHandler(h) # Make a new RotatingFileHandler for the access log. - fname = getattr(logscope, "rot_access_file", "access.log") + fname = getattr(logscope, "rot_access_file", "%s/access.log" % script_dir) h = handlers.TimedRotatingFileHandler(fname, when='midnight') h.setLevel(logging.DEBUG) h.setFormatter(_cplogging.logfmt) diff --git a/linaro-healthcheck.conf b/linaro-healthcheck.conf index 1991272..cf5e1e1 100644 --- a/linaro-healthcheck.conf +++ b/linaro-healthcheck.conf @@ -5,9 +5,7 @@ stop on runlevel [!2345] pre-start script # Stop job from continuing if no fqdn file - [ -f /opt/linaro-healthcheck/fqdn ] && exit 0 - - exit 1 + [ ! -f /opt/linaro-healthcheck/fqdn ] && exit 1 end script expect daemon |