summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPhilip Colmer <philip.colmer@linaro.org>2014-01-22 13:43:33 +0000
committerPhilip Colmer <philip.colmer@linaro.org>2014-01-22 13:43:33 +0000
commite33d37f0f95d56eebda8fa57a5260cdf0997d5c8 (patch)
treeecda7b835bb52c585dd6f2e8f5e0ac01e9ff1667
parentd64fc96ffde38c7c4121072101be67037ab5c97e (diff)
downloadlinaro-healthcheck-e33d37f0f95d56eebda8fa57a5260cdf0997d5c8.tar.gz
Fixed state change bugs and tweaked upstart script
-rw-r--r--healthcheck.py54
-rw-r--r--linaro-healthcheck.conf4
2 files changed, 40 insertions, 18 deletions
diff --git a/healthcheck.py b/healthcheck.py
index 47b8d79..4cab2a4 100644
--- a/healthcheck.py
+++ b/healthcheck.py
@@ -15,6 +15,7 @@ class HealthCheck(object):
self.last_state = prev_state
self.last_response = prev_response
self.last_address = prev_addr
+ self.state_process = None
def silentremove(self, filename):
try:
@@ -30,6 +31,13 @@ class HealthCheck(object):
def index(self):
new_state = -1
+ # If we had a state change process, poll it so that the process quits properly
+ # when it has finished, rather than leaving a defunct thing lying around.
+ if not (self.state_process is None):
+ if (self.state_process.poll() is None):
+ self.logmsg("State change script has finished")
+ self.state_process = None
+
# We can't abort the daemon starting if we fail to get the right
# info so we only proceed if the various sanity checks work
if (self.fqdn != "" and self.system_ip != ""):
@@ -63,39 +71,52 @@ class HealthCheck(object):
if (service_ip != self.last_address):
# Active node has changed
- if (self.last_state == States.Passive):
- # We've become the new active node - switch to starting up
- new_state = States.StartingUp
- if (self.last_state == States.Active):
+ if (service_ip == self.system_ip):
+ if (self.last_state == States.Passive):
+ # We've become the new active node - switch to starting up
+ self.logmsg("We're the active node and we were passive, now starting up")
+ new_state = States.StartingUp
+ else:
+ self.logmsg("Now active ode with uncaught state of %s" % str(self.last_state))
+ elif (self.last_state == States.Active):
# We were the active node - see if we are still healthy,
# in which case we switch to passive, or if we have failed.
if (healthy):
+ self.logmsg("Active node has changed and we are healthy; switching to Passive")
new_state = States.Passive
else:
self.logmsg("Active node has changed and we aren't healthy; switching to Failed")
new_state = States.Failed
+ else:
+ self.logmsg("IP address has changed with uncaught state of %s" % str(self.last_state))
else:
if (service_ip == self.system_ip):
# We're the active node.
if (self.last_state == States.Maintenance):
+ self.logmsg("Active node, last state was Maintenance, switching to Passive")
new_state = States.Passive
- if (self.last_state == States.Passive):
+ elif (self.last_state == States.Passive):
+ self.logmsg("Active node, last state was Passive, switching to StartingUp")
new_state = States.StartingUp
- if (self.last_state == States.StartingUp):
+ elif (self.last_state == States.StartingUp):
if (healthy):
# Finished starting up
+ self.logmsg("Healthy active node, switching from StartingUp to Active")
new_state = States.Active
else:
# Still starting
new_state = States.StartingUp
- if (self.last_state == States.Active):
+ elif (self.last_state == States.Active):
if (healthy):
new_state = States.Active
else:
self.logmsg("Active node but we aren't healthy; switching to Failed")
new_state = States.Failed
+ else:
+ self.logmsg("Active node with uncaught state of %s" % str(self.last_state))
else:
# We're the passive node
+ self.logmsg("Passive node = passive state")
new_state = States.Passive
else:
# Sanity checks failed = failed :-)
@@ -114,7 +135,7 @@ class HealthCheck(object):
elif (new_state == States.Frozen):
new_response = last_response
else:
- log.error("Unmatched state of %s" % str(new_state))
+ self.logmsg("Unmatched state of %s" % str(new_state))
# Clean up some of the trigger files
if (new_state != States.Active):
@@ -174,9 +195,10 @@ class HealthCheck(object):
files = glob.glob("%s/%s" % (self.script_directory, filename))
if len(files) == 1:
# os.system(files[0])
- subprocess.Popen([files[0]])
+ self.logmsg("Firing state change script %s" % files[0])
+ self.state_process = subprocess.Popen([files[0]])
elif (len(files) > 1):
- self.logmsg("more than one matching script for stage change %s to %s" % (str(self.last_state), str(new_state)))
+ self.logmsg("More than one matching script for stage change %s to %s" % (str(self.last_state), str(new_state)))
else:
self.logmsg("Unexpected stage change from %s to %s" % (str(self.last_state), str(new_state)))
@@ -205,6 +227,7 @@ def main():
script_dir = os.path.dirname(os.path.abspath(__file__))
syslog_logger.debug("Linaro Healthcheck running from %s" % script_dir)
+
service_fqdn = ""
try:
with open(script_dir + "/fqdn") as fp:
@@ -242,9 +265,10 @@ def main():
if (prev_address == ""):
prev_address = "error"
- # If we were Active last time this script ran, switch to Passive so
- # that things get started up properly, i.e. Passive -> StartingUp -> Active
- if (prev_state == States.Active):
+ # If we were anything other then failed, maintenance or frozen last time
+ # this script ran, switch to Passive so that things get started up
+ # properly, i.e. Passive -> StartingUp -> Active
+ if (prev_state != States.Failed and prev_state != States.Maintenance and prev_state != States.Frozen):
prev_state = States.Passive
prev_response = 202
@@ -269,7 +293,7 @@ def main():
logscope = cherrypy.log
# Make a new RotatingFileHandler for the error log.
- fname = getattr(logscope, "rot_error_file", "error.log")
+ fname = getattr(logscope, "rot_error_file", "%s/error.log" % script_dir)
h = handlers.TimedRotatingFileHandler(fname, when='midnight')
h.setLevel(logging.DEBUG)
h.setFormatter(_cplogging.logfmt)
@@ -277,7 +301,7 @@ def main():
logscope.error_log.addHandler(h)
# Make a new RotatingFileHandler for the access log.
- fname = getattr(logscope, "rot_access_file", "access.log")
+ fname = getattr(logscope, "rot_access_file", "%s/access.log" % script_dir)
h = handlers.TimedRotatingFileHandler(fname, when='midnight')
h.setLevel(logging.DEBUG)
h.setFormatter(_cplogging.logfmt)
diff --git a/linaro-healthcheck.conf b/linaro-healthcheck.conf
index 1991272..cf5e1e1 100644
--- a/linaro-healthcheck.conf
+++ b/linaro-healthcheck.conf
@@ -5,9 +5,7 @@ stop on runlevel [!2345]
pre-start script
# Stop job from continuing if no fqdn file
- [ -f /opt/linaro-healthcheck/fqdn ] && exit 0
-
- exit 1
+ [ ! -f /opt/linaro-healthcheck/fqdn ] && exit 1
end script
expect daemon