import string, cgi, time, socket, os, errno, logging, glob, subprocess import cherrypy from cherrypy import log, _cplogging from cherrypy.process.plugins import Daemonizer, PIDFile from logging import handlers class States: Passive, Active, Failed, StartingUp, Maintenance, Frozen = range(6) class HealthCheck(object): def __init__(self, script_dir, service_fqdn, my_ip, prev_state, prev_response, prev_addr): self.script_directory = script_dir self.fqdn = service_fqdn self.system_ip = my_ip self.last_state = prev_state self.last_response = prev_response self.last_address = prev_addr self.state_process = None self.startingup_countdown = 0 def silentremove(self, filename): try: os.remove(self.script_directory + filename) except OSError as e: if (e.errno != errno.ENOENT): raise def logmsg(self, string): log.error(string, context='HTTP', severity=20, traceback=True) @cherrypy.expose def index(self): new_state = -1 # If we had a state change process, poll it so that the process quits properly # when it has finished, rather than leaving a defunct thing lying around. if not (self.state_process is None): if (self.state_process.poll() is None): self.logmsg("State change script has finished") self.state_process = None # We can't abort the daemon starting if we fail to get the right # info so we only proceed if the various sanity checks work if (self.fqdn != "" and self.system_ip != ""): # Get the IP address from Route 53 try: service_ip = socket.gethostbyname(self.fqdn) except Exception,e: self.logmsg("Got exception trying to get IP address for '%s': %s" % (fqdn, str(e))) service_ip = "unknown" # Only log IP address info when a change happens if (service_ip != self.last_address): self.logmsg("Service IP = %s, this IP = %s, last service IP = %s" % (service_ip, self.system_ip, self.last_address)) if (os.path.isfile(self.script_directory + "/frozen")): if (self.last_state != States.Frozen): self.logmsg("Frozen file exists") new_state = States.Frozen elif (os.path.isfile(self.script_directory + "/maintenance")): if (self.last_state != States.Maintenance): self.logmsg("Maintenance file exists") new_state = States.Maintenance elif (self.last_state == States.Failed): new_state = States.Failed elif (self.last_state == States.Frozen): # We were frozen but now we aren't - figure out what our state # should be from the response code we were issuing if (self.last_response == 200): self.last_state = new_state = States.Active elif (self.last_response == 202): self.last_state = new_state = States.Passive elif (self.last_response == 203): self.last_state = new_state = States.StartingUp elif (self.last_response == 500): self.last_state = new_state = States.Failed elif (self.last_response == 503): self.last_state = new_state = States.Maintenance else: self.logmsg("Coming out of frozen, old response code was %s" % str(self.last_response)) self.last_state = new_state = States.Failed else: # The following logic ONLY works if: # a) there are two nodes # b) the IP addresses returned by Route 53 map onto those nodes # See if the external health checks think we're healthy? healthy = os.path.isfile(self.script_directory + "/healthy") if (service_ip != self.last_address): # Active node has changed if (service_ip == self.system_ip): if (self.last_state == States.Passive): # We've become the new active node - switch to starting up self.logmsg("We're the active node and we were passive, now starting up") new_state = States.StartingUp else: self.logmsg("Now active node with uncaught state of %s" % str(self.last_state)) elif (self.last_state == States.Active): # We were the active node - see if we are still healthy, # in which case we switch to passive, or if we have failed. if (healthy): self.logmsg("Active node has changed and we are healthy; switching to Passive") new_state = States.Passive else: self.logmsg("Active node has changed and we aren't healthy; switching to Failed") new_state = States.Failed elif (self.last_state == States.Passive): # We're on the passive node, we were passive and we're still passive new_state = States.Passive elif (self.last_state == States.StartingUp): # We were starting up the services but the IP address has shifted so we need to # stop the services - switch back to Passive self.logmsg("No longer the active node, switching from StartingUp back to Passive") new_state = States.Passive else: self.logmsg("IP address has changed with uncaught state of %s" % str(self.last_state)) else: if (service_ip == self.system_ip): # We're the active node. if (self.last_state == States.Maintenance): self.logmsg("Active node, last state was Maintenance, switching to Passive") new_state = States.Passive elif (self.last_state == States.Passive): self.logmsg("Active node, last state was Passive, switching to StartingUp") new_state = States.StartingUp elif (self.last_state == States.StartingUp): if (healthy): # Finished starting up self.logmsg("Healthy active node, switching from StartingUp to Active") new_state = States.Active else: # Still starting new_state = States.StartingUp elif (self.last_state == States.Active): if (healthy): new_state = States.Active else: self.logmsg("Active node but we aren't healthy; switching to Failed") new_state = States.Failed else: self.logmsg("Active node with uncaught state of %s" % str(self.last_state)) else: # We're the passive node if (self.last_state != States.Passive): self.logmsg("Passive node = passive state") new_state = States.Passive else: # Sanity checks failed = failed :-) new_state = States.Failed if (new_state == States.Active): new_response = 200 elif (new_state == States.Passive): new_response = 202 elif (new_state == States.StartingUp): new_response = 203 elif (new_state == States.Failed): new_response = 500 elif (new_state == States.Maintenance): new_response = 503 elif (new_state == States.Frozen): new_response = self.last_response else: self.logmsg("Unmatched state of %s" % str(new_state)) new_response = 500 # Clean up some of the trigger files if (new_state != States.Active): self.silentremove("/healthy") if (new_state != States.Maintenance): self.silentremove("/maintenance") if (new_state != States.Frozen): self.silentremove("/frozen") cherrypy.response.status = new_response cherrypy.response.headers['Content-type'] = 'text/html' cherrypy.response.body = ["

This is the Linaro health check service. State is %s and response code is %s

" % (str(new_state), str(new_response))] # Save away the various bits of information try: fp = open(self.script_directory + "/last_state", "w") fp.write(str(new_state)) fp.close except Exception,e: self.logmsg("Got exception trying to save reported state: %s" % str(e)) try: fp = open(self.script_directory + "/last_response", "w") fp.write(str(new_response)) fp.close except Exception,e: self.logmsg("Got exception trying to save reported response: %s" % str(e)) try: fp = open(self.script_directory + "/last_address", "w") fp.write(str(service_ip)) fp.close except Exception,e: self.logmsg("Got exception trying to save service IP: %s" % str(e)) # Set the starting up count - we won't run the change state script until # the count reaches 0. Since the script hasn't run, we'll stay in StartingUp # because we can't switch to Active until monit (or whatever) detects the service # as actually running. if (new_state == States.StartingUp and self.last_state != States.StartingUp): self.startingup_countdown = 10 # Override last state in order to prevent the state change script from # running self.last_state = States.StartingUp self.logmsg("Holding start up for %s cycles" % str(self.startingup_countdown)) elif (new_state == States.StartingUp and self.startingup_countdown > 0): # Decrement the countdown - if we reach zero, switch last state to Passive # to trigger the state change script self.startingup_countdown -= 1 if (self.startingup_countdown == 0): self.last_state = States.Passive else: self.logmsg("Holding start up for %s cycles" % str(self.startingup_countdown)) # See if a script exists for one of the supported state changes if (new_state != self.last_state): valid_state = False if (self.last_state == States.Passive): if (new_state == States.StartingUp or new_state == States.Maintenance or new_state == States.Frozen): valid_state = True elif (self.last_state == States.StartingUp): if (new_state == States.Passive or new_state == States.Active or new_state == States.Failed or new_state == States.Maintenance or new_state == States.Frozen): valid_state = True elif (self.last_state == States.Active): if (new_state == States.Failed or new_state == States.Maintenance or new_state == States.Frozen or new_state == States.Passive): valid_state = True elif (self.last_state == States.Failed): if (new_state == States.Maintenance or new_state == States.Frozen): valid_state = True elif (self.last_state == States.Maintenance or self.last_state == States.Frozen): if (new_state == States.Passive): valid_state = True if valid_state: filename = "from_%s_to_%s.*" % (str(self.last_state), str(new_state)) files = glob.glob("%s/%s" % (self.script_directory, filename)) if len(files) == 1: # os.system(files[0]) self.logmsg("Firing state change script %s" % files[0]) self.state_process = subprocess.Popen([files[0]]) elif (len(files) > 1): self.logmsg("More than one matching script for stage change %s to %s" % (str(self.last_state), str(new_state))) else: self.logmsg("Unexpected stage change from %s to %s" % (str(self.last_state), str(new_state))) self.last_state = new_state self.last_response = new_response self.last_address = service_ip def safereadline(logger, script_directory, filename): line = "" try: with open(script_directory + filename) as fp: line = fp.readline() except IOError as e: if (e.errno != errno.ENOENT): logger.error("Got exception trying to read %s: %s" % (filename, str(e))) return line def main(): # Set up a syslog logger so that we can report stuff before # the daemon starts up syslog_logger = logging.getLogger("linaro_healthcheck") syslog_logger.setLevel(logging.DEBUG) handler = logging.handlers.SysLogHandler( facility=logging.handlers.SysLogHandler.LOG_DAEMON, address="/dev/log") syslog_logger.addHandler(handler) script_dir = os.path.dirname(os.path.abspath(__file__)) syslog_logger.debug("Linaro Healthcheck running from %s" % script_dir) service_fqdn = "" try: with open(script_dir + "/fqdn") as fp: service_fqdn = str(fp.readline()).rstrip() except Exception,e: syslog_logger.error("Got exception trying to get fqdn: %s" % str(e)) # Try to get this system's IP address my_ip = "" try: s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) s.connect(("8.8.8.8", 53)) my_ip = s.getsockname()[0] s.close except Exception,e: syslog_logger.error("Got exception trying to get system's IP address: %s" % str(e)) # See if we've got a recorded last state prev_state = States.Passive line = safereadline(syslog_logger, script_dir, "/last_state") if (line != ""): prev_state = int(line) if ((prev_state < States.Passive) or (prev_state > States.Frozen)): prev_state = States.Passive # and see if we've got a recorded last response code line = safereadline(syslog_logger, script_dir, "/last_response") if (line != ""): prev_response = int(line) else: prev_response = 202 # and a last IP address prev_address = safereadline(syslog_logger, script_dir, "/last_address") if (prev_address == ""): prev_address = "error" # If we were anything other then failed, maintenance or frozen last time # this script ran, switch to Passive so that things get started up # properly, i.e. Passive -> StartingUp -> Active if (prev_state != States.Failed and prev_state != States.Maintenance and prev_state != States.Frozen): prev_state = States.Passive prev_response = 202 cherrypy.config.update({'server.socket_host': '0.0.0.0', 'server.socket_port': 1234, 'server.thread_pool': 1, 'server.thread_pool_max': 1, 'tools.staticdir.on': True, 'tools.staticdir.dir': script_dir, 'log.screen': True, 'tools.sessions.on': True, }) config = {'/': { } } application = cherrypy.tree.mount(HealthCheck(script_dir, service_fqdn, my_ip, prev_state, prev_response, prev_address), "/", config) #log = application.log logscope = cherrypy.log # Make a new RotatingFileHandler for the error log. fname = getattr(logscope, "rot_error_file", "%s/error.log" % script_dir) h = handlers.TimedRotatingFileHandler(fname, when='midnight') h.setLevel(logging.DEBUG) h.setFormatter(_cplogging.logfmt) logscope.error_file = "" logscope.error_log.addHandler(h) # Make a new RotatingFileHandler for the access log. fname = getattr(logscope, "rot_access_file", "%s/access.log" % script_dir) h = handlers.TimedRotatingFileHandler(fname, when='midnight') h.setLevel(logging.DEBUG) h.setFormatter(_cplogging.logfmt) logscope.access_file = "" logscope.access_log.addHandler(h) # Add a CTRL+C handler if hasattr(cherrypy.engine, 'signal_handler'): cherrypy.engine.signal_handler.subscribe() if hasattr(cherrypy.engine, 'console_control_handler'): cherrypy.engine.console_control_handler.subscrive() # Set up the daemon d = Daemonizer(cherrypy.engine) d.subscribe() PIDFile(cherrypy.engine, '/var/run/linaro-healthcheck.pid').subscribe() cherrypy.engine.start() cherrypy.engine.block() if __name__ == '__main__': main()