summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorroot <root@ip-10-138-52-91.ec2.internal>2014-01-17 15:23:28 +0000
committerroot <root@ip-10-138-52-91.ec2.internal>2014-01-17 15:23:28 +0000
commit2ae4d7edec9f4c29e334ecf9329ff7dadf9a61f7 (patch)
treeabe561c1d090fe7f7b28251b78aaa60249053063
parentcccc654008c88d3b63621c37593a93864db24499 (diff)
New healthcheck.py script
-rw-r--r--healthcheck.py191
1 files changed, 191 insertions, 0 deletions
diff --git a/healthcheck.py b/healthcheck.py
new file mode 100644
index 0000000..59d69be
--- /dev/null
+++ b/healthcheck.py
@@ -0,0 +1,191 @@
+import string, cgi, time, socket, os, errno
+import cherrypy
+from cherrypy import log
+
+class States:
+ Passive, Active, Failed, StartingUp, Maintenance, Frozen = range(6)
+
+class HealthCheck(object):
+ def __init__(self, script_dir, service_fqdn, my_ip):
+ self.script_directory = script_dir
+ self.fqdn = service_fqdn
+ self.system_ip = my_ip
+
+ def silentremove(self, filename):
+ try:
+ os.remove(self.script_directory + filename)
+ except OSError as e:
+ if (e.errno != errno.ENOENT):
+ raise
+
+ @cherrypy.expose
+ def index(self):
+ service_ip = ""
+ new_state = last_state = States.Passive
+ new_response = last_response = 202
+
+ # See if we've got a recorded last state
+ try:
+ with open(self.script_directory + "/last_state") as fp:
+ line = int(fp.readline())
+ if ((line < States.Passive) or (line > States.Frozen)):
+ line = States.Passive
+ last_state = line
+ except Exception,e:
+ if (e.errno != errno.ENOENT):
+ log.error("Got exception trying to read last state: %s" % str(e))
+
+ # and see if we've got a recorded last response code
+ try:
+ with open(self.script_directory + "/last_response") as fp:
+ last_response = int(fp.readline())
+ except Exception,e:
+ if (e.errno != errno.ENOENT):
+ log.error("Got exception trying to read last response: %s" % str(e))
+ last_response = 202
+
+ # and a last IP address
+ try:
+ with open(self.script_directory + "/last_address") as fp:
+ last_address = fp.readline()
+ except Exception,e:
+ if (e.errno != errno.ENOENT):
+ log.error("Got exception trying to read last address: %s" % str(e))
+ last_address = ""
+
+ if (os.path.isfile(self.script_directory + "/frozen")):
+ log.error("Frozen file exists")
+ new_state = States.Frozen
+ elif (os.path.isfile(self.script_directory + "/maintenance")):
+ log.error("Maintenance file exists")
+ new_state = States.Maintenance
+ elif (last_state == States.Failed):
+ log.error("In a failed state")
+ new_state = States.Failed
+ else:
+ # Get the IP address from Route 53
+ try:
+ service_ip = socket.gethostbyname(self.fqdn)
+ log.error("Service IP = %s, this IP = %s, last service IP = %s" % (service_ip, self.system_ip, last_address))
+ # The following logic ONLY works if:
+ # a) there are two nodes
+ # b) the IP addresses returned by Route 53 map onto those nodes
+ if (service_ip != last_address):
+ # Active node has changed
+ if (last_state == States.Passive):
+ # We've become the new active node - switch to starting up
+ new_state = States.StartingUp
+ if (last_state == States.Active):
+ # We were the active node - we must have failed
+ new_state = States.Failed
+ else:
+ if (service_ip == self.system_ip):
+ # We're the active node.
+ # See if the external health checks think we're healthy?
+ healthy = os.path.isfile(self.script_directory + "/healthy")
+ if (last_state == States.Passive):
+ new_state = States.StartingUp
+ if (last_state == States.StartingUp):
+ if (healthy):
+ # Finished starting up
+ new_state = States.Active
+ else:
+ # Still starting
+ new_state = States.StartingUp
+ if (last_state == States.Active):
+ if (healthy):
+ new_state = States.Active
+ else:
+ new_state = States.Failed
+ else:
+ # We're the passive node
+ new_state = States.Passive
+ except Exception,e:
+ log.error("Got exception trying to get IP address of '%s': %s" % (self.fqdn, str(e)))
+ new_state = States.Failed
+
+ if (new_state == States.Active):
+ new_response = 200
+ elif (new_state == States.Passive):
+ new_response = 202
+ elif (new_state == States.StartingUp):
+ new_response = 203
+ elif (new_state == States.Failed):
+ new_response = 500
+ elif (new_state == States.Maintenance):
+ new_response = 503
+ elif (new_state == States.Frozen):
+ new_response = last_response
+ else:
+ log.error("Unmatched state of %s" % str(new_state))
+
+ # Clean up some of the trigger files
+ if (new_state != States.Active):
+ self.silentremove("/healthy")
+ if (new_state != States.Maintenance):
+ self.silentremove("/maintenance")
+ if (new_state != States.Frozen):
+ self.silentremove("/frozen")
+
+ log.error("Returning new response of %s" % str(new_response))
+ cherrypy.response.status = new_response
+ cherrypy.response.headers['Content-type'] = 'text/html'
+ cherrypy.response.body = ["<p>This is the health check service. State is %s and response code is %s</P>" % (str(new_state), str(new_response))]
+
+ # Save away the various bits of information
+ try:
+ fp = open(self.script_directory + "/last_state", "w")
+ fp.write(str(new_state))
+ fp.close
+ except Exception,e:
+ log.error("Got exception trying to save reported state: %s" % str(e))
+
+ try:
+ fp = open(self.script_directory + "/last_response", "w")
+ fp.write(str(new_response))
+ fp.close
+ except Exception,e:
+ log.error("Got exception trying to save reported response: %s" % str(e))
+
+ try:
+ fp = open(self.script_directory + "/last_address", "w")
+ fp.write(str(service_ip))
+ fp.close
+ except Exception,e:
+ log.error("Got exception trying to save service IP: %s" % str(e))
+
+def main():
+ script_dir = os.path.dirname(os.path.realpath(__file__))
+ service_fqdn = ""
+ try:
+ with open(script_dir + "/fqdn") as fp:
+ service_fqdn = str(fp.readline()).rstrip()
+ except Exception,e:
+ print "Got exception trying to get fqdn: %s" % str(e)
+ return
+
+ # Try to get this system's IP address
+ try:
+ s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+ s.connect(("8.8.8.8", 53))
+ my_ip = s.getsockname()[0]
+ s.close
+ except Exception,e:
+ print "Got exception trying to get system's IP address: %s" % str(e)
+ return
+
+ # Set up the daemon
+ #cherrypy.process.plugins.Daemonizer(cherrypy.engine).subscribe()
+ cherrypy.config.update({'server.socket_host': '0.0.0.0',
+ 'server.socket_port': 1234,
+ })
+ cherrypy.quickstart(HealthCheck(script_dir, service_fqdn, my_ip), config={
+ '/': { 'log.access_file' : os.path.join(script_dir, "access.log"),
+ 'log.error_file' : os.path.join(script_dir, "error.log"),
+ 'log.screen' : True,
+ 'tools.sessions.on': True
+ }
+ })
+
+if __name__ == '__main__':
+ main()