summaryrefslogtreecommitdiff
path: root/healthcheck.py
blob: 574844c4269a572ac276602619bdcac597a33705 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
import string, cgi, time, socket, os, errno, logging, glob, subprocess
import cherrypy
from cherrypy import log, _cplogging
from cherrypy.process.plugins import Daemonizer, PIDFile
from logging import handlers

class States:
    Passive, Active, Failed, StartingUp, Maintenance, Frozen = range(6)

class HealthCheck(object):
    def __init__(self, script_dir, service_fqdn, my_ip, prev_state, prev_response, prev_addr):
        self.script_directory = script_dir
        self.fqdn = service_fqdn
        self.system_ip = my_ip
        self.last_state = prev_state
        self.last_response = prev_response
        self.last_address = prev_addr
        self.state_process = None

    def silentremove(self, filename):
        try:
            os.remove(self.script_directory + filename)
        except OSError as e:
            if (e.errno != errno.ENOENT):
                raise

    def logmsg(self, string):
        log.error(string, context='HTTP', severity=20, traceback=True)

    @cherrypy.expose
    def index(self):
        new_state = -1

        # If we had a state change process, poll it so that the process quits properly
        # when it has finished, rather than leaving a defunct thing lying around.
        if not (self.state_process is None):
            if (self.state_process.poll() is None):
                self.logmsg("State change script has finished")
                self.state_process = None

        # We can't abort the daemon starting if we fail to get the right
        # info so we only proceed if the various sanity checks work
        if (self.fqdn != "" and self.system_ip != ""):
            # Get the IP address from Route 53
            try:
                service_ip = socket.gethostbyname(self.fqdn)
            except Exception,e:
                self.logmsg("Got exception trying to get IP address for '%s': %s" % (fqdn, str(e)))
                service_ip = "unknown"

            # Only log IP address info when a change happens
            if (service_ip != self.last_address):
                self.logmsg("Service IP = %s, this IP = %s, last service IP = %s" % (service_ip, self.system_ip, self.last_address))

            if (os.path.isfile(self.script_directory + "/frozen")):
                if (self.last_state != States.Frozen):
                    self.logmsg("Frozen file exists")
                new_state = States.Frozen
            elif (os.path.isfile(self.script_directory + "/maintenance")):
                if (self.last_state != States.Maintenance):
                    self.logmsg("Maintenance file exists")
                new_state = States.Maintenance
            elif (self.last_state == States.Failed):
                new_state = States.Failed
            else:
                # The following logic ONLY works if:
                # a) there are two nodes
                # b) the IP addresses returned by Route 53 map onto those nodes

                # See if the external health checks think we're healthy?
                healthy = os.path.isfile(self.script_directory + "/healthy")

                if (service_ip != self.last_address):
                    # Active node has changed
                    if (service_ip == self.system_ip):
                        if (self.last_state == States.Passive):
                            # We've become the new active node - switch to starting up
                            self.logmsg("We're the active node and we were passive, now starting up")
                            new_state = States.StartingUp
                        else:
                            self.logmsg("Now active node with uncaught state of %s" % str(self.last_state))
                    elif (self.last_state == States.Active):
                        # We were the active node - see if we are still healthy,
                        # in which case we switch to passive, or if we have failed.
                        if (healthy):
                            self.logmsg("Active node has changed and we are healthy; switching to Passive")
                            new_state = States.Passive
                        else:
                            self.logmsg("Active node has changed and we aren't healthy; switching to Failed")
                            new_state = States.Failed
                    elif (self.last_state == States.Passive):
                        # We're on the passive node, we were passive and we're still passive
                        new_state = States.Passive
                    else:
                        self.logmsg("IP address has changed with uncaught state of %s" % str(self.last_state))
                else:
                    if (service_ip == self.system_ip):
                        # We're the active node.
                        if (self.last_state == States.Maintenance):
                            self.logmsg("Active node, last state was Maintenance, switching to Passive")
                            new_state = States.Passive
                        elif (self.last_state == States.Passive):
                            self.logmsg("Active node, last state was Passive, switching to StartingUp")
                            new_state = States.StartingUp
                        elif (self.last_state == States.StartingUp):
                            if (healthy):
                                # Finished starting up
                                self.logmsg("Healthy active node, switching from StartingUp to Active")
                                new_state = States.Active
                            else:
                                # Still starting
                                new_state = States.StartingUp
                        elif (self.last_state == States.Active):
                            if (healthy):
                                new_state = States.Active
                            else:
                                self.logmsg("Active node but we aren't healthy; switching to Failed")
                                new_state = States.Failed
                        else:
                            self.logmsg("Active node with uncaught state of %s" % str(self.last_state))
                    else:
                        # We're the passive node
                        if (self.last_state != States.Passive):
                            self.logmsg("Passive node = passive state")
                        new_state = States.Passive
        else:
            # Sanity checks failed = failed :-)
            new_state = States.Failed

        if (new_state == States.Active):
            new_response = 200
        elif (new_state == States.Passive):
            new_response = 202
        elif (new_state == States.StartingUp):
            new_response = 203
        elif (new_state == States.Failed):
            new_response = 500
        elif (new_state == States.Maintenance):
            new_response = 503
        elif (new_state == States.Frozen):
            new_response = last_response
        else:
            self.logmsg("Unmatched state of %s" % str(new_state))
            new_response = 500

        # Clean up some of the trigger files
        if (new_state != States.Active):
            self.silentremove("/healthy")
        if (new_state != States.Maintenance):
            self.silentremove("/maintenance")
        if (new_state != States.Frozen):
            self.silentremove("/frozen")

        cherrypy.response.status = new_response
        cherrypy.response.headers['Content-type'] = 'text/html'
        cherrypy.response.body = ["<p>This is the Linaro health check service. State is %s and response code is %s</P>" % (str(new_state), str(new_response))]

        # Save away the various bits of information
        try:
            fp = open(self.script_directory + "/last_state", "w")
            fp.write(str(new_state))
            fp.close
        except Exception,e:
            self.logmsg("Got exception trying to save reported state: %s" % str(e))

        try:
            fp = open(self.script_directory + "/last_response", "w")
            fp.write(str(new_response))
            fp.close
        except Exception,e:
            self.logmsg("Got exception trying to save reported response: %s" % str(e))

        try:
            fp = open(self.script_directory + "/last_address", "w")
            fp.write(str(service_ip))
            fp.close
        except Exception,e:
            self.logmsg("Got exception trying to save service IP: %s" % str(e))

        # See if a script exists for one of the supported state changes
        if (new_state != self.last_state):
            valid_state = False
            if (self.last_state == States.Passive):
                if (new_state == States.StartingUp or new_state == States.Maintenance or new_state == States.Frozen):
                    valid_state = True
            elif (self.last_state == States.StartingUp):
                if (new_state == States.Active or new_state == States.Failed or new_state == States.Maintenance or new_state == States.Frozen):
                    valid_state = True
            elif (self.last_state == States.Active):
                if (new_state == States.Failed or new_state == States.Maintenance or new_state == States.Frozen or new_state == States.Passive):
                    valid_state = True
            elif (self.last_state == States.Failed):
                if (new_state == States.Maintenance or new_state == States.Frozen):
                    valid_state = True
            elif (self.last_state == States.Maintenance or self.last_state == States.Frozen):
                if (new_state == States.Passive):
                    valid_state = True

            if valid_state:
                filename = "from_%s_to_%s.*" % (str(self.last_state), str(new_state))
                files = glob.glob("%s/%s" % (self.script_directory, filename))
                if len(files) == 1:
                    # os.system(files[0])
                    self.logmsg("Firing state change script %s" % files[0])
                    self.state_process = subprocess.Popen([files[0]])
                elif (len(files) > 1):
                    self.logmsg("More than one matching script for stage change %s to %s" % (str(self.last_state), str(new_state)))
            else:
                self.logmsg("Unexpected stage change from %s to %s" % (str(self.last_state), str(new_state)))

        self.last_state = new_state
        self.last_response = new_response
        self.last_address = service_ip

def safereadline(logger, script_directory, filename):
    line = ""
    try:
        with open(script_directory + filename) as fp:
            line = fp.readline()
    except IOError as e:
        if (e.errno != errno.ENOENT):
            logger.error("Got exception trying to read %s: %s" % (filename, str(e)))
    return line

def main():
    # Set up a syslog logger so that we can report stuff before
    # the daemon starts up
    syslog_logger = logging.getLogger("linaro_healthcheck")
    syslog_logger.setLevel(logging.DEBUG)
    handler = logging.handlers.SysLogHandler(
        facility=logging.handlers.SysLogHandler.LOG_DAEMON, address="/dev/log")
    syslog_logger.addHandler(handler)

    script_dir = os.path.dirname(os.path.abspath(__file__))
    syslog_logger.debug("Linaro Healthcheck running from %s" % script_dir)

    service_fqdn = ""
    try:
        with open(script_dir + "/fqdn") as fp:
            service_fqdn = str(fp.readline()).rstrip()
    except Exception,e:
        syslog_logger.error("Got exception trying to get fqdn: %s" % str(e))

    # Try to get this system's IP address
    my_ip = ""
    try:
        s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
        s.connect(("8.8.8.8", 53))
        my_ip = s.getsockname()[0]
        s.close
    except Exception,e:
        syslog_logger.error("Got exception trying to get system's IP address: %s" % str(e))

    # See if we've got a recorded last state
    prev_state = States.Passive
    line = safereadline(syslog_logger, script_dir, "/last_state")
    if (line != ""):
        prev_state = int(line)
    if ((prev_state < States.Passive) or (prev_state > States.Frozen)):
        prev_state = States.Passive

    # and see if we've got a recorded last response code
    line = safereadline(syslog_logger, script_dir, "/last_response")
    if (line != ""):
        prev_response = int(line)
    else:
        prev_response = 202

    # and a last IP address
    prev_address = safereadline(syslog_logger, script_dir, "/last_address")
    if (prev_address == ""):
        prev_address = "error"

    # If we were anything other then failed, maintenance or frozen last time
    # this script ran, switch to Passive so that things get started up
    # properly, i.e. Passive -> StartingUp -> Active
    if (prev_state != States.Failed and prev_state != States.Maintenance and prev_state != States.Frozen):
        prev_state = States.Passive
        prev_response = 202

    cherrypy.config.update({'server.socket_host': '0.0.0.0',
                            'server.socket_port': 1234,
                            'server.thread_pool': 1,
                            'server.thread_pool_max': 1,
                            'tools.staticdir.on': True,
                            'tools.staticdir.dir': script_dir,
                            'log.screen': True,
                            'tools.sessions.on': True,
                           })

    config = {'/':
                 {
                 }
             }

    application = cherrypy.tree.mount(HealthCheck(script_dir, service_fqdn, my_ip, prev_state, prev_response, prev_address), "/", config)

    #log = application.log
    logscope = cherrypy.log

    # Make a new RotatingFileHandler for the error log.
    fname = getattr(logscope, "rot_error_file", "%s/error.log" % script_dir)
    h = handlers.TimedRotatingFileHandler(fname, when='midnight')
    h.setLevel(logging.DEBUG)
    h.setFormatter(_cplogging.logfmt)
    logscope.error_file = ""
    logscope.error_log.addHandler(h)

    # Make a new RotatingFileHandler for the access log.
    fname = getattr(logscope, "rot_access_file", "%s/access.log" % script_dir)
    h = handlers.TimedRotatingFileHandler(fname, when='midnight')
    h.setLevel(logging.DEBUG)
    h.setFormatter(_cplogging.logfmt)
    logscope.access_file = ""
    logscope.access_log.addHandler(h)

    # Add a CTRL+C handler
    if hasattr(cherrypy.engine, 'signal_handler'):
        cherrypy.engine.signal_handler.subscribe()
    if hasattr(cherrypy.engine, 'console_control_handler'):
        cherrypy.engine.console_control_handler.subscrive()

    # Set up the daemon
    d = Daemonizer(cherrypy.engine)
    d.subscribe()
    PIDFile(cherrypy.engine, '/var/run/linaro-healthcheck.pid').subscribe()

    cherrypy.engine.start()
    cherrypy.engine.block()

if __name__ == '__main__':
    main()