healthcheck.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400

import string, cgi, time, socket, os, errno, logging, glob, subprocess
import cherrypy
from cherrypy import log, _cplogging
from cherrypy.process.plugins import Daemonizer, PIDFile
from logging import handlers

class States:
    Passive, Active, Failed, StartingUp, Maintenance, Frozen = range(6)

class HealthCheck(object):
    def __init__(self, script_dir, service_fqdn, my_ip, prev_state, prev_response, prev_addr):
        self.script_directory = script_dir
        self.fqdn = service_fqdn
        self.system_ip = my_ip
        self.last_state = prev_state
        self.last_response = prev_response
        self.last_address = prev_addr
        self.state_process = None
        self.startingup_time = -1
        self.logged_message = ""

    def silentremove(self, filename):
        try:
            os.remove(self.script_directory + filename)
        except OSError as e:
            if (e.errno != errno.ENOENT):
                raise

    def logmsg(self, string):
        self.logged_message = string
        log.error(string, context='HTTP', severity=20, traceback=True)

    @cherrypy.expose
    def index(self):
        new_state = -1

        # If we had a state change process, poll it so that the process quits properly
        # when it has finished, rather than leaving a defunct thing lying around.
        if not (self.state_process is None):
            # Get any output from the script
            for line in iter(self.state_process.stdout.readline,''):
                # Trim any whitespace off the end, including newlines
                self.logmsg("Script output: %s" % line.rstrip())
            if not (self.state_process.poll() is None):
                self.logmsg("State change script has finished with code %s" % str(self.state_process.returncode))
                if not (self.state_process.returncode is None):
                    if (self.state_process.returncode != 0):
                        new_state = States.Failed
                        self.logmsg("***** STATE CHANGE SCRIPT FAILED *****")
                self.state_process = None

        # We can't abort the daemon starting if we fail to get the right
        # info so we only proceed if the various sanity checks work
        if (self.fqdn != "" and self.system_ip != ""):
            # Get the IP address from Route 53
            try:
                service_ip = socket.gethostbyname(self.fqdn)
            except Exception,e:
                self.logmsg("Got exception trying to get IP address for '%s': %s" % (fqdn, str(e)))
                service_ip = "unknown"

            # Only log IP address info when a change happens
            if (service_ip != self.last_address):
                self.logmsg("Service IP = %s, last service IP = %s, this IP = %s" % (service_ip, self.last_address, self.system_ip))

            if (new_state != -1):
                # Dummy command to satisfy IF. The state could have been set above if
                # a state change script fails.
                new_state = new_state
            elif (os.path.isfile(self.script_directory + "/state-change-happening")):
                self.logmsg("State change script is still executing ...")
                new_state = self.last_state
            elif (os.path.isfile(self.script_directory + "/frozen")):
                if (self.last_state != States.Frozen):
                    self.logmsg("Frozen file exists")
                new_state = States.Frozen
            elif (os.path.isfile(self.script_directory + "/maintenance")):
                if (self.last_state != States.Maintenance):
                    self.logmsg("Maintenance file exists")
                new_state = States.Maintenance
            elif (self.last_state == States.Failed):
                new_state = States.Failed
            elif (self.last_state == States.Frozen):
                # We were frozen but now we aren't - figure out what our state
                # should be from the response code we were issuing
                if (self.last_response == 200):
                    self.last_state = new_state = States.Active
                elif (self.last_response == 202):
                    self.last_state = new_state = States.Passive
                elif (self.last_response == 203):
                    self.last_state = new_state = States.StartingUp
                elif (self.last_response == 500):
                    self.last_state = new_state = States.Failed
                elif (self.last_response == 503):
                    self.last_state = new_state = States.Maintenance
                else:
                    self.logmsg("Coming out of frozen, old response code was %s" % str(self.last_response))
                    self.last_state = new_state = States.Failed
            else:
                # The following logic ONLY works if:
                # a) there are two nodes
                # b) the IP addresses returned by Route 53 map onto those nodes

                # See if the external health checks think we're healthy?
                healthy = os.path.isfile(self.script_directory + "/healthy")

                if (service_ip != self.last_address):
                    # Active node has changed
                    if (service_ip == self.system_ip):
                        if (self.last_state == States.Passive):
                            # We've become the new active node - switch to starting up
                            self.logmsg("We're the active node and we were passive, now starting up")
                            new_state = States.StartingUp
                        else:
                            self.logmsg("Now active node with uncaught state of %s" % str(self.last_state))
                    elif (self.last_state == States.Active):
                        # We were the active node - see if we are still healthy,
                        # in which case we switch to passive, or if we have failed.
                        if (healthy):
                            self.logmsg("Active node has changed and we are healthy; switching to Passive")
                            new_state = States.Passive
                        else:
                            self.logmsg("Active node has changed and we aren't healthy; switching to Failed")
                            new_state = States.Failed
                    elif (self.last_state == States.Passive):
                        # We're on the passive node, we were passive and we're still passive
                        new_state = States.Passive
                    elif (self.last_state == States.StartingUp):
                        # We were starting up the services but the IP address has shifted so we need to
                        # stop the services - switch back to Passive
                        self.logmsg("No longer the active node, switching from StartingUp back to Passive")
                        new_state = States.Passive
                    else:
                        self.logmsg("IP address has changed with uncaught state of %s" % str(self.last_state))
                else:
                    if (service_ip == self.system_ip):
                        # We're the active node.
                        if (self.last_state == States.Maintenance):
                            self.logmsg("Active node, last state was Maintenance, switching to Passive")
                            new_state = States.Passive
                        elif (self.last_state == States.Passive):
                            self.logmsg("Active node, last state was Passive, switching to StartingUp")
                            new_state = States.StartingUp
                        elif (self.last_state == States.StartingUp):
                            if (healthy):
                                # Finished starting up
                                self.logmsg("Healthy active node, switching from StartingUp to Active")
                                new_state = States.Active
                            else:
                                # Still starting
                                new_state = States.StartingUp
                        elif (self.last_state == States.Active):
                            if (healthy):
                                new_state = States.Active
                            else:
                                self.logmsg("Active node but we aren't healthy; switching to Failed")
                                new_state = States.Failed
                        else:
                            self.logmsg("Active node with uncaught state of %s" % str(self.last_state))
                    else:
                        # We're the passive node
                        if (self.last_state != States.Passive):
                            self.logmsg("Passive node = passive state")
                        new_state = States.Passive
        else:
            # Sanity checks failed = failed :-)
            new_state = States.Failed

        if (new_state == States.Active):
            new_response = 200
        elif (new_state == States.Passive):
            new_response = 202
        elif (new_state == States.StartingUp):
            new_response = 203
        elif (new_state == States.Failed):
            new_response = 500
        elif (new_state == States.Maintenance):
            new_response = 503
        elif (new_state == States.Frozen):
            new_response = self.last_response
        else:
            self.logmsg("Unmatched state of %s" % str(new_state))
            new_response = 500

        # Clean up some of the trigger files
        if (new_state != States.Active and new_state != States.Frozen):
            self.silentremove("/healthy")
        if (new_state != States.Maintenance):
            self.silentremove("/maintenance")
        if (new_state != States.Frozen):
            self.silentremove("/frozen")

        cherrypy.response.status = new_response
        cherrypy.response.headers['Content-type'] = 'text/html'

        # Save away the various bits of information
        try:
            fp = open(self.script_directory + "/last_state", "w")
            fp.write(str(new_state))
            fp.close
        except Exception,e:
            self.logmsg("Got exception trying to save reported state: %s" % str(e))

        try:
            fp = open(self.script_directory + "/last_response", "w")
            fp.write(str(new_response))
            fp.close
        except Exception,e:
            self.logmsg("Got exception trying to save reported response: %s" % str(e))

        try:
            fp = open(self.script_directory + "/last_address", "w")
            fp.write(str(service_ip))
            fp.close
        except Exception,e:
            self.logmsg("Got exception trying to save service IP: %s" % str(e))

        # Set the starting up time - we won't run the change state script until
        # the elapsed time is more than 30. Since the script hasn't run, we'll stay
        # in StartingUp because we can't switch to Active until monit (or whatever)
        # detects the service as actually running.
        if (new_state == States.StartingUp):
            if (self.last_state != States.StartingUp):
                self.startingup_time = time.time()
                # Override last state in order to prevent the state change script from
                # running
                self.last_state = States.StartingUp
                self.logmsg("Holding start up for 30 seconds")
            elif (self.startingup_time != -1):
                duration = time.time() - self.startingup_time
                # If we've reached the desired duration, switch last state to Passive
                # to trigger the state change script and reset the starting up time
                # so that we don't keep on switching to the passive state.
                if (duration > 30):
                    self.last_state = States.Passive
                    self.startingup_time = -1
                else:
                    self.logmsg("Holding start up; duration is now %s" % str(duration))

        # See if a script exists for one of the supported state changes
        if (new_state != self.last_state):
            valid_state = False
            if (self.last_state == States.Passive):
                if (new_state == States.StartingUp or new_state == States.Maintenance or new_state == States.Frozen):
                    valid_state = True
            elif (self.last_state == States.StartingUp):
                if (new_state == States.Passive or new_state == States.Active or new_state == States.Failed or new_state == States.Maintenance or new_state == States.Frozen):
                    valid_state = True
            elif (self.last_state == States.Active):
                if (new_state == States.Failed or new_state == States.Maintenance or new_state == States.Frozen or new_state == States.Passive):
                    valid_state = True
            elif (self.last_state == States.Failed):
                if (new_state == States.Maintenance or new_state == States.Frozen):
                    valid_state = True
            elif (self.last_state == States.Maintenance or self.last_state == States.Frozen):
                if (new_state == States.Passive):
                    valid_state = True

            if valid_state:
                filename = "from_%s_to_%s.*" % (str(self.last_state), str(new_state))
                files = glob.glob("%s/%s" % (self.script_directory, filename))
                if len(files) == 1:
                    # os.system(files[0])
                    self.logmsg("Firing state change script %s" % files[0])
                    self.state_process = subprocess.Popen([files[0]], shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
                elif (len(files) > 1):
                    self.logmsg("More than one matching script for stage change %s to %s" % (str(self.last_state), str(new_state)))
            else:
                self.logmsg("Unexpected stage change from %s to %s" % (str(self.last_state), str(new_state)))

        self.last_state = new_state
        self.last_response = new_response
        self.last_address = service_ip

        body1 = "<P>This is the Linaro health check service. State is %s and response code is %s</P>" % (str(new_state), str(new_response))
        if (self.logged_message == ""):
            body2=""
        else:
            body2="<P>Most recent log message: %s</P>" % self.logged_message

        return "%s%s" % (body1, body2)

def safereadline(logger, script_directory, filename):
    line = ""
    try:
        with open(script_directory + filename) as fp:
            line = fp.readline()
    except IOError as e:
        if (e.errno != errno.ENOENT):
            logger.error("Got exception trying to read %s: %s" % (filename, str(e)))
    return line

def main():
    # Set up a syslog logger so that we can report stuff before
    # the daemon starts up
    syslog_logger = logging.getLogger("linaro_healthcheck")
    syslog_logger.setLevel(logging.DEBUG)
    handler = logging.handlers.SysLogHandler(
        facility=logging.handlers.SysLogHandler.LOG_DAEMON, address="/dev/log")
    syslog_logger.addHandler(handler)

    script_dir = os.path.dirname(os.path.abspath(__file__))
    syslog_logger.debug("Linaro Healthcheck running from %s" % script_dir)

    service_fqdn = ""
    try:
        with open(script_dir + "/fqdn") as fp:
            service_fqdn = str(fp.readline()).rstrip()
    except Exception,e:
        syslog_logger.error("Got exception trying to get fqdn: %s" % str(e))

    # Try to get this system's IP address
    my_ip = ""
    try:
        s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
        s.connect(("8.8.8.8", 53))
        my_ip = s.getsockname()[0]
        s.close
    except Exception,e:
        syslog_logger.error("Got exception trying to get system's IP address: %s" % str(e))

    # See if we've got a recorded last state
    prev_state = States.Passive
    line = safereadline(syslog_logger, script_dir, "/last_state")
    if (line != ""):
        prev_state = int(line)
    if ((prev_state < States.Passive) or (prev_state > States.Frozen)):
        prev_state = States.Passive

    # and see if we've got a recorded last response code
    line = safereadline(syslog_logger, script_dir, "/last_response")
    if (line != ""):
        prev_response = int(line)
    else:
        prev_response = 202

    # and a last IP address
    prev_address = safereadline(syslog_logger, script_dir, "/last_address")
    if (prev_address == ""):
        prev_address = "error"

    # If we were anything other then failed, maintenance or frozen last time
    # this script ran, switch to Passive so that things get started up
    # properly, i.e. Passive -> StartingUp -> Active
    if (prev_state != States.Failed and prev_state != States.Maintenance and prev_state != States.Frozen):
        prev_state = States.Passive
        prev_response = 202

    cherrypy.config.update({'server.socket_host': '0.0.0.0',
                            'server.socket_port': 1234,
                            'server.thread_pool': 1,
                            'server.thread_pool_max': 1,
                            'tools.staticdir.on': True,
                            'tools.staticdir.dir': script_dir,
                            'log.screen': True,
                            'tools.sessions.on': True,
                           })

    config = {'/':
                 {
                 }
             }

    application = cherrypy.tree.mount(HealthCheck(script_dir, service_fqdn, my_ip, prev_state, prev_response, prev_address), "/", config)

    #log = application.log
    logscope = cherrypy.log

    # Make a new RotatingFileHandler for the error log.
    fname = getattr(logscope, "rot_error_file", "%s/error.log" % script_dir)
    h = handlers.TimedRotatingFileHandler(fname, when='midnight', backupCount=7)
    h.setLevel(logging.DEBUG)
    h.setFormatter(_cplogging.logfmt)
    logscope.error_file = ""
    logscope.error_log.addHandler(h)

    # Make a new RotatingFileHandler for the access log.
    fname = getattr(logscope, "rot_access_file", "%s/access.log" % script_dir)
    h = handlers.TimedRotatingFileHandler(fname, when='midnight', backupCount=7)
    h.setLevel(logging.DEBUG)
    h.setFormatter(_cplogging.logfmt)
    logscope.access_file = ""
    logscope.access_log.addHandler(h)

    # Add a CTRL+C handler
    if hasattr(cherrypy.engine, 'signal_handler'):
        cherrypy.engine.signal_handler.subscribe()
    if hasattr(cherrypy.engine, 'console_control_handler'):
        cherrypy.engine.console_control_handler.subscrive()

    # Set up the daemon
    d = Daemonizer(cherrypy.engine)
    d.subscribe()
    PIDFile(cherrypy.engine, '/var/run/linaro-healthcheck.pid').subscribe()

    cherrypy.engine.start()
    cherrypy.engine.block()

if __name__ == '__main__':
    main()