summaryrefslogtreecommitdiff
path: root/healthcheck.py
blob: 0892db185f9943ba05ae6e20f4c981ef0174f13a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
import string, cgi, time, socket, os, errno, logging, glob
import cherrypy
from cherrypy import log, _cplogging
from logging import handlers

class States:
    Passive, Active, Failed, StartingUp, Maintenance, Frozen = range(6)

class HealthCheck(object):
    def __init__(self, script_dir, service_fqdn, my_ip, prev_state, prev_response, prev_addr):
        self.script_directory = script_dir
        self.fqdn = service_fqdn
        self.system_ip = my_ip
        self.last_state = prev_state
        self.last_response = prev_response
        self.last_address = prev_addr

    def silentremove(self, filename):
        try:
            os.remove(self.script_directory + filename)
        except OSError as e:
            if (e.errno != errno.ENOENT):
                raise

    def logmsg(self, string):
        log.error(string, context='HTTP', severity=20, traceback=True)

    @cherrypy.expose
    def index(self):
        new_state = -1
        new_response = 202

        # Get the IP address from Route 53
        try:
            service_ip = socket.gethostbyname(self.fqdn)
        except Exception,e:
            self.logmsg("Got exception trying to get IP address for '%s': %s" % (fqdn, str(e)))
            service_ip = "unknown"

	# Only log IP address info when a change happens
        if (service_ip != self.last_address):
            self.logmsg("Service IP = %s, this IP = %s, last service IP = %s" % (service_ip, self.system_ip, self.last_address))

        if (os.path.isfile(self.script_directory + "/frozen")):
            self.logmsg("Frozen file exists")
            new_state = States.Frozen
        elif (os.path.isfile(self.script_directory + "/maintenance")):
            self.logmsg("Maintenance file exists")
            new_state = States.Maintenance
        elif (self.last_state == States.Failed):
            self.logmsg("In a failed state")
            new_state = States.Failed
        else:
            try:
                # The following logic ONLY works if:
                # a) there are two nodes
                # b) the IP addresses returned by Route 53 map onto those nodes
                if (service_ip != self.last_address):
                    # Active node has changed
                    if (self.last_state == States.Passive):
                        # We've become the new active node - switch to starting up
                        new_state = States.StartingUp
                    if (self.last_state == States.Active):
                        # We were the active node - we must have failed
                        new_state = States.Failed
                else:
                    if (service_ip == self.system_ip):
                        # We're the active node.
                        # See if the external health checks think we're healthy?
                        healthy = os.path.isfile(self.script_directory + "/healthy")
                        if (self.last_state == States.Maintenance):
                            new_state = States.Passive
                        if (self.last_state == States.Passive):
                            new_state = States.StartingUp
                        if (self.last_state == States.StartingUp):
                            if (healthy):
                                # Finished starting up
                                new_state = States.Active
                            else:
                                # Still starting
                                new_state = States.StartingUp
                        if (self.last_state == States.Active):
                            if (healthy):
                                new_state = States.Active
                            else:
                                new_state = States.Failed
                    else:
                        # We're the passive node
                        new_state = States.Passive
            except Exception,e:
                log.error("Got exception trying to get IP address of '%s': %s" % (self.fqdn, str(e)))
                new_state = States.Failed

        if (new_state == States.Active):
            new_response = 200
        elif (new_state == States.Passive):
            new_response = 202
        elif (new_state == States.StartingUp):
            new_response = 203
        elif (new_state == States.Failed):
            new_response = 500
        elif (new_state == States.Maintenance):
            new_response = 503
        elif (new_state == States.Frozen):
            new_response = last_response
        else:
            log.error("Unmatched state of %s" % str(new_state))

        # Clean up some of the trigger files
        if (new_state != States.Active):
            self.silentremove("/healthy")
        if (new_state != States.Maintenance):
            self.silentremove("/maintenance")
        if (new_state != States.Frozen):
            self.silentremove("/frozen")

        cherrypy.response.status = new_response
        cherrypy.response.headers['Content-type'] = 'text/html'
        cherrypy.response.body = ["<p>This is the Linaro health check service. State is %s and response code is %s</P>" % (str(new_state), str(new_response))]

        # Save away the various bits of information
        try:
            fp = open(self.script_directory + "/last_state", "w")
            fp.write(str(new_state))
            fp.close
        except Exception,e:
            self.logmsg("Got exception trying to save reported state: %s" % str(e))

        try:
            fp = open(self.script_directory + "/last_response", "w")
            fp.write(str(new_response))
            fp.close
        except Exception,e:
            self.logmsg("Got exception trying to save reported response: %s" % str(e))

        try:
            fp = open(self.script_directory + "/last_address", "w")
            fp.write(str(service_ip))
            fp.close
        except Exception,e:
            self.logmsg("Got exception trying to save service IP: %s" % str(e))

        # See if a script exists for one of the supported state changes
        if (new_state != self.last_state):
            valid_state = False
            if (self.last_state == States.Passive):
                if (new_state == States.StartingUp or new_state == States.Maintenance or new_state == States.Frozen):
                    valid_state = True
            elif (self.last_state == States.StartingUp):
                if (new_state == States.Active or new_state == States.Failed or new_state == States.Maintenance or new_state == States.Frozen):
                    valid_state = True
            elif (self.last_state == States.Active):
                if (new_state == States.Failed or new_state == States.Maintenance or new_state == States.Frozen):
                    valid_state = True
            elif (self.last_state == States.Failed):
                if (new_state == States.Maintenance or new_state == States.Frozen):
                    valid_state = True
            elif (self.last_state == States.Maintenance or last_state == States.Frozen):
                if (new_state == States.StartingUp or new_state == States.Passive):
                    valid_state = True

            if valid_state:
                filename = "from_%s_to_%s.*" % (str(self.last_state), str(new_state))
                files = glob.glob("%s/%s" % (self.script_directory, filename))
                if len(files) == 1:
                    os.system(files[0])
                else:
                    self.logmsg("more than one matching script for stage change %s to %s" % (str(self.last_state), str(new_state)))
            else:
                self.logmsg("Unexpected stage change from %s to %s" % (str(self.last_state), str(new_state)))

        self.last_state = new_state
        self.last_response = new_response
        self.last_address = service_ip

def safereadline(script_directory, filename):
    line = ""
    try:
        with open(script_directory + filename) as fp:
            line = fp.readline()
    except IOError as e:
        if (e.errno != errno.ENOENT):
            print("Got exception trying to read %s: %s" % (filename, str(e)))
    return line

def main():
    script_dir = os.path.dirname(os.path.realpath(__file__))
    service_fqdn = ""
    try:
        with open(script_dir + "/fqdn") as fp:
            service_fqdn = str(fp.readline()).rstrip()
    except Exception,e:
        print "Got exception trying to get fqdn: %s" % str(e)
        return

    # Try to get this system's IP address
    try:
        s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
        s.connect(("8.8.8.8", 53))
        my_ip = s.getsockname()[0]
        s.close
    except Exception,e:
        print "Got exception trying to get system's IP address: %s" % str(e)
        return

    # See if we've got a recorded last state
    prev_state = States.Passive
    line = safereadline(script_dir, "/last_state")
    if (line != ""):
        prev_state = int(line)
    if ((prev_state < States.Passive) or (prev_state > States.Frozen)):
        prev_state = States.Passive

    # and see if we've got a recorded last response code
    line = safereadline(script_dir, "/last_response")
    if (line != ""):
        prev_response = int(line)
    else:
        prev_response = 202

    # and a last IP address
    prev_address = safereadline(script_dir, "/last_address")
    if (prev_address == ""):
        print "Failed to get last IP address"
        prev_address = "error"


    # Set up the daemon
    #cherrypy.process.plugins.Daemonizer(cherrypy.engine).subscribe()
    cherrypy.config.update({'server.socket_host': '0.0.0.0',
                            'server.socket_port': 1234,
                            'server.thread_pool': 1,
                            'server.thread_pool_max': 1,
                            'tools.staticdir.on': True,
                            'tools.staticdir.dir': script_dir,
                            'log.screen': True,
                            'tools.sessions.on': True,
                           })

    config = {'/':
                 {
                 }
             }

    application = cherrypy.tree.mount(HealthCheck(script_dir, service_fqdn, my_ip, prev_state, prev_response, prev_address), "/", config)

    #log = application.log
    logscope = cherrypy.log

    # Make a new RotatingFileHandler for the error log.
    fname = getattr(logscope, "rot_error_file", "error.log")
    h = handlers.TimedRotatingFileHandler(fname, when='midnight')
    h.setLevel(logging.DEBUG)
    h.setFormatter(_cplogging.logfmt)
    logscope.error_file = ""
    logscope.error_log.addHandler(h)

    # Make a new RotatingFileHandler for the access log.
    fname = getattr(logscope, "rot_access_file", "access.log")
    h = handlers.TimedRotatingFileHandler(fname, when='midnight')
    h.setLevel(logging.DEBUG)
    h.setFormatter(_cplogging.logfmt)
    logscope.access_file = ""
    logscope.access_log.addHandler(h)

    # Add a CTRL+C handler
    if hasattr(cherrypy.engine, 'signal_handler'):
        cherrypy.engine.signal_handler.subscribe()
    if hasattr(cherrypy.engine, 'console_control_handler'):
        cherrypy.engine.console_control_handler.subscrive()

    cherrypy.engine.start()
    cherrypy.engine.block()

if __name__ == '__main__':
    main()