1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
|
import string, cgi, time, socket, os, errno, logging, glob, subprocess
import cherrypy
from cherrypy import log, _cplogging
from cherrypy.process.plugins import Daemonizer, PIDFile
from logging import handlers
class States:
Passive, Active, Failed, StartingUp, Maintenance, Frozen = range(6)
class HealthCheck(object):
def __init__(self, script_dir, service_fqdn, my_ip, prev_state, prev_response, prev_addr):
self.script_directory = script_dir
self.fqdn = service_fqdn
self.system_ip = my_ip
self.last_state = prev_state
self.last_response = prev_response
self.last_address = prev_addr
self.state_process = None
self.startingup_time = -1
self.logged_message = ""
def silentremove(self, filename):
try:
os.remove(self.script_directory + filename)
except OSError as e:
if (e.errno != errno.ENOENT):
raise
def logmsg(self, string):
self.logged_message = string
log.error(string, context='HTTP', severity=20, traceback=True)
@cherrypy.expose
def index(self):
new_state = -1
# If we had a state change process, poll it so that the process quits properly
# when it has finished, rather than leaving a defunct thing lying around.
if not (self.state_process is None):
# Get any output from the script
for line in iter(self.state_process.stdout.readline,''):
# Trim any whitespace off the end, including newlines
self.logmsg("Script output: %s" % line.rstrip())
if not (self.state_process.poll() is None):
self.logmsg("State change script has finished with code %s" % str(self.state_process.returncode))
if not (self.state_process.returncode is None):
if (self.state_process.returncode != 0):
new_state = States.Failed
self.logmsg("***** STATE CHANGE SCRIPT FAILED *****")
self.state_process = None
# We can't abort the daemon starting if we fail to get the right
# info so we only proceed if the various sanity checks work
if (self.fqdn != "" and self.system_ip != ""):
# Get the IP address from Route 53
try:
service_ip = socket.gethostbyname(self.fqdn)
except Exception,e:
self.logmsg("Got exception trying to get IP address for '%s': %s" % (fqdn, str(e)))
service_ip = "unknown"
# Only log IP address info when a change happens
if (service_ip != self.last_address):
self.logmsg("Service IP = %s, last service IP = %s, this IP = %s" % (service_ip, self.last_address, self.system_ip))
if (new_state != -1):
# Dummy command to satisfy IF. The state could have been set above if
# a state change script fails.
new_state = new_state
elif (os.path.isfile(self.script_directory + "/state-change-happening")):
self.logmsg("State change script is still executing ...")
new_state = self.last_state
elif (os.path.isfile(self.script_directory + "/frozen")):
if (self.last_state != States.Frozen):
self.logmsg("Frozen file exists")
new_state = States.Frozen
elif (os.path.isfile(self.script_directory + "/maintenance")):
if (self.last_state != States.Maintenance):
self.logmsg("Maintenance file exists")
new_state = States.Maintenance
elif (self.last_state == States.Failed):
new_state = States.Failed
elif (self.last_state == States.Frozen):
# We were frozen but now we aren't - figure out what our state
# should be from the response code we were issuing
if (self.last_response == 200):
self.last_state = new_state = States.Active
elif (self.last_response == 202):
self.last_state = new_state = States.Passive
elif (self.last_response == 203):
self.last_state = new_state = States.StartingUp
elif (self.last_response == 500):
self.last_state = new_state = States.Failed
elif (self.last_response == 503):
self.last_state = new_state = States.Maintenance
else:
self.logmsg("Coming out of frozen, old response code was %s" % str(self.last_response))
self.last_state = new_state = States.Failed
else:
# The following logic ONLY works if:
# a) there are two nodes
# b) the IP addresses returned by Route 53 map onto those nodes
# See if the external health checks think we're healthy?
healthy = os.path.isfile(self.script_directory + "/healthy")
if (service_ip != self.last_address):
# Active node has changed
if (service_ip == self.system_ip):
if (self.last_state == States.Passive):
# We've become the new active node - switch to starting up
self.logmsg("We're the active node and we were passive, now starting up")
new_state = States.StartingUp
else:
self.logmsg("Now active node with uncaught state of %s" % str(self.last_state))
elif (self.last_state == States.Active):
# We were the active node - see if we are still healthy,
# in which case we switch to passive, or if we have failed.
if (healthy):
self.logmsg("Active node has changed and we are healthy; switching to Passive")
new_state = States.Passive
else:
self.logmsg("Active node has changed and we aren't healthy; switching to Failed")
new_state = States.Failed
elif (self.last_state == States.Passive):
# We're on the passive node, we were passive and we're still passive
new_state = States.Passive
elif (self.last_state == States.StartingUp):
# We were starting up the services but the IP address has shifted so we need to
# stop the services - switch back to Passive
self.logmsg("No longer the active node, switching from StartingUp back to Passive")
new_state = States.Passive
else:
self.logmsg("IP address has changed with uncaught state of %s" % str(self.last_state))
else:
if (service_ip == self.system_ip):
# We're the active node.
if (self.last_state == States.Maintenance):
self.logmsg("Active node, last state was Maintenance, switching to Passive")
new_state = States.Passive
elif (self.last_state == States.Passive):
self.logmsg("Active node, last state was Passive, switching to StartingUp")
new_state = States.StartingUp
elif (self.last_state == States.StartingUp):
if (healthy):
# Finished starting up
self.logmsg("Healthy active node, switching from StartingUp to Active")
new_state = States.Active
else:
# Still starting
new_state = States.StartingUp
elif (self.last_state == States.Active):
if (healthy):
new_state = States.Active
else:
self.logmsg("Active node but we aren't healthy; switching to Failed")
new_state = States.Failed
else:
self.logmsg("Active node with uncaught state of %s" % str(self.last_state))
else:
# We're the passive node
if (self.last_state != States.Passive):
self.logmsg("Passive node = passive state")
new_state = States.Passive
else:
# Sanity checks failed = failed :-)
new_state = States.Failed
if (new_state == States.Active):
new_response = 200
elif (new_state == States.Passive):
new_response = 202
elif (new_state == States.StartingUp):
new_response = 203
elif (new_state == States.Failed):
new_response = 500
elif (new_state == States.Maintenance):
new_response = 503
elif (new_state == States.Frozen):
new_response = self.last_response
else:
self.logmsg("Unmatched state of %s" % str(new_state))
new_response = 500
# Clean up some of the trigger files
if (new_state != States.Active and new_state != States.Frozen):
self.silentremove("/healthy")
if (new_state != States.Maintenance):
self.silentremove("/maintenance")
if (new_state != States.Frozen):
self.silentremove("/frozen")
cherrypy.response.status = new_response
cherrypy.response.headers['Content-type'] = 'text/html'
# Save away the various bits of information
try:
fp = open(self.script_directory + "/last_state", "w")
fp.write(str(new_state))
fp.close
except Exception,e:
self.logmsg("Got exception trying to save reported state: %s" % str(e))
try:
fp = open(self.script_directory + "/last_response", "w")
fp.write(str(new_response))
fp.close
except Exception,e:
self.logmsg("Got exception trying to save reported response: %s" % str(e))
try:
fp = open(self.script_directory + "/last_address", "w")
fp.write(str(service_ip))
fp.close
except Exception,e:
self.logmsg("Got exception trying to save service IP: %s" % str(e))
# Set the starting up time - we won't run the change state script until
# the elapsed time is more than 30. Since the script hasn't run, we'll stay
# in StartingUp because we can't switch to Active until monit (or whatever)
# detects the service as actually running.
if (new_state == States.StartingUp):
if (self.last_state != States.StartingUp):
self.startingup_time = time.time()
# Override last state in order to prevent the state change script from
# running
self.last_state = States.StartingUp
self.logmsg("Holding start up for 30 seconds")
elif (self.startingup_time != -1):
duration = time.time() - self.startingup_time
# If we've reached the desired duration, switch last state to Passive
# to trigger the state change script and reset the starting up time
# so that we don't keep on switching to the passive state.
if (duration > 30):
self.last_state = States.Passive
self.startingup_time = -1
else:
self.logmsg("Holding start up; duration is now %s" % str(duration))
# See if a script exists for one of the supported state changes
if (new_state != self.last_state):
valid_state = False
if (self.last_state == States.Passive):
if (new_state == States.StartingUp or new_state == States.Maintenance or new_state == States.Frozen):
valid_state = True
elif (self.last_state == States.StartingUp):
if (new_state == States.Passive or new_state == States.Active or new_state == States.Failed or new_state == States.Maintenance or new_state == States.Frozen):
valid_state = True
elif (self.last_state == States.Active):
if (new_state == States.Failed or new_state == States.Maintenance or new_state == States.Frozen or new_state == States.Passive):
valid_state = True
elif (self.last_state == States.Failed):
if (new_state == States.Maintenance or new_state == States.Frozen):
valid_state = True
elif (self.last_state == States.Maintenance or self.last_state == States.Frozen):
if (new_state == States.Passive):
valid_state = True
if valid_state:
filename = "from_%s_to_%s.*" % (str(self.last_state), str(new_state))
files = glob.glob("%s/%s" % (self.script_directory, filename))
if len(files) == 1:
# os.system(files[0])
self.logmsg("Firing state change script %s" % files[0])
self.state_process = subprocess.Popen([files[0]], shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
elif (len(files) > 1):
self.logmsg("More than one matching script for stage change %s to %s" % (str(self.last_state), str(new_state)))
else:
self.logmsg("Unexpected stage change from %s to %s" % (str(self.last_state), str(new_state)))
self.last_state = new_state
self.last_response = new_response
self.last_address = service_ip
body1 = "<P>This is the Linaro health check service. State is %s and response code is %s</P>" % (str(new_state), str(new_response))
if (self.logged_message == ""):
body2=""
else:
body2="<P>Most recent log message: %s</P>" % self.logged_message
return "%s%s" % (body1, body2)
def safereadline(logger, script_directory, filename):
line = ""
try:
with open(script_directory + filename) as fp:
line = fp.readline()
except IOError as e:
if (e.errno != errno.ENOENT):
logger.error("Got exception trying to read %s: %s" % (filename, str(e)))
return line
def main():
# Set up a syslog logger so that we can report stuff before
# the daemon starts up
syslog_logger = logging.getLogger("linaro_healthcheck")
syslog_logger.setLevel(logging.DEBUG)
handler = logging.handlers.SysLogHandler(
facility=logging.handlers.SysLogHandler.LOG_DAEMON, address="/dev/log")
syslog_logger.addHandler(handler)
script_dir = os.path.dirname(os.path.abspath(__file__))
syslog_logger.debug("Linaro Healthcheck running from %s" % script_dir)
service_fqdn = ""
try:
with open(script_dir + "/fqdn") as fp:
service_fqdn = str(fp.readline()).rstrip()
except Exception,e:
syslog_logger.error("Got exception trying to get fqdn: %s" % str(e))
# Try to get this system's IP address
my_ip = ""
try:
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
s.connect(("8.8.8.8", 53))
my_ip = s.getsockname()[0]
s.close
except Exception,e:
syslog_logger.error("Got exception trying to get system's IP address: %s" % str(e))
# See if we've got a recorded last state
prev_state = States.Passive
line = safereadline(syslog_logger, script_dir, "/last_state")
if (line != ""):
prev_state = int(line)
if ((prev_state < States.Passive) or (prev_state > States.Frozen)):
prev_state = States.Passive
# and see if we've got a recorded last response code
line = safereadline(syslog_logger, script_dir, "/last_response")
if (line != ""):
prev_response = int(line)
else:
prev_response = 202
# and a last IP address
prev_address = safereadline(syslog_logger, script_dir, "/last_address")
if (prev_address == ""):
prev_address = "error"
# If we were anything other then failed, maintenance or frozen last time
# this script ran, switch to Passive so that things get started up
# properly, i.e. Passive -> StartingUp -> Active
if (prev_state != States.Failed and prev_state != States.Maintenance and prev_state != States.Frozen):
prev_state = States.Passive
prev_response = 202
cherrypy.config.update({'server.socket_host': '0.0.0.0',
'server.socket_port': 1234,
'server.thread_pool': 1,
'server.thread_pool_max': 1,
'tools.staticdir.on': True,
'tools.staticdir.dir': script_dir,
'log.screen': True,
'tools.sessions.on': True,
})
config = {'/':
{
}
}
application = cherrypy.tree.mount(HealthCheck(script_dir, service_fqdn, my_ip, prev_state, prev_response, prev_address), "/", config)
#log = application.log
logscope = cherrypy.log
# Make a new RotatingFileHandler for the error log.
fname = getattr(logscope, "rot_error_file", "%s/error.log" % script_dir)
h = handlers.TimedRotatingFileHandler(fname, when='midnight', backupCount=7)
h.setLevel(logging.DEBUG)
h.setFormatter(_cplogging.logfmt)
logscope.error_file = ""
logscope.error_log.addHandler(h)
# Make a new RotatingFileHandler for the access log.
fname = getattr(logscope, "rot_access_file", "%s/access.log" % script_dir)
h = handlers.TimedRotatingFileHandler(fname, when='midnight', backupCount=7)
h.setLevel(logging.DEBUG)
h.setFormatter(_cplogging.logfmt)
logscope.access_file = ""
logscope.access_log.addHandler(h)
# Add a CTRL+C handler
if hasattr(cherrypy.engine, 'signal_handler'):
cherrypy.engine.signal_handler.subscribe()
if hasattr(cherrypy.engine, 'console_control_handler'):
cherrypy.engine.console_control_handler.subscrive()
# Set up the daemon
d = Daemonizer(cherrypy.engine)
d.subscribe()
PIDFile(cherrypy.engine, '/var/run/linaro-healthcheck.pid').subscribe()
cherrypy.engine.start()
cherrypy.engine.block()
if __name__ == '__main__':
main()
|