1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
|
import string, cgi, time, socket, os, errno, logging, glob
import cherrypy
from cherrypy import log, _cplogging
from logging import handlers
class States:
Passive, Active, Failed, StartingUp, Maintenance, Frozen = range(6)
class HealthCheck(object):
def __init__(self, script_dir, service_fqdn, my_ip, prev_state, prev_response, prev_addr):
self.script_directory = script_dir
self.fqdn = service_fqdn
self.system_ip = my_ip
self.last_state = prev_state
self.last_response = prev_response
self.last_address = prev_addr
def silentremove(self, filename):
try:
os.remove(self.script_directory + filename)
except OSError as e:
if (e.errno != errno.ENOENT):
raise
def logmsg(self, string):
log.error(string, context='HTTP', severity=20, traceback=True)
@cherrypy.expose
def index(self):
new_state = -1
new_response = 202
# Get the IP address from Route 53
try:
service_ip = socket.gethostbyname(self.fqdn)
except Exception,e:
self.logmsg("Got exception trying to get IP address for '%s': %s" % (fqdn, str(e)))
service_ip = "unknown"
# Only log IP address info when a change happens
if (service_ip != self.last_address):
self.logmsg("Service IP = %s, this IP = %s, last service IP = %s" % (service_ip, self.system_ip, self.last_address))
if (os.path.isfile(self.script_directory + "/frozen")):
self.logmsg("Frozen file exists")
new_state = States.Frozen
elif (os.path.isfile(self.script_directory + "/maintenance")):
self.logmsg("Maintenance file exists")
new_state = States.Maintenance
elif (self.last_state == States.Failed):
self.logmsg("In a failed state")
new_state = States.Failed
else:
try:
# The following logic ONLY works if:
# a) there are two nodes
# b) the IP addresses returned by Route 53 map onto those nodes
if (service_ip != self.last_address):
# Active node has changed
if (self.last_state == States.Passive):
# We've become the new active node - switch to starting up
new_state = States.StartingUp
if (self.last_state == States.Active):
# We were the active node - we must have failed
new_state = States.Failed
else:
if (service_ip == self.system_ip):
# We're the active node.
# See if the external health checks think we're healthy?
healthy = os.path.isfile(self.script_directory + "/healthy")
if (self.last_state == States.Maintenance):
new_state = States.Passive
if (self.last_state == States.Passive):
new_state = States.StartingUp
if (self.last_state == States.StartingUp):
if (healthy):
# Finished starting up
new_state = States.Active
else:
# Still starting
new_state = States.StartingUp
if (self.last_state == States.Active):
if (healthy):
new_state = States.Active
else:
new_state = States.Failed
else:
# We're the passive node
new_state = States.Passive
except Exception,e:
log.error("Got exception trying to get IP address of '%s': %s" % (self.fqdn, str(e)))
new_state = States.Failed
if (new_state == States.Active):
new_response = 200
elif (new_state == States.Passive):
new_response = 202
elif (new_state == States.StartingUp):
new_response = 203
elif (new_state == States.Failed):
new_response = 500
elif (new_state == States.Maintenance):
new_response = 503
elif (new_state == States.Frozen):
new_response = last_response
else:
log.error("Unmatched state of %s" % str(new_state))
# Clean up some of the trigger files
if (new_state != States.Active):
self.silentremove("/healthy")
if (new_state != States.Maintenance):
self.silentremove("/maintenance")
if (new_state != States.Frozen):
self.silentremove("/frozen")
cherrypy.response.status = new_response
cherrypy.response.headers['Content-type'] = 'text/html'
cherrypy.response.body = ["<p>This is the Linaro health check service. State is %s and response code is %s</P>" % (str(new_state), str(new_response))]
# Save away the various bits of information
try:
fp = open(self.script_directory + "/last_state", "w")
fp.write(str(new_state))
fp.close
except Exception,e:
self.logmsg("Got exception trying to save reported state: %s" % str(e))
try:
fp = open(self.script_directory + "/last_response", "w")
fp.write(str(new_response))
fp.close
except Exception,e:
self.logmsg("Got exception trying to save reported response: %s" % str(e))
try:
fp = open(self.script_directory + "/last_address", "w")
fp.write(str(service_ip))
fp.close
except Exception,e:
self.logmsg("Got exception trying to save service IP: %s" % str(e))
# See if a script exists for one of the supported state changes
if (new_state != self.last_state):
valid_state = False
if (self.last_state == States.Passive):
if (new_state == States.StartingUp or new_state == States.Maintenance or new_state == States.Frozen):
valid_state = True
elif (self.last_state == States.StartingUp):
if (new_state == States.Active or new_state == States.Failed or new_state == States.Maintenance or new_state == States.Frozen):
valid_state = True
elif (self.last_state == States.Active):
if (new_state == States.Failed or new_state == States.Maintenance or new_state == States.Frozen):
valid_state = True
elif (self.last_state == States.Failed):
if (new_state == States.Maintenance or new_state == States.Frozen):
valid_state = True
elif (self.last_state == States.Maintenance or last_state == States.Frozen):
if (new_state == States.StartingUp or new_state == States.Passive):
valid_state = True
if valid_state:
filename = "from_%s_to_%s.*" % (str(self.last_state), str(new_state))
files = glob.glob("%s/%s" % (self.script_directory, filename))
if len(files) == 1:
os.system(files[0])
else:
self.logmsg("more than one matching script for stage change %s to %s" % (str(self.last_state), str(new_state)))
else:
self.logmsg("Unexpected stage change from %s to %s" % (str(self.last_state), str(new_state)))
self.last_state = new_state
self.last_response = new_response
self.last_address = service_ip
def safereadline(script_directory, filename):
line = ""
try:
with open(script_directory + filename) as fp:
line = fp.readline()
except IOError as e:
if (e.errno != errno.ENOENT):
print("Got exception trying to read %s: %s" % (filename, str(e)))
return line
def main():
script_dir = os.path.dirname(os.path.realpath(__file__))
service_fqdn = ""
try:
with open(script_dir + "/fqdn") as fp:
service_fqdn = str(fp.readline()).rstrip()
except Exception,e:
print "Got exception trying to get fqdn: %s" % str(e)
return
# Try to get this system's IP address
try:
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
s.connect(("8.8.8.8", 53))
my_ip = s.getsockname()[0]
s.close
except Exception,e:
print "Got exception trying to get system's IP address: %s" % str(e)
return
# See if we've got a recorded last state
prev_state = States.Passive
line = safereadline(script_dir, "/last_state")
if (line != ""):
prev_state = int(line)
if ((prev_state < States.Passive) or (prev_state > States.Frozen)):
prev_state = States.Passive
# and see if we've got a recorded last response code
line = safereadline(script_dir, "/last_response")
if (line != ""):
prev_response = int(line)
else:
prev_response = 202
# and a last IP address
prev_address = safereadline(script_dir, "/last_address")
if (prev_address == ""):
print "Failed to get last IP address"
prev_address = "error"
# Set up the daemon
#cherrypy.process.plugins.Daemonizer(cherrypy.engine).subscribe()
cherrypy.config.update({'server.socket_host': '0.0.0.0',
'server.socket_port': 1234,
'server.thread_pool': 1,
'server.thread_pool_max': 1,
'tools.staticdir.on': True,
'tools.staticdir.dir': script_dir,
'log.screen': True,
'tools.sessions.on': True,
})
config = {'/':
{
}
}
application = cherrypy.tree.mount(HealthCheck(script_dir, service_fqdn, my_ip, prev_state, prev_response, prev_address), "/", config)
#log = application.log
logscope = cherrypy.log
# Make a new RotatingFileHandler for the error log.
fname = getattr(logscope, "rot_error_file", "error.log")
h = handlers.TimedRotatingFileHandler(fname, when='midnight')
h.setLevel(logging.DEBUG)
h.setFormatter(_cplogging.logfmt)
logscope.error_file = ""
logscope.error_log.addHandler(h)
# Make a new RotatingFileHandler for the access log.
fname = getattr(logscope, "rot_access_file", "access.log")
h = handlers.TimedRotatingFileHandler(fname, when='midnight')
h.setLevel(logging.DEBUG)
h.setFormatter(_cplogging.logfmt)
logscope.access_file = ""
logscope.access_log.addHandler(h)
# Add a CTRL+C handler
if hasattr(cherrypy.engine, 'signal_handler'):
cherrypy.engine.signal_handler.subscribe()
if hasattr(cherrypy.engine, 'console_control_handler'):
cherrypy.engine.console_control_handler.subscrive()
cherrypy.engine.start()
cherrypy.engine.block()
if __name__ == '__main__':
main()
|