1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
|
#!/usr/bin/env python2.7
"""
This script monitors runaway/stuck build slave EC2 instances as run by
Jenkins build services. There are known cases when Jenkins can't catch/
handle/stop them, so this cronject represents independent external line
of defence against such.
"""
#
# Note: this requires recent boto (tested with 2.0)
# Older Ubuntu version don't have recent enough boto, so we instead use
# non-system python2.7 and install needed packages using easy_install:
#
# easy_install-2.7 boto lxml pycrypto
#
import sys
import time
import datetime
import logging
import re
import urllib2
import json
from pprint import pprint
from boto import connect_ec2, connect_s3, ec2, utils
from lxml.etree import fromstring
# All timeouts are in minutes
HOUR = 60
# Don't consider slave running less than this at all
MINIMAL_INSTANCE_RUN_TIME = 0.5 * HOUR
# "Normal" instance run time, running less than this w/o obvious
# breakage symptoms is ok
INSTANCE_RUN_TIME = 3 * HOUR
# Is slave is idle (no current builds), warn if it ran for this time
# NOTE: this will lead to false positives. Generally, this is a problem
# if slave stays idle for more than 30 mins
INSTANCE_RUN_TIME_IDLE = 15 * HOUR
# Unconditionally warn about instance running for so much
INSTANCE_RUN_TIME_TOO_LONG = 17 * HOUR
# Warn about builds running more than this
DEFAULT_BUILD_RUN_TIME = 3.33 * HOUR
BUILD_RUN_TIMES = [
("linaro-android_toolchain-trunk", 3 * HOUR),
("precise-armhf-pre-built-images", 5 * HOUR),
("quantal-armhf-trigger", 7.1 * HOUR),
("openembedded-.+", 13.5 * HOUR),
]
ACTIVE_REGION = "us-east-1"
JENKINS_HOME = "/var/lib/jenkins/"
# This maps EC2 key name used to start a slave to the owning master instance
KEY_NAME_TO_MASTER = {
"jenkins": "ci.linaro.org",
"jenkins-slave": "android-build.linaro.org",
}
log = logging.getLogger("monitor")
def get_cleartext(s):
import base64
import hashlib
from Crypto.Cipher import AES
key = open(JENKINS_HOME + "secret.key").read()
digest = hashlib.sha256(key)
aes = AES.new(digest.digest()[0:128 / 8])
clear = aes.decrypt(base64.b64decode(s))
clear = clear.rstrip()
if not clear.endswith("::::MAGIC::::"):
return None
clear = clear[:-len("::::MAGIC::::")]
return clear
def get_credentials():
tree = fromstring(open(JENKINS_HOME + "config.xml").read())
nodes = tree.xpath("//hudson.plugins.ec2.EC2Cloud/accessId")
access_id = nodes[0].text
nodes = tree.xpath("//hudson.plugins.ec2.EC2Cloud/secretKey")
secret_key = get_cleartext(nodes[0].text)
return access_id, secret_key
def get_slave_info(instance_id, host):
slave_info = {"status": "", "builds": []}
try:
url = "https://%s/jenkins/computer/%s/api/json?depth=2" % (host, instance_id)
log.debug("Fetching %s", url)
f = urllib2.urlopen(url)
except urllib2.HTTPError, e:
if e.code == 404:
slave_info["status"] = "unknown-owner"
return slave_info
raise
js = json.load(f)
# pprint(js)
if js["offline"]:
slave_info["status"] = "offline"
return slave_info
builds = []
for executor in js["executors"]:
job = executor["currentExecutable"]
if not job:
continue
url = job["url"].replace("/.", "")
head, build_no, _ = url.rsplit("/", 2)
job_name = head.split("/", 5)[-1]
dur = datetime.timedelta(seconds=time.time() - job["timestamp"] / 1000.0)
builds.append((job_name, build_no, dur, url))
slave_info["builds"] = builds
return slave_info
def check_build_slave(instance_id, owner, run_time):
"""Decide if given instance should be reported as potential runaway.
Return None if not. Otherwise, return slave_info structure with details
about a slave which will be useful to report to a human.
"""
log.debug("Considering %s by %s, run time %s", instance_id, owner, run_time)
if run_time < datetime.timedelta(minutes=MINIMAL_INSTANCE_RUN_TIME):
return None
slave_info = get_slave_info(instance_id, owner)
# Offline for that much time? Startup failure.
if slave_info["status"] == "offline":
return slave_info
if run_time < datetime.timedelta(minutes=INSTANCE_RUN_TIME):
return None
if run_time > datetime.timedelta(minutes=INSTANCE_RUN_TIME_TOO_LONG):
return slave_info
if not slave_info["builds"] and run_time > datetime.timedelta(minutes=INSTANCE_RUN_TIME_IDLE):
return slave_info
found = False
for build in slave_info["builds"]:
# Check if we have special timeout for this build
for pat, timeout in BUILD_RUN_TIMES:
if re.match(pat, build[0]):
if build[2] > datetime.timedelta(minutes=timeout):
return slave_info
else:
found = True
break
# Else, use default
if not found and build[2] > datetime.timedelta(minutes=DEFAULT_BUILD_RUN_TIME):
return slave_info
return None
def process_instance(now, i):
# Useful properties: i.id, i.key_name, i.state, i.launch_time
if i.state == "running":
if not i.key_name in KEY_NAME_TO_MASTER:
return
owner = KEY_NAME_TO_MASTER[i.key_name]
run_time = now - utils.parse_ts(i.launch_time)
try:
slave_info = check_build_slave(i.id, owner, run_time)
except Exception, e:
print "Error checking %s (owner %s) running %s for stuck slaves" % (i.id, owner, run_time)
print "Exception: %s" % e
return
if slave_info:
print "Build slave %s (%s) or build(s) on it are running for too long!" % (i.id, owner)
print "Build slave run time: %s" % run_time
if slave_info["status"] == "unknown-owner":
print "This slave was not recognized by %s (started by sandbox?)" % owner
elif slave_info["status"] == "offline":
print "Offline (failed to start up or failed later by some reason)"
print "Automatically terminating the instance"
i.terminate()
elif not slave_info["builds"]:
print "Idle"
else:
print "Running builds:"
for build in slave_info["builds"]:
print "%s #%s" % (build[0], build[1])
print "Running for:", build[2]
print build[3]
print
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
if len(sys.argv) > 1 and sys.argv[1] == "-v":
log.setLevel(logging.DEBUG)
key, secret = get_credentials()
# regions = [ec2.get_region(ACTIVE_REGION)]
# regions = ec2.regions()
for region in [ACTIVE_REGION]:
region = ec2.connect_to_region(region, aws_access_key_id=key, aws_secret_access_key=secret)
now = datetime.datetime.utcnow()
for reservation in region.get_all_instances():
for i in reservation.instances:
process_instance(now, i)
|