monitor_ec2_build_slaves.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207

#!/usr/bin/env python2.7
"""
This script monitors runaway/stuck build slave EC2 instances as run by
Jenkins build services. There are known cases when Jenkins can't catch/
handle/stop them, so this cronject represents independent external line
of defence against such.
"""
#
# Note: this requires recent boto (tested with 2.0)
# Older Ubuntu version don't have recent enough boto, so we instead use
# non-system python2.7 and install needed packages using easy_install:
#
# easy_install-2.7 boto lxml pycrypto
#

import sys
import time
import datetime
import logging
import re
import urllib2
import json
from pprint import pprint

from boto import connect_ec2, connect_s3, ec2, utils
from lxml.etree import fromstring


# All timeouts are in minutes
HOUR = 60

# Don't consider slave running less than this at all
MINIMAL_INSTANCE_RUN_TIME = 0.5 * HOUR
# "Normal" instance run time, running less than this w/o obvious
# breakage symptoms is ok
INSTANCE_RUN_TIME = 3 * HOUR
# Is slave is idle (no current builds), warn if it ran for this time
# NOTE: this will lead to false positives. Generally, this is a problem
# if slave stays idle for more than 30 mins
INSTANCE_RUN_TIME_IDLE = 15 * HOUR
# Unconditionally warn about instance running for so much
INSTANCE_RUN_TIME_TOO_LONG = 17 * HOUR
# Warn about builds running more than this
DEFAULT_BUILD_RUN_TIME = 3.33 * HOUR
BUILD_RUN_TIMES = [
    ("linaro-android_toolchain-trunk", 3 * HOUR),
    ("precise-armhf-pre-built-images", 5 * HOUR),
    ("quantal-armhf-trigger", 7.1 * HOUR),
    ("openembedded-.+", 13.5 * HOUR),
]

ACTIVE_REGION = "us-east-1"
JENKINS_HOME = "/var/lib/jenkins/"

# This maps EC2 key name used to start a slave to the owning master instance
KEY_NAME_TO_MASTER = {
    "jenkins": "ci.linaro.org",
    "jenkins-slave": "android-build.linaro.org",
}


log = logging.getLogger("monitor")

def get_cleartext(s):
    import base64
    import hashlib
    from Crypto.Cipher import AES

    key = open(JENKINS_HOME + "secret.key").read()
    digest = hashlib.sha256(key)
    aes = AES.new(digest.digest()[0:128 / 8])
    clear = aes.decrypt(base64.b64decode(s))
    clear = clear.rstrip()
    if not clear.endswith("::::MAGIC::::"):
        return None
    clear = clear[:-len("::::MAGIC::::")]
    return clear


def get_credentials():
    tree = fromstring(open(JENKINS_HOME + "config.xml").read())
    nodes = tree.xpath("//hudson.plugins.ec2.EC2Cloud/accessId")
    access_id = nodes[0].text
    nodes = tree.xpath("//hudson.plugins.ec2.EC2Cloud/secretKey")
    secret_key = get_cleartext(nodes[0].text)
    return access_id, secret_key


def get_slave_info(instance_id, host):
    slave_info = {"status": "", "builds": []}
    try:
        url = "https://%s/jenkins/computer/%s/api/json?depth=2" % (host, instance_id)
        log.debug("Fetching %s", url)
        f = urllib2.urlopen(url)
    except urllib2.HTTPError, e:
        if e.code == 404:
            slave_info["status"] = "unknown-owner"
            return slave_info
        raise
    js = json.load(f)
#    pprint(js)

    if js["offline"]:
        slave_info["status"] = "offline"
        return slave_info

    builds = []
    for executor in js["executors"]:
        job = executor["currentExecutable"]
        if not job:
            continue
        url = job["url"].replace("/.", "")
        head, build_no, _ = url.rsplit("/", 2)
        job_name = head.split("/", 5)[-1]
        dur = datetime.timedelta(seconds=time.time() - job["timestamp"] / 1000.0)
        builds.append((job_name, build_no, dur, url))

    slave_info["builds"] = builds
    return slave_info


def check_build_slave(instance_id, owner, run_time):
    """Decide if given instance should be reported as potential runaway.
    Return None if not. Otherwise, return slave_info structure with details
    about a slave which will be useful to report to a human.
    """
    log.debug("Considering %s by %s, run time %s", instance_id, owner, run_time)
    if run_time < datetime.timedelta(minutes=MINIMAL_INSTANCE_RUN_TIME):
        return None

    slave_info = get_slave_info(instance_id, owner)

    # Offline for that much time? Startup failure.
    if slave_info["status"] == "offline":
        return slave_info

    if run_time < datetime.timedelta(minutes=INSTANCE_RUN_TIME):
        return None

    if run_time > datetime.timedelta(minutes=INSTANCE_RUN_TIME_TOO_LONG):
        return slave_info
    if not slave_info["builds"] and run_time > datetime.timedelta(minutes=INSTANCE_RUN_TIME_IDLE):
        return slave_info
    found = False
    for build in slave_info["builds"]:
        # Check if we have special timeout for this build
        for pat, timeout in BUILD_RUN_TIMES:
            if re.match(pat, build[0]):
                if build[2] > datetime.timedelta(minutes=timeout):
                    return slave_info
                else:
                    found = True
                    break
        # Else, use default
        if not found and build[2] > datetime.timedelta(minutes=DEFAULT_BUILD_RUN_TIME):
            return slave_info
    return None


def process_instance(now, i):
    # Useful properties: i.id, i.key_name, i.state, i.launch_time
    if i.state == "running":
        if not i.key_name in KEY_NAME_TO_MASTER:
            return
        owner = KEY_NAME_TO_MASTER[i.key_name]
        run_time = now - utils.parse_ts(i.launch_time)
        try:
            slave_info = check_build_slave(i.id, owner, run_time)
        except Exception, e:
            print "Error checking %s (owner %s) running %s for stuck slaves" % (i.id, owner, run_time)
            print "Exception: %s" % e
            return

        if slave_info:
            print "Build slave %s (%s) or build(s) on it are running for too long!" % (i.id, owner)
            print "Build slave run time: %s" % run_time
            if slave_info["status"] == "unknown-owner":
                print "This slave was not recognized by %s (started by sandbox?)" % owner
            elif slave_info["status"] == "offline":
                print "Offline (failed to start up or failed later by some reason)"
                print "Automatically terminating the instance"
                i.terminate()
            elif not slave_info["builds"]:
                print "Idle"
            else:
                print "Running builds:"
                for build in slave_info["builds"]:
                    print "%s #%s" % (build[0], build[1])
                    print "Running for:", build[2]
                    print build[3]
            print


if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    if len(sys.argv) > 1 and sys.argv[1] == "-v":
        log.setLevel(logging.DEBUG)
    key, secret = get_credentials()

#    regions = [ec2.get_region(ACTIVE_REGION)]
#    regions = ec2.regions()
    for region in [ACTIVE_REGION]:
        region = ec2.connect_to_region(region, aws_access_key_id=key, aws_secret_access_key=secret)
        now = datetime.datetime.utcnow()
        for reservation in region.get_all_instances():
            for i in reservation.instances:
                process_instance(now, i)