llvmbot monitor: Add "First Failing" and "Failing Since" columns
- The "First Failing" column contains a link to the first failed
build, if the bot has been failing for several builds.
- The "Failing Since" column shows the time elapsed since that first
failed build finished. This can be useful for detecting bots which
fail for a long time.
This column will be colored in red if that time is >24h.
Change-Id: I19878c922c65f8a9f5548ab8e3ea942a57c5a1f6
diff --git a/monitor/README.txt b/monitor/README.txt
index 06f0cec..4917e1e 100644
--- a/monitor/README.txt
+++ b/monitor/README.txt
@@ -104,12 +104,16 @@
that have gotten disconnected. If this time is greater than 24 hours, it will be shown
in red.
* "Duration": The length of the last build.
- * "Build": The build number of the last finished build, which itself will be a link
+ * "Latest": The build number of the last finished build, which itself will be a link
to the results page for that build.
* "Failing steps": The failed build steps, if it was a failed build.
* "Build In Progress": This will be "Yes" if there is a build currently running or "No"
if there is not. If we cannot determine this, it will be left blank and you should
check the builder's status page instead.
+ * "1st Failing": The number of the first failed build, if the bot fails for several
+ builds.
+ * "Failing Since": The time since the first failed build finished. This is useful for
+ spotting bots that fail for a long time.
Note: "finished" here refers to the build ending be that by success, cancellation or
failure.
diff --git a/monitor/bot-status.py b/monitor/bot-status.py
index e86e7bc..f19ce39 100755
--- a/monitor/bot-status.py
+++ b/monitor/bot-status.py
@@ -102,6 +102,23 @@
buildid = build["buildid"]
status["steps"] = list(get_bot_failing_steps(session, base_url, buildid))
+ # find the start of the failure streak
+ first_fail = build
+ for build in reversed_builds:
+ if build["state_string"] == "build successful":
+ status["first_fail_number"] = first_fail["number"]
+ status["first_fail_url"] = "{}/builds/{}".format(
+ agent_url, first_fail["number"]
+ )
+ fail_since = int(datetime.now().timestamp()) - int(
+ first_fail["complete_at"]
+ )
+ status["fail_since"] = timedelta(seconds=fail_since)
+ break
+ first_fail = build
+ else:
+ pass # fails since forever?
+
return status
@@ -168,9 +185,11 @@
"Status",
"T Since",
"Duration",
- "Build",
+ "Latest",
"Failing steps",
"Build In Progress",
+ "1st Failing",
+ "Failing Since",
]
num_columns = len(column_titles)
@@ -228,24 +247,19 @@
row.AddCell(
"<a href='{}'>{}</a>".format(status["builder_url"], bot["name"])
)
- row.AddCell(
- "<font color='{}'>{}</font>".format(
- "red" if status["fail"] else "green",
- "FAIL" if status["fail"] else "PASS",
- )
- )
+
+ status_cell = row.AddCell()
+ if status["fail"]:
+ status_cell.Style("color:red").Content("FAIL")
+ else:
+ status_cell.Style("color:green").Content("PASS")
time_since_cell = row.AddCell()
if "time_since" in status:
time_since = status["time_since"]
# No build should be taking more than a day
if time_since > timedelta(hours=24):
- time_since = '<p style="color:red">{}</p>'.format(
- time_since
- )
- else:
- time_since = str(time_since)
-
+ time_since_cell.Style("color:red")
time_since_cell.Content(time_since)
duration_cell = row.AddCell()
@@ -280,6 +294,22 @@
"Yes" if status["next_in_progress"] else "No"
)
+ first_fail_cell = row.AddCell()
+ if "first_fail_number" in status:
+ first_fail_cell.Content(
+ "<a href='{}'>{}</a>".format(
+ status["first_fail_url"], status["first_fail_number"]
+ )
+ )
+
+ fail_since_cell = row.AddCell()
+ if "fail_since" in status:
+ fail_since = status["fail_since"]
+ # No build should fail for more than a day
+ if fail_since > timedelta(hours=24):
+ fail_since_cell.Style("color:red")
+ fail_since_cell.Content(fail_since)
+
table.EndBody()
# Move temp to main (atomic change)