llvmbot monitor: Add "First Failing" and "Failing Since" columns - The "First Failing" column contains a link to the first failed build, if the bot has been failing for several builds. - The "Failing Since" column shows the time elapsed since that first failed build finished. This can be useful for detecting bots which fail for a long time. This column will be colored in red if that time is >24h. Change-Id: I19878c922c65f8a9f5548ab8e3ea942a57c5a1f6

commit: 0cbe02e84982406f11349808706dc07af7f6f10f [log] [tgz]
author: Antoine Moynault <antoine.moynault@linaro.org> Wed Jun 21 08:13:57 2023 +0000
committer: Antoine Moynault <antoine.moynault@linaro.org> Wed Jun 21 10:20:29 2023 +0000
tree: 7398c3eea51cef10dc1c85818963715983f6a963
parent: 966ee8aecd8c8e673a22bd121d8730cb6e5e24d4 [diff]
diff --git a/monitor/README.txt b/monitor/README.txt
index 06f0cec..4917e1e 100644
--- a/monitor/README.txt
+++ b/monitor/README.txt

@@ -104,12 +104,16 @@
     that have gotten disconnected. If this time is greater than 24 hours, it will be shown
     in red.
   * "Duration": The length of the last build.
-  * "Build": The build number of the last finished build, which itself will be a link
+  * "Latest": The build number of the last finished build, which itself will be a link
     to the results page for that build.
   * "Failing steps": The failed build steps, if it was a failed build.
   * "Build In Progress": This will be "Yes" if there is a build currently running or "No"
     if there is not. If we cannot determine this, it will be left blank and you should
     check the builder's status page instead.
+  * "1st Failing": The number of the first failed build, if the bot fails for several
+    builds.
+  * "Failing Since": The time since the first failed build finished. This is useful for
+    spotting bots that fail for a long time.
 
 Note: "finished" here refers to the build ending be that by success, cancellation or
 failure.

diff --git a/monitor/bot-status.py b/monitor/bot-status.py
index e86e7bc..f19ce39 100755
--- a/monitor/bot-status.py
+++ b/monitor/bot-status.py

@@ -102,6 +102,23 @@
             buildid = build["buildid"]
             status["steps"] = list(get_bot_failing_steps(session, base_url, buildid))
 
+            # find the start of the failure streak
+            first_fail = build
+            for build in reversed_builds:
+                if build["state_string"] == "build successful":
+                    status["first_fail_number"] = first_fail["number"]
+                    status["first_fail_url"] = "{}/builds/{}".format(
+                        agent_url, first_fail["number"]
+                    )
+                    fail_since = int(datetime.now().timestamp()) - int(
+                        first_fail["complete_at"]
+                    )
+                    status["fail_since"] = timedelta(seconds=fail_since)
+                    break
+                first_fail = build
+            else:
+                pass  # fails since forever?
+
         return status
 
 
@@ -168,9 +185,11 @@
         "Status",
         "T Since",
         "Duration",
-        "Build",
+        "Latest",
         "Failing steps",
         "Build In Progress",
+        "1st Failing",
+        "Failing Since",
     ]
     num_columns = len(column_titles)
 
@@ -228,24 +247,19 @@
                     row.AddCell(
                         "<a href='{}'>{}</a>".format(status["builder_url"], bot["name"])
                     )
-                    row.AddCell(
-                        "<font color='{}'>{}</font>".format(
-                            "red" if status["fail"] else "green",
-                            "FAIL" if status["fail"] else "PASS",
-                        )
-                    )
+
+                    status_cell = row.AddCell()
+                    if status["fail"]:
+                        status_cell.Style("color:red").Content("FAIL")
+                    else:
+                        status_cell.Style("color:green").Content("PASS")
 
                     time_since_cell = row.AddCell()
                     if "time_since" in status:
                         time_since = status["time_since"]
                         # No build should be taking more than a day
                         if time_since > timedelta(hours=24):
-                            time_since = '<p style="color:red">{}</p>'.format(
-                                time_since
-                            )
-                        else:
-                            time_since = str(time_since)
-
+                            time_since_cell.Style("color:red")
                         time_since_cell.Content(time_since)
 
                     duration_cell = row.AddCell()
@@ -280,6 +294,22 @@
                             "Yes" if status["next_in_progress"] else "No"
                         )
 
+                    first_fail_cell = row.AddCell()
+                    if "first_fail_number" in status:
+                        first_fail_cell.Content(
+                            "<a href='{}'>{}</a>".format(
+                                status["first_fail_url"], status["first_fail_number"]
+                            )
+                        )
+
+                    fail_since_cell = row.AddCell()
+                    if "fail_since" in status:
+                        fail_since = status["fail_since"]
+                        # No build should fail for more than a day
+                        if fail_since > timedelta(hours=24):
+                            fail_since_cell.Style("color:red")
+                        fail_since_cell.Content(fail_since)
+
                 table.EndBody()
 
     # Move temp to main (atomic change)
commit	0cbe02e84982406f11349808706dc07af7f6f10f	[log] [tgz]
author	Antoine Moynault <antoine.moynault@linaro.org>	Wed Jun 21 08:13:57 2023 +0000
committer	Antoine Moynault <antoine.moynault@linaro.org>	Wed Jun 21 10:20:29 2023 +0000
tree	7398c3eea51cef10dc1c85818963715983f6a963
parent	966ee8aecd8c8e673a22bd121d8730cb6e5e24d4 [diff]