lib/make.sh (make_check): Don't use baseline to build flaky tests list

We remove old results from the database, which also removes the flaky entries they provided. This causes our CI to forget previously detected flaky tests. Fix this by making each job record all flaky tests that it detects. So even if it is later removed from the results database, other results will also have the same flaky entries. This has the additional advantage of recording how often each flaky entry occurs. We do this by not using the baseline flaky results when generating the lists of flaky test failures and flaky test passes detected in each testsuite run. We still use the baseline flaky results to decide whether to run another try of the testsuite, so this change won't impact runtime cost. Change-Id: Idf2ec69af9d613511b743f871dc9abfe32e7b977
author: Thiago Jung Bauermann <thiago.bauermann@linaro.org> 2023-10-31 22:06:04 -0300
committer: Thiago Jung Bauermann <thiago.bauermann@linaro.org> 2023-12-27 12:18:24 -0300
commit: 0e230b26962aee97d8c53a79a78a64a6a9d6b2f3 (patch)
tree: 902bb1c8e37366edda032603ee1772676d2001c8
parent: fc92e939942f8d495315075c64dae72328bcee56 (diff)
1 files changed, 84 insertions, 27 deletions
diff --git a/lib/make.sh b/lib/make.sh
index bf4facaf..fc052569 100644
--- a/lib/make.sh
+++ b/lib/make.sh
@@ -1171,11 +1171,13 @@ make_check()
     local validate_failures="$testsuite_mgmt/validate_failures.py"
 
     # Prepare temporary fail files
-    local new_fails new_passes baseline_flaky known_flaky_and_fails new_flaky
+    local new_fails new_passes baseline_flaky new_flaky
+    local known_flaky_and_fails_for_deciding flaky_and_fails_for_output
     new_fails=$(mktemp)
     new_passes=$(mktemp)
     baseline_flaky=$(mktemp)
-    known_flaky_and_fails=$(mktemp)
+    known_flaky_and_fails_for_deciding=$(mktemp)
+    flaky_and_fails_for_output=$(mktemp)
 
     if [ "$flaky_failures" = "" ]; then
 	new_flaky=$(mktemp)
@@ -1201,7 +1203,8 @@ make_check()
 	expiry_date_opt+=(--expiry_date "$failures_expiration_date")
     fi
 
-    # Construct the initial $known_flaky_and_fails list.
+    # Construct the initial $known_flaky_and_fails_for_deciding and
+    # $flaky_and_fails_for_output lists.
     #
     # For the first iteration (try #0) we expect fails, passes and flaky tests
     # to be the same as in provided $expected_failures and $flaky_failures.
@@ -1214,7 +1217,13 @@ make_check()
     # $try results.  Each difference between $try-1 and $try will be recorded
     # in $new_flaky list, so with every try we will ignore more and more
     # tests as flaky.  We collect failures of the current try in $new_try_fails,
-    # which then becomes $prev_try_fails on $try+1.
+    # which then becomes $prev_try_fails on $try+1.  When generating the
+    # difference that will go into $new_flaky we don't compare against the
+    # provided $baseline_flaky, so that any detected flaky failure will appear
+    # in $new_flaky even if it's already in the baseline flaky list.  This is
+    # why $flaky_and_fails_for_output doesn't include the baseline flaky results
+    # and is done so that the list of flaky tests produced at the end of
+    # make_check has some redundancy with it.
     #
     # Note that we generate $prev_try_fails and $new_try_fails without regard
     # for flaky tests.  Therefore, $validate_failures that generate $new_fails
@@ -1247,18 +1256,25 @@ make_check()
     # libstdc++:libstdc++-dg/conformance.exp testsuite.
     #
     # With the new approach when this test [rarely] passes, we will detect
-    # that in comparison with "$known_flaky_and_fails", and, if $try==0, trigger
-    # another iteration of testing to confirm stability of the new PASS.
-    # The test will fail on the next iteration, and we will add it to
-    # $new_flaky list.  If the test passes during $try!=0, we will add it
-    # to the $new_flaky list immediately.
+    # that in comparison with "$known_flaky_and_fails_for_deciding", and, if
+    # $try==0, trigger another iteration of testing to confirm stability of
+    # the new PASS.  The test will fail on the next iteration, and we will add
+    # it to $new_flaky list.  If the test passes during $try!=0, we will add
+    # it to the $new_flaky list immediately.
 
-    cat > "$known_flaky_and_fails" <<EOF
+    cat > "$known_flaky_and_fails_for_deciding" <<EOF
 @include $new_flaky
 @include $baseline_flaky
 @include $prev_try_fails
 EOF
 
+    # This file doesn't contain $baseline_flaky and is used to find the new
+    # flaky tests to be added to $new_flaky in each try.
+    cat > "$flaky_and_fails_for_output" <<EOF
+@include $new_flaky
+@include $prev_try_fails
+EOF
+
     # Example iterations with binutils component:
     # try#0 runtestflags=""  -> tools=(any) dirs=(/binutils /gas /ld)
     #   detect failures in both gas and ld
@@ -1414,32 +1430,68 @@ EOF
 
 		    local -a failed_exps_for_dir=()
 
+		    # We do two sets of validate_failures.py runs:
+		    #
+		    # In the first one we compare with the previous try's
+		    # failures plus all known flaky tests.  We use the exit
+		    # status to decide whether to do another try of the
+		    # testsuite.
+		    #
+		    # In the second one we compare with the previous try's
+		    # failures plus the flaky tests detected in this
+		    # invocation of Abe.  We use the output for $new_flaky.
+
 		    # Check if we have any new FAILs or PASSes compared
 		    # to the previous iteration.
 		    # Detect PASS->FAIL flaky tests.
-		    local res_new_fails
+		    local res_new_fails_for_deciding
+		    "$validate_failures" \
+			--manifest="$known_flaky_and_fails_for_deciding" \
+			--build_dir="${builddir}$dir" \
+			--verbosity=0 "${expiry_date_opt[@]}" &
+		    res_new_fails_for_deciding=0 && wait $! \
+			    || res_new_fails_for_deciding=$?
+
+		    # Detect FAIL->PASS flaky tests.
+		    local res_new_passes_for_deciding
+		    "$validate_failures" \
+			--manifest="$known_flaky_and_fails_for_deciding" \
+			--build_dir="${builddir}$dir" \
+			--verbosity=0 "${expiry_date_opt[@]}" \
+			--inverse_match &
+		    res_new_passes_for_deciding=0 && wait $! \
+			    || res_new_passes_for_deciding=$?
+
+		    # Check again for new FAILs or PASSes compared to the
+		    # previous iteration, but this time without considering
+		    # the baseline flaky results.
+
+		    # Detect PASS->FAIL flaky tests.
+		    local res_new_fails_for_output
 		    "$validate_failures" \
-			--manifest="$known_flaky_and_fails" \
+			--manifest="$flaky_and_fails_for_output" \
 			--build_dir="${builddir}$dir" \
 			--verbosity=1 "${expiry_date_opt[@]}" \
 			> "$new_fails" &
-		    res_new_fails=0 && wait $! || res_new_fails=$?
+		    res_new_fails_for_output=0 && wait $! \
+			    || res_new_fails_for_output=$?
 
 		    # Detect FAIL->PASS flaky tests.
-		    local res_new_passes
+		    local res_new_passes_for_output
 		    "$validate_failures" \
-			--manifest="$known_flaky_and_fails" \
+			--manifest="$flaky_and_fails_for_output" \
 			--build_dir="${builddir}$dir" \
 			--verbosity=1 "${expiry_date_opt[@]}" \
 			--inverse_match \
 			> "$new_passes" &
-		    res_new_passes=0 && wait $! || res_new_passes=$?
+		    res_new_passes_for_output=0 && wait $! \
+			    || res_new_passes_for_output=$?
 
 		    # If it was the first try and it didn't fail, we don't
 		    # need to save copies of the sum and log files.
 		    if [ $try = 0 ] \
-			   && [ $res_new_fails = 0 ] \
-			   && [ $res_new_passes = 0 ]; then
+			   && [ $res_new_fails_for_deciding = 0 ] \
+			   && [ $res_new_passes_for_deciding = 0 ]; then
 			break
 		    fi
 
@@ -1463,17 +1515,20 @@ EOF
 			sums["$sum"]+="${sum}.${try};"
 		    done < <(find "${builddir}$dir" -name '*.sum' -print0)
 
-		    if [ $res_new_fails = 0 ] \
-			   && [ $res_new_passes = 0 ]; then
+		    if [ $res_new_fails_for_deciding = 0 ] \
+			   && [ $res_new_passes_for_deciding = 0 ]; then
 			# No failures. We can stop now.
 			break
-		    elif [ $res_new_fails = 0 ] && [ $res_new_passes = 2 ] \
+		    elif [ $res_new_fails_for_deciding = 0 ] \
+			     && [ $res_new_passes_for_deciding = 2 ] \
 			     && [ $res_prev_fails = 0 ]; then
 			:
-		    elif [ $res_new_fails = 2 ] && [ $res_new_passes = 0 ] \
+		    elif [ $res_new_fails_for_deciding = 2 ] \
+			     && [ $res_new_passes_for_deciding = 0 ] \
 			     && [ $res_prev_fails = 0 ]; then
 			:
-		    elif [ $res_new_fails = 2 ] && [ $res_new_passes = 2 ] \
+		    elif [ $res_new_fails_for_deciding = 2 ] \
+			     && [ $res_new_passes_for_deciding = 2 ] \
 			     && [ $res_prev_fails = 0 ]; then
 			:
 		    else
@@ -1492,8 +1547,9 @@ EOF
 		    if [ $try != 0 ]; then
 			# Incorporate this try's flaky tests into $new_flaky.
 			# This will make these tests appear in
-			# $known_flaky_and_fails for the next iteration.
-			if [ $res_new_fails = 2 ]; then
+			# $known_flaky_and_fails_for_deciding and
+			# $flaky_and_fails_for_output for the next iteration.
+			if [ $res_new_fails_for_output = 2 ]; then
 			    # Prepend "flaky | " attribute to
 			    # the newly-detected flaky tests.
 			    sed -i -e "s#^\([A-Z]\+: \)#flaky | \1#" \
@@ -1503,7 +1559,7 @@ EOF
 			    notice "Detected new PASS->FAIL flaky tests:"
 			    cat "$new_fails"
 			fi
-			if [ $res_new_passes = 2 ]; then
+			if [ $res_new_passes_for_output = 2 ]; then
 			    # Prepend "flaky | " attribute to
 			    # the newly-detected flaky tests.
 			    sed -i -e "s#^\([A-Z]\+: \)#flaky | \1#" \
@@ -1552,7 +1608,8 @@ EOF
 	done
     fi
 
-    rm "$new_fails" "$new_passes" "$baseline_flaky" "$known_flaky_and_fails"
+    rm "$new_fails" "$new_passes" "$baseline_flaky"
+    rm "$known_flaky_and_fails_for_deciding" "$flaky_and_fails_for_output"
     if [ "$flaky_failures" = "" ]; then
 	rm "$new_flaky"
     fi
author	Thiago Jung Bauermann <thiago.bauermann@linaro.org>	2023-10-31 22:06:04 -0300
committer	Thiago Jung Bauermann <thiago.bauermann@linaro.org>	2023-12-27 12:18:24 -0300
commit	0e230b26962aee97d8c53a79a78a64a6a9d6b2f3 (patch)
tree	902bb1c8e37366edda032603ee1772676d2001c8
parent	fc92e939942f8d495315075c64dae72328bcee56 (diff)