diff options
Diffstat (limited to 'tcwg-update-containers.yaml.in')
-rw-r--r-- | tcwg-update-containers.yaml.in | 443 |
1 files changed, 443 insertions, 0 deletions
diff --git a/tcwg-update-containers.yaml.in b/tcwg-update-containers.yaml.in new file mode 100644 index 0000000000..d748ce663c --- /dev/null +++ b/tcwg-update-containers.yaml.in @@ -0,0 +1,443 @@ +#include tcwg/default.yaml.inc + +- job: + name: tcwg-update-#{CONTAINER}-containers + project-type: freestyle + defaults: global + properties: + - authorization: + anonymous: + - job-read + - job-extended-read + everyone-flat: + - job-build + - job-cancel + - build-discarder: + days-to-keep: 30 + num-to-keep: 100 + - throttle: + max-per-node: 1 + option: project + parameters: + - label: + name: nodes +#if CONTAINER_host + # Run on all real machines with exception of benchmarking boards. + # Host containers of benchmarking boards are handled by + # CONTAINER_bmk job. + default: tcwg && !tcwg-bmk-hw && !tcwg-secondary-node +#elif CONTAINER_jenkins + # Run on all nodes with exception of benchmarking boards. + # Jenkins containers of benchmarking boards are handled by + # CONTAINER_bmk job. + default: tcwg && !tcwg-bmk-hw +#elif CONTAINER_llvmbot || CONTAINER_buildkite + default: tcwg-llvmbot +#elif CONTAINER_bmk + default: tcwg-bmk-pool +#endif + all-nodes: true + matching-label: 'allCases' + description: 'Machines to run on' + - string: + name: distro +#if CONTAINER_llvmbot || CONTAINER_buildkite + default: 'lts_1' +#else + default: 'default' +#endif + description: 'Distro version to use.' +#if CONTAINER_llvmbot + - string: + name: master + default: 'normal' + description: 'LLVM buildmaster to use: silent or normal' +#endif + - bool: + name: force + default: 'false' + description: 'Whether to force update even with no changes in image' + - bool: + name: verbose + default: 'true' + description: 'Whether to be verbose' + - string: + name: scripts_branch + default: master + description: 'Scripts revision to use' + disabled: false + node: tcwg-coordinator + concurrent: true + display-name: 'TCWG CCC Update #{CONTAINER} containers' + # We need to unshare workspace with $NODE_NAME in the path to + # correctly run on tcwg-bmk-* nodes. + workspace: workspace/tcwg-update-#{CONTAINER}-containers_$EXECUTOR_NUMBER/$NODE_NAME + scm: + - jenkins-scripts +#if !CONTAINER_llvmbot && !CONTAINER_buildkite + triggers: + - timed: '@daily' +#else + # No timer trigger for llvmbot and buildkite because we want the bot maintainer to be + # able to control the deployment time. +#endif + wrappers: + - timeout: +#if CONTAINER_jenkins + # Wait at most 5 hours before giving up on updating jenkins + # client container. + timeout: 300 +#else + timeout: 600 +#endif + - timestamps +#if CONTAINER_llvmbot + - credentials-binding: + - text: + credential-id: TCWG_LLVMBOT_PASSWORD + variable: TCWG_LLVMBOT_PASSWORD +#elif CONTAINER_buildkite + - credentials-binding: + - text: + credential-id: TCWG_BUILDKITE_TOKEN_LIBCXX + variable: TCWG_BUILDKITE_TOKEN_LIBCXX +#elif CONTAINER_bmk + - ssh-agent-credentials: + users: + - 'tcwg-benchmark' + # tcwg-buildslave user id + # ??? Do we need tcwg-buildslave's keys for BMK containers? + - 'e0958a95-204f-4c14-a66c-5e2be6c5d50a' +#elif CONTAINER_jenkins + - ssh-agent-credentials: + users: + # tcwg-buildslave user id + - 'e0958a95-204f-4c14-a66c-5e2be6c5d50a' +#endif + - build-name: + name: '#${BUILD_NUMBER}-${NODE_NAME}' + builders: + - shell: + command: | + #!/bin/bash + set -ex + +#if CONTAINER_host + case " $NODE_LABELS " in + *" tcwg-llvmbot "*) group=tcwg-llvm ;; + *) group=all ;; + esac + ./jenkins-scripts/tcwg-update-host-containers.sh \ + --distro "$distro" \ + --group "$group" \ + --force "$force" \ + --verbose "$verbose" +#elif CONTAINER_jenkins + # Check if we need to update the image -- run with --dryrun true. + ./jenkins-scripts/tcwg-update-host-containers.sh \ + --distro "$distro" \ + --dryrun true \ + --node "$NODE_NAME" \ + --force "$force" \ + --verbose "$verbose" & + res=0 && wait $! || res=$? + + if [ $res = 0 ]; then + # Fast-path exit to avoid bringing the node offline. + echo "$NODE_NAME is up-to-date" + # Skip the rest and mark the build UNSTABLE (aka skipped). + exit 125 + elif [ $res = 125 ]; then + echo "$NODE_NAME needs container update" + elif [ $res != 0 ]; then + echo "ERROR: container check failed" + exit $res + fi + + # We are about to update the container that is running this. + # The plan is: + # 1. Prevent new builds from starting by putting the node into + # offline mode. + # 2. Wait for current builds to finish. We detect this by + # checking for children process of the jenkins client. + # 3. Trigger a job on the master node to bring this node back + # online. Without this we would restart the jenkins container, + # but the node would still be marked as "offline" and no new + # builds will be scheduled to it. + # 4. Restart the container. + + # Mark the node offline. + ssh -p2222 -l tcwg-buildslave@linaro.org ci.linaro.org \ + offline-node "$NODE_NAME" \ + -m "Updating_jenkins_container:$BUILD_URL" + + start_date=$(date +%s) + rm -f timeout + + # Wait for current builds to finish. + while true; do + n_busy=$(source jenkins-scripts/jenkins-helpers.sh + print_number_of_busy_executors "$NODE_NAME") + + if [ "$n_busy" = "1" ]; then + # We are the only build left. + break + fi + + elapsed=$(($(date +%s) - $start_date)) + elapsed=$(($elapsed / 60)) + if [ "$elapsed" -gt "270" ]; then + # We'll timeout in 30 minutes; give up on the update and + # bring the node back online. + # We budget 30 minutes for tcwg-cleanup-stale-workspaces.sh + # below. + touch timeout + break + fi + + # Wait for other builds to complete. + sleep 60 + done + unstable-return: 125 + - conditional-step: + condition-kind: current-status + steps: + - trigger-builds: + - project: tcwg-update-jenkins-containers-online-node + predefined-parameters: | + node=$NODE_NAME + build_num=$BUILD_NUMBER + block: false + - shell: + command: | + #!/bin/bash + set -ex + + if [ -f timeout ]; then + exit 125 + fi + + # Cleanup workspace directory while the node is idle. + ( + set +e + $WORKSPACE/jenkins-scripts/tcwg-cleanup-stale-workspaces.sh \ + --days 3 --workspace_top $HOME/workspace + ) + + ./jenkins-scripts/tcwg-update-host-containers.sh \ + --distro "$distro" \ + --dryrun false \ + --node "$NODE_NAME" \ + --force "$force" \ + --verbose "$verbose" || exit 125 + unstable-return: 125 +#elif CONTAINER_llvmbot + ./jenkins-scripts/tcwg-update-llvmbot-containers.sh \ + --NODE_NAME "$NODE_NAME" \ + --distro "$distro" \ + --master "$master" \ + --password "$TCWG_LLVMBOT_PASSWORD" \ + --force "$force" \ + --verbose "$verbose" +#elif CONTAINER_buildkite + ./jenkins-scripts/tcwg-update-llvmbot-containers.sh \ + --NODE_NAME "$NODE_NAME" \ + --distro "$distro" \ + --master "buildkite" \ + --password "$TCWG_BUILDKITE_TOKEN_LIBCXX" \ + --force "$force" \ + --verbose "$verbose" +#elif CONTAINER_bmk + # See: LABEL_SED below + set -o pipefail + + rm -rf artifacts + mkdir artifacts + + echo "$BUILD_URL" > artifacts/mail-body.txt + echo "maxim.kuvyrkov@linaro.org, laurent.alfonsi@linaro.org" > artifacts/mail-recipients.txt + + declare -A pids + for lock in $(set +f; ls -tr $HOME/boards/$NODE_NAME-*.lock \ + $HOME/boards/$NODE_NAME-*.bak*); do + ( + flock -e 9 + touch $lock + board=$(cat <&9) + + # Start the jenkins container so that it can process + # all the queued cleanup and maintenance tasks. + node=$(basename "$board" .tcwglab) + + ./jenkins-scripts/tcwg-update-bmk-containers.sh \ + --board "$board" \ + --distro "$distro" \ + --force "$force" \ + --node "$node" \ + --verbose "$verbose" + + # Wait for jenkins container to become idle + while sleep 60; do + n_busy=$(source jenkins-scripts/jenkins-helpers.sh + print_number_of_busy_executors "$node") + + if [ "$n_busy" = "0" ]; then + break + fi + done + + # Now stop the jenkins container so that it's not terminated + # midway some other build by a starting benchmarking job. + ssh -Snone $board docker stop "$node" + ) 9<$lock 2>&1 | sed -e "s/^/$(basename $lock): /" & + + # LABEL_SED: We need pipefail to get correct result of sub-shell + # "( foo ) | sed" instead of always-succeeding "sed". + pids[$(basename "$lock")]=$! + done + + n_good_boards=0 + for lock in "${!pids[@]}"; do + res=0 && wait "${pids[$lock]}" || res=$? + case "$res:$lock" in + "0":*".lock") + n_good_boards=$(($n_good_boards + 1)) + echo "$lock: SUCCESS" >> artifacts/mail-body.txt + ;; + "0":*".bak") + n_good_boards=$(($n_good_boards + 1)) + echo "$lock: UNEXPECTED SUCCESS" >> artifacts/mail-body.txt + # Move boards with no STG ticket (e.g., no ".bak.STG-1234") + # back into service. + mv "$HOME/boards/$lock" "$HOME/boards/${lock%.bak}.lock" + ;; + "0":*) + # The board appears fine, but it has STG ticket assigned + # to it (e.g., ".bak.STG-1234"). + echo "$lock: UNEXPECTED SUCCESS" >> artifacts/mail-body.txt + ;; + *:*".lock") + echo "$lock: UNEXPECTED FAILURE" >> artifacts/mail-body.txt + # Move offline boards out of service. + mv "$HOME/boards/$lock" "$HOME/boards/${lock%.lock}.bak" + ;; + *:*) + echo "$lock: EXPECTED FAILURE" >> artifacts/mail-body.txt + ;; + esac + done + + # tcwg-benchmark doesn't have ssh access to ci.linaro.org, + # so use tcwg-buildslave's credentials. + n_executors=$(ssh -p2222 -l tcwg-buildslave@linaro.org \ + ci.linaro.org get-node "$NODE_NAME" \ + | grep "numExecutors") + n_executors=$(echo "$n_executors" \ + | sed -e "s#.*<numExecutors>\([0-9]\+\)</numExecutors>.*#\1#") + if [ x"$n_executors" != x"$n_good_boards" ]; then + echo "UNEXPECTED: Updating number of executors on $NODE_NAME from $n_executors to $n_good_boards" \ + >> artifacts/mail-body.txt + if [ "$n_good_boards" = "0" ]; then + # Setting executors to "0" will mightily confuse jenkins; + # the node with 0 executors will be stuck in limbo. + n_good_boards=1 + fi + ssh -p2222 -l tcwg-buildslave@linaro.org \ + ci.linaro.org get-node "$NODE_NAME" \ + | sed -e "s#<numExecutors>\([0-9]\+\)</numExecutors>#<numExecutors>$n_good_boards</numExecutors>#" \ + | ssh -p2222 -l tcwg-buildslave@linaro.org \ + ci.linaro.org update-node "$NODE_NAME" + fi + + if ! grep -q UNEXPECTED artifacts/mail-body.txt; then + exit 0 + fi + + exit 1 + publishers: + - email-ext: + recipients: | + ${FILE,path="artifacts/mail-recipients.txt"} + content-type: text + body: | + ${FILE,path="artifacts/mail-body.txt"} + failure: true + success: false + aborted: true + send-to: + - recipients +#endif + +#if CONTAINER_jenkins +- job: + name: tcwg-update-jenkins-containers-online-node + project-type: freestyle + defaults: global + properties: + - authorization: + anonymous: + - job-read + - job-extended-read + everyone-flat: + - job-build + - job-cancel + - build-discarder: + days-to-keep: 30 + num-to-keep: 100 + parameters: + - string: + name: node + default: '' + description: 'NODE_NAME to bring online' + - string: + name: build_num + default: '' + description: 'BUILD_NUMBER to wait for to finish' + disabled: false + concurrent: true + display-name: 'TCWG CCC Update jenkins containers online-node' + wrappers: + - timeout: + timeout: 60 + - timestamps + - ssh-agent-credentials: + users: + # tcwg-buildslave user id + - 'e0958a95-204f-4c14-a66c-5e2be6c5d50a' + - build-name: + name: '#${BUILD_NUMBER}-#${build_num}-${node}' + builders: + - shell: + command: | + #!/bin/bash + set -ex + + # Below logic was adapted from jenkins-scripts/tcwg_bmk-build.sh: + # benchmark(). + + ssh_cmd=(ssh -p2222 -l tcwg-buildslave@linaro.org + -oStrictHostKeyChecking=no -oUserKnownHostsFile=/dev/null) + + # ??? Below loop can fail for reasons I can't understand. + # "|| true" should the subshell always exit with "0", + # and I can't see how "| tee | sed" can fail. + # Bring the node back online as we exit due to any reason, + # and ignore shell errors so that we exit only when see + # "Finished: " line in the console.log. + trap "${ssh_cmd[*]} ci.linaro.org online-node $node" EXIT + set +e + + while true; do + sleep 60 + + (timeout 1m \ + "${ssh_cmd[@]}" ci.linaro.org console \ + tcwg-update-jenkins-containers $build_num || true) \ + | tee console.log | sed -e "s/^/$node: /" + + build_status=$(tail -n 1 console.log) + case "$build_status" in + "Finished: "*) break ;; + esac + done +#endif |