#include tcwg/default.yaml.inc - job: name: tcwg-update-#{CONTAINER}-containers project-type: freestyle defaults: global properties: - authorization: anonymous: - job-read - job-extended-read everyone-flat: - job-build - job-cancel - build-discarder: days-to-keep: 30 num-to-keep: 100 - throttle: max-per-node: 1 option: project parameters: - label: name: nodes #if CONTAINER_host # Run on all real machines with exception of benchmarking boards. # Host containers of benchmarking boards are handled by # CONTAINER_bmk job. default: tcwg && !tcwg-bmk-hw && !tcwg-secondary-node #elif CONTAINER_jenkins # Run on all nodes with exception of benchmarking boards. # Jenkins containers of benchmarking boards are handled by # CONTAINER_bmk job. default: tcwg && !tcwg-bmk-hw #elif CONTAINER_llvmbot || CONTAINER_buildkite default: tcwg-llvmbot #elif CONTAINER_bmk default: tcwg-bmk-pool #endif all-nodes: true matching-label: 'allCases' description: 'Machines to run on' - string: name: distro #if CONTAINER_llvmbot || CONTAINER_buildkite default: 'lts_1' #else default: 'default' #endif description: 'Distro version to use.' #if CONTAINER_llvmbot - string: name: master default: 'normal' description: 'LLVM buildmaster to use: silent or normal' #endif - bool: name: force default: 'false' description: 'Whether to force update even with no changes in image' - bool: name: verbose default: 'true' description: 'Whether to be verbose' - string: name: scripts_branch default: master description: 'Scripts revision to use' disabled: false node: tcwg-coordinator concurrent: true display-name: 'TCWG CCC Update #{CONTAINER} containers' # We need to unshare workspace with $NODE_NAME in the path to # correctly run on tcwg-bmk-* nodes. workspace: workspace/tcwg-update-#{CONTAINER}-containers_$EXECUTOR_NUMBER/$NODE_NAME scm: - jenkins-scripts #if !CONTAINER_llvmbot && !CONTAINER_buildkite triggers: - timed: '@daily' #else # No timer trigger for llvmbot and buildkite because we want the bot maintainer to be # able to control the deployment time. #endif wrappers: - timeout: #if CONTAINER_jenkins # Wait at most 5 hours before giving up on updating jenkins # client container. timeout: 300 #else timeout: 600 #endif - timestamps #if CONTAINER_llvmbot - credentials-binding: - text: credential-id: TCWG_LLVMBOT_PASSWORD variable: TCWG_LLVMBOT_PASSWORD #elif CONTAINER_buildkite - credentials-binding: - text: credential-id: TCWG_BUILDKITE_TOKEN_LIBCXX variable: TCWG_BUILDKITE_TOKEN_LIBCXX #elif CONTAINER_bmk - ssh-agent-credentials: users: - 'tcwg-benchmark' # tcwg-buildslave user id # ??? Do we need tcwg-buildslave's keys for BMK containers? - 'e0958a95-204f-4c14-a66c-5e2be6c5d50a' #elif CONTAINER_jenkins - ssh-agent-credentials: users: # tcwg-buildslave user id - 'e0958a95-204f-4c14-a66c-5e2be6c5d50a' #endif - build-name: name: '#${BUILD_NUMBER}-${NODE_NAME}' builders: - shell: command: | #!/bin/bash set -ex #if CONTAINER_host case " $NODE_LABELS " in *" tcwg-llvmbot "*) group=tcwg-llvm ;; *) group=all ;; esac ./jenkins-scripts/tcwg-update-host-containers.sh \ --distro "$distro" \ --group "$group" \ --force "$force" \ --verbose "$verbose" #elif CONTAINER_jenkins # Check if we need to update the image -- run with --dryrun true. ./jenkins-scripts/tcwg-update-host-containers.sh \ --distro "$distro" \ --dryrun true \ --node "$NODE_NAME" \ --force "$force" \ --verbose "$verbose" & res=0 && wait $! || res=$? if [ $res = 0 ]; then # Fast-path exit to avoid bringing the node offline. echo "$NODE_NAME is up-to-date" # Skip the rest and mark the build UNSTABLE (aka skipped). exit 125 elif [ $res = 125 ]; then echo "$NODE_NAME needs container update" elif [ $res != 0 ]; then echo "ERROR: container check failed" exit $res fi # We are about to update the container that is running this. # The plan is: # 1. Prevent new builds from starting by putting the node into # offline mode. # 2. Wait for current builds to finish. We detect this by # checking for children process of the jenkins client. # 3. Trigger a job on the master node to bring this node back # online. Without this we would restart the jenkins container, # but the node would still be marked as "offline" and no new # builds will be scheduled to it. # 4. Restart the container. # Mark the node offline. ssh -p2222 -l tcwg-buildslave@linaro.org ci.linaro.org \ offline-node "$NODE_NAME" \ -m "Updating_jenkins_container:$BUILD_URL" start_date=$(date +%s) rm -f timeout # Wait for current builds to finish. while true; do n_busy=$(source jenkins-scripts/jenkins-helpers.sh print_number_of_busy_executors "$NODE_NAME") if [ "$n_busy" = "1" ]; then # We are the only build left. break fi elapsed=$(($(date +%s) - $start_date)) elapsed=$(($elapsed / 60)) if [ "$elapsed" -gt "270" ]; then # We'll timeout in 30 minutes; give up on the update and # bring the node back online. # We budget 30 minutes for tcwg-cleanup-stale-workspaces.sh # below. touch timeout break fi # Wait for other builds to complete. sleep 60 done unstable-return: 125 - conditional-step: condition-kind: current-status steps: - trigger-builds: - project: tcwg-update-jenkins-containers-online-node predefined-parameters: | node=$NODE_NAME build_num=$BUILD_NUMBER block: false - shell: command: | #!/bin/bash set -ex if [ -f timeout ]; then exit 125 fi # Cleanup workspace directory while the node is idle. ( set +e $WORKSPACE/jenkins-scripts/tcwg-cleanup-stale-workspaces.sh \ --days 3 --workspace_top $HOME/workspace ) ./jenkins-scripts/tcwg-update-host-containers.sh \ --distro "$distro" \ --dryrun false \ --node "$NODE_NAME" \ --force "$force" \ --verbose "$verbose" || exit 125 unstable-return: 125 #elif CONTAINER_llvmbot ./jenkins-scripts/tcwg-update-llvmbot-containers.sh \ --NODE_NAME "$NODE_NAME" \ --distro "$distro" \ --master "$master" \ --password "$TCWG_LLVMBOT_PASSWORD" \ --force "$force" \ --verbose "$verbose" #elif CONTAINER_buildkite ./jenkins-scripts/tcwg-update-llvmbot-containers.sh \ --NODE_NAME "$NODE_NAME" \ --distro "$distro" \ --master "buildkite" \ --password "$TCWG_BUILDKITE_TOKEN_LIBCXX" \ --force "$force" \ --verbose "$verbose" #elif CONTAINER_bmk # See: LABEL_SED below set -o pipefail rm -rf artifacts mkdir artifacts echo "$BUILD_URL" > artifacts/mail-body.txt echo "maxim.kuvyrkov@linaro.org, laurent.alfonsi@linaro.org" > artifacts/mail-recipients.txt declare -A pids for lock in $(set +f; ls -tr $HOME/boards/$NODE_NAME-*.lock \ $HOME/boards/$NODE_NAME-*.bak*); do ( flock -e 9 touch $lock board=$(cat <&9) # Start the jenkins container so that it can process # all the queued cleanup and maintenance tasks. node=$(basename "$board" .tcwglab) ./jenkins-scripts/tcwg-update-bmk-containers.sh \ --board "$board" \ --distro "$distro" \ --force "$force" \ --node "$node" \ --verbose "$verbose" # Wait for jenkins container to become idle while sleep 60; do n_busy=$(source jenkins-scripts/jenkins-helpers.sh print_number_of_busy_executors "$node") if [ "$n_busy" = "0" ]; then break fi done # Now stop the jenkins container so that it's not terminated # midway some other build by a starting benchmarking job. ssh -Snone $board docker stop "$node" ) 9<$lock 2>&1 | sed -e "s/^/$(basename $lock): /" & # LABEL_SED: We need pipefail to get correct result of sub-shell # "( foo ) | sed" instead of always-succeeding "sed". pids[$(basename "$lock")]=$! done n_good_boards=0 for lock in "${!pids[@]}"; do res=0 && wait "${pids[$lock]}" || res=$? case "$res:$lock" in "0":*".lock") n_good_boards=$(($n_good_boards + 1)) echo "$lock: SUCCESS" >> artifacts/mail-body.txt ;; "0":*".bak") n_good_boards=$(($n_good_boards + 1)) echo "$lock: UNEXPECTED SUCCESS" >> artifacts/mail-body.txt # Move boards with no STG ticket (e.g., no ".bak.STG-1234") # back into service. mv "$HOME/boards/$lock" "$HOME/boards/${lock%.bak}.lock" ;; "0":*) # The board appears fine, but it has STG ticket assigned # to it (e.g., ".bak.STG-1234"). echo "$lock: UNEXPECTED SUCCESS" >> artifacts/mail-body.txt ;; *:*".lock") echo "$lock: UNEXPECTED FAILURE" >> artifacts/mail-body.txt # Move offline boards out of service. mv "$HOME/boards/$lock" "$HOME/boards/${lock%.lock}.bak" ;; *:*) echo "$lock: EXPECTED FAILURE" >> artifacts/mail-body.txt ;; esac done # tcwg-benchmark doesn't have ssh access to ci.linaro.org, # so use tcwg-buildslave's credentials. n_executors=$(ssh -p2222 -l tcwg-buildslave@linaro.org \ ci.linaro.org get-node "$NODE_NAME" \ | grep "numExecutors") n_executors=$(echo "$n_executors" \ | sed -e "s#.*\([0-9]\+\).*#\1#") if [ x"$n_executors" != x"$n_good_boards" ]; then echo "UNEXPECTED: Updating number of executors on $NODE_NAME from $n_executors to $n_good_boards" \ >> artifacts/mail-body.txt if [ "$n_good_boards" = "0" ]; then # Setting executors to "0" will mightily confuse jenkins; # the node with 0 executors will be stuck in limbo. n_good_boards=1 fi ssh -p2222 -l tcwg-buildslave@linaro.org \ ci.linaro.org get-node "$NODE_NAME" \ | sed -e "s#\([0-9]\+\)#$n_good_boards#" \ | ssh -p2222 -l tcwg-buildslave@linaro.org \ ci.linaro.org update-node "$NODE_NAME" fi if ! grep -q UNEXPECTED artifacts/mail-body.txt; then exit 0 fi exit 1 publishers: - email-ext: recipients: | ${FILE,path="artifacts/mail-recipients.txt"} content-type: text body: | ${FILE,path="artifacts/mail-body.txt"} failure: true success: false aborted: true send-to: - recipients #endif #if CONTAINER_jenkins - job: name: tcwg-update-jenkins-containers-online-node project-type: freestyle defaults: global properties: - authorization: anonymous: - job-read - job-extended-read everyone-flat: - job-build - job-cancel - build-discarder: days-to-keep: 30 num-to-keep: 100 parameters: - string: name: node default: '' description: 'NODE_NAME to bring online' - string: name: build_num default: '' description: 'BUILD_NUMBER to wait for to finish' disabled: false concurrent: true display-name: 'TCWG CCC Update jenkins containers online-node' wrappers: - timeout: timeout: 60 - timestamps - ssh-agent-credentials: users: # tcwg-buildslave user id - 'e0958a95-204f-4c14-a66c-5e2be6c5d50a' - build-name: name: '#${BUILD_NUMBER}-#${build_num}-${node}' builders: - shell: command: | #!/bin/bash set -ex # Below logic was adapted from jenkins-scripts/tcwg_bmk-build.sh: # benchmark(). ssh_cmd=(ssh -p2222 -l tcwg-buildslave@linaro.org -oStrictHostKeyChecking=no -oUserKnownHostsFile=/dev/null) # ??? Below loop can fail for reasons I can't understand. # "|| true" should the subshell always exit with "0", # and I can't see how "| tee | sed" can fail. # Bring the node back online as we exit due to any reason, # and ignore shell errors so that we exit only when see # "Finished: " line in the console.log. trap "${ssh_cmd[*]} ci.linaro.org online-node $node" EXIT set +e while true; do sleep 60 (timeout 1m \ "${ssh_cmd[@]}" ci.linaro.org console \ tcwg-update-jenkins-containers $build_num || true) \ | tee console.log | sed -e "s/^/$node: /" build_status=$(tail -n 1 console.log) case "$build_status" in "Finished: "*) break ;; esac done #endif