diff options
Diffstat (limited to 'tcwg-update-jenkins-containers.yaml')
-rw-r--r-- | tcwg-update-jenkins-containers.yaml | 195 |
1 files changed, 181 insertions, 14 deletions
diff --git a/tcwg-update-jenkins-containers.yaml b/tcwg-update-jenkins-containers.yaml index 25a7b6fd31..9abc9064bb 100644 --- a/tcwg-update-jenkins-containers.yaml +++ b/tcwg-update-jenkins-containers.yaml @@ -1,4 +1,4 @@ -# Auto generated by ./tcwg/generate-yamlfiles.sh from tcwg-update.yaml.in and tcwg-update/tcwg-update-jenkins-containers.def. Do not edit. +# Auto generated by ./tcwg/generate-yamlfiles.sh from tcwg-update-containers.yaml.in and tcwg-update-containers/tcwg-update-jenkins-containers.def. Do not edit. #BEGIN: tcwg/default.yaml.inc # -*- mode: Yaml -*- @@ -9,12 +9,11 @@ anonymous: - job-read - job-extended-read - linaro: + everyone-flat: - job-build - job-cancel - build-discarder: days-to-keep: 30 - num-to-keep: 30 - scm: name: jenkins-scripts @@ -43,7 +42,7 @@ anonymous: - job-read - job-extended-read - linaro: + everyone-flat: - job-build - job-cancel - build-discarder: @@ -55,18 +54,16 @@ parameters: - label: name: nodes - # We can't restart coordinator nodes because we can't block-out - # matrix master jobs. Similarly, we can't restart nodes with - # multiple executors because we risk killing builds on other - # executors. - default: tcwg && !tcwg-coordinator && !tcwg-bmk && !tcwg-short && !tcwg-x86_64-build && !tcwg-x86_64-build-09 && !tcwg-x86_64-build-10 + # Run on all nodes with exception of benchmarking boards. + # Jenkins containers of benchmarking boards are handled by + # CONTAINER_bmk job. + default: tcwg && !tcwg-bmk-hw all-nodes: true matching-label: 'allCases' - node-eligibility: 'ignore-offline' description: 'Machines to run on' - string: name: distro - default: 'bionic' + default: 'default' description: 'Distro version to use.' - bool: name: force @@ -93,8 +90,14 @@ - timed: '@daily' wrappers: - timeout: - timeout: 600 + # Wait at most 5 hours before giving up on updating jenkins + # client container. + timeout: 300 - timestamps + - ssh-agent-credentials: + users: + # tcwg-buildslave user id + - 'e0958a95-204f-4c14-a66c-5e2be6c5d50a' - build-name: name: '#${BUILD_NUMBER}-${NODE_NAME}' builders: @@ -103,10 +106,174 @@ #!/bin/bash set -ex + # Check if we need to update the image -- run with --dryrun true. ./jenkins-scripts/tcwg-update-host-containers.sh \ --distro "$distro" \ + --dryrun true \ --node "$NODE_NAME" \ --force "$force" \ - --verbose "$verbose" || exit 125 + --verbose "$verbose" & + res=0 && wait $! || res=$? + + if [ $res = 0 ]; then + # Fast-path exit to avoid bringing the node offline. + echo "$NODE_NAME is up-to-date" + # Skip the rest and mark the build UNSTABLE (aka skipped). + exit 125 + elif [ $res = 125 ]; then + echo "$NODE_NAME needs container update" + elif [ $res != 0 ]; then + echo "ERROR: container check failed" + exit $res + fi + + # We are about to update the container that is running this. + # The plan is: + # 1. Prevent new builds from starting by putting the node into + # offline mode. + # 2. Wait for current builds to finish. We detect this by + # checking for children process of the jenkins client. + # 3. Trigger a job on the master node to bring this node back + # online. Without this we would restart the jenkins container, + # but the node would still be marked as "offline" and no new + # builds will be scheduled to it. + # 4. Restart the container. + + # Mark the node offline. + ssh -p2222 -l tcwg-buildslave@linaro.org ci.linaro.org \ + offline-node "$NODE_NAME" \ + -m "Updating_jenkins_container:$BUILD_URL" + + start_date=$(date +%s) + rm -f timeout + + # Wait for current builds to finish. + while true; do + n_busy=$(source jenkins-scripts/jenkins-helpers.sh + print_number_of_busy_executors "$NODE_NAME") + + if [ "$n_busy" = "1" ]; then + # We are the only build left. + break + fi + + elapsed=$(($(date +%s) - $start_date)) + elapsed=$(($elapsed / 60)) + if [ "$elapsed" -gt "270" ]; then + # We'll timeout in 30 minutes; give up on the update and + # bring the node back online. + # We budget 30 minutes for tcwg-cleanup-stale-workspaces.sh + # below. + touch timeout + break + fi + + # Wait for other builds to complete. + sleep 60 + done unstable-return: 125 -# checksum: 5361d1951bb9e0964cca304ad024243c + - conditional-step: + condition-kind: current-status + steps: + - trigger-builds: + - project: tcwg-update-jenkins-containers-online-node + predefined-parameters: | + node=$NODE_NAME + build_num=$BUILD_NUMBER + block: false + - shell: + command: | + #!/bin/bash + set -ex + + if [ -f timeout ]; then + exit 125 + fi + + # Cleanup workspace directory while the node is idle. + ( + set +e + $WORKSPACE/jenkins-scripts/tcwg-cleanup-stale-workspaces.sh \ + --days 3 --workspace_top $HOME/workspace + ) + + ./jenkins-scripts/tcwg-update-host-containers.sh \ + --distro "$distro" \ + --dryrun false \ + --node "$NODE_NAME" \ + --force "$force" \ + --verbose "$verbose" || exit 125 + unstable-return: 125 + +- job: + name: tcwg-update-jenkins-containers-online-node + project-type: freestyle + defaults: global + properties: + - authorization: + anonymous: + - job-read + - job-extended-read + everyone-flat: + - job-build + - job-cancel + - build-discarder: + days-to-keep: 30 + num-to-keep: 100 + parameters: + - string: + name: node + default: '' + description: 'NODE_NAME to bring online' + - string: + name: build_num + default: '' + description: 'BUILD_NUMBER to wait for to finish' + disabled: false + concurrent: true + display-name: 'TCWG CCC Update jenkins containers online-node' + wrappers: + - timeout: + timeout: 60 + - timestamps + - ssh-agent-credentials: + users: + # tcwg-buildslave user id + - 'e0958a95-204f-4c14-a66c-5e2be6c5d50a' + - build-name: + name: '#${BUILD_NUMBER}-#${build_num}-${node}' + builders: + - shell: + command: | + #!/bin/bash + set -ex + + # Below logic was adapted from jenkins-scripts/tcwg_bmk-build.sh: + # benchmark(). + + ssh_cmd=(ssh -p2222 -l tcwg-buildslave@linaro.org + -oStrictHostKeyChecking=no -oUserKnownHostsFile=/dev/null) + + # ??? Below loop can fail for reasons I can't understand. + # "|| true" should the subshell always exit with "0", + # and I can't see how "| tee | sed" can fail. + # Bring the node back online as we exit due to any reason, + # and ignore shell errors so that we exit only when see + # "Finished: " line in the console.log. + trap "${ssh_cmd[*]} ci.linaro.org online-node $node" EXIT + set +e + + while true; do + sleep 60 + + (timeout 1m \ + "${ssh_cmd[@]}" ci.linaro.org console \ + tcwg-update-jenkins-containers $build_num || true) \ + | tee console.log | sed -e "s/^/$node: /" + + build_status=$(tail -n 1 console.log) + case "$build_status" in + "Finished: "*) break ;; + esac + done +# checksum: 6ffc6ff4f71c52329c02ca6e6f932e10 |