aboutsummaryrefslogtreecommitdiff
path: root/tcwg-update-jenkins-containers.yaml
diff options
context:
space:
mode:
Diffstat (limited to 'tcwg-update-jenkins-containers.yaml')
-rw-r--r--tcwg-update-jenkins-containers.yaml195
1 files changed, 181 insertions, 14 deletions
diff --git a/tcwg-update-jenkins-containers.yaml b/tcwg-update-jenkins-containers.yaml
index 25a7b6fd31..9abc9064bb 100644
--- a/tcwg-update-jenkins-containers.yaml
+++ b/tcwg-update-jenkins-containers.yaml
@@ -1,4 +1,4 @@
-# Auto generated by ./tcwg/generate-yamlfiles.sh from tcwg-update.yaml.in and tcwg-update/tcwg-update-jenkins-containers.def. Do not edit.
+# Auto generated by ./tcwg/generate-yamlfiles.sh from tcwg-update-containers.yaml.in and tcwg-update-containers/tcwg-update-jenkins-containers.def. Do not edit.
#BEGIN: tcwg/default.yaml.inc
# -*- mode: Yaml -*-
@@ -9,12 +9,11 @@
anonymous:
- job-read
- job-extended-read
- linaro:
+ everyone-flat:
- job-build
- job-cancel
- build-discarder:
days-to-keep: 30
- num-to-keep: 30
- scm:
name: jenkins-scripts
@@ -43,7 +42,7 @@
anonymous:
- job-read
- job-extended-read
- linaro:
+ everyone-flat:
- job-build
- job-cancel
- build-discarder:
@@ -55,18 +54,16 @@
parameters:
- label:
name: nodes
- # We can't restart coordinator nodes because we can't block-out
- # matrix master jobs. Similarly, we can't restart nodes with
- # multiple executors because we risk killing builds on other
- # executors.
- default: tcwg && !tcwg-coordinator && !tcwg-bmk && !tcwg-short && !tcwg-x86_64-build && !tcwg-x86_64-build-09 && !tcwg-x86_64-build-10
+ # Run on all nodes with exception of benchmarking boards.
+ # Jenkins containers of benchmarking boards are handled by
+ # CONTAINER_bmk job.
+ default: tcwg && !tcwg-bmk-hw
all-nodes: true
matching-label: 'allCases'
- node-eligibility: 'ignore-offline'
description: 'Machines to run on'
- string:
name: distro
- default: 'bionic'
+ default: 'default'
description: 'Distro version to use.'
- bool:
name: force
@@ -93,8 +90,14 @@
- timed: '@daily'
wrappers:
- timeout:
- timeout: 600
+ # Wait at most 5 hours before giving up on updating jenkins
+ # client container.
+ timeout: 300
- timestamps
+ - ssh-agent-credentials:
+ users:
+ # tcwg-buildslave user id
+ - 'e0958a95-204f-4c14-a66c-5e2be6c5d50a'
- build-name:
name: '#${BUILD_NUMBER}-${NODE_NAME}'
builders:
@@ -103,10 +106,174 @@
#!/bin/bash
set -ex
+ # Check if we need to update the image -- run with --dryrun true.
./jenkins-scripts/tcwg-update-host-containers.sh \
--distro "$distro" \
+ --dryrun true \
--node "$NODE_NAME" \
--force "$force" \
- --verbose "$verbose" || exit 125
+ --verbose "$verbose" &
+ res=0 && wait $! || res=$?
+
+ if [ $res = 0 ]; then
+ # Fast-path exit to avoid bringing the node offline.
+ echo "$NODE_NAME is up-to-date"
+ # Skip the rest and mark the build UNSTABLE (aka skipped).
+ exit 125
+ elif [ $res = 125 ]; then
+ echo "$NODE_NAME needs container update"
+ elif [ $res != 0 ]; then
+ echo "ERROR: container check failed"
+ exit $res
+ fi
+
+ # We are about to update the container that is running this.
+ # The plan is:
+ # 1. Prevent new builds from starting by putting the node into
+ # offline mode.
+ # 2. Wait for current builds to finish. We detect this by
+ # checking for children process of the jenkins client.
+ # 3. Trigger a job on the master node to bring this node back
+ # online. Without this we would restart the jenkins container,
+ # but the node would still be marked as "offline" and no new
+ # builds will be scheduled to it.
+ # 4. Restart the container.
+
+ # Mark the node offline.
+ ssh -p2222 -l tcwg-buildslave@linaro.org ci.linaro.org \
+ offline-node "$NODE_NAME" \
+ -m "Updating_jenkins_container:$BUILD_URL"
+
+ start_date=$(date +%s)
+ rm -f timeout
+
+ # Wait for current builds to finish.
+ while true; do
+ n_busy=$(source jenkins-scripts/jenkins-helpers.sh
+ print_number_of_busy_executors "$NODE_NAME")
+
+ if [ "$n_busy" = "1" ]; then
+ # We are the only build left.
+ break
+ fi
+
+ elapsed=$(($(date +%s) - $start_date))
+ elapsed=$(($elapsed / 60))
+ if [ "$elapsed" -gt "270" ]; then
+ # We'll timeout in 30 minutes; give up on the update and
+ # bring the node back online.
+ # We budget 30 minutes for tcwg-cleanup-stale-workspaces.sh
+ # below.
+ touch timeout
+ break
+ fi
+
+ # Wait for other builds to complete.
+ sleep 60
+ done
unstable-return: 125
-# checksum: 5361d1951bb9e0964cca304ad024243c
+ - conditional-step:
+ condition-kind: current-status
+ steps:
+ - trigger-builds:
+ - project: tcwg-update-jenkins-containers-online-node
+ predefined-parameters: |
+ node=$NODE_NAME
+ build_num=$BUILD_NUMBER
+ block: false
+ - shell:
+ command: |
+ #!/bin/bash
+ set -ex
+
+ if [ -f timeout ]; then
+ exit 125
+ fi
+
+ # Cleanup workspace directory while the node is idle.
+ (
+ set +e
+ $WORKSPACE/jenkins-scripts/tcwg-cleanup-stale-workspaces.sh \
+ --days 3 --workspace_top $HOME/workspace
+ )
+
+ ./jenkins-scripts/tcwg-update-host-containers.sh \
+ --distro "$distro" \
+ --dryrun false \
+ --node "$NODE_NAME" \
+ --force "$force" \
+ --verbose "$verbose" || exit 125
+ unstable-return: 125
+
+- job:
+ name: tcwg-update-jenkins-containers-online-node
+ project-type: freestyle
+ defaults: global
+ properties:
+ - authorization:
+ anonymous:
+ - job-read
+ - job-extended-read
+ everyone-flat:
+ - job-build
+ - job-cancel
+ - build-discarder:
+ days-to-keep: 30
+ num-to-keep: 100
+ parameters:
+ - string:
+ name: node
+ default: ''
+ description: 'NODE_NAME to bring online'
+ - string:
+ name: build_num
+ default: ''
+ description: 'BUILD_NUMBER to wait for to finish'
+ disabled: false
+ concurrent: true
+ display-name: 'TCWG CCC Update jenkins containers online-node'
+ wrappers:
+ - timeout:
+ timeout: 60
+ - timestamps
+ - ssh-agent-credentials:
+ users:
+ # tcwg-buildslave user id
+ - 'e0958a95-204f-4c14-a66c-5e2be6c5d50a'
+ - build-name:
+ name: '#${BUILD_NUMBER}-#${build_num}-${node}'
+ builders:
+ - shell:
+ command: |
+ #!/bin/bash
+ set -ex
+
+ # Below logic was adapted from jenkins-scripts/tcwg_bmk-build.sh:
+ # benchmark().
+
+ ssh_cmd=(ssh -p2222 -l tcwg-buildslave@linaro.org
+ -oStrictHostKeyChecking=no -oUserKnownHostsFile=/dev/null)
+
+ # ??? Below loop can fail for reasons I can't understand.
+ # "|| true" should the subshell always exit with "0",
+ # and I can't see how "| tee | sed" can fail.
+ # Bring the node back online as we exit due to any reason,
+ # and ignore shell errors so that we exit only when see
+ # "Finished: " line in the console.log.
+ trap "${ssh_cmd[*]} ci.linaro.org online-node $node" EXIT
+ set +e
+
+ while true; do
+ sleep 60
+
+ (timeout 1m \
+ "${ssh_cmd[@]}" ci.linaro.org console \
+ tcwg-update-jenkins-containers $build_num || true) \
+ | tee console.log | sed -e "s/^/$node: /"
+
+ build_status=$(tail -n 1 console.log)
+ case "$build_status" in
+ "Finished: "*) break ;;
+ esac
+ done
+# checksum: 6ffc6ff4f71c52329c02ca6e6f932e10