aboutsummaryrefslogtreecommitdiff
path: root/tcwg-update-containers.yaml.in
diff options
context:
space:
mode:
Diffstat (limited to 'tcwg-update-containers.yaml.in')
-rw-r--r--tcwg-update-containers.yaml.in443
1 files changed, 443 insertions, 0 deletions
diff --git a/tcwg-update-containers.yaml.in b/tcwg-update-containers.yaml.in
new file mode 100644
index 0000000000..d748ce663c
--- /dev/null
+++ b/tcwg-update-containers.yaml.in
@@ -0,0 +1,443 @@
+#include tcwg/default.yaml.inc
+
+- job:
+ name: tcwg-update-#{CONTAINER}-containers
+ project-type: freestyle
+ defaults: global
+ properties:
+ - authorization:
+ anonymous:
+ - job-read
+ - job-extended-read
+ everyone-flat:
+ - job-build
+ - job-cancel
+ - build-discarder:
+ days-to-keep: 30
+ num-to-keep: 100
+ - throttle:
+ max-per-node: 1
+ option: project
+ parameters:
+ - label:
+ name: nodes
+#if CONTAINER_host
+ # Run on all real machines with exception of benchmarking boards.
+ # Host containers of benchmarking boards are handled by
+ # CONTAINER_bmk job.
+ default: tcwg && !tcwg-bmk-hw && !tcwg-secondary-node
+#elif CONTAINER_jenkins
+ # Run on all nodes with exception of benchmarking boards.
+ # Jenkins containers of benchmarking boards are handled by
+ # CONTAINER_bmk job.
+ default: tcwg && !tcwg-bmk-hw
+#elif CONTAINER_llvmbot || CONTAINER_buildkite
+ default: tcwg-llvmbot
+#elif CONTAINER_bmk
+ default: tcwg-bmk-pool
+#endif
+ all-nodes: true
+ matching-label: 'allCases'
+ description: 'Machines to run on'
+ - string:
+ name: distro
+#if CONTAINER_llvmbot || CONTAINER_buildkite
+ default: 'lts_1'
+#else
+ default: 'default'
+#endif
+ description: 'Distro version to use.'
+#if CONTAINER_llvmbot
+ - string:
+ name: master
+ default: 'normal'
+ description: 'LLVM buildmaster to use: silent or normal'
+#endif
+ - bool:
+ name: force
+ default: 'false'
+ description: 'Whether to force update even with no changes in image'
+ - bool:
+ name: verbose
+ default: 'true'
+ description: 'Whether to be verbose'
+ - string:
+ name: scripts_branch
+ default: master
+ description: 'Scripts revision to use'
+ disabled: false
+ node: tcwg-coordinator
+ concurrent: true
+ display-name: 'TCWG CCC Update #{CONTAINER} containers'
+ # We need to unshare workspace with $NODE_NAME in the path to
+ # correctly run on tcwg-bmk-* nodes.
+ workspace: workspace/tcwg-update-#{CONTAINER}-containers_$EXECUTOR_NUMBER/$NODE_NAME
+ scm:
+ - jenkins-scripts
+#if !CONTAINER_llvmbot && !CONTAINER_buildkite
+ triggers:
+ - timed: '@daily'
+#else
+ # No timer trigger for llvmbot and buildkite because we want the bot maintainer to be
+ # able to control the deployment time.
+#endif
+ wrappers:
+ - timeout:
+#if CONTAINER_jenkins
+ # Wait at most 5 hours before giving up on updating jenkins
+ # client container.
+ timeout: 300
+#else
+ timeout: 600
+#endif
+ - timestamps
+#if CONTAINER_llvmbot
+ - credentials-binding:
+ - text:
+ credential-id: TCWG_LLVMBOT_PASSWORD
+ variable: TCWG_LLVMBOT_PASSWORD
+#elif CONTAINER_buildkite
+ - credentials-binding:
+ - text:
+ credential-id: TCWG_BUILDKITE_TOKEN_LIBCXX
+ variable: TCWG_BUILDKITE_TOKEN_LIBCXX
+#elif CONTAINER_bmk
+ - ssh-agent-credentials:
+ users:
+ - 'tcwg-benchmark'
+ # tcwg-buildslave user id
+ # ??? Do we need tcwg-buildslave's keys for BMK containers?
+ - 'e0958a95-204f-4c14-a66c-5e2be6c5d50a'
+#elif CONTAINER_jenkins
+ - ssh-agent-credentials:
+ users:
+ # tcwg-buildslave user id
+ - 'e0958a95-204f-4c14-a66c-5e2be6c5d50a'
+#endif
+ - build-name:
+ name: '#${BUILD_NUMBER}-${NODE_NAME}'
+ builders:
+ - shell:
+ command: |
+ #!/bin/bash
+ set -ex
+
+#if CONTAINER_host
+ case " $NODE_LABELS " in
+ *" tcwg-llvmbot "*) group=tcwg-llvm ;;
+ *) group=all ;;
+ esac
+ ./jenkins-scripts/tcwg-update-host-containers.sh \
+ --distro "$distro" \
+ --group "$group" \
+ --force "$force" \
+ --verbose "$verbose"
+#elif CONTAINER_jenkins
+ # Check if we need to update the image -- run with --dryrun true.
+ ./jenkins-scripts/tcwg-update-host-containers.sh \
+ --distro "$distro" \
+ --dryrun true \
+ --node "$NODE_NAME" \
+ --force "$force" \
+ --verbose "$verbose" &
+ res=0 && wait $! || res=$?
+
+ if [ $res = 0 ]; then
+ # Fast-path exit to avoid bringing the node offline.
+ echo "$NODE_NAME is up-to-date"
+ # Skip the rest and mark the build UNSTABLE (aka skipped).
+ exit 125
+ elif [ $res = 125 ]; then
+ echo "$NODE_NAME needs container update"
+ elif [ $res != 0 ]; then
+ echo "ERROR: container check failed"
+ exit $res
+ fi
+
+ # We are about to update the container that is running this.
+ # The plan is:
+ # 1. Prevent new builds from starting by putting the node into
+ # offline mode.
+ # 2. Wait for current builds to finish. We detect this by
+ # checking for children process of the jenkins client.
+ # 3. Trigger a job on the master node to bring this node back
+ # online. Without this we would restart the jenkins container,
+ # but the node would still be marked as "offline" and no new
+ # builds will be scheduled to it.
+ # 4. Restart the container.
+
+ # Mark the node offline.
+ ssh -p2222 -l tcwg-buildslave@linaro.org ci.linaro.org \
+ offline-node "$NODE_NAME" \
+ -m "Updating_jenkins_container:$BUILD_URL"
+
+ start_date=$(date +%s)
+ rm -f timeout
+
+ # Wait for current builds to finish.
+ while true; do
+ n_busy=$(source jenkins-scripts/jenkins-helpers.sh
+ print_number_of_busy_executors "$NODE_NAME")
+
+ if [ "$n_busy" = "1" ]; then
+ # We are the only build left.
+ break
+ fi
+
+ elapsed=$(($(date +%s) - $start_date))
+ elapsed=$(($elapsed / 60))
+ if [ "$elapsed" -gt "270" ]; then
+ # We'll timeout in 30 minutes; give up on the update and
+ # bring the node back online.
+ # We budget 30 minutes for tcwg-cleanup-stale-workspaces.sh
+ # below.
+ touch timeout
+ break
+ fi
+
+ # Wait for other builds to complete.
+ sleep 60
+ done
+ unstable-return: 125
+ - conditional-step:
+ condition-kind: current-status
+ steps:
+ - trigger-builds:
+ - project: tcwg-update-jenkins-containers-online-node
+ predefined-parameters: |
+ node=$NODE_NAME
+ build_num=$BUILD_NUMBER
+ block: false
+ - shell:
+ command: |
+ #!/bin/bash
+ set -ex
+
+ if [ -f timeout ]; then
+ exit 125
+ fi
+
+ # Cleanup workspace directory while the node is idle.
+ (
+ set +e
+ $WORKSPACE/jenkins-scripts/tcwg-cleanup-stale-workspaces.sh \
+ --days 3 --workspace_top $HOME/workspace
+ )
+
+ ./jenkins-scripts/tcwg-update-host-containers.sh \
+ --distro "$distro" \
+ --dryrun false \
+ --node "$NODE_NAME" \
+ --force "$force" \
+ --verbose "$verbose" || exit 125
+ unstable-return: 125
+#elif CONTAINER_llvmbot
+ ./jenkins-scripts/tcwg-update-llvmbot-containers.sh \
+ --NODE_NAME "$NODE_NAME" \
+ --distro "$distro" \
+ --master "$master" \
+ --password "$TCWG_LLVMBOT_PASSWORD" \
+ --force "$force" \
+ --verbose "$verbose"
+#elif CONTAINER_buildkite
+ ./jenkins-scripts/tcwg-update-llvmbot-containers.sh \
+ --NODE_NAME "$NODE_NAME" \
+ --distro "$distro" \
+ --master "buildkite" \
+ --password "$TCWG_BUILDKITE_TOKEN_LIBCXX" \
+ --force "$force" \
+ --verbose "$verbose"
+#elif CONTAINER_bmk
+ # See: LABEL_SED below
+ set -o pipefail
+
+ rm -rf artifacts
+ mkdir artifacts
+
+ echo "$BUILD_URL" > artifacts/mail-body.txt
+ echo "maxim.kuvyrkov@linaro.org, laurent.alfonsi@linaro.org" > artifacts/mail-recipients.txt
+
+ declare -A pids
+ for lock in $(set +f; ls -tr $HOME/boards/$NODE_NAME-*.lock \
+ $HOME/boards/$NODE_NAME-*.bak*); do
+ (
+ flock -e 9
+ touch $lock
+ board=$(cat <&9)
+
+ # Start the jenkins container so that it can process
+ # all the queued cleanup and maintenance tasks.
+ node=$(basename "$board" .tcwglab)
+
+ ./jenkins-scripts/tcwg-update-bmk-containers.sh \
+ --board "$board" \
+ --distro "$distro" \
+ --force "$force" \
+ --node "$node" \
+ --verbose "$verbose"
+
+ # Wait for jenkins container to become idle
+ while sleep 60; do
+ n_busy=$(source jenkins-scripts/jenkins-helpers.sh
+ print_number_of_busy_executors "$node")
+
+ if [ "$n_busy" = "0" ]; then
+ break
+ fi
+ done
+
+ # Now stop the jenkins container so that it's not terminated
+ # midway some other build by a starting benchmarking job.
+ ssh -Snone $board docker stop "$node"
+ ) 9<$lock 2>&1 | sed -e "s/^/$(basename $lock): /" &
+
+ # LABEL_SED: We need pipefail to get correct result of sub-shell
+ # "( foo ) | sed" instead of always-succeeding "sed".
+ pids[$(basename "$lock")]=$!
+ done
+
+ n_good_boards=0
+ for lock in "${!pids[@]}"; do
+ res=0 && wait "${pids[$lock]}" || res=$?
+ case "$res:$lock" in
+ "0":*".lock")
+ n_good_boards=$(($n_good_boards + 1))
+ echo "$lock: SUCCESS" >> artifacts/mail-body.txt
+ ;;
+ "0":*".bak")
+ n_good_boards=$(($n_good_boards + 1))
+ echo "$lock: UNEXPECTED SUCCESS" >> artifacts/mail-body.txt
+ # Move boards with no STG ticket (e.g., no ".bak.STG-1234")
+ # back into service.
+ mv "$HOME/boards/$lock" "$HOME/boards/${lock%.bak}.lock"
+ ;;
+ "0":*)
+ # The board appears fine, but it has STG ticket assigned
+ # to it (e.g., ".bak.STG-1234").
+ echo "$lock: UNEXPECTED SUCCESS" >> artifacts/mail-body.txt
+ ;;
+ *:*".lock")
+ echo "$lock: UNEXPECTED FAILURE" >> artifacts/mail-body.txt
+ # Move offline boards out of service.
+ mv "$HOME/boards/$lock" "$HOME/boards/${lock%.lock}.bak"
+ ;;
+ *:*)
+ echo "$lock: EXPECTED FAILURE" >> artifacts/mail-body.txt
+ ;;
+ esac
+ done
+
+ # tcwg-benchmark doesn't have ssh access to ci.linaro.org,
+ # so use tcwg-buildslave's credentials.
+ n_executors=$(ssh -p2222 -l tcwg-buildslave@linaro.org \
+ ci.linaro.org get-node "$NODE_NAME" \
+ | grep "numExecutors")
+ n_executors=$(echo "$n_executors" \
+ | sed -e "s#.*<numExecutors>\([0-9]\+\)</numExecutors>.*#\1#")
+ if [ x"$n_executors" != x"$n_good_boards" ]; then
+ echo "UNEXPECTED: Updating number of executors on $NODE_NAME from $n_executors to $n_good_boards" \
+ >> artifacts/mail-body.txt
+ if [ "$n_good_boards" = "0" ]; then
+ # Setting executors to "0" will mightily confuse jenkins;
+ # the node with 0 executors will be stuck in limbo.
+ n_good_boards=1
+ fi
+ ssh -p2222 -l tcwg-buildslave@linaro.org \
+ ci.linaro.org get-node "$NODE_NAME" \
+ | sed -e "s#<numExecutors>\([0-9]\+\)</numExecutors>#<numExecutors>$n_good_boards</numExecutors>#" \
+ | ssh -p2222 -l tcwg-buildslave@linaro.org \
+ ci.linaro.org update-node "$NODE_NAME"
+ fi
+
+ if ! grep -q UNEXPECTED artifacts/mail-body.txt; then
+ exit 0
+ fi
+
+ exit 1
+ publishers:
+ - email-ext:
+ recipients: |
+ ${FILE,path="artifacts/mail-recipients.txt"}
+ content-type: text
+ body: |
+ ${FILE,path="artifacts/mail-body.txt"}
+ failure: true
+ success: false
+ aborted: true
+ send-to:
+ - recipients
+#endif
+
+#if CONTAINER_jenkins
+- job:
+ name: tcwg-update-jenkins-containers-online-node
+ project-type: freestyle
+ defaults: global
+ properties:
+ - authorization:
+ anonymous:
+ - job-read
+ - job-extended-read
+ everyone-flat:
+ - job-build
+ - job-cancel
+ - build-discarder:
+ days-to-keep: 30
+ num-to-keep: 100
+ parameters:
+ - string:
+ name: node
+ default: ''
+ description: 'NODE_NAME to bring online'
+ - string:
+ name: build_num
+ default: ''
+ description: 'BUILD_NUMBER to wait for to finish'
+ disabled: false
+ concurrent: true
+ display-name: 'TCWG CCC Update jenkins containers online-node'
+ wrappers:
+ - timeout:
+ timeout: 60
+ - timestamps
+ - ssh-agent-credentials:
+ users:
+ # tcwg-buildslave user id
+ - 'e0958a95-204f-4c14-a66c-5e2be6c5d50a'
+ - build-name:
+ name: '#${BUILD_NUMBER}-#${build_num}-${node}'
+ builders:
+ - shell:
+ command: |
+ #!/bin/bash
+ set -ex
+
+ # Below logic was adapted from jenkins-scripts/tcwg_bmk-build.sh:
+ # benchmark().
+
+ ssh_cmd=(ssh -p2222 -l tcwg-buildslave@linaro.org
+ -oStrictHostKeyChecking=no -oUserKnownHostsFile=/dev/null)
+
+ # ??? Below loop can fail for reasons I can't understand.
+ # "|| true" should the subshell always exit with "0",
+ # and I can't see how "| tee | sed" can fail.
+ # Bring the node back online as we exit due to any reason,
+ # and ignore shell errors so that we exit only when see
+ # "Finished: " line in the console.log.
+ trap "${ssh_cmd[*]} ci.linaro.org online-node $node" EXIT
+ set +e
+
+ while true; do
+ sleep 60
+
+ (timeout 1m \
+ "${ssh_cmd[@]}" ci.linaro.org console \
+ tcwg-update-jenkins-containers $build_num || true) \
+ | tee console.log | sed -e "s/^/$node: /"
+
+ build_status=$(tail -n 1 console.log)
+ case "$build_status" in
+ "Finished: "*) break ;;
+ esac
+ done
+#endif