#include tcwg/default.yaml.inc
- job:
name: tcwg-update-#{CONTAINER}-containers
project-type: freestyle
defaults: global
properties:
- authorization:
anonymous:
- job-read
- job-extended-read
everyone-flat:
- job-build
- job-cancel
- build-discarder:
days-to-keep: 30
num-to-keep: 100
- throttle:
max-per-node: 1
option: project
parameters:
- label:
name: nodes
#if CONTAINER_host
# Run on all real machines with exception of benchmarking boards.
# Host containers of benchmarking boards are handled by
# CONTAINER_bmk job.
default: tcwg && !tcwg-bmk-hw && !tcwg-secondary-node
#elif CONTAINER_jenkins
# Run on all nodes with exception of benchmarking boards.
# Jenkins containers of benchmarking boards are handled by
# CONTAINER_bmk job.
default: tcwg && !tcwg-bmk-hw
#elif CONTAINER_llvmbot || CONTAINER_buildkite
default: tcwg-llvmbot
#elif CONTAINER_bmk
default: tcwg-bmk-pool
#endif
all-nodes: true
matching-label: 'allCases'
description: 'Machines to run on'
- string:
name: distro
#if CONTAINER_llvmbot || CONTAINER_buildkite
default: 'lts_1'
#else
default: 'default'
#endif
description: 'Distro version to use.'
#if CONTAINER_llvmbot
- string:
name: master
default: 'normal'
description: 'LLVM buildmaster to use: silent or normal'
#endif
- bool:
name: force
default: 'false'
description: 'Whether to force update even with no changes in image'
- bool:
name: verbose
default: 'true'
description: 'Whether to be verbose'
- string:
name: scripts_branch
default: master
description: 'Scripts revision to use'
disabled: false
node: tcwg-coordinator
concurrent: true
display-name: 'TCWG CCC Update #{CONTAINER} containers'
# We need to unshare workspace with $NODE_NAME in the path to
# correctly run on tcwg-bmk-* nodes.
workspace: workspace/tcwg-update-#{CONTAINER}-containers_$EXECUTOR_NUMBER/$NODE_NAME
scm:
- jenkins-scripts
#if !CONTAINER_llvmbot && !CONTAINER_buildkite
triggers:
- timed: '@daily'
#else
# No timer trigger for llvmbot and buildkite because we want the bot maintainer to be
# able to control the deployment time.
#endif
wrappers:
- timeout:
#if CONTAINER_jenkins
# Wait at most 5 hours before giving up on updating jenkins
# client container.
timeout: 300
#else
timeout: 600
#endif
- timestamps
#if CONTAINER_llvmbot
- credentials-binding:
- text:
credential-id: TCWG_LLVMBOT_PASSWORD
variable: TCWG_LLVMBOT_PASSWORD
#elif CONTAINER_buildkite
- credentials-binding:
- text:
credential-id: TCWG_BUILDKITE_TOKEN_LIBCXX
variable: TCWG_BUILDKITE_TOKEN_LIBCXX
#elif CONTAINER_bmk
- ssh-agent-credentials:
users:
- 'tcwg-benchmark'
# tcwg-buildslave user id
# ??? Do we need tcwg-buildslave's keys for BMK containers?
- 'e0958a95-204f-4c14-a66c-5e2be6c5d50a'
#elif CONTAINER_jenkins
- ssh-agent-credentials:
users:
# tcwg-buildslave user id
- 'e0958a95-204f-4c14-a66c-5e2be6c5d50a'
#endif
- build-name:
name: '#${BUILD_NUMBER}-${NODE_NAME}'
builders:
- shell:
command: |
#!/bin/bash
set -ex
#if CONTAINER_host
case " $NODE_LABELS " in
*" tcwg-llvmbot "*) group=tcwg-llvm ;;
*) group=all ;;
esac
./jenkins-scripts/tcwg-update-host-containers.sh \
--distro "$distro" \
--group "$group" \
--force "$force" \
--verbose "$verbose"
#elif CONTAINER_jenkins
# Check if we need to update the image -- run with --dryrun true.
./jenkins-scripts/tcwg-update-host-containers.sh \
--distro "$distro" \
--dryrun true \
--node "$NODE_NAME" \
--force "$force" \
--verbose "$verbose" &
res=0 && wait $! || res=$?
if [ $res = 0 ]; then
# Fast-path exit to avoid bringing the node offline.
echo "$NODE_NAME is up-to-date"
# Skip the rest and mark the build UNSTABLE (aka skipped).
exit 125
elif [ $res = 125 ]; then
echo "$NODE_NAME needs container update"
elif [ $res != 0 ]; then
echo "ERROR: container check failed"
exit $res
fi
# We are about to update the container that is running this.
# The plan is:
# 1. Prevent new builds from starting by putting the node into
# offline mode.
# 2. Wait for current builds to finish. We detect this by
# checking for children process of the jenkins client.
# 3. Trigger a job on the master node to bring this node back
# online. Without this we would restart the jenkins container,
# but the node would still be marked as "offline" and no new
# builds will be scheduled to it.
# 4. Restart the container.
# Mark the node offline.
ssh -p2222 -l tcwg-buildslave@linaro.org ci.linaro.org \
offline-node "$NODE_NAME" \
-m "Updating_jenkins_container:$BUILD_URL"
start_date=$(date +%s)
rm -f timeout
# Wait for current builds to finish.
while true; do
n_busy=$(source jenkins-scripts/jenkins-helpers.sh
print_number_of_busy_executors "$NODE_NAME")
if [ "$n_busy" = "1" ]; then
# We are the only build left.
break
fi
elapsed=$(($(date +%s) - $start_date))
elapsed=$(($elapsed / 60))
if [ "$elapsed" -gt "270" ]; then
# We'll timeout in 30 minutes; give up on the update and
# bring the node back online.
# We budget 30 minutes for tcwg-cleanup-stale-workspaces.sh
# below.
touch timeout
break
fi
# Wait for other builds to complete.
sleep 60
done
unstable-return: 125
- conditional-step:
condition-kind: current-status
steps:
- trigger-builds:
- project: tcwg-update-jenkins-containers-online-node
predefined-parameters: |
node=$NODE_NAME
build_num=$BUILD_NUMBER
block: false
- shell:
command: |
#!/bin/bash
set -ex
if [ -f timeout ]; then
exit 125
fi
# Cleanup workspace directory while the node is idle.
(
set +e
$WORKSPACE/jenkins-scripts/tcwg-cleanup-stale-workspaces.sh \
--days 3 --workspace_top $HOME/workspace
)
./jenkins-scripts/tcwg-update-host-containers.sh \
--distro "$distro" \
--dryrun false \
--node "$NODE_NAME" \
--force "$force" \
--verbose "$verbose" || exit 125
unstable-return: 125
#elif CONTAINER_llvmbot
./jenkins-scripts/tcwg-update-llvmbot-containers.sh \
--NODE_NAME "$NODE_NAME" \
--distro "$distro" \
--master "$master" \
--password "$TCWG_LLVMBOT_PASSWORD" \
--force "$force" \
--verbose "$verbose"
#elif CONTAINER_buildkite
./jenkins-scripts/tcwg-update-llvmbot-containers.sh \
--NODE_NAME "$NODE_NAME" \
--distro "$distro" \
--master "buildkite" \
--password "$TCWG_BUILDKITE_TOKEN_LIBCXX" \
--force "$force" \
--verbose "$verbose"
#elif CONTAINER_bmk
# See: LABEL_SED below
set -o pipefail
rm -rf artifacts
mkdir artifacts
echo "$BUILD_URL" > artifacts/mail-body.txt
echo "maxim.kuvyrkov@linaro.org, laurent.alfonsi@linaro.org" > artifacts/mail-recipients.txt
declare -A pids
for lock in $(set +f; ls -tr $HOME/boards/$NODE_NAME-*.lock \
$HOME/boards/$NODE_NAME-*.bak*); do
(
flock -e 9
touch $lock
board=$(cat <&9)
# Start the jenkins container so that it can process
# all the queued cleanup and maintenance tasks.
node=$(basename "$board" .tcwglab)
./jenkins-scripts/tcwg-update-bmk-containers.sh \
--board "$board" \
--distro "$distro" \
--force "$force" \
--node "$node" \
--verbose "$verbose"
# Wait for jenkins container to become idle
while sleep 60; do
n_busy=$(source jenkins-scripts/jenkins-helpers.sh
print_number_of_busy_executors "$node")
if [ "$n_busy" = "0" ]; then
break
fi
done
# Now stop the jenkins container so that it's not terminated
# midway some other build by a starting benchmarking job.
ssh -Snone $board docker stop "$node"
) 9<$lock 2>&1 | sed -e "s/^/$(basename $lock): /" &
# LABEL_SED: We need pipefail to get correct result of sub-shell
# "( foo ) | sed" instead of always-succeeding "sed".
pids[$(basename "$lock")]=$!
done
n_good_boards=0
for lock in "${!pids[@]}"; do
res=0 && wait "${pids[$lock]}" || res=$?
case "$res:$lock" in
"0":*".lock")
n_good_boards=$(($n_good_boards + 1))
echo "$lock: SUCCESS" >> artifacts/mail-body.txt
;;
"0":*".bak")
n_good_boards=$(($n_good_boards + 1))
echo "$lock: UNEXPECTED SUCCESS" >> artifacts/mail-body.txt
# Move boards with no STG ticket (e.g., no ".bak.STG-1234")
# back into service.
mv "$HOME/boards/$lock" "$HOME/boards/${lock%.bak}.lock"
;;
"0":*)
# The board appears fine, but it has STG ticket assigned
# to it (e.g., ".bak.STG-1234").
echo "$lock: UNEXPECTED SUCCESS" >> artifacts/mail-body.txt
;;
*:*".lock")
echo "$lock: UNEXPECTED FAILURE" >> artifacts/mail-body.txt
# Move offline boards out of service.
mv "$HOME/boards/$lock" "$HOME/boards/${lock%.lock}.bak"
;;
*:*)
echo "$lock: EXPECTED FAILURE" >> artifacts/mail-body.txt
;;
esac
done
# tcwg-benchmark doesn't have ssh access to ci.linaro.org,
# so use tcwg-buildslave's credentials.
n_executors=$(ssh -p2222 -l tcwg-buildslave@linaro.org \
ci.linaro.org get-node "$NODE_NAME" \
| grep "numExecutors")
n_executors=$(echo "$n_executors" \
| sed -e "s#.*\([0-9]\+\).*#\1#")
if [ x"$n_executors" != x"$n_good_boards" ]; then
echo "UNEXPECTED: Updating number of executors on $NODE_NAME from $n_executors to $n_good_boards" \
>> artifacts/mail-body.txt
if [ "$n_good_boards" = "0" ]; then
# Setting executors to "0" will mightily confuse jenkins;
# the node with 0 executors will be stuck in limbo.
n_good_boards=1
fi
ssh -p2222 -l tcwg-buildslave@linaro.org \
ci.linaro.org get-node "$NODE_NAME" \
| sed -e "s#\([0-9]\+\)#$n_good_boards#" \
| ssh -p2222 -l tcwg-buildslave@linaro.org \
ci.linaro.org update-node "$NODE_NAME"
fi
if ! grep -q UNEXPECTED artifacts/mail-body.txt; then
exit 0
fi
exit 1
publishers:
- email-ext:
recipients: |
${FILE,path="artifacts/mail-recipients.txt"}
content-type: text
body: |
${FILE,path="artifacts/mail-body.txt"}
failure: true
success: false
aborted: true
send-to:
- recipients
#endif
#if CONTAINER_jenkins
- job:
name: tcwg-update-jenkins-containers-online-node
project-type: freestyle
defaults: global
properties:
- authorization:
anonymous:
- job-read
- job-extended-read
everyone-flat:
- job-build
- job-cancel
- build-discarder:
days-to-keep: 30
num-to-keep: 100
parameters:
- string:
name: node
default: ''
description: 'NODE_NAME to bring online'
- string:
name: build_num
default: ''
description: 'BUILD_NUMBER to wait for to finish'
disabled: false
concurrent: true
display-name: 'TCWG CCC Update jenkins containers online-node'
wrappers:
- timeout:
timeout: 60
- timestamps
- ssh-agent-credentials:
users:
# tcwg-buildslave user id
- 'e0958a95-204f-4c14-a66c-5e2be6c5d50a'
- build-name:
name: '#${BUILD_NUMBER}-#${build_num}-${node}'
builders:
- shell:
command: |
#!/bin/bash
set -ex
# Below logic was adapted from jenkins-scripts/tcwg_bmk-build.sh:
# benchmark().
ssh_cmd=(ssh -p2222 -l tcwg-buildslave@linaro.org
-oStrictHostKeyChecking=no -oUserKnownHostsFile=/dev/null)
# ??? Below loop can fail for reasons I can't understand.
# "|| true" should the subshell always exit with "0",
# and I can't see how "| tee | sed" can fail.
# Bring the node back online as we exit due to any reason,
# and ignore shell errors so that we exit only when see
# "Finished: " line in the console.log.
trap "${ssh_cmd[*]} ci.linaro.org online-node $node" EXIT
set +e
while true; do
sleep 60
(timeout 1m \
"${ssh_cmd[@]}" ci.linaro.org console \
tcwg-update-jenkins-containers $build_num || true) \
| tee console.log | sed -e "s/^/$node: /"
build_status=$(tail -n 1 console.log)
case "$build_status" in
"Finished: "*) break ;;
esac
done
#endif