| #include tcwg/default.yaml.inc |
| |
| - job: |
| name: tcwg-update-#{CONTAINER}-containers |
| project-type: freestyle |
| defaults: global |
| properties: |
| - authorization: |
| anonymous: |
| - job-read |
| - job-extended-read |
| everyone-flat: |
| - job-build |
| - job-cancel |
| - build-discarder: |
| days-to-keep: 30 |
| num-to-keep: 100 |
| - throttle: |
| max-per-node: 1 |
| option: project |
| parameters: |
| - label: |
| name: nodes |
| #if CONTAINER_host |
| # Run on all real machines with exception of benchmarking boards. |
| # Host containers of benchmarking boards are handled by |
| # CONTAINER_bmk job. |
| default: tcwg && !tcwg-bmk-hw && !tcwg-secondary-node |
| #elif CONTAINER_jenkins |
| # Run on all nodes with exception of benchmarking boards. |
| # Jenkins containers of benchmarking boards are handled by |
| # CONTAINER_bmk job. |
| default: tcwg && !tcwg-bmk-hw |
| #elif CONTAINER_llvmbot || CONTAINER_buildkite |
| default: tcwg-llvmbot |
| #elif CONTAINER_bmk |
| default: tcwg-bmk-pool |
| #endif |
| all-nodes: true |
| matching-label: 'allCases' |
| description: 'Machines to run on' |
| - string: |
| name: distro |
| default: 'default' |
| description: 'Distro version to use.' |
| #if CONTAINER_llvmbot |
| - string: |
| name: master |
| default: 'normal' |
| description: 'LLVM buildmaster to use: silent or normal' |
| #endif |
| - bool: |
| name: force |
| default: 'false' |
| description: 'Whether to force update even with no changes in image' |
| - bool: |
| name: verbose |
| default: 'true' |
| description: 'Whether to be verbose' |
| - string: |
| name: scripts_branch |
| default: master |
| description: 'Scripts revision to use' |
| disabled: false |
| node: tcwg-coordinator |
| concurrent: true |
| display-name: 'TCWG CCC Update #{CONTAINER} containers' |
| # We need to unshare workspace with $NODE_NAME in the path to |
| # correctly run on tcwg-bmk-* nodes. |
| workspace: workspace/tcwg-update-#{CONTAINER}-containers_$EXECUTOR_NUMBER/$NODE_NAME |
| scm: |
| - jenkins-scripts |
| #if !CONTAINER_llvmbot && !CONTAINER_buildkite |
| triggers: |
| - timed: '@daily' |
| #else |
| # No timer trigger for llvmbot and buildkite because we want the bot maintainer to be |
| # able to control the deployment time. |
| #endif |
| wrappers: |
| - timeout: |
| #if CONTAINER_jenkins |
| # Wait at most 5 hours before giving up on updating jenkins |
| # client container. |
| timeout: 300 |
| #else |
| timeout: 600 |
| #endif |
| - timestamps |
| #if CONTAINER_llvmbot |
| - credentials-binding: |
| - text: |
| credential-id: TCWG_LLVMBOT_PASSWORD |
| variable: TCWG_LLVMBOT_PASSWORD |
| #elif CONTAINER_buildkite |
| - credentials-binding: |
| - text: |
| credential-id: TCWG_BUILDKITE_TOKEN_LIBCXX |
| variable: TCWG_BUILDKITE_TOKEN_LIBCXX |
| #elif CONTAINER_bmk |
| - ssh-agent-credentials: |
| users: |
| - 'tcwg-benchmark' |
| # tcwg-buildslave user id |
| # ??? Do we need tcwg-buildslave's keys for BMK containers? |
| - 'e0958a95-204f-4c14-a66c-5e2be6c5d50a' |
| #elif CONTAINER_jenkins |
| - ssh-agent-credentials: |
| users: |
| # tcwg-buildslave user id |
| - 'e0958a95-204f-4c14-a66c-5e2be6c5d50a' |
| #endif |
| - build-name: |
| name: '#${BUILD_NUMBER}-${NODE_NAME}' |
| builders: |
| - shell: |
| command: | |
| #!/bin/bash |
| set -ex |
| |
| #if CONTAINER_host |
| case " $NODE_LABELS " in |
| *" tcwg-llvmbot "*) group=tcwg-llvm ;; |
| *) group=all ;; |
| esac |
| ./jenkins-scripts/tcwg-update-host-containers.sh \ |
| --distro "$distro" \ |
| --group "$group" \ |
| --force "$force" \ |
| --verbose "$verbose" |
| #elif CONTAINER_jenkins |
| # Check if we need to update the image -- run with --dryrun true. |
| ./jenkins-scripts/tcwg-update-host-containers.sh \ |
| --distro "$distro" \ |
| --dryrun true \ |
| --node "$NODE_NAME" \ |
| --force "$force" \ |
| --verbose "$verbose" & |
| res=0 && wait $! || res=$? |
| |
| if [ $res = 0 ]; then |
| # Fast-path exit to avoid bringing the node offline. |
| echo "$NODE_NAME is up-to-date" |
| # Skip the rest and mark the build UNSTABLE (aka skipped). |
| exit 125 |
| elif [ $res = 125 ]; then |
| echo "$NODE_NAME needs container update" |
| elif [ $res != 0 ]; then |
| echo "ERROR: container check failed" |
| exit $res |
| fi |
| |
| # We are about to update the container that is running this. |
| # The plan is: |
| # 1. Prevent new builds from starting by putting the node into |
| # offline mode. |
| # 2. Wait for current builds to finish. We detect this by |
| # checking for children process of the jenkins client. |
| # 3. Trigger a job on the master node to bring this node back |
| # online. Without this we would restart the jenkins container, |
| # but the node would still be marked as "offline" and no new |
| # builds will be scheduled to it. |
| # 4. Restart the container. |
| |
| # Mark the node offline. |
| ssh -p2222 -l tcwg-buildslave@linaro.org ci.linaro.org \ |
| offline-node "$NODE_NAME" \ |
| -m "Updating_jenkins_container:$BUILD_URL" |
| |
| start_date=$(date +%s) |
| rm -f timeout |
| |
| # Wait for current builds to finish. |
| while true; do |
| n_busy=$(source jenkins-scripts/jenkins-helpers.sh |
| print_number_of_busy_executors "$NODE_NAME") |
| |
| if [ "$n_busy" = "1" ]; then |
| # We are the only build left. |
| break |
| fi |
| |
| elapsed=$(($(date +%s) - $start_date)) |
| elapsed=$(($elapsed / 60)) |
| if [ "$elapsed" -gt "270" ]; then |
| # We'll timeout in 30 minutes; give up on the update and |
| # bring the node back online. |
| # We budget 30 minutes for tcwg-cleanup-stale-workspaces.sh |
| # below. |
| touch timeout |
| break |
| fi |
| |
| # Wait for other builds to complete. |
| sleep 60 |
| done |
| unstable-return: 125 |
| - conditional-step: |
| condition-kind: current-status |
| steps: |
| - trigger-builds: |
| - project: tcwg-update-jenkins-containers-online-node |
| predefined-parameters: | |
| node=$NODE_NAME |
| build_num=$BUILD_NUMBER |
| block: false |
| - shell: |
| command: | |
| #!/bin/bash |
| set -ex |
| |
| if [ -f timeout ]; then |
| exit 125 |
| fi |
| |
| # Cleanup workspace directory while the node is idle. |
| ( |
| set +e |
| $WORKSPACE/jenkins-scripts/tcwg-cleanup-stale-workspaces.sh \ |
| --days 3 --workspace_top $HOME/workspace |
| ) |
| |
| ./jenkins-scripts/tcwg-update-host-containers.sh \ |
| --distro "$distro" \ |
| --dryrun false \ |
| --node "$NODE_NAME" \ |
| --force "$force" \ |
| --verbose "$verbose" || exit 125 |
| unstable-return: 125 |
| #elif CONTAINER_llvmbot |
| ./jenkins-scripts/tcwg-update-llvmbot-containers.sh \ |
| --NODE_NAME "$NODE_NAME" \ |
| --distro "$distro" \ |
| --master "$master" \ |
| --password "$TCWG_LLVMBOT_PASSWORD" \ |
| --force "$force" \ |
| --verbose "$verbose" |
| #elif CONTAINER_buildkite |
| ./jenkins-scripts/tcwg-update-llvmbot-containers.sh \ |
| --NODE_NAME "$NODE_NAME" \ |
| --distro "$distro" \ |
| --master "buildkite" \ |
| --password "$TCWG_BUILDKITE_TOKEN_LIBCXX" \ |
| --force "$force" \ |
| --verbose "$verbose" |
| #elif CONTAINER_bmk |
| # See: LABEL_SED below |
| set -o pipefail |
| |
| rm -rf artifacts |
| mkdir artifacts |
| |
| echo "$BUILD_URL" > artifacts/mail-body.txt |
| echo "maxim.kuvyrkov@linaro.org, laurent.alfonsi@linaro.org" > artifacts/mail-recipients.txt |
| |
| declare -A pids |
| for lock in $(set +f; ls -tr $HOME/boards/$NODE_NAME-*.lock \ |
| $HOME/boards/$NODE_NAME-*.bak*); do |
| ( |
| flock -e 9 |
| touch $lock |
| board=$(cat <&9) |
| |
| # Start the jenkins container so that it can process |
| # all the queued cleanup and maintenance tasks. |
| node=$(basename "$board" .tcwglab) |
| |
| ./jenkins-scripts/tcwg-update-bmk-containers.sh \ |
| --board "$board" \ |
| --distro "$distro" \ |
| --force "$force" \ |
| --node "$node" \ |
| --verbose "$verbose" |
| |
| # Wait for jenkins container to become idle |
| while sleep 60; do |
| n_busy=$(source jenkins-scripts/jenkins-helpers.sh |
| print_number_of_busy_executors "$node") |
| |
| if [ "$n_busy" = "0" ]; then |
| break |
| fi |
| done |
| |
| # Now stop the jenkins container so that it's not terminated |
| # midway some other build by a starting benchmarking job. |
| ssh -Snone $board docker stop "$node" |
| ) 9<$lock 2>&1 | sed -e "s/^/$(basename $lock): /" & |
| |
| # LABEL_SED: We need pipefail to get correct result of sub-shell |
| # "( foo ) | sed" instead of always-succeeding "sed". |
| pids[$(basename "$lock")]=$! |
| done |
| |
| n_good_boards=0 |
| for lock in "${!pids[@]}"; do |
| res=0 && wait "${pids[$lock]}" || res=$? |
| case "$res:$lock" in |
| "0":*".lock") |
| n_good_boards=$(($n_good_boards + 1)) |
| echo "$lock: SUCCESS" >> artifacts/mail-body.txt |
| ;; |
| "0":*".bak") |
| n_good_boards=$(($n_good_boards + 1)) |
| echo "$lock: UNEXPECTED SUCCESS" >> artifacts/mail-body.txt |
| # Move boards with no STG ticket (e.g., no ".bak.STG-1234") |
| # back into service. |
| mv "$HOME/boards/$lock" "$HOME/boards/${lock%.bak}.lock" |
| ;; |
| "0":*) |
| # The board appears fine, but it has STG ticket assigned |
| # to it (e.g., ".bak.STG-1234"). |
| echo "$lock: UNEXPECTED SUCCESS" >> artifacts/mail-body.txt |
| ;; |
| *:*".lock") |
| echo "$lock: UNEXPECTED FAILURE" >> artifacts/mail-body.txt |
| # Move offline boards out of service. |
| mv "$HOME/boards/$lock" "$HOME/boards/${lock%.lock}.bak" |
| ;; |
| *:*) |
| echo "$lock: EXPECTED FAILURE" >> artifacts/mail-body.txt |
| ;; |
| esac |
| done |
| |
| # tcwg-benchmark doesn't have ssh access to ci.linaro.org, |
| # so use tcwg-buildslave's credentials. |
| n_executors=$(ssh -p2222 -l tcwg-buildslave@linaro.org \ |
| ci.linaro.org get-node "$NODE_NAME" \ |
| | grep "numExecutors") |
| n_executors=$(echo "$n_executors" \ |
| | sed -e "s#.*<numExecutors>\([0-9]\+\)</numExecutors>.*#\1#") |
| if [ x"$n_executors" != x"$n_good_boards" ]; then |
| echo "UNEXPECTED: Updating number of executors on $NODE_NAME from $n_executors to $n_good_boards" \ |
| >> artifacts/mail-body.txt |
| if [ "$n_good_boards" = "0" ]; then |
| # Setting executors to "0" will mightily confuse jenkins; |
| # the node with 0 executors will be stuck in limbo. |
| n_good_boards=1 |
| fi |
| ssh -p2222 -l tcwg-buildslave@linaro.org \ |
| ci.linaro.org get-node "$NODE_NAME" \ |
| | sed -e "s#<numExecutors>\([0-9]\+\)</numExecutors>#<numExecutors>$n_good_boards</numExecutors>#" \ |
| | ssh -p2222 -l tcwg-buildslave@linaro.org \ |
| ci.linaro.org update-node "$NODE_NAME" |
| fi |
| |
| if ! grep -q UNEXPECTED artifacts/mail-body.txt; then |
| exit 0 |
| fi |
| |
| exit 1 |
| publishers: |
| - email-ext: |
| recipients: | |
| ${FILE,path="artifacts/mail-recipients.txt"} |
| content-type: text |
| body: | |
| ${FILE,path="artifacts/mail-body.txt"} |
| failure: true |
| success: false |
| aborted: true |
| send-to: |
| - recipients |
| #endif |
| |
| #if CONTAINER_jenkins |
| - job: |
| name: tcwg-update-jenkins-containers-online-node |
| project-type: freestyle |
| defaults: global |
| properties: |
| - authorization: |
| anonymous: |
| - job-read |
| - job-extended-read |
| everyone-flat: |
| - job-build |
| - job-cancel |
| - build-discarder: |
| days-to-keep: 30 |
| num-to-keep: 100 |
| parameters: |
| - string: |
| name: node |
| default: '' |
| description: 'NODE_NAME to bring online' |
| - string: |
| name: build_num |
| default: '' |
| description: 'BUILD_NUMBER to wait for to finish' |
| disabled: false |
| concurrent: true |
| display-name: 'TCWG CCC Update jenkins containers online-node' |
| wrappers: |
| - timeout: |
| timeout: 60 |
| - timestamps |
| - ssh-agent-credentials: |
| users: |
| # tcwg-buildslave user id |
| - 'e0958a95-204f-4c14-a66c-5e2be6c5d50a' |
| - build-name: |
| name: '#${BUILD_NUMBER}-#${build_num}-${node}' |
| builders: |
| - shell: |
| command: | |
| #!/bin/bash |
| set -ex |
| |
| # Below logic was adapted from jenkins-scripts/tcwg_bmk-build.sh: |
| # benchmark(). |
| |
| ssh_cmd=(ssh -p2222 -l tcwg-buildslave@linaro.org |
| -oStrictHostKeyChecking=no -oUserKnownHostsFile=/dev/null) |
| |
| # ??? Below loop can fail for reasons I can't understand. |
| # "|| true" should the subshell always exit with "0", |
| # and I can't see how "| tee | sed" can fail. |
| # Bring the node back online as we exit due to any reason, |
| # and ignore shell errors so that we exit only when see |
| # "Finished: " line in the console.log. |
| trap "${ssh_cmd[*]} ci.linaro.org online-node $node" EXIT |
| set +e |
| |
| while true; do |
| sleep 60 |
| |
| (timeout 1m \ |
| "${ssh_cmd[@]}" ci.linaro.org console \ |
| tcwg-update-jenkins-containers $build_num || true) \ |
| | tee console.log | sed -e "s/^/$node: /" |
| |
| build_status=$(tail -n 1 console.log) |
| case "$build_status" in |
| "Finished: "*) break ;; |
| esac |
| done |
| #endif |