aboutsummaryrefslogtreecommitdiff
path: root/tcwg-update-containers.yaml.in
blob: d748ce663cf9ea437b869ce68670ff4f95c748f5 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
#include tcwg/default.yaml.inc

- job:
    name: tcwg-update-#{CONTAINER}-containers
    project-type: freestyle
    defaults: global
    properties:
        - authorization:
            anonymous:
                - job-read
                - job-extended-read
            everyone-flat:
                - job-build
                - job-cancel
        - build-discarder:
            days-to-keep: 30
            num-to-keep: 100
        - throttle:
            max-per-node: 1
            option: project
    parameters:
        - label:
            name: nodes
#if CONTAINER_host
            # Run on all real machines with exception of benchmarking boards.
            # Host containers of benchmarking boards are handled by
            # CONTAINER_bmk job.
            default: tcwg && !tcwg-bmk-hw && !tcwg-secondary-node
#elif CONTAINER_jenkins
            # Run on all nodes with exception of benchmarking boards.
            # Jenkins containers of benchmarking boards are handled by
            # CONTAINER_bmk job.
            default: tcwg && !tcwg-bmk-hw
#elif CONTAINER_llvmbot || CONTAINER_buildkite
            default: tcwg-llvmbot
#elif CONTAINER_bmk
            default: tcwg-bmk-pool
#endif
            all-nodes: true
            matching-label: 'allCases'
            description: 'Machines to run on'
        - string:
            name: distro
#if CONTAINER_llvmbot || CONTAINER_buildkite
            default: 'lts_1'
#else
            default: 'default'
#endif
            description: 'Distro version to use.'
#if CONTAINER_llvmbot
        - string:
            name: master
            default: 'normal'
            description: 'LLVM buildmaster to use: silent or normal'
#endif
        - bool:
            name: force
            default: 'false'
            description: 'Whether to force update even with no changes in image'
        - bool:
            name: verbose
            default: 'true'
            description: 'Whether to be verbose'
        - string:
            name: scripts_branch
            default: master
            description: 'Scripts revision to use'
    disabled: false
    node: tcwg-coordinator
    concurrent: true
    display-name: 'TCWG CCC Update #{CONTAINER} containers'
    # We need to unshare workspace with $NODE_NAME in the path to
    # correctly run on tcwg-bmk-* nodes.
    workspace: workspace/tcwg-update-#{CONTAINER}-containers_$EXECUTOR_NUMBER/$NODE_NAME
    scm:
      - jenkins-scripts
#if !CONTAINER_llvmbot && !CONTAINER_buildkite
    triggers:
        - timed: '@daily'
#else
    # No timer trigger for llvmbot and buildkite because we want the bot maintainer to be
    # able to control the deployment time.
#endif
    wrappers:
        - timeout:
#if CONTAINER_jenkins
            # Wait at most 5 hours before giving up on updating jenkins
            # client container.
            timeout: 300
#else
            timeout: 600
#endif
        - timestamps
#if CONTAINER_llvmbot
        - credentials-binding:
            - text:
                credential-id: TCWG_LLVMBOT_PASSWORD
                variable: TCWG_LLVMBOT_PASSWORD
#elif CONTAINER_buildkite
        - credentials-binding:
            - text:
                credential-id: TCWG_BUILDKITE_TOKEN_LIBCXX
                variable: TCWG_BUILDKITE_TOKEN_LIBCXX
#elif CONTAINER_bmk
        - ssh-agent-credentials:
            users:
                - 'tcwg-benchmark'
                # tcwg-buildslave user id
                # ??? Do we need tcwg-buildslave's keys for BMK containers?
                - 'e0958a95-204f-4c14-a66c-5e2be6c5d50a'
#elif CONTAINER_jenkins
        - ssh-agent-credentials:
            users:
                # tcwg-buildslave user id
                - 'e0958a95-204f-4c14-a66c-5e2be6c5d50a'
#endif
        - build-name:
            name: '#${BUILD_NUMBER}-${NODE_NAME}'
    builders:
      - shell:
          command: |
            #!/bin/bash
            set -ex

#if CONTAINER_host
            case " $NODE_LABELS " in
              *" tcwg-llvmbot "*) group=tcwg-llvm ;;
              *) group=all ;;
            esac
            ./jenkins-scripts/tcwg-update-host-containers.sh \
            --distro "$distro" \
            --group "$group" \
            --force "$force" \
            --verbose "$verbose"
#elif CONTAINER_jenkins
            # Check if we need to update the image -- run with --dryrun true.
            ./jenkins-scripts/tcwg-update-host-containers.sh \
            --distro "$distro" \
            --dryrun true \
            --node "$NODE_NAME" \
            --force "$force" \
            --verbose "$verbose" &
            res=0 && wait $! || res=$?

            if [ $res = 0 ]; then
              # Fast-path exit to avoid bringing the node offline.
              echo "$NODE_NAME is up-to-date"
              # Skip the rest and mark the build UNSTABLE (aka skipped).
              exit 125
            elif [ $res = 125 ]; then
              echo "$NODE_NAME needs container update"
            elif [ $res != 0 ]; then
              echo "ERROR: container check failed"
              exit $res
            fi

            # We are about to update the container that is running this.
            # The plan is:
            # 1. Prevent new builds from starting by putting the node into
            #    offline mode.
            # 2. Wait for current builds to finish.  We detect this by
            #    checking for children process of the jenkins client.
            # 3. Trigger a job on the master node to bring this node back
            #    online.  Without this we would restart the jenkins container,
            #    but the node would still be marked as "offline" and no new
            #    builds will be scheduled to it.
            # 4. Restart the container.

            # Mark the node offline.
            ssh -p2222 -l tcwg-buildslave@linaro.org ci.linaro.org \
              offline-node "$NODE_NAME" \
              -m "Updating_jenkins_container:$BUILD_URL"

            start_date=$(date +%s)
            rm -f timeout

            # Wait for current builds to finish.
            while true; do
              n_busy=$(source jenkins-scripts/jenkins-helpers.sh
                       print_number_of_busy_executors "$NODE_NAME")

              if [ "$n_busy" = "1" ]; then
                # We are the only build left.
                break
              fi

              elapsed=$(($(date +%s) - $start_date))
              elapsed=$(($elapsed / 60))
              if [ "$elapsed" -gt "270" ]; then
                # We'll timeout in 30 minutes; give up on the update and
                # bring the node back online.
                # We budget 30 minutes for tcwg-cleanup-stale-workspaces.sh
                # below.
                touch timeout
                break
              fi

              # Wait for other builds to complete.
              sleep 60
            done
          unstable-return: 125
      - conditional-step:
          condition-kind: current-status
          steps:
            - trigger-builds:
                - project: tcwg-update-jenkins-containers-online-node
                  predefined-parameters: |
                    node=$NODE_NAME
                    build_num=$BUILD_NUMBER
                  block: false
            - shell:
                command: |
                  #!/bin/bash
                  set -ex

                  if [ -f timeout ]; then
                    exit 125
                  fi

                  # Cleanup workspace directory while the node is idle.
                  (
                  set +e
                  $WORKSPACE/jenkins-scripts/tcwg-cleanup-stale-workspaces.sh \
                    --days 3 --workspace_top $HOME/workspace
                  )

                  ./jenkins-scripts/tcwg-update-host-containers.sh \
                    --distro "$distro" \
                    --dryrun false \
                    --node "$NODE_NAME" \
                    --force "$force" \
                    --verbose "$verbose" || exit 125
                unstable-return: 125
#elif CONTAINER_llvmbot
            ./jenkins-scripts/tcwg-update-llvmbot-containers.sh \
            --NODE_NAME "$NODE_NAME" \
            --distro "$distro" \
            --master "$master" \
            --password "$TCWG_LLVMBOT_PASSWORD" \
            --force "$force" \
            --verbose "$verbose"
#elif CONTAINER_buildkite
            ./jenkins-scripts/tcwg-update-llvmbot-containers.sh \
            --NODE_NAME "$NODE_NAME" \
            --distro "$distro" \
            --master "buildkite" \
            --password "$TCWG_BUILDKITE_TOKEN_LIBCXX" \
            --force "$force" \
            --verbose "$verbose"
#elif CONTAINER_bmk
            # See: LABEL_SED below
            set -o pipefail

            rm -rf artifacts
            mkdir artifacts

            echo "$BUILD_URL" > artifacts/mail-body.txt
            echo "maxim.kuvyrkov@linaro.org, laurent.alfonsi@linaro.org" > artifacts/mail-recipients.txt

            declare -A pids
            for lock in $(set +f; ls -tr $HOME/boards/$NODE_NAME-*.lock \
                                         $HOME/boards/$NODE_NAME-*.bak*); do
              (
                flock -e 9
                touch $lock
                board=$(cat <&9)

                # Start the jenkins container so that it can process
                # all the queued cleanup and maintenance tasks.
                node=$(basename "$board" .tcwglab)

                ./jenkins-scripts/tcwg-update-bmk-containers.sh \
                  --board "$board" \
                  --distro "$distro" \
                  --force "$force" \
                  --node "$node" \
                  --verbose "$verbose"

                # Wait for jenkins container to become idle
                while sleep 60; do
                  n_busy=$(source jenkins-scripts/jenkins-helpers.sh
                           print_number_of_busy_executors "$node")

                  if [ "$n_busy" = "0" ]; then
                    break
                  fi
                done

                # Now stop the jenkins container so that it's not terminated
                # midway some other build by a starting benchmarking job.
                ssh -Snone $board docker stop "$node"
              ) 9<$lock 2>&1 | sed -e "s/^/$(basename $lock): /" &

              # LABEL_SED: We need pipefail to get correct result of sub-shell
              # "( foo ) | sed" instead of always-succeeding "sed".
              pids[$(basename "$lock")]=$!
            done

            n_good_boards=0
            for lock in "${!pids[@]}"; do
              res=0 && wait "${pids[$lock]}" || res=$?
              case "$res:$lock" in
                "0":*".lock")
                  n_good_boards=$(($n_good_boards + 1))
                  echo "$lock: SUCCESS" >> artifacts/mail-body.txt
                  ;;
                "0":*".bak")
                  n_good_boards=$(($n_good_boards + 1))
                  echo "$lock: UNEXPECTED SUCCESS" >> artifacts/mail-body.txt
                  # Move boards with no STG ticket (e.g., no ".bak.STG-1234")
                  # back into service.
                  mv "$HOME/boards/$lock" "$HOME/boards/${lock%.bak}.lock"
                  ;;
                "0":*)
                  # The board appears fine, but it has STG ticket assigned
                  # to it (e.g., ".bak.STG-1234").
                  echo "$lock: UNEXPECTED SUCCESS" >> artifacts/mail-body.txt
                  ;;
                *:*".lock")
                  echo "$lock: UNEXPECTED FAILURE" >> artifacts/mail-body.txt
                  # Move offline boards out of service.
                  mv "$HOME/boards/$lock" "$HOME/boards/${lock%.lock}.bak"
                  ;;
                *:*)
                  echo "$lock: EXPECTED FAILURE" >> artifacts/mail-body.txt
                  ;;
              esac
            done

            # tcwg-benchmark doesn't have ssh access to ci.linaro.org,
            # so use tcwg-buildslave's credentials.
            n_executors=$(ssh -p2222 -l tcwg-buildslave@linaro.org \
                              ci.linaro.org get-node "$NODE_NAME" \
                              | grep "numExecutors")
            n_executors=$(echo "$n_executors" \
                          | sed -e "s#.*<numExecutors>\([0-9]\+\)</numExecutors>.*#\1#")
            if [ x"$n_executors" != x"$n_good_boards" ]; then
              echo "UNEXPECTED: Updating number of executors on $NODE_NAME from $n_executors to $n_good_boards" \
                >> artifacts/mail-body.txt
              if [ "$n_good_boards" = "0" ]; then
                # Setting executors to "0" will mightily confuse jenkins;
                # the node with 0 executors will be stuck in limbo.
                n_good_boards=1
              fi
              ssh -p2222 -l tcwg-buildslave@linaro.org \
                  ci.linaro.org get-node "$NODE_NAME" \
                | sed -e "s#<numExecutors>\([0-9]\+\)</numExecutors>#<numExecutors>$n_good_boards</numExecutors>#" \
                | ssh -p2222 -l tcwg-buildslave@linaro.org \
                      ci.linaro.org update-node "$NODE_NAME"
            fi

            if ! grep -q UNEXPECTED artifacts/mail-body.txt; then
              exit 0
            fi

            exit 1
    publishers:
      - email-ext:
          recipients: |
            ${FILE,path="artifacts/mail-recipients.txt"}
          content-type: text
          body: |
            ${FILE,path="artifacts/mail-body.txt"}
          failure: true
          success: false
          aborted: true
          send-to:
            - recipients
#endif

#if CONTAINER_jenkins
- job:
    name: tcwg-update-jenkins-containers-online-node
    project-type: freestyle
    defaults: global
    properties:
        - authorization:
            anonymous:
                - job-read
                - job-extended-read
            everyone-flat:
                - job-build
                - job-cancel
        - build-discarder:
            days-to-keep: 30
            num-to-keep: 100
    parameters:
        - string:
            name: node
            default: ''
            description: 'NODE_NAME to bring online'
        - string:
            name: build_num
            default: ''
            description: 'BUILD_NUMBER to wait for to finish'
    disabled: false
    concurrent: true
    display-name: 'TCWG CCC Update jenkins containers online-node'
    wrappers:
        - timeout:
            timeout: 60
        - timestamps
        - ssh-agent-credentials:
            users:
                # tcwg-buildslave user id
                - 'e0958a95-204f-4c14-a66c-5e2be6c5d50a'
        - build-name:
            name: '#${BUILD_NUMBER}-#${build_num}-${node}'
    builders:
      - shell:
          command: |
            #!/bin/bash
            set -ex

            # Below logic was adapted from jenkins-scripts/tcwg_bmk-build.sh:
            # benchmark().

            ssh_cmd=(ssh -p2222 -l tcwg-buildslave@linaro.org
              -oStrictHostKeyChecking=no -oUserKnownHostsFile=/dev/null)

            # ??? Below loop can fail for reasons I can't understand.
            #     "|| true" should the subshell always exit with "0",
            #     and I can't see how "| tee | sed" can fail.
            # Bring the node back online as we exit due to any reason,
            # and ignore shell errors so that we exit only when see
            # "Finished: " line in the console.log.
            trap "${ssh_cmd[*]} ci.linaro.org online-node $node" EXIT
            set +e

            while true; do
              sleep 60

              (timeout 1m \
                "${ssh_cmd[@]}" ci.linaro.org console \
                tcwg-update-jenkins-containers $build_num || true) \
                | tee console.log | sed -e "s/^/$node: /"

              build_status=$(tail -n 1 console.log)
              case "$build_status" in
                "Finished: "*) break ;;
              esac
            done
#endif