aboutsummaryrefslogtreecommitdiff
path: root/tcwg-update-jenkins-containers.yaml
blob: 9abc9064bb694f4441ccab6ed76c2686d0d8ef05 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
# Auto generated by ./tcwg/generate-yamlfiles.sh from tcwg-update-containers.yaml.in and tcwg-update-containers/tcwg-update-jenkins-containers.def. Do not edit.
#BEGIN: tcwg/default.yaml.inc
# -*- mode: Yaml -*-

- property:
    name: default-properties
    properties:
      - authorization:
          anonymous:
            - job-read
            - job-extended-read
          everyone-flat:
            - job-build
            - job-cancel
      - build-discarder:
          days-to-keep: 30

- scm:
    name: jenkins-scripts
    scm:
      - git:
          url: https://git.linaro.org/toolchain/jenkins-scripts.git
          refspec: +refs/heads/*:refs/remotes/origin/* +refs/changes/*:refs/changes/*
          branches:
            - $scripts_branch
          basedir: jenkins-scripts
          skip-tag: true
          reference-repo: /home/tcwg-buildslave/snapshots-ref/jenkins-scripts.git
          wipe-workspace: false
          clean:
            before: true
          prune: true

#END:   tcwg/default.yaml.inc

- job:
    name: tcwg-update-jenkins-containers
    project-type: freestyle
    defaults: global
    properties:
        - authorization:
            anonymous:
                - job-read
                - job-extended-read
            everyone-flat:
                - job-build
                - job-cancel
        - build-discarder:
            days-to-keep: 30
            num-to-keep: 100
        - throttle:
            max-per-node: 1
            option: project
    parameters:
        - label:
            name: nodes
            # Run on all nodes with exception of benchmarking boards.
            # Jenkins containers of benchmarking boards are handled by
            # CONTAINER_bmk job.
            default: tcwg && !tcwg-bmk-hw
            all-nodes: true
            matching-label: 'allCases'
            description: 'Machines to run on'
        - string:
            name: distro
            default: 'default'
            description: 'Distro version to use.'
        - bool:
            name: force
            default: 'false'
            description: 'Whether to force update even with no changes in image'
        - bool:
            name: verbose
            default: 'true'
            description: 'Whether to be verbose'
        - string:
            name: scripts_branch
            default: master
            description: 'Scripts revision to use'
    disabled: false
    node: tcwg-coordinator
    concurrent: true
    display-name: 'TCWG CCC Update jenkins containers'
    # We need to unshare workspace with $NODE_NAME in the path to
    # correctly run on tcwg-bmk-* nodes.
    workspace: workspace/tcwg-update-jenkins-containers_$EXECUTOR_NUMBER/$NODE_NAME
    scm:
      - jenkins-scripts
    triggers:
        - timed: '@daily'
    wrappers:
        - timeout:
            # Wait at most 5 hours before giving up on updating jenkins
            # client container.
            timeout: 300
        - timestamps
        - ssh-agent-credentials:
            users:
                # tcwg-buildslave user id
                - 'e0958a95-204f-4c14-a66c-5e2be6c5d50a'
        - build-name:
            name: '#${BUILD_NUMBER}-${NODE_NAME}'
    builders:
      - shell:
          command: |
            #!/bin/bash
            set -ex

            # Check if we need to update the image -- run with --dryrun true.
            ./jenkins-scripts/tcwg-update-host-containers.sh \
            --distro "$distro" \
            --dryrun true \
            --node "$NODE_NAME" \
            --force "$force" \
            --verbose "$verbose" &
            res=0 && wait $! || res=$?

            if [ $res = 0 ]; then
              # Fast-path exit to avoid bringing the node offline.
              echo "$NODE_NAME is up-to-date"
              # Skip the rest and mark the build UNSTABLE (aka skipped).
              exit 125
            elif [ $res = 125 ]; then
              echo "$NODE_NAME needs container update"
            elif [ $res != 0 ]; then
              echo "ERROR: container check failed"
              exit $res
            fi

            # We are about to update the container that is running this.
            # The plan is:
            # 1. Prevent new builds from starting by putting the node into
            #    offline mode.
            # 2. Wait for current builds to finish.  We detect this by
            #    checking for children process of the jenkins client.
            # 3. Trigger a job on the master node to bring this node back
            #    online.  Without this we would restart the jenkins container,
            #    but the node would still be marked as "offline" and no new
            #    builds will be scheduled to it.
            # 4. Restart the container.

            # Mark the node offline.
            ssh -p2222 -l tcwg-buildslave@linaro.org ci.linaro.org \
              offline-node "$NODE_NAME" \
              -m "Updating_jenkins_container:$BUILD_URL"

            start_date=$(date +%s)
            rm -f timeout

            # Wait for current builds to finish.
            while true; do
              n_busy=$(source jenkins-scripts/jenkins-helpers.sh
                       print_number_of_busy_executors "$NODE_NAME")

              if [ "$n_busy" = "1" ]; then
                # We are the only build left.
                break
              fi

              elapsed=$(($(date +%s) - $start_date))
              elapsed=$(($elapsed / 60))
              if [ "$elapsed" -gt "270" ]; then
                # We'll timeout in 30 minutes; give up on the update and
                # bring the node back online.
                # We budget 30 minutes for tcwg-cleanup-stale-workspaces.sh
                # below.
                touch timeout
                break
              fi

              # Wait for other builds to complete.
              sleep 60
            done
          unstable-return: 125
      - conditional-step:
          condition-kind: current-status
          steps:
            - trigger-builds:
                - project: tcwg-update-jenkins-containers-online-node
                  predefined-parameters: |
                    node=$NODE_NAME
                    build_num=$BUILD_NUMBER
                  block: false
            - shell:
                command: |
                  #!/bin/bash
                  set -ex

                  if [ -f timeout ]; then
                    exit 125
                  fi

                  # Cleanup workspace directory while the node is idle.
                  (
                  set +e
                  $WORKSPACE/jenkins-scripts/tcwg-cleanup-stale-workspaces.sh \
                    --days 3 --workspace_top $HOME/workspace
                  )

                  ./jenkins-scripts/tcwg-update-host-containers.sh \
                    --distro "$distro" \
                    --dryrun false \
                    --node "$NODE_NAME" \
                    --force "$force" \
                    --verbose "$verbose" || exit 125
                unstable-return: 125

- job:
    name: tcwg-update-jenkins-containers-online-node
    project-type: freestyle
    defaults: global
    properties:
        - authorization:
            anonymous:
                - job-read
                - job-extended-read
            everyone-flat:
                - job-build
                - job-cancel
        - build-discarder:
            days-to-keep: 30
            num-to-keep: 100
    parameters:
        - string:
            name: node
            default: ''
            description: 'NODE_NAME to bring online'
        - string:
            name: build_num
            default: ''
            description: 'BUILD_NUMBER to wait for to finish'
    disabled: false
    concurrent: true
    display-name: 'TCWG CCC Update jenkins containers online-node'
    wrappers:
        - timeout:
            timeout: 60
        - timestamps
        - ssh-agent-credentials:
            users:
                # tcwg-buildslave user id
                - 'e0958a95-204f-4c14-a66c-5e2be6c5d50a'
        - build-name:
            name: '#${BUILD_NUMBER}-#${build_num}-${node}'
    builders:
      - shell:
          command: |
            #!/bin/bash
            set -ex

            # Below logic was adapted from jenkins-scripts/tcwg_bmk-build.sh:
            # benchmark().

            ssh_cmd=(ssh -p2222 -l tcwg-buildslave@linaro.org
              -oStrictHostKeyChecking=no -oUserKnownHostsFile=/dev/null)

            # ??? Below loop can fail for reasons I can't understand.
            #     "|| true" should the subshell always exit with "0",
            #     and I can't see how "| tee | sed" can fail.
            # Bring the node back online as we exit due to any reason,
            # and ignore shell errors so that we exit only when see
            # "Finished: " line in the console.log.
            trap "${ssh_cmd[*]} ci.linaro.org online-node $node" EXIT
            set +e

            while true; do
              sleep 60

              (timeout 1m \
                "${ssh_cmd[@]}" ci.linaro.org console \
                tcwg-update-jenkins-containers $build_num || true) \
                | tee console.log | sed -e "s/^/$node: /"

              build_status=$(tail -n 1 console.log)
              case "$build_status" in
                "Finished: "*) break ;;
              esac
            done
# checksum: 6ffc6ff4f71c52329c02ca6e6f932e10