diff options
author | Chase Qi <chase.qi@linaro.org> | 2019-02-18 19:56:24 +0800 |
---|---|---|
committer | GitHub <noreply@github.com> | 2019-02-18 19:56:24 +0800 |
commit | 267cc24986b65ccaa0d1c26c4c88a469e15ad59e (patch) | |
tree | 85092864c639f447e0d4398ebe20088046242f76 | |
parent | 54ff3e53032361cd3e2f6bbb8481d3b57b33b32d (diff) | |
parent | 2c59069ade26da3233c9db55465571758c9c80f9 (diff) |
Merge pull request #15 from kateyy/tradefed_shards_with_retry
Add MultiNode TradeFed tests for --shards and with session retry
15 files changed, 1936 insertions, 0 deletions
diff --git a/automated/android/multinode/connect-to-remote-adb-tcpip-devices.yaml b/automated/android/multinode/connect-to-remote-adb-tcpip-devices.yaml new file mode 100644 index 0000000..3dc2e42 --- /dev/null +++ b/automated/android/multinode/connect-to-remote-adb-tcpip-devices.yaml @@ -0,0 +1,24 @@ +metadata: + name: connect-to-remote-adb-tcpip-devices + format: "Lava-Test-Shell Test Definition 1.0" + description: "adb MultiNode setup: connect to remote devices made accessible via adb TCP/IP." + maintainer: + - karsten@fairphone.com + - softwareteam@fairphone.com + os: + - debian + - ubuntu + devices: + - lxc + scope: + - functional + +params: + ADB_CONNECT_TIMEOUT_SECS: "60" + DEVICE_WORKER_MAPPING_FILE: "/tmp/deviceWorkerMapping" + +run: + steps: + - . ./automated/lib/sh-test-lib + - . ./automated/lib/android-multinode-test-lib + - connect_to_remote_adb_tcpip_devices "${ADB_CONNECT_TIMEOUT_SECS}" "${DEVICE_WORKER_MAPPING_FILE}" diff --git a/automated/android/multinode/release-remote-adb-tcpip-devices.yaml b/automated/android/multinode/release-remote-adb-tcpip-devices.yaml new file mode 100644 index 0000000..c28c22e --- /dev/null +++ b/automated/android/multinode/release-remote-adb-tcpip-devices.yaml @@ -0,0 +1,20 @@ +metadata: + name: release-remote-adb-tcpip-devices + format: "Lava-Test-Shell Test Definition 1.0" + description: "Disconnect from remote adb devices and cleanup." + maintainer: + - karsten@fairphone.com + - softwareteam@fairphone.com + os: + - debian + - ubuntu + devices: + - lxc + scope: + - functional + +run: + steps: + - lava-sync release_dut + # Cleanup adb server: LAVA expects only one device connected to adb. + - adb kill-server diff --git a/automated/android/multinode/remote-adb-devices-smoke-test.yaml b/automated/android/multinode/remote-adb-devices-smoke-test.yaml new file mode 100644 index 0000000..838b866 --- /dev/null +++ b/automated/android/multinode/remote-adb-devices-smoke-test.yaml @@ -0,0 +1,27 @@ +metadata: + name: remote-adb-devices-smoke-test + format: "Lava-Test-Shell Test Definition 1.0" + description: "Smoke test demonstrating access to adb devices over TCP/IP." + maintainer: + - karsten@fairphone.com + - softwareteam@fairphone.com + os: + - debian + - ubuntu + devices: + - lxc + scope: + - functional + +params: + DEVICE_WORKER_MAPPING_FILE: "/tmp/deviceWorkerMapping" + +run: + steps: + - device_worker_mapping="$(cat "${DEVICE_WORKER_MAPPING_FILE}")" + - | + for device_to_worker in ${device_worker_mapping}; do + device="$(echo ${device_to_worker} | cut -d';' -f1)" + echo "${device}: $(adb -s "${device}" shell service call iphonesubinfo 1 | \ + grep -oE '(\.[0-9])|([0-9]\.)' | grep -oE '[0-9]' | tr -d '\n')" + done diff --git a/automated/android/multinode/share-local-device-over-adb-tcpip.yaml b/automated/android/multinode/share-local-device-over-adb-tcpip.yaml new file mode 100644 index 0000000..9ede1df --- /dev/null +++ b/automated/android/multinode/share-local-device-over-adb-tcpip.yaml @@ -0,0 +1,36 @@ +metadata: + name: share-local-device-over-adb-tcpip + format: "Lava-Test-Shell Test Definition 1.0" + description: "adb MultiNode setup: make local device remotely accessible via adb TCP/IP. + Handles the device over to a role that responds to the following synchronization steps: + - lava-sync start_handover + - lava-send dut_address dut_address=${dut_address} + - lava-sync finish_handover" + maintainer: + - karsten@fairphone.com + - softwareteam@fairphone.com + os: + - debian + - ubuntu + devices: + - lxc + scope: + - functional + +params: + ADB_PORT: "5555" + ADB_TCPIP_ATTEMPTS: "5" + TIMEOUT_SECS: "60" + RAISE_ON_FAILURE: "true" + +run: + steps: + - . ./automated/lib/sh-test-lib + - . ./automated/lib/android-test-lib + - . ./automated/lib/android-multinode-test-lib + - ret_val=0 + - share_local_device_over_adb_tcpip "${ADB_TCPIP_ATTEMPTS}" "${TIMEOUT_SECS}" "${ADB_PORT}" || ret_val=$? + - | + if [ "${ret_val}" -ne 0 -a "${RAISE_ON_FAILURE}" = "true" ]; then + lava-test-raise "Could not share device of adb tcpip." + fi diff --git a/automated/android/multinode/tradefed/example-job-template-cts.yaml b/automated/android/multinode/tradefed/example-job-template-cts.yaml new file mode 100644 index 0000000..2be722b --- /dev/null +++ b/automated/android/multinode/tradefed/example-job-template-cts.yaml @@ -0,0 +1,204 @@ +job_name: MutliNode_xTS_template +timeouts: + job: + hours: 2 +priority: medium +visibility: public +reboot_to_fastboot: false + +protocols: + lava-lxc: + master: + name: lxc-xts-master + template: debian + distribution: debian + release: stretch + worker: + name: lxc-xts-worker + template: debian + distribution: debian + release: stretch + lava-multinode: + # There must be one master and an arbitrary number of additional workers, so + # that 1+n devices will be available in the TradeFed shell. + roles: + master: + count: 1 + device_type: # TODO (a Android device, e.g., nexus4) + timeout: + minutes: 30 + worker: + expect_role: master + host_role: master + count: 2 + device_type: # TODO (a Android device, e.g., nexus4) + timeout: + minutes: 30 + +actions: +- deploy: + namespace: tlxc + to: lxc + os: debian + role: + - master + - worker + timeout: + minutes: 10 + packages: + - adb + - fastboot + - unzip + - wget + - zip + +- boot: + namespace: tlxc + role: + - master + - worker + prompts: + - 'root@(.*):/#' + timeout: + minutes: 5 + method: lxc + +# TODO: Device-type specific deployment. +# - deploy: +# namespace: droid +# to: fastboot +# role: +# - master +# - worker +# timeout: +# minutes: 30 +# images: +# # TODO +# os: debian + +- boot: + namespace: droid + role: + - master + - worker + timeout: + minutes: 10 + method: fastboot + +- test: + namespace: tlxc + role: + - master + - worker + timeout: + minutes: 20 + definitions: + - repository: https://review.linaro.org/qa/test-definitions + from: git + path: automated/android/wait-single-boot-completed.yaml + name: wait-single-boot-completed + +- test: + namespace: tlxc + role: + - worker + timeout: + minutes: 5 + definitions: + - repository: https://review.linaro.org/qa/test-definitions + from: git + path: automated/android/wait-single-network-connected.yaml + name: wait-single-network-connected + +- test: + namespace: tlxc + role: + - worker + timeout: + minutes: 15 + definitions: + - repository: https://review.linaro.org/qa/test-definitions + from: git + path: automated/android/multinode/share-local-device-over-adb-tcpip.yaml + name: share-local-device-over-adb-tcpip + params: + TIMEOUT_SECS: "600" + +- test: + namespace: tlxc + role: + - worker + timeout: + hours: 1 + definitions: + - repository: https://review.linaro.org/qa/test-definitions + from: git + path: automated/android/multinode/wait-and-keep-local-device-accessible.yaml + name: wait-and-keep-local-device-accessible + params: + # The sum of these timeouts must be smaller than the lava-multinode timeout for the master. + BOOT_TIMEOUT_SECS: "480" + NETWORK_TIMEOUT_SECS: "300" + ADB_CONNECT_TEST_TIMEOUT_SECS: "60" + +- test: + namespace: tlxc + role: + - worker + timeout: + minutes: 15 + definitions: + - repository: https://review.linaro.org/qa/test-definitions + from: git + path: automated/android/multinode/wait-for-release-and-reset.yaml + name: wait-for-release-and-reset + +- test: + namespace: tlxc + role: + - master + timeout: + minutes: 15 + definitions: + - repository: https://review.linaro.org/qa/test-definitions + from: git + path: automated/android/multinode/connect-to-remote-adb-tcpip-devices.yaml + name: connect-to-remote-adb-tcpip-devices + params: + ADB_CONNECT_TIMEOUT_SECS: 300 + +- test: + namespace: tlxc + role: + - master + timeout: + hours: 1 + definitions: + - repository: https://review.linaro.org/qa/test-definitions + from: git + path: automated/android/multinode/tradefed/tradefed-multinode.yaml + params: + TEST_PARAMS: "run cts --disable-reboot --include-filter CtsNetTestCases" + TEST_RETRY_PARAMS: "run cts --disable-reboot" + TEST_PATH: "android-cts" + TEST_URL: "https://dl.google.com/dl/android/cts/android-cts-7.1_r23-linux_x86-arm.zip" + STATE_CHECK_FREQUENCY_SECS: "300" + MAX_NUM_RUNS: "25" + RUNS_IF_UNCHANGED: "5" + FAILURES_PRINTED: "50" + # For Artifactorial: + # URL: "" + # TOKEN: "" + name: cts + +- test: + namespace: tlxc + role: + - master + timeout: + minutes: 10 + definitions: + - repository: https://review.linaro.org/qa/test-definitions + from: git + path: automated/android/multinode/release-remote-adb-tcpip-devices.yaml + name: release-remote-adb-tcpip-devices diff --git a/automated/android/multinode/tradefed/sts_util.py b/automated/android/multinode/tradefed/sts_util.py new file mode 100644 index 0000000..a90ce70 --- /dev/null +++ b/automated/android/multinode/tradefed/sts_util.py @@ -0,0 +1,144 @@ +"""Utilities for handling STS-specific behavior in TradeFed. + +The following behavior was noted at least in the 2018-09 version of STS for +Android 7. When running STS, it manipulates the logged device fingerprint to +show up as a 'user' build with 'release-keys', even when using the required +setup with either 'userdebug' or 'eng' build. That behavior breaks the TradeFed +rerun feature, as the fingerprint read from the device will not match the logged +fingerprint of a previous run. + +StsUtil works around this behavior by reverting the manipulated fingerprint in +the log file to the string reported by the device. tradefed-runner-multinode.py +uses this module to apply STS workarounds automatically when STS is run. +""" + +import os +import shutil +import subprocess +import xml.etree.ElementTree as ET + + +class StsUtil: + """Interface for STS related workarounds when automating TradeFed. + + For applying StsUtil, use one instance per TradeFed STS invocation. Ideally, + construct it before running any tests, so when the passed device is in a + good known state. Call fix_result_file_fingerprints() after each completed + run, before rerunning. + + Applying StsUtil to non-STS TradeFed runs does not help, but should also not + affect the results in any way. + """ + + def __init__( + self, device_serial_or_address, logger, device_access_timeout_secs=60 + ): + """Construct a StsUtil instance for a TradeFed invocation. + + Args: + device_serial_or_address (str): + Serial number of network address if the device that will be used + to determine the reference fingerprint. + logger (logging.Logger) + Logger instance to redirect messages to. + device_access_timeout_secs (int): + Timeout in seconds for `adb` calls. + """ + + self.device_serial_or_address = device_serial_or_address + self.logger = logger + self.device_access_timeout_secs = device_access_timeout_secs + # Try reading the device fingerprint now. There is a better chance that + # the device is in a good state now than after a test run. If reading + # fails here, however, we can still retry in + # fix_result_file_fingerprints(). + try: + self.device_fingerprint = self.read_device_fingerprint() + except subprocess.CalledProcessError: + self.device_fingerprint = None + + def read_device_fingerprint(self): + """Read the fingerprint of device_serial_or_address via adb. + + Returns: + str: + Fingerprint of the device. + + Raises: + subprocess.CalledProcessError: + If the communication with `adb` does not lead to + expected results. + """ + + fingerprint = subprocess.check_output( + [ + "adb", + "-s", + self.device_serial_or_address, + "shell", + "getprop", + "ro.build.fingerprint", + ], + universal_newlines=True, + timeout=self.device_access_timeout_secs, + ).rstrip() + + self.logger.debug("Device reports fingerprint '%s'", fingerprint) + + return fingerprint + + def fix_result_file_fingerprints(self, result_dir): + """Fix STS-manipulated device fingerprints in result files. + + This will replace the fingerprint in the result files with the correct + fingerprint as reported by the device. + + Args: + result_dir (str): + Path to the result directory of the STS run to fix. This folder + must contain a test_result.xml and test_result_failures.html, + which are both present in a result folder of a completed + TradeFed run. + + Raises: + subprocess.CalledProcessError: + If the device fingerprint could not be determined via adb. + """ + + if self.device_fingerprint is None: + self.device_fingerprint = self.read_device_fingerprint() + + test_result_path = os.path.join(result_dir, "test_result.xml") + test_result_path_orig = test_result_path + ".orig" + shutil.move(test_result_path, test_result_path_orig) + + test_result_failures_path = os.path.join( + result_dir, "test_result_failures.html" + ) + test_result_failures_path_orig = test_result_failures_path + ".orig" + shutil.move(test_result_failures_path, test_result_failures_path_orig) + + # Find the manipulated fingerprint in the result XML. + test_result_tree = ET.parse(test_result_path_orig) + result_build_node = test_result_tree.getroot().find("Build") + manipulated_fingerprint = result_build_node.get("build_fingerprint") + + self.logger.debug( + "Reverting STS manipulated device fingerprint: '%s' -> '%s'", + manipulated_fingerprint, + self.device_fingerprint, + ) + + # Fix the fingerprint in the result file. + result_build_node.set("build_fingerprint", self.device_fingerprint) + test_result_tree.write(test_result_path) + + # Fix the fingerprint in the failures overview HTML. + with open( + test_result_failures_path_orig, "r" + ) as test_result_failures_file: + test_result_failures = test_result_failures_file.read().replace( + manipulated_fingerprint, self.device_fingerprint + ) + with open(test_result_failures_path, "w") as test_result_failures_file: + test_result_failures_file.write(test_result_failures) diff --git a/automated/android/multinode/tradefed/tradefed-multinode.sh b/automated/android/multinode/tradefed/tradefed-multinode.sh new file mode 100755 index 0000000..567b09a --- /dev/null +++ b/automated/android/multinode/tradefed/tradefed-multinode.sh @@ -0,0 +1,158 @@ +#!/bin/sh -ex + +# shellcheck disable=SC1091 +. ../../../lib/sh-test-lib +# shellcheck disable=SC1091 +. ../../../lib/android-test-lib + +TIMEOUT_SECS="300" +DEVICE_WORKER_MAPPING_FILE="" +TEST_URL="http://testdata.validation.linaro.org/cts/android-cts-7.1_r1.zip" +TEST_PARAMS="run cts -m CtsBionicTestCases --disable-reboot --skip-preconditions --skip-device-info" +TEST_RETRY_PARAMS="run cts --disable-reboot --skip-preconditions --skip-device-info" +MAX_NUM_RUNS="10" +RUNS_IF_UNCHANGED="3" +TEST_PATH="android-cts" +STATE_CHECK_FREQUENCY_SECS="60" +RESULT_FORMAT="aggregated" +RESULT_FILE="$(pwd)/output/result.txt" +export RESULT_FILE +# the default number of failed test cases to be printed +FAILURES_PRINTED="0" +# WIFI AP SSID +AP_SSID="" +# WIFI AP KEY +AP_KEY="" +JAVA_OPTIONS="-Xmx350M" + +usage() { + cat <<heredoc +Usage: +$0 [-o timeout_secs] [ -m device_worker_mapping_file] [-c cts_url] +[-t test_params] [-u test_retry_params] [-i max_num_runs] [-n runs_if_unchanged] +[-p test_path] [-s state_check_frequency_secs] [-r <aggregated|atomic>] +[-f failures_printed] [-a <ap_ssid>] [-k <ap_key>] [-j <java_options>] +heredoc + exit 1 +} + +while getopts ':o:m:c:t:u:i:n:p:s:r:f:a:k:j:' opt; do + case "${opt}" in + o) TIMEOUT_SECS="${OPTARG}" ;; + m) DEVICE_WORKER_MAPPING_FILE="${OPTARG}" ;; + c) TEST_URL="${OPTARG}" ;; + t) TEST_PARAMS="${OPTARG}" ;; + u) TEST_RETRY_PARAMS="${OPTARG}" ;; + i) MAX_NUM_RUNS="${OPTARG}" ;; + n) RUNS_IF_UNCHANGED="${OPTARG}" ;; + p) TEST_PATH="${OPTARG}" ;; + s) STATE_CHECK_FREQUENCY_SECS="${OPTARG}" ;; + r) RESULT_FORMAT="${OPTARG}" ;; + f) FAILURES_PRINTED="${OPTARG}" ;; + a) AP_SSID="${OPTARG}" ;; + k) AP_KEY="${OPTARG}" ;; + j) JAVA_OPTIONS="${OPTARG}" ;; + *) usage ;; + esac +done + +if [ -e "/home/testuser" ]; then + export HOME=/home/testuser +fi + +ANDROID_SERIALS="" +if [ -n "${DEVICE_WORKER_MAPPING_FILE}" ]; then + deviceWorkerMapping="$(grep -ve '^$' "${DEVICE_WORKER_MAPPING_FILE}")" + for deviceToWorker in ${deviceWorkerMapping}; do + ANDROID_SERIALS="${ANDROID_SERIALS}$(echo "${deviceToWorker}" | cut -d';' -f1)," + done +fi +ANDROID_SERIALS="${ANDROID_SERIALS%,}" + + +IFS=","; for ANDROID_SERIAL in ${ANDROID_SERIALS}; do + info_msg "Processing device ${ANDROID_SERIAL}" + export ANDROID_SERIAL; wait_boot_completed "${TIMEOUT_SECS}" +done; unset IFS + +IFS=","; for ANDROID_SERIAL in ${ANDROID_SERIALS}; do + info_msg "Processing device ${ANDROID_SERIAL}" + export ANDROID_SERIAL; disable_suspend +done; unset IFS + +# wait_homescreen() searches logcat output for +# 'Displayed com.android.launcher', but the log might be washed away when +# a lot of logs generated after it. When the function not executed in +# time, error occurs. This has been observer several times on lkft +# testing. Refer to the following link: + # https://lkft.validation.linaro.org/scheduler/job/18918#L4721 +# We are already using wait_boot_completed() to check boot status, lets +# comment out wait_homescreen() and see if wait_boot_completed() is +# sufficient. +# wait_homescreen "${TIMEOUT}" + +# Increase the heap size. KVM devices in LAVA default to ~250M of heap +# This, however, breaks STS: the sts-tradefed script checks only the first line +# of `java -version` output, which becomes `Picked up _JAVA_OPTIONS: ...`. +# cts-tradefed checks for the first two lines and is therefore more robust here. +if [ "${TEST_PATH}" != "android-sts" ]; then + export _JAVA_OPTIONS="${JAVA_OPTIONS}" +fi +java -version + +# Download CTS/VTS test package or copy it from local disk. +if echo "${TEST_URL}" | grep "^http" ; then + wget -S --progress=dot:giga "${TEST_URL}" +else + cp "${TEST_URL}" ./ +fi +file_name=$(basename "${TEST_URL}") +unzip -q "${file_name}" +rm -f "${file_name}" + +if [ -d "${TEST_PATH}/results" ]; then + mv "${TEST_PATH}/results" "${TEST_PATH}/results_$(date +%Y%m%d%H%M%S)" +fi + +# FIXME removing timer-suspend from vts test as it breaks the testing in lava +if [ -e "${TEST_PATH}/testcases/vts/testcases/kernel/linux_kselftest/kselftest_config.py" ]; then + sed -i "/suspend/d" "${TEST_PATH}"/testcases/vts/testcases/kernel/linux_kselftest/kselftest_config.py +fi + +# try to connect wifi if AP information specified +IFS=","; for ANDROID_SERIAL in ${ANDROID_SERIALS}; do + info_msg "Processing device ${ANDROID_SERIAL}" + export ANDROID_SERIAL; adb_join_wifi "${AP_SSID}" "${AP_KEY}" +done; unset IFS + +# Run tradefed test. +info_msg "About to run tradefed shell on following devices: ${ANDROID_SERIALS}" + +# This part is critical: if this is set, TradeFed will only connect to the one specified device. +unset ANDROID_SERIAL + +runner_exited_cleanly="pass" +./tradefed-runner-multinode.py -t "${TEST_PARAMS}" -u "${TEST_RETRY_PARAMS}" -i "${MAX_NUM_RUNS}" \ + -n "${RUNS_IF_UNCHANGED}" -p "${TEST_PATH}" -s "${STATE_CHECK_FREQUENCY_SECS}" \ + -r "${RESULT_FORMAT}" -f "${FAILURES_PRINTED}" -m "${DEVICE_WORKER_MAPPING_FILE}" \ + || runner_exited_cleanly="fail" + +# "fail" here means that an unexpected error/exception occurred in the runner. +# Expected exceptions will be caught in the runner and reported via +# `tradefed-test-run fail` +if [ "${runner_exited_cleanly}" = "fail" ]; then + warn_msg "The TradeFed runner reported failure." +fi +echo "TradeFed-runner-exited-cleanly ${runner_exited_cleanly}" | tee -a "${RESULT_FILE}" + +IFS=","; for ANDROID_SERIAL in ${ANDROID_SERIALS}; do + info_msg "Processing device ${ANDROID_SERIAL}" + export ANDROID_SERIAL; disable_suspend false || true +done; unset IFS + +unset ANDROID_SERIAL + +if [ "${runner_exited_cleanly}" = "fail" ]; then + # Report failure to complete back to the test shell. + exit 1 +fi diff --git a/automated/android/multinode/tradefed/tradefed-multinode.yaml b/automated/android/multinode/tradefed/tradefed-multinode.yaml new file mode 100644 index 0000000..8c02eb6 --- /dev/null +++ b/automated/android/multinode/tradefed/tradefed-multinode.yaml @@ -0,0 +1,131 @@ +metadata: + name: cts + format: "Lava-Test-Shell Test Definition 1.0" + description: "Run tradefed based tests in LAVA." + maintainer: + - milosz.wasilewski@linaro.org + - chase.qi@linaro.org + os: + - debian + - ubuntu + devices: + - lxc + scope: + - functional + +params: + SKIP_INSTALL: "false" + # Specify timeout in seconds for wait_boot_completed and wait_homescreen. + TIMEOUT_SECS: "300" + # Download CTS package or copy it from local disk. + # CTS_URL: "/root/android-cts/linaro/7.1_r1/android-cts-7.1_r1.zip" + TEST_URL: "http://testdata.validation.linaro.org/cts/android-cts-7.1_r1.zip" + TEST_PARAMS: "run cts -m CtsBionicTestCases --abi arm64-v8a --disable-reboot --skip-preconditions --skip-device-info" + TEST_RETRY_PARAMS: "" + # Determine the number of shards automatically and add the + # `${TEST_PARAM_SHARDS_COUNT} N` parameter to the CTS invocation. + TEST_PARAMS_AUTO_SHARDS: "true" + TEST_PARAM_SHARDS_COUNT: "--shards" + MAX_NUM_RUNS: "10" + RUNS_IF_UNCHANGED: "3" + # set to the name of the top directory in TEST_URL archive + # This should be 'android-cts' for CTS and android-vts for VTS + TEST_PATH: "android-cts" + STATE_CHECK_FREQUENCY_SECS: "60" + # Specify result format: aggregated or atomic + RESULTS_FORMAT: "aggregated" + # Specify url and token for file uploading. + URL: "https://archive.validation.linaro.org/artifacts/team/qa/" + TOKEN: "" + ARTIFACTORIAL_UPLOAD_ATTEMPTS: 5 + ARTIFACTORIAL_UPLOAD_RETRY_WAIT_MINUTES: 5 + AP_SSID: "" + AP_KEY: "" + JAVA_OPTIONS: "-Xmx350M" + # Specify the failures number to be printed + FAILURES_PRINTED: "0" + # File listing local and remote adb devices to be used by TradeFed. + # This file must contain lines in the format <device>[;<workerId>] + # For devices attached via adb tcpip, <device> is the device network address + # with port and <workerId> is the LAVA MultiNode worker job id. + # For devices locally connected via USB, <device> the serial number of the + # device and <workerId> must be empty. + DEVICE_WORKER_MAPPING_FILE: "/tmp/deviceWorkerMapping" + # Let the whole test run fail if the test runner failed to exit cleanly. + RAISE_ON_FAILURE: "true" + +run: + steps: + - . ./automated/lib/sh-test-lib # for error_msg + - | + if [ -z "${DEVICE_WORKER_MAPPING_FILE}" -o ! -f "${DEVICE_WORKER_MAPPING_FILE}" ]; then + lava-test-raise "Parameter DEVICE_WORKER_MAPPING_FILE must be defined and point to an existing file." + fi + - lava-install-packages --no-install-recommends aapt curl default-jre-headless python3-pexpect sudo usbutils wget xz-utils zip + # delete the test user to clean environment + - userdel testuser -r -f || true + # create test use to run the cts/vts tests + - useradd -m testuser && echo "testuser created successfully" + - cd ./automated/android/multinode/tradefed + - chown testuser:testuser . + - | + if [ "${TEST_PARAMS_AUTO_SHARDS}" = "true" ]; then + num_shards="$(lava-role list | grep -v '^$' | wc -l)" + info_msg "Determined number of shards based on MultiNode role counts: ${num_shards}" + if [ "${num_shards}" -ne 1 ]; then + TEST_PARAMS="${TEST_PARAMS} ${TEST_PARAM_SHARDS_COUNT} ${num_shards}" + if [ -n "${TEST_RETRY_PARAMS}" ]; then + TEST_RETRY_PARAMS="${TEST_RETRY_PARAMS} ${TEST_PARAM_SHARDS_COUNT} ${num_shards}" + fi + fi + fi + # Run the actual TradeFed script. PATH is passed through to make lava MultiNode commands + # available in the CTS execution scripts. + - | + exec_result=0 + sudo -u testuser env "PATH=${PATH}" ./tradefed-multinode.sh \ + -o "${TIMEOUT_SECS}" -c "${TEST_URL}" -t "${TEST_PARAMS}" \ + -u "${TEST_RETRY_PARAMS}" -i "${MAX_NUM_RUNS}" \ + -n "${RUNS_IF_UNCHANGED}" -p "${TEST_PATH}" \ + -s "${STATE_CHECK_FREQUENCY_SECS}" -r "${RESULTS_FORMAT}" \ + -m "${DEVICE_WORKER_MAPPING_FILE}" -f "${FAILURES_PRINTED}" \ + -a "${AP_SSID}" -k "${AP_KEY}" -j "${JAVA_OPTIONS}" \ + || exec_result=$? + # Upload test log and result files to artifactorial. + - cp -r ./${TEST_PATH}/results ./output/ || true + - cp -r ./${TEST_PATH}/logs ./output/ || true + # Include logs dumped from TF shell 'd l' command. + - if ls /tmp/tradefed*; then cp -r /tmp/tradefed* ./output || true; fi + - tar caf tradefed-output-$(date +%Y%m%d%H%M%S).tar.xz ./output + - ATTACHMENT=$(ls tradefed-output-*.tar.xz) + - | + for i in $(seq "${ARTIFACTORIAL_UPLOAD_ATTEMPTS}"); do + attachment_result=0 + ../../../utils/upload-to-artifactorial.sh -a "${ATTACHMENT}" -u "${URL}" -t "${TOKEN}" -v -r || attachment_result=$? + if [ "${attachment_result}" -eq 0 ]; then break; fi + if [ "${i}" -lt "${ARTIFACTORIAL_UPLOAD_ATTEMPTS}" ]; then + echo "WARNING: Upload to Artifactorial failed, waiting ${ARTIFACTORIAL_UPLOAD_RETRY_WAIT_MINUTES} minutes and retrying..." + sleep "${ARTIFACTORIAL_UPLOAD_RETRY_WAIT_MINUTES}m" + else + echo "WARNING: Upload to Artifactorial failed too often, not retrying anymore." + fi + done + # Send test result to LAVA. + - ../../../utils/send-to-lava.sh ./output/result.txt || true + - userdel testuser -f -r || true + # When adb devices are lost, mark the test job as 'incomplete' + - | + lost_devices="" + for device in $(awk -F';' 'length {print $1}' "${DEVICE_WORKER_MAPPING_FILE}"); do + if ! adb -s "${device}" shell echo ok; then + lost_devices="${lost_devices}${device}, " + fi + done + lost_devices="${lost_devices%??}" + if [ "${lost_devices}" ]; then + warn_msg "Following adb devices are lost: ${lost_devices}" + fi + - | + if [ "${exec_result}" -ne 0 -a "${RAISE_ON_FAILURE}" = "true" ]; then + lava-test-raise "Test runner did not exit cleanly." + fi diff --git a/automated/android/multinode/tradefed/tradefed-runner-multinode.py b/automated/android/multinode/tradefed/tradefed-runner-multinode.py new file mode 100755 index 0000000..e2da6ec --- /dev/null +++ b/automated/android/multinode/tradefed/tradefed-runner-multinode.py @@ -0,0 +1,392 @@ +#!/usr/bin/env python3 + +import argparse +import datetime +import logging +import os +import pexpect +import re +import subprocess +import sys +import time + + +sys.path.insert(0, '../../../lib/') +sys.path.insert(1, '../../') +import py_test_lib # nopep8 +import tradefed.result_parser as result_parser # nopep8 +from multinode.tradefed.utils import * # nopep8 +from multinode.tradefed.sts_util import StsUtil # nopep8 + + +OUTPUT = '%s/output' % os.getcwd() +RESULT_FILE = '%s/result.txt' % OUTPUT +TRADEFED_STDOUT = '%s/tradefed-stdout.txt' % OUTPUT +TRADEFED_LOGCAT = '%s/tradefed-logcat-%s.txt' % (OUTPUT, '%s') + + +parser = argparse.ArgumentParser() +parser.add_argument('-t', dest='TEST_PARAMS', required=True, + help="TradeFed shell test parameters") +parser.add_argument('-u', dest='TEST_RETRY_PARAMS', required=False, + help="TradeFed shell test parameters for TradeFed session retry") +parser.add_argument('-i', dest='MAX_NUM_RUNS', required=False, default=10, type=int, + help="Maximum number of TradeFed runs. Based on the first run, retries can be \ + triggered to stabilize the results of the test suite.") +parser.add_argument('-n', dest='RUNS_IF_UNCHANGED', required=False, default=3, type=int, + help="Number of runs while the number of failures and completed modules does \ + not change. Results are considered to be stable after this number of runs.") +parser.add_argument('-p', dest='TEST_PATH', required=True, + help="path to TradeFed package top directory") +parser.add_argument('-s', dest='STATE_CHECK_FREQUENCY_SECS', required=False, default=60, type=int, + help="Every STATE_CHECK_FREQUENCY_SECS seconds, the state of connected devices is \ + checked and the last few lines TradeFed output are printed. Increase this time \ + for large test suite runs to reduce the noise in the LAVA logs.") +parser.add_argument('-r', dest='RESULTS_FORMAT', required=False, + default=result_parser.TradefedResultParser.AGGREGATED, + choices=[result_parser.TradefedResultParser.AGGREGATED, + result_parser.TradefedResultParser.ATOMIC], + help="The format of the saved results. 'aggregated' means number of \ + passed and failed tests are recorded for each module. 'atomic' means \ + each test result is recorded separately") +parser.add_argument('-m', dest='DEVICE_WORKER_MAPPING_FILE', required=True, + help="File listing adb devices to be used for testing. For devices connected \ + via adb TCP/IP, the LAVA worker job id should be given as second column, \ + separated by semicolon. Individual lines in that files will look like \ + \"some_device_serial\" or \"some_device_ip;worker_host_id\"") + +# The total number of failed test cases to be printed for this job +# Print too much failures would cause the lava job timed out +# Default to not print any failures +parser.add_argument('-f', dest='FAILURES_PRINTED', type=int, + required=False, default=0, + help="Specify the number of failed test cases to be\ + printed, 0 means not print any failures.") + +args = parser.parse_args() + +if os.path.exists(OUTPUT): + suffix = datetime.datetime.now().strftime('%Y%m%d%H%M%S') + shutil.move(OUTPUT, '%s_%s' % (OUTPUT, suffix)) +os.makedirs(OUTPUT) + +# Setup logger. +# There might be an issue in lava/local dispatcher, most likely problem of +# pexpect. It prints the messages from print() last, not by sequence. +# Use logging and subprocess.run() to work around this. +logger = logging.getLogger("TradefedRunnerMultinode") +logger.setLevel(logging.DEBUG) +ch = logging.StreamHandler() +ch.setLevel(logging.DEBUG) +formatter = logging.Formatter('%(asctime)s - %(name)s: %(levelname)s: %(message)s') +ch.setFormatter(formatter) +logger.addHandler(ch) + +devices = [] +try: + with open(args.DEVICE_WORKER_MAPPING_FILE) as mappingFile: + for line in filter(None, (line.rstrip() for line in mappingFile)): + deviceToWorker = line.split(sep=";") + device_address = deviceToWorker[0] + worker_job_id = (None if (len(deviceToWorker) == 1 or not deviceToWorker[1]) + else deviceToWorker[1]) + devices.append(Device(device_address, TRADEFED_LOGCAT % device_address, worker_job_id)) +except OSError as e: + logger.error("Mapping file cannot be opened: %s" % args.DEVICE_WORKER_MAPPING_FILE) + sys.exit(1) + +logger.info('Configured devices:') +for device in devices: + if device.worker_job_id is None: + logger.info("%s (locally connected via USB)" % device.serial_or_address) + else: + logger.info("%s (remote worker job id: %s)" % (device.serial_or_address, device.worker_job_id)) + + +def release_all_devices(): + for device in devices: + device.release() + + +def cleanup_and_exit(exit_code=0, message=None): + if message: + logger.error(message) + release_all_devices() + sys.exit(exit_code) + + +tradefed_stdout = open(TRADEFED_STDOUT, 'w') + +logger.info('Test params: %s' % args.TEST_PARAMS) +logger.info('Starting TradeFed shell and waiting for device detection...') + +command = None +prompt = None +results_heading_re = None +results_line_re = None +valid_test_paths = ['android-cts', 'android-gts', 'android-sts'] +if args.TEST_PATH in valid_test_paths: + suite = args.TEST_PATH[-3:] + command = "android-%s/tools/%s-tradefed" % (suite, suite) + prompt = "%s-tf >" % suite + results_heading_re = re.compile(r'Session\s+Pass\s+Fail\s+Modules\s+Complete\s+Result Directory\s+Test Plan\s+Device serial\(s\)\s+Build ID\s+Product') + results_line_re_without_session = r'\s+(\d+\s+){3,3}(of)\s+\d+\s+' + +if command is None: + cleanup_and_exit(1, "Not supported path: %s" % args.TEST_PATH) + +if args.TEST_PATH == 'android-sts': + stsUtil = StsUtil(devices[0].serial_or_address, logger) + +# Locate and parse test result. +result_dir_parent = os.path.join(args.TEST_PATH, 'results') + + +def last_result_dir(): + latest_subdir = next(reversed(sorted([ + d for d in os.listdir(result_dir_parent) + if os.path.isdir(os.path.join(result_dir_parent, d)) + ]))) + + return os.path.join(result_dir_parent, latest_subdir) + + +device_detected_re = re.compile(r'DeviceManager: Detected new device ') +device_detected_search_re = re.compile(r'DeviceManager: Detected new device .*$', flags=re.M) +tradefed_start_retry_count = 5 +all_devices_names = set(device.serial_or_address for device in devices) +for tradefed_start_retry in range(tradefed_start_retry_count): + child = pexpect.spawnu(command, logfile=tradefed_stdout) + try: + devices_to_detect = all_devices_names.copy() + while devices_to_detect: + # Find and parse output lines following this pattern: + # 04-23 12:30:33 I/DeviceManager: Detected new device serial_or_address + child.expect(device_detected_re, timeout=30) + output_lines = subprocess.check_output(['tail', TRADEFED_STDOUT]).decode("utf-8") + matches = [match[1].strip() for match in + (device_detected_re.split(line_match) + for line_match in device_detected_search_re.findall(output_lines)) + if len(match) == 2 and match[1]] + for match in matches: + try: + devices_to_detect.remove(match) + except KeyError: + if match not in all_devices_names: + logger.debug('Unexpected device detected: %s' % match) + + except (pexpect.TIMEOUT, pexpect.EOF) as e: + logger.warning('TradeFed did not detect all devices. Checking device availability and restarting TradeFed...') + print(e) + child.terminate(force=True) + missing_devices = [device for device in devices + if device.serial_or_address in devices_to_detect] + for device in missing_devices: + if not device.ensure_available(logger=logger): + cleanup_and_exit( + 1, + 'adb device %s is not available and reconnection attempts failed. Aborting.' + % device.serial_or_address) + +if devices_to_detect: + cleanup_and_exit( + 1, + 'TradeFed did not detect all available devices after %s retries. Aborting.' + % tradefed_start_retry_count) + +logger.info('Starting TradeFed shell test.') +try: + child.expect(prompt, timeout=60) + child.sendline(args.TEST_PARAMS) +except pexpect.TIMEOUT: + result = 'lunch-tf-shell fail' + py_test_lib.add_result(RESULT_FILE, result) + +retry_check = RetryCheck(args.MAX_NUM_RUNS, args.RUNS_IF_UNCHANGED) + +# Loop while TradeFed is running. +# This loop will rerun TradeFed if requested, until the number of failures stabilizes or a maximum +# number of retries is reached. +# Meanwhile, try to keep all devices accessible. For remote devices, use handshakes to inform remote +# workers that their locally connected device needs to be reset. +# The worker host side of the LAVA MultiNode messages is implemented in +# wait-and-keep-local-device-accessible.yaml +fail_to_complete = False +# Assuming TradeFed is started from a clean environment, the first run will have the id 0 +# Each retry gets a new session id. +tradefed_session_id = 0 +result_summary = None +while child.isalive(): + subprocess.run('echo') + subprocess.run(['echo', '--- line break ---']) + logger.info('Checking adb connectivity...') + for device in devices: + device.ensure_available(logger=logger) + num_available_devices = sum(device.is_available() for device in devices) + if num_available_devices < len(devices): + logger.debug('Some devices are lost. Dumping state of adb/USB devices.') + child.sendline('dump logs') + subprocess.run(['sh', '-c', '. ../../../lib/sh-test-lib && . ../../../lib/android-test-lib ' + '&& adb_debug_info']) + logger.debug('"adb devices" output') + subprocess.run(['adb', 'devices']) + + if num_available_devices == 0: + logger.error('adb connection to all devices lost!! Will wait for 5 minutes and ' + 'terminating TradeFed shell test!') + time.sleep(300) + child.terminate(force=True) + result = 'check-adb-connectivity fail' + py_test_lib.add_result(RESULT_FILE, result) + fail_to_complete = True + break + + logger.info("Currently available devices: %s" % + [device.serial_or_address for device in devices if device.is_available()]) + + # Check if all tests finished every minute. + m = child.expect(['ResultReporter: Full Result:', + 'ConsoleReporter:.*Test run failed to complete.', + pexpect.TIMEOUT], + timeout=args.STATE_CHECK_FREQUENCY_SECS) + + # TradeFed run not finished yet, continue to wait. + if m == 2: + # Flush pexpect input buffer. + child.expect(['.+', pexpect.TIMEOUT, pexpect.EOF], timeout=1) + logger.info('Printing tradefed recent output...') + subprocess.run(['tail', TRADEFED_STDOUT]) + continue + + # A module or test run failed to complete. This is a case for TradeFed retry + if m == 1: + fail_to_complete = True + logger.warning('TradeFed reported failure to complete a module.') + # TradeFed didn't report completion yet, so keep going. + continue + + assert m == 0 + + # All tests finished. Check if rerunning is necessary/sensible. + # Once all tests and reruns finished, exit from TradeFed shell to throw EOF, + # which sets child.isalive() to false. + try: + logger.debug('Checking TradeFed session result...') + child.expect(prompt, timeout=60) + child.sendline('list results') + child.expect(results_heading_re, timeout=60) + results_line_re = \ + re.compile('(%s)%s' % + (str(tradefed_session_id), # Expect the current session ID in the output + results_line_re_without_session)) + child.expect(results_line_re, timeout=60) + output_lines = subprocess.check_output(['tail', TRADEFED_STDOUT]) + output_lines_match = results_line_re.search(str(output_lines)) + if output_lines_match is None: + cleanup_and_exit( + 1, + 'Unexpected TradeFed output. Could not find expected results line for the current ' + 'TradeFed session (%s)' % str(tradefed_session_id)) + # Expected column contents: see results_heading_re + result_line_columns = re.split(r'\s+', output_lines_match.group()) + pass_count = result_line_columns[1] + failure_count = result_line_columns[2] + modules_completed = result_line_columns[3] + modules_total = result_line_columns[5] + timestamp = result_line_columns[6] + result_summary = ResultSummary(failure_count, modules_completed, modules_total, timestamp) + retry_check.post_result(result_summary) + logger.info('Finished TradeFed session %s. %s of %s modules completed with %s passed ' + 'tests and %s failures.' + % (tradefed_session_id, str(modules_completed), + str(modules_total), str(pass_count), str(failure_count))) + except (pexpect.TIMEOUT, pexpect.EOF) as e: + logger.error('Unexpected TradeFed output/behavior while trying to fetch test run results. ' + 'Printing the exception and killing the TradeFed process...') + print(e) + child.terminate(force=True) + fail_to_complete = True + break + + # Preparing for rerunning or releasing results. + # A workaround is required here for STS; It patches the device fingerprint + # that is stored in the result files, to make it look like a 'user' build + # with 'release-keys'. + # That actually breaks the TradeFed retry feature, as the stored fingerprint + # won't match anymore with the fingerprint reported by the device. + if suite == 'sts': + try: + stsUtil.fix_result_file_fingerprints(last_result_dir()) + except subprocess.CalledProcessError as e: + fail_to_complete = True + print(e) + logger.error('Could not apply workarounds for STS due to an ' + 'adb-related error. Cannot continue with TradeFed ' + 'reruns; results might be incomplete.') + child.terminate(force=True) + break + + # Retry if necessary and applicable. + # NOTE: both checks here should be equivalent, but checking both of them might make the TradeFed + # output parsing more reliable. + if not result_summary.was_successful() or fail_to_complete: + if args.TEST_RETRY_PARAMS is None: + logger.debug('NOT retrying TradeFed session as TEST_RETRY_PARAMS is not defined.') + elif not retry_check.should_continue(): + logger.info('NOT retrying TradeFed session as maximum number of retries is reached.') + else: + logger.info('Retrying with results of session %s' % tradefed_session_id) + try: + child.expect(prompt, timeout=60) + child.sendline('%s --retry %s' % (args.TEST_RETRY_PARAMS, str(tradefed_session_id))) + tradefed_session_id += 1 + fail_to_complete = False # Reset as we have a new chance to complete. + except pexpect.TIMEOUT: + print(e) + logger.error('Timeout while starting a TradeFed retry. Force killing the child process...') + child.terminate(force=True) + fail_to_complete = True + break + continue + + try: + child.expect(prompt, timeout=60) + logger.debug('Sending "exit" command to TF shell...') + child.sendline('exit') + child.expect(pexpect.EOF, timeout=60) + logger.debug('Child process ended properly.') + except pexpect.TIMEOUT as e: + # The Tradefed shell is hanging longer than expected for some reason. + # We need to kill it, but that most likely doesn't affect the results of + # previously finished test runs, so don't report failure. + print(e) + logger.debug('Timeout while trying to exit cleanly, force killing child process...') + child.terminate(force=True) + break + +tradefed_stdout.close() + +if fail_to_complete: + py_test_lib.add_result(RESULT_FILE, 'tradefed-test-run fail') +else: + py_test_lib.add_result(RESULT_FILE, 'tradefed-test-run pass') + +logger.info('Tradefed test finished') + +# Log only results of the last run. It also lists all successful tests from previous runs. +parser = result_parser.TradefedResultParser(RESULT_FILE) +parser.logger = logger +parser.results_format = args.RESULTS_FORMAT +parser.failures_to_print = args.FAILURES_PRINTED +parser_success = parser.parse_recursively(last_result_dir()) +if not parser_success: + logger.warning( + 'Failed to parse the TradeFed logs. Test result listing in the LAVA ' + 'logs will be incomplete.') + +# Report failure if not all test modules were completed, if the test result +# files seem broken or incomplete or if Tradefed ran into a unknown state. +summary_complete = result_summary.all_modules_completed() if result_summary else False +success = parser_success and not fail_to_complete and summary_complete + +cleanup_and_exit(0 if success else 1) diff --git a/automated/android/multinode/tradefed/utils.py b/automated/android/multinode/tradefed/utils.py new file mode 100644 index 0000000..c3b748f --- /dev/null +++ b/automated/android/multinode/tradefed/utils.py @@ -0,0 +1,246 @@ +import logging +import re +import shutil +import subprocess +import time + + +class Device: + tcpip_device_re = re.compile( + r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}$" + ) + EXEC_IN_LAVA = shutil.which("lava-send") is not None + + def __init__( + self, serial_or_address, logcat_output_filename, worker_job_id=None + ): + self.serial_or_address = serial_or_address + self.is_tcpip_device = bool( + Device.tcpip_device_re.match(self.serial_or_address) + ) + self.logcat_output_file = open(logcat_output_filename, "w") + self.logcat = subprocess.Popen( + ["adb", "-s", serial_or_address, "logcat"], + stdout=self.logcat_output_file, + ) + self.worker_job_id = worker_job_id + self.worker_handshake_iteration = 1 + self._is_available = True + + def ensure_available(self, logger, timeout_secs=30): + """ + High level function that encapsulates all logic for ensuring that a device is accessible. + Returns a boolean indicating if this function succeeded. This function will only return once + the device is available or no other options for reestablishing a connection are known. + + Keyword arguments: + tradefed_pexpect -- pexpect spawnu object that allows to communicate with TradeFed + logger -- logging.getLogger() object to paste some debug information + """ + if self.check_available(timeout_secs=timeout_secs): + self._is_available = True + logger.info("adb device %s is alive" % self.serial_or_address) + # Tell the hosting worker that everything is fine + self.worker_handshake("continue") + return self._is_available + + self._is_available = False + + logger.debug( + "adb connection to %s lost! Trying to reconnect..." + % self.serial_or_address + ) + + # Tell the hosting worker that something is broken + # This call will only return once the device is up and running again, if possible. + self.worker_handshake("reconnect") + + if not self.try_reconnect(): + logger.warning( + "adb connection to %s lost and reconnect failed!" + % self.serial_or_address + ) + return self._is_available + + logger.debug("Successfully reconnected to %s!" % self.serial_or_address) + + # TODO should check if TradeFed detected the device. + + self._is_available = True + return self._is_available + + def is_available(self): + """ + High level function that checks if the last ensure_available() + invocation led to a positive result. + """ + return self._is_available + + def check_available(self, timeout_secs=30): + return ( + subprocess.run( + [ + "timeout", + str(timeout_secs), + "adb", + "-s", + self.serial_or_address, + "shell", + "echo", + "%s:" % self.serial_or_address, + "OK", + ] + ).returncode == 0 + ) + + def try_reconnect(self, reconnectTimeoutSecs=60): + # NOTE: When running inside LAVA, self.is_tcpip_device == (self.worker_job_id is not None). + # However, when running this script directly, there is no such thing as a remote worker ID, + # and reconnect attempts to remote devices may still be useful. + if not self.is_tcpip_device: + # On local devices, we can currently only try to recover from fastboot. + # This would be a good point for a hard reset. + # NOTE: If the boot/reboot process takes longer than the specified timeout, this + # function will return failure, but the device can still become accessible in the next + # iteration of device availability checks. + fastbootRebootTimeoutSecs = ( + 10 + ) # There is no point in waiting longer for fastboot + subprocess.run( + [ + "timeout", + str(fastbootRebootTimeoutSecs), + "fastboot", + "-s", + self.serial_or_address, + "reboot", + ] + ) + bootTimeoutSecs = max( + 10, int(reconnectTimeoutSecs) - fastbootRebootTimeoutSecs + ) + return ( + subprocess.run( + [ + "sh", + "-c", + ". ../../../lib/sh-test-lib && . ../../../lib/android-test-lib && " + 'export ANDROID_SERIAL="%s" && wait_boot_completed %s' + % (self.serial_or_address, bootTimeoutSecs), + ] + ).returncode == 0 + ) + + # adb may not yet have realized that the connection is broken + subprocess.run(["adb", "disconnect", self.serial_or_address]) + time.sleep( + 5 + ) # adb connect ~often~ fails when called ~directly~ after disconnect. + + if ( + subprocess.run( + [ + "timeout", + str(reconnectTimeoutSecs), + "adb", + "connect", + self.serial_or_address, + ] + ).returncode != 0 + ): + return False + if not self.check_available(): + return False + # reestablish logcat connection + self.logcat.kill() + self.logcat = subprocess.Popen( + ["adb", "-s", self.serial_or_address, "logcat"], + stdout=self.logcat_output_file, + ) + return True + + def release(self): + self.logcat.kill() + self.logcat_output_file.close() + self.worker_handshake("release") + + def worker_handshake(self, command): + """ + This function implements the counterpart of wait-and-keep-local-device-accessible.yaml + It is basically a no-op when running outside LAVA. + + """ + + # Nothing to do for local devices and nothing to do when not called by LAVA. + if self.worker_job_id is None or not Device.EXEC_IN_LAVA: + self.worker_handshake_iteration += 1 + return True + + # All commands except release are followed by a lava-send from the worker side. + wait_for_acc = command != "release" + + subprocess.run( + [ + "lava-send", + "master-sync-%s-%s" + % (self.worker_job_id, str(self.worker_handshake_iteration)), + "command=%s" % command, + ] + ) + if wait_for_acc: + subprocess.run( + [ + "lava-wait", + "worker-sync-%s-%s" + % ( + self.worker_job_id, + str(self.worker_handshake_iteration), + ), + ] + ) + # TODO could check result variable from MultiNode cache + self.worker_handshake_iteration += 1 + return True + + +class RetryCheck: + def __init__(self, total_max_retries, retries_if_unchanged): + self.total_max_retries = total_max_retries + self.retries_if_unchanged = retries_if_unchanged + self.current_retry = 0 + self.current_unchanged = 0 + self.last_value = None + + def post_result(self, value): + self.current_retry += 1 + if value == self.last_value: + self.current_unchanged += 1 + else: + self.current_unchanged = 1 + self.last_value = value + + def should_continue(self): + return ( + self.current_retry < self.total_max_retries and self.current_unchanged < self.retries_if_unchanged + ) + + +class ResultSummary: + def __init__( + self, failure_count, modules_completed, modules_total, timestamp + ): + self.failure_count = int(failure_count) + self.modules_completed = int(modules_completed) + self.modules_total = int(modules_total) + self.timestamp = timestamp + + def was_successful(self): + return self.failure_count == 0 and self.all_modules_completed() + + def all_modules_completed(self): + return self.modules_completed == self.modules_total + + def __eq__(self, other): + if isinstance(self, other.__class__): + return self.__dict__ == other.__dict__ + return NotImplemented diff --git a/automated/android/multinode/wait-and-keep-local-device-accessible.yaml b/automated/android/multinode/wait-and-keep-local-device-accessible.yaml new file mode 100644 index 0000000..35460d0 --- /dev/null +++ b/automated/android/multinode/wait-and-keep-local-device-accessible.yaml @@ -0,0 +1,98 @@ +metadata: + name: wait-and-keep-local-device-accessible + format: "Lava-Test-Shell Test Definition 1.0" + description: "Continuously wait for MultiNode messages from a remote role (master) and make the + locally connected device accessible again when it is lost for the remote role." + maintainer: + - karsten@fairphone.com + - softwareteam@fairphone.com + os: + - debian + - ubuntu + devices: + - lxc + scope: + - functional + +params: + ADB_PORT: "5555" + BOOT_TIMEOUT_SECS: "900" + NETWORK_TIMEOUT_SECS: "300" + ADB_TCPIP_ATTEMPTS: "5" + ADB_CONNECT_TEST_TIMEOUT_SECS: "60" + ANDROID_ENABLE_WIFI: "true" + +run: + steps: + - lava-install-packages --no-install-recommends python3-pip python3-setuptools python3-wheel + - pip3 install -q uiautomator + - . ./automated/lib/sh-test-lib + - . ./automated/lib/android-test-lib + - . ./automated/lib/android-multinode-test-lib + - lava-test-set start keepAlive + - | + reconnect_device() { + timeout 10 fastboot reboot || true + + local ret_val=0 + sh -c ". ./automated/lib/sh-test-lib && . ./automated/lib/android-test-lib \ + && wait_boot_completed \"${BOOT_TIMEOUT_SECS}\"" \ + || ret_val=$? + + if [ "${ret_val}" -ne 0 ]; then + result=false + echo "WARNING: Reconnect attempt failed: target did not boot up or is not accessible." + return + fi + + if [ "${ANDROID_ENABLE_WIFI}" = "true" ]; then + ./automated/lib/android_ui_wifi.py -a set_wifi_state on || ret_val=$? + if [ "${ret_val}" -ne 0 ]; then + echo "WARNING: Cannot ensure that Wi-Fi is enabled in the device settings; UI automation failed." + fi + fi + + ret_val=0 + sh -c ". ./automated/lib/sh-test-lib && . ./automated/lib/android-test-lib \ + && . ./automated/lib/android-multinode-test-lib \ + && wait_network_connected \"${NETWORK_TIMEOUT_SECS}\" \ + && open_adb_tcpip_on_local_device \ + \"${ADB_TCPIP_ATTEMPTS}\" \"${ADB_CONNECT_TEST_TIMEOUT_SECS}\" \"${ADB_PORT}\"" \ + || ret_val=$? + + if [ "${ret_val}" -ne 0 ]; then + result=false + echo "WARNING: Reconnect attempt failed." + fi + } + - iteration=1 + - | + while true; do + lava-wait master-sync-$(lava-self)-${iteration} + + command="$(cat /tmp/lava_multi_node_cache.txt | grep "command" | sed 's/.*command=//' | grep -v '^$')" + + result="pass" + + case "${command}" in + continue) + ;; + release) + break + ;; + reconnect) + echo "Reconnect requested by master." + adb kill-server || true + adb devices || true + reconnect_device + ;; + *) + lava-test-raise "Script error. Unexpected message from master to worker, command=${command}" + esac + + lava-send worker-sync-$(lava-self)-${iteration} result=$result + + iteration="$(expr ${iteration} + 1)" + done + - echo "master released the device." + - lava-test-set stop diff --git a/automated/android/multinode/wait-for-release-and-reset.yaml b/automated/android/multinode/wait-for-release-and-reset.yaml new file mode 100644 index 0000000..78a52b4 --- /dev/null +++ b/automated/android/multinode/wait-for-release-and-reset.yaml @@ -0,0 +1,21 @@ +metadata: + name: wait-for-release-and-reset + format: "Lava-Test-Shell Test Definition 1.0" + description: "Wait until a remote MultiNode role (master) requests to release the device. + Then, bring the device back into adb USB state." + maintainer: + - karsten@fairphone.com + - softwareteam@fairphone.com + os: + - debian + - ubuntu + devices: + - lxc + scope: + - functional + +run: + steps: + - lava-sync release_dut + - adb kill-server || true + - adb usb diff --git a/automated/lib/android-multinode-test-lib b/automated/lib/android-multinode-test-lib new file mode 100644 index 0000000..d5fe843 --- /dev/null +++ b/automated/lib/android-multinode-test-lib @@ -0,0 +1,213 @@ +#!/bin/sh + +# Configure adb to accept adb connections via TCP/IP and make sure that the device is actually +# accessible. +# This function assumes that the device has a network address. Guards around `adb tcpip` and test +# connection setups using `adb connect` are used to check if the device is reachable after this +# call. +# Globals: +# dut_address Set to "ip_address:adb_port" by this function +# Arguments: +# adb_tcpip_attempts Number of tries for enabling adb TCP/IP mode on the device +# timeout_secs Timeout for waiting for getting the IP address from the device +# adb_port Network port to use for adb TCP/IP +# Returns: +# 0 only if the device is accessible via adb TCP/IP, 1 otherwise. +open_adb_tcpip_on_local_device() { + [ "$#" -lt 2 -o "$#" -gt 3 ] && \ + error_fatal "Usage: open_adb_tcpip_on_local_device adb_tcpip_attempts timeout_secs [adb_port]" + local adb_tcpip_attempts="$1" + local timeout_secs="$2" + local adb_port="$3" + if [ -z "${adb_port}" ]; then + local adb_port=5555 # default port assumed by adb connect + fi + + local end=$(( $(date +%s) + timeout_secs )) + + local ret_val=0 + local ip_address + ip_address="$(get_ip_address ${timeout_secs})" || ret_val=$? + if [ "${ret_val}" -ne 0 ]; then + warn_msg "get_ip_address failed unexpectedly." + return 1 + fi + if [ -z "${ip_address}" ]; then + warn_msg "Device has no ip address (network not connected?)" + return 1 + fi + dut_address="${ip_address}:${adb_port}" + + # adb tcpip may fail with different reasons + # (e.g., "error: protocol fault (couldn't read status): Connection reset by peer"). + # Just hope that it works after a few retries. + local adb_tcpip_retry_wait_secs=10 + local attempt=0 + while [ "${attempt}" -lt "${adb_tcpip_attempts}" -a "$(date +%s)" -lt "$end" ]; do + ret_val=0 + adb tcpip "${adb_port}" || ret_val=$? + if [ "${ret_val}" -eq 0 ]; then + break + fi + info_msg "adb tcpip apparently failed. Retrying in a moment..." + sleep "${adb_tcpip_retry_wait_secs}" + adb usb || true # In between, make sure to have some default state. + done + + if [ "${ret_val}" -ne 0 ]; then + warn_msg "Could not prepare the device for adb TCP/IP connections: adb tcpip failed." + return 1 + fi + + # `adb tcpip` sometimes takes some time + # (on some builds, up to 10 seconds were observed) + local success=false + while [ "$(date +%s)" -lt "$end" ]; do + if [ $(adb connect "${dut_address}" | grep -c '^connected to ') -eq 1 ]; then + success=true + break + fi + sleep 1 + done + + # Make sure the device is not reserved to the local adb server. + adb disconnect "${dut_address}" >/dev/null 2>&1 || true + + if [ "${success}" = false ]; then + warn_msg "Could not prepare the device for adb TCP/IP connections: device is not reachable via network." + return 1 + fi +} + +# Make this device accessible via adb TCP/IP and send its address via handshake to a waiting role. +# NOTE: This function must only be called once per role per test submission, as LAVA does not allow +# to use the same MultiNode message ID multiple times. +# One job instance must call connect_to_remote_adb_tcpip_devices to receive the send addresses and +# complete the handshake. +# See open_adb_tcpip_on_local_device and connect_to_remote_adb_tcpip_devices +# Globals: +# dut_address Set to "ip_address:adb_port" by this function +# Arguments: +# adb_tcpip_attempts Number of tries for establishing enabling adb TCP/IP mode on the device +# timeout_secs Timeout for waiting for getting the IP address from the device +# adb_port Network port to use for adb TCP/IP +# Returns: +# 0 only if the device is accessible via adb TCP/IP, 1 otherwise. +share_local_device_over_adb_tcpip() { + [ "$#" -lt 2 -o "$#" -gt 3 ] && \ + error_fatal "Usage: share_local_device_over_adb_tcpip adb_tcpip_attempts timeout_secs [adb_port]" + local adb_tcpip_attempts="$1" + local timeout_secs="$2" + local adb_port="$3" + + local ret_val=0 + open_adb_tcpip_on_local_device "${adb_tcpip_attempts}" "${timeout_secs}" "${adb_port}" || ret_val=$? + if [ "${ret_val}" -ne 0 ]; then + return "${ret_val}" + fi + + lava-sync start_handover + lava-send dut_address dut_address="${dut_address}" + lava-sync finish_handover +} + +# Counterpart to share_local_device_over_adb_tcpip +# Wait for other job instances to send their device address, guarded by a handshake for +# synchronization. +# Globals: +# None +# Arguments: +# adb_connect_timeout_secs Timeout for waiting for getting the IP address from the device +# device_worker_mapping_file File to store a mapping between devices and their LAVA worker host +# in the format 'serial_or_address;worker_host_id'. This file is relevant for following +# functions to communicate with the devices or workers. +# Optional: This file will not be created if no path is specified. +# Returns: +# 0 on success 1 otherwise. +connect_to_remote_adb_tcpip_devices() { + [ "$#" -lt 1 -o "$#" -gt 2 ] && \ + error_fatal "Usage: connect_to_remote_adb_tcpip_devices adb_connect_timeout_secs [device_worker_mapping_file]" + + local adb_connect_timeout_secs="$1" + local device_worker_mapping_file="$2" + + lava-sync start_handover + # For lava-wait-all, all involved nodes must invoke lava-send with the same message id, + # otherwise lava-wait-all would lead to a dead lock. + # However, only the nodes that make their device accessible (workers) add the value + # dut_address="address:port". + lava-send dut_address + lava-wait-all dut_address + + # The MultiNode cache file might not exist if there is no other worker. + local cache_lines + local device_worker_mapping="" + if [ -f "/tmp/lava_multi_node_cache.txt" ]; then + cache_lines="$(cat "/tmp/lava_multi_node_cache.txt" | grep "dut_address" | grep -v '^$' || true)" + + for line in ${cache_lines}; do + # <worker_job_id>:dut_address=<dut_address> + local dut_address="$(echo "$line"| sed 's/.*dut_address=//')" + local worker_host="$(echo "$line" | cut -d: -f1)" + device_worker_mapping="${device_worker_mapping}${dut_address};${worker_host}\n" + done + device_worker_mapping="$(printf "${device_worker_mapping}" | grep -v '^$' || true)" + fi + + lava-sync finish_handover + # adb is not super reliable, it too often sees connected and authorized devices as "offline" + adb kill-server || true + + # Connect to remote devices and wait until they appear online + + for device_to_worker in ${device_worker_mapping}; do + local device="$(echo ${device_to_worker} | cut -d';' -f1)" + for i in $(seq 5); do + local ret_val=0 + adb connect "${device}" || ret_val="$?" + if [ "${ret_val}" -eq 0 ]; then + break + fi + adb disconnect "${device}" || true + warn_msg "adb connect failed. Retrying in a minute..." + sleep 1m + done + done + + for device_to_worker in ${device_worker_mapping}; do + local device="$(echo ${device_to_worker} | cut -d';' -f1)" + if ! timeout "${adb_connect_timeout_secs}" adb -s "${device}" wait-for-device; then + warn_msg "adb wait-for-device for ${device} timed out after ${adb_connect_timeout_secs} seconds." + return 1 + fi + done + + local num_remote_devices="$(echo "${device_worker_mapping}" | wc -l)" + info_msg "All ${num_remote_devices} remote devices are connected and online." + + info_msg "Now adding devices locally connected via USB." + local connected_devices + local ret_val=0 + connected_devices="$(adb devices | grep -E '^([:\.[:alnum:]]+)\s+device$' | cut -f1)" || ret_val=$? + if [ "${ret_val}" -ne 0 ]; then + warn_msg "\"adb devices\" did not exit cleanly. Cannot reliably determine the list of connected devices." + fi + local remote_only_mapping="${device_worker_mapping}" + device_worker_mapping="${device_worker_mapping}\n" + for device in ${connected_devices}; do + if [ "$(echo "${remote_only_mapping}" | cut -d';' -f1 | grep -xc "${device}")" -eq 0 ]; then + device_worker_mapping="${device_worker_mapping}${device};\n" + info_msg "Local device: ${device}" + fi + done + device_worker_mapping="$(printf "${device_worker_mapping}" | grep -v '^$')" + + if [ "${device_worker_mapping_file}" ]; then + # Make mapping between attached DUTs and worker job ids and accessible to subsequent tests: + echo "${device_worker_mapping}" > "${device_worker_mapping_file}" + info_msg "Mapping between devices and to worker job ids stored in ${device_worker_mapping_file}:" + info_msg "${device_worker_mapping}" + else + info_msg "NOT storing device to worker job id mapping, empty filename specified." + fi +} diff --git a/automated/lib/android_adb_wrapper.py b/automated/lib/android_adb_wrapper.py new file mode 100644 index 0000000..64fcbb1 --- /dev/null +++ b/automated/lib/android_adb_wrapper.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python3 + +import re +import subprocess +import time + + +ADB_DEVICES_PATTERN = re.compile(r"^([a-z0-9-]+)\s+device$", flags=re.M) + + +class DeviceCommandError(BaseException): + """An error happened while sending a command to a device.""" + + def __init__(self, serial, command, error_message): + self.serial = serial + self.command = command + self.error_message = error_message + message = "Command `{}` failed on {}: {}".format( + command, serial, error_message + ) + super(DeviceCommandError, self).__init__(message) + + +def adb(*args, serial=None, raise_on_error=True): + """Run ADB command attached to serial. + + Example: + >>> process = adb('shell', 'getprop', 'ro.build.fingerprint', serial='aserialnumber') + >>> process.returncode + 0 + >>> process.stdout.strip() + 'ExampleVendor/Device/version/tags' + + :param *args: + List of options to ADB (including command). + :param str serial: + Identifier for ADB connection to device. + :param raise_on_error bool: + Whether to raise a DeviceCommandError exception if the return code is + less than 0. + :returns subprocess.CompletedProcess: + Completed process. + :raises DeviceCommandError: + If the command failed. + """ + + # Make sure the adb server is started to avoid the infamous "out of date" + # message that pollutes stdout. + ret = subprocess.run( + ["adb", "start-server"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + universal_newlines=True, + ) + if ret.returncode < 0: + if raise_on_error: + raise DeviceCommandError( + serial if serial else "??", str(args), ret.stderr + ) + else: + return None + + command = ["adb"] + if serial: + command += ["-s", serial] + if args: + command += list(args) + ret = subprocess.run( + command, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + universal_newlines=True, + ) + + if raise_on_error and ret.returncode < 0: + raise DeviceCommandError( + serial if serial else "??", str(args), ret.stderr + ) + + return ret + + +def list_devices(): + """List serial numbers of devices attached to adb. + + Raises: + DeviceCommandError: If the underlying adb command failed. + """ + process = adb("devices") + return ADB_DEVICES_PATTERN.findall(process.stdout) + + +def unlock(dut): + """Wake-up the device and unlock it. + + Raises: + DeviceCommandError: If the underlying adb commands failed. + """ + if not dut.info["screenOn"]: + adb("shell", "input keyevent KEYCODE_POWER", serial=dut.serial) + time.sleep(1) + + # Make sure we are on the home screen. + adb("shell", "input keyevent KEYCODE_HOME", serial=dut.serial) + # The KEYCODE_MENU input is enough to unlock a "swipe up to unlock" + # lockscreen on Android 6, but unfortunately not Android 7. So we use a + # swipe up (that depends on the screen resolution) instead. + adb("shell", "input touchscreen swipe 930 880 930 380", serial=dut.serial) + time.sleep(1) + adb("shell", "input keyevent KEYCODE_HOME", serial=dut.serial) diff --git a/automated/lib/android_ui_wifi.py b/automated/lib/android_ui_wifi.py new file mode 100755 index 0000000..dd32166 --- /dev/null +++ b/automated/lib/android_ui_wifi.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python3 + +from android_adb_wrapper import * +import argparse +import sys +from uiautomator import Device + + +def set_wifi_state(dut, turn_on): + """Turn WiFi on or off. + + This checks the current WiFi settings and turns it on or off. It does + nothing if the settings are already in the desired state. + + Parameters: + dut (Device): The device object. + enabled: Boolean, true for on, false for off + Raises: + DeviceCommandError: If the UI automation fails. + """ + # Open the Wi-Fi settings + adb( + "shell", + ("am start -a android.settings.WIFI_SETTINGS " "--activity-clear-task"), + serial=dut.serial, + ) + + # Check if there is an option to turn WiFi on or off + wifi_enabler = dut( + text="OFF", resourceId="com.android.settings:id/switch_widget" + ) + wifi_disabler = dut( + text="ON", resourceId="com.android.settings:id/switch_widget" + ) + + if not wifi_enabler.exists and not wifi_disabler.exists: + raise DeviceCommandError( + dut, + "UI: set Wi-Fi state", + "Neither switch for turning Wi-Fi on nor for turning it off are present.", + ) + if wifi_enabler.exists and wifi_disabler.exists: + raise DeviceCommandError( + dut, + "UI: set Wi-Fi state", + "Unexpected UI: Both, a switch for turning Wi-Fi on and for turning it off are present.", + ) + + if turn_on: + if wifi_enabler.exists: + wifi_enabler.click() + else: + print("Wi-Fi is already enabled.") + else: + if wifi_disabler.exists: + wifi_disabler.click() + else: + print("Wi-Fi is already disabled.") + + # Leave the settings + dut.press.back() + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "-a", + dest="ACTION", + required=True, + nargs="+", + help="Action to perform. Following action is currently implemented: \ + set_wifi_state <on|off>", + ) + parser.add_argument( + "-s", + dest="SERIALS", + nargs="+", + help="Serial numbers of devices to configure. \ + If not present, all available devices will be configured.", + ) + args = parser.parse_args() + + if args.ACTION[0] != "set_wifi_state" or args.ACTION[1] not in ( + "on", + "off", + ): + print( + "ERROR: Specified ACTION is not supported: {}".format(args.ACTION), + file=sys.stderr, + ) + sys.exit(1) + + serials = args.SERIALS if args.SERIALS is not None else list_devices() + + for serial in serials: + print("Configuring device {}…".format(serial)) + + dut = Device(serial) + # Work around the not-so-easy Device class + dut.serial = serial + + try: + unlock(dut) + + set_wifi_state(dut, args.ACTION[1] == "on") + + except DeviceCommandError as e: + print("ERROR {}".format(e), file=sys.stderr) + + +if __name__ == "__main__": + main() |