Bug 1924533: Run macrobenchmark test on CI to measure Baseline Profile impact on performance r=releng-reviewers,android-reviewers,bhearsum,calu

This patch adds a job called `run-macrobenchmark-firebase-fenix` that runs a macrobenchmark on a physical device on Firebase Test lab to benchmark the impact of the baseline profile on app startup. Differential Revision: https://phabricator.services.mozilla.com/D229019
2025-05-20 07:29:45 +00:00
parent f2dca3c70d
commit abe18b0bd1
7 changed files with 302 additions and 0 deletions
--- a/mobile/android/fenix/automation/taskcluster/androidTest/flank-arm64-v8a-macrobenchmark.yml
+++ b/mobile/android/fenix/automation/taskcluster/androidTest/flank-arm64-v8a-macrobenchmark.yml
@@ -0,0 +1,39 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+# Google Cloud Documentation: https://cloud.google.com/sdk/gcloud/reference/firebase/test/android/run
+# Flank Documentation: https://flank.github.io/flank/
+gcloud:
+  results-bucket: fenix_test_artifacts
+  record-video: true
+  timeout: 15m
+  async: false
+  num-flaky-test-attempts: 1
+
+  app: /app/path
+  test: /test/path
+
+  auto-google-login: false
+  use-orchestrator: false
+  environment-variables:
+    clearPackageData: true
+  directories-to-pull:
+    - /sdcard/Download
+    - /sdcard/Android/media/org.mozilla.fenix.benchmark/
+  performance-metrics: true
+
+  test-targets:
+    - class org.mozilla.fenix.benchmark.BaselineProfilesStartupBenchmark
+
+  device:
+    - model: shiba
+      version: 34
+      locale: en_US
+
+flank:
+  project: GOOGLE_PROJECT
+  max-test-shards: 1
+  num-test-runs: 1
+  output-style: compact
+  full-junit-result: true
--- a/mobile/android/fenix/benchmark/src/main/java/org/mozilla/fenix/benchmark/BaselineProfilesStartupBenchmark.kt
+++ b/mobile/android/fenix/benchmark/src/main/java/org/mozilla/fenix/benchmark/BaselineProfilesStartupBenchmark.kt
@@ -66,5 +66,6 @@ class BaselineProfilesStartupBenchmark {
            },
        ) {
            startActivityAndWait()
+            killProcess()
        }
 }
--- a/taskcluster/docs/kinds.rst
+++ b/taskcluster/docs/kinds.rst
@@ -890,3 +890,11 @@ Run baseline profile generation for Android on Firebase TestLab.
 update
 ------------
 Run tests to see if the executable can be updated to the latest release.
+
+run-macrobenchmark-firebase
+---------------------------
+Run Macrobenchmark for Android on Firebase TestLab.
+
+instrumented-build-macrobenchmark-apk
+-------------------------------------
+Generate instrumented apks used to run Macrobenchmark for Android apps.
--- a/taskcluster/kinds/instrumented-build-macrobenchmark-apk/kind.yml
+++ b/taskcluster/kinds/instrumented-build-macrobenchmark-apk/kind.yml
@@ -0,0 +1,54 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+---
+loader: taskgraph.loader.transform:loader
+
+transforms:
+    - android_taskgraph.transforms.build_android_app:transforms
+    - gecko_taskgraph.transforms.job:transforms
+    - gecko_taskgraph.transforms.task:transforms
+
+kind-dependencies:
+    - toolchain
+    - build-fat-aar
+    - generate-baseline-profile-firebase
+
+task-defaults:
+    attributes:
+        retrigger: true
+    fetches:
+        toolchain:
+            - linux64-android-sdk-linux-repack
+            - linux64-jdk-repack
+            - linux64-node
+    run:
+        using: gradlew
+    treeherder:
+        kind: test
+        tier: 1
+
+tasks:
+    fenix:
+        description: 'Generate macrobenchmark apks for fenix with baseline profile.'
+        attributes:
+            shipping-product: fenix
+        dependencies:
+            build-fat-aar: build-fat-aar-android-geckoview-fat-aar/opt
+            baseline-profile: generate-baseline-profile-firebase-fenix
+        source-project-name: "fenix"
+        run:
+            gradle-package-command: ":benchmark:assembleBenchmark :app:assembleBenchmark"
+            baseline-profile-path: /builds/worker/fetches/baselineProfiles
+        treeherder:
+            symbol: fenix(instr)
+            platform: fenix-android-all/opt
+        worker:
+            artifacts:
+                - name: "public/build/target.arm64-v8a.apk"
+                  path: "/builds/worker/workspace/obj-build/gradle/build/mobile/android/fenix/app/outputs/apk/fenix/benchmark/app-fenix-arm64-v8a-benchmark.apk"
+                  type: "file"
+                - name: "public/build/target.noarch.apk"
+                  path: "/builds/worker/workspace/obj-build/gradle/build/mobile/android/fenix/benchmark/outputs/apk/benchmark/benchmark-benchmark.apk"
+                  type: "file"
+            chain-of-trust: true
--- a/taskcluster/kinds/run-macrobenchmark-firebase/kind.yml
+++ b/taskcluster/kinds/run-macrobenchmark-firebase/kind.yml
@@ -0,0 +1,81 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+---
+loader: taskgraph.loader.transform:loader
+
+transforms:
+    - gecko_taskgraph.transforms.test_apk:transforms
+    # While not a build, the build optimization works well for these tasks.
+    - gecko_taskgraph.transforms.build_schedules:transforms
+    - gecko_taskgraph.transforms.job:transforms
+    - gecko_taskgraph.transforms.task:transforms
+
+kind-dependencies:
+    - toolchain
+    - instrumented-build-macrobenchmark-apk
+
+task-defaults:
+    attributes:
+        build_platform: android
+        build-type: debug
+        retrigger: true
+    fetches:
+        toolchain:
+            - android-sdk-linux
+    optimization:
+        skip-unless-backstop: null
+    worker-type: b-linux-medium-gcp
+    worker:
+        docker-image: {in-tree: android-ui-tests}
+        max-run-time: 7200
+        env:
+            GOOGLE_APPLICATION_CREDENTIALS: '.firebase_token.json'
+            ANDROID_SDK_ROOT: /builds/worker/fetches/android-sdk-linux
+        artifacts:
+            - name: public
+              path: /builds/worker/artifacts
+              type: directory
+        retry-exit-status: [20]
+    treeherder:
+        kind: test
+        tier: 1
+    run:
+        use-caches: false
+        using: run-commands
+        dummy-secrets:
+            - content: "faketoken"
+              path: .adjust_token
+
+tasks:
+    fenix:
+        attributes:
+            build-type: fenix-debug-firebase
+            shipping-product: fenix
+        description: Run macrobenchmark for Fenix
+        dependencies:
+            instrumented-apk: instrumented-build-macrobenchmark-apk-fenix
+        fetches:
+            instrumented-apk:
+                - artifact: target.arm64-v8a.apk
+                  extract: false
+                - artifact: target.noarch.apk
+                  extract: false
+        run:
+            secrets:
+                - name: project/mobile/firefox-android/fenix/firebase
+                  key: firebaseToken
+                  path: .firebase_token.json
+                  json: true
+            pre-commands:
+                - ["cd", "mobile/android/fenix"]
+            commands:
+                - [python3, ../../../taskcluster/scripts/tests/test-lab.py, arm64-v8a-macrobenchmark, /builds/worker/fetches/target.arm64-v8a.apk, --apk_test, /builds/worker/fetches/target.noarch.apk]
+                - [python3, ../../../taskcluster/scripts/tests/copy-artifacts-from-ftl.py, "macrobenchmark"]
+                - [python3, ../../../taskcluster/scripts/tests/compute-macrobenchmark-results.py, "/builds/worker/artifacts/build/macrobenchmark.json", "/builds/worker/artifacts/build/macrobenchmark.txt"]
+        treeherder:
+            platform: fenix-android-all/opt
+            symbol: fenix(run)
+        worker:
+            env:
+                GOOGLE_PROJECT: moz-fenix
--- a/taskcluster/scripts/tests/compute-macrobenchmark-results.py
+++ b/taskcluster/scripts/tests/compute-macrobenchmark-results.py
@@ -0,0 +1,100 @@
+import json
+import sys
+
+
+def read_benchmark_data(file_path):
+    """Reads the JSON file and returns the benchmark results as a dictionary."""
+    with open(file_path) as file:
+        data = json.load(file)
+
+    # Extract benchmarks data
+    benchmarks = data["benchmarks"]
+    results = {}
+    for benchmark in benchmarks:
+        name = benchmark["name"]
+        time_metrics = benchmark["metrics"]["timeToInitialDisplayMs"]
+        results[name] = {
+            "median": time_metrics["median"],
+            "minimum": time_metrics["minimum"],
+            "maximum": time_metrics["maximum"],
+        }
+    return results
+
+
+def calculate_improvements(results):
+    """Calculates percentage improvements between startup with and without baseline profiles."""
+    improvements = {
+        "median": f"{((results['startupNone']['median'] - results['startupPartialWithBaselineProfiles']['median']) / results['startupNone']['median']) * 100:.2f}%",
+        "minimum": f"{((results['startupNone']['minimum'] - results['startupPartialWithBaselineProfiles']['minimum']) / results['startupNone']['minimum']) * 100:.2f}%",
+        "maximum": f"{((results['startupNone']['maximum'] - results['startupPartialWithBaselineProfiles']['maximum']) / results['startupNone']['maximum']) * 100:.2f}%",
+    }
+    return improvements
+
+
+def format_output_content(results):
+    """Formats the output content into the specified JSON structure."""
+    # Map to transform result names to subtest entries
+    baseline_map = {
+        "startupPartialWithBaselineProfiles": "baseline",
+        "startupNone": "no_baseline",
+    }
+
+    # Construct the subtests list
+    subtests = []
+    for result_name, metrics in results.items():
+        baseline_mode = baseline_map.get(result_name, "unknown")
+        for metric_name, value in metrics.items():
+            subtest = {
+                "name": f"cold_startup.{baseline_mode}.{metric_name}",
+                "lowerIsBetter": True,
+                "value": value,
+                "unit": "ms",
+            }
+            subtests.append(subtest)
+
+    # Define the base JSON structure using the subtests list
+    output_json = {
+        "framework": {"name": "mozperftest"},
+        "application": {"name": "fenix"},
+        "suites": [
+            {
+                "name": "baseline-profile:fenix",
+                "type": "coldstart",
+                "unit": "ms",
+                "extraOptions": [],
+                "lowerIsBetter": True,
+                "subtests": subtests,
+            }
+        ],
+    }
+
+    return output_json
+
+
+def output_results(output_json, output_file_path):
+    """Writes the output JSON to a specified file and prints it in a compacted format to the console."""
+    # Convert JSON structure to a compacted one-line string
+    compact_json = json.dumps(output_json)
+
+    # Print in the specified format
+    print(f"PERFHERDER_DATA: {compact_json}")
+
+    # Write the pretty-formatted JSON to the file
+    with open(output_file_path, "w") as output_file:
+        output_file.write(json.dumps(output_json, indent=3))
+    print(f"Results have been written to {output_file_path}")
+
+
+# Main script logic
+if __name__ == "__main__":
+    if len(sys.argv) < 3:
+        print("Usage: python script.py <input_json_path> <output_file_path>")
+    else:
+        input_json_path = sys.argv[1]
+        output_file_path = sys.argv[2]
+
+        # Process the benchmark data
+        results = read_benchmark_data(input_json_path)
+        improvements = calculate_improvements(results)
+        output_json = format_output_content(results)
+        output_results(output_json, output_file_path)
--- a/taskcluster/scripts/tests/copy-artifacts-from-ftl.py
+++ b/taskcluster/scripts/tests/copy-artifacts-from-ftl.py
@@ -58,6 +58,7 @@ class Worker(Enum):

    RESULTS_DIR = "/builds/worker/artifacts/results"
    BASELINE_PROFILE_DEST = "/builds/worker/artifacts/build/baseline-prof.txt"
+    MACROBENCHMARK_DEST = "/builds/worker/artifacts/build/macrobenchmark.json"
    ARTIFACTS_DIR = "/builds/worker/artifacts"


@@ -70,6 +71,9 @@ class ArtifactType(Enum):
        "artifacts/sdcard/Android/media/org.mozilla.fenix.benchmark/*-baseline-prof.txt"
    )
    CRASH_LOG = "data_app_crash*.txt"
+    MACROBENCHMARK = (
+        "artifacts/sdcard/Android/media/org.mozilla.fenix.benchmark/*benchmarkData.json"
+    )
    MATRIX_IDS = "matrix_ids.json"


@@ -247,6 +251,8 @@ def process_artifacts(artifact_type):

    if artifact_type == ArtifactType.BASELINE_PROFILE:
        return process_baseline_profile_artifact(root_gcs_path, device_names)
+    elif artifact_type == ArtifactType.MACROBENCHMARK:
+        return process_macrobenchmark_artifact(root_gcs_path, device_names)
    else:
        return process_crash_artifacts(root_gcs_path, device_names)

@@ -262,6 +268,17 @@ def process_baseline_profile_artifact(root_gcs_path, device_names):
    gsutil_cp(artifact, Worker.BASELINE_PROFILE_DEST.value)


+def process_macrobenchmark_artifact(root_gcs_path, device_names):
+    device = device_names[0]
+    artifact = fetch_artifacts(
+        root_gcs_path, device, ArtifactType.MACROBENCHMARK.value
+    )[0]
+    if not artifact:
+        exit_with_error(f"No artifacts found for device: {device}")
+
+    gsutil_cp(artifact, Worker.MACROBENCHMARK_DEST.value)
+
+
 def process_crash_artifacts(root_gcs_path, failed_device_names):
    crashes_reported = 0
    for device in failed_device_names:
@@ -295,6 +312,8 @@ def main():
    artifact_type_arg = sys.argv[1]
    if artifact_type_arg == "baseline_profile":
        process_artifacts(ArtifactType.BASELINE_PROFILE)
+    elif artifact_type_arg == "macrobenchmark":
+        process_artifacts(ArtifactType.MACROBENCHMARK)
    elif artifact_type_arg == "crash_log":
        process_artifacts(ArtifactType.CRASH_LOG)
    else: