Bug 1924533: Run macrobenchmark test on CI to measure Baseline Profile impact on performance r=releng-reviewers,android-reviewers,bhearsum,calu

This patch adds a job called `run-macrobenchmark-firebase-fenix` that runs a macrobenchmark on a physical device on Firebase Test lab to benchmark the impact of the baseline profile on app startup.

Differential Revision: https://phabricator.services.mozilla.com/D229019
This commit is contained in:
Titouan Thibaud
2025-05-20 07:29:45 +00:00
committed by tthibaud@mozilla.com
parent f2dca3c70d
commit abe18b0bd1
7 changed files with 302 additions and 0 deletions

View File

@@ -0,0 +1,39 @@
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
# Google Cloud Documentation: https://cloud.google.com/sdk/gcloud/reference/firebase/test/android/run
# Flank Documentation: https://flank.github.io/flank/
gcloud:
results-bucket: fenix_test_artifacts
record-video: true
timeout: 15m
async: false
num-flaky-test-attempts: 1
app: /app/path
test: /test/path
auto-google-login: false
use-orchestrator: false
environment-variables:
clearPackageData: true
directories-to-pull:
- /sdcard/Download
- /sdcard/Android/media/org.mozilla.fenix.benchmark/
performance-metrics: true
test-targets:
- class org.mozilla.fenix.benchmark.BaselineProfilesStartupBenchmark
device:
- model: shiba
version: 34
locale: en_US
flank:
project: GOOGLE_PROJECT
max-test-shards: 1
num-test-runs: 1
output-style: compact
full-junit-result: true

View File

@@ -66,5 +66,6 @@ class BaselineProfilesStartupBenchmark {
},
) {
startActivityAndWait()
killProcess()
}
}

View File

@@ -890,3 +890,11 @@ Run baseline profile generation for Android on Firebase TestLab.
update
------------
Run tests to see if the executable can be updated to the latest release.
run-macrobenchmark-firebase
---------------------------
Run Macrobenchmark for Android on Firebase TestLab.
instrumented-build-macrobenchmark-apk
-------------------------------------
Generate instrumented apks used to run Macrobenchmark for Android apps.

View File

@@ -0,0 +1,54 @@
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
---
loader: taskgraph.loader.transform:loader
transforms:
- android_taskgraph.transforms.build_android_app:transforms
- gecko_taskgraph.transforms.job:transforms
- gecko_taskgraph.transforms.task:transforms
kind-dependencies:
- toolchain
- build-fat-aar
- generate-baseline-profile-firebase
task-defaults:
attributes:
retrigger: true
fetches:
toolchain:
- linux64-android-sdk-linux-repack
- linux64-jdk-repack
- linux64-node
run:
using: gradlew
treeherder:
kind: test
tier: 1
tasks:
fenix:
description: 'Generate macrobenchmark apks for fenix with baseline profile.'
attributes:
shipping-product: fenix
dependencies:
build-fat-aar: build-fat-aar-android-geckoview-fat-aar/opt
baseline-profile: generate-baseline-profile-firebase-fenix
source-project-name: "fenix"
run:
gradle-package-command: ":benchmark:assembleBenchmark :app:assembleBenchmark"
baseline-profile-path: /builds/worker/fetches/baselineProfiles
treeherder:
symbol: fenix(instr)
platform: fenix-android-all/opt
worker:
artifacts:
- name: "public/build/target.arm64-v8a.apk"
path: "/builds/worker/workspace/obj-build/gradle/build/mobile/android/fenix/app/outputs/apk/fenix/benchmark/app-fenix-arm64-v8a-benchmark.apk"
type: "file"
- name: "public/build/target.noarch.apk"
path: "/builds/worker/workspace/obj-build/gradle/build/mobile/android/fenix/benchmark/outputs/apk/benchmark/benchmark-benchmark.apk"
type: "file"
chain-of-trust: true

View File

@@ -0,0 +1,81 @@
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
---
loader: taskgraph.loader.transform:loader
transforms:
- gecko_taskgraph.transforms.test_apk:transforms
# While not a build, the build optimization works well for these tasks.
- gecko_taskgraph.transforms.build_schedules:transforms
- gecko_taskgraph.transforms.job:transforms
- gecko_taskgraph.transforms.task:transforms
kind-dependencies:
- toolchain
- instrumented-build-macrobenchmark-apk
task-defaults:
attributes:
build_platform: android
build-type: debug
retrigger: true
fetches:
toolchain:
- android-sdk-linux
optimization:
skip-unless-backstop: null
worker-type: b-linux-medium-gcp
worker:
docker-image: {in-tree: android-ui-tests}
max-run-time: 7200
env:
GOOGLE_APPLICATION_CREDENTIALS: '.firebase_token.json'
ANDROID_SDK_ROOT: /builds/worker/fetches/android-sdk-linux
artifacts:
- name: public
path: /builds/worker/artifacts
type: directory
retry-exit-status: [20]
treeherder:
kind: test
tier: 1
run:
use-caches: false
using: run-commands
dummy-secrets:
- content: "faketoken"
path: .adjust_token
tasks:
fenix:
attributes:
build-type: fenix-debug-firebase
shipping-product: fenix
description: Run macrobenchmark for Fenix
dependencies:
instrumented-apk: instrumented-build-macrobenchmark-apk-fenix
fetches:
instrumented-apk:
- artifact: target.arm64-v8a.apk
extract: false
- artifact: target.noarch.apk
extract: false
run:
secrets:
- name: project/mobile/firefox-android/fenix/firebase
key: firebaseToken
path: .firebase_token.json
json: true
pre-commands:
- ["cd", "mobile/android/fenix"]
commands:
- [python3, ../../../taskcluster/scripts/tests/test-lab.py, arm64-v8a-macrobenchmark, /builds/worker/fetches/target.arm64-v8a.apk, --apk_test, /builds/worker/fetches/target.noarch.apk]
- [python3, ../../../taskcluster/scripts/tests/copy-artifacts-from-ftl.py, "macrobenchmark"]
- [python3, ../../../taskcluster/scripts/tests/compute-macrobenchmark-results.py, "/builds/worker/artifacts/build/macrobenchmark.json", "/builds/worker/artifacts/build/macrobenchmark.txt"]
treeherder:
platform: fenix-android-all/opt
symbol: fenix(run)
worker:
env:
GOOGLE_PROJECT: moz-fenix

View File

@@ -0,0 +1,100 @@
import json
import sys
def read_benchmark_data(file_path):
"""Reads the JSON file and returns the benchmark results as a dictionary."""
with open(file_path) as file:
data = json.load(file)
# Extract benchmarks data
benchmarks = data["benchmarks"]
results = {}
for benchmark in benchmarks:
name = benchmark["name"]
time_metrics = benchmark["metrics"]["timeToInitialDisplayMs"]
results[name] = {
"median": time_metrics["median"],
"minimum": time_metrics["minimum"],
"maximum": time_metrics["maximum"],
}
return results
def calculate_improvements(results):
"""Calculates percentage improvements between startup with and without baseline profiles."""
improvements = {
"median": f"{((results['startupNone']['median'] - results['startupPartialWithBaselineProfiles']['median']) / results['startupNone']['median']) * 100:.2f}%",
"minimum": f"{((results['startupNone']['minimum'] - results['startupPartialWithBaselineProfiles']['minimum']) / results['startupNone']['minimum']) * 100:.2f}%",
"maximum": f"{((results['startupNone']['maximum'] - results['startupPartialWithBaselineProfiles']['maximum']) / results['startupNone']['maximum']) * 100:.2f}%",
}
return improvements
def format_output_content(results):
"""Formats the output content into the specified JSON structure."""
# Map to transform result names to subtest entries
baseline_map = {
"startupPartialWithBaselineProfiles": "baseline",
"startupNone": "no_baseline",
}
# Construct the subtests list
subtests = []
for result_name, metrics in results.items():
baseline_mode = baseline_map.get(result_name, "unknown")
for metric_name, value in metrics.items():
subtest = {
"name": f"cold_startup.{baseline_mode}.{metric_name}",
"lowerIsBetter": True,
"value": value,
"unit": "ms",
}
subtests.append(subtest)
# Define the base JSON structure using the subtests list
output_json = {
"framework": {"name": "mozperftest"},
"application": {"name": "fenix"},
"suites": [
{
"name": "baseline-profile:fenix",
"type": "coldstart",
"unit": "ms",
"extraOptions": [],
"lowerIsBetter": True,
"subtests": subtests,
}
],
}
return output_json
def output_results(output_json, output_file_path):
"""Writes the output JSON to a specified file and prints it in a compacted format to the console."""
# Convert JSON structure to a compacted one-line string
compact_json = json.dumps(output_json)
# Print in the specified format
print(f"PERFHERDER_DATA: {compact_json}")
# Write the pretty-formatted JSON to the file
with open(output_file_path, "w") as output_file:
output_file.write(json.dumps(output_json, indent=3))
print(f"Results have been written to {output_file_path}")
# Main script logic
if __name__ == "__main__":
if len(sys.argv) < 3:
print("Usage: python script.py <input_json_path> <output_file_path>")
else:
input_json_path = sys.argv[1]
output_file_path = sys.argv[2]
# Process the benchmark data
results = read_benchmark_data(input_json_path)
improvements = calculate_improvements(results)
output_json = format_output_content(results)
output_results(output_json, output_file_path)

View File

@@ -58,6 +58,7 @@ class Worker(Enum):
RESULTS_DIR = "/builds/worker/artifacts/results"
BASELINE_PROFILE_DEST = "/builds/worker/artifacts/build/baseline-prof.txt"
MACROBENCHMARK_DEST = "/builds/worker/artifacts/build/macrobenchmark.json"
ARTIFACTS_DIR = "/builds/worker/artifacts"
@@ -70,6 +71,9 @@ class ArtifactType(Enum):
"artifacts/sdcard/Android/media/org.mozilla.fenix.benchmark/*-baseline-prof.txt"
)
CRASH_LOG = "data_app_crash*.txt"
MACROBENCHMARK = (
"artifacts/sdcard/Android/media/org.mozilla.fenix.benchmark/*benchmarkData.json"
)
MATRIX_IDS = "matrix_ids.json"
@@ -247,6 +251,8 @@ def process_artifacts(artifact_type):
if artifact_type == ArtifactType.BASELINE_PROFILE:
return process_baseline_profile_artifact(root_gcs_path, device_names)
elif artifact_type == ArtifactType.MACROBENCHMARK:
return process_macrobenchmark_artifact(root_gcs_path, device_names)
else:
return process_crash_artifacts(root_gcs_path, device_names)
@@ -262,6 +268,17 @@ def process_baseline_profile_artifact(root_gcs_path, device_names):
gsutil_cp(artifact, Worker.BASELINE_PROFILE_DEST.value)
def process_macrobenchmark_artifact(root_gcs_path, device_names):
device = device_names[0]
artifact = fetch_artifacts(
root_gcs_path, device, ArtifactType.MACROBENCHMARK.value
)[0]
if not artifact:
exit_with_error(f"No artifacts found for device: {device}")
gsutil_cp(artifact, Worker.MACROBENCHMARK_DEST.value)
def process_crash_artifacts(root_gcs_path, failed_device_names):
crashes_reported = 0
for device in failed_device_names:
@@ -295,6 +312,8 @@ def main():
artifact_type_arg = sys.argv[1]
if artifact_type_arg == "baseline_profile":
process_artifacts(ArtifactType.BASELINE_PROFILE)
elif artifact_type_arg == "macrobenchmark":
process_artifacts(ArtifactType.MACROBENCHMARK)
elif artifact_type_arg == "crash_log":
process_artifacts(ArtifactType.CRASH_LOG)
else: