updateTrackingSinglePageApp() contains the core logic for when we observe same document loads. Based on conditions, it may record telemetry (engagement/abandonment), and dispatch events to the appropriate SearchSERPTelemetry actor. SearchSERPTelemetryChild had to be modified to allow for manual dispatching of events, especially unloading event listeners in the case where a user navigates to a non-default search page, like Images or Shopping. One complication in observing clicks is we're storing search pages based on the URL at the time of load, and then looking them up again using the originURL, but it might be different from the currentURL due to the way single page apps work. So this could cause confusion if multiple SERPs are open and different from their originURL, hence the usage of recovering the correct state map by first using the browser object and defaulting back to the originURL if the WeakMap doesn't contain any of the browser objects. I also created a mock SPA html file that when given a search query parameter, shows mock results. The helper file head-spa.js contains a class that navigates the pages in tests. The tests are broken out into the following: - When a single search provider is opened in a single tab - When a single search provider is has multiple tabs open - When multiple providers are open at the same time - Event listeners on in-content elements are unloaded properly when navigating away from the page Differential Revision: https://phabricator.services.mozilla.com/D193320
2509 lines
79 KiB
JavaScript
2509 lines
79 KiB
JavaScript
/* This Source Code Form is subject to the terms of the Mozilla Public
|
|
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
|
|
|
import { XPCOMUtils } from "resource://gre/modules/XPCOMUtils.sys.mjs";
|
|
|
|
const lazy = {};
|
|
|
|
ChromeUtils.defineESModuleGetters(lazy, {
|
|
BrowserSearchTelemetry: "resource:///modules/BrowserSearchTelemetry.sys.mjs",
|
|
PrivateBrowsingUtils: "resource://gre/modules/PrivateBrowsingUtils.sys.mjs",
|
|
Region: "resource://gre/modules/Region.sys.mjs",
|
|
RemoteSettings: "resource://services-settings/remote-settings.sys.mjs",
|
|
SearchUtils: "resource://gre/modules/SearchUtils.sys.mjs",
|
|
});
|
|
|
|
ChromeUtils.defineLazyGetter(lazy, "gCryptoHash", () => {
|
|
return Cc["@mozilla.org/security/hash;1"].createInstance(Ci.nsICryptoHash);
|
|
});
|
|
|
|
// The various histograms and scalars that we report to.
|
|
const SEARCH_CONTENT_SCALAR_BASE = "browser.search.content.";
|
|
const SEARCH_WITH_ADS_SCALAR_BASE = "browser.search.withads.";
|
|
const SEARCH_AD_CLICKS_SCALAR_BASE = "browser.search.adclicks.";
|
|
const SEARCH_DATA_TRANSFERRED_SCALAR = "browser.search.data_transferred";
|
|
const SEARCH_TELEMETRY_PRIVATE_BROWSING_KEY_SUFFIX = "pb";
|
|
|
|
// Exported for tests.
|
|
export const ADLINK_CHECK_TIMEOUT_MS = 1000;
|
|
// Unlike the standard adlink check, the timeout for single page apps is not
|
|
// based on a content event within the page, like DOMContentLoaded or load.
|
|
// Thus, we aim for a longer timeout to account for when the server might be
|
|
// slow to update the content on the page.
|
|
export const SPA_ADLINK_CHECK_TIMEOUT_MS = 2500;
|
|
export const TELEMETRY_SETTINGS_KEY = "search-telemetry-v2";
|
|
export const TELEMETRY_CATEGORIZATION_KEY = "search-categorization";
|
|
export const TELEMETRY_CATEGORIZATION_DOWNLOAD_SETTINGS = {
|
|
// Units are in milliseconds.
|
|
base: 3600000,
|
|
minAdjust: 60000,
|
|
maxAdjust: 600000,
|
|
maxTriesPerSession: 2,
|
|
};
|
|
|
|
export const SEARCH_TELEMETRY_SHARED = {
|
|
PROVIDER_INFO: "SearchTelemetry:ProviderInfo",
|
|
LOAD_TIMEOUT: "SearchTelemetry:LoadTimeout",
|
|
SPA_LOAD_TIMEOUT: "SearchTelemetry:SPALoadTimeout",
|
|
};
|
|
|
|
const impressionIdsWithoutEngagementsSet = new Set();
|
|
|
|
export const CATEGORIZATION_SETTINGS = {
|
|
HIGHEST_SCORE_THRESHOLD: 50,
|
|
MAX_DOMAINS_TO_CATEGORIZE: 10,
|
|
MINIMUM_SCORE: 0,
|
|
STARTING_RANK: 2,
|
|
IDLE_TIMEOUT_SECONDS: 60 * 60,
|
|
WAKE_TIMEOUT_MS: 60 * 60 * 1000,
|
|
};
|
|
|
|
ChromeUtils.defineLazyGetter(lazy, "logConsole", () => {
|
|
return console.createInstance({
|
|
prefix: "SearchTelemetry",
|
|
maxLogLevel: lazy.SearchUtils.loggingEnabled ? "Debug" : "Warn",
|
|
});
|
|
});
|
|
|
|
XPCOMUtils.defineLazyPreferenceGetter(
|
|
lazy,
|
|
"serpEventsEnabled",
|
|
"browser.search.serpEventTelemetry.enabled",
|
|
true
|
|
);
|
|
|
|
const CATEGORIZATION_PREF =
|
|
"browser.search.serpEventTelemetryCategorization.enabled";
|
|
|
|
XPCOMUtils.defineLazyPreferenceGetter(
|
|
lazy,
|
|
"serpEventTelemetryCategorization",
|
|
CATEGORIZATION_PREF,
|
|
false,
|
|
(aPreference, previousValue, newValue) => {
|
|
if (newValue) {
|
|
SearchSERPDomainToCategoriesMap.init();
|
|
SearchSERPCategorizationEventScheduler.init();
|
|
} else {
|
|
SearchSERPDomainToCategoriesMap.uninit();
|
|
SearchSERPCategorizationEventScheduler.uninit();
|
|
}
|
|
}
|
|
);
|
|
|
|
export const SearchSERPTelemetryUtils = {
|
|
ACTIONS: {
|
|
CLICKED: "clicked",
|
|
EXPANDED: "expanded",
|
|
SUBMITTED: "submitted",
|
|
},
|
|
COMPONENTS: {
|
|
AD_CAROUSEL: "ad_carousel",
|
|
AD_IMAGE_ROW: "ad_image_row",
|
|
AD_LINK: "ad_link",
|
|
AD_SIDEBAR: "ad_sidebar",
|
|
AD_SITELINK: "ad_sitelink",
|
|
INCONTENT_SEARCHBOX: "incontent_searchbox",
|
|
NON_ADS_LINK: "non_ads_link",
|
|
REFINED_SEARCH_BUTTONS: "refined_search_buttons",
|
|
SHOPPING_TAB: "shopping_tab",
|
|
},
|
|
ABANDONMENTS: {
|
|
NAVIGATION: "navigation",
|
|
TAB_CLOSE: "tab_close",
|
|
WINDOW_CLOSE: "window_close",
|
|
},
|
|
INCONTENT_SOURCES: {
|
|
OPENED_IN_NEW_TAB: "opened_in_new_tab",
|
|
REFINE_ON_SERP: "follow_on_from_refine_on_SERP",
|
|
SEARCHBOX: "follow_on_from_refine_on_incontent_search",
|
|
},
|
|
CATEGORIZATION: {
|
|
INCONCLUSIVE: 0,
|
|
},
|
|
};
|
|
|
|
const AD_COMPONENTS = [
|
|
SearchSERPTelemetryUtils.COMPONENTS.AD_CAROUSEL,
|
|
SearchSERPTelemetryUtils.COMPONENTS.AD_IMAGE_ROW,
|
|
SearchSERPTelemetryUtils.COMPONENTS.AD_LINK,
|
|
SearchSERPTelemetryUtils.COMPONENTS.AD_SIDEBAR,
|
|
SearchSERPTelemetryUtils.COMPONENTS.AD_SITELINK,
|
|
];
|
|
|
|
/**
|
|
* TelemetryHandler is the main class handling Search Engine Result Page (SERP)
|
|
* telemetry. It primarily deals with tracking of what pages are loaded into tabs.
|
|
*
|
|
* It handles the *in-content:sap* keys of the SEARCH_COUNTS histogram.
|
|
*/
|
|
class TelemetryHandler {
|
|
// Whether or not this class is initialised.
|
|
_initialized = false;
|
|
|
|
// An instance of ContentHandler.
|
|
_contentHandler;
|
|
|
|
// The original provider information, mainly used for tests.
|
|
_originalProviderInfo = null;
|
|
|
|
// The current search provider info.
|
|
_searchProviderInfo = null;
|
|
|
|
// An instance of remote settings that is used to access the provider info.
|
|
_telemetrySettings;
|
|
|
|
// Callback used when syncing telemetry settings.
|
|
#telemetrySettingsSync;
|
|
|
|
// _browserInfoByURL is a map of tracked search urls to objects containing:
|
|
// * {object} info
|
|
// the search provider information associated with the url.
|
|
// * {WeakMap} browserTelemetryStateMap
|
|
// a weak map of browsers that have the url loaded, their ad report state,
|
|
// and their impression id.
|
|
// * {integer} count
|
|
// a manual count of browsers logged.
|
|
// We keep a weak map of browsers, in case we miss something on our counts
|
|
// and cause a memory leak - worst case our map is slightly bigger than it
|
|
// needs to be.
|
|
// The manual count is because WeakMap doesn't give us size/length
|
|
// information, but we want to know when we can clean up our associated
|
|
// entry.
|
|
_browserInfoByURL = new Map();
|
|
|
|
// Browser objects mapped to the info in _browserInfoByURL.
|
|
#browserToItemMap = new WeakMap();
|
|
|
|
// _browserSourceMap is a map of the latest search source for a particular
|
|
// browser - one of the KNOWN_SEARCH_SOURCES in BrowserSearchTelemetry.
|
|
_browserSourceMap = new WeakMap();
|
|
|
|
/**
|
|
* A WeakMap whose key is a browser with value of a source type found in
|
|
* INCONTENT_SOURCES. Kept separate to avoid overlapping with legacy
|
|
* search sources. These sources are specific to the content of a search
|
|
* provider page rather than something from within the browser itself.
|
|
*/
|
|
#browserContentSourceMap = new WeakMap();
|
|
|
|
/**
|
|
* Sets the source of a SERP visit from something that occured in content
|
|
* rather than from the browser.
|
|
*
|
|
* @param {browser} browser
|
|
* The browser object associated with the page that should be a SERP.
|
|
* @param {string} source
|
|
* The source that started the load. One of
|
|
* SearchSERPTelemetryUtils.COMPONENTS.INCONTENT_SEARCHBOX,
|
|
* SearchSERPTelemetryUtils.INCONTENT_SOURCES.OPENED_IN_NEW_TAB or
|
|
* SearchSERPTelemetryUtils.INCONTENT_SOURCES.REFINE_ON_SERP.
|
|
*/
|
|
setBrowserContentSource(browser, source) {
|
|
this.#browserContentSourceMap.set(browser, source);
|
|
}
|
|
|
|
// _browserNewtabSessionMap is a map of the newtab session id for particular
|
|
// browsers.
|
|
_browserNewtabSessionMap = new WeakMap();
|
|
|
|
constructor() {
|
|
this._contentHandler = new ContentHandler({
|
|
browserInfoByURL: this._browserInfoByURL,
|
|
findBrowserItemForURL: (...args) => this._findBrowserItemForURL(...args),
|
|
checkURLForSerpMatch: (...args) => this._checkURLForSerpMatch(...args),
|
|
findItemForBrowser: (...args) => this.findItemForBrowser(...args),
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Initializes the TelemetryHandler and its ContentHandler. It will add
|
|
* appropriate listeners to the window so that window opening and closing
|
|
* can be tracked.
|
|
*/
|
|
async init() {
|
|
if (this._initialized) {
|
|
return;
|
|
}
|
|
|
|
this._telemetrySettings = lazy.RemoteSettings(TELEMETRY_SETTINGS_KEY);
|
|
let rawProviderInfo = [];
|
|
try {
|
|
rawProviderInfo = await this._telemetrySettings.get();
|
|
} catch (ex) {
|
|
lazy.logConsole.error("Could not get settings:", ex);
|
|
}
|
|
|
|
this.#telemetrySettingsSync = event => this.#onSettingsSync(event);
|
|
this._telemetrySettings.on("sync", this.#telemetrySettingsSync);
|
|
|
|
// Send the provider info to the child handler.
|
|
this._contentHandler.init(rawProviderInfo);
|
|
this._originalProviderInfo = rawProviderInfo;
|
|
|
|
// Now convert the regexps into
|
|
this._setSearchProviderInfo(rawProviderInfo);
|
|
|
|
for (let win of Services.wm.getEnumerator("navigator:browser")) {
|
|
this._registerWindow(win);
|
|
}
|
|
Services.wm.addListener(this);
|
|
|
|
this._initialized = true;
|
|
}
|
|
|
|
async #onSettingsSync(event) {
|
|
let current = event.data?.current;
|
|
if (current) {
|
|
lazy.logConsole.debug(
|
|
"Update provider info due to Remote Settings sync."
|
|
);
|
|
this._originalProviderInfo = current;
|
|
this._setSearchProviderInfo(current);
|
|
Services.ppmm.sharedData.set(
|
|
SEARCH_TELEMETRY_SHARED.PROVIDER_INFO,
|
|
current
|
|
);
|
|
Services.ppmm.sharedData.flush();
|
|
} else {
|
|
lazy.logConsole.debug(
|
|
"Ignoring Remote Settings sync data due to missing records."
|
|
);
|
|
}
|
|
Services.obs.notifyObservers(null, "search-telemetry-v2-synced");
|
|
}
|
|
|
|
/**
|
|
* Uninitializes the TelemetryHandler and its ContentHandler.
|
|
*/
|
|
uninit() {
|
|
if (!this._initialized) {
|
|
return;
|
|
}
|
|
|
|
this._contentHandler.uninit();
|
|
|
|
for (let win of Services.wm.getEnumerator("navigator:browser")) {
|
|
this._unregisterWindow(win);
|
|
}
|
|
Services.wm.removeListener(this);
|
|
|
|
try {
|
|
this._telemetrySettings.off("sync", this.#telemetrySettingsSync);
|
|
} catch (ex) {
|
|
lazy.logConsole.error(
|
|
"Failed to shutdown SearchSERPTelemetry Remote Settings.",
|
|
ex
|
|
);
|
|
}
|
|
this._telemetrySettings = null;
|
|
this.#telemetrySettingsSync = null;
|
|
|
|
this._initialized = false;
|
|
}
|
|
|
|
/**
|
|
* Records the search source for particular browsers, in case it needs
|
|
* to be associated with a SERP.
|
|
*
|
|
* @param {browser} browser
|
|
* The browser where the search originated.
|
|
* @param {string} source
|
|
* Where the search originated from.
|
|
*/
|
|
recordBrowserSource(browser, source) {
|
|
this._browserSourceMap.set(browser, source);
|
|
}
|
|
|
|
/**
|
|
* Records the newtab source for particular browsers, in case it needs
|
|
* to be associated with a SERP.
|
|
*
|
|
* @param {browser} browser
|
|
* The browser where the search originated.
|
|
* @param {string} newtabSessionId
|
|
* The sessionId of the newtab session the search originated from.
|
|
*/
|
|
recordBrowserNewtabSession(browser, newtabSessionId) {
|
|
this._browserNewtabSessionMap.set(browser, newtabSessionId);
|
|
}
|
|
|
|
/**
|
|
* Helper function for recording the reason for a Glean abandonment event.
|
|
*
|
|
* @param {string} impressionId
|
|
* The impression id for the abandonment event about to be recorded.
|
|
* @param {string} reason
|
|
* The reason the SERP is deemed abandoned.
|
|
* One of SearchSERPTelemetryUtils.ABANDONMENTS.
|
|
*/
|
|
recordAbandonmentTelemetry(impressionId, reason) {
|
|
impressionIdsWithoutEngagementsSet.delete(impressionId);
|
|
|
|
lazy.logConsole.debug(
|
|
`Recording an abandonment event for impression id ${impressionId} with reason: ${reason}`
|
|
);
|
|
|
|
Glean.serp.abandonment.record({
|
|
impression_id: impressionId,
|
|
reason,
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Handles the TabClose event received from the listeners.
|
|
*
|
|
* @param {object} event
|
|
* The event object provided by the listener.
|
|
*/
|
|
handleEvent(event) {
|
|
if (event.type != "TabClose") {
|
|
console.error("Received unexpected event type", event.type);
|
|
return;
|
|
}
|
|
|
|
this._browserNewtabSessionMap.delete(event.target.linkedBrowser);
|
|
this.stopTrackingBrowser(
|
|
event.target.linkedBrowser,
|
|
SearchSERPTelemetryUtils.ABANDONMENTS.TAB_CLOSE
|
|
);
|
|
}
|
|
|
|
/**
|
|
* Test-only function, used to override the provider information, so that
|
|
* unit tests can set it to easy to test values.
|
|
*
|
|
* @param {Array} providerInfo
|
|
* See {@link https://searchfox.org/mozilla-central/search?q=search-telemetry-schema.json}
|
|
* for type information.
|
|
*/
|
|
overrideSearchTelemetryForTests(providerInfo) {
|
|
let info = providerInfo ? providerInfo : this._originalProviderInfo;
|
|
this._contentHandler.overrideSearchTelemetryForTests(info);
|
|
this._setSearchProviderInfo(info);
|
|
}
|
|
|
|
/**
|
|
* Used to set the local version of the search provider information.
|
|
* This automatically maps the regexps to RegExp objects so that
|
|
* we don't have to create a new instance each time.
|
|
*
|
|
* @param {Array} providerInfo
|
|
* A raw array of provider information to set.
|
|
*/
|
|
_setSearchProviderInfo(providerInfo) {
|
|
this._searchProviderInfo = providerInfo.map(provider => {
|
|
let newProvider = {
|
|
...provider,
|
|
searchPageRegexp: new RegExp(provider.searchPageRegexp),
|
|
};
|
|
if (provider.extraAdServersRegexps) {
|
|
newProvider.extraAdServersRegexps = provider.extraAdServersRegexps.map(
|
|
r => new RegExp(r)
|
|
);
|
|
}
|
|
|
|
newProvider.nonAdsLinkRegexps = provider.nonAdsLinkRegexps?.length
|
|
? provider.nonAdsLinkRegexps.map(r => new RegExp(r))
|
|
: [];
|
|
if (provider.shoppingTab?.regexp) {
|
|
newProvider.shoppingTab = {
|
|
selector: provider.shoppingTab.selector,
|
|
regexp: new RegExp(provider.shoppingTab.regexp),
|
|
};
|
|
}
|
|
return newProvider;
|
|
});
|
|
this._contentHandler._searchProviderInfo = this._searchProviderInfo;
|
|
}
|
|
|
|
reportPageAction(info, browser) {
|
|
this._contentHandler._reportPageAction(info, browser);
|
|
}
|
|
|
|
reportPageWithAds(info, browser) {
|
|
this._contentHandler._reportPageWithAds(info, browser);
|
|
}
|
|
|
|
reportPageWithAdImpressions(info, browser) {
|
|
this._contentHandler._reportPageWithAdImpressions(info, browser);
|
|
}
|
|
|
|
reportPageDomains(info, browser) {
|
|
this._contentHandler._reportPageDomains(info, browser);
|
|
}
|
|
|
|
reportPageImpression(info, browser) {
|
|
this._contentHandler._reportPageImpression(info, browser);
|
|
}
|
|
|
|
/**
|
|
* This may start tracking a tab based on the URL. If the URL matches a search
|
|
* partner, and it has a code, then we'll start tracking it. This will aid
|
|
* determining if it is a page we should be tracking for adverts.
|
|
*
|
|
* @param {object} browser
|
|
* The browser associated with the page.
|
|
* @param {string} url
|
|
* The url that was loaded in the browser.
|
|
* @param {nsIDocShell.LoadCommand} loadType
|
|
* The load type associated with the page load.
|
|
*/
|
|
updateTrackingStatus(browser, url, loadType) {
|
|
if (
|
|
!lazy.BrowserSearchTelemetry.shouldRecordSearchCount(
|
|
browser.getTabBrowser()
|
|
)
|
|
) {
|
|
return;
|
|
}
|
|
let info = this._checkURLForSerpMatch(url);
|
|
if (!info) {
|
|
this._browserNewtabSessionMap.delete(browser);
|
|
this.stopTrackingBrowser(browser);
|
|
return;
|
|
}
|
|
|
|
let source = "unknown";
|
|
if (loadType & Ci.nsIDocShell.LOAD_CMD_RELOAD) {
|
|
source = "reload";
|
|
} else if (loadType & Ci.nsIDocShell.LOAD_CMD_HISTORY) {
|
|
source = "tabhistory";
|
|
} else if (this._browserSourceMap.has(browser)) {
|
|
source = this._browserSourceMap.get(browser);
|
|
this._browserSourceMap.delete(browser);
|
|
}
|
|
|
|
// If it's a SERP but doesn't have a browser source, the source might be
|
|
// from something that happened in content. We keep this separate from
|
|
// source because legacy telemetry should not change its reporting.
|
|
let inContentSource;
|
|
if (
|
|
lazy.serpEventsEnabled &&
|
|
info.hasComponents &&
|
|
this.#browserContentSourceMap.has(browser)
|
|
) {
|
|
inContentSource = this.#browserContentSourceMap.get(browser);
|
|
this.#browserContentSourceMap.delete(browser);
|
|
}
|
|
|
|
let newtabSessionId;
|
|
if (this._browserNewtabSessionMap.has(browser)) {
|
|
newtabSessionId = this._browserNewtabSessionMap.get(browser);
|
|
// We leave the newtabSessionId in the map for this browser
|
|
// until we stop loading SERP pages or the tab is closed.
|
|
}
|
|
|
|
let impressionId;
|
|
if (lazy.serpEventsEnabled && info.hasComponents) {
|
|
// The UUID generated by Services.uuid contains leading and trailing braces.
|
|
// Need to trim them first.
|
|
impressionId = Services.uuid.generateUUID().toString().slice(1, -1);
|
|
|
|
impressionIdsWithoutEngagementsSet.add(impressionId);
|
|
}
|
|
|
|
this._reportSerpPage(info, source, url);
|
|
|
|
// For single page apps, we store the page by its original URI so the
|
|
// network observers can recover the browser in a context when they only
|
|
// have access to the originURL.
|
|
let urlKey =
|
|
info.isSPA && browser.originalURI?.spec ? browser.originalURI.spec : url;
|
|
let item = this._browserInfoByURL.get(urlKey);
|
|
|
|
let impressionInfo;
|
|
if (lazy.serpEventsEnabled && info.hasComponents) {
|
|
let partnerCode = "";
|
|
if (info.code != "none" && info.code != null) {
|
|
partnerCode = info.code;
|
|
}
|
|
impressionInfo = {
|
|
provider: info.provider,
|
|
tagged: info.type.startsWith("tagged"),
|
|
partnerCode,
|
|
source: inContentSource ?? source,
|
|
isShoppingPage: info.isShoppingPage,
|
|
isPrivate: lazy.PrivateBrowsingUtils.isBrowserPrivate(browser),
|
|
};
|
|
}
|
|
|
|
if (item) {
|
|
item.browserTelemetryStateMap.set(browser, {
|
|
adsReported: false,
|
|
adImpressionsReported: false,
|
|
impressionId,
|
|
urlToComponentMap: null,
|
|
impressionInfo,
|
|
searchBoxSubmitted: false,
|
|
categorizationInfo: null,
|
|
adsClicked: 0,
|
|
adsVisible: 0,
|
|
searchQuery: info.searchQuery,
|
|
});
|
|
item.count++;
|
|
item.source = source;
|
|
item.newtabSessionId = newtabSessionId;
|
|
} else {
|
|
item = {
|
|
browserTelemetryStateMap: new WeakMap().set(browser, {
|
|
adsReported: false,
|
|
adImpressionsReported: false,
|
|
impressionId,
|
|
urlToComponentMap: null,
|
|
impressionInfo,
|
|
searchBoxSubmitted: false,
|
|
categorizationInfo: null,
|
|
adsClicked: 0,
|
|
adsVisible: 0,
|
|
searchQuery: info.searchQuery,
|
|
}),
|
|
info,
|
|
count: 1,
|
|
source,
|
|
newtabSessionId,
|
|
majorVersion: parseInt(Services.appinfo.version),
|
|
channel: lazy.SearchUtils.MODIFIED_APP_CHANNEL,
|
|
region: lazy.Region.home,
|
|
isSPA: info.isSPA,
|
|
};
|
|
// For single page apps, we store the page by its original URI so that
|
|
// network observers can recover the browser in a context when they only
|
|
// have the originURL to work with.
|
|
this._browserInfoByURL.set(urlKey, item);
|
|
}
|
|
this.#browserToItemMap.set(browser, item);
|
|
}
|
|
|
|
/**
|
|
* Determines whether or not a browser should be untracked or tracked for
|
|
* SERPs who have single page app behaviour.
|
|
*
|
|
* The over-arching logic:
|
|
* 1. Only inspect the browser if the url matches a SERP that is a SPA.
|
|
* 2. Recording an engagement if we're tracking the browser and we're going
|
|
* to another page.
|
|
* 3. Untrack the browser if we're tracking it and switching pages.
|
|
* 4. Track the browser if we're now on a default search page.
|
|
*
|
|
* @param {BrowserElement} browser
|
|
* The browser element related to the request.
|
|
* @param {string} url
|
|
* The url of the request.
|
|
* @param {number} loadType
|
|
* The loadtype of a the request.
|
|
*/
|
|
updateTrackingSinglePageApp(browser, url, loadType) {
|
|
let providerInfo = this._getProviderInfoForURL(url);
|
|
if (!providerInfo?.isSPA) {
|
|
return;
|
|
}
|
|
|
|
let item = this.findItemForBrowser(browser);
|
|
let telemetryState = item?.browserTelemetryStateMap.get(browser);
|
|
|
|
let searchTermChanged =
|
|
this.urlSearchTerms(url, providerInfo) != telemetryState?.searchQuery;
|
|
let isSerp = !!this._checkURLForSerpMatch(url, providerInfo);
|
|
let browserIsTracked = !!telemetryState;
|
|
let isTabHistory = loadType & Ci.nsIDocShell.LOAD_CMD_HISTORY;
|
|
|
|
// Step 2: Maybe record engagement.
|
|
if (browserIsTracked && !isTabHistory && (searchTermChanged || !isSerp)) {
|
|
impressionIdsWithoutEngagementsSet.delete(telemetryState.impressionId);
|
|
Glean.serp.engagement.record({
|
|
impression_id: telemetryState.impressionId,
|
|
action: SearchSERPTelemetryUtils.ACTIONS.CLICKED,
|
|
target: SearchSERPTelemetryUtils.COMPONENTS.NON_ADS_LINK,
|
|
});
|
|
lazy.logConsole.debug("Counting click:", {
|
|
impressionId: telemetryState.impressionId,
|
|
type: SearchSERPTelemetryUtils.COMPONENTS.NON_ADS_LINK,
|
|
URL: url,
|
|
});
|
|
}
|
|
|
|
// Step 3: Maybe untrack the browser.
|
|
if (browserIsTracked && (searchTermChanged || !isSerp)) {
|
|
let reason = "";
|
|
// If we have to untrack it, it might be due to the user using the
|
|
// back/forward button.
|
|
if (isTabHistory) {
|
|
reason = SearchSERPTelemetryUtils.ABANDONMENTS.NAVIGATION;
|
|
}
|
|
let actor = browser.browsingContext.currentWindowGlobal.getActor(
|
|
"SearchSERPTelemetry"
|
|
);
|
|
actor.sendAsyncMessage("SearchSERPTelemetry:RemoveEventListeners");
|
|
this.stopTrackingBrowser(browser, reason);
|
|
browserIsTracked = false;
|
|
}
|
|
|
|
// Step 4: Maybe track the browser.
|
|
if (isSerp && !browserIsTracked) {
|
|
this.updateTrackingStatus(browser, url, loadType);
|
|
let actor = browser.browsingContext.currentWindowGlobal.getActor(
|
|
"SearchSERPTelemetry"
|
|
);
|
|
actor.sendAsyncMessage("SearchSERPTelemetry:WaitForSPAPageLoad");
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Stops tracking of a tab, for example the tab has loaded a different URL.
|
|
* Also records a Glean abandonment event if appropriate.
|
|
*
|
|
* @param {object} browser The browser associated with the tab to stop being
|
|
* tracked.
|
|
* @param {string} abandonmentReason
|
|
* An optional parameter that specifies why the browser is deemed abandoned.
|
|
* The reason will be recorded as part of Glean abandonment telemetry.
|
|
* One of SearchSERPTelemetryUtils.ABANDONMENTS.
|
|
*/
|
|
stopTrackingBrowser(browser, abandonmentReason) {
|
|
for (let [url, item] of this._browserInfoByURL) {
|
|
if (item.browserTelemetryStateMap.has(browser)) {
|
|
let telemetryState = item.browserTelemetryStateMap.get(browser);
|
|
let impressionId = telemetryState.impressionId;
|
|
if (impressionIdsWithoutEngagementsSet.has(impressionId)) {
|
|
this.recordAbandonmentTelemetry(impressionId, abandonmentReason);
|
|
}
|
|
|
|
if (
|
|
lazy.serpEventTelemetryCategorization &&
|
|
telemetryState.categorizationInfo
|
|
) {
|
|
SearchSERPCategorizationEventScheduler.sendCallback(browser);
|
|
}
|
|
|
|
item.browserTelemetryStateMap.delete(browser);
|
|
item.count--;
|
|
}
|
|
|
|
if (!item.count) {
|
|
this._browserInfoByURL.delete(url);
|
|
}
|
|
}
|
|
this.#browserToItemMap.delete(browser);
|
|
}
|
|
|
|
/**
|
|
* Calculate how close two urls are in equality.
|
|
*
|
|
* The scoring system:
|
|
* - If the URLs look exactly the same, including the ordering of query
|
|
* parameters, the score is Infinity.
|
|
* - If the origin is the same, the score is increased by 1. Otherwise the
|
|
* score is 0.
|
|
* - If the path is the same, the score is increased by 1.
|
|
* - For each query parameter, if the key exists the score is increased by 1.
|
|
* Likewise if the query parameter values match.
|
|
* - If the hash is the same, the score is increased by 1. This includes if
|
|
* the hash is missing in both URLs.
|
|
*
|
|
* @param {URL} url1
|
|
* Url to compare.
|
|
* @param {URL} url2
|
|
* Other url to compare. Ordering shouldn't matter.
|
|
* @param {object} [matchOptions]
|
|
* Options for checking equality.
|
|
* @param {boolean} [matchOptions.path]
|
|
* Whether the path must match. Default to false.
|
|
* @param {boolean} [matchOptions.paramValues]
|
|
* Whether the values of the query parameters must match if the query
|
|
* parameter key exists in the other. Defaults to false.
|
|
* @returns {number}
|
|
* A score of how closely the two URLs match. Returns 0 if there is no
|
|
* match or the equality check failed for an enabled match option.
|
|
*/
|
|
compareUrls(url1, url2, matchOptions = {}) {
|
|
// In case of an exact match, well, that's an obvious winner.
|
|
if (url1.href == url2.href) {
|
|
return Infinity;
|
|
}
|
|
|
|
// Each step we get closer to the two URLs being the same, we increase the
|
|
// score. The consumer of this method will use these scores to see which
|
|
// of the URLs is the best match.
|
|
let score = 0;
|
|
if (url1.origin == url2.origin) {
|
|
++score;
|
|
if (url1.pathname == url2.pathname) {
|
|
++score;
|
|
for (let [key1, value1] of url1.searchParams) {
|
|
// Let's not fuss about the ordering of search params, since the
|
|
// score effect will solve that.
|
|
if (url2.searchParams.has(key1)) {
|
|
++score;
|
|
if (url2.searchParams.get(key1) == value1) {
|
|
++score;
|
|
} else if (matchOptions.paramValues) {
|
|
return 0;
|
|
}
|
|
}
|
|
}
|
|
if (url1.hash == url2.hash) {
|
|
++score;
|
|
}
|
|
} else if (matchOptions.path) {
|
|
return 0;
|
|
}
|
|
}
|
|
return score;
|
|
}
|
|
|
|
/**
|
|
* Extracts the search terms from the URL based on the provider info.
|
|
*
|
|
* @param {string} url
|
|
* The URL to inspect.
|
|
* @param {object} providerInfo
|
|
* The providerInfo associated with the URL.
|
|
* @returns {string}
|
|
* The search term or if none is found, a blank string.
|
|
*/
|
|
urlSearchTerms(url, providerInfo) {
|
|
if (providerInfo?.queryParamNames?.length) {
|
|
let { searchParams } = new URL(url);
|
|
for (let queryParamName of providerInfo.queryParamNames) {
|
|
let value = searchParams.get(queryParamName);
|
|
if (value) {
|
|
return value;
|
|
}
|
|
}
|
|
}
|
|
return "";
|
|
}
|
|
|
|
findItemForBrowser(browser) {
|
|
return this.#browserToItemMap.get(browser);
|
|
}
|
|
|
|
/**
|
|
* Parts of the URL, like search params and hashes, may be mutated by scripts
|
|
* on a page we're tracking. Since we don't want to keep track of that
|
|
* ourselves in order to keep the list of browser objects a weak-referenced
|
|
* set, we do optional fuzzy matching of URLs to fetch the most relevant item
|
|
* that contains tracking information.
|
|
*
|
|
* @param {string} url URL to fetch the tracking data for.
|
|
* @returns {object} Map containing the following members:
|
|
* - {WeakMap} browsers
|
|
* Map of browser elements that belong to `url` and their ad report state.
|
|
* - {object} info
|
|
* Info dictionary as returned by `_checkURLForSerpMatch`.
|
|
* - {number} count
|
|
* The number of browser element we can most accurately tell we're
|
|
* tracking, since they're inside a WeakMap.
|
|
*/
|
|
_findBrowserItemForURL(url) {
|
|
try {
|
|
url = new URL(url);
|
|
} catch (ex) {
|
|
return null;
|
|
}
|
|
|
|
let item;
|
|
let currentBestMatch = 0;
|
|
for (let [trackingURL, candidateItem] of this._browserInfoByURL) {
|
|
if (currentBestMatch === Infinity) {
|
|
break;
|
|
}
|
|
try {
|
|
// Make sure to cache the parsed URL object, since there's no reason to
|
|
// do it twice.
|
|
trackingURL =
|
|
candidateItem._trackingURL ||
|
|
(candidateItem._trackingURL = new URL(trackingURL));
|
|
} catch (ex) {
|
|
continue;
|
|
}
|
|
let score = this.compareUrls(url, trackingURL);
|
|
if (score > currentBestMatch) {
|
|
item = candidateItem;
|
|
currentBestMatch = score;
|
|
}
|
|
}
|
|
|
|
return item;
|
|
}
|
|
|
|
// nsIWindowMediatorListener
|
|
|
|
/**
|
|
* This is called when a new window is opened, and handles registration of
|
|
* that window if it is a browser window.
|
|
*
|
|
* @param {nsIAppWindow} appWin The xul window that was opened.
|
|
*/
|
|
onOpenWindow(appWin) {
|
|
let win = appWin.docShell.domWindow;
|
|
win.addEventListener(
|
|
"load",
|
|
() => {
|
|
if (
|
|
win.document.documentElement.getAttribute("windowtype") !=
|
|
"navigator:browser"
|
|
) {
|
|
return;
|
|
}
|
|
|
|
this._registerWindow(win);
|
|
},
|
|
{ once: true }
|
|
);
|
|
}
|
|
|
|
/**
|
|
* Listener that is called when a window is closed, and handles deregistration of
|
|
* that window if it is a browser window.
|
|
*
|
|
* @param {nsIAppWindow} appWin The xul window that was closed.
|
|
*/
|
|
onCloseWindow(appWin) {
|
|
let win = appWin.docShell.domWindow;
|
|
|
|
if (
|
|
win.document.documentElement.getAttribute("windowtype") !=
|
|
"navigator:browser"
|
|
) {
|
|
return;
|
|
}
|
|
|
|
this._unregisterWindow(win);
|
|
}
|
|
|
|
/**
|
|
* Adds event listeners for the window and registers it with the content handler.
|
|
*
|
|
* @param {object} win The window to register.
|
|
*/
|
|
_registerWindow(win) {
|
|
win.gBrowser.tabContainer.addEventListener("TabClose", this);
|
|
}
|
|
|
|
/**
|
|
* Removes event listeners for the window and unregisters it with the content
|
|
* handler.
|
|
*
|
|
* @param {object} win The window to unregister.
|
|
*/
|
|
_unregisterWindow(win) {
|
|
for (let tab of win.gBrowser.tabs) {
|
|
this.stopTrackingBrowser(
|
|
tab.linkedBrowser,
|
|
SearchSERPTelemetryUtils.ABANDONMENTS.WINDOW_CLOSE
|
|
);
|
|
}
|
|
|
|
win.gBrowser.tabContainer.removeEventListener("TabClose", this);
|
|
}
|
|
|
|
/**
|
|
* Searches for provider information for a given url.
|
|
*
|
|
* @param {string} url The url to match for a provider.
|
|
* @returns {Array | null} Returns an array of provider name and the provider information.
|
|
*/
|
|
_getProviderInfoForURL(url) {
|
|
return this._searchProviderInfo.find(info =>
|
|
info.searchPageRegexp.test(url)
|
|
);
|
|
}
|
|
|
|
/**
|
|
* Checks to see if a url is a search partner location, and determines the
|
|
* provider and codes used.
|
|
*
|
|
* @param {string} url The url to match.
|
|
* @returns {null|object} Returns null if there is no match found. Otherwise,
|
|
* returns an object of strings for provider, code and type.
|
|
*/
|
|
_checkURLForSerpMatch(url) {
|
|
let searchProviderInfo = this._getProviderInfoForURL(url);
|
|
if (!searchProviderInfo) {
|
|
return null;
|
|
}
|
|
|
|
let queries = new URLSearchParams(url.split("#")[0].split("?")[1]);
|
|
|
|
let isSPA = !!searchProviderInfo.isSPA;
|
|
if (isSPA) {
|
|
// A URL may have a specific query parameter denoting a search page.
|
|
// If the key was expected but doesn't currently exist, it could be due to
|
|
// the initial url containing it until after a page load.
|
|
// In that case, ignore this check since most SERPs missing the query
|
|
// param will go to the default search page.
|
|
let { key, value } = searchProviderInfo.defaultPageQueryParam;
|
|
if (key && queries.has(key) && queries.get(key) != value) {
|
|
return null;
|
|
}
|
|
}
|
|
|
|
// Some URLs can match provider info but also be the provider's homepage
|
|
// instead of a SERP.
|
|
// e.g. https://example.com/ vs. https://example.com/?foo=bar
|
|
// Look for the presence of the query parameter that contains a search term.
|
|
let hasQuery = false;
|
|
let searchQuery = "";
|
|
for (let queryParamName of searchProviderInfo.queryParamNames) {
|
|
searchQuery = queries.get(queryParamName);
|
|
if (searchQuery) {
|
|
hasQuery = true;
|
|
break;
|
|
}
|
|
}
|
|
if (!hasQuery) {
|
|
return null;
|
|
}
|
|
// Default to organic to simplify things.
|
|
// We override type in the sap cases.
|
|
let type = "organic";
|
|
let code;
|
|
if (searchProviderInfo.codeParamName) {
|
|
code = queries.get(searchProviderInfo.codeParamName);
|
|
if (code) {
|
|
// The code is only included if it matches one of the specific ones.
|
|
if (searchProviderInfo.taggedCodes.includes(code)) {
|
|
type = "tagged";
|
|
if (
|
|
searchProviderInfo.followOnParamNames &&
|
|
searchProviderInfo.followOnParamNames.some(p => queries.has(p))
|
|
) {
|
|
type += "-follow-on";
|
|
}
|
|
} else if (searchProviderInfo.organicCodes.includes(code)) {
|
|
type = "organic";
|
|
} else if (searchProviderInfo.expectedOrganicCodes?.includes(code)) {
|
|
code = "none";
|
|
} else {
|
|
code = "other";
|
|
}
|
|
} else if (searchProviderInfo.followOnCookies) {
|
|
// Especially Bing requires lots of extra work related to cookies.
|
|
for (let followOnCookie of searchProviderInfo.followOnCookies) {
|
|
if (followOnCookie.extraCodeParamName) {
|
|
let eCode = queries.get(followOnCookie.extraCodeParamName);
|
|
if (
|
|
!eCode ||
|
|
!followOnCookie.extraCodePrefixes.some(p => eCode.startsWith(p))
|
|
) {
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// If this cookie is present, it's probably an SAP follow-on.
|
|
// This might be an organic follow-on in the same session, but there
|
|
// is no way to tell the difference.
|
|
for (let cookie of Services.cookies.getCookiesFromHost(
|
|
followOnCookie.host,
|
|
{}
|
|
)) {
|
|
if (cookie.name != followOnCookie.name) {
|
|
continue;
|
|
}
|
|
|
|
let [cookieParam, cookieValue] = cookie.value
|
|
.split("=")
|
|
.map(p => p.trim());
|
|
if (
|
|
cookieParam == followOnCookie.codeParamName &&
|
|
searchProviderInfo.taggedCodes.includes(cookieValue)
|
|
) {
|
|
type = "tagged-follow-on";
|
|
code = cookieValue;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
let isShoppingPage = false;
|
|
let hasComponents = false;
|
|
if (lazy.serpEventsEnabled) {
|
|
if (searchProviderInfo.shoppingTab?.regexp) {
|
|
isShoppingPage = searchProviderInfo.shoppingTab.regexp.test(url);
|
|
}
|
|
if (searchProviderInfo.components?.length) {
|
|
hasComponents = true;
|
|
}
|
|
}
|
|
return {
|
|
provider: searchProviderInfo.telemetryId,
|
|
type,
|
|
code,
|
|
isShoppingPage,
|
|
hasComponents,
|
|
searchQuery,
|
|
isSPA,
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Logs telemetry for a search provider visit.
|
|
*
|
|
* @param {object} info The search provider information.
|
|
* @param {string} info.provider The name of the provider.
|
|
* @param {string} info.type The type of search.
|
|
* @param {string} [info.code] The code for the provider.
|
|
* @param {string} source Where the search originated from.
|
|
* @param {string} url The url that was matched (for debug logging only).
|
|
*/
|
|
_reportSerpPage(info, source, url) {
|
|
let payload = `${info.provider}:${info.type}:${info.code || "none"}`;
|
|
Services.telemetry.keyedScalarAdd(
|
|
SEARCH_CONTENT_SCALAR_BASE + source,
|
|
payload,
|
|
1
|
|
);
|
|
lazy.logConsole.debug("Impression:", payload, url);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* ContentHandler deals with handling telemetry of the content within a tab -
|
|
* when ads detected and when they are selected.
|
|
*/
|
|
class ContentHandler {
|
|
/**
|
|
* Constructor.
|
|
*
|
|
* @param {object} options
|
|
* The options for the handler.
|
|
* @param {Map} options.browserInfoByURL
|
|
* The map of urls from TelemetryHandler.
|
|
* @param {Function} options.getProviderInfoForURL
|
|
* A function that obtains the provider information for a url.
|
|
*/
|
|
constructor(options) {
|
|
this._browserInfoByURL = options.browserInfoByURL;
|
|
this._findBrowserItemForURL = options.findBrowserItemForURL;
|
|
this._checkURLForSerpMatch = options.checkURLForSerpMatch;
|
|
this._findItemForBrowser = options.findItemForBrowser;
|
|
}
|
|
|
|
/**
|
|
* Initializes the content handler. This will also set up the shared data that is
|
|
* shared with the SearchTelemetryChild actor.
|
|
*
|
|
* @param {Array} providerInfo
|
|
* The provider information for the search telemetry to record.
|
|
*/
|
|
init(providerInfo) {
|
|
Services.ppmm.sharedData.set(
|
|
SEARCH_TELEMETRY_SHARED.PROVIDER_INFO,
|
|
providerInfo
|
|
);
|
|
Services.ppmm.sharedData.set(
|
|
SEARCH_TELEMETRY_SHARED.LOAD_TIMEOUT,
|
|
ADLINK_CHECK_TIMEOUT_MS
|
|
);
|
|
Services.ppmm.sharedData.set(
|
|
SEARCH_TELEMETRY_SHARED.SPA_LOAD_TIMEOUT,
|
|
SPA_ADLINK_CHECK_TIMEOUT_MS
|
|
);
|
|
|
|
Services.obs.addObserver(this, "http-on-examine-response");
|
|
Services.obs.addObserver(this, "http-on-examine-cached-response");
|
|
Services.obs.addObserver(this, "http-on-stop-request");
|
|
}
|
|
|
|
/**
|
|
* Uninitializes the content handler.
|
|
*/
|
|
uninit() {
|
|
Services.obs.removeObserver(this, "http-on-examine-response");
|
|
Services.obs.removeObserver(this, "http-on-examine-cached-response");
|
|
Services.obs.removeObserver(this, "http-on-stop-request");
|
|
}
|
|
|
|
/**
|
|
* Test-only function to override the search provider information for use
|
|
* with tests. Passes it to the SearchTelemetryChild actor.
|
|
*
|
|
* @param {object} providerInfo @see SEARCH_PROVIDER_INFO for type information.
|
|
*/
|
|
overrideSearchTelemetryForTests(providerInfo) {
|
|
Services.ppmm.sharedData.set("SearchTelemetry:ProviderInfo", providerInfo);
|
|
}
|
|
|
|
/**
|
|
* Reports bandwidth used by the given channel if it is used by search requests.
|
|
*
|
|
* @param {object} aChannel The channel that generated the activity.
|
|
*/
|
|
_reportChannelBandwidth(aChannel) {
|
|
if (!(aChannel instanceof Ci.nsIChannel)) {
|
|
return;
|
|
}
|
|
let wrappedChannel = ChannelWrapper.get(aChannel);
|
|
|
|
let getTopURL = channel => {
|
|
// top-level document
|
|
if (
|
|
channel.loadInfo &&
|
|
channel.loadInfo.externalContentPolicyType ==
|
|
Ci.nsIContentPolicy.TYPE_DOCUMENT
|
|
) {
|
|
return channel.finalURL;
|
|
}
|
|
|
|
// iframe
|
|
let frameAncestors;
|
|
try {
|
|
frameAncestors = channel.frameAncestors;
|
|
} catch (e) {
|
|
frameAncestors = null;
|
|
}
|
|
if (frameAncestors) {
|
|
let ancestor = frameAncestors.find(obj => obj.frameId == 0);
|
|
if (ancestor) {
|
|
return ancestor.url;
|
|
}
|
|
}
|
|
|
|
// top-level resource
|
|
if (channel.loadInfo && channel.loadInfo.loadingPrincipal) {
|
|
return channel.loadInfo.loadingPrincipal.spec;
|
|
}
|
|
|
|
return null;
|
|
};
|
|
|
|
let topUrl = getTopURL(wrappedChannel);
|
|
if (!topUrl) {
|
|
return;
|
|
}
|
|
|
|
let info = this._checkURLForSerpMatch(topUrl);
|
|
if (!info) {
|
|
return;
|
|
}
|
|
|
|
let bytesTransferred =
|
|
wrappedChannel.requestSize + wrappedChannel.responseSize;
|
|
let { provider } = info;
|
|
|
|
let isPrivate =
|
|
wrappedChannel.loadInfo &&
|
|
wrappedChannel.loadInfo.originAttributes.privateBrowsingId > 0;
|
|
if (isPrivate) {
|
|
provider += `-${SEARCH_TELEMETRY_PRIVATE_BROWSING_KEY_SUFFIX}`;
|
|
}
|
|
|
|
Services.telemetry.keyedScalarAdd(
|
|
SEARCH_DATA_TRANSFERRED_SCALAR,
|
|
provider,
|
|
bytesTransferred
|
|
);
|
|
}
|
|
|
|
observe(aSubject, aTopic, aData) {
|
|
switch (aTopic) {
|
|
case "http-on-stop-request":
|
|
this._reportChannelBandwidth(aSubject);
|
|
break;
|
|
case "http-on-examine-response":
|
|
case "http-on-examine-cached-response":
|
|
this.observeActivity(aSubject);
|
|
break;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Listener that observes network activity, so that we can determine if a link
|
|
* from a search provider page was followed, and if then if that link was an
|
|
* ad click or not.
|
|
*
|
|
* @param {nsIChannel} channel The channel that generated the activity.
|
|
*/
|
|
observeActivity(channel) {
|
|
if (!(channel instanceof Ci.nsIChannel)) {
|
|
return;
|
|
}
|
|
|
|
let wrappedChannel = ChannelWrapper.get(channel);
|
|
// The channel we're observing might be a redirect of a channel we've
|
|
// observed before.
|
|
if (wrappedChannel._adClickRecorded) {
|
|
lazy.logConsole.debug("Ad click already recorded");
|
|
return;
|
|
}
|
|
|
|
Services.tm.dispatchToMainThread(() => {
|
|
// We suspect that No Content (204) responses are used to transfer or
|
|
// update beacons. They used to lead to double-counting ad-clicks, so let's
|
|
// ignore them.
|
|
if (wrappedChannel.statusCode == 204) {
|
|
lazy.logConsole.debug("Ignoring activity from ambiguous responses");
|
|
return;
|
|
}
|
|
|
|
// The wrapper is consistent across redirects, so we can use it to track state.
|
|
let originURL = wrappedChannel.originURI && wrappedChannel.originURI.spec;
|
|
let item = this._findBrowserItemForURL(originURL);
|
|
if (!originURL || !item) {
|
|
return;
|
|
}
|
|
|
|
let url = wrappedChannel.finalURL;
|
|
|
|
let providerInfo = item.info.provider;
|
|
let info = this._searchProviderInfo.find(provider => {
|
|
return provider.telemetryId == providerInfo;
|
|
});
|
|
|
|
// If an error occurs with Glean SERP telemetry logic, avoid
|
|
// disrupting legacy telemetry.
|
|
try {
|
|
this.#maybeRecordSERPTelemetry(wrappedChannel, item, info);
|
|
} catch (ex) {
|
|
lazy.logConsole.error(ex);
|
|
}
|
|
|
|
if (!info.extraAdServersRegexps?.some(regex => regex.test(url))) {
|
|
return;
|
|
}
|
|
|
|
try {
|
|
Services.telemetry.keyedScalarAdd(
|
|
SEARCH_AD_CLICKS_SCALAR_BASE + item.source,
|
|
`${info.telemetryId}:${item.info.type}`,
|
|
1
|
|
);
|
|
wrappedChannel._adClickRecorded = true;
|
|
if (item.newtabSessionId) {
|
|
Glean.newtabSearchAd.click.record({
|
|
newtab_visit_id: item.newtabSessionId,
|
|
search_access_point: item.source,
|
|
is_follow_on: item.info.type.endsWith("follow-on"),
|
|
is_tagged: item.info.type.startsWith("tagged"),
|
|
telemetry_id: item.info.provider,
|
|
});
|
|
}
|
|
|
|
lazy.logConsole.debug("Counting ad click in page for:", {
|
|
source: item.source,
|
|
originURL,
|
|
URL: url,
|
|
});
|
|
} catch (e) {
|
|
console.error(e);
|
|
}
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Checks if a request should record an ad click if it can be traced to a
|
|
* browser containing an observed SERP.
|
|
*
|
|
* @param {ChannelWrapper} wrappedChannel
|
|
* The wrapped channel.
|
|
* @param {object} item
|
|
* The browser item associated with the origin URL of the request.
|
|
* @param {object} info
|
|
* The search provider info associated with the item.
|
|
*/
|
|
#maybeRecordSERPTelemetry(wrappedChannel, item, info) {
|
|
if (!lazy.serpEventsEnabled) {
|
|
return;
|
|
}
|
|
|
|
if (wrappedChannel._recordedClick) {
|
|
lazy.logConsole.debug("Click already recorded.");
|
|
return;
|
|
}
|
|
|
|
let originURL = wrappedChannel.originURI?.spec;
|
|
let url = wrappedChannel.finalURL;
|
|
// Some channels re-direct by loading pages that return 200. The result
|
|
// is the channel will have an originURL that changes from the SERP to
|
|
// either a nonAdsRegexp or an extraAdServersRegexps. This is typical
|
|
// for loading a page in a new tab. The channel will have changed so any
|
|
// properties attached to them to record state (e.g. _recordedClick)
|
|
// won't be present.
|
|
if (
|
|
info.nonAdsLinkRegexps.some(r => r.test(originURL)) ||
|
|
info.extraAdServersRegexps.some(r => r.test(originURL))
|
|
) {
|
|
return;
|
|
}
|
|
|
|
// A click event is recorded if a user loads a resource from an
|
|
// originURL that is a SERP.
|
|
//
|
|
// Typically, we only want top level loads containing documents to avoid
|
|
// recording any event on an in-page resource a SERP might load
|
|
// (e.g. CSS files).
|
|
//
|
|
// The exception to this is if a subframe loads a resource that matches
|
|
// a non ad link. Some SERPs encode non ad search results with a URL
|
|
// that gets loaded into an iframe, which then tells the container of
|
|
// the iframe to change the location of the page.
|
|
if (
|
|
wrappedChannel.channel.isDocument &&
|
|
(wrappedChannel.channel.loadInfo.isTopLevelLoad ||
|
|
info.nonAdsLinkRegexps.some(r => r.test(url)))
|
|
) {
|
|
let browser = wrappedChannel.browserElement;
|
|
|
|
// If the load is from history, don't record an event.
|
|
if (
|
|
browser?.browsingContext.webProgress?.loadType &
|
|
Ci.nsIDocShell.LOAD_CMD_HISTORY
|
|
) {
|
|
lazy.logConsole.debug("Ignoring load from history");
|
|
return;
|
|
}
|
|
|
|
// Step 1: Check if the browser associated with the request was a
|
|
// tracked SERP.
|
|
let start = Cu.now();
|
|
let telemetryState;
|
|
let isFromNewtab = false;
|
|
if (item.browserTelemetryStateMap.has(browser)) {
|
|
// If the map contains the browser, then it means that the request is
|
|
// the SERP is going from one page to another. We know this because
|
|
// previous conditions prevent non-top level loads from occuring here.
|
|
telemetryState = item.browserTelemetryStateMap.get(browser);
|
|
} else if (browser) {
|
|
// Alternatively, it could be the case that the request is occuring in
|
|
// a new tab but was triggered by one of the browsers in the state map.
|
|
// If only one browser exists in the state map, it must be that one.
|
|
if (item.count === 1) {
|
|
let sourceBrowsers = ChromeUtils.nondeterministicGetWeakMapKeys(
|
|
item.browserTelemetryStateMap
|
|
);
|
|
if (sourceBrowsers?.length) {
|
|
telemetryState = item.browserTelemetryStateMap.get(
|
|
sourceBrowsers[0]
|
|
);
|
|
}
|
|
} else if (item.count > 1) {
|
|
// If the count is more than 1, then multiple open SERPs contain the
|
|
// same search term, so try to find the specific browser that opened
|
|
// the request.
|
|
let tabBrowser = browser.getTabBrowser();
|
|
let tab = tabBrowser.getTabForBrowser(browser).openerTab;
|
|
// A tab will not always have an openerTab, as first tabs in new
|
|
// windows don't have an openerTab.
|
|
// Bug 1867582: We should also handle the case where multiple tabs
|
|
// contain the same search term.
|
|
if (tab) {
|
|
telemetryState = item.browserTelemetryStateMap.get(
|
|
tab.linkedBrowser
|
|
);
|
|
}
|
|
}
|
|
if (telemetryState) {
|
|
isFromNewtab = true;
|
|
}
|
|
}
|
|
|
|
// Step 2: If we have telemetryState, the browser object must be
|
|
// associated with another browser that is tracked. Try to find the
|
|
// component type on the SERP responsible for the request.
|
|
// Exceptions:
|
|
// - If a searchbox was used to initiate the load, don't record another
|
|
// engagement because the event was logged elsewhere.
|
|
// - If the ad impression hasn't been recorded yet, we have no way of
|
|
// knowing precisely what kind of component was selected.
|
|
let isSerp = false;
|
|
if (
|
|
telemetryState &&
|
|
telemetryState.adImpressionsReported &&
|
|
!telemetryState.searchBoxSubmitted
|
|
) {
|
|
if (info.searchPageRegexp?.test(originURL)) {
|
|
isSerp = true;
|
|
}
|
|
|
|
let startFindComponent = Cu.now();
|
|
let parsedUrl = new URL(url);
|
|
// Determine the component type of the link.
|
|
let type;
|
|
for (let [
|
|
storedUrl,
|
|
componentType,
|
|
] of telemetryState.urlToComponentMap.entries()) {
|
|
// The URL we're navigating to may have more query parameters if
|
|
// the provider adds query parameters when the user clicks on a link.
|
|
// On the other hand, the URL we are navigating to may have have
|
|
// fewer query parameters because of query param stripping.
|
|
// Thus, if a query parameter is missing, a match can still be made
|
|
// provided keys that exist in both URLs contain equal values.
|
|
let score = SearchSERPTelemetry.compareUrls(storedUrl, parsedUrl, {
|
|
paramValues: true,
|
|
path: true,
|
|
});
|
|
if (score) {
|
|
type = componentType;
|
|
break;
|
|
}
|
|
}
|
|
ChromeUtils.addProfilerMarker(
|
|
"SearchSERPTelemetry._observeActivity",
|
|
startFindComponent,
|
|
"Find component for URL"
|
|
);
|
|
|
|
// Default value for URLs that don't match any components categorized
|
|
// on the page.
|
|
if (!type) {
|
|
type = SearchSERPTelemetryUtils.COMPONENTS.NON_ADS_LINK;
|
|
}
|
|
|
|
if (
|
|
type == SearchSERPTelemetryUtils.COMPONENTS.REFINED_SEARCH_BUTTONS
|
|
) {
|
|
SearchSERPTelemetry.setBrowserContentSource(
|
|
browser,
|
|
SearchSERPTelemetryUtils.INCONTENT_SOURCES.REFINE_ON_SERP
|
|
);
|
|
} else if (isSerp && isFromNewtab) {
|
|
SearchSERPTelemetry.setBrowserContentSource(
|
|
browser,
|
|
SearchSERPTelemetryUtils.INCONTENT_SOURCES.OPENED_IN_NEW_TAB
|
|
);
|
|
}
|
|
|
|
// Step 3: Record the engagement.
|
|
impressionIdsWithoutEngagementsSet.delete(telemetryState.impressionId);
|
|
if (AD_COMPONENTS.includes(type)) {
|
|
telemetryState.adsClicked += 1;
|
|
}
|
|
Glean.serp.engagement.record({
|
|
impression_id: telemetryState.impressionId,
|
|
action: SearchSERPTelemetryUtils.ACTIONS.CLICKED,
|
|
target: type,
|
|
});
|
|
lazy.logConsole.debug("Counting click:", {
|
|
impressionId: telemetryState.impressionId,
|
|
type,
|
|
URL: url,
|
|
});
|
|
// Prevent re-directed channels from being examined more than once.
|
|
wrappedChannel._recordedClick = true;
|
|
}
|
|
ChromeUtils.addProfilerMarker(
|
|
"SearchSERPTelemetry._observeActivity",
|
|
start,
|
|
"Maybe record user engagement."
|
|
);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Logs telemetry for a page with adverts, if it is one of the partner search
|
|
* provider pages that we're tracking.
|
|
*
|
|
* @param {object} info
|
|
* The search provider information for the page.
|
|
* @param {boolean} info.hasAds
|
|
* Whether or not the page has adverts.
|
|
* @param {string} info.url
|
|
* The url of the page.
|
|
* @param {object} browser
|
|
* The browser associated with the page.
|
|
*/
|
|
_reportPageWithAds(info, browser) {
|
|
let item = this._findItemForBrowser(browser);
|
|
if (!item) {
|
|
lazy.logConsole.warn(
|
|
"Expected to report URI for",
|
|
info.url,
|
|
"with ads but couldn't find the information"
|
|
);
|
|
return;
|
|
}
|
|
|
|
let telemetryState = item.browserTelemetryStateMap.get(browser);
|
|
if (telemetryState.adsReported) {
|
|
lazy.logConsole.debug(
|
|
"Ad was previously reported for browser with URI",
|
|
info.url
|
|
);
|
|
return;
|
|
}
|
|
|
|
lazy.logConsole.debug(
|
|
"Counting ads in page for",
|
|
item.info.provider,
|
|
item.info.type,
|
|
item.source,
|
|
info.url
|
|
);
|
|
Services.telemetry.keyedScalarAdd(
|
|
SEARCH_WITH_ADS_SCALAR_BASE + item.source,
|
|
`${item.info.provider}:${item.info.type}`,
|
|
1
|
|
);
|
|
Services.obs.notifyObservers(null, "reported-page-with-ads");
|
|
|
|
telemetryState.adsReported = true;
|
|
|
|
if (item.newtabSessionId) {
|
|
Glean.newtabSearchAd.impression.record({
|
|
newtab_visit_id: item.newtabSessionId,
|
|
search_access_point: item.source,
|
|
is_follow_on: item.info.type.endsWith("follow-on"),
|
|
is_tagged: item.info.type.startsWith("tagged"),
|
|
telemetry_id: item.info.provider,
|
|
});
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Logs ad impression telemetry for a page with adverts, if it is
|
|
* one of the partner search provider pages that we're tracking.
|
|
*
|
|
* @param {object} info
|
|
* The search provider information for the page.
|
|
* @param {string} info.url
|
|
* The url of the page.
|
|
* @param {Map<string, object>} info.adImpressions
|
|
* A map of ad impressions found for the page, where the key
|
|
* is the type of ad component and the value is an object
|
|
* containing the number of ads that were loaded, visible,
|
|
* and hidden.
|
|
* @param {Map<string, string>} info.hrefToComponentMap
|
|
* A map of hrefs to their component type. Contains both ads
|
|
* and non-ads.
|
|
* @param {object} browser
|
|
* The browser associated with the page.
|
|
*/
|
|
_reportPageWithAdImpressions(info, browser) {
|
|
let item = this._findItemForBrowser(browser);
|
|
if (!item) {
|
|
return;
|
|
}
|
|
let telemetryState = item.browserTelemetryStateMap.get(browser);
|
|
if (
|
|
lazy.serpEventsEnabled &&
|
|
info.adImpressions &&
|
|
telemetryState &&
|
|
!telemetryState.adImpressionsReported
|
|
) {
|
|
for (let [componentType, data] of info.adImpressions.entries()) {
|
|
telemetryState.adsVisible += data.adsVisible;
|
|
|
|
lazy.logConsole.debug("Counting ad:", { type: componentType, ...data });
|
|
Glean.serp.adImpression.record({
|
|
impression_id: telemetryState.impressionId,
|
|
component: componentType,
|
|
ads_loaded: data.adsLoaded,
|
|
ads_visible: data.adsVisible,
|
|
ads_hidden: data.adsHidden,
|
|
});
|
|
}
|
|
// Convert hrefToComponentMap to a urlToComponentMap in order to cache
|
|
// the query parameters of the href.
|
|
let urlToComponentMap = new Map();
|
|
for (let [href, adType] of info.hrefToComponentMap) {
|
|
urlToComponentMap.set(new URL(href), adType);
|
|
}
|
|
telemetryState.urlToComponentMap = urlToComponentMap;
|
|
telemetryState.adImpressionsReported = true;
|
|
Services.obs.notifyObservers(null, "reported-page-with-ad-impressions");
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Records a page action from a SERP page. Normally, actions are tracked in
|
|
* parent process by observing network events but some actions are not
|
|
* possible to detect outside of subscribing to the child process.
|
|
*
|
|
* @param {object} info
|
|
* The search provider infomation for the page.
|
|
* @param {string} info.type
|
|
* The component type that was clicked on.
|
|
* @param {string} info.action
|
|
* The action taken on the page.
|
|
* @param {object} browser
|
|
* The browser associated with the page.
|
|
*/
|
|
_reportPageAction(info, browser) {
|
|
let item = this._findItemForBrowser(browser);
|
|
if (!item) {
|
|
return;
|
|
}
|
|
let telemetryState = item.browserTelemetryStateMap.get(browser);
|
|
let impressionId = telemetryState?.impressionId;
|
|
if (info.type && impressionId) {
|
|
lazy.logConsole.debug(`Recorded page action:`, {
|
|
impressionId: telemetryState.impressionId,
|
|
type: info.type,
|
|
action: info.action,
|
|
});
|
|
Glean.serp.engagement.record({
|
|
impression_id: impressionId,
|
|
action: info.action,
|
|
target: info.type,
|
|
});
|
|
impressionIdsWithoutEngagementsSet.delete(impressionId);
|
|
// In-content searches are not be categorized with a type, so they will
|
|
// not be picked up in the network processes.
|
|
if (
|
|
info.type == SearchSERPTelemetryUtils.COMPONENTS.INCONTENT_SEARCHBOX &&
|
|
info.action == SearchSERPTelemetryUtils.ACTIONS.SUBMITTED
|
|
) {
|
|
telemetryState.searchBoxSubmitted = true;
|
|
SearchSERPTelemetry.setBrowserContentSource(
|
|
browser,
|
|
SearchSERPTelemetryUtils.INCONTENT_SOURCES.SEARCHBOX
|
|
);
|
|
}
|
|
} else {
|
|
lazy.logConsole.warn(
|
|
"Expected to report a",
|
|
info.action,
|
|
"engagement for",
|
|
info.url,
|
|
"but couldn't find an impression id."
|
|
);
|
|
}
|
|
}
|
|
|
|
_reportPageImpression(info, browser) {
|
|
let item = this._findItemForBrowser(browser);
|
|
let telemetryState = item.browserTelemetryStateMap.get(browser);
|
|
if (!telemetryState?.impressionInfo) {
|
|
lazy.logConsole.debug(
|
|
"Could not find telemetry state or impression info."
|
|
);
|
|
return;
|
|
}
|
|
let impressionId = telemetryState.impressionId;
|
|
if (impressionId) {
|
|
let impressionInfo = telemetryState.impressionInfo;
|
|
Glean.serp.impression.record({
|
|
impression_id: impressionId,
|
|
provider: impressionInfo.provider,
|
|
tagged: impressionInfo.tagged,
|
|
partner_code: impressionInfo.partnerCode,
|
|
source: impressionInfo.source,
|
|
shopping_tab_displayed: info.shoppingTabDisplayed,
|
|
is_shopping_page: impressionInfo.isShoppingPage,
|
|
is_private: impressionInfo.isPrivate,
|
|
});
|
|
lazy.logConsole.debug(`Reported Impression:`, {
|
|
impressionId,
|
|
...impressionInfo,
|
|
shoppingTabDisplayed: info.shoppingTabDisplayed,
|
|
});
|
|
Services.obs.notifyObservers(null, "reported-page-with-impression");
|
|
} else {
|
|
lazy.logConsole.debug("Could not find an impression id.");
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Initiates the categorization and reporting of domains extracted from
|
|
* SERPs.
|
|
*
|
|
* @param {object} info
|
|
* The search provider infomation for the page.
|
|
* @param {Set} info.nonAdDomains
|
|
The non-ad domains extracted from the page.
|
|
* @param {Set} info.adDomains
|
|
The ad domains extracted from the page.
|
|
* @param {object} browser
|
|
* The browser associated with the page.
|
|
*/
|
|
_reportPageDomains(info, browser) {
|
|
let item = this._findItemForBrowser(browser);
|
|
let telemetryState = item.browserTelemetryStateMap.get(browser);
|
|
if (lazy.serpEventTelemetryCategorization && telemetryState) {
|
|
let result = SearchSERPCategorization.maybeCategorizeSERP(
|
|
info.nonAdDomains,
|
|
info.adDomains,
|
|
item.info.provider
|
|
);
|
|
if (result) {
|
|
telemetryState.categorizationInfo = result;
|
|
let callback = () => {
|
|
let impressionInfo = telemetryState.impressionInfo;
|
|
SERPCategorizationRecorder.recordCategorizationTelemetry({
|
|
...telemetryState.categorizationInfo,
|
|
app_version: item.majorVersion,
|
|
channel: item.channel,
|
|
region: item.region,
|
|
partner_code: impressionInfo.partnerCode,
|
|
provider: impressionInfo.provider,
|
|
tagged: impressionInfo.tagged,
|
|
num_ads_clicked: telemetryState.adsClicked,
|
|
num_ads_visible: telemetryState.adsVisible,
|
|
});
|
|
};
|
|
SearchSERPCategorizationEventScheduler.addCallback(browser, callback);
|
|
}
|
|
}
|
|
Services.obs.notifyObservers(
|
|
null,
|
|
"reported-page-with-categorized-domains"
|
|
);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* @typedef {object} CategorizationResult
|
|
* @property {string} organic_category
|
|
* The category for the organic result.
|
|
* @property {number} organic_num_domains
|
|
* The number of domains examined to determine the organic category result.
|
|
* @property {number} organic_num_inconclusive
|
|
* The number of inconclusive domains when determining the organic result.
|
|
* @property {number} organic_num_unknown
|
|
* The number of unknown domains when determining the organic result.
|
|
* @property {string} sponsored_category
|
|
* The category for the organic result.
|
|
* @property {number} sponsored_num_domains
|
|
* The number of domains examined to determine the sponsored category.
|
|
* @property {number} sponsored_num_inconclusive
|
|
* The number of inconclusive domains when determining the sponsored category.
|
|
* @property {number} sponsored_num_unknown
|
|
* The category for the sponsored result.
|
|
* @property {string} mappings_version
|
|
* The category mapping version used to determine the categories.
|
|
*/
|
|
|
|
/**
|
|
* @typedef {object} CategorizationExtraParams
|
|
* @property {number} num_ads_clicked
|
|
* The total number of ads clicked on a SERP.
|
|
* @property {number} num_ads_visible
|
|
* The total number of ads visible to the user when categorization occured.
|
|
*/
|
|
|
|
/* eslint-disable jsdoc/valid-types */
|
|
/**
|
|
* @typedef {CategorizationResult & CategorizationExtraParams} RecordCategorizationParameters
|
|
*/
|
|
/* eslint-enable jsdoc/valid-types */
|
|
|
|
/**
|
|
* Categorizes SERPs.
|
|
*/
|
|
class SERPCategorizer {
|
|
/**
|
|
* Categorizes domains extracted from SERPs. Note that we don't process
|
|
* domains if the domain-to-categories map is empty (if the client couldn't
|
|
* download Remote Settings attachments, for example).
|
|
*
|
|
* @param {Set} nonAdDomains
|
|
* Domains from organic results extracted from the page.
|
|
* @param {Set} adDomains
|
|
* Domains from ad results extracted from the page.
|
|
* @param {string} provider
|
|
* The provider associated with the page.
|
|
* @returns {CategorizationResult | null}
|
|
* The final categorization result. Returns null if the map was empty.
|
|
*/
|
|
maybeCategorizeSERP(nonAdDomains, adDomains, provider) {
|
|
// Per DS, if the map was empty (e.g. because of a technical issue
|
|
// downloading the data), we shouldn't report telemetry.
|
|
// Thus, there is no point attempting to categorize the SERP.
|
|
if (SearchSERPDomainToCategoriesMap.empty) {
|
|
return null;
|
|
}
|
|
let resultsToReport = {};
|
|
|
|
let processedDomains = this.processDomains(nonAdDomains, provider);
|
|
let results = this.applyCategorizationLogic(processedDomains);
|
|
resultsToReport.organic_category = results.category;
|
|
resultsToReport.organic_num_domains = results.num_domains;
|
|
resultsToReport.organic_num_unknown = results.num_unknown;
|
|
resultsToReport.organic_num_inconclusive = results.num_inconclusive;
|
|
|
|
processedDomains = this.processDomains(adDomains, provider);
|
|
results = this.applyCategorizationLogic(processedDomains);
|
|
resultsToReport.sponsored_category = results.category;
|
|
resultsToReport.sponsored_num_domains = results.num_domains;
|
|
resultsToReport.sponsored_num_unknown = results.num_unknown;
|
|
resultsToReport.sponsored_num_inconclusive = results.num_inconclusive;
|
|
|
|
resultsToReport.mappings_version = SearchSERPDomainToCategoriesMap.version;
|
|
|
|
return resultsToReport;
|
|
}
|
|
|
|
/**
|
|
* Applies the logic for reducing extracted domains to a single category for
|
|
* the SERP.
|
|
*
|
|
* @param {Set} domains
|
|
* The domains extracted from the page.
|
|
* @returns {object} resultsToReport
|
|
* The final categorization results. Keys are: "category", "num_domains",
|
|
* "num_unknown" and "num_inconclusive".
|
|
*/
|
|
applyCategorizationLogic(domains) {
|
|
let domainInfo = {};
|
|
let domainsCount = 0;
|
|
let unknownsCount = 0;
|
|
let inconclusivesCount = 0;
|
|
|
|
// Per a request from Data Science, we need to limit the number of domains
|
|
// categorized to 10 non-ad domains and 10 ad domains.
|
|
domains = new Set(
|
|
[...domains].slice(0, CATEGORIZATION_SETTINGS.MAX_DOMAINS_TO_CATEGORIZE)
|
|
);
|
|
|
|
for (let domain of domains) {
|
|
domainsCount++;
|
|
|
|
let categoryCandidates = SearchSERPDomainToCategoriesMap.get(domain);
|
|
|
|
if (!categoryCandidates.length) {
|
|
unknownsCount++;
|
|
continue;
|
|
}
|
|
|
|
let isInconclusive =
|
|
(categoryCandidates.length == 1 &&
|
|
categoryCandidates[0].category ==
|
|
SearchSERPTelemetryUtils.CATEGORIZATION.INCONCLUSIVE) ||
|
|
categoryCandidates.some(
|
|
c =>
|
|
c.category ==
|
|
SearchSERPTelemetryUtils.CATEGORIZATION.INCONCLUSIVE &&
|
|
c.score >= CATEGORIZATION_SETTINGS.HIGHEST_SCORE_THRESHOLD
|
|
);
|
|
if (isInconclusive) {
|
|
inconclusivesCount++;
|
|
continue;
|
|
}
|
|
|
|
domainInfo[domain] = categoryCandidates;
|
|
}
|
|
|
|
let finalCategory;
|
|
let topCategories = [];
|
|
// Determine if all domains were unknown or inconclusive.
|
|
if (unknownsCount + inconclusivesCount == domainsCount) {
|
|
finalCategory = SearchSERPTelemetryUtils.CATEGORIZATION.INCONCLUSIVE;
|
|
} else {
|
|
let maxScore = CATEGORIZATION_SETTINGS.MINIMUM_SCORE;
|
|
let rank = CATEGORIZATION_SETTINGS.STARTING_RANK;
|
|
for (let categoryCandidates of Object.values(domainInfo)) {
|
|
for (let { category, score } of categoryCandidates) {
|
|
let adjustedScore = score / Math.log2(rank);
|
|
if (adjustedScore > maxScore) {
|
|
maxScore = adjustedScore;
|
|
topCategories = [category];
|
|
} else if (adjustedScore == maxScore) {
|
|
topCategories.push(Number(category));
|
|
}
|
|
rank++;
|
|
}
|
|
}
|
|
finalCategory =
|
|
topCategories.length > 1
|
|
? this.#chooseRandomlyFrom(topCategories)
|
|
: topCategories[0];
|
|
}
|
|
|
|
return {
|
|
category: finalCategory,
|
|
num_domains: domainsCount,
|
|
num_unknown: unknownsCount,
|
|
num_inconclusive: inconclusivesCount,
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Processes raw domains extracted from the SERP into their final form before
|
|
* categorization.
|
|
*
|
|
* @param {Set} domains
|
|
* The domains extracted from the page.
|
|
* @param {string} provider
|
|
* The provider associated with the page.
|
|
* @returns {Set} processedDomains
|
|
* The final set of processed domains for a page.
|
|
*/
|
|
processDomains(domains, provider) {
|
|
let processedDomains = new Set();
|
|
|
|
for (let domain of domains) {
|
|
// Don't include domains associated with the search provider.
|
|
if (
|
|
domain.startsWith(`${provider}.`) ||
|
|
domain.includes(`.${provider}.`)
|
|
) {
|
|
continue;
|
|
}
|
|
let domainWithoutSubdomains = this.#stripDomainOfSubdomains(domain);
|
|
// We may have come across the same domain twice, once with www. prefixed
|
|
// and another time without.
|
|
if (
|
|
domainWithoutSubdomains &&
|
|
!processedDomains.has(domainWithoutSubdomains)
|
|
) {
|
|
processedDomains.add(domainWithoutSubdomains);
|
|
}
|
|
}
|
|
|
|
return processedDomains;
|
|
}
|
|
|
|
/**
|
|
* Helper to strip domains of any subdomains.
|
|
*
|
|
* @param {string} domain
|
|
* The domain to strip of any subdomains.
|
|
* @returns {object} browser
|
|
* The given domain with any subdomains removed.
|
|
*/
|
|
#stripDomainOfSubdomains(domain) {
|
|
let tld;
|
|
// Can throw an exception if the input has too few domain levels.
|
|
try {
|
|
tld = Services.eTLD.getKnownPublicSuffixFromHost(domain);
|
|
} catch (ex) {
|
|
return "";
|
|
}
|
|
|
|
let domainWithoutTLD = domain.substring(0, domain.length - tld.length);
|
|
let secondLevelDomain = domainWithoutTLD.split(".").at(-2);
|
|
|
|
return secondLevelDomain ? `${secondLevelDomain}.${tld}` : "";
|
|
}
|
|
|
|
#chooseRandomlyFrom(categories) {
|
|
let randIdx = Math.floor(Math.random() * categories.length);
|
|
return categories[randIdx];
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Contains outstanding categorizations of browser objects that have yet to be
|
|
* scheduled to be reported into a Glean event.
|
|
* They are kept here until one of the conditions are met:
|
|
* 1. The browser that was tracked is no longer being tracked.
|
|
* 2. A user has been idle for IDLE_TIMEOUT_SECONDS
|
|
* 3. The user has awoken their computer and the time elapsed from the last
|
|
* categorization event exceeds WAKE_TIMEOUT_MS.
|
|
*/
|
|
class CategorizationEventScheduler {
|
|
/**
|
|
* A WeakMap containing browser objects mapped to a callback.
|
|
*
|
|
* @type {WeakMap | null}
|
|
*/
|
|
#browserToCallbackMap = null;
|
|
|
|
/**
|
|
* An instance of user idle service. Cached for testing purposes.
|
|
*
|
|
* @type {nsIUserIdleService | null}
|
|
*/
|
|
#idleService = null;
|
|
|
|
/**
|
|
* Whether it has been initialized.
|
|
*
|
|
* @type {boolean}
|
|
*/
|
|
#init = false;
|
|
|
|
/**
|
|
* The last Date.now() of a callback insertion.
|
|
*
|
|
* @type {number | null}
|
|
*/
|
|
#mostRecentMs = null;
|
|
|
|
constructor() {
|
|
this.init();
|
|
}
|
|
|
|
init() {
|
|
if (!lazy.serpEventTelemetryCategorization || this.#init) {
|
|
return;
|
|
}
|
|
|
|
lazy.logConsole.debug("Initializing categorization event scheduler.");
|
|
|
|
this.#browserToCallbackMap = new WeakMap();
|
|
|
|
// In tests, we simulate idleness as it is more reliable and easier than
|
|
// trying to replicate idleness. The way to do is so it by creating
|
|
// an mock idle service and having the component subscribe to it. If we
|
|
// used a lazy instantiation of idle service, the test could only ever be
|
|
// subscribed to the real one.
|
|
this.#idleService = Cc["@mozilla.org/widget/useridleservice;1"].getService(
|
|
Ci.nsIUserIdleService
|
|
);
|
|
|
|
this.#idleService.addIdleObserver(
|
|
this,
|
|
CATEGORIZATION_SETTINGS.IDLE_TIMEOUT_SECONDS
|
|
);
|
|
|
|
Services.obs.addObserver(this, "quit-application");
|
|
Services.obs.addObserver(this, "wake_notification");
|
|
|
|
this.#init = true;
|
|
}
|
|
|
|
uninit() {
|
|
if (!this.#init) {
|
|
return;
|
|
}
|
|
|
|
this.#browserToCallbackMap = null;
|
|
|
|
lazy.logConsole.debug("Un-initializing categorization event scheduler.");
|
|
this.#idleService.removeIdleObserver(
|
|
this,
|
|
CATEGORIZATION_SETTINGS.IDLE_TIMEOUT_SECONDS
|
|
);
|
|
|
|
Services.obs.removeObserver(this, "quit-application");
|
|
Services.obs.removeObserver(this, "wake_notification");
|
|
|
|
this.#idleService = null;
|
|
this.#init = false;
|
|
}
|
|
|
|
observe(subject, topic, data) {
|
|
switch (topic) {
|
|
case "idle":
|
|
lazy.logConsole.debug("Triggering all callbacks due to idle.");
|
|
this.#sendAllCallbacks();
|
|
break;
|
|
case "quit-application":
|
|
this.uninit();
|
|
break;
|
|
case "wake_notification":
|
|
if (
|
|
this.#mostRecentMs &&
|
|
Date.now() - this.#mostRecentMs >=
|
|
CATEGORIZATION_SETTINGS.WAKE_TIMEOUT_MS
|
|
) {
|
|
lazy.logConsole.debug(
|
|
"Triggering all callbacks due to a wake notification."
|
|
);
|
|
this.#sendAllCallbacks();
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
addCallback(browser, callback) {
|
|
lazy.logConsole.debug("Adding callback to queue.");
|
|
this.#mostRecentMs = Date.now();
|
|
this.#browserToCallbackMap?.set(browser, callback);
|
|
}
|
|
|
|
sendCallback(browser) {
|
|
let callback = this.#browserToCallbackMap?.get(browser);
|
|
if (callback) {
|
|
lazy.logConsole.debug("Triggering callback.");
|
|
callback();
|
|
Services.obs.notifyObservers(
|
|
null,
|
|
"recorded-single-categorization-event"
|
|
);
|
|
this.#browserToCallbackMap.delete(browser);
|
|
}
|
|
}
|
|
|
|
#sendAllCallbacks() {
|
|
let browsers = ChromeUtils.nondeterministicGetWeakMapKeys(
|
|
this.#browserToCallbackMap
|
|
);
|
|
if (browsers) {
|
|
lazy.logConsole.debug("Triggering all callbacks.");
|
|
for (let browser of browsers) {
|
|
this.sendCallback(browser);
|
|
}
|
|
}
|
|
this.#mostRecentMs = null;
|
|
Services.obs.notifyObservers(null, "recorded-all-categorization-events");
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Handles reporting SERP categorization telemetry to Glean.
|
|
*/
|
|
class CategorizationRecorder {
|
|
/**
|
|
* Helper function for recording the SERP categorization event.
|
|
*
|
|
* @param {RecordCategorizationParameters} resultToReport
|
|
* The object containing all the data required to report.
|
|
*/
|
|
recordCategorizationTelemetry(resultToReport) {
|
|
lazy.logConsole.debug(
|
|
"Reporting the following categorization result:",
|
|
resultToReport
|
|
);
|
|
// TODO: Bug 1868476 - Report result to Glean.
|
|
}
|
|
}
|
|
|
|
/**
|
|
* @typedef {object} DomainToCategoriesRecord
|
|
* @property {number} version
|
|
* The version of the record.
|
|
*/
|
|
|
|
/**
|
|
* @typedef {object} DomainCategoryScore
|
|
* @property {number} category
|
|
* The index of the category.
|
|
* @property {number} score
|
|
* The score associated with the category.
|
|
*/
|
|
|
|
/**
|
|
* Maps domain to categories, with data synced with Remote Settings.
|
|
*/
|
|
class DomainToCategoriesMap {
|
|
/**
|
|
* Contains the domain to category scores.
|
|
*
|
|
* @type {Object<string, Array<DomainCategoryScore>> | null}
|
|
*/
|
|
#map = null;
|
|
|
|
/**
|
|
* Latest version number of the attachments.
|
|
*
|
|
* @type {number | null}
|
|
*/
|
|
#version = null;
|
|
|
|
/**
|
|
* The Remote Settings client.
|
|
*
|
|
* @type {object | null}
|
|
*/
|
|
#client = null;
|
|
|
|
/**
|
|
* Whether this is synced with Remote Settings.
|
|
*
|
|
* @type {boolean}
|
|
*/
|
|
#init = false;
|
|
|
|
/**
|
|
* Callback when Remote Settings syncs.
|
|
*
|
|
* @type {Function | null}
|
|
*/
|
|
#onSettingsSync = null;
|
|
|
|
/**
|
|
* When downloading an attachment from Remote Settings fails, this will
|
|
* contain a timer which will eventually attempt to retry downloading
|
|
* attachments.
|
|
*/
|
|
#downloadTimer = null;
|
|
|
|
/**
|
|
* Number of times this has attempted to try another download. Will reset
|
|
* if the categorization preference has been toggled, or a sync event has
|
|
* been detected.
|
|
*
|
|
* @type {number}
|
|
*/
|
|
#downloadRetries = 0;
|
|
|
|
/**
|
|
* Runs at application startup with startup idle tasks. If the SERP
|
|
* categorization preference is enabled, it creates a Remote Settings
|
|
* client to listen to updates, and populates the map.
|
|
*/
|
|
async init() {
|
|
if (!lazy.serpEventTelemetryCategorization || this.#init) {
|
|
return;
|
|
}
|
|
lazy.logConsole.debug("Initializing domain-to-categories map.");
|
|
this.#setupClientAndMap();
|
|
this.#init = true;
|
|
}
|
|
|
|
uninit() {
|
|
if (this.#init) {
|
|
lazy.logConsole.debug("Un-initializing domain-to-categories map.");
|
|
this.#clearClientAndMap();
|
|
this.#cancelAndNullifyTimer();
|
|
this.#init = false;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Given a domain, find categories and relevant scores.
|
|
*
|
|
* @param {string} domain Domain to lookup.
|
|
* @returns {Array<DomainCategoryScore>}
|
|
* An array containing categories and their respective score. If no record
|
|
* for the domain is available, return an empty array.
|
|
*/
|
|
get(domain) {
|
|
if (this.empty) {
|
|
return [];
|
|
}
|
|
lazy.gCryptoHash.init(lazy.gCryptoHash.MD5);
|
|
let bytes = new TextEncoder().encode(domain);
|
|
lazy.gCryptoHash.update(bytes, domain.length);
|
|
let hash = lazy.gCryptoHash.finish(true);
|
|
let rawValues = this.#map[hash] ?? [];
|
|
if (rawValues.length) {
|
|
let output = [];
|
|
// Transform data into a more readable format.
|
|
// [x, y] => { category: x, score: y }
|
|
for (let i = 0; i < rawValues.length; i += 2) {
|
|
output.push({ category: rawValues[i], score: rawValues[i + 1] });
|
|
}
|
|
return output;
|
|
}
|
|
return [];
|
|
}
|
|
|
|
/**
|
|
* If the map was initialized, returns the version number for the data.
|
|
* The version number is determined by the record with the highest version
|
|
* number. Even if the records have different versions, only records from the
|
|
* latest version should be available. Returns null if the map was not
|
|
* initialized.
|
|
*
|
|
* @returns {null | number} The version number.
|
|
*/
|
|
get version() {
|
|
return this.#version;
|
|
}
|
|
|
|
/**
|
|
* Whether the map is empty of data.
|
|
*
|
|
* @returns {boolean}
|
|
*/
|
|
get empty() {
|
|
return !this.#map;
|
|
}
|
|
|
|
/**
|
|
* Unit test-only function, used to override the domainToCategoriesMap so
|
|
* that tests can set it to easy to test values.
|
|
*
|
|
* @param {object} domainToCategoriesMap
|
|
* An object where the key is a hashed domain and the value is an array
|
|
* containing an arbitrary number of DomainCategoryScores.
|
|
*/
|
|
overrideMapForTests(domainToCategoriesMap) {
|
|
this.#map = domainToCategoriesMap;
|
|
}
|
|
|
|
async #setupClientAndMap() {
|
|
if (this.#client && !this.empty) {
|
|
return;
|
|
}
|
|
lazy.logConsole.debug("Setting up domain-to-categories map.");
|
|
this.#client = lazy.RemoteSettings(TELEMETRY_CATEGORIZATION_KEY);
|
|
|
|
this.#onSettingsSync = event => this.#sync(event.data);
|
|
this.#client.on("sync", this.#onSettingsSync);
|
|
|
|
let records = await this.#client.get();
|
|
await this.#clearAndPopulateMap(records);
|
|
}
|
|
|
|
#clearClientAndMap() {
|
|
if (this.#client) {
|
|
lazy.logConsole.debug("Removing Remote Settings client.");
|
|
this.#client.off("sync", this.#onSettingsSync);
|
|
this.#client = null;
|
|
this.#onSettingsSync = null;
|
|
this.#downloadRetries = 0;
|
|
}
|
|
|
|
if (this.#map) {
|
|
lazy.logConsole.debug("Clearing domain-to-categories map.");
|
|
this.#map = null;
|
|
this.#version = null;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Inspects a list of records from the categorization domain bucket and finds
|
|
* the maximum version score from the set of records. Each record should have
|
|
* the same version number but if for any reason one entry has a lower
|
|
* version number, the latest version can be used to filter it out.
|
|
*
|
|
* @param {Array<DomainToCategoriesRecord>} records
|
|
* An array containing the records from a Remote Settings collection.
|
|
* @returns {number}
|
|
*/
|
|
#retrieveLatestVersion(records) {
|
|
return records.reduce((version, record) => {
|
|
if (record.version > version) {
|
|
return record.version;
|
|
}
|
|
return version;
|
|
}, 0);
|
|
}
|
|
|
|
/**
|
|
* Callback when Remote Settings has indicated the collection has been
|
|
* synced. Since the records in the collection will be updated all at once,
|
|
* use the array of current records which at this point in time would have
|
|
* the latest records from Remote Settings. Additionally, delete any
|
|
* attachment for records that no longer exist.
|
|
*
|
|
* @param {object} data
|
|
* Object containing records that are current, deleted, created, or updated.
|
|
*
|
|
*/
|
|
async #sync(data) {
|
|
lazy.logConsole.debug("Syncing domain-to-categories with Remote Settings.");
|
|
|
|
// Remove local files of deleted records.
|
|
let toDelete = data?.deleted.filter(d => d.attachment);
|
|
await Promise.all(
|
|
toDelete.map(record => this.#client.attachments.deleteDownloaded(record))
|
|
);
|
|
|
|
// In case a user encountered network failures in the past and kept their
|
|
// session on, this will ensure the next sync event will retry downloading
|
|
// again in case there's a new download error.
|
|
this.#downloadRetries = 0;
|
|
|
|
this.#clearAndPopulateMap(data?.current);
|
|
}
|
|
|
|
/**
|
|
* Clear the existing map and populate it with attachments found in the
|
|
* records. If no attachments are found, or no record containing an
|
|
* attachment contained the latest version, then nothing will change.
|
|
*
|
|
* @param {Array<DomainToCategoriesRecord>} records
|
|
* The records containing attachments.
|
|
*
|
|
*/
|
|
async #clearAndPopulateMap(records) {
|
|
// Set map to null so that if there are errors in the downloads, consumers
|
|
// will be able to know whether the map has information. Once we've
|
|
// successfully downloaded attachments and are parsing them, a non-null
|
|
// object will be created.
|
|
this.#map = null;
|
|
this.#version = null;
|
|
this.#cancelAndNullifyTimer();
|
|
|
|
if (!records?.length) {
|
|
lazy.logConsole.debug("No records found for domain-to-categories map.");
|
|
return;
|
|
}
|
|
|
|
let fileContents = [];
|
|
for (let record of records) {
|
|
let result;
|
|
// Downloading attachments can fail.
|
|
try {
|
|
result = await this.#client.attachments.download(record);
|
|
} catch (ex) {
|
|
lazy.logConsole.error("Could not download file:", ex);
|
|
this.#createTimerToPopulateMap();
|
|
return;
|
|
}
|
|
fileContents.push(result.buffer);
|
|
}
|
|
|
|
// All attachments should have the same version number. If for whatever
|
|
// reason they don't, we should only use the attachments with the latest
|
|
// version.
|
|
this.#version = this.#retrieveLatestVersion(records);
|
|
|
|
if (!this.#version) {
|
|
lazy.logConsole.debug("Could not find a version number for any record.");
|
|
return;
|
|
}
|
|
|
|
// Queue the series of assignments.
|
|
for (let i = 0; i < fileContents.length; ++i) {
|
|
let buffer = fileContents[i];
|
|
Services.tm.idleDispatchToMainThread(() => {
|
|
let start = Cu.now();
|
|
let json;
|
|
try {
|
|
json = JSON.parse(new TextDecoder().decode(buffer));
|
|
} catch (ex) {
|
|
// TODO: If there was an error decoding the buffer, we may want to
|
|
// dispatch an error in telemetry or try again.
|
|
return;
|
|
}
|
|
ChromeUtils.addProfilerMarker(
|
|
"SearchSERPTelemetry.#clearAndPopulateMap",
|
|
start,
|
|
"Convert buffer to JSON."
|
|
);
|
|
if (!this.#map) {
|
|
this.#map = {};
|
|
}
|
|
Object.assign(this.#map, json);
|
|
lazy.logConsole.debug("Updated domain-to-categories map.");
|
|
if (i == fileContents.length - 1) {
|
|
Services.obs.notifyObservers(
|
|
null,
|
|
"domain-to-categories-map-update-complete"
|
|
);
|
|
}
|
|
});
|
|
}
|
|
}
|
|
|
|
#cancelAndNullifyTimer() {
|
|
if (this.#downloadTimer) {
|
|
lazy.logConsole.debug("Cancel and nullify download timer.");
|
|
this.#downloadTimer.cancel();
|
|
this.#downloadTimer = null;
|
|
}
|
|
}
|
|
|
|
#createTimerToPopulateMap() {
|
|
if (
|
|
this.#downloadRetries >=
|
|
TELEMETRY_CATEGORIZATION_DOWNLOAD_SETTINGS.maxTriesPerSession
|
|
) {
|
|
return;
|
|
}
|
|
if (!this.#downloadTimer) {
|
|
this.#downloadTimer = Cc["@mozilla.org/timer;1"].createInstance(
|
|
Ci.nsITimer
|
|
);
|
|
}
|
|
lazy.logConsole.debug("Create timer to retry downloading attachments.");
|
|
let delay =
|
|
TELEMETRY_CATEGORIZATION_DOWNLOAD_SETTINGS.base +
|
|
randomInteger(
|
|
TELEMETRY_CATEGORIZATION_DOWNLOAD_SETTINGS.minAdjust,
|
|
TELEMETRY_CATEGORIZATION_DOWNLOAD_SETTINGS.maxAdjust
|
|
);
|
|
this.#downloadTimer.initWithCallback(
|
|
async () => {
|
|
this.#downloadRetries += 1;
|
|
let records = await this.#client.get();
|
|
this.#clearAndPopulateMap(records);
|
|
},
|
|
delay,
|
|
Ci.nsITimer.TYPE_ONE_SHOT
|
|
);
|
|
}
|
|
}
|
|
|
|
function randomInteger(min, max) {
|
|
return Math.floor(Math.random() * (max - min + 1)) + min;
|
|
}
|
|
|
|
export var SearchSERPDomainToCategoriesMap = new DomainToCategoriesMap();
|
|
export var SearchSERPTelemetry = new TelemetryHandler();
|
|
export var SearchSERPCategorization = new SERPCategorizer();
|
|
export var SERPCategorizationRecorder = new CategorizationRecorder();
|
|
export var SearchSERPCategorizationEventScheduler =
|
|
new CategorizationEventScheduler();
|