Bug 1960959 - Improve lookup performance of url classifier exception list. r=bvandersloot

Switching the data structure from an array to nested hash maps. Differential Revision: https://phabricator.services.mozilla.com/D247625
2025-05-07 09:47:58 +00:00
parent e01fee1518
commit eda1e20339
3 changed files with 242 additions and 40 deletions
--- a/netwerk/url-classifier/UrlClassifierExceptionList.cpp
+++ b/netwerk/url-classifier/UrlClassifierExceptionList.cpp
@@ -5,10 +5,14 @@
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

 #include "UrlClassifierExceptionList.h"
+#include "nsIEffectiveTLDService.h"
 #include "nsIUrlClassifierExceptionListEntry.h"
 #include "nsIURI.h"
 #include "mozilla/net/UrlClassifierCommon.h"
 #include "mozilla/ProfilerMarkers.h"
+#include "nsNetCID.h"
+#include "nsServiceManagerUtils.h"
+#include "mozilla/RustRegex.h"

 namespace mozilla::net {

@@ -25,12 +29,58 @@ UrlClassifierExceptionList::AddEntry(
    nsIUrlClassifierExceptionListEntry* aEntry) {
  NS_ENSURE_ARG_POINTER(aEntry);

-  nsAutoCString entryString;
-  Unused << aEntry->Describe(entryString);
-  UC_LOG_DEBUG(("UrlClassifierExceptionList::%s - Adding entry: %s",
-                __FUNCTION__, entryString.get()));
+  // From the url patterns in the entry, extract the site and top level site.
+  // They are used as keys in the exception entry maps.
+
+  nsAutoCString urlPattern;
+  nsresult rv = aEntry->GetUrlPattern(urlPattern);
+  NS_ENSURE_SUCCESS(rv, rv);
+
+  nsAutoCString site;
+  rv = GetSchemelessSiteFromUrlPattern(urlPattern, site);
+  NS_ENSURE_SUCCESS(rv, rv);
+
+  // We must be able to parse a site from the url pattern.
+  NS_ENSURE_TRUE(!site.IsEmpty(), NS_ERROR_INVALID_ARG);
+
+  nsAutoCString topLevelUrlPattern;
+  rv = aEntry->GetTopLevelUrlPattern(topLevelUrlPattern);
+  NS_ENSURE_SUCCESS(rv, rv);
+
+  nsAutoCString topLevelSite;
+  rv = GetSchemelessSiteFromUrlPattern(topLevelUrlPattern, topLevelSite);
+  NS_ENSURE_SUCCESS(rv, rv);
+
+  // topLevelUrlPattern is not mandatory, but if topLevelUrlPattern is set,
+  // topLevelSite populated as well.
+  NS_ENSURE_TRUE(topLevelUrlPattern.IsEmpty() == topLevelSite.IsEmpty(),
+                 NS_ERROR_INVALID_ARG);
+
+  if (MOZ_LOG_TEST(UrlClassifierCommon::sLog, LogLevel::Debug)) {
+    nsAutoCString entryString;
+    Unused << aEntry->Describe(entryString);
+    UC_LOG_DEBUG(("UrlClassifierExceptionList::%s - Adding entry: %s",
+                  __FUNCTION__, entryString.get()));
+  }
+
+  // If the top level site is empty, the exception applies across all top
+  // level sites. Store it in the global exceptions map.
+  if (topLevelSite.IsEmpty()) {
+    mGlobalExceptions.LookupOrInsert(site).AppendElement(aEntry);
+    return NS_OK;
+  }
+
+  // Otherwise, store it in the site specific exception map.
+  mExceptions
+      // Outer map keyed by top level site.
+      // topLevelSite may be the empty string. We still use that a key. These
+      // entries apply to all top-level sites.
+      .LookupOrInsert(topLevelSite)
+      // Inner map keyed by site of the load.
+      .LookupOrInsert(site)
+      // Append the entry.
+      .AppendElement(aEntry);

-  mEntries.AppendElement(aEntry);
  return NS_OK;
 }

@@ -53,13 +103,77 @@ UrlClassifierExceptionList::Matches(nsIURI* aURI, nsIURI* aTopLevelURI,
       aTopLevelURI ? aTopLevelURI->GetSpecOrDefault().get() : "null",
       aIsPrivateBrowsing));

-  for (auto& entry : mEntries) {
+  // Get the eTLD service so we can compute sites from URIs.
+  nsresult rv;
+  nsCOMPtr<nsIEffectiveTLDService> eTLDService(
+      do_GetService(NS_EFFECTIVETLDSERVICE_CONTRACTID, &rv));
+  NS_ENSURE_SUCCESS(rv, rv);
+
+  // If given, compute the (schemeless) site from the top level URI.
+  // If not we will leave it empty and only look for global exceptions.
+  nsAutoCString aTopLevelSite;
+  if (aTopLevelURI) {
+    rv = eTLDService->GetSchemelessSite(aTopLevelURI, aTopLevelSite);
+    NS_ENSURE_SUCCESS(rv, rv);
+  }
+
+  // Compute the (schemeless) site from the URI of the load.
+  nsAutoCString aSite;
+  rv = eTLDService->GetSchemelessSite(aURI, aSite);
+  NS_ENSURE_SUCCESS(rv, rv);
+
+  // Get the list of exceptions that apply to the current load.
+  // We need to check both global and site specific exceptions
+
+  // 1. Check global exceptions, which apply to all top level sites and lookup
+  //    entries matching the current load (aSite).
+  ExceptionEntryArray* globalExceptions =
+      mGlobalExceptions.Lookup(aSite).DataPtrOrNull();
+
+  *aResult = ExceptionListMatchesLoad(globalExceptions, aURI, aTopLevelURI,
+                                      aIsPrivateBrowsing);
+  if (*aResult) {
+    // We found a match, no need to check the site specific exceptions.
+    return NS_OK;
+  }
+
+  // 2. Get exceptions which apply only to the current top level site.
+  SiteToEntries* topLevelSiteToEntries =
+      mExceptions.Lookup(aTopLevelSite).DataPtrOrNull();
+  if (topLevelSiteToEntries) {
+    ExceptionEntryArray* siteSpecificExceptions =
+        topLevelSiteToEntries->Lookup(aSite).DataPtrOrNull();
+
+    *aResult = ExceptionListMatchesLoad(siteSpecificExceptions, aURI,
+                                        aTopLevelURI, aIsPrivateBrowsing);
+    if (*aResult) {
+      return NS_OK;
+    }
+  }
+
+  if (!(*aResult)) {
+    UC_LOG_DEBUG(("%s - No match found", __FUNCTION__));
+  }
+
+  return NS_OK;
+}
+
+bool UrlClassifierExceptionList::ExceptionListMatchesLoad(
+    ExceptionEntryArray* aExceptions, nsIURI* aURI, nsIURI* aTopLevelURI,
+    bool aIsPrivateBrowsing) {
+  MOZ_ASSERT(aURI);
+
+  if (!aExceptions) {
+    return false;
+  }
+  for (const auto& entry : *aExceptions) {
+    bool match = false;
    nsresult rv =
-        entry->Matches(aURI, aTopLevelURI, aIsPrivateBrowsing, aResult);
+        entry->Matches(aURI, aTopLevelURI, aIsPrivateBrowsing, &match);
    if (NS_WARN_IF(NS_FAILED(rv))) {
      continue;
    }
-    if (*aResult) {
+    if (match) {
      // Match found, return immediately.
      if (MOZ_LOG_TEST(UrlClassifierCommon::sLog, LogLevel::Debug)) {
        nsAutoCString entryString;
@@ -69,20 +183,66 @@ UrlClassifierExceptionList::Matches(nsIURI* aURI, nsIURI* aTopLevelURI,
             "entry: %s",
             __FUNCTION__, entryString.get()));
      }
-      return NS_OK;
+      return true;
    }
  }
+  return false;
+}

-  // No match found, return false.
-  UC_LOG_DEBUG(("%s - No match found", __FUNCTION__));
+NS_IMETHODIMP
+UrlClassifierExceptionList::GetSchemelessSiteFromUrlPattern(
+    const nsACString& aUrlPattern, nsACString& aSite) {
+  if (aUrlPattern.IsEmpty()) {
+    aSite.Truncate();
+    return NS_OK;
+  }

-  return NS_OK;
+  // Extract the host portion from the url pattern. This regex only supports url
+  // patterns with a host.
+  mozilla::RustRegex regex("://(?:\\*\\.)?([^/*]+)");
+  mozilla::RustRegexCaptures captures = regex.FindCaptures(aUrlPattern);
+  NS_ENSURE_TRUE(captures.IsValid(), NS_ERROR_INVALID_ARG);
+
+  // Get the host from the first capture group
+  auto maybeMatch = captures[1];
+  NS_ENSURE_TRUE(maybeMatch, NS_ERROR_INVALID_ARG);
+
+  nsAutoCString host;
+  host.Assign(Substring(aUrlPattern, maybeMatch->start,
+                        maybeMatch->end - maybeMatch->start));
+  NS_ENSURE_TRUE(!host.IsEmpty(), NS_ERROR_INVALID_ARG);
+
+  // Get the eTLD service to convert host to schemeless site
+  nsresult rv;
+  nsCOMPtr<nsIEffectiveTLDService> eTLDService(
+      do_GetService(NS_EFFECTIVETLDSERVICE_CONTRACTID, &rv));
+  NS_ENSURE_SUCCESS(rv, rv);
+
+  return eTLDService->GetSchemelessSiteFromHost(host, aSite);
 }

 NS_IMETHODIMP
 UrlClassifierExceptionList::TestGetEntries(
    nsTArray<RefPtr<nsIUrlClassifierExceptionListEntry>>& aEntries) {
-  aEntries = mEntries.Clone();
+  // Global entries (not top-level specific)
+  for (const auto& entry : mGlobalExceptions) {
+    const ExceptionEntryArray& entries = entry.GetData();
+    aEntries.AppendElements(entries);
+  }
+
+  // Site specific entries.
+  // Iterate through the outer map (top-level sites)
+  for (const auto& outerEntry : mExceptions) {
+    const SiteToEntries& innerMap = outerEntry.GetData();
+
+    // Iterate through the inner map (sites to exception entries)
+    for (const auto& innerEntry : innerMap) {
+      const ExceptionEntryArray& entries = innerEntry.GetData();
+      // Append all entries from this array to the result
+      aEntries.AppendElements(entries);
+    }
+  }
+
  return NS_OK;
 }
 }  // namespace mozilla::net
--- a/netwerk/url-classifier/UrlClassifierExceptionList.h
+++ b/netwerk/url-classifier/UrlClassifierExceptionList.h
@@ -7,6 +7,8 @@
 #ifndef mozilla_UrlClassifierExceptionList_h
 #define mozilla_UrlClassifierExceptionList_h

+#include "nsHashKeys.h"
+#include "nsTHashMap.h"
 #include "nsIUrlClassifierExceptionList.h"
 #include "nsISupports.h"
 #include "nsTArray.h"
@@ -27,8 +29,35 @@ class UrlClassifierExceptionList final : public nsIUrlClassifierExceptionList {
 private:
  ~UrlClassifierExceptionList() = default;

+  // A list of exception entries
+  using ExceptionEntryArray =
+      nsTArray<RefPtr<nsIUrlClassifierExceptionListEntry>>;
+
+  // A map from (schemeless) site to a list of exception entries.
+  using SiteToEntries = nsTHashMap<nsCStringHashKey, ExceptionEntryArray>;
+
+  // Helper method to check if any exception in the array matches the given
+  // load.
+  static bool ExceptionListMatchesLoad(ExceptionEntryArray* aExceptions,
+                                       nsIURI* aURI, nsIURI* aTopLevelURI,
+                                       bool aIsPrivateBrowsing);
+
+  // Helper method to extract the schemeless site from a URL pattern.
+  NS_IMETHODIMP GetSchemelessSiteFromUrlPattern(const nsACString& aUrlPattern,
+                                                nsACString& aSite);
+
+  // The feature this exception list is for, e.g. "tracking-protection".
  nsCString mFeature;
-  nsTArray<RefPtr<nsIUrlClassifierExceptionListEntry>> mEntries;
+
+  // A two stage hash map to store the (top level) site-specific exception
+  // entries.
+  // The outer hash map key is the top level (schemeless) site.
+  // The inner hash map key is the (schemeless) site of the load to be checked.
+  nsTHashMap<nsCStringHashKey, SiteToEntries> mExceptions;
+
+  // A map of exception list entries which apply across all top level sites.
+  // The hash map key is the (schemeless) site of the load to be checked.
+  nsTHashMap<nsCStringHashKey, ExceptionEntryArray> mGlobalExceptions;
 };

 }  // namespace mozilla::net
--- a/toolkit/components/url-classifier/tests/unit/test_exceptionListService.js
+++ b/toolkit/components/url-classifier/tests/unit/test_exceptionListService.js
@@ -110,33 +110,35 @@ add_task(async function test_list_changes() {

  let entries = list.testGetEntries();
  Assert.equal(entries.length, 3, "Has three items in the list");
+  entries = entries.sort((a, b) => a.urlPattern.localeCompare(b.urlPattern));
+
  Assert.equal(
-    entries[0].urlPattern,
+    entries[1].urlPattern,
    "*://example.com/*",
    "First item is example.com"
  );
  Assert.equal(
-    entries[1].urlPattern,
+    entries[2].urlPattern,
    "*://MOZILLA.ORG/*",
    "Second item is mozilla.org"
  );
  Assert.equal(
-    entries[1].topLevelUrlPattern,
+    entries[2].topLevelUrlPattern,
    "*://example.com/*",
    "Top level url pattern of second item is correctly set."
  );
  Assert.equal(
-    entries[1].isPrivateBrowsingOnly,
+    entries[2].isPrivateBrowsingOnly,
    true,
    "isPrivateBrowsingOnly flag of second item is correctly set."
  );
  Assert.deepEqual(
-    entries[1].filterContentBlockingCategories,
+    entries[2].filterContentBlockingCategories,
    ["standard"],
    "filterContentBlockingCategories of second item is correctly set."
  );
  Assert.equal(
-    entries[2].urlPattern,
+    entries[0].urlPattern,
    "*://*.example.org/*",
    "Third item is *.example.org"
  );
@@ -151,6 +153,8 @@ add_task(async function test_list_changes() {

  Assert.equal(entries.length, 4, "Has four items in the list");

+  entries = entries.sort((a, b) => a.urlPattern.localeCompare(b.urlPattern));
+
  Assert.equal(
    entries[1].urlPattern,
    "*://example.com/*",
@@ -162,12 +166,12 @@ add_task(async function test_list_changes() {
    "Second item is mozilla.org"
  );
  Assert.equal(
-    entries[3].urlPattern,
+    entries[0].urlPattern,
    "*://*.example.org/*",
    "Third item is *.example.org"
  );
  Assert.equal(
-    entries[0].urlPattern,
+    entries[3].urlPattern,
    "*://test.com/*",
    "Fourth item is test.com"
  );
@@ -183,33 +187,36 @@ add_task(async function test_list_changes() {

  entries = list.testGetEntries();
  Assert.equal(entries.length, 6, "Has six items in the list");
+
+  entries = entries.sort((a, b) => a.urlPattern.localeCompare(b.urlPattern));
+
  Assert.equal(
-    entries[0].urlPattern,
+    entries[4].urlPattern,
    "*://test.com/*",
    "First item is test.com"
  );
  Assert.equal(
-    entries[1].urlPattern,
+    entries[5].urlPattern,
    "*://whatever.com/*",
    "Second item is whatever.com"
  );
  Assert.equal(
-    entries[2].urlPattern,
+    entries[0].urlPattern,
    "*://*.abc.com/*",
    "Third item is *.abc.com"
  );
  Assert.equal(
-    entries[3].urlPattern,
+    entries[2].urlPattern,
    "*://example.com/*",
    "Fourth item is example.com"
  );
  Assert.equal(
-    entries[4].urlPattern,
+    entries[3].urlPattern,
    "*://MOZILLA.ORG/*",
    "Fifth item is mozilla.org"
  );
  Assert.equal(
-    entries[5].urlPattern,
+    entries[1].urlPattern,
    "*://*.example.org/*",
    "Sixth item is *.example.org"
  );
@@ -288,16 +295,19 @@ add_task(async function test_list_init_data() {

  list = await waitForEvent(updateEvent, "update");
  let entries = list.testGetEntries();
+  entries = entries.sort((a, b) => a.urlPattern.localeCompare(b.urlPattern));
+
  Assert.equal(entries.length, 2, "Has two items in the list");
+
  Assert.equal(
    entries[0].urlPattern,
-    "*://tracking.example.com/*",
-    "First item is tracking.example.com"
+    "*://*.tracking.org/*",
+    "First item is *.tracking.org"
  );
  Assert.equal(
    entries[1].urlPattern,
-    "*://*.tracking.org/*",
-    "Second item is *.tracking.org"
+    "*://tracking.example.com/*",
+    "Second item is tracking.example.com"
  );

  // Register another feature after ExceptionListService got the initial data.
@@ -311,16 +321,18 @@ add_task(async function test_list_init_data() {

  list = await promise;
  entries = list.testGetEntries();
+  entries = entries.sort((a, b) => a.urlPattern.localeCompare(b.urlPattern));
+
  Assert.equal(entries.length, 2, "Has two items in the list");
  Assert.equal(
    entries[0].urlPattern,
-    "*://social.example.com/*",
-    "First item is social.example.com"
+    "*://MOZILLA.ORG/*",
+    "First item is mozilla.org"
  );
  Assert.equal(
    entries[1].urlPattern,
-    "*://MOZILLA.ORG/*",
-    "Second item is mozilla.org"
+    "*://social.example.com/*",
+    "Second item is social.example.com"
  );

  // Test registering a feature after ExceptionListService recieved the synced data.
@@ -360,16 +372,17 @@ add_task(async function test_list_init_data() {
  list = await promise;

  entries = list.testGetEntries();
+  entries = entries.sort((a, b) => a.urlPattern.localeCompare(b.urlPattern));
  Assert.equal(entries.length, 2, "Has two items in the list");
  Assert.equal(
    entries[0].urlPattern,
-    "*://fingerprinting.example.com/*",
-    "First item is fingerprinting.example.com"
+    "*://*.fingerprinting.org/*",
+    "First item is *.fingerprinting.org"
  );
  Assert.equal(
    entries[1].urlPattern,
-    "*://*.fingerprinting.org/*",
-    "Second item is *.fingerprinting.org"
+    "*://fingerprinting.example.com/*",
+    "Second item is fingerprinting.example.com"
  );

  exceptionListService.unregisterExceptionListObserver(