From eda1e203391196d83fe9e702bf65abbeab5afdfe Mon Sep 17 00:00:00 2001 From: Emma Zuehlcke Date: Wed, 7 May 2025 09:47:58 +0000 Subject: [PATCH] Bug 1960959 - Improve lookup performance of url classifier exception list. r=bvandersloot Switching the data structure from an array to nested hash maps. Differential Revision: https://phabricator.services.mozilla.com/D247625 --- .../UrlClassifierExceptionList.cpp | 186 ++++++++++++++++-- .../UrlClassifierExceptionList.h | 31 ++- .../tests/unit/test_exceptionListService.js | 65 +++--- 3 files changed, 242 insertions(+), 40 deletions(-) diff --git a/netwerk/url-classifier/UrlClassifierExceptionList.cpp b/netwerk/url-classifier/UrlClassifierExceptionList.cpp index 91a747e43b06..22fa38c38283 100644 --- a/netwerk/url-classifier/UrlClassifierExceptionList.cpp +++ b/netwerk/url-classifier/UrlClassifierExceptionList.cpp @@ -5,10 +5,14 @@ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ #include "UrlClassifierExceptionList.h" +#include "nsIEffectiveTLDService.h" #include "nsIUrlClassifierExceptionListEntry.h" #include "nsIURI.h" #include "mozilla/net/UrlClassifierCommon.h" #include "mozilla/ProfilerMarkers.h" +#include "nsNetCID.h" +#include "nsServiceManagerUtils.h" +#include "mozilla/RustRegex.h" namespace mozilla::net { @@ -25,12 +29,58 @@ UrlClassifierExceptionList::AddEntry( nsIUrlClassifierExceptionListEntry* aEntry) { NS_ENSURE_ARG_POINTER(aEntry); - nsAutoCString entryString; - Unused << aEntry->Describe(entryString); - UC_LOG_DEBUG(("UrlClassifierExceptionList::%s - Adding entry: %s", - __FUNCTION__, entryString.get())); + // From the url patterns in the entry, extract the site and top level site. + // They are used as keys in the exception entry maps. + + nsAutoCString urlPattern; + nsresult rv = aEntry->GetUrlPattern(urlPattern); + NS_ENSURE_SUCCESS(rv, rv); + + nsAutoCString site; + rv = GetSchemelessSiteFromUrlPattern(urlPattern, site); + NS_ENSURE_SUCCESS(rv, rv); + + // We must be able to parse a site from the url pattern. + NS_ENSURE_TRUE(!site.IsEmpty(), NS_ERROR_INVALID_ARG); + + nsAutoCString topLevelUrlPattern; + rv = aEntry->GetTopLevelUrlPattern(topLevelUrlPattern); + NS_ENSURE_SUCCESS(rv, rv); + + nsAutoCString topLevelSite; + rv = GetSchemelessSiteFromUrlPattern(topLevelUrlPattern, topLevelSite); + NS_ENSURE_SUCCESS(rv, rv); + + // topLevelUrlPattern is not mandatory, but if topLevelUrlPattern is set, + // topLevelSite populated as well. + NS_ENSURE_TRUE(topLevelUrlPattern.IsEmpty() == topLevelSite.IsEmpty(), + NS_ERROR_INVALID_ARG); + + if (MOZ_LOG_TEST(UrlClassifierCommon::sLog, LogLevel::Debug)) { + nsAutoCString entryString; + Unused << aEntry->Describe(entryString); + UC_LOG_DEBUG(("UrlClassifierExceptionList::%s - Adding entry: %s", + __FUNCTION__, entryString.get())); + } + + // If the top level site is empty, the exception applies across all top + // level sites. Store it in the global exceptions map. + if (topLevelSite.IsEmpty()) { + mGlobalExceptions.LookupOrInsert(site).AppendElement(aEntry); + return NS_OK; + } + + // Otherwise, store it in the site specific exception map. + mExceptions + // Outer map keyed by top level site. + // topLevelSite may be the empty string. We still use that a key. These + // entries apply to all top-level sites. + .LookupOrInsert(topLevelSite) + // Inner map keyed by site of the load. + .LookupOrInsert(site) + // Append the entry. + .AppendElement(aEntry); - mEntries.AppendElement(aEntry); return NS_OK; } @@ -53,13 +103,77 @@ UrlClassifierExceptionList::Matches(nsIURI* aURI, nsIURI* aTopLevelURI, aTopLevelURI ? aTopLevelURI->GetSpecOrDefault().get() : "null", aIsPrivateBrowsing)); - for (auto& entry : mEntries) { + // Get the eTLD service so we can compute sites from URIs. + nsresult rv; + nsCOMPtr eTLDService( + do_GetService(NS_EFFECTIVETLDSERVICE_CONTRACTID, &rv)); + NS_ENSURE_SUCCESS(rv, rv); + + // If given, compute the (schemeless) site from the top level URI. + // If not we will leave it empty and only look for global exceptions. + nsAutoCString aTopLevelSite; + if (aTopLevelURI) { + rv = eTLDService->GetSchemelessSite(aTopLevelURI, aTopLevelSite); + NS_ENSURE_SUCCESS(rv, rv); + } + + // Compute the (schemeless) site from the URI of the load. + nsAutoCString aSite; + rv = eTLDService->GetSchemelessSite(aURI, aSite); + NS_ENSURE_SUCCESS(rv, rv); + + // Get the list of exceptions that apply to the current load. + // We need to check both global and site specific exceptions + + // 1. Check global exceptions, which apply to all top level sites and lookup + // entries matching the current load (aSite). + ExceptionEntryArray* globalExceptions = + mGlobalExceptions.Lookup(aSite).DataPtrOrNull(); + + *aResult = ExceptionListMatchesLoad(globalExceptions, aURI, aTopLevelURI, + aIsPrivateBrowsing); + if (*aResult) { + // We found a match, no need to check the site specific exceptions. + return NS_OK; + } + + // 2. Get exceptions which apply only to the current top level site. + SiteToEntries* topLevelSiteToEntries = + mExceptions.Lookup(aTopLevelSite).DataPtrOrNull(); + if (topLevelSiteToEntries) { + ExceptionEntryArray* siteSpecificExceptions = + topLevelSiteToEntries->Lookup(aSite).DataPtrOrNull(); + + *aResult = ExceptionListMatchesLoad(siteSpecificExceptions, aURI, + aTopLevelURI, aIsPrivateBrowsing); + if (*aResult) { + return NS_OK; + } + } + + if (!(*aResult)) { + UC_LOG_DEBUG(("%s - No match found", __FUNCTION__)); + } + + return NS_OK; +} + +bool UrlClassifierExceptionList::ExceptionListMatchesLoad( + ExceptionEntryArray* aExceptions, nsIURI* aURI, nsIURI* aTopLevelURI, + bool aIsPrivateBrowsing) { + MOZ_ASSERT(aURI); + + if (!aExceptions) { + return false; + } + for (const auto& entry : *aExceptions) { + bool match = false; nsresult rv = - entry->Matches(aURI, aTopLevelURI, aIsPrivateBrowsing, aResult); + entry->Matches(aURI, aTopLevelURI, aIsPrivateBrowsing, &match); if (NS_WARN_IF(NS_FAILED(rv))) { continue; } - if (*aResult) { + if (match) { // Match found, return immediately. if (MOZ_LOG_TEST(UrlClassifierCommon::sLog, LogLevel::Debug)) { nsAutoCString entryString; @@ -69,20 +183,66 @@ UrlClassifierExceptionList::Matches(nsIURI* aURI, nsIURI* aTopLevelURI, "entry: %s", __FUNCTION__, entryString.get())); } - return NS_OK; + return true; } } + return false; +} - // No match found, return false. - UC_LOG_DEBUG(("%s - No match found", __FUNCTION__)); +NS_IMETHODIMP +UrlClassifierExceptionList::GetSchemelessSiteFromUrlPattern( + const nsACString& aUrlPattern, nsACString& aSite) { + if (aUrlPattern.IsEmpty()) { + aSite.Truncate(); + return NS_OK; + } - return NS_OK; + // Extract the host portion from the url pattern. This regex only supports url + // patterns with a host. + mozilla::RustRegex regex("://(?:\\*\\.)?([^/*]+)"); + mozilla::RustRegexCaptures captures = regex.FindCaptures(aUrlPattern); + NS_ENSURE_TRUE(captures.IsValid(), NS_ERROR_INVALID_ARG); + + // Get the host from the first capture group + auto maybeMatch = captures[1]; + NS_ENSURE_TRUE(maybeMatch, NS_ERROR_INVALID_ARG); + + nsAutoCString host; + host.Assign(Substring(aUrlPattern, maybeMatch->start, + maybeMatch->end - maybeMatch->start)); + NS_ENSURE_TRUE(!host.IsEmpty(), NS_ERROR_INVALID_ARG); + + // Get the eTLD service to convert host to schemeless site + nsresult rv; + nsCOMPtr eTLDService( + do_GetService(NS_EFFECTIVETLDSERVICE_CONTRACTID, &rv)); + NS_ENSURE_SUCCESS(rv, rv); + + return eTLDService->GetSchemelessSiteFromHost(host, aSite); } NS_IMETHODIMP UrlClassifierExceptionList::TestGetEntries( nsTArray>& aEntries) { - aEntries = mEntries.Clone(); + // Global entries (not top-level specific) + for (const auto& entry : mGlobalExceptions) { + const ExceptionEntryArray& entries = entry.GetData(); + aEntries.AppendElements(entries); + } + + // Site specific entries. + // Iterate through the outer map (top-level sites) + for (const auto& outerEntry : mExceptions) { + const SiteToEntries& innerMap = outerEntry.GetData(); + + // Iterate through the inner map (sites to exception entries) + for (const auto& innerEntry : innerMap) { + const ExceptionEntryArray& entries = innerEntry.GetData(); + // Append all entries from this array to the result + aEntries.AppendElements(entries); + } + } + return NS_OK; } } // namespace mozilla::net diff --git a/netwerk/url-classifier/UrlClassifierExceptionList.h b/netwerk/url-classifier/UrlClassifierExceptionList.h index 2c46f160fcfd..65cf175dc07a 100644 --- a/netwerk/url-classifier/UrlClassifierExceptionList.h +++ b/netwerk/url-classifier/UrlClassifierExceptionList.h @@ -7,6 +7,8 @@ #ifndef mozilla_UrlClassifierExceptionList_h #define mozilla_UrlClassifierExceptionList_h +#include "nsHashKeys.h" +#include "nsTHashMap.h" #include "nsIUrlClassifierExceptionList.h" #include "nsISupports.h" #include "nsTArray.h" @@ -27,8 +29,35 @@ class UrlClassifierExceptionList final : public nsIUrlClassifierExceptionList { private: ~UrlClassifierExceptionList() = default; + // A list of exception entries + using ExceptionEntryArray = + nsTArray>; + + // A map from (schemeless) site to a list of exception entries. + using SiteToEntries = nsTHashMap; + + // Helper method to check if any exception in the array matches the given + // load. + static bool ExceptionListMatchesLoad(ExceptionEntryArray* aExceptions, + nsIURI* aURI, nsIURI* aTopLevelURI, + bool aIsPrivateBrowsing); + + // Helper method to extract the schemeless site from a URL pattern. + NS_IMETHODIMP GetSchemelessSiteFromUrlPattern(const nsACString& aUrlPattern, + nsACString& aSite); + + // The feature this exception list is for, e.g. "tracking-protection". nsCString mFeature; - nsTArray> mEntries; + + // A two stage hash map to store the (top level) site-specific exception + // entries. + // The outer hash map key is the top level (schemeless) site. + // The inner hash map key is the (schemeless) site of the load to be checked. + nsTHashMap mExceptions; + + // A map of exception list entries which apply across all top level sites. + // The hash map key is the (schemeless) site of the load to be checked. + nsTHashMap mGlobalExceptions; }; } // namespace mozilla::net diff --git a/toolkit/components/url-classifier/tests/unit/test_exceptionListService.js b/toolkit/components/url-classifier/tests/unit/test_exceptionListService.js index f5da7aa7ab8e..9d77f01f043b 100644 --- a/toolkit/components/url-classifier/tests/unit/test_exceptionListService.js +++ b/toolkit/components/url-classifier/tests/unit/test_exceptionListService.js @@ -110,33 +110,35 @@ add_task(async function test_list_changes() { let entries = list.testGetEntries(); Assert.equal(entries.length, 3, "Has three items in the list"); + entries = entries.sort((a, b) => a.urlPattern.localeCompare(b.urlPattern)); + Assert.equal( - entries[0].urlPattern, + entries[1].urlPattern, "*://example.com/*", "First item is example.com" ); Assert.equal( - entries[1].urlPattern, + entries[2].urlPattern, "*://MOZILLA.ORG/*", "Second item is mozilla.org" ); Assert.equal( - entries[1].topLevelUrlPattern, + entries[2].topLevelUrlPattern, "*://example.com/*", "Top level url pattern of second item is correctly set." ); Assert.equal( - entries[1].isPrivateBrowsingOnly, + entries[2].isPrivateBrowsingOnly, true, "isPrivateBrowsingOnly flag of second item is correctly set." ); Assert.deepEqual( - entries[1].filterContentBlockingCategories, + entries[2].filterContentBlockingCategories, ["standard"], "filterContentBlockingCategories of second item is correctly set." ); Assert.equal( - entries[2].urlPattern, + entries[0].urlPattern, "*://*.example.org/*", "Third item is *.example.org" ); @@ -151,6 +153,8 @@ add_task(async function test_list_changes() { Assert.equal(entries.length, 4, "Has four items in the list"); + entries = entries.sort((a, b) => a.urlPattern.localeCompare(b.urlPattern)); + Assert.equal( entries[1].urlPattern, "*://example.com/*", @@ -162,12 +166,12 @@ add_task(async function test_list_changes() { "Second item is mozilla.org" ); Assert.equal( - entries[3].urlPattern, + entries[0].urlPattern, "*://*.example.org/*", "Third item is *.example.org" ); Assert.equal( - entries[0].urlPattern, + entries[3].urlPattern, "*://test.com/*", "Fourth item is test.com" ); @@ -183,33 +187,36 @@ add_task(async function test_list_changes() { entries = list.testGetEntries(); Assert.equal(entries.length, 6, "Has six items in the list"); + + entries = entries.sort((a, b) => a.urlPattern.localeCompare(b.urlPattern)); + Assert.equal( - entries[0].urlPattern, + entries[4].urlPattern, "*://test.com/*", "First item is test.com" ); Assert.equal( - entries[1].urlPattern, + entries[5].urlPattern, "*://whatever.com/*", "Second item is whatever.com" ); Assert.equal( - entries[2].urlPattern, + entries[0].urlPattern, "*://*.abc.com/*", "Third item is *.abc.com" ); Assert.equal( - entries[3].urlPattern, + entries[2].urlPattern, "*://example.com/*", "Fourth item is example.com" ); Assert.equal( - entries[4].urlPattern, + entries[3].urlPattern, "*://MOZILLA.ORG/*", "Fifth item is mozilla.org" ); Assert.equal( - entries[5].urlPattern, + entries[1].urlPattern, "*://*.example.org/*", "Sixth item is *.example.org" ); @@ -288,16 +295,19 @@ add_task(async function test_list_init_data() { list = await waitForEvent(updateEvent, "update"); let entries = list.testGetEntries(); + entries = entries.sort((a, b) => a.urlPattern.localeCompare(b.urlPattern)); + Assert.equal(entries.length, 2, "Has two items in the list"); + Assert.equal( entries[0].urlPattern, - "*://tracking.example.com/*", - "First item is tracking.example.com" + "*://*.tracking.org/*", + "First item is *.tracking.org" ); Assert.equal( entries[1].urlPattern, - "*://*.tracking.org/*", - "Second item is *.tracking.org" + "*://tracking.example.com/*", + "Second item is tracking.example.com" ); // Register another feature after ExceptionListService got the initial data. @@ -311,16 +321,18 @@ add_task(async function test_list_init_data() { list = await promise; entries = list.testGetEntries(); + entries = entries.sort((a, b) => a.urlPattern.localeCompare(b.urlPattern)); + Assert.equal(entries.length, 2, "Has two items in the list"); Assert.equal( entries[0].urlPattern, - "*://social.example.com/*", - "First item is social.example.com" + "*://MOZILLA.ORG/*", + "First item is mozilla.org" ); Assert.equal( entries[1].urlPattern, - "*://MOZILLA.ORG/*", - "Second item is mozilla.org" + "*://social.example.com/*", + "Second item is social.example.com" ); // Test registering a feature after ExceptionListService recieved the synced data. @@ -360,16 +372,17 @@ add_task(async function test_list_init_data() { list = await promise; entries = list.testGetEntries(); + entries = entries.sort((a, b) => a.urlPattern.localeCompare(b.urlPattern)); Assert.equal(entries.length, 2, "Has two items in the list"); Assert.equal( entries[0].urlPattern, - "*://fingerprinting.example.com/*", - "First item is fingerprinting.example.com" + "*://*.fingerprinting.org/*", + "First item is *.fingerprinting.org" ); Assert.equal( entries[1].urlPattern, - "*://*.fingerprinting.org/*", - "Second item is *.fingerprinting.org" + "*://fingerprinting.example.com/*", + "Second item is fingerprinting.example.com" ); exceptionListService.unregisterExceptionListObserver(