Files
tubestation/browser/components/urlbar/UrlbarTokenizer.sys.mjs
mcheang f8ea71ac19 Bug 1933003 - Add support to use restricted search keywords in both en-US and the localized language. r=mak,fluent-reviewers,settings-reviewers,urlbar-reviewers,mossop
This patch adds English restrict keyword strings to the enUS-searchFeature.ftl
file. The restrictKeywords providers return an array of keywords for their
l10nRestrictKeywords property. The first is the localized keyword, and the
second is the English keyword. If the user is in the English locale, the
providers return an array with one element: the English keyword.

Differential Revision: https://phabricator.services.mozilla.com/D230009
2024-12-24 22:40:09 +00:00

511 lines
17 KiB
JavaScript

/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
/**
* This module exports a tokenizer to be used by the urlbar model.
* Emitted tokens are objects in the shape { type, value }, where type is one
* of UrlbarTokenizer.TYPE.
*/
const lazy = {};
ChromeUtils.defineESModuleGetters(lazy, {
UrlbarPrefs: "resource:///modules/UrlbarPrefs.sys.mjs",
UrlbarUtils: "resource:///modules/UrlbarUtils.sys.mjs",
PlacesUtils: "resource://gre/modules/PlacesUtils.sys.mjs",
});
ChromeUtils.defineLazyGetter(lazy, "logger", () =>
lazy.UrlbarUtils.getLogger({ prefix: "Tokenizer" })
);
ChromeUtils.defineLazyGetter(lazy, "gFluentStrings", function () {
return new Localization(["browser/browser.ftl"]);
});
/*
* This Map stores key-value pairs where each key is a restrict token
* and each value is an array containing the localized keyword and the
* english keyword.
*
* For example,
* "*" maps to "Bookmarks" for english locales
* "*" maps to "Marcadores, Bookmarks" for es-ES
*/
let tokenToKeywords = new Map();
export var UrlbarTokenizer = {
// Regex matching on whitespaces.
REGEXP_SPACES: /\s+/,
REGEXP_SPACES_START: /^\s+/,
// Regex used to guess url-like strings.
// These are not expected to be 100% correct, we accept some user mistypes
// and we're unlikely to be able to cover 100% of the cases.
REGEXP_LIKE_PROTOCOL: /^[A-Z+.-]+:\/*(?!\/)/i,
REGEXP_USERINFO_INVALID_CHARS: /[^\w.~%!$&'()*+,;=:-]/,
REGEXP_HOSTPORT_INVALID_CHARS: /[^\[\]A-Z0-9.:-]/i,
REGEXP_SINGLE_WORD_HOST: /^[^.:]+$/i,
REGEXP_HOSTPORT_IP_LIKE: /^(?=(.*[.:].*){2})[a-f0-9\.\[\]:]+$/i,
// This accepts partial IPv4.
REGEXP_HOSTPORT_INVALID_IP:
/\.{2,}|\d{5,}|\d{4,}(?![:\]])|^\.|^(\d+\.){4,}\d+$|^\d{4,}$/,
// This only accepts complete IPv4.
REGEXP_HOSTPORT_IPV4: /^(\d{1,3}\.){3,}\d{1,3}(:\d+)?$/,
// This accepts partial IPv6.
REGEXP_HOSTPORT_IPV6: /^\[([0-9a-f]{0,4}:){0,7}[0-9a-f]{0,4}\]?$/i,
REGEXP_COMMON_EMAIL: /^[\w!#$%&'*+/=?^`{|}~.-]+@[\[\]A-Z0-9.-]+$/i,
REGEXP_HAS_PORT: /:\d+$/,
// Regex matching a percent encoded char at the beginning of a string.
REGEXP_PERCENT_ENCODED_START: /^(%[0-9a-f]{2}){2,}/i,
// Regex matching scheme and colon, plus, if present, two slashes.
REGEXP_PREFIX: /^[a-z-]+:(?:\/){0,2}/i,
TYPE: {
TEXT: 1,
POSSIBLE_ORIGIN: 2, // It may be an ip, a domain, but even just a single word used as host.
POSSIBLE_URL: 3, // Consumers should still check this with a fixup.
RESTRICT_HISTORY: 4,
RESTRICT_BOOKMARK: 5,
RESTRICT_TAG: 6,
RESTRICT_OPENPAGE: 7,
RESTRICT_SEARCH: 8,
RESTRICT_TITLE: 9,
RESTRICT_URL: 10,
RESTRICT_ACTION: 11,
},
// The special characters below can be typed into the urlbar to restrict
// the search to a certain category, like history, bookmarks or open pages; or
// to force a match on just the title or url.
// These restriction characters can be typed alone, or at word boundaries,
// provided their meaning cannot be confused, for example # could be present
// in a valid url, and thus it should not be interpreted as a restriction.
RESTRICT: {
HISTORY: "^",
BOOKMARK: "*",
TAG: "+",
OPENPAGE: "%",
SEARCH: "?",
TITLE: "#",
URL: "$",
ACTION: ">",
},
// The keys of characters in RESTRICT that will enter search mode.
get SEARCH_MODE_RESTRICT() {
const keys = [
this.RESTRICT.HISTORY,
this.RESTRICT.BOOKMARK,
this.RESTRICT.OPENPAGE,
this.RESTRICT.SEARCH,
];
if (lazy.UrlbarPrefs.get("scotchBonnet.enableOverride")) {
keys.push(this.RESTRICT.ACTION);
}
return new Set(keys);
},
async loadL10nRestrictKeywords() {
let l10nKeywords = await lazy.gFluentStrings.formatValues(
lazy.UrlbarUtils.LOCAL_SEARCH_MODES.map(mode => {
let name = lazy.UrlbarUtils.getResultSourceName(mode.source);
return { id: `urlbar-search-mode-${name}` };
})
);
let englishSearchStrings = new Localization([
"preview/enUS-searchFeatures.ftl",
]);
let englishKeywords = await englishSearchStrings.formatValues(
lazy.UrlbarUtils.LOCAL_SEARCH_MODES.map(mode => {
let name = lazy.UrlbarUtils.getResultSourceName(mode.source);
return { id: `urlbar-search-mode-${name}-en` };
})
);
for (let { restrict } of lazy.UrlbarUtils.LOCAL_SEARCH_MODES) {
let uniqueKeywords = [
...new Set([l10nKeywords.shift(), englishKeywords.shift()]),
];
tokenToKeywords.set(restrict, uniqueKeywords);
}
},
/**
* Gets the cached localized restrict keywords. If keywords are not cached
* fetch the localized keywords first and then return the keywords.
*
* @returns {Map} The tokenToKeywords Map.
*/
async getL10nRestrictKeywords() {
if (tokenToKeywords.size === 0) {
await this.loadL10nRestrictKeywords();
}
return tokenToKeywords;
},
/**
* Returns whether the passed in token looks like a URL.
* This is based on guessing and heuristics, that means if this function
* returns false, it's surely not a URL, if it returns true, the result must
* still be verified through URIFixup.
*
* @param {string} token
* The string token to verify
* @param {boolean} [requirePath] The url must have a path
* @returns {boolean} whether the token looks like a URL.
*/
looksLikeUrl(token, { requirePath = false } = {}) {
if (token.length < 2) {
return false;
}
// Ignore spaces and require path for the data: protocol.
if (token.startsWith("data:")) {
return token.length > 5;
}
if (this.REGEXP_SPACES.test(token)) {
return false;
}
// If it starts with something that looks like a protocol, it's likely a url.
if (this.REGEXP_LIKE_PROTOCOL.test(token)) {
return true;
}
// Guess path and prePath. At this point we should be analyzing strings not
// having a protocol.
let slashIndex = token.indexOf("/");
let prePath = slashIndex != -1 ? token.slice(0, slashIndex) : token;
if (!this.looksLikeOrigin(prePath, { ignoreKnownDomains: true })) {
return false;
}
let path = slashIndex != -1 ? token.slice(slashIndex) : "";
lazy.logger.debug("path", path);
if (requirePath && !path) {
return false;
}
// If there are both path and userinfo, it's likely a url.
let atIndex = prePath.indexOf("@");
let userinfo = atIndex != -1 ? prePath.slice(0, atIndex) : "";
if (path.length && userinfo.length) {
return true;
}
// If the first character after the slash in the path is a letter, then the
// token may be an "abc/def" url.
if (/^\/[a-z]/i.test(path)) {
return true;
}
// If the path contains special chars, it is likely a url.
if (["%", "?", "#"].some(c => path.includes(c))) {
return true;
}
// The above looksLikeOrigin call told us the prePath looks like an origin,
// now we go into details checking some common origins.
let hostPort = atIndex != -1 ? prePath.slice(atIndex + 1) : prePath;
if (this.REGEXP_HOSTPORT_IPV4.test(hostPort)) {
return true;
}
// ipv6 is very complex to support, just check for a few chars.
if (
this.REGEXP_HOSTPORT_IPV6.test(hostPort) &&
["[", "]", ":"].some(c => hostPort.includes(c))
) {
return true;
}
if (Services.uriFixup.isDomainKnown(hostPort)) {
return true;
}
return false;
},
/**
* Returns whether the passed in token looks like an origin.
* This is based on guessing and heuristics, that means if this function
* returns false, it's surely not an origin, if it returns true, the result
* must still be verified through URIFixup.
*
* @param {string} token
* The string token to verify
* @param {object} options Options object
* @param {boolean} [options.ignoreKnownDomains] If true, the origin doesn't have to be
* in the known domain list
* @param {boolean} [options.noIp] If true, the origin cannot be an IP address
* @param {boolean} [options.noPort] If true, the origin cannot have a port number
* @returns {boolean} whether the token looks like an origin.
*/
looksLikeOrigin(
token,
{ ignoreKnownDomains = false, noIp = false, noPort = false } = {}
) {
if (!token.length) {
return false;
}
let atIndex = token.indexOf("@");
if (atIndex != -1 && this.REGEXP_COMMON_EMAIL.test(token)) {
// We prefer handling it as an email rather than an origin with userinfo.
return false;
}
let userinfo = atIndex != -1 ? token.slice(0, atIndex) : "";
let hostPort = atIndex != -1 ? token.slice(atIndex + 1) : token;
let hasPort = this.REGEXP_HAS_PORT.test(hostPort);
lazy.logger.debug("userinfo", userinfo);
lazy.logger.debug("hostPort", hostPort);
if (noPort && hasPort) {
return false;
}
if (
this.REGEXP_HOSTPORT_IPV4.test(hostPort) ||
this.REGEXP_HOSTPORT_IPV6.test(hostPort)
) {
return !noIp;
}
// Check for invalid chars.
if (
this.REGEXP_LIKE_PROTOCOL.test(hostPort) ||
this.REGEXP_USERINFO_INVALID_CHARS.test(userinfo) ||
this.REGEXP_HOSTPORT_INVALID_CHARS.test(hostPort) ||
(!this.REGEXP_SINGLE_WORD_HOST.test(hostPort) &&
this.REGEXP_HOSTPORT_IP_LIKE.test(hostPort) &&
this.REGEXP_HOSTPORT_INVALID_IP.test(hostPort))
) {
return false;
}
// If it looks like a single word host, check the known domains.
if (
!ignoreKnownDomains &&
!userinfo &&
!hasPort &&
this.REGEXP_SINGLE_WORD_HOST.test(hostPort)
) {
return Services.uriFixup.isDomainKnown(hostPort);
}
return true;
},
/**
* Tokenizes the searchString from a UrlbarQueryContext.
*
* @param {UrlbarQueryContext} queryContext
* The query context object to tokenize
* @returns {UrlbarQueryContext} the same query context object with a new
* tokens property.
*/
tokenize(queryContext) {
lazy.logger.debug("Tokenizing search string", {
searchString: queryContext.searchString,
});
if (!queryContext.trimmedSearchString) {
queryContext.tokens = [];
return queryContext;
}
let unfiltered = splitString(queryContext);
let tokens = filterTokens(unfiltered);
queryContext.tokens = tokens;
return queryContext;
},
/**
* Given a token, tells if it's a restriction token.
*
* @param {object} token
* The token to check.
* @returns {boolean} Whether the token is a restriction character.
*/
isRestrictionToken(token) {
return (
token &&
token.type >= this.TYPE.RESTRICT_HISTORY &&
token.type <= this.TYPE.RESTRICT_URL
);
},
};
const CHAR_TO_TYPE_MAP = new Map(
Object.entries(UrlbarTokenizer.RESTRICT).map(([type, char]) => [
char,
UrlbarTokenizer.TYPE[`RESTRICT_${type}`],
])
);
/**
* Given a queryContext object, splits its searchString into string tokens.
*
* @param {UrlbarQueryContext} queryContext
* The query context object to tokenize.
* @param {string} queryContext.searchString
* The search string to split.
* @param {object} queryContext.searchMode
* A search mode object.
* @returns {Array} An array of string tokens.
*/
function splitString({ searchString, searchMode }) {
// The first step is splitting on unicode whitespaces. We ignore whitespaces
// if the search string starts with "data:", to better support Web developers
// and compatiblity with other browsers.
let trimmed = searchString.trim();
let tokens;
if (trimmed.startsWith("data:")) {
tokens = [trimmed];
} else if (trimmed.length < 500) {
tokens = trimmed.split(UrlbarTokenizer.REGEXP_SPACES);
} else {
// If the string is very long, tokenizing all of it would be expensive. So
// we only tokenize a part of it, then let the last token become a
// catch-all.
tokens = trimmed.substring(0, 500).split(UrlbarTokenizer.REGEXP_SPACES);
tokens[tokens.length - 1] += trimmed.substring(500);
}
if (!tokens.length) {
return tokens;
}
// If there is no separate restriction token, it's possible we have to split
// a token, if it's the first one and it includes a leading restriction char
// or it's the last one and it includes a trailing restriction char.
// This allows to not require the user to add artificial whitespaces to
// enforce restrictions, for example typing questions would restrict to
// search results.
const hasRestrictionToken = tokens.some(t => CHAR_TO_TYPE_MAP.has(t));
const firstToken = tokens[0];
const isFirstTokenAKeyword =
!Object.values(UrlbarTokenizer.RESTRICT).includes(firstToken) &&
lazy.PlacesUtils.keywords.isKeywordFromCache(firstToken);
if (hasRestrictionToken || isFirstTokenAKeyword) {
return tokens;
}
// Check for an unambiguous restriction char at the beginning of the first
// token, or at the end of the last token. We only count trailing restriction
// chars if they are the search restriction char, which is "?". This is to
// allow for a typed question to yield only search results.
if (
CHAR_TO_TYPE_MAP.has(firstToken[0]) &&
!UrlbarTokenizer.REGEXP_PERCENT_ENCODED_START.test(firstToken) &&
!searchMode
) {
tokens[0] = firstToken.substring(1);
tokens.splice(0, 0, firstToken[0]);
return tokens;
}
const lastIndex = tokens.length - 1;
const lastToken = tokens[lastIndex];
if (
lastToken[lastToken.length - 1] == UrlbarTokenizer.RESTRICT.SEARCH &&
!UrlbarTokenizer.looksLikeUrl(lastToken, { requirePath: true })
) {
tokens[lastIndex] = lastToken.substring(0, lastToken.length - 1);
tokens.push(lastToken[lastToken.length - 1]);
}
return tokens;
}
/**
* Given an array of unfiltered tokens, this function filters them and converts
* to token objects with a type.
*
* @param {Array} tokens
* An array of strings, representing search tokens.
* @returns {Array} An array of token objects.
* Note: restriction characters are only considered if they appear at the start
* or at the end of the tokens list. In case of restriction characters
* conflict, the most external ones win. Leading ones win over trailing
* ones. Discarded restriction characters are considered text.
*/
function filterTokens(tokens) {
let filtered = [];
let restrictions = [];
const isFirstTokenAKeyword =
!Object.values(UrlbarTokenizer.RESTRICT).includes(tokens[0]) &&
lazy.PlacesUtils.keywords.isKeywordFromCache(tokens[0]);
for (let i = 0; i < tokens.length; ++i) {
let token = tokens[i];
let tokenObj = {
value: token,
lowerCaseValue: token.toLocaleLowerCase(),
type: UrlbarTokenizer.TYPE.TEXT,
};
// For privacy reasons, we don't want to send a data (or other kind of) URI
// to a search engine. So we want to parse any single long token below.
if (tokens.length > 1 && token.length > 500) {
filtered.push(tokenObj);
break;
}
if (isFirstTokenAKeyword) {
filtered.push(tokenObj);
continue;
}
let restrictionType = CHAR_TO_TYPE_MAP.get(token);
if (restrictionType) {
restrictions.push({ index: i, type: restrictionType });
} else if (UrlbarTokenizer.looksLikeOrigin(token)) {
tokenObj.type = UrlbarTokenizer.TYPE.POSSIBLE_ORIGIN;
} else if (UrlbarTokenizer.looksLikeUrl(token, { requirePath: true })) {
tokenObj.type = UrlbarTokenizer.TYPE.POSSIBLE_URL;
}
filtered.push(tokenObj);
}
// Handle restriction characters.
if (restrictions.length) {
// We can apply two kind of restrictions: type (bookmark, search, ...) and
// matching (url, title). These kind of restrictions can be combined, but we
// can only have one restriction per kind.
let matchingRestrictionFound = false;
let typeRestrictionFound = false;
function assignRestriction(r) {
if (r && !(matchingRestrictionFound && typeRestrictionFound)) {
if (
[
UrlbarTokenizer.TYPE.RESTRICT_TITLE,
UrlbarTokenizer.TYPE.RESTRICT_URL,
].includes(r.type)
) {
if (!matchingRestrictionFound) {
matchingRestrictionFound = true;
filtered[r.index].type = r.type;
return true;
}
} else if (!typeRestrictionFound) {
typeRestrictionFound = true;
filtered[r.index].type = r.type;
return true;
}
}
return false;
}
// Look at the first token.
let found = assignRestriction(restrictions.find(r => r.index == 0));
if (found) {
// If the first token was assigned, look at the next one.
assignRestriction(restrictions.find(r => r.index == 1));
}
// Then look at the last token.
let lastIndex = tokens.length - 1;
found = assignRestriction(restrictions.find(r => r.index == lastIndex));
if (found) {
// If the last token was assigned, look at the previous one.
assignRestriction(restrictions.find(r => r.index == lastIndex - 1));
}
}
lazy.logger.info("Filtered Tokens", filtered);
return filtered;
}