Hardcode some text that gets shown on first time and additional links Differential Revision: https://phabricator.services.mozilla.com/D242045
389 lines
10 KiB
JavaScript
389 lines
10 KiB
JavaScript
/**
|
|
* This Source Code Form is subject to the terms of the Mozilla Public
|
|
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
*/
|
|
import { XPCOMUtils } from "resource://gre/modules/XPCOMUtils.sys.mjs";
|
|
|
|
// On average, each token represents about 4 characters. A factor of 3.5 is used
|
|
// instead of 4 to account for edge cases.
|
|
const CHARACTERS_PER_TOKEN = 3.5;
|
|
// On average, one token corresponds to approximately 4 characters, meaning 0.25
|
|
// times the character count would suffice under normal conditions. To ensure
|
|
// robustness and handle edge cases, we use a more conservative factor of 0.69.
|
|
const CONTEXT_SIZE_MULTIPLIER = 0.69;
|
|
const DEFAULT_INPUT_SENTENCES = 6;
|
|
const MIN_SENTENCE_LENGTH = 14;
|
|
const MIN_WORD_COUNT = 5;
|
|
|
|
const lazy = {};
|
|
ChromeUtils.defineESModuleGetters(lazy, {
|
|
createEngine: "chrome://global/content/ml/EngineProcess.sys.mjs",
|
|
Progress: "chrome://global/content/ml/Utils.sys.mjs",
|
|
});
|
|
XPCOMUtils.defineLazyPreferenceGetter(
|
|
lazy,
|
|
"config",
|
|
"browser.ml.linkPreview.config",
|
|
"{}"
|
|
);
|
|
XPCOMUtils.defineLazyPreferenceGetter(
|
|
lazy,
|
|
"inputSentences",
|
|
"browser.ml.linkPreview.inputSentences",
|
|
DEFAULT_INPUT_SENTENCES
|
|
);
|
|
XPCOMUtils.defineLazyPreferenceGetter(
|
|
lazy,
|
|
"outputSentences",
|
|
"browser.ml.linkPreview.outputSentences"
|
|
);
|
|
XPCOMUtils.defineLazyPreferenceGetter(
|
|
lazy,
|
|
"prompt",
|
|
"browser.ml.linkPreview.prompt",
|
|
"Provide a concise, objective summary of the input text in up to three sentences, focusing on key actions and intentions without using second or third person pronouns."
|
|
);
|
|
|
|
export const LinkPreviewModel = {
|
|
/**
|
|
* Extracts sentences from a given text.
|
|
*
|
|
* @param {string} text text to process
|
|
* @returns {Array<string>} sentences
|
|
*/
|
|
getSentences(text) {
|
|
const abbreviations = [
|
|
"Mr.",
|
|
"Mrs.",
|
|
"Ms.",
|
|
"Dr.",
|
|
"Prof.",
|
|
"Inc.",
|
|
"Ltd.",
|
|
"Jr.",
|
|
"Sr.",
|
|
"St.",
|
|
"e.g.",
|
|
"i.e.",
|
|
"U.S.A",
|
|
"D.C.",
|
|
"U.K.",
|
|
"etc.",
|
|
"a.m.",
|
|
"p.m.",
|
|
"D.",
|
|
"Mass.",
|
|
"Sen.",
|
|
"Rep.",
|
|
"No.",
|
|
"Fig.",
|
|
"vs.",
|
|
"Mx.",
|
|
"Ph.D.",
|
|
"M.D.",
|
|
"D.D.S.",
|
|
"B.A.",
|
|
"M.A.",
|
|
"LL.B.",
|
|
"LL.M.",
|
|
"J.D.",
|
|
"D.O.",
|
|
"D.V.M.",
|
|
"Psy.D.",
|
|
"Ed.D.",
|
|
"Eng.",
|
|
"Co.",
|
|
"Corp.",
|
|
"Mt.",
|
|
"Ft.",
|
|
"U.S.",
|
|
"U.S.A.",
|
|
"E.U.",
|
|
"et al.",
|
|
"Nos.",
|
|
"pp.",
|
|
"Vol.",
|
|
"Rev.",
|
|
"Gen.",
|
|
"Lt.",
|
|
"Col.",
|
|
"Maj.",
|
|
"Capt.",
|
|
"Sgt.",
|
|
"Cpl.",
|
|
"Pvt.",
|
|
"Adm.",
|
|
"Cmdr.",
|
|
"Ave.",
|
|
"Blvd.",
|
|
"Rd.",
|
|
"Ln.",
|
|
"Jan.",
|
|
"Feb.",
|
|
"Mar.",
|
|
"Apr.",
|
|
"May.",
|
|
"Jun.",
|
|
"Jul.",
|
|
"Aug.",
|
|
"Sep.",
|
|
"Sept.",
|
|
"Oct.",
|
|
"Nov.",
|
|
"Dec.",
|
|
"Mon.",
|
|
"Tue.",
|
|
"Tues.",
|
|
"Wed.",
|
|
"Thu.",
|
|
"Thur.",
|
|
"Thurs.",
|
|
"Fri.",
|
|
"Sat.",
|
|
"Sun.",
|
|
"Dept.",
|
|
"Univ.",
|
|
"Est.",
|
|
"Calif.",
|
|
"Fla.",
|
|
"N.Y.",
|
|
"Conn.",
|
|
"Va.",
|
|
"Ill.",
|
|
"Assoc.",
|
|
"Bros.",
|
|
"Dist.",
|
|
"Msgr.",
|
|
"S.P.",
|
|
"P.S.",
|
|
"U.S.S.R.",
|
|
"Mlle.",
|
|
"Mme.",
|
|
"Hon.",
|
|
"Messrs.",
|
|
"Mmes.",
|
|
"v.",
|
|
"vs.",
|
|
];
|
|
|
|
// Replace periods in abbreviations with a placeholder.
|
|
let modifiedText = text;
|
|
const placeholder = "∯";
|
|
|
|
abbreviations.forEach(abbrev => {
|
|
const escapedAbbrev = abbrev
|
|
.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")
|
|
.replace(/\\\./g, "\\.");
|
|
const regex = new RegExp(escapedAbbrev, "g");
|
|
const abbrevWithPlaceholder = abbrev.replace(/\./g, placeholder);
|
|
modifiedText = modifiedText.replace(regex, abbrevWithPlaceholder);
|
|
});
|
|
|
|
const segmenter = new Intl.Segmenter("en", {
|
|
granularity: "sentence",
|
|
});
|
|
const segments = segmenter.segment(modifiedText);
|
|
let sentences = Array.from(segments, segment => segment.segment);
|
|
|
|
// Restore the periods in abbreviations.
|
|
return sentences.map(sentence =>
|
|
sentence.replace(new RegExp(placeholder, "g"), ".")
|
|
);
|
|
},
|
|
|
|
/**
|
|
* Clean up text for text generation AI.
|
|
*
|
|
* @param {string} text to process
|
|
* @returns {string} cleaned up text
|
|
*/
|
|
preprocessText(text) {
|
|
return (
|
|
this.getSentences(text)
|
|
.map(s =>
|
|
// trim and replace consecutive blank by a single one.
|
|
s.trim().replace(
|
|
/(\s*\n\s*)|\s{2,}/g,
|
|
// (\s*\n\s*) -> Matches a newline (`\n`) surrounded by optional whitespace.
|
|
// \s{2,} -> Matches two or more consecutive spaces.
|
|
// g -> Global flag to replace all occurrences in the string.
|
|
|
|
(_, newline) => (newline ? "\n" : " ")
|
|
// Callback function:
|
|
// `_` -> First argument (full match) is ignored.
|
|
// `newline` -> If the first capturing group (\s*\n\s*) matched, `newline` is truthy.
|
|
// If `newline` exists, it replaces the match with a single newline ("\n").
|
|
// Otherwise, it replaces the match (extra spaces) with a single space (" ").
|
|
)
|
|
)
|
|
// Remove sentences that are too short without punctuation.
|
|
.filter(
|
|
s =>
|
|
s.length >= MIN_SENTENCE_LENGTH &&
|
|
s.split(" ").length >= MIN_WORD_COUNT &&
|
|
/\p{P}$/u.test(s)
|
|
)
|
|
.slice(0, lazy.inputSentences)
|
|
.join(" ")
|
|
);
|
|
},
|
|
|
|
/**
|
|
* Generate summary text using AI.
|
|
*
|
|
* @param {string} inputText
|
|
* @param {object} callbacks for progress and error
|
|
* @param {Function} callbacks.onDownload optional for download active
|
|
* @param {Function} callbacks.onText optional for text chunks
|
|
* @param {Function} callbacks.onError optional for error
|
|
*/
|
|
async generateTextAI(inputText, { onDownload, onText, onError } = {}) {
|
|
const processedInput = this.preprocessText(inputText);
|
|
// Asssume generated text is approximately the same length as the input.
|
|
const nPredict = Math.ceil(processedInput.length / CHARACTERS_PER_TOKEN);
|
|
const systemPrompt = lazy.prompt;
|
|
// Estimate an upper bound for the required number of tokens. This estimate
|
|
// must be large enough to include prompt tokens, input tokens, and
|
|
// generated tokens.
|
|
const numContext =
|
|
Math.ceil(
|
|
(processedInput.length + systemPrompt.length) * CONTEXT_SIZE_MULTIPLIER
|
|
) + nPredict;
|
|
|
|
let engine;
|
|
try {
|
|
engine = await lazy.createEngine(
|
|
{
|
|
backend: "wllama",
|
|
engineId: "wllamapreview",
|
|
kvCacheDtype: "q8_0",
|
|
modelFile: "smollm2-360m-instruct-q8_0.gguf",
|
|
modelHubRootUrl: "https://model-hub.mozilla.org",
|
|
modelHubUrlTemplate: "{model}/{revision}",
|
|
modelId: "HuggingFaceTB/SmolLM2-360M-Instruct-GGUF",
|
|
modelRevision: "main",
|
|
numBatch: numContext,
|
|
numContext,
|
|
numUbatch: numContext,
|
|
runtimeFilename: "wllama.wasm",
|
|
taskName: "wllama-text-generation",
|
|
timeoutMS: -1,
|
|
useMlock: false,
|
|
useMmap: true,
|
|
...JSON.parse(lazy.config),
|
|
},
|
|
data => {
|
|
if (data.type == lazy.Progress.ProgressType.DOWNLOAD) {
|
|
onDownload?.(
|
|
data.statusText != lazy.Progress.ProgressStatusText.DONE
|
|
);
|
|
}
|
|
}
|
|
);
|
|
|
|
const postProcessor = new SentencePostProcessor();
|
|
for await (const val of engine.runWithGenerator({
|
|
nPredict,
|
|
prompt: [
|
|
{ role: "system", content: systemPrompt },
|
|
{ role: "user", content: processedInput },
|
|
],
|
|
})) {
|
|
const sentence = postProcessor.put(val.text);
|
|
if (sentence) {
|
|
onText?.(sentence);
|
|
} else if (!val.text) {
|
|
const remaining = postProcessor.flush();
|
|
if (remaining) {
|
|
onText?.(remaining);
|
|
}
|
|
}
|
|
}
|
|
} catch (error) {
|
|
onError?.(error);
|
|
} finally {
|
|
await engine?.terminate();
|
|
}
|
|
},
|
|
};
|
|
|
|
/**
|
|
* A class for processing streaming text to detect and extract complete
|
|
* sentences. It buffers incoming text and periodically checks for new sentences
|
|
* based on punctuation and character count limits.
|
|
*
|
|
* This class is useful for incremental sentence processing in NLP tasks.
|
|
*/
|
|
export class SentencePostProcessor {
|
|
/**
|
|
* The maximum number of sentences to output before truncating the buffer.
|
|
* Use -1 for unlimited.
|
|
*
|
|
* @type {number}
|
|
*/
|
|
maxNumOutputSentences = -1;
|
|
|
|
/**
|
|
* Stores the current text being processed.
|
|
*
|
|
* @type {string}
|
|
*/
|
|
currentText = "";
|
|
|
|
/**
|
|
* Tracks the number of sentences processed so far.
|
|
*
|
|
* @type {number}
|
|
*/
|
|
currentNumSentences = 0;
|
|
|
|
/**
|
|
* @param {number} maxNumOutputSentences - The maximum number of sentences to
|
|
* output before truncating the buffer.
|
|
*/
|
|
constructor(maxNumOutputSentences = lazy.outputSentences) {
|
|
this.maxNumOutputSentences = maxNumOutputSentences;
|
|
}
|
|
|
|
/**
|
|
* Processes incoming text, checking if a full sentence has been completed. If
|
|
* a full sentence is detected, it returns the first complete sentence.
|
|
* Otherwise, it returns an empty string.
|
|
*
|
|
* @param {string} text to process
|
|
* @returns {string} first complete sentence if available, otherwise ""
|
|
*/
|
|
put(text) {
|
|
if (this.currentNumSentences == this.maxNumOutputSentences) {
|
|
return "";
|
|
}
|
|
this.currentText += text;
|
|
|
|
// We need to ensure that the current sentence is complete and the next
|
|
// has started before reporting that a sentence is ready.
|
|
const sentences = LinkPreviewModel.getSentences(this.currentText);
|
|
if (sentences.length >= 2) {
|
|
this.currentText = sentences.slice(1).join("");
|
|
this.currentNumSentences += 1;
|
|
|
|
if (this.currentNumSentences == this.maxNumOutputSentences) {
|
|
this.currentText = "";
|
|
}
|
|
return sentences[0];
|
|
}
|
|
|
|
return "";
|
|
}
|
|
|
|
/**
|
|
* Flushes the remaining text buffer. This ensures that any last remaining
|
|
* sentence is returned.
|
|
*
|
|
* @returns {string} remaining text that hasn't been processed yet
|
|
*/
|
|
flush() {
|
|
return this.currentText;
|
|
}
|
|
}
|