We developed the new translations codebase using an "s" at the end, and retained the old translations code under "translation". At this point I'm unifying it so that it's all under "translations", which involves a rename of the existing code. This way we will be consistent in our naming practice. Differential Revision: https://phabricator.services.mozilla.com/D239047
193 lines
6.3 KiB
JavaScript
193 lines
6.3 KiB
JavaScript
/* This Source Code Form is subject to the terms of the Mozilla Public
|
|
* License, v. 2.0. If a copy of the MPL was not distributed with this file,
|
|
* You can obtain one at http://mozilla.org/MPL/2.0/. */
|
|
|
|
// workerManager is exported for tests.
|
|
import { clearTimeout, setTimeout } from "resource://gre/modules/Timer.sys.mjs";
|
|
|
|
const WORKER_URL = "resource://gre/modules/translation/cld-worker.js";
|
|
|
|
/**
|
|
* The length of the substring to pull from the document's text for language
|
|
* identification.
|
|
*
|
|
* This value should ideally be one that is large enough to yield a confident
|
|
* identification result without being too large or expensive to extract.
|
|
*
|
|
* At this time, this value is not driven by statistical data or analysis.
|
|
*
|
|
* For the moment, while we investigate which language identification library
|
|
* we would like to use, keep this logic in sync with language-id-engine.sys.mjs
|
|
*/
|
|
const DOC_TEXT_TO_IDENTIFY_LENGTH = 1024;
|
|
|
|
export var workerManager = {
|
|
// Since Emscripten can handle heap growth, but not heap shrinkage, we
|
|
// need to refresh the worker after we've processed a particularly large
|
|
// string in order to prevent unnecessary resident memory growth.
|
|
//
|
|
// These values define the cut-off string length and the idle timeout
|
|
// (in milliseconds) before destroying a worker. Once a string of the
|
|
// maximum size has been processed, the worker is marked for
|
|
// destruction, and is terminated as soon as it has been idle for the
|
|
// given timeout.
|
|
//
|
|
// 1.5MB. This is the approximate string length that forces heap growth
|
|
// for a 2MB heap.
|
|
LARGE_STRING: 1.5 * 1024 * 1024,
|
|
IDLE_TIMEOUT: 10 * 1000,
|
|
|
|
detectionQueue: [],
|
|
|
|
detectLanguage(aParams) {
|
|
return this.workerReady
|
|
.then(worker => {
|
|
return new Promise(resolve => {
|
|
this.detectionQueue.push({ resolve });
|
|
worker.postMessage(aParams);
|
|
});
|
|
})
|
|
.then(result => {
|
|
// We have our asynchronous result from the worker.
|
|
//
|
|
// Determine if our input was large enough to trigger heap growth,
|
|
// or if we're already waiting to destroy the worker when it's
|
|
// idle. If so, schedule termination after the idle timeout.
|
|
if (
|
|
aParams.text.length >= this.LARGE_STRING ||
|
|
this._idleTimeout != null
|
|
) {
|
|
this.flushWorker();
|
|
}
|
|
|
|
return result;
|
|
});
|
|
},
|
|
|
|
_worker: null,
|
|
_workerReadyPromise: null,
|
|
|
|
get workerReady() {
|
|
if (!this._workerReadyPromise) {
|
|
this._workerReadyPromise = new Promise(resolve => {
|
|
let worker = new Worker(WORKER_URL);
|
|
worker.onmessage = aMsg => {
|
|
if (aMsg.data == "ready") {
|
|
resolve(worker);
|
|
} else {
|
|
this.detectionQueue.shift().resolve(aMsg.data);
|
|
}
|
|
};
|
|
this._worker = worker;
|
|
});
|
|
}
|
|
|
|
return this._workerReadyPromise;
|
|
},
|
|
|
|
// Holds the ID of the current pending idle cleanup setTimeout.
|
|
_idleTimeout: null,
|
|
|
|
// Schedule the current worker to be terminated after the idle timeout.
|
|
flushWorker() {
|
|
if (this._idleTimeout != null) {
|
|
clearTimeout(this._idleTimeout);
|
|
}
|
|
|
|
this._idleTimeout = setTimeout(
|
|
this._flushWorker.bind(this),
|
|
this.IDLE_TIMEOUT
|
|
);
|
|
},
|
|
|
|
// Immediately terminate the worker, as long as there no pending
|
|
// results. Otherwise, reschedule termination until after the next
|
|
// idle timeout.
|
|
_flushWorker() {
|
|
if (this.detectionQueue.length) {
|
|
this.flushWorker();
|
|
} else {
|
|
if (this._worker) {
|
|
this._worker.terminate();
|
|
}
|
|
|
|
this._worker = null;
|
|
this._workerReadyPromise = null;
|
|
this._idleTimeout = null;
|
|
}
|
|
},
|
|
};
|
|
|
|
export var LanguageDetector = {
|
|
/**
|
|
* Detect the language of a given string.
|
|
*
|
|
* The argument may be either a string containing the text to analyze,
|
|
* or an object with the following properties:
|
|
*
|
|
* - 'text' The text to analyze.
|
|
*
|
|
* - 'isHTML' (optional) A boolean, indicating whether the text
|
|
* should be analyzed as HTML rather than plain text.
|
|
*
|
|
* - 'language' (optional) A string indicating the expected language.
|
|
* For text extracted from HTTP documents, this is expected to
|
|
* come from the Content-Language header.
|
|
*
|
|
* - 'tld' (optional) A string indicating the top-level domain of the
|
|
* document the text was extracted from.
|
|
*
|
|
* - 'encoding' (optional) A string describing the encoding of the
|
|
* document the string was extracted from. Note that, regardless
|
|
* of the value of this property, the 'text' property must be a
|
|
* UTF-16 JavaScript string.
|
|
*
|
|
* @returns {Promise<Object>}
|
|
* @resolves When detection is finished, with a object containing
|
|
* these fields:
|
|
* - 'language' (string with a language code)
|
|
* - 'confident' (boolean) Whether the detector is confident of the
|
|
* result.
|
|
* - 'languages' (array) An array of up to three elements, containing
|
|
* the most prevalent languages detected. It contains a
|
|
* 'languageCode' property, containing the ISO language code of
|
|
* the language, and a 'percent' property, describing the
|
|
* approximate percentage of the input which is in that language.
|
|
* For text of an unknown language, the result may contain an
|
|
* entry with the languge code 'un', indicating the percent of
|
|
* the text which is unknown.
|
|
*/
|
|
detectLanguage(aParams) {
|
|
if (typeof aParams == "string") {
|
|
aParams = { text: aParams };
|
|
}
|
|
|
|
return workerManager.detectLanguage(aParams);
|
|
},
|
|
|
|
/**
|
|
* Attempts to determine the language in which the document's content is written.
|
|
*
|
|
* For the moment, while we investigate which language identification library
|
|
* we would like to use, keep this logic in sync with language-id-engine.sys.mjs
|
|
* @returns {string | null}
|
|
*/
|
|
async detectLanguageFromDocument(aDocument) {
|
|
// Grab a selection of text.
|
|
let encoder = Cu.createDocumentEncoder("text/plain");
|
|
encoder.init(aDocument, "text/plain", encoder.SkipInvisibleContent);
|
|
let text = encoder
|
|
.encodeToStringWithMaxLength(DOC_TEXT_TO_IDENTIFY_LENGTH)
|
|
.replaceAll("\r", "")
|
|
.replaceAll("\n", " ");
|
|
|
|
const { language, confident } = await workerManager.detectLanguage({
|
|
text,
|
|
});
|
|
|
|
workerManager.flushWorker();
|
|
|
|
return confident ? language : null;
|
|
},
|
|
};
|