Files
tubestation/browser/components/genai/LinkPreviewChild.sys.mjs
Tim Xia 736aabff21 Bug 1952011 - Implement fetch for Link Preview - r=Mardak,firefox-desktop-core-reviewers ,firefox-ai-ml-reviewers,mconley
- add fetch in child process
- DOM parsing only once
- check content-type to be text/html before fetching content
- add custom request header x-firefox-ai so publisher has option to allow/disallow the fetch request
- more fetch safeguards(max content-length) may come in the near future

Differential Revision: https://phabricator.services.mozilla.com/D240654
2025-03-11 05:15:02 +00:00

181 lines
5.2 KiB
JavaScript

/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
const lazy = {};
ChromeUtils.defineESModuleGetters(lazy, {
ReaderMode: "moz-src:///toolkit/components/reader/ReaderMode.sys.mjs",
});
/**
* Represents a child actor for handling link previews in the browser.
* Interacts with content windows and handles events related to link previews.
*
* @class LinkPreviewChild
* @augments {JSWindowActorChild}
*/
export class LinkPreviewChild extends JSWindowActorChild {
/**
* Handles incoming messages from the parent actor.
*
* @param {object} message - The message object containing name and data.
* @param {string} message.name - The name of the message.
* @param {object} message.data - Data associated with the message.
* @returns {Promise<object>|undefined} The result of fetchPageData if applicable.
*/
async receiveMessage({ name, data }) {
if (name === "LinkPreview:FetchPageData") {
return this.fetchPageData(data.url);
}
//expected a return value. consistent-return (eslint)
return undefined;
}
/**
* Fetches the HTML content from the given URL.
*
* @param {string} url - The URL to fetch.
* @returns {Promise<string>} The HTML content as a string.
* @throws {Error} If the fetch fails or the content type is invalid.
*/
async fetchHTML(url) {
// Perform a HEAD request to check content type and length
const headResponse = await fetch(url, {
method: "HEAD",
mode: "cors",
headers: {
"x-firefox-ai": "true",
},
});
if (!headResponse.ok) {
throw new Error(
`Failed to fetch: ${headResponse.status} ${headResponse.statusText}`
);
}
const contentType = headResponse.headers.get("content-type") || "";
if (!contentType.startsWith("text/html")) {
throw new Error(`Invalid content-type: ${contentType}`);
}
const contentLength = parseInt(
headResponse.headers.get("content-length"),
10
);
const MAX_CONTENT_LENGTH = 2 * 1024 * 1024; // 2 MB limit
if (contentLength && contentLength > MAX_CONTENT_LENGTH) {
throw new Error(`Content length exceeds limit: ${contentLength} bytes`);
}
// Proceed with GET request to fetch the HTML content
const response = await fetch(url, {
method: "GET",
mode: "cors",
headers: {
"x-firefox-ai": "true",
},
});
if (!response.ok) {
throw new Error(
`Failed to fetch: ${response.status} ${response.statusText}`
);
}
const html = await response.text();
return html;
}
/**
* Fetches HTML content from a URL and parses its meta tags and page text.
*
* @param {string} url - The URL to fetch and parse.
* @returns {Promise<object>} An object containing meta information, page text, and HTML code.
*/
async fetchPageData(url) {
try {
const htmlCode = await this.fetchHTML(url);
const parser = new DOMParser();
const doc = parser.parseFromString(htmlCode, "text/html");
const metaInfo = this.parseMetaTagsFromDoc(doc);
const articleData = await this.getArticleDataFromDoc(doc);
return {
metaInfo,
article: articleData,
};
} catch (error) {
console.error(`Failed to fetch and parse page data: ${error.message}`);
return {
metaInfo: {},
article: {},
};
}
}
/**
* Parses meta tags from the provided Document into a key-value object.
* Also extracts the title if available.
*
* @param {Document} doc - The parsed HTML document.
* @returns {object} An object containing meta tag key-value pairs.
*/
parseMetaTagsFromDoc(doc) {
const metaTags = doc.querySelectorAll("meta");
const metaInfo = {};
// TODO: Define the meta tags we are interested in
const desiredMetaNames = ["description", "og:image", "title"];
metaTags.forEach(tag => {
const name = tag.getAttribute("name") || tag.getAttribute("property");
const content = tag.getAttribute("content");
if (name && content) {
if (desiredMetaNames.includes(name.toLowerCase())) {
metaInfo[name] = content;
}
}
});
const title = doc.querySelector("title")?.textContent;
if (title) {
metaInfo["html:title"] = title;
}
return metaInfo;
}
/**
* Extracts article data from the provided Document using ReaderMode.
*
* @param {Document} doc - The parsed HTML document.
* @returns {Promise<object>} The extracted article data including specified fields.
*/
async getArticleDataFromDoc(doc) {
try {
const article = await lazy.ReaderMode.parseDocument(doc);
if (article) {
//TODO include other fields like sitename from article
const { title, byline, textContent, length, siteName } = article;
//TODO getReadingTime from Readermode
return {
title,
byline,
textContent,
length,
siteName,
};
}
} catch (error) {
console.error("Error parsing document with ReaderMode:", error);
}
return {};
}
}