Files
tubestation/browser/components/pagedata/SchemaOrgPageData.jsm

184 lines
4.9 KiB
JavaScript

/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
"use strict";
var EXPORTED_SYMBOLS = ["SchemaOrgPageData"];
const { PageDataCollector } = ChromeUtils.import(
"resource:///modules/pagedata/PageDataCollector.jsm"
);
/**
* @typedef {object} ProductData
* Data about a product.
* @property {string | undefined} gtin
* The Global Trade Item Number for the product.
* @property {string | undefined} name
* The name of the product.
* @property {URL | undefined} url
* The url of the product.
* @property {string | undefined} image
* the url of a product image.
* @property {string | undefined} price
* The price of the product.
* @property {string | undefined} currency
* The currency of the price.
*/
/**
* Finds the values for a given property.
* See https://html.spec.whatwg.org/multipage/microdata.html#values for the parsing spec
*
* TODO: Currently this will find item properties of inner-items. Need to use itemscope as a
* boundary.
*
* @param {Element} element
* The item scope.
* @param {string} prop
* The property to find.
* @returns {any[]}
* The value of the property.
*/
function getProp(element, prop) {
const parseUrl = (urlElement, attr) => {
if (!urlElement.hasAttribute(attr)) {
return "";
}
try {
let url = new URL(
urlElement.getAttribute(attr),
urlElement.ownerDocument.documentURI
);
return url.toString();
} catch (e) {
return "";
}
};
return Array.from(
// Ignores properties that are scopes.
element.querySelectorAll(`[itemprop~='${prop}']:not([itemscope])`),
propElement => {
switch (propElement.localName) {
case "meta":
return propElement.getAttribute("content") ?? "";
case "audio":
case "embed":
case "iframe":
case "img":
case "source":
case "track":
case "video":
return parseUrl(propElement, "src");
case "object":
return parseUrl(propElement, "data");
case "a":
case "area":
case "link":
return parseUrl(propElement, "href");
case "data":
case "meter":
return propElement.getAttribute("value");
case "time":
if (propElement.hasAtribute("datetime")) {
return propElement.getAttribute("datetime");
}
return propElement.textContent;
default:
// Not mentioned in the spec but sites seem to use it.
if (propElement.hasAttribute("content")) {
return propElement.getAttribute("content");
}
return propElement.textContent;
}
}
);
}
/**
* Collects schema.org related data from a page.
*
* Currently only supports HTML Microdata, not RDFa or JSON-LD formats.
* Currently only collects product data.
*
* TODO: Respond to DOM mutations to trigger recollection.
*/
class SchemaOrgPageData extends PageDataCollector {
/**
* @see PageDataCollector.init
*/
async init() {
return this.#collect();
}
/**
* Collects product data from an element.
*
* @param {Element} element
* The DOM element representing the product.
*
* @returns {ProductData}
* The product data.
*/
#collectProduct(element) {
// At the moment we simply grab the first element found for each property.
// In future we may need to do something better.
return {
gtin: getProp(element, "gtin")[0],
name: getProp(element, "name")[0],
image: getProp(element, "image")[0] || undefined,
url: getProp(element, "url")[0] || undefined,
price: getProp(element, "price")[0],
currency: getProp(element, "priceCurrency")[0],
};
}
/**
* Collects the existing data from the page.
*
* @returns {Data[]}
*/
#collect() {
/**
* A map from item type to an array of the items found in the page.
*/
let items = new Map();
let insert = (type, item) => {
let data = items.get(type);
if (!data) {
data = [];
items.set(type, data);
}
data.push(item);
};
let scopes = this.document.querySelectorAll(
"[itemscope][itemtype^='https://schema.org/'], [itemscope][itemtype^='http://schema.org/']"
);
for (let scope of scopes) {
let itemType = scope.getAttribute("itemtype");
// Strip off the protocol
if (itemType.startsWith("https://")) {
itemType = itemType.substring(8);
} else {
itemType = itemType.substring(7);
}
switch (itemType) {
case "schema.org/Product":
insert(
PageDataCollector.DATA_TYPE.PRODUCT,
this.#collectProduct(scope)
);
break;
}
}
return Array.from(items, ([type, data]) => ({ type, data }));
}
}