184 lines
4.9 KiB
JavaScript
184 lines
4.9 KiB
JavaScript
/* This Source Code Form is subject to the terms of the Mozilla Public
|
|
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
|
|
|
"use strict";
|
|
|
|
var EXPORTED_SYMBOLS = ["SchemaOrgPageData"];
|
|
|
|
const { PageDataCollector } = ChromeUtils.import(
|
|
"resource:///modules/pagedata/PageDataCollector.jsm"
|
|
);
|
|
|
|
/**
|
|
* @typedef {object} ProductData
|
|
* Data about a product.
|
|
* @property {string | undefined} gtin
|
|
* The Global Trade Item Number for the product.
|
|
* @property {string | undefined} name
|
|
* The name of the product.
|
|
* @property {URL | undefined} url
|
|
* The url of the product.
|
|
* @property {string | undefined} image
|
|
* the url of a product image.
|
|
* @property {string | undefined} price
|
|
* The price of the product.
|
|
* @property {string | undefined} currency
|
|
* The currency of the price.
|
|
*/
|
|
|
|
/**
|
|
* Finds the values for a given property.
|
|
* See https://html.spec.whatwg.org/multipage/microdata.html#values for the parsing spec
|
|
*
|
|
* TODO: Currently this will find item properties of inner-items. Need to use itemscope as a
|
|
* boundary.
|
|
*
|
|
* @param {Element} element
|
|
* The item scope.
|
|
* @param {string} prop
|
|
* The property to find.
|
|
* @returns {any[]}
|
|
* The value of the property.
|
|
*/
|
|
function getProp(element, prop) {
|
|
const parseUrl = (urlElement, attr) => {
|
|
if (!urlElement.hasAttribute(attr)) {
|
|
return "";
|
|
}
|
|
|
|
try {
|
|
let url = new URL(
|
|
urlElement.getAttribute(attr),
|
|
urlElement.ownerDocument.documentURI
|
|
);
|
|
return url.toString();
|
|
} catch (e) {
|
|
return "";
|
|
}
|
|
};
|
|
|
|
return Array.from(
|
|
// Ignores properties that are scopes.
|
|
element.querySelectorAll(`[itemprop~='${prop}']:not([itemscope])`),
|
|
propElement => {
|
|
switch (propElement.localName) {
|
|
case "meta":
|
|
return propElement.getAttribute("content") ?? "";
|
|
case "audio":
|
|
case "embed":
|
|
case "iframe":
|
|
case "img":
|
|
case "source":
|
|
case "track":
|
|
case "video":
|
|
return parseUrl(propElement, "src");
|
|
case "object":
|
|
return parseUrl(propElement, "data");
|
|
case "a":
|
|
case "area":
|
|
case "link":
|
|
return parseUrl(propElement, "href");
|
|
case "data":
|
|
case "meter":
|
|
return propElement.getAttribute("value");
|
|
case "time":
|
|
if (propElement.hasAtribute("datetime")) {
|
|
return propElement.getAttribute("datetime");
|
|
}
|
|
return propElement.textContent;
|
|
default:
|
|
// Not mentioned in the spec but sites seem to use it.
|
|
if (propElement.hasAttribute("content")) {
|
|
return propElement.getAttribute("content");
|
|
}
|
|
return propElement.textContent;
|
|
}
|
|
}
|
|
);
|
|
}
|
|
|
|
/**
|
|
* Collects schema.org related data from a page.
|
|
*
|
|
* Currently only supports HTML Microdata, not RDFa or JSON-LD formats.
|
|
* Currently only collects product data.
|
|
*
|
|
* TODO: Respond to DOM mutations to trigger recollection.
|
|
*/
|
|
class SchemaOrgPageData extends PageDataCollector {
|
|
/**
|
|
* @see PageDataCollector.init
|
|
*/
|
|
async init() {
|
|
return this.#collect();
|
|
}
|
|
|
|
/**
|
|
* Collects product data from an element.
|
|
*
|
|
* @param {Element} element
|
|
* The DOM element representing the product.
|
|
*
|
|
* @returns {ProductData}
|
|
* The product data.
|
|
*/
|
|
#collectProduct(element) {
|
|
// At the moment we simply grab the first element found for each property.
|
|
// In future we may need to do something better.
|
|
return {
|
|
gtin: getProp(element, "gtin")[0],
|
|
name: getProp(element, "name")[0],
|
|
image: getProp(element, "image")[0] || undefined,
|
|
url: getProp(element, "url")[0] || undefined,
|
|
price: getProp(element, "price")[0],
|
|
currency: getProp(element, "priceCurrency")[0],
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Collects the existing data from the page.
|
|
*
|
|
* @returns {Data[]}
|
|
*/
|
|
#collect() {
|
|
/**
|
|
* A map from item type to an array of the items found in the page.
|
|
*/
|
|
let items = new Map();
|
|
let insert = (type, item) => {
|
|
let data = items.get(type);
|
|
if (!data) {
|
|
data = [];
|
|
items.set(type, data);
|
|
}
|
|
data.push(item);
|
|
};
|
|
|
|
let scopes = this.document.querySelectorAll(
|
|
"[itemscope][itemtype^='https://schema.org/'], [itemscope][itemtype^='http://schema.org/']"
|
|
);
|
|
|
|
for (let scope of scopes) {
|
|
let itemType = scope.getAttribute("itemtype");
|
|
// Strip off the protocol
|
|
if (itemType.startsWith("https://")) {
|
|
itemType = itemType.substring(8);
|
|
} else {
|
|
itemType = itemType.substring(7);
|
|
}
|
|
|
|
switch (itemType) {
|
|
case "schema.org/Product":
|
|
insert(
|
|
PageDataCollector.DATA_TYPE.PRODUCT,
|
|
this.#collectProduct(scope)
|
|
);
|
|
break;
|
|
}
|
|
}
|
|
|
|
return Array.from(items, ([type, data]) => ({ type, data }));
|
|
}
|
|
}
|