160 lines
4.3 KiB
JavaScript
160 lines
4.3 KiB
JavaScript
/* This Source Code Form is subject to the terms of the Mozilla Public
|
|
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
|
|
|
"use strict";
|
|
|
|
var EXPORTED_SYMBOLS = ["SchemaOrgPageData"];
|
|
|
|
const { PageDataSchema } = ChromeUtils.import(
|
|
"resource:///modules/pagedata/PageDataSchema.jsm"
|
|
);
|
|
|
|
/**
|
|
* Finds the values for a given property.
|
|
* See https://html.spec.whatwg.org/multipage/microdata.html#values for the parsing spec
|
|
*
|
|
* TODO: Currently this will find item properties of inner-items. Need to use itemscope as a
|
|
* boundary.
|
|
*
|
|
* @param {Element} element
|
|
* The item scope.
|
|
* @param {string} prop
|
|
* The property to find.
|
|
* @returns {any[]}
|
|
* The value of the property.
|
|
*/
|
|
function getProp(element, prop) {
|
|
const parseUrl = (urlElement, attr) => {
|
|
if (!urlElement.hasAttribute(attr)) {
|
|
return "";
|
|
}
|
|
|
|
try {
|
|
let url = new URL(
|
|
urlElement.getAttribute(attr),
|
|
urlElement.ownerDocument.documentURI
|
|
);
|
|
return url.toString();
|
|
} catch (e) {
|
|
return "";
|
|
}
|
|
};
|
|
|
|
return Array.from(
|
|
// Ignores properties that are scopes.
|
|
element.querySelectorAll(`[itemprop~='${prop}']:not([itemscope])`),
|
|
propElement => {
|
|
switch (propElement.localName) {
|
|
case "meta":
|
|
return propElement.getAttribute("content") ?? "";
|
|
case "audio":
|
|
case "embed":
|
|
case "iframe":
|
|
case "img":
|
|
case "source":
|
|
case "track":
|
|
case "video":
|
|
return parseUrl(propElement, "src");
|
|
case "object":
|
|
return parseUrl(propElement, "data");
|
|
case "a":
|
|
case "area":
|
|
case "link":
|
|
return parseUrl(propElement, "href");
|
|
case "data":
|
|
case "meter":
|
|
return propElement.getAttribute("value");
|
|
case "time":
|
|
if (propElement.hasAtribute("datetime")) {
|
|
return propElement.getAttribute("datetime");
|
|
}
|
|
return propElement.textContent;
|
|
default:
|
|
// Not mentioned in the spec but sites seem to use it.
|
|
if (propElement.hasAttribute("content")) {
|
|
return propElement.getAttribute("content");
|
|
}
|
|
return propElement.textContent;
|
|
}
|
|
}
|
|
);
|
|
}
|
|
|
|
/**
|
|
* Collects product data from an item scope.
|
|
*
|
|
* @param {PageData} pageData
|
|
* The pageData object to add to.
|
|
* @param {Element} element
|
|
* The product item scope element.
|
|
*/
|
|
function collectProduct(pageData, element) {
|
|
// At the moment we simply grab the first element found for each property.
|
|
// In future we may need to do something better.
|
|
|
|
let images = getProp(element, "image");
|
|
if (images.length) {
|
|
pageData.image = images[0];
|
|
}
|
|
|
|
let descriptions = getProp(element, "description");
|
|
if (descriptions.length) {
|
|
pageData.description = descriptions[0];
|
|
}
|
|
|
|
pageData.data[PageDataSchema.DATA_TYPE.PRODUCT] = {
|
|
name: getProp(element, "name")[0],
|
|
};
|
|
|
|
let prices = getProp(element, "price");
|
|
if (prices.length) {
|
|
let price = parseInt(prices[0]);
|
|
if (!isNaN(price)) {
|
|
pageData.data[PageDataSchema.DATA_TYPE.PRODUCT].price = {
|
|
value: price,
|
|
currency: getProp(element, "priceCurrency")[0],
|
|
};
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Collects schema.org related data from a page.
|
|
*
|
|
* Currently only supports HTML Microdata, not RDFa or JSON-LD formats.
|
|
* Currently only collects product data.
|
|
*/
|
|
const SchemaOrgPageData = {
|
|
collect(document) {
|
|
let pageData = { data: {} };
|
|
|
|
let scopes = document.querySelectorAll(
|
|
"[itemscope][itemtype^='https://schema.org/'], [itemscope][itemtype^='http://schema.org/']"
|
|
);
|
|
|
|
for (let scope of scopes) {
|
|
let itemType = scope.getAttribute("itemtype");
|
|
// Strip off the protocol
|
|
if (itemType.startsWith("https://")) {
|
|
itemType = itemType.substring(8);
|
|
} else {
|
|
itemType = itemType.substring(7);
|
|
}
|
|
|
|
switch (itemType) {
|
|
case "schema.org/Product":
|
|
if (!(PageDataSchema.DATA_TYPE.PRODUCT in pageData.data)) {
|
|
collectProduct(pageData, scope);
|
|
}
|
|
break;
|
|
case "schema.org/Organization":
|
|
pageData.siteName = getProp(scope, "name")[0];
|
|
break;
|
|
}
|
|
}
|
|
|
|
return pageData;
|
|
},
|
|
};
|