Ghost/core/frontend/services/sitemap/base-generator.js
Hannah Wolfe 7d1d6ec6eb
🐛 Fixed error in sitemap with >50k posts (#13317)
closes: CORE-34
refs: https://github.com/TryGhost/Team/issues/1044

- this is a super basic fix, it adds a max nodes concept and limits the node in each sub-sitemap to 50k by default
- this will prevent the error in google console
- a better fix is in progress, but we want to at least solve the errors ASAP
2021-09-17 11:13:42 +01:00

189 lines
5.0 KiB
JavaScript

const _ = require('lodash');
const xml = require('xml');
const moment = require('moment');
const path = require('path');
const urlUtils = require('../../../shared/url-utils');
const localUtils = require('./utils');
// Sitemap specific xml namespace declarations that should not change
const XMLNS_DECLS = {
_attr: {
xmlns: 'http://www.sitemaps.org/schemas/sitemap/0.9',
'xmlns:image': 'http://www.google.com/schemas/sitemap-image/1.1'
}
};
class BaseSiteMapGenerator {
constructor() {
this.nodeLookup = {};
this.nodeTimeLookup = {};
this.siteMapContent = null;
this.lastModified = 0;
this.maxNodes = 50000;
}
generateXmlFromNodes() {
// Get a mapping of node to timestamp
let nodesToProcess = _.map(this.nodeLookup, (node, id) => {
return {
id: id,
// Using negative here to sort newest to oldest
ts: -(this.nodeTimeLookup[id] || 0),
node: node
};
});
// Limit to 50k nodes - this is a quick fix to prevent errors in google console
if (this.maxNodes) {
nodesToProcess = nodesToProcess.slice(0, this.maxNodes);
}
// Sort nodes by timestamp
nodesToProcess = _.sortBy(nodesToProcess, 'ts');
// Grab just the nodes
nodesToProcess = _.map(nodesToProcess, 'node');
const data = {
// Concat the elements to the _attr declaration
urlset: [XMLNS_DECLS].concat(nodesToProcess)
};
// Generate full xml
let sitemapXml = localUtils.getDeclarations() + xml(data);
// Perform url transformatons
// - Necessary because sitemap data is supplied by the router which
// uses knex directly bypassing model-layer attribute transforms
sitemapXml = urlUtils.transformReadyToAbsolute(sitemapXml);
return sitemapXml;
}
addUrl(url, datum) {
const node = this.createUrlNodeFromDatum(url, datum);
if (node) {
this.updateLastModified(datum);
this.updateLookups(datum, node);
// force regeneration of xml
this.siteMapContent = null;
}
}
removeUrl(url, datum) {
this.removeFromLookups(datum);
// force regeneration of xml
this.siteMapContent = null;
this.lastModified = Date.now();
}
getLastModifiedForDatum(datum) {
if (datum.updated_at || datum.published_at || datum.created_at) {
const modifiedDate = datum.updated_at || datum.published_at || datum.created_at;
return moment(modifiedDate);
} else {
return moment();
}
}
updateLastModified(datum) {
const lastModified = this.getLastModifiedForDatum(datum);
if (lastModified > this.lastModified) {
this.lastModified = lastModified;
}
}
createUrlNodeFromDatum(url, datum) {
let node;
let imgNode;
node = {
url: [
{loc: url},
{lastmod: moment(this.getLastModifiedForDatum(datum)).toISOString()}
]
};
imgNode = this.createImageNodeFromDatum(datum);
if (imgNode) {
node.url.push(imgNode);
}
return node;
}
createImageNodeFromDatum(datum) {
// Check for cover first because user has cover but the rest only have image
const image = datum.cover_image || datum.profile_image || datum.feature_image;
let imageUrl;
let imageEl;
if (!image) {
return;
}
// Grab the image url
imageUrl = urlUtils.urlFor('image', {image: image}, true);
// Verify the url structure
if (!this.validateImageUrl(imageUrl)) {
return;
}
// Create the weird xml node syntax structure that is expected
imageEl = [
{'image:loc': imageUrl},
{'image:caption': path.basename(imageUrl)}
];
// Return the node to be added to the url xml node
return {
'image:image': imageEl
};
}
validateImageUrl(imageUrl) {
return !!imageUrl;
}
getXml() {
if (this.siteMapContent) {
return this.siteMapContent;
}
const content = this.generateXmlFromNodes();
this.siteMapContent = content;
return content;
}
/**
* @NOTE
* The url service currently has no url update event.
* It removes and adds the url. If the url service extends it's
* feature set, we can detect if a node has changed.
*/
updateLookups(datum, node) {
this.nodeLookup[datum.id] = node;
this.nodeTimeLookup[datum.id] = this.getLastModifiedForDatum(datum);
}
removeFromLookups(datum) {
delete this.nodeLookup[datum.id];
delete this.nodeTimeLookup[datum.id];
}
reset() {
this.nodeLookup = {};
this.nodeTimeLookup = {};
this.siteMapContent = null;
}
}
module.exports = BaseSiteMapGenerator;