🐛 Added pagination to sitemap.xml to avoid max 50,000 entries limit

refs https://github.com/TryGhost/Team/issues/1044
refs https://github.com/TryGhost/Ghost/pull/13298

- This splits the sitemaps according to the limit set by Google https://developers.google.com/search/docs/advanced/sitemaps/large-sitemaps

Co-authored-by:
- Kevin Ansfield (@kevinansfield)
This commit is contained in:
Thibaut Patel 2022-01-05 11:54:35 +01:00 committed by GitHub
parent 9427886610
commit 01e833376b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 113 additions and 46 deletions

View File

@ -17,12 +17,12 @@ class BaseSiteMapGenerator {
constructor() {
this.nodeLookup = {};
this.nodeTimeLookup = {};
this.siteMapContent = null;
this.siteMapContent = new Map();
this.lastModified = 0;
this.maxNodes = 50000;
this.maxPerPage = 50000;
}
generateXmlFromNodes() {
generateXmlFromNodes(page) {
// Get a mapping of node to timestamp
let nodesToProcess = _.map(this.nodeLookup, (node, id) => {
return {
@ -33,20 +33,23 @@ class BaseSiteMapGenerator {
};
});
// Limit to 50k nodes - this is a quick fix to prevent errors in google console
if (this.maxNodes) {
nodesToProcess = nodesToProcess.slice(0, this.maxNodes);
}
// Sort nodes by timestamp
nodesToProcess = _.sortBy(nodesToProcess, 'ts');
// Get the page of nodes that was requested
nodesToProcess = nodesToProcess.slice((page - 1) * this.maxPerPage, page * this.maxPerPage);
// Do not generate empty sitemaps
if (nodesToProcess.length === 0) {
return null;
}
// Grab just the nodes
nodesToProcess = _.map(nodesToProcess, 'node');
const nodes = _.map(nodesToProcess, 'node');
const data = {
// Concat the elements to the _attr declaration
urlset: [XMLNS_DECLS].concat(nodesToProcess)
urlset: [XMLNS_DECLS].concat(nodes)
};
// Generate full xml
@ -67,7 +70,7 @@ class BaseSiteMapGenerator {
this.updateLastModified(datum);
this.updateLookups(datum, node);
// force regeneration of xml
this.siteMapContent = null;
this.siteMapContent.clear();
}
}
@ -75,7 +78,7 @@ class BaseSiteMapGenerator {
this.removeFromLookups(datum);
// force regeneration of xml
this.siteMapContent = null;
this.siteMapContent.clear();
this.lastModified = Date.now();
}
@ -152,13 +155,13 @@ class BaseSiteMapGenerator {
return !!imageUrl;
}
getXml() {
if (this.siteMapContent) {
return this.siteMapContent;
getXml(page = 1) {
if (this.siteMapContent.has(page)) {
return this.siteMapContent.get(page);
}
const content = this.generateXmlFromNodes();
this.siteMapContent = content;
const content = this.generateXmlFromNodes(page);
this.siteMapContent.set(page, content);
return content;
}
@ -181,7 +184,7 @@ class BaseSiteMapGenerator {
reset() {
this.nodeLookup = {};
this.nodeTimeLookup = {};
this.siteMapContent = null;
this.siteMapContent.clear();
}
}

View File

@ -5,7 +5,8 @@ const manager = new Manager();
// Responsible for handling requests for sitemap files
module.exports = function handler(siteApp) {
const verifyResourceType = function verifyResourceType(req, res, next) {
if (!Object.prototype.hasOwnProperty.call(manager, req.params.resource)) {
const resourceWithoutPage = req.params.resource.replace(/-\d+$/, '');
if (!Object.prototype.hasOwnProperty.call(manager, resourceWithoutPage)) {
return res.sendStatus(404);
}
@ -22,14 +23,22 @@ module.exports = function handler(siteApp) {
});
siteApp.get('/sitemap-:resource.xml', verifyResourceType, function sitemapResourceXML(req, res) {
const type = req.params.resource;
const page = 1;
const type = req.params.resource.replace(/-\d+$/, '');
const pageParam = (req.params.resource.match(/-(\d+)$/) || [null, null])[1];
const page = pageParam ? parseInt(pageParam, 10) : 1;
const content = manager.getSiteMapXml(type, page);
// Prevent x-1.xml as it is a duplicate of x.xml and empty sitemaps
// (except for the first page so that at least one sitemap exists per type)
if (pageParam === '1' || (!content && page !== 1)) {
return res.sendStatus(404);
}
res.set({
'Cache-Control': 'public, max-age=' + config.get('caching:sitemap:maxAge'),
'Content-Type': 'text/xml'
});
res.send(manager.getSiteMapXml(type, page));
res.send(content);
});
};

View File

@ -14,6 +14,7 @@ class SiteMapIndexGenerator {
constructor(options) {
options = options || {};
this.types = options.types;
this.maxPerPage = options.maxPerPage;
}
getXml() {
@ -30,16 +31,25 @@ class SiteMapIndexGenerator {
generateSiteMapUrlElements() {
return _.map(this.types, (resourceType) => {
const url = urlUtils.urlFor({relativeUrl: '/sitemap-' + resourceType.name + '.xml'}, true);
const lastModified = resourceType.lastModified;
// `|| 1` = even if there are no items we still have an empty sitemap file
const noOfPages = Math.ceil(Object.keys(resourceType.nodeLookup).length / this.maxPerPage) || 1;
const pages = [];
return {
sitemap: [
{loc: url},
{lastmod: moment(lastModified).toISOString()}
]
};
});
for (let i = 0; i < noOfPages; i++) {
const page = i === 0 ? '' : `-${i + 1}`;
const url = urlUtils.urlFor({relativeUrl: '/sitemap-' + resourceType.name + page + '.xml'}, true);
const lastModified = resourceType.lastModified;
pages.push({
sitemap: [
{loc: url},
{lastmod: moment(lastModified).toISOString()}
]
});
}
return pages;
}).flat();
}
}

View File

@ -11,11 +11,13 @@ class SiteMapManager {
constructor(options) {
options = options || {};
options.maxPerPage = options.maxPerPage || 50000;
this.pages = options.pages || this.createPagesGenerator(options);
this.posts = options.posts || this.createPostsGenerator(options);
this.users = this.authors = options.authors || this.createUsersGenerator(options);
this.tags = options.tags || this.createTagsGenerator(options);
this.index = options.index || this.createIndexGenerator();
this.index = options.index || this.createIndexGenerator(options);
events.on('router.created', (router) => {
if (router.name === 'StaticRoutesRouter') {
@ -43,14 +45,15 @@ class SiteMapManager {
});
}
createIndexGenerator() {
createIndexGenerator(options) {
return new IndexMapGenerator({
types: {
pages: this.pages,
posts: this.posts,
authors: this.authors,
tags: this.tags
}
},
maxPerPage: options.maxPerPage
});
}
@ -74,8 +77,8 @@ class SiteMapManager {
return this.index.getXml();
}
getSiteMapXml(type) {
return this[type].getXml();
getSiteMapXml(type, page) {
return this[type].getXml(page);
}
}

View File

@ -53,7 +53,7 @@ describe('Generators', function () {
});
it('max node setting results in the right number of nodes', function () {
generator = new PostGenerator({maxNodes: 5});
generator = new PostGenerator({maxPerPage: 5});
for (let i = 0; i < 10; i++) {
generator.addUrl(`http://my-ghost-blog.com/episode-${i}/`, testUtils.DataGenerator.forKnex.createPost({
@ -70,12 +70,12 @@ describe('Generators', function () {
Object.keys(generator.nodeLookup).should.be.Array().with.lengthOf(10);
// But only 5 are output in the xml
generator.siteMapContent.match(/<loc>/g).should.be.Array().with.lengthOf(5);
generator.siteMapContent.get(1).match(/<loc>/g).should.be.Array().with.lengthOf(5);
});
it('default is 50k', function () {
generator = new PostGenerator();
generator.maxNodes.should.eql(50000);
generator.maxPerPage.should.eql(50000);
});
describe('IndexGenerator', function () {
@ -86,7 +86,8 @@ describe('Generators', function () {
pages: new PageGenerator(),
tags: new TagGenerator(),
authors: new UserGenerator()
}
},
maxPerPage: 5
});
});
@ -99,6 +100,21 @@ describe('Generators', function () {
xml.should.match(/sitemap-pages.xml/);
xml.should.match(/sitemap-authors.xml/);
});
it('creates multiple pages when there are too many posts', function () {
for (let i = 0; i < 10; i++) {
generator.types.posts.addUrl(`http://my-ghost-blog.com/episode-${i}/`, testUtils.DataGenerator.forKnex.createPost({
created_at: (Date.UTC(2014, 11, 22, 12) - 360000) + 200,
updated_at: null,
published_at: null,
slug: `episode-${i}`
}));
}
const xml = generator.getXml();
xml.should.match(/sitemap-posts.xml/);
xml.should.match(/sitemap-posts-2.xml/);
});
});
});
@ -126,9 +142,9 @@ describe('Generators', function () {
it('get cached xml', function () {
sinon.spy(generator, 'generateXmlFromNodes');
generator.siteMapContent = 'something';
generator.siteMapContent.set(1, 'something');
generator.getXml().should.eql('something');
generator.siteMapContent = null;
generator.siteMapContent.clear();
generator.generateXmlFromNodes.called.should.eql(false);
});
@ -184,6 +200,32 @@ describe('Generators', function () {
idxFirst.should.be.below(idxSecond);
idxSecond.should.be.below(idxThird);
});
it('creates multiple pages when there are too many posts', function () {
generator.maxPerPage = 5;
urlUtils.urlFor.withArgs('sitemap_xsl', true).returns('http://my-ghost-blog.com/sitemap.xsl');
for (let i = 0; i < 10; i++) {
generator.addUrl(`http://my-ghost-blog.com/episode-${i}/`, testUtils.DataGenerator.forKnex.createPost({
created_at: (Date.UTC(2014, 11, 22, 12) - 360000) + 200,
updated_at: null,
published_at: null,
slug: `episode-${i}`
}));
}
const pages = [generator.getXml(), generator.getXml(2)];
for (let i = 0; i < 10; i++) {
const pageIndex = Math.floor(i / 5);
pages[pageIndex].should.containEql(`<loc>http://my-ghost-blog.com/episode-${i}/</loc>`);
}
});
it('shouldn\'t break with out of bounds pages', function () {
should.not.exist(generator.getXml(-1));
should.not.exist(generator.getXml(99999));
should.not.exist(generator.getXml(0));
});
});
describe('fn: removeUrl', function () {
@ -224,12 +266,12 @@ describe('Generators', function () {
generator.getXml();
generator.siteMapContent.should.containEql('<loc>http://my-ghost-blog.com/home/</loc>');
generator.siteMapContent.should.containEql('<loc>http://my-ghost-blog.com/magic/</loc>');
generator.siteMapContent.should.containEql('<loc>http://my-ghost-blog.com/subscribe/</loc>');
generator.siteMapContent.get(1).should.containEql('<loc>http://my-ghost-blog.com/home/</loc>');
generator.siteMapContent.get(1).should.containEql('<loc>http://my-ghost-blog.com/magic/</loc>');
generator.siteMapContent.get(1).should.containEql('<loc>http://my-ghost-blog.com/subscribe/</loc>');
// <loc> should exist exactly one time
generator.siteMapContent.match(/<loc>/g).length.should.eql(3);
generator.siteMapContent.get(1).match(/<loc>/g).length.should.eql(3);
});
});
});