diff --git a/ghost/external-media-inliner/lib/ExternalMediaInliner.js b/ghost/external-media-inliner/lib/ExternalMediaInliner.js index 14546e7051..4fb5ff69d7 100644 --- a/ghost/external-media-inliner/lib/ExternalMediaInliner.js +++ b/ghost/external-media-inliner/lib/ExternalMediaInliner.js @@ -1,7 +1,9 @@ const mime = require('mime-types'); +const FileType = require('file-type'); const request = require('@tryghost/request'); const errors = require('@tryghost/errors'); const logging = require('@tryghost/logging'); +const string = require('@tryghost/string'); const path = require('path'); class ExternalMediaInliner { @@ -39,14 +41,22 @@ class ExternalMediaInliner { * @param {string} requestURL - url of remote media * @returns {Promise} */ - async #getRemoteMedia(requestURL) { + async getRemoteMedia(requestURL) { // @NOTE: this is the most expensive operation in the whole inlining process // we should consider caching the results to improve performance + + // Enforce http - http > https redirects are commonplace + requestURL = requestURL.replace(/^\/\//g, 'http://'); + + // Encode to handle special characters in URLs + requestURL = encodeURI(requestURL); try { - return await request(requestURL, { + const response = await request(requestURL, { followRedirect: true, responseType: 'buffer' }); + + return response; } catch (error) { // NOTE: add special case for 404s logging.error(`Error downloading remote media: ${requestURL}`); @@ -63,21 +73,34 @@ class ExternalMediaInliner { * @param {Object} response - response from request * @returns {Object} */ - #extractFileDataFromResponse(requestURL, response) { - const headers = response.headers; - const contentType = headers['content-type']; + async extractFileDataFromResponse(requestURL, response) { + let extension; - const filename = requestURL - .split('/') - .pop() - .split('#')[0] - .split('?')[0]; + // Attempt to get the file extension from the file itself + // If that fails, or if `.ext` is undefined, get the extension from the file path in the catch + try { + const fileInfo = await FileType.fromBuffer(response.body); + extension = fileInfo.ext; + } catch { + const headers = response.headers; + const contentType = headers['content-type']; + const extensionFromPath = path.parse(requestURL).ext.split(/[^a-z]/i).filter(Boolean)[0]; + extension = mime.extension(contentType) || extensionFromPath; + } - const extension = mime.extension(contentType) || filename.split('.').pop(); + const removeExtRegExp = new RegExp(`.${extension}`, ''); + const fileNameNoExt = path.parse(requestURL).base.replace(removeExtRegExp, ''); + + // CASE: Query strings _can_ form part of the unique image URL, so rather that strip them include the in the file name + // Then trim to last 248 chars (this will be more unique than the first 248), and trim leading & trailing dashes. + // 248 is on the lower end of limits from various OSes and file systems + const fileName = string.slugify(path.parse(fileNameNoExt).base, { + requiredChangesOnly: true + }).slice(-248).replace(/^-|-$/, ''); return { fileBuffer: response.body, - filename: filename, + filename: `${fileName}.${extension}`, extension: `.${extension}` }; } @@ -87,7 +110,7 @@ class ExternalMediaInliner { * @param {Object} media - media to store locally * @returns {Promise} - path to stored media */ - async #storeMediaLocally(media) { + async storeMediaLocally(media) { const storage = this.getMediaStorage(media.extension); if (!storage) { @@ -106,7 +129,7 @@ class ExternalMediaInliner { } } - async #inlineMibiledoc(mobiledoc, domains) { + async inlineMobiledoc(mobiledoc, domains) { for (const domain of domains) { // NOTE: the src could end with a quote, apostrophe or double-backslash. backlashes are added to mobiledoc // as an escape character @@ -115,15 +138,15 @@ class ExternalMediaInliner { const matches = mobiledoc.matchAll(regex); for (const [,src] of matches) { - const response = await this.#getRemoteMedia(src); + const response = await this.getRemoteMedia(src); let media; if (response) { - media = this.#extractFileDataFromResponse(src, response); + media = await this.extractFileDataFromResponse(src, response); } if (media) { - const filePath = await this.#storeMediaLocally(media); + const filePath = await this.storeMediaLocally(media); if (filePath) { const inlinedSrc = `__GHOST_URL__${filePath}`; @@ -147,7 +170,7 @@ class ExternalMediaInliner { * @param {String[]} domains - domains to inline media from * @returns Promise - updated fields map with local media paths */ - async #inlineFields(resourceModel, fields, domains) { + async inlineFields(resourceModel, fields, domains) { const updatedFields = {}; for (const field of fields) { @@ -155,15 +178,15 @@ class ExternalMediaInliner { const src = resourceModel.get(field); if (src && src.startsWith(domain)) { - const response = await this.#getRemoteMedia(src); + const response = await this.getRemoteMedia(src); let media; if (response) { - media = this.#extractFileDataFromResponse(src, response); + media = await this.extractFileDataFromResponse(src, response); } if (media) { - const filePath = await this.#storeMediaLocally(media); + const filePath = await this.storeMediaLocally(media); if (filePath) { const inlinedSrc = `__GHOST_URL__${filePath}`; @@ -186,12 +209,12 @@ class ExternalMediaInliner { * @param {string[]} fields - fields to inline * @param {string[]} domains - domains to inline media from */ - async #inlineSimpleFields(resources, model, fields, domains) { + async inlineSimpleFields(resources, model, fields, domains) { logging.info(`Starting inlining external media for ${resources?.length} resources and with ${fields.join(', ')} fields`); for (const resource of resources) { try { - const updatedFields = await this.#inlineFields(resource, fields, domains); + const updatedFields = await this.inlineFields(resource, fields, domains); if (Object.keys(updatedFields).length > 0) { await model.edit(updatedFields, { @@ -227,8 +250,8 @@ class ExternalMediaInliner { for (const post of posts) { try { - const inlinedMobiledoc = await this.#inlineMibiledoc(post.get('mobiledoc'), domains); - const updatedFields = await this.#inlineFields(post, postsInilingFields, domains); + const inlinedMobiledoc = await this.inlineMobiledoc(post.get('mobiledoc'), domains); + const updatedFields = await this.inlineFields(post, postsInilingFields, domains); if (inlinedMobiledoc !== post.get('mobiledoc')) { updatedFields.mobiledoc = inlinedMobiledoc; @@ -258,7 +281,7 @@ class ExternalMediaInliner { 'twitter_image' ]; - await this.#inlineSimpleFields(postsMetas, this.#PostMetaModel, postsMetaInilingFields, domains); + await this.inlineSimpleFields(postsMetas, this.#PostMetaModel, postsMetaInilingFields, domains); const {data: tags} = await this.#TagModel.findPage({ limit: 'all' @@ -269,7 +292,7 @@ class ExternalMediaInliner { 'twitter_image' ]; - await this.#inlineSimpleFields(tags, this.#TagModel, tagInliningFields, domains); + await this.inlineSimpleFields(tags, this.#TagModel, tagInliningFields, domains); const {data: users} = await this.#UserModel.findPage({ limit: 'all' @@ -279,7 +302,7 @@ class ExternalMediaInliner { 'cover_image' ]; - await this.#inlineSimpleFields(users, this.#UserModel, userInliningFields, domains); + await this.inlineSimpleFields(users, this.#UserModel, userInliningFields, domains); logging.info('Finished inlining external media for posts, tags, and users'); } diff --git a/ghost/external-media-inliner/package.json b/ghost/external-media-inliner/package.json index 547448f217..f59306e199 100644 --- a/ghost/external-media-inliner/package.json +++ b/ghost/external-media-inliner/package.json @@ -22,7 +22,7 @@ "mocha": "10.2.0", "sinon": "15.2.0" }, -"dependencies": { - "mime-types": "2.1.35" -} + "dependencies": { + "file-type": "16.5.4" + } } diff --git a/ghost/external-media-inliner/test/ExternalMediaInliner.test.js b/ghost/external-media-inliner/test/ExternalMediaInliner.test.js index 5bf3507f86..7a6ed9528a 100644 --- a/ghost/external-media-inliner/test/ExternalMediaInliner.test.js +++ b/ghost/external-media-inliner/test/ExternalMediaInliner.test.js @@ -1,4 +1,5 @@ const assert = require('assert/strict'); +const fs = require('fs'); const sinon = require('sinon'); const nock = require('nock'); const path = require('path'); @@ -7,6 +8,8 @@ const ExternalMediaInliner = require('../index'); describe('ExternalMediaInliner', function () { let logging; + let ghostLogoPng; + let exeFile; let GIF1x1; let postModelStub; let postMetaModelStub; @@ -15,6 +18,8 @@ describe('ExternalMediaInliner', function () { beforeEach(function () { // use a 1x1 gif in nock responses because it's really small and easy to work with + ghostLogoPng = fs.readFileSync(path.join(__dirname, 'fixtures', 'ghost-logo.png')); + exeFile = fs.readFileSync(path.join(__dirname, 'fixtures', 'fixture.exe')); GIF1x1 = Buffer.from('R0lGODlhAQABAAAAACH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==', 'base64'); logging = { info: sinon.stub(loggingLib, 'info'), @@ -222,7 +227,7 @@ describe('ExternalMediaInliner', function () { const fileURL = 'https://img.stockfresh.com/files/f/inlined.exe'; const requestMock = nock('https://img.stockfresh.com') .get('/files/f/inlined.exe') - .reply(200, GIF1x1); + .reply(200, exeFile); const postModelInstanceStub = { id: 'inlined-post-id', @@ -526,4 +531,145 @@ describe('ExternalMediaInliner', function () { }); }); }); + + describe('Special URL & file type handling', function () { + it('Handles URLs with quotes', async function () { + const imageURL = 'https://img.stockfresh.com/files/f/ghost-logo’s-cool.png'; + const requestMock = nock('https://img.stockfresh.com') + .get(encodeURI('/files/f/ghost-logo’s-cool.png')) + .reply(200, ghostLogoPng); + + const inliner = new ExternalMediaInliner({}); + const response = await inliner.getRemoteMedia(imageURL); + const fileData = await inliner.extractFileDataFromResponse(imageURL, response); + + assert.ok(requestMock.isDone()); + assert.equal(fileData.filename, 'ghost-logos-cool.png'); + assert.equal(fileData.extension, '.png'); + }); + + it('Handles URLs with spaces', async function () { + const imageURL = 'https://img.stockfresh.com/files/f/ghost logo with spaces.png'; + const requestMock = nock('https://img.stockfresh.com') + .get(encodeURI('/files/f/ghost logo with spaces.png')) + .reply(200, ghostLogoPng); + + const inliner = new ExternalMediaInliner({}); + const response = await inliner.getRemoteMedia(imageURL); + const fileData = await inliner.extractFileDataFromResponse(imageURL, response); + + assert.ok(requestMock.isDone()); + assert.equal(fileData.filename, 'ghost-logo-with-spaces.png'); + assert.equal(fileData.extension, '.png'); + }); + + it('Handles URLs with no extension', async function () { + const imageURL = 'https://img.stockfresh.com/files/f/ghost-logo'; + const requestMock = nock('https://img.stockfresh.com') + .get('/files/f/ghost-logo') + .reply(200, ghostLogoPng); + + const inliner = new ExternalMediaInliner({}); + const response = await inliner.getRemoteMedia(imageURL); + const fileData = await inliner.extractFileDataFromResponse(imageURL, response); + + assert.ok(requestMock.isDone()); + assert.equal(fileData.filename, 'ghost-logo.png'); + assert.equal(fileData.extension, '.png'); + }); + + it('Handles URLs with unicode characters', async function () { + const imageURL = 'https://img.stockfresh.com/files/f/你好.png'; + const requestMock = nock('https://img.stockfresh.com') + .get(encodeURI('/files/f/你好.png')) + .reply(200, ghostLogoPng); + + const inliner = new ExternalMediaInliner({}); + const response = await inliner.getRemoteMedia(imageURL); + const fileData = await inliner.extractFileDataFromResponse(imageURL, response); + + assert.ok(requestMock.isDone()); + assert.equal(fileData.filename, 'ni-hao.png'); + assert.equal(fileData.extension, '.png'); + }); + + it('Handles URLs with no scheme', async function () { + const imageURL = '//img.stockfresh.com/files/f/ghost-logo.png'; + const requestMock = nock('https://img.stockfresh.com') + .get('/files/f/ghost-logo.png') + .reply(200, ghostLogoPng); + + const inliner = new ExternalMediaInliner({}); + const response = await inliner.getRemoteMedia(imageURL); + const fileData = await inliner.extractFileDataFromResponse(imageURL, response); + + assert.ok(requestMock.isDone()); + assert.equal(fileData.filename, 'ghost-logo.png'); + assert.equal(fileData.extension, '.png'); + }); + + it('Handles URLs with query params', async function () { + const imageURL = 'https://img.stockfresh.com/files/f/ghost-logo.png?version=1&size=large'; + const requestMock = nock('https://img.stockfresh.com') + .get('/files/f/ghost-logo.png?version=1&size=large') + .reply(200, ghostLogoPng); + + const inliner = new ExternalMediaInliner({}); + const response = await inliner.getRemoteMedia(imageURL); + const fileData = await inliner.extractFileDataFromResponse(imageURL, response); + + assert.ok(requestMock.isDone()); + assert.equal(fileData.filename, 'ghost-logo-version-1-size-large.png'); + assert.equal(fileData.extension, '.png'); + }); + + it('Handles URLs with duplicated characters', async function () { + const imageURL = 'https://img.stockfresh.com/files/f/ghost---logo.png'; + const requestMock = nock('https://img.stockfresh.com') + .get('/files/f/ghost---logo.png') + .reply(200, ghostLogoPng); + + const inliner = new ExternalMediaInliner({}); + const response = await inliner.getRemoteMedia(imageURL); + const fileData = await inliner.extractFileDataFromResponse(imageURL, response); + + assert.ok(requestMock.isDone()); + assert.equal(fileData.filename, 'ghost---logo.png'); + assert.equal(fileData.extension, '.png'); + }); + + it('Handles falling back to `content-type` for type', async function () { + const imageURL = 'https://img.stockfresh.com/files/f/photo.gif?v=1&s=2'; + const requestMock = nock('https://img.stockfresh.com') + .defaultReplyHeaders({ + 'content-type': 'image/gif' + }) + .get('/files/f/photo.gif?v=1&s=2') + .reply(200); + + const inliner = new ExternalMediaInliner({}); + const response = await inliner.getRemoteMedia(imageURL); + const fileData = await inliner.extractFileDataFromResponse(imageURL, response); + + assert.ok(requestMock.isDone()); + assert.equal(fileData.filename, 'photo-v-1-s-2.gif'); + assert.equal(fileData.extension, '.gif'); + }); + + it('Handles falling back to file path for type', async function () { + const imageURL = 'https://img.stockfresh.com/files/f/photo.gif?v=1&s=2'; + const requestMock = nock('https://img.stockfresh.com') + .get('/files/f/photo.gif?v=1&s=2') + .reply(200); + + const inliner = new ExternalMediaInliner({}); + const response = await inliner.getRemoteMedia(imageURL); + const fileData = await inliner.extractFileDataFromResponse(imageURL, response); + + assert.ok(requestMock.isDone()); + assert.equal(fileData.filename, 'photo-v-1-s-2.gif'); + assert.equal(fileData.extension, '.gif'); + }); + }); }); + diff --git a/ghost/external-media-inliner/test/fixtures/fixture.exe b/ghost/external-media-inliner/test/fixtures/fixture.exe new file mode 100644 index 0000000000..9b0c9e0594 Binary files /dev/null and b/ghost/external-media-inliner/test/fixtures/fixture.exe differ diff --git a/ghost/external-media-inliner/test/fixtures/ghost-logo.png b/ghost/external-media-inliner/test/fixtures/ghost-logo.png new file mode 100644 index 0000000000..8ade814dc5 Binary files /dev/null and b/ghost/external-media-inliner/test/fixtures/ghost-logo.png differ diff --git a/yarn.lock b/yarn.lock index f68206d6ea..f31dfdbf5a 100644 --- a/yarn.lock +++ b/yarn.lock @@ -7499,6 +7499,11 @@ "@tiptap/extension-bubble-menu" "^2.1.12" "@tiptap/extension-floating-menu" "^2.1.12" +"@tokenizer/token@^0.3.0": + version "0.3.0" + resolved "https://registry.yarnpkg.com/@tokenizer/token/-/token-0.3.0.tgz#fe98a93fe789247e998c75e74e9c7c63217aa276" + integrity sha512-OvjF+z51L3ov0OyAU0duzsYuvO01PH7x4t6DJx+guahgTnBHkhJdG7soQeTSFLWN3efnHyibZ4Z8l2EuWwJN3A== + "@tootallnate/once@1": version "1.1.2" resolved "https://registry.yarnpkg.com/@tootallnate/once/-/once-1.1.2.tgz#ccb91445360179a04e7fe6aff78c00ffc1eeaf82" @@ -17704,6 +17709,15 @@ file-system-cache@2.3.0, file-system-cache@^2.0.0: fs-extra "11.1.1" ramda "0.29.0" +file-type@16: + version "16.5.4" + resolved "https://registry.yarnpkg.com/file-type/-/file-type-16.5.4.tgz#474fb4f704bee427681f98dd390058a172a6c2fd" + integrity sha512-/yFHK0aGjFEgDJjEKP0pWCplsPFPhwyfwevf/pVxiN0tmE4L9LmwWxWukdJSHdoCli4VgQLehjJtwQBnqmsKcw== + dependencies: + readable-web-to-node-stream "^3.0.0" + strtok3 "^6.2.4" + token-types "^4.1.1" + file-uri-to-path@1.0.0: version "1.0.0" resolved "https://registry.yarnpkg.com/file-uri-to-path/-/file-uri-to-path-1.0.0.tgz#553a7b8446ff6f684359c445f1e37a05dacc33dd" @@ -23101,7 +23115,7 @@ mime-types@2.1.13: dependencies: mime-db "~1.25.0" -mime-types@2.1.35, mime-types@^2.1.12, mime-types@^2.1.18, mime-types@^2.1.25, mime-types@^2.1.26, mime-types@^2.1.27, mime-types@~2.1.19, mime-types@~2.1.24, mime-types@~2.1.34: +mime-types@^2.1.12, mime-types@^2.1.18, mime-types@^2.1.25, mime-types@^2.1.26, mime-types@^2.1.27, mime-types@~2.1.19, mime-types@~2.1.24, mime-types@~2.1.34: version "2.1.35" resolved "https://registry.yarnpkg.com/mime-types/-/mime-types-2.1.35.tgz#381a871b62a734450660ae3deee44813f70d959a" integrity sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw== @@ -25085,6 +25099,11 @@ pbkdf2@^3.0.3: safe-buffer "^5.0.1" sha.js "^2.4.8" +peek-readable@^4.1.0: + version "4.1.0" + resolved "https://registry.yarnpkg.com/peek-readable/-/peek-readable-4.1.0.tgz#4ece1111bf5c2ad8867c314c81356847e8a62e72" + integrity sha512-ZI3LnwUv5nOGbQzD9c2iDG6toheuXSZP5esSHBjopsXH4dg19soufvpUGA3uohi5anFtGb2lhAVdHzH6R/Evvg== + peek-stream@^1.1.0: version "1.1.3" resolved "https://registry.yarnpkg.com/peek-stream/-/peek-stream-1.1.3.tgz#3b35d84b7ccbbd262fff31dc10da56856ead6d67" @@ -27123,6 +27142,13 @@ readable-stream@~1.0.2, readable-stream@~1.0.26, readable-stream@~1.0.26-4: isarray "0.0.1" string_decoder "~0.10.x" +readable-web-to-node-stream@^3.0.0: + version "3.0.2" + resolved "https://registry.yarnpkg.com/readable-web-to-node-stream/-/readable-web-to-node-stream-3.0.2.tgz#5d52bb5df7b54861fd48d015e93a2cb87b3ee0bb" + integrity sha512-ePeK6cc1EcKLEhJFt/AebMCLL+GgSKhuygrZ/GLaKZYEecIgIECf4UaUuaByiGtzckwR4ain9VzUh95T1exYGw== + dependencies: + readable-stream "^3.6.0" + readdir-glob@^1.0.0: version "1.1.2" resolved "https://registry.yarnpkg.com/readdir-glob/-/readdir-glob-1.1.2.tgz#b185789b8e6a43491635b6953295c5c5e3fd224c" @@ -29319,6 +29345,14 @@ strong-log-transformer@^2.1.0: minimist "^1.2.0" through "^2.3.4" +strtok3@^6.2.4: + version "6.3.0" + resolved "https://registry.yarnpkg.com/strtok3/-/strtok3-6.3.0.tgz#358b80ffe6d5d5620e19a073aa78ce947a90f9a0" + integrity sha512-fZtbhtvI9I48xDSywd/somNqgUHl2L2cstmXCCif0itOf96jeW18MBSyrLuNicYQVkvpOxkZtkzujiTJ9LW5Jw== + dependencies: + "@tokenizer/token" "^0.3.0" + peek-readable "^4.1.0" + style-loader@^2.0.0: version "2.0.0" resolved "https://registry.yarnpkg.com/style-loader/-/style-loader-2.0.0.tgz#9669602fd4690740eaaec137799a03addbbc393c" @@ -30126,6 +30160,14 @@ toidentifier@1.0.1: resolved "https://registry.yarnpkg.com/toidentifier/-/toidentifier-1.0.1.tgz#3be34321a88a820ed1bd80dfaa33e479fbb8dd35" integrity sha512-o5sSPKEkg/DIQNmH43V0/uerLrpzVedkUh8tGNvaeXpfpuwjKenlSox/2O/BTlZUtEe+JG7s5YhEz608PlAHRA== +token-types@^4.1.1: + version "4.2.1" + resolved "https://registry.yarnpkg.com/token-types/-/token-types-4.2.1.tgz#0f897f03665846982806e138977dbe72d44df753" + integrity sha512-6udB24Q737UD/SDsKAHI9FCRP7Bqc9D/MQUV02ORQg5iskjtLJlZJNdN4kKtcdtwCeWIwIHDGaUsTsCCAa8sFQ== + dependencies: + "@tokenizer/token" "^0.3.0" + ieee754 "^1.2.1" + toml@3.0.0: version "3.0.0" resolved "https://registry.yarnpkg.com/toml/-/toml-3.0.0.tgz#342160f1af1904ec9d204d03a5d61222d762c5ee"