Improve external media inliner URL handling (#18428)

This commit is contained in:
Paul Davis 2023-10-19 11:58:41 +01:00 committed by GitHub
parent 2debf686e6
commit 489fae98eb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 244 additions and 33 deletions

View File

@ -1,7 +1,9 @@
const mime = require('mime-types');
const FileType = require('file-type');
const request = require('@tryghost/request');
const errors = require('@tryghost/errors');
const logging = require('@tryghost/logging');
const string = require('@tryghost/string');
const path = require('path');
class ExternalMediaInliner {
@ -39,14 +41,22 @@ class ExternalMediaInliner {
* @param {string} requestURL - url of remote media
* @returns {Promise<Object>}
*/
async #getRemoteMedia(requestURL) {
async getRemoteMedia(requestURL) {
// @NOTE: this is the most expensive operation in the whole inlining process
// we should consider caching the results to improve performance
// Enforce http - http > https redirects are commonplace
requestURL = requestURL.replace(/^\/\//g, 'http://');
// Encode to handle special characters in URLs
requestURL = encodeURI(requestURL);
try {
return await request(requestURL, {
const response = await request(requestURL, {
followRedirect: true,
responseType: 'buffer'
});
return response;
} catch (error) {
// NOTE: add special case for 404s
logging.error(`Error downloading remote media: ${requestURL}`);
@ -63,21 +73,34 @@ class ExternalMediaInliner {
* @param {Object} response - response from request
* @returns {Object}
*/
#extractFileDataFromResponse(requestURL, response) {
async extractFileDataFromResponse(requestURL, response) {
let extension;
// Attempt to get the file extension from the file itself
// If that fails, or if `.ext` is undefined, get the extension from the file path in the catch
try {
const fileInfo = await FileType.fromBuffer(response.body);
extension = fileInfo.ext;
} catch {
const headers = response.headers;
const contentType = headers['content-type'];
const extensionFromPath = path.parse(requestURL).ext.split(/[^a-z]/i).filter(Boolean)[0];
extension = mime.extension(contentType) || extensionFromPath;
}
const filename = requestURL
.split('/')
.pop()
.split('#')[0]
.split('?')[0];
const removeExtRegExp = new RegExp(`.${extension}`, '');
const fileNameNoExt = path.parse(requestURL).base.replace(removeExtRegExp, '');
const extension = mime.extension(contentType) || filename.split('.').pop();
// CASE: Query strings _can_ form part of the unique image URL, so rather that strip them include the in the file name
// Then trim to last 248 chars (this will be more unique than the first 248), and trim leading & trailing dashes.
// 248 is on the lower end of limits from various OSes and file systems
const fileName = string.slugify(path.parse(fileNameNoExt).base, {
requiredChangesOnly: true
}).slice(-248).replace(/^-|-$/, '');
return {
fileBuffer: response.body,
filename: filename,
filename: `${fileName}.${extension}`,
extension: `.${extension}`
};
}
@ -87,7 +110,7 @@ class ExternalMediaInliner {
* @param {Object} media - media to store locally
* @returns {Promise<string>} - path to stored media
*/
async #storeMediaLocally(media) {
async storeMediaLocally(media) {
const storage = this.getMediaStorage(media.extension);
if (!storage) {
@ -106,7 +129,7 @@ class ExternalMediaInliner {
}
}
async #inlineMibiledoc(mobiledoc, domains) {
async inlineMobiledoc(mobiledoc, domains) {
for (const domain of domains) {
// NOTE: the src could end with a quote, apostrophe or double-backslash. backlashes are added to mobiledoc
// as an escape character
@ -115,15 +138,15 @@ class ExternalMediaInliner {
const matches = mobiledoc.matchAll(regex);
for (const [,src] of matches) {
const response = await this.#getRemoteMedia(src);
const response = await this.getRemoteMedia(src);
let media;
if (response) {
media = this.#extractFileDataFromResponse(src, response);
media = await this.extractFileDataFromResponse(src, response);
}
if (media) {
const filePath = await this.#storeMediaLocally(media);
const filePath = await this.storeMediaLocally(media);
if (filePath) {
const inlinedSrc = `__GHOST_URL__${filePath}`;
@ -147,7 +170,7 @@ class ExternalMediaInliner {
* @param {String[]} domains - domains to inline media from
* @returns Promise<Object> - updated fields map with local media paths
*/
async #inlineFields(resourceModel, fields, domains) {
async inlineFields(resourceModel, fields, domains) {
const updatedFields = {};
for (const field of fields) {
@ -155,15 +178,15 @@ class ExternalMediaInliner {
const src = resourceModel.get(field);
if (src && src.startsWith(domain)) {
const response = await this.#getRemoteMedia(src);
const response = await this.getRemoteMedia(src);
let media;
if (response) {
media = this.#extractFileDataFromResponse(src, response);
media = await this.extractFileDataFromResponse(src, response);
}
if (media) {
const filePath = await this.#storeMediaLocally(media);
const filePath = await this.storeMediaLocally(media);
if (filePath) {
const inlinedSrc = `__GHOST_URL__${filePath}`;
@ -186,12 +209,12 @@ class ExternalMediaInliner {
* @param {string[]} fields - fields to inline
* @param {string[]} domains - domains to inline media from
*/
async #inlineSimpleFields(resources, model, fields, domains) {
async inlineSimpleFields(resources, model, fields, domains) {
logging.info(`Starting inlining external media for ${resources?.length} resources and with ${fields.join(', ')} fields`);
for (const resource of resources) {
try {
const updatedFields = await this.#inlineFields(resource, fields, domains);
const updatedFields = await this.inlineFields(resource, fields, domains);
if (Object.keys(updatedFields).length > 0) {
await model.edit(updatedFields, {
@ -227,8 +250,8 @@ class ExternalMediaInliner {
for (const post of posts) {
try {
const inlinedMobiledoc = await this.#inlineMibiledoc(post.get('mobiledoc'), domains);
const updatedFields = await this.#inlineFields(post, postsInilingFields, domains);
const inlinedMobiledoc = await this.inlineMobiledoc(post.get('mobiledoc'), domains);
const updatedFields = await this.inlineFields(post, postsInilingFields, domains);
if (inlinedMobiledoc !== post.get('mobiledoc')) {
updatedFields.mobiledoc = inlinedMobiledoc;
@ -258,7 +281,7 @@ class ExternalMediaInliner {
'twitter_image'
];
await this.#inlineSimpleFields(postsMetas, this.#PostMetaModel, postsMetaInilingFields, domains);
await this.inlineSimpleFields(postsMetas, this.#PostMetaModel, postsMetaInilingFields, domains);
const {data: tags} = await this.#TagModel.findPage({
limit: 'all'
@ -269,7 +292,7 @@ class ExternalMediaInliner {
'twitter_image'
];
await this.#inlineSimpleFields(tags, this.#TagModel, tagInliningFields, domains);
await this.inlineSimpleFields(tags, this.#TagModel, tagInliningFields, domains);
const {data: users} = await this.#UserModel.findPage({
limit: 'all'
@ -279,7 +302,7 @@ class ExternalMediaInliner {
'cover_image'
];
await this.#inlineSimpleFields(users, this.#UserModel, userInliningFields, domains);
await this.inlineSimpleFields(users, this.#UserModel, userInliningFields, domains);
logging.info('Finished inlining external media for posts, tags, and users');
}

View File

@ -23,6 +23,6 @@
"sinon": "15.2.0"
},
"dependencies": {
"mime-types": "2.1.35"
"file-type": "16.5.4"
}
}

View File

@ -1,4 +1,5 @@
const assert = require('assert/strict');
const fs = require('fs');
const sinon = require('sinon');
const nock = require('nock');
const path = require('path');
@ -7,6 +8,8 @@ const ExternalMediaInliner = require('../index');
describe('ExternalMediaInliner', function () {
let logging;
let ghostLogoPng;
let exeFile;
let GIF1x1;
let postModelStub;
let postMetaModelStub;
@ -15,6 +18,8 @@ describe('ExternalMediaInliner', function () {
beforeEach(function () {
// use a 1x1 gif in nock responses because it's really small and easy to work with
ghostLogoPng = fs.readFileSync(path.join(__dirname, 'fixtures', 'ghost-logo.png'));
exeFile = fs.readFileSync(path.join(__dirname, 'fixtures', 'fixture.exe'));
GIF1x1 = Buffer.from('R0lGODlhAQABAAAAACH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==', 'base64');
logging = {
info: sinon.stub(loggingLib, 'info'),
@ -222,7 +227,7 @@ describe('ExternalMediaInliner', function () {
const fileURL = 'https://img.stockfresh.com/files/f/inlined.exe';
const requestMock = nock('https://img.stockfresh.com')
.get('/files/f/inlined.exe')
.reply(200, GIF1x1);
.reply(200, exeFile);
const postModelInstanceStub = {
id: 'inlined-post-id',
@ -526,4 +531,145 @@ describe('ExternalMediaInliner', function () {
});
});
});
describe('Special URL & file type handling', function () {
it('Handles URLs with quotes', async function () {
const imageURL = 'https://img.stockfresh.com/files/f/ghost-logos-cool.png';
const requestMock = nock('https://img.stockfresh.com')
.get(encodeURI('/files/f/ghost-logos-cool.png'))
.reply(200, ghostLogoPng);
const inliner = new ExternalMediaInliner({});
const response = await inliner.getRemoteMedia(imageURL);
const fileData = await inliner.extractFileDataFromResponse(imageURL, response);
assert.ok(requestMock.isDone());
assert.equal(fileData.filename, 'ghost-logos-cool.png');
assert.equal(fileData.extension, '.png');
});
it('Handles URLs with spaces', async function () {
const imageURL = 'https://img.stockfresh.com/files/f/ghost logo with spaces.png';
const requestMock = nock('https://img.stockfresh.com')
.get(encodeURI('/files/f/ghost logo with spaces.png'))
.reply(200, ghostLogoPng);
const inliner = new ExternalMediaInliner({});
const response = await inliner.getRemoteMedia(imageURL);
const fileData = await inliner.extractFileDataFromResponse(imageURL, response);
assert.ok(requestMock.isDone());
assert.equal(fileData.filename, 'ghost-logo-with-spaces.png');
assert.equal(fileData.extension, '.png');
});
it('Handles URLs with no extension', async function () {
const imageURL = 'https://img.stockfresh.com/files/f/ghost-logo';
const requestMock = nock('https://img.stockfresh.com')
.get('/files/f/ghost-logo')
.reply(200, ghostLogoPng);
const inliner = new ExternalMediaInliner({});
const response = await inliner.getRemoteMedia(imageURL);
const fileData = await inliner.extractFileDataFromResponse(imageURL, response);
assert.ok(requestMock.isDone());
assert.equal(fileData.filename, 'ghost-logo.png');
assert.equal(fileData.extension, '.png');
});
it('Handles URLs with unicode characters', async function () {
const imageURL = 'https://img.stockfresh.com/files/f/你好.png';
const requestMock = nock('https://img.stockfresh.com')
.get(encodeURI('/files/f/你好.png'))
.reply(200, ghostLogoPng);
const inliner = new ExternalMediaInliner({});
const response = await inliner.getRemoteMedia(imageURL);
const fileData = await inliner.extractFileDataFromResponse(imageURL, response);
assert.ok(requestMock.isDone());
assert.equal(fileData.filename, 'ni-hao.png');
assert.equal(fileData.extension, '.png');
});
it('Handles URLs with no scheme', async function () {
const imageURL = '//img.stockfresh.com/files/f/ghost-logo.png';
const requestMock = nock('https://img.stockfresh.com')
.get('/files/f/ghost-logo.png')
.reply(200, ghostLogoPng);
const inliner = new ExternalMediaInliner({});
const response = await inliner.getRemoteMedia(imageURL);
const fileData = await inliner.extractFileDataFromResponse(imageURL, response);
assert.ok(requestMock.isDone());
assert.equal(fileData.filename, 'ghost-logo.png');
assert.equal(fileData.extension, '.png');
});
it('Handles URLs with query params', async function () {
const imageURL = 'https://img.stockfresh.com/files/f/ghost-logo.png?version=1&size=large';
const requestMock = nock('https://img.stockfresh.com')
.get('/files/f/ghost-logo.png?version=1&size=large')
.reply(200, ghostLogoPng);
const inliner = new ExternalMediaInliner({});
const response = await inliner.getRemoteMedia(imageURL);
const fileData = await inliner.extractFileDataFromResponse(imageURL, response);
assert.ok(requestMock.isDone());
assert.equal(fileData.filename, 'ghost-logo-version-1-size-large.png');
assert.equal(fileData.extension, '.png');
});
it('Handles URLs with duplicated characters', async function () {
const imageURL = 'https://img.stockfresh.com/files/f/ghost---logo.png';
const requestMock = nock('https://img.stockfresh.com')
.get('/files/f/ghost---logo.png')
.reply(200, ghostLogoPng);
const inliner = new ExternalMediaInliner({});
const response = await inliner.getRemoteMedia(imageURL);
const fileData = await inliner.extractFileDataFromResponse(imageURL, response);
assert.ok(requestMock.isDone());
assert.equal(fileData.filename, 'ghost---logo.png');
assert.equal(fileData.extension, '.png');
});
it('Handles falling back to `content-type` for type', async function () {
const imageURL = 'https://img.stockfresh.com/files/f/photo.gif?v=1&s=2';
const requestMock = nock('https://img.stockfresh.com')
.defaultReplyHeaders({
'content-type': 'image/gif'
})
.get('/files/f/photo.gif?v=1&s=2')
.reply(200);
const inliner = new ExternalMediaInliner({});
const response = await inliner.getRemoteMedia(imageURL);
const fileData = await inliner.extractFileDataFromResponse(imageURL, response);
assert.ok(requestMock.isDone());
assert.equal(fileData.filename, 'photo-v-1-s-2.gif');
assert.equal(fileData.extension, '.gif');
});
it('Handles falling back to file path for type', async function () {
const imageURL = 'https://img.stockfresh.com/files/f/photo.gif?v=1&s=2';
const requestMock = nock('https://img.stockfresh.com')
.get('/files/f/photo.gif?v=1&s=2')
.reply(200);
const inliner = new ExternalMediaInliner({});
const response = await inliner.getRemoteMedia(imageURL);
const fileData = await inliner.extractFileDataFromResponse(imageURL, response);
assert.ok(requestMock.isDone());
assert.equal(fileData.filename, 'photo-v-1-s-2.gif');
assert.equal(fileData.extension, '.gif');
});
});
});

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 44 KiB

View File

@ -7499,6 +7499,11 @@
"@tiptap/extension-bubble-menu" "^2.1.12"
"@tiptap/extension-floating-menu" "^2.1.12"
"@tokenizer/token@^0.3.0":
version "0.3.0"
resolved "https://registry.yarnpkg.com/@tokenizer/token/-/token-0.3.0.tgz#fe98a93fe789247e998c75e74e9c7c63217aa276"
integrity sha512-OvjF+z51L3ov0OyAU0duzsYuvO01PH7x4t6DJx+guahgTnBHkhJdG7soQeTSFLWN3efnHyibZ4Z8l2EuWwJN3A==
"@tootallnate/once@1":
version "1.1.2"
resolved "https://registry.yarnpkg.com/@tootallnate/once/-/once-1.1.2.tgz#ccb91445360179a04e7fe6aff78c00ffc1eeaf82"
@ -17704,6 +17709,15 @@ file-system-cache@2.3.0, file-system-cache@^2.0.0:
fs-extra "11.1.1"
ramda "0.29.0"
file-type@16:
version "16.5.4"
resolved "https://registry.yarnpkg.com/file-type/-/file-type-16.5.4.tgz#474fb4f704bee427681f98dd390058a172a6c2fd"
integrity sha512-/yFHK0aGjFEgDJjEKP0pWCplsPFPhwyfwevf/pVxiN0tmE4L9LmwWxWukdJSHdoCli4VgQLehjJtwQBnqmsKcw==
dependencies:
readable-web-to-node-stream "^3.0.0"
strtok3 "^6.2.4"
token-types "^4.1.1"
file-uri-to-path@1.0.0:
version "1.0.0"
resolved "https://registry.yarnpkg.com/file-uri-to-path/-/file-uri-to-path-1.0.0.tgz#553a7b8446ff6f684359c445f1e37a05dacc33dd"
@ -23101,7 +23115,7 @@ mime-types@2.1.13:
dependencies:
mime-db "~1.25.0"
mime-types@2.1.35, mime-types@^2.1.12, mime-types@^2.1.18, mime-types@^2.1.25, mime-types@^2.1.26, mime-types@^2.1.27, mime-types@~2.1.19, mime-types@~2.1.24, mime-types@~2.1.34:
mime-types@^2.1.12, mime-types@^2.1.18, mime-types@^2.1.25, mime-types@^2.1.26, mime-types@^2.1.27, mime-types@~2.1.19, mime-types@~2.1.24, mime-types@~2.1.34:
version "2.1.35"
resolved "https://registry.yarnpkg.com/mime-types/-/mime-types-2.1.35.tgz#381a871b62a734450660ae3deee44813f70d959a"
integrity sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==
@ -25085,6 +25099,11 @@ pbkdf2@^3.0.3:
safe-buffer "^5.0.1"
sha.js "^2.4.8"
peek-readable@^4.1.0:
version "4.1.0"
resolved "https://registry.yarnpkg.com/peek-readable/-/peek-readable-4.1.0.tgz#4ece1111bf5c2ad8867c314c81356847e8a62e72"
integrity sha512-ZI3LnwUv5nOGbQzD9c2iDG6toheuXSZP5esSHBjopsXH4dg19soufvpUGA3uohi5anFtGb2lhAVdHzH6R/Evvg==
peek-stream@^1.1.0:
version "1.1.3"
resolved "https://registry.yarnpkg.com/peek-stream/-/peek-stream-1.1.3.tgz#3b35d84b7ccbbd262fff31dc10da56856ead6d67"
@ -27123,6 +27142,13 @@ readable-stream@~1.0.2, readable-stream@~1.0.26, readable-stream@~1.0.26-4:
isarray "0.0.1"
string_decoder "~0.10.x"
readable-web-to-node-stream@^3.0.0:
version "3.0.2"
resolved "https://registry.yarnpkg.com/readable-web-to-node-stream/-/readable-web-to-node-stream-3.0.2.tgz#5d52bb5df7b54861fd48d015e93a2cb87b3ee0bb"
integrity sha512-ePeK6cc1EcKLEhJFt/AebMCLL+GgSKhuygrZ/GLaKZYEecIgIECf4UaUuaByiGtzckwR4ain9VzUh95T1exYGw==
dependencies:
readable-stream "^3.6.0"
readdir-glob@^1.0.0:
version "1.1.2"
resolved "https://registry.yarnpkg.com/readdir-glob/-/readdir-glob-1.1.2.tgz#b185789b8e6a43491635b6953295c5c5e3fd224c"
@ -29319,6 +29345,14 @@ strong-log-transformer@^2.1.0:
minimist "^1.2.0"
through "^2.3.4"
strtok3@^6.2.4:
version "6.3.0"
resolved "https://registry.yarnpkg.com/strtok3/-/strtok3-6.3.0.tgz#358b80ffe6d5d5620e19a073aa78ce947a90f9a0"
integrity sha512-fZtbhtvI9I48xDSywd/somNqgUHl2L2cstmXCCif0itOf96jeW18MBSyrLuNicYQVkvpOxkZtkzujiTJ9LW5Jw==
dependencies:
"@tokenizer/token" "^0.3.0"
peek-readable "^4.1.0"
style-loader@^2.0.0:
version "2.0.0"
resolved "https://registry.yarnpkg.com/style-loader/-/style-loader-2.0.0.tgz#9669602fd4690740eaaec137799a03addbbc393c"
@ -30126,6 +30160,14 @@ toidentifier@1.0.1:
resolved "https://registry.yarnpkg.com/toidentifier/-/toidentifier-1.0.1.tgz#3be34321a88a820ed1bd80dfaa33e479fbb8dd35"
integrity sha512-o5sSPKEkg/DIQNmH43V0/uerLrpzVedkUh8tGNvaeXpfpuwjKenlSox/2O/BTlZUtEe+JG7s5YhEz608PlAHRA==
token-types@^4.1.1:
version "4.2.1"
resolved "https://registry.yarnpkg.com/token-types/-/token-types-4.2.1.tgz#0f897f03665846982806e138977dbe72d44df753"
integrity sha512-6udB24Q737UD/SDsKAHI9FCRP7Bqc9D/MQUV02ORQg5iskjtLJlZJNdN4kKtcdtwCeWIwIHDGaUsTsCCAa8sFQ==
dependencies:
"@tokenizer/token" "^0.3.0"
ieee754 "^1.2.1"
toml@3.0.0:
version "3.0.0"
resolved "https://registry.yarnpkg.com/toml/-/toml-3.0.0.tgz#342160f1af1904ec9d204d03a5d61222d762c5ee"