2023-03-03 14:08:18 +03:00
|
|
|
const mime = require('mime-types');
|
2023-10-19 13:58:41 +03:00
|
|
|
const FileType = require('file-type');
|
2023-03-03 14:08:18 +03:00
|
|
|
const request = require('@tryghost/request');
|
|
|
|
const errors = require('@tryghost/errors');
|
|
|
|
const logging = require('@tryghost/logging');
|
2023-10-19 13:58:41 +03:00
|
|
|
const string = require('@tryghost/string');
|
2023-03-07 13:41:30 +03:00
|
|
|
const path = require('path');
|
2023-03-03 14:08:18 +03:00
|
|
|
|
2023-03-03 11:15:44 +03:00
|
|
|
class ExternalMediaInliner {
|
2023-03-03 14:08:18 +03:00
|
|
|
/** @type {object} */
|
|
|
|
#PostModel;
|
|
|
|
|
2023-03-06 17:50:24 +03:00
|
|
|
/** @type {object} */
|
|
|
|
#PostMetaModel;
|
|
|
|
|
|
|
|
/** @type {object} */
|
|
|
|
#TagModel;
|
|
|
|
|
|
|
|
/** @type {object} */
|
|
|
|
#UserModel;
|
|
|
|
|
2023-03-03 14:08:18 +03:00
|
|
|
/**
|
|
|
|
*
|
|
|
|
* @param {Object} deps
|
|
|
|
* @param {Object} deps.PostModel - Post model
|
2023-03-06 17:50:24 +03:00
|
|
|
* @param {Object} deps.PostMetaModel - PostMeta model
|
|
|
|
* @param {Object} deps.TagModel - Tag model
|
|
|
|
* @param {Object} deps.UserModel - User model
|
2023-03-03 14:08:18 +03:00
|
|
|
* @param {(extension) => import('ghost-storage-base')} deps.getMediaStorage - getMediaStorage
|
|
|
|
*/
|
|
|
|
constructor(deps) {
|
|
|
|
this.#PostModel = deps.PostModel;
|
2023-03-06 17:50:24 +03:00
|
|
|
this.#PostMetaModel = deps.PostMetaModel;
|
|
|
|
this.#TagModel = deps.TagModel;
|
|
|
|
this.#UserModel = deps.UserModel;
|
2023-03-03 14:08:18 +03:00
|
|
|
this.getMediaStorage = deps.getMediaStorage;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
*
|
|
|
|
* @param {string} requestURL - url of remote media
|
|
|
|
* @returns {Promise<Object>}
|
|
|
|
*/
|
2023-10-19 13:58:41 +03:00
|
|
|
async getRemoteMedia(requestURL) {
|
2023-03-08 09:16:28 +03:00
|
|
|
// @NOTE: this is the most expensive operation in the whole inlining process
|
|
|
|
// we should consider caching the results to improve performance
|
2023-10-19 13:58:41 +03:00
|
|
|
|
|
|
|
// Enforce http - http > https redirects are commonplace
|
|
|
|
requestURL = requestURL.replace(/^\/\//g, 'http://');
|
|
|
|
|
|
|
|
// Encode to handle special characters in URLs
|
|
|
|
requestURL = encodeURI(requestURL);
|
2023-03-03 14:08:18 +03:00
|
|
|
try {
|
2023-10-19 13:58:41 +03:00
|
|
|
const response = await request(requestURL, {
|
2023-03-03 14:08:18 +03:00
|
|
|
followRedirect: true,
|
2023-09-25 16:43:11 +03:00
|
|
|
responseType: 'buffer'
|
2023-03-03 14:08:18 +03:00
|
|
|
});
|
2023-10-19 13:58:41 +03:00
|
|
|
|
|
|
|
return response;
|
2023-03-03 14:08:18 +03:00
|
|
|
} catch (error) {
|
|
|
|
// NOTE: add special case for 404s
|
|
|
|
logging.error(`Error downloading remote media: ${requestURL}`);
|
|
|
|
logging.error(new errors.DataImportError({
|
|
|
|
err: error
|
|
|
|
}));
|
|
|
|
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
*
|
|
|
|
* @param {Object} response - response from request
|
|
|
|
* @returns {Object}
|
|
|
|
*/
|
2023-10-19 13:58:41 +03:00
|
|
|
async extractFileDataFromResponse(requestURL, response) {
|
|
|
|
let extension;
|
|
|
|
|
|
|
|
// Attempt to get the file extension from the file itself
|
|
|
|
// If that fails, or if `.ext` is undefined, get the extension from the file path in the catch
|
|
|
|
try {
|
|
|
|
const fileInfo = await FileType.fromBuffer(response.body);
|
|
|
|
extension = fileInfo.ext;
|
|
|
|
} catch {
|
|
|
|
const headers = response.headers;
|
|
|
|
const contentType = headers['content-type'];
|
|
|
|
const extensionFromPath = path.parse(requestURL).ext.split(/[^a-z]/i).filter(Boolean)[0];
|
|
|
|
extension = mime.extension(contentType) || extensionFromPath;
|
|
|
|
}
|
2023-03-03 14:08:18 +03:00
|
|
|
|
2023-10-19 13:58:41 +03:00
|
|
|
const removeExtRegExp = new RegExp(`.${extension}`, '');
|
|
|
|
const fileNameNoExt = path.parse(requestURL).base.replace(removeExtRegExp, '');
|
2023-03-03 14:08:18 +03:00
|
|
|
|
2023-10-19 13:58:41 +03:00
|
|
|
// CASE: Query strings _can_ form part of the unique image URL, so rather that strip them include the in the file name
|
|
|
|
// Then trim to last 248 chars (this will be more unique than the first 248), and trim leading & trailing dashes.
|
|
|
|
// 248 is on the lower end of limits from various OSes and file systems
|
|
|
|
const fileName = string.slugify(path.parse(fileNameNoExt).base, {
|
|
|
|
requiredChangesOnly: true
|
|
|
|
}).slice(-248).replace(/^-|-$/, '');
|
2023-03-03 14:08:18 +03:00
|
|
|
|
|
|
|
return {
|
|
|
|
fileBuffer: response.body,
|
2023-10-19 13:58:41 +03:00
|
|
|
filename: `${fileName}.${extension}`,
|
2023-03-03 14:08:18 +03:00
|
|
|
extension: `.${extension}`
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
2023-03-06 17:50:24 +03:00
|
|
|
/**
|
|
|
|
*
|
|
|
|
* @param {Object} media - media to store locally
|
|
|
|
* @returns {Promise<string>} - path to stored media
|
|
|
|
*/
|
2023-10-19 13:58:41 +03:00
|
|
|
async storeMediaLocally(media) {
|
2023-03-06 17:50:24 +03:00
|
|
|
const storage = this.getMediaStorage(media.extension);
|
|
|
|
|
|
|
|
if (!storage) {
|
|
|
|
logging.warn(`No storage adapter found for file extension: ${media.extension}`);
|
|
|
|
return null;
|
|
|
|
} else {
|
2023-03-07 13:41:30 +03:00
|
|
|
// @NOTE: this is extremely convoluted and should live on a
|
|
|
|
// storage adapter level
|
2023-03-06 17:50:24 +03:00
|
|
|
const targetDir = storage.getTargetDir(storage.storagePath);
|
|
|
|
const uniqueFileName = await storage.getUniqueFileName({
|
|
|
|
name: media.filename
|
|
|
|
}, targetDir);
|
2023-03-07 13:41:30 +03:00
|
|
|
const targetPath = path.relative(storage.storagePath, uniqueFileName);
|
|
|
|
const filePath = await storage.saveRaw(media.fileBuffer, targetPath);
|
2023-03-06 17:50:24 +03:00
|
|
|
return filePath;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-11-29 20:56:03 +03:00
|
|
|
/**
|
|
|
|
* Find & inline external media from a JSON sting.
|
|
|
|
* This works with both Lexical & Mobiledoc, so no separate methods are needed here.
|
|
|
|
*
|
|
|
|
* @param {string} content - stringified JSON of post Lexical or Mobiledoc content
|
|
|
|
* @param {String[]} domains - domains to inline media from
|
|
|
|
* @returns {Promise<string>} - updated stringified JSON of post content
|
|
|
|
*/
|
|
|
|
async inlineContent(content, domains) {
|
2023-03-03 14:08:18 +03:00
|
|
|
for (const domain of domains) {
|
2023-11-29 20:56:03 +03:00
|
|
|
// NOTE: the src could end with a quote, apostrophe or double-backslash. backlashes are added to content
|
2023-03-08 10:22:51 +03:00
|
|
|
// as an escape character
|
|
|
|
const srcTerminationSymbols = `"|'|\\\\`;
|
|
|
|
const regex = new RegExp(`(${domain}.*?)(${srcTerminationSymbols})`, 'igm');
|
2023-11-29 20:56:03 +03:00
|
|
|
const matches = content.matchAll(regex);
|
2023-03-03 14:08:18 +03:00
|
|
|
|
|
|
|
for (const [,src] of matches) {
|
2023-10-19 13:58:41 +03:00
|
|
|
const response = await this.getRemoteMedia(src);
|
2023-03-03 14:08:18 +03:00
|
|
|
|
|
|
|
let media;
|
|
|
|
if (response) {
|
2023-10-19 13:58:41 +03:00
|
|
|
media = await this.extractFileDataFromResponse(src, response);
|
2023-03-03 14:08:18 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
if (media) {
|
2023-10-19 13:58:41 +03:00
|
|
|
const filePath = await this.storeMediaLocally(media);
|
2023-03-06 17:50:24 +03:00
|
|
|
|
|
|
|
if (filePath) {
|
2023-03-03 14:08:18 +03:00
|
|
|
const inlinedSrc = `__GHOST_URL__${filePath}`;
|
|
|
|
|
2023-11-29 20:56:03 +03:00
|
|
|
// NOTE: does not account for duplicate images in content
|
2023-03-03 14:08:18 +03:00
|
|
|
// in those cases would be processed twice
|
2023-11-29 20:56:03 +03:00
|
|
|
content = content.replace(src, inlinedSrc);
|
2023-03-06 17:50:24 +03:00
|
|
|
logging.info(`Inlined media: ${src} -> ${inlinedSrc}`);
|
2023-03-03 14:08:18 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-11-29 20:56:03 +03:00
|
|
|
return content;
|
2023-03-03 14:08:18 +03:00
|
|
|
}
|
|
|
|
|
2023-03-06 17:50:24 +03:00
|
|
|
/**
|
|
|
|
*
|
|
|
|
* @param {Object} resourceModel - one of PostModel, TagModel, UserModel instances
|
|
|
|
* @param {String[]} fields - fields to inline
|
|
|
|
* @param {String[]} domains - domains to inline media from
|
|
|
|
* @returns Promise<Object> - updated fields map with local media paths
|
|
|
|
*/
|
2023-10-19 13:58:41 +03:00
|
|
|
async inlineFields(resourceModel, fields, domains) {
|
2023-03-06 17:50:24 +03:00
|
|
|
const updatedFields = {};
|
|
|
|
|
|
|
|
for (const field of fields) {
|
|
|
|
for (const domain of domains) {
|
|
|
|
const src = resourceModel.get(field);
|
|
|
|
|
|
|
|
if (src && src.startsWith(domain)) {
|
2023-10-19 13:58:41 +03:00
|
|
|
const response = await this.getRemoteMedia(src);
|
2023-03-06 17:50:24 +03:00
|
|
|
|
|
|
|
let media;
|
|
|
|
if (response) {
|
2023-10-19 13:58:41 +03:00
|
|
|
media = await this.extractFileDataFromResponse(src, response);
|
2023-03-06 17:50:24 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
if (media) {
|
2023-10-19 13:58:41 +03:00
|
|
|
const filePath = await this.storeMediaLocally(media);
|
2023-03-06 17:50:24 +03:00
|
|
|
|
|
|
|
if (filePath) {
|
|
|
|
const inlinedSrc = `__GHOST_URL__${filePath}`;
|
|
|
|
|
|
|
|
updatedFields[field] = inlinedSrc;
|
|
|
|
logging.info(`Added media to inline: ${src} -> ${inlinedSrc}`);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return updatedFields;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
*
|
|
|
|
* @param {Object[]} resources - array of model instances
|
|
|
|
* @param {Object} model - resource model
|
|
|
|
* @param {string[]} fields - fields to inline
|
|
|
|
* @param {string[]} domains - domains to inline media from
|
|
|
|
*/
|
2023-10-19 13:58:41 +03:00
|
|
|
async inlineSimpleFields(resources, model, fields, domains) {
|
2023-03-08 10:24:21 +03:00
|
|
|
logging.info(`Starting inlining external media for ${resources?.length} resources and with ${fields.join(', ')} fields`);
|
2023-03-06 17:50:24 +03:00
|
|
|
|
|
|
|
for (const resource of resources) {
|
|
|
|
try {
|
2023-10-19 13:58:41 +03:00
|
|
|
const updatedFields = await this.inlineFields(resource, fields, domains);
|
2023-03-06 17:50:24 +03:00
|
|
|
|
|
|
|
if (Object.keys(updatedFields).length > 0) {
|
|
|
|
await model.edit(updatedFields, {
|
|
|
|
id: resource.id,
|
|
|
|
context: {
|
|
|
|
internal: true
|
|
|
|
}
|
|
|
|
});
|
|
|
|
}
|
|
|
|
} catch (err) {
|
2023-03-08 10:24:21 +03:00
|
|
|
logging.error(`Error inlining media for: ${resource.id}`);
|
2023-03-06 17:50:24 +03:00
|
|
|
logging.error(new errors.DataImportError({
|
|
|
|
err
|
|
|
|
}));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-03-03 14:08:18 +03:00
|
|
|
/**
|
|
|
|
*
|
|
|
|
* @param {string[]} domains domains to inline media from
|
|
|
|
*/
|
|
|
|
async inline(domains) {
|
|
|
|
const {data: posts} = await this.#PostModel.findPage({
|
|
|
|
limit: 'all',
|
|
|
|
status: 'all'
|
|
|
|
});
|
2023-03-06 17:50:24 +03:00
|
|
|
const postsInilingFields = [
|
|
|
|
'feature_image'
|
|
|
|
];
|
|
|
|
|
|
|
|
logging.info(`Starting inlining external media for posts: ${posts?.length}`);
|
2023-03-03 14:08:18 +03:00
|
|
|
|
|
|
|
for (const post of posts) {
|
|
|
|
try {
|
2023-11-29 20:56:03 +03:00
|
|
|
const mobiledocContent = post.get('mobiledoc');
|
|
|
|
const lexicalContent = post.get('lexical');
|
|
|
|
|
2023-10-19 13:58:41 +03:00
|
|
|
const updatedFields = await this.inlineFields(post, postsInilingFields, domains);
|
2023-03-03 14:08:18 +03:00
|
|
|
|
2023-11-29 20:56:03 +03:00
|
|
|
if (mobiledocContent) {
|
|
|
|
const inlinedContent = await this.inlineContent(mobiledocContent, domains);
|
|
|
|
|
|
|
|
// If content has changed, update the post
|
|
|
|
if (inlinedContent !== mobiledocContent) {
|
|
|
|
updatedFields.mobiledoc = inlinedContent;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (lexicalContent) {
|
|
|
|
const inlinedContent = await this.inlineContent(lexicalContent, domains);
|
|
|
|
|
|
|
|
// If content has changed, update the post
|
|
|
|
if (inlinedContent !== lexicalContent) {
|
|
|
|
updatedFields.lexical = inlinedContent;
|
|
|
|
}
|
2023-03-06 17:50:24 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
if (Object.keys(updatedFields).length > 0) {
|
|
|
|
await this.#PostModel.edit(updatedFields, {
|
|
|
|
id: post.id,
|
|
|
|
context: {
|
|
|
|
internal: true
|
|
|
|
}
|
2023-03-03 14:08:18 +03:00
|
|
|
});
|
|
|
|
}
|
|
|
|
} catch (err) {
|
|
|
|
logging.error(`Error inlining media for post: ${post.id}`);
|
|
|
|
logging.error(new errors.DataImportError({
|
|
|
|
err
|
|
|
|
}));
|
|
|
|
}
|
|
|
|
}
|
2023-03-03 11:15:44 +03:00
|
|
|
|
2023-03-06 17:50:24 +03:00
|
|
|
const {data: postsMetas} = await this.#PostMetaModel.findPage({
|
|
|
|
limit: 'all'
|
|
|
|
});
|
|
|
|
const postsMetaInilingFields = [
|
|
|
|
'og_image',
|
|
|
|
'twitter_image'
|
|
|
|
];
|
|
|
|
|
2023-10-19 13:58:41 +03:00
|
|
|
await this.inlineSimpleFields(postsMetas, this.#PostMetaModel, postsMetaInilingFields, domains);
|
2023-03-06 17:50:24 +03:00
|
|
|
|
|
|
|
const {data: tags} = await this.#TagModel.findPage({
|
|
|
|
limit: 'all'
|
|
|
|
});
|
|
|
|
const tagInliningFields = [
|
|
|
|
'feature_image',
|
|
|
|
'og_image',
|
|
|
|
'twitter_image'
|
|
|
|
];
|
|
|
|
|
2023-10-19 13:58:41 +03:00
|
|
|
await this.inlineSimpleFields(tags, this.#TagModel, tagInliningFields, domains);
|
2023-03-06 17:50:24 +03:00
|
|
|
|
|
|
|
const {data: users} = await this.#UserModel.findPage({
|
|
|
|
limit: 'all'
|
|
|
|
});
|
|
|
|
const userInliningFields = [
|
|
|
|
'profile_image',
|
|
|
|
'cover_image'
|
|
|
|
];
|
|
|
|
|
2023-10-19 13:58:41 +03:00
|
|
|
await this.inlineSimpleFields(users, this.#UserModel, userInliningFields, domains);
|
2023-03-06 17:50:24 +03:00
|
|
|
|
|
|
|
logging.info('Finished inlining external media for posts, tags, and users');
|
2023-03-03 14:08:18 +03:00
|
|
|
}
|
2023-03-03 11:15:44 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
module.exports = ExternalMediaInliner;
|