2021-01-12 20:16:05 +03:00
|
|
|
const errors = require('@tryghost/errors');
|
2021-10-08 18:12:19 +03:00
|
|
|
const tpl = require('@tryghost/tpl');
|
2021-11-17 16:31:10 +03:00
|
|
|
const logging = require('@tryghost/logging');
|
2021-01-12 20:16:05 +03:00
|
|
|
const cheerio = require('cheerio');
|
|
|
|
const _ = require('lodash');
|
2022-10-13 14:19:47 +03:00
|
|
|
const charset = require('charset');
|
|
|
|
const iconv = require('iconv-lite');
|
2021-01-12 20:16:05 +03:00
|
|
|
|
2021-10-08 17:32:16 +03:00
|
|
|
const messages = {
|
|
|
|
noUrlProvided: 'No url provided.',
|
2022-03-21 12:07:54 +03:00
|
|
|
insufficientMetadata: 'URL contains insufficient metadata.',
|
2023-05-05 02:04:58 +03:00
|
|
|
unknownProvider: 'No provider found for supplied URL.',
|
|
|
|
unauthorized: 'URL contains a private resource.'
|
2021-10-08 17:32:16 +03:00
|
|
|
};
|
|
|
|
|
2021-11-10 17:14:04 +03:00
|
|
|
/**
|
|
|
|
* @param {string} url
|
|
|
|
* @returns {{url: string, provider: boolean}}
|
|
|
|
*/
|
2021-01-12 20:16:05 +03:00
|
|
|
const findUrlWithProvider = (url) => {
|
2023-05-09 15:30:38 +03:00
|
|
|
const {hasProvider} = require('@extractus/oembed-extractor');
|
|
|
|
|
2021-01-12 20:16:05 +03:00
|
|
|
let provider;
|
|
|
|
|
|
|
|
// build up a list of URL variations to test against because the oembed
|
|
|
|
// providers list is not always up to date with scheme or www vs non-www
|
|
|
|
let baseUrl = url.replace(/^\/\/|^https?:\/\/(?:www\.)?/, '');
|
|
|
|
let testUrls = [
|
|
|
|
`http://${baseUrl}`,
|
|
|
|
`https://${baseUrl}`,
|
|
|
|
`http://www.${baseUrl}`,
|
|
|
|
`https://www.${baseUrl}`
|
|
|
|
];
|
|
|
|
|
|
|
|
for (let testUrl of testUrls) {
|
|
|
|
provider = hasProvider(testUrl);
|
|
|
|
if (provider) {
|
|
|
|
url = testUrl;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return {url, provider};
|
|
|
|
};
|
|
|
|
|
2021-01-14 20:01:43 +03:00
|
|
|
/**
|
|
|
|
* @typedef {Object} IConfig
|
|
|
|
* @prop {(key: string) => string} get
|
|
|
|
*/
|
2021-01-12 20:16:05 +03:00
|
|
|
|
2021-01-14 20:01:43 +03:00
|
|
|
/**
|
|
|
|
* @typedef {(url: string, config: Object) => Promise} IExternalRequest
|
|
|
|
*/
|
2021-01-12 20:16:05 +03:00
|
|
|
|
2021-11-10 17:14:04 +03:00
|
|
|
/**
|
|
|
|
* @typedef {object} ICustomProvider
|
|
|
|
* @prop {(url: URL) => Promise<boolean>} canSupportRequest
|
2023-05-05 02:04:58 +03:00
|
|
|
* @prop {(url: URL, externalRequest: IExternalRequest) => Promise<import('@extractus/oembed-extractor').OembedData>} getOEmbedData
|
2021-11-10 17:14:04 +03:00
|
|
|
*/
|
|
|
|
|
2023-05-02 23:43:47 +03:00
|
|
|
class OEmbedService {
|
2021-01-14 20:01:43 +03:00
|
|
|
/**
|
|
|
|
*
|
|
|
|
* @param {Object} dependencies
|
|
|
|
* @param {IConfig} dependencies.config
|
|
|
|
* @param {IExternalRequest} dependencies.externalRequest
|
|
|
|
*/
|
2021-10-08 18:12:19 +03:00
|
|
|
constructor({config, externalRequest}) {
|
2021-01-14 20:01:43 +03:00
|
|
|
this.config = config;
|
2021-11-23 13:58:38 +03:00
|
|
|
|
2021-11-10 17:14:04 +03:00
|
|
|
/** @type {IExternalRequest} */
|
2023-02-20 18:33:11 +03:00
|
|
|
this.externalRequest = externalRequest;
|
2021-11-23 13:58:38 +03:00
|
|
|
|
2021-11-10 17:14:04 +03:00
|
|
|
/** @type {ICustomProvider[]} */
|
|
|
|
this.customProviders = [];
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @param {ICustomProvider} provider
|
|
|
|
*/
|
|
|
|
registerProvider(provider) {
|
|
|
|
this.customProviders.push(provider);
|
2021-01-12 20:16:05 +03:00
|
|
|
}
|
|
|
|
|
2021-11-10 17:14:04 +03:00
|
|
|
/**
|
|
|
|
* @param {string} url
|
|
|
|
*/
|
|
|
|
async unknownProvider(url) {
|
|
|
|
throw new errors.ValidationError({
|
2021-10-08 18:12:19 +03:00
|
|
|
message: tpl(messages.unknownProvider),
|
2021-01-14 20:01:43 +03:00
|
|
|
context: url
|
2021-11-10 17:14:04 +03:00
|
|
|
});
|
2021-01-12 20:16:05 +03:00
|
|
|
}
|
|
|
|
|
2021-11-10 17:14:04 +03:00
|
|
|
/**
|
|
|
|
* @param {string} url
|
|
|
|
*/
|
|
|
|
async knownProvider(url) {
|
2023-05-09 15:30:38 +03:00
|
|
|
const {extract} = require('@extractus/oembed-extractor');
|
|
|
|
|
2021-11-10 17:14:04 +03:00
|
|
|
try {
|
|
|
|
return await extract(url);
|
|
|
|
} catch (err) {
|
2023-05-05 02:04:58 +03:00
|
|
|
if (err.message === 'Request failed with error code 401') {
|
|
|
|
throw new errors.UnauthorizedError({
|
|
|
|
message: messages.unauthorized
|
|
|
|
});
|
|
|
|
} else {
|
|
|
|
throw new errors.InternalServerError({
|
|
|
|
message: err.message
|
|
|
|
});
|
|
|
|
}
|
2021-11-10 17:14:04 +03:00
|
|
|
}
|
2021-01-12 20:16:05 +03:00
|
|
|
}
|
|
|
|
|
2022-10-13 14:19:47 +03:00
|
|
|
/**
|
|
|
|
* @param {string} url
|
|
|
|
* @param {Object} options
|
2023-01-20 16:32:50 +03:00
|
|
|
*
|
2023-02-20 18:33:11 +03:00
|
|
|
* @returns {GotPromise<any>}
|
2022-10-13 14:19:47 +03:00
|
|
|
*/
|
2023-02-20 18:33:11 +03:00
|
|
|
fetchPage(url, options) {
|
2022-10-13 14:19:47 +03:00
|
|
|
return this.externalRequest(
|
|
|
|
url,
|
|
|
|
{
|
2023-02-20 18:33:11 +03:00
|
|
|
timeout: 2000,
|
2022-10-13 14:19:47 +03:00
|
|
|
followRedirect: true,
|
|
|
|
...options
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @param {string} url
|
2023-01-20 16:32:50 +03:00
|
|
|
*
|
2023-08-31 17:57:18 +03:00
|
|
|
* @returns {Promise<{url: string, body: string, contentType: string|undefined}>}
|
2022-10-13 14:19:47 +03:00
|
|
|
*/
|
|
|
|
async fetchPageHtml(url) {
|
|
|
|
// Fetch url and get response as binary buffer to
|
|
|
|
// avoid implicit cast
|
2023-02-20 18:33:11 +03:00
|
|
|
let {headers, body, url: responseUrl} = await this.fetchPage(
|
2022-10-13 14:19:47 +03:00
|
|
|
url,
|
|
|
|
{
|
|
|
|
encoding: 'binary',
|
|
|
|
responseType: 'buffer'
|
|
|
|
});
|
|
|
|
|
|
|
|
try {
|
|
|
|
// Detect page encoding which might not be utf-8
|
|
|
|
// and decode content
|
|
|
|
const encoding = charset(
|
|
|
|
headers,
|
|
|
|
body);
|
|
|
|
|
|
|
|
if (encoding === null) {
|
|
|
|
return {
|
|
|
|
body: body.toString(),
|
2023-08-31 17:57:18 +03:00
|
|
|
url: responseUrl,
|
|
|
|
contentType: headers['content-type']
|
2022-10-13 14:19:47 +03:00
|
|
|
};
|
|
|
|
}
|
|
|
|
|
|
|
|
const decodedBody = iconv.decode(
|
2023-02-20 18:33:11 +03:00
|
|
|
body, encoding);
|
2022-10-13 14:19:47 +03:00
|
|
|
|
|
|
|
return {
|
|
|
|
body: decodedBody,
|
2023-08-31 17:57:18 +03:00
|
|
|
url: responseUrl,
|
|
|
|
contentType: headers['content-type']
|
2022-10-13 14:19:47 +03:00
|
|
|
};
|
|
|
|
} catch (err) {
|
|
|
|
logging.error(err);
|
|
|
|
//return non decoded body anyway
|
|
|
|
return {
|
|
|
|
body: body.toString(),
|
2023-08-31 17:57:18 +03:00
|
|
|
url: responseUrl,
|
|
|
|
contentType: headers['content-type']
|
2022-10-13 14:19:47 +03:00
|
|
|
};
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @param {string} url
|
2023-01-20 16:32:50 +03:00
|
|
|
*
|
2022-10-13 14:19:47 +03:00
|
|
|
* @returns {Promise<{url: string, body: Object}>}
|
|
|
|
*/
|
|
|
|
async fetchPageJson(url) {
|
2023-02-20 18:33:11 +03:00
|
|
|
const res = await this.fetchPage(url, {responseType: 'json'});
|
|
|
|
const body = res.body;
|
|
|
|
const pageUrl = res.url;
|
2022-10-13 14:19:47 +03:00
|
|
|
return {
|
|
|
|
body,
|
|
|
|
url: pageUrl
|
|
|
|
};
|
|
|
|
}
|
2023-01-20 16:32:50 +03:00
|
|
|
|
2022-10-13 14:19:47 +03:00
|
|
|
/**
|
|
|
|
* @param {string} url
|
|
|
|
* @param {string} html
|
2023-01-20 16:32:50 +03:00
|
|
|
*
|
2022-10-13 14:19:47 +03:00
|
|
|
* @returns {Promise<Object>}
|
|
|
|
*/
|
|
|
|
async fetchBookmarkData(url, html) {
|
2023-03-24 12:46:14 +03:00
|
|
|
const gotOpts = {};
|
|
|
|
|
|
|
|
if (process.env.NODE_ENV?.startsWith('test')) {
|
|
|
|
gotOpts.retry = 0;
|
|
|
|
}
|
|
|
|
|
2023-08-31 12:26:12 +03:00
|
|
|
const pickFn = (sizes, pickDefault) => {
|
|
|
|
// Prioritize apple touch icon with sizes > 180
|
2023-09-06 13:22:45 +03:00
|
|
|
const appleTouchIcon = sizes.find(item => item.rel?.includes('apple') && item.sizes && item.size.width >= 180);
|
|
|
|
const svgIcon = sizes.find(item => item.href?.endsWith('svg'));
|
2023-08-31 12:26:12 +03:00
|
|
|
return appleTouchIcon || svgIcon || pickDefault(sizes);
|
|
|
|
};
|
|
|
|
|
2021-01-14 20:01:43 +03:00
|
|
|
const metascraper = require('metascraper')([
|
|
|
|
require('metascraper-url')(),
|
|
|
|
require('metascraper-title')(),
|
|
|
|
require('metascraper-description')(),
|
|
|
|
require('metascraper-author')(),
|
|
|
|
require('metascraper-publisher')(),
|
|
|
|
require('metascraper-image')(),
|
2023-03-24 12:46:14 +03:00
|
|
|
require('metascraper-logo-favicon')({
|
2023-08-31 12:26:12 +03:00
|
|
|
gotOpts,
|
|
|
|
pickFn
|
2023-03-24 12:46:14 +03:00
|
|
|
}),
|
2021-01-14 20:01:43 +03:00
|
|
|
require('metascraper-logo')()
|
|
|
|
]);
|
|
|
|
|
|
|
|
let scraperResponse;
|
2023-01-20 16:32:50 +03:00
|
|
|
|
2021-12-01 19:46:19 +03:00
|
|
|
try {
|
|
|
|
scraperResponse = await metascraper({html, url});
|
|
|
|
} catch (err) {
|
|
|
|
// Log to avoid being blind to errors happenning in metascraper
|
|
|
|
logging.error(err);
|
|
|
|
return this.unknownProvider(url);
|
|
|
|
}
|
2021-01-12 20:16:05 +03:00
|
|
|
|
2021-01-14 20:01:43 +03:00
|
|
|
const metadata = Object.assign({}, scraperResponse, {
|
|
|
|
thumbnail: scraperResponse.image,
|
|
|
|
icon: scraperResponse.logo
|
|
|
|
});
|
|
|
|
// We want to use standard naming for image and logo
|
|
|
|
delete metadata.image;
|
|
|
|
delete metadata.logo;
|
|
|
|
|
2021-11-10 17:14:04 +03:00
|
|
|
if (!metadata.title) {
|
|
|
|
throw new errors.ValidationError({
|
|
|
|
message: tpl(messages.insufficientMetadata),
|
|
|
|
context: url
|
2021-01-14 20:01:43 +03:00
|
|
|
});
|
|
|
|
}
|
|
|
|
|
2021-11-10 17:14:04 +03:00
|
|
|
return {
|
|
|
|
version: '1.0',
|
|
|
|
type: 'bookmark',
|
|
|
|
url,
|
|
|
|
metadata
|
|
|
|
};
|
2021-01-14 20:01:43 +03:00
|
|
|
}
|
|
|
|
|
2021-08-23 09:36:18 +03:00
|
|
|
/**
|
2022-10-13 14:19:47 +03:00
|
|
|
* @param {string} url
|
|
|
|
* @param {string} html
|
2021-08-23 09:53:44 +03:00
|
|
|
* @param {string} [cardType]
|
2021-08-23 09:36:18 +03:00
|
|
|
*
|
|
|
|
* @returns {Promise<Object>}
|
|
|
|
*/
|
2022-10-13 14:19:47 +03:00
|
|
|
async fetchOembedData(url, html, cardType) {
|
2021-11-10 17:14:04 +03:00
|
|
|
// check for <link rel="alternate" type="application/json+oembed"> element
|
|
|
|
let oembedUrl;
|
|
|
|
try {
|
2022-10-13 14:19:47 +03:00
|
|
|
oembedUrl = cheerio('link[type="application/json+oembed"]', html).attr('href');
|
2021-11-10 17:14:04 +03:00
|
|
|
} catch (e) {
|
|
|
|
return this.unknownProvider(url);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (oembedUrl) {
|
|
|
|
// for standard WP oembed's we want to insert a bookmark card rather than their blockquote+script
|
|
|
|
// which breaks in the editor and most Ghost themes. Only fallback if card type was not explicitly chosen
|
|
|
|
if (!cardType && oembedUrl.match(/wp-json\/oembed/)) {
|
|
|
|
return;
|
2021-01-12 20:16:05 +03:00
|
|
|
}
|
|
|
|
|
2021-11-10 17:14:04 +03:00
|
|
|
// fetch oembed response from embedded rel="alternate" url
|
2022-10-13 14:19:47 +03:00
|
|
|
const oembedResponse = await this.fetchPageJson(oembedUrl);
|
2021-11-10 17:14:04 +03:00
|
|
|
// validate the fetched json against the oembed spec to avoid
|
|
|
|
// leaking non-oembed responses
|
|
|
|
const body = oembedResponse.body;
|
|
|
|
const hasRequiredFields = body.type && body.version;
|
|
|
|
const hasValidType = ['photo', 'video', 'link', 'rich'].includes(body.type);
|
|
|
|
|
|
|
|
if (hasRequiredFields && hasValidType) {
|
|
|
|
// extract known oembed fields from the response to limit leaking of unrecognised data
|
|
|
|
const knownFields = [
|
|
|
|
'type',
|
|
|
|
'version',
|
|
|
|
'html',
|
|
|
|
'url',
|
|
|
|
'title',
|
|
|
|
'width',
|
|
|
|
'height',
|
|
|
|
'author_name',
|
|
|
|
'author_url',
|
|
|
|
'provider_name',
|
|
|
|
'provider_url',
|
|
|
|
'thumbnail_url',
|
|
|
|
'thumbnail_width',
|
|
|
|
'thumbnail_height'
|
|
|
|
];
|
|
|
|
const oembed = _.pick(body, knownFields);
|
|
|
|
|
|
|
|
// ensure we have required data for certain types
|
|
|
|
if (oembed.type === 'photo' && !oembed.url) {
|
|
|
|
return;
|
2021-01-14 20:01:43 +03:00
|
|
|
}
|
2021-11-10 17:14:04 +03:00
|
|
|
if ((oembed.type === 'video' || oembed.type === 'rich') && (!oembed.html || !oembed.width || !oembed.height)) {
|
2021-01-14 20:01:43 +03:00
|
|
|
return;
|
2021-01-12 20:16:05 +03:00
|
|
|
}
|
2021-01-14 20:01:43 +03:00
|
|
|
|
2021-11-10 17:14:04 +03:00
|
|
|
// return the extracted object, don't pass through the response body
|
|
|
|
return oembed;
|
2021-01-14 20:01:43 +03:00
|
|
|
}
|
2021-11-10 17:14:04 +03:00
|
|
|
}
|
2021-01-14 20:01:43 +03:00
|
|
|
}
|
2021-08-23 09:36:18 +03:00
|
|
|
|
|
|
|
/**
|
|
|
|
* @param {string} url - oembed URL
|
|
|
|
* @param {string} type - card type
|
|
|
|
*
|
|
|
|
* @returns {Promise<Object>}
|
|
|
|
*/
|
|
|
|
async fetchOembedDataFromUrl(url, type) {
|
2021-08-23 09:53:44 +03:00
|
|
|
try {
|
2021-11-10 17:14:04 +03:00
|
|
|
const urlObject = new URL(url);
|
2021-11-23 13:58:38 +03:00
|
|
|
|
2021-12-01 17:14:59 +03:00
|
|
|
// Trimming solves the difference of url validation between `new URL(url)`
|
|
|
|
// and metascraper.
|
|
|
|
url = url.trim();
|
|
|
|
|
2021-11-10 17:14:04 +03:00
|
|
|
for (const provider of this.customProviders) {
|
|
|
|
if (await provider.canSupportRequest(urlObject)) {
|
|
|
|
const result = await provider.getOEmbedData(urlObject, this.externalRequest);
|
|
|
|
if (result !== null) {
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-02-28 17:39:28 +03:00
|
|
|
if (type !== 'bookmark' && type !== 'mention') {
|
2022-10-13 14:19:47 +03:00
|
|
|
// if not a bookmark request, first
|
|
|
|
// check against known oembed list
|
|
|
|
const {url: providerUrl, provider} = findUrlWithProvider(url);
|
|
|
|
if (provider) {
|
|
|
|
return this.knownProvider(providerUrl);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Not in the list, we need to fetch the content
|
2023-08-31 17:57:18 +03:00
|
|
|
const {url: pageUrl, body, contentType} = await this.fetchPageHtml(url);
|
2022-10-13 14:19:47 +03:00
|
|
|
|
2021-11-23 13:58:38 +03:00
|
|
|
// fetch only bookmark when explicitly requested
|
2021-08-23 09:53:44 +03:00
|
|
|
if (type === 'bookmark') {
|
2022-10-13 14:19:47 +03:00
|
|
|
return this.fetchBookmarkData(url, body);
|
2021-08-23 09:36:18 +03:00
|
|
|
}
|
2021-08-23 09:53:44 +03:00
|
|
|
|
2023-02-28 17:39:28 +03:00
|
|
|
// mentions need to return bookmark data (metadata) and body (html) for link verification
|
|
|
|
if (type === 'mention') {
|
2023-08-31 17:57:18 +03:00
|
|
|
if (contentType.includes('application/json')) {
|
|
|
|
// No need to fetch metadata: we have none
|
|
|
|
const bookmark = {
|
|
|
|
version: '1.0',
|
|
|
|
type: 'bookmark',
|
|
|
|
url,
|
|
|
|
metadata: {
|
|
|
|
title: null,
|
|
|
|
description: null,
|
|
|
|
publisher: null,
|
|
|
|
author: null,
|
|
|
|
thumbnail: null,
|
|
|
|
icon: null
|
|
|
|
},
|
|
|
|
contentType
|
|
|
|
};
|
|
|
|
return {...bookmark, body};
|
|
|
|
}
|
2023-02-28 17:39:28 +03:00
|
|
|
const bookmark = await this.fetchBookmarkData(url, body);
|
2023-08-31 17:57:18 +03:00
|
|
|
return {...bookmark, body, contentType};
|
2023-02-28 17:39:28 +03:00
|
|
|
}
|
|
|
|
|
2021-11-23 13:58:38 +03:00
|
|
|
// attempt to fetch oembed
|
2022-10-13 14:19:47 +03:00
|
|
|
|
2023-01-20 16:32:50 +03:00
|
|
|
// In case response was a redirect, see if we were
|
2022-10-13 14:19:47 +03:00
|
|
|
// redirected to a known oembed
|
|
|
|
if (pageUrl !== url) {
|
|
|
|
const {url: providerUrl, provider} = findUrlWithProvider(pageUrl);
|
|
|
|
if (provider) {
|
|
|
|
return this.knownProvider(providerUrl);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
let data = await this.fetchOembedData(url, body);
|
2021-08-23 09:53:44 +03:00
|
|
|
|
2021-11-23 13:58:38 +03:00
|
|
|
// fallback to bookmark when we can't get oembed
|
2021-08-23 09:53:44 +03:00
|
|
|
if (!data && !type) {
|
2022-10-13 14:19:47 +03:00
|
|
|
data = await this.fetchBookmarkData(url, body);
|
2021-08-23 09:36:18 +03:00
|
|
|
}
|
2021-08-23 09:53:44 +03:00
|
|
|
|
2021-11-23 13:58:38 +03:00
|
|
|
// couldn't get anything, throw a validation error
|
2021-08-23 09:53:44 +03:00
|
|
|
if (!data) {
|
2021-11-23 13:58:38 +03:00
|
|
|
return this.unknownProvider(url);
|
2021-08-23 09:53:44 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
return data;
|
2021-11-23 13:58:38 +03:00
|
|
|
} catch (err) {
|
|
|
|
// allow specific validation errors through for better error messages
|
2021-12-01 13:22:01 +03:00
|
|
|
if (errors.utils.isGhostError(err) && err.errorType === 'ValidationError') {
|
2021-11-23 13:58:38 +03:00
|
|
|
throw err;
|
|
|
|
}
|
|
|
|
|
|
|
|
// log the real error because we're going to throw a generic "Unknown provider" error
|
2021-12-01 13:22:01 +03:00
|
|
|
logging.error(new errors.InternalServerError({
|
2021-11-23 13:58:38 +03:00
|
|
|
message: 'Encountered error when fetching oembed',
|
|
|
|
err
|
|
|
|
}));
|
|
|
|
|
|
|
|
// default to unknown provider to avoid leaking any app specifics
|
|
|
|
return this.unknownProvider(url);
|
2021-08-23 09:53:44 +03:00
|
|
|
}
|
2021-08-23 09:36:18 +03:00
|
|
|
}
|
2021-01-12 20:16:05 +03:00
|
|
|
}
|
|
|
|
|
2023-05-02 23:43:47 +03:00
|
|
|
module.exports = OEmbedService;
|