🐛 Fixed embedded cards for non-utf8 content (#15578)

closes: https://github.com/TryGhost/Ghost/issues/14973

- When fetching content using a non-standard charset, characters were notproperly decoded to utf-8 resulting in mangled text in the editor -> Detect charset and use iconv to decode the page text

- When requesting a non bookmark card, if no oembed data could be foundand we fallback to bookmark, a second network request to fetch the content was issued. This seemed unnecessary -> refactored to avoid that
This commit is contained in:
jbenezech 2022-10-13 18:19:47 +07:00 committed by GitHub
parent 76e906d498
commit 75811f35bc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 184 additions and 44 deletions

View File

@ -86,7 +86,7 @@ describe('Oembed API', function () {
it('falls back to bookmark without ?type=embed and no oembed metatag', async function () { it('falls back to bookmark without ?type=embed and no oembed metatag', async function () {
const pageMock = nock('http://example.com') const pageMock = nock('http://example.com')
.get('/') .get('/')
.times(2) // 1st = oembed metatag check, 2nd = metascraper .times(1) // url should not be fetched twice
.reply( .reply(
200, 200,
'<html><head><title>TESTING</title></head><body></body></html>', '<html><head><title>TESTING</title></head><body></body></html>',
@ -550,7 +550,6 @@ describe('Oembed API', function () {
it('falls back to bookmark card for WP oembeds', async function () { it('falls back to bookmark card for WP oembeds', async function () {
const pageMock = nock('http://test.com') const pageMock = nock('http://test.com')
.get('/') .get('/')
.twice() // oembed fetch then bookmark fetch
.reply( .reply(
200, 200,
'<html><head><link rel="alternate" type="application/json+oembed" href="http://test.com/wp-json/oembed/embed?url=https%3A%2F%2Ftest.com%2F"><title>TESTING</title></head><body></body></html>', '<html><head><link rel="alternate" type="application/json+oembed" href="http://test.com/wp-json/oembed/embed?url=https%3A%2F%2Ftest.com%2F"><title>TESTING</title></head><body></body></html>',
@ -574,5 +573,57 @@ describe('Oembed API', function () {
pageMock.isDone().should.be.true(); pageMock.isDone().should.be.true();
oembedMock.isDone().should.be.false(); oembedMock.isDone().should.be.false();
}); });
it('decodes non utf-8 charsets', async function () {
const utfString = '中国abc';
const encodedBytes = [0xd6,0xd0,0xb9,0xfa,0x61,0x62,0x63];
const replyBuffer = Buffer.concat([
Buffer.from('<html><head><title>'),
Buffer.from(encodedBytes),
Buffer.from('</title><meta charset="gb2312"></head><body></body></html>')
]);
const pageMock = nock('http://example.com')
.get('/')
.reply(
200,
replyBuffer,
{'content-type': 'text/html'}
);
const url = encodeURIComponent(' http://example.com\t '); // Whitespaces are to make sure urls are trimmed
const res = await request.get(localUtils.API.getApiQuery(`oembed/?url=${url}&type=bookmark`))
.set('Origin', config.get('url'))
.expect('Content-Type', /json/)
.expect('Cache-Control', testUtils.cacheRules.private)
.expect(200);
pageMock.isDone().should.be.true();
res.body.type.should.eql('bookmark');
res.body.url.should.eql('http://example.com');
res.body.metadata.title.should.eql(utfString);
});
it('does not fail on unknown charset', async function () {
const pageMock = nock('http://example.com')
.get('/')
.reply(
200,
'<html><head><title>TESTING</title><meta charset="notacharset"></head><body></body></html>',
{'content-type': 'text/html'}
);
const url = encodeURIComponent(' http://example.com\t '); // Whitespaces are to make sure urls are trimmed
const res = await request.get(localUtils.API.getApiQuery(`oembed/?url=${url}&type=bookmark`))
.set('Origin', config.get('url'))
.expect('Content-Type', /json/)
.expect('Cache-Control', testUtils.cacheRules.private)
.expect(200);
pageMock.isDone().should.be.true();
res.body.type.should.eql('bookmark');
res.body.url.should.eql('http://example.com');
res.body.metadata.title.should.eql('TESTING');
});
}); });
}); });

View File

@ -5,6 +5,8 @@ const {extract, hasProvider} = require('oembed-parser');
const cheerio = require('cheerio'); const cheerio = require('cheerio');
const _ = require('lodash'); const _ = require('lodash');
const {CookieJar} = require('tough-cookie'); const {CookieJar} = require('tough-cookie');
const charset = require('charset');
const iconv = require('iconv-lite');
const messages = { const messages = {
noUrlProvided: 'No url provided.', noUrlProvided: 'No url provided.',
@ -111,7 +113,96 @@ class OEmbed {
} }
} }
async fetchBookmarkData(url) { /**
* @param {string} url
* @param {Object} options
*
* @returns {Promise<{url: string, body: any, headers: any}>}
*/
async fetchPage(url, options) {
const cookieJar = new CookieJar();
return this.externalRequest(
url,
{
cookieJar,
method: 'GET',
timeout: 2 * 1000,
followRedirect: true,
...options
});
}
/**
* @param {string} url
*
* @returns {Promise<{url: string, body: string}>}
*/
async fetchPageHtml(url) {
// Fetch url and get response as binary buffer to
// avoid implicit cast
const {headers, body, url: responseUrl} = await this.fetchPage(
url,
{
encoding: 'binary',
responseType: 'buffer'
});
try {
// Detect page encoding which might not be utf-8
// and decode content
const encoding = charset(
headers,
body);
if (encoding === null) {
return {
body: body.toString(),
url: responseUrl
};
}
const decodedBody = iconv.decode(
Buffer.from(body, 'binary'), encoding);
return {
body: decodedBody,
url: responseUrl
};
} catch (err) {
logging.error(err);
//return non decoded body anyway
return {
body: body.toString(),
url: responseUrl
};
}
}
/**
* @param {string} url
*
* @returns {Promise<{url: string, body: Object}>}
*/
async fetchPageJson(url) {
const {body, url: pageUrl} = await this.fetchPage(
url,
{
json: true
});
return {
body,
url: pageUrl
};
}
/**
* @param {string} url
* @param {string} html
*
* @returns {Promise<Object>}
*/
async fetchBookmarkData(url, html) {
const metascraper = require('metascraper')([ const metascraper = require('metascraper')([
require('metascraper-url')(), require('metascraper-url')(),
require('metascraper-title')(), require('metascraper-title')(),
@ -125,10 +216,6 @@ class OEmbed {
let scraperResponse; let scraperResponse;
const cookieJar = new CookieJar();
const response = await this.externalRequest(url, {cookieJar});
const html = response.body;
try { try {
scraperResponse = await metascraper({html, url}); scraperResponse = await metascraper({html, url});
} catch (err) { } catch (err) {
@ -189,39 +276,17 @@ class OEmbed {
} }
/** /**
* @param {string} _url * @param {string} url
* @param {string} html
* @param {string} [cardType] * @param {string} [cardType]
* *
* @returns {Promise<Object>} * @returns {Promise<Object>}
*/ */
async fetchOembedData(_url, cardType) { async fetchOembedData(url, html, cardType) {
// check against known oembed list
let {url, provider} = findUrlWithProvider(_url);
if (provider) {
return this.knownProvider(url);
}
// url not in oembed list so fetch it in case it's a redirect or has a
// <link rel="alternate" type="application/json+oembed"> element
const cookieJar = new CookieJar();
const pageResponse = await this.externalRequest(url, {
method: 'GET',
timeout: 2 * 1000,
followRedirect: true,
cookieJar
});
// url changed after fetch, see if we were redirected to a known oembed
if (pageResponse.url !== url) {
({url, provider} = findUrlWithProvider(pageResponse.url));
if (provider) {
return this.knownProvider(url);
}
}
// check for <link rel="alternate" type="application/json+oembed"> element // check for <link rel="alternate" type="application/json+oembed"> element
let oembedUrl; let oembedUrl;
try { try {
oembedUrl = cheerio('link[type="application/json+oembed"]', pageResponse.body).attr('href'); oembedUrl = cheerio('link[type="application/json+oembed"]', html).attr('href');
} catch (e) { } catch (e) {
return this.unknownProvider(url); return this.unknownProvider(url);
} }
@ -234,13 +299,8 @@ class OEmbed {
} }
// fetch oembed response from embedded rel="alternate" url // fetch oembed response from embedded rel="alternate" url
const oembedResponse = await this.externalRequest(oembedUrl, { const oembedResponse = await this.fetchPageJson(oembedUrl);
method: 'GET',
json: true,
timeout: 2 * 1000,
followRedirect: true,
cookieJar
});
// validate the fetched json against the oembed spec to avoid // validate the fetched json against the oembed spec to avoid
// leaking non-oembed responses // leaking non-oembed responses
const body = oembedResponse.body; const body = oembedResponse.body;
@ -304,17 +364,39 @@ class OEmbed {
} }
} }
if (type !== 'bookmark') {
// if not a bookmark request, first
// check against known oembed list
const {url: providerUrl, provider} = findUrlWithProvider(url);
if (provider) {
return this.knownProvider(providerUrl);
}
}
// Not in the list, we need to fetch the content
const {url: pageUrl, body} = await this.fetchPageHtml(url);
// fetch only bookmark when explicitly requested // fetch only bookmark when explicitly requested
if (type === 'bookmark') { if (type === 'bookmark') {
return this.fetchBookmarkData(url); return this.fetchBookmarkData(url, body);
} }
// attempt to fetch oembed // attempt to fetch oembed
let data = await this.fetchOembedData(url);
// In case response was a redirect, see if we were
// redirected to a known oembed
if (pageUrl !== url) {
const {url: providerUrl, provider} = findUrlWithProvider(pageUrl);
if (provider) {
return this.knownProvider(providerUrl);
}
}
let data = await this.fetchOembedData(url, body);
// fallback to bookmark when we can't get oembed // fallback to bookmark when we can't get oembed
if (!data && !type) { if (!data && !type) {
data = await this.fetchBookmarkData(url); data = await this.fetchBookmarkData(url, body);
} }
// couldn't get anything, throw a validation error // couldn't get anything, throw a validation error

View File

@ -25,7 +25,9 @@
"@tryghost/errors": "1.2.18", "@tryghost/errors": "1.2.18",
"@tryghost/logging": "2.3.2", "@tryghost/logging": "2.3.2",
"@tryghost/tpl": "0.1.18", "@tryghost/tpl": "0.1.18",
"charset": "^1.0.1",
"cheerio": "0.22.0", "cheerio": "0.22.0",
"iconv-lite": "^0.6.3",
"lodash": "4.17.21", "lodash": "4.17.21",
"metascraper": "5.31.1", "metascraper": "5.31.1",
"metascraper-author": "5.31.1", "metascraper-author": "5.31.1",

View File

@ -8799,6 +8799,11 @@ charm@^1.0.0:
dependencies: dependencies:
inherits "^2.0.1" inherits "^2.0.1"
charset@^1.0.1:
version "1.0.1"
resolved "https://registry.yarnpkg.com/charset/-/charset-1.0.1.tgz#8d59546c355be61049a8fa9164747793319852bd"
integrity sha512-6dVyOOYjpfFcL1Y4qChrAoQLRHvj2ziyhcm0QJlhOcAhykL/k1kTUPbeo+87MNRTRdk2OIIsIXbuF3x2wi5EXg==
chart.js@^2.9.0: chart.js@^2.9.0:
version "2.9.4" version "2.9.4"
resolved "https://registry.yarnpkg.com/chart.js/-/chart.js-2.9.4.tgz#0827f9563faffb2dc5c06562f8eb10337d5b9684" resolved "https://registry.yarnpkg.com/chart.js/-/chart.js-2.9.4.tgz#0827f9563faffb2dc5c06562f8eb10337d5b9684"