mirror of
https://github.com/TryGhost/Ghost.git
synced 2024-12-23 19:02:29 +03:00
🐛 Fixed embedded cards for non-utf8 content (#15578)
closes: https://github.com/TryGhost/Ghost/issues/14973 - When fetching content using a non-standard charset, characters were notproperly decoded to utf-8 resulting in mangled text in the editor -> Detect charset and use iconv to decode the page text - When requesting a non bookmark card, if no oembed data could be foundand we fallback to bookmark, a second network request to fetch the content was issued. This seemed unnecessary -> refactored to avoid that
This commit is contained in:
parent
76e906d498
commit
75811f35bc
@ -86,7 +86,7 @@ describe('Oembed API', function () {
|
||||
it('falls back to bookmark without ?type=embed and no oembed metatag', async function () {
|
||||
const pageMock = nock('http://example.com')
|
||||
.get('/')
|
||||
.times(2) // 1st = oembed metatag check, 2nd = metascraper
|
||||
.times(1) // url should not be fetched twice
|
||||
.reply(
|
||||
200,
|
||||
'<html><head><title>TESTING</title></head><body></body></html>',
|
||||
@ -550,7 +550,6 @@ describe('Oembed API', function () {
|
||||
it('falls back to bookmark card for WP oembeds', async function () {
|
||||
const pageMock = nock('http://test.com')
|
||||
.get('/')
|
||||
.twice() // oembed fetch then bookmark fetch
|
||||
.reply(
|
||||
200,
|
||||
'<html><head><link rel="alternate" type="application/json+oembed" href="http://test.com/wp-json/oembed/embed?url=https%3A%2F%2Ftest.com%2F"><title>TESTING</title></head><body></body></html>',
|
||||
@ -574,5 +573,57 @@ describe('Oembed API', function () {
|
||||
pageMock.isDone().should.be.true();
|
||||
oembedMock.isDone().should.be.false();
|
||||
});
|
||||
|
||||
it('decodes non utf-8 charsets', async function () {
|
||||
const utfString = '中国abc';
|
||||
const encodedBytes = [0xd6,0xd0,0xb9,0xfa,0x61,0x62,0x63];
|
||||
const replyBuffer = Buffer.concat([
|
||||
Buffer.from('<html><head><title>'),
|
||||
Buffer.from(encodedBytes),
|
||||
Buffer.from('</title><meta charset="gb2312"></head><body></body></html>')
|
||||
]);
|
||||
|
||||
const pageMock = nock('http://example.com')
|
||||
.get('/')
|
||||
.reply(
|
||||
200,
|
||||
replyBuffer,
|
||||
{'content-type': 'text/html'}
|
||||
);
|
||||
|
||||
const url = encodeURIComponent(' http://example.com\t '); // Whitespaces are to make sure urls are trimmed
|
||||
const res = await request.get(localUtils.API.getApiQuery(`oembed/?url=${url}&type=bookmark`))
|
||||
.set('Origin', config.get('url'))
|
||||
.expect('Content-Type', /json/)
|
||||
.expect('Cache-Control', testUtils.cacheRules.private)
|
||||
.expect(200);
|
||||
|
||||
pageMock.isDone().should.be.true();
|
||||
res.body.type.should.eql('bookmark');
|
||||
res.body.url.should.eql('http://example.com');
|
||||
res.body.metadata.title.should.eql(utfString);
|
||||
});
|
||||
|
||||
it('does not fail on unknown charset', async function () {
|
||||
const pageMock = nock('http://example.com')
|
||||
.get('/')
|
||||
.reply(
|
||||
200,
|
||||
'<html><head><title>TESTING</title><meta charset="notacharset"></head><body></body></html>',
|
||||
{'content-type': 'text/html'}
|
||||
);
|
||||
|
||||
const url = encodeURIComponent(' http://example.com\t '); // Whitespaces are to make sure urls are trimmed
|
||||
const res = await request.get(localUtils.API.getApiQuery(`oembed/?url=${url}&type=bookmark`))
|
||||
.set('Origin', config.get('url'))
|
||||
.expect('Content-Type', /json/)
|
||||
.expect('Cache-Control', testUtils.cacheRules.private)
|
||||
.expect(200);
|
||||
|
||||
pageMock.isDone().should.be.true();
|
||||
res.body.type.should.eql('bookmark');
|
||||
res.body.url.should.eql('http://example.com');
|
||||
res.body.metadata.title.should.eql('TESTING');
|
||||
});
|
||||
});
|
||||
});
|
||||
|
@ -5,6 +5,8 @@ const {extract, hasProvider} = require('oembed-parser');
|
||||
const cheerio = require('cheerio');
|
||||
const _ = require('lodash');
|
||||
const {CookieJar} = require('tough-cookie');
|
||||
const charset = require('charset');
|
||||
const iconv = require('iconv-lite');
|
||||
|
||||
const messages = {
|
||||
noUrlProvided: 'No url provided.',
|
||||
@ -111,7 +113,96 @@ class OEmbed {
|
||||
}
|
||||
}
|
||||
|
||||
async fetchBookmarkData(url) {
|
||||
/**
|
||||
* @param {string} url
|
||||
* @param {Object} options
|
||||
*
|
||||
* @returns {Promise<{url: string, body: any, headers: any}>}
|
||||
*/
|
||||
async fetchPage(url, options) {
|
||||
const cookieJar = new CookieJar();
|
||||
return this.externalRequest(
|
||||
url,
|
||||
{
|
||||
cookieJar,
|
||||
method: 'GET',
|
||||
timeout: 2 * 1000,
|
||||
followRedirect: true,
|
||||
...options
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} url
|
||||
*
|
||||
* @returns {Promise<{url: string, body: string}>}
|
||||
*/
|
||||
async fetchPageHtml(url) {
|
||||
// Fetch url and get response as binary buffer to
|
||||
// avoid implicit cast
|
||||
const {headers, body, url: responseUrl} = await this.fetchPage(
|
||||
url,
|
||||
{
|
||||
encoding: 'binary',
|
||||
responseType: 'buffer'
|
||||
});
|
||||
|
||||
try {
|
||||
// Detect page encoding which might not be utf-8
|
||||
// and decode content
|
||||
const encoding = charset(
|
||||
headers,
|
||||
body);
|
||||
|
||||
if (encoding === null) {
|
||||
return {
|
||||
body: body.toString(),
|
||||
url: responseUrl
|
||||
};
|
||||
}
|
||||
|
||||
const decodedBody = iconv.decode(
|
||||
Buffer.from(body, 'binary'), encoding);
|
||||
|
||||
return {
|
||||
body: decodedBody,
|
||||
url: responseUrl
|
||||
};
|
||||
} catch (err) {
|
||||
logging.error(err);
|
||||
//return non decoded body anyway
|
||||
return {
|
||||
body: body.toString(),
|
||||
url: responseUrl
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} url
|
||||
*
|
||||
* @returns {Promise<{url: string, body: Object}>}
|
||||
*/
|
||||
async fetchPageJson(url) {
|
||||
const {body, url: pageUrl} = await this.fetchPage(
|
||||
url,
|
||||
{
|
||||
json: true
|
||||
});
|
||||
|
||||
return {
|
||||
body,
|
||||
url: pageUrl
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} url
|
||||
* @param {string} html
|
||||
*
|
||||
* @returns {Promise<Object>}
|
||||
*/
|
||||
async fetchBookmarkData(url, html) {
|
||||
const metascraper = require('metascraper')([
|
||||
require('metascraper-url')(),
|
||||
require('metascraper-title')(),
|
||||
@ -125,10 +216,6 @@ class OEmbed {
|
||||
|
||||
let scraperResponse;
|
||||
|
||||
const cookieJar = new CookieJar();
|
||||
const response = await this.externalRequest(url, {cookieJar});
|
||||
|
||||
const html = response.body;
|
||||
try {
|
||||
scraperResponse = await metascraper({html, url});
|
||||
} catch (err) {
|
||||
@ -189,39 +276,17 @@ class OEmbed {
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} _url
|
||||
* @param {string} url
|
||||
* @param {string} html
|
||||
* @param {string} [cardType]
|
||||
*
|
||||
* @returns {Promise<Object>}
|
||||
*/
|
||||
async fetchOembedData(_url, cardType) {
|
||||
// check against known oembed list
|
||||
let {url, provider} = findUrlWithProvider(_url);
|
||||
if (provider) {
|
||||
return this.knownProvider(url);
|
||||
}
|
||||
|
||||
// url not in oembed list so fetch it in case it's a redirect or has a
|
||||
// <link rel="alternate" type="application/json+oembed"> element
|
||||
const cookieJar = new CookieJar();
|
||||
const pageResponse = await this.externalRequest(url, {
|
||||
method: 'GET',
|
||||
timeout: 2 * 1000,
|
||||
followRedirect: true,
|
||||
cookieJar
|
||||
});
|
||||
// url changed after fetch, see if we were redirected to a known oembed
|
||||
if (pageResponse.url !== url) {
|
||||
({url, provider} = findUrlWithProvider(pageResponse.url));
|
||||
if (provider) {
|
||||
return this.knownProvider(url);
|
||||
}
|
||||
}
|
||||
|
||||
async fetchOembedData(url, html, cardType) {
|
||||
// check for <link rel="alternate" type="application/json+oembed"> element
|
||||
let oembedUrl;
|
||||
try {
|
||||
oembedUrl = cheerio('link[type="application/json+oembed"]', pageResponse.body).attr('href');
|
||||
oembedUrl = cheerio('link[type="application/json+oembed"]', html).attr('href');
|
||||
} catch (e) {
|
||||
return this.unknownProvider(url);
|
||||
}
|
||||
@ -234,13 +299,8 @@ class OEmbed {
|
||||
}
|
||||
|
||||
// fetch oembed response from embedded rel="alternate" url
|
||||
const oembedResponse = await this.externalRequest(oembedUrl, {
|
||||
method: 'GET',
|
||||
json: true,
|
||||
timeout: 2 * 1000,
|
||||
followRedirect: true,
|
||||
cookieJar
|
||||
});
|
||||
const oembedResponse = await this.fetchPageJson(oembedUrl);
|
||||
|
||||
// validate the fetched json against the oembed spec to avoid
|
||||
// leaking non-oembed responses
|
||||
const body = oembedResponse.body;
|
||||
@ -304,17 +364,39 @@ class OEmbed {
|
||||
}
|
||||
}
|
||||
|
||||
if (type !== 'bookmark') {
|
||||
// if not a bookmark request, first
|
||||
// check against known oembed list
|
||||
const {url: providerUrl, provider} = findUrlWithProvider(url);
|
||||
if (provider) {
|
||||
return this.knownProvider(providerUrl);
|
||||
}
|
||||
}
|
||||
|
||||
// Not in the list, we need to fetch the content
|
||||
const {url: pageUrl, body} = await this.fetchPageHtml(url);
|
||||
|
||||
// fetch only bookmark when explicitly requested
|
||||
if (type === 'bookmark') {
|
||||
return this.fetchBookmarkData(url);
|
||||
return this.fetchBookmarkData(url, body);
|
||||
}
|
||||
|
||||
// attempt to fetch oembed
|
||||
let data = await this.fetchOembedData(url);
|
||||
|
||||
// In case response was a redirect, see if we were
|
||||
// redirected to a known oembed
|
||||
if (pageUrl !== url) {
|
||||
const {url: providerUrl, provider} = findUrlWithProvider(pageUrl);
|
||||
if (provider) {
|
||||
return this.knownProvider(providerUrl);
|
||||
}
|
||||
}
|
||||
|
||||
let data = await this.fetchOembedData(url, body);
|
||||
|
||||
// fallback to bookmark when we can't get oembed
|
||||
if (!data && !type) {
|
||||
data = await this.fetchBookmarkData(url);
|
||||
data = await this.fetchBookmarkData(url, body);
|
||||
}
|
||||
|
||||
// couldn't get anything, throw a validation error
|
||||
|
@ -25,7 +25,9 @@
|
||||
"@tryghost/errors": "1.2.18",
|
||||
"@tryghost/logging": "2.3.2",
|
||||
"@tryghost/tpl": "0.1.18",
|
||||
"charset": "^1.0.1",
|
||||
"cheerio": "0.22.0",
|
||||
"iconv-lite": "^0.6.3",
|
||||
"lodash": "4.17.21",
|
||||
"metascraper": "5.31.1",
|
||||
"metascraper-author": "5.31.1",
|
||||
|
@ -8799,6 +8799,11 @@ charm@^1.0.0:
|
||||
dependencies:
|
||||
inherits "^2.0.1"
|
||||
|
||||
charset@^1.0.1:
|
||||
version "1.0.1"
|
||||
resolved "https://registry.yarnpkg.com/charset/-/charset-1.0.1.tgz#8d59546c355be61049a8fa9164747793319852bd"
|
||||
integrity sha512-6dVyOOYjpfFcL1Y4qChrAoQLRHvj2ziyhcm0QJlhOcAhykL/k1kTUPbeo+87MNRTRdk2OIIsIXbuF3x2wi5EXg==
|
||||
|
||||
chart.js@^2.9.0:
|
||||
version "2.9.4"
|
||||
resolved "https://registry.yarnpkg.com/chart.js/-/chart.js-2.9.4.tgz#0827f9563faffb2dc5c06562f8eb10337d5b9684"
|
||||
|
Loading…
Reference in New Issue
Block a user