mirror of
https://github.com/TryGhost/Ghost.git
synced 2024-12-23 19:02:29 +03:00
🐛 Fixed embedded cards for non-utf8 content (#15578)
closes: https://github.com/TryGhost/Ghost/issues/14973 - When fetching content using a non-standard charset, characters were notproperly decoded to utf-8 resulting in mangled text in the editor -> Detect charset and use iconv to decode the page text - When requesting a non bookmark card, if no oembed data could be foundand we fallback to bookmark, a second network request to fetch the content was issued. This seemed unnecessary -> refactored to avoid that
This commit is contained in:
parent
76e906d498
commit
75811f35bc
@ -86,7 +86,7 @@ describe('Oembed API', function () {
|
|||||||
it('falls back to bookmark without ?type=embed and no oembed metatag', async function () {
|
it('falls back to bookmark without ?type=embed and no oembed metatag', async function () {
|
||||||
const pageMock = nock('http://example.com')
|
const pageMock = nock('http://example.com')
|
||||||
.get('/')
|
.get('/')
|
||||||
.times(2) // 1st = oembed metatag check, 2nd = metascraper
|
.times(1) // url should not be fetched twice
|
||||||
.reply(
|
.reply(
|
||||||
200,
|
200,
|
||||||
'<html><head><title>TESTING</title></head><body></body></html>',
|
'<html><head><title>TESTING</title></head><body></body></html>',
|
||||||
@ -550,7 +550,6 @@ describe('Oembed API', function () {
|
|||||||
it('falls back to bookmark card for WP oembeds', async function () {
|
it('falls back to bookmark card for WP oembeds', async function () {
|
||||||
const pageMock = nock('http://test.com')
|
const pageMock = nock('http://test.com')
|
||||||
.get('/')
|
.get('/')
|
||||||
.twice() // oembed fetch then bookmark fetch
|
|
||||||
.reply(
|
.reply(
|
||||||
200,
|
200,
|
||||||
'<html><head><link rel="alternate" type="application/json+oembed" href="http://test.com/wp-json/oembed/embed?url=https%3A%2F%2Ftest.com%2F"><title>TESTING</title></head><body></body></html>',
|
'<html><head><link rel="alternate" type="application/json+oembed" href="http://test.com/wp-json/oembed/embed?url=https%3A%2F%2Ftest.com%2F"><title>TESTING</title></head><body></body></html>',
|
||||||
@ -574,5 +573,57 @@ describe('Oembed API', function () {
|
|||||||
pageMock.isDone().should.be.true();
|
pageMock.isDone().should.be.true();
|
||||||
oembedMock.isDone().should.be.false();
|
oembedMock.isDone().should.be.false();
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it('decodes non utf-8 charsets', async function () {
|
||||||
|
const utfString = '中国abc';
|
||||||
|
const encodedBytes = [0xd6,0xd0,0xb9,0xfa,0x61,0x62,0x63];
|
||||||
|
const replyBuffer = Buffer.concat([
|
||||||
|
Buffer.from('<html><head><title>'),
|
||||||
|
Buffer.from(encodedBytes),
|
||||||
|
Buffer.from('</title><meta charset="gb2312"></head><body></body></html>')
|
||||||
|
]);
|
||||||
|
|
||||||
|
const pageMock = nock('http://example.com')
|
||||||
|
.get('/')
|
||||||
|
.reply(
|
||||||
|
200,
|
||||||
|
replyBuffer,
|
||||||
|
{'content-type': 'text/html'}
|
||||||
|
);
|
||||||
|
|
||||||
|
const url = encodeURIComponent(' http://example.com\t '); // Whitespaces are to make sure urls are trimmed
|
||||||
|
const res = await request.get(localUtils.API.getApiQuery(`oembed/?url=${url}&type=bookmark`))
|
||||||
|
.set('Origin', config.get('url'))
|
||||||
|
.expect('Content-Type', /json/)
|
||||||
|
.expect('Cache-Control', testUtils.cacheRules.private)
|
||||||
|
.expect(200);
|
||||||
|
|
||||||
|
pageMock.isDone().should.be.true();
|
||||||
|
res.body.type.should.eql('bookmark');
|
||||||
|
res.body.url.should.eql('http://example.com');
|
||||||
|
res.body.metadata.title.should.eql(utfString);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('does not fail on unknown charset', async function () {
|
||||||
|
const pageMock = nock('http://example.com')
|
||||||
|
.get('/')
|
||||||
|
.reply(
|
||||||
|
200,
|
||||||
|
'<html><head><title>TESTING</title><meta charset="notacharset"></head><body></body></html>',
|
||||||
|
{'content-type': 'text/html'}
|
||||||
|
);
|
||||||
|
|
||||||
|
const url = encodeURIComponent(' http://example.com\t '); // Whitespaces are to make sure urls are trimmed
|
||||||
|
const res = await request.get(localUtils.API.getApiQuery(`oembed/?url=${url}&type=bookmark`))
|
||||||
|
.set('Origin', config.get('url'))
|
||||||
|
.expect('Content-Type', /json/)
|
||||||
|
.expect('Cache-Control', testUtils.cacheRules.private)
|
||||||
|
.expect(200);
|
||||||
|
|
||||||
|
pageMock.isDone().should.be.true();
|
||||||
|
res.body.type.should.eql('bookmark');
|
||||||
|
res.body.url.should.eql('http://example.com');
|
||||||
|
res.body.metadata.title.should.eql('TESTING');
|
||||||
|
});
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
@ -5,6 +5,8 @@ const {extract, hasProvider} = require('oembed-parser');
|
|||||||
const cheerio = require('cheerio');
|
const cheerio = require('cheerio');
|
||||||
const _ = require('lodash');
|
const _ = require('lodash');
|
||||||
const {CookieJar} = require('tough-cookie');
|
const {CookieJar} = require('tough-cookie');
|
||||||
|
const charset = require('charset');
|
||||||
|
const iconv = require('iconv-lite');
|
||||||
|
|
||||||
const messages = {
|
const messages = {
|
||||||
noUrlProvided: 'No url provided.',
|
noUrlProvided: 'No url provided.',
|
||||||
@ -111,7 +113,96 @@ class OEmbed {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async fetchBookmarkData(url) {
|
/**
|
||||||
|
* @param {string} url
|
||||||
|
* @param {Object} options
|
||||||
|
*
|
||||||
|
* @returns {Promise<{url: string, body: any, headers: any}>}
|
||||||
|
*/
|
||||||
|
async fetchPage(url, options) {
|
||||||
|
const cookieJar = new CookieJar();
|
||||||
|
return this.externalRequest(
|
||||||
|
url,
|
||||||
|
{
|
||||||
|
cookieJar,
|
||||||
|
method: 'GET',
|
||||||
|
timeout: 2 * 1000,
|
||||||
|
followRedirect: true,
|
||||||
|
...options
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param {string} url
|
||||||
|
*
|
||||||
|
* @returns {Promise<{url: string, body: string}>}
|
||||||
|
*/
|
||||||
|
async fetchPageHtml(url) {
|
||||||
|
// Fetch url and get response as binary buffer to
|
||||||
|
// avoid implicit cast
|
||||||
|
const {headers, body, url: responseUrl} = await this.fetchPage(
|
||||||
|
url,
|
||||||
|
{
|
||||||
|
encoding: 'binary',
|
||||||
|
responseType: 'buffer'
|
||||||
|
});
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Detect page encoding which might not be utf-8
|
||||||
|
// and decode content
|
||||||
|
const encoding = charset(
|
||||||
|
headers,
|
||||||
|
body);
|
||||||
|
|
||||||
|
if (encoding === null) {
|
||||||
|
return {
|
||||||
|
body: body.toString(),
|
||||||
|
url: responseUrl
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
const decodedBody = iconv.decode(
|
||||||
|
Buffer.from(body, 'binary'), encoding);
|
||||||
|
|
||||||
|
return {
|
||||||
|
body: decodedBody,
|
||||||
|
url: responseUrl
|
||||||
|
};
|
||||||
|
} catch (err) {
|
||||||
|
logging.error(err);
|
||||||
|
//return non decoded body anyway
|
||||||
|
return {
|
||||||
|
body: body.toString(),
|
||||||
|
url: responseUrl
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param {string} url
|
||||||
|
*
|
||||||
|
* @returns {Promise<{url: string, body: Object}>}
|
||||||
|
*/
|
||||||
|
async fetchPageJson(url) {
|
||||||
|
const {body, url: pageUrl} = await this.fetchPage(
|
||||||
|
url,
|
||||||
|
{
|
||||||
|
json: true
|
||||||
|
});
|
||||||
|
|
||||||
|
return {
|
||||||
|
body,
|
||||||
|
url: pageUrl
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param {string} url
|
||||||
|
* @param {string} html
|
||||||
|
*
|
||||||
|
* @returns {Promise<Object>}
|
||||||
|
*/
|
||||||
|
async fetchBookmarkData(url, html) {
|
||||||
const metascraper = require('metascraper')([
|
const metascraper = require('metascraper')([
|
||||||
require('metascraper-url')(),
|
require('metascraper-url')(),
|
||||||
require('metascraper-title')(),
|
require('metascraper-title')(),
|
||||||
@ -125,10 +216,6 @@ class OEmbed {
|
|||||||
|
|
||||||
let scraperResponse;
|
let scraperResponse;
|
||||||
|
|
||||||
const cookieJar = new CookieJar();
|
|
||||||
const response = await this.externalRequest(url, {cookieJar});
|
|
||||||
|
|
||||||
const html = response.body;
|
|
||||||
try {
|
try {
|
||||||
scraperResponse = await metascraper({html, url});
|
scraperResponse = await metascraper({html, url});
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
@ -189,39 +276,17 @@ class OEmbed {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param {string} _url
|
* @param {string} url
|
||||||
|
* @param {string} html
|
||||||
* @param {string} [cardType]
|
* @param {string} [cardType]
|
||||||
*
|
*
|
||||||
* @returns {Promise<Object>}
|
* @returns {Promise<Object>}
|
||||||
*/
|
*/
|
||||||
async fetchOembedData(_url, cardType) {
|
async fetchOembedData(url, html, cardType) {
|
||||||
// check against known oembed list
|
|
||||||
let {url, provider} = findUrlWithProvider(_url);
|
|
||||||
if (provider) {
|
|
||||||
return this.knownProvider(url);
|
|
||||||
}
|
|
||||||
|
|
||||||
// url not in oembed list so fetch it in case it's a redirect or has a
|
|
||||||
// <link rel="alternate" type="application/json+oembed"> element
|
|
||||||
const cookieJar = new CookieJar();
|
|
||||||
const pageResponse = await this.externalRequest(url, {
|
|
||||||
method: 'GET',
|
|
||||||
timeout: 2 * 1000,
|
|
||||||
followRedirect: true,
|
|
||||||
cookieJar
|
|
||||||
});
|
|
||||||
// url changed after fetch, see if we were redirected to a known oembed
|
|
||||||
if (pageResponse.url !== url) {
|
|
||||||
({url, provider} = findUrlWithProvider(pageResponse.url));
|
|
||||||
if (provider) {
|
|
||||||
return this.knownProvider(url);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// check for <link rel="alternate" type="application/json+oembed"> element
|
// check for <link rel="alternate" type="application/json+oembed"> element
|
||||||
let oembedUrl;
|
let oembedUrl;
|
||||||
try {
|
try {
|
||||||
oembedUrl = cheerio('link[type="application/json+oembed"]', pageResponse.body).attr('href');
|
oembedUrl = cheerio('link[type="application/json+oembed"]', html).attr('href');
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
return this.unknownProvider(url);
|
return this.unknownProvider(url);
|
||||||
}
|
}
|
||||||
@ -234,13 +299,8 @@ class OEmbed {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// fetch oembed response from embedded rel="alternate" url
|
// fetch oembed response from embedded rel="alternate" url
|
||||||
const oembedResponse = await this.externalRequest(oembedUrl, {
|
const oembedResponse = await this.fetchPageJson(oembedUrl);
|
||||||
method: 'GET',
|
|
||||||
json: true,
|
|
||||||
timeout: 2 * 1000,
|
|
||||||
followRedirect: true,
|
|
||||||
cookieJar
|
|
||||||
});
|
|
||||||
// validate the fetched json against the oembed spec to avoid
|
// validate the fetched json against the oembed spec to avoid
|
||||||
// leaking non-oembed responses
|
// leaking non-oembed responses
|
||||||
const body = oembedResponse.body;
|
const body = oembedResponse.body;
|
||||||
@ -304,17 +364,39 @@ class OEmbed {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (type !== 'bookmark') {
|
||||||
|
// if not a bookmark request, first
|
||||||
|
// check against known oembed list
|
||||||
|
const {url: providerUrl, provider} = findUrlWithProvider(url);
|
||||||
|
if (provider) {
|
||||||
|
return this.knownProvider(providerUrl);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Not in the list, we need to fetch the content
|
||||||
|
const {url: pageUrl, body} = await this.fetchPageHtml(url);
|
||||||
|
|
||||||
// fetch only bookmark when explicitly requested
|
// fetch only bookmark when explicitly requested
|
||||||
if (type === 'bookmark') {
|
if (type === 'bookmark') {
|
||||||
return this.fetchBookmarkData(url);
|
return this.fetchBookmarkData(url, body);
|
||||||
}
|
}
|
||||||
|
|
||||||
// attempt to fetch oembed
|
// attempt to fetch oembed
|
||||||
let data = await this.fetchOembedData(url);
|
|
||||||
|
// In case response was a redirect, see if we were
|
||||||
|
// redirected to a known oembed
|
||||||
|
if (pageUrl !== url) {
|
||||||
|
const {url: providerUrl, provider} = findUrlWithProvider(pageUrl);
|
||||||
|
if (provider) {
|
||||||
|
return this.knownProvider(providerUrl);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let data = await this.fetchOembedData(url, body);
|
||||||
|
|
||||||
// fallback to bookmark when we can't get oembed
|
// fallback to bookmark when we can't get oembed
|
||||||
if (!data && !type) {
|
if (!data && !type) {
|
||||||
data = await this.fetchBookmarkData(url);
|
data = await this.fetchBookmarkData(url, body);
|
||||||
}
|
}
|
||||||
|
|
||||||
// couldn't get anything, throw a validation error
|
// couldn't get anything, throw a validation error
|
||||||
|
@ -25,7 +25,9 @@
|
|||||||
"@tryghost/errors": "1.2.18",
|
"@tryghost/errors": "1.2.18",
|
||||||
"@tryghost/logging": "2.3.2",
|
"@tryghost/logging": "2.3.2",
|
||||||
"@tryghost/tpl": "0.1.18",
|
"@tryghost/tpl": "0.1.18",
|
||||||
|
"charset": "^1.0.1",
|
||||||
"cheerio": "0.22.0",
|
"cheerio": "0.22.0",
|
||||||
|
"iconv-lite": "^0.6.3",
|
||||||
"lodash": "4.17.21",
|
"lodash": "4.17.21",
|
||||||
"metascraper": "5.31.1",
|
"metascraper": "5.31.1",
|
||||||
"metascraper-author": "5.31.1",
|
"metascraper-author": "5.31.1",
|
||||||
|
@ -8799,6 +8799,11 @@ charm@^1.0.0:
|
|||||||
dependencies:
|
dependencies:
|
||||||
inherits "^2.0.1"
|
inherits "^2.0.1"
|
||||||
|
|
||||||
|
charset@^1.0.1:
|
||||||
|
version "1.0.1"
|
||||||
|
resolved "https://registry.yarnpkg.com/charset/-/charset-1.0.1.tgz#8d59546c355be61049a8fa9164747793319852bd"
|
||||||
|
integrity sha512-6dVyOOYjpfFcL1Y4qChrAoQLRHvj2ziyhcm0QJlhOcAhykL/k1kTUPbeo+87MNRTRdk2OIIsIXbuF3x2wi5EXg==
|
||||||
|
|
||||||
chart.js@^2.9.0:
|
chart.js@^2.9.0:
|
||||||
version "2.9.4"
|
version "2.9.4"
|
||||||
resolved "https://registry.yarnpkg.com/chart.js/-/chart.js-2.9.4.tgz#0827f9563faffb2dc5c06562f8eb10337d5b9684"
|
resolved "https://registry.yarnpkg.com/chart.js/-/chart.js-2.9.4.tgz#0827f9563faffb2dc5c06562f8eb10337d5b9684"
|
||||||
|
Loading…
Reference in New Issue
Block a user