diff --git a/ghost/core/test/e2e-api/admin/oembed.test.js b/ghost/core/test/e2e-api/admin/oembed.test.js index ec8484ea03..54db7dacaf 100644 --- a/ghost/core/test/e2e-api/admin/oembed.test.js +++ b/ghost/core/test/e2e-api/admin/oembed.test.js @@ -86,7 +86,7 @@ describe('Oembed API', function () { it('falls back to bookmark without ?type=embed and no oembed metatag', async function () { const pageMock = nock('http://example.com') .get('/') - .times(2) // 1st = oembed metatag check, 2nd = metascraper + .times(1) // url should not be fetched twice .reply( 200, 'TESTING', @@ -550,7 +550,6 @@ describe('Oembed API', function () { it('falls back to bookmark card for WP oembeds', async function () { const pageMock = nock('http://test.com') .get('/') - .twice() // oembed fetch then bookmark fetch .reply( 200, 'TESTING', @@ -574,5 +573,57 @@ describe('Oembed API', function () { pageMock.isDone().should.be.true(); oembedMock.isDone().should.be.false(); }); + + it('decodes non utf-8 charsets', async function () { + const utfString = '中国abc'; + const encodedBytes = [0xd6,0xd0,0xb9,0xfa,0x61,0x62,0x63]; + const replyBuffer = Buffer.concat([ + Buffer.from(''), + Buffer.from(encodedBytes), + Buffer.from('') + ]); + + const pageMock = nock('http://example.com') + .get('/') + .reply( + 200, + replyBuffer, + {'content-type': 'text/html'} + ); + + const url = encodeURIComponent(' http://example.com\t '); // Whitespaces are to make sure urls are trimmed + const res = await request.get(localUtils.API.getApiQuery(`oembed/?url=${url}&type=bookmark`)) + .set('Origin', config.get('url')) + .expect('Content-Type', /json/) + .expect('Cache-Control', testUtils.cacheRules.private) + .expect(200); + + pageMock.isDone().should.be.true(); + res.body.type.should.eql('bookmark'); + res.body.url.should.eql('http://example.com'); + res.body.metadata.title.should.eql(utfString); + }); + + it('does not fail on unknown charset', async function () { + const pageMock = nock('http://example.com') + .get('/') + .reply( + 200, + 'TESTING', + {'content-type': 'text/html'} + ); + + const url = encodeURIComponent(' http://example.com\t '); // Whitespaces are to make sure urls are trimmed + const res = await request.get(localUtils.API.getApiQuery(`oembed/?url=${url}&type=bookmark`)) + .set('Origin', config.get('url')) + .expect('Content-Type', /json/) + .expect('Cache-Control', testUtils.cacheRules.private) + .expect(200); + + pageMock.isDone().should.be.true(); + res.body.type.should.eql('bookmark'); + res.body.url.should.eql('http://example.com'); + res.body.metadata.title.should.eql('TESTING'); + }); }); }); diff --git a/ghost/oembed-service/lib/oembed-service.js b/ghost/oembed-service/lib/oembed-service.js index c780cf5e3c..77fe8ba181 100644 --- a/ghost/oembed-service/lib/oembed-service.js +++ b/ghost/oembed-service/lib/oembed-service.js @@ -5,6 +5,8 @@ const {extract, hasProvider} = require('oembed-parser'); const cheerio = require('cheerio'); const _ = require('lodash'); const {CookieJar} = require('tough-cookie'); +const charset = require('charset'); +const iconv = require('iconv-lite'); const messages = { noUrlProvided: 'No url provided.', @@ -111,7 +113,96 @@ class OEmbed { } } - async fetchBookmarkData(url) { + /** + * @param {string} url + * @param {Object} options + * + * @returns {Promise<{url: string, body: any, headers: any}>} + */ + async fetchPage(url, options) { + const cookieJar = new CookieJar(); + return this.externalRequest( + url, + { + cookieJar, + method: 'GET', + timeout: 2 * 1000, + followRedirect: true, + ...options + }); + } + + /** + * @param {string} url + * + * @returns {Promise<{url: string, body: string}>} + */ + async fetchPageHtml(url) { + // Fetch url and get response as binary buffer to + // avoid implicit cast + const {headers, body, url: responseUrl} = await this.fetchPage( + url, + { + encoding: 'binary', + responseType: 'buffer' + }); + + try { + // Detect page encoding which might not be utf-8 + // and decode content + const encoding = charset( + headers, + body); + + if (encoding === null) { + return { + body: body.toString(), + url: responseUrl + }; + } + + const decodedBody = iconv.decode( + Buffer.from(body, 'binary'), encoding); + + return { + body: decodedBody, + url: responseUrl + }; + } catch (err) { + logging.error(err); + //return non decoded body anyway + return { + body: body.toString(), + url: responseUrl + }; + } + } + + /** + * @param {string} url + * + * @returns {Promise<{url: string, body: Object}>} + */ + async fetchPageJson(url) { + const {body, url: pageUrl} = await this.fetchPage( + url, + { + json: true + }); + + return { + body, + url: pageUrl + }; + } + + /** + * @param {string} url + * @param {string} html + * + * @returns {Promise} + */ + async fetchBookmarkData(url, html) { const metascraper = require('metascraper')([ require('metascraper-url')(), require('metascraper-title')(), @@ -124,11 +215,7 @@ class OEmbed { ]); let scraperResponse; - - const cookieJar = new CookieJar(); - const response = await this.externalRequest(url, {cookieJar}); - - const html = response.body; + try { scraperResponse = await metascraper({html, url}); } catch (err) { @@ -189,39 +276,17 @@ class OEmbed { } /** - * @param {string} _url + * @param {string} url + * @param {string} html * @param {string} [cardType] * * @returns {Promise} */ - async fetchOembedData(_url, cardType) { - // check against known oembed list - let {url, provider} = findUrlWithProvider(_url); - if (provider) { - return this.knownProvider(url); - } - - // url not in oembed list so fetch it in case it's a redirect or has a - // element - const cookieJar = new CookieJar(); - const pageResponse = await this.externalRequest(url, { - method: 'GET', - timeout: 2 * 1000, - followRedirect: true, - cookieJar - }); - // url changed after fetch, see if we were redirected to a known oembed - if (pageResponse.url !== url) { - ({url, provider} = findUrlWithProvider(pageResponse.url)); - if (provider) { - return this.knownProvider(url); - } - } - + async fetchOembedData(url, html, cardType) { // check for element let oembedUrl; try { - oembedUrl = cheerio('link[type="application/json+oembed"]', pageResponse.body).attr('href'); + oembedUrl = cheerio('link[type="application/json+oembed"]', html).attr('href'); } catch (e) { return this.unknownProvider(url); } @@ -234,13 +299,8 @@ class OEmbed { } // fetch oembed response from embedded rel="alternate" url - const oembedResponse = await this.externalRequest(oembedUrl, { - method: 'GET', - json: true, - timeout: 2 * 1000, - followRedirect: true, - cookieJar - }); + const oembedResponse = await this.fetchPageJson(oembedUrl); + // validate the fetched json against the oembed spec to avoid // leaking non-oembed responses const body = oembedResponse.body; @@ -304,17 +364,39 @@ class OEmbed { } } + if (type !== 'bookmark') { + // if not a bookmark request, first + // check against known oembed list + const {url: providerUrl, provider} = findUrlWithProvider(url); + if (provider) { + return this.knownProvider(providerUrl); + } + } + + // Not in the list, we need to fetch the content + const {url: pageUrl, body} = await this.fetchPageHtml(url); + // fetch only bookmark when explicitly requested if (type === 'bookmark') { - return this.fetchBookmarkData(url); + return this.fetchBookmarkData(url, body); } // attempt to fetch oembed - let data = await this.fetchOembedData(url); + + // In case response was a redirect, see if we were + // redirected to a known oembed + if (pageUrl !== url) { + const {url: providerUrl, provider} = findUrlWithProvider(pageUrl); + if (provider) { + return this.knownProvider(providerUrl); + } + } + + let data = await this.fetchOembedData(url, body); // fallback to bookmark when we can't get oembed if (!data && !type) { - data = await this.fetchBookmarkData(url); + data = await this.fetchBookmarkData(url, body); } // couldn't get anything, throw a validation error diff --git a/ghost/oembed-service/package.json b/ghost/oembed-service/package.json index 0076f0ffe7..6d4c5fcfc0 100644 --- a/ghost/oembed-service/package.json +++ b/ghost/oembed-service/package.json @@ -25,7 +25,9 @@ "@tryghost/errors": "1.2.18", "@tryghost/logging": "2.3.2", "@tryghost/tpl": "0.1.18", + "charset": "^1.0.1", "cheerio": "0.22.0", + "iconv-lite": "^0.6.3", "lodash": "4.17.21", "metascraper": "5.31.1", "metascraper-author": "5.31.1", diff --git a/yarn.lock b/yarn.lock index 2e521a026a..5444306587 100644 --- a/yarn.lock +++ b/yarn.lock @@ -8799,6 +8799,11 @@ charm@^1.0.0: dependencies: inherits "^2.0.1" +charset@^1.0.1: + version "1.0.1" + resolved "https://registry.yarnpkg.com/charset/-/charset-1.0.1.tgz#8d59546c355be61049a8fa9164747793319852bd" + integrity sha512-6dVyOOYjpfFcL1Y4qChrAoQLRHvj2ziyhcm0QJlhOcAhykL/k1kTUPbeo+87MNRTRdk2OIIsIXbuF3x2wi5EXg== + chart.js@^2.9.0: version "2.9.4" resolved "https://registry.yarnpkg.com/chart.js/-/chart.js-2.9.4.tgz#0827f9563faffb2dc5c06562f8eb10337d5b9684"