diff --git a/ghost/core/test/e2e-api/admin/oembed.test.js b/ghost/core/test/e2e-api/admin/oembed.test.js
index ec8484ea03..54db7dacaf 100644
--- a/ghost/core/test/e2e-api/admin/oembed.test.js
+++ b/ghost/core/test/e2e-api/admin/oembed.test.js
@@ -86,7 +86,7 @@ describe('Oembed API', function () {
it('falls back to bookmark without ?type=embed and no oembed metatag', async function () {
const pageMock = nock('http://example.com')
.get('/')
- .times(2) // 1st = oembed metatag check, 2nd = metascraper
+ .times(1) // url should not be fetched twice
.reply(
200,
'
TESTING',
@@ -550,7 +550,6 @@ describe('Oembed API', function () {
it('falls back to bookmark card for WP oembeds', async function () {
const pageMock = nock('http://test.com')
.get('/')
- .twice() // oembed fetch then bookmark fetch
.reply(
200,
'TESTING',
@@ -574,5 +573,57 @@ describe('Oembed API', function () {
pageMock.isDone().should.be.true();
oembedMock.isDone().should.be.false();
});
+
+ it('decodes non utf-8 charsets', async function () {
+ const utfString = '中国abc';
+ const encodedBytes = [0xd6,0xd0,0xb9,0xfa,0x61,0x62,0x63];
+ const replyBuffer = Buffer.concat([
+ Buffer.from(''),
+ Buffer.from(encodedBytes),
+ Buffer.from('')
+ ]);
+
+ const pageMock = nock('http://example.com')
+ .get('/')
+ .reply(
+ 200,
+ replyBuffer,
+ {'content-type': 'text/html'}
+ );
+
+ const url = encodeURIComponent(' http://example.com\t '); // Whitespaces are to make sure urls are trimmed
+ const res = await request.get(localUtils.API.getApiQuery(`oembed/?url=${url}&type=bookmark`))
+ .set('Origin', config.get('url'))
+ .expect('Content-Type', /json/)
+ .expect('Cache-Control', testUtils.cacheRules.private)
+ .expect(200);
+
+ pageMock.isDone().should.be.true();
+ res.body.type.should.eql('bookmark');
+ res.body.url.should.eql('http://example.com');
+ res.body.metadata.title.should.eql(utfString);
+ });
+
+ it('does not fail on unknown charset', async function () {
+ const pageMock = nock('http://example.com')
+ .get('/')
+ .reply(
+ 200,
+ 'TESTING',
+ {'content-type': 'text/html'}
+ );
+
+ const url = encodeURIComponent(' http://example.com\t '); // Whitespaces are to make sure urls are trimmed
+ const res = await request.get(localUtils.API.getApiQuery(`oembed/?url=${url}&type=bookmark`))
+ .set('Origin', config.get('url'))
+ .expect('Content-Type', /json/)
+ .expect('Cache-Control', testUtils.cacheRules.private)
+ .expect(200);
+
+ pageMock.isDone().should.be.true();
+ res.body.type.should.eql('bookmark');
+ res.body.url.should.eql('http://example.com');
+ res.body.metadata.title.should.eql('TESTING');
+ });
});
});
diff --git a/ghost/oembed-service/lib/oembed-service.js b/ghost/oembed-service/lib/oembed-service.js
index c780cf5e3c..77fe8ba181 100644
--- a/ghost/oembed-service/lib/oembed-service.js
+++ b/ghost/oembed-service/lib/oembed-service.js
@@ -5,6 +5,8 @@ const {extract, hasProvider} = require('oembed-parser');
const cheerio = require('cheerio');
const _ = require('lodash');
const {CookieJar} = require('tough-cookie');
+const charset = require('charset');
+const iconv = require('iconv-lite');
const messages = {
noUrlProvided: 'No url provided.',
@@ -111,7 +113,96 @@ class OEmbed {
}
}
- async fetchBookmarkData(url) {
+ /**
+ * @param {string} url
+ * @param {Object} options
+ *
+ * @returns {Promise<{url: string, body: any, headers: any}>}
+ */
+ async fetchPage(url, options) {
+ const cookieJar = new CookieJar();
+ return this.externalRequest(
+ url,
+ {
+ cookieJar,
+ method: 'GET',
+ timeout: 2 * 1000,
+ followRedirect: true,
+ ...options
+ });
+ }
+
+ /**
+ * @param {string} url
+ *
+ * @returns {Promise<{url: string, body: string}>}
+ */
+ async fetchPageHtml(url) {
+ // Fetch url and get response as binary buffer to
+ // avoid implicit cast
+ const {headers, body, url: responseUrl} = await this.fetchPage(
+ url,
+ {
+ encoding: 'binary',
+ responseType: 'buffer'
+ });
+
+ try {
+ // Detect page encoding which might not be utf-8
+ // and decode content
+ const encoding = charset(
+ headers,
+ body);
+
+ if (encoding === null) {
+ return {
+ body: body.toString(),
+ url: responseUrl
+ };
+ }
+
+ const decodedBody = iconv.decode(
+ Buffer.from(body, 'binary'), encoding);
+
+ return {
+ body: decodedBody,
+ url: responseUrl
+ };
+ } catch (err) {
+ logging.error(err);
+ //return non decoded body anyway
+ return {
+ body: body.toString(),
+ url: responseUrl
+ };
+ }
+ }
+
+ /**
+ * @param {string} url
+ *
+ * @returns {Promise<{url: string, body: Object}>}
+ */
+ async fetchPageJson(url) {
+ const {body, url: pageUrl} = await this.fetchPage(
+ url,
+ {
+ json: true
+ });
+
+ return {
+ body,
+ url: pageUrl
+ };
+ }
+
+ /**
+ * @param {string} url
+ * @param {string} html
+ *
+ * @returns {Promise