Made server unreachable and maintenance error request retries application-wide

closes https://github.com/TryGhost/Team/issues/837

We previously added automatic retries to the editor controller for post saves; reviewing the resulting logs in Sentry we can see this stopped the "Server unreachable" error alerts showing to users because the requests typically succeeded on the first retry that was made 5 seconds later. However the problem is not limited to post saves and we can see other requests hitting the same issue, including when working in the editor such as adding embed cards, uploading images, or fetching member counts before publishing.

All of the API network requests we make in Admin run through an `ajax` service that makes and handles the request/response. By moving the retry logic for specific errors out of the editor controller and into the ajax service we can make temporary connection handling more graceful across the app.

- move retry behaviour from the editor controller to the `ajax` service so we can retry any request rather than just post save requests
- speed up retries so we reconnect as soon as possible
  - first retry at 500ms, then every 1000ms (previous was every 5s which meant overly long waits)
- reduce total retry time from >30s to 15s
- improve reporting to Sentry
  - report when a retry was required
  - report when a retry failed
  - include the total time taken for both success and failure reports
  - include the `server` header value from requests to distinguish between CDNs
  - include type of error so we can distinguish "server unreachable" from "maintenance" retries
This commit is contained in:
Kevin Ansfield 2021-06-30 14:51:40 +01:00
parent 91ec4435b5
commit 73d4ac8a26
2 changed files with 83 additions and 40 deletions

View File

@ -7,13 +7,12 @@ import moment from 'moment';
import {action, computed} from '@ember/object';
import {alias, mapBy} from '@ember/object/computed';
import {capitalize} from '@ember/string';
import {captureException, captureMessage} from '@sentry/browser';
import {inject as controller} from '@ember/controller';
import {get} from '@ember/object';
import {htmlSafe} from '@ember/template';
import {isBlank} from '@ember/utils';
import {isArray as isEmberArray} from '@ember/array';
import {isHostLimitError, isMaintenanceError, isServerUnreachableError} from 'ghost-admin/services/ajax';
import {isHostLimitError, isServerUnreachableError} from 'ghost-admin/services/ajax';
import {isInvalidError} from 'ember-ajax/errors';
import {isVersionMismatchError} from 'ghost-admin/services/ajax';
import {inject as service} from '@ember/service';
@ -603,44 +602,22 @@ export default Controller.extend({
_savePostTask: task(function* (options) {
let {post} = this;
// retry save every 5 seconds for a total of 30secs
// only retry if we get a ServerUnreachable error (code 0) from the browser or a MaintenanceError from Ghost
let attempts = 0;
const maxAttempts = 6;
const startTime = moment();
const retryErrorChecks = [isServerUnreachableError, isMaintenanceError];
let success = false;
try {
yield post.save(options);
} catch (error) {
if (isServerUnreachableError(error)) {
const [prevStatus, newStatus] = this.post.changedAttributes().status || [this.post.status, this.post.status];
this._showErrorAlert(prevStatus, newStatus, error);
while (attempts < maxAttempts && !success) {
try {
yield post.save(options);
success = true;
this.notifications.closeAlerts('post.save');
if (attempts !== 0 && this.config.get('sentry_dsn')) {
let totalSeconds = moment().diff(startTime, 'seconds');
captureMessage('Saving post required multiple attempts', {extra: {attempts, totalSeconds}});
}
} catch (error) {
attempts += 1;
if (retryErrorChecks.some(check => check(error)) && attempts < maxAttempts) {
yield timeout(5 * 1000);
} else if (isServerUnreachableError(error)) {
const [prevStatus, newStatus] = this.post.changedAttributes().status || [this.post.status, this.post.status];
this._showErrorAlert(prevStatus, newStatus, error);
if (this.config.get('sentry_dsn')) {
captureException(error);
}
// simulate a validation error so we don't end up on a 500 screen
throw undefined;
} else {
throw error;
}
// simulate a validation error so we don't end up on a 500 screen
throw undefined;
}
throw error;
}
this.notifications.closeAlerts('post.save');
// remove any unsaved tags
// NOTE: `updateTags` changes `hasDirtyAttributes => true`.
// For a saved post it would otherwise be false.

View File

@ -1,10 +1,13 @@
import AjaxService from 'ember-ajax/services/ajax';
import config from 'ghost-admin/config/environment';
import moment from 'moment';
import {AjaxError, isAjaxError} from 'ember-ajax/errors';
import {captureMessage} from '@sentry/browser';
import {get} from '@ember/object';
import {isArray as isEmberArray} from '@ember/array';
import {isNone} from '@ember/utils';
import {inject as service} from '@ember/service';
import {timeout} from 'ember-concurrency';
const JSON_CONTENT_TYPE = 'application/json';
const GHOST_REQUEST = /\/ghost\/api\//;
@ -158,6 +161,7 @@ export function isAcceptedResponse(errorOrStatus) {
}
let ajaxService = AjaxService.extend({
config: service(),
session: service(),
// flag to tell our ESA authenticator not to try an invalidate DELETE request
@ -180,9 +184,9 @@ let ajaxService = AjaxService.extend({
}
},
// ember-ajax recognises `application/vnd.api+json` as a JSON-API request
// and formats appropriately, we want to handle `application/json` the same
_makeRequest(hash) {
async _makeRequest(hash) {
// ember-ajax recognises `application/vnd.api+json` as a JSON-API request
// and formats appropriately, we want to handle `application/json` the same
if (isJSONContentType(hash.contentType) && hash.type !== 'GET') {
if (typeof hash.data === 'object') {
hash.data = JSON.stringify(hash.data);
@ -191,7 +195,64 @@ let ajaxService = AjaxService.extend({
hash.withCredentials = true;
return this._super(hash);
// attempt retries for 15 seconds in two situations:
// 1. Server Unreachable error from the browser (code 0), typically from short internet blips
// 2. Maintenance error from Ghost, upgrade in progress so API is temporarily unavailable
let success = false;
let errorName = null;
let attempts = 0;
let startTime = new Date();
let retryingMs = 0;
const maxRetryingMs = 15_000;
const retryPeriods = [500, 1000];
const retryErrorChecks = [this.isServerUnreachableError, this.isMaintenanceError];
const getErrorData = () => {
const data = {
errorName,
attempts,
totalSeconds: moment().diff(moment(startTime), 'seconds')
};
if (this._responseServer) {
data.server = this._responseServer;
}
return data;
};
const makeRequest = this._super.bind(this);
while (retryingMs <= maxRetryingMs && !success) {
try {
const result = await makeRequest(hash);
success = true;
if (attempts !== 0 && this.config.get('sentry_dsn')) {
captureMessage('Request took multiple attempts', {extra: getErrorData()});
}
return result;
} catch (error) {
errorName = error.response?.constructor?.name;
retryingMs = (new Date()) - startTime;
// avoid retries in tests because it slows things down and is not expected in mocks
// isTesting can be overridden in individual tests if required
if (this.isTesting) {
throw error;
}
if (retryErrorChecks.some(check => check(error.response)) && retryingMs <= maxRetryingMs) {
await timeout(retryPeriods[attempts] || retryPeriods[retryPeriods.length - 1]);
attempts += 1;
} else if (attempts > 0 && this.config.get('sentry_dsn')) {
captureMessage('Request failed after multiple attempts', {extra: getErrorData()});
throw error;
} else {
throw error;
}
}
}
},
handleResponse(status, headers, payload, request) {
@ -219,6 +280,11 @@ let ajaxService = AjaxService.extend({
let isAuthenticated = this.get('session.isAuthenticated');
let isUnauthorized = this.isUnauthorizedError(status, headers, payload);
// used when reporting connection errors, helps distinguish CDN
if (isGhostRequest) {
this._responseServer = headers.server;
}
if (isAuthenticated && isGhostRequest && isUnauthorized) {
this.skipSessionDeletion = true;
this.session.invalidate();