proxy: make proxyCheckInterval and proxyTimeout configurable

make proxyCheckInterval and proxyTimeout configurable in the cluster spec.

The proxy will publish its current proxyTimeout so the sentinel will know when
to consider it as active.
This commit is contained in:
Simone Gotti 2020-02-17 12:04:27 +01:00
parent 1305446441
commit 9cc800de05
9 changed files with 153 additions and 77 deletions

View File

@ -89,6 +89,10 @@ type ClusterChecker struct {
endPollonProxyCh chan error
pollonMutex sync.Mutex
proxyCheckInterval time.Duration
proxyTimeout time.Duration
configMutex sync.Mutex
}
func NewClusterChecker(uid string, cfg config) (*ClusterChecker, error) {
@ -104,6 +108,9 @@ func NewClusterChecker(uid string, cfg config) (*ClusterChecker, error) {
stopListening: cfg.stopListening,
e: e,
endPollonProxyCh: make(chan error),
proxyCheckInterval: cluster.DefaultProxyCheckInterval,
proxyTimeout: cluster.DefaultProxyTimeout,
}, nil
}
@ -164,15 +171,16 @@ func (c *ClusterChecker) sendPollonConfData(confData pollon.ConfData) {
}
}
func (c *ClusterChecker) SetProxyInfo(e store.Store, generation int64, ttl time.Duration) error {
func (c *ClusterChecker) SetProxyInfo(e store.Store, generation int64, proxyTimeout time.Duration) error {
proxyInfo := &cluster.ProxyInfo{
InfoUID: common.UID(),
UID: c.uid,
Generation: generation,
InfoUID: common.UID(),
UID: c.uid,
Generation: generation,
ProxyTimeout: proxyTimeout,
}
log.Debugf("proxyInfo dump: %s", spew.Sdump(proxyInfo))
if err := c.e.SetProxyInfo(context.TODO(), proxyInfo, ttl); err != nil {
if err := c.e.SetProxyInfo(context.TODO(), proxyInfo, 2*proxyTimeout); err != nil {
return err
}
return nil
@ -205,13 +213,31 @@ func (c *ClusterChecker) Check() error {
return fmt.Errorf("clusterdata validation failed: %v", err)
}
cdProxyCheckInterval := cd.Cluster.DefSpec().ProxyCheckInterval.Duration
cdProxyTimeout := cd.Cluster.DefSpec().ProxyTimeout.Duration
// use the greater between the current proxy timeout and the one defined in the cluster spec if they're different.
// in this way we're updating our proxyInfo using a timeout that is greater or equal the current active timeout timer.
c.configMutex.Lock()
proxyTimeout := c.proxyTimeout
if cdProxyTimeout > proxyTimeout {
proxyTimeout = cdProxyTimeout
}
c.configMutex.Unlock()
proxy := cd.Proxy
if proxy == nil {
log.Infow("no proxy object available, closing connections to master")
c.sendPollonConfData(pollon.ConfData{DestAddr: nil})
// ignore errors on setting proxy info
if err = c.SetProxyInfo(c.e, cluster.NoGeneration, 2*cluster.DefaultProxyTimeoutInterval); err != nil {
if err = c.SetProxyInfo(c.e, cluster.NoGeneration, proxyTimeout); err != nil {
log.Errorw("failed to update proxyInfo", zap.Error(err))
} else {
// update proxyCheckinterval and proxyTimeout only if we successfully updated our proxy info
c.configMutex.Lock()
c.proxyCheckInterval = cdProxyCheckInterval
c.proxyTimeout = cdProxyTimeout
c.configMutex.Unlock()
}
return nil
}
@ -221,8 +247,14 @@ func (c *ClusterChecker) Check() error {
log.Infow("no db object available, closing connections to master", "db", proxy.Spec.MasterDBUID)
c.sendPollonConfData(pollon.ConfData{DestAddr: nil})
// ignore errors on setting proxy info
if err = c.SetProxyInfo(c.e, proxy.Generation, 2*cluster.DefaultProxyTimeoutInterval); err != nil {
if err = c.SetProxyInfo(c.e, proxy.Generation, proxyTimeout); err != nil {
log.Errorw("failed to update proxyInfo", zap.Error(err))
} else {
// update proxyCheckinterval and proxyTimeout only if we successfully updated our proxy info
c.configMutex.Lock()
c.proxyCheckInterval = cdProxyCheckInterval
c.proxyTimeout = cdProxyTimeout
c.configMutex.Unlock()
}
return nil
}
@ -234,12 +266,18 @@ func (c *ClusterChecker) Check() error {
return nil
}
log.Infow("master address", "address", addr)
if err = c.SetProxyInfo(c.e, proxy.Generation, 2*cluster.DefaultProxyTimeoutInterval); err != nil {
if err = c.SetProxyInfo(c.e, proxy.Generation, proxyTimeout); err != nil {
// if we failed to update our proxy info when a master is defined we
// cannot ignore this error since the sentinel won't know that we exist
// and are sending connections to a master so, when electing a new
// master, it'll not wait for us to close connections to the old one.
return fmt.Errorf("failed to update proxyInfo: %v", err)
} else {
// update proxyCheckinterval and proxyTimeout only if we successfully updated our proxy info
c.configMutex.Lock()
c.proxyCheckInterval = cdProxyCheckInterval
c.proxyTimeout = cdProxyTimeout
c.configMutex.Unlock()
}
// start proxing only if we are inside enabledProxies, this ensures that the
@ -256,7 +294,9 @@ func (c *ClusterChecker) Check() error {
}
func (c *ClusterChecker) TimeoutChecker(checkOkCh chan struct{}) {
timeoutTimer := time.NewTimer(cluster.DefaultProxyTimeoutInterval)
c.configMutex.Lock()
timeoutTimer := time.NewTimer(c.proxyTimeout)
c.configMutex.Unlock()
for {
select {
@ -275,7 +315,10 @@ func (c *ClusterChecker) TimeoutChecker(checkOkCh chan struct{}) {
// ignore if stop succeeded or not due to timer already expired
timeoutTimer.Stop()
timeoutTimer = time.NewTimer(cluster.DefaultProxyTimeoutInterval)
c.configMutex.Lock()
timeoutTimer = time.NewTimer(c.proxyTimeout)
c.configMutex.Unlock()
}
}
}
@ -305,7 +348,10 @@ func (c *ClusterChecker) Start() error {
// report that check was ok
checkOkCh <- struct{}{}
}
timerCh = time.NewTimer(cluster.DefaultProxyCheckInterval).C
c.configMutex.Lock()
timerCh = time.NewTimer(c.proxyCheckInterval).C
c.configMutex.Unlock()
case err := <-c.endPollonProxyCh:
if err != nil {
return fmt.Errorf("proxy error: %v", err)

View File

@ -342,7 +342,7 @@ func (s *Sentinel) activeProxiesInfos(proxiesInfo cluster.ProxiesInfo) cluster.P
for _, pi := range proxiesInfo {
if pih, ok := pihs[pi.UID]; ok {
if pih.ProxyInfo.InfoUID == pi.InfoUID {
if timer.Since(pih.Timer) > 2*cluster.DefaultProxyTimeoutInterval {
if timer.Since(pih.Timer) > 2*pi.ProxyTimeout {
delete(activeProxiesInfo, pi.UID)
}
} else {
@ -1820,7 +1820,6 @@ func (s *Sentinel) clusterSentinelCheck(pctx context.Context) {
s.sleepInterval = cd.Cluster.DefSpec().SleepInterval.Duration
s.requestTimeout = cd.Cluster.DefSpec().RequestTimeout.Duration
}
}
log.Debugf("cd dump: %s", spew.Sdump(cd))

View File

@ -5000,8 +5000,8 @@ func TestUpdateCluster(t *testing.T) {
}
func TestActiveProxiesInfos(t *testing.T) {
proxyInfo1 := cluster.ProxyInfo{UID: "proxy1", InfoUID: "infoUID1"}
proxyInfo2 := cluster.ProxyInfo{UID: "proxy2", InfoUID: "infoUID2"}
proxyInfo1 := cluster.ProxyInfo{UID: "proxy1", InfoUID: "infoUID1", ProxyTimeout: cluster.DefaultProxyTimeout}
proxyInfo2 := cluster.ProxyInfo{UID: "proxy2", InfoUID: "infoUID2", ProxyTimeout: cluster.DefaultProxyTimeout}
proxyInfoWithDifferentInfoUID := cluster.ProxyInfo{UID: "proxy2", InfoUID: "differentInfoUID"}
var secToNanoSecondMultiplier int64 = 1000000000
tests := []struct {
@ -5033,7 +5033,7 @@ func TestActiveProxiesInfos(t *testing.T) {
expectedProxyInfoHistories: ProxyInfoHistories{"proxy1": &ProxyInfoHistory{ProxyInfo: &proxyInfo1}, "proxy2": &ProxyInfoHistory{ProxyInfo: &proxyInfoWithDifferentInfoUID}},
},
{
name: "should remove from active proxies if is not updated for twice the DefaultProxyTimeoutInterval",
name: "should remove from active proxies if is not updated for twice the DefaultProxyTimeout",
proxyInfoHistories: ProxyInfoHistories{"proxy1": &ProxyInfoHistory{ProxyInfo: &proxyInfo1, Timer: timer.Now() - (3 * 15 * secToNanoSecondMultiplier)}, "proxy2": &ProxyInfoHistory{ProxyInfo: &proxyInfo2, Timer: timer.Now() - (1 * 15 * secToNanoSecondMultiplier)}},
proxiesInfos: cluster.ProxiesInfo{"proxy1": &proxyInfo1, "proxy2": &proxyInfo2},
expectedActiveProxies: cluster.ProxiesInfo{"proxy2": &proxyInfo2},

View File

@ -50,6 +50,8 @@ type ClusterSpecNoDefaults struct {
DBWaitReadyTimeout *cluster.Duration `json:"dbWaitReadyTimeout,omitempty"`
FailInterval *cluster.Duration `json:"failInterval,omitempty"`
DeadKeeperRemovalInterval *cluster.Duration `json:"deadKeeperRemovalInterval,omitempty"`
ProxyCheckInterval *cluster.Duration `json:"proxyCheckInterval,omitempty"`
ProxyTimeout *cluster.Duration `json:"proxyTimeout,omitempty"`
MaxStandbys *uint16 `json:"maxStandbys,omitempty"`
MaxStandbysPerSender *uint16 `json:"maxStandbysPerSender,omitempty"`
MaxStandbyLag *uint32 `json:"maxStandbyLag,omitempty"`
@ -81,6 +83,8 @@ type ClusterSpecDefaults struct {
DBWaitReadyTimeout *cluster.Duration `json:"dbWaitReadyTimeout"`
FailInterval *cluster.Duration `json:"failInterval"`
DeadKeeperRemovalInterval *cluster.Duration `json:"deadKeeperRemovalInterval"`
ProxyCheckInterval *cluster.Duration `json:"proxyCheckInterval"`
ProxyTimeout *cluster.Duration `json:"proxyTimeout"`
MaxStandbys *uint16 `json:"maxStandbys"`
MaxStandbysPerSender *uint16 `json:"maxStandbysPerSender"`
MaxStandbyLag *uint32 `json:"maxStandbyLag"`

View File

@ -12,34 +12,36 @@ Some options in a running cluster specification can be changed to update the des
### Cluster Specification Format.
| Name | Description | Required | Type | Default |
|---------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------|-------------------|-------------------------------------------------------------------------------------------------------------------------------------|
| sleepInterval | interval to wait before next check (for every component: keeper, sentinel, proxy). | no | string (duration) | 5s |
| requestTimeout | time after which any request (keepers checks from sentinel etc...) will fail. | no | string (duration) | 10s |
| failInterval | interval after the first fail to declare a keeper as not healthy. | no | string (duration) | 20s |
| deadKeeperRemovalInterval | interval after which a dead keeper will be removed from the cluster data | no | string (duration) | 48h |
| maxStandbys | max number of standbys. This needs to be greater enough to cover both standby managed by stolon and additional standbys configured by the user. Its value affect different postgres parameters like max_replication_slots and max_wal_senders. Setting this to a number lower than the sum of stolon managed standbys and user managed standbys will have unpredicatable effects due to problems creating replication slots or replication problems due to exhausted wal senders. | no | uint16 | 20 |
| maxStandbysPerSender | max number of standbys for every sender. A sender can be a master or another standby (with cascading replication). | no | uint16 | 3 |
| maxStandbyLag | maximum lag (from the last reported master state, in bytes) that an asynchronous standby can have to be elected in place of a failed master. | no | uint32 | 1MiB |
| synchronousReplication | use synchronous replication between the master and its standbys | no | bool | false |
| minSynchronousStandbys | minimum number of required synchronous standbys when synchronous replication is enabled (only set this to a value > 1 when using PostgreSQL >= 9.6) | no | uint16 | 1 |
| maxSynchronousStandbys | maximum number of required synchronous standbys when synchronous replication is enabled (only set this to a value > 1 when using PostgreSQL >= 9.6) | no | uint16 | 1 |
| additionalWalSenders | number of additional wal_senders in addition to the ones internally defined by stolon, useful to provide enough wal senders for external standbys (changing this value requires an instance restart) | no | uint16 | 5 |
| additionalMasterReplicationSlots | a list of additional physical replication slots to be created on the master postgres instance. They will be prefixed with `stolon_` (like internal replication slots used for standby replication) to make them "namespaced" from other replication slots. Replication slots starting with `stolon_` and not defined here (and not used for standby replication) will be dropped from the master instance. | no | []string | null |
| usePgrewind | try to use pg_rewind for faster instance resyncronization. | no | bool | false |
| initMode | The cluster initialization mode. Can be *new* or *existing*. *new* means that a new db cluster will be created on a random keeper and the other keepers will sync with it. *existing* means that a keeper (that needs to have an already created db cluster) will be choosed as the initial master and the other keepers will sync with it. In this case the `existingConfig` object needs to be populated. | yes | string | |
| existingConfig | configuration for initMode of type "existing" | if initMode is "existing" | ExistingConfig | |
| mergePgParameters | merge pgParameters of the initialized db cluster, useful the retain initdb generated parameters when InitMode is new, retain current parameters when initMode is existing or pitr. | no | bool | true |
| role | cluster role (master or standby) | no | bool | master |
| defaultSUReplAccessMode | mode for the default hba rules used for replication by standby keepers (the su and repl auth methods will be the one provided in the keeper command line options). Values can be *all* or *strict*. *all* allow access from all ips, *strict* restrict master access to standby servers ips. | no | string | all |
| newConfig | configuration for initMode of type "new" | if initMode is "new" | NewConfig | |
| pitrConfig | configuration for initMode of type "pitr" | if initMode is "pitr" | PITRConfig | |
| standbyConfig | standby config when the cluster is a standby cluster | if role is "standby" | StandbyConfig | |
| pgParameters | a map containing the postgres server parameters and their values. The parameters value don't have to be quoted and single quotes don't have to be doubled since this is already done by the keeper when writing the postgresql.conf file | no | map[string]string | |
| pgHBA | a list containing additional pg_hba.conf entries. They will be added to the pg_hba.conf generated by stolon. **NOTE**: these lines aren't validated so if some of them are wrong postgres will refuse to start or, on reload, will log a warning and ignore the updated pg_hba.conf file | no | []string | null. Will use the default behiavior of accepting connections from all hosts for all dbs and users with md5 password authentication |
| automaticPgRestart | restart postgres automatically after changing the pgParameters that requires restart. Refer `pending_restart` in [pg_settings](https://www.postgresql.org/docs/9.5/static/view-pg-settings.html) | no | bool | false |
| dbWaitReadyTimeout | Time to wait for the database to become ready after starting. Increase this value if your Postgres takes longer to boot, e.g. because it has to recover a lot of WAL. | no | string (duration) | 60s |
| syncTimeout | Time to wait for a database recovery (including the replay of WAL files in case of Point-In-Time-Recovery) | no | string (duration) | 0 (no timeout, waits until recovery has finished) |
| Name | Description | Required | Type | Default |
|----------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------|-------------------|-------------------------------------------------------------------------------------------------------------------------------------|
| sleepInterval | interval to wait before next check (for keepers and sentinels). | no | string (duration) | 5s |
| requestTimeout | time after which any request to external resources (store, postgres queries etc...) will fail. | no | string (duration) | 10s |
| failInterval | interval after the first fail to declare a keeper as not healthy. | no | string (duration) | 20s |
| proxyCheckInterval | interval to wait before next proxy check. | no | string (duration) | 5s |
| proxyTimeout | interval where a proxy check must successfully complete or the proxy will close all connections to the master. | no | string (duration) | 15s |
| deadKeeperRemovalInterval | interval after which a dead keeper will be removed from the cluster data | no | string (duration) | 48h |
| maxStandbys | max number of standbys. This needs to be greater enough to cover both standby managed by stolon and additional standbys configured by the user. Its value affect different postgres parameters like max_replication_slots and max_wal_senders. Setting this to a number lower than the sum of stolon managed standbys and user managed standbys will have unpredicatable effects due to problems creating replication slots or replication problems due to exhausted wal senders. | no | uint16 | 20 |
| maxStandbysPerSender | max number of standbys for every sender. A sender can be a master or another standby (with cascading replication). | no | uint16 | 3 |
| maxStandbyLag | maximum lag (from the last reported master state, in bytes) that an asynchronous standby can have to be elected in place of a failed master. | no | uint32 | 1MiB |
| synchronousReplication | use synchronous replication between the master and its standbys | no | bool | false |
| minSynchronousStandbys | minimum number of required synchronous standbys when synchronous replication is enabled (only set this to a value > 1 when using PostgreSQL >= 9.6) | no | uint16 | 1 |
| maxSynchronousStandbys | maximum number of required synchronous standbys when synchronous replication is enabled (only set this to a value > 1 when using PostgreSQL >= 9.6) | no | uint16 | 1 |
| additionalWalSenders | number of additional wal_senders in addition to the ones internally defined by stolon, useful to provide enough wal senders for external standbys (changing this value requires an instance restart) | no | uint16 | 5 |
| additionalMasterReplicationSlots | a list of additional physical replication slots to be created on the master postgres instance. They will be prefixed with `stolon_` (like internal replication slots used for standby replication) to make them "namespaced" from other replication slots. Replication slots starting with `stolon_` and not defined here (and not used for standby replication) will be dropped from the master instance. | no | []string | null |
| usePgrewind | try to use pg_rewind for faster instance resyncronization. | no | bool | false |
| initMode | The cluster initialization mode. Can be *new* or *existing*. *new* means that a new db cluster will be created on a random keeper and the other keepers will sync with it. *existing* means that a keeper (that needs to have an already created db cluster) will be choosed as the initial master and the other keepers will sync with it. In this case the `existingConfig` object needs to be populated. | yes | string | |
| existingConfig | configuration for initMode of type "existing" | if initMode is "existing" | ExistingConfig | |
| mergePgParameters | merge pgParameters of the initialized db cluster, useful the retain initdb generated parameters when InitMode is new, retain current parameters when initMode is existing or pitr. | no | bool | true |
| role | cluster role (master or standby) | no | bool | master |
| defaultSUReplAccessMode | mode for the default hba rules used for replication by standby keepers (the su and repl auth methods will be the one provided in the keeper command line options). Values can be *all* or *strict*. *all* allow access from all ips, *strict* restrict master access to standby servers ips. | no | string | all |
| newConfig | configuration for initMode of type "new" | if initMode is "new" | NewConfig | |
| pitrConfig | configuration for initMode of type "pitr" | if initMode is "pitr" | PITRConfig | |
| standbyConfig | standby config when the cluster is a standby cluster | if role is "standby" | StandbyConfig | |
| pgParameters | a map containing the postgres server parameters and their values. The parameters value don't have to be quoted and single quotes don't have to be doubled since this is already done by the keeper when writing the postgresql.conf file | no | map[string]string | |
| pgHBA | a list containing additional pg_hba.conf entries. They will be added to the pg_hba.conf generated by stolon. **NOTE**: these lines aren't validated so if some of them are wrong postgres will refuse to start or, on reload, will log a warning and ignore the updated pg_hba.conf file | no | []string | null. Will use the default behaviour of accepting connections from all hosts for all dbs and users with md5 password authentication |
| automaticPgRestart | restart postgres automatically after changing the pgParameters that requires restart. Refer `pending_restart` in [pg_settings](https://www.postgresql.org/docs/9.5/static/view-pg-settings.html) | no | bool | false |
| dbWaitReadyTimeout | Time to wait for the database to become ready after starting. Increase this value if your Postgres takes longer to boot, e.g. because it has to recover a lot of WAL. | no | string (duration) | 60s |
| syncTimeout | Time to wait for a database recovery (including the replay of WAL files in case of Point-In-Time-Recovery) | no | string (duration) | 0 (no timeout, waits until recovery has finished) |
#### ExistingConfig
@ -63,41 +65,41 @@ Some options in a running cluster specification can be changed to update the des
|-------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------|-------------------------|---------|
| dataRestoreCommand | defines the command to execute for restoring the db cluster data. %d is replaced with the full path to the db cluster datadir. Use %% to embed an actual % character. Must return a 0 exit code only on success. | yes | string | |
| archiveRecoverySettings | archive recovery configuration | yes | ArchiveRecoverySettings | |
| recoveryTargetSettings | recovery target configuration | no | RecoveryTargetSettings | |
| recoveryTargetSettings | recovery target configuration | no | RecoveryTargetSettings | |
#### StandbyConfig
| Name | Description | Required | Type | Default |
|-------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------|-------------------------|---------|
| standbySettings | standby configuration | no | StandbySettings | |
| archiveRecoverySettings | archive recovery configuration | no | ArchiveRecoverySettings | |
| Name | Description | Required | Type | Default |
|-------------------------|--------------------------------|----------|-------------------------|---------|
| standbySettings | standby configuration | no | StandbySettings | |
| archiveRecoverySettings | archive recovery configuration | no | ArchiveRecoverySettings | |
#### ArchiveRecoverySettings
| Name | Description | Required | Type | Default |
|-------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------|-------------------------|---------|
| restoreCommand | defines the command to execute for restoring the archives. See the related [postgresql doc](https://www.postgresql.org/docs/current/static/archive-recovery-settings.html) | yes | string | |
| Name | Description | Required | Type | Default |
|----------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------|--------|---------|
| restoreCommand | defines the command to execute for restoring the archives. See the related [postgresql doc](https://www.postgresql.org/docs/current/static/archive-recovery-settings.html) | yes | string | |
#### RecoveryTargetSettings
These parameters are the same as defined in [postgresql recovery target settings doc](https://www.postgresql.org/docs/current/static/recovery-target-settings.html)
| Name | Description | Required | Type | Default |
|-------------------------|----------------------------------------------------------------------------------------------------------------------------------------------|----------|-------------------------|---------|
| recoveryTarget | See `recovery_target` in the related [postgresql doc](https://www.postgresql.org/docs/current/static/recovery-target-settings.html) | no | string | |
| recoveryTargetLsn | See `recovery_target_lsn` in the related [postgresql doc](https://www.postgresql.org/docs/current/static/recovery-target-settings.html) | no | string | |
| recoveryTargetName | See `recovery_target_name` in the related [postgresql doc](https://www.postgresql.org/docs/current/static/recovery-target-settings.html) | no | string | |
| recoveryTargetTime | See `recovery_target_time` in the related [postgresql doc](https://www.postgresql.org/docs/current/static/recovery-target-settings.html) | no | string | |
| recoveryTargetXid | See `recovery_target_xid` in the related [postgresql doc](https://www.postgresql.org/docs/current/static/recovery-target-settings.html) | no | string | |
| recoveryTargetTimeline | See `recovery_target_timeline` in the related [postgresql doc](https://www.postgresql.org/docs/current/static/recovery-target-settings.html) | no | string | |
| Name | Description | Required | Type | Default |
|------------------------|----------------------------------------------------------------------------------------------------------------------------------------------|----------|--------|---------|
| recoveryTarget | See `recovery_target` in the related [postgresql doc](https://www.postgresql.org/docs/current/static/recovery-target-settings.html) | no | string | |
| recoveryTargetLsn | See `recovery_target_lsn` in the related [postgresql doc](https://www.postgresql.org/docs/current/static/recovery-target-settings.html) | no | string | |
| recoveryTargetName | See `recovery_target_name` in the related [postgresql doc](https://www.postgresql.org/docs/current/static/recovery-target-settings.html) | no | string | |
| recoveryTargetTime | See `recovery_target_time` in the related [postgresql doc](https://www.postgresql.org/docs/current/static/recovery-target-settings.html) | no | string | |
| recoveryTargetXid | See `recovery_target_xid` in the related [postgresql doc](https://www.postgresql.org/docs/current/static/recovery-target-settings.html) | no | string | |
| recoveryTargetTimeline | See `recovery_target_timeline` in the related [postgresql doc](https://www.postgresql.org/docs/current/static/recovery-target-settings.html) | no | string | |
#### StandbySettings
| Name | Description | Required | Type | Default |
|-------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------|-------------------------|---------|
| primaryConnInfo | connection string to connect to the primary server (its value will be placed in the `primary_conninfo` parameter of the instance `recovery.conf` file. See the related [postgresql doc](https://www.postgresql.org/docs/current/static/standby-settings.html) | yes | string | |
| primarySlotName | optional replication slot to use (its value will be placed in the `primary_slot_name` parameter of the instance `recovery.conf` file. See the related [postgresql doc](https://www.postgresql.org/docs/current/static/standby-settings.html) | no | string | |
| recoveryMinApplyDelay | delay recovery for a fixed period of time (its value will be placed in the `recovery_min_apply_delay` parameter of the instance `recovery.conf` file. See the related [postgresql doc](https://www.postgresql.org/docs/current/static/standby-settings.html) | no | string | |
| Name | Description | Required | Type | Default |
|-----------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------|--------|---------|
| primaryConnInfo | connection string to connect to the primary server (its value will be placed in the `primary_conninfo` parameter of the instance `recovery.conf` file. See the related [postgresql doc](https://www.postgresql.org/docs/current/static/standby-settings.html) | yes | string | |
| primarySlotName | optional replication slot to use (its value will be placed in the `primary_slot_name` parameter of the instance `recovery.conf` file. See the related [postgresql doc](https://www.postgresql.org/docs/current/static/standby-settings.html) | no | string | |
| recoveryMinApplyDelay | delay recovery for a fixed period of time (its value will be placed in the `recovery_min_apply_delay` parameter of the instance `recovery.conf` file. See the related [postgresql doc](https://www.postgresql.org/docs/current/static/standby-settings.html) | no | string | |
#### Special Types
duration types (as described in https://golang.org/pkg/time/#ParseDuration) are signed sequence of decimal numbers, each with optional fraction and a unit suffix, such as "300ms", "-1.5h" or "2h45m". Valid time units are "ns", "us" (or "µs"), "ms", "s", "m", "h".

View File

@ -43,11 +43,7 @@ const (
)
const (
DefaultStoreTimeout = 5 * time.Second
DefaultProxyCheckInterval = 5 * time.Second
DefaultProxyTimeoutInterval = 15 * time.Second
DefaultDBWaitReadyTimeout = 60 * time.Second
DefaultStoreTimeout = 5 * time.Second
DefaultDBNotIncreasingXLogPosTimes = 10
@ -56,8 +52,11 @@ const (
DefaultConvergenceTimeout = 30 * time.Second
DefaultInitTimeout = 5 * time.Minute
DefaultSyncTimeout = 0
DefaultDBWaitReadyTimeout = 60 * time.Second
DefaultFailInterval = 20 * time.Second
DefaultDeadKeeperRemovalInterval = 48 * time.Hour
DefaultProxyCheckInterval = 5 * time.Second
DefaultProxyTimeout = 15 * time.Second
DefaultMaxStandbys uint16 = 20
DefaultMaxStandbysPerSender uint16 = 3
DefaultMaxStandbyLag = 1024 * 1204
@ -228,6 +227,10 @@ type ClusterSpec struct {
FailInterval *Duration `json:"failInterval,omitempty"`
// Interval after which a dead keeper will be removed from the cluster data
DeadKeeperRemovalInterval *Duration `json:"deadKeeperRemovalInterval,omitempty"`
// Interval to wait before next proxy check
ProxyCheckInterval *Duration `json:"proxyCheckInterval,omitempty"`
// Interval where the proxy must successfully complete a check
ProxyTimeout *Duration `json:"proxyTimeout,omitempty"`
// Max number of standbys. This needs to be greater enough to cover both
// standby managed by stolon and additional standbys configured by the
// user. Its value affect different postgres parameters like
@ -364,6 +367,12 @@ func (os *ClusterSpec) WithDefaults() *ClusterSpec {
if s.DeadKeeperRemovalInterval == nil {
s.DeadKeeperRemovalInterval = &Duration{Duration: DefaultDeadKeeperRemovalInterval}
}
if s.ProxyCheckInterval == nil {
s.ProxyCheckInterval = &Duration{Duration: DefaultProxyCheckInterval}
}
if s.ProxyTimeout == nil {
s.ProxyTimeout = &Duration{Duration: DefaultProxyTimeout}
}
if s.MaxStandbys == nil {
s.MaxStandbys = Uint16P(DefaultMaxStandbys)
}
@ -426,11 +435,20 @@ func (os *ClusterSpec) Validate() error {
if s.DBWaitReadyTimeout.Duration < 0 {
return fmt.Errorf("dbWaitReadyTimeout must be positive")
}
if s.FailInterval.Duration < 0 {
return fmt.Errorf("failInterval must be positive")
}
if s.DeadKeeperRemovalInterval.Duration < 0 {
return fmt.Errorf("deadKeeperRemovalInterval must be positive")
}
if s.FailInterval.Duration < 0 {
return fmt.Errorf("failInterval must be positive")
if s.ProxyCheckInterval.Duration < 0 {
return fmt.Errorf("proxyCheckInterval must be positive")
}
if s.ProxyTimeout.Duration < 0 {
return fmt.Errorf("proxyTimeout must be positive")
}
if s.ProxyCheckInterval.Duration >= s.ProxyTimeout.Duration {
return fmt.Errorf("proxyCheckInterval should be less than proxyTimeout")
}
if *s.MaxStandbys < 1 {
return fmt.Errorf("maxStandbys must be at least 1")

View File

@ -16,6 +16,7 @@ package cluster
import (
"reflect"
"time"
"github.com/sorintlab/stolon/internal/common"
@ -133,6 +134,12 @@ type ProxyInfo struct {
UID string
Generation int64
// ProxyTimeout is the current proxyTimeout used by the proxy
// at the time of publishing its state.
// It's used by the sentinel to know for how much time the
// proxy should be considered active.
ProxyTimeout time.Duration
}
type ProxiesInfo map[string]*ProxyInfo

View File

@ -151,7 +151,7 @@ func TestProxyListening(t *testing.T) {
}
// tp should not listen because it cannot talk with the store
if err := tp.WaitNotListening(cluster.DefaultProxyTimeoutInterval * 2); err != nil {
if err := tp.WaitNotListening(cluster.DefaultProxyTimeout * 2); err != nil {
t.Fatalf("expecting tp not listening due to failed store communication, but it's listening.")
}
@ -174,8 +174,8 @@ func TestProxyListening(t *testing.T) {
if err := tstore.WaitDown(10 * time.Second); err != nil {
t.Fatalf("error waiting on store down: %v", err)
}
// wait less than DefaultProxyTimeoutInterval
time.Sleep(cluster.DefaultProxyTimeoutInterval / 3)
// wait less than DefaultProxyTimeout
time.Sleep(cluster.DefaultProxyTimeout / 3)
// Start store
if err := tstore.Start(); err != nil {
t.Fatalf("unexpected err: %v", err)
@ -239,7 +239,7 @@ func TestProxyListening(t *testing.T) {
}
// tp should not listen because it cannot talk with the store
if err := tp.WaitNotListening(cluster.DefaultProxyTimeoutInterval * 2); err != nil {
if err := tp.WaitNotListening(cluster.DefaultProxyTimeout * 2); err != nil {
t.Fatalf("expecting tp not listening due to failed store communication, but it's listening.")
}

View File

@ -152,7 +152,7 @@ func TestSentinelEnabledProxies(t *testing.T) {
t.Fatalf("unexpected err: %v", err)
}
if err := WaitClusterDataEnabledProxiesNum(sm, 1, 3*cluster.DefaultProxyTimeoutInterval); err != nil {
if err := WaitClusterDataEnabledProxiesNum(sm, 1, 3*cluster.DefaultProxyTimeout); err != nil {
t.Fatalf("unexpected err: %v", err)
}