From 9cc800de058a9e282ae77dc851146007c23aaa2b Mon Sep 17 00:00:00 2001 From: Simone Gotti Date: Mon, 17 Feb 2020 12:04:27 +0100 Subject: [PATCH] proxy: make proxyCheckInterval and proxyTimeout configurable make proxyCheckInterval and proxyTimeout configurable in the cluster spec. The proxy will publish its current proxyTimeout so the sentinel will know when to consider it as active. --- cmd/proxy/cmd/proxy.go | 68 ++++++++++++++++---- cmd/sentinel/cmd/sentinel.go | 3 +- cmd/sentinel/cmd/sentinel_test.go | 6 +- cmd/stolonctl/cmd/spec.go | 4 ++ doc/cluster_spec.md | 100 +++++++++++++++-------------- internal/cluster/cluster.go | 32 +++++++-- internal/cluster/member.go | 7 ++ tests/integration/proxy_test.go | 8 +-- tests/integration/sentinel_test.go | 2 +- 9 files changed, 153 insertions(+), 77 deletions(-) diff --git a/cmd/proxy/cmd/proxy.go b/cmd/proxy/cmd/proxy.go index d165038..39f1f3b 100644 --- a/cmd/proxy/cmd/proxy.go +++ b/cmd/proxy/cmd/proxy.go @@ -89,6 +89,10 @@ type ClusterChecker struct { endPollonProxyCh chan error pollonMutex sync.Mutex + + proxyCheckInterval time.Duration + proxyTimeout time.Duration + configMutex sync.Mutex } func NewClusterChecker(uid string, cfg config) (*ClusterChecker, error) { @@ -104,6 +108,9 @@ func NewClusterChecker(uid string, cfg config) (*ClusterChecker, error) { stopListening: cfg.stopListening, e: e, endPollonProxyCh: make(chan error), + + proxyCheckInterval: cluster.DefaultProxyCheckInterval, + proxyTimeout: cluster.DefaultProxyTimeout, }, nil } @@ -164,15 +171,16 @@ func (c *ClusterChecker) sendPollonConfData(confData pollon.ConfData) { } } -func (c *ClusterChecker) SetProxyInfo(e store.Store, generation int64, ttl time.Duration) error { +func (c *ClusterChecker) SetProxyInfo(e store.Store, generation int64, proxyTimeout time.Duration) error { proxyInfo := &cluster.ProxyInfo{ - InfoUID: common.UID(), - UID: c.uid, - Generation: generation, + InfoUID: common.UID(), + UID: c.uid, + Generation: generation, + ProxyTimeout: proxyTimeout, } log.Debugf("proxyInfo dump: %s", spew.Sdump(proxyInfo)) - if err := c.e.SetProxyInfo(context.TODO(), proxyInfo, ttl); err != nil { + if err := c.e.SetProxyInfo(context.TODO(), proxyInfo, 2*proxyTimeout); err != nil { return err } return nil @@ -205,13 +213,31 @@ func (c *ClusterChecker) Check() error { return fmt.Errorf("clusterdata validation failed: %v", err) } + cdProxyCheckInterval := cd.Cluster.DefSpec().ProxyCheckInterval.Duration + cdProxyTimeout := cd.Cluster.DefSpec().ProxyTimeout.Duration + + // use the greater between the current proxy timeout and the one defined in the cluster spec if they're different. + // in this way we're updating our proxyInfo using a timeout that is greater or equal the current active timeout timer. + c.configMutex.Lock() + proxyTimeout := c.proxyTimeout + if cdProxyTimeout > proxyTimeout { + proxyTimeout = cdProxyTimeout + } + c.configMutex.Unlock() + proxy := cd.Proxy if proxy == nil { log.Infow("no proxy object available, closing connections to master") c.sendPollonConfData(pollon.ConfData{DestAddr: nil}) // ignore errors on setting proxy info - if err = c.SetProxyInfo(c.e, cluster.NoGeneration, 2*cluster.DefaultProxyTimeoutInterval); err != nil { + if err = c.SetProxyInfo(c.e, cluster.NoGeneration, proxyTimeout); err != nil { log.Errorw("failed to update proxyInfo", zap.Error(err)) + } else { + // update proxyCheckinterval and proxyTimeout only if we successfully updated our proxy info + c.configMutex.Lock() + c.proxyCheckInterval = cdProxyCheckInterval + c.proxyTimeout = cdProxyTimeout + c.configMutex.Unlock() } return nil } @@ -221,8 +247,14 @@ func (c *ClusterChecker) Check() error { log.Infow("no db object available, closing connections to master", "db", proxy.Spec.MasterDBUID) c.sendPollonConfData(pollon.ConfData{DestAddr: nil}) // ignore errors on setting proxy info - if err = c.SetProxyInfo(c.e, proxy.Generation, 2*cluster.DefaultProxyTimeoutInterval); err != nil { + if err = c.SetProxyInfo(c.e, proxy.Generation, proxyTimeout); err != nil { log.Errorw("failed to update proxyInfo", zap.Error(err)) + } else { + // update proxyCheckinterval and proxyTimeout only if we successfully updated our proxy info + c.configMutex.Lock() + c.proxyCheckInterval = cdProxyCheckInterval + c.proxyTimeout = cdProxyTimeout + c.configMutex.Unlock() } return nil } @@ -234,12 +266,18 @@ func (c *ClusterChecker) Check() error { return nil } log.Infow("master address", "address", addr) - if err = c.SetProxyInfo(c.e, proxy.Generation, 2*cluster.DefaultProxyTimeoutInterval); err != nil { + if err = c.SetProxyInfo(c.e, proxy.Generation, proxyTimeout); err != nil { // if we failed to update our proxy info when a master is defined we // cannot ignore this error since the sentinel won't know that we exist // and are sending connections to a master so, when electing a new // master, it'll not wait for us to close connections to the old one. return fmt.Errorf("failed to update proxyInfo: %v", err) + } else { + // update proxyCheckinterval and proxyTimeout only if we successfully updated our proxy info + c.configMutex.Lock() + c.proxyCheckInterval = cdProxyCheckInterval + c.proxyTimeout = cdProxyTimeout + c.configMutex.Unlock() } // start proxing only if we are inside enabledProxies, this ensures that the @@ -256,7 +294,9 @@ func (c *ClusterChecker) Check() error { } func (c *ClusterChecker) TimeoutChecker(checkOkCh chan struct{}) { - timeoutTimer := time.NewTimer(cluster.DefaultProxyTimeoutInterval) + c.configMutex.Lock() + timeoutTimer := time.NewTimer(c.proxyTimeout) + c.configMutex.Unlock() for { select { @@ -275,7 +315,10 @@ func (c *ClusterChecker) TimeoutChecker(checkOkCh chan struct{}) { // ignore if stop succeeded or not due to timer already expired timeoutTimer.Stop() - timeoutTimer = time.NewTimer(cluster.DefaultProxyTimeoutInterval) + + c.configMutex.Lock() + timeoutTimer = time.NewTimer(c.proxyTimeout) + c.configMutex.Unlock() } } } @@ -305,7 +348,10 @@ func (c *ClusterChecker) Start() error { // report that check was ok checkOkCh <- struct{}{} } - timerCh = time.NewTimer(cluster.DefaultProxyCheckInterval).C + c.configMutex.Lock() + timerCh = time.NewTimer(c.proxyCheckInterval).C + c.configMutex.Unlock() + case err := <-c.endPollonProxyCh: if err != nil { return fmt.Errorf("proxy error: %v", err) diff --git a/cmd/sentinel/cmd/sentinel.go b/cmd/sentinel/cmd/sentinel.go index ede28cd..be109ad 100644 --- a/cmd/sentinel/cmd/sentinel.go +++ b/cmd/sentinel/cmd/sentinel.go @@ -342,7 +342,7 @@ func (s *Sentinel) activeProxiesInfos(proxiesInfo cluster.ProxiesInfo) cluster.P for _, pi := range proxiesInfo { if pih, ok := pihs[pi.UID]; ok { if pih.ProxyInfo.InfoUID == pi.InfoUID { - if timer.Since(pih.Timer) > 2*cluster.DefaultProxyTimeoutInterval { + if timer.Since(pih.Timer) > 2*pi.ProxyTimeout { delete(activeProxiesInfo, pi.UID) } } else { @@ -1820,7 +1820,6 @@ func (s *Sentinel) clusterSentinelCheck(pctx context.Context) { s.sleepInterval = cd.Cluster.DefSpec().SleepInterval.Duration s.requestTimeout = cd.Cluster.DefSpec().RequestTimeout.Duration } - } log.Debugf("cd dump: %s", spew.Sdump(cd)) diff --git a/cmd/sentinel/cmd/sentinel_test.go b/cmd/sentinel/cmd/sentinel_test.go index 519d09f..c8bb705 100644 --- a/cmd/sentinel/cmd/sentinel_test.go +++ b/cmd/sentinel/cmd/sentinel_test.go @@ -5000,8 +5000,8 @@ func TestUpdateCluster(t *testing.T) { } func TestActiveProxiesInfos(t *testing.T) { - proxyInfo1 := cluster.ProxyInfo{UID: "proxy1", InfoUID: "infoUID1"} - proxyInfo2 := cluster.ProxyInfo{UID: "proxy2", InfoUID: "infoUID2"} + proxyInfo1 := cluster.ProxyInfo{UID: "proxy1", InfoUID: "infoUID1", ProxyTimeout: cluster.DefaultProxyTimeout} + proxyInfo2 := cluster.ProxyInfo{UID: "proxy2", InfoUID: "infoUID2", ProxyTimeout: cluster.DefaultProxyTimeout} proxyInfoWithDifferentInfoUID := cluster.ProxyInfo{UID: "proxy2", InfoUID: "differentInfoUID"} var secToNanoSecondMultiplier int64 = 1000000000 tests := []struct { @@ -5033,7 +5033,7 @@ func TestActiveProxiesInfos(t *testing.T) { expectedProxyInfoHistories: ProxyInfoHistories{"proxy1": &ProxyInfoHistory{ProxyInfo: &proxyInfo1}, "proxy2": &ProxyInfoHistory{ProxyInfo: &proxyInfoWithDifferentInfoUID}}, }, { - name: "should remove from active proxies if is not updated for twice the DefaultProxyTimeoutInterval", + name: "should remove from active proxies if is not updated for twice the DefaultProxyTimeout", proxyInfoHistories: ProxyInfoHistories{"proxy1": &ProxyInfoHistory{ProxyInfo: &proxyInfo1, Timer: timer.Now() - (3 * 15 * secToNanoSecondMultiplier)}, "proxy2": &ProxyInfoHistory{ProxyInfo: &proxyInfo2, Timer: timer.Now() - (1 * 15 * secToNanoSecondMultiplier)}}, proxiesInfos: cluster.ProxiesInfo{"proxy1": &proxyInfo1, "proxy2": &proxyInfo2}, expectedActiveProxies: cluster.ProxiesInfo{"proxy2": &proxyInfo2}, diff --git a/cmd/stolonctl/cmd/spec.go b/cmd/stolonctl/cmd/spec.go index d266890..5649c27 100644 --- a/cmd/stolonctl/cmd/spec.go +++ b/cmd/stolonctl/cmd/spec.go @@ -50,6 +50,8 @@ type ClusterSpecNoDefaults struct { DBWaitReadyTimeout *cluster.Duration `json:"dbWaitReadyTimeout,omitempty"` FailInterval *cluster.Duration `json:"failInterval,omitempty"` DeadKeeperRemovalInterval *cluster.Duration `json:"deadKeeperRemovalInterval,omitempty"` + ProxyCheckInterval *cluster.Duration `json:"proxyCheckInterval,omitempty"` + ProxyTimeout *cluster.Duration `json:"proxyTimeout,omitempty"` MaxStandbys *uint16 `json:"maxStandbys,omitempty"` MaxStandbysPerSender *uint16 `json:"maxStandbysPerSender,omitempty"` MaxStandbyLag *uint32 `json:"maxStandbyLag,omitempty"` @@ -81,6 +83,8 @@ type ClusterSpecDefaults struct { DBWaitReadyTimeout *cluster.Duration `json:"dbWaitReadyTimeout"` FailInterval *cluster.Duration `json:"failInterval"` DeadKeeperRemovalInterval *cluster.Duration `json:"deadKeeperRemovalInterval"` + ProxyCheckInterval *cluster.Duration `json:"proxyCheckInterval"` + ProxyTimeout *cluster.Duration `json:"proxyTimeout"` MaxStandbys *uint16 `json:"maxStandbys"` MaxStandbysPerSender *uint16 `json:"maxStandbysPerSender"` MaxStandbyLag *uint32 `json:"maxStandbyLag"` diff --git a/doc/cluster_spec.md b/doc/cluster_spec.md index 8e37f53..92eaff9 100644 --- a/doc/cluster_spec.md +++ b/doc/cluster_spec.md @@ -12,34 +12,36 @@ Some options in a running cluster specification can be changed to update the des ### Cluster Specification Format. -| Name | Description | Required | Type | Default | -|---------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------|-------------------|-------------------------------------------------------------------------------------------------------------------------------------| -| sleepInterval | interval to wait before next check (for every component: keeper, sentinel, proxy). | no | string (duration) | 5s | -| requestTimeout | time after which any request (keepers checks from sentinel etc...) will fail. | no | string (duration) | 10s | -| failInterval | interval after the first fail to declare a keeper as not healthy. | no | string (duration) | 20s | -| deadKeeperRemovalInterval | interval after which a dead keeper will be removed from the cluster data | no | string (duration) | 48h | -| maxStandbys | max number of standbys. This needs to be greater enough to cover both standby managed by stolon and additional standbys configured by the user. Its value affect different postgres parameters like max_replication_slots and max_wal_senders. Setting this to a number lower than the sum of stolon managed standbys and user managed standbys will have unpredicatable effects due to problems creating replication slots or replication problems due to exhausted wal senders. | no | uint16 | 20 | -| maxStandbysPerSender | max number of standbys for every sender. A sender can be a master or another standby (with cascading replication). | no | uint16 | 3 | -| maxStandbyLag | maximum lag (from the last reported master state, in bytes) that an asynchronous standby can have to be elected in place of a failed master. | no | uint32 | 1MiB | -| synchronousReplication | use synchronous replication between the master and its standbys | no | bool | false | -| minSynchronousStandbys | minimum number of required synchronous standbys when synchronous replication is enabled (only set this to a value > 1 when using PostgreSQL >= 9.6) | no | uint16 | 1 | -| maxSynchronousStandbys | maximum number of required synchronous standbys when synchronous replication is enabled (only set this to a value > 1 when using PostgreSQL >= 9.6) | no | uint16 | 1 | -| additionalWalSenders | number of additional wal_senders in addition to the ones internally defined by stolon, useful to provide enough wal senders for external standbys (changing this value requires an instance restart) | no | uint16 | 5 | -| additionalMasterReplicationSlots | a list of additional physical replication slots to be created on the master postgres instance. They will be prefixed with `stolon_` (like internal replication slots used for standby replication) to make them "namespaced" from other replication slots. Replication slots starting with `stolon_` and not defined here (and not used for standby replication) will be dropped from the master instance. | no | []string | null | -| usePgrewind | try to use pg_rewind for faster instance resyncronization. | no | bool | false | -| initMode | The cluster initialization mode. Can be *new* or *existing*. *new* means that a new db cluster will be created on a random keeper and the other keepers will sync with it. *existing* means that a keeper (that needs to have an already created db cluster) will be choosed as the initial master and the other keepers will sync with it. In this case the `existingConfig` object needs to be populated. | yes | string | | -| existingConfig | configuration for initMode of type "existing" | if initMode is "existing" | ExistingConfig | | -| mergePgParameters | merge pgParameters of the initialized db cluster, useful the retain initdb generated parameters when InitMode is new, retain current parameters when initMode is existing or pitr. | no | bool | true | -| role | cluster role (master or standby) | no | bool | master | -| defaultSUReplAccessMode | mode for the default hba rules used for replication by standby keepers (the su and repl auth methods will be the one provided in the keeper command line options). Values can be *all* or *strict*. *all* allow access from all ips, *strict* restrict master access to standby servers ips. | no | string | all | -| newConfig | configuration for initMode of type "new" | if initMode is "new" | NewConfig | | -| pitrConfig | configuration for initMode of type "pitr" | if initMode is "pitr" | PITRConfig | | -| standbyConfig | standby config when the cluster is a standby cluster | if role is "standby" | StandbyConfig | | -| pgParameters | a map containing the postgres server parameters and their values. The parameters value don't have to be quoted and single quotes don't have to be doubled since this is already done by the keeper when writing the postgresql.conf file | no | map[string]string | | -| pgHBA | a list containing additional pg_hba.conf entries. They will be added to the pg_hba.conf generated by stolon. **NOTE**: these lines aren't validated so if some of them are wrong postgres will refuse to start or, on reload, will log a warning and ignore the updated pg_hba.conf file | no | []string | null. Will use the default behiavior of accepting connections from all hosts for all dbs and users with md5 password authentication | -| automaticPgRestart | restart postgres automatically after changing the pgParameters that requires restart. Refer `pending_restart` in [pg_settings](https://www.postgresql.org/docs/9.5/static/view-pg-settings.html) | no | bool | false | -| dbWaitReadyTimeout | Time to wait for the database to become ready after starting. Increase this value if your Postgres takes longer to boot, e.g. because it has to recover a lot of WAL. | no | string (duration) | 60s | -| syncTimeout | Time to wait for a database recovery (including the replay of WAL files in case of Point-In-Time-Recovery) | no | string (duration) | 0 (no timeout, waits until recovery has finished) | +| Name | Description | Required | Type | Default | +|----------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------|-------------------|-------------------------------------------------------------------------------------------------------------------------------------| +| sleepInterval | interval to wait before next check (for keepers and sentinels). | no | string (duration) | 5s | +| requestTimeout | time after which any request to external resources (store, postgres queries etc...) will fail. | no | string (duration) | 10s | +| failInterval | interval after the first fail to declare a keeper as not healthy. | no | string (duration) | 20s | +| proxyCheckInterval | interval to wait before next proxy check. | no | string (duration) | 5s | +| proxyTimeout | interval where a proxy check must successfully complete or the proxy will close all connections to the master. | no | string (duration) | 15s | +| deadKeeperRemovalInterval | interval after which a dead keeper will be removed from the cluster data | no | string (duration) | 48h | +| maxStandbys | max number of standbys. This needs to be greater enough to cover both standby managed by stolon and additional standbys configured by the user. Its value affect different postgres parameters like max_replication_slots and max_wal_senders. Setting this to a number lower than the sum of stolon managed standbys and user managed standbys will have unpredicatable effects due to problems creating replication slots or replication problems due to exhausted wal senders. | no | uint16 | 20 | +| maxStandbysPerSender | max number of standbys for every sender. A sender can be a master or another standby (with cascading replication). | no | uint16 | 3 | +| maxStandbyLag | maximum lag (from the last reported master state, in bytes) that an asynchronous standby can have to be elected in place of a failed master. | no | uint32 | 1MiB | +| synchronousReplication | use synchronous replication between the master and its standbys | no | bool | false | +| minSynchronousStandbys | minimum number of required synchronous standbys when synchronous replication is enabled (only set this to a value > 1 when using PostgreSQL >= 9.6) | no | uint16 | 1 | +| maxSynchronousStandbys | maximum number of required synchronous standbys when synchronous replication is enabled (only set this to a value > 1 when using PostgreSQL >= 9.6) | no | uint16 | 1 | +| additionalWalSenders | number of additional wal_senders in addition to the ones internally defined by stolon, useful to provide enough wal senders for external standbys (changing this value requires an instance restart) | no | uint16 | 5 | +| additionalMasterReplicationSlots | a list of additional physical replication slots to be created on the master postgres instance. They will be prefixed with `stolon_` (like internal replication slots used for standby replication) to make them "namespaced" from other replication slots. Replication slots starting with `stolon_` and not defined here (and not used for standby replication) will be dropped from the master instance. | no | []string | null | +| usePgrewind | try to use pg_rewind for faster instance resyncronization. | no | bool | false | +| initMode | The cluster initialization mode. Can be *new* or *existing*. *new* means that a new db cluster will be created on a random keeper and the other keepers will sync with it. *existing* means that a keeper (that needs to have an already created db cluster) will be choosed as the initial master and the other keepers will sync with it. In this case the `existingConfig` object needs to be populated. | yes | string | | +| existingConfig | configuration for initMode of type "existing" | if initMode is "existing" | ExistingConfig | | +| mergePgParameters | merge pgParameters of the initialized db cluster, useful the retain initdb generated parameters when InitMode is new, retain current parameters when initMode is existing or pitr. | no | bool | true | +| role | cluster role (master or standby) | no | bool | master | +| defaultSUReplAccessMode | mode for the default hba rules used for replication by standby keepers (the su and repl auth methods will be the one provided in the keeper command line options). Values can be *all* or *strict*. *all* allow access from all ips, *strict* restrict master access to standby servers ips. | no | string | all | +| newConfig | configuration for initMode of type "new" | if initMode is "new" | NewConfig | | +| pitrConfig | configuration for initMode of type "pitr" | if initMode is "pitr" | PITRConfig | | +| standbyConfig | standby config when the cluster is a standby cluster | if role is "standby" | StandbyConfig | | +| pgParameters | a map containing the postgres server parameters and their values. The parameters value don't have to be quoted and single quotes don't have to be doubled since this is already done by the keeper when writing the postgresql.conf file | no | map[string]string | | +| pgHBA | a list containing additional pg_hba.conf entries. They will be added to the pg_hba.conf generated by stolon. **NOTE**: these lines aren't validated so if some of them are wrong postgres will refuse to start or, on reload, will log a warning and ignore the updated pg_hba.conf file | no | []string | null. Will use the default behaviour of accepting connections from all hosts for all dbs and users with md5 password authentication | +| automaticPgRestart | restart postgres automatically after changing the pgParameters that requires restart. Refer `pending_restart` in [pg_settings](https://www.postgresql.org/docs/9.5/static/view-pg-settings.html) | no | bool | false | +| dbWaitReadyTimeout | Time to wait for the database to become ready after starting. Increase this value if your Postgres takes longer to boot, e.g. because it has to recover a lot of WAL. | no | string (duration) | 60s | +| syncTimeout | Time to wait for a database recovery (including the replay of WAL files in case of Point-In-Time-Recovery) | no | string (duration) | 0 (no timeout, waits until recovery has finished) | #### ExistingConfig @@ -63,41 +65,41 @@ Some options in a running cluster specification can be changed to update the des |-------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------|-------------------------|---------| | dataRestoreCommand | defines the command to execute for restoring the db cluster data. %d is replaced with the full path to the db cluster datadir. Use %% to embed an actual % character. Must return a 0 exit code only on success. | yes | string | | | archiveRecoverySettings | archive recovery configuration | yes | ArchiveRecoverySettings | | -| recoveryTargetSettings | recovery target configuration | no | RecoveryTargetSettings | | +| recoveryTargetSettings | recovery target configuration | no | RecoveryTargetSettings | | #### StandbyConfig -| Name | Description | Required | Type | Default | -|-------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------|-------------------------|---------| -| standbySettings | standby configuration | no | StandbySettings | | -| archiveRecoverySettings | archive recovery configuration | no | ArchiveRecoverySettings | | +| Name | Description | Required | Type | Default | +|-------------------------|--------------------------------|----------|-------------------------|---------| +| standbySettings | standby configuration | no | StandbySettings | | +| archiveRecoverySettings | archive recovery configuration | no | ArchiveRecoverySettings | | #### ArchiveRecoverySettings -| Name | Description | Required | Type | Default | -|-------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------|-------------------------|---------| -| restoreCommand | defines the command to execute for restoring the archives. See the related [postgresql doc](https://www.postgresql.org/docs/current/static/archive-recovery-settings.html) | yes | string | | +| Name | Description | Required | Type | Default | +|----------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------|--------|---------| +| restoreCommand | defines the command to execute for restoring the archives. See the related [postgresql doc](https://www.postgresql.org/docs/current/static/archive-recovery-settings.html) | yes | string | | #### RecoveryTargetSettings These parameters are the same as defined in [postgresql recovery target settings doc](https://www.postgresql.org/docs/current/static/recovery-target-settings.html) -| Name | Description | Required | Type | Default | -|-------------------------|----------------------------------------------------------------------------------------------------------------------------------------------|----------|-------------------------|---------| -| recoveryTarget | See `recovery_target` in the related [postgresql doc](https://www.postgresql.org/docs/current/static/recovery-target-settings.html) | no | string | | -| recoveryTargetLsn | See `recovery_target_lsn` in the related [postgresql doc](https://www.postgresql.org/docs/current/static/recovery-target-settings.html) | no | string | | -| recoveryTargetName | See `recovery_target_name` in the related [postgresql doc](https://www.postgresql.org/docs/current/static/recovery-target-settings.html) | no | string | | -| recoveryTargetTime | See `recovery_target_time` in the related [postgresql doc](https://www.postgresql.org/docs/current/static/recovery-target-settings.html) | no | string | | -| recoveryTargetXid | See `recovery_target_xid` in the related [postgresql doc](https://www.postgresql.org/docs/current/static/recovery-target-settings.html) | no | string | | -| recoveryTargetTimeline | See `recovery_target_timeline` in the related [postgresql doc](https://www.postgresql.org/docs/current/static/recovery-target-settings.html) | no | string | | +| Name | Description | Required | Type | Default | +|------------------------|----------------------------------------------------------------------------------------------------------------------------------------------|----------|--------|---------| +| recoveryTarget | See `recovery_target` in the related [postgresql doc](https://www.postgresql.org/docs/current/static/recovery-target-settings.html) | no | string | | +| recoveryTargetLsn | See `recovery_target_lsn` in the related [postgresql doc](https://www.postgresql.org/docs/current/static/recovery-target-settings.html) | no | string | | +| recoveryTargetName | See `recovery_target_name` in the related [postgresql doc](https://www.postgresql.org/docs/current/static/recovery-target-settings.html) | no | string | | +| recoveryTargetTime | See `recovery_target_time` in the related [postgresql doc](https://www.postgresql.org/docs/current/static/recovery-target-settings.html) | no | string | | +| recoveryTargetXid | See `recovery_target_xid` in the related [postgresql doc](https://www.postgresql.org/docs/current/static/recovery-target-settings.html) | no | string | | +| recoveryTargetTimeline | See `recovery_target_timeline` in the related [postgresql doc](https://www.postgresql.org/docs/current/static/recovery-target-settings.html) | no | string | | #### StandbySettings -| Name | Description | Required | Type | Default | -|-------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------|-------------------------|---------| -| primaryConnInfo | connection string to connect to the primary server (its value will be placed in the `primary_conninfo` parameter of the instance `recovery.conf` file. See the related [postgresql doc](https://www.postgresql.org/docs/current/static/standby-settings.html) | yes | string | | -| primarySlotName | optional replication slot to use (its value will be placed in the `primary_slot_name` parameter of the instance `recovery.conf` file. See the related [postgresql doc](https://www.postgresql.org/docs/current/static/standby-settings.html) | no | string | | -| recoveryMinApplyDelay | delay recovery for a fixed period of time (its value will be placed in the `recovery_min_apply_delay` parameter of the instance `recovery.conf` file. See the related [postgresql doc](https://www.postgresql.org/docs/current/static/standby-settings.html) | no | string | | +| Name | Description | Required | Type | Default | +|-----------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------|--------|---------| +| primaryConnInfo | connection string to connect to the primary server (its value will be placed in the `primary_conninfo` parameter of the instance `recovery.conf` file. See the related [postgresql doc](https://www.postgresql.org/docs/current/static/standby-settings.html) | yes | string | | +| primarySlotName | optional replication slot to use (its value will be placed in the `primary_slot_name` parameter of the instance `recovery.conf` file. See the related [postgresql doc](https://www.postgresql.org/docs/current/static/standby-settings.html) | no | string | | +| recoveryMinApplyDelay | delay recovery for a fixed period of time (its value will be placed in the `recovery_min_apply_delay` parameter of the instance `recovery.conf` file. See the related [postgresql doc](https://www.postgresql.org/docs/current/static/standby-settings.html) | no | string | | #### Special Types duration types (as described in https://golang.org/pkg/time/#ParseDuration) are signed sequence of decimal numbers, each with optional fraction and a unit suffix, such as "300ms", "-1.5h" or "2h45m". Valid time units are "ns", "us" (or "µs"), "ms", "s", "m", "h". diff --git a/internal/cluster/cluster.go b/internal/cluster/cluster.go index 1d60388..ca78a77 100644 --- a/internal/cluster/cluster.go +++ b/internal/cluster/cluster.go @@ -43,11 +43,7 @@ const ( ) const ( - DefaultStoreTimeout = 5 * time.Second - DefaultProxyCheckInterval = 5 * time.Second - DefaultProxyTimeoutInterval = 15 * time.Second - - DefaultDBWaitReadyTimeout = 60 * time.Second + DefaultStoreTimeout = 5 * time.Second DefaultDBNotIncreasingXLogPosTimes = 10 @@ -56,8 +52,11 @@ const ( DefaultConvergenceTimeout = 30 * time.Second DefaultInitTimeout = 5 * time.Minute DefaultSyncTimeout = 0 + DefaultDBWaitReadyTimeout = 60 * time.Second DefaultFailInterval = 20 * time.Second DefaultDeadKeeperRemovalInterval = 48 * time.Hour + DefaultProxyCheckInterval = 5 * time.Second + DefaultProxyTimeout = 15 * time.Second DefaultMaxStandbys uint16 = 20 DefaultMaxStandbysPerSender uint16 = 3 DefaultMaxStandbyLag = 1024 * 1204 @@ -228,6 +227,10 @@ type ClusterSpec struct { FailInterval *Duration `json:"failInterval,omitempty"` // Interval after which a dead keeper will be removed from the cluster data DeadKeeperRemovalInterval *Duration `json:"deadKeeperRemovalInterval,omitempty"` + // Interval to wait before next proxy check + ProxyCheckInterval *Duration `json:"proxyCheckInterval,omitempty"` + // Interval where the proxy must successfully complete a check + ProxyTimeout *Duration `json:"proxyTimeout,omitempty"` // Max number of standbys. This needs to be greater enough to cover both // standby managed by stolon and additional standbys configured by the // user. Its value affect different postgres parameters like @@ -364,6 +367,12 @@ func (os *ClusterSpec) WithDefaults() *ClusterSpec { if s.DeadKeeperRemovalInterval == nil { s.DeadKeeperRemovalInterval = &Duration{Duration: DefaultDeadKeeperRemovalInterval} } + if s.ProxyCheckInterval == nil { + s.ProxyCheckInterval = &Duration{Duration: DefaultProxyCheckInterval} + } + if s.ProxyTimeout == nil { + s.ProxyTimeout = &Duration{Duration: DefaultProxyTimeout} + } if s.MaxStandbys == nil { s.MaxStandbys = Uint16P(DefaultMaxStandbys) } @@ -426,11 +435,20 @@ func (os *ClusterSpec) Validate() error { if s.DBWaitReadyTimeout.Duration < 0 { return fmt.Errorf("dbWaitReadyTimeout must be positive") } + if s.FailInterval.Duration < 0 { + return fmt.Errorf("failInterval must be positive") + } if s.DeadKeeperRemovalInterval.Duration < 0 { return fmt.Errorf("deadKeeperRemovalInterval must be positive") } - if s.FailInterval.Duration < 0 { - return fmt.Errorf("failInterval must be positive") + if s.ProxyCheckInterval.Duration < 0 { + return fmt.Errorf("proxyCheckInterval must be positive") + } + if s.ProxyTimeout.Duration < 0 { + return fmt.Errorf("proxyTimeout must be positive") + } + if s.ProxyCheckInterval.Duration >= s.ProxyTimeout.Duration { + return fmt.Errorf("proxyCheckInterval should be less than proxyTimeout") } if *s.MaxStandbys < 1 { return fmt.Errorf("maxStandbys must be at least 1") diff --git a/internal/cluster/member.go b/internal/cluster/member.go index 7a8b062..e6ff3c0 100644 --- a/internal/cluster/member.go +++ b/internal/cluster/member.go @@ -16,6 +16,7 @@ package cluster import ( "reflect" + "time" "github.com/sorintlab/stolon/internal/common" @@ -133,6 +134,12 @@ type ProxyInfo struct { UID string Generation int64 + + // ProxyTimeout is the current proxyTimeout used by the proxy + // at the time of publishing its state. + // It's used by the sentinel to know for how much time the + // proxy should be considered active. + ProxyTimeout time.Duration } type ProxiesInfo map[string]*ProxyInfo diff --git a/tests/integration/proxy_test.go b/tests/integration/proxy_test.go index 0c2f4aa..f3bb6e5 100644 --- a/tests/integration/proxy_test.go +++ b/tests/integration/proxy_test.go @@ -151,7 +151,7 @@ func TestProxyListening(t *testing.T) { } // tp should not listen because it cannot talk with the store - if err := tp.WaitNotListening(cluster.DefaultProxyTimeoutInterval * 2); err != nil { + if err := tp.WaitNotListening(cluster.DefaultProxyTimeout * 2); err != nil { t.Fatalf("expecting tp not listening due to failed store communication, but it's listening.") } @@ -174,8 +174,8 @@ func TestProxyListening(t *testing.T) { if err := tstore.WaitDown(10 * time.Second); err != nil { t.Fatalf("error waiting on store down: %v", err) } - // wait less than DefaultProxyTimeoutInterval - time.Sleep(cluster.DefaultProxyTimeoutInterval / 3) + // wait less than DefaultProxyTimeout + time.Sleep(cluster.DefaultProxyTimeout / 3) // Start store if err := tstore.Start(); err != nil { t.Fatalf("unexpected err: %v", err) @@ -239,7 +239,7 @@ func TestProxyListening(t *testing.T) { } // tp should not listen because it cannot talk with the store - if err := tp.WaitNotListening(cluster.DefaultProxyTimeoutInterval * 2); err != nil { + if err := tp.WaitNotListening(cluster.DefaultProxyTimeout * 2); err != nil { t.Fatalf("expecting tp not listening due to failed store communication, but it's listening.") } diff --git a/tests/integration/sentinel_test.go b/tests/integration/sentinel_test.go index dee1d82..73ec964 100644 --- a/tests/integration/sentinel_test.go +++ b/tests/integration/sentinel_test.go @@ -152,7 +152,7 @@ func TestSentinelEnabledProxies(t *testing.T) { t.Fatalf("unexpected err: %v", err) } - if err := WaitClusterDataEnabledProxiesNum(sm, 1, 3*cluster.DefaultProxyTimeoutInterval); err != nil { + if err := WaitClusterDataEnabledProxiesNum(sm, 1, 3*cluster.DefaultProxyTimeout); err != nil { t.Fatalf("unexpected err: %v", err) }