proxy: make proxyCheckInterval and proxyTimeout configurable

make proxyCheckInterval and proxyTimeout configurable in the cluster spec.

The proxy will publish its current proxyTimeout so the sentinel will know when
to consider it as active.
This commit is contained in:
Simone Gotti 2020-02-17 12:04:27 +01:00
parent 1305446441
commit 9cc800de05
9 changed files with 153 additions and 77 deletions

View File

@ -89,6 +89,10 @@ type ClusterChecker struct {
endPollonProxyCh chan error endPollonProxyCh chan error
pollonMutex sync.Mutex pollonMutex sync.Mutex
proxyCheckInterval time.Duration
proxyTimeout time.Duration
configMutex sync.Mutex
} }
func NewClusterChecker(uid string, cfg config) (*ClusterChecker, error) { func NewClusterChecker(uid string, cfg config) (*ClusterChecker, error) {
@ -104,6 +108,9 @@ func NewClusterChecker(uid string, cfg config) (*ClusterChecker, error) {
stopListening: cfg.stopListening, stopListening: cfg.stopListening,
e: e, e: e,
endPollonProxyCh: make(chan error), endPollonProxyCh: make(chan error),
proxyCheckInterval: cluster.DefaultProxyCheckInterval,
proxyTimeout: cluster.DefaultProxyTimeout,
}, nil }, nil
} }
@ -164,15 +171,16 @@ func (c *ClusterChecker) sendPollonConfData(confData pollon.ConfData) {
} }
} }
func (c *ClusterChecker) SetProxyInfo(e store.Store, generation int64, ttl time.Duration) error { func (c *ClusterChecker) SetProxyInfo(e store.Store, generation int64, proxyTimeout time.Duration) error {
proxyInfo := &cluster.ProxyInfo{ proxyInfo := &cluster.ProxyInfo{
InfoUID: common.UID(), InfoUID: common.UID(),
UID: c.uid, UID: c.uid,
Generation: generation, Generation: generation,
ProxyTimeout: proxyTimeout,
} }
log.Debugf("proxyInfo dump: %s", spew.Sdump(proxyInfo)) log.Debugf("proxyInfo dump: %s", spew.Sdump(proxyInfo))
if err := c.e.SetProxyInfo(context.TODO(), proxyInfo, ttl); err != nil { if err := c.e.SetProxyInfo(context.TODO(), proxyInfo, 2*proxyTimeout); err != nil {
return err return err
} }
return nil return nil
@ -205,13 +213,31 @@ func (c *ClusterChecker) Check() error {
return fmt.Errorf("clusterdata validation failed: %v", err) return fmt.Errorf("clusterdata validation failed: %v", err)
} }
cdProxyCheckInterval := cd.Cluster.DefSpec().ProxyCheckInterval.Duration
cdProxyTimeout := cd.Cluster.DefSpec().ProxyTimeout.Duration
// use the greater between the current proxy timeout and the one defined in the cluster spec if they're different.
// in this way we're updating our proxyInfo using a timeout that is greater or equal the current active timeout timer.
c.configMutex.Lock()
proxyTimeout := c.proxyTimeout
if cdProxyTimeout > proxyTimeout {
proxyTimeout = cdProxyTimeout
}
c.configMutex.Unlock()
proxy := cd.Proxy proxy := cd.Proxy
if proxy == nil { if proxy == nil {
log.Infow("no proxy object available, closing connections to master") log.Infow("no proxy object available, closing connections to master")
c.sendPollonConfData(pollon.ConfData{DestAddr: nil}) c.sendPollonConfData(pollon.ConfData{DestAddr: nil})
// ignore errors on setting proxy info // ignore errors on setting proxy info
if err = c.SetProxyInfo(c.e, cluster.NoGeneration, 2*cluster.DefaultProxyTimeoutInterval); err != nil { if err = c.SetProxyInfo(c.e, cluster.NoGeneration, proxyTimeout); err != nil {
log.Errorw("failed to update proxyInfo", zap.Error(err)) log.Errorw("failed to update proxyInfo", zap.Error(err))
} else {
// update proxyCheckinterval and proxyTimeout only if we successfully updated our proxy info
c.configMutex.Lock()
c.proxyCheckInterval = cdProxyCheckInterval
c.proxyTimeout = cdProxyTimeout
c.configMutex.Unlock()
} }
return nil return nil
} }
@ -221,8 +247,14 @@ func (c *ClusterChecker) Check() error {
log.Infow("no db object available, closing connections to master", "db", proxy.Spec.MasterDBUID) log.Infow("no db object available, closing connections to master", "db", proxy.Spec.MasterDBUID)
c.sendPollonConfData(pollon.ConfData{DestAddr: nil}) c.sendPollonConfData(pollon.ConfData{DestAddr: nil})
// ignore errors on setting proxy info // ignore errors on setting proxy info
if err = c.SetProxyInfo(c.e, proxy.Generation, 2*cluster.DefaultProxyTimeoutInterval); err != nil { if err = c.SetProxyInfo(c.e, proxy.Generation, proxyTimeout); err != nil {
log.Errorw("failed to update proxyInfo", zap.Error(err)) log.Errorw("failed to update proxyInfo", zap.Error(err))
} else {
// update proxyCheckinterval and proxyTimeout only if we successfully updated our proxy info
c.configMutex.Lock()
c.proxyCheckInterval = cdProxyCheckInterval
c.proxyTimeout = cdProxyTimeout
c.configMutex.Unlock()
} }
return nil return nil
} }
@ -234,12 +266,18 @@ func (c *ClusterChecker) Check() error {
return nil return nil
} }
log.Infow("master address", "address", addr) log.Infow("master address", "address", addr)
if err = c.SetProxyInfo(c.e, proxy.Generation, 2*cluster.DefaultProxyTimeoutInterval); err != nil { if err = c.SetProxyInfo(c.e, proxy.Generation, proxyTimeout); err != nil {
// if we failed to update our proxy info when a master is defined we // if we failed to update our proxy info when a master is defined we
// cannot ignore this error since the sentinel won't know that we exist // cannot ignore this error since the sentinel won't know that we exist
// and are sending connections to a master so, when electing a new // and are sending connections to a master so, when electing a new
// master, it'll not wait for us to close connections to the old one. // master, it'll not wait for us to close connections to the old one.
return fmt.Errorf("failed to update proxyInfo: %v", err) return fmt.Errorf("failed to update proxyInfo: %v", err)
} else {
// update proxyCheckinterval and proxyTimeout only if we successfully updated our proxy info
c.configMutex.Lock()
c.proxyCheckInterval = cdProxyCheckInterval
c.proxyTimeout = cdProxyTimeout
c.configMutex.Unlock()
} }
// start proxing only if we are inside enabledProxies, this ensures that the // start proxing only if we are inside enabledProxies, this ensures that the
@ -256,7 +294,9 @@ func (c *ClusterChecker) Check() error {
} }
func (c *ClusterChecker) TimeoutChecker(checkOkCh chan struct{}) { func (c *ClusterChecker) TimeoutChecker(checkOkCh chan struct{}) {
timeoutTimer := time.NewTimer(cluster.DefaultProxyTimeoutInterval) c.configMutex.Lock()
timeoutTimer := time.NewTimer(c.proxyTimeout)
c.configMutex.Unlock()
for { for {
select { select {
@ -275,7 +315,10 @@ func (c *ClusterChecker) TimeoutChecker(checkOkCh chan struct{}) {
// ignore if stop succeeded or not due to timer already expired // ignore if stop succeeded or not due to timer already expired
timeoutTimer.Stop() timeoutTimer.Stop()
timeoutTimer = time.NewTimer(cluster.DefaultProxyTimeoutInterval)
c.configMutex.Lock()
timeoutTimer = time.NewTimer(c.proxyTimeout)
c.configMutex.Unlock()
} }
} }
} }
@ -305,7 +348,10 @@ func (c *ClusterChecker) Start() error {
// report that check was ok // report that check was ok
checkOkCh <- struct{}{} checkOkCh <- struct{}{}
} }
timerCh = time.NewTimer(cluster.DefaultProxyCheckInterval).C c.configMutex.Lock()
timerCh = time.NewTimer(c.proxyCheckInterval).C
c.configMutex.Unlock()
case err := <-c.endPollonProxyCh: case err := <-c.endPollonProxyCh:
if err != nil { if err != nil {
return fmt.Errorf("proxy error: %v", err) return fmt.Errorf("proxy error: %v", err)

View File

@ -342,7 +342,7 @@ func (s *Sentinel) activeProxiesInfos(proxiesInfo cluster.ProxiesInfo) cluster.P
for _, pi := range proxiesInfo { for _, pi := range proxiesInfo {
if pih, ok := pihs[pi.UID]; ok { if pih, ok := pihs[pi.UID]; ok {
if pih.ProxyInfo.InfoUID == pi.InfoUID { if pih.ProxyInfo.InfoUID == pi.InfoUID {
if timer.Since(pih.Timer) > 2*cluster.DefaultProxyTimeoutInterval { if timer.Since(pih.Timer) > 2*pi.ProxyTimeout {
delete(activeProxiesInfo, pi.UID) delete(activeProxiesInfo, pi.UID)
} }
} else { } else {
@ -1820,7 +1820,6 @@ func (s *Sentinel) clusterSentinelCheck(pctx context.Context) {
s.sleepInterval = cd.Cluster.DefSpec().SleepInterval.Duration s.sleepInterval = cd.Cluster.DefSpec().SleepInterval.Duration
s.requestTimeout = cd.Cluster.DefSpec().RequestTimeout.Duration s.requestTimeout = cd.Cluster.DefSpec().RequestTimeout.Duration
} }
} }
log.Debugf("cd dump: %s", spew.Sdump(cd)) log.Debugf("cd dump: %s", spew.Sdump(cd))

View File

@ -5000,8 +5000,8 @@ func TestUpdateCluster(t *testing.T) {
} }
func TestActiveProxiesInfos(t *testing.T) { func TestActiveProxiesInfos(t *testing.T) {
proxyInfo1 := cluster.ProxyInfo{UID: "proxy1", InfoUID: "infoUID1"} proxyInfo1 := cluster.ProxyInfo{UID: "proxy1", InfoUID: "infoUID1", ProxyTimeout: cluster.DefaultProxyTimeout}
proxyInfo2 := cluster.ProxyInfo{UID: "proxy2", InfoUID: "infoUID2"} proxyInfo2 := cluster.ProxyInfo{UID: "proxy2", InfoUID: "infoUID2", ProxyTimeout: cluster.DefaultProxyTimeout}
proxyInfoWithDifferentInfoUID := cluster.ProxyInfo{UID: "proxy2", InfoUID: "differentInfoUID"} proxyInfoWithDifferentInfoUID := cluster.ProxyInfo{UID: "proxy2", InfoUID: "differentInfoUID"}
var secToNanoSecondMultiplier int64 = 1000000000 var secToNanoSecondMultiplier int64 = 1000000000
tests := []struct { tests := []struct {
@ -5033,7 +5033,7 @@ func TestActiveProxiesInfos(t *testing.T) {
expectedProxyInfoHistories: ProxyInfoHistories{"proxy1": &ProxyInfoHistory{ProxyInfo: &proxyInfo1}, "proxy2": &ProxyInfoHistory{ProxyInfo: &proxyInfoWithDifferentInfoUID}}, expectedProxyInfoHistories: ProxyInfoHistories{"proxy1": &ProxyInfoHistory{ProxyInfo: &proxyInfo1}, "proxy2": &ProxyInfoHistory{ProxyInfo: &proxyInfoWithDifferentInfoUID}},
}, },
{ {
name: "should remove from active proxies if is not updated for twice the DefaultProxyTimeoutInterval", name: "should remove from active proxies if is not updated for twice the DefaultProxyTimeout",
proxyInfoHistories: ProxyInfoHistories{"proxy1": &ProxyInfoHistory{ProxyInfo: &proxyInfo1, Timer: timer.Now() - (3 * 15 * secToNanoSecondMultiplier)}, "proxy2": &ProxyInfoHistory{ProxyInfo: &proxyInfo2, Timer: timer.Now() - (1 * 15 * secToNanoSecondMultiplier)}}, proxyInfoHistories: ProxyInfoHistories{"proxy1": &ProxyInfoHistory{ProxyInfo: &proxyInfo1, Timer: timer.Now() - (3 * 15 * secToNanoSecondMultiplier)}, "proxy2": &ProxyInfoHistory{ProxyInfo: &proxyInfo2, Timer: timer.Now() - (1 * 15 * secToNanoSecondMultiplier)}},
proxiesInfos: cluster.ProxiesInfo{"proxy1": &proxyInfo1, "proxy2": &proxyInfo2}, proxiesInfos: cluster.ProxiesInfo{"proxy1": &proxyInfo1, "proxy2": &proxyInfo2},
expectedActiveProxies: cluster.ProxiesInfo{"proxy2": &proxyInfo2}, expectedActiveProxies: cluster.ProxiesInfo{"proxy2": &proxyInfo2},

View File

@ -50,6 +50,8 @@ type ClusterSpecNoDefaults struct {
DBWaitReadyTimeout *cluster.Duration `json:"dbWaitReadyTimeout,omitempty"` DBWaitReadyTimeout *cluster.Duration `json:"dbWaitReadyTimeout,omitempty"`
FailInterval *cluster.Duration `json:"failInterval,omitempty"` FailInterval *cluster.Duration `json:"failInterval,omitempty"`
DeadKeeperRemovalInterval *cluster.Duration `json:"deadKeeperRemovalInterval,omitempty"` DeadKeeperRemovalInterval *cluster.Duration `json:"deadKeeperRemovalInterval,omitempty"`
ProxyCheckInterval *cluster.Duration `json:"proxyCheckInterval,omitempty"`
ProxyTimeout *cluster.Duration `json:"proxyTimeout,omitempty"`
MaxStandbys *uint16 `json:"maxStandbys,omitempty"` MaxStandbys *uint16 `json:"maxStandbys,omitempty"`
MaxStandbysPerSender *uint16 `json:"maxStandbysPerSender,omitempty"` MaxStandbysPerSender *uint16 `json:"maxStandbysPerSender,omitempty"`
MaxStandbyLag *uint32 `json:"maxStandbyLag,omitempty"` MaxStandbyLag *uint32 `json:"maxStandbyLag,omitempty"`
@ -81,6 +83,8 @@ type ClusterSpecDefaults struct {
DBWaitReadyTimeout *cluster.Duration `json:"dbWaitReadyTimeout"` DBWaitReadyTimeout *cluster.Duration `json:"dbWaitReadyTimeout"`
FailInterval *cluster.Duration `json:"failInterval"` FailInterval *cluster.Duration `json:"failInterval"`
DeadKeeperRemovalInterval *cluster.Duration `json:"deadKeeperRemovalInterval"` DeadKeeperRemovalInterval *cluster.Duration `json:"deadKeeperRemovalInterval"`
ProxyCheckInterval *cluster.Duration `json:"proxyCheckInterval"`
ProxyTimeout *cluster.Duration `json:"proxyTimeout"`
MaxStandbys *uint16 `json:"maxStandbys"` MaxStandbys *uint16 `json:"maxStandbys"`
MaxStandbysPerSender *uint16 `json:"maxStandbysPerSender"` MaxStandbysPerSender *uint16 `json:"maxStandbysPerSender"`
MaxStandbyLag *uint32 `json:"maxStandbyLag"` MaxStandbyLag *uint32 `json:"maxStandbyLag"`

View File

@ -12,34 +12,36 @@ Some options in a running cluster specification can be changed to update the des
### Cluster Specification Format. ### Cluster Specification Format.
| Name | Description | Required | Type | Default | | Name | Description | Required | Type | Default |
|---------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------|-------------------|-------------------------------------------------------------------------------------------------------------------------------------| |----------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------|-------------------|-------------------------------------------------------------------------------------------------------------------------------------|
| sleepInterval | interval to wait before next check (for every component: keeper, sentinel, proxy). | no | string (duration) | 5s | | sleepInterval | interval to wait before next check (for keepers and sentinels). | no | string (duration) | 5s |
| requestTimeout | time after which any request (keepers checks from sentinel etc...) will fail. | no | string (duration) | 10s | | requestTimeout | time after which any request to external resources (store, postgres queries etc...) will fail. | no | string (duration) | 10s |
| failInterval | interval after the first fail to declare a keeper as not healthy. | no | string (duration) | 20s | | failInterval | interval after the first fail to declare a keeper as not healthy. | no | string (duration) | 20s |
| deadKeeperRemovalInterval | interval after which a dead keeper will be removed from the cluster data | no | string (duration) | 48h | | proxyCheckInterval | interval to wait before next proxy check. | no | string (duration) | 5s |
| maxStandbys | max number of standbys. This needs to be greater enough to cover both standby managed by stolon and additional standbys configured by the user. Its value affect different postgres parameters like max_replication_slots and max_wal_senders. Setting this to a number lower than the sum of stolon managed standbys and user managed standbys will have unpredicatable effects due to problems creating replication slots or replication problems due to exhausted wal senders. | no | uint16 | 20 | | proxyTimeout | interval where a proxy check must successfully complete or the proxy will close all connections to the master. | no | string (duration) | 15s |
| maxStandbysPerSender | max number of standbys for every sender. A sender can be a master or another standby (with cascading replication). | no | uint16 | 3 | | deadKeeperRemovalInterval | interval after which a dead keeper will be removed from the cluster data | no | string (duration) | 48h |
| maxStandbyLag | maximum lag (from the last reported master state, in bytes) that an asynchronous standby can have to be elected in place of a failed master. | no | uint32 | 1MiB | | maxStandbys | max number of standbys. This needs to be greater enough to cover both standby managed by stolon and additional standbys configured by the user. Its value affect different postgres parameters like max_replication_slots and max_wal_senders. Setting this to a number lower than the sum of stolon managed standbys and user managed standbys will have unpredicatable effects due to problems creating replication slots or replication problems due to exhausted wal senders. | no | uint16 | 20 |
| synchronousReplication | use synchronous replication between the master and its standbys | no | bool | false | | maxStandbysPerSender | max number of standbys for every sender. A sender can be a master or another standby (with cascading replication). | no | uint16 | 3 |
| minSynchronousStandbys | minimum number of required synchronous standbys when synchronous replication is enabled (only set this to a value > 1 when using PostgreSQL >= 9.6) | no | uint16 | 1 | | maxStandbyLag | maximum lag (from the last reported master state, in bytes) that an asynchronous standby can have to be elected in place of a failed master. | no | uint32 | 1MiB |
| maxSynchronousStandbys | maximum number of required synchronous standbys when synchronous replication is enabled (only set this to a value > 1 when using PostgreSQL >= 9.6) | no | uint16 | 1 | | synchronousReplication | use synchronous replication between the master and its standbys | no | bool | false |
| additionalWalSenders | number of additional wal_senders in addition to the ones internally defined by stolon, useful to provide enough wal senders for external standbys (changing this value requires an instance restart) | no | uint16 | 5 | | minSynchronousStandbys | minimum number of required synchronous standbys when synchronous replication is enabled (only set this to a value > 1 when using PostgreSQL >= 9.6) | no | uint16 | 1 |
| additionalMasterReplicationSlots | a list of additional physical replication slots to be created on the master postgres instance. They will be prefixed with `stolon_` (like internal replication slots used for standby replication) to make them "namespaced" from other replication slots. Replication slots starting with `stolon_` and not defined here (and not used for standby replication) will be dropped from the master instance. | no | []string | null | | maxSynchronousStandbys | maximum number of required synchronous standbys when synchronous replication is enabled (only set this to a value > 1 when using PostgreSQL >= 9.6) | no | uint16 | 1 |
| usePgrewind | try to use pg_rewind for faster instance resyncronization. | no | bool | false | | additionalWalSenders | number of additional wal_senders in addition to the ones internally defined by stolon, useful to provide enough wal senders for external standbys (changing this value requires an instance restart) | no | uint16 | 5 |
| initMode | The cluster initialization mode. Can be *new* or *existing*. *new* means that a new db cluster will be created on a random keeper and the other keepers will sync with it. *existing* means that a keeper (that needs to have an already created db cluster) will be choosed as the initial master and the other keepers will sync with it. In this case the `existingConfig` object needs to be populated. | yes | string | | | additionalMasterReplicationSlots | a list of additional physical replication slots to be created on the master postgres instance. They will be prefixed with `stolon_` (like internal replication slots used for standby replication) to make them "namespaced" from other replication slots. Replication slots starting with `stolon_` and not defined here (and not used for standby replication) will be dropped from the master instance. | no | []string | null |
| existingConfig | configuration for initMode of type "existing" | if initMode is "existing" | ExistingConfig | | | usePgrewind | try to use pg_rewind for faster instance resyncronization. | no | bool | false |
| mergePgParameters | merge pgParameters of the initialized db cluster, useful the retain initdb generated parameters when InitMode is new, retain current parameters when initMode is existing or pitr. | no | bool | true | | initMode | The cluster initialization mode. Can be *new* or *existing*. *new* means that a new db cluster will be created on a random keeper and the other keepers will sync with it. *existing* means that a keeper (that needs to have an already created db cluster) will be choosed as the initial master and the other keepers will sync with it. In this case the `existingConfig` object needs to be populated. | yes | string | |
| role | cluster role (master or standby) | no | bool | master | | existingConfig | configuration for initMode of type "existing" | if initMode is "existing" | ExistingConfig | |
| defaultSUReplAccessMode | mode for the default hba rules used for replication by standby keepers (the su and repl auth methods will be the one provided in the keeper command line options). Values can be *all* or *strict*. *all* allow access from all ips, *strict* restrict master access to standby servers ips. | no | string | all | | mergePgParameters | merge pgParameters of the initialized db cluster, useful the retain initdb generated parameters when InitMode is new, retain current parameters when initMode is existing or pitr. | no | bool | true |
| newConfig | configuration for initMode of type "new" | if initMode is "new" | NewConfig | | | role | cluster role (master or standby) | no | bool | master |
| pitrConfig | configuration for initMode of type "pitr" | if initMode is "pitr" | PITRConfig | | | defaultSUReplAccessMode | mode for the default hba rules used for replication by standby keepers (the su and repl auth methods will be the one provided in the keeper command line options). Values can be *all* or *strict*. *all* allow access from all ips, *strict* restrict master access to standby servers ips. | no | string | all |
| standbyConfig | standby config when the cluster is a standby cluster | if role is "standby" | StandbyConfig | | | newConfig | configuration for initMode of type "new" | if initMode is "new" | NewConfig | |
| pgParameters | a map containing the postgres server parameters and their values. The parameters value don't have to be quoted and single quotes don't have to be doubled since this is already done by the keeper when writing the postgresql.conf file | no | map[string]string | | | pitrConfig | configuration for initMode of type "pitr" | if initMode is "pitr" | PITRConfig | |
| pgHBA | a list containing additional pg_hba.conf entries. They will be added to the pg_hba.conf generated by stolon. **NOTE**: these lines aren't validated so if some of them are wrong postgres will refuse to start or, on reload, will log a warning and ignore the updated pg_hba.conf file | no | []string | null. Will use the default behiavior of accepting connections from all hosts for all dbs and users with md5 password authentication | | standbyConfig | standby config when the cluster is a standby cluster | if role is "standby" | StandbyConfig | |
| automaticPgRestart | restart postgres automatically after changing the pgParameters that requires restart. Refer `pending_restart` in [pg_settings](https://www.postgresql.org/docs/9.5/static/view-pg-settings.html) | no | bool | false | | pgParameters | a map containing the postgres server parameters and their values. The parameters value don't have to be quoted and single quotes don't have to be doubled since this is already done by the keeper when writing the postgresql.conf file | no | map[string]string | |
| dbWaitReadyTimeout | Time to wait for the database to become ready after starting. Increase this value if your Postgres takes longer to boot, e.g. because it has to recover a lot of WAL. | no | string (duration) | 60s | | pgHBA | a list containing additional pg_hba.conf entries. They will be added to the pg_hba.conf generated by stolon. **NOTE**: these lines aren't validated so if some of them are wrong postgres will refuse to start or, on reload, will log a warning and ignore the updated pg_hba.conf file | no | []string | null. Will use the default behaviour of accepting connections from all hosts for all dbs and users with md5 password authentication |
| syncTimeout | Time to wait for a database recovery (including the replay of WAL files in case of Point-In-Time-Recovery) | no | string (duration) | 0 (no timeout, waits until recovery has finished) | | automaticPgRestart | restart postgres automatically after changing the pgParameters that requires restart. Refer `pending_restart` in [pg_settings](https://www.postgresql.org/docs/9.5/static/view-pg-settings.html) | no | bool | false |
| dbWaitReadyTimeout | Time to wait for the database to become ready after starting. Increase this value if your Postgres takes longer to boot, e.g. because it has to recover a lot of WAL. | no | string (duration) | 60s |
| syncTimeout | Time to wait for a database recovery (including the replay of WAL files in case of Point-In-Time-Recovery) | no | string (duration) | 0 (no timeout, waits until recovery has finished) |
#### ExistingConfig #### ExistingConfig
@ -63,41 +65,41 @@ Some options in a running cluster specification can be changed to update the des
|-------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------|-------------------------|---------| |-------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------|-------------------------|---------|
| dataRestoreCommand | defines the command to execute for restoring the db cluster data. %d is replaced with the full path to the db cluster datadir. Use %% to embed an actual % character. Must return a 0 exit code only on success. | yes | string | | | dataRestoreCommand | defines the command to execute for restoring the db cluster data. %d is replaced with the full path to the db cluster datadir. Use %% to embed an actual % character. Must return a 0 exit code only on success. | yes | string | |
| archiveRecoverySettings | archive recovery configuration | yes | ArchiveRecoverySettings | | | archiveRecoverySettings | archive recovery configuration | yes | ArchiveRecoverySettings | |
| recoveryTargetSettings | recovery target configuration | no | RecoveryTargetSettings | | | recoveryTargetSettings | recovery target configuration | no | RecoveryTargetSettings | |
#### StandbyConfig #### StandbyConfig
| Name | Description | Required | Type | Default | | Name | Description | Required | Type | Default |
|-------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------|-------------------------|---------| |-------------------------|--------------------------------|----------|-------------------------|---------|
| standbySettings | standby configuration | no | StandbySettings | | | standbySettings | standby configuration | no | StandbySettings | |
| archiveRecoverySettings | archive recovery configuration | no | ArchiveRecoverySettings | | | archiveRecoverySettings | archive recovery configuration | no | ArchiveRecoverySettings | |
#### ArchiveRecoverySettings #### ArchiveRecoverySettings
| Name | Description | Required | Type | Default | | Name | Description | Required | Type | Default |
|-------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------|-------------------------|---------| |----------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------|--------|---------|
| restoreCommand | defines the command to execute for restoring the archives. See the related [postgresql doc](https://www.postgresql.org/docs/current/static/archive-recovery-settings.html) | yes | string | | | restoreCommand | defines the command to execute for restoring the archives. See the related [postgresql doc](https://www.postgresql.org/docs/current/static/archive-recovery-settings.html) | yes | string | |
#### RecoveryTargetSettings #### RecoveryTargetSettings
These parameters are the same as defined in [postgresql recovery target settings doc](https://www.postgresql.org/docs/current/static/recovery-target-settings.html) These parameters are the same as defined in [postgresql recovery target settings doc](https://www.postgresql.org/docs/current/static/recovery-target-settings.html)
| Name | Description | Required | Type | Default | | Name | Description | Required | Type | Default |
|-------------------------|----------------------------------------------------------------------------------------------------------------------------------------------|----------|-------------------------|---------| |------------------------|----------------------------------------------------------------------------------------------------------------------------------------------|----------|--------|---------|
| recoveryTarget | See `recovery_target` in the related [postgresql doc](https://www.postgresql.org/docs/current/static/recovery-target-settings.html) | no | string | | | recoveryTarget | See `recovery_target` in the related [postgresql doc](https://www.postgresql.org/docs/current/static/recovery-target-settings.html) | no | string | |
| recoveryTargetLsn | See `recovery_target_lsn` in the related [postgresql doc](https://www.postgresql.org/docs/current/static/recovery-target-settings.html) | no | string | | | recoveryTargetLsn | See `recovery_target_lsn` in the related [postgresql doc](https://www.postgresql.org/docs/current/static/recovery-target-settings.html) | no | string | |
| recoveryTargetName | See `recovery_target_name` in the related [postgresql doc](https://www.postgresql.org/docs/current/static/recovery-target-settings.html) | no | string | | | recoveryTargetName | See `recovery_target_name` in the related [postgresql doc](https://www.postgresql.org/docs/current/static/recovery-target-settings.html) | no | string | |
| recoveryTargetTime | See `recovery_target_time` in the related [postgresql doc](https://www.postgresql.org/docs/current/static/recovery-target-settings.html) | no | string | | | recoveryTargetTime | See `recovery_target_time` in the related [postgresql doc](https://www.postgresql.org/docs/current/static/recovery-target-settings.html) | no | string | |
| recoveryTargetXid | See `recovery_target_xid` in the related [postgresql doc](https://www.postgresql.org/docs/current/static/recovery-target-settings.html) | no | string | | | recoveryTargetXid | See `recovery_target_xid` in the related [postgresql doc](https://www.postgresql.org/docs/current/static/recovery-target-settings.html) | no | string | |
| recoveryTargetTimeline | See `recovery_target_timeline` in the related [postgresql doc](https://www.postgresql.org/docs/current/static/recovery-target-settings.html) | no | string | | | recoveryTargetTimeline | See `recovery_target_timeline` in the related [postgresql doc](https://www.postgresql.org/docs/current/static/recovery-target-settings.html) | no | string | |
#### StandbySettings #### StandbySettings
| Name | Description | Required | Type | Default | | Name | Description | Required | Type | Default |
|-------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------|-------------------------|---------| |-----------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------|--------|---------|
| primaryConnInfo | connection string to connect to the primary server (its value will be placed in the `primary_conninfo` parameter of the instance `recovery.conf` file. See the related [postgresql doc](https://www.postgresql.org/docs/current/static/standby-settings.html) | yes | string | | | primaryConnInfo | connection string to connect to the primary server (its value will be placed in the `primary_conninfo` parameter of the instance `recovery.conf` file. See the related [postgresql doc](https://www.postgresql.org/docs/current/static/standby-settings.html) | yes | string | |
| primarySlotName | optional replication slot to use (its value will be placed in the `primary_slot_name` parameter of the instance `recovery.conf` file. See the related [postgresql doc](https://www.postgresql.org/docs/current/static/standby-settings.html) | no | string | | | primarySlotName | optional replication slot to use (its value will be placed in the `primary_slot_name` parameter of the instance `recovery.conf` file. See the related [postgresql doc](https://www.postgresql.org/docs/current/static/standby-settings.html) | no | string | |
| recoveryMinApplyDelay | delay recovery for a fixed period of time (its value will be placed in the `recovery_min_apply_delay` parameter of the instance `recovery.conf` file. See the related [postgresql doc](https://www.postgresql.org/docs/current/static/standby-settings.html) | no | string | | | recoveryMinApplyDelay | delay recovery for a fixed period of time (its value will be placed in the `recovery_min_apply_delay` parameter of the instance `recovery.conf` file. See the related [postgresql doc](https://www.postgresql.org/docs/current/static/standby-settings.html) | no | string | |
#### Special Types #### Special Types
duration types (as described in https://golang.org/pkg/time/#ParseDuration) are signed sequence of decimal numbers, each with optional fraction and a unit suffix, such as "300ms", "-1.5h" or "2h45m". Valid time units are "ns", "us" (or "µs"), "ms", "s", "m", "h". duration types (as described in https://golang.org/pkg/time/#ParseDuration) are signed sequence of decimal numbers, each with optional fraction and a unit suffix, such as "300ms", "-1.5h" or "2h45m". Valid time units are "ns", "us" (or "µs"), "ms", "s", "m", "h".

View File

@ -43,11 +43,7 @@ const (
) )
const ( const (
DefaultStoreTimeout = 5 * time.Second DefaultStoreTimeout = 5 * time.Second
DefaultProxyCheckInterval = 5 * time.Second
DefaultProxyTimeoutInterval = 15 * time.Second
DefaultDBWaitReadyTimeout = 60 * time.Second
DefaultDBNotIncreasingXLogPosTimes = 10 DefaultDBNotIncreasingXLogPosTimes = 10
@ -56,8 +52,11 @@ const (
DefaultConvergenceTimeout = 30 * time.Second DefaultConvergenceTimeout = 30 * time.Second
DefaultInitTimeout = 5 * time.Minute DefaultInitTimeout = 5 * time.Minute
DefaultSyncTimeout = 0 DefaultSyncTimeout = 0
DefaultDBWaitReadyTimeout = 60 * time.Second
DefaultFailInterval = 20 * time.Second DefaultFailInterval = 20 * time.Second
DefaultDeadKeeperRemovalInterval = 48 * time.Hour DefaultDeadKeeperRemovalInterval = 48 * time.Hour
DefaultProxyCheckInterval = 5 * time.Second
DefaultProxyTimeout = 15 * time.Second
DefaultMaxStandbys uint16 = 20 DefaultMaxStandbys uint16 = 20
DefaultMaxStandbysPerSender uint16 = 3 DefaultMaxStandbysPerSender uint16 = 3
DefaultMaxStandbyLag = 1024 * 1204 DefaultMaxStandbyLag = 1024 * 1204
@ -228,6 +227,10 @@ type ClusterSpec struct {
FailInterval *Duration `json:"failInterval,omitempty"` FailInterval *Duration `json:"failInterval,omitempty"`
// Interval after which a dead keeper will be removed from the cluster data // Interval after which a dead keeper will be removed from the cluster data
DeadKeeperRemovalInterval *Duration `json:"deadKeeperRemovalInterval,omitempty"` DeadKeeperRemovalInterval *Duration `json:"deadKeeperRemovalInterval,omitempty"`
// Interval to wait before next proxy check
ProxyCheckInterval *Duration `json:"proxyCheckInterval,omitempty"`
// Interval where the proxy must successfully complete a check
ProxyTimeout *Duration `json:"proxyTimeout,omitempty"`
// Max number of standbys. This needs to be greater enough to cover both // Max number of standbys. This needs to be greater enough to cover both
// standby managed by stolon and additional standbys configured by the // standby managed by stolon and additional standbys configured by the
// user. Its value affect different postgres parameters like // user. Its value affect different postgres parameters like
@ -364,6 +367,12 @@ func (os *ClusterSpec) WithDefaults() *ClusterSpec {
if s.DeadKeeperRemovalInterval == nil { if s.DeadKeeperRemovalInterval == nil {
s.DeadKeeperRemovalInterval = &Duration{Duration: DefaultDeadKeeperRemovalInterval} s.DeadKeeperRemovalInterval = &Duration{Duration: DefaultDeadKeeperRemovalInterval}
} }
if s.ProxyCheckInterval == nil {
s.ProxyCheckInterval = &Duration{Duration: DefaultProxyCheckInterval}
}
if s.ProxyTimeout == nil {
s.ProxyTimeout = &Duration{Duration: DefaultProxyTimeout}
}
if s.MaxStandbys == nil { if s.MaxStandbys == nil {
s.MaxStandbys = Uint16P(DefaultMaxStandbys) s.MaxStandbys = Uint16P(DefaultMaxStandbys)
} }
@ -426,11 +435,20 @@ func (os *ClusterSpec) Validate() error {
if s.DBWaitReadyTimeout.Duration < 0 { if s.DBWaitReadyTimeout.Duration < 0 {
return fmt.Errorf("dbWaitReadyTimeout must be positive") return fmt.Errorf("dbWaitReadyTimeout must be positive")
} }
if s.FailInterval.Duration < 0 {
return fmt.Errorf("failInterval must be positive")
}
if s.DeadKeeperRemovalInterval.Duration < 0 { if s.DeadKeeperRemovalInterval.Duration < 0 {
return fmt.Errorf("deadKeeperRemovalInterval must be positive") return fmt.Errorf("deadKeeperRemovalInterval must be positive")
} }
if s.FailInterval.Duration < 0 { if s.ProxyCheckInterval.Duration < 0 {
return fmt.Errorf("failInterval must be positive") return fmt.Errorf("proxyCheckInterval must be positive")
}
if s.ProxyTimeout.Duration < 0 {
return fmt.Errorf("proxyTimeout must be positive")
}
if s.ProxyCheckInterval.Duration >= s.ProxyTimeout.Duration {
return fmt.Errorf("proxyCheckInterval should be less than proxyTimeout")
} }
if *s.MaxStandbys < 1 { if *s.MaxStandbys < 1 {
return fmt.Errorf("maxStandbys must be at least 1") return fmt.Errorf("maxStandbys must be at least 1")

View File

@ -16,6 +16,7 @@ package cluster
import ( import (
"reflect" "reflect"
"time"
"github.com/sorintlab/stolon/internal/common" "github.com/sorintlab/stolon/internal/common"
@ -133,6 +134,12 @@ type ProxyInfo struct {
UID string UID string
Generation int64 Generation int64
// ProxyTimeout is the current proxyTimeout used by the proxy
// at the time of publishing its state.
// It's used by the sentinel to know for how much time the
// proxy should be considered active.
ProxyTimeout time.Duration
} }
type ProxiesInfo map[string]*ProxyInfo type ProxiesInfo map[string]*ProxyInfo

View File

@ -151,7 +151,7 @@ func TestProxyListening(t *testing.T) {
} }
// tp should not listen because it cannot talk with the store // tp should not listen because it cannot talk with the store
if err := tp.WaitNotListening(cluster.DefaultProxyTimeoutInterval * 2); err != nil { if err := tp.WaitNotListening(cluster.DefaultProxyTimeout * 2); err != nil {
t.Fatalf("expecting tp not listening due to failed store communication, but it's listening.") t.Fatalf("expecting tp not listening due to failed store communication, but it's listening.")
} }
@ -174,8 +174,8 @@ func TestProxyListening(t *testing.T) {
if err := tstore.WaitDown(10 * time.Second); err != nil { if err := tstore.WaitDown(10 * time.Second); err != nil {
t.Fatalf("error waiting on store down: %v", err) t.Fatalf("error waiting on store down: %v", err)
} }
// wait less than DefaultProxyTimeoutInterval // wait less than DefaultProxyTimeout
time.Sleep(cluster.DefaultProxyTimeoutInterval / 3) time.Sleep(cluster.DefaultProxyTimeout / 3)
// Start store // Start store
if err := tstore.Start(); err != nil { if err := tstore.Start(); err != nil {
t.Fatalf("unexpected err: %v", err) t.Fatalf("unexpected err: %v", err)
@ -239,7 +239,7 @@ func TestProxyListening(t *testing.T) {
} }
// tp should not listen because it cannot talk with the store // tp should not listen because it cannot talk with the store
if err := tp.WaitNotListening(cluster.DefaultProxyTimeoutInterval * 2); err != nil { if err := tp.WaitNotListening(cluster.DefaultProxyTimeout * 2); err != nil {
t.Fatalf("expecting tp not listening due to failed store communication, but it's listening.") t.Fatalf("expecting tp not listening due to failed store communication, but it's listening.")
} }

View File

@ -152,7 +152,7 @@ func TestSentinelEnabledProxies(t *testing.T) {
t.Fatalf("unexpected err: %v", err) t.Fatalf("unexpected err: %v", err)
} }
if err := WaitClusterDataEnabledProxiesNum(sm, 1, 3*cluster.DefaultProxyTimeoutInterval); err != nil { if err := WaitClusterDataEnabledProxiesNum(sm, 1, 3*cluster.DefaultProxyTimeout); err != nil {
t.Fatalf("unexpected err: %v", err) t.Fatalf("unexpected err: %v", err)
} }