proxy: make proxyCheckInterval and proxyTimeout configurable

make proxyCheckInterval and proxyTimeout configurable in the cluster spec. The proxy will publish its current proxyTimeout so the sentinel will know when to consider it as active.
2024-11-24 11:24:48 +03:00 · 2020-02-17 12:04:27 +01:00 · 2020-02-17 12:04:27 +01:00 · 9cc800de05
commit 9cc800de05
parent 1305446441
9 changed files with 153 additions and 77 deletions
--- a/cmd/proxy/cmd/proxy.go
+++ b/cmd/proxy/cmd/proxy.go
@ -89,6 +89,10 @@ type ClusterChecker struct {
 	endPollonProxyCh chan error

 	pollonMutex sync.Mutex
+
+	proxyCheckInterval time.Duration
+	proxyTimeout       time.Duration
+	configMutex        sync.Mutex
 }

 func NewClusterChecker(uid string, cfg config) (*ClusterChecker, error) {
@ -104,6 +108,9 @@ func NewClusterChecker(uid string, cfg config) (*ClusterChecker, error) {
 		stopListening:    cfg.stopListening,
 		e:                e,
 		endPollonProxyCh: make(chan error),
+
+		proxyCheckInterval: cluster.DefaultProxyCheckInterval,
+		proxyTimeout:       cluster.DefaultProxyTimeout,
 	}, nil
 }

@ -164,15 +171,16 @@ func (c *ClusterChecker) sendPollonConfData(confData pollon.ConfData) {
 	}
 }

-func (c *ClusterChecker) SetProxyInfo(e store.Store, generation int64, ttl time.Duration) error {
+func (c *ClusterChecker) SetProxyInfo(e store.Store, generation int64, proxyTimeout time.Duration) error {
 	proxyInfo := &cluster.ProxyInfo{
-		InfoUID:    common.UID(),
-		UID:        c.uid,
-		Generation: generation,
+		InfoUID:      common.UID(),
+		UID:          c.uid,
+		Generation:   generation,
+		ProxyTimeout: proxyTimeout,
 	}
 	log.Debugf("proxyInfo dump: %s", spew.Sdump(proxyInfo))

-	if err := c.e.SetProxyInfo(context.TODO(), proxyInfo, ttl); err != nil {
+	if err := c.e.SetProxyInfo(context.TODO(), proxyInfo, 2*proxyTimeout); err != nil {
 		return err
 	}
 	return nil
@ -205,13 +213,31 @@ func (c *ClusterChecker) Check() error {
 		return fmt.Errorf("clusterdata validation failed: %v", err)
 	}

+	cdProxyCheckInterval := cd.Cluster.DefSpec().ProxyCheckInterval.Duration
+	cdProxyTimeout := cd.Cluster.DefSpec().ProxyTimeout.Duration
+
+	// use the greater between the current proxy timeout and the one defined in the cluster spec if they're different.
+	// in this way we're updating our proxyInfo using a timeout that is greater or equal the current active timeout timer.
+	c.configMutex.Lock()
+	proxyTimeout := c.proxyTimeout
+	if cdProxyTimeout > proxyTimeout {
+		proxyTimeout = cdProxyTimeout
+	}
+	c.configMutex.Unlock()
+
 	proxy := cd.Proxy
 	if proxy == nil {
 		log.Infow("no proxy object available, closing connections to master")
 		c.sendPollonConfData(pollon.ConfData{DestAddr: nil})
 		// ignore errors on setting proxy info
-		if err = c.SetProxyInfo(c.e, cluster.NoGeneration, 2*cluster.DefaultProxyTimeoutInterval); err != nil {
+		if err = c.SetProxyInfo(c.e, cluster.NoGeneration, proxyTimeout); err != nil {
 			log.Errorw("failed to update proxyInfo", zap.Error(err))
+		} else {
+			// update proxyCheckinterval and proxyTimeout only if we successfully updated our proxy info
+			c.configMutex.Lock()
+			c.proxyCheckInterval = cdProxyCheckInterval
+			c.proxyTimeout = cdProxyTimeout
+			c.configMutex.Unlock()
 		}
 		return nil
 	}
@ -221,8 +247,14 @@ func (c *ClusterChecker) Check() error {
 		log.Infow("no db object available, closing connections to master", "db", proxy.Spec.MasterDBUID)
 		c.sendPollonConfData(pollon.ConfData{DestAddr: nil})
 		// ignore errors on setting proxy info
-		if err = c.SetProxyInfo(c.e, proxy.Generation, 2*cluster.DefaultProxyTimeoutInterval); err != nil {
+		if err = c.SetProxyInfo(c.e, proxy.Generation, proxyTimeout); err != nil {
 			log.Errorw("failed to update proxyInfo", zap.Error(err))
+		} else {
+			// update proxyCheckinterval and proxyTimeout only if we successfully updated our proxy info
+			c.configMutex.Lock()
+			c.proxyCheckInterval = cdProxyCheckInterval
+			c.proxyTimeout = cdProxyTimeout
+			c.configMutex.Unlock()
 		}
 		return nil
 	}
@ -234,12 +266,18 @@ func (c *ClusterChecker) Check() error {
 		return nil
 	}
 	log.Infow("master address", "address", addr)
-	if err = c.SetProxyInfo(c.e, proxy.Generation, 2*cluster.DefaultProxyTimeoutInterval); err != nil {
+	if err = c.SetProxyInfo(c.e, proxy.Generation, proxyTimeout); err != nil {
 		// if we failed to update our proxy info when a master is defined we
 		// cannot ignore this error since the sentinel won't know that we exist
 		// and are sending connections to a master so, when electing a new
 		// master, it'll not wait for us to close connections to the old one.
 		return fmt.Errorf("failed to update proxyInfo: %v", err)
+	} else {
+		// update proxyCheckinterval and proxyTimeout only if we successfully updated our proxy info
+		c.configMutex.Lock()
+		c.proxyCheckInterval = cdProxyCheckInterval
+		c.proxyTimeout = cdProxyTimeout
+		c.configMutex.Unlock()
 	}

 	// start proxing only if we are inside enabledProxies, this ensures that the
@ -256,7 +294,9 @@ func (c *ClusterChecker) Check() error {
 }

 func (c *ClusterChecker) TimeoutChecker(checkOkCh chan struct{}) {
-	timeoutTimer := time.NewTimer(cluster.DefaultProxyTimeoutInterval)
+	c.configMutex.Lock()
+	timeoutTimer := time.NewTimer(c.proxyTimeout)
+	c.configMutex.Unlock()

 	for {
 		select {
@ -275,7 +315,10 @@ func (c *ClusterChecker) TimeoutChecker(checkOkCh chan struct{}) {

 			// ignore if stop succeeded or not due to timer already expired
 			timeoutTimer.Stop()
-			timeoutTimer = time.NewTimer(cluster.DefaultProxyTimeoutInterval)
+
+			c.configMutex.Lock()
+			timeoutTimer = time.NewTimer(c.proxyTimeout)
+			c.configMutex.Unlock()
 		}
 	}
 }
@ -305,7 +348,10 @@ func (c *ClusterChecker) Start() error {
 				// report that check was ok
 				checkOkCh <- struct{}{}
 			}
-			timerCh = time.NewTimer(cluster.DefaultProxyCheckInterval).C
+			c.configMutex.Lock()
+			timerCh = time.NewTimer(c.proxyCheckInterval).C
+			c.configMutex.Unlock()
+
 		case err := <-c.endPollonProxyCh:
 			if err != nil {
 				return fmt.Errorf("proxy error: %v", err)
--- a/cmd/sentinel/cmd/sentinel.go
+++ b/cmd/sentinel/cmd/sentinel.go
@ -342,7 +342,7 @@ func (s *Sentinel) activeProxiesInfos(proxiesInfo cluster.ProxiesInfo) cluster.P
 	for _, pi := range proxiesInfo {
 		if pih, ok := pihs[pi.UID]; ok {
 			if pih.ProxyInfo.InfoUID == pi.InfoUID {
-				if timer.Since(pih.Timer) > 2*cluster.DefaultProxyTimeoutInterval {
+				if timer.Since(pih.Timer) > 2*pi.ProxyTimeout {
 					delete(activeProxiesInfo, pi.UID)
 				}
 			} else {
@ -1820,7 +1820,6 @@ func (s *Sentinel) clusterSentinelCheck(pctx context.Context) {
 			s.sleepInterval = cd.Cluster.DefSpec().SleepInterval.Duration
 			s.requestTimeout = cd.Cluster.DefSpec().RequestTimeout.Duration
 		}
-
 	}

 	log.Debugf("cd dump: %s", spew.Sdump(cd))
--- a/cmd/sentinel/cmd/sentinel_test.go
+++ b/cmd/sentinel/cmd/sentinel_test.go
@ -5000,8 +5000,8 @@ func TestUpdateCluster(t *testing.T) {
 }

 func TestActiveProxiesInfos(t *testing.T) {
-	proxyInfo1 := cluster.ProxyInfo{UID: "proxy1", InfoUID: "infoUID1"}
-	proxyInfo2 := cluster.ProxyInfo{UID: "proxy2", InfoUID: "infoUID2"}
+	proxyInfo1 := cluster.ProxyInfo{UID: "proxy1", InfoUID: "infoUID1", ProxyTimeout: cluster.DefaultProxyTimeout}
+	proxyInfo2 := cluster.ProxyInfo{UID: "proxy2", InfoUID: "infoUID2", ProxyTimeout: cluster.DefaultProxyTimeout}
 	proxyInfoWithDifferentInfoUID := cluster.ProxyInfo{UID: "proxy2", InfoUID: "differentInfoUID"}
 	var secToNanoSecondMultiplier int64 = 1000000000
 	tests := []struct {
@ -5033,7 +5033,7 @@ func TestActiveProxiesInfos(t *testing.T) {
 			expectedProxyInfoHistories: ProxyInfoHistories{"proxy1": &ProxyInfoHistory{ProxyInfo: &proxyInfo1}, "proxy2": &ProxyInfoHistory{ProxyInfo: &proxyInfoWithDifferentInfoUID}},
 		},
 		{
-			name:                       "should remove from active proxies if is not updated for twice the DefaultProxyTimeoutInterval",
+			name:                       "should remove from active proxies if is not updated for twice the DefaultProxyTimeout",
 			proxyInfoHistories:         ProxyInfoHistories{"proxy1": &ProxyInfoHistory{ProxyInfo: &proxyInfo1, Timer: timer.Now() - (3 * 15 * secToNanoSecondMultiplier)}, "proxy2": &ProxyInfoHistory{ProxyInfo: &proxyInfo2, Timer: timer.Now() - (1 * 15 * secToNanoSecondMultiplier)}},
 			proxiesInfos:               cluster.ProxiesInfo{"proxy1": &proxyInfo1, "proxy2": &proxyInfo2},
 			expectedActiveProxies:      cluster.ProxiesInfo{"proxy2": &proxyInfo2},
--- a/cmd/stolonctl/cmd/spec.go
+++ b/cmd/stolonctl/cmd/spec.go
@ -50,6 +50,8 @@ type ClusterSpecNoDefaults struct {
 	DBWaitReadyTimeout               *cluster.Duration         `json:"dbWaitReadyTimeout,omitempty"`
 	FailInterval                     *cluster.Duration         `json:"failInterval,omitempty"`
 	DeadKeeperRemovalInterval        *cluster.Duration         `json:"deadKeeperRemovalInterval,omitempty"`
+	ProxyCheckInterval               *cluster.Duration         `json:"proxyCheckInterval,omitempty"`
+	ProxyTimeout                     *cluster.Duration         `json:"proxyTimeout,omitempty"`
 	MaxStandbys                      *uint16                   `json:"maxStandbys,omitempty"`
 	MaxStandbysPerSender             *uint16                   `json:"maxStandbysPerSender,omitempty"`
 	MaxStandbyLag                    *uint32                   `json:"maxStandbyLag,omitempty"`
@ -81,6 +83,8 @@ type ClusterSpecDefaults struct {
 	DBWaitReadyTimeout               *cluster.Duration         `json:"dbWaitReadyTimeout"`
 	FailInterval                     *cluster.Duration         `json:"failInterval"`
 	DeadKeeperRemovalInterval        *cluster.Duration         `json:"deadKeeperRemovalInterval"`
+	ProxyCheckInterval               *cluster.Duration         `json:"proxyCheckInterval"`
+	ProxyTimeout                     *cluster.Duration         `json:"proxyTimeout"`
 	MaxStandbys                      *uint16                   `json:"maxStandbys"`
 	MaxStandbysPerSender             *uint16                   `json:"maxStandbysPerSender"`
 	MaxStandbyLag                    *uint32                   `json:"maxStandbyLag"`
--- a/doc/cluster_spec.md
+++ b/doc/cluster_spec.md
@ -12,34 +12,36 @@ Some options in a running cluster specification can be changed to update the des

 ### Cluster Specification Format.

-| Name                      | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                       | Required                  | Type              | Default                                                                                                                             |
-|---------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------|-------------------|-------------------------------------------------------------------------------------------------------------------------------------|
-| sleepInterval             | interval to wait before next check (for every component: keeper, sentinel, proxy).                                                                                                                                                                                                                                                                                                                                                                                                | no                        | string (duration) | 5s                                                                                                                                  |
-| requestTimeout            | time after which any request (keepers checks from sentinel etc...) will fail.                                                                                                                                                                                                                                                                                                                                                                                                     | no                        | string (duration) | 10s                                                                                                                                 |
-| failInterval              | interval after the first fail to declare a keeper as not healthy.                                                                                                                                                                                                                                                                                                                                                                                                                 | no                        | string (duration) | 20s                                                                                                                                 |
-| deadKeeperRemovalInterval | interval after which a dead keeper will be removed from the cluster data                                                                                                                                                                                                                                                                                                                                                                                                          | no                        | string (duration) | 48h                                                                                                                                 |
-| maxStandbys               | max number of standbys. This needs to be greater enough to cover both standby managed by stolon and additional standbys configured by the user. Its value affect different postgres parameters like max_replication_slots and max_wal_senders. Setting this to a number lower than the sum of stolon managed standbys and user managed standbys will have unpredicatable effects due to problems creating replication slots or replication problems due to exhausted wal senders. | no                        | uint16            | 20                                                                                                                                  |
-| maxStandbysPerSender      | max number of standbys for every sender. A sender can be a master or another standby (with cascading replication).                                                                                                                                                                                                                                                                                                                                                                | no                        | uint16            | 3                                                                                                                                   |
-| maxStandbyLag             | maximum lag (from the last reported master state, in bytes) that an asynchronous standby can have to be elected in place of a failed master.                                                                                                                                                                                                                                                                                                                                      | no                        | uint32            | 1MiB                                                                                                                                |
-| synchronousReplication    | use synchronous replication between the master and its standbys                                                                                                                                                                                                                                                                                                                                                                                                                   | no                        | bool              | false                                                                                                                               |
-| minSynchronousStandbys    | minimum number of required synchronous standbys when synchronous replication is enabled (only set this to a value > 1 when using PostgreSQL >= 9.6)                                                                                                                                                                                                                                                                                                                               | no                        | uint16            | 1                                                                                                                                   |
-| maxSynchronousStandbys    | maximum number of required synchronous standbys when synchronous replication is enabled (only set this to a value > 1 when using PostgreSQL >= 9.6)                                                                                                                                                                                                                                                                                                                               | no                        | uint16            | 1                                                                                                                                   |
-| additionalWalSenders      | number of additional wal_senders in addition to the ones internally defined by stolon, useful to provide enough wal senders for external standbys (changing this value requires an instance restart)                                                                                                                                                                                                                                                                              | no                        | uint16            | 5                                                                                                                                   |
-| additionalMasterReplicationSlots | a list of additional physical replication slots to be created on the master postgres instance. They will be prefixed with `stolon_` (like internal replication slots used for standby replication) to make them "namespaced" from other replication slots. Replication slots starting with `stolon_` and not defined here (and not used for standby replication) will be dropped from the master instance.                                                                                                                                                                | no                        | []string          | null                                                                                                                                |
-| usePgrewind               | try to use pg_rewind for faster instance resyncronization.                                                                                                                                                                                                                                                                                                                                                                                                                        | no                        | bool              | false                                                                                                                               |
-| initMode                  | The cluster initialization mode. Can be *new* or *existing*. *new* means that a new db cluster will be created on a random keeper and the other keepers will sync with it. *existing* means that a keeper (that needs to have an already created db cluster) will be choosed as the initial master and the other keepers will sync with it. In this case the `existingConfig` object needs to be populated.                                                                       | yes                       | string            |                                                                                                                                     |
-| existingConfig            | configuration for initMode of type "existing"                                                                                                                                                                                                                                                                                                                                                                                                                                     | if initMode is "existing" | ExistingConfig    |                                                                                                                                     |
-| mergePgParameters         | merge pgParameters of the initialized db cluster, useful the retain initdb generated parameters when InitMode is new, retain current parameters when initMode is existing or pitr.                                                                                                                                                                                                                                                                                                | no                        | bool              | true                                                                                                                                |
-| role                      | cluster role (master or standby)                                                                                                                                                                                                                                                                                                                                                                                                                                                  | no                        | bool              | master                                                                                                                              |
-| defaultSUReplAccessMode   | mode for the default hba rules used for replication by standby keepers (the su and repl auth methods will be the one provided in the keeper command line options). Values can be *all* or *strict*. *all* allow access from all ips, *strict* restrict master access to standby servers ips.                                                                                                                                                                                      | no                        | string            | all                                                                                                                                 |
-| newConfig                 | configuration for initMode of type "new"                                                                                                                                                                                                                                                                                                                                                                                                                                          | if initMode is "new"      | NewConfig         |                                                                                                                                     |
-| pitrConfig                | configuration for initMode of type "pitr"                                                                                                                                                                                                                                                                                                                                                                                                                                         | if initMode is "pitr"     | PITRConfig        |                                                                                                                                     |
-| standbyConfig             | standby config when the cluster is a standby cluster                                                                                                                                                                                                                                                                                                                                                                                                                              | if role is "standby"      | StandbyConfig     |                                                                                                                                     |
-| pgParameters              | a map containing the postgres server parameters and their values. The parameters value don't have to be quoted and single quotes don't have to be doubled since this is already done by the keeper when writing the postgresql.conf file                                                                                                                                                                                                                                          | no                        | map[string]string |                                                                                                                                     |
-| pgHBA                     | a list containing additional pg_hba.conf entries. They will be added to the pg_hba.conf generated by stolon. **NOTE**: these lines aren't validated so if some of them are wrong postgres will refuse to start or, on reload, will log a warning and ignore the updated pg_hba.conf file                                                                                                                                                                                          | no                        | []string          | null. Will use the default behiavior of accepting connections from all hosts for all dbs and users with md5 password authentication                                                   |
-| automaticPgRestart        | restart postgres automatically after changing the pgParameters that requires restart. Refer `pending_restart` in [pg_settings](https://www.postgresql.org/docs/9.5/static/view-pg-settings.html)    | no | bool | false |
-| dbWaitReadyTimeout        | Time to wait for the database to become ready after starting.  Increase this value if your Postgres takes longer to boot, e.g. because it has to recover a lot of WAL. | no | string (duration) | 60s |
-| syncTimeout               | Time to wait for a database recovery (including the replay of WAL files in case of Point-In-Time-Recovery)                                                                                                                                                                                                                                                                                                                                                                        | no                        | string (duration) | 0 (no timeout, waits until recovery has finished)                                                                                   |
+| Name                             | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                       | Required                  | Type              | Default                                                                                                                             |
+|----------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------|-------------------|-------------------------------------------------------------------------------------------------------------------------------------|
+| sleepInterval                    | interval to wait before next check (for keepers and sentinels).                                                                                                                                                                                                                                                                                                                                                                                                                   | no                        | string (duration) | 5s                                                                                                                                  |
+| requestTimeout                   | time after which any request to external resources (store, postgres queries etc...) will fail.                                                                                                                                                                                                                                                                                                                                                                                    | no                        | string (duration) | 10s                                                                                                                                 |
+| failInterval                     | interval after the first fail to declare a keeper as not healthy.                                                                                                                                                                                                                                                                                                                                                                                                                 | no                        | string (duration) | 20s                                                                                                                                 |
+| proxyCheckInterval               | interval to wait before next proxy check.                                                                                                                                                                                                                                                                                                                                                                                                                                         | no                        | string (duration) | 5s                                                                                                                                  |
+| proxyTimeout                     | interval where a proxy check must successfully complete or the proxy will close all connections to the master.                                                                                                                                                                                                                                                                                                                                                                    | no                        | string (duration) | 15s                                                                                                                                 |
+| deadKeeperRemovalInterval        | interval after which a dead keeper will be removed from the cluster data                                                                                                                                                                                                                                                                                                                                                                                                          | no                        | string (duration) | 48h                                                                                                                                 |
+| maxStandbys                      | max number of standbys. This needs to be greater enough to cover both standby managed by stolon and additional standbys configured by the user. Its value affect different postgres parameters like max_replication_slots and max_wal_senders. Setting this to a number lower than the sum of stolon managed standbys and user managed standbys will have unpredicatable effects due to problems creating replication slots or replication problems due to exhausted wal senders. | no                        | uint16            | 20                                                                                                                                  |
+| maxStandbysPerSender             | max number of standbys for every sender. A sender can be a master or another standby (with cascading replication).                                                                                                                                                                                                                                                                                                                                                                | no                        | uint16            | 3                                                                                                                                   |
+| maxStandbyLag                    | maximum lag (from the last reported master state, in bytes) that an asynchronous standby can have to be elected in place of a failed master.                                                                                                                                                                                                                                                                                                                                      | no                        | uint32            | 1MiB                                                                                                                                |
+| synchronousReplication           | use synchronous replication between the master and its standbys                                                                                                                                                                                                                                                                                                                                                                                                                   | no                        | bool              | false                                                                                                                               |
+| minSynchronousStandbys           | minimum number of required synchronous standbys when synchronous replication is enabled (only set this to a value > 1 when using PostgreSQL >= 9.6)                                                                                                                                                                                                                                                                                                                               | no                        | uint16            | 1                                                                                                                                   |
+| maxSynchronousStandbys           | maximum number of required synchronous standbys when synchronous replication is enabled (only set this to a value > 1 when using PostgreSQL >= 9.6)                                                                                                                                                                                                                                                                                                                               | no                        | uint16            | 1                                                                                                                                   |
+| additionalWalSenders             | number of additional wal_senders in addition to the ones internally defined by stolon, useful to provide enough wal senders for external standbys (changing this value requires an instance restart)                                                                                                                                                                                                                                                                              | no                        | uint16            | 5                                                                                                                                   |
+| additionalMasterReplicationSlots | a list of additional physical replication slots to be created on the master postgres instance. They will be prefixed with `stolon_` (like internal replication slots used for standby replication) to make them "namespaced" from other replication slots. Replication slots starting with `stolon_` and not defined here (and not used for standby replication) will be dropped from the master instance.                                                                        | no                        | []string          | null                                                                                                                                |
+| usePgrewind                      | try to use pg_rewind for faster instance resyncronization.                                                                                                                                                                                                                                                                                                                                                                                                                        | no                        | bool              | false                                                                                                                               |
+| initMode                         | The cluster initialization mode. Can be *new* or *existing*. *new* means that a new db cluster will be created on a random keeper and the other keepers will sync with it. *existing* means that a keeper (that needs to have an already created db cluster) will be choosed as the initial master and the other keepers will sync with it. In this case the `existingConfig` object needs to be populated.                                                                       | yes                       | string            |                                                                                                                                     |
+| existingConfig                   | configuration for initMode of type "existing"                                                                                                                                                                                                                                                                                                                                                                                                                                     | if initMode is "existing" | ExistingConfig    |                                                                                                                                     |
+| mergePgParameters                | merge pgParameters of the initialized db cluster, useful the retain initdb generated parameters when InitMode is new, retain current parameters when initMode is existing or pitr.                                                                                                                                                                                                                                                                                                | no                        | bool              | true                                                                                                                                |
+| role                             | cluster role (master or standby)                                                                                                                                                                                                                                                                                                                                                                                                                                                  | no                        | bool              | master                                                                                                                              |
+| defaultSUReplAccessMode          | mode for the default hba rules used for replication by standby keepers (the su and repl auth methods will be the one provided in the keeper command line options). Values can be *all* or *strict*. *all* allow access from all ips, *strict* restrict master access to standby servers ips.                                                                                                                                                                                      | no                        | string            | all                                                                                                                                 |
+| newConfig                        | configuration for initMode of type "new"                                                                                                                                                                                                                                                                                                                                                                                                                                          | if initMode is "new"      | NewConfig         |                                                                                                                                     |
+| pitrConfig                       | configuration for initMode of type "pitr"                                                                                                                                                                                                                                                                                                                                                                                                                                         | if initMode is "pitr"     | PITRConfig        |                                                                                                                                     |
+| standbyConfig                    | standby config when the cluster is a standby cluster                                                                                                                                                                                                                                                                                                                                                                                                                              | if role is "standby"      | StandbyConfig     |                                                                                                                                     |
+| pgParameters                     | a map containing the postgres server parameters and their values. The parameters value don't have to be quoted and single quotes don't have to be doubled since this is already done by the keeper when writing the postgresql.conf file                                                                                                                                                                                                                                          | no                        | map[string]string |                                                                                                                                     |
+| pgHBA                            | a list containing additional pg_hba.conf entries. They will be added to the pg_hba.conf generated by stolon. **NOTE**: these lines aren't validated so if some of them are wrong postgres will refuse to start or, on reload, will log a warning and ignore the updated pg_hba.conf file                                                                                                                                                                                          | no                        | []string          | null. Will use the default behaviour of accepting connections from all hosts for all dbs and users with md5 password authentication |
+| automaticPgRestart               | restart postgres automatically after changing the pgParameters that requires restart. Refer `pending_restart` in [pg_settings](https://www.postgresql.org/docs/9.5/static/view-pg-settings.html)                                                                                                                                                                                                                                                                                  | no                        | bool              | false                                                                                                                               |
+| dbWaitReadyTimeout               | Time to wait for the database to become ready after starting.  Increase this value if your Postgres takes longer to boot, e.g. because it has to recover a lot of WAL.                                                                                                                                                                                                                                                                                                            | no                        | string (duration) | 60s                                                                                                                                 |
+| syncTimeout                      | Time to wait for a database recovery (including the replay of WAL files in case of Point-In-Time-Recovery)                                                                                                                                                                                                                                                                                                                                                                        | no                        | string (duration) | 0 (no timeout, waits until recovery has finished)                                                                                   |

 #### ExistingConfig

@ -63,41 +65,41 @@ Some options in a running cluster specification can be changed to update the des
 |-------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------|-------------------------|---------|
 | dataRestoreCommand      | defines the command to execute for restoring the db cluster data. %d is replaced with the full path to the db cluster datadir. Use %% to embed an actual % character. Must return a 0 exit code only on success. | yes      | string                  |         |
 | archiveRecoverySettings | archive recovery configuration                                                                                                                                                                                   | yes      | ArchiveRecoverySettings |         |
-| recoveryTargetSettings | recovery target configuration                                                                                                                                                                                     | no       | RecoveryTargetSettings  |         |
+| recoveryTargetSettings  | recovery target configuration                                                                                                                                                                                    | no       | RecoveryTargetSettings  |         |

 #### StandbyConfig

-| Name                    | Description                                                                                                                                                                                                      | Required | Type                    | Default |
-|-------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------|-------------------------|---------|
-| standbySettings         | standby configuration                                                                                                                                                                                            | no       | StandbySettings         |         |
-| archiveRecoverySettings | archive recovery configuration                                                                                                                                                                                   | no       | ArchiveRecoverySettings |         |
+| Name                    | Description                    | Required | Type                    | Default |
+|-------------------------|--------------------------------|----------|-------------------------|---------|
+| standbySettings         | standby configuration          | no       | StandbySettings         |         |
+| archiveRecoverySettings | archive recovery configuration | no       | ArchiveRecoverySettings |         |

 #### ArchiveRecoverySettings

-| Name                    | Description                                                                                                                                                                | Required | Type                    | Default |
-|-------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------|-------------------------|---------|
-| restoreCommand          | defines the command to execute for restoring the archives. See the related [postgresql doc](https://www.postgresql.org/docs/current/static/archive-recovery-settings.html) | yes      | string                  |         |
+| Name           | Description                                                                                                                                                                | Required | Type   | Default |
+|----------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------|--------|---------|
+| restoreCommand | defines the command to execute for restoring the archives. See the related [postgresql doc](https://www.postgresql.org/docs/current/static/archive-recovery-settings.html) | yes      | string |         |

 #### RecoveryTargetSettings

 These parameters are the same as defined in [postgresql recovery target settings doc](https://www.postgresql.org/docs/current/static/recovery-target-settings.html)

-| Name                    | Description                                                                                                                                  | Required | Type                    | Default |
-|-------------------------|----------------------------------------------------------------------------------------------------------------------------------------------|----------|-------------------------|---------|
-| recoveryTarget          | See `recovery_target` in the related [postgresql doc](https://www.postgresql.org/docs/current/static/recovery-target-settings.html)          | no       | string                  |         |
-| recoveryTargetLsn       | See `recovery_target_lsn` in the related [postgresql doc](https://www.postgresql.org/docs/current/static/recovery-target-settings.html)      | no       | string                  |         |
-| recoveryTargetName      | See `recovery_target_name` in the related [postgresql doc](https://www.postgresql.org/docs/current/static/recovery-target-settings.html)     | no       | string                  |         |
-| recoveryTargetTime      | See `recovery_target_time` in the related [postgresql doc](https://www.postgresql.org/docs/current/static/recovery-target-settings.html)     | no       | string                  |         |
-| recoveryTargetXid       | See `recovery_target_xid` in the related [postgresql doc](https://www.postgresql.org/docs/current/static/recovery-target-settings.html)      | no       | string                  |         |
-| recoveryTargetTimeline  | See `recovery_target_timeline` in the related [postgresql doc](https://www.postgresql.org/docs/current/static/recovery-target-settings.html) | no       | string                  |         |
+| Name                   | Description                                                                                                                                  | Required | Type   | Default |
+|------------------------|----------------------------------------------------------------------------------------------------------------------------------------------|----------|--------|---------|
+| recoveryTarget         | See `recovery_target` in the related [postgresql doc](https://www.postgresql.org/docs/current/static/recovery-target-settings.html)          | no       | string |         |
+| recoveryTargetLsn      | See `recovery_target_lsn` in the related [postgresql doc](https://www.postgresql.org/docs/current/static/recovery-target-settings.html)      | no       | string |         |
+| recoveryTargetName     | See `recovery_target_name` in the related [postgresql doc](https://www.postgresql.org/docs/current/static/recovery-target-settings.html)     | no       | string |         |
+| recoveryTargetTime     | See `recovery_target_time` in the related [postgresql doc](https://www.postgresql.org/docs/current/static/recovery-target-settings.html)     | no       | string |         |
+| recoveryTargetXid      | See `recovery_target_xid` in the related [postgresql doc](https://www.postgresql.org/docs/current/static/recovery-target-settings.html)      | no       | string |         |
+| recoveryTargetTimeline | See `recovery_target_timeline` in the related [postgresql doc](https://www.postgresql.org/docs/current/static/recovery-target-settings.html) | no       | string |         |

 #### StandbySettings

-| Name                    | Description                                                                                                                                                                                                                                                   | Required | Type                    | Default |
-|-------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------|-------------------------|---------|
-| primaryConnInfo         | connection string to connect to the primary server (its value will be placed in the `primary_conninfo` parameter of the instance `recovery.conf` file. See the related [postgresql doc](https://www.postgresql.org/docs/current/static/standby-settings.html) | yes      | string                  |         |
-| primarySlotName         | optional replication slot to use (its value will be placed in the `primary_slot_name` parameter of the instance `recovery.conf` file. See the related [postgresql doc](https://www.postgresql.org/docs/current/static/standby-settings.html)                  | no       | string                  |         |
-| recoveryMinApplyDelay   | delay recovery for a fixed period of time (its value will be placed in the `recovery_min_apply_delay` parameter of the instance `recovery.conf` file. See the related [postgresql doc](https://www.postgresql.org/docs/current/static/standby-settings.html)  | no       | string                  |         |
+| Name                  | Description                                                                                                                                                                                                                                                   | Required | Type   | Default |
+|-----------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------|--------|---------|
+| primaryConnInfo       | connection string to connect to the primary server (its value will be placed in the `primary_conninfo` parameter of the instance `recovery.conf` file. See the related [postgresql doc](https://www.postgresql.org/docs/current/static/standby-settings.html) | yes      | string |         |
+| primarySlotName       | optional replication slot to use (its value will be placed in the `primary_slot_name` parameter of the instance `recovery.conf` file. See the related [postgresql doc](https://www.postgresql.org/docs/current/static/standby-settings.html)                  | no       | string |         |
+| recoveryMinApplyDelay | delay recovery for a fixed period of time (its value will be placed in the `recovery_min_apply_delay` parameter of the instance `recovery.conf` file. See the related [postgresql doc](https://www.postgresql.org/docs/current/static/standby-settings.html)  | no       | string |         |

 #### Special Types
 duration types (as described in https://golang.org/pkg/time/#ParseDuration) are signed sequence of decimal numbers, each with optional fraction and a unit suffix, such as "300ms", "-1.5h" or "2h45m". Valid time units are "ns", "us" (or "µs"), "ms", "s", "m", "h".
--- a/internal/cluster/cluster.go
+++ b/internal/cluster/cluster.go
@ -43,11 +43,7 @@ const (
 )

 const (
-	DefaultStoreTimeout         = 5 * time.Second
-	DefaultProxyCheckInterval   = 5 * time.Second
-	DefaultProxyTimeoutInterval = 15 * time.Second
-
-	DefaultDBWaitReadyTimeout = 60 * time.Second
+	DefaultStoreTimeout = 5 * time.Second

 	DefaultDBNotIncreasingXLogPosTimes = 10

@ -56,8 +52,11 @@ const (
 	DefaultConvergenceTimeout                         = 30 * time.Second
 	DefaultInitTimeout                                = 5 * time.Minute
 	DefaultSyncTimeout                                = 0
+	DefaultDBWaitReadyTimeout                         = 60 * time.Second
 	DefaultFailInterval                               = 20 * time.Second
 	DefaultDeadKeeperRemovalInterval                  = 48 * time.Hour
+	DefaultProxyCheckInterval                         = 5 * time.Second
+	DefaultProxyTimeout                               = 15 * time.Second
 	DefaultMaxStandbys               uint16           = 20
 	DefaultMaxStandbysPerSender      uint16           = 3
 	DefaultMaxStandbyLag                              = 1024 * 1204
@ -228,6 +227,10 @@ type ClusterSpec struct {
 	FailInterval *Duration `json:"failInterval,omitempty"`
 	// Interval after which a dead keeper will be removed from the cluster data
 	DeadKeeperRemovalInterval *Duration `json:"deadKeeperRemovalInterval,omitempty"`
+	// Interval to wait before next proxy check
+	ProxyCheckInterval *Duration `json:"proxyCheckInterval,omitempty"`
+	// Interval where the proxy must successfully complete a check
+	ProxyTimeout *Duration `json:"proxyTimeout,omitempty"`
 	// Max number of standbys. This needs to be greater enough to cover both
 	// standby managed by stolon and additional standbys configured by the
 	// user. Its value affect different postgres parameters like
@ -364,6 +367,12 @@ func (os *ClusterSpec) WithDefaults() *ClusterSpec {
 	if s.DeadKeeperRemovalInterval == nil {
 		s.DeadKeeperRemovalInterval = &Duration{Duration: DefaultDeadKeeperRemovalInterval}
 	}
+	if s.ProxyCheckInterval == nil {
+		s.ProxyCheckInterval = &Duration{Duration: DefaultProxyCheckInterval}
+	}
+	if s.ProxyTimeout == nil {
+		s.ProxyTimeout = &Duration{Duration: DefaultProxyTimeout}
+	}
 	if s.MaxStandbys == nil {
 		s.MaxStandbys = Uint16P(DefaultMaxStandbys)
 	}
@ -426,11 +435,20 @@ func (os *ClusterSpec) Validate() error {
 	if s.DBWaitReadyTimeout.Duration < 0 {
 		return fmt.Errorf("dbWaitReadyTimeout must be positive")
 	}
+	if s.FailInterval.Duration < 0 {
+		return fmt.Errorf("failInterval must be positive")
+	}
 	if s.DeadKeeperRemovalInterval.Duration < 0 {
 		return fmt.Errorf("deadKeeperRemovalInterval must be positive")
 	}
-	if s.FailInterval.Duration < 0 {
-		return fmt.Errorf("failInterval must be positive")
+	if s.ProxyCheckInterval.Duration < 0 {
+		return fmt.Errorf("proxyCheckInterval must be positive")
+	}
+	if s.ProxyTimeout.Duration < 0 {
+		return fmt.Errorf("proxyTimeout must be positive")
+	}
+	if s.ProxyCheckInterval.Duration >= s.ProxyTimeout.Duration {
+		return fmt.Errorf("proxyCheckInterval should be less than proxyTimeout")
 	}
 	if *s.MaxStandbys < 1 {
 		return fmt.Errorf("maxStandbys must be at least 1")
--- a/internal/cluster/member.go
+++ b/internal/cluster/member.go
@ -16,6 +16,7 @@ package cluster

 import (
 	"reflect"
+	"time"

 	"github.com/sorintlab/stolon/internal/common"

@ -133,6 +134,12 @@ type ProxyInfo struct {

 	UID        string
 	Generation int64
+
+	// ProxyTimeout is the current proxyTimeout used by the proxy
+	// at the time of publishing its state.
+	// It's used by the sentinel to know for how much time the
+	// proxy should be considered active.
+	ProxyTimeout time.Duration
 }

 type ProxiesInfo map[string]*ProxyInfo
--- a/tests/integration/proxy_test.go
+++ b/tests/integration/proxy_test.go
@ -151,7 +151,7 @@ func TestProxyListening(t *testing.T) {
 	}

 	// tp should not listen because it cannot talk with the store
-	if err := tp.WaitNotListening(cluster.DefaultProxyTimeoutInterval * 2); err != nil {
+	if err := tp.WaitNotListening(cluster.DefaultProxyTimeout * 2); err != nil {
 		t.Fatalf("expecting tp not listening due to failed store communication, but it's listening.")
 	}

@ -174,8 +174,8 @@ func TestProxyListening(t *testing.T) {
 	if err := tstore.WaitDown(10 * time.Second); err != nil {
 		t.Fatalf("error waiting on store down: %v", err)
 	}
-	// wait less than DefaultProxyTimeoutInterval
-	time.Sleep(cluster.DefaultProxyTimeoutInterval / 3)
+	// wait less than DefaultProxyTimeout
+	time.Sleep(cluster.DefaultProxyTimeout / 3)
 	// Start store
 	if err := tstore.Start(); err != nil {
 		t.Fatalf("unexpected err: %v", err)
@ -239,7 +239,7 @@ func TestProxyListening(t *testing.T) {
 	}

 	// tp should not listen because it cannot talk with the store
-	if err := tp.WaitNotListening(cluster.DefaultProxyTimeoutInterval * 2); err != nil {
+	if err := tp.WaitNotListening(cluster.DefaultProxyTimeout * 2); err != nil {
 		t.Fatalf("expecting tp not listening due to failed store communication, but it's listening.")
 	}

--- a/tests/integration/sentinel_test.go
+++ b/tests/integration/sentinel_test.go
@ -152,7 +152,7 @@ func TestSentinelEnabledProxies(t *testing.T) {
 		t.Fatalf("unexpected err: %v", err)
 	}

-	if err := WaitClusterDataEnabledProxiesNum(sm, 1, 3*cluster.DefaultProxyTimeoutInterval); err != nil {
+	if err := WaitClusterDataEnabledProxiesNum(sm, 1, 3*cluster.DefaultProxyTimeout); err != nil {
 		t.Fatalf("unexpected err: %v", err)
 	}