Merge pull request #16289 from hashicorp/jbardin/consul-lock

some consul lock improvements
This commit is contained in:
James Bardin 2017-10-17 15:32:26 -04:00 committed by GitHub
commit 344fa54638
4 changed files with 136 additions and 29 deletions

View File

@ -2,7 +2,9 @@ package consul
import (
"context"
"net"
"strings"
"time"
consulapi "github.com/hashicorp/consul/api"
"github.com/hashicorp/terraform/backend"
@ -100,6 +102,7 @@ type Backend struct {
*schema.Backend
// The fields below are set from configure
client *consulapi.Client
configData *schema.ResourceData
lock bool
}
@ -111,16 +114,14 @@ func (b *Backend) configure(ctx context.Context) error {
// Store the lock information
b.lock = b.configData.Get("lock").(bool)
// Initialize a client to test config
_, err := b.clientRaw()
return err
}
func (b *Backend) clientRaw() (*consulapi.Client, error) {
data := b.configData
// Configure the client
config := consulapi.DefaultConfig()
// replace the default Transport Dialer to reduce the KeepAlive
config.Transport.DialContext = dialContext
if v, ok := data.GetOk("access_token"); ok && v.(string) != "" {
config.Token = v.(string)
}
@ -162,5 +163,18 @@ func (b *Backend) clientRaw() (*consulapi.Client, error) {
}
}
return consulapi.NewClient(config)
client, err := consulapi.NewClient(config)
if err != nil {
return err
}
b.client = client
return nil
}
// dialContext is the DialContext function for the consul client transport.
// This is stored in a package var to inject a different dialer for tests.
var dialContext = (&net.Dialer{
Timeout: 30 * time.Second,
KeepAlive: 17 * time.Second,
}).DialContext

View File

@ -15,15 +15,9 @@ const (
)
func (b *Backend) States() ([]string, error) {
// Get the Consul client
client, err := b.clientRaw()
if err != nil {
return nil, err
}
// List our raw path
prefix := b.configData.Get("path").(string) + keyEnvPrefix
keys, _, err := client.KV().Keys(prefix, "/", nil)
keys, _, err := b.client.KV().Keys(prefix, "/", nil)
if err != nil {
return nil, err
}
@ -60,28 +54,16 @@ func (b *Backend) DeleteState(name string) error {
return fmt.Errorf("can't delete default state")
}
// Get the Consul API client
client, err := b.clientRaw()
if err != nil {
return err
}
// Determine the path of the data
path := b.path(name)
// Delete it. We just delete it without any locking since
// the DeleteState API is documented as such.
_, err = client.KV().Delete(path, nil)
_, err := b.client.KV().Delete(path, nil)
return err
}
func (b *Backend) State(name string) (state.State, error) {
// Get the Consul API client
client, err := b.clientRaw()
if err != nil {
return nil, err
}
// Determine the path of the data
path := b.path(name)
@ -91,7 +73,7 @@ func (b *Backend) State(name string) (state.State, error) {
// Build the state client
var stateMgr state.State = &remote.State{
Client: &RemoteClient{
Client: client,
Client: b.client,
Path: path,
GZip: gzip,
lockState: b.lock,
@ -103,6 +85,11 @@ func (b *Backend) State(name string) (state.State, error) {
stateMgr = &state.LockDisabled{Inner: stateMgr}
}
// the default state always exists
if name == backend.DefaultStateName {
return stateMgr, nil
}
// Grab a lock, we use this to write an empty state if one doesn't
// exist already. We have to write an empty state as a sentinel value
// so States() knows it exists.

View File

@ -32,6 +32,8 @@ const (
lockReacquireInterval = 2 * time.Second
)
var lostLockErr = errors.New("consul lock was lost")
// RemoteClient is a remote client that stores data in Consul.
type RemoteClient struct {
Client *consulapi.Client
@ -228,6 +230,9 @@ func (c *RemoteClient) lock() (string, error) {
return "", err
}
// store the session ID for correlation with consul logs
c.info.Info = "consul session: " + lockSession
opts := &consulapi.LockOptions{
Key: c.Path + lockSuffix,
Session: lockSession,
@ -406,7 +411,7 @@ func (c *RemoteClient) unlock(id string) error {
select {
case <-c.lockCh:
return errors.New("consul lock was lost")
return lostLockErr
default:
}

View File

@ -1,7 +1,10 @@
package consul
import (
"context"
"fmt"
"net"
"sync"
"testing"
"time"
@ -173,6 +176,7 @@ func TestConsul_lostLock(t *testing.T) {
reLocked := make(chan struct{})
testLockHook = func() {
close(reLocked)
testLockHook = nil
}
// now we use the second client to break the lock
@ -188,3 +192,100 @@ func TestConsul_lostLock(t *testing.T) {
t.Fatal(err)
}
}
func TestConsul_lostLockConnection(t *testing.T) {
srv := newConsulTestServer(t)
defer srv.Stop()
// create an "unreliable" network by closing all the consul client's
// network connections
conns := &unreliableConns{}
origDialFn := dialContext
defer func() {
dialContext = origDialFn
}()
dialContext = conns.DialContext
path := fmt.Sprintf("tf-unit/%s", time.Now().String())
b := backend.TestBackendConfig(t, New(), map[string]interface{}{
"address": srv.HTTPAddr,
"path": path,
})
s, err := b.State(backend.DefaultStateName)
if err != nil {
t.Fatal(err)
}
info := state.NewLockInfo()
info.Operation = "test-lost-lock-connection"
id, err := s.Lock(info)
if err != nil {
t.Fatal(err)
}
// set a callback to know when the monitor loop re-connects
dialed := make(chan struct{})
conns.dialCallback = func() {
close(dialed)
conns.dialCallback = nil
}
// kill any open connections
// once the consul client is fixed, we should loop over this a few time to
// be sure, since we can't hook into the client's internal lock monitor
// loop.
conns.Kill()
// wait for a new connection to be dialed, and kill it again
<-dialed
conns.Kill()
// since the lock monitor loop is hidden in the consul api client, we can
// only wait a bit to make sure we were notified of the failure
time.Sleep(time.Second)
// once the consul client can reconnect properly, there will no longer be
// an error here
//if err := s.Unlock(id); err != nil {
if err := s.Unlock(id); err != lostLockErr {
t.Fatalf("expected lost lock error, got %v", err)
}
}
type unreliableConns struct {
sync.Mutex
conns []net.Conn
dialCallback func()
}
func (u *unreliableConns) DialContext(ctx context.Context, netw, addr string) (net.Conn, error) {
u.Lock()
defer u.Unlock()
dialer := &net.Dialer{}
conn, err := dialer.DialContext(ctx, netw, addr)
if err != nil {
return nil, err
}
u.conns = append(u.conns, conn)
if u.dialCallback != nil {
u.dialCallback()
}
return conn, nil
}
// Kill these with a deadline, just to make sure we don't end up with any EOFs
// that get ignored.
func (u *unreliableConns) Kill() {
u.Lock()
defer u.Unlock()
for _, conn := range u.conns {
conn.(*net.TCPConn).SetDeadline(time.Now())
}
u.conns = nil
}