Fix logic race with Context.watchStop

Always wait for watchStop to return during context.walk.

Context.walk would often complete immediately after sending the close
signal to watchStop, which would in turn call the deferred releaseRun
cancelling the runContext.

Without any synchronization points after the select statement in
watchStop, that goroutine was not guaranteed to be scheduled
immediately, and in fact it often didn't continue until after the
runContext was canceled. This in turn left the select statement with
multiple successful cases, and half the time it would chose to Stop the
providers.

Stopping the providers after the walk of course didn't cause any
immediate failures, but if there was another walk performed, the
provider StopContext would no longer be valid and could cause
cancellation errors in the provider.
This commit is contained in:
James Bardin 2017-03-08 14:43:31 -05:00
parent 4b2e96b2e2
commit 38d2a8f6ac
1 changed files with 65 additions and 44 deletions

View File

@ -775,15 +775,14 @@ func (c *Context) walk(
} }
// Watch for a stop so we can call the provider Stop() API. // Watch for a stop so we can call the provider Stop() API.
doneCh := make(chan struct{}) watchStop, watchWait := c.watchStop(walker)
stopCh := c.runContext.Done()
go c.watchStop(walker, doneCh, stopCh)
// Walk the real graph, this will block until it completes // Walk the real graph, this will block until it completes
realErr := graph.Walk(walker) realErr := graph.Walk(walker)
// Close the done channel so the watcher stops // Close the channel so the watcher stops, and wait for it to return.
close(doneCh) close(watchStop)
<-watchWait
// If we have a shadow graph and we interrupted the real graph, then // If we have a shadow graph and we interrupted the real graph, then
// we just close the shadow and never verify it. It is non-trivial to // we just close the shadow and never verify it. It is non-trivial to
@ -872,52 +871,74 @@ func (c *Context) walk(
return walker, realErr return walker, realErr
} }
func (c *Context) watchStop(walker *ContextGraphWalker, doneCh, stopCh <-chan struct{}) { // watchStop immediately returns a `stop` and a `wait` chan after dispatching
// Wait for a stop or completion // the watchStop goroutine. This will watch the runContext for cancellation and
select { // stop the providers accordingly. When the watch is no longer needed, the
case <-stopCh: // `stop` chan should be closed before waiting on the `wait` chan.
// Stop was triggered. Fall out of the select // The `wait` chan is important, because without synchronizing with the end of
case <-doneCh: // the watchStop goroutine, the runContext may also be closed during the select
// Done, just exit completely // incorrectly causing providers to be stopped. Even if the graph walk is done
return // at that point, stopping a provider permanently cancels its StopContext which
} // can cause later actions to fail.
func (c *Context) watchStop(walker *ContextGraphWalker) (chan struct{}, <-chan struct{}) {
stop := make(chan struct{})
wait := make(chan struct{})
// If we're here, we're stopped, trigger the call. // get the runContext cancellation channel now, because releaseRun will
// write to the runContext field.
done := c.runContext.Done()
{ go func() {
// Copy the providers so that a misbehaved blocking Stop doesn't defer close(wait)
// completely hang Terraform. // Wait for a stop or completion
walker.providerLock.Lock() select {
ps := make([]ResourceProvider, 0, len(walker.providerCache)) case <-done:
for _, p := range walker.providerCache { // done means the context was canceled, so we need to try and stop
ps = append(ps, p) // providers.
case <-stop:
// our own stop channel was closed.
return
} }
defer walker.providerLock.Unlock()
for _, p := range ps { // If we're here, we're stopped, trigger the call.
// We ignore the error for now since there isn't any reasonable
// action to take if there is an error here, since the stop is still
// advisory: Terraform will exit once the graph node completes.
p.Stop()
}
}
{ {
// Call stop on all the provisioners // Copy the providers so that a misbehaved blocking Stop doesn't
walker.provisionerLock.Lock() // completely hang Terraform.
ps := make([]ResourceProvisioner, 0, len(walker.provisionerCache)) walker.providerLock.Lock()
for _, p := range walker.provisionerCache { ps := make([]ResourceProvider, 0, len(walker.providerCache))
ps = append(ps, p) for _, p := range walker.providerCache {
} ps = append(ps, p)
defer walker.provisionerLock.Unlock() }
defer walker.providerLock.Unlock()
for _, p := range ps { for _, p := range ps {
// We ignore the error for now since there isn't any reasonable // We ignore the error for now since there isn't any reasonable
// action to take if there is an error here, since the stop is still // action to take if there is an error here, since the stop is still
// advisory: Terraform will exit once the graph node completes. // advisory: Terraform will exit once the graph node completes.
p.Stop() p.Stop()
}
} }
}
{
// Call stop on all the provisioners
walker.provisionerLock.Lock()
ps := make([]ResourceProvisioner, 0, len(walker.provisionerCache))
for _, p := range walker.provisionerCache {
ps = append(ps, p)
}
defer walker.provisionerLock.Unlock()
for _, p := range ps {
// We ignore the error for now since there isn't any reasonable
// action to take if there is an error here, since the stop is still
// advisory: Terraform will exit once the graph node completes.
p.Stop()
}
}
}()
return stop, wait
} }
// parseVariableAsHCL parses the value of a single variable as would have been specified // parseVariableAsHCL parses the value of a single variable as would have been specified