From db6ad08f5e8fb28bf1aa1196d2ad25ed1f83e448 Mon Sep 17 00:00:00 2001 From: Tom Limoncelli Date: Wed, 18 Nov 2020 09:06:45 -0500 Subject: [PATCH] GCLOUD: Retry on ratelimit (#946) * Add exponential delay retry on 429, 503; and single retry on 404s. --- integrationTest/integration_test.go | 2 +- providers/gcloud/gcloudProvider.go | 95 +++++++++++++++++++++++++++-- 2 files changed, 90 insertions(+), 7 deletions(-) diff --git a/integrationTest/integration_test.go b/integrationTest/integration_test.go index 968d87a2d..1be315426 100644 --- a/integrationTest/integration_test.go +++ b/integrationTest/integration_test.go @@ -756,7 +756,7 @@ func makeTests(t *testing.T) []*TestGroup { testgroup("pager601", // AWS: https://github.com/StackExchange/dnscontrol/issues/493 //only("AZURE_DNS", "HEXONET", "ROUTE53"), - only("HEXONET", "ROUTE53"), // AZURE_DNS is failing. + only("HEXONET", "ROUTE53", "GCLOUD"), // AZURE_DNS is failing. tc("601 records", manyA("rec%04d", "1.2.3.4", 600)...), tc("Update 601 records", manyA("rec%04d", "1.2.3.5", 600)...), ), diff --git a/providers/gcloud/gcloudProvider.go b/providers/gcloud/gcloudProvider.go index 535368e3b..fdf216cb3 100644 --- a/providers/gcloud/gcloudProvider.go +++ b/providers/gcloud/gcloudProvider.go @@ -4,14 +4,18 @@ import ( "context" "encoding/json" "fmt" + "log" "strings" + "time" + + "google.golang.org/api/googleapi" gauth "golang.org/x/oauth2/google" - gdns "google.golang.org/api/dns/v1" "github.com/StackExchange/dnscontrol/v3/models" "github.com/StackExchange/dnscontrol/v3/pkg/diff" "github.com/StackExchange/dnscontrol/v3/providers" + gdns "google.golang.org/api/dns/v1" ) var features = providers.DocumentationNotes{ @@ -173,11 +177,11 @@ func (g *gcloudProvider) getZoneSets(domain string) (models.Records, map[key]*gd func (g *gcloudProvider) GetDomainCorrections(dc *models.DomainConfig) ([]*models.Correction, error) { if err := dc.Punycode(); err != nil { - return nil, err + return nil, fmt.Errorf("punycode error: %w", err) } existingRecords, oldRRs, zoneName, err := g.getZoneSets(dc.Name) if err != nil { - return nil, err + return nil, fmt.Errorf("getzonesets error: %w", err) } // Normalize @@ -187,7 +191,7 @@ func (g *gcloudProvider) GetDomainCorrections(dc *models.DomainConfig) ([]*model differ := diff.New(dc) _, create, delete, modify, err := differ.IncrementalDiff(existingRecords) if err != nil { - return nil, err + return nil, fmt.Errorf("incdiff error: %w", err) } changedKeys := map[key]bool{} @@ -230,10 +234,26 @@ func (g *gcloudProvider) GetDomainCorrections(dc *models.DomainConfig) ([]*model } } + // FIXME(tlim): Google will return an error if too many changes are + // specified in a single request. We should split up very large + // batches. This can be reliably reproduced with the 1201 + // integration test. The error you get is: + // googleapi: Error 403: The change would exceed quota for additions per change., quotaExceeded + //log.Printf("PAUSE STT = %+v %v\n", err, resp) + //log.Printf("PAUSE ERR = %+v %v\n", err, resp) + runChange := func() error { - _, err := g.client.Changes.Create(g.project, zoneName, chg).Do() - return err + retry: + resp, err := g.client.Changes.Create(g.project, zoneName, chg).Do() + if retryNeeded(resp, err) { + goto retry + } + if err != nil { + return fmt.Errorf("runChange error: %w", err) + } + return nil } + return []*models.Correction{{ Msg: desc, F: runChange, @@ -310,3 +330,66 @@ func (g *gcloudProvider) EnsureDomainExists(domain string) error { _, err = g.client.ManagedZones.Create(g.project, mz).Do() return err } + +const initialBackoff = time.Second * 10 // First delay duration +const maxBackoff = time.Minute * 3 // Maximum backoff delay + +// backoff is the amount of time to sleep if a 429 or 504 is received. +// It is doubled after each use. +var backoff = initialBackoff +var backoff404 = false // Set if the last call requested a retry of a 404 + +func retryNeeded(resp *gdns.Change, err error) bool { + if err != nil { + return false // Not an error. + } + serr, ok := err.(*googleapi.Error) + if !ok { + return false // Not a google error. + } + if serr.Code == 200 { + backoff = initialBackoff // Reset + return false // Success! No need to retry. + } + + if serr.Code == 404 { + // serr.Code == 404 happens occasionally when GCLOUD hasn't + // finished updating the database yet. We pause and retry + // exactly once. There should be a better way to do this, such as + // a callback that would tell us a transaction is complete. + if backoff404 { + backoff404 = false + return false // Give up. We've done this already. + } + log.Printf("Special 404 pause-and-retry for GCLOUD: Pausing %s\n", backoff) + time.Sleep(backoff) + backoff404 = true + return true // Request a retry. + } + backoff404 = false + + if serr.Code != 429 && serr.Code != 503 { + return false // Not an error that permits retrying. + } + + // TODO(tlim): In theory, resp.Header has a header that says how + // long to wait but I haven't been able to capture that header in + // the wild. If you get these "RUNCHANGE HEAD" messages, please + // file a bug with the contents! + + if resp != nil { + log.Printf("NOTE: If you see this message, please file a bug with the output below:\n") + log.Printf("RUNCHANGE CODE = %+v\n", resp.HTTPStatusCode) + log.Printf("RUNCHANGE HEAD = %+v\n", resp.Header) + } + + // a simple exponential back-off + log.Printf("Pausing due to ratelimit: %v seconds\n", backoff) + time.Sleep(backoff) + backoff = backoff + (backoff / 2) + if backoff > maxBackoff { + backoff = maxBackoff + } + + return true // Request the API call be re-tried. +}