skia2/infra/bots/task_drivers/canary/canary.go
Ravi Mistry 8e4a44fb92 Add Retries to Canaries
Bug: skia:10563
Bug: skia:10639
Change-Id: I0b500e4996f8edc0328c1d63d50ac876c231734c
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/313079
Commit-Queue: Ravi Mistry <rmistry@google.com>
Reviewed-by: Eric Boren <borenet@google.com>
2020-08-26 15:04:16 +00:00

175 lines
5.7 KiB
Go

// Copyright 2020 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
package main
import (
"context"
"errors"
"flag"
"fmt"
"time"
"cloud.google.com/go/datastore"
"go.skia.org/infra/autoroll/go/manual"
"go.skia.org/infra/go/auth"
"go.skia.org/infra/go/firestore"
"go.skia.org/infra/go/skerr"
"go.skia.org/infra/task_driver/go/lib/auth_steps"
"go.skia.org/infra/task_driver/go/lib/checkout"
"go.skia.org/infra/task_driver/go/td"
)
const (
MinWaitDuration = 4 * time.Minute
)
var (
canaryRollNotCreatedErr = errors.New("Canary roll could not be created. Ask the trooper to investigate (or directly ping rmistry@) if this happens consistently.")
canaryRollSuccessTooQuicklyErr = fmt.Errorf("Canary roll returned success in less than %s. Failing canary due to skbug.com/10563.", MinWaitDuration)
// Lets add the roll link only once to step data.
addedRollLinkStepData = false
)
func main() {
var (
projectId = flag.String("project_id", "", "ID of the Google Cloud project.")
taskId = flag.String("task_id", "", "ID of this task.")
taskName = flag.String("task_name", "", "Name of the task.")
output = flag.String("o", "", "If provided, dump a JSON blob of step data to the given file. Prints to stdout if '-' is given.")
local = flag.Bool("local", true, "True if running locally (as opposed to on the bots)")
checkoutFlags = checkout.SetupFlags(nil)
rollerName = flag.String("roller_name", "", "The roller we will use to create the canary with.")
)
ctx := td.StartRun(projectId, taskId, taskName, output, local)
defer td.EndRun(ctx)
if *rollerName == "" {
td.Fatalf(ctx, "--roller_name must be specified")
}
rs, err := checkout.GetRepoState(checkoutFlags)
if err != nil {
td.Fatal(ctx, skerr.Wrap(err))
}
if rs.Issue == "" || rs.Patchset == "" {
td.Fatalf(ctx, "This task driver should be run only as a try bot")
}
// Create token source with scope for datastore access.
ts, err := auth_steps.Init(ctx, *local, auth.SCOPE_USERINFO_EMAIL, datastore.ScopeDatastore)
if err != nil {
td.Fatal(ctx, skerr.Wrap(err))
}
// Add documentation link for canary rolls.
td.StepText(ctx, "Canary roll doc", "https://goto.google.com/autoroller-canary-bots")
// Instantiate firestore DB.
manualRollDB, err := manual.NewDBWithParams(ctx, firestore.FIRESTORE_PROJECT, "production", ts)
if err != nil {
td.Fatal(ctx, skerr.Wrap(err))
}
// Retry if canary roll could not be created or if canary roll returned
// success too quickly (skbug.com/10563).
retryAttempts := 3
for retry := 0; ; retry++ {
retryText := ""
if retry > 0 {
retryText = fmt.Sprintf(" (Retry #%d)", retry)
}
req := manual.ManualRollRequest{
Requester: *rollerName,
RollerName: *rollerName,
Status: manual.STATUS_PENDING,
Timestamp: firestore.FixTimestamp(time.Now()),
Revision: rs.GetPatchRef(),
DryRun: true,
NoEmail: true,
NoResolveRevision: true,
}
if err := td.Do(ctx, td.Props(fmt.Sprintf("Trigger canary roll%s", retryText)).Infra(), func(ctx context.Context) error {
return manualRollDB.Put(&req)
}); err != nil {
// Immediately fail for errors in triggering.
td.Fatal(ctx, skerr.Wrap(err))
}
if err := waitForCanaryRoll(ctx, manualRollDB, req.Id, fmt.Sprintf("Wait for canary roll%s", retryText)); err != nil {
// Retry these errors.
if err == canaryRollNotCreatedErr || err == canaryRollSuccessTooQuicklyErr {
if retry >= (retryAttempts - 1) {
td.Fatal(ctx, skerr.Wrapf(err, "failed inspite of 3 retries"))
}
time.Sleep(time.Minute)
continue
}
// Immediately fail for all other errors.
td.Fatal(ctx, skerr.Wrap(err))
} else {
// The canary roll was successful, break out of the
// retry loop.
break
}
}
}
func waitForCanaryRoll(parentCtx context.Context, manualRollDB manual.DB, rollId, stepName string) error {
ctx := td.StartStep(parentCtx, td.Props(stepName))
defer td.EndStep(ctx)
startTime := time.Now()
// For writing to the step's log stream.
stdout := td.NewLogStream(ctx, "stdout", td.Info)
for {
roll, err := manualRollDB.Get(ctx, rollId)
if err != nil {
return td.FailStep(ctx, fmt.Errorf("Could not find canary roll with ID: %s", rollId))
}
cl := roll.Url
var rollStatus string
if cl == "" {
rollStatus = fmt.Sprintf("Canary roll has status %s", roll.Status)
} else {
if !addedRollLinkStepData {
// Add the roll link to both the current step and it's parent.
td.StepText(ctx, "Canary roll CL", cl)
td.StepText(parentCtx, "Canary roll CL", cl)
addedRollLinkStepData = true
}
rollStatus = fmt.Sprintf("Canary roll [ %s ] has status %s", roll.Url, roll.Status)
}
if _, err := stdout.Write([]byte(rollStatus)); err != nil {
return td.FailStep(ctx, fmt.Errorf("Could not write to stdout: %s", err))
}
if roll.Status == manual.STATUS_COMPLETE {
if roll.Result == manual.RESULT_SUCCESS {
// This is a hopefully temperory workaround for skbug.com/10563. Sometimes
// Canary-Chromium returns success immediately after creating a change and
// before the tryjobs have a chance to run. If we have waited
// for < MinWaitDuration then be cautious and assume failure.
if time.Now().Before(startTime.Add(MinWaitDuration)) {
return td.FailStep(ctx, canaryRollSuccessTooQuicklyErr)
}
return nil
} else if roll.Result == manual.RESULT_FAILURE {
if cl == "" {
return td.FailStep(ctx, canaryRollNotCreatedErr)
}
return td.FailStep(ctx, fmt.Errorf("Canary roll [ %s ] failed", cl))
} else if roll.Result == manual.RESULT_UNKNOWN {
return td.FailStep(ctx, fmt.Errorf("Canary roll [ %s ] completed with an unknown result", cl))
}
}
time.Sleep(30 * time.Second)
}
}