Add new g3_canary_infra_failure metric

Using prometheus's pushgateway since the metric will be reported from a task driver (a short-lived service-level job).

Updated go.mod/go.sum to get pushgateway libraries with:
$ go get go.skia.org/infra@2dce552b7a
$ go mod download
$ make -C infra/bots train

Bug: skia:12521
Change-Id: Ic63487b08d06163ef92556999c4920e5f178e285
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/487227
Reviewed-by: Joe Gregorio <jcgregorio@google.com>
Commit-Queue: Ravi Mistry <rmistry@google.com>
This commit is contained in:
Ravi Mistry 2021-12-21 13:52:09 -05:00 committed by SkCQ
parent 4c4cf43db4
commit bd7cb72770
4 changed files with 492 additions and 9 deletions

2
go.mod
View File

@ -15,7 +15,7 @@ require (
github.com/prometheus/common v0.15.0 // indirect
github.com/stretchr/testify v1.6.1
go.chromium.org/luci v0.0.0-20201121231857-b9ab316d7198 // indirect
go.skia.org/infra v0.0.0-20211220211329-847b896d0605
go.skia.org/infra v0.0.0-20211221155757-2dce552b7a06
golang.org/x/crypto v0.0.0-20210220033148-5ea612d1eb83
golang.org/x/oauth2 v0.0.0-20201109201403-9fd604954f58
google.golang.org/api v0.35.0

468
go.sum

File diff suppressed because it is too large Load Diff

View File

@ -11,7 +11,7 @@ g3_canary_infra_failures
------------------------
Happens when the skia_try_service in G3 returns an exception.
Check the skia_try_service's error logs in [go/skia-borg-jobs](go/skia-borg-jobs).
Check the skia_try_service's error logs in [go/skia-borg-jobs](go/skia-borg-jobs)
For errors that do not seem to be transient, restarting the borg job has worked
in the past:

View File

@ -10,6 +10,7 @@ import (
"errors"
"flag"
"fmt"
"net/http"
"strconv"
"time"
@ -22,6 +23,7 @@ import (
"go.skia.org/infra/go/httputils"
"go.skia.org/infra/go/skerr"
"go.skia.org/infra/go/sklog"
"go.skia.org/infra/promk/go/pushgateway"
"go.skia.org/infra/task_driver/go/lib/auth_steps"
"go.skia.org/infra/task_driver/go/lib/checkout"
"go.skia.org/infra/task_driver/go/td"
@ -35,6 +37,12 @@ const (
MergeConflictErrorMsg = "G3 tryjob failed because the change is causing a merge conflict when applying it to the Skia hash in G3."
PatchingInformation = "Tip: If needed, could try patching in the CL into a local G3 client with \"g4 patch\" and then hacking on it."
// Metric constants for pushgateway.
jobName = "g3-canary"
metricName = "g3_canary_infra_failure"
metricValue_NoInfraFailure = "0"
metricValue_InfraFailure = "1"
)
type CanaryStatusType string
@ -122,7 +130,7 @@ func main() {
td.StepText(ctx, "Canary roll doc", "https://goto.google.com/autoroller-canary-bots")
// Wait for the canary roll to finish.
if err := waitForCanaryRoll(ctx, taskFileName, taskStoragePath, gcsClient); err != nil {
if err := waitForCanaryRoll(ctx, taskFileName, taskStoragePath, client, gcsClient); err != nil {
td.Fatal(ctx, skerr.Wrap(err))
}
}
@ -154,10 +162,13 @@ func triggerCanaryRoll(ctx context.Context, issue, patchset, taskFileName, taskS
return nil
}
func waitForCanaryRoll(parentCtx context.Context, taskFileName, taskStoragePath string, gcsClient gcs.GCSClient) error {
func waitForCanaryRoll(parentCtx context.Context, taskFileName, taskStoragePath string, httpClient *http.Client, gcsClient gcs.GCSClient) error {
ctx := td.StartStep(parentCtx, td.Props("Wait for canary roll"))
defer td.EndStep(ctx)
// For updating g3_canary_infra_failure metric after run completes.
pg := pushgateway.New(httpClient, jobName, pushgateway.DefaultPushgatewayURL)
// For writing to the step's log stream.
stdout := td.NewLogStream(ctx, "stdout", td.SeverityInfo)
// Lets add the roll link only once to step data.
@ -196,20 +207,24 @@ func waitForCanaryRoll(parentCtx context.Context, taskFileName, taskStoragePath
time.Sleep(30 * time.Second)
continue
case ExceptionStatus:
if task.Error == "" {
return td.FailStep(ctx, fmt.Errorf("Run failed with: %s", task.Error))
} else {
// Use a general purpose error message.
return td.FailStep(ctx, errors.New(InfraFailureErrorMsg))
if task.Error != "" {
sklog.Errorf("Run failed with: %s", task.Error)
}
pg.Push(ctx, metricName, metricValue_InfraFailure)
// Use a general purpose error message.
return td.FailStep(ctx, errors.New(InfraFailureErrorMsg))
case MissingApprovalStatus:
pg.Push(ctx, metricName, metricValue_NoInfraFailure)
return td.FailStep(ctx, errors.New(MissingApprovalErrorMsg))
case MergeConflictStatus:
pg.Push(ctx, metricName, metricValue_NoInfraFailure)
return td.FailStep(ctx, errors.New(MergeConflictErrorMsg))
case FailureStatus:
pg.Push(ctx, metricName, metricValue_NoInfraFailure)
return td.FailStep(ctx, fmt.Errorf("Run failed G3 TAP.\n%s", PatchingInformation))
case SuccessStatus:
// Run passed G3 TAP.
pg.Push(ctx, metricName, metricValue_NoInfraFailure)
return nil
}
}