Add new g3_canary_infra_failure metric
Using prometheus's pushgateway since the metric will be reported from a task driver (a short-lived service-level job). Updated go.mod/go.sum to get pushgateway libraries with: $ go get go.skia.org/infra@2dce552b7a $ go mod download $ make -C infra/bots train Bug: skia:12521 Change-Id: Ic63487b08d06163ef92556999c4920e5f178e285 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/487227 Reviewed-by: Joe Gregorio <jcgregorio@google.com> Commit-Queue: Ravi Mistry <rmistry@google.com>
This commit is contained in:
parent
4c4cf43db4
commit
bd7cb72770
2
go.mod
2
go.mod
@ -15,7 +15,7 @@ require (
|
||||
github.com/prometheus/common v0.15.0 // indirect
|
||||
github.com/stretchr/testify v1.6.1
|
||||
go.chromium.org/luci v0.0.0-20201121231857-b9ab316d7198 // indirect
|
||||
go.skia.org/infra v0.0.0-20211220211329-847b896d0605
|
||||
go.skia.org/infra v0.0.0-20211221155757-2dce552b7a06
|
||||
golang.org/x/crypto v0.0.0-20210220033148-5ea612d1eb83
|
||||
golang.org/x/oauth2 v0.0.0-20201109201403-9fd604954f58
|
||||
google.golang.org/api v0.35.0
|
||||
|
@ -11,7 +11,7 @@ g3_canary_infra_failures
|
||||
------------------------
|
||||
|
||||
Happens when the skia_try_service in G3 returns an exception.
|
||||
Check the skia_try_service's error logs in [go/skia-borg-jobs](go/skia-borg-jobs).
|
||||
Check the skia_try_service's error logs in [go/skia-borg-jobs](go/skia-borg-jobs)
|
||||
|
||||
For errors that do not seem to be transient, restarting the borg job has worked
|
||||
in the past:
|
||||
|
@ -10,6 +10,7 @@ import (
|
||||
"errors"
|
||||
"flag"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"strconv"
|
||||
"time"
|
||||
|
||||
@ -22,6 +23,7 @@ import (
|
||||
"go.skia.org/infra/go/httputils"
|
||||
"go.skia.org/infra/go/skerr"
|
||||
"go.skia.org/infra/go/sklog"
|
||||
"go.skia.org/infra/promk/go/pushgateway"
|
||||
"go.skia.org/infra/task_driver/go/lib/auth_steps"
|
||||
"go.skia.org/infra/task_driver/go/lib/checkout"
|
||||
"go.skia.org/infra/task_driver/go/td"
|
||||
@ -35,6 +37,12 @@ const (
|
||||
MergeConflictErrorMsg = "G3 tryjob failed because the change is causing a merge conflict when applying it to the Skia hash in G3."
|
||||
|
||||
PatchingInformation = "Tip: If needed, could try patching in the CL into a local G3 client with \"g4 patch\" and then hacking on it."
|
||||
|
||||
// Metric constants for pushgateway.
|
||||
jobName = "g3-canary"
|
||||
metricName = "g3_canary_infra_failure"
|
||||
metricValue_NoInfraFailure = "0"
|
||||
metricValue_InfraFailure = "1"
|
||||
)
|
||||
|
||||
type CanaryStatusType string
|
||||
@ -122,7 +130,7 @@ func main() {
|
||||
td.StepText(ctx, "Canary roll doc", "https://goto.google.com/autoroller-canary-bots")
|
||||
|
||||
// Wait for the canary roll to finish.
|
||||
if err := waitForCanaryRoll(ctx, taskFileName, taskStoragePath, gcsClient); err != nil {
|
||||
if err := waitForCanaryRoll(ctx, taskFileName, taskStoragePath, client, gcsClient); err != nil {
|
||||
td.Fatal(ctx, skerr.Wrap(err))
|
||||
}
|
||||
}
|
||||
@ -154,10 +162,13 @@ func triggerCanaryRoll(ctx context.Context, issue, patchset, taskFileName, taskS
|
||||
return nil
|
||||
}
|
||||
|
||||
func waitForCanaryRoll(parentCtx context.Context, taskFileName, taskStoragePath string, gcsClient gcs.GCSClient) error {
|
||||
func waitForCanaryRoll(parentCtx context.Context, taskFileName, taskStoragePath string, httpClient *http.Client, gcsClient gcs.GCSClient) error {
|
||||
ctx := td.StartStep(parentCtx, td.Props("Wait for canary roll"))
|
||||
defer td.EndStep(ctx)
|
||||
|
||||
// For updating g3_canary_infra_failure metric after run completes.
|
||||
pg := pushgateway.New(httpClient, jobName, pushgateway.DefaultPushgatewayURL)
|
||||
|
||||
// For writing to the step's log stream.
|
||||
stdout := td.NewLogStream(ctx, "stdout", td.SeverityInfo)
|
||||
// Lets add the roll link only once to step data.
|
||||
@ -196,20 +207,24 @@ func waitForCanaryRoll(parentCtx context.Context, taskFileName, taskStoragePath
|
||||
time.Sleep(30 * time.Second)
|
||||
continue
|
||||
case ExceptionStatus:
|
||||
if task.Error == "" {
|
||||
return td.FailStep(ctx, fmt.Errorf("Run failed with: %s", task.Error))
|
||||
} else {
|
||||
// Use a general purpose error message.
|
||||
return td.FailStep(ctx, errors.New(InfraFailureErrorMsg))
|
||||
if task.Error != "" {
|
||||
sklog.Errorf("Run failed with: %s", task.Error)
|
||||
}
|
||||
pg.Push(ctx, metricName, metricValue_InfraFailure)
|
||||
// Use a general purpose error message.
|
||||
return td.FailStep(ctx, errors.New(InfraFailureErrorMsg))
|
||||
case MissingApprovalStatus:
|
||||
pg.Push(ctx, metricName, metricValue_NoInfraFailure)
|
||||
return td.FailStep(ctx, errors.New(MissingApprovalErrorMsg))
|
||||
case MergeConflictStatus:
|
||||
pg.Push(ctx, metricName, metricValue_NoInfraFailure)
|
||||
return td.FailStep(ctx, errors.New(MergeConflictErrorMsg))
|
||||
case FailureStatus:
|
||||
pg.Push(ctx, metricName, metricValue_NoInfraFailure)
|
||||
return td.FailStep(ctx, fmt.Errorf("Run failed G3 TAP.\n%s", PatchingInformation))
|
||||
case SuccessStatus:
|
||||
// Run passed G3 TAP.
|
||||
pg.Push(ctx, metricName, metricValue_NoInfraFailure)
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user