From 873ab93cc96e59b89e203928490f310586baec95 Mon Sep 17 00:00:00 2001 From: Samuel Laferriere Date: Tue, 29 Oct 2024 17:51:31 +0000 Subject: [PATCH] feat: dealing with new eigenda-client grpc errors + ErrorFailover convention --- flags/eigendaflags/cli.go | 11 +++++++ go.mod | 3 +- go.sum | 4 +-- server/config.go | 22 ++++++++------ server/errors.go | 13 +++++--- server/handlers.go | 3 +- server/load_store.go | 7 +++-- store/generated_key/eigenda/eigenda.go | 42 ++++++++++++++++++-------- 8 files changed, 69 insertions(+), 36 deletions(-) diff --git a/flags/eigendaflags/cli.go b/flags/eigendaflags/cli.go index 9a13c772..1130329d 100644 --- a/flags/eigendaflags/cli.go +++ b/flags/eigendaflags/cli.go @@ -27,6 +27,8 @@ var ( ConfirmationDepthFlagName = withFlagPrefix("confirmation-depth") EthRPCURLFlagName = withFlagPrefix("eth-rpc") SvcManagerAddrFlagName = withFlagPrefix("svc-manager-addr") + // Flags that are proxy specific, and not used by the eigenda-client + RetriesBeforeFailoverFlagName = withFlagPrefix("retries-before-failover") ) func withFlagPrefix(s string) string { @@ -137,6 +139,15 @@ func CLIFlags(envPrefix, category string) []cli.Flag { Category: category, Required: true, }, + // Flags that are proxy specific, and not used by the eigenda-client + // TODO: should we move this to a more specific category, like EIGENDA_STORE? + &cli.UintFlag{ + Name: RetriesBeforeFailoverFlagName, + Usage: "Number of times to retry blob dispersal before returning a 503 to the client, signifying that the client should failover to ethda.", + Value: 3, + EnvVars: []string{withEnvPrefix(envPrefix, "RETRIES_BEFORE_FAILOVER")}, + Category: category, + }, } } diff --git a/go.mod b/go.mod index 1c4c94b9..d2545bd4 100644 --- a/go.mod +++ b/go.mod @@ -7,7 +7,7 @@ toolchain go1.22.0 require ( // this is the latest commit of https://github.com/Layr-Labs/eigenda/pull/828 // TODO: merge upstream first and the use the latest commit of the main branch - github.com/Layr-Labs/eigenda v0.8.5-0.20241025144504-69ff616c1e5b + github.com/Layr-Labs/eigenda v0.8.5-0.20241029154833-b5efda80c3ca github.com/avast/retry-go/v4 v4.6.0 github.com/consensys/gnark-crypto v0.12.1 github.com/ethereum-optimism/optimism v1.9.4-0.20240927020138-a9c7f349d10b @@ -42,6 +42,7 @@ require ( github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.5 // indirect github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.5 // indirect github.com/aws/aws-sdk-go-v2/internal/ini v1.8.0 // indirect + github.com/aws/aws-sdk-go-v2/service/dynamodb v1.31.0 // indirect github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.11.2 // indirect github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.11.7 // indirect github.com/aws/aws-sdk-go-v2/service/kms v1.31.0 // indirect diff --git a/go.sum b/go.sum index 045cbf6f..356799bb 100644 --- a/go.sum +++ b/go.sum @@ -22,8 +22,8 @@ github.com/BurntSushi/toml v1.4.0/go.mod h1:ukJfTF/6rtPPRCnwkur4qwRxa8vTRFBF0uk2 github.com/DataDog/datadog-go v3.2.0+incompatible/go.mod h1:LButxg5PwREeZtORoXG3tL4fMGNddJ+vMq1mwgfaqoQ= github.com/DataDog/zstd v1.5.6-0.20230824185856-869dae002e5e h1:ZIWapoIRN1VqT8GR8jAwb1Ie9GyehWjVcGh32Y2MznE= github.com/DataDog/zstd v1.5.6-0.20230824185856-869dae002e5e/go.mod h1:g4AWEaM3yOg3HYfnJ3YIawPnVdXJh9QME85blwSAmyw= -github.com/Layr-Labs/eigenda v0.8.5-0.20241025144504-69ff616c1e5b h1:wQQhuxeFIK069pKIPcjNWI+ak/wuxlBybt/n787YUBQ= -github.com/Layr-Labs/eigenda v0.8.5-0.20241025144504-69ff616c1e5b/go.mod h1:sqUNf9Ak+EfAX82jDxrb4QbT/g3DViWD3b7YIk36skk= +github.com/Layr-Labs/eigenda v0.8.5-0.20241029154833-b5efda80c3ca h1:b0twiz1+FpVC1JfN96smFOhWwPDxpY3D8SYw7tr7hXY= +github.com/Layr-Labs/eigenda v0.8.5-0.20241029154833-b5efda80c3ca/go.mod h1:sqUNf9Ak+EfAX82jDxrb4QbT/g3DViWD3b7YIk36skk= github.com/Layr-Labs/eigensdk-go v0.1.7-0.20240507215523-7e4891d5099a h1:L/UsJFw9M31FD/WgXTPFB0oxbq9Cu4Urea1xWPMQS7Y= github.com/Layr-Labs/eigensdk-go v0.1.7-0.20240507215523-7e4891d5099a/go.mod h1:OF9lmS/57MKxS0xpSpX0qHZl0SKkDRpvJIvsGvMN1y8= github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERoyfY= diff --git a/server/config.go b/server/config.go index 3dc9d55c..826a9a00 100644 --- a/server/config.go +++ b/server/config.go @@ -19,8 +19,9 @@ import ( ) type Config struct { - EdaClientConfig clients.EigenDAClientConfig - VerifierConfig verify.Config + EdaClientConfig clients.EigenDAClientConfig + VerifierConfig verify.Config + RetriesBeforeFailover uint MemstoreEnabled bool MemstoreConfig memstore.Config @@ -39,14 +40,15 @@ type Config struct { func ReadConfig(ctx *cli.Context) Config { edaClientConfig := eigendaflags.ReadConfig(ctx) return Config{ - RedisConfig: redis.ReadConfig(ctx), - S3Config: s3.ReadConfig(ctx), - EdaClientConfig: edaClientConfig, - VerifierConfig: verify.ReadConfig(ctx, edaClientConfig), - MemstoreEnabled: ctx.Bool(memstore.EnabledFlagName), - MemstoreConfig: memstore.ReadConfig(ctx), - FallbackTargets: ctx.StringSlice(flags.FallbackTargetsFlagName), - CacheTargets: ctx.StringSlice(flags.CacheTargetsFlagName), + RedisConfig: redis.ReadConfig(ctx), + S3Config: s3.ReadConfig(ctx), + EdaClientConfig: edaClientConfig, + VerifierConfig: verify.ReadConfig(ctx, edaClientConfig), + RetriesBeforeFailover: ctx.Uint(eigendaflags.RetriesBeforeFailoverFlagName), + MemstoreEnabled: ctx.Bool(memstore.EnabledFlagName), + MemstoreConfig: memstore.ReadConfig(ctx), + FallbackTargets: ctx.StringSlice(flags.FallbackTargetsFlagName), + CacheTargets: ctx.StringSlice(flags.CacheTargetsFlagName), } } diff --git a/server/errors.go b/server/errors.go index ea6cc03d..49351125 100644 --- a/server/errors.go +++ b/server/errors.go @@ -6,7 +6,6 @@ import ( "github.com/Layr-Labs/eigenda-proxy/commitments" "github.com/Layr-Labs/eigenda-proxy/store" - "github.com/Layr-Labs/eigenda/api" ) // MetaError includes both an error and commitment metadata @@ -27,8 +26,12 @@ func (me MetaError) Unwrap() error { } func is400(err error) bool { - var errAPI api.ErrorAPI - // TODO: the ErrProxyOversizedBlob - return errors.Is(err, store.ErrProxyOversizedBlob) || - errors.As(err, &errAPI) && errAPI.ErrorFault() == api.ErrorFaultClient + // proxy requests are super simple (clients basically only pass bytes), so the only 400 possible + // is passing a blog that's too big. + // + // Any 400s returned by the disperser are due to formatting bugs in proxy code, for eg. badly + // IFFT'ing or encoding the blob, so we shouldn't return a 400 to the client. + // See https://github.com/Layr-Labs/eigenda/blob/bee55ed9207f16153c3fd8ebf73c219e68685def/api/errors.go#L22 + // for the 400s returned by the disperser server (currently only INVALID_ARGUMENT). + return errors.Is(err, store.ErrProxyOversizedBlob) } diff --git a/server/handlers.go b/server/handlers.go index 89bcad20..32161d72 100644 --- a/server/handlers.go +++ b/server/handlers.go @@ -180,11 +180,10 @@ func (svr *Server) handlePostShared(w http.ResponseWriter, r *http.Request, comm Err: fmt.Errorf("put request failed with commitment %v (commitment mode %v): %w", comm, meta.Mode, err), Meta: meta, } - var errAPI api.ErrorAPI switch { case is400(err): http.Error(w, err.Error(), http.StatusBadRequest) - case errors.As(err, &errAPI) && errAPI.ErrorCode() == http.StatusServiceUnavailable: + case errors.Is(err, &api.ErrorFailover{}): // this tells the caller (batcher) to failover to ethda b/c eigenda is temporarily down http.Error(w, err.Error(), http.StatusServiceUnavailable) default: diff --git a/server/load_store.go b/server/load_store.go index be298c87..121d4d99 100644 --- a/server/load_store.go +++ b/server/load_store.go @@ -108,9 +108,10 @@ func LoadStoreManager(ctx context.Context, cfg CLIConfig, log log.Logger, m metr verifier, log, &eigenda.StoreConfig{ - MaxBlobSizeBytes: cfg.EigenDAConfig.MemstoreConfig.MaxBlobSizeBytes, - EthConfirmationDepth: cfg.EigenDAConfig.VerifierConfig.EthConfirmationDepth, - StatusQueryTimeout: cfg.EigenDAConfig.EdaClientConfig.StatusQueryTimeout, + MaxBlobSizeBytes: cfg.EigenDAConfig.MemstoreConfig.MaxBlobSizeBytes, + EthConfirmationDepth: cfg.EigenDAConfig.VerifierConfig.EthConfirmationDepth, + StatusQueryTimeout: cfg.EigenDAConfig.EdaClientConfig.StatusQueryTimeout, + RetriesBeforeFailover: cfg.EigenDAConfig.RetriesBeforeFailover, }, ) } diff --git a/store/generated_key/eigenda/eigenda.go b/store/generated_key/eigenda/eigenda.go index dc664e28..2f2401cd 100644 --- a/store/generated_key/eigenda/eigenda.go +++ b/store/generated_key/eigenda/eigenda.go @@ -2,19 +2,19 @@ package eigenda import ( "context" - "errors" "fmt" "time" "github.com/Layr-Labs/eigenda-proxy/store" "github.com/Layr-Labs/eigenda-proxy/verify" - "github.com/Layr-Labs/eigenda/api" "github.com/Layr-Labs/eigenda/api/clients" "github.com/Layr-Labs/eigenda/api/grpc/disperser" "github.com/avast/retry-go/v4" "github.com/ethereum/go-ethereum/log" "github.com/ethereum/go-ethereum/rlp" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" ) type StoreConfig struct { @@ -25,6 +25,10 @@ type StoreConfig struct { // total duration time that client waits for blob to confirm StatusQueryTimeout time.Duration + + // number of times to retry blob dispersal before returning a 503 to the client + // signifying that the client should failover to ethda + RetriesBeforeFailover uint } // Store does storage interactions and verifications for blobs with DA. @@ -81,20 +85,32 @@ func (e Store) Put(ctx context.Context, value []byte) ([]byte, error) { return e.client.PutBlob(ctx, value) }, retry.RetryIf(func(err error) bool { - var errAPI api.ErrorAPI - if errors.As(err, &errAPI) { - if errAPI.ErrorFault() == api.ErrorFaultClient { - // we don't retry 400 errors because there is no point, - // we are passing invalid data - return false - } + st, isGRPCError := status.FromError(err) + if !isGRPCError { + // api.ErrorFailover is returned, so we should retry + return true + } + switch st.Code() { + case codes.InvalidArgument: + // we don't retry 400 errors because there is no point, + // we are passing invalid data + return false + case codes.ResourceExhausted: + // we retry on 429s because *can* mean we are being rate limited + // we sleep 1 second... very arbitrarily, because we don't have more info. + // grpc error itself should return a backoff time, + // see https://github.com/Layr-Labs/eigenda/issues/845 for more details + time.Sleep(1 * time.Second) + return true + default: + return true } - return true }), - // only return the last error. If it is a 503, then the handler will convert it to an http 503 - // to signify to the client (batcher) to failover to ethda b/c eigenda is temporarily down. + // only return the last error. If it is an api.ErrorFailover, then the handler will convert + // it to an http 503 to signify to the client (batcher) to failover to ethda + // b/c eigenda is temporarily down. retry.LastErrorOnly(true), - retry.Attempts(3), + retry.Attempts(e.cfg.RetriesBeforeFailover), ) if err != nil { // TODO: we will want to filter for errors here and return a 503 when needed