Skip to content

Commit

Permalink
feat: dealing with new eigenda-client grpc errors + ErrorFailover con…
Browse files Browse the repository at this point in the history
…vention
  • Loading branch information
samlaf committed Oct 29, 2024
1 parent 7a697b4 commit 873ab93
Show file tree
Hide file tree
Showing 8 changed files with 69 additions and 36 deletions.
11 changes: 11 additions & 0 deletions flags/eigendaflags/cli.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ var (
ConfirmationDepthFlagName = withFlagPrefix("confirmation-depth")
EthRPCURLFlagName = withFlagPrefix("eth-rpc")
SvcManagerAddrFlagName = withFlagPrefix("svc-manager-addr")
// Flags that are proxy specific, and not used by the eigenda-client
RetriesBeforeFailoverFlagName = withFlagPrefix("retries-before-failover")
)

func withFlagPrefix(s string) string {
Expand Down Expand Up @@ -137,6 +139,15 @@ func CLIFlags(envPrefix, category string) []cli.Flag {
Category: category,
Required: true,
},
// Flags that are proxy specific, and not used by the eigenda-client
// TODO: should we move this to a more specific category, like EIGENDA_STORE?
&cli.UintFlag{
Name: RetriesBeforeFailoverFlagName,
Usage: "Number of times to retry blob dispersal before returning a 503 to the client, signifying that the client should failover to ethda.",
Value: 3,
EnvVars: []string{withEnvPrefix(envPrefix, "RETRIES_BEFORE_FAILOVER")},
Category: category,
},
}
}

Expand Down
3 changes: 2 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ toolchain go1.22.0
require (
// this is the latest commit of https://github.com/Layr-Labs/eigenda/pull/828
// TODO: merge upstream first and the use the latest commit of the main branch
github.com/Layr-Labs/eigenda v0.8.5-0.20241025144504-69ff616c1e5b
github.com/Layr-Labs/eigenda v0.8.5-0.20241029154833-b5efda80c3ca
github.com/avast/retry-go/v4 v4.6.0
github.com/consensys/gnark-crypto v0.12.1
github.com/ethereum-optimism/optimism v1.9.4-0.20240927020138-a9c7f349d10b
Expand Down Expand Up @@ -42,6 +42,7 @@ require (
github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.5 // indirect
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.5 // indirect
github.com/aws/aws-sdk-go-v2/internal/ini v1.8.0 // indirect
github.com/aws/aws-sdk-go-v2/service/dynamodb v1.31.0 // indirect
github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.11.2 // indirect
github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.11.7 // indirect
github.com/aws/aws-sdk-go-v2/service/kms v1.31.0 // indirect
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ github.com/BurntSushi/toml v1.4.0/go.mod h1:ukJfTF/6rtPPRCnwkur4qwRxa8vTRFBF0uk2
github.com/DataDog/datadog-go v3.2.0+incompatible/go.mod h1:LButxg5PwREeZtORoXG3tL4fMGNddJ+vMq1mwgfaqoQ=
github.com/DataDog/zstd v1.5.6-0.20230824185856-869dae002e5e h1:ZIWapoIRN1VqT8GR8jAwb1Ie9GyehWjVcGh32Y2MznE=
github.com/DataDog/zstd v1.5.6-0.20230824185856-869dae002e5e/go.mod h1:g4AWEaM3yOg3HYfnJ3YIawPnVdXJh9QME85blwSAmyw=
github.com/Layr-Labs/eigenda v0.8.5-0.20241025144504-69ff616c1e5b h1:wQQhuxeFIK069pKIPcjNWI+ak/wuxlBybt/n787YUBQ=
github.com/Layr-Labs/eigenda v0.8.5-0.20241025144504-69ff616c1e5b/go.mod h1:sqUNf9Ak+EfAX82jDxrb4QbT/g3DViWD3b7YIk36skk=
github.com/Layr-Labs/eigenda v0.8.5-0.20241029154833-b5efda80c3ca h1:b0twiz1+FpVC1JfN96smFOhWwPDxpY3D8SYw7tr7hXY=
github.com/Layr-Labs/eigenda v0.8.5-0.20241029154833-b5efda80c3ca/go.mod h1:sqUNf9Ak+EfAX82jDxrb4QbT/g3DViWD3b7YIk36skk=
github.com/Layr-Labs/eigensdk-go v0.1.7-0.20240507215523-7e4891d5099a h1:L/UsJFw9M31FD/WgXTPFB0oxbq9Cu4Urea1xWPMQS7Y=
github.com/Layr-Labs/eigensdk-go v0.1.7-0.20240507215523-7e4891d5099a/go.mod h1:OF9lmS/57MKxS0xpSpX0qHZl0SKkDRpvJIvsGvMN1y8=
github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERoyfY=
Expand Down
22 changes: 12 additions & 10 deletions server/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,9 @@ import (
)

type Config struct {
EdaClientConfig clients.EigenDAClientConfig
VerifierConfig verify.Config
EdaClientConfig clients.EigenDAClientConfig
VerifierConfig verify.Config
RetriesBeforeFailover uint

MemstoreEnabled bool
MemstoreConfig memstore.Config
Expand All @@ -39,14 +40,15 @@ type Config struct {
func ReadConfig(ctx *cli.Context) Config {
edaClientConfig := eigendaflags.ReadConfig(ctx)
return Config{
RedisConfig: redis.ReadConfig(ctx),
S3Config: s3.ReadConfig(ctx),
EdaClientConfig: edaClientConfig,
VerifierConfig: verify.ReadConfig(ctx, edaClientConfig),
MemstoreEnabled: ctx.Bool(memstore.EnabledFlagName),
MemstoreConfig: memstore.ReadConfig(ctx),
FallbackTargets: ctx.StringSlice(flags.FallbackTargetsFlagName),
CacheTargets: ctx.StringSlice(flags.CacheTargetsFlagName),
RedisConfig: redis.ReadConfig(ctx),
S3Config: s3.ReadConfig(ctx),
EdaClientConfig: edaClientConfig,
VerifierConfig: verify.ReadConfig(ctx, edaClientConfig),
RetriesBeforeFailover: ctx.Uint(eigendaflags.RetriesBeforeFailoverFlagName),
MemstoreEnabled: ctx.Bool(memstore.EnabledFlagName),
MemstoreConfig: memstore.ReadConfig(ctx),
FallbackTargets: ctx.StringSlice(flags.FallbackTargetsFlagName),
CacheTargets: ctx.StringSlice(flags.CacheTargetsFlagName),
}
}

Expand Down
13 changes: 8 additions & 5 deletions server/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ import (

"github.com/Layr-Labs/eigenda-proxy/commitments"
"github.com/Layr-Labs/eigenda-proxy/store"
"github.com/Layr-Labs/eigenda/api"
)

// MetaError includes both an error and commitment metadata
Expand All @@ -27,8 +26,12 @@ func (me MetaError) Unwrap() error {
}

func is400(err error) bool {
var errAPI api.ErrorAPI
// TODO: the ErrProxyOversizedBlob
return errors.Is(err, store.ErrProxyOversizedBlob) ||
errors.As(err, &errAPI) && errAPI.ErrorFault() == api.ErrorFaultClient
// proxy requests are super simple (clients basically only pass bytes), so the only 400 possible
// is passing a blog that's too big.
//
// Any 400s returned by the disperser are due to formatting bugs in proxy code, for eg. badly
// IFFT'ing or encoding the blob, so we shouldn't return a 400 to the client.
// See https://github.com/Layr-Labs/eigenda/blob/bee55ed9207f16153c3fd8ebf73c219e68685def/api/errors.go#L22
// for the 400s returned by the disperser server (currently only INVALID_ARGUMENT).
return errors.Is(err, store.ErrProxyOversizedBlob)
}
3 changes: 1 addition & 2 deletions server/handlers.go
Original file line number Diff line number Diff line change
Expand Up @@ -180,11 +180,10 @@ func (svr *Server) handlePostShared(w http.ResponseWriter, r *http.Request, comm
Err: fmt.Errorf("put request failed with commitment %v (commitment mode %v): %w", comm, meta.Mode, err),
Meta: meta,
}
var errAPI api.ErrorAPI
switch {
case is400(err):
http.Error(w, err.Error(), http.StatusBadRequest)
case errors.As(err, &errAPI) && errAPI.ErrorCode() == http.StatusServiceUnavailable:
case errors.Is(err, &api.ErrorFailover{}):
// this tells the caller (batcher) to failover to ethda b/c eigenda is temporarily down
http.Error(w, err.Error(), http.StatusServiceUnavailable)
default:
Expand Down
7 changes: 4 additions & 3 deletions server/load_store.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,9 +108,10 @@ func LoadStoreManager(ctx context.Context, cfg CLIConfig, log log.Logger, m metr
verifier,
log,
&eigenda.StoreConfig{
MaxBlobSizeBytes: cfg.EigenDAConfig.MemstoreConfig.MaxBlobSizeBytes,
EthConfirmationDepth: cfg.EigenDAConfig.VerifierConfig.EthConfirmationDepth,
StatusQueryTimeout: cfg.EigenDAConfig.EdaClientConfig.StatusQueryTimeout,
MaxBlobSizeBytes: cfg.EigenDAConfig.MemstoreConfig.MaxBlobSizeBytes,
EthConfirmationDepth: cfg.EigenDAConfig.VerifierConfig.EthConfirmationDepth,
StatusQueryTimeout: cfg.EigenDAConfig.EdaClientConfig.StatusQueryTimeout,
RetriesBeforeFailover: cfg.EigenDAConfig.RetriesBeforeFailover,
},
)
}
Expand Down
42 changes: 29 additions & 13 deletions store/generated_key/eigenda/eigenda.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,19 @@ package eigenda

import (
"context"
"errors"
"fmt"
"time"

"github.com/Layr-Labs/eigenda-proxy/store"
"github.com/Layr-Labs/eigenda-proxy/verify"
"github.com/Layr-Labs/eigenda/api"
"github.com/Layr-Labs/eigenda/api/clients"
"github.com/Layr-Labs/eigenda/api/grpc/disperser"

"github.com/avast/retry-go/v4"
"github.com/ethereum/go-ethereum/log"
"github.com/ethereum/go-ethereum/rlp"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
)

type StoreConfig struct {
Expand All @@ -25,6 +25,10 @@ type StoreConfig struct {

// total duration time that client waits for blob to confirm
StatusQueryTimeout time.Duration

// number of times to retry blob dispersal before returning a 503 to the client
// signifying that the client should failover to ethda
RetriesBeforeFailover uint
}

// Store does storage interactions and verifications for blobs with DA.
Expand Down Expand Up @@ -81,20 +85,32 @@ func (e Store) Put(ctx context.Context, value []byte) ([]byte, error) {
return e.client.PutBlob(ctx, value)
},
retry.RetryIf(func(err error) bool {
var errAPI api.ErrorAPI
if errors.As(err, &errAPI) {
if errAPI.ErrorFault() == api.ErrorFaultClient {
// we don't retry 400 errors because there is no point,
// we are passing invalid data
return false
}
st, isGRPCError := status.FromError(err)
if !isGRPCError {
// api.ErrorFailover is returned, so we should retry
return true
}
switch st.Code() {

Check failure on line 93 in store/generated_key/eigenda/eigenda.go

View workflow job for this annotation

GitHub Actions / Linter

missing cases in switch of type codes.Code: codes.OK, codes.Canceled, codes.Unknown, codes.DeadlineExceeded, codes.NotFound, codes.AlreadyExists, codes.PermissionDenied, codes.FailedPrecondition, codes.Aborted, codes.OutOfRange, codes.Unimplemented, codes.Internal, codes.Unavailable, codes.DataLoss, codes.Unauthenticated (exhaustive)
case codes.InvalidArgument:
// we don't retry 400 errors because there is no point,
// we are passing invalid data
return false
case codes.ResourceExhausted:
// we retry on 429s because *can* mean we are being rate limited
// we sleep 1 second... very arbitrarily, because we don't have more info.
// grpc error itself should return a backoff time,
// see https://github.com/Layr-Labs/eigenda/issues/845 for more details
time.Sleep(1 * time.Second)
return true
default:
return true
}
return true
}),
// only return the last error. If it is a 503, then the handler will convert it to an http 503
// to signify to the client (batcher) to failover to ethda b/c eigenda is temporarily down.
// only return the last error. If it is an api.ErrorFailover, then the handler will convert
// it to an http 503 to signify to the client (batcher) to failover to ethda
// b/c eigenda is temporarily down.
retry.LastErrorOnly(true),
retry.Attempts(3),
retry.Attempts(e.cfg.RetriesBeforeFailover),
)
if err != nil {
// TODO: we will want to filter for errors here and return a 503 when needed
Expand Down

0 comments on commit 873ab93

Please sign in to comment.