Skip to content

Commit

Permalink
feat: rename DisableTraceLocality to TraceCache (#1450)
Browse files Browse the repository at this point in the history
## Which problem is this PR solving?

To improve readability of the config option for our users, this PR
changes the binary trace locality feature flag to a mode structure.

## Short description of the changes

- Rename `DisableTraceLocality` to `TraceCache` that comes with two
modes, `concentrated` and `distributed`
  • Loading branch information
VinozzZ authored Dec 2, 2024
1 parent b357020 commit 5ce285d
Show file tree
Hide file tree
Showing 14 changed files with 137 additions and 114 deletions.
2 changes: 1 addition & 1 deletion app/app_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -766,7 +766,7 @@ func TestPeerRouting_TraceLocalityDisabled(t *testing.T) {
redisDB := 12 + i
cfg := defaultConfig(basePort, redisDB)
collectionCfg := cfg.GetCollectionConfig()
collectionCfg.DisableTraceLocality = true
collectionCfg.TraceLocalityMode = "distributed"
cfg.GetCollectionConfigVal = collectionCfg

apps[i], graph = newStartedApp(t, senders[i], nil, peers, cfg)
Expand Down
40 changes: 20 additions & 20 deletions collect/collect.go
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ func (i *InMemCollector) Start() error {
i.Peers.RegisterUpdatedPeersCallback(i.redistributeTimer.Reset)
}

if i.Config.GetCollectionConfig().DisableTraceLocality {
if !i.Config.GetCollectionConfig().TraceLocalityEnabled() {
i.keptDecisionMessages = make(chan string, decisionMessageBufferSize)
i.dropDecisionMessages = make(chan string, decisionMessageBufferSize)
i.PubSub.Subscribe(context.Background(), keptTraceDecisionTopic, i.signalKeptTraceDecisions)
Expand Down Expand Up @@ -524,7 +524,7 @@ func (i *InMemCollector) redistributeTraces(ctx context.Context) {

for _, sp := range trace.GetSpans() {

if i.Config.GetCollectionConfig().DisableTraceLocality {
if !i.Config.GetCollectionConfig().TraceLocalityEnabled() {
dc := i.createDecisionSpan(sp, trace, newTarget)
i.PeerTransmission.EnqueueEvent(dc)
continue
Expand Down Expand Up @@ -669,21 +669,21 @@ func (i *InMemCollector) processSpan(ctx context.Context, sp *types.Span, source
isMyTrace bool
)
// if trace locality is enabled, we should forward all spans to its correct peer
if i.Config.GetCollectionConfig().DisableTraceLocality {
targetShard, isMyTrace = i.IsMyTrace(sp.TraceID)
// if the span is a decision span and the trace no longer belong to us, we should not forward it to the peer
if !isMyTrace && sp.IsDecisionSpan() {
return
}

} else {
if i.Config.GetCollectionConfig().TraceLocalityEnabled() {
targetShard = i.Sharder.WhichShard(sp.TraceID)
isMyTrace = true
if !targetShard.Equals(i.Sharder.MyShard()) {
sp.APIHost = targetShard.GetAddress()
i.PeerTransmission.EnqueueSpan(sp)
return
}
} else {
targetShard, isMyTrace = i.IsMyTrace(sp.TraceID)
// if the span is a decision span and the trace no longer belong to us, we should not forward it to the peer
if !isMyTrace && sp.IsDecisionSpan() {
return
}

}

tcfg := i.Config.GetTracesConfig()
Expand Down Expand Up @@ -1069,7 +1069,7 @@ func (i *InMemCollector) Stop() error {
close(i.fromPeer)
close(i.outgoingTraces)

if i.Config.GetCollectionConfig().DisableTraceLocality {
if !i.Config.GetCollectionConfig().TraceLocalityEnabled() {
close(i.dropDecisionBuffer)
close(i.keptDecisionBuffer)
}
Expand Down Expand Up @@ -1527,23 +1527,23 @@ func (i *InMemCollector) makeDecision(ctx context.Context, trace *types.Trace, s
HasRoot: hasRoot,
}

if i.Config.GetCollectionConfig().DisableTraceLocality {
if !i.Config.GetCollectionConfig().TraceLocalityEnabled() {
i.publishTraceDecision(ctx, td)
}

return &td, nil
}

func (i *InMemCollector) IsMyTrace(traceID string) (sharder.Shard, bool) {
if i.Config.GetCollectionConfig().TraceLocalityEnabled() {
return i.Sharder.MyShard(), true
}

// if trace locality is disabled, we should only process
// traces that belong to the current refinery
if i.Config.GetCollectionConfig().DisableTraceLocality {
targeShard := i.Sharder.WhichShard(traceID)

return targeShard, i.Sharder.MyShard().Equals(targeShard)
}
targeShard := i.Sharder.WhichShard(traceID)

return i.Sharder.MyShard(), true
return targeShard, i.Sharder.MyShard().Equals(targeShard)

}

Expand Down Expand Up @@ -1578,7 +1578,7 @@ func (i *InMemCollector) publishTraceDecision(ctx context.Context, td TraceDecis
}

func (i *InMemCollector) sendKeptDecisions() {
if !i.Config.GetCollectionConfig().DisableTraceLocality {
if i.Config.GetCollectionConfig().TraceLocalityEnabled() {
return
}
interval := time.Duration(i.Config.GetCollectionConfig().KeptDecisionSendInterval)
Expand All @@ -1589,7 +1589,7 @@ func (i *InMemCollector) sendKeptDecisions() {
}

func (i *InMemCollector) sendDropDecisions() {
if !i.Config.GetCollectionConfig().DisableTraceLocality {
if i.Config.GetCollectionConfig().TraceLocalityEnabled() {
return
}
interval := time.Duration(i.Config.GetCollectionConfig().DropDecisionSendInterval)
Expand Down
14 changes: 7 additions & 7 deletions collect/collect_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ func newTestCollector(conf config.Config, transmission transmit.Transmission, pe
redistributeTimer: redistributeNotifier,
}

if conf.GetCollectionConfig().DisableTraceLocality {
if !conf.GetCollectionConfig().TraceLocalityEnabled() {
localPubSub.Subscribe(context.Background(), keptTraceDecisionTopic, c.signalKeptTraceDecisions)
}

Expand Down Expand Up @@ -196,7 +196,7 @@ func TestAddRootSpan(t *testing.T) {

conf.Mux.Lock()
collectionCfg := conf.GetCollectionConfigVal
collectionCfg.DisableTraceLocality = true
collectionCfg.TraceLocalityMode = "distributed"
conf.GetCollectionConfigVal = collectionCfg
conf.Mux.Unlock()

Expand Down Expand Up @@ -527,8 +527,8 @@ func TestDryRunMode(t *testing.T) {
DryRun: true,
ParentIdFieldNames: []string{"trace.parent_id", "parentId"},
GetCollectionConfigVal: config.CollectionConfig{
ShutdownDelay: config.Duration(1 * time.Millisecond),
DisableTraceLocality: true,
ShutdownDelay: config.Duration(1 * time.Millisecond),
TraceLocalityMode: "distributed",
},
}
transmission := &transmit.MockTransmission{}
Expand Down Expand Up @@ -2233,8 +2233,8 @@ func TestSendDropDecisions(t *testing.T) {
GetSamplerTypeVal: &config.DeterministicSamplerConfig{SampleRate: 1},
ParentIdFieldNames: []string{"trace.parent_id", "parentId"},
GetCollectionConfigVal: config.CollectionConfig{
ShutdownDelay: config.Duration(1 * time.Millisecond),
DisableTraceLocality: true,
ShutdownDelay: config.Duration(1 * time.Millisecond),
TraceLocalityMode: "distributed",
},
}
transmission := &transmit.MockTransmission{}
Expand Down Expand Up @@ -2318,7 +2318,7 @@ func TestExpiredTracesCleanup(t *testing.T) {
MaxBatchSize: 1500,
},
GetCollectionConfigVal: config.CollectionConfig{
DisableTraceLocality: true,
TraceLocalityMode: "distributed",
},
GetSamplerTypeVal: &config.DeterministicSamplerConfig{SampleRate: 1},
AddSpanCountToRoot: true,
Expand Down
2 changes: 1 addition & 1 deletion collect/stressRelief.go
Original file line number Diff line number Diff line change
Expand Up @@ -428,7 +428,7 @@ func (s *StressRelief) Recalc() uint {
// If a single node is under significant stress, it can activate stress relief mode
overallStressLevel := uint(math.Max(float64(clusterStressLevel), float64(localLevel)))

if s.Config.GetCollectionConfig().DisableTraceLocality {
if !s.Config.GetCollectionConfig().TraceLocalityEnabled() {
overallStressLevel = clusterStressLevel
}
s.overallStressLevel = overallStressLevel
Expand Down
2 changes: 1 addition & 1 deletion collect/stress_relief_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ func TestStressRelief_OverallStressLevel_DisableTraceLocality(t *testing.T) {
MinimumActivationDuration: config.Duration(5 * time.Second),
},
GetCollectionConfigVal: config.CollectionConfig{
DisableTraceLocality: true,
TraceLocalityMode: "distributed",
},
}
// On startup, the stress relief should not be active
Expand Down
22 changes: 12 additions & 10 deletions config.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Honeycomb Refinery Configuration Documentation

This is the documentation for the configuration file for Honeycomb's Refinery.
It was automatically generated on 2024-11-26 at 18:43:37 UTC.
It was automatically generated on 2024-12-02 at 16:52:55 UTC.

## The Config file

Expand Down Expand Up @@ -402,7 +402,7 @@ This is useful for evaluating sampling rules.
When DryRun is enabled, traces is decorated with `meta.refinery.
dryrun.kept` that is set to `true` or `false`, based on whether the trace would be kept or dropped.
In addition, `SampleRate` will be set to the incoming rate for all traces, and the field `meta.refinery.dryrun.sample_rate` will be set to the sample rate that would have been used.
NOTE: This setting is not compatible with `DisableTraceLocality=true`, because drop trace decisions shared among peers do not contain all the relevant information needed to send traces to Honeycomb.
NOTE: This setting is not compatible with `TraceCache=distributed`, because drop trace decisions shared among peers do not contain all the relevant information needed to send traces to Honeycomb.

- Eligible for live reload.
- Type: `bool`
Expand Down Expand Up @@ -1021,21 +1021,23 @@ This value should be set to a bit less than the normal timeout period for shutti
- Type: `duration`
- Default: `15s`

### `DisableTraceLocality`
### `TraceLocalityMode`

DisableTraceLocality controls whether all spans that belongs to the same trace are sent to a single Refinery for processing.
TraceLocalityMode controls how Refinery handles spans that belong to the same trace in a clustered environment.

When `false`, Refinery will route all spans that belong to the same trace to a single peer.
When `concentrated`, Refinery will route all spans that belong to the same trace to a single peer.
This is the default behavior ("Trace Locality") and the way Refinery has worked in the past.
When `true`, Refinery will instead keep spans on the node where they were received, and forward proxy spans that contain only the key information needed to make a trace decision.
This can reduce the amount of traffic between peers in most cases, and can help avoid a situation where a single large trace can cause a memory overrun on a single node.
If `true`, the amount of traffic between peers will be reduced, but the amount of traffic between Refinery and Redis will significantly increase, because Refinery uses Redis to distribute the trace decisions to all nodes in the cluster.
When `distributed`, Refinery will instead keep spans on the node where they were received, and forward proxy spans that contain only the key information needed to make a trace decision.
This can reduce the amount of traffic between peers, and can help avoid a situation where a single large trace can cause a memory overrun on a single node.
If `distributed`, the amount of traffic between peers will be reduced, but the amount of traffic between Refinery and Redis will significantly increase, because Refinery uses Redis to distribute the trace decisions to all nodes in the cluster.
It is important to adjust the size of the Redis cluster in this case.
NOTE: This setting is not compatible with `DryRun` when set to true.
The total volume of network traffic in `distributed` mode should be expected to decrease unless the cluster size is very large (hundreds of nodes).
NOTE: This setting is not compatible with `DryRun` when set to `distributed`.
See `DryRun` for more information.

- Not eligible for live reload.
- Type: `bool`
- Type: `string`
- Default: `concentrated`

### `HealthCheckTimeout`

Expand Down
17 changes: 15 additions & 2 deletions config/file_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -317,8 +317,8 @@ type CollectionConfig struct {
DisableRedistribution bool `yaml:"DisableRedistribution"`
RedistributionDelay Duration `yaml:"RedistributionDelay" default:"30s"`

ShutdownDelay Duration `yaml:"ShutdownDelay" default:"15s"`
DisableTraceLocality bool `yaml:"DisableTraceLocality"`
ShutdownDelay Duration `yaml:"ShutdownDelay" default:"15s"`
TraceLocalityMode string `yaml:"TraceLocalityMode" default:"concentrated"`

MaxDropDecisionBatchSize int `yaml:"MaxDropDecisionBatchSize" default:"1000"`
DropDecisionSendInterval Duration `yaml:"DropDecisionSendInterval" default:"1s"`
Expand Down Expand Up @@ -355,6 +355,19 @@ func (c CollectionConfig) GetIncomingQueueSize() int {
return c.IncomingQueueSize
}

// TraceLocalityEnabled returns whether trace locality is enabled.
func (c CollectionConfig) TraceLocalityEnabled() bool {
switch c.TraceLocalityMode {
case "concentrated":
return true
case "distributed":
return false
default:
// Default to true for backwards compatibility
return true
}
}

type BufferSizeConfig struct {
UpstreamBufferSize int `yaml:"UpstreamBufferSize" default:"10_000"`
PeerBufferSize int `yaml:"PeerBufferSize" default:"100_000"`
Expand Down
24 changes: 12 additions & 12 deletions config/metadata/configMeta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -489,7 +489,7 @@ groups:
`meta.refinery.dryrun.sample_rate` will be set to the sample rate
that would have been used.
NOTE: This setting is not compatible with `DisableTraceLocality=true`,
NOTE: This setting is not compatible with `TraceCache=distributed`,
because drop trace decisions shared among peers do not contain all
the relevant information needed to send traces to Honeycomb.
Expand Down Expand Up @@ -1340,26 +1340,26 @@ groups:
This value should be set to a bit less than the normal timeout period
for shutting down without forcibly terminating the process.
- name: DisableTraceLocality
type: bool
- name: TraceLocalityMode
type: string
valuetype: nondefault
firstversion: v2.9
default: false
default: "concentrated"
reload: false
summary: controls whether all spans that belongs to the same trace are sent to a single Refinery for processing.
summary: controls how Refinery handles spans that belong to the same trace in a clustered environment.
description: >
When `false`, Refinery will route all spans that belong to the same trace to a single peer. This is the
default behavior ("Trace Locality") and the way Refinery has worked in the past. When `true`, Refinery
When `concentrated`, Refinery will route all spans that belong to the same trace to a single peer. This is the
default behavior ("Trace Locality") and the way Refinery has worked in the past. When `distributed`, Refinery
will instead keep spans on the node where they were received, and forward proxy spans that contain only
the key information needed to make a trace decision. This can reduce the amount of traffic between peers
in most cases, and can help avoid a situation where a single large trace can cause a memory overrun on
a single node.
the key information needed to make a trace decision. This can reduce the amount of traffic between peers,
and can help avoid a situation where a single large trace can cause a memory overrun on a single node.
If `true`, the amount of traffic between peers will be reduced, but the amount of traffic between Refinery
If `distributed`, the amount of traffic between peers will be reduced, but the amount of traffic between Refinery
and Redis will significantly increase, because Refinery uses Redis to distribute the trace decisions to all
nodes in the cluster. It is important to adjust the size of the Redis cluster in this case.
NOTE: This setting is not compatible with `DryRun` when set to true. See `DryRun` for more information.
The total volume of network traffic in `distributed` mode should be expected to decrease unless the cluster size is very large (hundreds of nodes).
NOTE: This setting is not compatible with `DryRun` when set to `distributed`. See `DryRun` for more information.
- name: HealthCheckTimeout
type: duration
Expand Down
49 changes: 26 additions & 23 deletions config_complete.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
## Honeycomb Refinery Configuration ##
######################################
#
# created on 2024-11-22 at 17:59:57 UTC from ../../config.yaml using a template generated on 2024-11-22 at 17:59:50 UTC
# created on 2024-12-02 at 16:40:51 UTC from ../../config.yaml using a template generated on 2024-12-02 at 16:40:45 UTC

# This file contains a configuration for the Honeycomb Refinery. It is in YAML
# format, organized into named groups, each of which contains a set of
Expand Down Expand Up @@ -427,7 +427,7 @@ Debugging:
## to the incoming rate for all traces, and the field
## `meta.refinery.dryrun.sample_rate` will be set to the sample rate that
## would have been used.
## NOTE: This setting is not compatible with `DisableTraceLocality=true`,
## NOTE: This setting is not compatible with `TraceCache=distributed`,
## because drop trace decisions shared among peers do not contain all the
## relevant information needed to send traces to Honeycomb.
##
Expand Down Expand Up @@ -1083,27 +1083,30 @@ Collection:
## Eligible for live reload.
# ShutdownDelay: 15s

## DisableTraceLocality controls whether all spans that belongs to the
## same trace are sent to a single Refinery for processing.
##
## When `false`, Refinery will route all spans that belong to the same
## trace to a single peer. This is the default behavior ("Trace
## Locality") and the way Refinery has worked in the past. When `true`,
## Refinery will instead keep spans on the node where they were received,
## and forward proxy spans that contain only the key information needed
## to make a trace decision. This can reduce the amount of traffic
## between peers in most cases, and can help avoid a situation where a
## single large trace can cause a memory overrun on a single node.
## If `true`, the amount of traffic between peers will be reduced, but
## the amount of traffic between Refinery and Redis will significantly
## increase, because Refinery uses Redis to distribute the trace
## decisions to all nodes in the cluster. It is important to adjust the
## size of the Redis cluster in this case.
## NOTE: This setting is not compatible with `DryRun` when set to true.
## See `DryRun` for more information.
##
## Not eligible for live reload.
# DisableTraceLocality: false
## TraceLocalityMode controls how Refinery handles spans that belongs to
## the same trace in a clustered environment.
##
## When `concentrated`, Refinery will route all spans that belong to the
## same trace to a single peer. This is the default behavior ("Trace
## Locality") and the way Refinery has worked in the past. When
## `distributed`, Refinery will instead keep spans on the node where they
## were received, and forward proxy spans that contain only the key
## information needed to make a trace decision. This can reduce the
## amount of traffic between peers, and can help avoid a situation where
## a single large trace can cause a memory overrun on a single node.
## If `distributed`, the amount of traffic between peers will be reduced,
## but the amount of traffic between Refinery and Redis will
## significantly increase, because Refinery uses Redis to distribute the
## trace decisions to all nodes in the cluster. It is important to adjust
## the size of the Redis cluster in this case.
## The total volume of network traffic in `distributed` mode should be
## expected to decrease unless the cluster size is very large (hundreds
## of nodes). NOTE: This setting is not compatible with `DryRun` when set
## to `distributed`. See `DryRun` for more information.
##
## default: concentrated
## Not eligible for live reload.
# TraceLocalityMode: concentrated

## HealthCheckTimeout controls the maximum duration allowed for
## collection health checks to complete.
Expand Down
Loading

0 comments on commit 5ce285d

Please sign in to comment.