Skip to content

Commit

Permalink
Experiment: job worker stream is resilient to gateway crashes (#453)
Browse files Browse the repository at this point in the history
Experiment: job worker stream is resilient to gateway crashes
  • Loading branch information
npepinpe authored Dec 11, 2023
2 parents c9b90f8 + 6c1ee94 commit 95e3c9e
Show file tree
Hide file tree
Showing 6 changed files with 320 additions and 1 deletion.
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
{
"version": "0.1.0",
"title": "Job push gateway restart gracefully experiment",
"description": "Job workers with streaming enabled should be fault-tolerant. The stream should be recreated and reused to complete instances even after gateway restarts.",
"contributions": {
"performance": "high",
"reliability": "high",
"availability": "high"
},
"steady-state-hypothesis": {
"title": "Zeebe is alive",
"probes": [
{
"name": "All pods should be ready",
"type": "probe",
"tolerance": 0,
"provider": {
"type": "process",
"path": "zbchaos",
"arguments": ["verify", "readiness"],
"timeout": 900
}
},
{
"name": "Can deploy process model",
"type": "probe",
"tolerance": 0,
"provider": {
"type": "process",
"path": "zbchaos",
"arguments": ["deploy", "process"],
"timeout": 900
}
},
{
"name": "Can deploy workers",
"type": "probe",
"tolerance": 0,
"provider": {
"type": "process",
"path": "zbchaos",
"arguments": ["deploy", "worker", "--pollingDelay", "86400000"],
"timeout": 900
},
"pauses": {
"after": 5
}
},
{
"name": "Should be able to create process instances",
"type": "probe",
"tolerance": 0,
"provider": {
"type": "process",
"path": "zbchaos",
"arguments": ["verify", "instance-creation", "--awaitResult"],
"timeout": 900
}
}
]
},
"method": [
{
"name": "Restart gateways",
"type": "probe",
"tolerance": 0,
"provider": {
"type": "process",
"path": "zbchaos",
"arguments": ["restart", "gateway", "--all"],
"timeout": 900
}
}
],
"rollbacks": []
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
{
"version": "0.1.0",
"title": "Job push gateway restart gracefully experiment",
"description": "Job workers with streaming enabled should be fault-tolerant. The stream should be recreated and reused to complete instances even after gateway restarts.",
"contributions": {
"performance": "high",
"reliability": "high",
"availability": "high"
},
"steady-state-hypothesis": {
"title": "Zeebe is alive",
"probes": [
{
"name": "All pods should be ready",
"type": "probe",
"tolerance": 0,
"provider": {
"type": "process",
"path": "zbchaos",
"arguments": ["verify", "readiness"],
"timeout": 900
}
},
{
"name": "Can deploy process model",
"type": "probe",
"tolerance": 0,
"provider": {
"type": "process",
"path": "zbchaos",
"arguments": ["deploy", "process"],
"timeout": 900
}
},
{
"name": "Can deploy workers",
"type": "probe",
"tolerance": 0,
"provider": {
"type": "process",
"path": "zbchaos",
"arguments": ["deploy", "worker", "--pollingDelay", "86400000"],
"timeout": 900
},
"pauses": {
"after": 5
}
},
{
"name": "Should be able to create process instances",
"type": "probe",
"tolerance": 0,
"provider": {
"type": "process",
"path": "zbchaos",
"arguments": ["verify", "instance-creation", "--awaitResult"],
"timeout": 900
}
}
]
},
"method": [
{
"name": "Restart gateways",
"type": "probe",
"tolerance": 0,
"provider": {
"type": "process",
"path": "zbchaos",
"arguments": ["terminate", "gateway", "--all"],
"timeout": 900
}
}
],
"rollbacks": []
}
16 changes: 16 additions & 0 deletions go-chaos/internal/chaos-experiments/camunda-cloud/manifest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,22 @@ experiments:
- path: worker-restart/experiment.json
clusterPlans:
- production-s
- path: job-push-gateway/restart.json
clusterPlans:
- production-s
minVersion: 8.4
- path: job-push-gateway/terminate.json
clusterPlans:
- production-s
minVersion: 8.4
- path: worker-resilience/gateway-restart.json
clusterPlans:
- production-s
maxVersion: 8.3
- path: worker-resilience/gateway-terminate.json
clusterPlans:
- production-s
minVersion: 8.3
# only for testing
- path: test/experiment.json
clusterPlans:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
{
"version": "0.1.0",
"title": "Gateways restart",
"description": "Job workers should be fault-tolerant, even if all gateways are restarted.",
"contributions": {
"reliability": "high",
"availability": "high"
},
"steady-state-hypothesis": {
"title": "Zeebe is alive",
"probes": [
{
"name": "All pods should be ready",
"type": "probe",
"tolerance": 0,
"provider": {
"type": "process",
"path": "zbchaos",
"arguments": ["verify", "readiness"],
"timeout": 900
}
},
{
"name": "Can deploy process model",
"type": "probe",
"tolerance": 0,
"provider": {
"type": "process",
"path": "zbchaos",
"arguments": ["deploy", "process"],
"timeout": 900
}
},
{
"name": "Can deploy workers",
"type": "probe",
"tolerance": 0,
"provider": {
"type": "process",
"path": "zbchaos",
"arguments": ["deploy", "worker"],
"timeout": 900
},
"pauses": {
"after": 5
}
},
{
"name": "Should be able to create process instances",
"type": "probe",
"tolerance": 0,
"provider": {
"type": "process",
"path": "zbchaos",
"arguments": ["verify", "instance-creation", "--awaitResult"],
"timeout": 900
}
}
]
},
"method": [
{
"name": "Restart gateways",
"type": "probe",
"tolerance": 0,
"provider": {
"type": "process",
"path": "zbchaos",
"arguments": ["restart", "gateway", "--all"],
"timeout": 900
}
}
],
"rollbacks": []
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
{
"version": "0.1.0",
"title": "Gateways restart",
"description": "Job workers should be fault-tolerant, even if all gateways crash.",
"contributions": {
"reliability": "high",
"availability": "high"
},
"steady-state-hypothesis": {
"title": "Zeebe is alive",
"probes": [
{
"name": "All pods should be ready",
"type": "probe",
"tolerance": 0,
"provider": {
"type": "process",
"path": "zbchaos",
"arguments": ["verify", "readiness"],
"timeout": 900
}
},
{
"name": "Can deploy process model",
"type": "probe",
"tolerance": 0,
"provider": {
"type": "process",
"path": "zbchaos",
"arguments": ["deploy", "process"],
"timeout": 900
}
},
{
"name": "Can deploy workers",
"type": "probe",
"tolerance": 0,
"provider": {
"type": "process",
"path": "zbchaos",
"arguments": ["deploy", "worker"],
"timeout": 900
},
"pauses": {
"after": 5
}
},
{
"name": "Should be able to create process instances",
"type": "probe",
"tolerance": 0,
"provider": {
"type": "process",
"path": "zbchaos",
"arguments": ["verify", "instance-creation", "--awaitResult"],
"timeout": 900
}
}
]
},
"method": [
{
"name": "Restart gateways",
"type": "probe",
"tolerance": 0,
"provider": {
"type": "process",
"path": "zbchaos",
"arguments": ["terminate", "gateway", "--all"],
"timeout": 900
}
}
],
"rollbacks": []
}
3 changes: 2 additions & 1 deletion go-chaos/worker/chaos_worker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,8 @@ func Test_ShouldSendExperimentsForClusterPlan(t *testing.T) {
// then
assert.True(t, fakeJobClient.Succeeded)
assert.Equal(t, 123, fakeJobClient.Key)
experiments, err := chaos_experiments.ReadExperimentsForClusterPlan("Production - S", "8.4.0-SNAPSHOT")
// as we don't have a version in this test, we should omit version bounded experiments
experiments, err := chaos_experiments.ReadExperimentsForClusterPlan("Production - S", "")
require.NoError(t, err)
assert.Equal(t, experiments, fakeJobClient.Variables)
}
Expand Down

0 comments on commit 95e3c9e

Please sign in to comment.