Hi - I’ve been scale testing my temporal workflow structure. I’m looking to scale my service and my main concern is the temporal limits (history, transitions/sec, etc). With this, I’ve split some of my logic into batches which then create child workflows for each batch. While doing small scale tests (mostly working with batch size which in turn decides number of child workflows) I’ve run into an odd issue where right around 20+ child workflows leads to this error with the error message being a “context timeout deadline exceeded”.
From the temporal UI I see 0 child workflows created, but if I configure this to be 20 or less child workflows, they are all created without an issue.
Here is the main logic I am using:
childCtx, cancelHandler := workflow.WithCancel(ctx)
futures := t.startBatchWorkflows(childCtx, batches)
err := t.waitForBatchesWithCancellation(childCtx, futures, cancelHandler)
func (t *taskRunnerWorkflow) startBatchWorkflows(ctx workflow.Context, batches []experiment.TaskInstanceBatch) []workflow.Future {
futures := make([]workflow.Future, 0, len(batches))
for i, batch := range batches {
batchNumber := i + 1
input := TaskExecutorWorkflowInput{
Batch: batch,
ExecutableTask: t.executableTask,
ExperimentID: t.experimentID,
Fabric: t.fabric,
TaskRun: t.taskRun,
Principal: t.principal,
}
childWorkflowOptions := workflow.ChildWorkflowOptions{
WaitForCancellation: true,
}
future, settable := workflow.NewFuture(ctx)
workflow.Go(ctx, func(ctx workflow.Context) {
batchCtx := workflow.WithChildOptions(ctx, childWorkflowOptions)
if err := workflow.ExecuteChildWorkflow(batchCtx, TaskExecutor, input).Get(ctx, nil); err != nil {
settable.Set(nil, err)
return
}
settable.Set(nil, nil)
})
futures = append(futures, future)
}
return futures
}
func (t *taskRunnerWorkflow) waitForBatchesWithCancellation(ctx workflow.Context, futures []workflow.Future, cancelHandler workflow.CancelFunc) error {
selector := workflow.NewSelector(ctx)
var firstError error
for _, f := range futures {
selector.AddFuture(f, func(future workflow.Future) {
err := future.Get(ctx, nil)
if err != nil && firstError == nil {
firstError = err
cancelHandler()
}
})
}
// Wait for all futures to complete (either successfully or with cancellation)
for range len(futures) {
selector.Select(ctx)
}
if firstError != nil {
return firstError
}
return nil
}
Has anyone ran into this issue before? Any help would be greatly appreciated.