I have a workflow with these steps:
- Perform some workflow argument input validation, throw an exception if invalid
- Execute an activity that performs some work
- Based on the status code of the activity result:
- If a feature flag is disabled, wait 5min and continue-as-new
- If the workflow should not continue-as-new, return with a workflow result
- Otherwise, sleep until a specific time based on the activity’s result, and then continue-as-new
I’ve witnessed a few times (likely <1% of workflow runs) on different days where the sleep in step #4 has its timer fired twice. Both TimerFired
events get recorded in the workflow history, causing replays to fail. The workflow doesn’t have a workflow run or workflow execution timeout, so the affected workflows were stuck in a Running state after a WorkflowTaskFailed
event until they were manually reset via the web UI.
The Java code for the sleep looks like this:
Duration timeToSleep = Duration.between(Instant.ofEpochMilli(Workflow.currentTimeMillis()), activityResult.getNextRun());
if (!timeToSleep.isNegative() && !timeToSleep.isZero()) {
Workflow.sleep(timeToSleep.toMillis());
}
Question: is the sleep time calculated in a problematic way? Is there a better way to sleep until a specific date & time?
Question: is the double timer fire an expected edge case with Temporal? How can we protect against this in the future?
This was with:
- Server v1.191
- Java SDK v1.18.2
Here is the full event history of an affected workflow:
[
{
"eventId": "1",
"eventTime": "2023-05-22T07:07:46.530702820Z",
"eventType": "WorkflowExecutionStarted",
"version": "1",
"taskId": "5114599506",
"workerMayIgnore": false,
"workflowExecutionStartedEventAttributes": {
"workflowType": {
"name": "ProductsSyncWorkflow"
},
"parentWorkflowNamespace": "",
"parentWorkflowNamespaceId": "",
"parentWorkflowExecution": null,
"parentInitiatedEventId": "0",
"taskQueue": {
"name": "products-sync-task-queue",
"kind": "Normal"
},
"input": {
"payloads": [
{
"metadata": {
"class-type": "...",
"encoding": "YmluYXJ5L2VuY3J5cHRlZA==",
"encryption-cipher": "QUVTL0dDTS9Ob1BhZGRpbmc=",
"encryption-key-id": "..."
},
"data": "..."
}
]
},
"workflowExecutionTimeout": "0s",
"workflowRunTimeout": "0s",
"workflowTaskTimeout": "10s",
"continuedExecutionRunId": "bb88090d-2e8c-4ebe-9191-289775fe7ed1",
"initiator": "Workflow",
"continuedFailure": null,
"lastCompletionResult": null,
"originalExecutionRunId": "a16f083f-33ea-4741-aff4-734ccc9169e8",
"identity": "",
"firstExecutionRunId": "bf752333-2d47-4725-9c2e-cf0e5b59c96d",
"retryPolicy": null,
"attempt": 1,
"workflowExecutionExpirationTime": null,
"cronSchedule": "",
"firstWorkflowTaskBackoff": null,
"memo": null,
"searchAttributes": null,
"prevAutoResetPoints": {
"points": []
},
"header": {
"fields": {
"_tracer-data": {
"metadata": {
"encoding": "anNvbi9wbGFpbg=="
},
"data": "..."
}
}
},
"parentInitiatedEventVersion": "0"
}
},
{
"eventId": "2",
"eventTime": "2023-05-22T07:07:46.530756520Z",
"eventType": "WorkflowTaskScheduled",
"version": "1",
"taskId": "5114599507",
"workerMayIgnore": false,
"workflowTaskScheduledEventAttributes": {
"taskQueue": {
"name": "products-sync-task-queue",
"kind": "Normal"
},
"startToCloseTimeout": "10s",
"attempt": 1
}
},
{
"eventId": "3",
"eventTime": "2023-05-22T07:07:46.669630969Z",
"eventType": "WorkflowTaskStarted",
"version": "1",
"taskId": "5114599515",
"workerMayIgnore": false,
"workflowTaskStartedEventAttributes": {
"scheduledEventId": "2",
"identity": "7@products-service-5645c64c7b-w4wg2",
"requestId": "3faba282-1f81-425e-9877-c73ba7b1cedc",
"suggestContinueAsNew": false,
"historySizeBytes": "0"
}
},
{
"eventId": "4",
"eventTime": "2023-05-22T07:07:46.741643267Z",
"eventType": "WorkflowTaskCompleted",
"version": "1",
"taskId": "5114599520",
"workerMayIgnore": false,
"workflowTaskCompletedEventAttributes": {
"scheduledEventId": "2",
"startedEventId": "3",
"identity": "7@products-service-5645c64c7b-w4wg2",
"binaryChecksum": "",
"workerVersioningId": null,
"sdkMetadata": null,
"meteringMetadata": null
}
},
{
"eventId": "5",
"eventTime": "2023-05-22T07:07:46.741689267Z",
"eventType": "ActivityTaskScheduled",
"version": "1",
"taskId": "5114599521",
"workerMayIgnore": false,
"activityTaskScheduledEventAttributes": {
"activityId": "e15fb579-82a5-3fef-9549-d30fe4d0597d",
"activityType": {
"name": "ProductsSyncActivity"
},
"taskQueue": {
"name": "products-sync-task-queue",
"kind": "Normal"
},
"header": {
"fields": {
"_tracer-data": {
"metadata": {
"encoding": "anNvbi9wbGFpbg=="
},
"data": "..."
}
}
},
"input": {
"payloads": [
{
"metadata": {
"class-type": "...",
"encoding": "YmluYXJ5L2VuY3J5cHRlZA==",
"encryption-cipher": "QUVTL0dDTS9Ob1BhZGRpbmc=",
"encryption-key-id": "..."
},
"data": "..."
}
]
},
"scheduleToCloseTimeout": "0s",
"scheduleToStartTimeout": "0s",
"startToCloseTimeout": "900s",
"heartbeatTimeout": "0s",
"workflowTaskCompletedEventId": "4",
"retryPolicy": {
"initialInterval": "1s",
"backoffCoefficient": 4,
"maximumInterval": "3600s",
"maximumAttempts": 0,
"nonRetryableErrorTypes": [
"....workflows.common.StopWorkflowException"
]
}
}
},
{
"eventId": "6",
"eventTime": "2023-05-22T07:07:46.816267458Z",
"eventType": "ActivityTaskStarted",
"version": "1",
"taskId": "5114599528",
"workerMayIgnore": false,
"activityTaskStartedEventAttributes": {
"scheduledEventId": "5",
"identity": "7@products-service-5645c64c7b-n7fb7",
"requestId": "2a604f5b-d22b-4c2e-a274-bcd085cfe6f9",
"attempt": 1,
"lastFailure": null
}
},
{
"eventId": "7",
"eventTime": "2023-05-22T07:07:46.895939988Z",
"eventType": "ActivityTaskCompleted",
"version": "1",
"taskId": "5114599529",
"workerMayIgnore": false,
"activityTaskCompletedEventAttributes": {
"result": {
"payloads": [
{
"metadata": {
"class-type": "...",
"encoding": "YmluYXJ5L2VuY3J5cHRlZA==",
"encryption-cipher": "QUVTL0dDTS9Ob1BhZGRpbmc=",
"encryption-key-id": "..."
},
"data": "..."
}
]
},
"scheduledEventId": "5",
"startedEventId": "6",
"identity": "7@products-service-5645c64c7b-n7fb7"
}
},
{
"eventId": "8",
"eventTime": "2023-05-22T07:07:46.895946578Z",
"eventType": "WorkflowTaskScheduled",
"version": "1",
"taskId": "5114599530",
"workerMayIgnore": false,
"workflowTaskScheduledEventAttributes": {
"taskQueue": {
"name": "7@products-service-5645c64c7b-w4wg2:37f3e3a8-dc06-4574-8f80-f2d66aee54bd",
"kind": "Sticky"
},
"startToCloseTimeout": "10s",
"attempt": 1
}
},
{
"eventId": "9",
"eventTime": "2023-05-22T07:07:47.007794989Z",
"eventType": "WorkflowTaskStarted",
"version": "1",
"taskId": "5114599535",
"workerMayIgnore": false,
"workflowTaskStartedEventAttributes": {
"scheduledEventId": "8",
"identity": "7@products-service-5645c64c7b-w4wg2",
"requestId": "de11742b-eaab-4bbb-8aff-f326d253216e",
"suggestContinueAsNew": false,
"historySizeBytes": "0"
}
},
{
"eventId": "10",
"eventTime": "2023-05-22T07:07:47.073839021Z",
"eventType": "WorkflowTaskCompleted",
"version": "1",
"taskId": "5114599540",
"workerMayIgnore": false,
"workflowTaskCompletedEventAttributes": {
"scheduledEventId": "8",
"startedEventId": "9",
"identity": "7@products-service-5645c64c7b-w4wg2",
"binaryChecksum": "",
"workerVersioningId": null,
"sdkMetadata": null,
"meteringMetadata": null
}
},
{
"eventId": "11",
"eventTime": "2023-05-22T07:07:47.073856165Z",
"eventType": "TimerStarted",
"version": "1",
"taskId": "5114599541",
"workerMayIgnore": false,
"timerStartedEventAttributes": {
"timerId": "2c7aaffa-b7b5-36d8-95ac-981c7b549da8",
"startToFireTimeout": "59.886s",
"workflowTaskCompletedEventId": "10"
}
},
{
"eventId": "12",
"eventTime": "2023-05-22T07:08:46.961276652Z",
"eventType": "TimerFired",
"version": "1",
"taskId": "5114599780",
"workerMayIgnore": false,
"timerFiredEventAttributes": {
"timerId": "2c7aaffa-b7b5-36d8-95ac-981c7b549da8",
"startedEventId": "11"
}
},
{
"eventId": "13",
"eventTime": "2023-05-22T07:08:46.961281757Z",
"eventType": "WorkflowTaskScheduled",
"version": "1",
"taskId": "5114599781",
"workerMayIgnore": false,
"workflowTaskScheduledEventAttributes": {
"taskQueue": {
"name": "7@products-service-5645c64c7b-w4wg2:37f3e3a8-dc06-4574-8f80-f2d66aee54bd",
"kind": "Sticky"
},
"startToCloseTimeout": "10s",
"attempt": 1
}
},
{
"eventId": "14",
"eventTime": "2023-05-22T07:08:47.868004653Z",
"eventType": "TimerFired",
"version": "1",
"taskId": "5114599786",
"workerMayIgnore": false,
"timerFiredEventAttributes": {
"timerId": "2c7aaffa-b7b5-36d8-95ac-981c7b549da8",
"startedEventId": "11"
}
},
{
"eventId": "15",
"eventTime": "2023-05-22T07:08:49.520594348Z",
"eventType": "WorkflowTaskStarted",
"version": "1",
"taskId": "5114599789",
"workerMayIgnore": false,
"workflowTaskStartedEventAttributes": {
"scheduledEventId": "13",
"identity": "7@products-service-5645c64c7b-w4wg2",
"requestId": "e70d2291-af6d-41c5-a3bd-070018916137",
"suggestContinueAsNew": false,
"historySizeBytes": "0"
}
},
{
"eventId": "16",
"eventTime": "2023-05-22T07:08:49.602267895Z",
"eventType": "WorkflowTaskFailed",
"version": "1",
"taskId": "5114599794",
"workerMayIgnore": false,
"workflowTaskFailedEventAttributes": {
"scheduledEventId": "13",
"startedEventId": "15",
"cause": "Unspecified",
"failure": {
"message": "Failure handling event 14 of type 'EVENT_TYPE_TIMER_FIRED' during replay. {WorkflowTaskStartedEventId=15, CurrentStartedEventId=9}",
"source": "JavaSDK",
"stackTrace": "io.temporal.internal.statemachines.WorkflowStateMachines.createEventProcessingException(WorkflowStateMachines.java:257)\nio.temporal.internal.statemachines.WorkflowStateMachines.handleEventsBatch(WorkflowStateMachines.java:236)\nio.temporal.internal.statemachines.WorkflowStateMachines.handleEvent(WorkflowStateMachines.java:208)\nio.temporal.internal.replay.ReplayWorkflowRunTaskHandler.applyServerHistory(ReplayWorkflowRunTaskHandler.java:220)\nio.temporal.internal.replay.ReplayWorkflowRunTaskHandler.handleWorkflowTaskImpl(ReplayWorkflowRunTaskHandler.java:204)\nio.temporal.internal.replay.ReplayWorkflowRunTaskHandler.handleWorkflowTask(ReplayWorkflowRunTaskHandler.java:153)\nio.temporal.internal.replay.ReplayWorkflowTaskHandler.handleWorkflowTaskWithQuery(ReplayWorkflowTaskHandler.java:131)\nio.temporal.internal.replay.ReplayWorkflowTaskHandler.handleWorkflowTask(ReplayWorkflowTaskHandler.java:96)\nio.temporal.internal.worker.WorkflowWorker$TaskHandlerImpl.handleTask(WorkflowWorker.java:337)\nio.temporal.internal.worker.WorkflowWorker$TaskHandlerImpl.handle(WorkflowWorker.java:247)\nio.temporal.internal.worker.WorkflowWorker$TaskHandlerImpl.handle(WorkflowWorker.java:189)\nio.temporal.internal.worker.PollTaskExecutor.lambda$process$0(PollTaskExecutor.java:103)\njava.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)\njava.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)\njava.base/java.lang.Thread.run(Thread.java:829)\n",
"encodedAttributes": null,
"cause": {
"message": "Unexpected event:event_id: 14\nevent_time {\n seconds: 1684739327\n nanos: 868004653\n}\nevent_type: EVENT_TYPE_TIMER_FIRED\nversion: 1\ntask_id: 5114599786\ntimer_fired_event_attributes {\n timer_id: \"2c7aaffa-b7b5-36d8-95ac-981c7b549da8\"\n started_event_id: 11\n}\n",
"source": "JavaSDK",
"stackTrace": "io.temporal.internal.statemachines.WorkflowStateMachines.handleNonStatefulEvent(WorkflowStateMachines.java:521)\nio.temporal.internal.statemachines.WorkflowStateMachines.handleSingleEvent(WorkflowStateMachines.java:282)\nio.temporal.internal.statemachines.WorkflowStateMachines.handleEventsBatch(WorkflowStateMachines.java:234)\nio.temporal.internal.statemachines.WorkflowStateMachines.handleEvent(WorkflowStateMachines.java:208)\nio.temporal.internal.replay.ReplayWorkflowRunTaskHandler.applyServerHistory(ReplayWorkflowRunTaskHandler.java:220)\nio.temporal.internal.replay.ReplayWorkflowRunTaskHandler.handleWorkflowTaskImpl(ReplayWorkflowRunTaskHandler.java:204)\nio.temporal.internal.replay.ReplayWorkflowRunTaskHandler.handleWorkflowTask(ReplayWorkflowRunTaskHandler.java:153)\nio.temporal.internal.replay.ReplayWorkflowTaskHandler.handleWorkflowTaskWithQuery(ReplayWorkflowTaskHandler.java:131)\nio.temporal.internal.replay.ReplayWorkflowTaskHandler.handleWorkflowTask(ReplayWorkflowTaskHandler.java:96)\nio.temporal.internal.worker.WorkflowWorker$TaskHandlerImpl.handleTask(WorkflowWorker.java:337)\nio.temporal.internal.worker.WorkflowWorker$TaskHandlerImpl.handle(WorkflowWorker.java:247)\nio.temporal.internal.worker.WorkflowWorker$TaskHandlerImpl.handle(WorkflowWorker.java:189)\nio.temporal.internal.worker.PollTaskExecutor.lambda$process$0(PollTaskExecutor.java:103)\njava.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)\njava.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)\njava.base/java.lang.Thread.run(Thread.java:829)\n",
"encodedAttributes": null,
"cause": null,
"applicationFailureInfo": {
"type": "java.lang.IllegalArgumentException",
"nonRetryable": false,
"details": null
}
},
"applicationFailureInfo": {
"type": "io.temporal.internal.statemachines.InternalWorkflowTaskException",
"nonRetryable": false,
"details": null
}
},
"identity": "7@products-service-5645c64c7b-w4wg2",
"baseRunId": "",
"newRunId": "",
"forkEventVersion": "0",
"binaryChecksum": ""
}
},
{
"eventId": "17",
"eventTime": "2023-06-01T15:12:06.162611701Z",
"eventType": "WorkflowExecutionTerminated",
"version": "1",
"taskId": "5118629361",
"workerMayIgnore": false,
"workflowExecutionTerminatedEventAttributes": {
"reason": "",
"details": null,
"identity": "history-resetter"
}
}
]