Temporal TIMEOUT_TYPE_START_TO_CLOSE

Dear team,

I have temporal workers which has workflows.
Sometimes, but constantly around 0.15% activates didn’t executed. By timeout it gets a timeout-type_start_to_close error.

As a temporal solution we added retries. It helps, however the problem we want speed from temporal and every retry affect our business.
Let’s say our SLA is 2 seconds, each activity takes ~900ms, but could be 1.5secs, or 0.5s.

My question here who is the bottleneck here? where should I pay attention? I have already 6 workers, maybe it’s too much? Maybe it’s related to temporal-history pods? Or maybe I need to change settings for polling?

Hi,

it can be several factors, can you check schedule_to_start latency for activities ? if you don’t have this metric you can share share a workflow history (json) with this issue, remove the payloads , and we can take a look

Please see:

Antonio

Sure, here is the payload of one on the workflow

{
  "events": [
    {
      "eventId": "1",
      "eventTime": "2025-07-28T13:31:04.684264570Z",
      "eventType": "EVENT_TYPE_WORKFLOW_EXECUTION_STARTED",
      "taskId": "10585501637",
      "workflowExecutionStartedEventAttributes": {
        "workflowType": {
          "name": "myType"
        },
        "taskQueue": {
          "name": "TASK_QUEUE",
          "kind": "TASK_QUEUE_KIND_NORMAL"
        },
        "input": {
          "payloads": [
            {
              "metadata": {
                "encoding": "dfas=="
              },
              "data": {
                "some object": {}
                },
                "Event": {
                  "event_type": "Created",
                  "alias": "01971200-64b6-7d4b-a412-b9317721c61f_ORIGINAL_1753709464647",                  },
                  "start_time": "2025-07-28T13:31:04.647Z",
                  "response": {
                    "id": "",
                  },
                  "produce_time": "2025-07-28T13:31:04.674763056Z",
                }
              }
            }
          ]
        },
        "workflowExecutionTimeout": "0s",
        "workflowRunTimeout": "0s",
        "workflowTaskTimeout": "10s",
        "originalExecutionRunId": "0198513a-cc6c-7405-8c25-a162a23b3bf0",
        "identity": "1@tov-services-prd-execution-67546cf5fd-2b24n@",
        "firstExecutionRunId": "0198513a-cc6c-7405-8c25-a162a23b3bf0",
        "retryPolicy": {
          "initialInterval": "1s",
          "backoffCoefficient": 2,
          "maximumInterval": "100s",
          "maximumAttempts": 1
        },
        "attempt": 1,
        "firstWorkflowTaskBackoff": "0s",
        "header": {},
        "workflowId": "bc4129d7-6c22-4e6e-bb38-f92cb449013a"
      }
    },
    {
      "eventId": "2",
      "eventTime": "2025-07-28T13:31:04.684361001Z",
      "eventType": "EVENT_TYPE_WORKFLOW_TASK_SCHEDULED",
      "taskId": "10585501638",
      "workflowTaskScheduledEventAttributes": {
        "taskQueue": {
          "name": "TASK_QUEUE",
          "kind": "TASK_QUEUE_KIND_NORMAL"
        },
        "startToCloseTimeout": "10s",
        "attempt": 1
      }
    },
    {
      "eventId": "3",
      "eventTime": "2025-07-28T13:31:04.740833203Z",
      "eventType": "EVENT_TYPE_WORKFLOW_TASK_STARTED",
      "taskId": "10585501643",
      "workflowTaskStartedEventAttributes": {
        "scheduledEventId": "2",
        "identity": "1@worker-7865476bc5-pq9ff@",
        "requestId": "88e7655e-7153-4fe3-9197-23f810945287",
        "historySizeBytes": "2717",
        "workerVersion": {
          "buildId": "0b4eccf6492fff7fc9aab65a044c83f9"
        }
      }
    },
    {
      "eventId": "4",
      "eventTime": "2025-07-28T13:31:04.769487733Z",
      "eventType": "EVENT_TYPE_WORKFLOW_TASK_COMPLETED",
      "taskId": "10585501647",
      "workflowTaskCompletedEventAttributes": {
        "scheduledEventId": "2",
        "startedEventId": "3",
        "identity": "1@temporalworker-7865476bc5-pq9ff@",
        "workerVersion": {
          "buildId": "0b4eccf6492fff7fc9aab65a044c83f9"
        },
        "sdkMetadata": {
          "langUsedFlags": [
            3
          ],
          "sdkName": "temporal-go",
          "sdkVersion": "1.34.0"
        },
        "meteringMetadata": {}
      }
    },
    {
      "eventId": "5",
      "eventTime": "2025-07-28T13:31:04.769551384Z",
      "eventType": "EVENT_TYPE_ACTIVITY_TASK_SCHEDULED",
      "taskId": "10585501648",
      "activityTaskScheduledEventAttributes": {
        "activityId": "5",
        "activityType": {
          "name": "myDefinition"
        },
        "taskQueue": {
          "name": "INGESTOR_TASK_QUEUE",
          "kind": "TASK_QUEUE_KIND_NORMAL"
        },
        "header": {},
        "input": {
          "payloads": [
            {
              "metadata": {
                "encoding": "anNvbi9wbGFpbg=="
              },
              "data": {
                },
                "Event": {
                  "event_type": "Created",
                  "alias": "01971200-64b6-7d4b-a412-b9317721c61f_ORIGINAL_1753709464647",
                  "start_time": "2025-07-28T13:31:04.647Z",
                  "response": {
                    "id": "",
                  },
                  "produce_time": "2025-07-28T13:31:04.674763056Z",
                }
              }
            }
          ]
        },
        "scheduleToCloseTimeout": "0s",
        "scheduleToStartTimeout": "0s",
        "startToCloseTimeout": "10s",
        "heartbeatTimeout": "0s",
        "workflowTaskCompletedEventId": "4",
        "retryPolicy": {
          "initialInterval": "1s",
          "backoffCoefficient": 2,
          "maximumInterval": "100s",
          "maximumAttempts": 3
        },
        "useWorkflowBuildId": true
      }
    },
    {
      "eventId": "6",
      "eventTime": "2025-07-28T13:31:15.810250073Z",
      "eventType": "EVENT_TYPE_ACTIVITY_TASK_STARTED",
      "taskId": "10585501657",
      "activityTaskStartedEventAttributes": {
        "scheduledEventId": "5",
        "identity": "1@temporalworker-7865476bc5-8pbfh@",
        "requestId": "741bd76a-3907-4010-859b-3a509534291a",
        "attempt": 2,
        "lastFailure": {
          "message": "activity StartToClose timeout",
          "source": "Server",
          "timeoutFailureInfo": {
            "timeoutType": "TIMEOUT_TYPE_START_TO_CLOSE"
          }
        },
        "workerVersion": {
          "buildId": "0b4eccf6492fff7fc9aab65a044c83f9"
        }
      }
    },
    {
      "eventId": "7",
      "eventTime": "2025-07-28T13:31:18.055572715Z",
      "eventType": "EVENT_TYPE_ACTIVITY_TASK_COMPLETED",
      "taskId": "10585501658",
      "activityTaskCompletedEventAttributes": {
        "result": {
          "payloads": [
            {
              "metadata": {
                "encoding": "anNvbi9wbGFpbg=="
              },
              "data": {
                "success": true
              }
            }
          ]
        },
        "scheduledEventId": "5",
        "startedEventId": "6",
        "identity": "1@prd-temporalworker-7865476bc5-8pbfh@"
      }
    },
    {
      "eventId": "8",
      "eventTime": "2025-07-28T13:31:18.055580785Z",
      "eventType": "EVENT_TYPE_WORKFLOW_TASK_SCHEDULED",
      "taskId": "10585501659",
      "workflowTaskScheduledEventAttributes": {
        "taskQueue": {
          "name": "temporalworker-7865476bc5-pq9ff:1723737f-28fb-4ddb-80fc-545ad6231cfd",
          "kind": "TASK_QUEUE_KIND_STICKY",
          "normalName": "TASK_QUEUE"
        },
        "startToCloseTimeout": "10s",
        "attempt": 1
      }
    },
    {
      "eventId": "9",
      "eventTime": "2025-07-28T13:31:18.091037662Z",
      "eventType": "EVENT_TYPE_WORKFLOW_TASK_STARTED",
      "taskId": "10585501663",
      "workflowTaskStartedEventAttributes": {
        "scheduledEventId": "8",
        "identity": "1@prd-temporalworker-7865476bc5-pq9ff@",
        "requestId": "0b2bbab7-983d-40f2-9896-f1a4667ae15a",
        "historySizeBytes": "6009",
        "workerVersion": {
          "buildId": "0b4eccf6492fff7fc9aab65a044c83f9"
        }
      }
    },
    {
      "eventId": "10",
      "eventTime": "2025-07-28T13:31:18.124535904Z",
      "eventType": "EVENT_TYPE_WORKFLOW_TASK_COMPLETED",
      "taskId": "10585501667",
      "workflowTaskCompletedEventAttributes": {
        "scheduledEventId": "8",
        "startedEventId": "9",
        "identity": "1@prd-temporalworker-7865476bc5-pq9ff@",
        "workerVersion": {
          "buildId": "0b4eccf6492fff7fc9aab65a044c83f9"
        },
        "sdkMetadata": {},
        "meteringMetadata": {}
      }
    },
    {
      "eventId": "11",
      "eventTime": "2025-07-28T13:31:18.124576445Z",
      "eventType": "EVENT_TYPE_WORKFLOW_EXECUTION_COMPLETED",
      "taskId": "10585501668",
      "workflowExecutionCompletedEventAttributes": {
        "workflowTaskCompletedEventId": "10"
      }
    }
  ]
}

Hi, thanks,

If the maximum time the activity can take is 1.5s maybe you can reduce start to close to 2 seconds (instead of 10 seconds), using activity heartbeat Detecting Activity failures | Temporal Platform Documentation is another option to make the server schedule the next retry faster if the activity is not making progress

Antonio