How to track Resource Exhausted

Hi team:

Resource Exhaustion happened in our environment. Could you tell me how to track this issue?

Env:
temporalio/server:1.22.3
temporalio/admin-tools:1.22.3
temporalio/ui:latest

Workflow Status:

SELECT 
    sum(t1.wsuc) AS success, 
    sum(t2.wfai) AS fail, 
    sumIf(t3.wtim, t3.time IS NOT NULL) AS terminate,
    sumIf(t4.wsuc, t4.time IS NOT NULL) AS cancel,
    (sum(t1.wsuc) + sum(t2.wfai) + sumIf(t3.wtim, t3.time IS NOT NULL) + sumIf(t4.wsuc, t4.time IS NOT NULL)) as total,
    t1.time AS time
FROM (
    SELECT 
        sumIf("workflow_success", operation='CompletionStats') AS wsuc, 
        toStartOfInterval(time, INTERVAL :INTERVAL:) AS time 
    FROM "temporal"."workflow_success" 
    WHERE time > :startTime: AND time < :endTime: 
    GROUP BY time 
    ORDER BY time
) t1 
LEFT JOIN (
    SELECT 
        sumIf("workflow_failed", operation='CompletionStats') AS wfai, 
        toStartOfInterval(time, INTERVAL :INTERVAL:) AS time 
    FROM "temporal"."workflow_failed" 
    WHERE time > :startTime: AND time < :endTime: 
    GROUP BY time 
    ORDER BY time
) t2 ON t1.time = t2.time 
LEFT JOIN (
    SELECT 
        sumIf("workflow_terminate", operation='CompletionStats') AS wtim, 
        toStartOfInterval(time, INTERVAL :INTERVAL:) AS time 
    FROM "temporal"."workflow_terminate" 
    WHERE time > :startTime: AND time < :endTime: 
    GROUP BY time 
    ORDER BY time
) t3 ON t1.time = t3.time 
LEFT JOIN (
    SELECT 
        sumIf("workflow_cancel", operation='CompletionStats') AS wsuc, 
        toStartOfInterval(time, INTERVAL :INTERVAL:) AS time 
    FROM "temporal"."workflow_cancel" 
    WHERE time > :startTime: AND time < :endTime: 
    GROUP BY time 
    ORDER BY time
) t4 ON t1.time = t4.time 
GROUP BY time 
ORDER BY time;

Activity Status

SELECT
  sumIf ("service_requests", operation = 'AddActivityTask') AS "Scheduled",
  sumIf (
    "service_requests",
    operation = 'RecordActivityTaskStarted'
  ) AS "Started",
  sumIf (
    "service_requests",
    operation IN (
      'RespondActivityTaskCompleted',
      'RespondActivityTaskFailed',
      'RespondActivityTaskCanceled'
    )
  ) AS "Completed",
    "Scheduled" + "Started" + "Completed" AS "Total",
  toStartOfInterval (time, INTERVAL :INTERVAL:) AS time
FROM
  "temporal"."service_requests"
WHERE
  time > :startTime:
  AND time < :endTime:
GROUP BY
  time
ORDER BY
  time

Cache Latency:

SELECT histogram_quantile(sumForEach(cache_latency), [0.001,0.002,0.005,0.01,0.02,0.05,0.1,0.2,0.5,1,2,5,10,20,50,100,200,500,1000], 0.99) AS "latency(HistoryCacheGetOrCreate)", toStartOfInterval(time, INTERVAL :INTERVAL:) AS time
FROM "temporal"."cache_latency" 
WHERE time > :startTime: AND time < :endTime: AND ( "operation"='HistoryCacheGetOrCreate' ) 
GROUP BY time 
ORDER BY time 

Slots_Available:

SELECT
  avg("temporal_worker_task_slots_available") AS "worker_task_slots_available",
  toStartOfInterval (time, INTERVAL :INTERVAL:) AS time,
  "worker_type"
FROM
  "temporal"."temporal_worker_task_slots_available"
WHERE
  time > :startTime:
  AND time < :endTime:
  AND (
    "worker_type" = 'WorkflowWorker'
    or "worker_type" = 'ActivityWorker'
  )
GROUP BY
  time,
  "worker_type"
ORDER BY
  time

Poll Success Rate:

SELECT
    ((sum(t1.suc) + sum(t2.suc_sync)) / (sum(t1.suc) + sum(t2.suc_sync) + sum(t3.timeout))) * 100 as poll_success_rate,
    t1.time AS time
FROM
    (
        SELECT
            sum("poll_success") AS suc,
            toStartOfInterval(time, INTERVAL :INTERVAL:) AS time
        FROM
            "temporal"."poll_success"
        WHERE
            time > :startTime: and time < :endTime:
        GROUP BY
            time
        ORDER BY
            time
    ) t1
    LEFT JOIN (
        SELECT
            sum("poll_success_sync") AS suc_sync,
            toStartOfInterval(time, INTERVAL :INTERVAL:) AS time
        FROM
            "temporal"."poll_success_sync"
        WHERE
            time > :startTime: and time < :endTime:
        GROUP BY
            time
        ORDER BY
            time
    ) t2 ON t1.time = t2.time
    LEFT JOIN (
        SELECT
            sum( "poll_timeouts") AS timeout,
            toStartOfInterval(time, INTERVAL :INTERVAL:) AS time
        FROM
            "temporal"."poll_timeouts"
        WHERE
            time > :startTime: and time < :endTime:
        GROUP BY
            time
        ORDER BY
            time
    ) t3 ON t1.time = t3.time
GROUP BY
    time
ORDER BY
    time;

schedule to start latency:

SELECT
  sum(t1.tstsl) AS task_latency,
  sum(t2.tastsl) AS activity_latency,
  sum (t3.twlstsl) AS workflow_task_latency,
  t1.time AS time
FROM
  (
    SELECT
      sum(task_schedule_to_start_latency[20]) AS tstsl,
      toStartOfInterval (time, INTERVAL :INTERVAL:) AS time
    FROM
      "temporal"."task_schedule_to_start_latency"
    WHERE
      time > :startTime:
      AND time < :endTime:
    GROUP BY
      time
    ORDER BY
      time
  ) t1
  LEFT JOIN (
    SELECT
      sum(temporal_activity_schedule_to_start_latency[20]) AS tastsl,
      toStartOfInterval (time, INTERVAL :INTERVAL:) AS time
    FROM
      "temporal"."temporal_activity_schedule_to_start_latency"
    WHERE
      time > :startTime:
      AND time < :endTime:
    GROUP BY
      time
    ORDER BY
      time
  ) t2 ON t1.time = t2.time
  LEFT JOIN (
    SELECT
      sum(temporal_workflow_task_schedule_to_start_latency[20]) AS twlstsl,
      toStartOfInterval (time, INTERVAL :INTERVAL:) AS time
    FROM
      "temporal"."temporal_workflow_task_schedule_to_start_latency"
    WHERE
      time > :startTime:
      AND time < :endTime:
    GROUP BY
      time
    ORDER BY
      time
  ) t3 ON t1.time = t3.time
GROUP BY
  time
ORDER BY
  time;

sticky cache size:

SELECT
  sum("temporal_sticky_cache_size") AS "sum_temporal_sticky_cache_size",
  toStartOfInterval (time, INTERVAL :INTERVAL:) AS time
FROM
  "temporal"."temporal_sticky_cache_size"
WHERE
  time > :startTime:
  AND time < :endTime:
GROUP BY
  time
ORDER BY
  time

:rofl: Resource Exhausted:

SELECT
  sum("service_errors_resource_exhausted") AS "sum_service_errors_resource_exhausted",
  toStartOfInterval (time, INTERVAL :INTERVAL:) AS time,
  "service_name"
FROM
  "temporal"."service_errors_resource_exhausted"
WHERE
  time > :startTime:
  AND time < :endTime:
GROUP BY
  time,
  "service_name"
ORDER BY
  time

Need to understand the resource exhausted cause to see whats really going on,
in your last query group also by resource_exhausted_cause

See if cause is BusyWorkflow (which would be my guess here), if so then you are getting throttled on workflow lock latency, meaning your use case starting too many async activities/child workflows is most likely cause.

Other things you could be getting throttled on is ConcurrentLimit cause, meaning you are have over max num of pollers for namespace (dynamic config frontend.namespaceCount)
or could be SystemExhausted, PersistenceLimi, or ApsLimit. See which ones you are running into and we can go from there.