Hi team:
Resource Exhaustion happened in our environment. Could you tell me how to track this issue?
Env:
temporalio/server:1.22.3
temporalio/admin-tools:1.22.3
temporalio/ui:latest
Workflow Status:
SELECT
sum(t1.wsuc) AS success,
sum(t2.wfai) AS fail,
sumIf(t3.wtim, t3.time IS NOT NULL) AS terminate,
sumIf(t4.wsuc, t4.time IS NOT NULL) AS cancel,
(sum(t1.wsuc) + sum(t2.wfai) + sumIf(t3.wtim, t3.time IS NOT NULL) + sumIf(t4.wsuc, t4.time IS NOT NULL)) as total,
t1.time AS time
FROM (
SELECT
sumIf("workflow_success", operation='CompletionStats') AS wsuc,
toStartOfInterval(time, INTERVAL :INTERVAL:) AS time
FROM "temporal"."workflow_success"
WHERE time > :startTime: AND time < :endTime:
GROUP BY time
ORDER BY time
) t1
LEFT JOIN (
SELECT
sumIf("workflow_failed", operation='CompletionStats') AS wfai,
toStartOfInterval(time, INTERVAL :INTERVAL:) AS time
FROM "temporal"."workflow_failed"
WHERE time > :startTime: AND time < :endTime:
GROUP BY time
ORDER BY time
) t2 ON t1.time = t2.time
LEFT JOIN (
SELECT
sumIf("workflow_terminate", operation='CompletionStats') AS wtim,
toStartOfInterval(time, INTERVAL :INTERVAL:) AS time
FROM "temporal"."workflow_terminate"
WHERE time > :startTime: AND time < :endTime:
GROUP BY time
ORDER BY time
) t3 ON t1.time = t3.time
LEFT JOIN (
SELECT
sumIf("workflow_cancel", operation='CompletionStats') AS wsuc,
toStartOfInterval(time, INTERVAL :INTERVAL:) AS time
FROM "temporal"."workflow_cancel"
WHERE time > :startTime: AND time < :endTime:
GROUP BY time
ORDER BY time
) t4 ON t1.time = t4.time
GROUP BY time
ORDER BY time;
Activity Status
SELECT
sumIf ("service_requests", operation = 'AddActivityTask') AS "Scheduled",
sumIf (
"service_requests",
operation = 'RecordActivityTaskStarted'
) AS "Started",
sumIf (
"service_requests",
operation IN (
'RespondActivityTaskCompleted',
'RespondActivityTaskFailed',
'RespondActivityTaskCanceled'
)
) AS "Completed",
"Scheduled" + "Started" + "Completed" AS "Total",
toStartOfInterval (time, INTERVAL :INTERVAL:) AS time
FROM
"temporal"."service_requests"
WHERE
time > :startTime:
AND time < :endTime:
GROUP BY
time
ORDER BY
time
Cache Latency:
SELECT histogram_quantile(sumForEach(cache_latency), [0.001,0.002,0.005,0.01,0.02,0.05,0.1,0.2,0.5,1,2,5,10,20,50,100,200,500,1000], 0.99) AS "latency(HistoryCacheGetOrCreate)", toStartOfInterval(time, INTERVAL :INTERVAL:) AS time
FROM "temporal"."cache_latency"
WHERE time > :startTime: AND time < :endTime: AND ( "operation"='HistoryCacheGetOrCreate' )
GROUP BY time
ORDER BY time
Slots_Available:
SELECT
avg("temporal_worker_task_slots_available") AS "worker_task_slots_available",
toStartOfInterval (time, INTERVAL :INTERVAL:) AS time,
"worker_type"
FROM
"temporal"."temporal_worker_task_slots_available"
WHERE
time > :startTime:
AND time < :endTime:
AND (
"worker_type" = 'WorkflowWorker'
or "worker_type" = 'ActivityWorker'
)
GROUP BY
time,
"worker_type"
ORDER BY
time
Poll Success Rate:
SELECT
((sum(t1.suc) + sum(t2.suc_sync)) / (sum(t1.suc) + sum(t2.suc_sync) + sum(t3.timeout))) * 100 as poll_success_rate,
t1.time AS time
FROM
(
SELECT
sum("poll_success") AS suc,
toStartOfInterval(time, INTERVAL :INTERVAL:) AS time
FROM
"temporal"."poll_success"
WHERE
time > :startTime: and time < :endTime:
GROUP BY
time
ORDER BY
time
) t1
LEFT JOIN (
SELECT
sum("poll_success_sync") AS suc_sync,
toStartOfInterval(time, INTERVAL :INTERVAL:) AS time
FROM
"temporal"."poll_success_sync"
WHERE
time > :startTime: and time < :endTime:
GROUP BY
time
ORDER BY
time
) t2 ON t1.time = t2.time
LEFT JOIN (
SELECT
sum( "poll_timeouts") AS timeout,
toStartOfInterval(time, INTERVAL :INTERVAL:) AS time
FROM
"temporal"."poll_timeouts"
WHERE
time > :startTime: and time < :endTime:
GROUP BY
time
ORDER BY
time
) t3 ON t1.time = t3.time
GROUP BY
time
ORDER BY
time;
schedule to start latency:
SELECT
sum(t1.tstsl) AS task_latency,
sum(t2.tastsl) AS activity_latency,
sum (t3.twlstsl) AS workflow_task_latency,
t1.time AS time
FROM
(
SELECT
sum(task_schedule_to_start_latency[20]) AS tstsl,
toStartOfInterval (time, INTERVAL :INTERVAL:) AS time
FROM
"temporal"."task_schedule_to_start_latency"
WHERE
time > :startTime:
AND time < :endTime:
GROUP BY
time
ORDER BY
time
) t1
LEFT JOIN (
SELECT
sum(temporal_activity_schedule_to_start_latency[20]) AS tastsl,
toStartOfInterval (time, INTERVAL :INTERVAL:) AS time
FROM
"temporal"."temporal_activity_schedule_to_start_latency"
WHERE
time > :startTime:
AND time < :endTime:
GROUP BY
time
ORDER BY
time
) t2 ON t1.time = t2.time
LEFT JOIN (
SELECT
sum(temporal_workflow_task_schedule_to_start_latency[20]) AS twlstsl,
toStartOfInterval (time, INTERVAL :INTERVAL:) AS time
FROM
"temporal"."temporal_workflow_task_schedule_to_start_latency"
WHERE
time > :startTime:
AND time < :endTime:
GROUP BY
time
ORDER BY
time
) t3 ON t1.time = t3.time
GROUP BY
time
ORDER BY
time;
sticky cache size:
SELECT
sum("temporal_sticky_cache_size") AS "sum_temporal_sticky_cache_size",
toStartOfInterval (time, INTERVAL :INTERVAL:) AS time
FROM
"temporal"."temporal_sticky_cache_size"
WHERE
time > :startTime:
AND time < :endTime:
GROUP BY
time
ORDER BY
time
Resource Exhausted:
SELECT
sum("service_errors_resource_exhausted") AS "sum_service_errors_resource_exhausted",
toStartOfInterval (time, INTERVAL :INTERVAL:) AS time,
"service_name"
FROM
"temporal"."service_errors_resource_exhausted"
WHERE
time > :startTime:
AND time < :endTime:
GROUP BY
time,
"service_name"
ORDER BY
time