Temporal Cloud seems to not receive heartbeats from Go SDK after some time

VS

{"level":"debug","ts":1703713878.2367544,"caller":"heartbeat/heartbeat.go:60","msg":"HB","Namespace":"core-apac.t0lvw","TaskQueue":"worker-golang","WorkerID":"1@worker-0@","ActivityID":"11","ActivityType":"ParseChunk","Attempt":9,"WorkflowType":"ParseChunk","WorkflowID":"ParseChunk_2023-12-25T06:18:03.990801Z_3","RunID":"6ce7aae5-4826-4856-aa23-ee1e881acf10","details":[{"Step":0,"Progress":"2023-12-27T21:51:18.236733406Z"}]}
{"level":"debug","ts":1703713879.0549097,"caller":"heartbeat/heartbeat.go:60","msg":"HB","Namespace":"core-apac.t0lvw","TaskQueue":"worker-golang","WorkerID":"1@worker-0@","ActivityID":"11","ActivityType":"ParseChunk","Attempt":9,"WorkflowType":"ParseChunk","WorkflowID":"ParseChunk_2023-12-25T06:18:03.990801Z_3","RunID":"6ce7aae5-4826-4856-aa23-ee1e881acf10","details":[{"Step":0,"Progress":"2023-12-27T21:51:19.05489105Z"}]}
{"level":"debug","ts":1703713880.2367814,"caller":"heartbeat/heartbeat.go:60","msg":"HB","Namespace":"core-apac.t0lvw","TaskQueue":"worker-golang","WorkerID":"1@worker-0@","ActivityID":"11","ActivityType":"ParseChunk","Attempt":9,"WorkflowType":"ParseChunk","WorkflowID":"ParseChunk_2023-12-25T06:18:03.990801Z_3","RunID":"6ce7aae5-4826-4856-aa23-ee1e881acf10","details":[{"Step":0,"Progress":"2023-12-27T21:51:20.236763624Z"}]}
{"level":"debug","ts":1703713881.13676,"caller":"heartbeat/heartbeat.go:60","msg":"HB","Namespace":"core-apac.t0lvw","TaskQueue":"worker-golang","WorkerID":"1@worker-0@","ActivityID":"11","ActivityType":"ParseChunk","Attempt":9,"WorkflowType":"ParseChunk","WorkflowID":"ParseChunk_2023-12-25T06:18:03.990801Z_3","RunID":"6ce7aae5-4826-4856-aa23-ee1e881acf10","details":[{"Step":0,"Progress":"2023-12-27T21:51:21.136738353Z"}]}
{"level":"debug","ts":1703713882.1367195,"caller":"heartbeat/heartbeat.go:60","msg":"HB","Namespace":"core-apac.t0lvw","TaskQueue":"worker-golang","WorkerID":"1@worker-0@","ActivityID":"11","ActivityType":"ParseChunk","Attempt":9,"WorkflowType":"ParseChunk","WorkflowID":"ParseChunk_2023-12-25T06:18:03.990801Z_3","RunID":"6ce7aae5-4826-4856-aa23-ee1e881acf10","details":[{"Step":0,"Progress":"2023-12-27T21:51:22.136708378Z"}]}
{"level":"debug","ts":1703713883.236777,"caller":"heartbeat/heartbeat.go:60","msg":"HB","Namespace":"core-apac.t0lvw","TaskQueue":"worker-golang","WorkerID":"1@worker-0@","ActivityID":"11","ActivityType":"ParseChunk","Attempt":9,"WorkflowType":"ParseChunk","WorkflowID":"ParseChunk_2023-12-25T06:18:03.990801Z_3","RunID":"6ce7aae5-4826-4856-aa23-ee1e881acf10","details":[{"Step":0,"Progress":"2023-12-27T21:51:23.236757028Z"}]}
{"level":"debug","ts":1703713884.136784,"caller":"heartbeat/heartbeat.go:60","msg":"HB","Namespace":"core-apac.t0lvw","TaskQueue":"worker-golang","WorkerID":"1@worker-0@","ActivityID":"11","ActivityType":"ParseChunk","Attempt":9,"WorkflowType":"ParseChunk","WorkflowID":"ParseChunk_2023-12-25T06:18:03.990801Z_3","RunID":"6ce7aae5-4826-4856-aa23-ee1e881acf10","details":[{"Step":0,"Progress":"2023-12-27T21:51:24.136706416Z"}]}
{"level":"debug","ts":1703713885.2367258,"caller":"heartbeat/heartbeat.go:60","msg":"HB","Namespace":"core-apac.t0lvw","TaskQueue":"worker-golang","WorkerID":"1@worker-0@","ActivityID":"11","ActivityType":"ParseChunk","Attempt":9,"WorkflowType":"ParseChunk","WorkflowID":"ParseChunk_2023-12-25T06:18:03.990801Z_3","RunID":"6ce7aae5-4826-4856-aa23-ee1e881acf10","details":[{"Step":0,"Progress":"2023-12-27T21:51:25.236706396Z"}]}
{"level":"debug","ts":1703713886.136722,"caller":"heartbeat/heartbeat.go:60","msg":"HB","Namespace":"core-apac.t0lvw","TaskQueue":"worker-golang","WorkerID":"1@worker-0@","ActivityID":"11","ActivityType":"ParseChunk","Attempt":9,"WorkflowType":"ParseChunk","WorkflowID":"ParseChunk_2023-12-25T06:18:03.990801Z_3","RunID":"6ce7aae5-4826-4856-aa23-ee1e881acf10","details":[{"Step":0,"Progress":"2023-12-27T21:51:26.136702614Z"}]}
{"level":"debug","ts":1703713887.3372765,"caller":"heartbeat/heartbeat.go:60","msg":"HB","Namespace":"core-apac.t0lvw","TaskQueue":"worker-golang","WorkerID":"1@worker-0@","ActivityID":"11","ActivityType":"ParseChunk","Attempt":9,"WorkflowType":"ParseChunk","WorkflowID":"ParseChunk_2023-12-25T06:18:03.990801Z_3","RunID":"6ce7aae5-4826-4856-aa23-ee1e881acf10","details":[{"Step":0,"Progress":"2023-12-27T21:51:27.337261718Z"}]}
{"level":"debug","ts":1703713889.0368614,"caller":"heartbeat/heartbeat.go:60","msg":"HB","Namespace":"core-apac.t0lvw","TaskQueue":"worker-golang","WorkerID":"1@worker-0@","ActivityID":"11","ActivityType":"ParseChunk","Attempt":9,"WorkflowType":"ParseChunk","WorkflowID":"ParseChunk_2023-12-25T06:18:03.990801Z_3","RunID":"6ce7aae5-4826-4856-aa23-ee1e881acf10","details":[{"Step":0,"Progress":"2023-12-27T21:51:29.036839823Z"}]}
{"level":"debug","ts":1703713899.8397164,"caller":"heartbeat/heartbeat.go:60","msg":"HB","Namespace":"core-apac.t0lvw","TaskQueue":"worker-golang","WorkerID":"1@worker-0@","ActivityID":"11","ActivityType":"ParseChunk","Attempt":9,"WorkflowType":"ParseChunk","WorkflowID":"ParseChunk_2023-12-25T06:18:03.990801Z_3","RunID":"6ce7aae5-4826-4856-aa23-ee1e881acf10","details":[{"Step":0,"Progress":"2023-12-27T21:51:39.838884754Z"}]}
{"level":"debug","ts":1703713900.2367902,"caller":"heartbeat/heartbeat.go:60","msg":"HB","Namespace":"core-apac.t0lvw","TaskQueue":"worker-golang","WorkerID":"1@worker-0@","ActivityID":"11","ActivityType":"ParseChunk","Attempt":9,"WorkflowType":"ParseChunk","WorkflowID":"ParseChunk_2023-12-25T06:18:03.990801Z_3","RunID":"6ce7aae5-4826-4856-aa23-ee1e881acf10","details":[{"Step":0,"Progress":"2023-12-27T21:51:40.236770773Z"}]}
{"level":"warn","ts":1703713900.6369581,"caller":"internal/internal_task_handlers.go:1959","msg":"RecordActivityHeartbeat with error","Namespace":"core-apac.t0lvw","TaskQueue":"worker-golang","WorkerID":"1@worker-0@","ActivityID":"11","ActivityType":"ParseChunk","Attempt":9,"WorkflowType":"ParseChunk","WorkflowID":"ParseChunk_2023-12-25T06:18:03.990801Z_3","RunID":"6ce7aae5-4826-4856-aa23-ee1e881acf10","Error":"workflow execution already completed"}
{"level":"info","ts":1703713900.9368887,"caller":"internal/internal_worker_base.go:510","msg":"Task processing failed with error","Namespace":"core-apac.t0lvw","TaskQueue":"worker-golang","WorkerID":"1@worker-0@","WorkerType":"ActivityWorker","Error":"workflow execution already completed"}

Hi, could you please open a support ticket (support.temporal.io) for your cloud questions? Thanks!

Ok, will forward it there, thanks.

For the record, here is the code producing the log messages above.

		logger.Info("Starting download", "dst", rawPath)
		if _, _, err := heartbeat.WithHB(ctx, time.Second,
			func(x time.Time) any { return heartbeat.HBDownload(x) },
			func() (any, error) { return DownloadFile(ctx, s3EU, data.ResolveResp.File, rawPath) }); err != nil {
			return nil, err
		}

and heartbeat.go:

func SendHBs(ctx context.Context, span time.Duration, done chan struct{}, hf ...func(time.Time) any) {
	logger := activity.GetLogger(ctx)
	ticker := time.NewTicker(span)

	// Record heartbeat when it ticks, or finish.
	go func() {
		for {
			select {
			case <-ctx.Done(): // context is canceled
				ticker.Stop()
				return
			case <-done: // requested to stop
				ticker.Stop()
				return
			case now := <-ticker.C:
				details := []any{}
				for _, f := range hf {
					details = append(details, f(now))
				}
				logger.Debug("HB", "details", details)
				activity.RecordHeartbeat(ctx, details)
			}
		}
	}()
}

func WithHB(ctx context.Context, span time.Duration, hf func(time.Time) any, f func() (any, error)) (time.Duration, any, error) {
	preTime := time.Now()
	done := make(chan struct{})
	defer close(done)
	SendHBs(ctx, span, done, hf)
	ret, err := f()
	postTime := time.Now()
	return postTime.Sub(preTime), ret, err
}

This means that the workflow is completed without waiting for the activity to complete. As soon as the workflow is completed, all its activities are considered canceled, and the next heartbeat cancels the Context.

1 Like

Mmh, right, this particular copy paste happens like this because I terminated the workflow, unfortunately it kept on failing because of heartbeat activities before I terminated it.

Solved, the issue was excessive throttling on the VM that was running the Go worker, hence disrupting heartbeat emission. Thanks all!