I found there is many error log in my Temporal server’s log.
{"level":"error","ts":"2023-04-07T10:36:11.662+0800","msg":"Operation failed with internal error.","error":"UpdateTaskQueue failed. Failed to start transaction. Error: invalid connection","operation":"UpdateTaskQueue","logging-call-at":"persistenceMetricClients.go:1171","stacktrace":"go.temporal.io/server/common/log.(*zapLogger).Error\n\t/gaia/workspace-job/git.xiaojukeji.com/soda-server/temporal-server/internal/common/log/zap_logger.go:150\ngo.temporal.io/server/common/persistence.updateErrorMetric\n\t/gaia/workspace-job/git.xiaojukeji.com/soda-server/temporal-server/internal/common/persistence/persistenceMetricClients.go:1171\ngo.temporal.io/server/common/persistence.(*metricEmitter).recordRequestMetrics\n\t/gaia/workspace-job/git.xiaojukeji.com/soda-server/temporal-server/internal/common/persistence/persistenceMetricClients.go:1148\ngo.temporal.io/server/common/persistence.(*taskPersistenceClient).UpdateTaskQueue.func1\n\t/gaia/workspace-job/git.xiaojukeji.com/soda-server/temporal-server/internal/common/persistence/persistenceMetricClients.go:555\ngo.temporal.io/server/common/persistence.(*taskPersistenceClient).UpdateTaskQueue\n\t/gaia/workspace-job/git.xiaojukeji.com/soda-server/temporal-server/internal/common/persistence/persistenceMetricClients.go:557\ngo.temporal.io/server/service/matching.(*taskQueueDB).updateTaskQueue\n\t/gaia/workspace-job/git.xiaojukeji.com/soda-server/temporal-server/internal/service/matching/db.go:376\ngo.temporal.io/server/service/matching.(*taskQueueDB).UpdateState\n\t/gaia/workspace-job/git.xiaojukeji.com/soda-server/temporal-server/internal/service/matching/db.go:186\ngo.temporal.io/server/service/matching.(*taskReader).persistAckLevel\n\t/gaia/workspace-job/git.xiaojukeji.com/soda-server/temporal-server/internal/service/matching/taskReader.go:297\ngo.temporal.io/server/service/matching.(*taskReader).getTasksPump\n\t/gaia/workspace-job/git.xiaojukeji.com/soda-server/temporal-server/internal/service/matching/taskReader.go:209\ngo.temporal.io/server/internal/goro.(*Group).Go.func1\n\t/gaia/workspace-job/git.xiaojukeji.com/soda-server/temporal-server/internal/internal/goro/group.go:58"}
{"level":"error","ts":"2023-04-07T10:36:11.662+0800","msg":"Persistent store operation failure","service":"matching","component":"matching-engine","wf-task-queue-name":"/_sys/temporal-sys-tq-scanner-taskqueue-0/1","wf-task-queue-type":"Activity","wf-namespace":"temporal-system","store-operation":"update-task-queue","error":"UpdateTaskQueue failed. Failed to start transaction. Error: invalid connection","logging-call-at":"taskReader.go:212","stacktrace":"go.temporal.io/server/common/log.(*zapLogger).Error\n\t/gaia/workspace-job/git.xiaojukeji.com/soda-server/temporal-server/internal/common/log/zap_logger.go:150\ngo.temporal.io/server/service/matching.(*taskReader).getTasksPump\n\t/gaia/workspace-job/git.xiaojukeji.com/soda-server/temporal-server/internal/service/matching/taskReader.go:212\ngo.temporal.io/server/internal/goro.(*Group).Go.func1\n\t/gaia/workspace-job/git.xiaojukeji.com/soda-server/temporal-server/internal/internal/goro/group.go:58"}
Temporal server version: 1.20.0
Mysql version: v57
It seems that the error was reported from this part of the code.The m.txExecute
func (m *sqlTaskManager) UpdateTaskQueue(
ctx context.Context,
request *persistence.InternalUpdateTaskQueueRequest,
) (*persistence.UpdateTaskQueueResponse, error) {
nidBytes, err := primitives.ParseUUID(request.NamespaceID)
if err != nil {
return nil, serviceerror.NewInternal(err.Error())
}
tqId, tqHash := m.taskQueueIdAndHash(nidBytes, request.TaskQueue, request.TaskType)
var resp *persistence.UpdateTaskQueueResponse
err = m.txExecute(ctx, "UpdateTaskQueue", func(tx sqlplugin.Tx) error {
if err := lockTaskQueue(ctx,
tx,
tqHash,
tqId,
request.PrevRangeID,
); err != nil {
return err
}
result, err := tx.UpdateTaskQueues(ctx, &sqlplugin.TaskQueuesRow{
RangeHash: tqHash,
TaskQueueID: tqId,
RangeID: request.RangeID,
Data: request.TaskQueueInfo.Data,
DataEncoding: request.TaskQueueInfo.EncodingType.String(),
})
if err != nil {
return err
}
rowsAffected, err := result.RowsAffected()
if err != nil {
return err
}
if rowsAffected != 1 {
return fmt.Errorf("%v rows were affected instead of 1", rowsAffected)
}
resp = &persistence.UpdateTaskQueueResponse{}
return nil
})
return resp, err
}
The issue is that the sample workflow occasionally gets stuck at either the WorkflowTaskScheduled event or the ActivityTaskSchedule event, and remains there until the workflow eventually times out.
I’m not sure if this issue is related to the error logs mentioned above.