From 5cfcb3122298c186faa88fa913c94e3b3ad49b4c Mon Sep 17 00:00:00 2001 From: Jan Safranek Date: Thu, 19 Aug 2021 13:36:07 +0200 Subject: [PATCH] Fix deadlock in recursive metric locks RecordMetrics() grabs a mutex and calls recordCancelMetric(), which wants to grab the same mutex. Go mutexes are not recursive, so recordCancelMetric blocks forever. recordCancelMetric should not grab the mutex, it can be sure that the caller did it already. --- pkg/metrics/metrics.go | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index c7f5f895..82188e45 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -231,7 +231,7 @@ func (opMgr *operationMetricsManager) RecordMetrics(opKey OperationKey, opStatus obj, exists := opMgr.cache[createKey] if exists { // record a cancel metric if found - opMgr.recordCancelMetric(obj, createKey, operationDuration) + opMgr.recordCancelMetricLocked(obj, createKey, operationDuration) } // check if we have a CreateSnapshotAndReady operation pending for this @@ -239,7 +239,7 @@ func (opMgr *operationMetricsManager) RecordMetrics(opKey OperationKey, opStatus obj, exists = opMgr.cache[createAndReadyKey] if exists { // record a cancel metric if found - opMgr.recordCancelMetric(obj, createAndReadyKey, operationDuration) + opMgr.recordCancelMetricLocked(obj, createAndReadyKey, operationDuration) } } @@ -248,9 +248,8 @@ func (opMgr *operationMetricsManager) RecordMetrics(opKey OperationKey, opStatus } // recordCancelMetric records a metric for a create operation that hasn't finished -func (opMgr *operationMetricsManager) recordCancelMetric(val OperationValue, key OperationKey, duration float64) { - opMgr.mu.Lock() - defer opMgr.mu.Unlock() +// This function must be called with opMgr mutex locked (to prevent recursive locks). +func (opMgr *operationMetricsManager) recordCancelMetricLocked(val OperationValue, key OperationKey, duration float64) { // record a cancel metric if found opMgr.opLatencyMetrics.WithLabelValues(