Linux 5.13 compat: retry zvol_open() when contended

Due to a possible lock inversion the zvol open call path on Linux needs to be able to retry in the case where the spa_namespace_lock cannot be acquired. For Linux 5.12 an older kernel this was accomplished by returning -ERESTARTSYS from zvol_open() to request that blkdev_get() drop the bdev->bd_mutex lock, reaquire it, then call the open callback again. However, as of the 5.13 kernel this behavior was removed. Therefore, for 5.12 and older kernels we preserved the existing retry logic, but for 5.13 and newer kernels we retry internally in zvol_open(). This should always succeed except in the case where a pool's vdev are layed on zvols, in which case it may fail. To handle this case vdev_disk_open() has been updated to retry when opening a device when -ERESTARTSYS is returned. Reviewed-by: Tony Hutter <hutter2@llnl.gov> Reviewed-by: Tony Nguyen <tony.nguyen@delphix.com> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Issue #12301 Closes #12759
author: Brian Behlendorf <behlendorf1@llnl.gov> 2021-12-01 16:07:12 -0800
committer: GitHub <noreply@github.com> 2021-12-01 17:07:12 -0700
commit: 77e2756de08ccd4e8ce755f78bdcda1e3e0c55e5 (patch)
tree: 230b1de25c7af90496defdd1c5914f75223a8c38
parent: 31d2f42b2a55485b017f428b8f7c1836750ecde9 (diff)
5 files changed, 90 insertions, 38 deletions
diff --git a/config/kernel-blkdev.m4 b/config/kernel-blkdev.m4
index 61e66421f..9c60e5dd4 100644
--- a/config/kernel-blkdev.m4
+++ b/config/kernel-blkdev.m4
@@ -294,6 +294,27 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BDEV_WHOLE], [
 	])
 ])
 
+dnl #
+dnl # 5.13 API change
+dnl # blkdev_get_by_path() no longer handles ERESTARTSYS
+dnl #
+dnl # Unfortunately we're forced to rely solely on the kernel version
+dnl # number in order to determine the expected behavior.  This was an
+dnl # internal change to blkdev_get_by_dev(), see commit a8ed1a0607.
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_GET_ERESTARTSYS], [
+	AC_MSG_CHECKING([whether blkdev_get_by_path() handles ERESTARTSYS])
+	AS_VERSION_COMPARE([$LINUX_VERSION], [5.13.0], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_BLKDEV_GET_ERESTARTSYS, 1,
+			[blkdev_get_by_path() handles ERESTARTSYS])
+	],[
+		AC_MSG_RESULT(no)
+	],[
+		AC_MSG_RESULT(no)
+	])
+])
+
 AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV], [
 	ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH
 	ZFS_AC_KERNEL_SRC_BLKDEV_PUT
@@ -318,4 +339,5 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV], [
 	ZFS_AC_KERNEL_BLKDEV_CHECK_DISK_CHANGE
 	ZFS_AC_KERNEL_BLKDEV_BDEV_CHECK_MEDIA_CHANGE
 	ZFS_AC_KERNEL_BLKDEV_BDEV_WHOLE
+	ZFS_AC_KERNEL_BLKDEV_GET_ERESTARTSYS
 ])
diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index 42144322a..82af0a975 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -265,6 +265,11 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
 	 * a ENOENT failure at this point is highly likely to be transient
 	 * and it is reasonable to sleep and retry before giving up.  In
 	 * practice delays have been observed to be on the order of 100ms.
+	 *
+	 * When ERESTARTSYS is returned it indicates the block device is
+	 * a zvol which could not be opened due to the deadlock detection
+	 * logic in zvol_open().  Extend the timeout and retry the open
+	 * subsequent attempts are expected to eventually succeed.
 	 */
 	hrtime_t start = gethrtime();
 	bdev = ERR_PTR(-ENXIO);
@@ -273,6 +278,9 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
 		    zfs_vdev_holder);
 		if (unlikely(PTR_ERR(bdev) == -ENOENT)) {
 			schedule_timeout(MSEC_TO_TICK(10));
+		} else if (unlikely(PTR_ERR(bdev) == -ERESTARTSYS)) {
+			timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms * 10);
+			continue;
 		} else if (IS_ERR(bdev)) {
 			break;
 		}
diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c
index c17423426..75a31b38d 100644
--- a/module/os/linux/zfs/zvol_os.c
+++ b/module/os/linux/zfs/zvol_os.c
@@ -46,6 +46,7 @@ unsigned int zvol_request_sync = 0;
 unsigned int zvol_prefetch_bytes = (128 * 1024);
 unsigned long zvol_max_discard_blocks = 16384;
 unsigned int zvol_threads = 32;
+unsigned int zvol_open_timeout_ms = 1000;
 
 struct zvol_state_os {
 	struct gendisk		*zvo_disk;	/* generic disk */
@@ -490,7 +491,13 @@ zvol_open(struct block_device *bdev, fmode_t flag)
 	zvol_state_t *zv;
 	int error = 0;
 	boolean_t drop_suspend = B_TRUE;
+	boolean_t drop_namespace = B_FALSE;
+#ifndef HAVE_BLKDEV_GET_ERESTARTSYS
+	hrtime_t timeout = MSEC2NSEC(zvol_open_timeout_ms);
+	hrtime_t start = gethrtime();
 
+retry:
+#endif
 	rw_enter(&zvol_state_lock, RW_READER);
 	/*
 	 * Obtain a copy of private_data under the zvol_state_lock to make
@@ -504,6 +511,49 @@ zvol_open(struct block_device *bdev, fmode_t flag)
 		return (SET_ERROR(-ENXIO));
 	}
 
+	if (zv->zv_open_count == 0 && !mutex_owned(&spa_namespace_lock)) {
+		/*
+		 * In all other call paths the spa_namespace_lock is taken
+		 * before the bdev->bd_mutex lock.  However, on open(2)
+		 * the __blkdev_get() function calls fops->open() with the
+		 * bdev->bd_mutex lock held.  This can result in a deadlock
+		 * when zvols from one pool are used as vdevs in another.
+		 *
+		 * To prevent a lock inversion deadlock we preemptively
+		 * take the spa_namespace_lock.  Normally the lock will not
+		 * be contended and this is safe because spa_open_common()
+		 * handles the case where the caller already holds the
+		 * spa_namespace_lock.
+		 *
+		 * When the lock cannot be aquired after multiple retries
+		 * this must be the vdev on zvol deadlock case and we have
+		 * no choice but to return an error.  For 5.12 and older
+		 * kernels returning -ERESTARTSYS will result in the
+		 * bdev->bd_mutex being dropped, then reacquired, and
+		 * fops->open() being called again.  This process can be
+		 * repeated safely until both locks are acquired.  For 5.13
+		 * and newer the -ERESTARTSYS retry logic was removed from
+		 * the kernel so the only option is to return the error for
+		 * the caller to handle it.
+		 */
+		if (!mutex_tryenter(&spa_namespace_lock)) {
+			rw_exit(&zvol_state_lock);
+
+#ifdef HAVE_BLKDEV_GET_ERESTARTSYS
+			schedule();
+			return (SET_ERROR(-ERESTARTSYS));
+#else
+			if ((gethrtime() - start) > timeout)
+				return (SET_ERROR(-ERESTARTSYS));
+
+			schedule_timeout(MSEC_TO_TICK(10));
+			goto retry;
+#endif
+		} else {
+			drop_namespace = B_TRUE;
+		}
+	}
+
 	mutex_enter(&zv->zv_state_lock);
 	/*
 	 * make sure zvol is not suspended during first open
@@ -543,6 +593,8 @@ zvol_open(struct block_device *bdev, fmode_t flag)
 	zv->zv_open_count++;
 
 	mutex_exit(&zv->zv_state_lock);
+	if (drop_namespace)
+		mutex_exit(&spa_namespace_lock);
 	if (drop_suspend)
 		rw_exit(&zv->zv_suspend_lock);
 
@@ -556,12 +608,11 @@ out_open_count:
 
 out_mutex:
 	mutex_exit(&zv->zv_state_lock);
+	if (drop_namespace)
+		mutex_exit(&spa_namespace_lock);
 	if (drop_suspend)
 		rw_exit(&zv->zv_suspend_lock);
-	if (error == -EINTR) {
-		error = -ERESTARTSYS;
-		schedule();
-	}
+
 	return (SET_ERROR(error));
 }
 
diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c
index d50cce7d7..cb5ee483c 100644
--- a/module/zfs/zvol.c
+++ b/module/zfs/zvol.c
@@ -867,54 +867,26 @@ int
 zvol_first_open(zvol_state_t *zv, boolean_t readonly)
 {
 	objset_t *os;
-	int error, locked = 0;
-	boolean_t ro;
+	int error;
 
 	ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+	ASSERT(mutex_owned(&spa_namespace_lock));
 
-	/*
-	 * In all other cases the spa_namespace_lock is taken before the
-	 * bdev->bd_mutex lock.	 But in this case the Linux __blkdev_get()
-	 * function calls fops->open() with the bdev->bd_mutex lock held.
-	 * This deadlock can be easily observed with zvols used as vdevs.
-	 *
-	 * To avoid a potential lock inversion deadlock we preemptively
-	 * try to take the spa_namespace_lock().  Normally it will not
-	 * be contended and this is safe because spa_open_common() handles
-	 * the case where the caller already holds the spa_namespace_lock.
-	 *
-	 * When it is contended we risk a lock inversion if we were to
-	 * block waiting for the lock.	Luckily, the __blkdev_get()
-	 * function allows us to return -ERESTARTSYS which will result in
-	 * bdev->bd_mutex being dropped, reacquired, and fops->open() being
-	 * called again.  This process can be repeated safely until both
-	 * locks are acquired.
-	 */
-	if (!mutex_owned(&spa_namespace_lock)) {
-		locked = mutex_tryenter(&spa_namespace_lock);
-		if (!locked)
-			return (SET_ERROR(EINTR));
-	}
-
-	ro = (readonly || (strchr(zv->zv_name, '@') != NULL));
+	boolean_t ro = (readonly || (strchr(zv->zv_name, '@') != NULL));
 	error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, ro, B_TRUE, zv, &os);
 	if (error)
-		goto out_mutex;
+		return (SET_ERROR(error));
 
 	zv->zv_objset = os;
 
 	error = zvol_setup_zv(zv);
-
 	if (error) {
 		dmu_objset_disown(os, 1, zv);
 		zv->zv_objset = NULL;
 	}
 
-out_mutex:
-	if (locked)
-		mutex_exit(&spa_namespace_lock);
-	return (SET_ERROR(error));
+	return (error);
 }
 
 void
diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in
index 3a3e357d5..27369d12b 100755
--- a/tests/test-runner/bin/zts-report.py.in
+++ b/tests/test-runner/bin/zts-report.py.in
@@ -294,7 +294,6 @@ elif sys.platform.startswith('linux'):
         'alloc_class/alloc_class_011_neg': ['FAIL', known_reason],
         'alloc_class/alloc_class_012_pos': ['FAIL', known_reason],
         'alloc_class/alloc_class_013_pos': ['FAIL', '11888'],
-        'cli_root/zfs_copies/zfs_copies_003_pos': ['FAIL', '12301'],
         'cli_root/zfs_rename/zfs_rename_002_pos': ['FAIL', known_reason],
         'cli_root/zpool_expand/zpool_expand_001_pos': ['FAIL', known_reason],
         'cli_root/zpool_expand/zpool_expand_005_pos': ['FAIL', known_reason],
author	Brian Behlendorf <behlendorf1@llnl.gov>	2021-12-01 16:07:12 -0800
committer	GitHub <noreply@github.com>	2021-12-01 17:07:12 -0700
commit	77e2756de08ccd4e8ce755f78bdcda1e3e0c55e5 (patch)
tree	230b1de25c7af90496defdd1c5914f75223a8c38
parent	31d2f42b2a55485b017f428b8f7c1836750ecde9 (diff)