- dm-crypt: switch to using the crc32 library

- dm-verity, dm-integrity, dm-crypt: documentation improvement - dm-vdo fixes - dm-stripe: enable inline crypto passthrough - dm-integrity: set ti->error on memory allocation failure - dm-bufio: remove unused return value - dm-verity: do forward error correction on metadata I/O errors - dm: fix unconditional IO throttle caused by REQ_PREFLUSH - dm cache: prevent BUG_ON by blocking retries on failed device resumes - dm cache: support shrinking the origin device - dm: restrict dm device size to 2^63-512 bytes - dm-delay: support zoned devices - dm-verity: support block number limits for different ioprio classes - dm-integrity: fix non-constant-time tag verification (security bug) - dm-verity, dm-ebs: fix prefetch-vs-suspend race -----BEGIN PGP SIGNATURE----- iIoEABYIADIWIQRnH8MwLyZDhyYfesYTAyx9YGnhbQUCZ+u7shQcbXBhdG9ja2FA cmVkaGF0LmNvbQAKCRATAyx9YGnhbZ0JAQDVhbl77u9jjPWjxJvFodMAqw+KPXGC MNzkyzG0lu7oPAEA33vt5pHQtr7F3SJj/sDBuZ+rb5bvUtgxeGqpJOQpTAk= =tj00 -----END PGP SIGNATURE----- Merge tag 'for-6.15/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm Pull device mapper updates from Mikulas Patocka: - dm-crypt: switch to using the crc32 library - dm-verity, dm-integrity, dm-crypt: documentation improvement - dm-vdo fixes - dm-stripe: enable inline crypto passthrough - dm-integrity: set ti->error on memory allocation failure - dm-bufio: remove unused return value - dm-verity: do forward error correction on metadata I/O errors - dm: fix unconditional IO throttle caused by REQ_PREFLUSH - dm cache: prevent BUG_ON by blocking retries on failed device resumes - dm cache: support shrinking the origin device - dm: restrict dm device size to 2^63-512 bytes - dm-delay: support zoned devices - dm-verity: support block number limits for different ioprio classes - dm-integrity: fix non-constant-time tag verification (security bug) - dm-verity, dm-ebs: fix prefetch-vs-suspend race * tag 'for-6.15/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (27 commits) dm-ebs: fix prefetch-vs-suspend race dm-verity: fix prefetch-vs-suspend race dm-integrity: fix non-constant-time tag verification dm-verity: support block number limits for different ioprio classes dm-delay: support zoned devices dm: restrict dm device size to 2^63-512 bytes dm cache: support shrinking the origin device dm cache: prevent BUG_ON by blocking retries on failed device resumes dm vdo indexer: reorder uds_request to reduce padding dm: fix unconditional IO throttle caused by REQ_PREFLUSH dm vdo: rework processing of loaded refcount byte arrays dm vdo: remove remaining ring references dm-verity: do forward error correction on metadata I/O errors dm-bufio: remove unused return value dm-integrity: set ti->error on memory allocation failure dm: Enable inline crypto passthrough for striped target dm vdo slab-depot: read refcount blocks in large chunks at load time dm vdo vio-pool: allow variable-sized metadata vios dm vdo vio-pool: support pools with multiple data blocks per vio dm vdo vio-pool: add a pool pointer to pooled_vio ...
2025-04-19 20:58:31 +09:00 · 2025-04-02 21:27:59 -07:00 · 2025-04-02 21:27:59 -07:00 · 5014bebee0
commit 5014bebee0
parent 447d2d272e 9c56542878
33 changed files with 522 additions and 246 deletions
--- a/Documentation/admin-guide/device-mapper/dm-crypt.rst
+++ b/Documentation/admin-guide/device-mapper/dm-crypt.rst
@ -146,6 +146,11 @@ integrity:<bytes>:<type>
    integrity for the encrypted device. The additional space is then
    used for storing authentication tag (and persistent IV if needed).

+integrity_key_size:<bytes>
+    Optionally set the integrity key size if it differs from the digest size.
+    It allows the use of wrapped key algorithms where the key size is
+    independent of the cryptographic key size.
+
 sector_size:<bytes>
    Use <bytes> as the encryption unit instead of 512 bytes sectors.
    This option can be in range 512 - 4096 bytes and must be power of two.
--- a/Documentation/admin-guide/device-mapper/dm-integrity.rst
+++ b/Documentation/admin-guide/device-mapper/dm-integrity.rst
@ -92,6 +92,11 @@ Target arguments:
 		allowed. This mode is useful for data recovery if the
 		device cannot be activated in any of the other standard
 		modes.
+	I - inline mode - in this mode, dm-integrity will store integrity
+		data directly in the underlying device sectors.
+		The underlying device must have an integrity profile that
+		allows storing user integrity data and provides enough
+		space for the selected integrity tag.

 5. the number of additional arguments

--- a/Documentation/admin-guide/device-mapper/verity.rst
+++ b/Documentation/admin-guide/device-mapper/verity.rst
@ -87,6 +87,15 @@ panic_on_corruption
    Panic the device when a corrupted block is discovered. This option is
    not compatible with ignore_corruption and restart_on_corruption.

+restart_on_error
+    Restart the system when an I/O error is detected.
+    This option can be combined with the restart_on_corruption option.
+
+panic_on_error
+    Panic the device when an I/O error is detected. This option is
+    not compatible with the restart_on_error option but can be combined
+    with the panic_on_corruption option.
+
 ignore_zero_blocks
    Do not verify blocks that are expected to contain zeroes and always return
    zeroes instead. This may be useful if the partition contains unused blocks
@ -142,8 +151,15 @@ root_hash_sig_key_desc <key_description>
    already in the secondary trusted keyring.

 try_verify_in_tasklet
-    If verity hashes are in cache, verify data blocks in kernel tasklet instead
-    of workqueue. This option can reduce IO latency.
+    If verity hashes are in cache and the IO size does not exceed the limit,
+    verify data blocks in bottom half instead of workqueue. This option can
+    reduce IO latency. The size limits can be configured via
+    /sys/module/dm_verity/parameters/use_bh_bytes. The four parameters
+    correspond to limits for IOPRIO_CLASS_NONE, IOPRIO_CLASS_RT,
+    IOPRIO_CLASS_BE and IOPRIO_CLASS_IDLE in turn.
+    For example:
+    <none>,<rt>,<be>,<idle>
+    4096,4096,4096,4096

 Theory of operation
 ===================
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@ -267,6 +267,7 @@ config DM_CRYPT
 	depends on BLK_DEV_DM
 	depends on (ENCRYPTED_KEYS || ENCRYPTED_KEYS=n)
 	depends on (TRUSTED_KEYS || TRUSTED_KEYS=n)
+	select CRC32
 	select CRYPTO
 	select CRYPTO_CBC
 	select CRYPTO_ESSIV
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@ -2234,7 +2234,7 @@ int dm_bufio_issue_discard(struct dm_bufio_client *c, sector_t block, sector_t c
 }
 EXPORT_SYMBOL_GPL(dm_bufio_issue_discard);

-static bool forget_buffer(struct dm_bufio_client *c, sector_t block)
+static void forget_buffer(struct dm_bufio_client *c, sector_t block)
 {
 	struct dm_buffer *b;

@ -2249,8 +2249,6 @@ static bool forget_buffer(struct dm_bufio_client *c, sector_t block)
 			cache_put_and_wake(c, b);
 		}
 	}
-
-	return b ? true : false;
 }

 /*
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@ -406,6 +406,12 @@ struct cache {
 	mempool_t migration_pool;

 	struct bio_set bs;
+
+	/*
+	 * Cache_size entries. Set bits indicate blocks mapped beyond the
+	 * target length, which are marked for invalidation.
+	 */
+	unsigned long *invalid_bitset;
 };

 struct per_bio_data {
@ -1922,6 +1928,9 @@ static void __destroy(struct cache *cache)
 	if (cache->discard_bitset)
 		free_bitset(cache->discard_bitset);

+	if (cache->invalid_bitset)
+		free_bitset(cache->invalid_bitset);
+
 	if (cache->copier)
 		dm_kcopyd_client_destroy(cache->copier);

@ -2510,6 +2519,13 @@ static int cache_create(struct cache_args *ca, struct cache **result)
 	}
 	clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));

+	cache->invalid_bitset = alloc_bitset(from_cblock(cache->cache_size));
+	if (!cache->invalid_bitset) {
+		*error = "could not allocate bitset for invalid blocks";
+		goto bad;
+	}
+	clear_bitset(cache->invalid_bitset, from_cblock(cache->cache_size));
+
 	cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
 	if (IS_ERR(cache->copier)) {
 		*error = "could not create kcopyd client";
@ -2808,6 +2824,24 @@ static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
 	return policy_load_mapping(cache->policy, oblock, cblock, dirty, hint, hint_valid);
 }

+static int load_filtered_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
+				 bool dirty, uint32_t hint, bool hint_valid)
+{
+	struct cache *cache = context;
+
+	if (from_oblock(oblock) >= from_oblock(cache->origin_blocks)) {
+		if (dirty) {
+			DMERR("%s: unable to shrink origin; cache block %u is dirty",
+			      cache_device_name(cache), from_cblock(cblock));
+			return -EFBIG;
+		}
+		set_bit(from_cblock(cblock), cache->invalid_bitset);
+		return 0;
+	}
+
+	return load_mapping(context, oblock, cblock, dirty, hint, hint_valid);
+}
+
 /*
 * The discard block size in the on disk metadata is not
 * necessarily the same as we're currently using.  So we have to
@ -2899,6 +2933,27 @@ static dm_cblock_t get_cache_dev_size(struct cache *cache)
 	return to_cblock(size);
 }

+static bool can_resume(struct cache *cache)
+{
+	/*
+	 * Disallow retrying the resume operation for devices that failed the
+	 * first resume attempt, as the failure leaves the policy object partially
+	 * initialized. Retrying could trigger BUG_ON when loading cache mappings
+	 * into the incomplete policy object.
+	 */
+	if (cache->sized && !cache->loaded_mappings) {
+		if (get_cache_mode(cache) != CM_WRITE)
+			DMERR("%s: unable to resume a failed-loaded cache, please check metadata.",
+			      cache_device_name(cache));
+		else
+			DMERR("%s: unable to resume cache due to missing proper cache table reload",
+			      cache_device_name(cache));
+		return false;
+	}
+
+	return true;
+}
+
 static bool can_resize(struct cache *cache, dm_cblock_t new_size)
 {
 	if (from_cblock(new_size) > from_cblock(cache->cache_size)) {
@ -2941,12 +2996,33 @@ static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size)
 	return 0;
 }

+static int truncate_oblocks(struct cache *cache)
+{
+	uint32_t nr_blocks = from_cblock(cache->cache_size);
+	uint32_t i;
+	int r;
+
+	for_each_set_bit(i, cache->invalid_bitset, nr_blocks) {
+		r = dm_cache_remove_mapping(cache->cmd, to_cblock(i));
+		if (r) {
+			DMERR_LIMIT("%s: invalidation failed; couldn't update on disk metadata",
+				    cache_device_name(cache));
+			return r;
+		}
+	}
+
+	return 0;
+}
+
 static int cache_preresume(struct dm_target *ti)
 {
 	int r = 0;
 	struct cache *cache = ti->private;
 	dm_cblock_t csize = get_cache_dev_size(cache);

+	if (!can_resume(cache))
+		return -EINVAL;
+
 	/*
 	 * Check to see if the cache has resized.
 	 */
@ -2962,11 +3038,25 @@ static int cache_preresume(struct dm_target *ti)
 	}

 	if (!cache->loaded_mappings) {
+		/*
+		 * The fast device could have been resized since the last
+		 * failed preresume attempt.  To be safe we start by a blank
+		 * bitset for cache blocks.
+		 */
+		clear_bitset(cache->invalid_bitset, from_cblock(cache->cache_size));
+
 		r = dm_cache_load_mappings(cache->cmd, cache->policy,
-					   load_mapping, cache);
+					   load_filtered_mapping, cache);
 		if (r) {
 			DMERR("%s: could not load cache mappings", cache_device_name(cache));
-			metadata_operation_failed(cache, "dm_cache_load_mappings", r);
+			if (r != -EFBIG)
+				metadata_operation_failed(cache, "dm_cache_load_mappings", r);
+			return r;
+		}
+
+		r = truncate_oblocks(cache);
+		if (r) {
+			metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
 			return r;
 		}

@ -3426,7 +3516,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)

 static struct target_type cache_target = {
 	.name = "cache",
-	.version = {2, 2, 0},
+	.version = {2, 3, 0},
 	.module = THIS_MODULE,
 	.ctr = cache_ctr,
 	.dtr = cache_dtr,
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@ -17,6 +17,7 @@
 #include <linux/bio.h>
 #include <linux/blkdev.h>
 #include <linux/blk-integrity.h>
+#include <linux/crc32.h>
 #include <linux/mempool.h>
 #include <linux/slab.h>
 #include <linux/crypto.h>
@ -125,7 +126,6 @@ struct iv_lmk_private {

 #define TCW_WHITENING_SIZE 16
 struct iv_tcw_private {
-	struct crypto_shash *crc32_tfm;
 	u8 *iv_seed;
 	u8 *whitening;
 };
@ -607,10 +607,6 @@ static void crypt_iv_tcw_dtr(struct crypt_config *cc)
 	tcw->iv_seed = NULL;
 	kfree_sensitive(tcw->whitening);
 	tcw->whitening = NULL;
-
-	if (tcw->crc32_tfm && !IS_ERR(tcw->crc32_tfm))
-		crypto_free_shash(tcw->crc32_tfm);
-	tcw->crc32_tfm = NULL;
 }

 static int crypt_iv_tcw_ctr(struct crypt_config *cc, struct dm_target *ti,
@ -628,13 +624,6 @@ static int crypt_iv_tcw_ctr(struct crypt_config *cc, struct dm_target *ti,
 		return -EINVAL;
 	}

-	tcw->crc32_tfm = crypto_alloc_shash("crc32", 0,
-					    CRYPTO_ALG_ALLOCATES_MEMORY);
-	if (IS_ERR(tcw->crc32_tfm)) {
-		ti->error = "Error initializing CRC32 in TCW";
-		return PTR_ERR(tcw->crc32_tfm);
-	}
-
 	tcw->iv_seed = kzalloc(cc->iv_size, GFP_KERNEL);
 	tcw->whitening = kzalloc(TCW_WHITENING_SIZE, GFP_KERNEL);
 	if (!tcw->iv_seed || !tcw->whitening) {
@ -668,36 +657,28 @@ static int crypt_iv_tcw_wipe(struct crypt_config *cc)
 	return 0;
 }

-static int crypt_iv_tcw_whitening(struct crypt_config *cc,
-				  struct dm_crypt_request *dmreq,
-				  u8 *data)
+static void crypt_iv_tcw_whitening(struct crypt_config *cc,
+				   struct dm_crypt_request *dmreq, u8 *data)
 {
 	struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
 	__le64 sector = cpu_to_le64(dmreq->iv_sector);
 	u8 buf[TCW_WHITENING_SIZE];
-	SHASH_DESC_ON_STACK(desc, tcw->crc32_tfm);
-	int i, r;
+	int i;

 	/* xor whitening with sector number */
 	crypto_xor_cpy(buf, tcw->whitening, (u8 *)&sector, 8);
 	crypto_xor_cpy(&buf[8], tcw->whitening + 8, (u8 *)&sector, 8);

 	/* calculate crc32 for every 32bit part and xor it */
-	desc->tfm = tcw->crc32_tfm;
-	for (i = 0; i < 4; i++) {
-		r = crypto_shash_digest(desc, &buf[i * 4], 4, &buf[i * 4]);
-		if (r)
-			goto out;
-	}
+	for (i = 0; i < 4; i++)
+		put_unaligned_le32(crc32(0, &buf[i * 4], 4), &buf[i * 4]);
 	crypto_xor(&buf[0], &buf[12], 4);
 	crypto_xor(&buf[4], &buf[8], 4);

 	/* apply whitening (8 bytes) to whole sector */
 	for (i = 0; i < ((1 << SECTOR_SHIFT) / 8); i++)
 		crypto_xor(data + i * 8, buf, 8);
-out:
 	memzero_explicit(buf, sizeof(buf));
-	return r;
 }

 static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv,
@ -707,13 +688,12 @@ static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv,
 	struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
 	__le64 sector = cpu_to_le64(dmreq->iv_sector);
 	u8 *src;
-	int r = 0;

 	/* Remove whitening from ciphertext */
 	if (bio_data_dir(dmreq->ctx->bio_in) != WRITE) {
 		sg = crypt_get_sg_data(cc, dmreq->sg_in);
 		src = kmap_local_page(sg_page(sg));
-		r = crypt_iv_tcw_whitening(cc, dmreq, src + sg->offset);
+		crypt_iv_tcw_whitening(cc, dmreq, src + sg->offset);
 		kunmap_local(src);
 	}

@ -723,7 +703,7 @@ static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv,
 		crypto_xor_cpy(&iv[8], tcw->iv_seed + 8, (u8 *)&sector,
 			       cc->iv_size - 8);

-	return r;
+	return 0;
 }

 static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv,
@ -731,7 +711,6 @@ static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv,
 {
 	struct scatterlist *sg;
 	u8 *dst;
-	int r;

 	if (bio_data_dir(dmreq->ctx->bio_in) != WRITE)
 		return 0;
@ -739,10 +718,10 @@ static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv,
 	/* Apply whitening on ciphertext */
 	sg = crypt_get_sg_data(cc, dmreq->sg_out);
 	dst = kmap_local_page(sg_page(sg));
-	r = crypt_iv_tcw_whitening(cc, dmreq, dst + sg->offset);
+	crypt_iv_tcw_whitening(cc, dmreq, dst + sg->offset);
 	kunmap_local(dst);

-	return r;
+	return 0;
 }

 static int crypt_iv_random_gen(struct crypt_config *cc, u8 *iv,
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@ -369,6 +369,21 @@ static int delay_map(struct dm_target *ti, struct bio *bio)
 	return delay_bio(dc, c, bio);
 }

+#ifdef CONFIG_BLK_DEV_ZONED
+static int delay_report_zones(struct dm_target *ti,
+		struct dm_report_zones_args *args, unsigned int nr_zones)
+{
+	struct delay_c *dc = ti->private;
+	struct delay_class *c = &dc->read;
+
+	return dm_report_zones(c->dev->bdev, c->start,
+			c->start + dm_target_offset(ti, args->next_sector),
+			args, nr_zones);
+}
+#else
+#define delay_report_zones	NULL
+#endif
+
 #define DMEMIT_DELAY_CLASS(c) \
 	DMEMIT("%s %llu %u", (c)->dev->name, (unsigned long long)(c)->start, (c)->delay)

@ -424,11 +439,12 @@ out:
 static struct target_type delay_target = {
 	.name	     = "delay",
 	.version     = {1, 4, 0},
-	.features    = DM_TARGET_PASSES_INTEGRITY,
+	.features    = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_ZONED_HM,
 	.module      = THIS_MODULE,
 	.ctr	     = delay_ctr,
 	.dtr	     = delay_dtr,
 	.map	     = delay_map,
+	.report_zones = delay_report_zones,
 	.presuspend  = delay_presuspend,
 	.resume	     = delay_resume,
 	.status	     = delay_status,
--- a/drivers/md/dm-ebs-target.c
+++ b/drivers/md/dm-ebs-target.c
@ -390,6 +390,12 @@ static int ebs_map(struct dm_target *ti, struct bio *bio)
 	return DM_MAPIO_REMAPPED;
 }

+static void ebs_postsuspend(struct dm_target *ti)
+{
+	struct ebs_c *ec = ti->private;
+	dm_bufio_client_reset(ec->bufio);
+}
+
 static void ebs_status(struct dm_target *ti, status_type_t type,
 		       unsigned int status_flags, char *result, unsigned int maxlen)
 {
@ -447,6 +453,7 @@ static struct target_type ebs_target = {
 	.ctr		 = ebs_ctr,
 	.dtr		 = ebs_dtr,
 	.map		 = ebs_map,
+	.postsuspend	 = ebs_postsuspend,
 	.status		 = ebs_status,
 	.io_hints	 = ebs_io_hints,
 	.prepare_ioctl	 = ebs_prepare_ioctl,
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@ -21,6 +21,7 @@
 #include <linux/reboot.h>
 #include <crypto/hash.h>
 #include <crypto/skcipher.h>
+#include <crypto/utils.h>
 #include <linux/async_tx.h>
 #include <linux/dm-bufio.h>

@ -516,7 +517,7 @@ static int sb_mac(struct dm_integrity_c *ic, bool wr)
 			dm_integrity_io_error(ic, "crypto_shash_digest", r);
 			return r;
 		}
-		if (memcmp(mac, actual_mac, mac_size)) {
+		if (crypto_memneq(mac, actual_mac, mac_size)) {
 			dm_integrity_io_error(ic, "superblock mac", -EILSEQ);
 			dm_audit_log_target(DM_MSG_PREFIX, "mac-superblock", ic->ti, 0);
 			return -EILSEQ;
@ -859,7 +860,7 @@ static void rw_section_mac(struct dm_integrity_c *ic, unsigned int section, bool
 		if (likely(wr))
 			memcpy(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR);
 		else {
-			if (memcmp(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR)) {
+			if (crypto_memneq(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR)) {
 				dm_integrity_io_error(ic, "journal mac", -EILSEQ);
 				dm_audit_log_target(DM_MSG_PREFIX, "mac-journal", ic->ti, 0);
 			}
@ -1401,10 +1402,9 @@ static bool find_newer_committed_node(struct dm_integrity_c *ic, struct journal_
 static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, sector_t *metadata_block,
 			       unsigned int *metadata_offset, unsigned int total_size, int op)
 {
-#define MAY_BE_FILLER		1
-#define MAY_BE_HASH		2
 	unsigned int hash_offset = 0;
-	unsigned int may_be = MAY_BE_HASH | (ic->discard ? MAY_BE_FILLER : 0);
+	unsigned char mismatch_hash = 0;
+	unsigned char mismatch_filler = !ic->discard;

 	do {
 		unsigned char *data, *dp;
@ -1425,7 +1425,7 @@ static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, se
 		if (op == TAG_READ) {
 			memcpy(tag, dp, to_copy);
 		} else if (op == TAG_WRITE) {
-			if (memcmp(dp, tag, to_copy)) {
+			if (crypto_memneq(dp, tag, to_copy)) {
 				memcpy(dp, tag, to_copy);
 				dm_bufio_mark_partial_buffer_dirty(b, *metadata_offset, *metadata_offset + to_copy);
 			}
@ -1433,29 +1433,30 @@ static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, se
 			/* e.g.: op == TAG_CMP */

 			if (likely(is_power_of_2(ic->tag_size))) {
-				if (unlikely(memcmp(dp, tag, to_copy)))
-					if (unlikely(!ic->discard) ||
-					    unlikely(memchr_inv(dp, DISCARD_FILLER, to_copy) != NULL)) {
-						goto thorough_test;
-				}
+				if (unlikely(crypto_memneq(dp, tag, to_copy)))
+					goto thorough_test;
 			} else {
 				unsigned int i, ts;
 thorough_test:
 				ts = total_size;

 				for (i = 0; i < to_copy; i++, ts--) {
-					if (unlikely(dp[i] != tag[i]))
-						may_be &= ~MAY_BE_HASH;
-					if (likely(dp[i] != DISCARD_FILLER))
-						may_be &= ~MAY_BE_FILLER;
+					/*
+					 * Warning: the control flow must not be
+					 * dependent on match/mismatch of
+					 * individual bytes.
+					 */
+					mismatch_hash |= dp[i] ^ tag[i];
+					mismatch_filler |= dp[i] ^ DISCARD_FILLER;
 					hash_offset++;
 					if (unlikely(hash_offset == ic->tag_size)) {
-						if (unlikely(!may_be)) {
+						if (unlikely(mismatch_hash) && unlikely(mismatch_filler)) {
 							dm_bufio_release(b);
 							return ts;
 						}
 						hash_offset = 0;
-						may_be = MAY_BE_HASH | (ic->discard ? MAY_BE_FILLER : 0);
+						mismatch_hash = 0;
+						mismatch_filler = !ic->discard;
 					}
 				}
 			}
@ -1476,8 +1477,6 @@ thorough_test:
 	} while (unlikely(total_size));

 	return 0;
-#undef MAY_BE_FILLER
-#undef MAY_BE_HASH
 }

 struct flush_request {
@ -2076,7 +2075,7 @@ retry_kmap:
 					char checksums_onstack[MAX_T(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)];

 					integrity_sector_checksum(ic, logical_sector, mem + bv.bv_offset, checksums_onstack);
-					if (unlikely(memcmp(checksums_onstack, journal_entry_tag(ic, je), ic->tag_size))) {
+					if (unlikely(crypto_memneq(checksums_onstack, journal_entry_tag(ic, je), ic->tag_size))) {
 						DMERR_LIMIT("Checksum failed when reading from journal, at sector 0x%llx",
 							    logical_sector);
 						dm_audit_log_bio(DM_MSG_PREFIX, "journal-checksum",
@ -2595,7 +2594,7 @@ static void dm_integrity_inline_recheck(struct work_struct *w)
 		bio_put(outgoing_bio);

 		integrity_sector_checksum(ic, dio->bio_details.bi_iter.bi_sector, outgoing_data, digest);
-		if (unlikely(memcmp(digest, dio->integrity_payload, min(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)))) {
+		if (unlikely(crypto_memneq(digest, dio->integrity_payload, min(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)))) {
 			DMERR_LIMIT("%pg: Checksum failed at sector 0x%llx",
 				ic->dev->bdev, dio->bio_details.bi_iter.bi_sector);
 			atomic64_inc(&ic->number_of_mismatches);
@ -2634,7 +2633,7 @@ static int dm_integrity_end_io(struct dm_target *ti, struct bio *bio, blk_status
 				char *mem = bvec_kmap_local(&bv);
 				//memset(mem, 0xff, ic->sectors_per_block << SECTOR_SHIFT);
 				integrity_sector_checksum(ic, dio->bio_details.bi_iter.bi_sector, mem, digest);
-				if (unlikely(memcmp(digest, dio->integrity_payload + pos,
+				if (unlikely(crypto_memneq(digest, dio->integrity_payload + pos,
 						min(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)))) {
 					kunmap_local(mem);
 					dm_integrity_free_payload(dio);
@ -2911,7 +2910,7 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned int write_start

 					integrity_sector_checksum(ic, sec + ((l - j) << ic->sb->log2_sectors_per_block),
 								  (char *)access_journal_data(ic, i, l), test_tag);
-					if (unlikely(memcmp(test_tag, journal_entry_tag(ic, je2), ic->tag_size))) {
+					if (unlikely(crypto_memneq(test_tag, journal_entry_tag(ic, je2), ic->tag_size))) {
 						dm_integrity_io_error(ic, "tag mismatch when replaying journal", -EILSEQ);
 						dm_audit_log_target(DM_MSG_PREFIX, "integrity-replay-journal", ic->ti, 0);
 					}
@ -5072,16 +5071,19 @@ try_smaller_buffer:

 		ic->recalc_bitmap = dm_integrity_alloc_page_list(n_bitmap_pages);
 		if (!ic->recalc_bitmap) {
+			ti->error = "Could not allocate memory for bitmap";
 			r = -ENOMEM;
 			goto bad;
 		}
 		ic->may_write_bitmap = dm_integrity_alloc_page_list(n_bitmap_pages);
 		if (!ic->may_write_bitmap) {
+			ti->error = "Could not allocate memory for bitmap";
 			r = -ENOMEM;
 			goto bad;
 		}
 		ic->bbs = kvmalloc_array(ic->n_bitmap_blocks, sizeof(struct bitmap_block_status), GFP_KERNEL);
 		if (!ic->bbs) {
+			ti->error = "Could not allocate memory for bitmap";
 			r = -ENOMEM;
 			goto bad;
 		}
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@ -467,7 +467,7 @@ static struct target_type stripe_target = {
 	.name   = "striped",
 	.version = {1, 7, 0},
 	.features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_NOWAIT |
-		    DM_TARGET_ATOMIC_WRITES,
+		    DM_TARGET_ATOMIC_WRITES | DM_TARGET_PASSES_CRYPTO,
 	.module = THIS_MODULE,
 	.ctr    = stripe_ctr,
 	.dtr    = stripe_dtr,
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@ -697,6 +697,10 @@ int dm_table_add_target(struct dm_table *t, const char *type,
 		DMERR("%s: zero-length target", dm_device_name(t->md));
 		return -EINVAL;
 	}
+	if (start + len < start || start + len > LLONG_MAX >> SECTOR_SHIFT) {
+		DMERR("%s: too large device", dm_device_name(t->md));
+		return -EINVAL;
+	}

 	ti->type = dm_get_target_type(type);
 	if (!ti->type) {
--- a/drivers/md/dm-vdo/block-map.c
+++ b/drivers/md/dm-vdo/block-map.c
@ -451,7 +451,7 @@ static struct page_info * __must_check find_page(struct vdo_page_cache *cache,
 * select_lru_page() - Determine which page is least recently used.
 *
 * Picks the least recently used from among the non-busy entries at the front of each of the lru
- * ring. Since whenever we mark a page busy we also put it to the end of the ring it is unlikely
+ * list. Since whenever we mark a page busy we also put it to the end of the list it is unlikely
 * that the entries at the front are busy unless the queue is very short, but not impossible.
 *
 * Return: A pointer to the info structure for a relevant page, or NULL if no such page can be
@ -1544,7 +1544,7 @@ static void write_page_if_not_dirtied(struct vdo_waiter *waiter, void *context)

 static void return_to_pool(struct block_map_zone *zone, struct pooled_vio *vio)
 {
-	return_vio_to_pool(zone->vio_pool, vio);
+	return_vio_to_pool(vio);
 	check_for_drain_complete(zone);
 }

@ -1837,7 +1837,7 @@ static void finish_block_map_page_load(struct vdo_completion *completion)

 	if (!vdo_copy_valid_page(vio->data, nonce, pbn, page))
 		vdo_format_block_map_page(page, nonce, pbn, false);
-	return_vio_to_pool(zone->vio_pool, pooled);
+	return_vio_to_pool(pooled);

 	/* Release our claim to the load and wake any waiters */
 	release_page_lock(data_vio, "load");
@ -1851,10 +1851,9 @@ static void handle_io_error(struct vdo_completion *completion)
 	struct vio *vio = as_vio(completion);
 	struct pooled_vio *pooled = container_of(vio, struct pooled_vio, vio);
 	struct data_vio *data_vio = completion->parent;
-	struct block_map_zone *zone = pooled->context;

 	vio_record_metadata_io_error(vio);
-	return_vio_to_pool(zone->vio_pool, pooled);
+	return_vio_to_pool(pooled);
 	abort_load(data_vio, result);
 }

@ -2499,7 +2498,7 @@ static void finish_cursor(struct cursor *cursor)
 	struct cursors *cursors = cursor->parent;
 	struct vdo_completion *completion = cursors->completion;

-	return_vio_to_pool(cursors->pool, vdo_forget(cursor->vio));
+	return_vio_to_pool(vdo_forget(cursor->vio));
 	if (--cursors->active_roots > 0)
 		return;

@ -2746,7 +2745,7 @@ static int __must_check initialize_block_map_zone(struct block_map *map,
 	if (result != VDO_SUCCESS)
 		return result;

-	result = make_vio_pool(vdo, BLOCK_MAP_VIO_POOL_SIZE,
+	result = make_vio_pool(vdo, BLOCK_MAP_VIO_POOL_SIZE, 1,
 			       zone->thread_id, VIO_TYPE_BLOCK_MAP_INTERIOR,
 			       VIO_PRIORITY_METADATA, zone, &zone->vio_pool);
 	if (result != VDO_SUCCESS)
--- a/drivers/md/dm-vdo/constants.h
+++ b/drivers/md/dm-vdo/constants.h
@ -44,9 +44,6 @@ enum {
 	/* The default size of each slab journal, in blocks */
 	DEFAULT_VDO_SLAB_JOURNAL_SIZE = 224,

-	/* Unit test minimum */
-	MINIMUM_VDO_SLAB_JOURNAL_BLOCKS = 2,
-
 	/*
 	 * The initial size of lbn_operations and pbn_operations, which is based upon the expected
 	 * maximum number of outstanding VIOs. This value was chosen to make it highly unlikely
--- a/drivers/md/dm-vdo/dedupe.c
+++ b/drivers/md/dm-vdo/dedupe.c
@ -226,7 +226,7 @@ struct hash_lock {
 	 * A list containing the data VIOs sharing this lock, all having the same record name and
 	 * data block contents, linked by their hash_lock_node fields.
 	 */
-	struct list_head duplicate_ring;
+	struct list_head duplicate_vios;

 	/* The number of data_vios sharing this lock instance */
 	data_vio_count_t reference_count;
@ -343,7 +343,7 @@ static void return_hash_lock_to_pool(struct hash_zone *zone, struct hash_lock *l
 {
 	memset(lock, 0, sizeof(*lock));
 	INIT_LIST_HEAD(&lock->pool_node);
-	INIT_LIST_HEAD(&lock->duplicate_ring);
+	INIT_LIST_HEAD(&lock->duplicate_vios);
 	vdo_waitq_init(&lock->waiters);
 	list_add_tail(&lock->pool_node, &zone->lock_pool);
 }
@ -441,7 +441,7 @@ static void set_hash_lock(struct data_vio *data_vio, struct hash_lock *new_lock)
 		VDO_ASSERT_LOG_ONLY(data_vio->hash_zone != NULL,
 				    "must have a hash zone when holding a hash lock");
 		VDO_ASSERT_LOG_ONLY(!list_empty(&data_vio->hash_lock_entry),
-				    "must be on a hash lock ring when holding a hash lock");
+				    "must be on a hash lock list when holding a hash lock");
 		VDO_ASSERT_LOG_ONLY(old_lock->reference_count > 0,
 				    "hash lock reference must be counted");

@ -464,10 +464,10 @@ static void set_hash_lock(struct data_vio *data_vio, struct hash_lock *new_lock)

 	if (new_lock != NULL) {
 		/*
-		 * Keep all data_vios sharing the lock on a ring since they can complete in any
+		 * Keep all data_vios sharing the lock on a list since they can complete in any
 		 * order and we'll always need a pointer to one to compare data.
 		 */
-		list_move_tail(&data_vio->hash_lock_entry, &new_lock->duplicate_ring);
+		list_move_tail(&data_vio->hash_lock_entry, &new_lock->duplicate_vios);
 		new_lock->reference_count += 1;
 		if (new_lock->max_references < new_lock->reference_count)
 			new_lock->max_references = new_lock->reference_count;
@ -1789,10 +1789,10 @@ static bool is_hash_collision(struct hash_lock *lock, struct data_vio *candidate
 	struct hash_zone *zone;
 	bool collides;

-	if (list_empty(&lock->duplicate_ring))
+	if (list_empty(&lock->duplicate_vios))
 		return false;

-	lock_holder = list_first_entry(&lock->duplicate_ring, struct data_vio,
+	lock_holder = list_first_entry(&lock->duplicate_vios, struct data_vio,
 				       hash_lock_entry);
 	zone = candidate->hash_zone;
 	collides = !blocks_equal(lock_holder->vio.data, candidate->vio.data);
@ -1815,7 +1815,7 @@ static inline int assert_hash_lock_preconditions(const struct data_vio *data_vio
 		return result;

 	result = VDO_ASSERT(list_empty(&data_vio->hash_lock_entry),
-			    "must not already be a member of a hash lock ring");
+			    "must not already be a member of a hash lock list");
 	if (result != VDO_SUCCESS)
 		return result;

@ -1942,8 +1942,8 @@ void vdo_release_hash_lock(struct data_vio *data_vio)
 			    "returned hash lock must not be in use with state %s",
 			    get_hash_lock_state_name(lock->state));
 	VDO_ASSERT_LOG_ONLY(list_empty(&lock->pool_node),
-			    "hash lock returned to zone must not be in a pool ring");
-	VDO_ASSERT_LOG_ONLY(list_empty(&lock->duplicate_ring),
+			    "hash lock returned to zone must not be in a pool list");
+	VDO_ASSERT_LOG_ONLY(list_empty(&lock->duplicate_vios),
 			    "hash lock returned to zone must not reference DataVIOs");

 	return_hash_lock_to_pool(zone, lock);
--- a/drivers/md/dm-vdo/encodings.c
+++ b/drivers/md/dm-vdo/encodings.c
@ -711,24 +711,11 @@ int vdo_configure_slab(block_count_t slab_size, block_count_t slab_journal_block
 	ref_blocks = vdo_get_saved_reference_count_size(slab_size - slab_journal_blocks);
 	meta_blocks = (ref_blocks + slab_journal_blocks);

-	/* Make sure test code hasn't configured slabs to be too small. */
+	/* Make sure configured slabs are not too small. */
 	if (meta_blocks >= slab_size)
 		return VDO_BAD_CONFIGURATION;

-	/*
-	 * If the slab size is very small, assume this must be a unit test and override the number
-	 * of data blocks to be a power of two (wasting blocks in the slab). Many tests need their
-	 * data_blocks fields to be the exact capacity of the configured volume, and that used to
-	 * fall out since they use a power of two for the number of data blocks, the slab size was
-	 * a power of two, and every block in a slab was a data block.
-	 *
-	 * TODO: Try to figure out some way of structuring testParameters and unit tests so this
-	 * hack isn't needed without having to edit several unit tests every time the metadata size
-	 * changes by one block.
-	 */
 	data_blocks = slab_size - meta_blocks;
-	if ((slab_size < 1024) && !is_power_of_2(data_blocks))
-		data_blocks = ((block_count_t) 1 << ilog2(data_blocks));

 	/*
 	 * Configure the slab journal thresholds. The flush threshold is 168 of 224 blocks in
@ -1221,11 +1208,6 @@ int vdo_validate_config(const struct vdo_config *config,
 	if (result != VDO_SUCCESS)
 		return result;

-	result = VDO_ASSERT(config->slab_journal_blocks >= MINIMUM_VDO_SLAB_JOURNAL_BLOCKS,
-			    "slab journal size meets minimum size");
-	if (result != VDO_SUCCESS)
-		return result;
-
 	result = VDO_ASSERT(config->slab_journal_blocks <= config->slab_size,
 			    "slab journal size is within expected bound");
 	if (result != VDO_SUCCESS)
--- a/drivers/md/dm-vdo/indexer/index-layout.c
+++ b/drivers/md/dm-vdo/indexer/index-layout.c
@ -54,7 +54,6 @@
 * Each save also has a unique nonce.
 */

-#define MAGIC_SIZE 32
 #define NONCE_INFO_SIZE 32
 #define MAX_SAVES 2

@ -98,9 +97,11 @@ enum region_type {
 #define SUPER_VERSION_CURRENT 3
 #define SUPER_VERSION_MAXIMUM 7

-static const u8 LAYOUT_MAGIC[MAGIC_SIZE] = "*ALBIREO*SINGLE*FILE*LAYOUT*001*";
+static const u8 LAYOUT_MAGIC[] = "*ALBIREO*SINGLE*FILE*LAYOUT*001*";
 static const u64 REGION_MAGIC = 0x416c6252676e3031; /* 'AlbRgn01' */

+#define MAGIC_SIZE (sizeof(LAYOUT_MAGIC) - 1)
+
 struct region_header {
 	u64 magic;
 	u64 region_blocks;
--- a/drivers/md/dm-vdo/indexer/index-session.c
+++ b/drivers/md/dm-vdo/indexer/index-session.c
@ -100,7 +100,6 @@ static int get_index_session(struct uds_index_session *index_session)

 int uds_launch_request(struct uds_request *request)
 {
-	size_t internal_size;
 	int result;

 	if (request->callback == NULL) {
@ -121,10 +120,7 @@ int uds_launch_request(struct uds_request *request)
 	}

 	/* Reset all internal fields before processing. */
-	internal_size =
-		sizeof(struct uds_request) - offsetof(struct uds_request, zone_number);
-	// FIXME should be using struct_group for this instead
-	memset((char *) request + sizeof(*request) - internal_size, 0, internal_size);
+	memset(&request->internal, 0, sizeof(request->internal));

 	result = get_index_session(request->session);
 	if (result != UDS_SUCCESS)
--- a/drivers/md/dm-vdo/indexer/indexer.h
+++ b/drivers/md/dm-vdo/indexer/indexer.h
@ -8,6 +8,7 @@

 #include <linux/mutex.h>
 #include <linux/sched.h>
+#include <linux/stddef.h>
 #include <linux/types.h>
 #include <linux/wait.h>

@ -73,7 +74,7 @@ enum uds_request_type {
 	/* Remove any mapping for a name. */
 	UDS_DELETE,

-};
+} __packed;

 enum uds_open_index_type {
 	/* Create a new index. */
@ -226,7 +227,7 @@ struct uds_zone_message {
 	enum uds_zone_message_type type;
 	/* The virtual chapter number to which the message applies */
 	u64 virtual_chapter;
-};
+} __packed;

 struct uds_index_session;
 struct uds_index;
@ -253,34 +254,32 @@ struct uds_request {

 	/* The existing data associated with the request name, if any */
 	struct uds_record_data old_metadata;
-	/* Either UDS_SUCCESS or an error code for the request */
-	int status;
 	/* True if the record name had an existing entry in the index */
 	bool found;
+	/* Either UDS_SUCCESS or an error code for the request */
+	int status;

-	/*
-	 * The remaining fields are used internally and should not be altered by clients. The index
-	 * relies on zone_number being the first field in this section.
-	 */
-
-	/* The number of the zone which will process this request*/
-	unsigned int zone_number;
-	/* A link for adding a request to a lock-free queue */
-	struct funnel_queue_entry queue_link;
-	/* A link for adding a request to a standard linked list */
-	struct uds_request *next_request;
-	/* A pointer to the index processing this request */
-	struct uds_index *index;
-	/* Control message for coordinating between zones */
-	struct uds_zone_message zone_message;
-	/* If true, process request immediately by waking the worker thread */
-	bool unbatched;
-	/* If true, continue this request before processing newer requests */
-	bool requeued;
-	/* The virtual chapter containing the record name, if known */
-	u64 virtual_chapter;
-	/* The region of the index containing the record name */
-	enum uds_index_region location;
+	/* The remaining fields are used internally and should not be altered by clients. */
+	struct_group(internal,
+		     /* The virtual chapter containing the record name, if known */
+		     u64 virtual_chapter;
+		     /* The region of the index containing the record name */
+		     enum uds_index_region location;
+		     /* If true, process request immediately by waking the worker thread */
+		     bool unbatched;
+		     /* If true, continue this request before processing newer requests */
+		     bool requeued;
+		     /* Control message for coordinating between zones */
+		     struct uds_zone_message zone_message;
+		     /* The number of the zone which will process this request*/
+		     unsigned int zone_number;
+		     /* A link for adding a request to a lock-free queue */
+		     struct funnel_queue_entry queue_link;
+		     /* A link for adding a request to a standard linked list */
+		     struct uds_request *next_request;
+		     /* A pointer to the index processing this request */
+		     struct uds_index *index;
+		     );
 };

 /* A session is required for most index operations. */
--- a/drivers/md/dm-vdo/io-submitter.c
+++ b/drivers/md/dm-vdo/io-submitter.c
@ -327,6 +327,7 @@ void vdo_submit_data_vio(struct data_vio *data_vio)
 * @error_handler: the handler for submission or I/O errors (may be NULL)
 * @operation: the type of I/O to perform
 * @data: the buffer to read or write (may be NULL)
+ * @size: the I/O amount in bytes
 *
 * The vio is enqueued on a vdo bio queue so that bio submission (which may block) does not block
 * other vdo threads.
@ -338,7 +339,7 @@ void vdo_submit_data_vio(struct data_vio *data_vio)
 */
 void __submit_metadata_vio(struct vio *vio, physical_block_number_t physical,
 			   bio_end_io_t callback, vdo_action_fn error_handler,
-			   blk_opf_t operation, char *data)
+			   blk_opf_t operation, char *data, int size)
 {
 	int result;
 	struct vdo_completion *completion = &vio->completion;
@ -349,7 +350,8 @@ void __submit_metadata_vio(struct vio *vio, physical_block_number_t physical,

 	vdo_reset_completion(completion);
 	completion->error_handler = error_handler;
-	result = vio_reset_bio(vio, data, callback, operation | REQ_META, physical);
+	result = vio_reset_bio_with_size(vio, data, size, callback, operation | REQ_META,
+					 physical);
 	if (result != VDO_SUCCESS) {
 		continue_vio(vio, result);
 		return;
--- a/drivers/md/dm-vdo/io-submitter.h
+++ b/drivers/md/dm-vdo/io-submitter.h
@ -8,6 +8,7 @@

 #include <linux/bio.h>

+#include "constants.h"
 #include "types.h"

 struct io_submitter;
@ -26,14 +27,25 @@ void vdo_submit_data_vio(struct data_vio *data_vio);

 void __submit_metadata_vio(struct vio *vio, physical_block_number_t physical,
 			   bio_end_io_t callback, vdo_action_fn error_handler,
-			   blk_opf_t operation, char *data);
+			   blk_opf_t operation, char *data, int size);

 static inline void vdo_submit_metadata_vio(struct vio *vio, physical_block_number_t physical,
 					   bio_end_io_t callback, vdo_action_fn error_handler,
 					   blk_opf_t operation)
 {
 	__submit_metadata_vio(vio, physical, callback, error_handler,
-			      operation, vio->data);
+			      operation, vio->data, vio->block_count * VDO_BLOCK_SIZE);
+}
+
+static inline void vdo_submit_metadata_vio_with_size(struct vio *vio,
+						     physical_block_number_t physical,
+						     bio_end_io_t callback,
+						     vdo_action_fn error_handler,
+						     blk_opf_t operation,
+						     int size)
+{
+	__submit_metadata_vio(vio, physical, callback, error_handler,
+			      operation, vio->data, size);
 }

 static inline void vdo_submit_flush_vio(struct vio *vio, bio_end_io_t callback,
@ -41,7 +53,7 @@ static inline void vdo_submit_flush_vio(struct vio *vio, bio_end_io_t callback,
 {
 	/* FIXME: Can we just use REQ_OP_FLUSH? */
 	__submit_metadata_vio(vio, 0, callback, error_handler,
-			      REQ_OP_WRITE | REQ_PREFLUSH, NULL);
+			      REQ_OP_WRITE | REQ_PREFLUSH, NULL, 0);
 }

 #endif /* VDO_IO_SUBMITTER_H */
--- a/drivers/md/dm-vdo/packer.h
+++ b/drivers/md/dm-vdo/packer.h
@ -46,7 +46,7 @@ struct compressed_block {

 /*
 * Each packer_bin holds an incomplete batch of data_vios that only partially fill a compressed
- * block. The bins are kept in a ring sorted by the amount of unused space so the first bin with
+ * block. The bins are kept in a list sorted by the amount of unused space so the first bin with
 * enough space to hold a newly-compressed data_vio can easily be found. When the bin fills up or
 * is flushed, the first uncanceled data_vio in the bin is selected to be the agent for that bin.
 * Upon entering the packer, each data_vio already has its compressed data in the first slot of the
--- a/drivers/md/dm-vdo/priority-table.c
+++ b/drivers/md/dm-vdo/priority-table.c
@ -199,7 +199,7 @@ void vdo_priority_table_remove(struct priority_table *table, struct list_head *e

 	/*
 	 * Remove the entry from the bucket list, remembering a pointer to another entry in the
-	 * ring.
+	 * list.
 	 */
 	next_entry = entry->next;
 	list_del_init(entry);
--- a/drivers/md/dm-vdo/recovery-journal.h
+++ b/drivers/md/dm-vdo/recovery-journal.h
@ -43,9 +43,9 @@
 * has a vio which is used to commit that block to disk. The vio's data is the on-disk
 * representation of the journal block. In addition each in-memory block has a buffer which is used
 * to accumulate entries while a partial commit of the block is in progress. In-memory blocks are
- * kept on two rings. Free blocks live on the 'free_tail_blocks' ring. When a block becomes active
- * (see below) it is moved to the 'active_tail_blocks' ring. When a block is fully committed, it is
- * moved back to the 'free_tail_blocks' ring.
+ * kept on two lists. Free blocks live on the 'free_tail_blocks' list. When a block becomes active
+ * (see below) it is moved to the 'active_tail_blocks' list. When a block is fully committed, it is
+ * moved back to the 'free_tail_blocks' list.
 *
 * When entries are added to the journal, they are added to the active in-memory block, as
 * indicated by the 'active_block' field. If the caller wishes to wait for the entry to be
--- a/drivers/md/dm-vdo/slab-depot.c
+++ b/drivers/md/dm-vdo/slab-depot.c
@ -139,7 +139,7 @@ static bool is_slab_journal_blank(const struct vdo_slab *slab)
 }

 /**
- * mark_slab_journal_dirty() - Put a slab journal on the dirty ring of its allocator in the correct
+ * mark_slab_journal_dirty() - Put a slab journal on the dirty list of its allocator in the correct
 *                             order.
 * @journal: The journal to be marked dirty.
 * @lock: The recovery journal lock held by the slab journal.
@ -414,8 +414,7 @@ static void complete_reaping(struct vdo_completion *completion)
 {
 	struct slab_journal *journal = completion->parent;

-	return_vio_to_pool(journal->slab->allocator->vio_pool,
-			   vio_as_pooled_vio(as_vio(vdo_forget(completion))));
+	return_vio_to_pool(vio_as_pooled_vio(as_vio(completion)));
 	finish_reaping(journal);
 	reap_slab_journal(journal);
 }
@ -698,7 +697,7 @@ static void complete_write(struct vdo_completion *completion)
 	sequence_number_t committed = get_committing_sequence_number(pooled);

 	list_del_init(&pooled->list_entry);
-	return_vio_to_pool(journal->slab->allocator->vio_pool, vdo_forget(pooled));
+	return_vio_to_pool(pooled);

 	if (result != VDO_SUCCESS) {
 		vio_record_metadata_io_error(as_vio(completion));
@ -822,7 +821,7 @@ static void commit_tail(struct slab_journal *journal)

 	/*
 	 * Since we are about to commit the tail block, this journal no longer needs to be on the
-	 * ring of journals which the recovery journal might ask to commit.
+	 * list of journals which the recovery journal might ask to commit.
 	 */
 	mark_slab_journal_clean(journal);

@ -1076,7 +1075,7 @@ static void finish_reference_block_write(struct vdo_completion *completion)
 	/* Release the slab journal lock. */
 	adjust_slab_journal_block_reference(&slab->journal,
 					    block->slab_journal_lock_to_release, -1);
-	return_vio_to_pool(slab->allocator->vio_pool, pooled);
+	return_vio_to_pool(pooled);

 	/*
 	 * We can't clear the is_writing flag earlier as releasing the slab journal lock may cause
@ -1170,8 +1169,8 @@ static void handle_io_error(struct vdo_completion *completion)
 	struct vdo_slab *slab = ((struct reference_block *) completion->parent)->slab;

 	vio_record_metadata_io_error(vio);
-	return_vio_to_pool(slab->allocator->vio_pool, vio_as_pooled_vio(vio));
-	slab->active_count--;
+	return_vio_to_pool(vio_as_pooled_vio(vio));
+	slab->active_count -= vio->io_size / VDO_BLOCK_SIZE;
 	vdo_enter_read_only_mode(slab->allocator->depot->vdo, result);
 	check_if_slab_drained(slab);
 }
@ -1372,7 +1371,7 @@ static unsigned int calculate_slab_priority(struct vdo_slab *slab)
 static void prioritize_slab(struct vdo_slab *slab)
 {
 	VDO_ASSERT_LOG_ONLY(list_empty(&slab->allocq_entry),
-			    "a slab must not already be on a ring when prioritizing");
+			    "a slab must not already be on a list when prioritizing");
 	slab->priority = calculate_slab_priority(slab);
 	vdo_priority_table_enqueue(slab->allocator->prioritized_slabs,
 				   slab->priority, &slab->allocq_entry);
@ -2165,23 +2164,6 @@ static void dirty_all_reference_blocks(struct vdo_slab *slab)
 		dirty_block(&slab->reference_blocks[i]);
 }

-/**
- * clear_provisional_references() - Clear the provisional reference counts from a reference block.
- * @block: The block to clear.
- */
-static void clear_provisional_references(struct reference_block *block)
-{
-	vdo_refcount_t *counters = get_reference_counters_for_block(block);
-	block_count_t j;
-
-	for (j = 0; j < COUNTS_PER_BLOCK; j++) {
-		if (counters[j] == PROVISIONAL_REFERENCE_COUNT) {
-			counters[j] = EMPTY_REFERENCE_COUNT;
-			block->allocated_count--;
-		}
-	}
-}
-
 static inline bool journal_points_equal(struct journal_point first,
 					struct journal_point second)
 {
@ -2189,6 +2171,90 @@ static inline bool journal_points_equal(struct journal_point first,
 		(first.entry_count == second.entry_count));
 }

+/**
+ * match_bytes() - Check an 8-byte word for bytes matching the value specified
+ * @input: A word to examine the bytes of
+ * @match: The byte value sought
+ *
+ * Return: 1 in each byte when the corresponding input byte matched, 0 otherwise
+ */
+static inline u64 match_bytes(u64 input, u8 match)
+{
+	u64 temp = input ^ (match * 0x0101010101010101ULL);
+	/* top bit of each byte is set iff top bit of temp byte is clear; rest are 0 */
+	u64 test_top_bits = ~temp & 0x8080808080808080ULL;
+	/* top bit of each byte is set iff low 7 bits of temp byte are clear; rest are useless */
+	u64 test_low_bits = 0x8080808080808080ULL - (temp & 0x7f7f7f7f7f7f7f7fULL);
+	/* return 1 when both tests indicate temp byte is 0 */
+	return (test_top_bits & test_low_bits) >> 7;
+}
+
+/**
+ * count_valid_references() - Process a newly loaded refcount array
+ * @counters: the array of counters from a metadata block
+ *
+ * Scan a 8-byte-aligned array of counters, fixing up any "provisional" values that weren't
+ * cleaned up at shutdown, changing them internally to "empty".
+ *
+ * Return: the number of blocks that are referenced (counters not "empty")
+ */
+static unsigned int count_valid_references(vdo_refcount_t *counters)
+{
+	u64 *words = (u64 *)counters;
+	/* It's easier to count occurrences of a specific byte than its absences. */
+	unsigned int empty_count = 0;
+	/* For speed, we process 8 bytes at once. */
+	unsigned int words_left = COUNTS_PER_BLOCK / sizeof(u64);
+
+	/*
+	 * Sanity check assumptions used for optimizing this code: Counters are bytes. The counter
+	 * array is a multiple of the word size.
+	 */
+	BUILD_BUG_ON(sizeof(vdo_refcount_t) != 1);
+	BUILD_BUG_ON((COUNTS_PER_BLOCK % sizeof(u64)) != 0);
+
+	while (words_left > 0) {
+		/*
+		 * This is used effectively as 8 byte-size counters. Byte 0 counts how many words
+		 * had the target value found in byte 0, etc. We just have to avoid overflow.
+		 */
+		u64 split_count = 0;
+		/*
+		 * The counter "% 255" trick used below to fold split_count into empty_count
+		 * imposes a limit of 254 bytes examined each iteration of the outer loop. We
+		 * process a word at a time, so that limit gets rounded down to 31 u64 words.
+		 */
+		const unsigned int max_words_per_iteration = 254 / sizeof(u64);
+		unsigned int iter_words_left = min_t(unsigned int, words_left,
+						     max_words_per_iteration);
+
+		words_left -= iter_words_left;
+
+		while (iter_words_left--) {
+			u64 word = *words;
+			u64 temp;
+
+			/* First, if we have any provisional refcount values, clear them. */
+			temp = match_bytes(word, PROVISIONAL_REFERENCE_COUNT);
+			if (temp) {
+				/*
+				 * 'temp' has 0x01 bytes where 'word' has PROVISIONAL; this xor
+				 * will alter just those bytes, changing PROVISIONAL to EMPTY.
+				 */
+				word ^= temp * (PROVISIONAL_REFERENCE_COUNT ^ EMPTY_REFERENCE_COUNT);
+				*words = word;
+			}
+
+			/* Now count the EMPTY_REFERENCE_COUNT bytes, updating the 8 counters. */
+			split_count += match_bytes(word, EMPTY_REFERENCE_COUNT);
+			words++;
+		}
+		empty_count += split_count % 255;
+	}
+
+	return COUNTS_PER_BLOCK - empty_count;
+}
+
 /**
 * unpack_reference_block() - Unpack reference counts blocks into the internal memory structure.
 * @packed: The written reference block to be unpacked.
@ -2197,7 +2263,6 @@ static inline bool journal_points_equal(struct journal_point first,
 static void unpack_reference_block(struct packed_reference_block *packed,
 				   struct reference_block *block)
 {
-	block_count_t index;
 	sector_count_t i;
 	struct vdo_slab *slab = block->slab;
 	vdo_refcount_t *counters = get_reference_counters_for_block(block);
@ -2223,11 +2288,7 @@ static void unpack_reference_block(struct packed_reference_block *packed,
 		}
 	}

-	block->allocated_count = 0;
-	for (index = 0; index < COUNTS_PER_BLOCK; index++) {
-		if (counters[index] != EMPTY_REFERENCE_COUNT)
-			block->allocated_count++;
-	}
+	block->allocated_count = count_valid_references(counters);
 }

 /**
@ -2240,13 +2301,19 @@ static void finish_reference_block_load(struct vdo_completion *completion)
 	struct pooled_vio *pooled = vio_as_pooled_vio(vio);
 	struct reference_block *block = completion->parent;
 	struct vdo_slab *slab = block->slab;
+	unsigned int block_count = vio->io_size / VDO_BLOCK_SIZE;
+	unsigned int i;
+	char *data = vio->data;

-	unpack_reference_block((struct packed_reference_block *) vio->data, block);
-	return_vio_to_pool(slab->allocator->vio_pool, pooled);
-	slab->active_count--;
-	clear_provisional_references(block);
+	for (i = 0; i < block_count; i++, block++, data += VDO_BLOCK_SIZE) {
+		struct packed_reference_block *packed = (struct packed_reference_block *) data;
+
+		unpack_reference_block(packed, block);
+		slab->free_blocks -= block->allocated_count;
+	}
+	return_vio_to_pool(pooled);
+	slab->active_count -= block_count;

-	slab->free_blocks -= block->allocated_count;
 	check_if_slab_drained(slab);
 }

@ -2260,23 +2327,25 @@ static void load_reference_block_endio(struct bio *bio)
 }

 /**
- * load_reference_block() - After a block waiter has gotten a VIO from the VIO pool, load the
- *                          block.
- * @waiter: The waiter of the block to load.
+ * load_reference_block_group() - After a block waiter has gotten a VIO from the VIO pool, load
+ *                                a set of blocks.
+ * @waiter: The waiter of the first block to load.
 * @context: The VIO returned by the pool.
 */
-static void load_reference_block(struct vdo_waiter *waiter, void *context)
+static void load_reference_block_group(struct vdo_waiter *waiter, void *context)
 {
 	struct pooled_vio *pooled = context;
 	struct vio *vio = &pooled->vio;
 	struct reference_block *block =
 		container_of(waiter, struct reference_block, waiter);
-	size_t block_offset = (block - block->slab->reference_blocks);
+	u32 block_offset = block - block->slab->reference_blocks;
+	u32 max_block_count = block->slab->reference_block_count - block_offset;
+	u32 block_count = min_t(int, vio->block_count, max_block_count);

 	vio->completion.parent = block;
-	vdo_submit_metadata_vio(vio, block->slab->ref_counts_origin + block_offset,
-				load_reference_block_endio, handle_io_error,
-				REQ_OP_READ);
+	vdo_submit_metadata_vio_with_size(vio, block->slab->ref_counts_origin + block_offset,
+					  load_reference_block_endio, handle_io_error,
+					  REQ_OP_READ, block_count * VDO_BLOCK_SIZE);
 }

 /**
@ -2286,14 +2355,21 @@ static void load_reference_block(struct vdo_waiter *waiter, void *context)
 static void load_reference_blocks(struct vdo_slab *slab)
 {
 	block_count_t i;
+	u64 blocks_per_vio = slab->allocator->refcount_blocks_per_big_vio;
+	struct vio_pool *pool = slab->allocator->refcount_big_vio_pool;
+
+	if (!pool) {
+		pool = slab->allocator->vio_pool;
+		blocks_per_vio = 1;
+	}

 	slab->free_blocks = slab->block_count;
 	slab->active_count = slab->reference_block_count;
-	for (i = 0; i < slab->reference_block_count; i++) {
+	for (i = 0; i < slab->reference_block_count; i += blocks_per_vio) {
 		struct vdo_waiter *waiter = &slab->reference_blocks[i].waiter;

-		waiter->callback = load_reference_block;
-		acquire_vio_from_pool(slab->allocator->vio_pool, waiter);
+		waiter->callback = load_reference_block_group;
+		acquire_vio_from_pool(pool, waiter);
 	}
 }

@ -2429,7 +2505,7 @@ static void finish_loading_journal(struct vdo_completion *completion)
 		initialize_journal_state(journal);
 	}

-	return_vio_to_pool(slab->allocator->vio_pool, vio_as_pooled_vio(vio));
+	return_vio_to_pool(vio_as_pooled_vio(vio));
 	vdo_finish_loading_with_result(&slab->state, allocate_counters_if_clean(slab));
 }

@ -2449,7 +2525,7 @@ static void handle_load_error(struct vdo_completion *completion)
 	struct vio *vio = as_vio(completion);

 	vio_record_metadata_io_error(vio);
-	return_vio_to_pool(journal->slab->allocator->vio_pool, vio_as_pooled_vio(vio));
+	return_vio_to_pool(vio_as_pooled_vio(vio));
 	vdo_finish_loading_with_result(&journal->slab->state, result);
 }

@ -2547,7 +2623,7 @@ static void queue_slab(struct vdo_slab *slab)
 	int result;

 	VDO_ASSERT_LOG_ONLY(list_empty(&slab->allocq_entry),
-			"a requeued slab must not already be on a ring");
+			"a requeued slab must not already be on a list");

 	if (vdo_is_read_only(allocator->depot->vdo))
 		return;
@ -2700,6 +2776,7 @@ static void finish_scrubbing(struct slab_scrubber *scrubber, int result)
 			vdo_log_info("VDO commencing normal operation");
 		else if (prior_state == VDO_RECOVERING)
 			vdo_log_info("Exiting recovery mode");
+		free_vio_pool(vdo_forget(allocator->refcount_big_vio_pool));
 	}

 	/*
@ -3281,7 +3358,7 @@ int vdo_release_block_reference(struct block_allocator *allocator,
 * This is a min_heap callback function orders slab_status structures using the 'is_clean' field as
 * the primary key and the 'emptiness' field as the secondary key.
 *
- * Slabs need to be pushed onto the rings in the same order they are to be popped off. Popping
+ * Slabs need to be pushed onto the lists in the same order they are to be popped off. Popping
 * should always get the most empty first, so pushing should be from most empty to least empty.
 * Thus, the ordering is reversed from the usual sense since min_heap returns smaller elements
 * before larger ones.
@ -3983,6 +4060,7 @@ static int __must_check initialize_block_allocator(struct slab_depot *depot,
 	struct vdo *vdo = depot->vdo;
 	block_count_t max_free_blocks = depot->slab_config.data_blocks;
 	unsigned int max_priority = (2 + ilog2(max_free_blocks));
+	u32 reference_block_count, refcount_reads_needed, refcount_blocks_per_vio;

 	*allocator = (struct block_allocator) {
 		.depot = depot,
@ -4000,12 +4078,24 @@ static int __must_check initialize_block_allocator(struct slab_depot *depot,
 		return result;

 	vdo_initialize_completion(&allocator->completion, vdo, VDO_BLOCK_ALLOCATOR_COMPLETION);
-	result = make_vio_pool(vdo, BLOCK_ALLOCATOR_VIO_POOL_SIZE, allocator->thread_id,
+	result = make_vio_pool(vdo, BLOCK_ALLOCATOR_VIO_POOL_SIZE, 1, allocator->thread_id,
 			       VIO_TYPE_SLAB_JOURNAL, VIO_PRIORITY_METADATA,
 			       allocator, &allocator->vio_pool);
 	if (result != VDO_SUCCESS)
 		return result;

+	/* Initialize the refcount-reading vio pool. */
+	reference_block_count = vdo_get_saved_reference_count_size(depot->slab_config.slab_blocks);
+	refcount_reads_needed = DIV_ROUND_UP(reference_block_count, MAX_BLOCKS_PER_VIO);
+	refcount_blocks_per_vio = DIV_ROUND_UP(reference_block_count, refcount_reads_needed);
+	allocator->refcount_blocks_per_big_vio = refcount_blocks_per_vio;
+	result = make_vio_pool(vdo, BLOCK_ALLOCATOR_REFCOUNT_VIO_POOL_SIZE,
+			       allocator->refcount_blocks_per_big_vio, allocator->thread_id,
+			       VIO_TYPE_SLAB_JOURNAL, VIO_PRIORITY_METADATA,
+			       NULL, &allocator->refcount_big_vio_pool);
+	if (result != VDO_SUCCESS)
+		return result;
+
 	result = initialize_slab_scrubber(allocator);
 	if (result != VDO_SUCCESS)
 		return result;
@ -4223,6 +4313,7 @@ void vdo_free_slab_depot(struct slab_depot *depot)
 		uninitialize_allocator_summary(allocator);
 		uninitialize_scrubber_vio(&allocator->scrubber);
 		free_vio_pool(vdo_forget(allocator->vio_pool));
+		free_vio_pool(vdo_forget(allocator->refcount_big_vio_pool));
 		vdo_free_priority_table(vdo_forget(allocator->prioritized_slabs));
 	}

--- a/drivers/md/dm-vdo/slab-depot.h
+++ b/drivers/md/dm-vdo/slab-depot.h
@ -45,6 +45,13 @@
 enum {
 	/* The number of vios in the vio pool is proportional to the throughput of the VDO. */
 	BLOCK_ALLOCATOR_VIO_POOL_SIZE = 128,
+
+	/*
+	 * The number of vios in the vio pool used for loading reference count data. A slab's
+	 * refcounts is capped at ~8MB, and we process one at a time in a zone, so 9 should be
+	 * plenty.
+	 */
+	BLOCK_ALLOCATOR_REFCOUNT_VIO_POOL_SIZE = 9,
 };

 /*
@ -248,7 +255,7 @@ struct vdo_slab {

 	/* A list of the dirty blocks waiting to be written out */
 	struct vdo_wait_queue dirty_blocks;
-	/* The number of blocks which are currently writing */
+	/* The number of blocks which are currently reading or writing */
 	size_t active_count;

 	/* A waiter object for updating the slab summary */
@ -425,6 +432,10 @@ struct block_allocator {

 	/* The vio pool for reading and writing block allocator metadata */
 	struct vio_pool *vio_pool;
+	/* The vio pool for large initial reads of ref count areas */
+	struct vio_pool *refcount_big_vio_pool;
+	/* How many ref count blocks are read per vio at initial load */
+	u32 refcount_blocks_per_big_vio;
 	/* The dm_kcopyd client for erasing slab journals */
 	struct dm_kcopyd_client *eraser;
 	/* Iterator over the slabs to be erased */
--- a/drivers/md/dm-vdo/types.h
+++ b/drivers/md/dm-vdo/types.h
@ -376,6 +376,9 @@ struct vio {
 	/* The size of this vio in blocks */
 	unsigned int block_count;

+	/* The amount of data to be read or written, in bytes */
+	unsigned int io_size;
+
 	/* The data being read or written. */
 	char *data;

--- a/drivers/md/dm-vdo/vdo.c
+++ b/drivers/md/dm-vdo/vdo.c
@ -31,9 +31,7 @@

 #include <linux/completion.h>
 #include <linux/device-mapper.h>
-#include <linux/kernel.h>
 #include <linux/lz4.h>
-#include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/spinlock.h>
 #include <linux/types.h>
@ -142,12 +140,6 @@ static void finish_vdo_request_queue(void *ptr)
 	vdo_unregister_allocating_thread();
 }

-#ifdef MODULE
-#define MODULE_NAME THIS_MODULE->name
-#else
-#define MODULE_NAME "dm-vdo"
-#endif  /* MODULE */
-
 static const struct vdo_work_queue_type default_queue_type = {
 	.start = start_vdo_request_queue,
 	.finish = finish_vdo_request_queue,
@ -559,8 +551,7 @@ int vdo_make(unsigned int instance, struct device_config *config, char **reason,
 	*vdo_ptr = vdo;

 	snprintf(vdo->thread_name_prefix, sizeof(vdo->thread_name_prefix),
-		 "%s%u", MODULE_NAME, instance);
-	BUG_ON(vdo->thread_name_prefix[0] == '\0');
+		 "vdo%u", instance);
 	result = vdo_allocate(vdo->thread_config.thread_count,
 			      struct vdo_thread, __func__, &vdo->threads);
 	if (result != VDO_SUCCESS) {
--- a/drivers/md/dm-vdo/vio.c
+++ b/drivers/md/dm-vdo/vio.c
@ -188,14 +188,23 @@ void vdo_set_bio_properties(struct bio *bio, struct vio *vio, bio_end_io_t callb

 /*
 * Prepares the bio to perform IO with the specified buffer. May only be used on a VDO-allocated
- * bio, as it assumes the bio wraps a 4k buffer that is 4k aligned, but there does not have to be a
- * vio associated with the bio.
+ * bio, as it assumes the bio wraps a 4k-multiple buffer that is 4k aligned, but there does not
+ * have to be a vio associated with the bio.
 */
 int vio_reset_bio(struct vio *vio, char *data, bio_end_io_t callback,
 		  blk_opf_t bi_opf, physical_block_number_t pbn)
 {
-	int bvec_count, offset, len, i;
+	return vio_reset_bio_with_size(vio, data, vio->block_count * VDO_BLOCK_SIZE,
+				       callback, bi_opf, pbn);
+}
+
+int vio_reset_bio_with_size(struct vio *vio, char *data, int size, bio_end_io_t callback,
+			    blk_opf_t bi_opf, physical_block_number_t pbn)
+{
+	int bvec_count, offset, i;
 	struct bio *bio = vio->bio;
+	int vio_size = vio->block_count * VDO_BLOCK_SIZE;
+	int remaining;

 	bio_reset(bio, bio->bi_bdev, bi_opf);
 	vdo_set_bio_properties(bio, vio, callback, bi_opf, pbn);
@ -205,22 +214,21 @@ int vio_reset_bio(struct vio *vio, char *data, bio_end_io_t callback,
 	bio->bi_ioprio = 0;
 	bio->bi_io_vec = bio->bi_inline_vecs;
 	bio->bi_max_vecs = vio->block_count + 1;
-	len = VDO_BLOCK_SIZE * vio->block_count;
+	if (VDO_ASSERT(size <= vio_size, "specified size %d is not greater than allocated %d",
+		       size, vio_size) != VDO_SUCCESS)
+		size = vio_size;
+	vio->io_size = size;
 	offset = offset_in_page(data);
-	bvec_count = DIV_ROUND_UP(offset + len, PAGE_SIZE);
+	bvec_count = DIV_ROUND_UP(offset + size, PAGE_SIZE);
+	remaining = size;

-	/*
-	 * If we knew that data was always on one page, or contiguous pages, we wouldn't need the
-	 * loop. But if we're using vmalloc, it's not impossible that the data is in different
-	 * pages that can't be merged in bio_add_page...
-	 */
-	for (i = 0; (i < bvec_count) && (len > 0); i++) {
+	for (i = 0; (i < bvec_count) && (remaining > 0); i++) {
 		struct page *page;
 		int bytes_added;
 		int bytes = PAGE_SIZE - offset;

-		if (bytes > len)
-			bytes = len;
+		if (bytes > remaining)
+			bytes = remaining;

 		page = is_vmalloc_addr(data) ? vmalloc_to_page(data) : virt_to_page(data);
 		bytes_added = bio_add_page(bio, page, bytes, offset);
@ -232,7 +240,7 @@ int vio_reset_bio(struct vio *vio, char *data, bio_end_io_t callback,
 		}

 		data += bytes;
-		len -= bytes;
+		remaining -= bytes;
 		offset = 0;
 	}

@ -301,6 +309,7 @@ void vio_record_metadata_io_error(struct vio *vio)
 * make_vio_pool() - Create a new vio pool.
 * @vdo: The vdo.
 * @pool_size: The number of vios in the pool.
+ * @block_count: The number of 4k blocks per vio.
 * @thread_id: The ID of the thread using this pool.
 * @vio_type: The type of vios in the pool.
 * @priority: The priority with which vios from the pool should be enqueued.
@ -309,13 +318,14 @@ void vio_record_metadata_io_error(struct vio *vio)
 *
 * Return: A success or error code.
 */
-int make_vio_pool(struct vdo *vdo, size_t pool_size, thread_id_t thread_id,
+int make_vio_pool(struct vdo *vdo, size_t pool_size, size_t block_count, thread_id_t thread_id,
 		  enum vio_type vio_type, enum vio_priority priority, void *context,
 		  struct vio_pool **pool_ptr)
 {
 	struct vio_pool *pool;
 	char *ptr;
 	int result;
+	size_t per_vio_size = VDO_BLOCK_SIZE * block_count;

 	result = vdo_allocate_extended(struct vio_pool, pool_size, struct pooled_vio,
 				       __func__, &pool);
@ -326,7 +336,7 @@ int make_vio_pool(struct vdo *vdo, size_t pool_size, thread_id_t thread_id,
 	INIT_LIST_HEAD(&pool->available);
 	INIT_LIST_HEAD(&pool->busy);

-	result = vdo_allocate(pool_size * VDO_BLOCK_SIZE, char,
+	result = vdo_allocate(pool_size * per_vio_size, char,
 			      "VIO pool buffer", &pool->buffer);
 	if (result != VDO_SUCCESS) {
 		free_vio_pool(pool);
@ -334,10 +344,10 @@ int make_vio_pool(struct vdo *vdo, size_t pool_size, thread_id_t thread_id,
 	}

 	ptr = pool->buffer;
-	for (pool->size = 0; pool->size < pool_size; pool->size++, ptr += VDO_BLOCK_SIZE) {
+	for (pool->size = 0; pool->size < pool_size; pool->size++, ptr += per_vio_size) {
 		struct pooled_vio *pooled = &pool->vios[pool->size];

-		result = allocate_vio_components(vdo, vio_type, priority, NULL, 1, ptr,
+		result = allocate_vio_components(vdo, vio_type, priority, NULL, block_count, ptr,
 						 &pooled->vio);
 		if (result != VDO_SUCCESS) {
 			free_vio_pool(pool);
@ -345,6 +355,7 @@ int make_vio_pool(struct vdo *vdo, size_t pool_size, thread_id_t thread_id,
 		}

 		pooled->context = context;
+		pooled->pool = pool;
 		list_add_tail(&pooled->pool_entry, &pool->available);
 	}

@ -419,12 +430,13 @@ void acquire_vio_from_pool(struct vio_pool *pool, struct vdo_waiter *waiter)
 }

 /**
- * return_vio_to_pool() - Return a vio to the pool
- * @pool: The vio pool.
+ * return_vio_to_pool() - Return a vio to its pool
 * @vio: The pooled vio to return.
 */
-void return_vio_to_pool(struct vio_pool *pool, struct pooled_vio *vio)
+void return_vio_to_pool(struct pooled_vio *vio)
 {
+	struct vio_pool *pool = vio->pool;
+
 	VDO_ASSERT_LOG_ONLY((pool->thread_id == vdo_get_callback_thread_id()),
 			    "vio pool entry returned on same thread as it was acquired");

--- a/drivers/md/dm-vdo/vio.h
+++ b/drivers/md/dm-vdo/vio.h
@ -30,6 +30,8 @@ struct pooled_vio {
 	void *context;
 	/* The list entry used by the pool */
 	struct list_head pool_entry;
+	/* The pool this vio is allocated from */
+	struct vio_pool *pool;
 };

 /**
@ -123,6 +125,8 @@ void vdo_set_bio_properties(struct bio *bio, struct vio *vio, bio_end_io_t callb

 int vio_reset_bio(struct vio *vio, char *data, bio_end_io_t callback,
 		  blk_opf_t bi_opf, physical_block_number_t pbn);
+int vio_reset_bio_with_size(struct vio *vio, char *data, int size, bio_end_io_t callback,
+			    blk_opf_t bi_opf, physical_block_number_t pbn);

 void update_vio_error_stats(struct vio *vio, const char *format, ...)
 	__printf(2, 3);
@ -188,12 +192,13 @@ static inline struct pooled_vio *vio_as_pooled_vio(struct vio *vio)

 struct vio_pool;

-int __must_check make_vio_pool(struct vdo *vdo, size_t pool_size, thread_id_t thread_id,
-			       enum vio_type vio_type, enum vio_priority priority,
-			       void *context, struct vio_pool **pool_ptr);
+int __must_check make_vio_pool(struct vdo *vdo, size_t pool_size, size_t block_count,
+			       thread_id_t thread_id, enum vio_type vio_type,
+			       enum vio_priority priority, void *context,
+			       struct vio_pool **pool_ptr);
 void free_vio_pool(struct vio_pool *pool);
 bool __must_check is_vio_pool_busy(struct vio_pool *pool);
 void acquire_vio_from_pool(struct vio_pool *pool, struct vdo_waiter *waiter);
-void return_vio_to_pool(struct vio_pool *pool, struct pooled_vio *vio);
+void return_vio_to_pool(struct pooled_vio *vio);

 #endif /* VIO_H */
--- a/drivers/md/dm-vdo/wait-queue.c
+++ b/drivers/md/dm-vdo/wait-queue.c
@ -34,7 +34,7 @@ void vdo_waitq_enqueue_waiter(struct vdo_wait_queue *waitq, struct vdo_waiter *w
 		waitq->last_waiter->next_waiter = waiter;
 	}

-	/* In both cases, the waiter we added to the ring becomes the last waiter. */
+	/* In both cases, the waiter we added to the list becomes the last waiter. */
 	waitq->last_waiter = waiter;
 	waitq->length += 1;
 }
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@ -30,6 +30,7 @@
 #define DM_VERITY_ENV_VAR_NAME		"DM_VERITY_ERR_BLOCK_NR"

 #define DM_VERITY_DEFAULT_PREFETCH_SIZE	262144
+#define DM_VERITY_USE_BH_DEFAULT_BYTES	8192

 #define DM_VERITY_MAX_CORRUPTED_ERRS	100

@ -49,6 +50,15 @@ static unsigned int dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE

 module_param_named(prefetch_cluster, dm_verity_prefetch_cluster, uint, 0644);

+static unsigned int dm_verity_use_bh_bytes[4] = {
+	DM_VERITY_USE_BH_DEFAULT_BYTES,	// IOPRIO_CLASS_NONE
+	DM_VERITY_USE_BH_DEFAULT_BYTES,	// IOPRIO_CLASS_RT
+	DM_VERITY_USE_BH_DEFAULT_BYTES,	// IOPRIO_CLASS_BE
+	0				// IOPRIO_CLASS_IDLE
+};
+
+module_param_array_named(use_bh_bytes, dm_verity_use_bh_bytes, uint, NULL, 0644);
+
 static DEFINE_STATIC_KEY_FALSE(use_bh_wq_enabled);

 /* Is at least one dm-verity instance using ahash_tfm instead of shash_tfm? */
@ -311,7 +321,7 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io,

 	if (static_branch_unlikely(&use_bh_wq_enabled) && io->in_bh) {
 		data = dm_bufio_get(v->bufio, hash_block, &buf);
-		if (data == NULL) {
+		if (IS_ERR_OR_NULL(data)) {
 			/*
 			 * In tasklet and the hash was not in the bufio cache.
 			 * Return early and resume execution from a work-queue
@ -324,8 +334,24 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io,
 						&buf, bio->bi_ioprio);
 	}

-	if (IS_ERR(data))
-		return PTR_ERR(data);
+	if (IS_ERR(data)) {
+		if (skip_unverified)
+			return 1;
+		r = PTR_ERR(data);
+		data = dm_bufio_new(v->bufio, hash_block, &buf);
+		if (IS_ERR(data))
+			return r;
+		if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_METADATA,
+				      hash_block, data) == 0) {
+			aux = dm_bufio_get_aux_data(buf);
+			aux->hash_verified = 1;
+			goto release_ok;
+		} else {
+			dm_bufio_release(buf);
+			dm_bufio_forget(v->bufio, hash_block);
+			return r;
+		}
+	}

 	aux = dm_bufio_get_aux_data(buf);

@ -366,6 +392,7 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io,
 		}
 	}

+release_ok:
 	data += offset;
 	memcpy(want_digest, data, v->digest_size);
 	r = 0;
@ -652,9 +679,17 @@ static void verity_bh_work(struct work_struct *w)
 	verity_finish_io(io, errno_to_blk_status(err));
 }

+static inline bool verity_use_bh(unsigned int bytes, unsigned short ioprio)
+{
+	return ioprio <= IOPRIO_CLASS_IDLE &&
+		bytes <= READ_ONCE(dm_verity_use_bh_bytes[ioprio]);
+}
+
 static void verity_end_io(struct bio *bio)
 {
 	struct dm_verity_io *io = bio->bi_private;
+	unsigned short ioprio = IOPRIO_PRIO_CLASS(bio->bi_ioprio);
+	unsigned int bytes = io->n_blocks << io->v->data_dev_block_bits;

 	if (bio->bi_status &&
 	    (!verity_fec_is_enabled(io->v) ||
@ -664,9 +699,14 @@ static void verity_end_io(struct bio *bio)
 		return;
 	}

-	if (static_branch_unlikely(&use_bh_wq_enabled) && io->v->use_bh_wq) {
-		INIT_WORK(&io->bh_work, verity_bh_work);
-		queue_work(system_bh_wq, &io->bh_work);
+	if (static_branch_unlikely(&use_bh_wq_enabled) && io->v->use_bh_wq &&
+		verity_use_bh(bytes, ioprio)) {
+		if (in_hardirq() || irqs_disabled()) {
+			INIT_WORK(&io->bh_work, verity_bh_work);
+			queue_work(system_bh_wq, &io->bh_work);
+		} else {
+			verity_bh_work(&io->bh_work);
+		}
 	} else {
 		INIT_WORK(&io->work, verity_work);
 		queue_work(io->v->verify_wq, &io->work);
@ -796,6 +836,13 @@ static int verity_map(struct dm_target *ti, struct bio *bio)
 	return DM_MAPIO_SUBMITTED;
 }

+static void verity_postsuspend(struct dm_target *ti)
+{
+	struct dm_verity *v = ti->private;
+	flush_workqueue(v->verify_wq);
+	dm_bufio_client_reset(v->bufio);
+}
+
 /*
 * Status: V (valid) or C (corruption found)
 */
@ -1761,11 +1808,12 @@ static struct target_type verity_target = {
 	.name		= "verity",
 /* Note: the LSMs depend on the singleton and immutable features */
 	.features	= DM_TARGET_SINGLETON | DM_TARGET_IMMUTABLE,
-	.version	= {1, 10, 0},
+	.version	= {1, 11, 0},
 	.module		= THIS_MODULE,
 	.ctr		= verity_ctr,
 	.dtr		= verity_dtr,
 	.map		= verity_map,
+	.postsuspend	= verity_postsuspend,
 	.status		= verity_status,
 	.prepare_ioctl	= verity_prepare_ioctl,
 	.iterate_devices = verity_iterate_devices,
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@ -1540,14 +1540,18 @@ static void __send_empty_flush(struct clone_info *ci)
 {
 	struct dm_table *t = ci->map;
 	struct bio flush_bio;
+	blk_opf_t opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
+
+	if ((ci->io->orig_bio->bi_opf & (REQ_IDLE | REQ_SYNC)) ==
+	    (REQ_IDLE | REQ_SYNC))
+		opf |= REQ_IDLE;

 	/*
 	 * Use an on-stack bio for this, it's safe since we don't
 	 * need to reference it after submit. It's just used as
 	 * the basis for the clone(s).
 	 */
-	bio_init(&flush_bio, ci->io->md->disk->part0, NULL, 0,
-		 REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC);
+	bio_init(&flush_bio, ci->io->md->disk->part0, NULL, 0, opf);

 	ci->bio = &flush_bio;
 	ci->sector_count = 0;