bcachefs: Split up bch_dev.io_ref

We now have separate per device io_refs for read and write access.

This fixes a device removal bug where the discard workers were still
running while we're removing alloc info for that device.

It's also a bit of hardening; we no longer allow writes to devices that
are read-only.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
Kent Overstreet 2025-03-29 23:11:08 -04:00
parent f1350c2c74
commit dcffc3b1ae
19 changed files with 142 additions and 87 deletions

View File

@ -1950,7 +1950,7 @@ static void bch2_do_discards_work(struct work_struct *work)
trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded,
bch2_err_str(ret));
percpu_ref_put(&ca->io_ref);
percpu_ref_put(&ca->io_ref[WRITE]);
bch2_write_ref_put(c, BCH_WRITE_REF_discard);
}
@ -1967,7 +1967,7 @@ void bch2_dev_do_discards(struct bch_dev *ca)
if (queue_work(c->write_ref_wq, &ca->discard_work))
return;
percpu_ref_put(&ca->io_ref);
percpu_ref_put(&ca->io_ref[WRITE]);
put_write_ref:
bch2_write_ref_put(c, BCH_WRITE_REF_discard);
}
@ -2045,7 +2045,7 @@ static void bch2_do_discards_fast_work(struct work_struct *work)
trace_discard_buckets_fast(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret));
bch2_trans_put(trans);
percpu_ref_put(&ca->io_ref);
percpu_ref_put(&ca->io_ref[WRITE]);
bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
}
@ -2065,7 +2065,7 @@ static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket)
if (queue_work(c->write_ref_wq, &ca->discard_fast_work))
return;
percpu_ref_put(&ca->io_ref);
percpu_ref_put(&ca->io_ref[WRITE]);
put_ref:
bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
}
@ -2256,7 +2256,7 @@ restart_err:
bch2_trans_iter_exit(trans, &iter);
err:
bch2_trans_put(trans);
percpu_ref_put(&ca->io_ref);
percpu_ref_put(&ca->io_ref[WRITE]);
bch2_bkey_buf_exit(&last_flushed, c);
bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
}
@ -2274,7 +2274,7 @@ void bch2_dev_do_invalidates(struct bch_dev *ca)
if (queue_work(c->write_ref_wq, &ca->invalidate_work))
return;
percpu_ref_put(&ca->io_ref);
percpu_ref_put(&ca->io_ref[WRITE]);
put_ref:
bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
}
@ -2506,7 +2506,7 @@ void bch2_recalc_capacity(struct bch_fs *c)
bch2_set_ra_pages(c, ra_pages);
for_each_rw_member(c, ca) {
__for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw), READ) {
u64 dev_reserve = 0;
/*

View File

@ -462,7 +462,7 @@ err:
if (bio)
bio_put(bio);
kvfree(data_buf);
percpu_ref_put(&ca->io_ref);
percpu_ref_put(&ca->io_ref[READ]);
printbuf_exit(&buf);
return ret;
}

View File

@ -524,8 +524,8 @@ struct bch_dev {
struct percpu_ref ref;
#endif
struct completion ref_completion;
struct percpu_ref io_ref;
struct completion io_ref_completion;
struct percpu_ref io_ref[2];
struct completion io_ref_completion[2];
struct bch_fs *fs;

View File

@ -1353,7 +1353,7 @@ start:
"btree read error %s for %s",
bch2_blk_status_to_str(bio->bi_status), buf.buf);
if (rb->have_ioref)
percpu_ref_put(&ca->io_ref);
percpu_ref_put(&ca->io_ref[READ]);
rb->have_ioref = false;
bch2_mark_io_failure(&failed, &rb->pick, false);
@ -1609,7 +1609,7 @@ static void btree_node_read_all_replicas_endio(struct bio *bio)
struct bch_dev *ca = bch2_dev_have_ref(c, rb->pick.ptr.dev);
bch2_latency_acct(ca, rb->start_time, READ);
percpu_ref_put(&ca->io_ref);
percpu_ref_put(&ca->io_ref[READ]);
}
ra->err[rb->idx] = bio->bi_status;
@ -1928,7 +1928,7 @@ err:
printbuf_exit(&err);
bch2_bkey_buf_exit(&scrub->key, c);;
btree_bounce_free(c, c->opts.btree_node_size, scrub->used_mempool, scrub->buf);
percpu_ref_put(&scrub->ca->io_ref);
percpu_ref_put(&scrub->ca->io_ref[READ]);
kfree(scrub);
bch2_write_ref_put(c, BCH_WRITE_REF_btree_node_scrub);
}
@ -1997,7 +1997,7 @@ int bch2_btree_node_scrub(struct btree_trans *trans,
return 0;
err_free:
btree_bounce_free(c, c->opts.btree_node_size, used_mempool, buf);
percpu_ref_put(&ca->io_ref);
percpu_ref_put(&ca->io_ref[READ]);
err:
bch2_write_ref_put(c, BCH_WRITE_REF_btree_node_scrub);
return ret;
@ -2159,8 +2159,12 @@ static void btree_node_write_endio(struct bio *bio)
spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
}
/*
* XXX: we should be using io_ref[WRITE], but we aren't retrying failed
* btree writes yet (due to device removal/ro):
*/
if (wbio->have_ioref)
percpu_ref_put(&ca->io_ref);
percpu_ref_put(&ca->io_ref[READ]);
if (parent) {
bio_put(bio);

View File

@ -271,7 +271,7 @@ static int read_btree_nodes_worker(void *p)
err:
bio_put(bio);
free_page((unsigned long) buf);
percpu_ref_put(&ca->io_ref);
percpu_ref_put(&ca->io_ref[READ]);
closure_put(w->cl);
kfree(w);
return 0;
@ -291,7 +291,7 @@ static int read_btree_nodes(struct find_btree_nodes *f)
struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL);
if (!w) {
percpu_ref_put(&ca->io_ref);
percpu_ref_put(&ca->io_ref[READ]);
ret = -ENOMEM;
goto err;
}
@ -303,14 +303,14 @@ static int read_btree_nodes(struct find_btree_nodes *f)
struct task_struct *t = kthread_create(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name);
ret = PTR_ERR_OR_ZERO(t);
if (ret) {
percpu_ref_put(&ca->io_ref);
percpu_ref_put(&ca->io_ref[READ]);
kfree(w);
bch_err_msg(c, ret, "starting kthread");
break;
}
closure_get(&cl);
percpu_ref_get(&ca->io_ref);
percpu_ref_get(&ca->io_ref[READ]);
wake_up_process(t);
}
err:

View File

@ -1132,7 +1132,7 @@ int bch2_trans_mark_dev_sbs_flags(struct bch_fs *c,
for_each_online_member(c, ca) {
int ret = bch2_trans_mark_dev_sb(c, ca, flags);
if (ret) {
percpu_ref_put(&ca->io_ref);
percpu_ref_put(&ca->io_ref[READ]);
return ret;
}
}

View File

@ -615,7 +615,7 @@ static long bch2_ioctl_disk_get_idx(struct bch_fs *c,
for_each_online_member(c, ca)
if (ca->dev == dev) {
percpu_ref_put(&ca->io_ref);
percpu_ref_put(&ca->io_ref[READ]);
return ca->dev_idx;
}

View File

@ -57,7 +57,7 @@ static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b,
submit_bio_wait(bio);
bio_put(bio);
percpu_ref_put(&ca->io_ref);
percpu_ref_put(&ca->io_ref[READ]);
memcpy(n_ondisk, n_sorted, btree_buf_bytes(b));
@ -297,7 +297,7 @@ out:
if (bio)
bio_put(bio);
kvfree(n_ondisk);
percpu_ref_put(&ca->io_ref);
percpu_ref_put(&ca->io_ref[READ]);
}
#ifdef CONFIG_DEBUG_FS

View File

@ -555,9 +555,9 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
? rcu_dereference(c->devs[t.dev])
: NULL;
if (ca && percpu_ref_tryget(&ca->io_ref)) {
if (ca && percpu_ref_tryget(&ca->io_ref[READ])) {
prt_printf(out, "/dev/%s", ca->name);
percpu_ref_put(&ca->io_ref);
percpu_ref_put(&ca->io_ref[READ]);
} else if (ca) {
prt_printf(out, "offline device %u", t.dev);
} else {

View File

@ -105,6 +105,7 @@ struct ec_bio {
struct bch_dev *ca;
struct ec_stripe_buf *buf;
size_t idx;
int rw;
u64 submit_time;
struct bio bio;
};
@ -704,6 +705,7 @@ static void ec_block_endio(struct bio *bio)
struct bch_extent_ptr *ptr = &v->ptrs[ec_bio->idx];
struct bch_dev *ca = ec_bio->ca;
struct closure *cl = bio->bi_private;
int rw = ec_bio->rw;
bch2_account_io_completion(ca, bio_data_dir(bio),
ec_bio->submit_time, !bio->bi_status);
@ -725,7 +727,7 @@ static void ec_block_endio(struct bio *bio)
}
bio_put(&ec_bio->bio);
percpu_ref_put(&ca->io_ref);
percpu_ref_put(&ca->io_ref[rw]);
closure_put(cl);
}
@ -776,6 +778,7 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
ec_bio->ca = ca;
ec_bio->buf = buf;
ec_bio->idx = idx;
ec_bio->rw = rw;
ec_bio->submit_time = local_clock();
ec_bio->bio.bi_iter.bi_sector = ptr->offset + buf->offset + (offset >> 9);
@ -785,14 +788,14 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b);
closure_get(cl);
percpu_ref_get(&ca->io_ref);
percpu_ref_get(&ca->io_ref[rw]);
submit_bio(&ec_bio->bio);
offset += b;
}
percpu_ref_put(&ca->io_ref);
percpu_ref_put(&ca->io_ref[rw]);
}
static int get_stripe_key_trans(struct btree_trans *trans, u64 idx,
@ -1265,7 +1268,7 @@ static void zero_out_rest_of_ec_bucket(struct bch_fs *c,
ob->sectors_free,
GFP_KERNEL, 0);
percpu_ref_put(&ca->io_ref);
percpu_ref_put(&ca->io_ref[WRITE]);
if (ret)
s->err = ret;

View File

@ -48,7 +48,7 @@ static void nocow_flush_endio(struct bio *_bio)
struct nocow_flush *bio = container_of(_bio, struct nocow_flush, bio);
closure_put(bio->cl);
percpu_ref_put(&bio->ca->io_ref);
percpu_ref_put(&bio->ca->io_ref[WRITE]);
bio_put(&bio->bio);
}
@ -71,7 +71,7 @@ void bch2_inode_flush_nocow_writes_async(struct bch_fs *c,
for_each_set_bit(dev, devs.d, BCH_SB_MEMBERS_MAX) {
rcu_read_lock();
ca = rcu_dereference(c->devs[dev]);
if (ca && !percpu_ref_tryget(&ca->io_ref))
if (ca && !percpu_ref_tryget(&ca->io_ref[WRITE]))
ca = NULL;
rcu_read_unlock();

View File

@ -2237,7 +2237,7 @@ got_sb:
/* XXX: create an anonymous device for multi device filesystems */
sb->s_bdev = bdev;
sb->s_dev = bdev->bd_dev;
percpu_ref_put(&ca->io_ref);
percpu_ref_put(&ca->io_ref[READ]);
break;
}

View File

@ -394,7 +394,7 @@ static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
if (rbio->have_ioref) {
struct bch_dev *ca = bch2_dev_have_ref(rbio->c, rbio->pick.ptr.dev);
percpu_ref_put(&ca->io_ref);
percpu_ref_put(&ca->io_ref[READ]);
}
if (rbio->split) {
@ -1003,7 +1003,7 @@ retry_pick:
unlikely(dev_ptr_stale(ca, &pick.ptr))) {
read_from_stale_dirty_pointer(trans, ca, k, pick.ptr);
bch2_mark_io_failure(failed, &pick, false);
percpu_ref_put(&ca->io_ref);
percpu_ref_put(&ca->io_ref[READ]);
goto retry_pick;
}
@ -1036,7 +1036,7 @@ retry_pick:
*/
if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) {
if (ca)
percpu_ref_put(&ca->io_ref);
percpu_ref_put(&ca->io_ref[READ]);
rbio->ret = -BCH_ERR_data_read_buffer_too_small;
goto out_read_done;
}

View File

@ -445,6 +445,11 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
BUG_ON(c->opts.nochanges);
bkey_for_each_ptr(ptrs, ptr) {
/*
* XXX: btree writes should be using io_ref[WRITE], but we
* aren't retrying failed btree writes yet (due to device
* removal/ro):
*/
struct bch_dev *ca = nocow
? bch2_dev_have_ref(c, ptr->dev)
: bch2_dev_get_ioref(c, ptr->dev, type == BCH_DATA_btree ? READ : WRITE);
@ -722,7 +727,7 @@ static void bch2_write_endio(struct bio *bio)
}
if (wbio->have_ioref)
percpu_ref_put(&ca->io_ref);
percpu_ref_put(&ca->io_ref[WRITE]);
if (wbio->bounce)
bch2_bio_free_pages_pool(c, bio);
@ -1421,7 +1426,7 @@ err:
return;
err_get_ioref:
darray_for_each(buckets, i)
percpu_ref_put(&bch2_dev_have_ref(c, i->b.inode)->io_ref);
percpu_ref_put(&bch2_dev_have_ref(c, i->b.inode)->io_ref[WRITE]);
/* Fall back to COW path: */
goto out;

View File

@ -1315,7 +1315,7 @@ int bch2_fs_journal_alloc(struct bch_fs *c)
int ret = bch2_dev_journal_alloc(ca, true);
if (ret) {
percpu_ref_put(&ca->io_ref);
percpu_ref_put(&ca->io_ref[READ]);
return ret;
}
}
@ -1461,11 +1461,9 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
j->reservations.idx = journal_cur_seq(j);
c->last_bucket_seq_cleanup = journal_cur_seq(j);
bch2_journal_space_available(j);
spin_unlock(&j->lock);
return bch2_journal_reclaim_start(j);
return 0;
}
/* init/exit: */

View File

@ -1218,7 +1218,7 @@ static CLOSURE_CALLBACK(bch2_journal_read_device)
out:
bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret);
kvfree(buf.data);
percpu_ref_put(&ca->io_ref);
percpu_ref_put(&ca->io_ref[READ]);
closure_return(cl);
return;
err:
@ -1253,7 +1253,7 @@ int bch2_journal_read(struct bch_fs *c,
if ((ca->mi.state == BCH_MEMBER_STATE_rw ||
ca->mi.state == BCH_MEMBER_STATE_ro) &&
percpu_ref_tryget(&ca->io_ref))
percpu_ref_tryget(&ca->io_ref[READ]))
closure_call(&ca->journal.read,
bch2_journal_read_device,
system_unbound_wq,
@ -1768,7 +1768,7 @@ static void journal_write_endio(struct bio *bio)
}
closure_put(&w->io);
percpu_ref_put(&ca->io_ref);
percpu_ref_put(&ca->io_ref[WRITE]);
}
static CLOSURE_CALLBACK(journal_write_submit)
@ -1843,7 +1843,7 @@ static CLOSURE_CALLBACK(journal_write_preflush)
if (w->separate_flush) {
for_each_rw_member(c, ca) {
percpu_ref_get(&ca->io_ref);
percpu_ref_get(&ca->io_ref[WRITE]);
struct journal_device *ja = &ca->journal;
struct bio *bio = &ja->bio[w->idx]->bio;

View File

@ -20,7 +20,7 @@ struct bch_member bch2_sb_member_get(struct bch_sb *sb, int i);
static inline bool bch2_dev_is_online(struct bch_dev *ca)
{
return !percpu_ref_is_zero(&ca->io_ref);
return !percpu_ref_is_zero(&ca->io_ref[READ]);
}
static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *, unsigned);
@ -156,33 +156,34 @@ static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, struct bch_dev
static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c,
struct bch_dev *ca,
unsigned state_mask)
unsigned state_mask,
int rw)
{
rcu_read_lock();
if (ca)
percpu_ref_put(&ca->io_ref);
percpu_ref_put(&ca->io_ref[rw]);
while ((ca = __bch2_next_dev(c, ca, NULL)) &&
(!((1 << ca->mi.state) & state_mask) ||
!percpu_ref_tryget(&ca->io_ref)))
!percpu_ref_tryget(&ca->io_ref[rw])))
;
rcu_read_unlock();
return ca;
}
#define __for_each_online_member(_c, _ca, state_mask) \
#define __for_each_online_member(_c, _ca, state_mask, rw) \
for (struct bch_dev *_ca = NULL; \
(_ca = bch2_get_next_online_dev(_c, _ca, state_mask));)
(_ca = bch2_get_next_online_dev(_c, _ca, state_mask, rw));)
#define for_each_online_member(c, ca) \
__for_each_online_member(c, ca, ~0)
__for_each_online_member(c, ca, ~0, READ)
#define for_each_rw_member(c, ca) \
__for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw))
__for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw), WRITE)
#define for_each_readable_member(c, ca) \
__for_each_online_member(c, ca, BIT( BCH_MEMBER_STATE_rw)|BIT(BCH_MEMBER_STATE_ro))
__for_each_online_member(c, ca, BIT( BCH_MEMBER_STATE_rw)|BIT(BCH_MEMBER_STATE_ro), READ)
static inline bool bch2_dev_exists(const struct bch_fs *c, unsigned dev)
{
@ -287,7 +288,7 @@ static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev,
rcu_read_lock();
struct bch_dev *ca = bch2_dev_rcu(c, dev);
if (ca && !percpu_ref_tryget(&ca->io_ref))
if (ca && !percpu_ref_tryget(&ca->io_ref[rw]))
ca = NULL;
rcu_read_unlock();
@ -297,7 +298,7 @@ static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev,
return ca;
if (ca)
percpu_ref_put(&ca->io_ref);
percpu_ref_put(&ca->io_ref[rw]);
return NULL;
}

View File

@ -248,7 +248,7 @@ struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *sb,
struct bch_sb_handle *dev_sb = &ca->disk_sb;
if (bch2_sb_realloc(dev_sb, le32_to_cpu(dev_sb->sb->u64s) + d)) {
percpu_ref_put(&ca->io_ref);
percpu_ref_put(&ca->io_ref[READ]);
return NULL;
}
}
@ -945,7 +945,7 @@ static void write_super_endio(struct bio *bio)
}
closure_put(&ca->fs->sb_write);
percpu_ref_put(&ca->io_ref);
percpu_ref_put(&ca->io_ref[READ]);
}
static void read_back_super(struct bch_fs *c, struct bch_dev *ca)
@ -963,7 +963,7 @@ static void read_back_super(struct bch_fs *c, struct bch_dev *ca)
this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb], bio_sectors(bio));
percpu_ref_get(&ca->io_ref);
percpu_ref_get(&ca->io_ref[READ]);
closure_bio_submit(bio, &c->sb_write);
}
@ -989,7 +989,7 @@ static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)
this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_sb],
bio_sectors(bio));
percpu_ref_get(&ca->io_ref);
percpu_ref_get(&ca->io_ref[READ]);
closure_bio_submit(bio, &c->sb_write);
}
@ -1014,13 +1014,20 @@ int bch2_write_super(struct bch_fs *c)
closure_init_stack(cl);
memset(&sb_written, 0, sizeof(sb_written));
/*
* Note: we do writes to RO devices here, and we might want to change
* that in the future.
*
* For now, we expect to be able to call write_super() when we're not
* yet RW:
*/
for_each_online_member(c, ca) {
ret = darray_push(&online_devices, ca);
if (bch2_fs_fatal_err_on(ret, c, "%s: error allocating online devices", __func__)) {
percpu_ref_put(&ca->io_ref);
percpu_ref_put(&ca->io_ref[READ]);
goto out;
}
percpu_ref_get(&ca->io_ref);
percpu_ref_get(&ca->io_ref[READ]);
}
/* Make sure we're using the new magic numbers: */
@ -1186,7 +1193,7 @@ out:
/* Make new options visible after they're persistent: */
bch2_sb_update(c);
darray_for_each(online_devices, ca)
percpu_ref_put(&(*ca)->io_ref);
percpu_ref_put(&(*ca)->io_ref[READ]);
darray_exit(&online_devices);
printbuf_exit(&err);
return ret;

View File

@ -185,6 +185,7 @@ static void bch2_dev_unlink(struct bch_dev *);
static void bch2_dev_free(struct bch_dev *);
static int bch2_dev_alloc(struct bch_fs *, unsigned);
static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *);
static void bch2_dev_io_ref_stop(struct bch_dev *, int);
static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *);
struct bch_fs *bch2_dev_to_fs(dev_t dev)
@ -294,8 +295,10 @@ static void __bch2_fs_read_only(struct bch_fs *c)
/*
* After stopping journal:
*/
for_each_member_device(c, ca)
for_each_member_device(c, ca) {
bch2_dev_io_ref_stop(ca, WRITE);
bch2_dev_allocator_remove(c, ca);
}
}
#ifndef BCH_WRITE_REF_DEBUG
@ -465,10 +468,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
if (ret)
goto err;
ret = bch2_fs_mark_dirty(c);
if (ret)
goto err;
clear_bit(BCH_FS_clean_shutdown, &c->flags);
/*
@ -480,10 +479,24 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
set_bit(JOURNAL_need_flush_write, &c->journal.flags);
set_bit(JOURNAL_running, &c->journal.flags);
for_each_rw_member(c, ca)
__for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw), READ) {
bch2_dev_allocator_add(c, ca);
percpu_ref_reinit(&ca->io_ref[WRITE]);
}
bch2_recalc_capacity(c);
ret = bch2_fs_mark_dirty(c);
if (ret)
goto err;
spin_lock(&c->journal.lock);
bch2_journal_space_available(&c->journal);
spin_unlock(&c->journal.lock);
ret = bch2_journal_reclaim_start(&c->journal);
if (ret)
goto err;
set_bit(BCH_FS_rw, &c->flags);
set_bit(BCH_FS_was_rw, &c->flags);
@ -495,11 +508,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
atomic_long_inc(&c->writes[i]);
}
#endif
ret = bch2_journal_reclaim_start(&c->journal);
if (ret)
goto err;
if (!early) {
ret = bch2_fs_read_write_late(c);
if (ret)
@ -675,6 +683,7 @@ void bch2_fs_free(struct bch_fs *c)
if (ca) {
EBUG_ON(atomic_long_read(&ca->ref) != 1);
bch2_dev_io_ref_stop(ca, READ);
bch2_free_super(&ca->disk_sb);
bch2_dev_free(ca);
}
@ -1199,6 +1208,15 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs,
/* Device startup/shutdown: */
static void bch2_dev_io_ref_stop(struct bch_dev *ca, int rw)
{
if (!percpu_ref_is_zero(&ca->io_ref[rw])) {
reinit_completion(&ca->io_ref_completion[rw]);
percpu_ref_kill(&ca->io_ref[rw]);
wait_for_completion(&ca->io_ref_completion[rw]);
}
}
static void bch2_dev_release(struct kobject *kobj)
{
struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
@ -1208,6 +1226,9 @@ static void bch2_dev_release(struct kobject *kobj)
static void bch2_dev_free(struct bch_dev *ca)
{
WARN_ON(!percpu_ref_is_zero(&ca->io_ref[WRITE]));
WARN_ON(!percpu_ref_is_zero(&ca->io_ref[READ]));
cancel_work_sync(&ca->io_error_work);
bch2_dev_unlink(ca);
@ -1226,7 +1247,8 @@ static void bch2_dev_free(struct bch_dev *ca)
bch2_time_stats_quantiles_exit(&ca->io_latency[WRITE]);
bch2_time_stats_quantiles_exit(&ca->io_latency[READ]);
percpu_ref_exit(&ca->io_ref);
percpu_ref_exit(&ca->io_ref[WRITE]);
percpu_ref_exit(&ca->io_ref[READ]);
#ifndef CONFIG_BCACHEFS_DEBUG
percpu_ref_exit(&ca->ref);
#endif
@ -1238,14 +1260,12 @@ static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca)
lockdep_assert_held(&c->state_lock);
if (percpu_ref_is_zero(&ca->io_ref))
if (percpu_ref_is_zero(&ca->io_ref[READ]))
return;
__bch2_dev_read_only(c, ca);
reinit_completion(&ca->io_ref_completion);
percpu_ref_kill(&ca->io_ref);
wait_for_completion(&ca->io_ref_completion);
bch2_dev_io_ref_stop(ca, READ);
bch2_dev_unlink(ca);
@ -1262,11 +1282,18 @@ static void bch2_dev_ref_complete(struct percpu_ref *ref)
}
#endif
static void bch2_dev_io_ref_complete(struct percpu_ref *ref)
static void bch2_dev_io_ref_read_complete(struct percpu_ref *ref)
{
struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref);
struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref[READ]);
complete(&ca->io_ref_completion);
complete(&ca->io_ref_completion[READ]);
}
static void bch2_dev_io_ref_write_complete(struct percpu_ref *ref)
{
struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref[WRITE]);
complete(&ca->io_ref_completion[WRITE]);
}
static void bch2_dev_unlink(struct bch_dev *ca)
@ -1330,7 +1357,8 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
kobject_init(&ca->kobj, &bch2_dev_ktype);
init_completion(&ca->ref_completion);
init_completion(&ca->io_ref_completion);
init_completion(&ca->io_ref_completion[READ]);
init_completion(&ca->io_ref_completion[WRITE]);
INIT_WORK(&ca->io_error_work, bch2_io_error_work);
@ -1356,7 +1384,9 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
bch2_dev_allocator_background_init(ca);
if (percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete,
if (percpu_ref_init(&ca->io_ref[READ], bch2_dev_io_ref_read_complete,
PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
percpu_ref_init(&ca->io_ref[WRITE], bch2_dev_io_ref_write_complete,
PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
!(ca->sb_read_scratch = kmalloc(BCH_SB_READ_SCRATCH_BUF_SIZE, GFP_KERNEL)) ||
bch2_dev_buckets_alloc(c, ca) ||
@ -1419,7 +1449,8 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
return -BCH_ERR_device_size_too_small;
}
BUG_ON(!percpu_ref_is_zero(&ca->io_ref));
BUG_ON(!percpu_ref_is_zero(&ca->io_ref[READ]));
BUG_ON(!percpu_ref_is_zero(&ca->io_ref[WRITE]));
ret = bch2_dev_journal_init(ca, sb->sb);
if (ret)
@ -1438,7 +1469,7 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
ca->dev = ca->disk_sb.bdev->bd_dev;
percpu_ref_reinit(&ca->io_ref);
percpu_ref_reinit(&ca->io_ref[READ]);
return 0;
}
@ -1568,6 +1599,8 @@ static bool bch2_fs_may_start(struct bch_fs *c)
static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
{
bch2_dev_io_ref_stop(ca, WRITE);
/*
* The allocator thread itself allocates btree nodes, so stop it first:
*/
@ -1584,6 +1617,10 @@ static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
if (percpu_ref_is_zero(&ca->io_ref[WRITE]))
percpu_ref_reinit(&ca->io_ref[WRITE]);
bch2_dev_do_discards(ca);
}
@ -1731,7 +1768,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
return 0;
err:
if (ca->mi.state == BCH_MEMBER_STATE_rw &&
!percpu_ref_is_zero(&ca->io_ref))
!percpu_ref_is_zero(&ca->io_ref[READ]))
__bch2_dev_read_write(c, ca);
up_write(&c->state_lock);
return ret;