Skip to content

Commit 5f5321e

Browse files
authored
Handle interaction between gang blocks, copies, and FDT.
With the advent of fast dedup, there are no longer separate dedup tables for different copies values. There is now logic that will add DVAs to the dedup table entry if more copies are needed for new writes. However, this interacts poorly with ganging. There are two different cases that can result in mixed gang/non-gang BPs, which are illegal in ZFS. This change modifies updates of existing FDT; if there are already gang DVAs in the FDT, we prevent the new write from extending the DDT entry. We cannot safely mix different gang trees in one block pointer. if there are non-gang DVAs in the FDT, then this allocation may not be gangs. If it would gang, we have to redo the whole write as a non-dedup write. This change also fixes a refcount leak that could occur if the lead DDT write failed. Sponsored by: iXsystems, Inc. Sponsored-by: Klara, Inc. Reviewed-by: Alexander Motin <[email protected]> Reviewed-by: Tony Hutter <[email protected]> Signed-off-by: Paul Dagnelie <[email protected]> Closes: #17123
1 parent 1d8f625 commit 5f5321e

File tree

6 files changed

+156
-14
lines changed

6 files changed

+156
-14
lines changed

include/sys/ddt.h

+2
Original file line numberDiff line numberDiff line change
@@ -352,6 +352,8 @@ extern ddt_phys_variant_t ddt_phys_select(const ddt_t *ddt,
352352
const ddt_entry_t *dde, const blkptr_t *bp);
353353
extern uint64_t ddt_phys_birth(const ddt_univ_phys_t *ddp,
354354
ddt_phys_variant_t v);
355+
extern int ddt_phys_is_gang(const ddt_univ_phys_t *ddp,
356+
ddt_phys_variant_t v);
355357
extern int ddt_phys_dva_count(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
356358
boolean_t encrypted);
357359

module/zfs/ddt.c

+11
Original file line numberDiff line numberDiff line change
@@ -856,6 +856,17 @@ ddt_phys_birth(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
856856
return (ddp->ddp_trad[v].ddp_phys_birth);
857857
}
858858

859+
int
860+
ddt_phys_is_gang(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
861+
{
862+
ASSERT3U(v, <, DDT_PHYS_NONE);
863+
864+
const dva_t *dvas = (v == DDT_PHYS_FLAT) ?
865+
ddp->ddp_flat.ddp_dva : ddp->ddp_trad[v].ddp_dva;
866+
867+
return (DVA_GET_GANG(&dvas[0]));
868+
}
869+
859870
int
860871
ddt_phys_dva_count(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
861872
boolean_t encrypted)

module/zfs/zio.c

+62-13
Original file line numberDiff line numberDiff line change
@@ -3145,6 +3145,17 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
31453145
* This value respects the redundant_metadata property.
31463146
*/
31473147
int gbh_copies = gio->io_prop.zp_gang_copies;
3148+
if (gbh_copies == 0) {
3149+
/*
3150+
* This should only happen in the case where we're filling in
3151+
* DDT entries for a parent that wants more copies than the DDT
3152+
* has. In that case, we cannot gang without creating a mixed
3153+
* blkptr, which is illegal.
3154+
*/
3155+
ASSERT3U(gio->io_child_type, ==, ZIO_CHILD_DDT);
3156+
pio->io_error = EAGAIN;
3157+
return (pio);
3158+
}
31483159
ASSERT3S(gbh_copies, >, 0);
31493160
ASSERT3S(gbh_copies, <=, SPA_DVAS_PER_BP);
31503161

@@ -3614,16 +3625,6 @@ zio_ddt_child_write_done(zio_t *zio)
36143625
return;
36153626
}
36163627

3617-
/*
3618-
* We've successfully added new DVAs to the entry. Clear the saved
3619-
* state or, if there's still outstanding IO, remember it so we can
3620-
* revert to a known good state if that IO fails.
3621-
*/
3622-
if (dde->dde_io->dde_lead_zio[p] == NULL)
3623-
ddt_phys_clear(orig, v);
3624-
else
3625-
ddt_phys_copy(orig, ddp, v);
3626-
36273628
/*
36283629
* Add references for all dedup writes that were waiting on the
36293630
* physical one, skipping any other physical writes that are waiting.
@@ -3635,6 +3636,16 @@ zio_ddt_child_write_done(zio_t *zio)
36353636
ddt_phys_addref(ddp, v);
36363637
}
36373638

3639+
/*
3640+
* We've successfully added new DVAs to the entry. Clear the saved
3641+
* state or, if there's still outstanding IO, remember it so we can
3642+
* revert to a known good state if that IO fails.
3643+
*/
3644+
if (dde->dde_io->dde_lead_zio[p] == NULL)
3645+
ddt_phys_clear(orig, v);
3646+
else
3647+
ddt_phys_copy(orig, ddp, v);
3648+
36383649
ddt_exit(ddt);
36393650
}
36403651

@@ -3650,6 +3661,16 @@ zio_ddt_child_write_ready(zio_t *zio)
36503661
int p = DDT_PHYS_FOR_COPIES(ddt, zio->io_prop.zp_copies);
36513662
ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
36523663

3664+
if (ddt_phys_is_gang(dde->dde_phys, v)) {
3665+
for (int i = 0; i < BP_GET_NDVAS(zio->io_bp); i++) {
3666+
dva_t *d = &zio->io_bp->blk_dva[i];
3667+
metaslab_group_alloc_decrement(zio->io_spa,
3668+
DVA_GET_VDEV(d), zio->io_allocator,
3669+
METASLAB_ASYNC_ALLOC, zio->io_size, zio);
3670+
}
3671+
zio->io_error = EAGAIN;
3672+
}
3673+
36533674
if (zio->io_error != 0)
36543675
return;
36553676

@@ -3769,9 +3790,12 @@ zio_ddt_write(zio_t *zio)
37693790
*/
37703791
int have_dvas = ddt_phys_dva_count(ddp, v, BP_IS_ENCRYPTED(bp));
37713792
IMPLY(have_dvas == 0, ddt_phys_birth(ddp, v) == 0);
3793+
boolean_t is_ganged = ddt_phys_is_gang(ddp, v);
37723794

3773-
/* Number of DVAs requested bya the IO. */
3795+
/* Number of DVAs requested by the IO. */
37743796
uint8_t need_dvas = zp->zp_copies;
3797+
/* Number of DVAs in outstanding writes for this dde. */
3798+
uint8_t parent_dvas = 0;
37753799

37763800
/*
37773801
* What we do next depends on whether or not there's IO outstanding that
@@ -3901,7 +3925,7 @@ zio_ddt_write(zio_t *zio)
39013925
zio_t *pio =
39023926
zio_walk_parents(dde->dde_io->dde_lead_zio[p], &zl);
39033927
ASSERT(pio);
3904-
uint8_t parent_dvas = pio->io_prop.zp_copies;
3928+
parent_dvas = pio->io_prop.zp_copies;
39053929

39063930
if (parent_dvas >= need_dvas) {
39073931
zio_add_child(zio, dde->dde_io->dde_lead_zio[p]);
@@ -3916,13 +3940,27 @@ zio_ddt_write(zio_t *zio)
39163940
need_dvas -= parent_dvas;
39173941
}
39183942

3943+
if (is_ganged) {
3944+
zp->zp_dedup = B_FALSE;
3945+
BP_SET_DEDUP(bp, B_FALSE);
3946+
zio->io_pipeline = ZIO_WRITE_PIPELINE;
3947+
ddt_exit(ddt);
3948+
return (zio);
3949+
}
3950+
39193951
/*
39203952
* We need to write. We will create a new write with the copies
39213953
* property adjusted to match the number of DVAs we need to need to
39223954
* grow the DDT entry by to satisfy the request.
39233955
*/
39243956
zio_prop_t czp = *zp;
3925-
czp.zp_copies = czp.zp_gang_copies = need_dvas;
3957+
if (have_dvas > 0 || parent_dvas > 0) {
3958+
czp.zp_copies = need_dvas;
3959+
czp.zp_gang_copies = 0;
3960+
} else {
3961+
ASSERT3U(czp.zp_copies, ==, need_dvas);
3962+
}
3963+
39263964
zio_t *cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
39273965
zio->io_orig_size, zio->io_orig_size, &czp,
39283966
zio_ddt_child_write_ready, NULL,
@@ -4106,6 +4144,7 @@ zio_dva_allocate(zio_t *zio)
41064144
ASSERT(BP_IS_HOLE(bp));
41074145
ASSERT0(BP_GET_NDVAS(bp));
41084146
ASSERT3U(zio->io_prop.zp_copies, >, 0);
4147+
41094148
ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
41104149
ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
41114150

@@ -5391,6 +5430,16 @@ zio_done(zio_t *zio)
53915430
}
53925431

53935432
if (zio->io_error && zio == zio->io_logical) {
5433+
5434+
/*
5435+
* A DDT child tried to create a mixed gang/non-gang BP. We're
5436+
* going to have to just retry as a non-dedup IO.
5437+
*/
5438+
if (zio->io_error == EAGAIN && IO_IS_ALLOCATING(zio) &&
5439+
zio->io_prop.zp_dedup) {
5440+
zio->io_reexecute |= ZIO_REEXECUTE_NOW;
5441+
zio->io_prop.zp_dedup = B_FALSE;
5442+
}
53945443
/*
53955444
* Determine whether zio should be reexecuted. This will
53965445
* propagate all the way to the root via zio_notify_parent().

tests/runfiles/common.run

+1-1
Original file line numberDiff line numberDiff line change
@@ -726,7 +726,7 @@ tests = ['large_dnode_001_pos', 'large_dnode_003_pos', 'large_dnode_004_neg',
726726
tags = ['functional', 'features', 'large_dnode']
727727

728728
[tests/functional/gang_blocks]
729-
tests = ['gang_blocks_redundant']
729+
tests = ['gang_blocks_redundant', 'gang_blocks_ddt_copies']
730730
tags = ['functional', 'gang_blocks']
731731

732732
[tests/functional/grow]

tests/zfs-tests/tests/Makefile.am

+1
Original file line numberDiff line numberDiff line change
@@ -1561,6 +1561,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
15611561
functional/features/large_dnode/large_dnode_009_pos.ksh \
15621562
functional/features/large_dnode/setup.ksh \
15631563
functional/gang_blocks/cleanup.ksh \
1564+
functional/gang_blocks/gang_blocks_ddt_copies.ksh \
15641565
functional/gang_blocks/gang_blocks_redundant.ksh \
15651566
functional/gang_blocks/setup.ksh \
15661567
functional/grow/grow_pool_001_pos.ksh \
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
#!/bin/ksh
2+
# SPDX-License-Identifier: CDDL-1.0
3+
#
4+
# This file and its contents are supplied under the terms of the
5+
# Common Development and Distribution License ("CDDL"), version 1.0.
6+
# You may only use this file in accordance with the terms of version
7+
# 1.0 of the CDDL.
8+
#
9+
# A full copy of the text of the CDDL should have accompanied this
10+
# source. A copy of the CDDL is also available via the Internet at
11+
# http://www.illumos.org/license/CDDL.
12+
#
13+
14+
#
15+
# Copyright (c) 2025 by Klara Inc.
16+
#
17+
18+
#
19+
# Description:
20+
# Verify that mixed gang blocks and copies interact correctly in FDT
21+
#
22+
# Strategy:
23+
# 1. Store a block with copies = 1 in the DDT unganged.
24+
# 2. Add a new entry with copies = 2 that gangs, ensure it doesn't panic
25+
# 3. Store a block with copies = 1 in the DDT ganged.
26+
# 4. Add a new entry with copies = 3 that doesn't gang, ensure that it doesn't panic.
27+
#
28+
29+
. $STF_SUITE/include/libtest.shlib
30+
. $STF_SUITE/tests/functional/gang_blocks/gang_blocks.kshlib
31+
32+
log_assert "Verify that mixed gang blocks and copies interact correctly in FDT"
33+
34+
save_tunable DEDUP_LOG_TXG_MAX
35+
36+
function cleanup2
37+
{
38+
zfs destroy $TESTPOOL/fs1
39+
zfs destroy $TESTPOOL/fs2
40+
restore_tunable DEDUP_LOG_TXG_MAX
41+
cleanup
42+
}
43+
44+
preamble
45+
log_onexit cleanup2
46+
47+
log_must zpool create -f -o ashift=9 -o feature@block_cloning=disabled $TESTPOOL $DISKS
48+
log_must zfs create -o recordsize=64k -o dedup=on $TESTPOOL/fs1
49+
log_must zfs create -o recordsize=64k -o dedup=on -o copies=3 $TESTPOOL/fs2
50+
set_tunable32 DEDUP_LOG_TXG_MAX 1
51+
log_must dd if=/dev/urandom of=/$TESTPOOL/fs1/f1 bs=64k count=1
52+
log_must sync_pool $TESTPOOL
53+
set_tunable32 METASLAB_FORCE_GANGING 20000
54+
set_tunable32 METASLAB_FORCE_GANGING_PCT 100
55+
log_must dd if=/$TESTPOOL/fs1/f1 of=/$TESTPOOL/fs2/f1 bs=64k count=1
56+
log_must sync_pool $TESTPOOL
57+
58+
log_must rm /$TESTPOOL/fs*/f1
59+
log_must sync_pool $TESTPOOL
60+
log_must dd if=/dev/urandom of=/$TESTPOOL/fs1/f1 bs=64k count=1
61+
log_must sync_pool $TESTPOOL
62+
log_must zdb -D $TESTPOOL
63+
set_tunable32 METASLAB_FORCE_GANGING_PCT 0
64+
log_must dd if=/$TESTPOOL/fs1/f1 of=/$TESTPOOL/fs2/f1 bs=64k count=1
65+
log_must sync_pool $TESTPOOL
66+
67+
log_must rm /$TESTPOOL/fs*/f1
68+
log_must sync_pool $TESTPOOL
69+
set_tunable32 METASLAB_FORCE_GANGING_PCT 50
70+
set_tunable32 METASLAB_FORCE_GANGING 40000
71+
log_must dd if=/dev/urandom of=/$TESTPOOL/f1 bs=64k count=1
72+
for i in `seq 1 16`; do
73+
log_must cp /$TESTPOOL/f1 /$TESTPOOL/fs2/f1
74+
log_must cp /$TESTPOOL/f1 /$TESTPOOL/fs1/f1
75+
log_must sync_pool $TESTPOOL
76+
log_must zdb -D $TESTPOOL
77+
done
78+
79+
log_pass "Verify that mixed gang blocks and copies interact correctly in FDT"

0 commit comments

Comments
 (0)