diff --git a/src/main/scala/gemmini/ConfigsFP.scala b/src/main/scala/gemmini/ConfigsFP.scala index 1c70d290..f9e233a1 100644 --- a/src/main/scala/gemmini/ConfigsFP.scala +++ b/src/main/scala/gemmini/ConfigsFP.scala @@ -105,7 +105,7 @@ object GemminiFPConfigs { ex_write_to_spad=false, has_training_convs = false, hardcode_d_to_garbage_addr = true, - has_loop_conv = false, + has_loop_conv = true, //false, acc_read_full_width = false, //has_loop_conv = false, max_in_flight_mem_reqs = 16, diff --git a/src/main/scala/gemmini/LoopConv.scala b/src/main/scala/gemmini/LoopConv.scala index 07dfefdc..62b7a428 100644 --- a/src/main/scala/gemmini/LoopConv.scala +++ b/src/main/scala/gemmini/LoopConv.scala @@ -11,13 +11,13 @@ import LocalAddr._ import Util._ class LoopConvOuterBounds(val large_iterator_bitwidth: Int, val small_iterator_bitwidth: Int, val tiny_iterator_bitwidth: Int) extends Bundle { - val batch_size = UInt(large_iterator_bitwidth.W) + //val batch_size = UInt(large_iterator_bitwidth.W) val in_row_dim = UInt(small_iterator_bitwidth.W) val in_col_dim = UInt(small_iterator_bitwidth.W) val in_channels = UInt(large_iterator_bitwidth.W) val out_channels = UInt(large_iterator_bitwidth.W) - val out_col_dim = UInt(large_iterator_bitwidth.W) - val out_row_dim = UInt(large_iterator_bitwidth.W) + val out_col_dim = UInt(small_iterator_bitwidth.W) + val out_row_dim = UInt(small_iterator_bitwidth.W) val out_stride = UInt(large_iterator_bitwidth.W) //stride for output activation val in_stride = UInt(large_iterator_bitwidth.W) //stride for input activation val weight_stride = UInt(large_iterator_bitwidth.W) //stride for weight @@ -33,12 +33,12 @@ class LoopConvOuterBounds(val large_iterator_bitwidth: Int, val small_iterator_b } class LoopConvInnerBounds(val large_iterator_bitwidth: Int, val small_iterator_bitwidth: Int, val tiny_iterator_bitwidth: Int) extends Bundle { - val batches = UInt(large_iterator_bitwidth.W) + //val batches = UInt(large_iterator_bitwidth.W) val porows = UInt(small_iterator_bitwidth.W) val pocols = UInt(small_iterator_bitwidth.W) val pochs = UInt(large_iterator_bitwidth.W) - val krows = UInt(tiny_iterator_bitwidth.W) - val kcols = UInt(tiny_iterator_bitwidth.W) + //val krows = UInt(tiny_iterator_bitwidth.W) + //val kcols = UInt(tiny_iterator_bitwidth.W) val kchs = UInt(large_iterator_bitwidth.W) val lpad = UInt(tiny_iterator_bitwidth.W) val rpad = UInt(tiny_iterator_bitwidth.W) @@ -77,7 +77,7 @@ class LoopConvLdBiasReq(val coreMaxAddrBits: Int, val large_iterator_bitwidth: I val derived_params = new LoopConvDerivedParams(large_iterator_bitwidth, small_iterator_bitwidth, tiny_iterator_bitwidth) val addr_start = UInt(log2Up(max_acc_addr).W) val dram_addr = UInt(coreMaxAddrBits.W) - val no_bias = Bool() + //val no_bias = Bool() val loop_id = UInt(log2Up(concurrent_loops).W) } @@ -114,15 +114,16 @@ class LoopConvLdBias(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwi val skip = req.dram_addr === 0.U // Iterators - val b = Reg(UInt(large_iterator_bitwidth.W)) + //val b = Reg(UInt(large_iterator_bitwidth.W)) val orow = Reg(UInt(small_iterator_bitwidth.W)) val ocol = Reg(UInt(small_iterator_bitwidth.W)) val och = Reg(UInt(large_iterator_bitwidth.W)) // Addresses val dram_offset = och * (acc_w/8).U - val dram_addr = Mux(req.no_bias, 0.U, req.dram_addr + LoopConv.castDramOffset(dram_offset)) - val spad_addr = acc_addr_start +& (och / block_size.U(och.getWidth.W)) * batches * orows * ocols +& b * orows * ocols +& orow * ocols +& ocol + val dram_addr = Mux(skip, 0.U, req.dram_addr + LoopConv.castDramOffset(dram_offset)) + //val spad_addr = acc_addr_start +& (och / block_size.U(och.getWidth.W)) * batches * orows * ocols +& b * orows * ocols +& orow * ocols +& ocol + val spad_addr = acc_addr_start +& (och / block_size.U(och.getWidth.W)) * orows * ocols +& orow * ocols +& ocol // Sizes val I = Mux(ocols - ocol > block_size.U, block_size.U, ocols - ocol) @@ -196,14 +197,14 @@ class LoopConvLdBias(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwi val next_och = floorAdd(och, max_ochs_per_mvin, ochs) val next_ocol = floorAdd(ocol, block_size.U, ocols, next_och === 0.U) val next_orow = floorAdd(orow, 1.U, orows, next_ocol === 0.U && next_och === 0.U) - val next_b = floorAdd(b, 1.U, batches, next_orow === 0.U && next_ocol === 0.U && next_och === 0.U) + //val next_b = floorAdd(b, 1.U, batches, next_orow === 0.U && next_ocol === 0.U && next_och === 0.U) och := next_och ocol := next_ocol orow := next_orow - b := next_b + //b := next_b - state := Mux(next_b === 0.U && next_orow === 0.U && next_ocol === 0.U && next_och === 0.U, + state := Mux(next_orow === 0.U && next_ocol === 0.U && next_och === 0.U, idle, ld) } } @@ -212,7 +213,7 @@ class LoopConvLdBias(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwi when (io.req.fire) { req := io.req.bits state := config - b := 0.U + //b := 0.U orow := 0.U ocol := 0.U och := 0.U @@ -264,11 +265,12 @@ class LoopConvLdInput(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitw // Derived parameters val max_ichs_per_mvin = Mux(ichs < (max_block_len * block_size).U, ichs, (max_block_len * block_size).U).zext - val max_batches_per_mvin = Mux(batches < (max_block_len * block_size).U, batches, (max_block_len * block_size).U).zext + //val max_batches_per_mvin = Mux(batches < (max_block_len * block_size).U, batches, (max_block_len * block_size).U).zext + val max_batches_per_mvin = 1.U.zext val max_chs_per_mvin = Mux(req.trans_input_3120, max_batches_per_mvin, max_ichs_per_mvin) // Iterators - val b = Reg(SInt(large_iterator_bitwidth.W)) + //val b = Reg(SInt(large_iterator_bitwidth.W)) val irow = Reg(SInt(small_iterator_bitwidth.W)) val icol = Reg(SInt(small_iterator_bitwidth.W)) val ich = Reg(SInt(large_iterator_bitwidth.W)) @@ -278,9 +280,11 @@ class LoopConvLdInput(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitw val icol_padded = icol +& undilated(lpad).zext val is_zeros = irow < 0.S || irow >= irows_unpadded.zext || icol < 0.S || icol >= icols_unpadded.zext - val dram_stride = Mux(req.trans_input_3120, batch_size * (input_w/8).U, in_stride * (input_w/8).U) + //val dram_stride = Mux(req.trans_input_3120, batch_size * (input_w/8).U, in_stride * (input_w/8).U) + val dram_stride = in_stride * (input_w/8).U // Addresses + /* val dram_offset = Mux(req.trans_input_3120, (((ich * in_col_dim * in_row_dim +& irow*in_col_dim +& icol) * batches +& b) * (input_w/8).U).asUInt, (((b * in_row_dim * in_col_dim +& irow*in_col_dim +& icol) * in_stride +& ich) * (input_w/8).U).asUInt) val dram_addr = Mux(is_zeros, 0.U, req.dram_addr + LoopConv.castDramOffset(dram_offset)) @@ -288,6 +292,11 @@ class LoopConvLdInput(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitw // To prevent Verilator errors, we replace some "/ block_size.U" calls here with ">> log2Up(block_size)" req.addr_start.zext +& (b >> log2Up(block_size)) * input_spad_stride +& ich * (irows >> req.downsample) * (icols >> req.downsample) +& (irow_padded >> req.downsample) * (icols >> req.downsample) +& (icol_padded >> req.downsample), req.addr_start.zext +& (ich >> log2Up(block_size)) * input_spad_stride +& b * (irows >> req.downsample) * (icols >> req.downsample) +& (irow_padded >> req.downsample) * (icols >> req.downsample) +& (icol_padded >> req.downsample)) + */ + + val dram_offset = (((irow*in_col_dim +& icol) * in_stride +& ich) * (input_w/8).U).asUInt + val dram_addr = Mux(is_zeros, 0.U, req.dram_addr + LoopConv.castDramOffset(dram_offset)) + val spad_addr = req.addr_start.zext +& (ich >> log2Up(block_size)) * input_spad_stride +& (irow_padded >> req.downsample) * (icols >> req.downsample) +& (icol_padded >> req.downsample) // Sizes val block_size_downsampled = (block_size.U << req.downsample).asUInt.zext @@ -299,10 +308,13 @@ class LoopConvLdInput(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitw (icol >= icols_unpadded.zext) -> Mux(icols_unpadded.zext +& undilated(rpad).zext -& icol > block_size.S, block_size.S, icols_unpadded.zext +& undilated(rpad).zext -& icol) ) ) + /* val K = Mux(req.trans_input_3120, Mux(batches.zext -& b > max_chs_per_mvin, max_chs_per_mvin, batches.zext -& b), Mux(ichs.zext -& ich > max_chs_per_mvin, max_chs_per_mvin, ichs.zext -& ich)) - +*/ + val K = Mux(ichs.zext -& ich > max_chs_per_mvin, max_chs_per_mvin, ichs.zext -& ich) + class RoCCCommandWithAddr extends Bundle { val cmd = new RoCCCommand val dram_addr = UInt() @@ -367,7 +379,7 @@ class LoopConvLdInput(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitw when (state === config) { state := ld }.otherwise { - val b_it = Mux(req.trans_input_3120, max_chs_per_mvin.asUInt, 1.U) + //val b_it = Mux(req.trans_input_3120, max_chs_per_mvin.asUInt, 1.U) val ich_it = Mux(req.trans_input_3120, 1.U, max_chs_per_mvin.asUInt) val next_ich = sFloorAdd(ich, ich_it, ichs.zext, 0.S) @@ -375,15 +387,15 @@ class LoopConvLdInput(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitw next_ich === 0.S) val next_irow = sFloorAdd(irow, 1.U << req.downsample, (irows_unpadded +& undilated(dpad)).zext, 0.S-&undilated(upad).zext, next_icol === 0.S-&undilated(lpad).zext && next_ich === 0.S) - val next_b = sFloorAdd(b, b_it, batches.zext, 0.S, - next_irow === 0.S-&undilated(upad).zext && next_icol === 0.S-&undilated(lpad).zext && next_ich === 0.S) + //val next_b = sFloorAdd(b, b_it, batches.zext, 0.S, + // next_irow === 0.S-&undilated(upad).zext && next_icol === 0.S-&undilated(lpad).zext && next_ich === 0.S) ich := next_ich icol := next_icol irow := next_irow - b := next_b + //b := next_b - state := Mux(next_b === 0.S && next_irow === 0.S-&undilated(upad).zext && next_icol === 0.S-&undilated(lpad).zext && next_ich === 0.S, + state := Mux(next_irow === 0.S-&undilated(upad).zext && next_icol === 0.S-&undilated(lpad).zext && next_ich === 0.S, idle, ld) } } @@ -392,7 +404,6 @@ class LoopConvLdInput(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitw when (io.req.fire) { req := io.req.bits state := config - b := 0.S irow := 0.S -& ((io.req.bits.inner_bounds.upad +& io.req.bits.input_dilated) >> io.req.bits.input_dilated).zext icol := 0.S -& ((io.req.bits.inner_bounds.lpad +& io.req.bits.input_dilated) >> io.req.bits.input_dilated).zext ich := 0.S @@ -446,15 +457,19 @@ class LoopConvLdWeight(block_size: Int, coreMaxAddrBits: Int, large_iterator_bit Mux(req.trans_weight_0132, max_kchs_per_mvin, max_ochs_per_mvin) } - val B_rows = Mux(req.trans_weight_0132, in_channels_per_bank * kcols * krows * ochs, - out_channels_per_bank * kcols * krows * kchs) + //val B_rows = Mux(req.trans_weight_0132, in_channels_per_bank * kcols * krows * ochs, + // out_channels_per_bank * kcols * krows * kchs) + val B_rows = out_channels_per_bank * kernel_dim * kernel_dim * kchs val addr_start = req.addr_end - B_rows + /* val dram_stride = MuxCase(weight_stride, Seq( req.dw -> 1.U, req.trans_weight_1203 -> (kernel_dim * kernel_dim * out_channels), req.trans_weight_0132 -> in_channels )) * (input_w/8).U + */ + val dram_stride = (Mux(req.dw, 1.U, weight_stride)) * (input_w/8).U // Iterators val och = Reg(UInt(large_iterator_bitwidth.W)) @@ -463,25 +478,36 @@ class LoopConvLdWeight(block_size: Int, coreMaxAddrBits: Int, large_iterator_bit val kch = Reg(UInt(large_iterator_bitwidth.W)) // Addresses + /* val dram_offset = MuxCase(((krow*kernel_dim*in_channels +& kcol*in_channels +& kch) * weight_stride +& och) * (input_w/8).U, Seq( req.dw -> (krow * kernel_dim +& kcol) * (input_w/8).U, req.trans_weight_1203 -> (((kch*kernel_dim*kernel_dim +& krow*kernel_dim +& kcol) * out_channels +& och) * (input_w/8).U), req.trans_weight_0132 -> (((krow*kernel_dim*out_channels +& kcol*out_channels +& och) * in_channels +& kch) * (input_w/8).U) )) + */ + val dram_offset = Mux(req.dw, (krow * kernel_dim +& kcol) * (input_w/8).U, ((krow*kernel_dim*in_channels +& kcol*in_channels +& kch) * weight_stride +& och) * (input_w/8).U) + val dram_addr = req.dram_addr + LoopConv.castDramOffset(dram_offset) + /* val spad_addr = Mux(req.trans_weight_0132, // The width expansions are added here solely to prevent Verilator's "WIDTH" warnings, despite making the code uglier addr_start + (kch / block_size.U(kch.getWidth.W)) * krows * kcols * ochs + krow * kcols * ochs + kcol * ochs + och, addr_start + (och / block_size.U(och.getWidth.W)) * krows * kcols * kchs + krow * kcols * kchs + kcol * kchs + kch) + */ + val spad_addr = addr_start + (och / block_size.U(och.getWidth.W)) * kernel_dim * kernel_dim * kchs + krow * kernel_dim * kchs + kcol * kchs + kch // Sizes + /* val J = Mux(req.trans_weight_0132, Mux(kchs - kch > max_chs_per_mvin, max_chs_per_mvin, kchs - kch), Mux(ochs - och > max_chs_per_mvin, max_chs_per_mvin, ochs - och)) val K = Mux(req.trans_weight_0132, Mux(ochs - och > block_size.U, block_size.U, ochs - och), Mux(kchs - kch > block_size.U, block_size.U, kchs - kch)) + */ + val J = Mux(ochs - och > max_chs_per_mvin, max_chs_per_mvin, ochs - och) + val K = Mux(kchs - kch > block_size.U, block_size.U, kchs - kch) class RoCCCommandWithAddr extends Bundle { val cmd = new RoCCCommand @@ -548,12 +574,14 @@ class LoopConvLdWeight(block_size: Int, coreMaxAddrBits: Int, large_iterator_bit when (state === config) { state := ld }.otherwise { - val och_it = Mux(req.trans_weight_0132, block_size.U, max_chs_per_mvin) - val kch_it = Mux(req.trans_weight_0132, max_chs_per_mvin, block_size.U) + //val och_it = Mux(req.trans_weight_0132, block_size.U, max_chs_per_mvin) + //val kch_it = Mux(req.trans_weight_0132, max_chs_per_mvin, block_size.U) + val och_it = max_chs_per_mvin + val kch_it = block_size.U val next_kch = floorAdd(kch, kch_it, kchs) - val next_kcol = floorAdd(kcol, 1.U, kcols, next_kch === 0.U) - val next_krow = floorAdd(krow, 1.U, krows, next_kcol === 0.U && next_kch === 0.U) + val next_kcol = floorAdd(kcol, 1.U, kernel_dim, next_kch === 0.U) + val next_krow = floorAdd(krow, 1.U, kernel_dim, next_kcol === 0.U && next_kch === 0.U) val next_och = floorAdd(och, och_it, ochs, next_krow === 0.U && next_kcol === 0.U && next_kch === 0.U) kch := next_kch @@ -561,7 +589,7 @@ class LoopConvLdWeight(block_size: Int, coreMaxAddrBits: Int, large_iterator_bit krow := next_krow och := next_och - state := Mux(next_och === 0.U && next_krow === 0.U && next_kcol === 0.U && next_kch === 0.U, + state := Mux(next_och === 0.U && next_kch === 0.U && next_krow === 0.U && next_kcol === 0.U, idle, ld) } } @@ -590,6 +618,7 @@ class LoopConvExecuteReq(val large_iterator_bitwidth: Int, val small_iterator_bi val input_dilated = Bool() val trans_weight_0132 = Bool() val trans_input_3120 = Bool() + val accumulate = Bool() val loop_id = UInt(log2Up(concurrent_loops).W) } @@ -626,8 +655,9 @@ class LoopConvExecute(block_size: Int, large_iterator_bitwidth: Int, small_itera def undilated(x: UInt): UInt = (x +& req.input_dilated) >> req.input_dilated // Derived parameters - val B_rows = Mux(req.trans_weight_0132, in_channels_per_bank * kcols * krows * ochs, - out_channels_per_bank * kcols * krows * kchs) + //val B_rows = Mux(req.trans_weight_0132, in_channels_per_bank * kcols * krows * ochs, + // out_channels_per_bank * kcols * krows * kchs) + val B_rows = out_channels_per_bank * kernel_dim * kernel_dim * kchs val a_addr_start = req.a_addr_start val b_addr_start = req.b_addr_end - B_rows @@ -638,45 +668,57 @@ class LoopConvExecute(block_size: Int, large_iterator_bitwidth: Int, small_itera val krow = Reg(UInt(tiny_iterator_bitwidth.W)) val kcol = Reg(UInt(tiny_iterator_bitwidth.W)) val kch = Reg(UInt(large_iterator_bitwidth.W)) - val b = Reg(UInt(large_iterator_bitwidth.W)) + //val b = Reg(UInt(large_iterator_bitwidth.W)) val orow = Reg(UInt(small_iterator_bitwidth.W)) val ocol = Reg(UInt(small_iterator_bitwidth.W)) // TODO kernel-dilation and input-dilation can never be activated at the same time, so we can optimize out some multiplications by kernel_dilation val skip_iteration = state >= pre && req.input_dilated && (((krow * kernel_dilation +& orow -& upad)(0) & req.input_dilated).asBool || ((kcol * kernel_dilation +& ocol -& lpad)(0) & req.input_dilated).asBool) + //val skip_iteration = state >= pre && req.input_dilated && (((orow -& upad)(0) & req.input_dilated).asBool || + // ((ocol -& lpad)(0) & req.input_dilated).asBool) - val pixels = Mux(kcols - kcol > req.max_pixels_per_row, req.max_pixels_per_row, kcols - kcol) + val pixels = Mux(kernel_dim - kcol > req.max_pixels_per_row, req.max_pixels_per_row, kernel_dim - kcol) + //val pixels = Mux(kernel_dim > req.max_pixels_per_row, req.max_pixels_per_row, kernel_dim) val irow = undilated(orow * stride +& krow * kernel_dilation) val icol = undilated(ocol * stride +& kcol * kernel_dilation) + /* val I = Mux(req.trans_input_3120, Mux(batches - b > block_size.U, block_size.U, batches - b), undilated(Mux(ocols - ocol > (block_size.U << req.input_dilated).asUInt, (block_size.U << req.input_dilated).asUInt, ocols - ocol))) + */ + val I = undilated(Mux(ocols - ocol > (block_size.U << req.input_dilated).asUInt, (block_size.U << req.input_dilated).asUInt, ocols - ocol)) val J = Mux(ochs - och > block_size.U, block_size.U, ochs - och) val K = pixels * Mux(kchs - kch > block_size.U, block_size.U, kchs - kch) // Addresses + /* val a_addr = Mux(req.trans_input_3120, a_addr_start +& (b / block_size.U) * input_spad_stride +& kch * (irows >> req.downsample) * (icols >> req.downsample) +& (irow >> req.downsample) * (icols >> req.downsample) +& (icol >> req.downsample), a_addr_start +& (kch / block_size.U(kch.getWidth.W)) * input_spad_stride +& b * (irows >> req.downsample) * (icols >> req.downsample) +& (irow >> req.downsample) * (icols >> req.downsample) +& (icol >> req.downsample)) - - // val c_addr = Mux(ex_overwrite && krow === 0.U && kcol === 0.U && kch === 0.U, d_addr_start, c_addr_start) +& - // (och / block_size.U) * batches * orows * ocols +& b * orows * ocols +& orow * ocols +& ocol + */ + val a_addr = a_addr_start +& (kch / block_size.U(kch.getWidth.W)) * input_spad_stride +& (irow >> req.downsample) * (icols >> req.downsample) +& (icol >> req.downsample) // The width expansions are added here solely to prevent Verilator's "WIDTH" warnings, despite making the code uglier + //val c_addr = c_addr_start +& + // (och / block_size.U(och.getWidth.W)) * batches * orows * ocols +& b * orows * ocols +& orow * ocols +& ocol val c_addr = c_addr_start +& - (och / block_size.U(och.getWidth.W)) * batches * orows * ocols +& b * orows * ocols +& orow * ocols +& ocol + (och / block_size.U(och.getWidth.W)) * orows * ocols +& orow * ocols +& ocol // val new_weights = b === 0.U && orow === 0.U && ocol === 0.U val new_weights = Reg(Bool()) - val krow_rot = Mux(req.wrot180, krows - krow - 1.U, krow) - val kcol_rot = Mux(req.wrot180, kcols - kcol - 1.U, kcol) + // these will be krow/kcol (which is 0) + val krow_rot = Mux(req.wrot180, kernel_dim - krow - 1.U, krow) + val kcol_rot = Mux(req.wrot180, kernel_dim - kcol - 1.U, kcol) + /* val b_addr = Mux(req.trans_weight_0132, b_addr_start +& (kch / block_size.U(och.getWidth.W)) * krows * kcols * ochs +& krow_rot * kcols * ochs +& kcol_rot * ochs +& och, b_addr_start +& (och / block_size.U(och.getWidth.W)) * krows * kcols * kchs +& krow_rot * kcols * kchs +& kcol_rot * kchs +& kch) + */ + val b_addr = b_addr_start +& (och / block_size.U(och.getWidth.W)) * kernel_dim * kernel_dim * kchs +& krow_rot * kernel_dim * kchs +& kcol_rot * kchs +& kch class RoCCCommandWithAddr extends Bundle { val cmd = new RoCCCommand @@ -687,6 +729,7 @@ class LoopConvExecute(block_size: Int, large_iterator_bitwidth: Int, small_itera val J = UInt() val K = UInt() val new_weights = Bool() + val k_start = Bool() } val command_p = Module(new Pipeline[RoCCCommandWithAddr](new RoCCCommandWithAddr, latency)()) @@ -736,10 +779,12 @@ class LoopConvExecute(block_size: Int, large_iterator_bitwidth: Int, small_itera command_p.io.in.bits.J := J command_p.io.in.bits.K := K command_p.io.in.bits.new_weights := new_weights + command_p.io.in.bits.k_start := kch === 0.U && kcol === 0.U && krow === 0.U command_p.io.out.ready := io.cmd.ready && !io.rob_overloaded io.cmd.valid := command_p.io.out.valid && !io.rob_overloaded io.cmd.bits := command_p.io.out.bits.cmd + val k_start = command_p.io.out.bits.k_start when (command_p.io.out.bits.cmd.inst.funct === PRELOAD_CMD) { val o = command_p.io.out.bits @@ -754,7 +799,8 @@ class LoopConvExecute(block_size: Int, large_iterator_bitwidth: Int, small_itera pre_cmd_rs2 := DontCare pre_cmd_rs2.num_rows := o.I.asUInt pre_cmd_rs2.num_cols := o.J.asUInt - pre_cmd_rs2.local_addr := cast_to_acc_addr(pre_cmd_rs2.local_addr, o.c_addr, accumulate = true.B, read_full = false.B) + pre_cmd_rs2.local_addr := cast_to_acc_addr(pre_cmd_rs2.local_addr, o.c_addr, accumulate = req.accumulate || !k_start, read_full = false.B) + //pre_cmd_rs2.local_addr := cast_to_acc_addr(pre_cmd_rs2.local_addr, o.c_addr, accumulate = true.B, read_full = false.B) io.cmd.bits.rs1 := pre_cmd_rs1.asUInt io.cmd.bits.rs2 := pre_cmd_rs2.asUInt @@ -788,45 +834,45 @@ class LoopConvExecute(block_size: Int, large_iterator_bitwidth: Int, small_itera }.elsewhen (state === pre) { state := comp }.otherwise { - val b_it = Mux(req.trans_input_3120, block_size.U, 1.U) - val ocol_it = Mux(skip_iteration || req.trans_input_3120, 1.U, block_size.U << req.input_dilated).asUInt + //val b_it = Mux(req.trans_input_3120, block_size.U, 1.U) + //val ocol_it = Mux(skip_iteration || req.trans_input_3120, 1.U, block_size.U << req.input_dilated).asUInt + val ocol_it = Mux(skip_iteration, 1.U, block_size.U << req.input_dilated).asUInt val next_ocol = floorAdd(ocol, ocol_it, ocols) val next_orow = floorAdd(orow, 1.U, orows, next_ocol === 0.U) - val next_b = floorAdd(b, b_it, batches, next_orow === 0.U && next_ocol === 0.U) + //val next_b = floorAdd(b, b_it, batches, next_orow === 0.U && next_ocol === 0.U) val next_kch = floorAdd(kch, block_size.U, kchs, - next_b === 0.U && next_orow === 0.U && next_ocol === 0.U) - val next_kcol = floorAdd(kcol, req.max_pixels_per_row, kcols, - next_kch === 0.U && next_b === 0.U && next_orow === 0.U && next_ocol === 0.U) - val next_krow = floorAdd(krow, 1.U, krows, - next_kcol === 0.U && next_kch === 0.U && next_b === 0.U && next_orow === 0.U && next_ocol === 0.U) + next_orow === 0.U && next_ocol === 0.U) + val next_kcol = floorAdd(kcol, req.max_pixels_per_row, kernel_dim, + next_kch === 0.U && next_orow === 0.U && next_ocol === 0.U) + val next_krow = floorAdd(krow, 1.U, kernel_dim, + next_kcol === 0.U && next_kch === 0.U && next_orow === 0.U && next_ocol === 0.U) val next_och = floorAdd(och, block_size.U, ochs, next_krow === 0.U && - next_kcol === 0.U && next_kch === 0.U && next_b === 0.U && next_orow === 0.U && next_ocol === 0.U) + next_kcol === 0.U && next_kch === 0.U && next_orow === 0.U && next_ocol === 0.U) ocol := next_ocol orow := next_orow - b := next_b + //b := next_b kch := next_kch kcol := next_kcol krow := next_krow och := next_och - when (next_b === 0.U && next_orow === 0.U && next_ocol === 0.U) { + when (next_orow === 0.U && next_ocol === 0.U) { new_weights := true.B } - state := Mux(next_och === 0.U && next_krow === 0.U && next_kcol === 0.U && next_kch === 0.U && next_b === 0.U && - next_orow === 0.U && next_ocol === 0.U, - idle, pre) + state := Mux(next_och === 0.U && next_kch === 0.U && next_orow === 0.U && next_ocol === 0.U && next_krow === 0.U && next_kcol === 0.U, idle, pre) } } // Accepting requests when (io.req.fire) { req := io.req.bits - state := Mux(io.req.bits.trans_input_3120, config, pre) + //state := Mux(io.req.bits.trans_input_3120, config, pre) + state := pre - b := 0.U + //b := 0.U orow := 0.U ocol := 0.U och := 0.U @@ -882,20 +928,26 @@ class LoopConvSt(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwidth: val skip = req.dram_addr === 0.U // Iterators - val b = Reg(UInt(large_iterator_bitwidth.W)) + //val b = Reg(UInt(large_iterator_bitwidth.W)) val orow = Reg(UInt(small_iterator_bitwidth.W)) val ocol = Reg(UInt(small_iterator_bitwidth.W)) val och = Reg(UInt(large_iterator_bitwidth.W)) // Addresses + /* val dram_offset = Mux(req.trans_output_1203, ((orow*out_col_dim*batch_size +& ocol*batch_size +& b) * out_channels +& och) * (input_w/8).U, ((b*out_row_dim*out_col_dim +& orow*out_col_dim +& ocol) * out_stride +& och) * (input_w/8).U) + */ + val dram_offset = ((orow*out_col_dim +& ocol) * out_stride +& och) * (input_w/8).U val dram_addr = req.dram_addr + LoopConv.castDramOffset(dram_offset) - val spad_addr = acc_addr_start +& (och / block_size.U(och.getWidth.W)) * batches * orows * ocols +& b * orows * ocols +& orow * ocols +& ocol + //val spad_addr = acc_addr_start +& (och / block_size.U(och.getWidth.W)) * batches * orows * ocols +& b * orows * ocols +& orow * ocols +& ocol + val spad_addr = acc_addr_start +& (och / block_size.U(och.getWidth.W)) * orows * ocols +& orow * ocols +& ocol - val pool_dram_addr = req.dram_addr + ((b * pool_out_col_dim * pool_out_row_dim) * out_stride + och) * (input_w/8).U - val pool_spad_addr = acc_addr_start +& (och / block_size.U(och.getWidth.W)) * batches * orows * ocols +& b * orows * ocols + val pool_dram_addr = req.dram_addr + och * (input_w/8).U + val pool_spad_addr = acc_addr_start +& (och / block_size.U(och.getWidth.W)) * orows * ocols + //val pool_dram_addr = req.dram_addr + ((b * pool_out_col_dim * pool_out_row_dim) * out_stride + och) * (input_w/8).U + //val pool_spad_addr = acc_addr_start +& (och / block_size.U(och.getWidth.W)) * batches * orows * ocols +& b * orows * ocols // Sizes val I = Mux(ocols - ocol > block_size.U, block_size.U, ocols - ocol) @@ -1021,14 +1073,14 @@ class LoopConvSt(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwidth: val next_och = floorAdd(och, block_size.U, ochs) val next_ocol = floorAdd(ocol, block_size.U, ocols, next_och === 0.U) val next_orow = floorAdd(orow, 1.U, orows, next_ocol === 0.U && next_och === 0.U) - val next_b = floorAdd(b, 1.U, batches, next_orow === 0.U && next_ocol === 0.U && next_och === 0.U) + //val next_b = floorAdd(b, 1.U, batches, next_orow === 0.U && next_ocol === 0.U && next_och === 0.U) och := next_och ocol := next_ocol orow := next_orow - b := next_b + //b := next_b - state := Mux(next_b === 0.U && next_orow === 0.U && next_ocol === 0.U && next_och === 0.U, + state := Mux(next_orow === 0.U && next_ocol === 0.U && next_och === 0.U, idle, st) }.elsewhen(state === pre_pool_config) { state := pool @@ -1036,12 +1088,12 @@ class LoopConvSt(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwidth: state := idle }.otherwise { val next_och = floorAdd(och, block_size.U, ochs) - val next_b = floorAdd(b, 1.U, batches, next_och === 0.U) + //val next_b = floorAdd(b, 1.U, batches, next_och === 0.U) och := next_och - b := next_b + //b := next_b - state := Mux(next_b === 0.U && next_och === 0.U, + state := Mux(next_och === 0.U, post_pool_config, pool) } } @@ -1051,7 +1103,7 @@ class LoopConvSt(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwidth: req := io.req.bits state := Mux(io.req.bits.no_pool, st, pre_pool_config) - b := 0.U + //b := 0.U orow := 0.U ocol := 0.U och := 0.U @@ -1067,7 +1119,7 @@ class LoopConvState(val block_size: Int, val large_iterator_bitwidth: Int, val s val input_dram_addr = UInt(coreMaxAddrBits.W) val output_dram_addr = UInt(coreMaxAddrBits.W) - val no_bias = Bool() + val accumulate = Bool() val wrot180 = Bool() val no_pool = Bool() val downsample = Bool() @@ -1105,15 +1157,17 @@ class LoopConvState(val block_size: Int, val large_iterator_bitwidth: Int, val s val b_addr_end = UInt(log2Up(max_addr+1).W) def derived_params(dummy: Int=0): LoopConvDerivedParams = { - import outer_bounds.{stride, kernel_dilation} - import inner_bounds.{batches, pochs, orows, ocols, krows, kcols, upad, dpad, lpad, rpad, kchs} + import outer_bounds.{stride, kernel_dilation, kernel_dim} + import inner_bounds.{pochs, orows, ocols, upad, dpad, lpad, rpad, kchs} val result = Wire(new LoopConvDerivedParams(large_iterator_bitwidth, small_iterator_bitwidth, tiny_iterator_bitwidth)) result.ochs := pochs - val dilated_krows = krows + (kernel_dilation - 1.U)*(krows - 1.U) - val dilated_kcols = kcols + (kernel_dilation - 1.U)*(kcols - 1.U) + //val dilated_krows = kernel_dim + (kernel_dilation - 1.U)*(krows - 1.U) + //val dilated_kcols = kernel_dim + (kernel_dilation - 1.U)*(kcols - 1.U) + val dilated_krows = kernel_dim + (kernel_dilation - 1.U)*(kernel_dim - 1.U) + val dilated_kcols = kernel_dim + (kernel_dilation - 1.U)*(kernel_dim - 1.U) val irows_without_dilation = orows * stride +& dilated_krows -& 1.U val icols_without_dilation = ocols * stride +& dilated_kcols -& 1.U @@ -1136,11 +1190,15 @@ class LoopConvState(val block_size: Int, val large_iterator_bitwidth: Int, val s result.out_channels_per_bank := result.ochs / block_size.U(result.ochs.getWidth.W) +& (result.ochs % block_size.U =/= 0.U) result.in_channels_per_bank := result.ichs / block_size.U(result.ochs.getWidth.W) +& (result.ichs % block_size.U =/= 0.U) - result.bias_spad_stride := batches * orows * ocols + result.bias_spad_stride := orows * ocols + /* result.input_spad_stride := Mux(trans_input_3120, result.ichs * (result.irows >> downsample) * (result.icols >> downsample), batches * (result.irows >> downsample) * (result.icols >> downsample)) result.weight_spad_stride := Mux(trans_weight_0132, krows * kcols * pochs, krows * kcols * kchs) + */ + result.input_spad_stride := (result.irows >> downsample) * (result.icols >> downsample) + result.weight_spad_stride := kernel_dim * kernel_dim * kchs // result.ex_overwrite := bias_dram_addr =/= 0.U && no_bias @@ -1174,9 +1232,9 @@ class LoopConv (block_size: Int, coreMaxAddrBits: Int, reservation_station_size: has_training_convs: Boolean, has_max_pool: Boolean, has_first_layer_optimizations: Boolean, has_dw_convs: Boolean) (implicit p: Parameters) extends Module { - val large_iterator_bitwidth = 16 - val small_iterator_bitwidth = 16 // 8 - val tiny_iterator_bitwidth = 16 // 4 + val large_iterator_bitwidth = 12 //16 + val small_iterator_bitwidth = 10 + val tiny_iterator_bitwidth = 4 val max_block_len = (dma_max_bytes / (block_size * (input_w / 8))) max 1 val max_block_len_acc = (dma_max_bytes / (block_size * (acc_w / 8))) max 1 @@ -1283,7 +1341,7 @@ class LoopConv (block_size: Int, coreMaxAddrBits: Int, reservation_station_size: loop_being_configured.outer_bounds.out_channels := cmd.bits.cmd.rs1(63, 48) loop_being_configured.outer_bounds.in_channels := cmd.bits.cmd.rs1(47, 32) loop_being_configured.outer_bounds.in_row_dim := cmd.bits.cmd.rs1(31, 16) - loop_being_configured.outer_bounds.batch_size := cmd.bits.cmd.rs1(15, 0) + //loop_being_configured.outer_bounds.batch_size := cmd.bits.cmd.rs1(15, 0) loop_being_configured.outer_bounds.padding := cmd.bits.cmd.rs2(63, 56) loop_being_configured.outer_bounds.stride := cmd.bits.cmd.rs2(55, 48) @@ -1299,15 +1357,15 @@ class LoopConv (block_size: Int, coreMaxAddrBits: Int, reservation_station_size: loop_being_configured.outer_bounds.pool_stride := (if (!has_max_pool) 1.U else cmd.bits.cmd.rs1(15, 8)) loop_being_configured.outer_bounds.pool_padding := (if (!has_max_pool) 0.U else cmd.bits.cmd.rs1(7, 0)) - loop_being_configured.inner_bounds.batches := cmd.bits.cmd.rs2(63, 48) + //loop_being_configured.inner_bounds.batches := cmd.bits.cmd.rs2(63, 48) loop_being_configured.inner_bounds.porows := cmd.bits.cmd.rs2(47, 32) loop_being_configured.inner_bounds.pocols := cmd.bits.cmd.rs2(31, 16) loop_being_configured.inner_bounds.pochs := cmd.bits.cmd.rs2(15, 0) } is (LOOP_CONV_WS_CONFIG_3) { - loop_being_configured.inner_bounds.krows := cmd.bits.cmd.rs1(63, 48) - loop_being_configured.inner_bounds.kcols := cmd.bits.cmd.rs1(47, 32) + //loop_being_configured.inner_bounds.krows := cmd.bits.cmd.rs1(63, 48) + //loop_being_configured.inner_bounds.kcols := cmd.bits.cmd.rs1(47, 32) loop_being_configured.inner_bounds.kchs := cmd.bits.cmd.rs1(31, 16) loop_being_configured.inner_bounds.lpad := cmd.bits.cmd.rs1(15, 0) @@ -1344,7 +1402,7 @@ class LoopConv (block_size: Int, coreMaxAddrBits: Int, reservation_station_size: } is (LOOP_CONV_WS) { - loop_being_configured.no_bias := cmd.bits.cmd.rs1(0) + loop_being_configured.accumulate := cmd.bits.cmd.rs1(0) // TODO we added a default value for max_pixels_per_row just to maintain backwards compatibility. we should deprecate and remove it later val config_max_pixels_per_row = cmd.bits.cmd.rs1(15, 8) @@ -1388,7 +1446,7 @@ class LoopConv (block_size: Int, coreMaxAddrBits: Int, reservation_station_size: ld_bias.io.req.bits.derived_params := loop_requesting_ld_bias.derived_params() ld_bias.io.req.bits.addr_start := ld_bias_addr_start ld_bias.io.req.bits.dram_addr := loop_requesting_ld_bias.bias_dram_addr - ld_bias.io.req.bits.no_bias := loop_requesting_ld_bias.no_bias + //ld_bias.io.req.bits.no_bias := loop_requesting_ld_bias.no_bias ld_bias.io.req.bits.loop_id := loop_requesting_ld_bias_id ld_bias.io.req.valid := !loop_requesting_ld_bias.ld_bias_started && loop_requesting_ld_bias.configured @@ -1457,6 +1515,7 @@ class LoopConv (block_size: Int, coreMaxAddrBits: Int, reservation_station_size: ex.io.req.bits.trans_weight_0132 := loop_requesting_ex.trans_weight_0132 ex.io.req.bits.trans_input_3120 := loop_requesting_ex.trans_input_3120 ex.io.req.bits.loop_id := loop_requesting_ex_id + ex.io.req.bits.accumulate := loop_requesting_ex.accumulate ex.io.req.valid := !loop_requesting_ex.ex_started && loop_requesting_ex.ld_bias_started && loop_requesting_ex.ld_input_started && loop_requesting_ex.ld_weights_started && loop_requesting_ex.configured diff --git a/src/main/scala/gemmini/LoopMatmul.scala b/src/main/scala/gemmini/LoopMatmul.scala index 5e96dea5..82d5a9bd 100644 --- a/src/main/scala/gemmini/LoopMatmul.scala +++ b/src/main/scala/gemmini/LoopMatmul.scala @@ -547,7 +547,7 @@ class LoopMatmulStC(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In }) object State extends ChiselEnum { - val idle, st, ln_config, ln_st = Value + val idle, st = Value } import State._ val state = RegInit(idle) @@ -585,57 +585,6 @@ class LoopMatmulStC(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In mvout_cmd_rs2.local_addr := cast_to_acc_addr(mvout_cmd_rs2.local_addr, sp_addr, accumulate = false.B, read_full = req.full_c) mvout_cmd.rs2 := mvout_cmd_rs2.asUInt - // Layernorm iterators and calculations - val ln_row = Reg(UInt(iterator_bitwidth.W)) - val ln_cmd = Reg(UInt(iterator_bitwidth.W)) - val ln_stat_id = Reg(UInt(iterator_bitwidth.W)) - - val NORM_STAT_IDS = 2 // TODO magic number - - val ln_norm_cmds = VecInit(VecInit(NormCmd.SUM, NormCmd.MEAN), VecInit(NormCmd.VARIANCE, NormCmd.INV_STDDEV), - VecInit(NormCmd.RESET, NormCmd.RESET)) - - val sm_norm_cmds = VecInit(VecInit(NormCmd.MAX, NormCmd.MAX), VecInit(NormCmd.SUM_EXP, NormCmd.INV_SUM_EXP), - VecInit(NormCmd.RESET, NormCmd.RESET)) - - val ln_stat_ids = Mux(rows -& ln_row > NORM_STAT_IDS.U, NORM_STAT_IDS.U, rows -& ln_row) - - val ln_r = ln_row +& ln_stat_id - - val ln_sp_addr = acc_addr_start +& (i * req.max_j +& j) * block_size.U +& ln_r - val ln_norm_cmd = Mux(j +& max_blocks >= req.max_j, - Mux(req.act === Activation.LAYERNORM, ln_norm_cmds(ln_cmd)(1), sm_norm_cmds(ln_cmd)(1)), - Mux(req.act === Activation.LAYERNORM, ln_norm_cmds(ln_cmd)(0), sm_norm_cmds(ln_cmd)(0))) - - // TODO we assume for now that full_C and layernorm aren't true at the same - val ln_dram_offset = ((i * req.dram_stride +& j) * block_size.U +& ln_r * req.dram_stride) * (input_w/8).U - val ln_dram_addr = req.dram_addr + LoopMatmul.castDramOffset(ln_dram_offset) - - val ln_config_norm_rs1 = Wire(new GemminiISA.ConfigNormRs1) - ln_config_norm_rs1 := DontCare - ln_config_norm_rs1.set_stats_id_only := 1.U - ln_config_norm_rs1.cmd_type := CONFIG_NORM - ln_config_norm_rs1.norm_stats_id := ln_stat_id - - val ln_config_norm = Wire(new RoCCCommand) - ln_config_norm := DontCare - ln_config_norm.inst.funct := CONFIG_CMD - ln_config_norm.rs1 := ln_config_norm_rs1.asUInt - ln_config_norm.rs2 := DontCare - - val ln_mvout_cmd = Wire(new RoCCCommand) - ln_mvout_cmd := DontCare - ln_mvout_cmd.inst.funct := STORE_CMD - ln_mvout_cmd.rs1 := ln_dram_addr - - val ln_mvout_cmd_rs2 = Wire(mvout_rs2_t.cloneType) - ln_mvout_cmd_rs2 := DontCare - ln_mvout_cmd_rs2.num_rows := 1.U - ln_mvout_cmd_rs2.num_cols := cols.asUInt - ln_mvout_cmd_rs2.local_addr := cast_to_acc_addr(ln_mvout_cmd_rs2.local_addr, ln_sp_addr, accumulate = false.B, read_full = req.full_c) - ln_mvout_cmd_rs2.local_addr.norm_cmd := ln_norm_cmd - ln_mvout_cmd.rs2 := ln_mvout_cmd_rs2.asUInt - io.req.ready := state === idle io.j := j io.i := i @@ -652,11 +601,7 @@ class LoopMatmulStC(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In } io.cmd.valid := state =/= idle && !io.rob_overloaded && ex_ahead && req.dram_addr =/= 0.U - io.cmd.bits := MuxCase(mvout_cmd, Seq( - (state === ln_config) -> ln_config_norm, - (state === ln_st) -> ln_mvout_cmd, - )) - + io.cmd.bits := mvout_cmd io.loop_id := req.loop_id when (req.dram_addr === 0.U) { @@ -672,38 +617,14 @@ class LoopMatmulStC(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In when (next_i === 0.U && next_j === 0.U) { state := idle } - }.elsewhen (io.cmd.fire() && state === ln_config) { - state := ln_st - }.elsewhen (io.cmd.fire() && state === ln_st) { - val next_j = floorAdd(j, max_blocks, req.max_j) - val next_stat_id = floorAdd(ln_stat_id, 1.U, ln_stat_ids, next_j === 0.U) - val next_cmd = floorAdd(ln_cmd, 1.U, ln_norm_cmds.size.U, next_j === 0.U && next_stat_id === 0.U) - val next_row = floorAdd(ln_row, NORM_STAT_IDS.U, rows, next_j === 0.U && next_stat_id === 0.U && next_cmd === 0.U) - val next_i = floorAdd(i, 1.U, req.max_i, - next_j === 0.U && next_stat_id === 0.U && next_cmd === 0.U && next_row === 0.U) - - j := next_j - ln_stat_id := next_stat_id - ln_cmd := next_cmd - ln_row := next_row - i := next_i - - when (next_i === 0.U && next_row === 0.U && next_cmd === 0.U && next_stat_id === 0.U && next_j === 0.U) { - state := idle - }.elsewhen (next_j === 0.U) { - state := ln_config - } } when (io.req.fire) { req := io.req.bits - state := Mux((io.req.bits.act === Activation.LAYERNORM) || (io.req.bits.act === Activation.SOFTMAX), ln_config, st) + state := st// Mux((io.req.bits.act === Activation.LAYERNORM) || (io.req.bits.act === Activation.SOFTMAX), ln_config, st) j := 0.U i := 0.U - ln_row := 0.U - ln_cmd := 0.U - ln_stat_id := 0.U } } @@ -787,7 +708,7 @@ class LoopMatmul(block_size: Int, coreMaxAddrBits: Int, reservation_station_size mvin_rs2_t: MvinRs2, preload_rs1_t: PreloadRs, preload_rs2_t: PreloadRs, compute_rs1_t: ComputeRs, compute_rs2_t: ComputeRs, mvout_rs2_t: MvoutRs2) (implicit p: Parameters) extends Module { - val iterator_bitwidth = 16 + val iterator_bitwidth = 10 //16 val max_block_len = (dma_max_bytes / (block_size * input_w / 8)) max 1 val max_block_len_acc = (dma_max_bytes / (block_size * acc_w / 8)) max 1 @@ -916,18 +837,19 @@ class LoopMatmul(block_size: Int, coreMaxAddrBits: Int, reservation_station_size val loops_configured = RegInit(0.U(16.W)) dontTouch(loops_configured) + val config_bitwidth = 16 // Create config registers when(cmd.valid && is_loop_cmd && !loop_being_configured.configured) { switch (cmd.bits.cmd.inst.funct) { is (LOOP_WS_CONFIG_BOUNDS) { - loop_being_configured.max_k := cmd.bits.cmd.rs2(iterator_bitwidth * 3 - 1, iterator_bitwidth * 2) - loop_being_configured.max_j := cmd.bits.cmd.rs2(iterator_bitwidth * 2 - 1, iterator_bitwidth) - loop_being_configured.max_i := cmd.bits.cmd.rs2(iterator_bitwidth-1, 0) + loop_being_configured.max_k := cmd.bits.cmd.rs2(16 * 3 - 1, 16 * 2) + loop_being_configured.max_j := cmd.bits.cmd.rs2(16 * 2 - 1, 16) + loop_being_configured.max_i := cmd.bits.cmd.rs2(16-1, 0) - loop_being_configured.pad_k := cmd.bits.cmd.rs1(iterator_bitwidth * 3 - 1, iterator_bitwidth * 2) - loop_being_configured.pad_j := cmd.bits.cmd.rs1(iterator_bitwidth * 2 - 1, iterator_bitwidth) - loop_being_configured.pad_i := cmd.bits.cmd.rs1(iterator_bitwidth-1, 0) + loop_being_configured.pad_k := cmd.bits.cmd.rs1(16 * 3 - 1, 16 * 2) + loop_being_configured.pad_j := cmd.bits.cmd.rs1(16 * 2 - 1, 16) + loop_being_configured.pad_i := cmd.bits.cmd.rs1(16-1, 0) } is (LOOP_WS_CONFIG_ADDRS_AB) {