@@ -39,12 +39,12 @@ case class BatchParam(config: GatewayConfig, bundles: Seq[DifftestBundle]) {
39
39
val StepGroupSize = bundles.distinctBy(_.desiredCppName).length
40
40
val StepDataByteLen = bundles.map(_.getByteAlignWidth).map { w => w / 8 }.sum
41
41
val StepDataBitLen = StepDataByteLen * 8
42
- val StepInfoByteLen = StepGroupSize * (infoWidth / 8 )
42
+ val StepInfoByteLen = ( StepGroupSize + 1 ) * (infoWidth / 8 ) // Include BatchStep to update buffer index
43
43
val StepInfoBitLen = StepInfoByteLen * 8
44
44
45
45
// Width of statistic for data/info byte length
46
46
val StatsDataWidth = log2Ceil(math.max(MaxDataByteLen , StepDataByteLen ))
47
- val StatsInfoWidth = log2Ceil(math.max(MaxInfoSize , StepGroupSize ))
47
+ val StatsInfoWidth = log2Ceil(math.max(MaxInfoSize , StepGroupSize + 1 ))
48
48
49
49
// Truncate width when shifting to reduce useless gates
50
50
val TruncDataBitLen = math.min(MaxDataBitLen , StepDataBitLen )
@@ -123,7 +123,7 @@ class BatchCollector(bundles: Seq[Valid[DifftestBundle]], param: BatchParam) ext
123
123
val step_enable = IO (Output (Bool ()))
124
124
125
125
val sorted =
126
- in.groupBy(_.bits.desiredCppName).values.toSeq.sortBy(gens => gens .length * gens .head.bits.getByteAlignWidth)
126
+ in.groupBy(_.bits.desiredCppName).values.toSeq.sortBy(gen => gen .length * gen .head.bits.getByteAlignWidth).reverse
127
127
// Stage 1: concat bundles with same desiredCppName
128
128
val group_bitlen = sorted.map(_.head.bits.getByteAlignWidth)
129
129
val group_length = sorted.map(_.length)
@@ -175,23 +175,33 @@ class BatchCollector(bundles: Seq[Valid[DifftestBundle]], param: BatchParam) ext
175
175
val info_num = delay_group_status.last.info_size
176
176
step_enable := info_num =/= 0 .U
177
177
step_status := delay_group_status
178
+ // append BatchStep to last step_status
179
+ step_status.last.info_size := delay_group_status.last.info_size + 1 .U
180
+ // Use BatchStep to update index of software buffer
181
+ val BatchStep = Wire (new BatchInfo )
182
+ BatchStep .id := Batch .getTemplate.length.U
183
+ BatchStep .num := info_num // unused, only for debugging
184
+ // Collect from tail, collect(i) include last 0~i
185
+ val toCollect_data = delay_group_data.reverse
186
+ val toCollect_info = delay_group_info.reverse
187
+ val toCollect_vsize = delay_group_vsize.reverse
178
188
val collect_data = Wire (MixedVec (Seq .tabulate(param.StepGroupSize ) { idx =>
179
- UInt (delay_group_data .take(idx + 1 ).map(_.getWidth).sum.W )
189
+ UInt (toCollect_data .take(idx + 1 ).map(_.getWidth).sum.W )
180
190
}))
181
191
val collect_info = Wire (MixedVec (Seq .tabulate(param.StepGroupSize ) { idx =>
182
- UInt (((idx + 1 ) * param.infoWidth).W )
192
+ UInt (((idx + 2 ) * param.infoWidth).W )
183
193
}))
184
- // Collect from head, collect(i) include 0~i
185
- collect_data(0 ) := delay_group_data (0 )
186
- collect_info(0 ) := delay_group_info( 0 )
194
+
195
+ collect_data(0 ) := toCollect_data (0 )
196
+ collect_info(0 ) := Mux (toCollect_vsize( 0 ) =/= 0 . U , Cat ( BatchStep .asUInt, toCollect_info( 0 )), BatchStep .asUInt )
187
197
(1 until param.StepGroupSize ).foreach { idx =>
188
- val cat_map = Seq .tabulate(group_length(idx) + 1 ) { len =>
189
- (len.U , Cat (collect_data(idx - 1 ), delay_group_data (idx)(len * group_bitlen(idx) - 1 , 0 )))
198
+ val cat_map = Seq .tabulate(group_length.reverse (idx) + 1 ) { len =>
199
+ (len.U , Cat (collect_data(idx - 1 ), toCollect_data (idx)(len * group_bitlen.reverse (idx) - 1 , 0 )))
190
200
}
191
- collect_data(idx) := LookupTree (delay_group_vsize (idx), cat_map)
201
+ collect_data(idx) := LookupTree (toCollect_vsize (idx), cat_map)
192
202
collect_info(idx) := Mux (
193
- delay_group_vsize (idx) =/= 0 .U ,
194
- Cat (collect_info(idx - 1 ), delay_group_info (idx)),
203
+ toCollect_vsize (idx) =/= 0 .U ,
204
+ Cat (collect_info(idx - 1 ), toCollect_info (idx)),
195
205
collect_info(idx - 1 ),
196
206
)
197
207
}
@@ -227,8 +237,8 @@ class BatchAssembler(
227
237
val delay_step_enable = RegNext (step_enable)
228
238
val delay_step_trace_info = Option .when(config.hasReplay)(RegNext (step_trace_info.get))
229
239
val data_bytes_avail = param.MaxDataByteLen .U -& state_status.data_bytes
230
- // Always leave space for BatchFinish and BatchInterval , use MaxInfoSize - 2
231
- val info_size_avail = (param.MaxInfoSize - 2 ).U -& state_status.info_size
240
+ // Always leave space for BatchFinish, use MaxInfoSize - 1
241
+ val info_size_avail = (param.MaxInfoSize - 1 ).U -& state_status.info_size
232
242
val data_exceed = Wire (Bool ())
233
243
val info_exceed = Wire (Bool ())
234
244
val append_data = Wire (UInt (param.TruncDataBitLen .W ))
@@ -239,10 +249,6 @@ class BatchAssembler(
239
249
val next_state_info = Wire (UInt (param.MaxInfoBitLen .W ))
240
250
val next_state_stats = Wire (new BatchStats (param))
241
251
242
- // Use BatchInterval to update index of software buffer
243
- val BatchInterval = Wire (new BatchInfo )
244
- BatchInterval .id := Batch .getTemplate.length.U
245
- BatchInterval .num := delay_step_status.last.info_size // unused, only for debugging
246
252
val BatchFinish = Wire (new BatchInfo )
247
253
BatchFinish .id := (Batch .getTemplate.length + 1 ).U
248
254
BatchFinish .num := finish_step
@@ -274,13 +280,13 @@ class BatchAssembler(
274
280
assert(remain_stats.data_bytes <= param.MaxDataByteLen .U )
275
281
assert(remain_stats.info_size + 1 .U <= param.MaxInfoSize .U )
276
282
277
- val concat_data = (delay_step_data >> (remain_stats.data_bytes << 3 ).asUInt).asUInt
278
- val concat_info = (delay_step_info >> (remain_stats.info_size * param.infoWidth.U )).asUInt
279
283
// Note we need only lowest bits to update state, truncate high bits to reduce gates
280
- val remain_data = (~ (~ 0 .U (param.TruncDataBitLen .W ) <<
281
- (remain_stats.data_bytes << 3 ).asUInt)).asUInt & delay_step_data
282
- val remain_info = (~ (~ 0 .U (param.StepInfoBitLen .W ) <<
283
- (remain_stats.info_size * param.infoWidth.U ))).asUInt & delay_step_info
284
+ val concat_data = (~ (~ 0 .U (param.TruncDataBitLen .W ) <<
285
+ (concat_stats.data_bytes << 3 ).asUInt)).asUInt & delay_step_data
286
+ val concat_info = (~ (~ 0 .U (param.StepInfoBitLen .W ) <<
287
+ (concat_stats.info_size * param.infoWidth.U ))).asUInt & delay_step_info
288
+ val remain_data = (delay_step_data >> (concat_stats.data_bytes << 3 ).asUInt).asUInt
289
+ val remain_info = (delay_step_info >> (concat_stats.info_size * param.infoWidth.U )).asUInt
284
290
285
291
// Delay step can be partly appended to output for making full use of transmission param
286
292
// Avoid appending when step equals batchSize(delay_step_exceed), last appended data will overwrite first step data
@@ -290,20 +296,20 @@ class BatchAssembler(
290
296
finish_step := state_step_cnt + Mux (append_whole, 1 .U , 0 .U )
291
297
292
298
append_data := Mux (has_append, concat_data(param.TruncDataBitLen - 1 , 0 ), 0 .U )
293
- val append_finish_map = Seq .tabulate(param.StepGroupSize ) { g =>
299
+ val append_finish_map = Seq .tabulate(param.StepGroupSize + 2 ) { g =>
294
300
(g.U , (BatchFinish .asUInt << (g * param.infoWidth)).asUInt)
295
301
}
296
302
append_info := Mux (
297
303
has_append,
298
- Cat ( concat_info | LookupTree (concat_stats.info_size, append_finish_map), BatchInterval .asUInt ),
304
+ concat_info | LookupTree (concat_stats.info_size, append_finish_map),
299
305
BatchFinish .asUInt,
300
306
)
301
307
302
308
next_state_step_cnt := Mux (has_append && append_whole, 0 .U , 1 .U )
303
309
next_state_data := Mux (has_append, remain_data, delay_step_data)
304
- next_state_info := Mux (has_append, remain_info, Cat ( delay_step_info, BatchInterval .asUInt) )
310
+ next_state_info := Mux (has_append, remain_info, delay_step_info)
305
311
next_state_stats.data_bytes := Mux (has_append, remain_stats.data_bytes, delay_step_status.last.data_bytes)
306
- next_state_stats.info_size := Mux (has_append, remain_stats.info_size, delay_step_status.last.info_size + 1 . U )
312
+ next_state_stats.info_size := Mux (has_append, remain_stats.info_size, delay_step_status.last.info_size)
307
313
} else {
308
314
data_exceed := delay_step_enable && delay_step_status.last.data_bytes > data_bytes_avail
309
315
info_exceed := delay_step_enable && delay_step_status.last.info_size > info_size_avail
@@ -316,9 +322,9 @@ class BatchAssembler(
316
322
317
323
next_state_step_cnt := 1 .U
318
324
next_state_data := delay_step_data
319
- next_state_info := Cat ( delay_step_info, BatchInterval .asUInt)
325
+ next_state_info := delay_step_info
320
326
next_state_stats.data_bytes := delay_step_status.last.data_bytes
321
- next_state_stats.info_size := delay_step_status.last.info_size + 1 . U
327
+ next_state_stats.info_size := delay_step_status.last.info_size
322
328
}
323
329
324
330
// Stage 2:
@@ -352,6 +358,7 @@ class BatchAssembler(
352
358
out.step := Mux (out.enable, finish_step, 0 .U )
353
359
354
360
val state_update = delay_step_enable || state_flush || timeout
361
+
355
362
when(state_update) {
356
363
when(delay_step_enable) {
357
364
when(should_tick) {
@@ -365,9 +372,9 @@ class BatchAssembler(
365
372
state_data := state_data |
366
373
(delay_step_data(param.TruncDataBitLen - 1 , 0 ) << (state_status.data_bytes << 3 ).asUInt).asUInt
367
374
state_info := state_info |
368
- (Cat ( delay_step_info, BatchInterval .asUInt) << (state_status.info_size * param.infoWidth.U )).asUInt
375
+ (delay_step_info << (state_status.info_size * param.infoWidth.U )).asUInt
369
376
state_status.data_bytes := state_status.data_bytes + delay_step_status.last.data_bytes
370
- state_status.info_size := state_status.info_size + delay_step_status.last.info_size + 1 . U
377
+ state_status.info_size := state_status.info_size + delay_step_status.last.info_size
371
378
if (config.hasReplay) state_trace_size.get := state_trace_size.get + delay_step_trace_info.get.trace_size
372
379
}
373
380
}.otherwise { // state_flush without new-coming step
0 commit comments