@@ -43,7 +43,9 @@ use crate::FUSE_OPT_KEY_ROW_PER_BLOCK;
43
43
#[ allow( clippy:: large_enum_variant) ]
44
44
enum State {
45
45
Consume ,
46
- Serialize ( DataBlock ) ,
46
+ Collect ( DataBlock ) ,
47
+ Serialize ,
48
+ Finalize ,
47
49
Flush ,
48
50
Write ( BlockSerialization ) ,
49
51
}
@@ -56,6 +58,10 @@ pub struct TransformBlockWriter {
56
58
properties : Arc < StreamBlockProperties > ,
57
59
58
60
builder : Option < StreamBlockBuilder > ,
61
+ need_flush : bool ,
62
+ input_data_size : usize ,
63
+ input_num_rows : usize ,
64
+
59
65
dal : Operator ,
60
66
// Only used in multi table insert
61
67
table_id : Option < u64 > ,
@@ -86,8 +92,11 @@ impl TransformBlockWriter {
86
92
properties,
87
93
builder : None ,
88
94
dal : table. get_operator ( ) ,
95
+ need_flush : false ,
89
96
table_id : if with_tid { Some ( table. get_id ( ) ) } else { None } ,
90
97
input_data : VecDeque :: new ( ) ,
98
+ input_data_size : 0 ,
99
+ input_num_rows : 0 ,
91
100
output_data : None ,
92
101
max_block_size,
93
102
} ) ) )
@@ -126,12 +135,12 @@ impl Processor for TransformBlockWriter {
126
135
}
127
136
128
137
fn event ( & mut self ) -> Result < Event > {
129
- if matches ! ( self . state, State :: Serialize { .. } | State :: Flush ) {
130
- return Ok ( Event :: Sync ) ;
131
- }
132
-
133
- if matches ! ( self . state , State :: Write { .. } ) {
134
- return Ok ( Event :: Async ) ;
138
+ match & self . state {
139
+ State :: Collect ( _ ) | State :: Serialize | State :: Flush | State :: Finalize => {
140
+ return Ok ( Event :: Sync )
141
+ }
142
+ State :: Write ( _ ) => return Ok ( Event :: Async ) ,
143
+ _ => { }
135
144
}
136
145
137
146
if self . output . is_finished ( ) {
@@ -147,59 +156,80 @@ impl Processor for TransformBlockWriter {
147
156
return Ok ( Event :: NeedConsume ) ;
148
157
}
149
158
150
- if let Some ( block) = self . input_data . pop_front ( ) {
151
- self . state = State :: Serialize ( block) ;
159
+ // To avoid tail fragments, flush only when the input is large enough.
160
+ if self . need_flush
161
+ && self
162
+ . properties
163
+ . block_thresholds
164
+ . check_large_enough ( self . input_num_rows , self . input_data_size )
165
+ {
166
+ self . state = State :: Flush ;
167
+ return Ok ( Event :: Sync ) ;
168
+ }
169
+
170
+ if !self . need_flush && !self . input_data . is_empty ( ) {
171
+ self . state = State :: Serialize ;
172
+ return Ok ( Event :: Sync ) ;
173
+ }
174
+
175
+ if self . input . has_data ( ) {
176
+ let input_data = self . input . pull_data ( ) . unwrap ( ) ?;
177
+ self . state = State :: Collect ( input_data) ;
152
178
return Ok ( Event :: Sync ) ;
153
179
}
154
180
155
181
if self . input . is_finished ( ) {
156
- if self . builder . is_some ( ) {
157
- self . state = State :: Flush ;
182
+ if ! self . input_data . is_empty ( ) || self . builder . is_some ( ) {
183
+ self . state = State :: Finalize ;
158
184
return Ok ( Event :: Sync ) ;
159
185
}
160
186
self . output . finish ( ) ;
161
187
return Ok ( Event :: Finished ) ;
162
188
}
163
189
164
- if !self . input . has_data ( ) {
165
- self . input . set_need_data ( ) ;
166
- return Ok ( Event :: NeedData ) ;
167
- }
168
-
169
- let input_data = self . input . pull_data ( ) . unwrap ( ) ?;
170
- self . state = State :: Serialize ( input_data) ;
171
- Ok ( Event :: Sync )
190
+ self . input . set_need_data ( ) ;
191
+ Ok ( Event :: NeedData )
172
192
}
173
193
174
194
fn process ( & mut self ) -> Result < ( ) > {
175
195
match std:: mem:: replace ( & mut self . state , State :: Consume ) {
176
- State :: Serialize ( block) => {
196
+ State :: Collect ( block) => {
177
197
// Check if the datablock is valid, this is needed to ensure data is correct
178
198
block. check_valid ( ) ?;
199
+ self . input_data_size += block. estimate_block_size ( ) ;
200
+ self . input_num_rows += block. num_rows ( ) ;
179
201
let max_rows_per_block = self . calc_max_block_size ( & block) ;
180
202
let blocks = block. split_by_rows_if_needed_no_tail ( max_rows_per_block) ;
181
- let mut blocks = VecDeque :: from ( blocks) ;
203
+ self . input_data . extend ( blocks) ;
204
+ }
205
+ State :: Serialize => {
206
+ while let Some ( b) = self . input_data . pop_front ( ) {
207
+ self . input_data_size -= b. estimate_block_size ( ) ;
208
+ self . input_num_rows -= b. num_rows ( ) ;
182
209
183
- let builder = self . get_or_create_builder ( ) ?;
184
- while let Some ( b) = blocks. pop_front ( ) {
210
+ let builder = self . get_or_create_builder ( ) ?;
185
211
builder. write ( b) ?;
186
212
187
213
if builder. need_flush ( ) {
188
- self . state = State :: Flush ;
189
-
190
- for left in blocks {
191
- self . input_data . push_back ( left) ;
192
- }
214
+ self . need_flush = true ;
193
215
return Ok ( ( ) ) ;
194
216
}
195
217
}
196
218
}
219
+ State :: Finalize => {
220
+ while let Some ( b) = self . input_data . pop_front ( ) {
221
+ let builder = self . get_or_create_builder ( ) ?;
222
+ builder. write ( b) ?;
223
+ }
224
+ self . state = State :: Flush ;
225
+ }
197
226
State :: Flush => {
198
227
let builder = self . builder . take ( ) . unwrap ( ) ;
199
228
if !builder. is_empty ( ) {
200
229
let serialized = builder. finish ( ) ?;
201
230
self . state = State :: Write ( serialized) ;
202
231
}
232
+ self . need_flush = false ;
203
233
}
204
234
_ => return Err ( ErrorCode :: Internal ( "It's a bug." ) ) ,
205
235
}
0 commit comments