@@ -219,26 +219,27 @@ def construct_schema_pymongo(
219
219
"""
220
220
221
221
aggregations : List [Dict ] = []
222
+
223
+ # The order of the aggregations impacts execution time. By setting the sample/limit aggregation first,
224
+ # the subsequent aggregations process a much smaller dataset, improving performance.
225
+ if sample_size :
226
+ if use_random_sampling :
227
+ aggregations .append ({"$sample" : {"size" : sample_size }})
228
+ else :
229
+ aggregations .append ({"$limit" : sample_size })
230
+
222
231
if should_add_document_size_filter :
223
232
doc_size_field = "temporary_doc_size_field"
224
233
# create a temporary field to store the size of the document. filter on it and then remove it.
225
- aggregations = [
226
- {"$addFields" : {doc_size_field : {"$bsonSize" : "$$ROOT" }}},
227
- {"$match" : {doc_size_field : {"$lt" : max_document_size }}},
228
- {"$project" : {doc_size_field : 0 }},
229
- ]
230
- if use_random_sampling :
231
- # get sample documents in collection
232
- if sample_size :
233
- aggregations .append ({"$sample" : {"size" : sample_size }})
234
- documents = collection .aggregate (
235
- aggregations ,
236
- allowDiskUse = True ,
234
+ aggregations .extend (
235
+ [
236
+ {"$addFields" : {doc_size_field : {"$bsonSize" : "$$ROOT" }}},
237
+ {"$match" : {doc_size_field : {"$lt" : max_document_size }}},
238
+ {"$project" : {doc_size_field : 0 }},
239
+ ]
237
240
)
238
- else :
239
- if sample_size :
240
- aggregations .append ({"$limit" : sample_size })
241
- documents = collection .aggregate (aggregations , allowDiskUse = True )
241
+
242
+ documents = collection .aggregate (aggregations , allowDiskUse = True )
242
243
243
244
return construct_schema (list (documents ), delimiter )
244
245
0 commit comments