datahub-project
diff --git a/‎metadata-ingestion/src/datahub/ingestion/source/mongodb.py
Lines changed: 17 additions & 16 deletions b/‎metadata-ingestion/src/datahub/ingestion/source/mongodb.py
Lines changed: 17 additions & 16 deletions
@@ -219,26 +219,27 @@ def construct_schema_pymongo(
     """
 
     aggregations: List[Dict] = []
+
+    # The order of the aggregations impacts execution time. By setting the sample/limit aggregation first,
+    # the subsequent aggregations process a much smaller dataset, improving performance.
+    if sample_size:
+        if use_random_sampling:
+            aggregations.append({"$sample": {"size": sample_size}})
+        else:
+            aggregations.append({"$limit": sample_size})
+
     if should_add_document_size_filter:
         doc_size_field = "temporary_doc_size_field"
         # create a temporary field to store the size of the document. filter on it and then remove it.
-        aggregations = [
-            {"$addFields": {doc_size_field: {"$bsonSize": "$$ROOT"}}},
-            {"$match": {doc_size_field: {"$lt": max_document_size}}},
-            {"$project": {doc_size_field: 0}},
-        ]
-    if use_random_sampling:
-        # get sample documents in collection
-        if sample_size:
-            aggregations.append({"$sample": {"size": sample_size}})
-        documents = collection.aggregate(
-            aggregations,
-            allowDiskUse=True,
+        aggregations.extend(
+            [
+                {"$addFields": {doc_size_field: {"$bsonSize": "$$ROOT"}}},
+                {"$match": {doc_size_field: {"$lt": max_document_size}}},
+                {"$project": {doc_size_field: 0}},
+            ]
         )
-    else:
-        if sample_size:
-            aggregations.append({"$limit": sample_size})
-        documents = collection.aggregate(aggregations, allowDiskUse=True)
+
+    documents = collection.aggregate(aggregations, allowDiskUse=True)
 
     return construct_schema(list(documents), delimiter)