Merge pull request #73 from ScrapeGraphAI/breadth-parameter

VinciGit00 · web-flow · commit e8dc3d2dd868 · 2026-01-26T14:58:06.000+01:00
feat: add breadth
diff --git a/scrapegraph-js/src/crawl.js b/scrapegraph-js/src/crawl.js
@@ -16,6 +16,7 @@ import { getMockResponse } from './utils/mockResponse.js';
  * @param {boolean} [options.extractionMode=true] - true for AI extraction, false for markdown conversion (NO AI/LLM)
  * @param {boolean} [options.cacheWebsite=true] - Whether to cache the website content
  * @param {number} [options.depth=2] - Maximum depth of the crawl (1-10)
+ * @param {number|null} [options.breadth] - Maximum number of links to crawl per depth level. If null/undefined, unlimited (default). Controls the 'width' of exploration at each depth. Useful for limiting crawl scope on large sites. Note: maxPages always takes priority. Ignored when sitemap=true.
  * @param {number} [options.maxPages=2] - Maximum number of pages to crawl (1-100)
  * @param {boolean} [options.sameDomainOnly=true] - Whether to only crawl pages from the same domain
  * @param {boolean} [options.sitemap] - Whether to use sitemap for better page discovery
@@ -68,6 +69,7 @@ export async function crawl(
   const {
     cacheWebsite = true,
     depth = 2,
+    breadth = null,
     maxPages = 2,
     sameDomainOnly = true,
     sitemap = false,
@@ -87,6 +89,10 @@ export async function crawl(
     render_heavy_js: renderHeavyJs,
   };
 
+  if (breadth !== null && breadth !== undefined) {
+    payload.breadth = breadth;
+  }
+
   if (stealth) {
     payload.stealth = stealth;
   }
diff --git a/scrapegraph-py/scrapegraph_py/async_client.py b/scrapegraph-py/scrapegraph_py/async_client.py
@@ -855,6 +855,7 @@ async def crawl(
         extraction_mode: bool = True,
         cache_website: bool = True,
         depth: int = 2,
+        breadth: Optional[int] = None,
         max_pages: int = 2,
         same_domain_only: bool = True,
         batch_size: Optional[int] = None,
@@ -877,6 +878,9 @@ async def crawl(
             extraction_mode: Whether to use AI extraction (True) or markdown (False)
             cache_website: Whether to cache the website
             depth: Maximum depth of link traversal
+            breadth: Maximum number of links to crawl per depth level. If None, unlimited (default).
+                    Controls the 'width' of exploration at each depth. Useful for limiting crawl scope
+                    on large sites. Note: max_pages always takes priority. Ignored when sitemap=True.
             max_pages: Maximum number of pages to crawl
             same_domain_only: Only crawl pages within the same domain
             batch_size: Number of pages to process in batch
@@ -905,6 +909,8 @@ async def crawl(
             )
         logger.debug(f"💾 Cache website: {cache_website}")
         logger.debug(f"🔍 Depth: {depth}")
+        if breadth is not None:
+            logger.debug(f"📏 Breadth: {breadth}")
         logger.debug(f"📄 Max pages: {max_pages}")
         logger.debug(f"🏠 Same domain only: {same_domain_only}")
         logger.debug(f"🗺️ Use sitemap: {sitemap}")
@@ -941,6 +947,8 @@ async def crawl(
             request_data["prompt"] = prompt
         if data_schema is not None:
             request_data["data_schema"] = data_schema
+        if breadth is not None:
+            request_data["breadth"] = breadth
         if batch_size is not None:
             request_data["batch_size"] = batch_size
         if headers is not None:
diff --git a/scrapegraph-py/scrapegraph_py/client.py b/scrapegraph-py/scrapegraph_py/client.py
@@ -865,6 +865,7 @@ def crawl(
         extraction_mode: bool = True,
         cache_website: bool = True,
         depth: int = 2,
+        breadth: Optional[int] = None,
         max_pages: int = 2,
         same_domain_only: bool = True,
         batch_size: Optional[int] = None,
@@ -887,6 +888,9 @@ def crawl(
             extraction_mode: Whether to use AI extraction (True) or markdown (False)
             cache_website: Whether to cache the website
             depth: Maximum depth of link traversal
+            breadth: Maximum number of links to crawl per depth level. If None, unlimited (default).
+                    Controls the 'width' of exploration at each depth. Useful for limiting crawl scope
+                    on large sites. Note: max_pages always takes priority. Ignored when sitemap=True.
             max_pages: Maximum number of pages to crawl
             same_domain_only: Only crawl pages within the same domain
             batch_size: Number of pages to process in batch
@@ -915,6 +919,8 @@ def crawl(
             )
         logger.debug(f"💾 Cache website: {cache_website}")
         logger.debug(f"🔍 Depth: {depth}")
+        if breadth is not None:
+            logger.debug(f"📏 Breadth: {breadth}")
         logger.debug(f"📄 Max pages: {max_pages}")
         logger.debug(f"🏠 Same domain only: {same_domain_only}")
         logger.debug(f"🗺️ Use sitemap: {sitemap}")
@@ -951,6 +957,8 @@ def crawl(
             request_data["prompt"] = prompt
         if data_schema is not None:
             request_data["data_schema"] = data_schema
+        if breadth is not None:
+            request_data["breadth"] = breadth
         if batch_size is not None:
             request_data["batch_size"] = batch_size
         if headers is not None:
diff --git a/scrapegraph-py/scrapegraph_py/models/crawl.py b/scrapegraph-py/scrapegraph_py/models/crawl.py
@@ -54,6 +54,14 @@ class CrawlRequest(BaseModel):
     depth: conint(ge=1, le=10) = Field(
         default=2, description="Maximum depth of the crawl (1-10)"
     )
+    breadth: Optional[conint(ge=1)] = Field(
+        default=None,
+        description="Maximum number of links to crawl per depth level. "
+        "If None, unlimited (default). Controls the 'width' of exploration at each depth. "
+        "Useful for limiting crawl scope on large sites. Note: max_pages always takes priority - "
+        "the total crawled pages will never exceed max_pages regardless of breadth setting. "
+        "Ignored when sitemap=True (sitemap mode uses sitemap URLs directly instead of link discovery).",
+    )
     max_pages: conint(ge=1, le=100) = Field(
         default=2, description="Maximum number of pages to crawl (1-100)"
     )