Skip to content

Commit e8dc3d2

Browse files
authored
Merge pull request #73 from ScrapeGraphAI/breadth-parameter
feat: add breadth
2 parents f8181fd + 5d996c5 commit e8dc3d2

File tree

4 files changed

+30
-0
lines changed

4 files changed

+30
-0
lines changed

scrapegraph-js/src/crawl.js

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ import { getMockResponse } from './utils/mockResponse.js';
1616
* @param {boolean} [options.extractionMode=true] - true for AI extraction, false for markdown conversion (NO AI/LLM)
1717
* @param {boolean} [options.cacheWebsite=true] - Whether to cache the website content
1818
* @param {number} [options.depth=2] - Maximum depth of the crawl (1-10)
19+
* @param {number|null} [options.breadth] - Maximum number of links to crawl per depth level. If null/undefined, unlimited (default). Controls the 'width' of exploration at each depth. Useful for limiting crawl scope on large sites. Note: maxPages always takes priority. Ignored when sitemap=true.
1920
* @param {number} [options.maxPages=2] - Maximum number of pages to crawl (1-100)
2021
* @param {boolean} [options.sameDomainOnly=true] - Whether to only crawl pages from the same domain
2122
* @param {boolean} [options.sitemap] - Whether to use sitemap for better page discovery
@@ -68,6 +69,7 @@ export async function crawl(
6869
const {
6970
cacheWebsite = true,
7071
depth = 2,
72+
breadth = null,
7173
maxPages = 2,
7274
sameDomainOnly = true,
7375
sitemap = false,
@@ -87,6 +89,10 @@ export async function crawl(
8789
render_heavy_js: renderHeavyJs,
8890
};
8991

92+
if (breadth !== null && breadth !== undefined) {
93+
payload.breadth = breadth;
94+
}
95+
9096
if (stealth) {
9197
payload.stealth = stealth;
9298
}

scrapegraph-py/scrapegraph_py/async_client.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -855,6 +855,7 @@ async def crawl(
855855
extraction_mode: bool = True,
856856
cache_website: bool = True,
857857
depth: int = 2,
858+
breadth: Optional[int] = None,
858859
max_pages: int = 2,
859860
same_domain_only: bool = True,
860861
batch_size: Optional[int] = None,
@@ -877,6 +878,9 @@ async def crawl(
877878
extraction_mode: Whether to use AI extraction (True) or markdown (False)
878879
cache_website: Whether to cache the website
879880
depth: Maximum depth of link traversal
881+
breadth: Maximum number of links to crawl per depth level. If None, unlimited (default).
882+
Controls the 'width' of exploration at each depth. Useful for limiting crawl scope
883+
on large sites. Note: max_pages always takes priority. Ignored when sitemap=True.
880884
max_pages: Maximum number of pages to crawl
881885
same_domain_only: Only crawl pages within the same domain
882886
batch_size: Number of pages to process in batch
@@ -905,6 +909,8 @@ async def crawl(
905909
)
906910
logger.debug(f"💾 Cache website: {cache_website}")
907911
logger.debug(f"🔍 Depth: {depth}")
912+
if breadth is not None:
913+
logger.debug(f"📏 Breadth: {breadth}")
908914
logger.debug(f"📄 Max pages: {max_pages}")
909915
logger.debug(f"🏠 Same domain only: {same_domain_only}")
910916
logger.debug(f"🗺️ Use sitemap: {sitemap}")
@@ -941,6 +947,8 @@ async def crawl(
941947
request_data["prompt"] = prompt
942948
if data_schema is not None:
943949
request_data["data_schema"] = data_schema
950+
if breadth is not None:
951+
request_data["breadth"] = breadth
944952
if batch_size is not None:
945953
request_data["batch_size"] = batch_size
946954
if headers is not None:

scrapegraph-py/scrapegraph_py/client.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -865,6 +865,7 @@ def crawl(
865865
extraction_mode: bool = True,
866866
cache_website: bool = True,
867867
depth: int = 2,
868+
breadth: Optional[int] = None,
868869
max_pages: int = 2,
869870
same_domain_only: bool = True,
870871
batch_size: Optional[int] = None,
@@ -887,6 +888,9 @@ def crawl(
887888
extraction_mode: Whether to use AI extraction (True) or markdown (False)
888889
cache_website: Whether to cache the website
889890
depth: Maximum depth of link traversal
891+
breadth: Maximum number of links to crawl per depth level. If None, unlimited (default).
892+
Controls the 'width' of exploration at each depth. Useful for limiting crawl scope
893+
on large sites. Note: max_pages always takes priority. Ignored when sitemap=True.
890894
max_pages: Maximum number of pages to crawl
891895
same_domain_only: Only crawl pages within the same domain
892896
batch_size: Number of pages to process in batch
@@ -915,6 +919,8 @@ def crawl(
915919
)
916920
logger.debug(f"💾 Cache website: {cache_website}")
917921
logger.debug(f"🔍 Depth: {depth}")
922+
if breadth is not None:
923+
logger.debug(f"📏 Breadth: {breadth}")
918924
logger.debug(f"📄 Max pages: {max_pages}")
919925
logger.debug(f"🏠 Same domain only: {same_domain_only}")
920926
logger.debug(f"🗺️ Use sitemap: {sitemap}")
@@ -951,6 +957,8 @@ def crawl(
951957
request_data["prompt"] = prompt
952958
if data_schema is not None:
953959
request_data["data_schema"] = data_schema
960+
if breadth is not None:
961+
request_data["breadth"] = breadth
954962
if batch_size is not None:
955963
request_data["batch_size"] = batch_size
956964
if headers is not None:

scrapegraph-py/scrapegraph_py/models/crawl.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,14 @@ class CrawlRequest(BaseModel):
5454
depth: conint(ge=1, le=10) = Field(
5555
default=2, description="Maximum depth of the crawl (1-10)"
5656
)
57+
breadth: Optional[conint(ge=1)] = Field(
58+
default=None,
59+
description="Maximum number of links to crawl per depth level. "
60+
"If None, unlimited (default). Controls the 'width' of exploration at each depth. "
61+
"Useful for limiting crawl scope on large sites. Note: max_pages always takes priority - "
62+
"the total crawled pages will never exceed max_pages regardless of breadth setting. "
63+
"Ignored when sitemap=True (sitemap mode uses sitemap URLs directly instead of link discovery).",
64+
)
5765
max_pages: conint(ge=1, le=100) = Field(
5866
default=2, description="Maximum number of pages to crawl (1-100)"
5967
)

0 commit comments

Comments
 (0)