Skip to content

Commit

Permalink
Feature/body collect timeout (#2)
Browse files Browse the repository at this point in the history
* Body collect timeout

* Body collect timeout

* Publish items

* Bump version 3.0
  • Loading branch information
mariotaddeucci authored Feb 10, 2025
1 parent dd8fcd5 commit 604d37f
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 18 deletions.
16 changes: 4 additions & 12 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@ build-backend = "hatchling.build"

[project]
name = "whiscraper"
# dynamic = ["version"]
version = "0.2.0"
dynamic = [
"version",
]
description = "Your Stealthy Scraping Python Framework"
readme = "README.md"
requires-python = ">=3.9"
Expand All @@ -19,15 +20,8 @@ exclude = ["/tests", "/docs", "/examples"]
[tool.hatch.build.targets.wheel]
packages = ["src/whiscraper"]

[tool.hatch.envs.hatch-test]
dependencies = ["pytest >=8.3.3,<9", "ruff >=0.1.0,<1"]
extra-args = ["-vv"]

[tool.hatch.envs.hatch-test.scripts]
run = "pytest{env:HATCH_TEST_ARGS:} {args}"

[tool.hatch.version]
source = "vcs"
path = "src/whiscraper/__init__.py"

[tool.pytest.ini_options]
pythonpath = "src"
Expand All @@ -36,7 +30,5 @@ testpaths = ["tests"]
[tool.ruff]
line-length = 120


[tool.ruff.lint]
extend-unsafe-fixes = ["UP"]
ignore = ["F401"]
5 changes: 5 additions & 0 deletions src/whiscraper/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,7 @@
from .browser.context import Browser as BrowserManager
from .browser.context import BrowserManagerConfig as BrowserConfig
from .browser.context import browser, get_page

__all__ = ["BrowserConfig", "browser", "get_page", "BrowserManager"]

__version__ = "0.3.0"
19 changes: 13 additions & 6 deletions src/whiscraper/browser/tools/request_interceptor.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ async def _cdp_receive_handler(self, event: nodriver.cdp.network.ResponseReceive

await self._intercepted_responses.put(event)

async def take(self, total: int, include_body: bool = True, timeout: float = 10):
async def take(self, total: int, include_body: bool = True, timeout: float = 10, body_collect_timeout: int = 5):
for _ in range(total):
item: nodriver.cdp.network.ResponseReceived = await asyncio.wait_for(
self._intercepted_responses.get(), timeout
Expand All @@ -65,14 +65,19 @@ async def take(self, total: int, include_body: bool = True, timeout: float = 10)
status_code=item.response.status,
)

if not include_body:
if not include_body or item.response.status == 204:
yield resp_factory_fn(body=None)
return

cdp_command = nodriver.cdp.network.get_response_body(item.request_id)
response_body: Tuple[str, bool] | None = await self._tab.send(cdp_command)

if response_body is None:
for i in range(body_collect_timeout):
response_body: Tuple[str, bool] | None = await self._tab.send(cdp_command)
if response_body is not None:
break
elif i < body_collect_timeout - 1:
await asyncio.sleep(1)
else:
yield resp_factory_fn(body=None)
return

Expand All @@ -83,8 +88,10 @@ async def take(self, total: int, include_body: bool = True, timeout: float = 10)

yield resp_factory_fn(body=body_text)

async def get(self, include_body: bool = True, timeout: float = 10):
async for vl in self.take(1, include_body=include_body, timeout=timeout):
async def get(self, include_body: bool = True, timeout: float = 10, body_collect_timeout: int = 5):
async for vl in self.take(
1, include_body=include_body, timeout=timeout, body_collect_timeout=body_collect_timeout
):
return vl

def filter(self, fn: Callable[[nodriver.cdp.network.ResponseReceived], bool]):
Expand Down

0 comments on commit 604d37f

Please sign in to comment.