Skip to content

Commit d1b2bf1

Browse files
authored
Multiple browser contexts (#13)
1 parent 68d8fd1 commit d1b2bf1

File tree

6 files changed

+479
-87
lines changed

6 files changed

+479
-87
lines changed

README.md

+128-37
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,8 @@ Also, be sure to [install the `asyncio`-based Twisted reactor](https://docs.scra
5555
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
5656
```
5757

58+
### Settings
59+
5860
`scrapy-playwright` accepts the following settings:
5961

6062
* `PLAYWRIGHT_BROWSER_TYPE` (type `str`, default `chromium`)
@@ -67,7 +69,28 @@ TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
6769

6870
* `PLAYWRIGHT_CONTEXT_ARGS` (type `dict`, default `{}`)
6971

70-
A dictionary with keyword arguments to be passed when creating the default Browser context.
72+
A dictionary with default keyword arguments to be passed when creating the
73+
"default" Browser context.
74+
75+
**Deprecated: use `PLAYWRIGHT_CONTEXTS` instead**
76+
77+
* `PLAYWRIGHT_CONTEXTS` (type `dict[str, dict]`, default `{}`)
78+
79+
A dictionary which defines Browser contexts to be created on startup.
80+
It should be a mapping of (name, keyword arguments) For instance:
81+
```python
82+
{
83+
"first": {
84+
"context_arg1": "value",
85+
"context_arg2": "value",
86+
},
87+
"second": {
88+
"context_arg1": "value",
89+
},
90+
}
91+
```
92+
If no contexts are defined, a default context (called `default`) is created.
93+
The arguments passed here take precedence over the ones defined in `PLAYWRIGHT_CONTEXT_ARGS`.
7194
See the docs for [`Browser.new_context`](https://playwright.dev/python/docs/api/class-browser#browsernew_contextkwargs).
7295

7396
* `PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT` (type `Optional[int]`, default `None`)
@@ -104,42 +127,7 @@ class AwesomeSpider(scrapy.Spider):
104127
```
105128

106129

107-
## Page coroutines
108-
109-
A sorted iterable (`list`, `tuple` or `dict`, for instance) could be passed
110-
in the `playwright_page_coroutines`
111-
[Request.meta](https://docs.scrapy.org/en/latest/topics/request-response.html#scrapy.http.Request.meta)
112-
key to request coroutines to be awaited on the `Page` before returning the final
113-
`Response` to the callback.
114-
115-
This is useful when you need to perform certain actions on a page, like scrolling
116-
down or clicking links, and you want everything to count as a single Scrapy
117-
Response, containing the final result.
118-
119-
### Supported actions
120-
121-
* `scrapy_playwright.page.PageCoroutine(method: str, *args, **kwargs)`:
122-
123-
_Represents a coroutine to be awaited on a `playwright.page.Page` object,
124-
such as "click", "screenshot", "evaluate", etc.
125-
`method` should be the name of the coroutine, `*args` and `**kwargs`
126-
are passed to the function call._
127-
128-
_The coroutine result will be stored in the `PageCoroutine.result` attribute_
129-
130-
For instance,
131-
```python
132-
PageCoroutine("screenshot", path="quotes.png", fullPage=True)
133-
```
134-
135-
produces the same effect as:
136-
```python
137-
# 'page' is a playwright.async_api.Page object
138-
await page.screenshot(path="quotes.png", fullPage=True)
139-
```
140-
141-
142-
### Receiving the Page object in the callback
130+
## Receiving the Page object in the callback
143131

144132
Specifying a non-False value for the `playwright_include_page` `meta` key for a
145133
request will result in the corresponding `playwright.async_api.Page` object
@@ -176,6 +164,109 @@ class AwesomeSpiderWithPage(scrapy.Spider):
176164
Scrapy request workflow (Scheduler, Middlewares, etc).
177165

178166

167+
## Multiple browser contexts
168+
169+
Multiple [browser contexts](https://playwright.dev/python/docs/core-concepts/#browser-contexts)
170+
to be launched at startup can be defined via the `PLAYWRIGHT_CONTEXTS` [setting](#settings).
171+
172+
### Choosing a specific context for a request
173+
174+
Pass the name of the desired context in the `playwright_context` meta key:
175+
176+
```python
177+
yield scrapy.Request(
178+
url="https://example.org",
179+
meta={"playwright": True, "playwright_context": "first"},
180+
)
181+
```
182+
183+
### Creating a context during a crawl
184+
185+
If the context specified in the `playwright_context` meta key does not exist, it will be created.
186+
You can specify keyword arguments to be passed to
187+
[`Browser.new_context`](https://playwright.dev/python/docs/api/class-browser#browsernew_contextkwargs)
188+
in the `playwright_context_kwargs` meta key:
189+
190+
```python
191+
yield scrapy.Request(
192+
url="https://example.org",
193+
meta={
194+
"playwright": True,
195+
"playwright_context": "new",
196+
"playwright_context_kwargs": {
197+
"java_script_enabled": False,
198+
"ignore_https_errors": True,
199+
"proxy": {
200+
"server": "http://myproxy.com:3128",
201+
"username": "user",
202+
"password": "pass",
203+
},
204+
},
205+
},
206+
)
207+
```
208+
209+
Please note that if a context with the specified name already exists,
210+
that context is used and `playwright_context_kwargs` are ignored.
211+
212+
### Closing a context during a crawl
213+
214+
After [receiving the Page object in your callback](#receiving-the-page-object-in-the-callback),
215+
you can access a context though the corresponding [`Page.context`](https://playwright.dev/python/docs/api/class-page#page-context)
216+
attribute, and await [`close`](https://playwright.dev/python/docs/api/class-browsercontext#browser-context-close) on it.
217+
218+
```python
219+
def parse(self, response):
220+
yield scrapy.Request(
221+
url="https://example.org",
222+
callback=self.parse_in_new_context,
223+
meta={"playwright": True, "playwright_context": "new", "playwright_include_page": True},
224+
)
225+
226+
async def parse_in_new_context(self, response):
227+
page = response.meta["playwright_page"]
228+
title = await page.title()
229+
await page.context.close() # close the context
230+
await page.close()
231+
return {"title": title}
232+
```
233+
234+
235+
## Page coroutines
236+
237+
A sorted iterable (`list`, `tuple` or `dict`, for instance) could be passed
238+
in the `playwright_page_coroutines`
239+
[Request.meta](https://docs.scrapy.org/en/latest/topics/request-response.html#scrapy.http.Request.meta)
240+
key to request coroutines to be awaited on the `Page` before returning the final
241+
`Response` to the callback.
242+
243+
This is useful when you need to perform certain actions on a page, like scrolling
244+
down or clicking links, and you want everything to count as a single Scrapy
245+
Response, containing the final result.
246+
247+
### Supported actions
248+
249+
* `scrapy_playwright.page.PageCoroutine(method: str, *args, **kwargs)`:
250+
251+
_Represents a coroutine to be awaited on a `playwright.page.Page` object,
252+
such as "click", "screenshot", "evaluate", etc.
253+
`method` should be the name of the coroutine, `*args` and `**kwargs`
254+
are passed to the function call._
255+
256+
_The coroutine result will be stored in the `PageCoroutine.result` attribute_
257+
258+
For instance,
259+
```python
260+
PageCoroutine("screenshot", path="quotes.png", fullPage=True)
261+
```
262+
263+
produces the same effect as:
264+
```python
265+
# 'page' is a playwright.async_api.Page object
266+
await page.screenshot(path="quotes.png", fullPage=True)
267+
```
268+
269+
179270
## Examples
180271

181272
**Click on a link, save the resulting page as PDF**

examples/books.py

+18-12
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import hashlib
2+
import logging
23
from pathlib import Path
3-
from typing import Generator
4+
from typing import Generator, Optional
45

56
from scrapy import Spider
67
from scrapy.crawler import CrawlerProcess
@@ -12,25 +13,23 @@ class BooksSpider(Spider):
1213

1314
name = "books"
1415
start_urls = ["http://books.toscrape.com"]
15-
custom_settings = {
16-
"CLOSESPIDER_ITEMCOUNT": 100,
17-
"CONCURRENT_REQUESTS": 32,
18-
"FEEDS": {
19-
"books.json": {"format": "json", "encoding": "utf-8", "indent": 4},
20-
},
21-
}
22-
23-
def parse(self, response: Response) -> Generator:
16+
17+
def parse(self, response: Response, current_page: Optional[int] = None) -> Generator:
2418
page_count = response.css(".pager .current::text").re_first(r"Page \d+ of (\d+)")
2519
page_count = int(page_count)
2620
for page in range(2, page_count + 1):
27-
yield response.follow(f"/catalogue/page-{page}.html")
21+
yield response.follow(f"/catalogue/page-{page}.html", cb_kwargs={"current_page": page})
2822

23+
current_page = current_page or 1
2924
for book in response.css("article.product_pod a"):
3025
yield response.follow(
3126
book,
3227
callback=self.parse_book,
33-
meta={"playwright": True, "playwright_include_page": True},
28+
meta={
29+
"playwright": True,
30+
"playwright_include_page": True,
31+
"playwright_context": f"page-{current_page}",
32+
},
3433
)
3534

3635
async def parse_book(self, response: Response) -> dict:
@@ -57,7 +56,14 @@ async def parse_book(self, response: Response) -> dict:
5756
# "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
5857
"http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
5958
},
59+
"CONCURRENT_REQUESTS": 32,
60+
"CLOSESPIDER_ITEMCOUNT": 100,
61+
"FEEDS": {
62+
"books.json": {"format": "json", "encoding": "utf-8", "indent": 4},
63+
},
6064
}
6165
)
6266
process.crawl(BooksSpider)
67+
logging.getLogger("scrapy.core.engine").setLevel(logging.WARNING)
68+
logging.getLogger("scrapy.core.scraper").setLevel(logging.WARNING)
6369
process.start()

examples/contexts.py

+107
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
from scrapy import Spider, Request
2+
from scrapy.crawler import CrawlerProcess
3+
4+
5+
class MultipleContextsSpider(Spider):
6+
"""Handle multiple browser contexts"""
7+
8+
name = "contexts"
9+
custom_settings = {
10+
"PLAYWRIGHT_CONTEXTS": {
11+
"first": {
12+
"storage_state": {
13+
"cookies": [
14+
{
15+
"url": "https://httpbin.org/headers",
16+
"name": "context",
17+
"value": "first",
18+
},
19+
],
20+
},
21+
},
22+
"second": {
23+
"storage_state": {
24+
"cookies": [
25+
{
26+
"url": "https://httpbin.org/headers",
27+
"name": "context",
28+
"value": "second",
29+
},
30+
],
31+
},
32+
},
33+
},
34+
}
35+
36+
def start_requests(self):
37+
# using existing contexts
38+
yield Request(
39+
url="https://httpbin.org/headers",
40+
meta={
41+
"playwright": True,
42+
"playwright_context": "first",
43+
"playwright_include_page": True,
44+
},
45+
dont_filter=True,
46+
)
47+
yield Request(
48+
url="https://httpbin.org/headers",
49+
meta={
50+
"playwright": True,
51+
"playwright_context": "second",
52+
"playwright_include_page": True,
53+
},
54+
dont_filter=True,
55+
)
56+
# create a new context
57+
yield Request(
58+
url="https://httpbin.org/headers",
59+
meta={
60+
"playwright": True,
61+
"playwright_context": "third",
62+
"playwright_context_kwargs": {
63+
"storage_state": {
64+
"cookies": [
65+
{
66+
"url": "https://httpbin.org/headers",
67+
"name": "context",
68+
"value": "third",
69+
},
70+
],
71+
},
72+
},
73+
"playwright_include_page": True,
74+
},
75+
dont_filter=True,
76+
)
77+
# default context
78+
yield Request(
79+
url="https://httpbin.org/headers",
80+
meta={"playwright": True, "playwright_include_page": True},
81+
dont_filter=True,
82+
)
83+
84+
async def parse(self, response):
85+
page = response.meta["playwright_page"]
86+
context_name = response.meta["playwright_context"]
87+
storage_state = await page.context.storage_state()
88+
await page.context.close()
89+
return {
90+
"url": response.url,
91+
"context": context_name,
92+
"cookies": storage_state["cookies"],
93+
}
94+
95+
96+
if __name__ == "__main__":
97+
process = CrawlerProcess(
98+
settings={
99+
"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
100+
"DOWNLOAD_HANDLERS": {
101+
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
102+
# "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
103+
},
104+
}
105+
)
106+
process.crawl(MultipleContextsSpider)
107+
process.start()

0 commit comments

Comments
 (0)