-
Notifications
You must be signed in to change notification settings - Fork 103
/
Copy pathvestiairecollective.py
139 lines (116 loc) · 5.12 KB
/
vestiairecollective.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
"""
This is an example web scraper for vestiairecollective.com.
To run this scraper set env variable $SCRAPFLY_KEY with your scrapfly API key:
$ export $SCRAPFLY_KEY="your key from https://scrapfly.io/dashboard"
"""
import os
import json
from typing import Dict, List
from pathlib import Path
from loguru import logger as log
from scrapfly import ScrapeConfig, ScrapflyClient, ScrapeApiResponse
SCRAPFLY = ScrapflyClient(key=os.environ["SCRAPFLY_KEY"])
BASE_CONFIG = {
# bypass vestiairecollective.com web scraping blocking
"asp": True,
# set the proxy country to US
"country": "US",
}
output = Path(__file__).parent / "results"
output.mkdir(exist_ok=True)
def find_hidden_data(result: ScrapeApiResponse) -> dict:
"""extract hidden NEXT_DATA from page html"""
data = result.selector.css("script#__NEXT_DATA__::text").get()
data = json.loads(data)
return data
def parse_xhr_call(result: ScrapeApiResponse) -> List[Dict]:
"""extract JSON data from xhr_calls"""
_xhr_calls = result.scrape_result["browser_data"]["xhr_call"]
try:
# extract the search xhr call
search_call = [call for call in _xhr_calls if "search" in call["url"]][0]
except:
log.warning("couldn't find the search xhr call - is the search URL t a valid search page?")
# extract the product listings data from the first search page
data = json.loads(search_call["response"]["body"])
result = {
"headers": search_call["headers"],
"payload": json.loads(search_call["body"]),
"total_pages": data["paginationStats"]["totalPages"],
"data": data["items"],
}
return result
async def send_api_request(headers, payload, offset) -> List[Dict]:
"""send a POST request the search API"""
# change the offest to control the number of products to retrieve
payload["pagination"]["offset"] = offset
# send a POST request to the search API
response = await SCRAPFLY.async_scrape(
ScrapeConfig(
url="https://search.vestiairecollective.com/v1/product/search",
headers=headers,
body=json.dumps(payload),
country="US",
method="POST",
)
)
return response
def parse_search_api(result: ScrapeApiResponse) -> List[Dict]:
"""extract JSON data from the search API response"""
data = json.loads(result.scrape_result["content"])
return data["items"]
async def scrape_products(urls: List[str]) -> dict:
"""scrape goat.com product pages for product data"""
to_scrape = [ScrapeConfig(url, **BASE_CONFIG) for url in urls]
products = []
async for response in SCRAPFLY.concurrent_scrape(to_scrape):
data = find_hidden_data(response)
product = data["props"]["pageProps"]["product"]
products.append(product)
log.success(f"scraped {len(products)} product listings from product pages")
return products
async def retry_failure(url: str, _retries: int = 0):
"""retry failed requests with a maximum number of retries"""
max_retries = 3
try:
response = await SCRAPFLY.async_scrape(
ScrapeConfig(url, **BASE_CONFIG, render_js=True, proxy_pool="public_residential_pool")
)
if response.status_code != 200:
if _retries < max_retries:
log.debug("Retrying failed request")
return await retry_failure(url, _retries=_retries + 1)
else:
raise Exception("Unable to scrape first search page, max retries exceeded")
return response
except Exception as e:
if _retries < max_retries:
log.debug("Retrying failed request")
return await retry_failure(url, _retries=_retries + 1)
else:
raise Exception("Unable to scrape first search page, max retries exceeded")
async def scrape_search(url: str, max_pages: int = 10) -> List[Dict]:
log.info(f"scraping search page {url}")
# first, scrape the first search page while enabling render_js to capture the xhr calls
result_first_page = await retry_failure(url)
# then, parse the first page response to get the headers, payload, data and the number of total pages
first_page_api_result = parse_xhr_call(result_first_page)
headers = first_page_api_result["headers"]
payload = first_page_api_result["payload"]
results = first_page_api_result["data"]
total_pages = first_page_api_result["total_pages"]
# find total page count
if max_pages and max_pages < total_pages:
total_pages = max_pages
total_products = total_pages * 48 # each page contains 48 listings
# next, scrape the remaining search pages directly from the API
log.info(f"scraping search pagination, remaining ({total_pages - 1}) more pages")
for offset in range(48, total_products, 48):
try:
result = await send_api_request(headers, payload, offset)
results.extend(parse_search_api(result))
except Exception as e:
log.debug(f"Error occured while requesting search API: {e}")
pass
log.success(f"scraped {len(results)} product listings from search pages")
return results