-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape.py
148 lines (118 loc) · 5.29 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import asyncio
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
import pandas as pd
import random
from typing import List, Dict
import time
import platform
import os
from datetime import datetime
class G2Scraper:
def __init__(self):
self.playwright = None
self.browser = None
self.context = None
self.page = None
async def initialize(self):
"""Async initialization method"""
self.playwright = await async_playwright().start()
chromium = self.playwright.chromium
self.browser = await chromium.connect_over_cdp('wss://connect.browserbase.com?apiKey='+ os.environ["BROWSERBASE_API_KEY"])
self.context = self.browser.contexts[0]
self.page = self.context.pages[0]
async def get_product_reviews(self, product_url: str, num_pages: int = 1, product_name: str = "example-product") -> List[Dict]:
reviews = []
try:
# First visit the main product page
print("Visiting main page...")
await self.page.goto(product_url)
await asyncio.sleep(2) # Short wait for page load
for page in range(1, num_pages + 1):
print(f"Processing page {page}...")
# Add timestamp to URL
timestamp = int(time.time() * 1000)
url = f"{product_url}?page={page}"
await asyncio.sleep(5) # Short delay between pages
# Navigate to the page and wait for content to load
await self.page.goto(url)
# Get the page content and parse with BeautifulSoup
content = await self.page.content()
print("found content")
soup = BeautifulSoup(content, 'html.parser')
review_elements = soup.find_all("div", class_="paper paper--white paper--box mb-2 position-relative border-bottom")
if not review_elements:
print(f"No reviews found on page {page}")
return reviews
for review in review_elements:
review_data = {
'text': self._extract_text(review),
'rating': self._extract_rating(review),
'date': self._extract_date(review),
'reviewer': self._extract_reviewer(review)
}
print(review_data)
reviews.append(review_data)
return reviews
except Exception as e:
print(f"An error occurred: {e}")
return reviews
def _extract_text(self, review_element) -> str:
review_body = review_element.find("div", attrs={"itemprop": "reviewBody"}).text
return review_body
def _extract_rating(self, review_element) -> int:
rating_container = review_element.find("div", class_="f-1 d-f ai-c mb-half-small-only")
rating_div = rating_container.find("div")
rating_class = rating_div.get("class")
stars_string = rating_class[-1]
stars_large_number = float(stars_string.split("-")[-1])
stars_clean_number = stars_large_number/2
return stars_clean_number
def _extract_date(self, review_element) -> str:
review_date = review_element.find("time")
date = review_date.get("datetime")
return date
def _extract_reviewer(self, review_element) -> str:
name_present = review_element.find("a", class_="link--header-color")
name = name_present.text if name_present else "anonymous"
return name
async def run(product_name: str):
scraper = G2Scraper()
try:
print("Initializing scraper...")
await scraper.initialize()
print("Connected to Chrome successfully!")
# Add initial delay before starting
await asyncio.sleep(random.uniform(5, 10))
product_url = f"https://www.g2.com/products/{product_name}/reviews"
reviews = await scraper.get_product_reviews(product_url, num_pages=10, product_name=product_name)
# Initialize filename variable
filename = None
if reviews:
df = pd.DataFrame(reviews)
filename = f'csv/g2_reviews_{product_name}_{datetime.now().strftime("%Y-%m-%d")}.csv'
df.to_csv(filename, index=False)
print(f"Successfully saved {len(reviews)} reviews to {filename}")
else:
print("No reviews were collected")
return filename if filename else None
except Exception as e:
print(f"An error occurred: {e}")
raise
finally:
# Add error handling for cleanup
try:
if scraper.browser:
await scraper.browser.close()
if scraper.playwright:
await scraper.playwright.stop()
if scraper.context:
await scraper.context.close()
except Exception as e:
print(f"Error during cleanup: {e}")
if __name__ == "__main__":
import sys
if len(sys.argv) != 2:
print("Usage: python script.py <company_name>")
sys.exit(1)
asyncio.run(run(sys.argv[1]))