Skip to content

Commit a7f17cb

Browse files
committed
Add initial scraper implementation
1 parent 6394df3 commit a7f17cb

14 files changed

+1580
-0
lines changed

Diff for: .flake8

+26
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
[flake8]
2+
min_python_version = 3.11
3+
max-line-length = 88
4+
ban-relative-imports = true
5+
format-greedy = 1
6+
inline-quotes = double
7+
mypy-init-return = true
8+
enable-extensions = TC, TC1
9+
type-checking-exempt-modules = typing, typing-extensions
10+
eradicate-whitelist-extend = ^-.*;
11+
extend-ignore =
12+
E203,
13+
SIM106,
14+
E501,
15+
INP001,
16+
W503,
17+
ANN001,
18+
ANN201,
19+
ANN002,
20+
ANN003,
21+
ANN101,
22+
ANN102,
23+
ANN204,
24+
ANN205,
25+
FS001,
26+
SIM119,

Diff for: .gitignore

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# Byte-compiled / optimized / DLL files
2+
__pycache__/
3+
*.py[cod]
4+
*$py.class
5+
6+
#mypy cache
7+
.mypy_cache
8+
9+
# Environments
10+
.env
11+
.venv
12+
env/
13+
venv/
14+
ENV/
15+
env.bak/
16+
venv.bak/
17+
18+
# Generated CSV files
19+
*.csv
20+
21+
.vscode/

Diff for: .pre-commit-config.yaml

+77
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
repos:
2+
- repo: https://github.com/pre-commit/pre-commit-hooks
3+
rev: v4.6.0
4+
hooks:
5+
- id: trailing-whitespace
6+
- id: end-of-file-fixer
7+
- id: check-merge-conflict
8+
- id: check-case-conflict
9+
- id: check-json
10+
- id: check-toml
11+
- id: check-yaml
12+
- id: pretty-format-json
13+
args: [--autofix, --no-ensure-ascii, --no-sort-keys]
14+
- id: check-ast
15+
- id: debug-statements
16+
- id: check-docstring-first
17+
18+
- repo: https://github.com/pre-commit/pygrep-hooks
19+
rev: v1.10.0
20+
hooks:
21+
- id: python-check-mock-methods
22+
- id: python-use-type-annotations
23+
- id: python-check-blanket-type-ignore
24+
- id: python-check-blanket-noqa
25+
26+
- repo: https://github.com/asottile/yesqa
27+
rev: v1.5.0
28+
hooks:
29+
- id: yesqa
30+
additional_dependencies: &flake8_deps
31+
- flake8-annotations==3.1.1
32+
- flake8-broken-line==1.0.0
33+
- flake8-bugbear==24.4.26
34+
- flake8-comprehensions==3.14.0
35+
- flake8-eradicate==1.5.0
36+
- flake8-no-pep420==2.7.0
37+
- flake8-quotes==3.4.0
38+
- flake8-simplify==0.21.0
39+
- flake8-tidy-imports==4.10.0
40+
- flake8-typing-imports==1.15.0
41+
- flake8-use-fstring==1.4
42+
43+
- repo: https://github.com/pycqa/isort
44+
rev: 5.13.2
45+
hooks:
46+
- id: isort
47+
name: "isort (python)"
48+
types: [python]
49+
- id: isort
50+
name: "isort (pyi)"
51+
types: [pyi]
52+
args: [--lines-after-imports, "-1"]
53+
54+
- repo: https://github.com/psf/black
55+
rev: 24.4.2
56+
hooks:
57+
- id: black
58+
59+
- repo: https://github.com/pycqa/flake8
60+
rev: 7.1.0
61+
hooks:
62+
- id: flake8
63+
additional_dependencies: *flake8_deps
64+
65+
- repo: https://github.com/pre-commit/mirrors-mypy
66+
rev: v1.10.1
67+
hooks:
68+
- id: mypy
69+
args: [--python-version=3.11]
70+
pass_filenames: false
71+
additional_dependencies:
72+
- types-requests
73+
74+
- repo: https://github.com/pre-commit/pre-commit
75+
rev: v2.17.0
76+
hooks:
77+
- id: validate_manifest

Diff for: .python-version

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
3.11.4

Diff for: Makefile

+17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# Makefile for running Amazon review scraper
2+
3+
4+
.PHONY: install
5+
install:
6+
pip install poetry==1.8.2
7+
poetry install
8+
9+
10+
.PHONY: scrape
11+
scrape:
12+
@if [ -z "$(ASIN_CODE)" ]; then \
13+
echo 'Error: An asin code of an Amazon product is required. Use make scrape ASIN_CODE="<asin_code>"'; \
14+
exit 1; \
15+
else \
16+
poetry run python -m amazon_review_scraper --asin-code="$(ASIN_CODE)"; \
17+
fi

Diff for: poetry.lock

+1,161
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Diff for: pyproject.toml

+41
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
[tool.poetry]
2+
name = "amazon-review-scraper"
3+
version = "0.1.0"
4+
description = "A tool for scraping Amazon Reviews."
5+
packages = [
6+
{ include = "amazon_review_scraper", from = "src" },
7+
]
8+
authors = ["Ignas Šimkūnas <[email protected]>"]
9+
readme = "README.md"
10+
11+
[tool.poetry.dependencies]
12+
python = "^3.11"
13+
pydantic = "^2.7.4"
14+
click = "^8.1.7"
15+
pandas = "^2.2.2"
16+
selenium = "^4.22.0"
17+
webdriver-manager = "^4.0.1"
18+
pydantic-settings = "^2.3.4"
19+
blinker="<1.8.0"
20+
21+
22+
[build-system]
23+
requires = ["poetry-core"]
24+
build-backend = "poetry.core.masonry.api"
25+
26+
[tool.mypy]
27+
files = "."
28+
ignore_missing_imports = true
29+
follow_imports = "silent"
30+
show_error_codes = true
31+
warn_redundant_casts = true
32+
warn_unused_configs = true
33+
warn_unused_ignores = true
34+
disallow_incomplete_defs = true
35+
36+
[tool.isort]
37+
py_version = 311
38+
combine_as_imports = true
39+
profile = "black"
40+
lines_between_types = 1
41+
lines_after_imports = 2

Diff for: src/amazon_review_scraper/__init__.py

Whitespace-only changes.

Diff for: src/amazon_review_scraper/__main__.py

+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
"""
2+
Main module for amazon_review_scraper.
3+
"""
4+
5+
import logging
6+
7+
import click
8+
9+
from amazon_review_scraper.collector import AmazonReviewDataCollector
10+
11+
12+
logging.basicConfig(level=logging.INFO)
13+
14+
15+
@click.command()
16+
@click.option(
17+
"--asin-code",
18+
help="The ASIN code of the product for which to scrape Amazon reviews for.",
19+
required=True,
20+
)
21+
def scrape_amazon_reviews(asin_code: str) -> None:
22+
collector = AmazonReviewDataCollector()
23+
collector.collect_amazon_review_data(asin_code)
24+
25+
26+
if __name__ == "__main__":
27+
scrape_amazon_reviews()

Diff for: src/amazon_review_scraper/collector.py

+57
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
"""
2+
Main module for collecting Amazon Review data.
3+
"""
4+
5+
import logging
6+
7+
from typing import List
8+
9+
import pandas as pd
10+
11+
from amazon_review_scraper.models import Review
12+
from amazon_review_scraper.scraper import AmazonReviewScraper
13+
14+
15+
DEFAULT_OUTPUT_FILE = "amazon_reviews.csv"
16+
17+
18+
class AmazonReviewDataCollector:
19+
"""Data collector class for Amazon Reviews"""
20+
21+
def __init__(
22+
self,
23+
output_file: str | None = None,
24+
logger: logging.Logger | None = None,
25+
) -> None:
26+
self._scraper = AmazonReviewScraper()
27+
self._output_file = output_file if output_file else DEFAULT_OUTPUT_FILE
28+
self._logger = logger if logger else logging.getLogger(__name__)
29+
30+
def _save_to_csv(self, reviews: List[Review]) -> None:
31+
"""Saves given list of product reviews to a CSV file."""
32+
self._logger.info(f"Writing {len(reviews)} reviews to {self._output_file}..")
33+
review_obejcts = [review.model_dump() for review in reviews]
34+
df = pd.DataFrame(review_obejcts)
35+
df.to_csv(self._output_file)
36+
37+
def collect_amazon_review_data(self, asin_code: str) -> None:
38+
"""
39+
Scrapes reviews from a given Amazon product page based on given ASIN code and stores it into a CSV file.
40+
41+
Args:
42+
asin_code (str): The ASIN code of the Amazon product for which to scrape reviews.
43+
"""
44+
self._logger.info(f"Getting Amazon reviews for ASIN code {asin_code}..")
45+
try:
46+
reviews = self._scraper.scrape_amazon_reviews(asin_code)
47+
except Exception:
48+
self._logger.exception(
49+
f"Error when scraping Amazon reviews for product {asin_code}."
50+
)
51+
return
52+
53+
if not reviews:
54+
self._logger.info(f"No reviews found for given product {asin_code}.")
55+
return
56+
57+
self._save_to_csv(reviews)

Diff for: src/amazon_review_scraper/conf.py

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
"""
2+
Config module for amazon_review_scraper.
3+
"""
4+
5+
from pydantic_settings import BaseSettings
6+
7+
8+
class AmazonReviewScraperSettings(BaseSettings):
9+
"""Settings class for Amazon Review Scraper"""
10+
11+
def get_amazon_product_url(self, asin_code: str) -> str:
12+
"""Returns an Amazon product URL for a given ASIN code."""
13+
return f"https://www.amazon.com/dp/{asin_code}"
14+
15+
16+
amazon_review_scraper_settings = AmazonReviewScraperSettings()

Diff for: src/amazon_review_scraper/exception.py

+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
"""
2+
Module for base exception class.
3+
"""
4+
5+
6+
class BaseException(Exception):
7+
"""Base exception class"""
8+
9+
message: str = ""
10+
11+
def __init__(self, message: str | None = None) -> None:
12+
super().__init__(message or self.message)

Diff for: src/amazon_review_scraper/models.py

+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
"""
2+
Pydantic models for Amazon Review scraper.
3+
"""
4+
5+
from pydantic import BaseModel
6+
7+
8+
class Review(BaseModel):
9+
author: str
10+
content: str
11+
rating: int
12+
title: str

0 commit comments

Comments
 (0)