Merge pull request #117 from arjuaman/master

powerexploit · web-flow · commit 38b780dd0376 · 2020-08-18T11:26:16.000-04:00
Amazon Reviews Scraper for any product and putting it into a csv file
diff --git a/Web-Scraping/amazon_reviews_scraping/README.md b/Web-Scraping/amazon_reviews_scraping/README.md
@@ -0,0 +1,24 @@
+## USE: As a part of projects where you are required to do sentiment analysis on customer reviews data. </br>
+
+Scrapy is a web crawling framework for a developer to write code to create, which defines how a particular site (or a group of websites) will be scrapped.
+
+### Steps:</br>
+
+1) From the conda-forge, install scrapy: </br>
+>> conda install -c conda-forge scrapy
+
+In case you want to install from system, use: 
+>> pip install scrapy
+
+2) Start a project:</br>
+>> scrapy startproject amazon_reviews_scraping
+
+3) A spider is a chunk of python code that determines how a web page will be scrapped, it's the main component that crwals the webpage and extracts contents from it.</br>
+So copy your link from the product review you want to scrape, and run the following: </br>
+>> scrapy genspider amazon_review <your-link-here>
+
+4) Now you'll need to define a scrapy parser, which I've already done in: </br>
+amazon_reviews_scraping/spider/scraper.py
+
+5) Run the following to store the result in a csv file titled "reviews.csv", or you may change the name as per your convenience! </br>
+>> scrapy runspider amazon_reviews_scraping/amazon_reviews_scraping/spiders/amazon_reviews.py -o reviews.csv
diff --git a/Web-Scraping/amazon_reviews_scraping/items.py b/Web-Scraping/amazon_reviews_scraping/items.py
@@ -0,0 +1,7 @@
+import scrapy
+
+
+class AmazonReviewsScrapingItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    pass
diff --git a/Web-Scraping/amazon_reviews_scraping/middlewares.py b/Web-Scraping/amazon_reviews_scraping/middlewares.py
@@ -0,0 +1,59 @@
+from scrapy import signals
+
+from itemadapter import is_item, ItemAdapter
+
+
+class AmazonReviewsScrapingSpiderMiddleware:
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(self, response, spider):
+        
+        return None
+
+    def process_spider_output(self, response, result, spider):
+        
+        for i in result:
+            yield i
+
+    def process_spider_exception(self, response, exception, spider):
+        
+        pass
+
+    def process_start_requests(self, start_requests, spider):
+        
+        for r in start_requests:
+            yield r
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
+
+
+class AmazonReviewsScrapingDownloaderMiddleware:
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_request(self, request, spider):
+        
+        return None
+
+    def process_response(self, request, response, spider):
+        
+        return response
+
+    def process_exception(self, request, exception, spider):
+        
+        pass
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
diff --git a/Web-Scraping/amazon_reviews_scraping/pipelines.py b/Web-Scraping/amazon_reviews_scraping/pipelines.py
@@ -0,0 +1,7 @@
+
+from itemadapter import ItemAdapter
+
+
+class AmazonReviewsScrapingPipeline:
+    def process_item(self, item, spider):
+        return item
diff --git a/Web-Scraping/amazon_reviews_scraping/settings.py b/Web-Scraping/amazon_reviews_scraping/settings.py
@@ -0,0 +1,6 @@
+BOT_NAME = 'amazon_reviews_scraping'
+
+SPIDER_MODULES = ['amazon_reviews_scraping.spiders']
+NEWSPIDER_MODULE = 'amazon_reviews_scraping.spiders'
+
+ROBOTSTXT_OBEY = True
diff --git a/Web-Scraping/amazon_reviews_scraping/spiders/__init__.py b/Web-Scraping/amazon_reviews_scraping/spiders/__init__.py
@@ -0,0 +1 @@
+# This package will contain the spiders of your Scrapy project
diff --git a/Web-Scraping/amazon_reviews_scraping/spiders/__pycache__/amazon_reviews.cpython-38.pyc b/Web-Scraping/amazon_reviews_scraping/spiders/__pycache__/amazon_reviews.cpython-38.pyc
diff --git a/Web-Scraping/amazon_reviews_scraping/spiders/scraper.py b/Web-Scraping/amazon_reviews_scraping/spiders/scraper.py
@@ -0,0 +1,27 @@
+import scrapy
+ 
+class AmazonReviewsSpider(scrapy.Spider):
+ 
+    name = 'amazon_reviews'
+ 
+    allowed_domains = ['amazon.in']
+ 
+    myBaseUrl = "https://www.amazon.com/OnePlus-Interstellar-Unlocked-Android-Smartphone/product-reviews/B0872473BF/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews"
+    start_urls=[]
+ 
+    for i in range(1,121):
+        start_urls.append(myBaseUrl+str(i))
+ 
+    def parse(self, response):
+            data = response.css('#cm_cr-review_list')
+ 
+            star_rating = data.css('.review-rating')
+ 
+            comments = data.css('.review-text-content')
+            count = 0
+ 
+            for review in star_rating:
+                yield{'stars': ''.join(review.xpath('.//text()').extract()),
+                      'comment': ''.join(comments[count].xpath(".//text()").extract())
+                     }
+                count=count+1

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+# This package will contain the spiders of your Scrapy project`