From d13ab778b232d9d61ab89aabc7ec3de7a848908e Mon Sep 17 00:00:00 2001 From: Rohit Nair Date: Wed, 9 Oct 2024 21:55:17 +0530 Subject: [PATCH 1/2] rewrote the package, added download feature too --- package-lock.json | 4 +- src/example.js | 6 +- src/google/index.js | 175 ++++++++++++++++++++++++++++++++++ src/google/scraper.js | 215 ------------------------------------------ src/logger.js | 18 ++-- 5 files changed, 190 insertions(+), 228 deletions(-) create mode 100644 src/google/index.js delete mode 100644 src/google/scraper.js diff --git a/package-lock.json b/package-lock.json index 9757f5e..ad123b5 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "images-scraper", - "version": "6.4.3", + "version": "6.4.6", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "images-scraper", - "version": "6.4.3", + "version": "6.4.6", "license": "ISC", "dependencies": { "cheerio": "^1.0.0-rc.12", diff --git a/src/example.js b/src/example.js index 331a4bd..6d3a548 100644 --- a/src/example.js +++ b/src/example.js @@ -1,10 +1,10 @@ 'use strict'; -var Scraper = require('./google/scraper'); +const Scraper = require('./google'); -let google = new Scraper(); +const google = new Scraper(); (async () => { - const results = await google.scrape('banana', 10); // Or ['banana', 'strawberry'] for multi-queries + const results = await google.downloadImages('cat', 10); console.log('results', results); })(); diff --git a/src/google/index.js b/src/google/index.js new file mode 100644 index 0000000..8fb7c6a --- /dev/null +++ b/src/google/index.js @@ -0,0 +1,175 @@ +'use strict'; + +const puppeteer = require('puppeteer'); +const fs = require("fs"); +const axios = require("axios"); +const path = require('path'); +const logger = require('../logger'); + +/** + * @param {string | array} userAgent user agent + * @param {object} puppeteer puppeteer options + * @param {object} tbs extra options for TBS request parameter + */ +class GoogleScraper { + constructor({ + userAgent = [ + 'Mozilla/5.0 (X11; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Safari/605.1.15', + 'Mozilla/5.0 (iPhone; CPU iPhone OS 15_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Mobile/15E148 Safari/604.1', + 'Mozilla/5.0 (Linux; Android 10; SM-G970F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Mobile Safari/537.36', + ], + scrollDelay = 500, + puppeteer = { headless: true }, + tbs = {}, + safe = false, + } = {}) { + this.userAgent = Array.isArray(userAgent) + ? userAgent[Math.floor(Math.random() * userAgent.length)] + : userAgent; + this.scrollDelay = scrollDelay; + this.puppeteerOptions = puppeteer; + this.tbs = this._parseRequestParameters(tbs); + this.safe = this._isQuerySafe(safe); + this.browser = null; + } + + /** + * Method to download images based on query + * @param {string | string[]} queries + * @param {number} limit + * @param {string} directory + * @returns {object} + */ + async downloadImages(queries, limit = 5, directory = 'downloads') { + const downloadFolder = path.join(process.cwd(), directory); + + if (!fs.existsSync(downloadFolder)) { + fs.mkdirSync(downloadFolder); + } + + const imageUrls = await this.getImageUrl(queries, limit); + + for (const queryKey in imageUrls) { + const imageUrlList = imageUrls[queryKey]; + for (let i = 0; i < imageUrlList.length; i++) { + const { url } = imageUrlList[i]; + let extension = '.jpg'; + try { + const response = await axios.head(url); + const contentType = response.headers['content-type']; + if (contentType) { + if (contentType.includes('image/jpeg')) extension = '.jpg'; + else if (contentType.includes('image/png')) extension = '.png'; + else if (contentType.includes('image/gif')) extension = '.gif'; + else if (contentType.includes('image/webp')) extension = '.webp'; + } + } catch (error) { + logger.info(`Error fetching headers for ${url}: ${error.message}`); + } + const fileName = `${queryKey}_${i + 1}${extension}`; + const queryDownloadPath = path.join(downloadFolder, queryKey); + if (!fs.existsSync(queryDownloadPath)) { + fs.mkdirSync(queryDownloadPath); + } + + const filePath = path.join(queryDownloadPath, fileName); + + try { + const imageResponse = await axios.get(url, { responseType: 'arraybuffer' }); + fs.writeFileSync(filePath, imageResponse.data); + logger.info(`Downloaded ${fileName}`); + } catch (error) { + logger.error(`Error downloading image from ${url}: ${error.message}`); + } + } + logger.info(`Saved files at ${downloadFolder}`); + } + + return imageUrls; + } + + /** + * Method to get an object with image urls + * @param {string | string[]} queries + * @param {number} limit + * @returns {object} + */ + async getImageUrl(queries, limit = 5) { + try { + const browser = await puppeteer.launch({ ...this.puppeteerOptions }); + const page = await browser.newPage(); + await page.setBypassCSP(true); + await page.setUserAgent(this.userAgent); + const queriesIsArray = Array.isArray(queries); + let imageUrlObject = {}; + + /** + * Used for DRY + * @param {string} query + */ + const getUrls = async (query) => { + const pageUrl = `https://www.google.com/search?${this.safe}&source=lnms&tbs=${this.tbs}&tbm=isch&q=${this._parseRequestQueries(query)}`; + logger.debug(pageUrl); + await page.goto(pageUrl); + + await page.evaluate(async () => { + for (let i = 0; i < 10; i++) { + window.scrollBy(0, window.innerHeight); + await new Promise(resolve => setTimeout(resolve, this.scrollDelay)); + } + }); + + await page.waitForSelector('img'); + + const images = await page.evaluate(() => { + const imageElements = document.querySelectorAll('img'); + return Array.from(imageElements) + .map(img => img.src) + .filter(url => url.startsWith('http') && !url.includes('google')); + }); + + const queryKey = query.replace(/\s/g, ''); + imageUrlObject[queryKey] = images.slice(0, limit).map(url => ({ query, url })); + } + + if (queriesIsArray) { + for (const query of queries) { + await getUrls(query); + } + } else { + await getUrls(queries); + } + + await browser.close(); + return imageUrlObject; + + } catch (err) { + logger.error('An error occurred:', err); + } + } + + _parseRequestParameters(tbs) { + if (!tbs) { + return ''; + } + + return encodeURIComponent( + Object.entries(tbs) + .filter(([, value]) => value) + .map(([key, value]) => `${key}:${value}`) + .join(',') + ); + } + + _parseRequestQueries(query) { + return query ? encodeURIComponent(query) : ''; + } + + _isQuerySafe(safe) { + return safe ? '&safe=active' : ''; + } +} + +module.exports = GoogleScraper; diff --git a/src/google/scraper.js b/src/google/scraper.js deleted file mode 100644 index 1881966..0000000 --- a/src/google/scraper.js +++ /dev/null @@ -1,215 +0,0 @@ -'use strict'; - -const puppeteer = require('puppeteer'); -const cheerio = require('cheerio'); -const url = require('url'); -const logger = require('../logger'); - -/** - * @param {string} userAgent user agent - * @param {object} puppeteer puppeteer options - * @param {object} tbs extra options for TBS request parameter - */ -class GoogleScraper { - constructor({ - userAgent = 'Mozilla/5.0 (X11; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0', - scrollDelay = 500, - puppeteer = {}, - tbs = {}, - safe = false, - } = {}) { - this.userAgent = userAgent; - this.scrollDelay = scrollDelay; - this.puppeteerOptions = puppeteer; - this.tbs = this._parseRequestParameters(tbs); - this.safe = this._isQuerySafe(safe); - this.browser = null; - } - - _parseRequestParameters(tbs) { - if (tbs === undefined) { - return ''; - } - - const options = Object.keys(tbs) - .filter((key) => tbs[key]) - .map((key) => `${key}:${tbs[key]}`) - .join(','); - return encodeURIComponent(options); - } - - _isQuerySafe(safe) { - if (safe === true) { - return '&safe=active'; - } else return ''; - } - - async _scrapePage(searchQuery, limit = 100) { - const query = `https://www.google.com/search?${this.safe}&source=lnms&tbm=isch&sa=X&tbs=${this.tbs}&q=${searchQuery}`; - - logger.debug(`Start Google search for "${searchQuery}"`); - - const page = await this.browser.newPage(); - await page.setBypassCSP(true); - await page.goto(query, { - waitUntil: 'networkidle0', - }); - - const [button] = await page.$x("//button[contains(., 'Accept all')]"); - if (button) { - await button.click(); - await page.waitForNavigation({ - waitUntil: 'networkidle0', - }); - } - - await page.setViewport({ width: 1920, height: 1080 }); - await page.setUserAgent(this.userAgent); - - let results = []; - let previousCount = -1; - while (results.length < limit) { - await this._scrollToEnd(page); - await this._clickAllImages(page); - await page - .waitForTimeout("#islrg a[href^='/imgres']", { timeout: 1000 }) // Wait for the selector to appear in page. - .catch(() => logger.debug('No results on this page')); // Unblock the flow - - const html = await page.content(); - const links = this._parseLinksFromHTML(html); - previousCount = results.length; - results = links.slice(0, limit); - if (previousCount === results.length) { - logger.debug('End of the page is reached'); - break; - } - - logger.debug(`Got ${results.length} results so far`); - } - - await page.close(); - - return results; - } - - async scrape(searchQuery, limit) { - if (searchQuery === undefined || searchQuery === '') { - throw new Error('Invalid search query provided'); - } - - this.browser = await puppeteer.launch({ - ...this.puppeteerOptions, - }); - - let results = []; - - if (Array.isArray(searchQuery)) { - const promises = searchQuery.map(async (query) => { - const images = await this._scrapePage(query, limit); - return { query, images }; - }); - results = await Promise.all(promises); - } else { - results = await this._scrapePage(searchQuery, limit); - } - await this.browser.close(); - return results; - } - - /** - * Scroll to the end of the page. - * @param {page} Puppeteer page to scroll - */ - async _scrollToEnd(page) { - logger.debug('Scrolling to the end of the page'); - - const isScrollable = await this._isScrollable(page); - if (!isScrollable) { - logger.debug('No results on this page'); - return; - } - - const buttonIsVisible = await this._isButtonVisible(page); - await page.evaluate('window.scrollTo(0, document.body.scrollHeight)'); - logger.debug(`Scrolled to bottom of the page`); - - if (buttonIsVisible) { - await page.click("#islmp input[type='button']"); - logger.debug('Clicked on show more results'); - } - - await page.waitForTimeout(this.scrollDelay); - await page.evaluate('window.scrollTo(0, document.body.scrollHeight)'); - } - - _isScrollable(page) { - return page.evaluate(() => { - return document.querySelector("#islmp input[type='button']") !== null; - }); - } - - _isButtonVisible(page) { - return page.evaluate(() => { - function isVisible(e) { - return !!(e.offsetWidth || e.offsetHeight || e.getClientRects().length); - } - return isVisible(document.querySelector("#islmp input[type='button']")); - }); - } - - async _clickAllImages(page) { - logger.debug('Scrolling to the end of the page'); - return page.evaluate(() => { - let elements = document.querySelectorAll('#islrg img'); - - function rightClick(element) { - return new Promise((resolve) => { - let event = new MouseEvent('mousedown', { - bubbles: true, - cancelable: false, - view: window, - button: 2, - buttons: 2, - clientX: element.getBoundingClientRect().x, - clientY: element.getBoundingClientRect().y, - }); - element.dispatchEvent(event); - resolve(); - }); - } - - async function rightClickAll(elements) { - for (const element of elements) { - await rightClick(element); - } - } - rightClickAll(elements); - }); - } - - _parseLinksFromHTML(html) { - const links = []; - - const $ = cheerio.load(html); - - $('#islrg div[jsaction][data-tbnid]').each(function (_i, containerElement) { - const containerElement_ = $(containerElement); - const linkElementHrefExpectedSelectors = ["a[href*='/imgres']", 'a[jsaction]']; - const linkElementHref = linkElementHrefExpectedSelectors - .map((s) => containerElement_.find(s).attr('href')) - .find((e) => e); - if (linkElementHref) { - // linkElementHref could be undefined - const imageElementAlt = containerElement_.find('img').attr('alt'); - const parsedLink = url.parse(linkElementHref, { parseQueryString: true }); - const imageurl = parsedLink.query.imgurl; - const source = parsedLink.query.imgrefurl; - links.push({ url: imageurl, source, title: imageElementAlt }); - } - }); - - return links; - } -} - -module.exports = GoogleScraper; diff --git a/src/logger.js b/src/logger.js index 0ddce73..7375e0b 100644 --- a/src/logger.js +++ b/src/logger.js @@ -1,9 +1,11 @@ const winston = require('winston'); - -const consoleTransport = new winston.transports.Console(); - -const winstonOptions = { - level: process.env.LOG_LEVEL || 'info', - transports: [consoleTransport], -}; -module.exports = new winston.createLogger(winstonOptions); +module.exports = winston.createLogger({ + level: 'info', + format: winston.format.combine( + winston.format.timestamp(), + winston.format.json() + ), + transports: [ + new winston.transports.Console(), + ], +}); From d01f2f10c2d16edd8ad34a622649f5e29c7c11f6 Mon Sep 17 00:00:00 2001 From: Rohit Nair Date: Wed, 9 Oct 2024 21:57:01 +0530 Subject: [PATCH 2/2] used const instead of var --- src/google/index.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/google/index.js b/src/google/index.js index 8fb7c6a..c8678a3 100644 --- a/src/google/index.js +++ b/src/google/index.js @@ -103,7 +103,7 @@ class GoogleScraper { await page.setBypassCSP(true); await page.setUserAgent(this.userAgent); const queriesIsArray = Array.isArray(queries); - let imageUrlObject = {}; + const imageUrlObject = {}; /** * Used for DRY