|
| 1 | +import bs4 |
| 2 | +from urllib.request import urlopen |
| 3 | +from bs4 import BeautifulSoup as soup |
| 4 | + |
| 5 | +my_url = 'https://www.newegg.com/Video-Cards-Video-Devices/Category/ID-38?Tpk=graphics%20card' |
| 6 | + |
| 7 | +# opening url and grabbing the web page |
| 8 | +uClient = urlopen(my_url) |
| 9 | +page_html = uClient.read() |
| 10 | +uClient.close() |
| 11 | + |
| 12 | +# html parsing |
| 13 | +page_soup = soup(page_html, 'html.parser') |
| 14 | + |
| 15 | +# grabbing all containers with class name = item-container |
| 16 | +containers = page_soup.findAll('div', {'class':'item-container'}) |
| 17 | + |
| 18 | +filename = "products.csv" |
| 19 | +f = open(filename, 'w') |
| 20 | + |
| 21 | +headers = "brands, product_name, shipping\n" |
| 22 | + |
| 23 | +f.write(headers) |
| 24 | + |
| 25 | +container = containers[1] |
| 26 | + |
| 27 | +for container in containers: |
| 28 | + brand = container.div.div.a.img['title'] |
| 29 | + title_container = container.findAll('a', {'class':'item-title'}) |
| 30 | + product_name = title_container[0].text |
| 31 | + ship_container = container.findAll('li', {'class':'price-ship'}) |
| 32 | + # use strip() to remove blank spaces before and after text |
| 33 | + shipping = ship_container[0].text.strip() |
| 34 | + |
| 35 | + print("brand:" + brand) |
| 36 | + print("product_name:" + product_name) |
| 37 | + print("shipping:" + shipping) |
| 38 | + |
| 39 | + f.write(brand + ',' + product_name.replace(',' , '|') + ',' + shipping + '\n') |
| 40 | + |
| 41 | +f.close() |
| 42 | + |
0 commit comments