-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcatcher_eurocvbay_parts.py
84 lines (77 loc) · 3.19 KB
/
catcher_eurocvbay_parts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import pymongo
from selenium import webdriver
from bs4 import BeautifulSoup
current_version = 1
client = pymongo.MongoClient('mongodb://localhost:27017/')
catcher_db = client['truck_catcher_db']
id_collect = catcher_db['id_collect']
# truck_model = catcher_db["truck_model"]
truck_model = catcher_db["truck_model_new"]
truck_model_detail = catcher_db["truck_model_detail"]
engine_model = catcher_db["engine_model"]
engine_model_detail = catcher_db["engine_model_detail_new"]
def init_engine(url):
driver = webdriver.Chrome()
driver.get(url)
html_str = BeautifulSoup(driver.page_source, 'html.parser')
product_list_ul = html_str.findAll(attrs={"class": "products-list"})[0]
products_li = product_list_ul.findAll("li")
for product_li in products_li:
engine = {}
product_name_a = product_li.find("h5").find("a")
product_index_href = product_name_a.attrs["href"]
product_index_name = product_name_a.text
product_span = product_li.findAll(
attrs={"class": "content"})[0].find("span")
product_items = product_span.findAll("p")
for product_item in product_items:
item_type = product_item.find("span")
engine[item_type.next_element.strip(
)] = item_type.next_sibling.strip()
engine['product_index_href'] = product_index_href
engine['product_index_name'] = product_index_name
engine['_id'] = get_next_id('engine_model')
engine['version'] = current_version
engine_model.insert(engine)
# links = html_str.findAll(attrs={"class": "pages-wd"})
# link = links[0].attrs["href"]
# init_engine('https://product.360che.com' + link)
def eurocvbay_parts_init(url):
print("current url is" + url)
html = gethtml(url)
# f = requests.get(url)
# html = BeautifulSoup(f.content, "html.parser")
part = {}
product_name = html.find(attrs={
"class": "J_productTitle title g_minor"
}).find("span").text
detailedDesc_container = html.find(attrs={"id": "detailedDesc"})
product_detail = detailedDesc_container.find("table")
replaces = []
if product_detail:
trs = product_detail.findAll("tr")
if len(trs) > 0:
for index in range(len(trs)):
if index == 0:
continue
replace = {}
tds = trs[index]
if (len(tds) < 4):
continue
replace["brand"] = tds.contents[0].text
replace["replace_prod_no"] = tds.contents[1].text
replace["properties"] = tds.contents[2].text
replace["data"] = tds.contents[3].text
replaces.insert(index, replace)
part["product_name"] = product_name
part["replaces"] = replaces
part["_id"] = getNextValue("eurocvbay_parts")
part["url"] = url
eurocvbay_parts.insert(part)
page_nav_container = html.findAll(attrs={"class": "pagenation2"})
for page_nav in page_nav_container:
tag = page_nav.find_all(text=re.compile('下一个*'))
if tag:
page_nav = page_nav.findAll("a")
url = page_nav[1].attrs["href"]
eurocvbay_parts_init("http://www.eurocvbay.com/" + url)