Dahua product information crawler
Language environment python 3.7
1 #!/usr/bin/env python
2 # -*- coding :utf-8 -*-
3 import os, re, time, requests
4 import urllib.request
5 from bs4 import BeautifulSoup
6 from urllib import request
7
8
9 def whtml(title, ajax_url):
10 print(title, ajax_url)
11 wp = request.urlopen(ajax_url)
12 content = wp.read()
13 if not os.path.exists(now):
14 os.mkdir(now)
15 name = now + "\\" + title + ".html"
16 fp = open(name, "w+b")
17 fp.write(content)
18 fp.close()
19
20
21 def h2class(url_info, ajax_url):
22 headers = {
23 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
24 req = urllib.request.Request(url=url_info, headers=headers)
25 res = urllib.request.urlopen(req)
26 html = res.read().decode(' span>utf-8' )
27 soup = BeautifulSoup(html, "html.parser")
28 all_a = soup.find('div', class_='info-font fr').find_all('h2')
29 now2 = '--' + time.strftime("%H%M%S", time.localtime(time.time()))
30 for h2 in all_a:
31 title = h2.get_text()
32 title = re.sub('[/[emailprotected]#$]', '_', title) + now2
33 whtml(title, ajax_url)
34
35
36 def dhinfo(url):
37 data = requests.get(url).text
38 dhinfo = re.findall(r"< span style="color: #800000;">https://www.dahuatech.com/product/info/(\d+).html" span>, data)
39 dhinfo1 = sorted(set(dhinfo), key=dhinfo. index)
40 for url in dhinfo1:
41 time.sleep(1)
42 url_info = ("https://www.dahuatech.com/product/info/%s.html" % url)
43 ajax_url = ("https://www.dahuatech.com/ajax/product/%s/2" % url)
44 h2class(url_info, ajax_url)
45
46
47 def product():
48 data = requests.get('https://www.dahuatech.com/product.html').text
49 product = re.findall(r"< span style="color: #800000;">https://www.dahuatech.com/product/lists/[1,9]\d*\.html\?area=[1,9]\d* span>", data)
50 product1 = sorted(set(product), key=product. index)
51 for url in product1:
52 print(url)
53 time.sleep(1)
54 dhinfo(url)
55
56
57 if __name__ == '__main__':
58 now = 'DH' + time.strftime(" %Y%m%d", time.localtime(time.time()))
59 product()
1 #!/usr/bin/env python
2 # -*- coding :utf-8 -*-
3 import os, re, time, requests
4 import urllib.request
5 from bs4 import BeautifulSoup
6 from urllib import request
7
8
9 def whtml(title, ajax_url):
10 print(title, ajax_url)
11 wp = request.urlopen(ajax_url)
12 content = wp.read()
13 if not os.path.exists(now):
14 os.mkdir(now)
15 name = now + "\\" + title + ".html"
16 fp = open(name, "w+b")
17 fp.write(content)
18 fp.close()
19
20
21 def h2class(url_info, ajax_url):
22 headers = {
23 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
24 req = urllib.request.Request(url=url_info, headers=headers)
25 res = urllib.request.urlopen(req)
26 html = res.read().decode(' span>utf-8' )
27 soup = BeautifulSoup(html, "html.parser")
28 all_a = soup.find('div', class_='info-font fr').find_all('h2')
29 now2 = '--' + time.strftime("%H%M%S", time.localtime(time.time()))
30 for h2 in all_a:
31 title = h2.get_text()
32 title = re.sub('[/[emailprotected]#$]', '_', title) + now2
33 whtml(title, ajax_url)
34
35
36 def dhinfo(url):
37 data = requests.get(url).text
38 dhinfo = re.findall(r"< span style="color: #800000;">https://www.dahuatech.com/product/info/(\d+).html" span>, data)
39 dhinfo1 = sorted(set(dhinfo), key=dhinfo. index)
40 for url in dhinfo1:
41 time.sleep(1)
42 url_info = ("https://www.dahuatech.com/product/info/%s.html" % url)
43 ajax_url = ("https://www.dahuatech.com/ajax/product/%s/2" % url)
44 h2class(url_info, ajax_url)
45
46
47 def product():
48 data = requests.get('https://www.dahuatech.com/product.html').text
49 product = re.findall(r"< span style="color: #800000;">https://www.dahuatech.com/product/lists/[1,9]\d*\.html\?area=[1,9]\d* span>", data)
50 product1 = sorted(set(product), key=product. index)
51 for url in product1:
52 print(url)
53 time.sleep(1)
54 dhinfo(url)
55
56
57 if __name__ == '__main__':
58 now = 'DH' + time.strftime(" %Y%m%d", time.localtime(time.time()))
59 product()