Dahua product information reptile - Hua, Information, large, Products, Recover

Dahua product information crawler

Language environment python 3.7

 1  #!/usr/bin/env python

 2 # -*- coding :utf-8 -*-

 3 import os, re, time, requests

 4 import urllib.request

 5 from bs4 import BeautifulSoup

 6 from urllib import request

 7

 8

 9 def whtml(title, ajax_url):

10 print(title, ajax_url)

11 wp = request.urlopen(ajax_url)

12 content = wp.read()

13 if not os.path.exists(now):

14  os.mkdir(now)

15 name = now + "\\" + title + ".html"

16 fp = open(name, "w+b")

17  fp.write(content)

18  fp.close()

19

20

21 def h2class(url_info, ajax_url):

22 headers = {

23 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}

24 req = urllib.request.Request(url=url_info, headers=headers)

25 res = urllib.request.urlopen(req)

26 html = res.read().decode('utf-8' )

27 soup = BeautifulSoup(html, "html.parser")

28 all_a = soup.find('div', class_='info-font fr').find_all('h2')

29 now2 = '--' + time.strftime("%H%M%S", time.localtime(time.time()))

30 for h2 in all_a:

31 title = h2.get_text()

32 title = re.sub('[/[emailprotected]#$]', '_', title) + now2

33  whtml(title, ajax_url)

34

35

36 def dhinfo(url):

37 data = requests.get(url).text

38 dhinfo = re.findall(r"< span style="color: #800000;">https://www.dahuatech.com/product/info/(\d+).html", data)

39 dhinfo1 = sorted(set(dhinfo), key=dhinfo. index)

40 for url in dhinfo1:

41 time.sleep(1)

42 url_info = ("https://www.dahuatech.com/product/info/%s.html" % url)

43 ajax_url = ("https://www.dahuatech.com/ajax/product/%s/2" % url)

44  h2class(url_info, ajax_url)

45

46

47 def product():

48 data = requests.get('https://www.dahuatech.com/product.html').text

49 product = re.findall(r"< span style="color: #800000;">https://www.dahuatech.com/product/lists/[1,9]\d*\.html\?area=[1,9]\d*", data)

50 product1 = sorted(set(product), key=product. index)

51 for url in product1:

52 print(url)

53 time.sleep(1)

54  dhinfo(url)

55

56

57 if  __name__ == '__main__':

58 now = 'DH' + time.strftime(" %Y%m%d", time.localtime(time.time()))

59 product()

 1 #!/usr/bin/env python 

 2 # -*- coding :utf-8 -*-

 3 import os, re, time, requests

 4 import urllib.request

 5 from bs4 import BeautifulSoup

 6 from urllib import request

 7

 8

 9 def whtml(title, ajax_url):

10 print(title, ajax_url)

11 wp = request.urlopen(ajax_url)

12 content = wp.read()

13 if not os.path.exists(now):

14  os.mkdir(now)

15 name = now + "\\" + title + ".html"

16 fp = open(name, "w+b")

17  fp.write(content)

18  fp.close()

19

20

21 def h2class(url_info, ajax_url):

22 headers = {

23 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}

24 req = urllib.request.Request(url=url_info, headers=headers)

25 res = urllib.request.urlopen(req)

26 html = res.read().decode('utf-8' )

27 soup = BeautifulSoup(html, "html.parser")

28 all_a = soup.find('div', class_='info-font fr').find_all('h2')

29 now2 = '--' + time.strftime("%H%M%S", time.localtime(time.time()))

30 for h2 in all_a:

31 title = h2.get_text()

32 title = re.sub('[/[emailprotected]#$]', '_', title) + now2

33  whtml(title, ajax_url)

34

35

36 def dhinfo(url):

37 data = requests.get(url).text

38 dhinfo = re.findall(r"< span style="color: #800000;">https://www.dahuatech.com/product/info/(\d+).html", data)

39 dhinfo1 = sorted(set(dhinfo), key=dhinfo. index)

40 for url in dhinfo1:

41 time.sleep(1)

42 url_info = ("https://www.dahuatech.com/product/info/%s.html" % url)

43 ajax_url = ("https://www.dahuatech.com/ajax/product/%s/2" % url)

44  h2class(url_info, ajax_url)

45

46

47 def product():

48 data = requests.get('https://www.dahuatech.com/product.html').text

49 product = re.findall(r"< span style="color: #800000;">https://www.dahuatech.com/product/lists/[1,9]\d*\.html\?area=[1,9]\d*", data)

50 product1 = sorted(set(product), key=product. index)

51 for url in product1:

52 print(url)

53 time.sleep(1)

54  dhinfo(url)

55

56

57 if  __name__ == '__main__':

58 now = 'DH' + time.strftime(" %Y%m%d", time.localtime(time.time()))

59 product()

Leave a Comment Cancel reply