The reptile knowledge point summary.

The basic workflow of a web crawler is as follows:

1. Select the seed URL;

2. Put these URLs into the URL queue to be crawled;

1. p>

3. Take out the URL to be crawled from the queue of URLs to be crawled. Resolve DNS, get the host’s ip, download the web page corresponding to the URL, and store it in the downloaded web page library.

4. Analyze the URL in the crawled URL queue, analyze other URLs in it, and put the URL into the URL queue to be crawled, thus entering the next cycle.

When crawling tag matching, there are three ways to crawl them: re, xpath, BeautifulSoup4

It is recommended that you learn the re regular Matching, because some websites, xpath, and BeautifulSoup4 have no regular matching efficiency.

I got this understanding when I crawled the Youku website. Regular matching will accurately filter out the corresponding data.

If you are a beginner, you can provide a secret to not being blocked ip: exit(-1).

The second secret is the breakpoint in the crawler. You can first set a breakpoint to crawl only one piece of data, so that the website does not think you are a crawler. After all the logic is successfully confirmed, unlock the breakpoint. Climb to the data you want.

Then let’s show the code of crawling video, the code is as follows:

# 导包
import requests
import re
from lxml import etree
import os
class PearVideo(object):

# Define the capture method
def get_content(self,url,type):

if type ==’index’:

< div> fil_name = ‘test_pear.html’

else:
fil_name = ‘inner_pear.html’
# 使用os模块来Determine whether the file exists
if not os.path.exists(fil_name):
# Send http request
r = requests.get(url)

# Decoding
html = r.content.decode(‘utf-8’)

# Write file
with open (‘./’+fil_name,’w’,encoding=’utf-8′) as f:
f.write(html)
else:

< div> with open(‘./’+fil_name,encoding=’utf-8’) as f:

contents = f.read()
return contents
# Define the data matching method
def get_xpath(self,html):
# Conversion format

html = etree .HTML(html)
html_data_img = html.xpath(“//div[@class=’actcontbd’]/a/@href”)
# print(html_data_img) < /div>

# Process the inner page URL
url_list = []
for item in html_data_img:
item =’https:/ /www.pearvideo.com/’+item
url_list.append(item)

# print(url_list)

< div> # Crawling the inner page

url_page = url_list[8]
inner_html = self.get_content(url_page,’inner’)

# Match the real video address
regex = re.compile(‘srcUrl=”(.+?)”‘)
print(regex.findall(inner_html ))

# download video
r = requests.get(regex.findall(inner_html)[0])
with open(“. /test_pear.mp4″,”ab”) as f:
f.write(r.content)

if __name__ == “__main__”:
# Instantiate an object

div>

pearvideo = PearVideo()
html = pearvideo.get_content(‘https://www.pearvideo.com/’,’index’)
# match picture Address
pearvideo.get_xpath(html)
If you need to change it, please ask God for guidance.

# Guide package
import requests
import re
from lxml import etree
import os
class PearVideo(object):
# 定义抓取方法
def get_content( self,url,type):

if type ==’index’:
fil_name =’test_pear.html’
else:

fil_name = ‘inner_pear.html’
# 使用os模块来判断文件是否存在
if not os.path.exists(fil_name ):
# Send http request
r = requests.get(url)
# Decoding
html = r.content. decode(‘utf-8’)

# Write file
with open(‘./’+fil_name,’w’,encoding=’utf-8′) as f:
f.write(html)
else:
with open(‘./’+fil_name,encoding=’utf-8’) as f:
contents = f.read()
return contents
# Define data matching method
def get_xpath(self, html):
# Conversion format

html = etree.HTML(html)
html_data_img = html.xpath(“//div[@class=’actcontbd’]/a/@href”)
# print(html_data_img )

# Process the inner page URL
url_list = []
for item in html_data_img:
item =’https ://www.pearvideo.com/’+item
url_list.append(item)

# print(url_list)
# Crawling the inner page
url_page = url_list[8]
inner_html = self.get_content(url_page,’inner’)

div>

# match the real video address
regex = re.compile(‘srcUrl=”(.+?)”‘)
print(regex.findall (inner_html))

# download video
r = requests.get(regex.findall(inner_html)[0])
with open( “./test_pear.mp4″,”ab”) as f:
f.write(r.content)

p>

# Guide package

import requests

import re

from lxml import etree

import os

class PearVideo(object):

# Define the capture method

de f get_content(self,url,type):

if type ==’index’:

fil_name =’test_pear.html’

else:

p>

fil_name =’inner_pear.html’

# Use the os module to determine whether the file exists

if not os.path.exists(fil_name):

# Send http request

r = requests.get(url)

# Decode

html = r.content.decode(‘utf-8 ‘)

# Write file

with open(‘./’+fil_name,’w’,encoding=’utf-8′) as f:

f.write(html)

else:

with open(‘./’+fil_name,encoding=’utf-8’) as f:

contents = f.read()

return contents

# Define data matching method

def get_xpath(self,html):

# Conversion format

html = etree.HTML(html)

html_data_img = html.xpath(“//div[@class=’actcontbd’]/a/@href” )

# print(html_data_img)

# Process inner page URL

url_list = []

for item in html_data_img:

p>

item =’https://www.pearvideo.com/’+item

url_list.append(item)

# print(url_list)

# Crawling inner pages

url_page = url_list[8]

inner_html = self.get_content(url_page, ‘inner’)

# match the real video address

regex = re.compile(‘srcUrl=”(.+?)”‘)

print(regex.findall(inner_html))

# Download video

r = requests.get(regex.findall(inner_html)[0])

with open(“./test_pear.mp4″,”ab”) as f:

f.write (r.content)

if __name__ == “__main__”:
# Instantiate an object< /div>

pearvideo = PearVideo()
html = pearvideo.get_content(‘https://www.pearvideo.com/’,’index’)
# match Picture address
pearvideo.get_xpath(html)
If you need to change it, please ask God for guidance.

if __name__ == “__main__”:

# Instantiate an object

pearvideo = PearVideo()

html = pearvideo.get_content(‘https://www.pearvideo.com/’,’index’)

# match picture address

pearvideo.get_xpath(html)

If you need to make more progress, please ask God for guidance.

WordPress database error: [Table 'yf99682.wp_s6mz6tyggq_comments' doesn't exist]
SELECT SQL_CALC_FOUND_ROWS wp_s6mz6tyggq_comments.comment_ID FROM wp_s6mz6tyggq_comments WHERE ( comment_approved = '1' ) AND comment_post_ID = 2033 ORDER BY wp_s6mz6tyggq_comments.comment_date_gmt ASC, wp_s6mz6tyggq_comments.comment_ID ASC

Leave a Comment

Your email address will not be published.