Reptile SCRAPY Component Request Metallization, POST Request, Middleware

post request

Use the post request in the scrapy component to call

def start_requests(self): 
Transfer parameters and then return
yield scrapy.FormRequest(url=url,formdata=data,callback=self.parse)
Make a post request, where FormRequest() is post Request method

import scrapy


class PostSpider(scrapy.Spider):
name
= 'post'
# allowed_domains = [‘www.xxx.com‘]
start_urls = ['https://fanyi.baidu.com/sug']

def start_requests(self):
data
= {
'kw':'dog'
}
for url in self.start_urls:
yield scrapy.FormRequest(url=url,formdata=data,callback=self.parse)
#scrapy.FormRequest() Make a post request
def parse(self, response):
print(response.text)

p>

Request to pass parameters

Scrapy requests to pass parameters to the main core

meta ={'item':item}

is a dictionary structure used to store items etc.

Access via the return url of the callback function

import scrapy

from moviePro.items import MovieproItem

class MovieSpider(scrapy.Spider):
name
= 'movie'
# allowed_domains = [‘www.xxx.com‘]
start_urls = ['https://www.4567tv.tv/frim/index1. html']
#Analyze the data in the details page
def parse_detail(self,response):
#response.meta returns the received meta dictionary
item = response.meta['item']
actor
= response.xpath('/html/body/div[ 1]/div/div/div/div[2]/p[3]/a/text()').extract_first()
item[
'actor'] = actor

yield item

def parse(self, response):
li_list
= response.xpath('//li[@class= "col-md-6 col-sm-4 col-xs-3"]')
for li in li_list:
item
= MovieproItem()
name
= li.xpath('./div/a/@ title').extract_first()
detail_url
= 'https://www.4567tv.tv span>'+li.xpath('./div/a/@href').extract_first()
item[
'name'] = name
#meta parameters: request parameters. The meta dictionary will be passed to The response parameter of the callback function
yield scrapy.Request(url=detail_url,callback=self.parse_detail,meta={ 'item':item})

        # Here, start_urls with a given url value is called for direct access

Note: The fields stored here must be the same as items. py is created consistently, that is, the fields of items.py are the main ones

items.py

import scrapy



class MovieproItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
name = scrapy.Field()
actor
= scrapy.Field()

pipelines.py

import pymysql


class MovieproPipeline(object):
conn
= None
cursor
= None

def open_spider(self,spider):
print("Start crawling")
self.conn
= pymysql.Connect(host='127.0.0.1 ',port=3306, user='root', password="" ,db='movie' ,charset='utf8')

def process_item(self, item, spider):
self.cursor
= self.conn.cursor()
try:
self.cursor.execute(
'insert into av values("% s","%s")'%(item[' name'],item[ 'actor']))
self.conn.commit()
except Exception as e:
self.conn.rollback()

def close_spider(self,spider):
print('End crawler')
self.cursor.close()
self.conn.close()

You can omit –nolog during execution, in setting LOG_LEVEL=”ERROR”

You can also define the write file, configure LOG_FILE = “./log.txt” in the setting

Five core components

Share pictures

Among them, dowmloader is the most important, and it is divided into three important methods

span>

(1)
def process_request(self, request, spider):
return None
Need to return NONE and Django's intermediate build, which means to operate before the visit comes.
When used as a sample, replace the user_agent, request.headers['User-Agent'] = random.choice([ ...])


(2)
def process_response(self, request, response, spider):

Response needs to be returned to spider for data processing
When used as selenium simulation access, if it is accessed by spider, a piece of data needs to generate a bro, so Add it here, just once
it will get the data

return HtmlResponse(url=spider.bro.current_url,body=page_text,encoding='utf-8',request=request) 
Return to spider for data analysis
return response

(3)
def process_exception(self, request , exception, spider):
< /span>
pass

refers to the use of error reporting
Used as a proxy to request proxy ip

request.meta['proxy'] = random.choice([])



Use
share picture

 1 import random

2
3 class MiddleproDownloaderMiddleware(object):
4 user_agent_list = [
5 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
6 " (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
7 "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
8 " (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
9 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
10 " (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
11 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
12 " (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
13 "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
14 " (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
15 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
16 " (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
17 "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
18 " (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
19 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
20 " (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
21 "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
22 " (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
23 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
24 " (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
25 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
26 " (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
27 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
28 " (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
29 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
30 " (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
31 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
32 " (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
33 "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
34 " (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
35 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
36 " (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
37 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
38 " (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
39 "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
40 " (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
41 ]
42 # Proxy IP that can be selected
43 PROXY_http = [
44 '153.180.102.104:80',
45 '195.208.131.189:56055',
46 ]
47 PROXY_https = [
48 '120.83.49.90:9000',
49 '95.189.112.214:35508',
50 ]
51 #Intercept all requests without exceptions
52 def process_request(self, request, spider):
53
54
55 # UA camouflage using UA pool for request
56 print('this is process_request')
57 request.headers['User-Agent'] = random.choice( self.user_agent_list)
58 print(request.headers['User-Agent' ])
59
60
61 return None
62 #Intercept all responses
63 def process_response(self, request, response, spider):
64
65 return response
66 #An abnormal request was intercepted
67 def process_exception(self, request, exception, spider):
68
69 print('this is process_exception!')
70 if request.url.split(':') [0] == 'http':
71 request.meta['proxy'] = random.choice(self. PROXY_http)
72 else:
73 request.meta['proxy'] = random.choice(self.PROXY_https)

View Code


Use of selenium middleware

Note: To use middleware, you need to open the seal of middleware (p56-58)

DOWNLOADER_MIDDLEWARES = {
'wangyiPro.middlewares.WangyiproDownloaderMiddleware': 543,
}

By instantiating the bro object, access the def response(self) of the download middleware after the request is over:

By obtaining Data

return HtmlResponse(url=spider.bro.current_url,body=page_text,encoding='utf-8',request=request) 
Return to spider
< br>
import scrapy

from selenium import webdriver


‘‘‘
The coding process of using selenium in scrapy:
1. Create a browser object in the spider's construction method (as an attribute of the current spider)
2. Rewrite a method of spider closed(self, spider), and execute the operation of browser closing in this method
3. In the process_response method of the download middleware, obtain the browser object through the spider parameter
4. Customize the browser-based automation operation code in the process_response of the middleware (get the page source data that is dynamically loaded)
5. Instantiate a response object, and encapsulate the page source code returned by page_source into the object
6. Return the new response object
‘‘‘

class WangyiSpider(scrapy.Spider):
name
= 'wangyi'
# allowed_domains = [‘www.xxx.com‘]
start_urls = ['http://war.163.com/ ']
def __init__(self):
self.bro
= webdriver.Chrome(executable_path=r'C: \Users\Administrator\Desktop\爬虫+数据\day_03_爬虫\chromedriver.exe')
def parse(self, response):
div_list
= response.xpath(//div[@class="data_row news_article clearfix "])
for div in div_list:
title
= div.xpath(.//div[@class="news_title"]/h3/a/text()).extract_first()
print(title)
def closed(self,spider):
print(关闭浏览器对象!)
self.bro.quit()


 


def process_response(self, request, response, spider):
 
    def process_response(self, request, response, spider):

# Called with the response returned from the downloader.

# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
print(即将返回一个新的响应对象!!!)
#如何获取动态加载出来的数据
bro = spider.bro
bro.get(url
=request.url)
sleep(
3)
#包含了动态加载出来的新闻数据
page_text = bro.page_source
sleep(
3)
return HtmlResponse(url=spider.bro.current_url,body=page_text,encoding=utf-8,request=request)

import scrapy


class PostSpider(scrapy.Spider):
name
= post
# allowed_domains = [‘www.xxx.com‘]
start_urls = [https://fanyi.baidu.com/sug]

def start_requests(self):
data
= {
kw:dog
}
for url in self.start_urls:
yield scrapy.FormRequest(url=url,formdata=data,callback=self.parse)
#scrapy.FormRequest() 进行post请求
def parse(self, response):
print(response.text)

import scrapy

from moviePro.items import MovieproItem

class MovieSpider(scrapy.Spider):
name
= movie
# allowed_domains = [‘www.xxx.com‘]
start_urls = [https://www.4567tv.tv/frim/index1.html]
#解析详情页中的数据
def parse_detail(self,response):
#response.meta返回接收到的meta字典
item = response.meta[item]
actor
= response.xpath(/html/body/div[1]/div/div/div/div[2]/p[3]/a/text()).extract_first()
item[
actor] = actor

yield item

def parse(self, response):
li_list
= response.xpath(//li[@class="col-md-6 col-sm-4 col-xs-3"])
for li in li_list:
item
= MovieproItem()
name
= li.xpath(./div/a/@title).extract_first()
detail_url
= https://www.4567tv.tv+li.xpath(./div/a/@href).extract_first()
item[
name] = name
#meta参数:请求传参.meta字典就会传递给回调函数的response参数
yield scrapy.Request(url=detail_url,callback=self.parse_detail,meta={item:item})

        #这里url 值定的start_urls 调用直接访问

import scrapy



class MovieproItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
name = scrapy.Field()
actor
= scrapy.Field()

import pymysql


class MovieproPipeline(object):
conn
= None
cursor
= None

def open_spider(self,spider):
print("开始爬虫")
self.conn
= pymysql.Connect(host=127.0.0.1,port=3306, user=root, password="",db=movie,charset=utf8)

def process_item(self, item, spider):
self.cursor
= self.conn.cursor()
try:
self.cursor.execute(
insert into av values("%s","%s")%(item[name],item[actor]))
self.conn.commit()
except Exception as e:
self.conn.rollback()

def close_spider(self,spider):
print(结束爬虫)
self.cursor.close()
self.conn.close()

分享图片

 1 import random

2
3 class MiddleproDownloaderMiddleware(object):
4 user_agent_list = [
5 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
6 "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
7 "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
8 "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
9 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
10 "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
11 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
12 "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
13 "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
14 "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
15 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
16 "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
17 "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
18 "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
19 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
20 "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
21 "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
22 "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
23 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
24 "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
25 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
26 "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
27 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
28 "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
29 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
30 "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
31 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
32 "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
33 "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
34 "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
35 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
36 "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
37 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
38 "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
39 "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
40 "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
41 ]
42 # 可被选用的代理IP
43 PROXY_http = [
44 153.180.102.104:80,
45 195.208.131.189:56055,
46 ]
47 PROXY_https = [
48 120.83.49.90:9000,
49 95.189.112.214:35508,
50 ]
51 #拦截所有未发生异常的请求
52 def process_request(self, request, spider):
53
54
55 #使用UA池进行请求的UA伪装
56 print(this is process_request)
57 request.headers[User-Agent] = random.choice(self.user_agent_list)
58 print(request.headers[User-Agent])
59
60
61 return None
62 #拦截所有的响应
63 def process_response(self, request, response, spider):
64
65 return response
66 #拦截到产生异常的请求
67 def process_exception(self, request, exception, spider):
68
69 print(this is process_exception!)
70 if request.url.split(:)[0] == http:
71 request.meta[proxy] = random.choice(self.PROXY_http)
72 else:
73 request.meta[proxy] = random.choice(self.PROXY_https)

View Code

 1 import random

2
3 class MiddleproDownloaderMiddleware(object):
4 user_agent_list = [
5 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
6 "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
7 "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
8 "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
9 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
10 "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
11 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
12 "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
13 "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
14 "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
15 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
16 "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
17 "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
18 "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
19 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
20 "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
21 "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
22 "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
23 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
24 "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
25 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
26 "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
27 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
28 "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
29 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
30 "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
31 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
32 "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
33 "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
34 "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
35 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
36 "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
37 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
38 "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
39 "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
40 "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
41 ]
42 # 可被选用的代理IP
43 PROXY_http = [
44 153.180.102.104:80,
45 195.208.131.189:56055,
46 ]
47 PROXY_https = [
48 120.83.49.90:9000,
49 95.189.112.214:35508,
50 ]
51 #拦截所有未发生异常的请求
52 def process_request(self, request, spider):
53
54
55 #使用UA池进行请求的UA伪装
56 print(this is process_request)
57 request.headers[User-Agent] = random.choice(self.user_agent_list)
58 print(request.headers[User-Agent])
59
60
61 return None
62 #拦截所有的响应
63 def process_response(self, request, response, spider):
64
65 return response
66 #拦截到产生异常的请求
67 def process_exception(self, request, exception, spider):
68
69 print(this is process_exception!)
70 if request.url.split(:)[0] == http:
71 request.meta[proxy] = random.choice(self.PROXY_http)
72 else:
73 request.meta[proxy] = random.choice(self.PROXY_https)

import scrapy

from selenium import webdriver


‘‘‘
在scrapy中使用selenium的编码流程:
1.在spider的构造方法中创建一个浏览器对象(作为当前spider的一个属性)
2.重写spider的一个方法closed(self,spider),在该方法中执行浏览器关闭的操作
3.在下载中间件的process_response方法中,通过spider参数获取浏览器对象
4.在中间件的process_response中定制基于浏览器自动化的操作代码(获取动态加载出来的页面源码数据)
5.实例化一个响应对象,且将page_source返回的页面源码封装到该对象中
6.返回该新的响应对象
‘‘‘

class WangyiSpider(scrapy.Spider):
name
= wangyi
# allowed_domains = [‘www.xxx.com‘]
start_urls = [http://war.163.com/]
def __init__(self):
self.bro
= webdriver.Chrome(executable_path=rC:\Users\Administrator\Desktop\爬虫+数据\day_03_爬虫\chromedriver.exe)
def parse(self, response):
div_list
= response.xpath(//div[@class="data_row news_article clearfix "])
for div in div_list:
title
= div.xpath(.//div[@class="news_title"]/h3/a/text()).extract_first()
print(title)
def closed(self,spider):
print(关闭浏览器对象!)
self.bro.quit()

    def process_response(self, request, response, spider):

# Called with the response returned from the downloader.

# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
print(即将返回一个新的响应对象!!!)
#如何获取动态加载出来的数据
bro = spider.bro
bro.get(url
=request.url)
sleep(
3)
#包含了动态加载出来的新闻数据
page_text = bro.page_source
sleep(
3)
return HtmlResponse(url=spider.bro.current_url,body=page_text,encoding=utf-8,request=request)

Leave a Comment

Your email address will not be published.