Create a project, the project name is (cmd): firstblood: scrapy startproject firstblood
Enter the project directory (cmd): cd :./firstblood
Create a crawler file (cmd): scrapy genspider first www.xxx.con (first is the crawler file name www.xxx.com: starting url)
pycharm Open the crawler project, enter the spider file, find the first crawler file, and write the crawler code. Comment allowed_domains
Start the crawler file (cmd): scrapy crawl first
< /p>
Second spider anti-anti-climbing configuration
robot.txt
settings amended to: ROBOTSTXT_OBEY = False
UA camouflage
setting File
USER_AGENT = 'firstblood (+http://www.yourdomain. com)' amended to:
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64 ) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'
Summary of three basic commands
scrapy startproject firstblood #new project
scrapy genspider first www.xxx.con #new crawler file
scrapy crawl first #Execute the crawler file and print the diary
scrapy crawl first –nolog #Execute the crawler file, do not print the diary
scrapy crawl qiubai -o qiushibaike.csv Store the return result of the parse function in a csv file
Four storage
Persistent storage based on terminal instructions (only the parse function The return value is stored locally and persistently)
Command: scrapy crawl qiubai -o qiushibaike.csv
Limitations: Only files with these suffixes can be stored (‘json’,’jsonlines’,’jl’,’csv’,’xml’,’marshal’,’pickle’)
class QiubaiSpider(scrapy.Spider):
name = 'qiubai' # allowed_domains = [‘www.xxx.com‘] start_urls = ['https://www.qiushibaike.com/text/ span>']
def parse(self, response):
div_list=response.xpath("//div[@id= 'content-left']/div")
res_list=[]
for div in div_list:
# author=div.xpath('./div[1] /a[2]/h2/text()')[0] ##scrapy in xpath returns the select object # #Get the data in the select object
Persistent storage based on pipeline operations (persistent storage operations must be written in the pipeline file)
Recommended use:
pip install redis==2.10.6
How to encapsulate data into item objects
1. In the items.py file Define the attributes of the storage field
class QiubaiproItem (scrapy.Item):
# define the fields for your item here like: (define the fields as follows :) # name = scrapy.Field() (name field=scrapy universal field) span>
#Example author=scrapy.Field()
content=scrapy.Field()
2. The item class defined in spiders/qiubai.py is introduced:< /p>
from qiubaiPro.items import QiubaiproIte
3. Instantiate items object
#Instantiate the item object item=QiubaiproItem()
item['author']=author
item['content']=content
#Note: a piece of data is an item object, and the pipeline accepts one item Store a record
4. Submit the instantiated object to the pipeline, scrapy submits it automatically, we only need to write:
< div class="code">
yield item #Submit each piece of data once
5. The logic of pipeline storage is written in the pipeline.py file (three types of storage Way)
class QiubaiproPipeline(object) :
def process_item(self, item, spider):
print(item)
return item
import pymysql
class Mysql_PipeLine(object):
#define pipeline conn and cursor globally #Import pymysql conn=None
cursor=None
def open_spider(self, spider):
#The port number is a number instead of a string, self.conn=pymysql.Connect(host='127.0.0.1',port=3306,user='root',password='123',db='< span style="color: #800000;">scrapy')
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
# print(item) try:
self.cursor.execute('insert into qiubai values ("% s","%s");'%(item['author'],item['content']))
self.conn.commit()
except Exception as e:
print(e)
self.conn.rollback()
return item
‘‘‘ 1 Persistent storage based on terminal instructions (only the return value of the parse function will be stored locally)
Command: scrapy crawl qiubai -o qiushibaike.csv
Limitations: Only files with these suffixes can be stored (‘json’, ‘jsonlines’, ‘jl’, ‘csv’, ‘xml’, ‘marshal’, ‘pickle’)
‘‘‘ # Persistent storage based on terminal commands (only the return value of the parse function will be local Persistent storage) class QiubaiSpider(scrapy.Spider):
name = 'qiubai' # allowed_domains = [‘www.xxx.com‘] start_urls = ['https://www.qiushibaike.com/text/ span>']
def parse(self, response):
div_list=response.xpath("//div[@id= 'content-left']/div")
res_list=[]
for div in div_list:
# Persistent storage based on pipeline operations (persistent storage operations Must be written in the pipeline file) class QiubaiSpider(scrapy.Spider):
name = 'qiubai' start_urls = ['https://www.qiushibaike.com/ text/']
def parse(self, response):
div_list=response.xpath("//div[@id= 'content-left']/div")
for div in div_list:
try:
author=div.xpath('./div[1]/ a[2]/h2/text()')[0].extract( )
except Exception as e:
# print(e) author=div.xpath('./div[1]/span[2] /h2/text()')[0].extract()
#One class corresponds to one storage method #Save the file qiubai.txt class QiubaiproPipeline(object):
fp = None # File pipeline
# open_spider overrides the parent class method, it will only be executed once during the crawling process def open_spider(self,spider):
self.fp=open('qiubai.txt','w span>',encoding='utf-8')
# Processing the item file will be executed multiple times, so the file is opened and closed Operations should not be placed inside this function, # Otherwise, the execution efficiency is too low def process_item(self, item, spider):
# print(item) self.fp.write(item['author']+':< span style="color: #800000;">'+item['content< /span>'])
return item
# close_spider overrides the method of the parent spider, in the crawler execution process Will only be executed once
def close_spider(self,spider):
self.fp.close()
#Save to mysql database #At the same time add the pipeline path in settings class Mysql_PipeLine(object):
#define pipeline conn and cursor globally #Import pymysql conn=None
cursor=None
def open_spider(self, spider):
#The port number is a number instead of a string, self.conn=pymysql.Connect(host='127.0.0.1',port=3306,user='root',password='123',db='< span style="color: #800000;">scrapy')
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
# print(item) try:
self.cursor.execute('insert into qiubai values ("% s","%s");'%(item['author'],item['content']))
self.conn.commit()
except Exception as e:
print(e)
self.conn.rollback()
return item
The data obtained by the xpath expression in scrapy is not a label object, but a select object
author=div.xpath('./div[1]/a[2]/h2/text()')[0]
#
< /div>
Get the data in the select object
Method 1: author=div.xpath('./div[1]/a[2]/h2/text()')[0].extract()
Method 2: author=div.xpath('./div[ 1]/a[2]/h2/text()').extract_first( )
author=div.xpath('./div[1]/ a[2]/h2/text()')[0].extract() #The return value is a list
Seven diary processing
Amended in settings to: ROBOTSTXT_OBEY = False
in settings file
USER_AGENT = 'firstblood (+http://www.yourdomain. com)' amended to:
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64 ) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'
class QiubaiSpider(scrapy.Spider):
name = 'qiubai' # allowed_domains = [‘www.xxx.com‘] start_urls = ['https://www.qiushibaike.com/text/ span>']
def parse(self, response):
div_list=response.xpath("//div[@id= 'content-left']/div")
res_list=[]
for div in div_list:
# author=div.xpath('./div[1] /a[2]/h2/text()')[0] ##scrapy in xpath returns the select object # #Get the data in the select object
class QiubaiproItem(scrapy.Item):
# define the fields for your item here like:(定义字段如下:) # name = scrapy.Field() (name字段=scrapy万能字段) #示例 author=scrapy.Field()
content=scrapy.Field()
# 基于管道操作的持久化存储(持久化存储的操作必须写在管道文件中) class QiubaiSpider(scrapy.Spider):
name = ‘qiubai‘ start_urls = [‘https://www.qiushibaike.com/text/‘]
def parse(self, response):
div_list=response.xpath("//div[@id=‘content-left‘]/div")
for div in div_list:
try:
author=div.xpath(‘./div[1]/a[2]/h2/text()‘)[0].extract()
except Exception as e:
# print(e) author=div.xpath(‘./div[1]/span[2]/h2/text()‘)[0].extract()