Reptile Series —- SCRAPY Crawling Web Page Initial

A basic process

  1. Create a project, the project name is (cmd): firstblood: scrapy startproject firstblood
  2. Enter the project directory (cmd): cd :./firstblood
  3. Create a crawler file (cmd): scrapy genspider first www.xxx.con (first is the crawler file name www.xxx.com: starting url)
  4. pycharm Open the crawler project, enter the spider file, find the first crawler file, and write the crawler code. Comment allowed_domains
  5. Start the crawler file (cmd): scrapy crawl first

< /p>

Second spider anti-anti-climbing configuration

  • robot.txt
settings amended to: ROBOTSTXT_OBEY = False

  • UA camouflage
setting File

USER_AGENT
= 'firstblood (+http://www.yourdomain. com)'
amended to:
USER_AGENT
= 'Mozilla/5.0 (Windows NT 6.1; Win64; x64 ) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'

Summary of three basic commands

  1. scrapy startproject firstblood #new project
  2. scrapy genspider first www.xxx.con #new crawler file
  3. scrapy crawl first #Execute the crawler file and print the diary
  4. scrapy crawl first –nolog #Execute the crawler file, do not print the diary
  5. scrapy crawl qiubai -o qiushibaike.csv Store the return result of the parse function in a csv file

Four storage

  • Persistent storage based on terminal instructions (only the parse function The return value is stored locally and persistently)

    Command: scrapy crawl qiubai -o qiushibaike.csv

    Limitations: Only files with these suffixes can be stored (‘json’,’jsonlines’,’jl’,’csv’,’xml’,’marshal’,’pickle’)  

class QiubaiSpider(scrapy.Spider):

name
= 'qiubai'
# allowed_domains = [‘www.xxx.com‘]
start_urls = ['https://www.qiushibaike.com/text/ span>']

def parse(self, response):
div_list
=response.xpath("//div[@id= 'content-left']/div")
res_list
=[]
for div in div_list:
# author=div.xpath('./div[1] /a[2]/h2/text()')[0]
##scrapy in xpath returns the select object
#
#Get the data in the select object

# Method 1: author=div.xpath('./div[1]/ a[2]/h2/text()')[0].extract()
# Method two: author=div.xpath('./div[1]/ a[2]/h2/text()').extract_first()
author=div.xpath('./div[1]/a[2] /h2/text()')[0].extract()
content
=div.xpath('./a[1]/ div[@class="content"]/span//text()') .extract()
content
="".join(content)

# print("author......",author )
# print("content......",content)
# break
dic={
'author':author,
'content':content
}
res_list.append(dic)
return res_list

  • Persistent storage based on pipeline operations (persistent storage operations must be written in the pipeline file)

Recommended use:

pip install redis
==2.10.6

How to encapsulate data into item objects

1. In the items.py file Define the attributes of the storage field

 class QiubaiproItem (scrapy.Item):

# define the fields for your item here like: (define the fields as follows :)
# name = scrapy.Field() (name field=scrapy universal field) span>
#Example
author=scrapy.Field()
content
=scrapy.Field()

2. The item class defined in spiders/qiubai.py is introduced:< /p>

from qiubaiPro.items import QiubaiproIte

3. Instantiate items object

 #Instantiate the item object

item=QiubaiproItem()
item[
'author']=author
item[
'content']=content
#Note: a piece of data is an item object, and the pipeline accepts one item Store a record

4. Submit the instantiated object to the pipeline, scrapy submits it automatically, we only need to write:

< div class="code">

 yield item #Submit each piece of data once

5. The logic of pipeline storage is written in the pipeline.py file (three types of storage Way)

 class QiubaiproPipeline(object) :

def process_item(self, item, spider):
print(item)
return item
import pymysql
class Mysql_PipeLine(object):
#define pipeline conn and cursor globally
#Import pymysql
conn=None
cursor
=None
def open_spider(self, spider):
#The port number is a number instead of a string,
self.conn=pymysql.Connect(host='127.0.0.1 ',port=3306,user='root',password='123',db='< span style="color: #800000;">scrapy')
self.cursor
= self.conn.cursor()
def process_item(self, item, spider):
# print(item)
try:
self.cursor.execute(
'insert into qiubai values ​​("% s","%s");'%(item['author'],item['content']))
self.conn.commit()
except Exception as e:
print(e)
self.conn.rollback()
return item

def close_spider(self, spider):
# self.cursor.close()
self.conn.close()

from redis import Redis
class Redis_PipeLine(object):
conn
=None

def open_spider(self,spider):
# Link database
self.conn=Redis(host='127.0.0.1',port=6379)

def process_item(self,item,spider):
dic
={
'author':item['author'],
'content':item['content']
}
self.conn.lpush(
'qiubai',dic)

6 Turn on the item_pipeline function in the settings file
#Allow writing multiple channels, multiple storage methods
ITEM_PIPELINES = {
'qiubaiPro.pipelines.QiubaiproPipeline': 300,
#'Pipe path. Pipe name': Priority
}

ITEM_PIPELINES
= {
'qiubaiPro.pipelines.QiubaiproPipeline': 300,
#New pipeline
'qiubaiPro.pipelines.Mysql_PipeLine': 301,
'qiubaiPro.pipelines.Redis_PipeLine': 302,

}

Five simple examples

  • New crawler file qiubai.py
# -*- coding: utf -8 -*-

import scrapy
from qiubaiPro.items import QiubaiproItem


‘‘‘
1 Persistent storage based on terminal instructions (only the return value of the parse function will be stored locally)
Command: scrapy crawl qiubai -o qiushibaike.csv

Limitations: Only files with these suffixes can be stored (‘json’, ‘jsonlines’, ‘jl’, ‘csv’, ‘xml’, ‘marshal’, ‘pickle’)
‘‘‘
# Persistent storage based on terminal commands (only the return value of the parse function will be local Persistent storage)
class QiubaiSpider(scrapy.Spider):
name
= 'qiubai'
# allowed_domains = [‘www.xxx.com‘]
start_urls = ['https://www.qiushibaike.com/text/ span>']

def parse(self, response):
div_list
=response.xpath("//div[@id= 'content-left']/div")
res_list
=[]
for div in div_list:

author
=div.xpath('./div[1]/ a[2]/h2/text()')[0].extract( )
content
=div.xpath('./a[1]/ div[@class="content"]/span//text()') .extract()
content
="".join(content)

dic
={
'author':author,
'content':content
}
res_list.append(dic)
return res_list

# Persistent storage based on pipeline operations (persistent storage operations Must be written in the pipeline file)
class QiubaiSpider(scrapy.Spider):
name
= 'qiubai'
start_urls
= ['https://www.qiushibaike.com/ text/']

def parse(self, response):
div_list
=response.xpath("//div[@id= 'content-left']/div")
for div in div_list:
try:
author
=div.xpath('./div[1]/ a[2]/h2/text()')[0].extract( )
except Exception as e:
# print(e)
author=div.xpath('./div[1]/span[2] /h2/text()')[0].extract()

content
=div.xpath('./a[1]/ div[@class="content"]/span//text()') .extract()
content
="".join(content)

#Instantiate the item object
item=QiubaiproItem()
item[
'author']=author
item[
'content']=content
# print(item[‘author‘])
#Submission pipeline
yield item

  • items.py
import scrapy


class QiubaiproItem(scrapy.Item):

author
=scrapy.Field()
content
=scrapy.Field()

  • pipeline.py
# -*- coding: utf-8 -*-

import pymysql
from redis import Redis

#One ​​class corresponds to one storage method
#
Save the file qiubai.txt
class QiubaiproPipeline(object):
fp
= None # File pipeline

# open_spider overrides the parent class method, it will only be executed once during the crawling process
def open_spider(self,spider):
self.fp
=open('qiubai.txt ','w span>',encoding='utf-8')

# Processing the item file will be executed multiple times, so the file is opened and closed Operations should not be placed inside this function,
# Otherwise, the execution efficiency is too low
def process_item(self, item, spider):
# print(item)
self.fp.write(item['author']+':< span style="color: #800000;">'
+item['content< /span>'])
return item

# close_spider overrides the method of the parent spider, in the crawler execution process Will only be executed once

def close_spider(self,spider):
self.fp.close()

#Save to mysql database
#
At the same time add the pipeline path in settings
class Mysql_PipeLine(object):
#define pipeline conn and cursor globally
#Import pymysql
conn=None
cursor
=None
def open_spider(self, spider):
#The port number is a number instead of a string,
self.conn=pymysql.Connect(host='127.0.0.1 ',port=3306,user='root',password='123',db='< span style="color: #800000;">scrapy
')
self.cursor
= self.conn.cursor()
def process_item(self, item, spider):
# print(item)
try:
self.cursor.execute(
'insert into qiubai values ​​("% s","%s");'%(item['author'],item['content']))
self.conn.commit()
except Exception as e:
print(e)
self.conn.rollback()
return item

def close_spider(self, spider):
# self.cursor.close()
self.conn.close()

class Redis_PipeLine(object):
conn
=None

def open_spider(self,spider):
# Link database
self.conn=Redis(host='127.0.0.1',port=6379)

def process_item(self,item,spider):
dic
={
'author':item['author'],
'content':item['content']
}
self.conn.lpush(
'qiubai',dic)

The difference of xpath in six scrapy

  • The data obtained by the xpath expression in scrapy is not a label object, but a select object
 author=div.xpath('./div[1]/a[2]/h2/text()')[0]

#

< /div>

  • Get the data in the select object
Method 1: author=div.xpath('./div[1]/a[2]/h2/text()')[0].extract()

Method 2: author
=div.xpath('./div[ 1]/a[2]/h2/text()').extract_first( )
author
=div.xpath('./div[1]/ a[2]/h2/text()')[0].extract() #The return value is a list

Seven diary processing

Amended in settings to: ROBOTSTXT_OBEY = False

in settings file

USER_AGENT
= 'firstblood (+http://www.yourdomain. com)'
amended to:
USER_AGENT
= 'Mozilla/5.0 (Windows NT 6.1; Win64; x64 ) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'

class QiubaiSpider(scrapy.Spider):

name
= 'qiubai'
# allowed_domains = [‘www.xxx.com‘]
start_urls = ['https://www.qiushibaike.com/text/ span>']

def parse(self, response):
div_list
=response.xpath("//div[@id= 'content-left']/div")
res_list
=[]
for div in div_list:
# author=div.xpath('./div[1] /a[2]/h2/text()')[0]
##scrapy in xpath returns the select object
#
#Get the data in the select object

# Method 1: author=div.xpath('./div[1]/ a[2]/h2/text()')[0].extract()
# 方式二:author=div.xpath(‘./div[1]/a[2]/h2/text()‘).extract_first()
author=div.xpath(./div[1]/a[2]/h2/text())[0].extract()
content
=div.xpath(./a[1]/div[@class="content"]/span//text()).extract()
content
="".join(content)

# print("author......",author)
# print("content......",content)
# break
dic={
author:author,
content:content
}
res_list.append(dic)
return res_list

推荐使用:

pip install redis
==2.10.6

        class QiubaiproItem(scrapy.Item):

# define the fields for your item here like:(定义字段如下:)
# name = scrapy.Field() (name字段=scrapy万能字段)
#示例
author=scrapy.Field()
content
=scrapy.Field()

from qiubaiPro.items import QiubaiproIte

 #实例化 item对象

item=QiubaiproItem()
item[
author]=author
item[
content]=content
#注意:一条数据一个item对象,pipeline接受一个item就存储一条记录

            yield item #每条数据提交一次

    class QiubaiproPipeline(object):

def process_item(self, item, spider):
print(item)
return item
import pymysql
class Mysql_PipeLine(object):
#全局定义管道conn和游标cursor
#导入pymysql
conn=None
cursor
=None
def open_spider(self, spider):
#端口号是数字而非字符串,
self.conn=pymysql.Connect(host=127.0.0.1,port=3306,user=root,password=123,db=scrapy)
self.cursor
= self.conn.cursor()
def process_item(self, item, spider):
# print(item)
try:
self.cursor.execute(
insert into qiubai values ("%s","%s");%(item[author],item[content]))
self.conn.commit()
except Exception as e:
print(e)
self.conn.rollback()
return item

def close_spider(self, spider):
# self.cursor.close()
self.conn.close()

from redis import Redis
class Redis_PipeLine(object):
conn
=None

def open_spider(self,spider):
# 链接数据库
self.conn=Redis(host=127.0.0.1,port=6379)

def process_item(self,item,spider):
dic
={
author:item[author],
content:item[content]
}
self.conn.lpush(
qiubai,dic)

6 settings文件中开启item_pipeline功能
#允许书写多个管道,多种存储方式
ITEM_PIPELINES = {
qiubaiPro.pipelines.QiubaiproPipeline: 300,
#‘管道路径.管道名称‘:优先级
}

ITEM_PIPELINES
= {
qiubaiPro.pipelines.QiubaiproPipeline: 300,
#新增的管道
qiubaiPro.pipelines.Mysql_PipeLine: 301,
qiubaiPro.pipelines.Redis_PipeLine: 302,

}

# -*- coding: utf-8 -*-

import scrapy
from qiubaiPro.items import QiubaiproItem


‘‘‘
1 基于终端指令的持久化存储(只会将parse函数返回值进行本地持久化存储)
命令: scrapy crawl qiubai -o qiushibaike.csv

局限性:只能存储这些后缀的文件(‘json‘, ‘jsonlines‘, ‘jl‘, ‘csv‘, ‘xml‘, ‘marshal‘, ‘pickle‘)
‘‘‘
# 基于终端指令的持久化存储(只会将parse函数返回值进行本地持久化存储)
class QiubaiSpider(scrapy.Spider):
name
= qiubai
# allowed_domains = [‘www.xxx.com‘]
start_urls = [https://www.qiushibaike.com/text/]

def parse(self, response):
div_list
=response.xpath("//div[@id=‘content-left‘]/div")
res_list
=[]
for div in div_list:

author
=div.xpath(./div[1]/a[2]/h2/text())[0].extract()
content
=div.xpath(./a[1]/div[@class="content"]/span//text()).extract()
content
="".join(content)

dic
={
author:author,
content:content
}
res_list.append(dic)
return res_list

# 基于管道操作的持久化存储(持久化存储的操作必须写在管道文件中)
class QiubaiSpider(scrapy.Spider):
name
= qiubai
start_urls
= [https://www.qiushibaike.com/text/]

def parse(self, response):
div_list
=response.xpath("//div[@id=‘content-left‘]/div")
for div in div_list:
try:
author
=div.xpath(./div[1]/a[2]/h2/text())[0].extract()
except Exception as e:
# print(e)
author=div.xpath(./div[1]/span[2]/h2/text())[0].extract()

content
=div.xpath(./a[1]/div[@class="content"]/span//text()).extract()
content
="".join(content)

#实例化 item对象
item=QiubaiproItem()
item[
author]=author
item[
content]=content
# print(item[‘author‘])
#提交管道
yield item

import scrapy


class QiubaiproItem(scrapy.Item):

author
=scrapy.Field()
content
=scrapy.Field()

# -*- coding: utf-8 -*-

import pymysql
from redis import Redis

#一个类对应一个存储方式
#
存入文件qiubai.txt
class QiubaiproPipeline(object):
fp
= None # 文件管道

# open_spider重写父类方法,爬虫过程中只会执行一次
def open_spider(self,spider):
self.fp
=open(qiubai.txt,w,encoding=utf-8)

# 处理item文件会执行多次,因此文件打开和关闭操作不应该放在这个函数内部,
# 否则,执行效率太低
def process_item(self, item, spider):
# print(item)
self.fp.write(item[author]+:+item[content])
return item

# close_spider重写父类spider的方法,在爬虫执行过程只会执行一次

def close_spider(self,spider):
self.fp.close()

#存入mysql数据库
#
同时在settings添加该管道路径
class Mysql_PipeLine(object):
#全局定义管道conn和游标cursor
#导入pymysql
conn=None
cursor
=None
def open_spider(self, spider):
#端口号是数字而非字符串,
self.conn=pymysql.Connect(host=127.0.0.1,port=3306,user=root,password=123,db=scrapy)
self.cursor
= self.conn.cursor()
def process_item(self, item, spider):
# print(item)
try:
self.cursor.execute(
insert into qiubai values ("%s","%s");%(item[author],item[content]))
self.conn.commit()
except Exception as e:
print(e)
self.conn.rollback()
return item

def close_spider(self, spider):
# self.cursor.close()
self.conn.close()

class Redis_PipeLine(object):
conn
=None

def open_spider(self,spider):
# 链接数据库
self.conn=Redis(host=127.0.0.1,port=6379)

def process_item(self,item,spider):
dic
={
author:item[author],
content:item[content]
}
self.conn.lpush(
qiubai,dic)

 author=div.xpath(./div[1]/a[2]/h2/text())[0]

#

方式一:author=div.xpath(./div[1]/a[2]/h2/text())[0].extract()

方式二:author
=div.xpath(./div[1]/a[2]/h2/text()).extract_first()
author
=div.xpath(./div[1]/a[2]/h2/text())[0].extract() #返回值为列表

Leave a Comment

Your email address will not be published.