Reptile Series —- SCRAPY Crawling Web Page Initial

September 29, 2021By Simo Web Crawler

A basic process

Create a project, the project name is (cmd): firstblood: scrapy startproject firstblood
Enter the project directory (cmd): cd :./firstblood
Create a crawler file (cmd): scrapy genspider first www.xxx.con (first is the crawler file name www.xxx.com: starting url)
pycharm Open the crawler project, enter the spider file, find the first crawler file, and write the crawler code. Comment allowed_domains
Start the crawler file (cmd): scrapy crawl first

< /p>

Second spider anti-anti-climbing configuration

robot.txt

settings amended to: ROBOTSTXT_OBEY = False

UA camouflage

setting File

USER_AGENT = 'firstblood (+http://www.yourdomain. com)'

 amended to:

USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64 ) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'

Summary of three basic commands

scrapy startproject firstblood #new project
scrapy genspider first www.xxx.con #new crawler file
scrapy crawl first #Execute the crawler file and print the diary
scrapy crawl first –nolog #Execute the crawler file, do not print the diary
scrapy crawl qiubai -o qiushibaike.csv Store the return result of the parse function in a csv file

Four storage

Persistent storage based on terminal instructions (only the parse function The return value is stored locally and persistently)

　　　　Command: scrapy crawl qiubai -o qiushibaike.csv

　　　 Limitations: Only files with these suffixes can be stored (‘json’,’jsonlines’,’jl’,’csv’,’xml’,’marshal’,’pickle’)　　

class QiubaiSpider(scrapy.Spider):

 name = 'qiubai'

 # allowed_domains = [‘www.xxx.com‘]

 start_urls = ['https://www.qiushibaike.com/text/ span>']



 def parse(self, response):

 div_list=response.xpath("//div[@id= 'content-left']/div")

 res_list=[]

 for div in div_list:

 # author=div.xpath('./div[1] /a[2]/h2/text()')[0]

 ##scrapy in xpath returns the select object

 #

 #Get the data in the select object



 # Method 1: author=div.xpath('./div[1]/ a[2]/h2/text()')[0].extract()

 # Method two: author=div.xpath('./div[1]/ a[2]/h2/text()').extract_first()

 author=div.xpath('./div[1]/a[2] /h2/text()')[0].extract()

 content=div.xpath('./a[1]/ div[@class="content"]/span//text()') .extract()

 content="".join(content)



 # print("author......",author )

 # print("content......",content)

 # break

 dic={

 'author':author,

 'content':content

}

 res_list.append(dic)

 return res_list

Persistent storage based on pipeline operations (persistent storage operations must be written in the pipeline file)

Recommended use:

pip install redis==2.10.6

How to encapsulate data into item objects

1. In the items.py file Define the attributes of the storage field

 class QiubaiproItem (scrapy.Item):

 # define the fields for your item here like: (define the fields as follows :)

 # name = scrapy.Field() (name field=scrapy universal field) span>

 #Example

 author=scrapy.Field()

 content=scrapy.Field()

2. The item class defined in spiders/qiubai.py is introduced:< /p>

from qiubaiPro.items import QiubaiproIte

3. Instantiate items object

 #Instantiate the item object

 item=QiubaiproItem()

 item['author']=author

 item['content']=content

 #Note: a piece of data is an item object, and the pipeline accepts one item Store a record

4. Submit the instantiated object to the pipeline, scrapy submits it automatically, we only need to write:

< div class="code">

 yield item #Submit each piece of data once

5. The logic of pipeline storage is written in the pipeline.py file (three types of storage Way)

 class QiubaiproPipeline(object) :

 def process_item(self, item, spider):

 print(item)

 return item

 import pymysql

 class Mysql_PipeLine(object):

 #define pipeline conn and cursor globally

 #Import pymysql

 conn=None

 cursor=None

 def open_spider(self, spider):

 #The port number is a number instead of a string,

 self.conn=pymysql.Connect(host='127.0.0.1 ',port=3306,user='root',password='123',db='< span style="color: #800000;">scrapy')

 self.cursor = self.conn.cursor()

 def process_item(self, item, spider):

 # print(item)

 try:

 self.cursor.execute('insert into qiubai values ("% s","%s");'%(item['author'],item['content']))

 self.conn.commit()

 except Exception as e:

 print(e)

 self.conn.rollback()

 return item



 def close_spider(self, spider):

 # self.cursor.close()

 self.conn.close()



 from redis import Redis

 class Redis_PipeLine(object):

 conn=None



 def open_spider(self,spider):

 # Link database

 self.conn=Redis(host='127.0.0.1',port=6379)



 def process_item(self,item,spider):

 dic={

 'author':item['author'],

 'content':item['content']

}

 self.conn.lpush('qiubai',dic)



 6 Turn on the item_pipeline function in the settings file

 #Allow writing multiple channels, multiple storage methods

 ITEM_PIPELINES = {

 'qiubaiPro.pipelines.QiubaiproPipeline': 300,

 #'Pipe path. Pipe name': Priority

}



 ITEM_PIPELINES = {

 'qiubaiPro.pipelines.QiubaiproPipeline': 300,

 #New pipeline

 'qiubaiPro.pipelines.Mysql_PipeLine': 301,

 'qiubaiPro.pipelines.Redis_PipeLine': 302,



}

Five simple examples

New crawler file qiubai.py

# -*- coding: utf -8 -*-

import scrapy

from qiubaiPro.items import QiubaiproItem





‘‘‘

1 Persistent storage based on terminal instructions (only the return value of the parse function will be stored locally)

 Command: scrapy crawl qiubai -o qiushibaike.csv



 Limitations: Only files with these suffixes can be stored (‘json’, ‘jsonlines’, ‘jl’, ‘csv’, ‘xml’, ‘marshal’, ‘pickle’)

‘‘‘

# Persistent storage based on terminal commands (only the return value of the parse function will be local Persistent storage)

class QiubaiSpider(scrapy.Spider):

 name = 'qiubai'

 # allowed_domains = [‘www.xxx.com‘]

 start_urls = ['https://www.qiushibaike.com/text/ span>']



 def parse(self, response):

 div_list=response.xpath("//div[@id= 'content-left']/div")

 res_list=[]

 for div in div_list:



 author=div.xpath('./div[1]/ a[2]/h2/text()')[0].extract( )

 content=div.xpath('./a[1]/ div[@class="content"]/span//text()') .extract()

 content="".join(content)



 dic={

 'author':author,

 'content':content

}

 res_list.append(dic)

 return res_list



# Persistent storage based on pipeline operations (persistent storage operations Must be written in the pipeline file)

class QiubaiSpider(scrapy.Spider):

 name = 'qiubai'

 start_urls = ['https://www.qiushibaike.com/ text/']



 def parse(self, response):

 div_list=response.xpath("//div[@id= 'content-left']/div")

 for div in div_list:

 try:

 author=div.xpath('./div[1]/ a[2]/h2/text()')[0].extract( )

 except Exception as e:

 # print(e)

 author=div.xpath('./div[1]/span[2] /h2/text()')[0].extract()



 content=div.xpath('./a[1]/ div[@class="content"]/span//text()') .extract()

 content="".join(content)



 #Instantiate the item object

 item=QiubaiproItem()

 item['author']=author

 item['content']=content

 # print(item[‘author‘])

 #Submission pipeline

 yield item

items.py

import scrapy



class QiubaiproItem(scrapy.Item):



 author=scrapy.Field()

 content=scrapy.Field()

pipeline.py

# -*- coding: utf-8 -*-

import pymysql

from redis import Redis



#One class corresponds to one storage method

#Save the file qiubai.txt

class QiubaiproPipeline(object):

 fp = None # File pipeline



 # open_spider overrides the parent class method, it will only be executed once during the crawling process

 def open_spider(self,spider):

 self.fp=open('qiubai.txt ','w span>',encoding='utf-8')



 # Processing the item file will be executed multiple times, so the file is opened and closed Operations should not be placed inside this function,

 # Otherwise, the execution efficiency is too low

 def process_item(self, item, spider):

 # print(item)

 self.fp.write(item['author']+':< span style="color: #800000;">'+item['content< /span>'])

 return item



 # close_spider overrides the method of the parent spider, in the crawler execution process Will only be executed once



 def close_spider(self,spider):

 self.fp.close()



#Save to mysql database

#At the same time add the pipeline path in settings

class Mysql_PipeLine(object):

 #define pipeline conn and cursor globally

 #Import pymysql

 conn=None

 cursor=None

 def open_spider(self, spider):

 #The port number is a number instead of a string,

 self.conn=pymysql.Connect(host='127.0.0.1 ',port=3306,user='root',password='123',db='< span style="color: #800000;">scrapy')

 self.cursor = self.conn.cursor()

 def process_item(self, item, spider):

 # print(item)

 try:

 self.cursor.execute('insert into qiubai values ("% s","%s");'%(item['author'],item['content']))

 self.conn.commit()

 except Exception as e:

 print(e)

 self.conn.rollback()

 return item



 def close_spider(self, spider):

 # self.cursor.close()

 self.conn.close()



class Redis_PipeLine(object):

 conn=None



 def open_spider(self,spider):

 # Link database

 self.conn=Redis(host='127.0.0.1',port=6379)



 def process_item(self,item,spider):

 dic={

 'author':item['author'],

 'content':item['content']

}

 self.conn.lpush('qiubai',dic)

The difference of xpath in six scrapy

The data obtained by the xpath expression in scrapy is not a label object, but a select object

 author=div.xpath('./div[1]/a[2]/h2/text()')[0]

#

< /div>

Get the data in the select object

Method 1: author=div.xpath('./div[1]/a[2]/h2/text()')[0].extract()

Method 2: author=div.xpath('./div[ 1]/a[2]/h2/text()').extract_first( )

author=div.xpath('./div[1]/ a[2]/h2/text()')[0].extract() #The return value is a list

Seven diary processing

Amended in settings to: ROBOTSTXT_OBEY = False

in settings file

USER_AGENT = 'firstblood (+http://www.yourdomain. com)'

 amended to:

USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64 ) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'

class QiubaiSpider(scrapy.Spider):

 name = 'qiubai'

 # allowed_domains = [‘www.xxx.com‘]

 start_urls = ['https://www.qiushibaike.com/text/ span>']



 def parse(self, response):

 div_list=response.xpath("//div[@id= 'content-left']/div")

 res_list=[]

 for div in div_list:

 # author=div.xpath('./div[1] /a[2]/h2/text()')[0]

 ##scrapy in xpath returns the select object

 #

 #Get the data in the select object



 # Method 1: author=div.xpath('./div[1]/ a[2]/h2/text()')[0].extract()

            # 方式二:author=div.xpath(‘./div[1]/a[2]/h2/text()‘).extract_first()

            author=div.xpath(‘./div[1]/a[2]/h2/text()‘)[0].extract()

            content=div.xpath(‘./a[1]/div[@class="content"]/span//text()‘).extract()

            content="".join(content)



            # print("author......",author)

            # print("content......",content)

            # break

            dic={

                ‘author‘:author,

                ‘content‘:content

}

            res_list.append(dic)

        return res_list

推荐使用:

pip install redis==2.10.6

        class QiubaiproItem(scrapy.Item):

            # define the fields for your item here like:(定义字段如下:)

            # name = scrapy.Field() (name字段=scrapy万能字段)

            #示例

            author=scrapy.Field()

            content=scrapy.Field()

from qiubaiPro.items import QiubaiproIte

 #实例化 item对象

            item=QiubaiproItem()

            item[‘author‘]=author

            item[‘content‘]=content

            #注意:一条数据一个item对象,pipeline接受一个item就存储一条记录

            yield item #每条数据提交一次

    class QiubaiproPipeline(object):

        def process_item(self, item, spider):

            print(item)

            return item

    import pymysql

    class Mysql_PipeLine(object):

    #全局定义管道conn和游标cursor

    #导入pymysql

    conn=None

    cursor=None

    def open_spider(self, spider):

        #端口号是数字而非字符串,

        self.conn=pymysql.Connect(host=‘127.0.0.1‘,port=3306,user=‘root‘,password=‘123‘,db=‘scrapy‘)

 self.cursor = self.conn.cursor()

 def process_item(self, item, spider):

        # print(item)

        try:

            self.cursor.execute(‘insert into qiubai values ("%s","%s");‘%(item[‘author‘],item[‘content‘]))

 self.conn.commit()

 except Exception as e:

            print(e)

 self.conn.rollback()

        return item



    def close_spider(self, spider):

        # self.cursor.close()

        self.conn.close()



    from redis import Redis        

    class Redis_PipeLine(object):

    conn=None



 def open_spider(self,spider):

        # 链接数据库

        self.conn=Redis(host=‘127.0.0.1‘,port=6379)



    def process_item(self,item,spider):

        dic={

            ‘author‘:item[‘author‘],

            ‘content‘:item[‘content‘]

}

        self.conn.lpush(‘qiubai‘,dic)



    6 settings文件中开启item_pipeline功能

        #允许书写多个管道,多种存储方式

        ITEM_PIPELINES = {

       ‘qiubaiPro.pipelines.QiubaiproPipeline‘: 300,

       #‘管道路径.管道名称‘:优先级

}



    ITEM_PIPELINES = {

       ‘qiubaiPro.pipelines.QiubaiproPipeline‘: 300,

       #新增的管道

       ‘qiubaiPro.pipelines.Mysql_PipeLine‘: 301,

       ‘qiubaiPro.pipelines.Redis_PipeLine‘: 302,



}

# -*- coding: utf-8 -*-

import scrapy

from qiubaiPro.items import QiubaiproItem





‘‘‘

1 基于终端指令的持久化存储(只会将parse函数返回值进行本地持久化存储)

    命令: scrapy crawl qiubai -o qiushibaike.csv



     局限性:只能存储这些后缀的文件(‘json‘, ‘jsonlines‘, ‘jl‘, ‘csv‘, ‘xml‘, ‘marshal‘, ‘pickle‘)

‘‘‘

# 基于终端指令的持久化存储(只会将parse函数返回值进行本地持久化存储)

class QiubaiSpider(scrapy.Spider):

    name = ‘qiubai‘

 # allowed_domains = [‘www.xxx.com‘]

    start_urls = [‘https://www.qiushibaike.com/text/‘]



 def parse(self, response):

        div_list=response.xpath("//div[@id=‘content-left‘]/div")

        res_list=[]

        for div in div_list:



            author=div.xpath(‘./div[1]/a[2]/h2/text()‘)[0].extract()

            content=div.xpath(‘./a[1]/div[@class="content"]/span//text()‘).extract()

            content="".join(content)



            dic={

                ‘author‘:author,

                ‘content‘:content

}

            res_list.append(dic)

        return res_list



# 基于管道操作的持久化存储(持久化存储的操作必须写在管道文件中)

class QiubaiSpider(scrapy.Spider):

    name = ‘qiubai‘

    start_urls = [‘https://www.qiushibaike.com/text/‘]



 def parse(self, response):

        div_list=response.xpath("//div[@id=‘content-left‘]/div")

        for div in div_list:

            try:

                author=div.xpath(‘./div[1]/a[2]/h2/text()‘)[0].extract()

            except Exception as e:

                # print(e)

                author=div.xpath(‘./div[1]/span[2]/h2/text()‘)[0].extract()



            content=div.xpath(‘./a[1]/div[@class="content"]/span//text()‘).extract()

            content="".join(content)



            #实例化 item对象

            item=QiubaiproItem()

            item[‘author‘]=author

            item[‘content‘]=content

            # print(item[‘author‘])

            #提交管道

            yield item

import scrapy



class QiubaiproItem(scrapy.Item):



    author=scrapy.Field()

    content=scrapy.Field()

# -*- coding: utf-8 -*-

import pymysql

from redis import Redis



#一个类对应一个存储方式

#存入文件qiubai.txt

class QiubaiproPipeline(object):

    fp = None  # 文件管道



    # open_spider重写父类方法,爬虫过程中只会执行一次

    def open_spider(self,spider):

        self.fp=open(‘qiubai.txt‘,‘w‘,encoding=‘utf-8‘)



    # 处理item文件会执行多次,因此文件打开和关闭操作不应该放在这个函数内部,

    # 否则,执行效率太低

    def process_item(self, item, spider):

        # print(item)

        self.fp.write(item[‘author‘]+‘:‘+item[‘content‘])

        return item



    # close_spider重写父类spider的方法,在爬虫执行过程只会执行一次



    def close_spider(self,spider):

        self.fp.close()



#存入mysql数据库

#同时在settings添加该管道路径

class Mysql_PipeLine(object):

    #全局定义管道conn和游标cursor

    #导入pymysql

    conn=None

    cursor=None

    def open_spider(self, spider):

        #端口号是数字而非字符串,

        self.conn=pymysql.Connect(host=‘127.0.0.1‘,port=3306,user=‘root‘,password=‘123‘,db=‘scrapy‘)

 self.cursor = self.conn.cursor()

 def process_item(self, item, spider):

        # print(item)

        try:

            self.cursor.execute(‘insert into qiubai values ("%s","%s");‘%(item[‘author‘],item[‘content‘]))

 self.conn.commit()

 except Exception as e:

            print(e)

 self.conn.rollback()

        return item



    def close_spider(self, spider):

        # self.cursor.close()

        self.conn.close()



class Redis_PipeLine(object):

    conn=None



 def open_spider(self,spider):

        # 链接数据库

        self.conn=Redis(host=‘127.0.0.1‘,port=6379)



    def process_item(self,item,spider):

        dic={

            ‘author‘:item[‘author‘],

            ‘content‘:item[‘content‘]

}

        self.conn.lpush(‘qiubai‘,dic)

 author=div.xpath(‘./div[1]/a[2]/h2/text()‘)[0]

#

方式一:author=div.xpath(‘./div[1]/a[2]/h2/text()‘)[0].extract()

方式二:author=div.xpath(‘./div[1]/a[2]/h2/text()‘).extract_first()

author=div.xpath(‘./div[1]/a[2]/h2/text()‘)[0].extract() #返回值为列表

Climb, Initial, reptile, SCRAPY, series, take, WEB