Chinese document API: http://requests.kennethreitz.org/zh_CN/latest/
Installation
pip install requests
Get webpage
# coding=utf-8
import requests
response = requests.get('http://www.baidu .com')
# The first way to get the response content
# View page encoding method
print(response.encoding)
# Modify encoding method
response.encoding = 'utf-8'
# Get response content
print(response.text)
# The second way to get the binary response content
# Get binary response content
print(response.content)
# Decoding decode('decoding method') The default is utf-8 Way
print(response.content.decode())# coding=utf-8
import requests
response = requests.get('http://www.baidu .com')
# The first way to get the response content
# View page encoding method
print(response.encoding)
# Modify encoding method
response.encoding = 'utf-8'
# Get response content
print(response.text)
# The second way to get the binary response content
# Get binary response content
print(response.content)
# Decoding decode('decoding method') The default is utf-8 Way
print(response.content.decode())
Save the picture
import requests
response = requests.get('http://requests.kennethreitz .org/zh_CN/latest/_static/requests-sidebar.png')
# save picture
with open('a.png','wb') as f:
f.write(response.content)
Get the status code and determine whether the request is successful
import requests
r = requests.get('http://www.baidu .com')
# Get status code
print(r.status_code) # The page number after the jump when 200 is not necessarily successful and may be obtained
# Assert to determine whether the request is successful
assert r.status_code==200 # If it succeeds, there is no response. Failure will report an error
# Get response header
print(r.headers)
# Get request header
print(r.request.headers)
# Get request url
print(r.request.url)
# Get the response url
print(r.url)
Disguise browser crawling content with header header
import requests
# Simulation header
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' }
# Get webpage
r = requests.get('http://www.baidu.com span>',headers=headers)
# Get response content
print(r.text)
Crawl the post bar content
import requests
class WebSpider():
def __init__(self, name):
self.headers = {'User-Agent< span style="color: #800000;">': 'Mozilla/5.0 ( Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'}
self.url_temp = "http://tieba.baidu.com /f?kw="+ name +"&ie=utf-8&pn={}"
self.name = name
# Build an address list
def get_url_list(self):
return [self.url_temp.format(i*50) for< /span> i in range(1000)]
# Get the post bar content
def parse_url(self, url): #Crawling data
print(url)
r = requests.get(url, headers=self.headers)
return r.content.decode()
def run(self):
# Get address list
urls = self.get_url_list()
# Traverse and crawl data
for url in urls:
html_str = self.parse_url(url)
# Save
page_num = urls.index(url)+1 # Get the number of yards
file_name = self.name + "page {}.html< span style="color: #800000;">".format(page_num)
with open(file_name, "w", encoding="utf-8< /span>") as f:
f.write(html_str)
if __name__ == '__main__':
r = WebSpider(input("Please enter the name of the post: span>"))
r.run()
pip install requests
# coding=utf-8
import requests
response = requests.get('http://www.baidu .com')
# The first way to get the response content
# View page encoding method
print(response.encoding)
# Modify encoding method
response.encoding = 'utf-8'
# Get response content
print(response.text)
# The second way to get the binary response content
# Get binary response content
print(response.content)
# Decoding decode('decoding method') The default is utf-8 Way
print(response.content.decode())# coding=utf-8
import requests
response = requests.get('http://www.baidu .com')
# The first way to get the response content
# View page encoding method
print(response.encoding)
# Modify encoding method
response.encoding = 'utf-8'
# Get response content
print(response.text)
# The second way to get the binary response content
# Get binary response content
print(response.content)
# Decoding decode('decoding method') The default is utf-8 Way
print(response.content.decode())
import requests
response = requests.get('http://requests.kennethreitz .org/zh_CN/latest/_static/requests-sidebar.png')
# save picture
with open('a.png','wb') as f:
f.write(response.content)
import requests
r = requests.get('http://www.baidu .com')
# Get status code
print(r.status_code) # The page number after the jump when 200 is not necessarily successful and may be obtained
# Assert to determine whether the request is successful
assert r.status_code==200 # If it succeeds, there is no response. Failure will report an error
# Get response header
print(r.headers)
# Get request header
print(r.request.headers)
# Get request url
print(r.request.url)
# Get the response url
print(r.url)
import requests
# Simulation header
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' }
# Get webpage
r = requests.get('http://www.baidu.com span>',headers=headers)
# Get response content
print(r.text)
import requests
class WebSpider():
def __init__(self, name):
self.headers = {'User-Agent< span style="color: #800000;">': 'Mozilla/5.0 ( Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'}
self.url_temp = "http://tieba.baidu.com /f?kw="+ name +"&ie=utf-8&pn={}"
self.name = name
# Build an address list
def get_url_list(self):
return [self.url_temp.format(i*50) for< /span> i in range(1000)]
# Get the post bar content
def parse_url(self, url): #Crawling data
print(url)
r = requests.get(url, headers=self.headers)
return r.content.decode()
def run(self):
# Get address list
urls = self.get_url_list()
# Traverse and crawl data
for url in urls:
html_str = self.parse_url(url)
# Save
page_num = urls.index(url)+1 # Get the number of yards
file_name = self.name + "page {}.html< span style="color: #800000;">".format(page_num)
with open(file_name, "w", encoding="utf-8< /span>") as f:
f.write(html_str)
if __name__ == '__main__':
r = WebSpider(input("Please enter the name of the post: span>"))
r.run()