High performance related
How to achieve multiple tasks at the same time and high efficiency
Serial implementation
The lowest efficiency is the most undesirable
import requests
urls = [
'http://www.baidu.com/',
'https://www.cnblogs.com/',
'https://www.cnblogs.com/news/< /span>',
'https://cn.bing.com/',
'https://stackoverflow.com/< span style="color: #800000;">',
]
for url in urls:
response = requests.get(url)
print(response)
Multithread
Multithread There is a problem of low thread utilization
import requests
import threading
urls = [
'http://www.baidu.com/',
'https://www.cnblogs.com/',
'https://www.cnblogs.com/news/< /span>',
'https://cn.bing.com/',
'https://stackoverflow.com/< span style="color: #800000;">',
]
def task(url):
response = requests.get(url)
print(response)
for url in urls:
t = threading.Thread(target=task,args=(url,))
t.start()
Coroutine + IO switch
Gevent internally calls greenlet (implements coroutine)
Based on coroutines, it saves more resources than threads.
from gevent import monkey; monkey.patch_all()
import gevent
import requests
def func(url):
response = requests.get(url)
print(response)
urls = [
'http://www.baidu.com/',
'https://www.cnblogs.com/',
'https://www.cnblogs.com/news/< /span>',
'https://cn.bing.com/',
'https://stackoverflow.com/< span style="color: #800000;">',
]
spawn_list = []
for url in urls:
spawn_list.append(gevent.spawn(func, url)) # Create a coroutine
gevent.joinall(spawn_list)
Event loop
Asynchronous non-blocking module based on event loop: Twisted
from twisted.web.client import getPage, defer
from twisted.internet import reactor
def stop_loop(arg):
reactor.stop()
def get_response(contents):
print(contents)
deferred_list = []
url_list = [
'http://www.baidu.com/',
'https://www.cnblogs.com/',
'https://www.cnblogs.com/news/< /span>',
'https://cn.bing.com/',
'https://stackoverflow.com/< span style="color: #800000;">',
]
for url in url_list:
deferred = getPage(bytes(url, encoding='utf8 span>')) # I got the task to be crawled, but did not actually execute the crawler
deferred.addCallback(get_response) # The callback function to be called
deferred_list.append(deferred) # Add all tasks to one list span>
dlist = defer.DeferredList(deferred_list) # Check whether all tasks are Are looped
dlist.addBoth(stop_loop) # Stop the loop if all tasks in the list are completed , The execution of the stopped function
reactor.run()
import requests
urls = [
'http://www.baidu.com/',
'https://www.cnblogs.com/',
'https://www.cnblogs.com/news/< /span>',
'https://cn.bing.com/',
'https://stackoverflow.com/< span style="color: #800000;">',
]
for url in urls:
response = requests.get(url)
print(response)
import requests
import threading
urls = [
'http://www.baidu.com/',
'https://www.cnblogs.com/',
'https://www.cnblogs.com/news/< /span>',
'https://cn.bing.com/',
'https://stackoverflow.com/< span style="color: #800000;">',
]
def task(url):
response = requests.get(url)
print(response)
for url in urls:
t = threading.Thread(target=task,args=(url,))
t.start()
from gevent import monkey; monkey.patch_all()
import gevent
import requests
def func(url):
response = requests.get(url)
print(response)
urls = [
'http://www.baidu.com/',
'https://www.cnblogs.com/',
'https://www.cnblogs.com/news/< /span>',
'https://cn.bing.com/',
'https://stackoverflow.com/< span style="color: #800000;">',
]
spawn_list = []
for url in urls:
spawn_list.append(gevent.spawn(func, url)) # Create a coroutine
gevent.joinall(spawn_list)
from twisted.web.client import getPage, defer
from twisted.internet import reactor
def stop_loop(arg):
reactor.stop()
def get_response(contents):
print(contents)
deferred_list = []
url_list = [
'http://www.baidu.com/',
'https://www.cnblogs.com/',
'https://www.cnblogs.com/news/< /span>',
'https://cn.bing.com/',
'https://stackoverflow.com/< span style="color: #800000;">',
]
for url in url_list:
deferred = getPage(bytes(url, encoding='utf8 span>')) # I got the task to be crawled, but did not actually execute the crawler
deferred.addCallback(get_response) # The callback function to be called
deferred_list.append(deferred) # add all tasks to a list with one span>
dlist = defer.DeferredList(deferred_list) # Check whether all tasks are Are looped
dlist.addBoth(stop_loop) # Stop the loop if all tasks in the list are completed , The execution of the stopped function
reactor.run()