Reptral high performance related

High performance related

How to achieve multiple tasks at the same time and high efficiency

Serial implementation

The lowest efficiency is the most undesirable

import requests


urls
= [
'http://www.baidu.com/',
'https://www.cnblogs.com/',
'https://www.cnblogs.com/news/< /span>',
'https://cn.bing.com/',
'https://stackoverflow.com/< span style="color: #800000;">'
,
]

for url in urls:
response
= requests.get(url)
print(response)

Multithread

Multithread There is a problem of low thread utilization

import requests

import threading


urls
= [
'http://www.baidu.com/',
'https://www.cnblogs.com/',
'https://www.cnblogs.com/news/< /span>',
'https://cn.bing.com/',
'https://stackoverflow.com/< span style="color: #800000;">'
,
]

def task(url):
response
= requests.get(url)
print(response)

for url in urls:
t
= threading.Thread(target=task,args=(url,))
t.start()

Coroutine + IO switch

Gevent internally calls greenlet (implements coroutine)

Based on coroutines, it saves more resources than threads.

from gevent import monkey; monkey.patch_all()

import gevent
import requests


def func(url):
response
= requests.get(url)
print(response)

urls
= [
'http://www.baidu.com/',
'https://www.cnblogs.com/',
'https://www.cnblogs.com/news/< /span>',
'https://cn.bing.com/',
'https://stackoverflow.com/< span style="color: #800000;">'
,
]
spawn_list
= []
for url in urls:
spawn_list.append(gevent.spawn(func, url))
# Create a coroutine

gevent.joinall(spawn_list)

Event loop

Asynchronous non-blocking module based on event loop: Twisted

from twisted.web.client import getPage, defer

from twisted.internet import reactor

def stop_loop(arg):
reactor.stop()


def get_response(contents):
print(contents)

deferred_list
= []

url_list
= [
'http://www.baidu.com/',
'https://www.cnblogs.com/',
'https://www.cnblogs.com/news/< /span>',
'https://cn.bing.com/',
'https://stackoverflow.com/< span style="color: #800000;">'
,
]

for url in url_list:
deferred
= getPage(bytes(url, encoding='utf8')) # I got the task to be crawled, but did not actually execute the crawler
deferred.addCallback(get_response) # The callback function to be called
deferred_list.append(deferred) # Add all tasks to one list span>


dlist
= defer.DeferredList(deferred_list) # Check whether all tasks are Are looped
dlist.addBoth(stop_loop) # Stop the loop if all tasks in the list are completed , The execution of the stopped function

reactor.run()

import requests


urls
= [
'http://www.baidu.com/',
'https://www.cnblogs.com/',
'https://www.cnblogs.com/news/< /span>',
'https://cn.bing.com/',
'https://stackoverflow.com/< span style="color: #800000;">'
,
]

for url in urls:
response
= requests.get(url)
print(response)

import requests

import threading


urls
= [
'http://www.baidu.com/',
'https://www.cnblogs.com/',
'https://www.cnblogs.com/news/< /span>',
'https://cn.bing.com/',
'https://stackoverflow.com/< span style="color: #800000;">'
,
]

def task(url):
response
= requests.get(url)
print(response)

for url in urls:
t
= threading.Thread(target=task,args=(url,))
t.start()

from gevent import monkey; monkey.patch_all()

import gevent
import requests


def func(url):
response
= requests.get(url)
print(response)

urls
= [
'http://www.baidu.com/',
'https://www.cnblogs.com/',
'https://www.cnblogs.com/news/< /span>',
'https://cn.bing.com/',
'https://stackoverflow.com/< span style="color: #800000;">'
,
]
spawn_list
= []
for url in urls:
spawn_list.append(gevent.spawn(func, url))
# Create a coroutine

gevent.joinall(spawn_list)

from twisted.web.client import getPage, defer

from twisted.internet import reactor

def stop_loop(arg):
reactor.stop()


def get_response(contents):
print(contents)

deferred_list
= []

url_list
= [
'http://www.baidu.com/',
'https://www.cnblogs.com/',
'https://www.cnblogs.com/news/< /span>',
'https://cn.bing.com/',
'https://stackoverflow.com/< span style="color: #800000;">'
,
]

for url in url_list:
deferred
= getPage(bytes(url, encoding='utf8')) # I got the task to be crawled, but did not actually execute the crawler
deferred.addCallback(get_response) # The callback function to be called
deferred_list.append(deferred) # add all tasks to a list with one
span>


dlist
= defer.DeferredList(deferred_list) # Check whether all tasks are Are looped
dlist.addBoth(stop_loop) # Stop the loop if all tasks in the list are completed , The execution of the stopped function

reactor.run()

Leave a Comment

Your email address will not be published.