#coding:utf-8 span>
import requests
import json
from lxml import etree
import threading
from queue import Queue
class QiushiSpide(object):
def __init__(self):
self.url_tmp = "https://www.qiushibaike.com /8hr/page/{}/"
self.header = {"User-Agent< span style="color: #800000;">": "Mozilla/5.0 ( Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36"}
self.pre_url = "https://www.qiushibaike.com "
self.url_queue = Queue()
self.html_queue = Queue()
self.content_queue = Queue()
def get_url_list(self):
for i in range(1,14):
self.url_queue.put(self.url_tmp.format(i))
print(self.url_queue.qsize())
# return [self.url_tmp.format(i) for i in range(1,14)]
def parse_url(self):
while True:
url = self.url_queue.get()
print(url)
response = requests.get(url, self.header)
self.html_queue.put(response.content.decode())
self.url_queue.task_done()
print("url_queue complete one")
# return response.content.decode()
def get_content_list(self):
while True:
html_str = self.html_queue.get()
html = etree.HTML(html_str)
li_list = html.xpath("//li[contains(@ class,'item typs_')]")
content_list=[]
for li in li_list:
item = {}
img_list = li.xpath(".//a[contains( @class,'recmd-left')]")
for img in img_list:
item["img_url"] = "https:" + img.xpath(". /img/@src")[0] if len(img .xpath("./img/@src"))>0 else None
div_list = li.xpath(".//div[@class ='recmd-right']")
for div in div_list:
item["text"] = div.xpath("./a/ text()")[0] if len(div.xpath ("./a/text()"))>0 else None
item["a_href"] = self.pre_url + div.xpath(" ./a/@href")[0] if len( div.xpath("./a/@href"))>0 else None
item["smile_num"] = div.xpath(".//div [@class='recmd-num']/span[1]/text()")[0] if len(div.xpath(".//div [@class='recmd-num']/span[1]"))>0 else None
item["comment_num"] = div.xpath(".//div [@class='recmd-num']/span[4]/text()")[0] if len(div.xpath(".//div [@class='recmd-num']/span[4]"))>0 else None
content_list.append(item)
self.content_queue.put(content_list)
self.html_queue.task_done()
print("html_queue complete one")
# return content_list
def save_content(self):
while True:
content = self.content_queue.get()
with open("Embarrassed a hundred threads.txt ",'a span>',encoding='utf-8') as f:
f.write(json.dumps(content,ensure_ascii=False,indent=2))
f.write("
")
self.content_queue.task_done()
def run(self):
# url_list = self.get_url_list()
# for url in url_list:
# print(url)
# html_str = self.parse_url(url)
# content = self.get_content_list(html_str)
# self.save_content(content)
t_list = []
self.get_url_list()
for i in range(4):
p = threading.Thread(target=self.parse_url)
t_list.append(p)
print("Add the end of the parse_url thread")
for i in range(4):
g = threading.Thread(target=self.get_content_list)
t_list.append(g)
print("Add the end of the get_content_list thread")
s = threading.Thread(target=self.save_content)
t_list.append(s)
for t in t_list:
t.setDaemon(True) #Daemon thread, this thread is not important , The main thread ends, and the child threads end
t.start()
for q in [self.url_queue,self.html_queue,self.content_queue]:
q.join() #Let the main thread wait for blocking and wait for the queue After the task is completed, complete it
print("Main thread end")
if __name__ == "__main__":
q = QiushiSpide()
q.run()
#coding:utf-8
import requests
import json
from lxml import etree
import threading
from queue import Queue
class QiushiSpide(object):
def __init__(self):
self.url_tmp = "https://www.qiushibaike.com /8hr/page/{}/"
self.header = {"User-Agent< span style="color: #800000;">": "Mozilla/5.0 ( Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36"}
self.pre_url = "https://www.qiushibaike.com "
self.url_queue = Queue()
self.html_queue = Queue()
self.content_queue = Queue()
def get_url_list(self):
for i in range(1,14):
self.url_queue.put(self.url_tmp.format(i))
print(self.url_queue.qsize())
# return [self.url_tmp.format(i) for i in range(1,14)]
def parse_url(self):
while True:
url = self.url_queue.get()
print(url)
response = requests.get(url, self.header)
self.html_queue.put(response.content.decode())
self.url_queue.task_done()
print("url_queue complete one")
# return response.content.decode()
def get_content_list(self):
while True:
html_str = self.html_queue.get()
html = etree.HTML(html_str)
li_list = html.xpath("//li[contains(@ class,'item typs_')]")
content_list=[]
for li in li_list:
item = {}
img_list = li.xpath(".//a[contains( @class,'recmd-left')]")
for img in img_list:
item["img_url"] = "https:" + img.xpath(". /img/@src")[0] if len(img .xpath("./img/@src"))>0 else None
div_list = li.xpath(".//div[@class ='recmd-right']")
for div in div_list:
item["text"] = div.xpath("./a/ text()")[0] if len(div.xpath ("./a/text()"))>0 else None
item["a_href"] = self.pre_url + div.xpath(" ./a/@href")[0] if len( div.xpath("./a/@href"))>0 else None
item["smile_num"] = div.xpath(".//div [@class='recmd-num']/span[1]/text()")[0] if len(div.xpath(".//div [@class='recmd-num']/span[1]"))>0 else None
item["comment_num"] = div.xpath(".//div [@class='recmd-num']/span[4]/text()")[0] if len(div.xpath(".//div [@class='recmd-num']/span[4]"))>0 else None
content_list.append(item)
self.content_queue.put(content_list)
self.html_queue.task_done()
print("html_queue complete one")
# return content_list
def save_content(self):
while True:
content = self.content_queue.get()
with open("Embarrassed a hundred threads.txt ",'a span>',encoding='utf-8') as f:
f.write(json.dumps(content,ensure_ascii=False,indent=2))
f.write("
")
self.content_queue.task_done()
def run(self):
# url_list = self.get_url_list()
# for url in url_list:
# print(url)
# html_str = self.parse_url(url)
# content = self.get_content_list(html_str)
# self.save_content(content)
t_list = []
self.get_url_list()
for i in range(4):
p = threading.Thread(target=self.parse_url)
t_list.append(p)
print("Add the end of the parse_url thread")
for i in range(4):
g = threading.Thread(target=self.get_content_list)
t_list.append(g)
print("Add the end of the get_content_list thread")
s = threading.Thread(target=self.save_content)
t_list.append(s)
for t in t_list:
t.setDaemon(True) #Daemon thread, this thread is not important , The main thread ends, and the child threads end
t.start()
for q in [self.url_queue,self.html_queue,self.content_queue]:
q.join() #Let the main thread wait for blocking and wait for the queue After the task is completed, complete it
print("Main thread end")
if __name__ == "__main__":
q = QiushiSpide()
q.run()