#coding:utf-8 span>

import requests
import json
from lxml import etree
import threading
from queue import Queue
class QiushiSpide(object):
def __init__(self):
self.url_tmp
= "https://www.qiushibaike.com /8hr/page/{}/"
self.header
= {"User-Agent< span style="color: #800000;">": "Mozilla/5.0 ( Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36"}
self.pre_url
= "https://www.qiushibaike.com "
self.url_queue
= Queue()
self.html_queue
= Queue()
self.content_queue
= Queue()

def get_url_list(self):
for i in range(1,14):
self.url_queue.put(self.url_tmp.format(i))
print(self.url_queue.qsize())
# return [self.url_tmp.format(i) for i in range(1,14)]

def parse_url(self):
while True:
url
= self.url_queue.get()
print(url)
response
= requests.get(url, self.header)
self.html_queue.put(response.content.decode())
self.url_queue.task_done()
print("url_queue complete one")
# return response.content.decode()

def get_content_list(self):
while True:
html_str
= self.html_queue.get()
html
= etree.HTML(html_str)
li_list
= html.xpath("//li[contains(@ class,'item typs_')]")
content_list
=[]
for li in li_list:
item
= {}
img_list
= li.xpath(".//a[contains( @class,'recmd-left')]")
for img in img_list:
item[
"img_url"] = "https:" + img.xpath(". /img/@src")[0] if len(img .xpath("./img/@src"))>0 else None
div_list
= li.xpath(".//div[@class ='recmd-right']")
for div in div_list:
item[
"text"] = div.xpath("./a/ text()")[0] if len(div.xpath ("./a/text()"))>0 else None
item[
"a_href"] = self.pre_url + div.xpath(" ./a/@href")[0] if len( div.xpath("./a/@href"))>0 else None
item[
"smile_num"] = div.xpath(".//div [@class='recmd-num']/span[1]/text()")[0] if len(div.xpath(".//div [@class='recmd-num']/span[1]"))>0 else None
item[
"comment_num"] = div.xpath(".//div [@class='recmd-num']/span[4]/text()")[0] if len(div.xpath(".//div [@class='recmd-num']/span[4]"))>0 else None
content_list.append(item)
self.content_queue.put(content_list)
self.html_queue.task_done()
print("html_queue complete one")
# return content_list

def save_content(self):
while True:
content
= self.content_queue.get()
with open(
"Embarrassed a hundred threads.txt ",'a span>',encoding='utf-8') as f:
f.write(json.dumps(content,ensure_ascii
=False,indent=2))
f.write(
" ")
self.content_queue.task_done()

def run(self):
# url_list = self.get_url_list()
# for url in url_list:
# print(url)
# html_str = self.parse_url(url)
# content = self.get_content_list(html_str)
# self.save_content(content)
t_list = []
self.get_url_list()
for i in range(4):
p
= threading.Thread(target=self.parse_url)
t_list.append(p)
print("Add the end of the parse_url thread")
for i in range(4):
g
= threading.Thread(target=self.get_content_list)
t_list.append(g)
print("Add the end of the get_content_list thread")
s
= threading.Thread(target=self.save_content)
t_list.append(s)
for t in t_list:
t.setDaemon(True)
#Daemon thread, this thread is not important , The main thread ends, and the child threads end
t.start()

for q in [self.url_queue,self.html_queue,self.content_queue]:
q.join()
#Let the main thread wait for blocking and wait for the queue After the task is completed, complete it
print("Main thread end")
if __name__ == "__main__":
q
= QiushiSpide()
q.run()

#coding:utf-8

import requests
import json
from lxml import etree
import threading
from queue import Queue
class QiushiSpide(object):
def __init__(self):
self.url_tmp
= "https://www.qiushibaike.com /8hr/page/{}/"
self.header
= {"User-Agent< span style="color: #800000;">": "Mozilla/5.0 ( Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36"}
self.pre_url
= "https://www.qiushibaike.com "
self.url_queue
= Queue()
self.html_queue
= Queue()
self.content_queue
= Queue()

def get_url_list(self):
for i in range(1,14):
self.url_queue.put(self.url_tmp.format(i))
print(self.url_queue.qsize())
# return [self.url_tmp.format(i) for i in range(1,14)]

def parse_url(self):
while True:
url
= self.url_queue.get()
print(url)
response
= requests.get(url, self.header)
self.html_queue.put(response.content.decode())
self.url_queue.task_done()
print("url_queue complete one")
# return response.content.decode()

def get_content_list(self):
while True:
html_str
= self.html_queue.get()
html
= etree.HTML(html_str)
li_list
= html.xpath("//li[contains(@ class,'item typs_')]")
content_list
=[]
for li in li_list:
item
= {}
img_list
= li.xpath(".//a[contains( @class,'recmd-left')]")
for img in img_list:
item[
"img_url"] = "https:" + img.xpath(". /img/@src")[0] if len(img .xpath("./img/@src"))>0 else None
div_list
= li.xpath(".//div[@class ='recmd-right']")
for div in div_list:
item[
"text"] = div.xpath("./a/ text()")[0] if len(div.xpath ("./a/text()"))>0 else None
item[
"a_href"] = self.pre_url + div.xpath(" ./a/@href")[0] if len( div.xpath("./a/@href"))>0 else None
item[
"smile_num"] = div.xpath(".//div [@class='recmd-num']/span[1]/text()")[0] if len(div.xpath(".//div [@class='recmd-num']/span[1]"))>0 else None
item[
"comment_num"] = div.xpath(".//div [@class='recmd-num']/span[4]/text()")[0] if len(div.xpath(".//div [@class='recmd-num']/span[4]"))>0 else None
content_list.append(item)
self.content_queue.put(content_list)
self.html_queue.task_done()
print("html_queue complete one")
# return content_list

def save_content(self):
while True:
content
= self.content_queue.get()
with open(
"Embarrassed a hundred threads.txt ",'a span>',encoding='utf-8') as f:
f.write(json.dumps(content,ensure_ascii
=False,indent=2))
f.write(
" ")
self.content_queue.task_done()

def run(self):
# url_list = self.get_url_list()
# for url in url_list:
# print(url)
# html_str = self.parse_url(url)
# content = self.get_content_list(html_str)
# self.save_content(content)
t_list = []
self.get_url_list()
for i in range(4):
p
= threading.Thread(target=self.parse_url)
t_list.append(p)
print("Add the end of the parse_url thread")
for i in range(4):
g
= threading.Thread(target=self.get_content_list)
t_list.append(g)
print("Add the end of the get_content_list thread")
s
= threading.Thread(target=self.save_content)
t_list.append(s)
for t in t_list:
t.setDaemon(True)
#Daemon thread, this thread is not important , The main thread ends, and the child threads end
t.start()

for q in [self.url_queue,self.html_queue,self.content_queue]:
q.join()
#Let the main thread wait for blocking and wait for the queue After the task is completed, complete it
print("Main thread end")
if __name__ == "__main__":
q
= QiushiSpide()
q.run()

Leave a Comment

Your email address will not be published.