糗 - one hundred, reptiles

#coding:utf-8 span>

import requests

import json

from lxml import etree

import threading

from queue import Queue

class QiushiSpide(object):

 def __init__(self):

 self.url_tmp = "https://www.qiushibaike.com /8hr/page/{}/"

 self.header = {"User-Agent< span style="color: #800000;">": "Mozilla/5.0 ( Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36"}

 self.pre_url = "https://www.qiushibaike.com "

 self.url_queue = Queue()

 self.html_queue = Queue()

 self.content_queue = Queue()



 def get_url_list(self):

 for i in range(1,14):

 self.url_queue.put(self.url_tmp.format(i))

 print(self.url_queue.qsize())

 # return [self.url_tmp.format(i) for i in range(1,14)]



 def parse_url(self):

 while True:

 url = self.url_queue.get()

 print(url)

 response = requests.get(url, self.header)

 self.html_queue.put(response.content.decode())

 self.url_queue.task_done()

 print("url_queue complete one")

 # return response.content.decode()



 def get_content_list(self):

 while True:

 html_str = self.html_queue.get()

 html = etree.HTML(html_str)

 li_list = html.xpath("//li[contains(@ class,'item typs_')]")

 content_list=[]

 for li in li_list:

 item = {}

 img_list = li.xpath(".//a[contains( @class,'recmd-left')]")

 for img in img_list:

 item["img_url"] = "https:" + img.xpath(". /img/@src")[0] if len(img .xpath("./img/@src"))>0 else None

 div_list = li.xpath(".//div[@class ='recmd-right']")

 for div in div_list:

 item["text"] = div.xpath("./a/ text()")[0] if len(div.xpath ("./a/text()"))>0 else None

 item["a_href"] = self.pre_url + div.xpath(" ./a/@href")[0] if len( div.xpath("./a/@href"))>0 else None

 item["smile_num"] = div.xpath(".//div [@class='recmd-num']/span[1]/text()")[0] if len(div.xpath(".//div [@class='recmd-num']/span[1]"))>0 else None

 item["comment_num"] = div.xpath(".//div [@class='recmd-num']/span[4]/text()")[0] if len(div.xpath(".//div [@class='recmd-num']/span[4]"))>0 else None

 content_list.append(item)

 self.content_queue.put(content_list)

 self.html_queue.task_done()

 print("html_queue complete one")

 # return content_list



 def save_content(self):

 while True:

 content = self.content_queue.get()

 with open("Embarrassed a hundred threads.txt ",'a span>',encoding='utf-8') as f:

 f.write(json.dumps(content,ensure_ascii=False,indent=2))

 f.write("
")

 self.content_queue.task_done()



 def run(self):

 # url_list = self.get_url_list()

 # for url in url_list:

 # print(url)

 # html_str = self.parse_url(url)

 # content = self.get_content_list(html_str)

 # self.save_content(content)

 t_list = []

 self.get_url_list()

 for i in range(4):

 p = threading.Thread(target=self.parse_url)

 t_list.append(p)

 print("Add the end of the parse_url thread")

 for i in range(4):

 g = threading.Thread(target=self.get_content_list)

 t_list.append(g)

 print("Add the end of the get_content_list thread")

 s = threading.Thread(target=self.save_content)

 t_list.append(s)

 for t in t_list:

 t.setDaemon(True) #Daemon thread, this thread is not important , The main thread ends, and the child threads end

 t.start()



 for q in [self.url_queue,self.html_queue,self.content_queue]:

 q.join() #Let the main thread wait for blocking and wait for the queue After the task is completed, complete it

 print("Main thread end")

if __name__ == "__main__":

 q = QiushiSpide()

 q.run()

#coding:utf-8

import requests

import json

from lxml import etree

import threading

from queue import Queue

class QiushiSpide(object):

 def __init__(self):

 self.url_tmp = "https://www.qiushibaike.com /8hr/page/{}/"

 self.header = {"User-Agent< span style="color: #800000;">": "Mozilla/5.0 ( Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36"}

 self.pre_url = "https://www.qiushibaike.com "

 self.url_queue = Queue()

 self.html_queue = Queue()

 self.content_queue = Queue()



 def get_url_list(self):

 for i in range(1,14):

 self.url_queue.put(self.url_tmp.format(i))

 print(self.url_queue.qsize())

 # return [self.url_tmp.format(i) for i in range(1,14)]



 def parse_url(self):

 while True:

 url = self.url_queue.get()

 print(url)

 response = requests.get(url, self.header)

 self.html_queue.put(response.content.decode())

 self.url_queue.task_done()

 print("url_queue complete one")

 # return response.content.decode()



 def get_content_list(self):

 while True:

 html_str = self.html_queue.get()

 html = etree.HTML(html_str)

 li_list = html.xpath("//li[contains(@ class,'item typs_')]")

 content_list=[]

 for li in li_list:

 item = {}

 img_list = li.xpath(".//a[contains( @class,'recmd-left')]")

 for img in img_list:

 item["img_url"] = "https:" + img.xpath(". /img/@src")[0] if len(img .xpath("./img/@src"))>0 else None

 div_list = li.xpath(".//div[@class ='recmd-right']")

 for div in div_list:

 item["text"] = div.xpath("./a/ text()")[0] if len(div.xpath ("./a/text()"))>0 else None

 item["a_href"] = self.pre_url + div.xpath(" ./a/@href")[0] if len( div.xpath("./a/@href"))>0 else None

 item["smile_num"] = div.xpath(".//div [@class='recmd-num']/span[1]/text()")[0] if len(div.xpath(".//div [@class='recmd-num']/span[1]"))>0 else None

 item["comment_num"] = div.xpath(".//div [@class='recmd-num']/span[4]/text()")[0] if len(div.xpath(".//div [@class='recmd-num']/span[4]"))>0 else None

 content_list.append(item)

 self.content_queue.put(content_list)

 self.html_queue.task_done()

 print("html_queue complete one")

 # return content_list



 def save_content(self):

 while True:

 content = self.content_queue.get()

 with open("Embarrassed a hundred threads.txt ",'a span>',encoding='utf-8') as f:

 f.write(json.dumps(content,ensure_ascii=False,indent=2))

 f.write("
")

 self.content_queue.task_done()



 def run(self):

 # url_list = self.get_url_list()

 # for url in url_list:

 # print(url)

 # html_str = self.parse_url(url)

 # content = self.get_content_list(html_str)

 # self.save_content(content)

 t_list = []

 self.get_url_list()

 for i in range(4):

 p = threading.Thread(target=self.parse_url)

 t_list.append(p)

 print("Add the end of the parse_url thread")

 for i in range(4):

 g = threading.Thread(target=self.get_content_list)

 t_list.append(g)

 print("Add the end of the get_content_list thread")

 s = threading.Thread(target=self.save_content)

 t_list.append(s)

 for t in t_list:

 t.setDaemon(True) #Daemon thread, this thread is not important , The main thread ends, and the child threads end

 t.start()



 for q in [self.url_queue,self.html_queue,self.content_queue]:

 q.join() #Let the main thread wait for blocking and wait for the queue After the task is completed, complete it

 print("Main thread end")

if __name__ == "__main__":

 q = QiushiSpide()

 q.run()

Leave a Comment Cancel reply