当前位置: 移动技术网 > IT编程>开发语言>JavaScript > 多线程爬取狗妈表情包

多线程爬取狗妈表情包

2020年07月30日  | 移动技术网IT编程  | 我要评论

 通过多线程爬取狗妈表情包

import requests
import json
from jsonpath import jsonpath
import threading
import queue
import time
#创建下载线程
class downlodethread(threading.Thread):
    	def __init__(self,name,urlpage):
        	super().__init__(name=name)
        	self.urlpage = urlpage
    def run(self):
        print(f"{self.name}正在运行")
        #print(self.urlpage.get())
#判断队列是否为空,不为空则取出url
        while not self.urlpage.empty():
            url = self.urlpage.get(block=False)
            #拿到表情包序号
            d = url[58:78]
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
                }
            reponse = requests.get(url =url,headers =headers)
            #为方便管理我在e盘下建文件夹下载,运行代码时需要修改成你想下载的地址
            with open(r'E:\\狗妈表情包\\'+str(d)+'.jpg','wb')as f:
                print(self.name,'正在下载')
                f.write(reponse.content)
#分析请求爬取数据
class crawl(threading.Thread):
    def __init__(self,name,first_queue):
        super().__init__(name=name)
    def run(self):
        lock = threading.Lock()
        print(f"{self.name}正在运行")
        while not first_queue.empty():
            t = first_queue.get(block=False)
            headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
    }
            print(f"{self.name}正在运行")
            response = requests.get(url=t,headers = headers)
            #把数据放入队列
            lock.acquire()
            second_queue.put(response.text)
            lock.release()
            print(f"{self.name}运行结束")
def parse_response(t):
    data = json.loads(t)
    x = jsonpath(data, '$..data..thumbURL')
    print('解析线程启动')
    #print(x)
    for i in x:
        urlpage.put(i)

if __name__ == '__main__':
    first_queue = queue.Queue()
    second_queue = queue.Queue()
    third_queue = queue.Queue()
    urlpage = queue.Queue()
    #网址通过分析ajax请求得到
    for i in range(0,480,30):#把网址放入队列中
        first_queue.put(f"https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E7%A5%9E%E4%B9%90%E4%B8%83%E5%A5%88%E8%A1%A8%E6%83%85%E5%8C%85&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=&hd=&latest=&copyright=&word=%E7%A5%9E%E4%B9%90%E4%B8%83%E5%A5%88%E8%A1%A8%E6%83%85%E5%8C%85&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&expermode=&force=&pn={i}&rn=30&gsm=5a&1594005271382=")
    #print(first_queue.get())
    #启动爬取线程组
    for c in range(0,3):
        crawlthread = crawl(f'爬取{c}',first_queue)
        crawlthread.start()
    #time.sleep(5)
    crawlthread.join()
    while not second_queue.empty():
        t = second_queue.get()
    #解析
        t = threading.Thread(target=parse_response(t))
        t.start()
        t.join()
    # 等待解析线程
    time.sleep(2)
    for e in range(0,3):
        thread = downlodethread(f'下载{e}',urlpage)
        #print(urlpage.get())
        thread.start()
    thread.join()

这是从百度图片中抓取下的狗妈表情包,因为百度图片用的是ajax请求,所以需要分析网页,并解析json数据。

本文地址:https://blog.csdn.net/snake_boy_/article/details/107666775

如对本文有疑问, 点击进行留言回复!!

相关文章:

验证码:
移动技术网