【多线程增量式爬虫】- 爬取王者荣耀英雄+皮肤高清壁纸

csx · 发表于 2023-10-8 01:37

图片自己做壁纸，没有授权不要商用

运行环境：python3.x
需要自己先安装 requests 和 lxml
pip install requests lxml
然后运行脚本就行了

[AppleScript] 纯文本查看 复制代码

import os
import time
import json
import requests
import threading
from queue import Queue
from lxml import etree

headers = {
    'user-agent':
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'
}

DETAIL_URL = 'https://pvp.qq.com/web201605/herodetail/{hid}.shtml'
SKIN_URL = 'https://game.gtimg.cn/images/yxzj/img201606/skin/hero-info/{hid}/{hid}-bigskin-{index}.jpg'
RUNTIME_QUEUE = Queue()
THREADS = []
THREAD_NUMBER = 100
SKINS_PATH = 'skins/'

def parse_skins(*hero):
    hid, cname = hero
    skins = []
    resp = requests.get(DETAIL_URL.format(hid=hid), headers=headers)
    if resp.status_code != 200:
        raise Exception('status_code error!')
    try:
        content = resp.content.decode('gbk')
        selector = etree.HTML(content)
        res = selector.xpath('//div[@class="pic-pf"]/ul/@data-imgname')[0]
        index = 1
        for _skin in res.split('|'):
            nameAnd = _skin.split('&')
            yield {
                'name': '%s-%s' % (cname, nameAnd[0]),
                'url': SKIN_URL.format(hid=hid, index=index)
            }
            index += 1
    except Exception as e:
        print(e)
        return []

def create_thread():
    for _ in range(THREAD_NUMBER):
        _t = threading.Thread(target=downloader)
        _t.start()
        THREADS.append(_t)

def close_thread():
    # 阻塞队列直到队列被清空才往下执行
    RUNTIME_QUEUE.join()
    for _ in range(THREAD_NUMBER):
        RUNTIME_QUEUE.put(None)

    for _t in THREADS:
        _t.join()

def downloader():
    while True:
        generator = RUNTIME_QUEUE.get()
        if generator is None:
            break
        try:
            while True:
                fileName, url = next(generator).values()
                file_path = '%s%s.jpg' % (SKINS_PATH, fileName)
                if not os.path.exists(file_path):
                    print('开始下载>>>>>%s, url:%s' % (fileName, url))
                    resp = requests.get(url, headers=headers)
                    with open(file_path, 'wb') as fb:
                        fb.write(resp.content)
                        print('下载完成<<<<保存在: %s' % os.path.abspath(file_path))
                #else:
                    #print('已经存在:%s, 路径:%s' %
                    #      (fileName, os.path.abspath(file_path)))
        except StopIteration:
            RUNTIME_QUEUE.task_done()
        except Exception as e:
            print(e)

if __name__ == "__main__":

    if not os.path.isdir(SKINS_PATH):
        os.mkdir(SKINS_PATH)
    try:
        create_thread()
        if not os.path.exists('herolist.json'):
            print('正在请求最新数据... >>> from url: https://pvp.qq.com/web201605/js/herolist.json')
            hero_list = requests.get('https://pvp.qq.com/web201605/js/herolist.json').json()
            #with open('herolist.json', 'w', encoding='utf-8') as fp:
            #    json.dump(hero_list, fp)
        else:
            print('正在读取数据... >>> from file: herolist.json')
            with open('herolist.json', 'r', encoding='utf-8') as fp:
                hero_list = json.load(fp=fp)
        for item in hero_list:
            RUNTIME_QUEUE.put(parse_skins(item['ename'], item['cname']))
    except Exception as e:
        print(e)
        try:
            while True:
                generator = RUNTIME_QUEUE.get()
                if generator is None:
                    break
                next(generator)
        except StopIteration:
            RUNTIME_QUEUE.task_done()
        except Exception as e:
            print(e)
        close_thread()
    close_thread()

账号		自动登录	找回密码
密码			立即注册

【多线程增量式爬虫】- 爬取王者荣耀英雄+皮肤高清壁纸

论坛元老

灌水之王