发个爬小姐姐的源码

csx · 发表于 2023-7-19 20:28

最近也要学爬虫，发个爬小姐姐的源码，只用了多线程，没有作查重处理。
图片保存在J:\xiezhen\文件夹下，可自行修改。

[Python] 纯文本查看 复制代码

import time
import requests
from lxml import etree
import os
import concurrent.futures

def download_image(url, img_path):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
    response = requests.get(url, headers=headers)
    img_name = url.split('/')[-1]
    with open(os.path.join(img_path, img_name), 'wb') as f:
        f.write(response.content)
        print(f'图片：{img_path}' + '/' + f'{img_name}下载完成！')

def process_page(page):
    url = f'https://www.xiezhen.xyz/page/{page}'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
    response = requests.get(url, headers=headers)
    html = etree.HTML(response.content)
    mail_url = html.xpath('//div[@class="excerpts"]/article/a/@href')
    for url in mail_url:
        response = requests.get(url, headers=headers)
        html = etree.HTML(response.content)
        sub_url = html.xpath('//article/p/img')
        img_title = html.xpath('//title/text()')[0].split('-')[0]
        img_path = f'J:/xiezhen/{img_title}'
        if not os.path.exists(img_path):
            os.makedirs(img_path)
        with concurrent.futures.ThreadPoolExecutor() as executor:
            futures = []
            for s_url in sub_url:
                img_url = s_url.attrib['src']
                futures.append(executor.submit(download_image, img_url, img_path))
            for future in concurrent.futures.as_completed(futures):
                pass
        time.sleep(0.5)

if __name__ == '__main__':
    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = []
        for page in range(1, 573):
            futures.append(executor.submit(process_page, page))
        for future in concurrent.futures.as_completed(futures):
            pass

账号		自动登录	找回密码
密码			立即注册

发个爬小姐姐的源码

浏览过的版块

论坛元老

灌水之王