最近也要学爬虫,发个爬小姐姐的源码,只用了多线程,没有作查重处理。
图片保存在J:\xiezhen\文件夹下,可自行修改。
[Python] 纯文本查看 复制代码 import time
import requests
from lxml import etree
import os
import concurrent.futures
def download_image(url, img_path):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
response = requests.get(url, headers=headers)
img_name = url.split('/')[-1]
with open(os.path.join(img_path, img_name), 'wb') as f:
f.write(response.content)
print(f'图片:{img_path}' + '/' + f'{img_name}下载完成!')
def process_page(page):
url = f'https://www.xiezhen.xyz/page/{page}'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
response = requests.get(url, headers=headers)
html = etree.HTML(response.content)
mail_url = html.xpath('//div[@class="excerpts"]/article/a/@href')
for url in mail_url:
response = requests.get(url, headers=headers)
html = etree.HTML(response.content)
sub_url = html.xpath('//article/p/img')
img_title = html.xpath('//title/text()')[0].split('-')[0]
img_path = f'J:/xiezhen/{img_title}'
if not os.path.exists(img_path):
os.makedirs(img_path)
with concurrent.futures.ThreadPoolExecutor() as executor:
futures = []
for s_url in sub_url:
img_url = s_url.attrib['src']
futures.append(executor.submit(download_image, img_url, img_path))
for future in concurrent.futures.as_completed(futures):
pass
time.sleep(0.5)
if __name__ == '__main__':
with concurrent.futures.ThreadPoolExecutor() as executor:
futures = []
for page in range(1, 573):
futures.append(executor.submit(process_page, page))
for future in concurrent.futures.as_completed(futures):
pass
|