|
注册登录后全站资源免费查看下载
您需要 登录 才可以下载或查看,没有账号?立即注册
×
【多线程爬虫成品EXE下载地址】链接: https://pan.baidu.com/s/1QymE_5q7e3bkHax7nIvkGw?pwd=tspe 提取码: tspe
【多线程爬虫成品EXE下载地址】链接:https://share.weiyun.com/86w0CFW7 密码:6jt488
这个网站,有2W多张美图~~,别给人网站搞坏了,我就给了5个线程,自己会动手的,自己改吧,成品就5个线程爬~
【异步爬虫成品下载地址】https://www.paijishu.com/forum.php?mod=viewthread&tid=8752
多线程爬虫代码
- from collections import namedtuple
- from concurrent.futures import ThreadPoolExecutor
- from typing import Dict, List
- import re
- import requests
- import os
- from datetime import datetime
- import keyboard
- from fake_useragent import UserAgent
- from lxml import etree
- from rich.console import Console
-
- console = Console()
- headers = {'User-Agent':UserAgent().random}
- DATA = namedtuple('DATA',['year','month','day','title','href'])
- url = 'https://www.vmgirls.com/archives.html'
- img_list = ['jpg','png','gif','jpeg']
-
- def start_requests():
- '''
- 获取下载链接
- '''
- res = requests.get(url,headers=headers)
- et = etree.HTML(res.text)
- # 获取全部年份
- y = et.xpath('//div[@id="archives"]/h4/text()')
- for year in range(1,len(y)+1):
- # 每个月
- m = et.xpath(f'//div[@id="archives"]//ul[{year}]/li/span/text()')
- for month in range(1,len(m)+1):
- # 每天
- d = et.xpath(f'//div[@id="archives"]//ul[{year}]/li[{month}]/ul/li')
- for day in range(1,len(d)+1):
- # 每天的网址
- _day = et.xpath(f'//div[@id="archives"]//ul[{year}]/li[{month}]/ul/li[{day}]/text()')[0]
- _href = et.xpath(f'//div[@id="archives"]//ul[{year}]/li[{month}]/ul/li[{day}]/a/@href')[0]
- _title = et.xpath(f'//div[@id="archives"]//ul[{year}]/li[{month}]/ul/li[{day}]/a/text()')[0]
- yield DATA(y[year-1],m[month-1],_day,_title,_href)
-
-
- def get_data(yield_func):
- '''
- 转换数据
- '''
- yield from yield_func
-
-
- def save_img(url,path,title):
- '''
- 保存图片
- '''
- imgcs = requests.get(url,headers=headers)
- et = etree.HTML(imgcs.text)
- IMG = et.xpath('//div[@class="nc-light-gallery"]//@href')
- for i in range(0,len(IMG)-1):
- path = mkdir_path(path)
- if IMG[i].split('.')[-1] in img_list:
- res = requests.get(IMG[i],headers=headers)
- with open(f'{path}/{title}_{i}.jpg','wb') as f:
- f.write(res.content)
- nowdate = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
- console.print(f'[yellow]创建时间:{nowdate}\n[yellow]保存路径:{path}\n[yellow]文件:{title}_{i}.jpg 保存成功!\n[green]提示:按esc退出')
- console.print('[blue]-'*70)
- if keyboard.read_key() == 'esc':
- raise KeyboardInterrupt
-
-
- def mkdir_path(path):
- '''
- 创建路径
- '''
- path = re.sub(r'[\s]','',path)
- if not os.path.exists(path):
- os.makedirs(path)
- return path
-
- def main():
- '''
- 多线程主函数
- '''
- with ThreadPoolExecutor(max_workers=5) as executor:
- try:
- for data in get_data(start_requests()):
- path = os.path.join(os.getcwd(),'美女壁纸',data.year,data.month,data.day[:-2])
- img_name = data.title
- url = data.href
- executor.submit(save_img,url,path,img_name)
- except Exception as e:
- print(e)
- console.print('[red]程序即将退出!')
- os._exit(0)
-
- if __name__ == '__main__':
- main()
复制代码 效果图
|
|