import requests
import urllib.parse
from tqdm import tqdm
import os
import concurrent.futures
def check_local_file(url, destination,count,total_count):
# 如果本地文件存在
if os.path.exists(destination):
response = requests.head(url) # 只获取头部信息,获取文件大小等
remote_file_size = int(response.headers.get('content-length', 0))
local_file_size = os.path.getsize(destination) # 获取本地文件大小
# 如果本地文件大小与远程文件大小一致
if local_file_size == remote_file_size:
print("no."+str(count)+"/"+str(total_count)+" "+destination+"文件已存在且特征一致,无需下载")
return True
return False
def download_file(url, destination,count,total_count):
if not check_local_file(url, destination,count,total_count):
response = requests.get(url, stream=True)
total_size = int(response.headers.get('content-length', 0))
block_size = 1024 # 每次读取的字节数
with open("down.txt", "a") as file:
file.write(url + '\n')
with open(destination, 'wb') as file, tqdm(
desc="Downloading no."+str(count)+"/"+str(total_count),
total=total_size,
unit='iB',
unit_scale=True,
unit_divisor=1024
) as bar:
for data in response.iter_content(block_size):
file.write(data)
bar.update(len(data))
# 假设链接存储在一个文本文件 'links.txt' 中
with open('full-links.txt', 'r') as file:
links = file.readlines()
# 遍历链接下载文件
links_array = []
urls = []
for link in links:
link = link.strip() # 去除换行符和空格
urls.append(link)
num_threads = 32 # 根据您的计算机性能调整此值
total_len = len(urls)
with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
for index, url in enumerate(urls):
file_name = urllib.parse.urlparse(url).path.split('/')[-1] # 从链接中获取文件名
file_path = "./opennueroQTAB/"+file_name
executor.submit(download_file, url, file_path,index,total_len)
最后修改:2024 年 07 月 28 日
© 允许规范转载