0%

Python 多线程下载器

一般网上的下载工具都会提供「下载线程数量」的参数,那么多个线程是怎么提高同一个文件的下载速度呢?其实主要是利用了 HTTP 请求头的 Range 参数,多个线程下载同一个文件的不同部分,然后拼接起来就可以了,这样就达到了多线程提高速度的目的。这里使用 Python 的并发库 concurrent[1] 进行多线程下载。

首先需要知道的知识就是 HTTP 的请求参数 Range[2],这个参数会指定你需要下载文件的哪部分,为了防止不同的线程干扰,最好是每个线程负责互不干扰的一片。

然后就是创建好线程池,省去了每次都去新建线程,直接创建一定数量的线程。获取到文件大小后平均分配给各个线程。

1
ThreadPoolExecutor(max_workers=workers_num)

获取远程文件名是首先从 HTTP 相应头的 Content-Disposition 字段获取,如果没有获取到,就从下载链接中解析[3]

每个线程下载好相应的部分的内容之后,需要将它按照顺序写入到本地文件中。使用 f.seek(start) 方法就可以移动文件读取指针。

最后使用 wait 方法等待线程池中的所有线程下载完毕,那么本次的下载也就完毕了。

可以使用 Browser Download Test Page 提供的测试样例来测试下载器。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import re
from os import path
import requests
from threading import Lock
from six.moves.urllib.parse import unquote, urlparse
from concurrent.futures import ThreadPoolExecutor, wait


class Downloader(object):
def __init__(self, workers_num=8):
self.session = requests.session()
self.pool = ThreadPoolExecutor(max_workers=workers_num)
self.workers_num = workers_num

def get_file_name(self, url):
token = '[-!#-\'*+.\dA-Z^-z|~]+'
qdtext='[]-~\t !#-[]'
mimeCharset='[-!#-&+\dA-Z^-z]+'
language='(?:[A-Za-z]{2,3}(?:-[A-Za-z]{3}(?:-[A-Za-z]{3}){,2})?|[A-Za-z]{4,8})(?:-[A-Za-z]{4})?(?:-(?:[A-Za-z]{2}|\d{3}))(?:-(?:[\dA-Za-z]{5,8}|\d[\dA-Za-z]{3}))*(?:-[\dA-WY-Za-wy-z](?:-[\dA-Za-z]{2,8})+)*(?:-[Xx](?:-[\dA-Za-z]{1,8})+)?|[Xx](?:-[\dA-Za-z]{1,8})+|[Ee][Nn]-[Gg][Bb]-[Oo][Ee][Dd]|[Ii]-[Aa][Mm][Ii]|[Ii]-[Bb][Nn][Nn]|[Ii]-[Dd][Ee][Ff][Aa][Uu][Ll][Tt]|[Ii]-[Ee][Nn][Oo][Cc][Hh][Ii][Aa][Nn]|[Ii]-[Hh][Aa][Kk]|[Ii]-[Kk][Ll][Ii][Nn][Gg][Oo][Nn]|[Ii]-[Ll][Uu][Xx]|[Ii]-[Mm][Ii][Nn][Gg][Oo]|[Ii]-[Nn][Aa][Vv][Aa][Jj][Oo]|[Ii]-[Pp][Ww][Nn]|[Ii]-[Tt][Aa][Oo]|[Ii]-[Tt][Aa][Yy]|[Ii]-[Tt][Ss][Uu]|[Ss][Gg][Nn]-[Bb][Ee]-[Ff][Rr]|[Ss][Gg][Nn]-[Bb][Ee]-[Nn][Ll]|[Ss][Gg][Nn]-[Cc][Hh]-[Dd][Ee]'
valueChars = '(?:%[\dA-F][\dA-F]|[-!#$&+.\dA-Z^-z|~])*'
dispositionParm = '[Ff][Ii][Ll][Ee][Nn][Aa][Mm][Ee]\s*=\s*(?:({token})|"((?:{qdtext}|\\\\[\t !-~])*)")|[Ff][Ii][Ll][Ee][Nn][Aa][Mm][Ee]\*\s*=\s*({mimeCharset})\'(?:{language})?\'({valueChars})|{token}\s*=\s*(?:{token}|"(?:{qdtext}|\\\\[\t !-~])*")|{token}\*\s*=\s*{mimeCharset}\'(?:{language})?\'{valueChars}'.format(**locals())
rsp = self.session.head(url)
try:
m = re.match('(?:{token}\s*;\s*)?(?:{dispositionParm})(?:\s*;\s*(?:{dispositionParm}))*|{token}'.format(**locals()), rsp.headers['Content-Disposition'])
except KeyError:
name = path.basename(unquote(urlparse(url).path))
else:
if not m:
name = path.basename(unquote(urlparse(url).path))
elif m.group(8) is not None:
name = unquote(m.group(8)).decode(m.group(7))
elif m.group(4) is not None:
name = unquote(m.group(4)).decode(m.group(3))
elif m.group(6) is not None:
name = re.sub('\\\\(.)', '\1', m.group(6))
elif m.group(5) is not None:
name = m.group(5)
elif m.group(2) is not None:
name = re.sub('\\\\(.)', '\1', m.group(2))
else:
name = m.group(1)
if name:
name = path.basename(name)
else:
name = path.basename(unquote(urlparse(url).path))
return name

def get_file_size(self, url):
while True:
rsp = self.session.head(url)
if 300 < rsp.status_code < 400:
url = rsp.headers["Location"]
continue
return int(rsp.headers["content-length"])

def handler(self, url, file_name, start, end):
headers = {'Range': 'bytes={}-{}'.format(start, end)}
rsp = self.session.get(url, headers=headers, stream=True)
with open(file_name, 'rb+') as f:
f.seek(start)
f.write(rsp.content)

def run(self, url):
file_name = self.get_file_name(url)
file_size = self.get_file_size(url)
with open(file_name, 'wb+') as f:
f.truncate(file_size)
f.close()

part = int(file_size / self.workers_num)
futures = []
for i in range(self.workers_num):
start = i * part
if i == self.workers_num - 1:
end = file_size
else:
end = start + part - 1
futures.append(self.pool.submit(self.handler, url, file_name, start, end))
wait(futures)


if __name__ == "__main__":
import time
d = Downloader()
s = time.time()
d.run("http://demo.borland.com/testsite/downloads/downloadfile.php?file=Small.zip&cd=attachment+filename")
d.run("http://static-aliyun-doc.oss-cn-hangzhou.aliyuncs.com/download/pdf/DNHCS_MGW1842487_zh-CN_cn_181112170048_public_92408e650bbaaab8b146f371082a0ac3.pdf")
d.run("https://p.pstatp.com/origin/ff670000482866725305")
e = time.time()
print(e - s)

相关代码放到 Gist 上了。


  1. https://docs.python.org/3/library/concurrent.futures.html ↩︎

  2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Range ↩︎

  3. How to get filename from Content-Disposition in headers ↩︎