Skip to content

Commit 7f48af8

Browse files
committed
nice download,增加数十倍下载速度
1 parent 353c780 commit 7f48af8

File tree

4 files changed

+140
-2
lines changed

4 files changed

+140
-2
lines changed

README.md

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,23 @@
11
# Python
22

3+
### nice_download.py 多线程文件下载器
4+
```
5+
理论在大型文件下载,带宽充足的情况下,可增加数十倍下载速度
6+
原理是多线程对目标文件分块下载
7+
1,发送head请求获取目标文件总大小,以及当前是否支持分块下载(详情:http协议header头range及response的content-range),现在基本都支持
8+
2,下载前创建一个和要下载文件一样大小的文件
9+
3,根据1中获得的文件大小分块多线程,各个线程下载不同的数据块
10+
太小型文件的文件可能看不出加速效果,在大型文件上就会比普通下载拉大差距
11+
关于http的range特性:
12+
有些文件下载器在下载中断的之后可以在中断位置继续下载,而不必重新开始的原因就是利用了支持range的特性
13+
记录了中断时的文件偏移位置,在实现时只要在中断异常的时候记录文件偏移位置到临时文件
14+
下次继续下载读取临时文件中的偏移即可支持断点下载,下载完成时删除记录文件偏移的临时文件即可
15+
说明:
16+
nice_download.py是多线程模式,所以去除断点下载功能,否则维护临时文件偏移位置比维护单一进程的临时文件偏移位置要复杂的多
17+
查看帮助:python nice_download.py -h
18+
```
19+
![](https://github.com/LockGit/Py/blob/master/img/download.gif)
20+
321
### ac.py 一个字符串搜索算法(tire树+AC自动机)
422
```
523
学习记录:
@@ -188,7 +206,7 @@ GET: https://github.com/
188206
```
189207

190208

191-
### base64.py base64加密原理
209+
### base64_str.py base64加密原理
192210
```
193211
Base64加密原理,使用Python实现Base64加密,可能有bug,未完全完善版
194212
1,准备一个包含64个字符的数组
@@ -200,7 +218,7 @@ Base64加密原理,使用Python实现Base64加密,可能有bug,未完全
200218
Base64编码会把3字节的二进制数据编码为4字节的文本数据,长度增加33%
201219
202220
例:
203-
➜ Py git:(master) ✗ python base64.py lock
221+
➜ Py git:(master) ✗ python base64_str.py lock
204222
bG9jaw==
205223
➜ Py git:(master) ✗ echo -n lock|base64
206224
bG9jaw==

base64.py renamed to base64_str.py

File renamed without changes.

img/download.gif

1.42 MB
Loading

nice_download.py

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
#!/usr/bin/env python
2+
# encoding: utf-8
3+
# author: Lock
4+
# time: 2017/12/21 18:28
5+
# 多线程文件下载器,默认单线程
6+
7+
import sys
8+
import optparse
9+
import threading
10+
import requests
11+
import re
12+
import time
13+
14+
15+
class Download(object):
16+
def __init__(self, config_dict):
17+
self.url = config_dict['url']
18+
self.filename = self.clear_name(config_dict['url'].split('/')[-1])
19+
self.thread = config_dict['thread']
20+
self.user_agent = config_dict['user_agent']
21+
self.fileSize = 0
22+
self.supportThread = True
23+
self.show_print = (config_dict['show_print'] == 'yes') and True or False
24+
25+
# 移除文件名的一些特殊字符
26+
def clear_name(self, filename):
27+
(filename, _) = re.subn(ur'[\\\/\:\*\?\"\<\>\|]', '', filename)
28+
return filename
29+
30+
# 初始化目标文件信息
31+
def init_file_info(self):
32+
headers = {
33+
'User-Agent': self.user_agent,
34+
'Range': 'bytes=0-4'
35+
}
36+
try:
37+
r = requests.head(self.url, headers=headers)
38+
rang_content = r.headers['content-range']
39+
self.fileSize = int(re.match(ur'^bytes 0-4/(\d+)$', rang_content).group(1))
40+
return True
41+
except Exception, e:
42+
print 'can not support breakpoint download,msg:%s' % (e.message,)
43+
44+
try:
45+
self.fileSize = int(r.headers['content-length'])
46+
except Exception, e:
47+
self.supportThread = False
48+
print 'can not support multi thread download , error:%s' % (e.message,)
49+
return False
50+
51+
def start_part_download(self, thread_id, start_index, stop_index):
52+
try:
53+
headers = {'Range': 'bytes=%d-%d' % (start_index, stop_index,), 'User-Agent': self.user_agent}
54+
r = requests.get(self.url, headers=headers, stream=True, allow_redirects=True)
55+
if r.status_code == 206:
56+
with open(self.filename, "rb+") as fp:
57+
fp.seek(start_index)
58+
fp.write(r.content)
59+
if self.show_print:
60+
sys.stdout.write('thread %s download part size:%.2f KB\n' % (thread_id, (r.content.__len__()) / 1024))
61+
sys.stdout.flush()
62+
except Exception, e:
63+
if self.show_print:
64+
sys.stdout.write('下载出现错误,错误位置:%s,状态码:%s,错误信息:%s\n' % (start_index, r.status_code, e.message))
65+
sys.stdout.flush()
66+
67+
def run(self):
68+
print 'Start...'
69+
start_time = time.time()
70+
self.init_file_info()
71+
# 创建一个和要下载文件一样大小的文件
72+
with open(self.filename, "wb") as fp:
73+
fp.truncate(self.fileSize)
74+
75+
if self.fileSize > 0:
76+
if self.supportThread is False and self.thread > 1:
77+
print 'sorry,only support single thread'
78+
self.thread = 1
79+
print 'Thread count is:%s' % (self.thread,)
80+
part = self.fileSize / self.thread
81+
for i in xrange(0, self.thread):
82+
start_index = part * i
83+
stop_index = start_index + part
84+
if i == self.thread - 1:
85+
stop_index = self.fileSize
86+
download_args = {'thread_id': i, 'start_index': start_index, 'stop_index': stop_index}
87+
worker = threading.Thread(target=self.start_part_download, kwargs=download_args)
88+
worker.setDaemon(True)
89+
worker.start()
90+
# 等待所有线程下载完成
91+
main_thread = threading.current_thread()
92+
for t in threading.enumerate():
93+
if t is main_thread:
94+
continue
95+
t.join()
96+
print 'Success.\nTime:%.2fs , Size:%.2fKB' % (time.time() - start_time, self.fileSize / 1024)
97+
else:
98+
print 'Can not download'
99+
100+
101+
if __name__ == '__main__':
102+
parser = optparse.OptionParser(usage='python %s.py [options]' % (sys.argv[0],))
103+
parser.add_option('-u', dest='url', type='string', help='specify download resource url')
104+
parser.add_option('-t', dest='thread', type='int', help='specify download thread count', default=1)
105+
parser.add_option('-p', dest='show_print', type='string', help='yes/no,show print info,default enable', default='yes')
106+
parser.add_option("-a", dest="user_agent", help="specify request user agent", default='Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:57.0) Gecko/20100101 Firefox/57.0')
107+
(options, args) = parser.parse_args()
108+
if options.url is None:
109+
parser.print_help()
110+
exit()
111+
config = {
112+
'url': options.url,
113+
'thread': options.thread,
114+
'user_agent': options.user_agent,
115+
'show_print': options.show_print
116+
}
117+
try:
118+
Download(config).run()
119+
except KeyboardInterrupt:
120+
print '\nCancel Download'

0 commit comments

Comments
 (0)