-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy paththebox.py
executable file
·213 lines (161 loc) · 6.22 KB
/
thebox.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
#!/usr/bin/env python3
'''
Download script for thebox website
Developed by: Henry Liang
Last modify: 4th Jun
'''
import re
import sys
import json
import math
import time
import requests
from bs4 import BeautifulSoup
# basic value definition
chunk_size = 1024*500
bar_len = 25
ecoding = 'utf-8'
header = {
'User-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-encoding': 'gzip, deflate, br',
'Accept-language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive'
}
def show(file_size):
# when is zero
if file_size == 0:
return '0 B'
unit_list = ['B', 'KB', 'MB', 'GB', 'TB', 'PB']
i = int(math.floor(math.log(file_size, 1024)))
# if over the range of unit list
if i >= len(unit_list):
i = len(unit_list) - 1
return '{size:.2f} {unit}'.format(size=file_size/math.pow(1024, i), unit=unit_list[i])
def eta(rate, remain_size):
if rate == 0:
return '0:0:0'
else:
eta_time = remain_size / rate
h = eta_time // 3600
m = (eta_time-h*3600) // 60
s = (eta_time-h*3600-m*60)
return '%dh %dm %ds'%(h, m, int(s))
def time_stamp():
''' Formating the time stamp function for log output '''
now = time.strftime("%H:%M:%S",time.localtime(time.time()))
return '['+now+']'
def get_source(url, session=None):
# first time
if session == None:
session = requests.Session()
try:
html_data = session.get(url, headers=header, timeout=10)
except TimeoutError:
print(time_stamp(), 'Time out of url:', url)
return html_data, session
# for second or more request
else:
try:
html_data = session.get(url, timeout=20)
except TimeoutError:
print(time_stamp(), 'Time out of url:', url)
return html_data, session
def download(url, file_name, session=None):
# if no session provided
if session == None:
session = requests.Session()
video = session.get(url, stream=True)
finish_size = 0
length = float(video.headers['content-length'])
# print(time_stamp(), 'Start to download file, size: {size:.2f} MB'.format(size = length/1024/1024))
print(time_stamp(), 'Start to download file, size: {size}'.format(size = show(length)))
# record start time
start_time = time.time()
# save content
with open(file_name, 'wb') as fp:
try:
for data in video.iter_content(chunk_size=chunk_size):
fp.write(data)
finish_size += len(data)
# for process bar display
cur = time.time()
rate = len(data)/(cur-start_time)
finish_unit = int(finish_size*bar_len/length)
print('\r'+'[%.2f%%]: |%s| Remain: %s %s/s ETA:%s -' % (float(finish_size/length*100), '>'*finish_unit+' '*(bar_len-finish_unit), show(length-finish_size), show(rate), eta(rate, length-finish_size)),end='')
start_time = cur
# when user choose to stop download
except KeyboardInterrupt:
print()
print(time_stamp(), 'HINTS: Program finish, provide download link for further process:')
print(url)
# when other exception
except Exception as e:
print()
print(time_stamp(),'ERROR:', e)
# finish download
print()
print(time_stamp(), 'Download completed and save as', file_name)
# return session for further processing
return session
def extract_video_link(html_data):
bs = BeautifulSoup(html_data, 'html.parser')
body = bs.body
# extract video title
title = body.header.get_text()
title = title.replace('\n', '').replace(' ', '_').replace('.', '')
# extract video source link
script_js = body.find_all(type="text/javascript")[0]
script_js = script_js.prettify()
m = re.search("// html5 files(.+)// flash", script_js)
extract_source = m.group(1)
# delete useless charter
script_link = extract_source.replace('\\t', '').replace('\\n', '').replace('\\r', '')
script_link = script_link.replace(' ', '').replace('\'', '\"').replace('},{', '}+{')
source_list = script_link.split('+')
# transfer to dict format for storge or further usage
video_list = list()
for source in source_list:
source_obj = json.loads(source)
source_obj['title'] = source_obj['file'].split('/')[-1]
video_list.append(source_obj)
# return the title of video and video link list
return title, video_list
def main():
# get link from user
args = sys.argv
link = args[1]
# check user input link
link = str(link)
if not link.startswith('https://thebox.unsw.edu.au/video/'):
print('ERROR: (LINK FORMAT ERROR) Source link should follow the format: https://thebox.unsw.edu.au/video/xxxxxxx')
print('Detail please follow GitHub page instruction')
return
# scratch the html source code from link
print(time_stamp(),'Getting the source page...', end=' ')
html_data, session = get_source(link)
print('Done!')
html_data.encoding = ecoding
title, video_list = extract_video_link(html_data.text)
# for user to choose download different file
order_num = 1
print(time_stamp(), f'Detect {len(video_list)} video source as follow, enter order number to download or enter -1 to provide download link for download by yourself')
print('OrderNum', 'FileName')
for video_obj in video_list:
print(order_num, video_obj['title'])
order_num+=1
chosen = int(input('Enter order num: '))
if chosen > len(video_list)+1:
print('ERROR: Wrong input, program exit')
exit(1)
# if user want to download by itself
if chosen == -1:
for video in video_list:
print(video['title'], ':\t', video['file'])
# else download by program
else:
chosen_obj = video_list[chosen-1]
# star to download
download(chosen_obj['file'], title+'.mp4', session=session)
if __name__ == "__main__":
main()