This repository was archived by the owner on Mar 16, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathreqsitesource_Cristian.py
151 lines (131 loc) · 7.45 KB
/
reqsitesource_Cristian.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import urllib.request
import re
import os
import requests
from bs4 import BeautifulSoup
import glob
import shutil
#-----------------------------------------------------------------------------------------------------------#
# Web-Scraper Program for TED Talks
#
# TO-DO:
# (a) improve video embedding by using .html versions from SingleFile
# UPDATE: integration is initiated and works well (SingleFile is run while on a given TED Talk,
# and then the Python program uses the generated .html file to embed the downloaded video). Next,
# the embedding aesthetics must be improved (I can't replace the downloaded video with the TED Talk
# placeholder yet ...)
# (b) scale the program so that offline copies of multiple Ted Talks can be generated simultaneously
# UPDATE: It can create local copy (+embedded video) for 10 arbitrarily selected TED Talks at once.
# Next, we must determine how the user will input the URLs of the TED Talks he wants to save
# (text file,navigation bar, etc...)
# BUGS FIXED:
# (a) fixed a bug for TED Talks older than 2020.
#-----------------------------------------------------------------------------------------------------------#
# function to write at a specified line in a given file
def replace_line(file_name, line_num, text):
lines = open(file_name, 'r',encoding="utf-8").readlines()
lines[line_num] = text
out = open(file_name, 'w',encoding = "utf-8")
out.writelines(lines)
out.close()
# function to download any video from the Internet in a .mp4 format
def download_file(url):
local_filename = url.split('/')[-1]
# NOTE the stream=True parameter
r = requests.get(url, stream=True)
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
# function to create a local copy of a TED Talk with the embedded low/medium/high quality video link
def create_local_copy(url):
# (A) Pull URL and obtain .html file generated by the SingleFile Chrome extension
response = urllib.request.urlopen(url)
webcontent = response.read()
# find the SingleFile .html file in the Downloads folder
listofDownloads = glob.iglob('C:/Users/mrciu/Downloads/*')
recentDownload = max(listofDownloads,key=os.path.getctime)
# copy it to the Web-Scraper folder
DownloadPath = os.path.abspath(recentDownload)
TargetPath = __file__.split("/reqsitesource_Cristian.py")[0]
shutil.copy(DownloadPath,TargetPath)
# open the .html file & parse content from url request
videoname = str(recentDownload).split("Downloads")[1]
videoname = videoname[1:]
f = open(videoname,"a")
soup = BeautifulSoup(webcontent, "lxml")
# (B) Get block holding all ".mp4" links
mp4Link = soup.findAll(text=re.compile('.mp4'))
currBlock = str(mp4Link) # making block parsable
mp4Links = [] # will contain list of .mp4 links
# (C) Find borders of block with .mp4 files
MP4Block = currBlock
FirstMP4 = MP4Block.find(".mp4") + 4 # index of first .mp4 link (low quality)
tempBlock = MP4Block[:FirstMP4]
LastQuote = tempBlock.rfind( '"') + 1 # index of last quote in tempBlock
# (D) Find all .mp4 links (low, medium and high quality) and append them to the list
# (1) find block with all three quality versions
videosBlock=currBlock[LastQuote:]
lastMP4=videosBlock.rfind(".mp4?a")+4
LowMediumHighVersions = videosBlock[:lastMP4]
# (2) extract .mp4 link of low-quality version & append it to the list
LowVersionLastIndex=videosBlock.find('?api')
LowVersion=videosBlock[:LowVersionLastIndex]
mp4Links.append(LowVersion)
# (3) extract .mp4 link of medium-quality version & append it to the list
MediumHighVersions=LowMediumHighVersions[LowVersionLastIndex:]
MediumVersionFirstIndex=MediumHighVersions.find('"medium":"')+ 10
MediumVersionLastIndex=MediumHighVersions.find('.mp4?a')+4
MediumVersion=MediumHighVersions[MediumVersionFirstIndex:MediumVersionLastIndex]
mp4Links.append(MediumVersion)
# (4) extract .mp4 link of high-quality version & append it to the list
HighVersion=MediumHighVersions[MediumVersionLastIndex:]
HighVersionFirstIndex=HighVersion.find('":"')+3
HighVersion=HighVersion[HighVersionFirstIndex:]
mp4Links.append(HighVersion)
# (5) extra step for videos that are older than 2020
lastindex = mp4Links[1].find(".mp4")
temp = mp4Links[1][:lastindex]
temp = temp+"-480p.mp4"
datecheck = mp4Links[2] # save potentially broken video link for step (3) of part (E)
if (datecheck != temp): # fix the broken high-quality video link for talks older than 2020
linkIndex = mp4Links[2].find(".mp4")+4
mp4Links[2] = mp4Links[2][:linkIndex]
# (6) printing out the links for validation
print(mp4Links)
# (E) Putting Everything Together
# (1) download high-quality video version
download = download_file(mp4Links[2]) # downloads video in the Web-Scraper project folder
download__medium = download_file(mp4Links[1])
# (2) find video source path
PathofFile = __file__ # path of reqSiteSource.py (in Web-Scraper project folder)
indexOfLastSlash = PathofFile.rfind("/")+1
PathofProject= PathofFile[:indexOfLastSlash] # path of Web-Scraper project folder
indexofLastSlashVideo = mp4Links[2].rfind("/")+1
NameofVideo=mp4Links[2][indexofLastSlashVideo:] # strip beginning of .mp4 link to get something like 'video_name.mp4'
PathofVideo = PathofProject+NameofVideo # put together the path of the video
indexofLastSlashMedium= mp4Links[1].rfind("/")+1
NameofVideoMedium=mp4Links[1][indexofLastSlashMedium:]
PathofVideoMedium = PathofProject+NameofVideoMedium
# (3) create iframe tag to embed video into local copy of Ted Talk webpage
video ="<iframe width='560' height='315' src='"+PathofVideo+"' type='video/mp4' frameborder='0' allowfullscreen></iframe>"
replace_line(videoname, 84, video)
videoMedium = "<iframe width='560' height='315' src='"+PathofVideoMedium+"' type='video/mp4' frameborder='0' allowfullscreen></iframe>"
replace_line(videoname, 85, videoMedium)
f.close
#-----------------------------------------------------------------------------------------------#
# First Test Run
# (put a few TED Talks URLs in a list and iterate through it to create local copies for each)
TEDTalks = ["https://www.ted.com/talks/kate_lister_an_honest_history_of_an_ancient_and_nasty_word"]
""""https://www.ted.com/talks/archie_crowley_language_around_gender_and_identity_evolves_and_always_has",
"https://www.ted.com/talks/kayla_wolf_ugly_history_the_spanish_inquisition",
"https://www.ted.com/talks/theresa_a_yugar_history_s_worst_nun#t-1754",
"https://www.ted.com/talks/michael_levin_the_electrical_blueprints_that_orchestrate_life",
"https://www.ted.com/talks/dennis_e_shasha_can_you_solve_the_fantasy_election_riddle",
"https://www.ted.com/talks/jacques_s_abramowicz_how_does_ultrasound_work",
"https://www.ted.com/talks/kate_lister_an_honest_history_of_an_ancient_and_nasty_word",
"https://www.ted.com/talks/william_collis_how_video_game_skills_can_get_you_ahead_in_life",
"https://www.ted.com/talks/ted_ed_the_world_s_biggest_battery_looks_nothing_like_a_battery",
"https://www.ted.com/talks/matt_langione_the_promise_of_quantum_computers",]"""
for talk in TEDTalks:
create_local_copy(talk)