reqsitesource_Cristian.py

import urllib.request
import re
import os
import requests
from bs4 import BeautifulSoup
import glob
import shutil

#-----------------------------------------------------------------------------------------------------------#
# Web-Scraper Program for TED Talks
#
# TO-DO:
# (a) improve video embedding by using .html versions from SingleFile
#       UPDATE: integration is initiated and works well (SingleFile is run while on a given TED Talk,
#       and then the Python program uses the generated .html file to embed the downloaded video). Next,
#       the embedding aesthetics must be improved (I can't replace the downloaded video with the TED Talk
#       placeholder yet ...)
# (b) scale the program so that offline copies of multiple Ted Talks can be generated simultaneously
#       UPDATE: It can create local copy (+embedded video) for 10 arbitrarily selected TED Talks at once.
#       Next, we must determine how the user will input the URLs of the TED Talks he wants to save
#       (text file,navigation bar, etc...)
# BUGS FIXED:
# (a) fixed a bug for TED Talks older than 2020.
#-----------------------------------------------------------------------------------------------------------#

# function to write at a specified line in a given file
def replace_line(file_name, line_num, text):
    lines = open(file_name, 'r',encoding="utf-8").readlines()
    lines[line_num] = text
    out = open(file_name, 'w',encoding = "utf-8")
    out.writelines(lines)
    out.close()

# function to download any video from the Internet in a .mp4 format
def download_file(url):
    local_filename = url.split('/')[-1]
    # NOTE the stream=True parameter
    r = requests.get(url, stream=True)
    with open(local_filename, 'wb') as f:
        for chunk in r.iter_content(chunk_size=1024):
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)

# function to create a local copy of a TED Talk with the embedded low/medium/high quality video link
def create_local_copy(url):
    # (A) Pull URL and obtain .html file generated by the SingleFile Chrome extension
    response = urllib.request.urlopen(url)
    webcontent = response.read()
        # find the SingleFile .html file in the Downloads folder
    listofDownloads = glob.iglob('C:/Users/mrciu/Downloads/*')
    recentDownload = max(listofDownloads,key=os.path.getctime)
        # copy it to the Web-Scraper folder
    DownloadPath = os.path.abspath(recentDownload)
    TargetPath = __file__.split("/reqsitesource_Cristian.py")[0]
    shutil.copy(DownloadPath,TargetPath)
        # open the .html file & parse content from url request
    videoname = str(recentDownload).split("Downloads")[1]
    videoname = videoname[1:]
    f = open(videoname,"a")
    soup = BeautifulSoup(webcontent, "lxml")

    # (B) Get block holding all ".mp4" links
    mp4Link = soup.findAll(text=re.compile('.mp4'))
    currBlock = str(mp4Link) # making block parsable
    mp4Links = [] # will contain list of .mp4 links

    # (C) Find borders of block with .mp4 files
    MP4Block = currBlock
    FirstMP4 = MP4Block.find(".mp4") + 4             # index of first .mp4 link (low quality)
    tempBlock = MP4Block[:FirstMP4]
    LastQuote = tempBlock.rfind( '"') + 1            # index of last quote in tempBlock

    # (D) Find all .mp4 links (low, medium and high quality) and append them to the list

        # (1) find block with all three quality versions
    videosBlock=currBlock[LastQuote:]
    lastMP4=videosBlock.rfind(".mp4?a")+4
    LowMediumHighVersions = videosBlock[:lastMP4]
        # (2) extract .mp4 link of low-quality version & append it to the list
    LowVersionLastIndex=videosBlock.find('?api')
    LowVersion=videosBlock[:LowVersionLastIndex]
    mp4Links.append(LowVersion)
        # (3) extract .mp4 link of medium-quality version & append it to the list
    MediumHighVersions=LowMediumHighVersions[LowVersionLastIndex:]
    MediumVersionFirstIndex=MediumHighVersions.find('"medium":"')+ 10
    MediumVersionLastIndex=MediumHighVersions.find('.mp4?a')+4
    MediumVersion=MediumHighVersions[MediumVersionFirstIndex:MediumVersionLastIndex]
    mp4Links.append(MediumVersion)
        # (4) extract .mp4 link of high-quality version & append it to the list
    HighVersion=MediumHighVersions[MediumVersionLastIndex:]
    HighVersionFirstIndex=HighVersion.find('":"')+3
    HighVersion=HighVersion[HighVersionFirstIndex:]
    mp4Links.append(HighVersion)
        # (5) extra step for videos that are older than 2020
    lastindex = mp4Links[1].find(".mp4")
    temp = mp4Links[1][:lastindex]
    temp = temp+"-480p.mp4"
    datecheck = mp4Links[2] # save potentially broken video link for step (3) of part (E)
    if (datecheck != temp): # fix the broken high-quality video link for talks older than 2020
        linkIndex = mp4Links[2].find(".mp4")+4
        mp4Links[2] = mp4Links[2][:linkIndex]
        # (6) printing out the links for validation
    print(mp4Links)

    # (E) Putting Everything Together
        # (1) download high-quality video version
    download = download_file(mp4Links[2])                # downloads video in the Web-Scraper project folder
    download__medium = download_file(mp4Links[1])
        # (2) find video source path
    PathofFile = __file__                                # path of reqSiteSource.py (in Web-Scraper project folder)
    indexOfLastSlash = PathofFile.rfind("/")+1
    PathofProject= PathofFile[:indexOfLastSlash]         # path of Web-Scraper project folder

    indexofLastSlashVideo = mp4Links[2].rfind("/")+1
    NameofVideo=mp4Links[2][indexofLastSlashVideo:]      # strip beginning of .mp4 link to get something like 'video_name.mp4'
    PathofVideo = PathofProject+NameofVideo              # put together the path of the video

    indexofLastSlashMedium= mp4Links[1].rfind("/")+1
    NameofVideoMedium=mp4Links[1][indexofLastSlashMedium:]
    PathofVideoMedium = PathofProject+NameofVideoMedium

        # (3) create iframe tag to embed video into local copy of Ted Talk webpage
    video ="<iframe width='560' height='315' src='"+PathofVideo+"' type='video/mp4' frameborder='0' allowfullscreen></iframe>"
    replace_line(videoname, 84, video)
    videoMedium = "<iframe width='560' height='315' src='"+PathofVideoMedium+"' type='video/mp4' frameborder='0' allowfullscreen></iframe>"
    replace_line(videoname, 85, videoMedium)
    f.close

#-----------------------------------------------------------------------------------------------#

# First Test Run
# (put a few TED Talks URLs in a list and iterate through it to create local copies for each)


TEDTalks = ["https://www.ted.com/talks/kate_lister_an_honest_history_of_an_ancient_and_nasty_word"]


""""https://www.ted.com/talks/archie_crowley_language_around_gender_and_identity_evolves_and_always_has",
"https://www.ted.com/talks/kayla_wolf_ugly_history_the_spanish_inquisition",
"https://www.ted.com/talks/theresa_a_yugar_history_s_worst_nun#t-1754",
"https://www.ted.com/talks/michael_levin_the_electrical_blueprints_that_orchestrate_life",
"https://www.ted.com/talks/dennis_e_shasha_can_you_solve_the_fantasy_election_riddle",
"https://www.ted.com/talks/jacques_s_abramowicz_how_does_ultrasound_work",
"https://www.ted.com/talks/kate_lister_an_honest_history_of_an_ancient_and_nasty_word",
"https://www.ted.com/talks/william_collis_how_video_game_skills_can_get_you_ahead_in_life",
"https://www.ted.com/talks/ted_ed_the_world_s_biggest_battery_looks_nothing_like_a_battery",
"https://www.ted.com/talks/matt_langione_the_promise_of_quantum_computers",]"""

for talk in  TEDTalks:
    create_local_copy(talk)