forked from makiisthenes/TiktokAutoUploader
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathyoutube_downloader.py
60 lines (51 loc) · 2.48 KB
/
youtube_downloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
from bs4 import BeautifulSoup
from requests_html import HTMLSession, AsyncHTMLSession
from pytube import YouTube
import uuid
YOUTUBE_BASE_URL = "https://www.youtube.com"
VIDEO_BANK_URL = "https://www.youtube.com/source/NPdgPZ0u3zQ/shorts?bp=8gUeChwSGgoLTlBkZ1BaMHUzelESC05QZGdQWjB1M3pR" # OVER 1 MILLION YT SHORTS.
# Get captions and anchor url tags of each video, to then be downloaded, the first 10k videos will be downloaded. As well as thier caption, in a seperate txt file.
sess = HTMLSession()
asess = AsyncHTMLSession()
sess.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
r = sess.get(VIDEO_BANK_URL)
r.html.render(retries=5, sleep=2)
soup = BeautifulSoup(r.html.html, 'html.parser')
# a.ShortsLockupViewModelHostEndpoint obtain href
# h3.ShortsLockupViewModelHostMetadataTitle get aria-label and take all title.
# Extract href from a.ShortsLockupViewModelHostEndpoint
endpoint_tags = soup.find_all('a', class_='ShortsLockupViewModelHostEndpoint')
href_list = [tag['href'] for tag in endpoint_tags]
with open("output/href_list.txt", "w") as f:
f.write(f"\n{YOUTUBE_BASE_URL}".join(href_list))
#
# # Extract aria-label from h3.ShortsLockupViewModelHostMetadataTitle to get titles
# title_tags = soup.find_all('h3', class_='ShortsLockupViewModelHostMetadataTitle')
# title_list = [tag.get('aria-label', '') for tag in title_tags]
#
# # print(href_list)
# # print(title_list)
#
# # Download each video using pytube and save the titles in a comments file
# for i, (href, title) in enumerate(zip(href_list[:10000], title_list[:10000]), start=1): # Assuming you want to download the first 10k videos
# try:
# yt = YouTube("https://www.youtube.com" + href)
# stream = yt.streams.filter(adaptive=True, file_extension="mp4").first()
#
# # Generate a random UUID
# random_uuid = str(uuid.uuid4())
#
# # Download video
# video_file_path = f"output/{random_uuid}.mp4"
# print(f"Downloading video {i}: {video_file_path}")
# stream.download(output_path="output", filename=f"{random_uuid}.mp4")
# print(f"Downloaded: {video_file_path}")
#
# # Save title in a comments file
# comments_file_path = f"output/{random_uuid}_title.txt"
# with open(comments_file_path, 'w', encoding='utf-8') as comments_file:
# comments_file.write(title)
# except Exception as e:
# print(f"Error downloading video {i}: {str(e)}")