Skip to content

Commit 10e16a5

Browse files
committed
edited extract youtube data tutorial
1 parent e8c4869 commit 10e16a5

File tree

2 files changed

+30
-16
lines changed

2 files changed

+30
-16
lines changed
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,46 @@
1-
import requests
1+
from requests_html import HTMLSession
22
from bs4 import BeautifulSoup as bs
33

4+
# init session
5+
session = HTMLSession()
6+
47

58
def get_video_info(url):
69
# download HTML code
7-
content = requests.get(url)
10+
response = session.get(url)
11+
# execute Javascript
12+
response.html.render(sleep=1)
813
# create beautiful soup object to parse HTML
9-
soup = bs(content.content, "html.parser")
14+
soup = bs(response.html.html, "html.parser")
15+
# open("index.html", "w").write(response.html.html)
1016
# initialize the result
1117
result = {}
1218
# video title
13-
result['title'] = soup.find("span", attrs={"class": "watch-title"}).text.strip()
19+
result["title"] = soup.find("h1").text.strip()
1420
# video views (converted to integer)
15-
result['views'] = int(soup.find("div", attrs={"class": "watch-view-count"}).text[:-6].replace(",", ""))
21+
result["views"] = int(''.join([ c for c in soup.find("span", attrs={"class": "view-count"}).text if c.isdigit() ]))
1622
# video description
17-
result['description'] = soup.find("p", attrs={"id": "eow-description"}).text
23+
result["description"] = soup.find("yt-formatted-string", {"class": "content"}).text
1824
# date published
19-
result['date_published'] = soup.find("strong", attrs={"class": "watch-time-text"}).text
20-
# number of likes as integer
21-
result['likes'] = int(soup.find("button", attrs={"title": "I like this"}).text.replace(",", ""))
22-
# number of dislikes as integer
23-
result['dislikes'] = int(soup.find("button", attrs={"title": "I dislike this"}).text.replace(",", ""))
25+
result["date_published"] = soup.find("div", {"id": "date"}).text[1:]
26+
# get the duration of the video
27+
result["duration"] = soup.find("span", {"class": "ytp-time-duration"}).text
28+
# get the video tags
29+
result["tags"] = ', '.join([ meta.attrs.get("content") for meta in soup.find_all("meta", {"property": "og:video:tag"}) ])
30+
# number of likes
31+
text_yt_formatted_strings = soup.find_all("yt-formatted-string", {"id": "text", "class": "ytd-toggle-button-renderer"})
32+
result["likes"] = int(''.join([ c for c in text_yt_formatted_strings[0].attrs.get("aria-label") if c.isdigit() ]))
33+
# number of dislikes
34+
result["dislikes"] = int(''.join([ c for c in text_yt_formatted_strings[1].attrs.get("aria-label") if c.isdigit() ]))
35+
2436
# channel details
25-
channel_tag = soup.find("div", attrs={"class": "yt-user-info"}).find("a")
37+
channel_tag = soup.find("yt-formatted-string", {"class": "ytd-channel-name"}).find("a")
2638
# channel name
2739
channel_name = channel_tag.text
2840
# channel URL
2941
channel_url = f"https://www.youtube.com{channel_tag['href']}"
3042
# number of subscribers as str
31-
channel_subscribers = soup.find("span", attrs={"class": "yt-subscriber-count"}).text.strip()
43+
channel_subscribers = soup.find("yt-formatted-string", {"id": "owner-sub-count"}).text.strip()
3244
result['channel'] = {'name': channel_name, 'url': channel_url, 'subscribers': channel_subscribers}
3345
return result
3446

@@ -46,10 +58,12 @@ def get_video_info(url):
4658
# print in nice format
4759
print(f"Title: {data['title']}")
4860
print(f"Views: {data['views']}")
49-
print(f"\nDescription: {data['description']}\n")
50-
print(data['date_published'])
61+
print(f"Published at: {data['date_published']}")
62+
print(f"Video Duration: {data['duration']}")
63+
print(f"Video tags: {data['tags']}")
5164
print(f"Likes: {data['likes']}")
5265
print(f"Dislikes: {data['dislikes']}")
66+
print(f"\nDescription: {data['description']}\n")
5367
print(f"\nChannel Name: {data['channel']['name']}")
5468
print(f"Channel URL: {data['channel']['url']}")
5569
print(f"Channel Subscribers: {data['channel']['subscribers']}")
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
requests
1+
requests_html
22
bs4

0 commit comments

Comments
 (0)