Skip to content

Commit 02652ec

Browse files
committed
Some slightly messy quickfixes to get_video_info()
1 parent bdd5df2 commit 02652ec

File tree

1 file changed

+21
-11
lines changed

1 file changed

+21
-11
lines changed

Diff for: web-scraping/youtube-extractor/extract_video_info.py

+21-11
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
from requests_html import HTMLSession
22
from bs4 import BeautifulSoup as bs
3+
import re
4+
import json
35

46
# init session
57
session = HTMLSession()
@@ -27,22 +29,30 @@ def get_video_info(url):
2729
result["duration"] = soup.find("span", {"class": "ytp-time-duration"}).text
2830
# get the video tags
2931
result["tags"] = ', '.join([ meta.attrs.get("content") for meta in soup.find_all("meta", {"property": "og:video:tag"}) ])
30-
# number of likes
31-
text_yt_formatted_strings = soup.find_all("yt-formatted-string", {"id": "text", "class": "ytd-toggle-button-renderer"})
32-
result["likes"] = ''.join([ c for c in text_yt_formatted_strings[0].attrs.get("aria-label") if c.isdigit() ])
33-
result["likes"] = 0 if result['likes'] == '' else int(result['likes'])
34-
# number of dislikes
35-
result["dislikes"] = ''.join([ c for c in text_yt_formatted_strings[1].attrs.get("aria-label") if c.isdigit() ])
36-
result['dislikes'] = 0 if result['dislikes'] == '' else int(result['dislikes'])
3732

33+
# Additional video and channel information (with help from: https://stackoverflow.com/a/68262735)
34+
data = re.search(r"var ytInitialData = ({.*?});", soup.prettify()).group(1)
35+
data_json = json.loads(data)
36+
videoPrimaryInfoRenderer = data_json['contents']['twoColumnWatchNextResults']['results']['results']['contents'][0]['videoPrimaryInfoRenderer']
37+
videoSecondaryInfoRenderer = data_json['contents']['twoColumnWatchNextResults']['results']['results']['contents'][1]['videoSecondaryInfoRenderer']
38+
# number of likes
39+
likes_label = videoPrimaryInfoRenderer['videoActions']['menuRenderer']['topLevelButtons'][0]['toggleButtonRenderer']['defaultText']['accessibility']['accessibilityData']['label'] # "No likes" or "###,### likes"
40+
likes_str = likes_label.split(' ')[0].replace(',','')
41+
result["likes"] = '0' if likes_str == 'No' else likes_str
42+
# number of dislikes - YouTube does not publish this anymore...?
43+
# result["dislikes"] = ''.join([ c for c in text_yt_formatted_strings[1].attrs.get("aria-label") if c.isdigit() ])
44+
# result["dislikes"] = '0' if result['dislikes'] == '' else result['dislikes']
45+
result['dislikes'] = 'UNKNOWN'
46+
3847
# channel details
39-
channel_tag = soup.find("yt-formatted-string", {"class": "ytd-channel-name"}).find("a")
48+
channel_tag = soup.find("meta", itemprop="channelId")['content']
4049
# channel name
41-
channel_name = channel_tag.text
50+
channel_name = soup.find("span", itemprop="author").next.next['content']
4251
# channel URL
43-
channel_url = f"https://www.youtube.com{channel_tag['href']}"
52+
# channel_url = soup.find("span", itemprop="author").next['href']
53+
channel_url = f"https://www.youtube.com{channel_tag}"
4454
# number of subscribers as str
45-
channel_subscribers = soup.find("yt-formatted-string", {"id": "owner-sub-count"}).text.strip()
55+
channel_subscribers = videoSecondaryInfoRenderer['owner']['videoOwnerRenderer']['subscriberCountText']['accessibility']['accessibilityData']['label']
4656
result['channel'] = {'name': channel_name, 'url': channel_url, 'subscribers': channel_subscribers}
4757
return result
4858

0 commit comments

Comments
 (0)