|
1 | 1 | from requests_html import HTMLSession
|
2 | 2 | from bs4 import BeautifulSoup as bs
|
| 3 | +import re |
| 4 | +import json |
3 | 5 |
|
4 | 6 | # init session
|
5 | 7 | session = HTMLSession()
|
@@ -27,22 +29,30 @@ def get_video_info(url):
|
27 | 29 | result["duration"] = soup.find("span", {"class": "ytp-time-duration"}).text
|
28 | 30 | # get the video tags
|
29 | 31 | result["tags"] = ', '.join([ meta.attrs.get("content") for meta in soup.find_all("meta", {"property": "og:video:tag"}) ])
|
30 |
| - # number of likes |
31 |
| - text_yt_formatted_strings = soup.find_all("yt-formatted-string", {"id": "text", "class": "ytd-toggle-button-renderer"}) |
32 |
| - result["likes"] = ''.join([ c for c in text_yt_formatted_strings[0].attrs.get("aria-label") if c.isdigit() ]) |
33 |
| - result["likes"] = 0 if result['likes'] == '' else int(result['likes']) |
34 |
| - # number of dislikes |
35 |
| - result["dislikes"] = ''.join([ c for c in text_yt_formatted_strings[1].attrs.get("aria-label") if c.isdigit() ]) |
36 |
| - result['dislikes'] = 0 if result['dislikes'] == '' else int(result['dislikes']) |
37 | 32 |
|
| 33 | + # Additional video and channel information (with help from: https://stackoverflow.com/a/68262735) |
| 34 | + data = re.search(r"var ytInitialData = ({.*?});", soup.prettify()).group(1) |
| 35 | + data_json = json.loads(data) |
| 36 | + videoPrimaryInfoRenderer = data_json['contents']['twoColumnWatchNextResults']['results']['results']['contents'][0]['videoPrimaryInfoRenderer'] |
| 37 | + videoSecondaryInfoRenderer = data_json['contents']['twoColumnWatchNextResults']['results']['results']['contents'][1]['videoSecondaryInfoRenderer'] |
| 38 | + # number of likes |
| 39 | + likes_label = videoPrimaryInfoRenderer['videoActions']['menuRenderer']['topLevelButtons'][0]['toggleButtonRenderer']['defaultText']['accessibility']['accessibilityData']['label'] # "No likes" or "###,### likes" |
| 40 | + likes_str = likes_label.split(' ')[0].replace(',','') |
| 41 | + result["likes"] = '0' if likes_str == 'No' else likes_str |
| 42 | + # number of dislikes - YouTube does not publish this anymore...? |
| 43 | + # result["dislikes"] = ''.join([ c for c in text_yt_formatted_strings[1].attrs.get("aria-label") if c.isdigit() ]) |
| 44 | + # result["dislikes"] = '0' if result['dislikes'] == '' else result['dislikes'] |
| 45 | + result['dislikes'] = 'UNKNOWN' |
| 46 | + |
38 | 47 | # channel details
|
39 |
| - channel_tag = soup.find("yt-formatted-string", {"class": "ytd-channel-name"}).find("a") |
| 48 | + channel_tag = soup.find("meta", itemprop="channelId")['content'] |
40 | 49 | # channel name
|
41 |
| - channel_name = channel_tag.text |
| 50 | + channel_name = soup.find("span", itemprop="author").next.next['content'] |
42 | 51 | # channel URL
|
43 |
| - channel_url = f"https://www.youtube.com{channel_tag['href']}" |
| 52 | + # channel_url = soup.find("span", itemprop="author").next['href'] |
| 53 | + channel_url = f"https://www.youtube.com{channel_tag}" |
44 | 54 | # number of subscribers as str
|
45 |
| - channel_subscribers = soup.find("yt-formatted-string", {"id": "owner-sub-count"}).text.strip() |
| 55 | + channel_subscribers = videoSecondaryInfoRenderer['owner']['videoOwnerRenderer']['subscriberCountText']['accessibility']['accessibilityData']['label'] |
46 | 56 | result['channel'] = {'name': channel_name, 'url': channel_url, 'subscribers': channel_subscribers}
|
47 | 57 | return result
|
48 | 58 |
|
|
0 commit comments