Skip to content

Commit aa48ebe

Browse files
committed
update the youtube video data extractor
1 parent 4c199cd commit aa48ebe

File tree

1 file changed

+14
-3
lines changed

1 file changed

+14
-3
lines changed

web-scraping/youtube-extractor/extract_video_info.py

+14-3
Original file line numberDiff line numberDiff line change
@@ -39,20 +39,31 @@ def get_video_info(url):
3939
likes_label = videoPrimaryInfoRenderer['videoActions']['menuRenderer']['topLevelButtons'][0]['toggleButtonRenderer']['defaultText']['accessibility']['accessibilityData']['label'] # "No likes" or "###,### likes"
4040
likes_str = likes_label.split(' ')[0].replace(',','')
4141
result["likes"] = '0' if likes_str == 'No' else likes_str
42-
# number of dislikes - YouTube does not publish this anymore...?
42+
# number of likes (old way) doesn't always work
43+
# text_yt_formatted_strings = soup.find_all("yt-formatted-string", {"id": "text", "class": "ytd-toggle-button-renderer"})
44+
# result["likes"] = ''.join([ c for c in text_yt_formatted_strings[0].attrs.get("aria-label") if c.isdigit() ])
45+
# result["likes"] = 0 if result['likes'] == '' else int(result['likes'])
46+
# number of dislikes - YouTube does not publish this anymore...
4347
# result["dislikes"] = ''.join([ c for c in text_yt_formatted_strings[1].attrs.get("aria-label") if c.isdigit() ])
4448
# result["dislikes"] = '0' if result['dislikes'] == '' else result['dislikes']
4549
result['dislikes'] = 'UNKNOWN'
46-
4750
# channel details
4851
channel_tag = soup.find("meta", itemprop="channelId")['content']
4952
# channel name
5053
channel_name = soup.find("span", itemprop="author").next.next['content']
5154
# channel URL
5255
# channel_url = soup.find("span", itemprop="author").next['href']
53-
channel_url = f"https://www.youtube.com{channel_tag}"
56+
channel_url = f"https://www.youtube.com/{channel_tag}"
5457
# number of subscribers as str
5558
channel_subscribers = videoSecondaryInfoRenderer['owner']['videoOwnerRenderer']['subscriberCountText']['accessibility']['accessibilityData']['label']
59+
# channel details (old way)
60+
# channel_tag = soup.find("yt-formatted-string", {"class": "ytd-channel-name"}).find("a")
61+
# # channel name (old way)
62+
# channel_name = channel_tag.text
63+
# # channel URL (old way)
64+
# channel_url = f"https://www.youtube.com{channel_tag['href']}"
65+
# number of subscribers as str (old way)
66+
# channel_subscribers = soup.find("yt-formatted-string", {"id": "owner-sub-count"}).text.strip()
5667
result['channel'] = {'name': channel_name, 'url': channel_url, 'subscribers': channel_subscribers}
5768
return result
5869

0 commit comments

Comments
 (0)