From 5b0a6a801084cced4b71c255270f53c881203ca8 Mon Sep 17 00:00:00 2001 From: insaneracist Date: Thu, 29 Oct 2020 16:11:14 -0700 Subject: [PATCH] [youtube] fix: extract mix playlist ids from ytInitialData (#33) --- youtube_dlc/extractor/youtube.py | 35 ++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index 5fd22081a..0354866ef 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -279,6 +279,15 @@ def _download_webpage_handle(self, *args, **kwargs): return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle( *args, **compat_kwargs(kwargs)) + def _get_yt_initial_data(self, video_id, webpage): + config = self._search_regex( + (r'window\["ytInitialData"\]\s*=\s*(.*?)(?<=});', + r'var\s+ytInitialData\s*=\s*(.*?)(?<=});'), + webpage, 'ytInitialData', default=None) + if config: + return self._parse_json( + uppercase_escape(config), video_id, fatal=False) + def _real_initialize(self): if self._downloader is None: return @@ -1397,15 +1406,6 @@ def _get_ytplayer_config(self, video_id, webpage): return self._parse_json( uppercase_escape(config), video_id, fatal=False) - def _get_yt_initial_data(self, video_id, webpage): - config = self._search_regex( - (r'window\["ytInitialData"\]\s*=\s*(.*?)(?<=});', - r'var\s+ytInitialData\s*=\s*(.*?)(?<=});'), - webpage, 'ytInitialData', default=None) - if config: - return self._parse_json( - uppercase_escape(config), video_id, fatal=False) - def _get_automatic_captions(self, video_id, webpage): """We need the webpage for getting the captions url, pass it as an argument to speed up the process.""" @@ -2765,6 +2765,16 @@ def extract_videos_from_page(self, page): return zip(ids_in_page, titles_in_page) + def _extract_mix_ids_from_yt_initial(self, yt_initial): + ids = [] + playlist_contents = try_get(yt_initial, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist']['contents']) + if type(playlist_contents) is list: + for item in playlist_contents: + videoId = try_get(item, lambda x: x['playlistPanelVideoRenderer']['videoId']) + if type(videoId) is str: + ids.append(videoId) + return ids + def _extract_mix(self, playlist_id): # The mixes are generated from a single video # the id of the playlist is just 'RD' + video_id @@ -2778,6 +2788,13 @@ def _extract_mix(self, playlist_id): r'''(?xs)data-video-username=".*?".*? href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s''' % re.escape(playlist_id), webpage)) + + # if no ids in html of page, try using embedded json + if (len(new_ids) == 0): + yt_initial = self._get_yt_initial_data(playlist_id, webpage) + if yt_initial: + new_ids = self._extract_mix_ids_from_yt_initial(yt_initial) + # Fetch new pages until all the videos are repeated, it seems that # there are always 51 unique videos. new_ids = [_id for _id in new_ids if _id not in ids]