From 7a7b1376fbce0067cf37566bb47131bc0022638d Mon Sep 17 00:00:00 2001 From: makeworld <25111343+makew0rld@users.noreply.github.com> Date: Thu, 27 Apr 2023 22:42:25 -0400 Subject: [PATCH] [extractor/cbc] Fix live extractor, playlist `_VALID_URL` (#6625) Authored by: makew0rld --- yt_dlp/extractor/cbc.py | 120 +++++++++++++++++++++++++++------------- 1 file changed, 83 insertions(+), 37 deletions(-) diff --git a/yt_dlp/extractor/cbc.py b/yt_dlp/extractor/cbc.py index eadb3f8c0..e42f06246 100644 --- a/yt_dlp/extractor/cbc.py +++ b/yt_dlp/extractor/cbc.py @@ -8,14 +8,16 @@ compat_str, ) from ..utils import ( + ExtractorError, int_or_none, join_nonempty, js_to_json, orderedSet, + parse_iso8601, smuggle_url, strip_or_none, + traverse_obj, try_get, - ExtractorError, ) @@ -404,7 +406,7 @@ def _real_extract(self, url): class CBCGemPlaylistIE(InfoExtractor): IE_NAME = 'gem.cbc.ca:playlist' - _VALID_URL = r'https?://gem\.cbc\.ca/media/(?P(?P[0-9a-z-]+)/s(?P[0-9]+))/?(?:[?#]|$)' + _VALID_URL = r'https?://gem\.cbc\.ca/(?:media/)?(?P(?P[0-9a-z-]+)/s(?P[0-9]+))/?(?:[?#]|$)' _TESTS = [{ # TV show playlist, all public videos 'url': 'https://gem.cbc.ca/media/schitts-creek/s06', @@ -414,6 +416,9 @@ class CBCGemPlaylistIE(InfoExtractor): 'title': 'Season 6', 'description': 'md5:6a92104a56cbeb5818cc47884d4326a2', }, + }, { + 'url': 'https://gem.cbc.ca/schitts-creek/s06', + 'only_matching': True, }] _API_BASE = 'https://services.radio-canada.ca/ott/cbc-api/v2/shows/' @@ -473,49 +478,90 @@ def _real_extract(self, url): class CBCGemLiveIE(InfoExtractor): IE_NAME = 'gem.cbc.ca:live' - _VALID_URL = r'https?://gem\.cbc\.ca/live/(?P\d+)' - _TEST = { - 'url': 'https://gem.cbc.ca/live/920604739687', - 'info_dict': { - 'title': 'Ottawa', - 'description': 'The live TV channel and local programming from Ottawa', - 'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/CBC_OTT_VMS/Live_Channel_Static_Images/Ottawa_2880x1620.jpg', - 'is_live': True, - 'id': 'AyqZwxRqh8EH', - 'ext': 'mp4', - 'timestamp': 1492106160, - 'upload_date': '20170413', - 'uploader': 'CBCC-NEW', + _VALID_URL = r'https?://gem\.cbc\.ca/live(?:-event)?/(?P\d+)' + _TESTS = [ + { + 'url': 'https://gem.cbc.ca/live/920604739687', + 'info_dict': { + 'title': 'Ottawa', + 'description': 'The live TV channel and local programming from Ottawa', + 'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/CBC_OTT_VMS/Live_Channel_Static_Images/Ottawa_2880x1620.jpg', + 'is_live': True, + 'id': 'AyqZwxRqh8EH', + 'ext': 'mp4', + 'timestamp': 1492106160, + 'upload_date': '20170413', + 'uploader': 'CBCC-NEW', + }, + 'skip': 'Live might have ended', }, - 'skip': 'Live might have ended', - } - - # It's unclear where the chars at the end come from, but they appear to be - # constant. Might need updating in the future. - # There are two URLs, some livestreams are in one, and some - # in the other. The JSON schema is the same for both. - _API_URLS = ['https://tpfeed.cbc.ca/f/ExhSPC/t_t3UKJR6MAT', 'https://tpfeed.cbc.ca/f/ExhSPC/FNiv9xQx_BnT'] + { + 'url': 'https://gem.cbc.ca/live/44', + 'info_dict': { + 'id': '44', + 'ext': 'mp4', + 'is_live': True, + 'title': r're:^Ottawa [0-9\-: ]+', + 'description': 'The live TV channel and local programming from Ottawa', + 'live_status': 'is_live', + 'thumbnail': r're:https://images.gem.cbc.ca/v1/cbc-gem/live/.*' + }, + 'params': {'skip_download': True}, + 'skip': 'Live might have ended', + }, + { + 'url': 'https://gem.cbc.ca/live-event/10835', + 'info_dict': { + 'id': '10835', + 'ext': 'mp4', + 'is_live': True, + 'title': r're:^The National \| Biden’s trip wraps up, Paltrow testifies, Bird flu [0-9\-: ]+', + 'description': 'March 24, 2023 | President Biden’s Ottawa visit ends with big pledges from both countries. Plus, Gwyneth Paltrow testifies in her ski collision trial.', + 'live_status': 'is_live', + 'thumbnail': r're:https://images.gem.cbc.ca/v1/cbc-gem/live/.*', + 'timestamp': 1679706000, + 'upload_date': '20230325', + }, + 'params': {'skip_download': True}, + 'skip': 'Live might have ended', + } + ] def _real_extract(self, url): video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + video_info = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['data'] - for api_url in self._API_URLS: - video_info = next(( - stream for stream in self._download_json(api_url, video_id)['entries'] - if stream.get('guid') == video_id), None) - if video_info: - break - else: + # Two types of metadata JSON + if not video_info.get('formattedIdMedia'): + video_info = traverse_obj( + video_info, (('freeTv', ('streams', ...)), 'items', lambda _, v: v['key'] == video_id, {dict}), + get_all=False, default={}) + + video_stream_id = video_info.get('formattedIdMedia') + if not video_stream_id: raise ExtractorError('Couldn\'t find video metadata, maybe this livestream is now offline', expected=True) + stream_data = self._download_json( + 'https://services.radio-canada.ca/media/validation/v2/', video_id, query={ + 'appCode': 'mpx', + 'connectionType': 'hd', + 'deviceType': 'ipad', + 'idMedia': video_stream_id, + 'multibitrate': 'true', + 'output': 'json', + 'tech': 'hls', + 'manifestType': 'desktop', + }) + return { - '_type': 'url_transparent', - 'ie_key': 'ThePlatform', - 'url': video_info['content'][0]['url'], 'id': video_id, - 'title': video_info.get('title'), - 'description': video_info.get('description'), - 'tags': try_get(video_info, lambda x: x['keywords'].split(', ')), - 'thumbnail': video_info.get('cbc$staticImage'), + 'formats': self._extract_m3u8_formats(stream_data['url'], video_id, 'mp4', live=True), 'is_live': True, + **traverse_obj(video_info, { + 'title': 'title', + 'description': 'description', + 'thumbnail': ('images', 'card', 'url'), + 'timestamp': ('airDate', {parse_iso8601}), + }) }