From a1ddaa899ca8693f31f34770f7263ace7e8c8841 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Fri, 22 Apr 2022 13:16:24 +0530 Subject: [PATCH] [hotstar] Refactor extractors Closes #3517 --- yt_dlp/extractor/extractors.py | 1 + yt_dlp/extractor/hotstar.py | 235 ++++++++++++++++++--------------- 2 files changed, 126 insertions(+), 110 deletions(-) diff --git a/yt_dlp/extractor/extractors.py b/yt_dlp/extractor/extractors.py index d67b2eeec..a4ccf07a4 100644 --- a/yt_dlp/extractor/extractors.py +++ b/yt_dlp/extractor/extractors.py @@ -616,6 +616,7 @@ from .hotnewhiphop import HotNewHipHopIE from .hotstar import ( HotStarIE, + HotStarPrefixIE, HotStarPlaylistIE, HotStarSeriesIE, ) diff --git a/yt_dlp/extractor/hotstar.py b/yt_dlp/extractor/hotstar.py index d82e1aead..fe16de665 100644 --- a/yt_dlp/extractor/hotstar.py +++ b/yt_dlp/extractor/hotstar.py @@ -14,6 +14,7 @@ determine_ext, ExtractorError, int_or_none, + join_nonempty, str_or_none, try_get, url_or_none, @@ -21,6 +22,8 @@ class HotStarBaseIE(InfoExtractor): + _BASE_URL = 'https://www.hotstar.com' + _API_URL = 'https://api.hotstar.com' _AKAMAI_ENCRYPTION_KEY = b'\x05\xfc\x1a\x01\xca\xc9\x4b\xc4\x12\xfc\x53\x12\x07\x75\xf9\xee' def _call_api_impl(self, path, video_id, query, st=None, cookies=None): @@ -33,7 +36,7 @@ def _call_api_impl(self, path, video_id, query, st=None, cookies=None): token = cookies.get('userUP').value else: token = self._download_json( - 'https://api.hotstar.com/um/v3/users', + f'{self._API_URL}/um/v3/users', video_id, note='Downloading token', data=json.dumps({"device_ids": [{"id": compat_str(uuid.uuid4()), "type": "device_id"}]}).encode('utf-8'), headers={ @@ -43,12 +46,13 @@ def _call_api_impl(self, path, video_id, query, st=None, cookies=None): })['user_identity'] response = self._download_json( - 'https://api.hotstar.com/' + path, video_id, headers={ + f'{self._API_URL}/{path}', video_id, query=query, + headers={ 'hotstarauth': auth, 'x-hs-appversion': '6.72.2', 'x-hs-platform': 'web', 'x-hs-usertoken': token, - }, query=query) + }) if response['message'] != "Playback URL's fetched successfully": raise ExtractorError( @@ -56,17 +60,20 @@ def _call_api_impl(self, path, video_id, query, st=None, cookies=None): return response['data'] def _call_api(self, path, video_id, query_name='contentId'): - return self._download_json('https://api.hotstar.com/' + path, video_id=video_id, query={ - query_name: video_id, - 'tas': 10000, - }, headers={ - 'x-country-code': 'IN', - 'x-platform-code': 'PCTV', - }) + return self._download_json( + f'{self._API_URL}/{path}', video_id=video_id, + query={ + query_name: video_id, + 'tas': 10000, + }, headers={ + 'x-country-code': 'IN', + 'x-platform-code': 'PCTV', + }) - def _call_api_v2(self, path, video_id, st=None, cookies=None): + def _call_api_v2(self, path, video_id, st=None): + cookies = self._get_cookies(self._BASE_URL) return self._call_api_impl( - '%s/content/%s' % (path, video_id), video_id, st=st, cookies=cookies, query={ + f'{path}/content/{video_id}', video_id, st=st, cookies=cookies, query={ 'desired-config': 'audio_channel:stereo|container:fmp4|dynamic_range:hdr|encryption:plain|ladder:tv|package:dash|resolution:fhd|subs-tag:HotstarVIP|video_codec:h265', 'device-id': cookies.get('device_id').value if cookies.get('device_id') else compat_str(uuid.uuid4()), 'os-name': 'Windows', @@ -77,24 +84,15 @@ def _call_api_v2(self, path, video_id, st=None, cookies=None): class HotStarIE(HotStarBaseIE): IE_NAME = 'hotstar' _VALID_URL = r'''(?x) - (?: - hotstar\:| - https?://(?:www\.)?hotstar\.com(?:/in)?/(?!in/) - ) - (?: - (?Pmovies|sports|episode|(?Ptv)) - (?: - \:| - /[^/?#]+/ - (?(tv) - (?:[^/?#]+/){2}| - (?:[^/?#]+/)* - ) - )| - [^/?#]+/ - )? - (?P\d{10}) - ''' + https?://(?:www\.)?hotstar\.com(?:/in)?/(?!in/) + (?: + (?Pmovies|sports|episode|(?Ptv))/ + (?(tv)(?:[^/?#]+/){2}|[^?#]*) + )? + [^/?#]+/ + (?P\d{10}) + ''' + _TESTS = [{ 'url': 'https://www.hotstar.com/can-you-not-spread-rumours/1000076273', 'info_dict': { @@ -105,38 +103,8 @@ class HotStarIE(HotStarBaseIE): 'timestamp': 1447248600, 'upload_date': '20151111', 'duration': 381, + 'episode': 'Can You Not Spread Rumours?', }, - }, { - 'url': 'hotstar:1000076273', - 'only_matching': True, - }, { - 'url': 'https://www.hotstar.com/movies/radha-gopalam/1000057157', - 'info_dict': { - 'id': '1000057157', - 'ext': 'mp4', - 'title': 'Radha Gopalam', - 'description': 'md5:be3bc342cc120bbc95b3b0960e2b0d22', - 'timestamp': 1140805800, - 'upload_date': '20060224', - 'duration': 9182, - }, - }, { - 'url': 'hotstar:movies:1000057157', - 'only_matching': True, - }, { - 'url': 'https://www.hotstar.com/in/sports/cricket/follow-the-blues-2021/recap-eng-fight-back-on-day-2/1260066104', - 'only_matching': True, - }, { - 'url': 'https://www.hotstar.com/in/sports/football/most-costly-pl-transfers-ft-grealish/1260065956', - 'only_matching': True, - }, { - # contentData - 'url': 'hotstar:sports:1260065956', - 'only_matching': True, - }, { - # contentData - 'url': 'hotstar:sports:1260066104', - 'only_matching': True, }, { 'url': 'https://www.hotstar.com/tv/ek-bhram-sarvagun-sampanna/s-2116/janhvi-targets-suman/1000234847', 'info_dict': { @@ -155,12 +123,19 @@ class HotStarIE(HotStarBaseIE): 'season_id': 6771, 'episode': 'Janhvi Targets Suman', 'episode_number': 8, - }, + } }, { - 'url': 'hotstar:episode:1000234847', + 'url': 'https://www.hotstar.com/movies/radha-gopalam/1000057157', + 'only_matching': True, + }, { + 'url': 'https://www.hotstar.com/in/sports/cricket/follow-the-blues-2021/recap-eng-fight-back-on-day-2/1260066104', + 'only_matching': True, + }, { + 'url': 'https://www.hotstar.com/in/sports/football/most-costly-pl-transfers-ft-grealish/1260065956', 'only_matching': True, }] _GEO_BYPASS = False + _TYPE = { 'movies': 'movie', 'sports': 'match', @@ -169,41 +144,52 @@ class HotStarIE(HotStarBaseIE): None: 'content', } - def _real_extract(self, url): - mobj = self._match_valid_url(url) - video_id = mobj.group('id') - video_type = mobj.group('type') - cookies = self._get_cookies(url) - video_type = self._TYPE.get(video_type, video_type) - video_data = self._call_api(f'o/v1/{video_type}/detail', video_id)['body']['results']['item'] - title = video_data['title'] + _IGNORE_MAP = { + 'res': 'resolution', + 'vcodec': 'video_codec', + 'dr': 'dynamic_range', + } + @classmethod + def _video_url(cls, video_id, video_type=None, *, slug='ignore_me', root=None): + assert None in (video_type, root) + if not root: + root = join_nonempty(cls._BASE_URL, video_type, delim='/') + return f'{root}/{slug}/{video_id}' + + def _real_extract(self, url): + video_id, video_type = self._match_valid_url(url).group('id', 'type') + video_type = self._TYPE.get(video_type, video_type) + + video_data = self._call_api(f'o/v1/{video_type}/detail', video_id)['body']['results']['item'] if not self.get_param('allow_unplayable_formats') and video_data.get('drmProtected'): self.report_drm(video_id) - headers = {'Referer': 'https://www.hotstar.com/in'} - formats = [] - subs = {} + # See https://github.com/yt-dlp/yt-dlp/issues/396 + st = self._download_webpage_handle(f'{self._BASE_URL}/in', video_id)[1].headers.get('x-origin-date') + geo_restricted = False - _, urlh = self._download_webpage_handle('https://www.hotstar.com/in', video_id) - # Required to fix https://github.com/yt-dlp/yt-dlp/issues/396 - st = urlh.headers.get('x-origin-date') + formats, subs = [], {} + headers = {'Referer': f'{self._BASE_URL}/in'} + # change to v2 in the future - playback_sets = self._call_api_v2('play/v1/playback', video_id, st=st, cookies=cookies)['playBackSets'] + playback_sets = self._call_api_v2('play/v1/playback', video_id, st=st)['playBackSets'] for playback_set in playback_sets: if not isinstance(playback_set, dict): continue - dr = re.search(r'dynamic_range:(?P[a-z]+)', playback_set.get('tagsCombination')).group('dr') + tags = str_or_none(playback_set.get('tagsCombination')) or '' + if any(f'{prefix}:{ignore}' in tags + for key, prefix in self._IGNORE_MAP.items() + for ignore in self._configuration_arg(key)): + continue + format_url = url_or_none(playback_set.get('playbackUrl')) if not format_url: continue - format_url = re.sub( - r'(?<=//staragvod)(\d)', r'web\1', format_url) - tags = str_or_none(playback_set.get('tagsCombination')) or '' - ingored_res, ignored_vcodec, ignored_dr = self._configuration_arg('res'), self._configuration_arg('vcodec'), self._configuration_arg('dr') - if any(f'resolution:{ig_res}' in tags for ig_res in ingored_res) or any(f'video_codec:{ig_vc}' in tags for ig_vc in ignored_vcodec) or any(f'dynamic_range:{ig_dr}' in tags for ig_dr in ignored_dr): - continue + format_url = re.sub(r'(?<=//staragvod)(\d)', r'web\1', format_url) + dr = re.search(r'dynamic_range:(?P[a-z]+)', playback_set.get('tagsCombination')).group('dr') ext = determine_ext(format_url) + current_formats, current_subs = [], {} try: if 'package:hls' in tags or ext == 'm3u8': @@ -215,8 +201,7 @@ def _real_extract(self, url): current_formats, current_subs = self._extract_mpd_formats_and_subtitles( format_url, video_id, mpd_id=f'{dr}-dash', headers=headers) elif ext == 'f4m': - # produce broken files - pass + pass # XXX: produce broken files else: current_formats = [{ 'url': format_url, @@ -227,6 +212,7 @@ def _real_extract(self, url): if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: geo_restricted = True continue + if tags and 'encryption:plain' not in tags: for f in current_formats: f['has_drm'] = True @@ -235,18 +221,19 @@ def _real_extract(self, url): for f in current_formats: if not f.get('langauge'): f['language'] = lang + formats.extend(current_formats) subs = self._merge_subtitles(subs, current_subs) + if not formats and geo_restricted: self.raise_geo_restricted(countries=['IN'], metadata_available=True) self._sort_formats(formats) - for f in formats: f.setdefault('http_headers', {}).update(headers) return { 'id': video_id, - 'title': title, + 'title': video_data.get('title'), 'description': video_data.get('description'), 'duration': int_or_none(video_data.get('duration')), 'timestamp': int_or_none(video_data.get('broadcastDate') or video_data.get('startDate')), @@ -258,14 +245,48 @@ def _real_extract(self, url): 'season': video_data.get('seasonName'), 'season_number': int_or_none(video_data.get('seasonNo')), 'season_id': video_data.get('seasonId'), - 'episode': title, + 'episode': video_data.get('title'), 'episode_number': int_or_none(video_data.get('episodeNo')), - 'http_headers': { - 'Referer': 'https://www.hotstar.com/in', - } } +class HotStarPrefixIE(InfoExtractor): + """ The "hotstar:" prefix is no longer in use, but this is kept for backward compatibility """ + IE_DESC = False + _VALID_URL = r'hotstar:(?:(?P\w+):)?(?P\d+)$' + _TESTS = [{ + 'url': 'hotstar:1000076273', + 'only_matching': True, + }, { + 'url': 'hotstar:movies:1000057157', + 'info_dict': { + 'id': '1000057157', + 'ext': 'mp4', + 'title': 'Radha Gopalam', + 'description': 'md5:be3bc342cc120bbc95b3b0960e2b0d22', + 'timestamp': 1140805800, + 'upload_date': '20060224', + 'duration': 9182, + 'episode': 'Radha Gopalam', + }, + }, { + 'url': 'hotstar:episode:1000234847', + 'only_matching': True, + }, { + # contentData + 'url': 'hotstar:sports:1260065956', + 'only_matching': True, + }, { + # contentData + 'url': 'hotstar:sports:1260066104', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id, video_type = self._match_valid_url(url).group('id', 'type') + return self.url_result(HotStarIE._video_url(video_id, video_type), HotStarIE, video_id) + + class HotStarPlaylistIE(HotStarBaseIE): IE_NAME = 'hotstar:playlist' _VALID_URL = r'https?://(?:www\.)?hotstar\.com/tv/[^/]+/s-\w+/list/[^/]+/t-(?P\w+)' @@ -285,11 +306,8 @@ def _real_extract(self, url): collection = self._call_api('o/v1/tray/find', playlist_id, 'uqId')['body']['results'] entries = [ - self.url_result( - 'https://www.hotstar.com/%s' % video['contentId'], - ie=HotStarIE.ie_key(), video_id=video['contentId']) - for video in collection['assets']['items'] - if video.get('contentId')] + self.url_result(HotStarIE._video_url(video['contentId']), HotStarIE, video['contentId']) + for video in collection['assets']['items'] if video.get('contentId')] return self.playlist_result(entries, playlist_id) @@ -323,16 +341,13 @@ def _real_extract(self, url): 'x-country-code': 'IN', 'x-platform-code': 'PCTV', } - detail_json = self._download_json('https://api.hotstar.com/o/v1/show/detail?contentId=' + series_id, - video_id=series_id, headers=headers) - id = compat_str(try_get(detail_json, lambda x: x['body']['results']['item']['id'], int)) - item_json = self._download_json('https://api.hotstar.com/o/v1/tray/g/1/items?etid=0&tao=0&tas=10000&eid=' + id, - video_id=series_id, headers=headers) - entries = [ - self.url_result( - '%s/ignoreme/%d' % (url, video['contentId']), - ie=HotStarIE.ie_key(), video_id=video['contentId']) - for video in item_json['body']['results']['items'] - if video.get('contentId')] + detail_json = self._download_json( + f'{self._API_URL}/o/v1/show/detail?contentId={series_id}', series_id, headers=headers) + id = try_get(detail_json, lambda x: x['body']['results']['item']['id'], int) + item_json = self._download_json( + f'{self._API_URL}/o/v1/tray/g/1/items?etid=0&tao=0&tas=10000&eid={id}', series_id, headers=headers) - return self.playlist_result(entries, series_id) + return self.playlist_result([ + self.url_result(HotStarIE._video_url(video['contentId'], root=url), HotStarIE, video['contentId']) + for video in item_json['body']['results']['items'] if video.get('contentId') + ], series_id)