From 4b183d49620e564219c01714ca8639199f6b1cc0 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 29 Dec 2022 14:29:08 +0000 Subject: [PATCH] [extractor/videoken] Add extractors (#5824) Closes #5818 Authored by: bashonly --- yt_dlp/extractor/_extractors.py | 7 + yt_dlp/extractor/videoken.py | 336 ++++++++++++++++++++++++++++++++ 2 files changed, 343 insertions(+) create mode 100644 yt_dlp/extractor/videoken.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 1b76d8264..e51228aff 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2097,6 +2097,13 @@ ) from .videodetective import VideoDetectiveIE from .videofyme import VideofyMeIE +from .videoken import ( + VideoKenIE, + VideoKenPlayerIE, + VideoKenPlaylistIE, + VideoKenCategoryIE, + VideoKenTopicIE, +) from .videomore import ( VideomoreIE, VideomoreVideoIE, diff --git a/yt_dlp/extractor/videoken.py b/yt_dlp/extractor/videoken.py new file mode 100644 index 000000000..560b41a6d --- /dev/null +++ b/yt_dlp/extractor/videoken.py @@ -0,0 +1,336 @@ +import base64 +import functools +import math +import re +import time +import urllib.parse + +from .common import InfoExtractor +from .slideslive import SlidesLiveIE +from ..utils import ( + ExtractorError, + InAdvancePagedList, + int_or_none, + traverse_obj, + update_url_query, + url_or_none, +) + + +class VideoKenBaseIE(InfoExtractor): + _ORGANIZATIONS = { + 'videos.icts.res.in': 'icts', + 'videos.cncf.io': 'cncf', + 'videos.neurips.cc': 'neurips', + } + _BASE_URL_RE = rf'https?://(?P{"|".join(map(re.escape, _ORGANIZATIONS))})/' + + _PAGE_SIZE = 12 + + def _get_org_id_and_api_key(self, org, video_id): + details = self._download_json( + f'https://analytics.videoken.com/api/videolake/{org}/details', video_id, + note='Downloading organization ID and API key', headers={ + 'Accept': 'application/json', + }) + return details['id'], details['apikey'] + + def _create_slideslive_url(self, video_url, video_id, referer): + if not video_url and not video_id: + return + elif not video_url or 'embed/sign-in' in video_url: + video_url = f'https://slideslive.com/embed/{video_id.lstrip("slideslive-")}' + if url_or_none(referer): + return update_url_query(video_url, { + 'embed_parent_url': referer, + 'embed_container_origin': f'https://{urllib.parse.urlparse(referer).netloc}', + }) + return video_url + + def _extract_videos(self, videos, url): + for video in traverse_obj(videos, (('videos', 'results'), ...)): + video_id = traverse_obj(video, 'youtube_id', 'videoid') + if not video_id: + continue + ie_key = None + if traverse_obj(video, 'type', 'source') == 'youtube': + video_url = video_id + ie_key = 'Youtube' + else: + video_url = traverse_obj(video, 'embed_url', 'embeddableurl') + if urllib.parse.urlparse(video_url).netloc == 'slideslive.com': + ie_key = SlidesLiveIE + video_url = self._create_slideslive_url(video_url, video_id, url) + if not video_url: + continue + yield self.url_result(video_url, ie_key, video_id) + + +class VideoKenIE(VideoKenBaseIE): + _VALID_URL = VideoKenBaseIE._BASE_URL_RE + r'(?:(?:topic|category)/[^/#?]+/)?video/(?P[\w-]+)' + _TESTS = [{ + # neurips -> videoken -> slideslive + 'url': 'https://videos.neurips.cc/video/slideslive-38922815', + 'info_dict': { + 'id': '38922815', + 'ext': 'mp4', + 'title': 'Efficient Processing of Deep Neural Network: from Algorithms to Hardware Architectures', + 'timestamp': 1630939331, + 'upload_date': '20210906', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'thumbnails': 'count:330', + 'chapters': 'count:329', + }, + 'params': { + 'skip_download': 'm3u8', + }, + 'expected_warnings': ['Failed to download VideoKen API JSON'], + }, { + # neurips -> videoken -> slideslive -> youtube + 'url': 'https://videos.neurips.cc/topic/machine%20learning/video/slideslive-38923348', + 'info_dict': { + 'id': '2Xa_dt78rJE', + 'ext': 'mp4', + 'display_id': '38923348', + 'title': 'Machine Education', + 'description': 'Watch full version of this video at https://slideslive.com/38923348.', + 'channel': 'SlidesLive Videos - G2', + 'channel_id': 'UCOExahQQ588Da8Nft_Ltb9w', + 'channel_url': 'https://www.youtube.com/channel/UCOExahQQ588Da8Nft_Ltb9w', + 'uploader': 'SlidesLive Videos - G2', + 'uploader_id': 'UCOExahQQ588Da8Nft_Ltb9w', + 'uploader_url': 'http://www.youtube.com/channel/UCOExahQQ588Da8Nft_Ltb9w', + 'duration': 2504, + 'timestamp': 1618922125, + 'upload_date': '20200131', + 'age_limit': 0, + 'channel_follower_count': int, + 'view_count': int, + 'availability': 'unlisted', + 'live_status': 'not_live', + 'playable_in_embed': True, + 'categories': ['People & Blogs'], + 'tags': [], + 'thumbnail': r're:^https?://.*\.(?:jpg|webp)', + 'thumbnails': 'count:78', + 'chapters': 'count:77', + }, + 'params': { + 'skip_download': 'm3u8', + }, + 'expected_warnings': ['Failed to download VideoKen API JSON'], + }, { + # icts -> videoken -> youtube + 'url': 'https://videos.icts.res.in/topic/random%20variable/video/zysIsojYdvc', + 'info_dict': { + 'id': 'zysIsojYdvc', + 'ext': 'mp4', + 'title': 'Small-worlds, complex networks and random graphs (Lecture 3) by Remco van der Hofstad', + 'description': 'md5:87433069d79719eeadc1962cc2ace00b', + 'channel': 'International Centre for Theoretical Sciences', + 'channel_id': 'UCO3xnVTHzB7l-nc8mABUJIQ', + 'channel_url': 'https://www.youtube.com/channel/UCO3xnVTHzB7l-nc8mABUJIQ', + 'uploader': 'International Centre for Theoretical Sciences', + 'uploader_id': 'ICTStalks', + 'uploader_url': 'http://www.youtube.com/user/ICTStalks', + 'duration': 3372, + 'upload_date': '20191004', + 'age_limit': 0, + 'live_status': 'not_live', + 'availability': 'public', + 'playable_in_embed': True, + 'channel_follower_count': int, + 'like_count': int, + 'view_count': int, + 'categories': ['Science & Technology'], + 'tags': [], + 'thumbnail': r're:^https?://.*\.(?:jpg|webp)', + 'thumbnails': 'count:42', + 'chapters': 'count:20', + }, + 'params': { + 'skip_download': 'm3u8', + }, + }, { + 'url': 'https://videos.cncf.io/category/478/video/IL4nxbmUIX8', + 'only_matching': True, + }, { + 'url': 'https://videos.cncf.io/topic/kubernetes/video/YAM2d7yTrrI', + 'only_matching': True, + }, { + 'url': 'https://videos.icts.res.in/video/d7HuP_abpKU', + 'only_matching': True, + }] + + def _real_extract(self, url): + hostname, video_id = self._match_valid_url(url).group('host', 'id') + org_id, _ = self._get_org_id_and_api_key(self._ORGANIZATIONS[hostname], video_id) + details = self._download_json( + 'https://analytics.videoken.com/api/videoinfo_private', video_id, query={ + 'videoid': video_id, + 'org_id': org_id, + }, headers={'Accept': 'application/json'}, note='Downloading VideoKen API JSON', + errnote='Failed to download VideoKen API JSON', fatal=False) + if details: + return next(self._extract_videos({'videos': [details]}, url)) + # fallback for API error 400 response + elif video_id.startswith('slideslive-'): + return self.url_result( + self._create_slideslive_url(None, video_id, url), SlidesLiveIE, video_id) + elif re.match(r'^[\w-]{11}$', video_id): + self.url_result(video_id, 'Youtube', video_id) + else: + raise ExtractorError('Unable to extract without VideoKen API response') + + +class VideoKenPlayerIE(VideoKenBaseIE): + _VALID_URL = r'https?://player\.videoken\.com/embed/slideslive-(?P\d+)' + _TESTS = [{ + 'url': 'https://player.videoken.com/embed/slideslive-38968434', + 'info_dict': { + 'id': '38968434', + 'ext': 'mp4', + 'title': 'Deep Learning with Label Differential Privacy', + 'timestamp': 1643377020, + 'upload_date': '20220128', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'thumbnails': 'count:30', + 'chapters': 'count:29', + }, + 'params': { + 'skip_download': 'm3u8', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + return self.url_result( + self._create_slideslive_url(None, video_id, url), SlidesLiveIE, video_id) + + +class VideoKenPlaylistIE(VideoKenBaseIE): + _VALID_URL = VideoKenBaseIE._BASE_URL_RE + r'(?:category/\d+/)?playlist/(?P\d+)' + _TESTS = [{ + 'url': 'https://videos.icts.res.in/category/1822/playlist/381', + 'playlist_mincount': 117, + 'info_dict': { + 'id': '381', + 'title': 'Cosmology - The Next Decade', + }, + }] + + def _real_extract(self, url): + hostname, playlist_id = self._match_valid_url(url).group('host', 'id') + org_id, _ = self._get_org_id_and_api_key(self._ORGANIZATIONS[hostname], playlist_id) + videos = self._download_json( + f'https://analytics.videoken.com/api/{org_id}/playlistitems/{playlist_id}/', + playlist_id, headers={'Accept': 'application/json'}, note='Downloading API JSON') + return self.playlist_result(self._extract_videos(videos, url), playlist_id, videos.get('title')) + + +class VideoKenCategoryIE(VideoKenBaseIE): + _VALID_URL = VideoKenBaseIE._BASE_URL_RE + r'category/(?P\d+)/?(?:$|[?#])' + _TESTS = [{ + 'url': 'https://videos.icts.res.in/category/1822/', + 'playlist_mincount': 500, + 'info_dict': { + 'id': '1822', + 'title': 'Programs', + }, + }, { + 'url': 'https://videos.neurips.cc/category/350/', + 'playlist_mincount': 34, + 'info_dict': { + 'id': '350', + 'title': 'NeurIPS 2018', + }, + }, { + 'url': 'https://videos.cncf.io/category/479/', + 'playlist_mincount': 328, + 'info_dict': { + 'id': '479', + 'title': 'KubeCon + CloudNativeCon Europe\'19', + }, + }] + + def _get_category_page(self, category_id, org_id, page=1, note=None): + return self._download_json( + f'https://analytics.videoken.com/api/videolake/{org_id}/category_videos', category_id, + fatal=False, note=note if note else f'Downloading category page {page}', + query={ + 'category_id': category_id, + 'page_number': page, + 'length': self._PAGE_SIZE, + }, headers={'Accept': 'application/json'}) or {} + + def _entries(self, category_id, org_id, url, page): + videos = self._get_category_page(category_id, org_id, page + 1) + yield from self._extract_videos(videos, url) + + def _real_extract(self, url): + hostname, category_id = self._match_valid_url(url).group('host', 'id') + org_id, _ = self._get_org_id_and_api_key(self._ORGANIZATIONS[hostname], category_id) + category_info = self._get_category_page(category_id, org_id, note='Downloading category info') + category = category_info['category_name'] + total_pages = math.ceil(int(category_info['recordsTotal']) / self._PAGE_SIZE) + return self.playlist_result(InAdvancePagedList( + functools.partial(self._entries, category_id, org_id, url), + total_pages, self._PAGE_SIZE), category_id, category) + + +class VideoKenTopicIE(VideoKenBaseIE): + _VALID_URL = VideoKenBaseIE._BASE_URL_RE + r'topic/(?P[^/#?]+)/?(?:$|[?#])' + _TESTS = [{ + 'url': 'https://videos.neurips.cc/topic/machine%20learning/', + 'playlist_mincount': 500, + 'info_dict': { + 'id': 'machine_learning', + 'title': 'machine learning', + }, + }, { + 'url': 'https://videos.icts.res.in/topic/gravitational%20waves/', + 'playlist_mincount': 77, + 'info_dict': { + 'id': 'gravitational_waves', + 'title': 'gravitational waves' + }, + }, { + 'url': 'https://videos.cncf.io/topic/prometheus/', + 'playlist_mincount': 134, + 'info_dict': { + 'id': 'prometheus', + 'title': 'prometheus', + }, + }] + + def _get_topic_page(self, topic, org_id, search_id, api_key, page=1, note=None): + return self._download_json( + 'https://es.videoken.com/api/v1.0/get_results', topic, fatal=False, query={ + 'orgid': org_id, + 'size': self._PAGE_SIZE, + 'query': topic, + 'page': page, + 'sort': 'upload_desc', + 'filter': 'all', + 'token': api_key, + 'is_topic': 'true', + 'category': '', + 'searchid': search_id, + }, headers={'Accept': 'application/json'}, + note=note if note else f'Downloading topic page {page}') or {} + + def _entries(self, topic, org_id, search_id, api_key, url, page): + videos = self._get_topic_page(topic, org_id, search_id, api_key, page + 1) + yield from self._extract_videos(videos, url) + + def _real_extract(self, url): + hostname, topic_id = self._match_valid_url(url).group('host', 'id') + topic = urllib.parse.unquote(topic_id) + topic_id = topic.replace(' ', '_') + org_id, api_key = self._get_org_id_and_api_key(self._ORGANIZATIONS[hostname], topic) + search_id = base64.b64encode(f':{topic}:{int(time.time())}:transient'.encode()).decode() + total_pages = int_or_none(self._get_topic_page( + topic, org_id, search_id, api_key, note='Downloading topic info')['total_no_of_pages']) + return self.playlist_result(InAdvancePagedList( + functools.partial(self._entries, topic, org_id, search_id, api_key, url), + total_pages, self._PAGE_SIZE), topic_id, topic)