yt-dlp/yt_dlp/extractor/videoken.py

338 lines
13 KiB
Python
Raw Normal View History

import base64
import functools
import math
import re
import time
import urllib.parse
from .common import InfoExtractor
from .slideslive import SlidesLiveIE
from ..utils import (
ExtractorError,
InAdvancePagedList,
int_or_none,
remove_start,
traverse_obj,
update_url_query,
url_or_none,
)
class VideoKenBaseIE(InfoExtractor):
_ORGANIZATIONS = {
'videos.icts.res.in': 'icts',
'videos.cncf.io': 'cncf',
'videos.neurips.cc': 'neurips',
}
_BASE_URL_RE = rf'https?://(?P<host>{"|".join(map(re.escape, _ORGANIZATIONS))})/'
_PAGE_SIZE = 12
def _get_org_id_and_api_key(self, org, video_id):
details = self._download_json(
f'https://analytics.videoken.com/api/videolake/{org}/details', video_id,
note='Downloading organization ID and API key', headers={
'Accept': 'application/json',
})
return details['id'], details['apikey']
def _create_slideslive_url(self, video_url, video_id, referer):
if not video_url and not video_id:
return
elif not video_url or 'embed/sign-in' in video_url:
video_url = f'https://slideslive.com/embed/{remove_start(video_id, "slideslive-")}'
if url_or_none(referer):
return update_url_query(video_url, {
'embed_parent_url': referer,
'embed_container_origin': f'https://{urllib.parse.urlparse(referer).hostname}',
})
return video_url
def _extract_videos(self, videos, url):
for video in traverse_obj(videos, (('videos', 'results'), ...)):
video_id = traverse_obj(video, 'youtube_id', 'videoid')
if not video_id:
continue
ie_key = None
if traverse_obj(video, 'type', 'source') == 'youtube':
video_url = video_id
ie_key = 'Youtube'
else:
video_url = traverse_obj(video, 'embed_url', 'embeddableurl', expected_type=url_or_none)
if not video_url:
continue
elif urllib.parse.urlparse(video_url).hostname == 'slideslive.com':
ie_key = SlidesLiveIE
video_url = self._create_slideslive_url(video_url, video_id, url)
yield self.url_result(video_url, ie_key, video_id)
class VideoKenIE(VideoKenBaseIE):
_VALID_URL = VideoKenBaseIE._BASE_URL_RE + r'(?:(?:topic|category)/[^/#?]+/)?video/(?P<id>[\w-]+)'
_TESTS = [{
# neurips -> videoken -> slideslive
'url': 'https://videos.neurips.cc/video/slideslive-38922815',
'info_dict': {
'id': '38922815',
'ext': 'mp4',
'title': 'Efficient Processing of Deep Neural Network: from Algorithms to Hardware Architectures',
'timestamp': 1630939331,
'upload_date': '20210906',
'thumbnail': r're:^https?://.*\.(?:jpg|png)',
'thumbnails': 'count:330',
'chapters': 'count:329',
},
'params': {
'skip_download': 'm3u8',
},
'expected_warnings': ['Failed to download VideoKen API JSON'],
}, {
# neurips -> videoken -> slideslive -> youtube
'url': 'https://videos.neurips.cc/topic/machine%20learning/video/slideslive-38923348',
'info_dict': {
'id': '2Xa_dt78rJE',
'ext': 'mp4',
'display_id': '38923348',
'title': 'Machine Education',
'description': 'Watch full version of this video at https://slideslive.com/38923348.',
'channel': 'SlidesLive Videos - G2',
'channel_id': 'UCOExahQQ588Da8Nft_Ltb9w',
'channel_url': 'https://www.youtube.com/channel/UCOExahQQ588Da8Nft_Ltb9w',
'uploader': 'SlidesLive Videos - G2',
'uploader_id': 'UCOExahQQ588Da8Nft_Ltb9w',
'uploader_url': 'http://www.youtube.com/channel/UCOExahQQ588Da8Nft_Ltb9w',
'duration': 2504,
'timestamp': 1618922125,
'upload_date': '20200131',
'age_limit': 0,
'channel_follower_count': int,
'view_count': int,
'availability': 'unlisted',
'live_status': 'not_live',
'playable_in_embed': True,
'categories': ['People & Blogs'],
'tags': [],
'thumbnail': r're:^https?://.*\.(?:jpg|webp)',
'thumbnails': 'count:78',
'chapters': 'count:77',
},
'params': {
'skip_download': 'm3u8',
},
'expected_warnings': ['Failed to download VideoKen API JSON'],
}, {
# icts -> videoken -> youtube
'url': 'https://videos.icts.res.in/topic/random%20variable/video/zysIsojYdvc',
'info_dict': {
'id': 'zysIsojYdvc',
'ext': 'mp4',
'title': 'Small-worlds, complex networks and random graphs (Lecture 3) by Remco van der Hofstad',
'description': 'md5:87433069d79719eeadc1962cc2ace00b',
'channel': 'International Centre for Theoretical Sciences',
'channel_id': 'UCO3xnVTHzB7l-nc8mABUJIQ',
'channel_url': 'https://www.youtube.com/channel/UCO3xnVTHzB7l-nc8mABUJIQ',
'uploader': 'International Centre for Theoretical Sciences',
'uploader_id': 'ICTStalks',
'uploader_url': 'http://www.youtube.com/user/ICTStalks',
'duration': 3372,
'upload_date': '20191004',
'age_limit': 0,
'live_status': 'not_live',
'availability': 'public',
'playable_in_embed': True,
'channel_follower_count': int,
'like_count': int,
'view_count': int,
'categories': ['Science & Technology'],
'tags': [],
'thumbnail': r're:^https?://.*\.(?:jpg|webp)',
'thumbnails': 'count:42',
'chapters': 'count:20',
},
'params': {
'skip_download': 'm3u8',
},
}, {
'url': 'https://videos.cncf.io/category/478/video/IL4nxbmUIX8',
'only_matching': True,
}, {
'url': 'https://videos.cncf.io/topic/kubernetes/video/YAM2d7yTrrI',
'only_matching': True,
}, {
'url': 'https://videos.icts.res.in/video/d7HuP_abpKU',
'only_matching': True,
}]
def _real_extract(self, url):
hostname, video_id = self._match_valid_url(url).group('host', 'id')
org_id, _ = self._get_org_id_and_api_key(self._ORGANIZATIONS[hostname], video_id)
details = self._download_json(
'https://analytics.videoken.com/api/videoinfo_private', video_id, query={
'videoid': video_id,
'org_id': org_id,
}, headers={'Accept': 'application/json'}, note='Downloading VideoKen API JSON',
errnote='Failed to download VideoKen API JSON', fatal=False)
if details:
return next(self._extract_videos({'videos': [details]}, url))
# fallback for API error 400 response
elif video_id.startswith('slideslive-'):
return self.url_result(
self._create_slideslive_url(None, video_id, url), SlidesLiveIE, video_id)
elif re.match(r'^[\w-]{11}$', video_id):
return self.url_result(video_id, 'Youtube', video_id)
else:
raise ExtractorError('Unable to extract without VideoKen API response')
class VideoKenPlayerIE(VideoKenBaseIE):
_VALID_URL = r'https?://player\.videoken\.com/embed/slideslive-(?P<id>\d+)'
_TESTS = [{
'url': 'https://player.videoken.com/embed/slideslive-38968434',
'info_dict': {
'id': '38968434',
'ext': 'mp4',
'title': 'Deep Learning with Label Differential Privacy',
'timestamp': 1643377020,
'upload_date': '20220128',
'thumbnail': r're:^https?://.*\.(?:jpg|png)',
'thumbnails': 'count:30',
'chapters': 'count:29',
},
'params': {
'skip_download': 'm3u8',
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
return self.url_result(
self._create_slideslive_url(None, video_id, url), SlidesLiveIE, video_id)
class VideoKenPlaylistIE(VideoKenBaseIE):
_VALID_URL = VideoKenBaseIE._BASE_URL_RE + r'(?:category/\d+/)?playlist/(?P<id>\d+)'
_TESTS = [{
'url': 'https://videos.icts.res.in/category/1822/playlist/381',
'playlist_mincount': 117,
'info_dict': {
'id': '381',
'title': 'Cosmology - The Next Decade',
},
}]
def _real_extract(self, url):
hostname, playlist_id = self._match_valid_url(url).group('host', 'id')
org_id, _ = self._get_org_id_and_api_key(self._ORGANIZATIONS[hostname], playlist_id)
videos = self._download_json(
f'https://analytics.videoken.com/api/{org_id}/playlistitems/{playlist_id}/',
playlist_id, headers={'Accept': 'application/json'}, note='Downloading API JSON')
return self.playlist_result(self._extract_videos(videos, url), playlist_id, videos.get('title'))
class VideoKenCategoryIE(VideoKenBaseIE):
_VALID_URL = VideoKenBaseIE._BASE_URL_RE + r'category/(?P<id>\d+)/?(?:$|[?#])'
_TESTS = [{
'url': 'https://videos.icts.res.in/category/1822/',
'playlist_mincount': 500,
'info_dict': {
'id': '1822',
'title': 'Programs',
},
}, {
'url': 'https://videos.neurips.cc/category/350/',
'playlist_mincount': 34,
'info_dict': {
'id': '350',
'title': 'NeurIPS 2018',
},
}, {
'url': 'https://videos.cncf.io/category/479/',
'playlist_mincount': 328,
'info_dict': {
'id': '479',
'title': 'KubeCon + CloudNativeCon Europe\'19',
},
}]
def _get_category_page(self, category_id, org_id, page=1, note=None):
return self._download_json(
f'https://analytics.videoken.com/api/videolake/{org_id}/category_videos', category_id,
fatal=False, note=note if note else f'Downloading category page {page}',
query={
'category_id': category_id,
'page_number': page,
'length': self._PAGE_SIZE,
}, headers={'Accept': 'application/json'}) or {}
def _entries(self, category_id, org_id, url, page):
videos = self._get_category_page(category_id, org_id, page + 1)
yield from self._extract_videos(videos, url)
def _real_extract(self, url):
hostname, category_id = self._match_valid_url(url).group('host', 'id')
org_id, _ = self._get_org_id_and_api_key(self._ORGANIZATIONS[hostname], category_id)
category_info = self._get_category_page(category_id, org_id, note='Downloading category info')
category = category_info['category_name']
total_pages = math.ceil(int(category_info['recordsTotal']) / self._PAGE_SIZE)
return self.playlist_result(InAdvancePagedList(
functools.partial(self._entries, category_id, org_id, url),
total_pages, self._PAGE_SIZE), category_id, category)
class VideoKenTopicIE(VideoKenBaseIE):
_VALID_URL = VideoKenBaseIE._BASE_URL_RE + r'topic/(?P<id>[^/#?]+)/?(?:$|[?#])'
_TESTS = [{
'url': 'https://videos.neurips.cc/topic/machine%20learning/',
'playlist_mincount': 500,
'info_dict': {
'id': 'machine_learning',
'title': 'machine learning',
},
}, {
'url': 'https://videos.icts.res.in/topic/gravitational%20waves/',
'playlist_mincount': 77,
'info_dict': {
'id': 'gravitational_waves',
'title': 'gravitational waves'
},
}, {
'url': 'https://videos.cncf.io/topic/prometheus/',
'playlist_mincount': 134,
'info_dict': {
'id': 'prometheus',
'title': 'prometheus',
},
}]
def _get_topic_page(self, topic, org_id, search_id, api_key, page=1, note=None):
return self._download_json(
'https://es.videoken.com/api/v1.0/get_results', topic, fatal=False, query={
'orgid': org_id,
'size': self._PAGE_SIZE,
'query': topic,
'page': page,
'sort': 'upload_desc',
'filter': 'all',
'token': api_key,
'is_topic': 'true',
'category': '',
'searchid': search_id,
}, headers={'Accept': 'application/json'},
note=note if note else f'Downloading topic page {page}') or {}
def _entries(self, topic, org_id, search_id, api_key, url, page):
videos = self._get_topic_page(topic, org_id, search_id, api_key, page + 1)
yield from self._extract_videos(videos, url)
def _real_extract(self, url):
hostname, topic_id = self._match_valid_url(url).group('host', 'id')
topic = urllib.parse.unquote(topic_id)
topic_id = topic.replace(' ', '_')
org_id, api_key = self._get_org_id_and_api_key(self._ORGANIZATIONS[hostname], topic)
search_id = base64.b64encode(f':{topic}:{int(time.time())}:transient'.encode()).decode()
total_pages = int_or_none(self._get_topic_page(
topic, org_id, search_id, api_key, note='Downloading topic info')['total_no_of_pages'])
return self.playlist_result(InAdvancePagedList(
functools.partial(self._entries, topic, org_id, search_id, api_key, url),
total_pages, self._PAGE_SIZE), topic_id, topic)