From 9200bc70c94546b2191bb6fbfc9cea98a919cc56 Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Mon, 1 Jul 2024 13:11:33 -0400 Subject: [PATCH] [ie/microsoftembed] Add extractors for dev materials (#9177) Closes #7112 Authored by: c-basalt --- yt_dlp/extractor/_extractors.py | 13 +- yt_dlp/extractor/microsoftembed.py | 258 +++++++++++++++++++- yt_dlp/extractor/microsoftvirtualacademy.py | 188 -------------- 3 files changed, 265 insertions(+), 194 deletions(-) delete mode 100644 yt_dlp/extractor/microsoftvirtualacademy.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 09dfa73ff..7f6507def 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1119,12 +1119,15 @@ from .melonvod import MelonVODIE from .metacritic import MetacriticIE from .mgtv import MGTVIE -from .microsoftembed import MicrosoftEmbedIE -from .microsoftstream import MicrosoftStreamIE -from .microsoftvirtualacademy import ( - MicrosoftVirtualAcademyCourseIE, - MicrosoftVirtualAcademyIE, +from .microsoftembed import ( + MicrosoftBuildIE, + MicrosoftEmbedIE, + MicrosoftLearnEpisodeIE, + MicrosoftLearnPlaylistIE, + MicrosoftLearnSessionIE, + MicrosoftMediusIE, ) +from .microsoftstream import MicrosoftStreamIE from .mildom import ( MildomClipIE, MildomIE, diff --git a/yt_dlp/extractor/microsoftembed.py b/yt_dlp/extractor/microsoftembed.py index 98d50b18a..d0135f5a9 100644 --- a/yt_dlp/extractor/microsoftembed.py +++ b/yt_dlp/extractor/microsoftembed.py @@ -1,5 +1,14 @@ +import re + from .common import InfoExtractor -from ..utils import int_or_none, traverse_obj, unified_timestamp +from ..utils import ( + int_or_none, + parse_iso8601, + traverse_obj, + unified_timestamp, + url_basename, + url_or_none, +) class MicrosoftEmbedIE(InfoExtractor): @@ -63,3 +72,250 @@ def _real_extract(self, url): 'subtitles': subtitles, 'thumbnails': thumbnails, } + + +class MicrosoftMediusBaseIE(InfoExtractor): + @staticmethod + def _sub_to_dict(subtitle_list): + subtitles = {} + for sub in subtitle_list: + subtitles.setdefault(sub.pop('tag', 'und'), []).append(sub) + return subtitles + + def _extract_ism(self, ism_url, video_id): + formats = self._extract_ism_formats(ism_url, video_id) + for fmt in formats: + if fmt['language'] != 'eng' and 'English' not in fmt['format_id']: + fmt['language_preference'] = -10 + return formats + + +class MicrosoftMediusIE(MicrosoftMediusBaseIE): + _VALID_URL = r'https?://medius\.microsoft\.com/Embed/(?:Video\?id=|video-nc/|VideoDetails/)(?P[\da-f-]+)' + + _TESTS = [{ + 'url': 'https://medius.microsoft.com/Embed/video-nc/9640d86c-f513-4889-959e-5dace86e7d2b', + 'info_dict': { + 'id': '9640d86c-f513-4889-959e-5dace86e7d2b', + 'ext': 'ismv', + 'title': 'Rapidly code, test and ship from secure cloud developer environments', + 'description': 'md5:33c8e4facadc438613476eea24165f71', + 'thumbnail': r're:https://mediusimg\.event\.microsoft\.com/video-\d+/thumbnail\.jpg.*', + 'subtitles': 'count:30', + }, + }, { + 'url': 'https://medius.microsoft.com/Embed/video-nc/81215af5-c813-4dcd-aede-94f4e1a7daa3', + 'info_dict': { + 'id': '81215af5-c813-4dcd-aede-94f4e1a7daa3', + 'ext': 'ismv', + 'title': 'Microsoft Build opening', + 'description': 'md5:43455096141077a1f23144cab8cec1cb', + 'thumbnail': r're:https://mediusimg\.event\.microsoft\.com/video-\d+/thumbnail\.jpg.*', + 'subtitles': 'count:31', + }, + }, { + 'url': 'https://medius.microsoft.com/Embed/VideoDetails/78493569-9b3b-4a85-a409-ee76e789e25c', + 'info_dict': { + 'id': '78493569-9b3b-4a85-a409-ee76e789e25c', + 'ext': 'ismv', + 'title': ' Anomaly Detection & Root cause at Edge', + 'description': 'md5:f8f1ad93d7918649bfb97fa081b03b83', + 'thumbnail': r're:https://mediusdownload.event.microsoft.com/asset.*\.jpg.*', + 'subtitles': 'count:17', + }, + }, { + 'url': 'https://medius.microsoft.com/Embed/Video?id=0dc69bda-079b-4070-a7db-a8da1a06a9c7', + 'only_matching': True, + }, { + 'url': 'https://medius.microsoft.com/Embed/video-nc/fe823a91-959c-465b-96d4-8f4db624f72c', + 'only_matching': True, + }] + + def _extract_subtitle(self, webpage, video_id): + captions = traverse_obj( + self._search_json(r'const\s+captionsConfiguration\s*=', webpage, 'captions', video_id, default=None), + ('languageList', lambda _, v: url_or_none(v['src']), { + 'url': 'src', + 'tag': ('srclang', {str}), + 'name': ('kind', {str}), + })) or [{'url': url, 'tag': url_basename(url).split('.vtt')[0].split('_')[-1]} + for url in re.findall(r'var\s+file\s+=\s+\{[^}]+\'(https://[^\']+\.vtt\?[^\']+)', webpage)] + + return self._sub_to_dict(captions) + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(f'https://medius.microsoft.com/Embed/video-nc/{video_id}', video_id) + + return { + 'id': video_id, + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), + 'formats': self._extract_ism( + self._search_regex(r'StreamUrl\s*=\s*"([^"]+manifest)"', webpage, 'ism url'), video_id), + 'thumbnail': self._og_search_thumbnail(webpage), + 'subtitles': self._extract_subtitle(webpage, video_id), + } + + +class MicrosoftLearnPlaylistIE(InfoExtractor): + _VALID_URL = r'https?://learn\.microsoft\.com/(?:[\w-]+/)?(?Pshows|events)/(?P[\w-]+)/?(?:[?#]|$)' + _TESTS = [{ + 'url': 'https://learn.microsoft.com/en-us/shows/bash-for-beginners', + 'info_dict': { + 'id': 'bash-for-beginners', + 'title': 'Bash for Beginners', + 'description': 'md5:16a91c07222117d1e00912f0dbc02c2c', + }, + 'playlist_count': 20, + }, { + 'url': 'https://learn.microsoft.com/en-us/events/build-2022', + 'info_dict': { + 'id': 'build-2022', + 'title': 'Microsoft Build 2022 - Events', + 'description': 'md5:c16b43848027df837b22c6fbac7648d3', + }, + 'playlist_count': 201, + }] + + def _entries(self, url_base, video_id): + skip = 0 + while True: + playlist_info = self._download_json(url_base, video_id, f'Downloading entries {skip}', query={ + 'locale': 'en-us', + '$skip': skip, + }) + url_paths = traverse_obj(playlist_info, ('results', ..., 'url', {str})) + for url_path in url_paths: + yield self.url_result(f'https://learn.microsoft.com/en-us{url_path}') + skip += len(url_paths) + if skip >= playlist_info.get('count', 0) or not url_paths: + break + + def _real_extract(self, url): + playlist_id, playlist_type = self._match_valid_url(url).group('id', 'type') + webpage = self._download_webpage(url, playlist_id) + + metainfo = { + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), + } + sub_type = 'episodes' if playlist_type == 'shows' else 'sessions' + + url_base = f'https://learn.microsoft.com/api/contentbrowser/search/{playlist_type}/{playlist_id}/{sub_type}' + return self.playlist_result(self._entries(url_base, playlist_id), playlist_id, **metainfo) + + +class MicrosoftLearnEpisodeIE(MicrosoftMediusBaseIE): + _VALID_URL = r'https?://learn\.microsoft\.com/(?:[\w-]+/)?shows/[\w-]+/(?P[^?#/]+)' + _TESTS = [{ + 'url': 'https://learn.microsoft.com/en-us/shows/bash-for-beginners/what-is-the-difference-between-a-terminal-and-a-shell-2-of-20-bash-for-beginners/', + 'info_dict': { + 'id': 'd44e1a03-a0e5-45c2-9496-5c9fa08dc94c', + 'ext': 'ismv', + 'title': 'What is the Difference Between a Terminal and a Shell? (Part 2 of 20)', + 'description': 'md5:7bbbfb593d21c2cf2babc3715ade6b88', + 'timestamp': 1676339547, + 'upload_date': '20230214', + 'thumbnail': r're:https://learn\.microsoft\.com/video/media/.*\.png', + 'subtitles': 'count:14', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + entry_id = self._html_search_meta('entryId', webpage, 'entryId', fatal=True) + video_info = self._download_json( + f'https://learn.microsoft.com/api/video/public/v1/entries/{entry_id}', video_id) + return { + 'id': entry_id, + 'formats': self._extract_ism(video_info['publicVideo']['adaptiveVideoUrl'], video_id), + 'subtitles': self._sub_to_dict(traverse_obj(video_info, ( + 'publicVideo', 'captions', lambda _, v: url_or_none(v['url']), { + 'tag': ('language', {str}), + 'url': 'url', + }))), + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), + **traverse_obj(video_info, { + 'timestamp': ('createTime', {parse_iso8601}), + 'thumbnails': ('publicVideo', 'thumbnailOtherSizes', ..., {'url': {url_or_none}}), + }), + } + + +class MicrosoftLearnSessionIE(InfoExtractor): + _VALID_URL = r'https?://learn\.microsoft\.com/(?:[\w-]+/)?events/[\w-]+/(?P[^?#/]+)' + _TESTS = [{ + 'url': 'https://learn.microsoft.com/en-us/events/build-2022/ts01-rapidly-code-test-ship-from-secure-cloud-developer-environments', + 'info_dict': { + 'id': '9640d86c-f513-4889-959e-5dace86e7d2b', + 'ext': 'ismv', + 'title': 'Rapidly code, test and ship from secure cloud developer environments - Events', + 'description': 'md5:f26c1a85d41c1cffd27a0279254a25c3', + 'timestamp': 1653408600, + 'upload_date': '20220524', + 'thumbnail': r're:https://mediusimg\.event\.microsoft\.com/video-\d+/thumbnail\.jpg.*', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + metainfo = { + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), + 'timestamp': parse_iso8601(self._html_search_meta('startDate', webpage, 'startDate')), + } + + return self.url_result( + self._html_search_meta('externalVideoUrl', webpage, 'videoUrl', fatal=True), + url_transparent=True, ie=MicrosoftMediusIE, **metainfo) + + +class MicrosoftBuildIE(InfoExtractor): + _VALID_URL = [ + r'https?://build\.microsoft\.com/[\w-]+/sessions/(?P[\da-f-]+)', + r'https?://build\.microsoft\.com/[\w-]+/(?Psessions)/?(?:[?#]|$)', + ] + + _TESTS = [{ + 'url': 'https://build.microsoft.com/en-US/sessions/b49feb31-afcd-4217-a538-d3ca1d171198?source=sessions', + 'info_dict': { + 'id': 'aee55fb5-fcf9-4b38-b764-a3527cb57554', + 'ext': 'ismv', + 'title': 'Microsoft Build opening keynote', + 'description': 'md5:d38338f336ef4b6ef9ad2a7466a76655', + 'timestamp': 1716307200, + 'upload_date': '20240521', + 'thumbnail': r're:https://mediusimg\.event\.microsoft\.com/video-\d+/thumbnail\.jpg.*', + }, + }, { + 'url': 'https://build.microsoft.com/en-US/sessions', + 'info_dict': { + 'id': 'sessions', + }, + 'playlist_mincount': 418, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + entries = [ + self.url_result( + video_info['onDemand'], ie=MicrosoftMediusIE, url_transparent=True, **traverse_obj(video_info, { + 'id': ('sessionId', {str}), + 'title': ('title', {str}), + 'description': ('description', {str}), + 'timestamp': ('startDateTime', {parse_iso8601}), + })) + for video_info in self._download_json( + 'https://api-v2.build.microsoft.com/api/session/all/en-US', video_id, 'Downloading video info') + ] + if video_id == 'sessions': + return self.playlist_result(entries, video_id) + else: + return traverse_obj(entries, (lambda _, v: v['id'] == video_id), get_all=False) diff --git a/yt_dlp/extractor/microsoftvirtualacademy.py b/yt_dlp/extractor/microsoftvirtualacademy.py deleted file mode 100644 index e354d8a50..000000000 --- a/yt_dlp/extractor/microsoftvirtualacademy.py +++ /dev/null @@ -1,188 +0,0 @@ -import re - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - parse_duration, - smuggle_url, - unsmuggle_url, - xpath_text, -) - - -class MicrosoftVirtualAcademyBaseIE(InfoExtractor): - def _extract_base_url(self, course_id, display_id): - return self._download_json( - f'https://api-mlxprod.microsoft.com/services/products/anonymous/{course_id}', - display_id, 'Downloading course base URL') - - def _extract_chapter_and_title(self, title): - if not title: - return None, None - m = re.search(r'(?P\d+)\s*\|\s*(?P.+)', title) - return (int(m.group('chapter')), m.group('title')) if m else (None, title) - - -class MicrosoftVirtualAcademyIE(MicrosoftVirtualAcademyBaseIE): - IE_NAME = 'mva' - IE_DESC = 'Microsoft Virtual Academy videos' - _VALID_URL = rf'(?:{IE_NAME}:|https?://(?:mva\.microsoft|(?:www\.)?microsoftvirtualacademy)\.com/[^/]+/training-courses/[^/?#&]+-)(?P<course_id>\d+)(?::|\?l=)(?P<id>[\da-zA-Z]+_\d+)' - - _TESTS = [{ - 'url': 'https://mva.microsoft.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788?l=gfVXISmEB_6804984382', - 'md5': '7826c44fc31678b12ad8db11f6b5abb9', - 'info_dict': { - 'id': 'gfVXISmEB_6804984382', - 'ext': 'mp4', - 'title': 'Course Introduction', - 'formats': 'mincount:3', - 'subtitles': { - 'en': [{ - 'ext': 'ttml', - }], - }, - }, - }, { - 'url': 'mva:11788:gfVXISmEB_6804984382', - 'only_matching': True, - }] - - def _real_extract(self, url): - url, smuggled_data = unsmuggle_url(url, {}) - - mobj = self._match_valid_url(url) - course_id = mobj.group('course_id') - video_id = mobj.group('id') - - base_url = smuggled_data.get('base_url') or self._extract_base_url(course_id, video_id) - - settings = self._download_xml( - f'{base_url}/content/content_{video_id}/videosettings.xml?v=1', - video_id, 'Downloading video settings XML') - - _, title = self._extract_chapter_and_title(xpath_text( - settings, './/Title', 'title', fatal=True)) - - formats = [] - - for sources in settings.findall('.//MediaSources'): - sources_type = sources.get('videoType') - for source in sources.findall('./MediaSource'): - video_url = source.text - if not video_url or not video_url.startswith('http'): - continue - if sources_type == 'smoothstreaming': - formats.extend(self._extract_ism_formats( - video_url, video_id, 'mss', fatal=False)) - continue - video_mode = source.get('videoMode') - height = int_or_none(self._search_regex( - r'^(\d+)[pP]$', video_mode or '', 'height', default=None)) - codec = source.get('codec') - acodec, vcodec = [None] * 2 - if codec: - codecs = codec.split(',') - if len(codecs) == 2: - acodec, vcodec = codecs - elif len(codecs) == 1: - vcodec = codecs[0] - formats.append({ - 'url': video_url, - 'format_id': video_mode, - 'height': height, - 'acodec': acodec, - 'vcodec': vcodec, - }) - - subtitles = {} - for source in settings.findall('.//MarkerResourceSource'): - subtitle_url = source.text - if not subtitle_url: - continue - subtitles.setdefault('en', []).append({ - 'url': f'{base_url}/{subtitle_url}', - 'ext': source.get('type'), - }) - - return { - 'id': video_id, - 'title': title, - 'subtitles': subtitles, - 'formats': formats, - } - - -class MicrosoftVirtualAcademyCourseIE(MicrosoftVirtualAcademyBaseIE): - IE_NAME = 'mva:course' - IE_DESC = 'Microsoft Virtual Academy courses' - _VALID_URL = rf'(?:{IE_NAME}:|https?://(?:mva\.microsoft|(?:www\.)?microsoftvirtualacademy)\.com/[^/]+/training-courses/(?P<display_id>[^/?#&]+)-)(?P<id>\d+)' - - _TESTS = [{ - 'url': 'https://mva.microsoft.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788', - 'info_dict': { - 'id': '11788', - 'title': 'Microsoft Azure Fundamentals: Virtual Machines', - }, - 'playlist_count': 36, - }, { - # with emphasized chapters - 'url': 'https://mva.microsoft.com/en-US/training-courses/developing-windows-10-games-with-construct-2-16335', - 'info_dict': { - 'id': '16335', - 'title': 'Developing Windows 10 Games with Construct 2', - }, - 'playlist_count': 10, - }, { - 'url': 'https://www.microsoftvirtualacademy.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788', - 'only_matching': True, - }, { - 'url': 'mva:course:11788', - 'only_matching': True, - }] - - @classmethod - def suitable(cls, url): - return False if MicrosoftVirtualAcademyIE.suitable(url) else super().suitable(url) - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - course_id = mobj.group('id') - display_id = mobj.group('display_id') - - base_url = self._extract_base_url(course_id, display_id) - - manifest = self._download_json( - f'{base_url}/imsmanifestlite.json', - display_id, 'Downloading course manifest JSON')['manifest'] - - organization = manifest['organizations']['organization'][0] - - entries = [] - for chapter in organization['item']: - chapter_number, chapter_title = self._extract_chapter_and_title(chapter.get('title')) - chapter_id = chapter.get('@identifier') - for item in chapter.get('item', []): - item_id = item.get('@identifier') - if not item_id: - continue - metadata = item.get('resource', {}).get('metadata') or {} - if metadata.get('learningresourcetype') != 'Video': - continue - _, title = self._extract_chapter_and_title(item.get('title')) - duration = parse_duration(metadata.get('duration')) - description = metadata.get('description') - entries.append({ - '_type': 'url_transparent', - 'url': smuggle_url( - f'mva:{course_id}:{item_id}', {'base_url': base_url}), - 'title': title, - 'description': description, - 'duration': duration, - 'chapter': chapter_title, - 'chapter_number': chapter_number, - 'chapter_id': chapter_id, - }) - - title = organization.get('title') or manifest.get('metadata', {}).get('title') - - return self.playlist_result(entries, course_id, title)