From 4310b6650eeb5630295f4591b37720877878c57a Mon Sep 17 00:00:00 2001 From: divStar Date: Fri, 19 Jan 2024 21:27:16 +0100 Subject: [PATCH] [ie/getcourseru] Add extractors (#8873) Authored by: divStar, seproDev Co-authored-by: sepro <4618135+seproDev@users.noreply.github.com> --- yt_dlp/extractor/_extractors.py | 4 + yt_dlp/extractor/getcourseru.py | 179 ++++++++++++++++++++++++++++++++ 2 files changed, 183 insertions(+) create mode 100644 yt_dlp/extractor/getcourseru.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index b72b53fdd..3d5c3eb60 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -687,6 +687,10 @@ GeniusIE, GeniusLyricsIE, ) +from .getcourseru import ( + GetCourseRuPlayerIE, + GetCourseRuIE +) from .gettr import ( GettrIE, GettrStreamingIE, diff --git a/yt_dlp/extractor/getcourseru.py b/yt_dlp/extractor/getcourseru.py new file mode 100644 index 000000000..6fdbcd736 --- /dev/null +++ b/yt_dlp/extractor/getcourseru.py @@ -0,0 +1,179 @@ +import re +import time +import urllib.parse + +from .common import InfoExtractor +from ..utils import ExtractorError, int_or_none, url_or_none, urlencode_postdata +from ..utils.traversal import traverse_obj + + +class GetCourseRuPlayerIE(InfoExtractor): + _VALID_URL = r'https?://player02\.getcourse\.ru/sign-player/?\?(?:[^#]+&)?json=[^#&]+' + _EMBED_REGEX = [rf']+\bsrc=[\'"](?P{_VALID_URL}[^\'"]*)'] + _TESTS = [{ + 'url': 'http://player02.getcourse.ru/sign-player/?json=eyJ2aWRlb19oYXNoIjoiMTkwYmRmOTNmMWIyOTczNTMwOTg1M2E3YTE5ZTI0YjMiLCJ1c2VyX2lkIjozNTk1MjUxODMsInN1Yl9sb2dpbl91c2VyX2lkIjpudWxsLCJsZXNzb25faWQiOm51bGwsImlwIjoiNDYuMTQyLjE4Mi4yNDciLCJnY19ob3N0IjoiYWNhZGVteW1lbC5vbmxpbmUiLCJ0aW1lIjoxNzA1NDQ5NjQyLCJwYXlsb2FkIjoidV8zNTk1MjUxODMiLCJ1aV9sYW5ndWFnZSI6InJ1IiwiaXNfaGF2ZV9jdXN0b21fc3R5bGUiOnRydWV9&s=354ad2c993d95d5ac629e3133d6cefea&vh-static-feature=zigzag', + 'info_dict': { + 'id': '513573381', + 'title': '190bdf93f1b29735309853a7a19e24b3', + 'ext': 'mp4', + 'thumbnail': 'https://preview-htz.kinescopecdn.net/preview/190bdf93f1b29735309853a7a19e24b3/preview.jpg?version=1702370546&host=vh-80', + 'duration': 1693 + }, + 'skip': 'JWT expired', + }] + + def _real_extract(self, url): + webpage = self._download_webpage(url, None, 'Downloading player page') + window_configs = self._search_json( + r'window\.configs\s*=', webpage, 'config', None) + video_id = str(window_configs['gcFileId']) + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + window_configs['masterPlaylistUrl'], video_id) + + return { + **traverse_obj(window_configs, { + 'title': ('videoHash', {str}), + 'thumbnail': ('previewUrl', {url_or_none}), + 'duration': ('videoDuration', {int_or_none}), + }), + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles + } + + +class GetCourseRuIE(InfoExtractor): + _NETRC_MACHINE = 'getcourseru' + _DOMAINS = [ + 'academymel.online', + 'marafon.mani-beauty.com', + 'on.psbook.ru' + ] + _BASE_URL_RE = rf'https?://(?:(?!player02\.)[^.]+\.getcourse\.(?:ru|io)|{"|".join(map(re.escape, _DOMAINS))})' + _VALID_URL = [ + rf'{_BASE_URL_RE}/(?!pl/|teach/)(?P[^?#]+)', + rf'{_BASE_URL_RE}/(:?pl/)?teach/control/lesson/view\?(?:[^#]+&)?id=(?P\d+)', + ] + _TESTS = [{ + 'url': 'http://academymel.online/3video_1', + 'info_dict': { + 'id': '3059742', + 'display_id': '3video_1', + 'title': 'Промоуроки Академии МЕЛ', + }, + 'playlist_count': 1, + 'playlist': [{ + 'info_dict': { + 'id': '513573381', + 'ext': 'mp4', + 'title': 'Промоуроки Академии МЕЛ', + 'thumbnail': 'https://preview-htz.kinescopecdn.net/preview/190bdf93f1b29735309853a7a19e24b3/preview.jpg?version=1702370546&host=vh-80', + 'duration': 1693 + }, + }] + }, { + 'url': 'https://academymel.getcourse.ru/3video_1', + 'info_dict': { + 'id': '3059742', + 'display_id': '3video_1', + 'title': 'Промоуроки Академии МЕЛ', + }, + 'playlist_count': 1, + 'playlist': [{ + 'info_dict': { + 'id': '513573381', + 'ext': 'mp4', + 'title': 'Промоуроки Академии МЕЛ', + 'thumbnail': 'https://preview-htz.kinescopecdn.net/preview/190bdf93f1b29735309853a7a19e24b3/preview.jpg?version=1702370546&host=vh-80', + 'duration': 1693 + }, + }] + }, { + 'url': 'https://academymel.getcourse.ru/pl/teach/control/lesson/view?id=319141781&editMode=0', + 'info_dict': { + 'id': '319141781', + 'title': '1. Разминка у стены', + }, + 'playlist_count': 1, + 'playlist': [{ + 'info_dict': { + 'id': '4919601', + 'ext': 'mp4', + 'title': '1. Разминка у стены', + 'thumbnail': 'https://preview-htz.vhcdn.com/preview/5a521788e7dc25b4f70c3dff6512d90e/preview.jpg?version=1703223532&host=vh-81', + 'duration': 704 + }, + }], + 'skip': 'paid lesson' + }, { + 'url': 'https://manibeauty.getcourse.ru/pl/teach/control/lesson/view?id=272499894', + 'info_dict': { + 'id': '272499894', + 'title': 'Мотивация к тренировкам', + }, + 'playlist_count': 1, + 'playlist': [{ + 'info_dict': { + 'id': '447479687', + 'ext': 'mp4', + 'title': 'Мотивация к тренировкам', + 'thumbnail': 'https://preview-htz.vhcdn.com/preview/70ed5b9f489dd03b4aff55bfdff71a26/preview.jpg?version=1685115787&host=vh-71', + 'duration': 30 + }, + }], + 'skip': 'paid lesson' + }, { + 'url': 'https://gaismasmandalas.getcourse.io/ATLAUTSEVBUT', + 'only_matching': True, + }] + + _LOGIN_URL_PATH = '/cms/system/login' + + def _login(self, hostname, username, password): + if self._get_cookies(f'https://{hostname}').get('PHPSESSID5'): + return + login_url = f'https://{hostname}{self._LOGIN_URL_PATH}' + webpage = self._download_webpage(login_url, None) + + self._request_webpage( + login_url, None, 'Logging in', 'Failed to log in', + data=urlencode_postdata({ + 'action': 'processXdget', + 'xdgetId': self._html_search_regex( + r']+\bclass="[^"]*\bstate-login[^"]*"[^>]+\bdata-xdget-id="([^"]+)"', + webpage, 'xdgetId'), + 'params[action]': 'login', + 'params[url]': login_url, + 'params[object_type]': 'cms_page', + 'params[object_id]': -1, + 'params[email]': username, + 'params[password]': password, + 'requestTime': int(time.time()), + 'requestSimpleSign': self._html_search_regex( + r'window.requestSimpleSign\s*=\s*"([\da-f]+)"', webpage, 'simple sign'), + })) + + def _real_extract(self, url): + hostname = urllib.parse.urlparse(url).hostname + username, password = self._get_login_info(netrc_machine=hostname) + if username: + self._login(hostname, username, password) + + display_id = self._match_id(url) + # NB: 404 is returned due to yt-dlp not properly following redirects #9020 + webpage, urlh = self._download_webpage_handle(url, display_id, expected_status=404) + if self._LOGIN_URL_PATH in urlh.url or urlh.status == 404: + raise ExtractorError( + f'This video is only available for registered users. {self._login_hint("any", netrc=hostname)}', + expected=True) + + playlist_id = self._search_regex( + r'window\.(?:lessonId|gcsObjectId)\s*=\s*(\d+)', webpage, 'playlist id', default=display_id) + title = self._og_search_title(webpage) or self._html_extract_title(webpage) + + return self.playlist_from_matches( + re.findall(GetCourseRuPlayerIE._EMBED_REGEX[0], webpage), + playlist_id, title, display_id=display_id, ie=GetCourseRuPlayerIE, video_kwargs={ + 'url_transparent': True, + 'title': title, + })