From c91af948e43570025e4aa887e248fd025abae394 Mon Sep 17 00:00:00 2001 From: Tristan Charpentier Date: Sun, 17 Dec 2023 09:07:55 -0500 Subject: [PATCH 01/32] [ie/RinseFM] Add extractor (#8778) Authored by: hashFactory --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/rinsefm.py | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+) create mode 100644 yt_dlp/extractor/rinsefm.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 9b96bd5b4..94369ca66 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1590,6 +1590,7 @@ from .restudy import RestudyIE from .reuters import ReutersIE from .reverbnation import ReverbNationIE from .rheinmaintv import RheinMainTVIE +from .rinsefm import RinseFMIE from .rmcdecouverte import RMCDecouverteIE from .rockstargames import RockstarGamesIE from .rokfin import ( diff --git a/yt_dlp/extractor/rinsefm.py b/yt_dlp/extractor/rinsefm.py new file mode 100644 index 000000000..760adf0eb --- /dev/null +++ b/yt_dlp/extractor/rinsefm.py @@ -0,0 +1,33 @@ +from .common import InfoExtractor +from ..utils import format_field, parse_iso8601 + + +class RinseFMIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?rinse\.fm/episodes/(?P[^/?#]+)' + _TESTS = [{ + 'url': 'https://rinse.fm/episodes/club-glow-15-12-2023-2000/', + 'md5': '76ee0b719315617df42e15e710f46c7b', + 'info_dict': { + 'id': '1536535', + 'ext': 'mp3', + 'title': 'Club Glow - 15/12/2023 - 20:00', + 'thumbnail': r're:^https://.+\.(?:jpg|JPG)$', + 'release_timestamp': 1702598400, + 'release_date': '20231215' + } + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + entry = self._search_nextjs_data(webpage, display_id)['props']['pageProps']['entry'] + + return { + 'id': entry['id'], + 'title': entry.get('title'), + 'url': entry['fileUrl'], + 'vcodec': 'none', + 'release_timestamp': parse_iso8601(entry.get('episodeDate')), + 'thumbnail': format_field( + entry, [('featuredImage', 0, 'filename')], 'https://rinse.imgix.net/media/%s', default=None), + } From c5f01bf7d4b9426c87c3f8248de23934a56579e0 Mon Sep 17 00:00:00 2001 From: "Amir Y. Perehodnik" Date: Mon, 18 Dec 2023 17:52:43 +0200 Subject: [PATCH 02/32] [ie/Maariv] Add extractor (#8331) Authored by: amir16yp --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/maariv.py | 62 +++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+) create mode 100644 yt_dlp/extractor/maariv.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 94369ca66..b3c411394 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -991,6 +991,7 @@ from .lynda import ( LyndaIE, LyndaCourseIE ) +from .maariv import MaarivIE from .magellantv import MagellanTVIE from .magentamusik360 import MagentaMusik360IE from .mailru import ( diff --git a/yt_dlp/extractor/maariv.py b/yt_dlp/extractor/maariv.py new file mode 100644 index 000000000..425a8b3b4 --- /dev/null +++ b/yt_dlp/extractor/maariv.py @@ -0,0 +1,62 @@ +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_resolution, + unified_timestamp, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class MaarivIE(InfoExtractor): + IE_NAME = 'maariv.co.il' + _VALID_URL = r'https?://player\.maariv\.co\.il/public/player\.html\?(?:[^#]+&)?media=(?P\d+)' + _EMBED_REGEX = [rf']+\bsrc=[\'"](?P{_VALID_URL})'] + _TESTS = [{ + 'url': 'https://player.maariv.co.il/public/player.html?player=maariv-desktop&media=3611585', + 'info_dict': { + 'id': '3611585', + 'duration': 75, + 'ext': 'mp4', + 'upload_date': '20231009', + 'title': 'מבצע חרבות ברזל', + 'timestamp': 1696851301, + }, + }] + _WEBPAGE_TESTS = [{ + 'url': 'https://www.maariv.co.il/news/law/Article-1044008', + 'info_dict': { + 'id': '3611585', + 'duration': 75, + 'ext': 'mp4', + 'upload_date': '20231009', + 'title': 'מבצע חרבות ברזל', + 'timestamp': 1696851301, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + data = self._download_json( + f'https://dal.walla.co.il/media/{video_id}?origin=player.maariv.co.il', video_id)['data'] + + formats = [] + if hls_url := traverse_obj(data, ('video', 'url', {url_or_none})): + formats.extend(self._extract_m3u8_formats(hls_url, video_id, m3u8_id='hls', fatal=False)) + + for http_format in traverse_obj(data, ('video', 'stream_urls', ..., 'stream_url', {url_or_none})): + formats.append({ + 'url': http_format, + 'format_id': 'http', + **parse_resolution(http_format), + }) + + return { + 'id': video_id, + **traverse_obj(data, { + 'title': 'title', + 'duration': ('video', 'duration', {int_or_none}), + 'timestamp': ('upload_date', {unified_timestamp}), + }), + 'formats': formats, + } From 00a3e47bf5440c96025a76e08337ff2a475ed83e Mon Sep 17 00:00:00 2001 From: Simon Sawicki Date: Mon, 18 Dec 2023 21:32:08 +0100 Subject: [PATCH 03/32] [ie/bundestag] Add extractor (#8783) Authored by: Grub4K --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/bundestag.py | 123 ++++++++++++++++++++++++++++++++ 2 files changed, 124 insertions(+) create mode 100644 yt_dlp/extractor/bundestag.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index b3c411394..572d79fba 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -276,6 +276,7 @@ from .brilliantpala import ( ) from .businessinsider import BusinessInsiderIE from .bundesliga import BundesligaIE +from .bundestag import BundestagIE from .buzzfeed import BuzzFeedIE from .byutv import BYUtvIE from .c56 import C56IE diff --git a/yt_dlp/extractor/bundestag.py b/yt_dlp/extractor/bundestag.py new file mode 100644 index 000000000..9fd7c7de1 --- /dev/null +++ b/yt_dlp/extractor/bundestag.py @@ -0,0 +1,123 @@ +import re +from functools import partial + +from .common import InfoExtractor +from ..networking.exceptions import HTTPError +from ..utils import ( + ExtractorError, + bug_reports_message, + clean_html, + format_field, + get_element_text_and_html_by_tag, + int_or_none, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class BundestagIE(InfoExtractor): + _VALID_URL = [ + r'https?://dbtg\.tv/[cf]vid/(?P\d+)', + r'https?://www\.bundestag\.de/mediathek/?\?(?:[^#]+&)?videoid=(?P\d+)', + ] + _TESTS = [{ + 'url': 'https://dbtg.tv/cvid/7605304', + 'info_dict': { + 'id': '7605304', + 'ext': 'mp4', + 'title': '145. Sitzung vom 15.12.2023, TOP 24 Barrierefreiheit', + 'description': 'md5:321a9dc6bdad201264c0045efc371561', + }, + }, { + 'url': 'https://www.bundestag.de/mediathek?videoid=7602120&url=L21lZGlhdGhla292ZXJsYXk=&mod=mediathek', + 'info_dict': { + 'id': '7602120', + 'ext': 'mp4', + 'title': '130. Sitzung vom 18.10.2023, TOP 1 Befragung der Bundesregierung', + 'description': 'Befragung der Bundesregierung', + }, + }, { + 'url': 'https://www.bundestag.de/mediathek?videoid=7604941#url=L21lZGlhdGhla292ZXJsYXk/dmlkZW9pZD03NjA0OTQx&mod=mediathek', + 'only_matching': True, + }, { + 'url': 'http://dbtg.tv/fvid/3594346', + 'only_matching': True, + }] + + _OVERLAY_URL = 'https://www.bundestag.de/mediathekoverlay' + _INSTANCE_FORMAT = 'https://cldf-wzw-od.r53.cdn.tv1.eu/13014bundestagod/_definst_/13014bundestag/ondemand/3777parlamentsfernsehen/archiv/app144277506/145293313/{0}/{0}_playlist.smil/playlist.m3u8' + + _SHARE_URL = 'https://webtv.bundestag.de/player/macros/_x_s-144277506/shareData.json?contentId=' + _SHARE_AUDIO_REGEX = r'/\d+_(?P\w+)_(?P\d+)kb_(?P\w+)_\w+_\d+\.(?P\w+)' + _SHARE_VIDEO_REGEX = r'/\d+_(?P\w+)_(?P\w+)_(?P\w+)_(?P\d+)kb_\w+_\w+_\d+\.(?P\w+)' + + def _bt_extract_share_formats(self, video_id): + share_data = self._download_json( + f'{self._SHARE_URL}{video_id}', video_id, note='Downloading share format JSON') + if traverse_obj(share_data, ('status', 'code', {int})) != 1: + self.report_warning(format_field( + share_data, [('status', 'message', {str})], + 'Share API response: %s', default='Unknown Share API Error') + + bug_reports_message()) + return + + for name, url in share_data.items(): + if not isinstance(name, str) or not url_or_none(url): + continue + + elif name.startswith('audio'): + match = re.search(self._SHARE_AUDIO_REGEX, url) + yield { + 'format_id': name, + 'url': url, + 'vcodec': 'none', + **traverse_obj(match, { + 'acodec': 'codec', + 'audio_channels': ('channels', {{'mono': 1, 'stereo': 2}.get}), + 'abr': ('bitrate', {int_or_none}), + 'ext': 'ext', + }), + } + + elif name.startswith('download'): + match = re.search(self._SHARE_VIDEO_REGEX, url) + yield { + 'format_id': name, + 'url': url, + **traverse_obj(match, { + 'vcodec': 'codec', + 'tbr': ('bitrate', {int_or_none}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + 'ext': 'ext', + }), + } + + def _real_extract(self, url): + video_id = self._match_id(url) + formats = [] + result = {'id': video_id, 'formats': formats} + + try: + formats.extend(self._extract_m3u8_formats( + self._INSTANCE_FORMAT.format(video_id), video_id, m3u8_id='instance')) + except ExtractorError as error: + if isinstance(error.cause, HTTPError) and error.cause.status == 404: + raise ExtractorError('Could not find video id', expected=True) + self.report_warning(f'Error extracting hls formats: {error}', video_id) + formats.extend(self._bt_extract_share_formats(video_id)) + if not formats: + self.raise_no_formats('Could not find suitable formats', video_id=video_id) + + result.update(traverse_obj(self._download_webpage( + self._OVERLAY_URL, video_id, + query={'videoid': video_id, 'view': 'main'}, + note='Downloading metadata overlay', fatal=False, + ), { + 'title': ( + {partial(get_element_text_and_html_by_tag, 'h3')}, 0, + {partial(re.sub, r']*>[^<]+', '')}, {clean_html}), + 'description': ({partial(get_element_text_and_html_by_tag, 'p')}, 0, {clean_html}), + })) + + return result From 1c54a98e19d047e7c15184237b6ef8ad50af489c Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Tue, 19 Dec 2023 07:24:55 -0600 Subject: [PATCH 04/32] [ie/twitter] Extract stale tweets (#8724) Closes #8691 Authored by: bashonly --- yt_dlp/extractor/twitter.py | 85 ++++++++++++++++++++++++++----------- 1 file changed, 61 insertions(+), 24 deletions(-) diff --git a/yt_dlp/extractor/twitter.py b/yt_dlp/extractor/twitter.py index d7609bc81..932b478d4 100644 --- a/yt_dlp/extractor/twitter.py +++ b/yt_dlp/extractor/twitter.py @@ -479,9 +479,9 @@ class TwitterIE(TwitterBaseIE): 'comment_count': int, 'repost_count': int, 'like_count': int, - 'view_count': int, 'tags': [], 'age_limit': 18, + '_old_archive_ids': ['twitter 643211948184596480'], }, }, { 'url': 'https://twitter.com/giphz/status/657991469417025536/photo/1', @@ -515,6 +515,7 @@ class TwitterIE(TwitterBaseIE): 'like_count': int, 'tags': ['TV', 'StarWars', 'TheForceAwakens'], 'age_limit': 0, + '_old_archive_ids': ['twitter 665052190608723968'], }, }, { 'url': 'https://twitter.com/BTNBrentYarina/status/705235433198714880', @@ -558,9 +559,9 @@ class TwitterIE(TwitterBaseIE): 'comment_count': int, 'repost_count': int, 'like_count': int, - 'view_count': int, 'tags': ['Damndaniel'], 'age_limit': 0, + '_old_archive_ids': ['twitter 700207533655363584'], }, }, { 'url': 'https://twitter.com/Filmdrunk/status/713801302971588609', @@ -599,9 +600,9 @@ class TwitterIE(TwitterBaseIE): 'comment_count': int, 'repost_count': int, 'like_count': int, - 'view_count': int, 'tags': [], 'age_limit': 0, + '_old_archive_ids': ['twitter 719944021058060289'], }, }, { 'url': 'https://twitter.com/OPP_HSD/status/779210622571536384', @@ -616,6 +617,7 @@ class TwitterIE(TwitterBaseIE): 'thumbnail': r're:^https?://.*\.jpg', }, 'add_ie': ['Periscope'], + 'skip': 'Broadcast not found', }, { # has mp4 formats via mobile API 'url': 'https://twitter.com/news_al3alm/status/852138619213144067', @@ -635,9 +637,9 @@ class TwitterIE(TwitterBaseIE): 'thumbnail': r're:^https?://.*\.jpg', 'tags': [], 'repost_count': int, - 'view_count': int, 'like_count': int, 'comment_count': int, + '_old_archive_ids': ['twitter 852138619213144067'], }, }, { 'url': 'https://twitter.com/i/web/status/910031516746514432', @@ -657,9 +659,9 @@ class TwitterIE(TwitterBaseIE): 'comment_count': int, 'repost_count': int, 'like_count': int, - 'view_count': int, 'tags': ['Maria'], 'age_limit': 0, + '_old_archive_ids': ['twitter 910031516746514432'], }, 'params': { 'skip_download': True, # requires ffmpeg @@ -683,9 +685,9 @@ class TwitterIE(TwitterBaseIE): 'comment_count': int, 'repost_count': int, 'like_count': int, - 'view_count': int, 'tags': [], 'age_limit': 0, + '_old_archive_ids': ['twitter 1001551623938805763'], }, 'params': { 'skip_download': True, # requires ffmpeg @@ -749,6 +751,7 @@ class TwitterIE(TwitterBaseIE): 'like_count': int, 'tags': [], 'age_limit': 0, + '_old_archive_ids': ['twitter 1349794411333394432'], }, 'params': { 'skip_download': True, @@ -771,18 +774,18 @@ class TwitterIE(TwitterBaseIE): 'comment_count': int, 'repost_count': int, 'like_count': int, - 'view_count': int, 'tags': [], 'age_limit': 0, + '_old_archive_ids': ['twitter 1577855540407197696'], }, 'params': {'skip_download': True}, }, { 'url': 'https://twitter.com/UltimaShadowX/status/1577719286659006464', 'info_dict': { 'id': '1577719286659006464', - 'title': 'Ultima📛| New Era - Test', + 'title': 'Ultima - Test', 'description': 'Test https://t.co/Y3KEZD7Dad', - 'uploader': 'Ultima📛| New Era', + 'uploader': 'Ultima', 'uploader_id': 'UltimaShadowX', 'uploader_url': 'https://twitter.com/UltimaShadowX', 'upload_date': '20221005', @@ -813,9 +816,9 @@ class TwitterIE(TwitterBaseIE): 'comment_count': int, 'repost_count': int, 'like_count': int, - 'view_count': int, 'tags': ['HurricaneIan'], 'age_limit': 0, + '_old_archive_ids': ['twitter 1575560063510810624'], }, }, { # Adult content, fails if not logged in @@ -951,10 +954,10 @@ class TwitterIE(TwitterBaseIE): 'uploader_url': 'https://twitter.com/CTVJLaidlaw', 'display_id': '1600649710662213632', 'like_count': int, - 'view_count': int, 'description': 'md5:591c19ce66fadc2359725d5cd0d1052c', 'upload_date': '20221208', 'age_limit': 0, + '_old_archive_ids': ['twitter 1600649710662213632'], }, 'params': {'noplaylist': True}, }, { @@ -979,7 +982,7 @@ class TwitterIE(TwitterBaseIE): 'like_count': int, 'repost_count': int, 'comment_count': int, - 'view_count': int, + '_old_archive_ids': ['twitter 1621117700482416640'], }, }, { 'url': 'https://twitter.com/hlo_again/status/1599108751385972737/video/2', @@ -995,13 +998,13 @@ class TwitterIE(TwitterBaseIE): 'repost_count': int, 'duration': 9.531, 'comment_count': int, - 'view_count': int, 'upload_date': '20221203', 'age_limit': 0, 'timestamp': 1670092210.0, 'tags': [], 'uploader': '\u06ea', 'description': '\U0001F48B https://t.co/bTj9Qz7vQP', + '_old_archive_ids': ['twitter 1599108751385972737'], }, 'params': {'noplaylist': True}, }, { @@ -1012,7 +1015,6 @@ class TwitterIE(TwitterBaseIE): 'ext': 'mp4', 'uploader_url': 'https://twitter.com/MunTheShinobi', 'description': 'This is a genius ad by Apple. \U0001f525\U0001f525\U0001f525\U0001f525\U0001f525 https://t.co/cNsA0MoOml', - 'view_count': int, 'thumbnail': 'https://pbs.twimg.com/ext_tw_video_thumb/1600009362759733248/pu/img/XVhFQivj75H_YxxV.jpg?name=orig', 'age_limit': 0, 'uploader': 'Mün', @@ -1025,6 +1027,7 @@ class TwitterIE(TwitterBaseIE): 'uploader_id': 'MunTheShinobi', 'duration': 139.987, 'timestamp': 1670306984.0, + '_old_archive_ids': ['twitter 1600009574919962625'], }, }, { # retweeted_status (private) @@ -1068,8 +1071,8 @@ class TwitterIE(TwitterBaseIE): 'thumbnail': r're:https://pbs\.twimg\.com/amplify_video_thumb/.+', 'like_count': int, 'repost_count': int, - 'view_count': int, 'comment_count': int, + '_old_archive_ids': ['twitter 1695424220702888009'], }, }, { # retweeted_status w/ legacy API @@ -1091,18 +1094,24 @@ class TwitterIE(TwitterBaseIE): 'thumbnail': r're:https://pbs\.twimg\.com/amplify_video_thumb/.+', 'like_count': int, 'repost_count': int, + '_old_archive_ids': ['twitter 1695424220702888009'], }, 'params': {'extractor_args': {'twitter': {'api': ['legacy']}}}, }, { # Broadcast embedded in tweet - 'url': 'https://twitter.com/JessicaDobsonWX/status/1693057346933600402', + 'url': 'https://twitter.com/JessicaDobsonWX/status/1731121063248175384', 'info_dict': { - 'id': '1yNGaNLjEblJj', + 'id': '1rmxPMjLzAXKN', 'ext': 'mp4', - 'title': 'Jessica Dobson - WAVE Weather Now - Saturday 8/19/23 Update', + 'title': 'WAVE Weather Now - Saturday 12/2/23 Update', 'uploader': 'Jessica Dobson', - 'uploader_id': '1DZEoDwDovRQa', - 'thumbnail': r're:^https?://.*\.jpg', + 'uploader_id': 'JessicaDobsonWX', + 'uploader_url': 'https://twitter.com/JessicaDobsonWX', + 'timestamp': 1701566398, + 'upload_date': '20231203', + 'live_status': 'was_live', + 'thumbnail': r're:https://[^/]+pscp\.tv/.+\.jpg', + 'concurrent_view_count': int, 'view_count': int, }, 'add_ie': ['TwitterBroadcast'], @@ -1125,6 +1134,30 @@ class TwitterIE(TwitterBaseIE): }, 'params': {'extractor_args': {'twitter': {'api': ['syndication']}}}, 'expected_warnings': ['Not all metadata'], + }, { + # "stale tweet" with typename "TweetWithVisibilityResults" + 'url': 'https://twitter.com/RobertKennedyJr/status/1724884212803834154', + 'md5': '62b1e11cdc2cdd0e527f83adb081f536', + 'info_dict': { + 'id': '1724883339285544960', + 'ext': 'mp4', + 'title': 'md5:cc56716f9ed0b368de2ba54c478e493c', + 'description': 'md5:9dc14f5b0f1311fc7caf591ae253a164', + 'display_id': '1724884212803834154', + 'uploader': 'Robert F. Kennedy Jr', + 'uploader_id': 'RobertKennedyJr', + 'uploader_url': 'https://twitter.com/RobertKennedyJr', + 'upload_date': '20231115', + 'timestamp': 1700079417.0, + 'duration': 341.048, + 'thumbnail': r're:https://pbs\.twimg\.com/amplify_video_thumb/.+', + 'tags': ['Kennedy24'], + 'repost_count': int, + 'like_count': int, + 'comment_count': int, + 'age_limit': 0, + '_old_archive_ids': ['twitter 1724884212803834154'], + }, }, { # onion route 'url': 'https://twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid.onion/TwitterBlue/status/1484226494708662273', @@ -1179,19 +1212,23 @@ class TwitterIE(TwitterBaseIE): ), default={}, get_all=False) if self.is_logged_in else traverse_obj( data, ('tweetResult', 'result', {dict}), default={}) - if result.get('__typename') not in ('Tweet', 'TweetTombstone', 'TweetUnavailable', None): - self.report_warning(f'Unknown typename: {result.get("__typename")}', twid, only_once=True) + typename = result.get('__typename') + if typename not in ('Tweet', 'TweetWithVisibilityResults', 'TweetTombstone', 'TweetUnavailable', None): + self.report_warning(f'Unknown typename: {typename}', twid, only_once=True) if 'tombstone' in result: cause = remove_end(traverse_obj(result, ('tombstone', 'text', 'text', {str})), '. Learn more') raise ExtractorError(f'Twitter API says: {cause or "Unknown error"}', expected=True) - elif result.get('__typename') == 'TweetUnavailable': + elif typename == 'TweetUnavailable': reason = result.get('reason') if reason == 'NsfwLoggedOut': self.raise_login_required('NSFW tweet requires authentication') elif reason == 'Protected': self.raise_login_required('You are not authorized to view this protected tweet') raise ExtractorError(reason or 'Requested tweet is unavailable', expected=True) + # Result for "stale tweet" needs additional transformation + elif typename == 'TweetWithVisibilityResults': + result = traverse_obj(result, ('tweet', {dict})) or {} status = result.get('legacy', {}) status.update(traverse_obj(result, { @@ -1377,7 +1414,7 @@ class TwitterIE(TwitterBaseIE): 'formats': formats, 'subtitles': subtitles, 'thumbnails': thumbnails, - 'view_count': traverse_obj(media, ('mediaStats', 'viewCount', {int_or_none})), + 'view_count': traverse_obj(media, ('mediaStats', 'viewCount', {int_or_none})), # No longer available 'duration': float_or_none(traverse_obj(media, ('video_info', 'duration_millis')), 1000), # The codec of http formats are unknown '_format_sort_fields': ('res', 'br', 'size', 'proto'), From db8b4edc7d0bd27da462f6fe82ff6e13e3d68a04 Mon Sep 17 00:00:00 2001 From: Mozi <29089388+pzhlkj6612@users.noreply.github.com> Date: Tue, 19 Dec 2023 22:21:47 +0800 Subject: [PATCH 05/32] [ie/JoqrAg] Add extractor (#8384) Authored by: pzhlkj6612 --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/joqrag.py | 112 ++++++++++++++++++++++++++++++++ 2 files changed, 113 insertions(+) create mode 100644 yt_dlp/extractor/joqrag.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 572d79fba..d5f030c6b 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -865,6 +865,7 @@ from .jiosaavn import ( ) from .jove import JoveIE from .joj import JojIE +from .joqrag import JoqrAgIE from .jstream import JStreamIE from .jtbc import ( JTBCIE, diff --git a/yt_dlp/extractor/joqrag.py b/yt_dlp/extractor/joqrag.py new file mode 100644 index 000000000..3bb28af94 --- /dev/null +++ b/yt_dlp/extractor/joqrag.py @@ -0,0 +1,112 @@ +import datetime +import urllib.parse + +from .common import InfoExtractor +from ..utils import ( + clean_html, + datetime_from_str, + unified_timestamp, + urljoin, +) + + +class JoqrAgIE(InfoExtractor): + IE_DESC = '超!A&G+ 文化放送 (f.k.a. AGQR) Nippon Cultural Broadcasting, Inc. (JOQR)' + _VALID_URL = [r'https?://www\.uniqueradio\.jp/agplayer5/(?:player|inc-player-hls)\.php', + r'https?://(?:www\.)?joqr\.co\.jp/ag/', + r'https?://(?:www\.)?joqr\.co\.jp/qr/ag(?:daily|regular)program/?(?:$|[#?])'] + _TESTS = [{ + 'url': 'https://www.uniqueradio.jp/agplayer5/player.php', + 'info_dict': { + 'id': 'live', + 'title': str, + 'channel': '超!A&G+', + 'description': str, + 'live_status': 'is_live', + 'release_timestamp': int, + }, + 'params': { + 'skip_download': True, + 'ignore_no_formats_error': True, + }, + }, { + 'url': 'https://www.uniqueradio.jp/agplayer5/inc-player-hls.php', + 'only_matching': True, + }, { + 'url': 'https://www.joqr.co.jp/ag/article/103760/', + 'only_matching': True, + }, { + 'url': 'http://www.joqr.co.jp/qr/agdailyprogram/', + 'only_matching': True, + }, { + 'url': 'http://www.joqr.co.jp/qr/agregularprogram/', + 'only_matching': True, + }] + + def _extract_metadata(self, variable, html): + return clean_html(urllib.parse.unquote_plus(self._search_regex( + rf'var\s+{variable}\s*=\s*(["\'])(?P(?:(?!\1).)+)\1', + html, 'metadata', group='value', default=''))) or None + + def _extract_start_timestamp(self, video_id, is_live): + def extract_start_time_from(date_str): + dt = datetime_from_str(date_str) + datetime.timedelta(hours=9) + date = dt.strftime('%Y%m%d') + start_time = self._search_regex( + r']+\bclass="dailyProgram-itemHeaderTime"[^>]*>[\s\d:]+–\s*(\d{1,2}:\d{1,2})', + self._download_webpage( + f'https://www.joqr.co.jp/qr/agdailyprogram/?date={date}', video_id, + note=f'Downloading program list of {date}', fatal=False, + errnote=f'Failed to download program list of {date}') or '', + 'start time', default=None) + if start_time: + return unified_timestamp(f'{dt.strftime("%Y/%m/%d")} {start_time} +09:00') + return None + + start_timestamp = extract_start_time_from('today') + if not start_timestamp: + return None + + if not is_live or start_timestamp < datetime_from_str('now').timestamp(): + return start_timestamp + else: + return extract_start_time_from('yesterday') + + def _real_extract(self, url): + video_id = 'live' + + metadata = self._download_webpage( + 'https://www.uniqueradio.jp/aandg', video_id, + note='Downloading metadata', errnote='Failed to download metadata') + title = self._extract_metadata('Program_name', metadata) + + if title == '放送休止': + formats = [] + live_status = 'is_upcoming' + release_timestamp = self._extract_start_timestamp(video_id, False) + msg = 'This stream is not currently live' + if release_timestamp: + msg += (' and will start at ' + + datetime.datetime.fromtimestamp(release_timestamp).strftime('%Y-%m-%d %H:%M:%S')) + self.raise_no_formats(msg, expected=True) + else: + m3u8_path = self._search_regex( + r']*\bsrc="([^"]+)"', + self._download_webpage( + 'https://www.uniqueradio.jp/agplayer5/inc-player-hls.php', video_id, + note='Downloading player data', errnote='Failed to download player data'), + 'm3u8 url') + formats = self._extract_m3u8_formats( + urljoin('https://www.uniqueradio.jp/', m3u8_path), video_id) + live_status = 'is_live' + release_timestamp = self._extract_start_timestamp(video_id, True) + + return { + 'id': video_id, + 'title': title, + 'channel': '超!A&G+', + 'description': self._extract_metadata('Program_text', metadata), + 'formats': formats, + 'live_status': live_status, + 'release_timestamp': release_timestamp, + } From 196eb0fe77b78e2e5ca02c506c3837c2b1a7964c Mon Sep 17 00:00:00 2001 From: coletdjnz Date: Wed, 20 Dec 2023 19:15:38 +1300 Subject: [PATCH 06/32] [networking] Strip whitespace around header values (#8802) Fixes https://github.com/yt-dlp/yt-dlp/issues/8729 Authored by: coletdjnz --- test/test_utils.py | 5 +++++ yt_dlp/utils/networking.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/test/test_utils.py b/test/test_utils.py index 100f11788..6c8571f98 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -2370,6 +2370,11 @@ Line 1 headers4 = HTTPHeaderDict({'ytdl-test': 'data;'}) self.assertEqual(set(headers4.items()), {('Ytdl-Test', 'data;')}) + # common mistake: strip whitespace from values + # https://github.com/yt-dlp/yt-dlp/issues/8729 + headers5 = HTTPHeaderDict({'ytdl-test': ' data; '}) + self.assertEqual(set(headers5.items()), {('Ytdl-Test', 'data;')}) + def test_extract_basic_auth(self): assert extract_basic_auth('http://:foo.bar') == ('http://:foo.bar', None) assert extract_basic_auth('http://foo.bar') == ('http://foo.bar', None) diff --git a/yt_dlp/utils/networking.py b/yt_dlp/utils/networking.py index ed0250011..4b73252cb 100644 --- a/yt_dlp/utils/networking.py +++ b/yt_dlp/utils/networking.py @@ -67,7 +67,7 @@ class HTTPHeaderDict(collections.UserDict, dict): def __setitem__(self, key, value): if isinstance(value, bytes): value = value.decode('latin-1') - super().__setitem__(key.title(), str(value)) + super().__setitem__(key.title(), str(value).strip()) def __getitem__(self, key): return super().__getitem__(key.title()) From 37755a037e612bfc608c3d4722e8ef2ce6a022ee Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Wed, 20 Dec 2023 13:03:54 -0600 Subject: [PATCH 07/32] [test:networking] Update tests for OpenSSL 3.2 (#8814) Authored by: bashonly --- test/test_networking.py | 2 +- test/test_websockets.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_networking.py b/test/test_networking.py index 64af6e459..dc60ca699 100644 --- a/test/test_networking.py +++ b/test/test_networking.py @@ -328,7 +328,7 @@ class TestHTTPRequestHandler(TestRequestHandlerBase): https_server_thread.start() with handler(verify=False) as rh: - with pytest.raises(SSLError, match='sslv3 alert handshake failure') as exc_info: + with pytest.raises(SSLError, match=r'ssl(?:v3|/tls) alert handshake failure') as exc_info: validate_and_send(rh, Request(f'https://127.0.0.1:{https_port}/headers')) assert not issubclass(exc_info.type, CertificateVerifyError) diff --git a/test/test_websockets.py b/test/test_websockets.py index 39d3c7d72..af6142ea3 100644 --- a/test/test_websockets.py +++ b/test/test_websockets.py @@ -148,7 +148,7 @@ class TestWebsSocketRequestHandlerConformance: @pytest.mark.parametrize('handler', ['Websockets'], indirect=True) def test_ssl_error(self, handler): with handler(verify=False) as rh: - with pytest.raises(SSLError, match='sslv3 alert handshake failure') as exc_info: + with pytest.raises(SSLError, match=r'ssl(?:v3|/tls) alert handshake failure') as exc_info: validate_and_send(rh, Request(self.bad_wss_host)) assert not issubclass(exc_info.type, CertificateVerifyError) From 19741ab8a401ec64d5e84fdbfcfb141d105e7bc8 Mon Sep 17 00:00:00 2001 From: bashonly Date: Thu, 21 Dec 2023 14:46:00 -0600 Subject: [PATCH 08/32] [ie/bbc] Fix JSON parsing bug Authored by: bashonly --- yt_dlp/extractor/bbc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py index d1d6e04fa..c94184bf0 100644 --- a/yt_dlp/extractor/bbc.py +++ b/yt_dlp/extractor/bbc.py @@ -1188,7 +1188,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE if initial_data is None: initial_data = self._search_regex( r'window\.__INITIAL_DATA__\s*=\s*({.+?})\s*;', webpage, - 'preload state', default={}) + 'preload state', default='{}') else: initial_data = self._parse_json(initial_data or '"{}"', playlist_id, fatal=False) initial_data = self._parse_json(initial_data, playlist_id, fatal=False) From c919b68f7e79ea5010f75f648d3c9e45405a8011 Mon Sep 17 00:00:00 2001 From: barsnick Date: Thu, 21 Dec 2023 21:47:32 +0100 Subject: [PATCH 09/32] [ie/bbc] Extract more formats (#8321) Closes #4902 Authored by: barsnick, dirkf --- yt_dlp/extractor/bbc.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py index c94184bf0..015af9e1d 100644 --- a/yt_dlp/extractor/bbc.py +++ b/yt_dlp/extractor/bbc.py @@ -317,16 +317,25 @@ class BBCCoUkIE(InfoExtractor): def _download_media_selector(self, programme_id): last_exception = None + formats, subtitles = [], {} for media_set in self._MEDIA_SETS: try: - return self._download_media_selector_url( + fmts, subs = self._download_media_selector_url( self._MEDIA_SELECTOR_URL_TEMPL % (media_set, programme_id), programme_id) + formats.extend(fmts) + if subs: + self._merge_subtitles(subs, target=subtitles) except BBCCoUkIE.MediaSelectionError as e: if e.id in ('notukerror', 'geolocation', 'selectionunavailable'): last_exception = e continue self._raise_extractor_error(e) - self._raise_extractor_error(last_exception) + if last_exception: + if formats or subtitles: + self.report_warning(f'{self.IE_NAME} returned error: {last_exception.id}') + else: + self._raise_extractor_error(last_exception) + return formats, subtitles def _download_media_selector_url(self, url, programme_id=None): media_selection = self._download_json( From 632b8ee54eb2df8ac6e20746a0bd95b7ebb053aa Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 21 Dec 2023 15:06:26 -0600 Subject: [PATCH 10/32] [core] Release workflow and Updater cleanup (#8640) - Only use trusted publishing with PyPI and remove support for PyPI tokens from release workflow - Clean up improper actions syntax in the build workflow inputs - Refactor Updater to allow for consistent unit testing with `UPDATE_SOURCES` Authored by: bashonly --- .github/workflows/build.yml | 8 ++++---- .github/workflows/release.yml | 24 +----------------------- test/test_update.py | 9 +++++++++ yt_dlp/update.py | 7 ++++--- 4 files changed, 18 insertions(+), 30 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index d944659b8..036ce4348 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -80,12 +80,12 @@ on: default: true type: boolean origin: - description: . + description: Origin required: false - default: '' + default: 'current repo' type: choice options: - - '' + - 'current repo' permissions: contents: read @@ -99,7 +99,7 @@ jobs: - name: Process origin id: process_origin run: | - echo "origin=${{ inputs.origin || github.repository }}" >> "$GITHUB_OUTPUT" + echo "origin=${{ inputs.origin == 'current repo' && github.repository || inputs.origin }}" | tee "$GITHUB_OUTPUT" unix: needs: process diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 84e892ffe..69b5e3152 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -64,7 +64,6 @@ jobs: target_tag: ${{ steps.setup_variables.outputs.target_tag }} pypi_project: ${{ steps.setup_variables.outputs.pypi_project }} pypi_suffix: ${{ steps.setup_variables.outputs.pypi_suffix }} - pypi_token: ${{ steps.setup_variables.outputs.pypi_token }} head_sha: ${{ steps.get_target.outputs.head_sha }} steps: @@ -153,7 +152,6 @@ jobs: ${{ !!secrets[format('{0}_archive_repo_token', env.target_repo)] }} || fallback_token pypi_project='${{ vars[format('{0}_pypi_project', env.target_repo)] }}' pypi_suffix='${{ vars[format('{0}_pypi_suffix', env.target_repo)] }}' - ${{ !secrets[format('{0}_pypi_token', env.target_repo)] }} || pypi_token='${{ env.target_repo }}_pypi_token' fi else target_tag="${source_tag:-${version}}" @@ -163,7 +161,6 @@ jobs: ${{ !!secrets[format('{0}_archive_repo_token', env.source_repo)] }} || fallback_token pypi_project='${{ vars[format('{0}_pypi_project', env.source_repo)] }}' pypi_suffix='${{ vars[format('{0}_pypi_suffix', env.source_repo)] }}' - ${{ !secrets[format('{0}_pypi_token', env.source_repo)] }} || pypi_token='${{ env.source_repo }}_pypi_token' else target_repo='${{ github.repository }}' fi @@ -172,13 +169,6 @@ jobs: if [[ "${target_repo}" == '${{ github.repository }}' ]] && ${{ !inputs.prerelease }}; then pypi_project='${{ vars.PYPI_PROJECT }}' fi - if [[ -z "${pypi_token}" && "${pypi_project}" ]]; then - if ${{ !secrets.PYPI_TOKEN }}; then - pypi_token=OIDC - else - pypi_token=PYPI_TOKEN - fi - fi echo "::group::Output variables" cat << EOF | tee -a "$GITHUB_OUTPUT" @@ -189,7 +179,6 @@ jobs: target_tag=${target_tag} pypi_project=${pypi_project} pypi_suffix=${pypi_suffix} - pypi_token=${pypi_token} EOF echo "::endgroup::" @@ -286,18 +275,7 @@ jobs: python devscripts/set-variant.py pip -M "You installed yt-dlp with pip or using the wheel from PyPi; Use that to update" python setup.py sdist bdist_wheel - - name: Publish to PyPI via token - env: - TWINE_USERNAME: __token__ - TWINE_PASSWORD: ${{ secrets[needs.prepare.outputs.pypi_token] }} - if: | - needs.prepare.outputs.pypi_token != 'OIDC' && env.TWINE_PASSWORD - run: | - twine upload dist/* - - - name: Publish to PyPI via trusted publishing - if: | - needs.prepare.outputs.pypi_token == 'OIDC' + - name: Publish to PyPI uses: pypa/gh-action-pypi-publish@release/v1 with: verbose: true diff --git a/test/test_update.py b/test/test_update.py index 2a5647e44..a5a388c10 100644 --- a/test/test_update.py +++ b/test/test_update.py @@ -11,6 +11,14 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from test.helper import FakeYDL, report_warning from yt_dlp.update import Updater, UpdateInfo + +# XXX: Keep in sync with yt_dlp.update.UPDATE_SOURCES +TEST_UPDATE_SOURCES = { + 'stable': 'yt-dlp/yt-dlp', + 'nightly': 'yt-dlp/yt-dlp-nightly-builds', + 'master': 'yt-dlp/yt-dlp-master-builds', +} + TEST_API_DATA = { 'yt-dlp/yt-dlp/latest': { 'tag_name': '2023.12.31', @@ -104,6 +112,7 @@ class FakeUpdater(Updater): _channel = 'stable' _origin = 'yt-dlp/yt-dlp' + _update_sources = TEST_UPDATE_SOURCES def _download_update_spec(self, *args, **kwargs): return TEST_LOCKFILE_ACTUAL diff --git a/yt_dlp/update.py b/yt_dlp/update.py index f99583b08..ba7eadf81 100644 --- a/yt_dlp/update.py +++ b/yt_dlp/update.py @@ -206,13 +206,14 @@ class Updater: # XXX: use class variables to simplify testing _channel = CHANNEL _origin = ORIGIN + _update_sources = UPDATE_SOURCES def __init__(self, ydl, target: str | None = None): self.ydl = ydl # For backwards compat, target needs to be treated as if it could be None self.requested_channel, sep, self.requested_tag = (target or self._channel).rpartition('@') # Check if requested_tag is actually the requested repo/channel - if not sep and ('/' in self.requested_tag or self.requested_tag in UPDATE_SOURCES): + if not sep and ('/' in self.requested_tag or self.requested_tag in self._update_sources): self.requested_channel = self.requested_tag self.requested_tag: str = None # type: ignore (we set it later) elif not self.requested_channel: @@ -237,11 +238,11 @@ class Updater: self._block_restart('Automatically restarting into custom builds is disabled for security reasons') else: # Check if requested_channel resolves to a known repository or else raise - self.requested_repo = UPDATE_SOURCES.get(self.requested_channel) + self.requested_repo = self._update_sources.get(self.requested_channel) if not self.requested_repo: self._report_error( f'Invalid update channel {self.requested_channel!r} requested. ' - f'Valid channels are {", ".join(UPDATE_SOURCES)}', True) + f'Valid channels are {", ".join(self._update_sources)}', True) self._identifier = f'{detect_variant()} {system_identifier()}' From bc4ab17b38f01000d99c5c2bedec89721fee65ec Mon Sep 17 00:00:00 2001 From: barsnick Date: Fri, 22 Dec 2023 02:32:29 +0100 Subject: [PATCH 11/32] [cleanup] Fix spelling of `IE_NAME` (#8810) Authored by: barsnick --- yt_dlp/extractor/iheart.py | 2 +- yt_dlp/extractor/kinja.py | 2 +- yt_dlp/extractor/nba.py | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/yt_dlp/extractor/iheart.py b/yt_dlp/extractor/iheart.py index 2c6a5b6a1..fb6f51e2c 100644 --- a/yt_dlp/extractor/iheart.py +++ b/yt_dlp/extractor/iheart.py @@ -23,7 +23,7 @@ class IHeartRadioBaseIE(InfoExtractor): class IHeartRadioIE(IHeartRadioBaseIE): - IENAME = 'iheartradio' + IE_NAME = 'iheartradio' _VALID_URL = r'(?:https?://(?:www\.)?iheart\.com/podcast/[^/]+/episode/(?P[^/?&#]+)-|iheartradio:)(?P\d+)' _TEST = { 'url': 'https://www.iheart.com/podcast/105-behind-the-bastards-29236323/episode/part-one-alexander-lukashenko-the-dictator-70346499/?embed=true', diff --git a/yt_dlp/extractor/kinja.py b/yt_dlp/extractor/kinja.py index a225d0a0d..f4e5c4c47 100644 --- a/yt_dlp/extractor/kinja.py +++ b/yt_dlp/extractor/kinja.py @@ -12,7 +12,7 @@ from ..utils import ( class KinjaEmbedIE(InfoExtractor): - IENAME = 'kinja:embed' + IE_NAME = 'kinja:embed' _DOMAIN_REGEX = r'''(?:[^.]+\.)? (?: avclub| diff --git a/yt_dlp/extractor/nba.py b/yt_dlp/extractor/nba.py index d8fc82488..81d11e3a5 100644 --- a/yt_dlp/extractor/nba.py +++ b/yt_dlp/extractor/nba.py @@ -97,7 +97,7 @@ class NBAWatchBaseIE(NBACVPBaseIE): class NBAWatchEmbedIE(NBAWatchBaseIE): - IENAME = 'nba:watch:embed' + IE_NAME = 'nba:watch:embed' _VALID_URL = NBAWatchBaseIE._VALID_URL_BASE + r'embed\?.*?\bid=(?P\d+)' _TESTS = [{ 'url': 'http://watch.nba.com/embed?id=659395', @@ -339,7 +339,7 @@ class NBABaseIE(NBACVPBaseIE): class NBAEmbedIE(NBABaseIE): - IENAME = 'nba:embed' + IE_NAME = 'nba:embed' _VALID_URL = r'https?://secure\.nba\.com/assets/amp/include/video/(?:topI|i)frame\.html\?.*?\bcontentId=(?P[^?#&]+)' _TESTS = [{ 'url': 'https://secure.nba.com/assets/amp/include/video/topIframe.html?contentId=teams/bulls/2020/12/04/3478774/1607105587854-20201204_SCHEDULE_RELEASE_FINAL_DRUPAL-3478774&team=bulls&adFree=false&profile=71&videoPlayerName=TAMPCVP&baseUrl=&videoAdsection=nba.com_mobile_web_teamsites_chicagobulls&Env=', @@ -361,7 +361,7 @@ class NBAEmbedIE(NBABaseIE): class NBAIE(NBABaseIE): - IENAME = 'nba' + IE_NAME = 'nba' _VALID_URL = NBABaseIE._VALID_URL_BASE + '(?!%s)video/(?P(?:[^/]+/)*[^/?#&]+)' % NBABaseIE._CHANNEL_PATH_REGEX _TESTS = [{ 'url': 'https://www.nba.com/bulls/video/teams/bulls/2020/12/04/3478774/1607105587854-20201204schedulereleasefinaldrupal-3478774', @@ -388,7 +388,7 @@ class NBAIE(NBABaseIE): class NBAChannelIE(NBABaseIE): - IENAME = 'nba:channel' + IE_NAME = 'nba:channel' _VALID_URL = NBABaseIE._VALID_URL_BASE + '(?:%s)/(?P[^/?#&]+)' % NBABaseIE._CHANNEL_PATH_REGEX _TESTS = [{ 'url': 'https://www.nba.com/blazers/video/channel/summer_league', From 0d531c35eca4c2eb36e160530a7a333edbc727cc Mon Sep 17 00:00:00 2001 From: Nicolas Dato <67328748+nicodato@users.noreply.github.com> Date: Fri, 22 Dec 2023 18:52:07 -0300 Subject: [PATCH 12/32] [ie/RudoVideo] Add extractor (#8664) Authored by: nicodato --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/rudovideo.py | 135 ++++++++++++++++++++++++++++++++ 2 files changed, 136 insertions(+) create mode 100644 yt_dlp/extractor/rudovideo.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index d5f030c6b..5c34bb7f4 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1647,6 +1647,7 @@ from .rumble import ( RumbleIE, RumbleChannelIE, ) +from .rudovideo import RudoVideoIE from .rutube import ( RutubeIE, RutubeChannelIE, diff --git a/yt_dlp/extractor/rudovideo.py b/yt_dlp/extractor/rudovideo.py new file mode 100644 index 000000000..1b8595593 --- /dev/null +++ b/yt_dlp/extractor/rudovideo.py @@ -0,0 +1,135 @@ +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + determine_ext, + js_to_json, + traverse_obj, + update_url_query, + url_or_none, +) + + +class RudoVideoIE(InfoExtractor): + _VALID_URL = r'https?://rudo\.video/(?Pvod|podcast|live)/(?P[^/?&#]+)' + _EMBED_REGEX = [r']+src=[\'"](?P(?:https?:)//rudo\.video/(?:vod|podcast|live)/[^\'"]+)'] + _TESTS = [{ + 'url': 'https://rudo.video/podcast/cz2wrUy8l0o', + 'md5': '28ed82b477708dc5e12e072da2449221', + 'info_dict': { + 'id': 'cz2wrUy8l0o', + 'title': 'Diego Cabot', + 'ext': 'mp4', + 'thumbnail': r're:^(?:https?:)?//.*\.(png|jpg)$', + }, + }, { + 'url': 'https://rudo.video/podcast/bQkt07', + 'md5': '36b22a9863de0f47f00fc7532a32a898', + 'info_dict': { + 'id': 'bQkt07', + 'title': 'Tubular Bells', + 'ext': 'mp4', + 'thumbnail': r're:^(?:https?:)?//.*\.(png|jpg)$', + }, + }, { + 'url': 'https://rudo.video/podcast/b42ZUznHX0', + 'md5': 'b91c70d832938871367f8ad10c895821', + 'info_dict': { + 'id': 'b42ZUznHX0', + 'title': 'Columna Ruperto Concha', + 'ext': 'mp3', + 'thumbnail': r're:^(?:https?:)?//.*\.(png|jpg)$', + }, + }, { + 'url': 'https://rudo.video/vod/bN5AaJ', + 'md5': '01324a329227e2591530ecb4f555c881', + 'info_dict': { + 'id': 'bN5AaJ', + 'title': 'Ucrania 19.03', + 'creator': 'La Tercera', + 'ext': 'mp4', + 'thumbnail': r're:^(?:https?:)?//.*\.(png|jpg)$', + }, + }, { + 'url': 'https://rudo.video/live/bbtv', + 'info_dict': { + 'id': 'bbtv', + 'ext': 'mp4', + 'creator': 'BioBioTV', + 'live_status': 'is_live', + 'title': r're:^LIVE BBTV\s\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}$', + 'thumbnail': r're:^(?:https?:)?//.*\.(png|jpg)$', + }, + }, { + 'url': 'https://rudo.video/live/c13', + 'info_dict': { + 'id': 'c13', + 'title': 'CANAL13', + 'ext': 'mp4', + }, + 'skip': 'Geo-restricted to Chile', + }, { + 'url': 'https://rudo.video/live/t13-13cl', + 'info_dict': { + 'id': 't13-13cl', + 'title': 'T13', + 'ext': 'mp4', + }, + 'skip': 'Geo-restricted to Chile', + }] + + def _real_extract(self, url): + video_id, type_ = self._match_valid_url(url).group('id', 'type') + is_live = type_ == 'live' + + webpage = self._download_webpage(url, video_id) + if 'Streaming is not available in your area' in webpage: + self.raise_geo_restricted() + + media_url = ( + self._search_regex( + r'var\s+streamURL\s*=\s*[\'"]([^?\'"]+)', webpage, 'stream url', default=None) + # Source URL must be used only if streamURL is unavailable + or self._search_regex( + r']+src=[\'"]([^\'"]+)', webpage, 'source url', default=None)) + if not media_url: + youtube_url = self._search_regex(r'file:\s*[\'"]((?:https?:)//(?:www\.)?youtube\.com[^\'"]+)', + webpage, 'youtube url', default=None) + if youtube_url: + return self.url_result(youtube_url, 'Youtube') + raise ExtractorError('Unable to extract stream url') + + token_array = self._search_json( + r'