From cb73b8460c3ce6d37ab651a4e44bb23b10056154 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sat, 14 Jan 2023 10:40:42 -0600 Subject: [PATCH] [extractor/nbc] Fix `NBC` and `NBCStations` extractors (#6033) Improve `InfoExtractor._parse_smil_formats` extension detection Closes #6019 Authored by: bashonly --- yt_dlp/extractor/common.py | 5 +- yt_dlp/extractor/nbc.py | 249 ++++++++++++++++++++++--------------- 2 files changed, 151 insertions(+), 103 deletions(-) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index ef9759974..e37595ffd 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -32,6 +32,7 @@ FormatSorter, GeoRestrictedError, GeoUtils, + HEADRequest, LenientJSONDecoder, RegexNotFoundError, RetryManager, @@ -80,6 +81,7 @@ update_Request, update_url_query, url_basename, + urlhandle_detect_ext, url_or_none, urljoin, variadic, @@ -2311,7 +2313,8 @@ def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_para height = int_or_none(medium.get('height')) proto = medium.get('proto') ext = medium.get('ext') - src_ext = determine_ext(src) + src_ext = determine_ext(src, default_ext=None) or ext or urlhandle_detect_ext( + self._request_webpage(HEADRequest(src), video_id, note='Requesting extension info', fatal=False)) streamer = medium.get('streamer') or base if proto == 'rtmp' or streamer.startswith('rtmp'): diff --git a/yt_dlp/extractor/nbc.py b/yt_dlp/extractor/nbc.py index 00c592cc3..82d759f75 100644 --- a/yt_dlp/extractor/nbc.py +++ b/yt_dlp/extractor/nbc.py @@ -8,24 +8,26 @@ from ..compat import compat_urllib_parse_unquote from ..utils import ( ExtractorError, + HEADRequest, + RegexNotFoundError, + UserNotLive, + clean_html, int_or_none, parse_age_limit, parse_duration, - RegexNotFoundError, smuggle_url, - str_or_none, traverse_obj, try_get, - unified_strdate, + unescapeHTML, unified_timestamp, update_url_query, url_basename, - variadic, + xpath_attr, ) class NBCIE(ThePlatformIE): # XXX: Do not subclass from concrete IE - _VALID_URL = r'https?(?P://(?:www\.)?nbc\.com/(?:classic-tv/)?[^/]+/video/[^/]+/(?Pn?\d+))' + _VALID_URL = r'https?(?P://(?:www\.)?nbc\.com/(?:classic-tv/)?[^/]+/video/[^/]+/(?P(?:NBCE|n)?\d+))' _TESTS = [ { @@ -38,10 +40,18 @@ class NBCIE(ThePlatformIE): # XXX: Do not subclass from concrete IE 'timestamp': 1424246400, 'upload_date': '20150218', 'uploader': 'NBCU-COM', + 'episode': 'Jimmy Fallon Surprises Fans at Ben & Jerry\'s', + 'episode_number': 86, + 'season': 'Season 2', + 'season_number': 2, + 'series': 'Tonight Show: Jimmy Fallon', + 'duration': 237.0, + 'chapters': 'count:1', + 'tags': 'count:4', + 'thumbnail': r're:https?://.+\.jpg', }, 'params': { - # m3u8 download - 'skip_download': True, + 'skip_download': 'm3u8', }, }, { @@ -55,11 +65,7 @@ class NBCIE(ThePlatformIE): # XXX: Do not subclass from concrete IE 'upload_date': '20141206', 'uploader': 'NBCU-COM', }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'skip': 'Only works from US', + 'skip': 'page not found', }, { # HLS streams requires the 'hdnea3' cookie @@ -73,10 +79,59 @@ class NBCIE(ThePlatformIE): # XXX: Do not subclass from concrete IE 'upload_date': '20090315', 'uploader': 'NBCU-COM', }, - 'params': { - 'skip_download': True, + 'skip': 'page not found', + }, + { + # manifest url does not have extension + 'url': 'https://www.nbc.com/the-golden-globe-awards/video/oprah-winfrey-receives-cecil-b-de-mille-award-at-the-2018-golden-globes/3646439', + 'info_dict': { + 'id': '3646439', + 'ext': 'mp4', + 'title': 'Oprah Winfrey Receives Cecil B. de Mille Award at the 2018 Golden Globes', + 'episode': 'Oprah Winfrey Receives Cecil B. de Mille Award at the 2018 Golden Globes', + 'episode_number': 1, + 'season': 'Season 75', + 'season_number': 75, + 'series': 'The Golden Globe Awards', + 'description': 'Oprah Winfrey receives the Cecil B. de Mille Award at the 75th Annual Golden Globe Awards.', + 'uploader': 'NBCU-COM', + 'upload_date': '20180107', + 'timestamp': 1515312000, + 'duration': 570.0, + 'tags': 'count:8', + 'thumbnail': r're:https?://.+\.jpg', + 'chapters': 'count:1', + }, + 'params': { + 'skip_download': 'm3u8', + }, + }, + { + # new video_id format + 'url': 'https://www.nbc.com/quantum-leap/video/bens-first-leap-nbcs-quantum-leap/NBCE125189978', + 'info_dict': { + 'id': 'NBCE125189978', + 'ext': 'mp4', + 'title': 'Ben\'s First Leap | NBC\'s Quantum Leap', + 'description': 'md5:a82762449b7ec4bb83291a7b355ebf8e', + 'uploader': 'NBCU-COM', + 'series': 'Quantum Leap', + 'season': 'Season 1', + 'season_number': 1, + 'episode': 'Ben\'s First Leap | NBC\'s Quantum Leap', + 'episode_number': 1, + 'duration': 170.171, + 'chapters': [], + 'timestamp': 1663956155, + 'upload_date': '20220923', + 'tags': 'count:10', + 'age_limit': 0, + 'thumbnail': r're:https?://.+\.jpg', + }, + 'expected_warnings': ['Ignoring subtitle tracks'], + 'params': { + 'skip_download': 'm3u8', }, - 'skip': 'Only works from US', }, { 'url': 'https://www.nbc.com/classic-tv/charles-in-charge/video/charles-in-charge-pilot/n3310', @@ -600,32 +655,36 @@ class NBCStationsIE(InfoExtractor): _TESTS = [{ 'url': 'https://www.nbclosangeles.com/news/local/large-structure-fire-in-downtown-la-prompts-smoke-odor-advisory/2968618/', - 'md5': '462041d91bd762ef5a38b7d85d6dc18f', 'info_dict': { 'id': '2968618', 'ext': 'mp4', 'title': 'Large Structure Fire in Downtown LA Prompts Smoke Odor Advisory', - 'description': None, + 'description': 'md5:417ed3c2d91fe9d301e6db7b0942f182', 'timestamp': 1661135892, - 'upload_date': '20220821', + 'upload_date': '20220822', 'uploader': 'NBC 4', - 'uploader_id': 'KNBC', + 'channel_id': 'KNBC', 'channel': 'nbclosangeles', }, + 'params': { + 'skip_download': 'm3u8', + }, }, { 'url': 'https://www.telemundoarizona.com/responde/huracan-complica-reembolso-para-televidente-de-tucson/2247002/', - 'md5': '0917dcf7885be1023a9220630d415f67', 'info_dict': { 'id': '2247002', 'ext': 'mp4', - 'title': 'Huracán complica que televidente de Tucson reciba reembolso', + 'title': 'Huracán complica que televidente de Tucson reciba reembolso', 'description': 'md5:af298dc73aab74d4fca6abfb12acb6cf', 'timestamp': 1660886507, 'upload_date': '20220819', 'uploader': 'Telemundo Arizona', - 'uploader_id': 'KTAZ', + 'channel_id': 'KTAZ', 'channel': 'telemundoarizona', }, + 'params': { + 'skip_download': 'm3u8', + }, }] _RESOLUTIONS = { @@ -644,48 +703,39 @@ def _real_extract(self, url): r'