From 339c339fec095ff4141b20e6aa83629117fb26df Mon Sep 17 00:00:00 2001 From: trainman261 Date: Sun, 13 Aug 2023 01:58:55 +0200 Subject: [PATCH] [ie/CBCPlayer] Extract HLS formats and subtitles (#7484) Authored by: trainman261 --- yt_dlp/extractor/cbc.py | 28 +++++++++++++++++++++++++++- yt_dlp/extractor/scrippsnetworks.py | 1 + yt_dlp/extractor/theplatform.py | 22 ++++++++++++++++++++-- 3 files changed, 48 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/cbc.py b/yt_dlp/extractor/cbc.py index 41e092422..9413281a5 100644 --- a/yt_dlp/extractor/cbc.py +++ b/yt_dlp/extractor/cbc.py @@ -161,7 +161,7 @@ class CBCPlayerIE(InfoExtractor): 'upload_date': '20160210', 'uploader': 'CBCC-NEW', }, - 'skip': 'Geo-restricted to Canada', + 'skip': 'Geo-restricted to Canada and no longer available', }, { # Redirected from http://www.cbc.ca/player/AudioMobile/All%20in%20a%20Weekend%20Montreal/ID/2657632011/ 'url': 'http://www.cbc.ca/player/play/2657631896', @@ -174,6 +174,9 @@ class CBCPlayerIE(InfoExtractor): 'timestamp': 1425704400, 'upload_date': '20150307', 'uploader': 'CBCC-NEW', + 'thumbnail': 'http://thumbnails.cbc.ca/maven_legacy/thumbnails/sonali-karnick-220.jpg', + 'chapters': [], + 'duration': 494.811, }, }, { 'url': 'http://www.cbc.ca/player/play/2164402062', @@ -186,6 +189,28 @@ class CBCPlayerIE(InfoExtractor): 'timestamp': 1320410746, 'upload_date': '20111104', 'uploader': 'CBCC-NEW', + 'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/277/67/cancer_852x480_2164412612.jpg', + 'chapters': [], + 'duration': 186.867, + }, + }, { + # Has subtitles + # These broadcasts expire after ~1 month, can find new test URL here: + # https://www.cbc.ca/player/news/TV%20Shows/The%20National/Latest%20Broadcast + 'url': 'http://www.cbc.ca/player/play/2249992771553', + 'md5': '2f2fb675dd4f0f8a5bb7588d1b13bacd', + 'info_dict': { + 'id': '2249992771553', + 'ext': 'mp4', + 'title': 'The National | Women’s soccer pay, Florida seawater, Swift quake', + 'description': 'md5:adba28011a56cfa47a080ff198dad27a', + 'timestamp': 1690596000, + 'duration': 2716.333, + 'subtitles': {'eng': [{'ext': 'vtt', 'protocol': 'm3u8_native'}]}, + 'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/481/326/thumbnail.jpeg', + 'uploader': 'CBCC-NEW', + 'chapters': 'count:5', + 'upload_date': '20230729', }, }] @@ -199,6 +224,7 @@ def _real_extract(self, url): 'force_smil_url': True }), 'id': video_id, + '_format_sort_fields': ('res', 'proto') # Prioritize direct http formats over HLS } diff --git a/yt_dlp/extractor/scrippsnetworks.py b/yt_dlp/extractor/scrippsnetworks.py index c3cee6e4a..adfd7e5f2 100644 --- a/yt_dlp/extractor/scrippsnetworks.py +++ b/yt_dlp/extractor/scrippsnetworks.py @@ -115,6 +115,7 @@ class ScrippsNetworksIE(InfoExtractor): 'uploader': 'SCNI-SCND', }, 'add_ie': ['ThePlatform'], + 'expected_warnings': ['No HLS formats found'], }, { 'url': 'https://www.diynetwork.com/videos/diy-barnwood-tablet-stand-0265790', 'only_matching': True, diff --git a/yt_dlp/extractor/theplatform.py b/yt_dlp/extractor/theplatform.py index 8307b912d..99caeb5f9 100644 --- a/yt_dlp/extractor/theplatform.py +++ b/yt_dlp/extractor/theplatform.py @@ -19,7 +19,11 @@ xpath_with_ns, mimetype2ext, find_xpath_attr, + traverse_obj, + update_url, + urlhandle_detect_ext, ) +from ..networking import HEADRequest default_ns = 'http://www.w3.org/2005/SMIL21/Language' _x = lambda p: xpath_with_ns(p, {'smil': default_ns}) @@ -162,7 +166,8 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE): 'params': { # rtmp download 'skip_download': True, - } + }, + 'skip': '404 Not Found', }, { 'url': 'https://player.theplatform.com/p/D6x-PC/pulse_preview/embed/select/media/yMBg9E8KFxZD', 'info_dict': { @@ -171,7 +176,8 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE): 'description': 'md5:644ad9188d655b742f942bf2e06b002d', 'title': 'HIGHLIGHTS: USA bag first ever series Cup win', 'uploader': 'EGSM', - } + }, + 'skip': '404 Not Found', }, { 'url': 'http://player.theplatform.com/p/NnzsPC/widget/select/media/4Y0TlYUr_ZT7', 'only_matching': True, @@ -189,6 +195,7 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE): 'upload_date': '20150701', 'uploader': 'NBCU-NEWS', }, + 'skip': '404 Not Found', }, { # From http://www.nbc.com/the-blacklist/video/sir-crispin-crandall/2928790?onid=137781#vc137781=1 # geo-restricted (US), HLS encrypted with AES-128 @@ -295,6 +302,17 @@ def _real_extract(self, url): formats, subtitles = self._extract_theplatform_smil(smil_url, video_id) + # With some sites, manifest URL must be forced to extract HLS formats + if not traverse_obj(formats, lambda _, v: v['format_id'].startswith('hls')): + m3u8_url = update_url(url, query='mbr=true&manifest=m3u', fragment=None) + urlh = self._request_webpage( + HEADRequest(m3u8_url), video_id, 'Checking for HLS formats', 'No HLS formats found', fatal=False) + if urlh and urlhandle_detect_ext(urlh) == 'm3u8': + m3u8_fmts, m3u8_subs = self._extract_m3u8_formats_and_subtitles( + m3u8_url, video_id, m3u8_id='hls', fatal=False) + formats.extend(m3u8_fmts) + self._merge_subtitles(m3u8_subs, target=subtitles) + ret = self._extract_theplatform_metadata(path, video_id) combined_subtitles = self._merge_subtitles(ret.get('subtitles', {}), subtitles) ret.update({