[ie/cbc.ca:player] Fix extractor (#10302)

Closes #10170 Authored by: trainman261, bashonly Co-authored-by: bashonly <88596187+bashonly@users.noreply.github.com>
2024-12-22 06:00:00 +00:00 · 2024-07-29 23:58:26 +02:00 · 2024-07-29 23:58:26 +02:00 · 94a1c5e642
commit 94a1c5e642
parent 2b6df93a24
1 changed files with 214 additions and 55 deletions
--- a/yt_dlp/extractor/cbc.py
+++ b/yt_dlp/extractor/cbc.py
@ -1,4 +1,5 @@
 import base64
 import functools
 import json
 import re
 import time
@ -6,17 +7,24 @@
 import xml.etree.ElementTree
 from .common import InfoExtractor
 from ..networking import HEADRequest
 from ..utils import (
    ExtractorError,
    float_or_none,
    int_or_none,
    join_nonempty,
    js_to_json,
    mimetype2ext,
    orderedSet,
    parse_iso8601,
    replace_extension,
    smuggle_url,
    strip_or_none,
    traverse_obj,
    try_get,
    update_url,
    url_basename,
    url_or_none,
 )
@ -149,6 +157,7 @@ def _real_extract(self, url):
 class CBCPlayerIE(InfoExtractor):
    IE_NAME = 'cbc.ca:player'
    _VALID_URL = r'(?:cbcplayer:|https?://(?:www\.)?cbc\.ca/(?:player/play/(?:video/)?|i/caffeine/syndicate/\?mediaId=))(?P<id>(?:\d\.)?\d+)'
    _GEO_COUNTRIES = ['CA']
    _TESTS = [{
        'url': 'http://www.cbc.ca/player/play/2683190193',
        'md5': '64d25f841ddf4ddb28a235338af32e2c',
@ -172,21 +181,20 @@ class CBCPlayerIE(InfoExtractor):
            'description': 'md5:dd3b692f0a139b0369943150bd1c46a9',
            'timestamp': 1425704400,
            'upload_date': '20150307',
-            'uploader': 'CBCC-NEW',
+            'thumbnail': 'https://i.cbc.ca/ais/1.2985700,1717262248558/full/max/0/default.jpg',
            'thumbnail': 'http://thumbnails.cbc.ca/maven_legacy/thumbnails/sonali-karnick-220.jpg',
            'chapters': [],
            'duration': 494.811,
-            'categories': ['AudioMobile/All in a Weekend Montreal'],
+            'categories': ['All in a Weekend Montreal'],
-            'tags': 'count:8',
+            'tags': 'count:11',
            'location': 'Quebec',
            'series': 'All in a Weekend Montreal',
            'season': 'Season 2015',
            'season_number': 2015,
            'media_type': 'Excerpt',
            'genres': ['Other'],
        },
    }, {
        'url': 'http://www.cbc.ca/i/caffeine/syndicate/?mediaId=2164402062',
        'md5': '33fcd8f6719b9dd60a5e73adcb83b9f6',
        'info_dict': {
            'id': '2164402062',
            'ext': 'mp4',
@ -194,107 +202,168 @@ class CBCPlayerIE(InfoExtractor):
            'description': 'Tim Mayer has beaten three different forms of cancer four times in five years.',
            'timestamp': 1320410746,
            'upload_date': '20111104',
-            'uploader': 'CBCC-NEW',
+            'thumbnail': 'https://i.cbc.ca/ais/1.1711287,1717139372111/full/max/0/default.jpg',
            'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/277/67/cancer_852x480_2164412612.jpg',
            'chapters': [],
            'duration': 186.867,
            'series': 'CBC News: Windsor at 6:00',
-            'categories': ['News/Canada/Windsor'],
+            'categories': ['Windsor'],
            'location': 'Windsor',
-            'tags': ['cancer'],
+            'tags': ['Cancer', 'News/Canada/Windsor', 'Windsor'],
            'creators': ['Allison Johnson'],
            'media_type': 'Excerpt',
            'genres': ['News'],
        },
        'params': {'skip_download': 'm3u8'},
    }, {
        # Redirected from http://www.cbc.ca/player/AudioMobile/All%20in%20a%20Weekend%20Montreal/ID/2657632011/
        'url': 'https://www.cbc.ca/player/play/1.2985700',
        'md5': 'e5e708c34ae6fca156aafe17c43e8b75',
        'info_dict': {
-            'id': '2657631896',
+            'id': '1.2985700',
            'ext': 'mp3',
            'title': 'CBC Montreal is organizing its first ever community hackathon!',
            'description': 'The modern technology we tend to depend on so heavily, is never without it\'s share of hiccups and headaches. Next weekend - CBC Montreal will be getting members of the public for its first Hackathon.',
            'timestamp': 1425704400,
            'upload_date': '20150307',
-            'uploader': 'CBCC-NEW',
+            'thumbnail': 'https://i.cbc.ca/ais/1.2985700,1717262248558/full/max/0/default.jpg',
            'thumbnail': 'http://thumbnails.cbc.ca/maven_legacy/thumbnails/sonali-karnick-220.jpg',
            'chapters': [],
            'duration': 494.811,
-            'categories': ['AudioMobile/All in a Weekend Montreal'],
+            'categories': ['All in a Weekend Montreal'],
-            'tags': 'count:8',
+            'tags': 'count:11',
            'location': 'Quebec',
            'series': 'All in a Weekend Montreal',
            'season': 'Season 2015',
            'season_number': 2015,
            'media_type': 'Excerpt',
            'genres': ['Other'],
        },
    }, {
        'url': 'https://www.cbc.ca/player/play/1.1711287',
        'md5': '33fcd8f6719b9dd60a5e73adcb83b9f6',
        'info_dict': {
-            'id': '2164402062',
+            'id': '1.1711287',
            'ext': 'mp4',
            'title': 'Cancer survivor four times over',
            'description': 'Tim Mayer has beaten three different forms of cancer four times in five years.',
            'timestamp': 1320410746,
            'upload_date': '20111104',
-            'uploader': 'CBCC-NEW',
+            'thumbnail': 'https://i.cbc.ca/ais/1.1711287,1717139372111/full/max/0/default.jpg',
            'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/277/67/cancer_852x480_2164412612.jpg',
            'chapters': [],
            'duration': 186.867,
            'series': 'CBC News: Windsor at 6:00',
-            'categories': ['News/Canada/Windsor'],
+            'categories': ['Windsor'],
            'location': 'Windsor',
-            'tags': ['cancer'],
+            'tags': ['Cancer', 'News/Canada/Windsor', 'Windsor'],
            'creators': ['Allison Johnson'],
            'media_type': 'Excerpt',
            'genres': ['News'],
        },
        'params': {'skip_download': 'm3u8'},
    }, {
        # Has subtitles
        # These broadcasts expire after ~1 month, can find new test URL here:
        # https://www.cbc.ca/player/news/TV%20Shows/The%20National/Latest%20Broadcast
-        'url': 'https://www.cbc.ca/player/play/1.7159484',
+        'url': 'https://www.cbc.ca/player/play/video/9.6424403',
-        'md5': '6ed6cd0fc2ef568d2297ba68a763d455',
+        'md5': '8025909eaffcf0adf59922904def9a5e',
        'info_dict': {
-            'id': '2324213316001',
+            'id': '9.6424403',
            'ext': 'mp4',
-            'title': 'The National | School boards sue social media giants',
+            'title': 'The National | N.W.T. wildfire emergency',
-            'description': 'md5:4b4db69322fa32186c3ce426da07402c',
+            'description': 'md5:ada33d36d1df69347ed575905bfd496c',
-            'timestamp': 1711681200,
+            'timestamp': 1718589600,
-            'duration': 2743.400,
+            'duration': 2692.833,
-            'subtitles': {'eng': [{'ext': 'vtt', 'protocol': 'm3u8_native'}]},
+            'subtitles': {
-            'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/607/559/thumbnail.jpeg',
+                'en-US': [{
-            'uploader': 'CBCC-NEW',
+                    'name': 'English Captions',
                    'url': 'https://cbchls.akamaized.net/delivery/news-shows/2024/06/17/NAT_JUN16-00-55-00/NAT_JUN16_cc.vtt',
                }],
            },
            'thumbnail': 'https://i.cbc.ca/ais/6272b5c6-5e78-4c05-915d-0e36672e33d1,1714756287822/full/max/0/default.jpg',
            'chapters': 'count:5',
-            'upload_date': '20240329',
+            'upload_date': '20240617',
-            'categories': 'count:4',
+            'categories': ['News', 'The National', 'The National Latest Broadcasts'],
            'series': 'The National - Full Show',
-            'tags': 'count:1',
+            'tags': ['The National'],
            'creators': ['News'],
            'location': 'Canada',
            'media_type': 'Full Program',
            'genres': ['News'],
        },
    }, {
        'url': 'https://www.cbc.ca/player/play/video/1.7194274',
        'md5': '188b96cf6bdcb2540e178a6caa957128',
        'info_dict': {
-            'id': '2334524995812',
+            'id': '1.7194274',
            'ext': 'mp4',
            'title': '#TheMoment a rare white spirit moose was spotted in Alberta',
            'description': 'md5:18ae269a2d0265c5b0bbe4b2e1ac61a3',
            'timestamp': 1714788791,
            'duration': 77.678,
            'subtitles': {'eng': [{'ext': 'vtt', 'protocol': 'm3u8_native'}]},
-            'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/201/543/THE_MOMENT.jpg',
+            'thumbnail': 'https://i.cbc.ca/ais/1.7194274,1717224990425/full/max/0/default.jpg',
-            'uploader': 'CBCC-NEW',
+            'chapters': [],
            'chapters': 'count:0',
            'upload_date': '20240504',
            'categories': 'count:3',
            'series': 'The National',
-            'tags': 'count:15',
+            'tags': 'count:17',
            'creators': ['encoder'],
            'location': 'Canada',
            'media_type': 'Excerpt',
            'upload_date': '20240504',
            'genres': ['News'],
        },
    }, {
        'url': 'https://www.cbc.ca/player/play/video/9.6427282',
        'info_dict': {
            'id': '9.6427282',
            'ext': 'mp4',
            'title': 'Men\'s Soccer - Argentina vs Morocco',
            'description': 'Argentina faces Morocco on the football pitch at Saint Etienne Stadium.',
            'series': 'CBC Sports',
            'media_type': 'Event Coverage',
            'thumbnail': 'https://i.cbc.ca/ais/a4c5c0c2-99fa-4bd3-8061-5a63879c1b33,1718828053500/full/max/0/default.jpg',
            'timestamp': 1721825400.0,
            'upload_date': '20240724',
            'duration': 10568.0,
            'chapters': [],
            'genres': [],
            'tags': ['2024 Paris Olympic Games'],
            'categories': ['Olympics Summer Soccer', 'Summer Olympics Replays', 'Summer Olympics Soccer Replays'],
            'location': 'Canada',
        },
        'params': {'skip_download': 'm3u8'},
    }, {
        'url': 'https://www.cbc.ca/player/play/video/9.6459530',
        'md5': '6c1bb76693ab321a2e99c347a1d5ecbc',
        'info_dict': {
            'id': '9.6459530',
            'ext': 'mp4',
            'title': 'Parts of Jasper incinerated as wildfire rages',
            'description': 'md5:6f1caa8d128ad3f629257ef5fecf0962',
            'series': 'The National',
            'media_type': 'Excerpt',
            'thumbnail': 'https://i.cbc.ca/ais/507c0086-31a2-494d-96e4-bffb1048d045,1721953984375/full/max/0/default.jpg',
            'timestamp': 1721964091.012,
            'upload_date': '20240726',
            'duration': 952.285,
            'chapters': [],
            'genres': [],
            'tags': 'count:23',
            'categories': ['News (FAST)', 'News', 'The National', 'TV News Shows', 'The National '],
        },
    }, {
        'url': 'https://www.cbc.ca/player/play/video/9.6420651',
        'md5': '71a850c2c6ee5e912de169f5311bb533',
        'info_dict': {
            'id': '9.6420651',
            'ext': 'mp4',
            'title': 'Is it a breath of fresh air? Measuring air quality in Edmonton',
            'description': 'md5:3922b92cc8b69212d739bd9dd095b1c3',
            'series': 'CBC News Edmonton',
            'media_type': 'Excerpt',
            'thumbnail': 'https://i.cbc.ca/ais/73c4ab9c-7ad4-46ee-bb9b-020fdc01c745,1718214547576/full/max/0/default.jpg',
            'timestamp': 1718220065.768,
            'upload_date': '20240612',
            'duration': 286.086,
            'chapters': [],
            'genres': ['News'],
            'categories': ['News', 'Edmonton'],
            'tags': 'count:7',
            'location': 'Edmonton',
        },
    }, {
        'url': 'cbcplayer:1.7159484',
@ -307,23 +376,113 @@ class CBCPlayerIE(InfoExtractor):
        'only_matching': True,
    }]
    def _parse_param(self, asset_data, name):
        return traverse_obj(asset_data, ('params', lambda _, v: v['name'] == name, 'value', {str}, any))
    def _real_extract(self, url):
        video_id = self._match_id(url)
-        if '.' in video_id:
+        webpage = self._download_webpage(f'https://www.cbc.ca/player/play/{video_id}', video_id)
-            webpage = self._download_webpage(f'https://www.cbc.ca/player/play/{video_id}', video_id)
+        data = self._search_json(
-            video_id = self._search_json(
+            r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', video_id)['video']['currentClip']
-                r'window\.__INITIAL_STATE__\s*=', webpage,
+        assets = traverse_obj(
-                'initial state', video_id)['video']['currentClip']['mediaId']
+            data, ('media', 'assets', lambda _, v: url_or_none(v['key']) and v['type']))
        if not assets and (media_id := traverse_obj(data, ('mediaId', {str}))):
            # XXX: Deprecated; CBC is migrating off of ThePlatform
            return {
                '_type': 'url_transparent',
                'ie_key': 'ThePlatform',
                'url': smuggle_url(
                    f'http://link.theplatform.com/s/ExhSPC/media/guid/2655402169/{media_id}?mbr=true&formats=MPEG4,FLV,MP3', {
                        'force_smil_url': True,
                    }),
                'id': media_id,
                '_format_sort_fields': ('res', 'proto'),  # Prioritize direct http formats over HLS
            }
        is_live = traverse_obj(data, ('media', 'streamType', {str})) == 'Live'
        formats, subtitles = [], {}
        for sub in traverse_obj(data, ('media', 'textTracks', lambda _, v: url_or_none(v['src']))):
            subtitles.setdefault(sub.get('language') or 'und', []).append({
                'url': sub['src'],
                'name': sub.get('label'),
            })
        for asset in assets:
            asset_key = asset['key']
            asset_type = asset['type']
            if asset_type != 'medianet':
                self.report_warning(f'Skipping unsupported asset type "{asset_type}": {asset_key}')
                continue
            asset_data = self._download_json(asset_key, video_id, f'Downloading {asset_type} JSON')
            ext = mimetype2ext(self._parse_param(asset_data, 'contentType'))
            if ext == 'm3u8':
                fmts, subs = self._extract_m3u8_formats_and_subtitles(
                    asset_data['url'], video_id, 'mp4', m3u8_id='hls', live=is_live)
                formats.extend(fmts)
                # Avoid slow/error-prone webvtt-over-m3u8 if direct https vtt is available
                if not subtitles:
                    self._merge_subtitles(subs, target=subtitles)
                if is_live or not fmts:
                    continue
                # Check for direct https mp4 format
                best_video_fmt = traverse_obj(fmts, (
                    lambda _, v: v.get('vcodec') != 'none' and v['tbr'], all,
                    {functools.partial(sorted, key=lambda x: x['tbr'])}, -1, {dict})) or {}
                base_url = self._search_regex(
                    r'(https?://[^?#]+?/)hdntl=', best_video_fmt.get('url'), 'base url', default=None)
                if not base_url or '/live/' in base_url:
                    continue
                mp4_url = base_url + replace_extension(url_basename(best_video_fmt['url']), 'mp4')
                if self._request_webpage(
                        HEADRequest(mp4_url), video_id, 'Checking for https format',
                        errnote=False, fatal=False):
                    formats.append({
                        **best_video_fmt,
                        'url': mp4_url,
                        'format_id': 'https-mp4',
                        'protocol': 'https',
                        'manifest_url': None,
                        'acodec': None,
                    })
            else:
                formats.append({
                    'url': asset_data['url'],
                    'ext': ext,
                    'vcodec': 'none' if self._parse_param(asset_data, 'mediaType') == 'audio' else None,
                })
        chapters = traverse_obj(data, (
            'media', 'chapters', lambda _, v: float(v['startTime']) is not None, {
                'start_time': ('startTime', {functools.partial(float_or_none, scale=1000)}),
                'end_time': ('endTime', {functools.partial(float_or_none, scale=1000)}),
                'title': ('name', {str}),
            }))
        # Filter out pointless single chapters with start_time==0 and no end_time
        if len(chapters) == 1 and not (chapters[0].get('start_time') or chapters[0].get('end_time')):
            chapters = []
        return {
-            '_type': 'url_transparent',
+            **traverse_obj(data, {
-            'ie_key': 'ThePlatform',
+                'title': ('title', {str}),
-            'url': smuggle_url(
+                'description': ('description', {str.strip}),
-                f'http://link.theplatform.com/s/ExhSPC/media/guid/2655402169/{video_id}?mbr=true&formats=MPEG4,FLV,MP3', {
+                'thumbnail': ('image', 'url', {url_or_none}, {functools.partial(update_url, query=None)}),
-                    'force_smil_url': True,
+                'timestamp': ('publishedAt', {functools.partial(float_or_none, scale=1000)}),
-                }),
+                'media_type': ('media', 'clipType', {str}),
                'series': ('showName', {str}),
                'season_number': ('media', 'season', {int_or_none}),
                'duration': ('media', 'duration', {float_or_none}, {lambda x: None if is_live else x}),
                'location': ('media', 'region', {str}),
                'tags': ('tags', ..., 'name', {str}),
                'genres': ('media', 'genre', all),
                'categories': ('categories', ..., 'name', {str}),
            }),
            'id': video_id,
-            '_format_sort_fields': ('res', 'proto'),  # Prioritize direct http formats over HLS
+            'formats': formats,
            'subtitles': subtitles,
            'chapters': chapters,
            'is_live': is_live,
        }