ITV BTCC new pages' URL update (articles instead of races)

Not my changes, but from @franhp that didn't get merged on yt-dl on time It supports BTCC new pages' schema from 2019 an on (/articles/ instead of /races/)
2024-11-26 02:55:17 +00:00 · 2020-11-08 14:07:12 +00:00 · 2020-11-08 14:07:12 +00:00 · 6857df609b
commit 6857df609b
parent 651bae3d23
1 changed files with 16 additions and 5 deletions
--- a/youtube_dlc/extractor/itv.py
+++ b/youtube_dlc/extractor/itv.py
@ -20,6 +20,7 @@
    merge_dicts,
    parse_duration,
    smuggle_url,
+    try_get,
    url_or_none,
    xpath_with_ns,
    xpath_element,
@ -280,12 +281,12 @@ def extract_subtitle(sub_url):
 class ITVBTCCIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?itv\.com/btcc/(?:[^/]+/)*(?P<id>[^/?#&]+)'
    _TEST = {
-        'url': 'http://www.itv.com/btcc/races/btcc-2018-all-the-action-from-brands-hatch',
+        'url': 'https://www.itv.com/btcc/articles/btcc-2019-brands-hatch-gp-race-action',
        'info_dict': {
-            'id': 'btcc-2018-all-the-action-from-brands-hatch',
-            'title': 'BTCC 2018: All the action from Brands Hatch',
+            'id': 'btcc-2019-brands-hatch-gp-race-action',
+            'title': 'BTCC 2019: Brands Hatch GP race action',
        },
-        'playlist_mincount': 9,
+        'playlist_mincount': 12,
    }
    BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1582188683001/HkiHLnNRx_default/index.html?videoId=%s'

@ -294,6 +295,16 @@ def _real_extract(self, url):

        webpage = self._download_webpage(url, playlist_id)

+        json_map = try_get(self._parse_json(self._html_search_regex(
+            '(?s)<script[^>]+id=[\'"]__NEXT_DATA__[^>]*>([^<]+)</script>', webpage, 'json_map'), playlist_id),
+            lambda x: x['props']['pageProps']['article']['body']['content']) or []
+
+        # Discard empty objects
+        video_ids = []
+        for video in json_map:
+            if video['data'].get('id'):
+                video_ids.append(video['data']['id'])
+
        entries = [
            self.url_result(
                smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % video_id, {
@ -305,7 +316,7 @@ def _real_extract(self, url):
                    'referrer': url,
                }),
                ie=BrightcoveNewIE.ie_key(), video_id=video_id)
-            for video_id in re.findall(r'data-video-id=["\'](\d+)', webpage)]
+            for video_id in video_ids]

        title = self._og_search_title(webpage, fatal=False)