[nbc] improve extraction(closes #12364)

2024-11-18 01:35:12 +00:00 · 2017-05-07 08:58:34 +01:00 · 2017-05-07 08:58:34 +01:00 · 2eeb588efe
commit 2eeb588efe
parent 4ac0f573ef
1 changed files with 31 additions and 67 deletions
--- a/youtube_dl/extractor/nbc.py
+++ b/youtube_dl/extractor/nbc.py
@ -17,7 +17,7 @@
 class NBCIE(AdobePassIE):
-    _VALID_URL = r'https?://(?:www\.)?nbc\.com/(?:[^/]+/)+(?P<id>n?\d+)'
+    _VALID_URL = r'https?://(?:www\.)?nbc\.com/[^/]+/video/[^/]+/(?P<id>n?\d+)'
    _TESTS = [
        {
@ -36,16 +36,6 @@ class NBCIE(AdobePassIE):
                'skip_download': True,
            },
        },
        {
            'url': 'http://www.nbc.com/the-tonight-show/episodes/176',
            'info_dict': {
                'id': '176',
                'ext': 'flv',
                'title': 'Ricky Gervais, Steven Van Zandt, ILoveMakonnen',
                'description': 'A brand new episode of The Tonight Show welcomes Ricky Gervais, Steven Van Zandt and ILoveMakonnen.',
            },
            'skip': '404 Not Found',
        },
        {
            'url': 'http://www.nbc.com/saturday-night-live/video/star-wars-teaser/2832821',
            'info_dict': {
@ -63,11 +53,6 @@ class NBCIE(AdobePassIE):
            },
            'skip': 'Only works from US',
        },
        {
            # This video has expired but with an escaped embedURL
            'url': 'http://www.nbc.com/parenthood/episode-guide/season-5/just-like-at-home/515',
            'only_matching': True,
        },
        {
            # HLS streams requires the 'hdnea3' cookie
            'url': 'http://www.nbc.com/Kings/video/goliath/n1806',
@ -89,58 +74,37 @@ class NBCIE(AdobePassIE):
    def _real_extract(self, url):
        video_id = self._match_id(url)
-        webpage = self._download_webpage(url, video_id)
+        video_data = self._download_json(
-        info = {
+            'https://api.nbc.com/v3/videos', video_id, query={
-            '_type': 'url_transparent',
+                'filter[permalink]': url,
-            'ie_key': 'ThePlatform',
+            })['data'][0]['attributes']
-            'id': video_id,
+        query = {
            'mbr': 'true',
            'manifest': 'm3u',
        }
        video_id = video_data['guid']
        title = video_data['title']
        if video_data.get('entitlement') == 'auth':
            resource = self._get_mvpd_resource(
                'nbcentertainment', title, video_id,
                video_data.get('vChipRating'))
            query['auth'] = self._extract_mvpd_auth(
                url, video_id, 'nbcentertainment', resource)
        theplatform_url = smuggle_url(update_url_query(
            'http://link.theplatform.com/s/NnzsPC/media/guid/2410887629/' + video_id,
            query), {'force_smil_url': True})
        return {
            '_type': 'url_transparent',
            'id': video_id,
            'title': title,
            'url': theplatform_url,
            'description': video_data.get('description'),
            'keywords': video_data.get('keywords'),
            'season_number': int_or_none(video_data.get('seasonNumber')),
            'episode_number': int_or_none(video_data.get('episodeNumber')),
            'series': video_data.get('showName'),
            'ie_key': 'ThePlatform',
        }
        video_data = None
        preload = self._search_regex(
            r'PRELOAD\s*=\s*({.+})', webpage, 'preload data', default=None)
        if preload:
            preload_data = self._parse_json(preload, video_id)
            path = compat_urllib_parse_urlparse(url).path.rstrip('/')
            entity_id = preload_data.get('xref', {}).get(path)
            video_data = preload_data.get('entities', {}).get(entity_id)
        if video_data:
            query = {
                'mbr': 'true',
                'manifest': 'm3u',
            }
            video_id = video_data['guid']
            title = video_data['title']
            if video_data.get('entitlement') == 'auth':
                resource = self._get_mvpd_resource(
                    'nbcentertainment', title, video_id,
                    video_data.get('vChipRating'))
                query['auth'] = self._extract_mvpd_auth(
                    url, video_id, 'nbcentertainment', resource)
            theplatform_url = smuggle_url(update_url_query(
                'http://link.theplatform.com/s/NnzsPC/media/guid/2410887629/' + video_id,
                query), {'force_smil_url': True})
            info.update({
                'id': video_id,
                'title': title,
                'url': theplatform_url,
                'description': video_data.get('description'),
                'keywords': video_data.get('keywords'),
                'season_number': int_or_none(video_data.get('seasonNumber')),
                'episode_number': int_or_none(video_data.get('episodeNumber')),
                'series': video_data.get('showName'),
            })
        else:
            theplatform_url = unescapeHTML(lowercase_escape(self._html_search_regex(
                [
                    r'(?:class="video-player video-player-full" data-mpx-url|class="player" src)="(.*?)"',
                    r'<iframe[^>]+src="((?:https?:)?//player\.theplatform\.com/[^"]+)"',
                    r'"embedURL"\s*:\s*"([^"]+)"'
                ],
                webpage, 'theplatform url').replace('_no_endcard', '').replace('\\/', '/')))
            if theplatform_url.startswith('//'):
                theplatform_url = 'http:' + theplatform_url
            info['url'] = smuggle_url(theplatform_url, {'source_url': url})
        return info
 class NBCSportsVPlayerIE(InfoExtractor):