[extractor/npr] Use stream url from json-ld (#3455)

Closes #1934 Authored by: r5d
2022-06-02 20:51:11 -04:00 · 2022-06-02 20:51:11 -04:00 · e50c3500b4
parent 09d02ea429
commit e50c3500b4
2 changed files with 20 additions and 1 deletions
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@ -1461,7 +1461,7 @@ class InfoExtractor:
            assert e['@type'] == 'VideoObject'
            author = e.get('author')
            info.update({
-                'url': url_or_none(e.get('contentUrl')),
+                'url': traverse_obj(e, 'contentUrl', 'embedUrl', expected_type=url_or_none),
                'title': unescapeHTML(e.get('name')),
                'description': unescapeHTML(e.get('description')),
                'thumbnails': [{'url': url}
@ -1529,6 +1529,8 @@ class InfoExtractor:
                    })
                    if traverse_obj(e, ('video', 0, '@type')) == 'VideoObject':
                        extract_video_object(e['video'][0])
+                    elif traverse_obj(e, ('subjectOf', 0, '@type')) == 'VideoObject':
+                        extract_video_object(e['subjectOf'][0])
                elif item_type == 'VideoObject':
                    extract_video_object(e)
                    if expected_type is None:
--- a/yt_dlp/extractor/npr.py
+++ b/yt_dlp/extractor/npr.py
@ -51,6 +51,15 @@ class NprIE(InfoExtractor):
        # multimedia, no formats, stream
        'url': 'https://www.npr.org/2020/02/14/805476846/laura-stevenson-tiny-desk-concert',
        'only_matching': True,
+    }, {
+        'url': 'https://www.npr.org/2022/03/15/1084896560/bonobo-tiny-desk-home-concert',
+        'info_dict': {
+            'id': '1086468851',
+            'ext': 'mp4',
+            'title': 'Bonobo: Tiny Desk (Home) Concert',
+            'duration': 1061,
+            'thumbnail': r're:^https?://media.npr.org/assets/img/.*\.jpg$',
+        },
    }]

    def _real_extract(self, url):
@ -65,6 +74,10 @@ class NprIE(InfoExtractor):
            })['list']['story'][0]
        playlist_title = story.get('title', {}).get('$text')

+        # Fetch the JSON-LD from the npr page.
+        json_ld = self._search_json_ld(
+            self._download_webpage(url, playlist_id), playlist_id, 'NewsArticle', fatal=False)
+
        KNOWN_FORMATS = ('threegp', 'm3u8', 'smil', 'mp4', 'mp3')
        quality = qualities(KNOWN_FORMATS)

@ -110,6 +123,10 @@ class NprIE(InfoExtractor):
                formats.extend(self._extract_m3u8_formats(
                    stream_url, stream_id, 'mp4', 'm3u8_native',
                    m3u8_id='hls', fatal=False))
+
+            if not formats and json_ld.get('url'):
+                formats.extend(self._extract_m3u8_formats(json_ld['url'], media_id, 'mp4', m3u8_id='hls', fatal=False))
+
            self._sort_formats(formats)

            entries.append({