From 0a1a8e3005f66c44bf67633dccd4df19c3fccd1a Mon Sep 17 00:00:00 2001 From: rrgomes Date: Sat, 11 May 2024 12:38:41 -0400 Subject: [PATCH] [ie/nfb] Fix extractors (#9650) Authored by: rrgomes --- yt_dlp/extractor/nfb.py | 27 ++++++++++----------------- 1 file changed, 10 insertions(+), 17 deletions(-) diff --git a/yt_dlp/extractor/nfb.py b/yt_dlp/extractor/nfb.py index 6f7872825..968c9728b 100644 --- a/yt_dlp/extractor/nfb.py +++ b/yt_dlp/extractor/nfb.py @@ -5,7 +5,6 @@ merge_dicts, parse_count, url_or_none, - urljoin, ) from ..utils.traversal import traverse_obj @@ -16,8 +15,7 @@ class NFBBaseIE(InfoExtractor): def _extract_ep_data(self, webpage, video_id, fatal=False): return self._search_json( - r'const\s+episodesData\s*=', webpage, 'episode data', video_id, - contains_pattern=r'\[\s*{(?s:.+)}\s*\]', fatal=fatal) or [] + r'episodesData\s*:', webpage, 'episode data', video_id, fatal=fatal) or {} def _extract_ep_info(self, data, video_id, slug=None): info = traverse_obj(data, (lambda _, v: video_id in v['embed_url'], { @@ -224,18 +222,14 @@ def _real_extract(self, url): # type_ can change from film to serie(s) after redirect; new slug may have episode number type_, slug = self._match_valid_url(urlh.url).group('type', 'id') - embed_url = urljoin(f'https://www.{site}.ca', self._html_search_regex( - r'<[^>]+\bid=["\']player-iframe["\'][^>]*\bsrc=["\']([^"\']+)', webpage, 'embed url')) - video_id = self._match_id(embed_url) # embed url has unique slug - player = self._download_webpage(embed_url, video_id, 'Downloading player page') - if 'MESSAGE_GEOBLOCKED' in player: - self.raise_geo_restricted(countries=self._GEO_COUNTRIES) + player_data = self._search_json( + r'window\.PLAYER_OPTIONS\[[^\]]+\]\s*=', webpage, 'player data', slug) + video_id = self._match_id(player_data['overlay']['url']) # overlay url always has unique slug formats, subtitles = self._extract_m3u8_formats_and_subtitles( - self._html_search_regex(r'source:\s*\'([^\']+)', player, 'm3u8 url'), - video_id, 'mp4', m3u8_id='hls') + player_data['source'], video_id, 'mp4', m3u8_id='hls') - if dv_source := self._html_search_regex(r'dvSource:\s*\'([^\']+)', player, 'dv', default=None): + if dv_source := url_or_none(player_data.get('dvSource')): fmts, subs = self._extract_m3u8_formats_and_subtitles( dv_source, video_id, 'mp4', m3u8_id='dv', preference=-2, fatal=False) for fmt in fmts: @@ -246,17 +240,16 @@ def _real_extract(self, url): info = { 'id': video_id, 'title': self._html_search_regex( - r'<[^>]+\bid=["\']titleHeader["\'][^>]*>\s*]*>\s*([^<]+?)\s*', + r'["\']nfb_version_title["\']\s*:\s*["\']([^"\']+)', webpage, 'title', default=None), 'description': self._html_search_regex( r'<[^>]+\bid=["\']tabSynopsis["\'][^>]*>\s*]*>\s*([^<]+)', webpage, 'description', default=None), - 'thumbnail': self._html_search_regex( - r'poster:\s*\'([^\']+)', player, 'thumbnail', default=None), + 'thumbnail': url_or_none(player_data.get('poster')), 'uploader': self._html_search_regex( - r'<[^>]+\bitemprop=["\']name["\'][^>]*>([^<]+)', webpage, 'uploader', default=None), + r'<[^>]+\bitemprop=["\']director["\'][^>]*>([^<]+)', webpage, 'uploader', default=None), 'release_year': int_or_none(self._html_search_regex( - r'<[^>]+\bitemprop=["\']datePublished["\'][^>]*>([^<]+)', + r'["\']nfb_version_year["\']\s*:\s*["\']([^"\']+)', webpage, 'release_year', default=None)), } if type_ == 'film' else self._extract_ep_info(self._extract_ep_data(webpage, video_id, slug), video_id)