mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2024-12-22 06:00:00 +00:00
[ie/NHKRadiru] Fix extractor (#10106)
Closes #10105 Authored by: garret1317
This commit is contained in:
parent
e53e56b735
commit
b8e2a5e0e1
1 changed files with 171 additions and 69 deletions
|
@ -4,6 +4,7 @@
|
||||||
from ..utils import (
|
from ..utils import (
|
||||||
ExtractorError,
|
ExtractorError,
|
||||||
clean_html,
|
clean_html,
|
||||||
|
filter_dict,
|
||||||
get_element_by_class,
|
get_element_by_class,
|
||||||
int_or_none,
|
int_or_none,
|
||||||
join_nonempty,
|
join_nonempty,
|
||||||
|
@ -590,21 +591,22 @@ class NhkRadiruIE(InfoExtractor):
|
||||||
IE_DESC = 'NHK らじる (Radiru/Rajiru)'
|
IE_DESC = 'NHK らじる (Radiru/Rajiru)'
|
||||||
_VALID_URL = r'https?://www\.nhk\.or\.jp/radio/(?:player/ondemand|ondemand/detail)\.html\?p=(?P<site>[\da-zA-Z]+)_(?P<corner>[\da-zA-Z]+)(?:_(?P<headline>[\da-zA-Z]+))?'
|
_VALID_URL = r'https?://www\.nhk\.or\.jp/radio/(?:player/ondemand|ondemand/detail)\.html\?p=(?P<site>[\da-zA-Z]+)_(?P<corner>[\da-zA-Z]+)(?:_(?P<headline>[\da-zA-Z]+))?'
|
||||||
_TESTS = [{
|
_TESTS = [{
|
||||||
'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=0449_01_3926210',
|
'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=0449_01_4003239',
|
||||||
'skip': 'Episode expired on 2024-02-24',
|
'skip': 'Episode expired on 2024-06-09',
|
||||||
'info_dict': {
|
'info_dict': {
|
||||||
'title': 'ジャズ・トゥナイト シリーズJAZZジャイアンツ 56 ジョニー・ホッジス',
|
'title': 'ジャズ・トゥナイト ジャズ「Night and Day」特集',
|
||||||
'id': '0449_01_3926210',
|
'id': '0449_01_4003239',
|
||||||
'ext': 'm4a',
|
'ext': 'm4a',
|
||||||
|
'uploader': 'NHK FM 東京',
|
||||||
|
'description': 'md5:ad05f3c3f3f6e99b2e69f9b5e49551dc',
|
||||||
'series': 'ジャズ・トゥナイト',
|
'series': 'ジャズ・トゥナイト',
|
||||||
'uploader': 'NHK-FM',
|
'channel': 'NHK FM 東京',
|
||||||
'channel': 'NHK-FM',
|
|
||||||
'thumbnail': 'https://www.nhk.or.jp/prog/img/449/g449.jpg',
|
'thumbnail': 'https://www.nhk.or.jp/prog/img/449/g449.jpg',
|
||||||
'release_date': '20240217',
|
'upload_date': '20240601',
|
||||||
'description': 'md5:a456ee8e5e59e6dd2a7d32e62386e811',
|
'series_id': '0449_01',
|
||||||
'timestamp': 1708185600,
|
'release_date': '20240601',
|
||||||
'release_timestamp': 1708178400,
|
'timestamp': 1717257600,
|
||||||
'upload_date': '20240217',
|
'release_timestamp': 1717250400,
|
||||||
},
|
},
|
||||||
}, {
|
}, {
|
||||||
# playlist, airs every weekday so it should _hopefully_ be okay forever
|
# playlist, airs every weekday so it should _hopefully_ be okay forever
|
||||||
|
@ -613,71 +615,145 @@ class NhkRadiruIE(InfoExtractor):
|
||||||
'id': '0458_01',
|
'id': '0458_01',
|
||||||
'title': 'ベストオブクラシック',
|
'title': 'ベストオブクラシック',
|
||||||
'description': '世界中の上質な演奏会をじっくり堪能する本格派クラシック番組。',
|
'description': '世界中の上質な演奏会をじっくり堪能する本格派クラシック番組。',
|
||||||
'channel': 'NHK-FM',
|
|
||||||
'uploader': 'NHK-FM',
|
|
||||||
'thumbnail': 'https://www.nhk.or.jp/prog/img/458/g458.jpg',
|
'thumbnail': 'https://www.nhk.or.jp/prog/img/458/g458.jpg',
|
||||||
|
'series_id': '0458_01',
|
||||||
|
'uploader': 'NHK FM',
|
||||||
|
'channel': 'NHK FM',
|
||||||
|
'series': 'ベストオブクラシック',
|
||||||
},
|
},
|
||||||
'playlist_mincount': 3,
|
'playlist_mincount': 3,
|
||||||
}, {
|
}, {
|
||||||
# one with letters in the id
|
# one with letters in the id
|
||||||
'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=F300_06_3738470',
|
'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=F683_01_3910688',
|
||||||
'note': 'Expires on 2024-03-31',
|
'note': 'Expires on 2025-03-31',
|
||||||
'info_dict': {
|
'info_dict': {
|
||||||
'id': 'F300_06_3738470',
|
'id': 'F683_01_3910688',
|
||||||
'ext': 'm4a',
|
'ext': 'm4a',
|
||||||
'title': '有島武郎「一房のぶどう」',
|
'title': '夏目漱石「文鳥」第1回',
|
||||||
'description': '朗読:川野一宇(ラジオ深夜便アンカー)\r\n\r\n(2016年12月8日放送「ラジオ深夜便『アンカー朗読シリーズ』」より)',
|
'series': '【らじる文庫】夏目漱石「文鳥」(全4回)',
|
||||||
'channel': 'NHKラジオ第1、NHK-FM',
|
'series_id': 'F683_01',
|
||||||
'uploader': 'NHKラジオ第1、NHK-FM',
|
'description': '朗読:浅井理アナウンサー',
|
||||||
'timestamp': 1635757200,
|
'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F683/img/roudoku_05_rod_640.jpg',
|
||||||
'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F300/img/corner/box_109_thumbnail.jpg',
|
'upload_date': '20240106',
|
||||||
'release_date': '20161207',
|
'release_date': '20240106',
|
||||||
'series': 'らじる文庫 by ラジオ深夜便 ',
|
'uploader': 'NHK R1',
|
||||||
'release_timestamp': 1481126700,
|
'release_timestamp': 1704511800,
|
||||||
'upload_date': '20211101',
|
'channel': 'NHK R1',
|
||||||
|
'timestamp': 1704512700,
|
||||||
},
|
},
|
||||||
'expected_warnings': ['Unable to download JSON metadata', 'Failed to get extended description'],
|
'expected_warnings': ['Unable to download JSON metadata',
|
||||||
|
'Failed to get extended metadata. API returned Error 1: Invalid parameters'],
|
||||||
}, {
|
}, {
|
||||||
# news
|
# news
|
||||||
'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=F261_01_3855109',
|
'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=F261_01_4012173',
|
||||||
'skip': 'Expires on 2023-04-17',
|
|
||||||
'info_dict': {
|
'info_dict': {
|
||||||
'id': 'F261_01_3855109',
|
'id': 'F261_01_4012173',
|
||||||
'ext': 'm4a',
|
'ext': 'm4a',
|
||||||
'channel': 'NHKラジオ第1',
|
'channel': 'NHKラジオ第1',
|
||||||
'uploader': 'NHKラジオ第1',
|
'uploader': 'NHKラジオ第1',
|
||||||
'timestamp': 1681635900,
|
|
||||||
'release_date': '20230416',
|
|
||||||
'series': 'NHKラジオニュース',
|
'series': 'NHKラジオニュース',
|
||||||
'title': '午後6時のNHKニュース',
|
'title': '午前0時のNHKニュース',
|
||||||
'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F261/img/RADIONEWS_640.jpg',
|
'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F261/img/RADIONEWS_640.jpg',
|
||||||
'upload_date': '20230416',
|
'release_timestamp': 1718290800,
|
||||||
'release_timestamp': 1681635600,
|
'release_date': '20240613',
|
||||||
|
'timestamp': 1718291400,
|
||||||
|
'upload_date': '20240613',
|
||||||
},
|
},
|
||||||
|
}, {
|
||||||
|
# fallback when extended metadata fails
|
||||||
|
'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=2834_01_4009298',
|
||||||
|
'skip': 'Expires on 2024-06-07',
|
||||||
|
'info_dict': {
|
||||||
|
'id': '2834_01_4009298',
|
||||||
|
'title': 'まち☆キラ!開成町特集',
|
||||||
|
'ext': 'm4a',
|
||||||
|
'release_date': '20240531',
|
||||||
|
'upload_date': '20240531',
|
||||||
|
'series': 'はま☆キラ!',
|
||||||
|
'thumbnail': 'https://www.nhk.or.jp/prog/img/2834/g2834.jpg',
|
||||||
|
'channel': 'NHK R1,FM',
|
||||||
|
'description': '',
|
||||||
|
'timestamp': 1717123800,
|
||||||
|
'uploader': 'NHK R1,FM',
|
||||||
|
'release_timestamp': 1717120800,
|
||||||
|
'series_id': '2834_01',
|
||||||
|
},
|
||||||
|
'expected_warnings': ['Failed to get extended metadata. API returned empty list.'],
|
||||||
}]
|
}]
|
||||||
|
|
||||||
_API_URL_TMPL = None
|
_API_URL_TMPL = None
|
||||||
|
|
||||||
def _extract_extended_description(self, episode_id, episode):
|
def _extract_extended_metadata(self, episode_id, aa_vinfo):
|
||||||
service, _, area = traverse_obj(episode, ('aa_vinfo2', {str}, {lambda x: (x or '').partition(',')}))
|
service, _, area = traverse_obj(aa_vinfo, (2, {str}, {lambda x: (x or '').partition(',')}))
|
||||||
aa_vinfo3 = traverse_obj(episode, ('aa_vinfo3', {str}))
|
|
||||||
detail_url = try_call(
|
detail_url = try_call(
|
||||||
lambda: self._API_URL_TMPL.format(service=service, area=area, dateid=aa_vinfo3))
|
lambda: self._API_URL_TMPL.format(area=area, service=service, dateid=aa_vinfo[3]))
|
||||||
if not detail_url:
|
if not detail_url:
|
||||||
return
|
return {}
|
||||||
|
|
||||||
full_meta = traverse_obj(
|
response = self._download_json(
|
||||||
self._download_json(detail_url, episode_id, 'Downloading extended metadata', fatal=False),
|
detail_url, episode_id, 'Downloading extended metadata',
|
||||||
('list', service, 0, {dict})) or {}
|
'Failed to download extended metadata', fatal=False, expected_status=400)
|
||||||
return join_nonempty('subtitle', 'content', 'act', 'music', delim='\n\n', from_dict=full_meta)
|
if not response:
|
||||||
|
return {}
|
||||||
|
|
||||||
def _extract_episode_info(self, headline, programme_id, series_meta):
|
if error := traverse_obj(response, ('error', {dict})):
|
||||||
|
self.report_warning(
|
||||||
|
'Failed to get extended metadata. API returned '
|
||||||
|
f'Error {join_nonempty("code", "message", from_dict=error, delim=": ")}')
|
||||||
|
return {}
|
||||||
|
|
||||||
|
full_meta = traverse_obj(response, ('list', service, 0, {dict}))
|
||||||
|
if not full_meta:
|
||||||
|
self.report_warning('Failed to get extended metadata. API returned empty list.')
|
||||||
|
return {}
|
||||||
|
|
||||||
|
station = ' '.join(traverse_obj(full_meta, (('service', 'area'), 'name', {str}))) or None
|
||||||
|
thumbnails = [{
|
||||||
|
'id': str(id_),
|
||||||
|
'preference': 1 if id_.startswith('thumbnail') else -2 if id_.startswith('logo') else -1,
|
||||||
|
**traverse_obj(thumb, {
|
||||||
|
'url': 'url',
|
||||||
|
'width': ('width', {int_or_none}),
|
||||||
|
'height': ('height', {int_or_none}),
|
||||||
|
}),
|
||||||
|
} for id_, thumb in traverse_obj(full_meta, ('images', {dict.items}, lambda _, v: v[1]['url']))]
|
||||||
|
|
||||||
|
return filter_dict({
|
||||||
|
'channel': station,
|
||||||
|
'uploader': station,
|
||||||
|
'description': join_nonempty(
|
||||||
|
'subtitle', 'content', 'act', 'music', delim='\n\n', from_dict=full_meta),
|
||||||
|
'thumbnails': thumbnails,
|
||||||
|
**traverse_obj(full_meta, {
|
||||||
|
'title': ('title', {str}),
|
||||||
|
'timestamp': ('end_time', {unified_timestamp}),
|
||||||
|
'release_timestamp': ('start_time', {unified_timestamp}),
|
||||||
|
}),
|
||||||
|
})
|
||||||
|
|
||||||
|
def _extract_episode_info(self, episode, programme_id, series_meta):
|
||||||
|
episode_id = f'{programme_id}_{episode["id"]}'
|
||||||
|
aa_vinfo = traverse_obj(episode, ('aa_contents_id', {lambda x: x.split(';')}))
|
||||||
|
extended_metadata = self._extract_extended_metadata(episode_id, aa_vinfo)
|
||||||
|
fallback_start_time, _, fallback_end_time = traverse_obj(
|
||||||
|
aa_vinfo, (4, {str}, {lambda x: (x or '').partition('_')}))
|
||||||
|
|
||||||
|
return {
|
||||||
|
**series_meta,
|
||||||
|
'id': episode_id,
|
||||||
|
'formats': self._extract_m3u8_formats(episode.get('stream_url'), episode_id, fatal=False),
|
||||||
|
'container': 'm4a_dash', # force fixup, AAC-only HLS
|
||||||
|
'was_live': True,
|
||||||
|
'title': episode.get('program_title'),
|
||||||
|
'description': episode.get('program_sub_title'), # fallback
|
||||||
|
'timestamp': unified_timestamp(fallback_end_time),
|
||||||
|
'release_timestamp': unified_timestamp(fallback_start_time),
|
||||||
|
**extended_metadata,
|
||||||
|
}
|
||||||
|
|
||||||
|
def _extract_news_info(self, headline, programme_id, series_meta):
|
||||||
episode_id = f'{programme_id}_{headline["headline_id"]}'
|
episode_id = f'{programme_id}_{headline["headline_id"]}'
|
||||||
episode = traverse_obj(headline, ('file_list', 0, {dict}))
|
episode = traverse_obj(headline, ('file_list', 0, {dict}))
|
||||||
description = self._extract_extended_description(episode_id, episode)
|
|
||||||
if not description:
|
|
||||||
self.report_warning('Failed to get extended description, falling back to summary')
|
|
||||||
description = traverse_obj(episode, ('file_title_sub', {str}))
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
**series_meta,
|
**series_meta,
|
||||||
|
@ -687,9 +763,9 @@ def _extract_episode_info(self, headline, programme_id, series_meta):
|
||||||
'was_live': True,
|
'was_live': True,
|
||||||
'series': series_meta.get('title'),
|
'series': series_meta.get('title'),
|
||||||
'thumbnail': url_or_none(headline.get('headline_image')) or series_meta.get('thumbnail'),
|
'thumbnail': url_or_none(headline.get('headline_image')) or series_meta.get('thumbnail'),
|
||||||
'description': description,
|
|
||||||
**traverse_obj(episode, {
|
**traverse_obj(episode, {
|
||||||
'title': 'file_title',
|
'title': ('file_title', {str}),
|
||||||
|
'description': ('file_title_sub', {str}),
|
||||||
'timestamp': ('open_time', {unified_timestamp}),
|
'timestamp': ('open_time', {unified_timestamp}),
|
||||||
'release_timestamp': ('aa_vinfo4', {lambda x: x.split('_')[0]}, {unified_timestamp}),
|
'release_timestamp': ('aa_vinfo4', {lambda x: x.split('_')[0]}, {unified_timestamp}),
|
||||||
}),
|
}),
|
||||||
|
@ -706,32 +782,58 @@ def _real_extract(self, url):
|
||||||
site_id, corner_id, headline_id = self._match_valid_url(url).group('site', 'corner', 'headline')
|
site_id, corner_id, headline_id = self._match_valid_url(url).group('site', 'corner', 'headline')
|
||||||
programme_id = f'{site_id}_{corner_id}'
|
programme_id = f'{site_id}_{corner_id}'
|
||||||
|
|
||||||
if site_id == 'F261':
|
if site_id == 'F261': # XXX: News programmes use old API (for now?)
|
||||||
json_url = 'https://www.nhk.or.jp/s-media/news/news-site/list/v1/all.json'
|
meta = self._download_json(
|
||||||
else:
|
'https://www.nhk.or.jp/s-media/news/news-site/list/v1/all.json', programme_id)['main']
|
||||||
json_url = f'https://www.nhk.or.jp/radioondemand/json/{site_id}/bangumi_{programme_id}.json'
|
|
||||||
|
|
||||||
meta = self._download_json(json_url, programme_id)['main']
|
|
||||||
|
|
||||||
series_meta = traverse_obj(meta, {
|
series_meta = traverse_obj(meta, {
|
||||||
'title': 'program_name',
|
'title': ('program_name', {str}),
|
||||||
'channel': 'media_name',
|
'channel': ('media_name', {str}),
|
||||||
'uploader': 'media_name',
|
'uploader': ('media_name', {str}),
|
||||||
'thumbnail': (('thumbnail_c', 'thumbnail_p'), {url_or_none}),
|
'thumbnail': (('thumbnail_c', 'thumbnail_p'), {url_or_none}),
|
||||||
}, get_all=False)
|
}, get_all=False)
|
||||||
|
|
||||||
if headline_id:
|
if headline_id:
|
||||||
return self._extract_episode_info(
|
headline = traverse_obj(
|
||||||
traverse_obj(meta, (
|
meta, ('detail_list', lambda _, v: v['headline_id'] == headline_id, any))
|
||||||
'detail_list', lambda _, v: v['headline_id'] == headline_id), get_all=False),
|
if not headline:
|
||||||
programme_id, series_meta)
|
raise ExtractorError('Content not found; it has most likely expired', expected=True)
|
||||||
|
return self._extract_news_info(headline, programme_id, series_meta)
|
||||||
|
|
||||||
def entries():
|
def news_entries():
|
||||||
for headline in traverse_obj(meta, ('detail_list', ..., {dict})):
|
for headline in traverse_obj(meta, ('detail_list', ..., {dict})):
|
||||||
yield self._extract_episode_info(headline, programme_id, series_meta)
|
yield self._extract_news_info(headline, programme_id, series_meta)
|
||||||
|
|
||||||
return self.playlist_result(
|
return self.playlist_result(
|
||||||
entries(), programme_id, playlist_description=meta.get('site_detail'), **series_meta)
|
news_entries(), programme_id, description=meta.get('site_detail'), **series_meta)
|
||||||
|
|
||||||
|
meta = self._download_json(
|
||||||
|
'https://www.nhk.or.jp/radio-api/app/v1/web/ondemand/series', programme_id, query={
|
||||||
|
'site_id': site_id,
|
||||||
|
'corner_site_id': corner_id,
|
||||||
|
})
|
||||||
|
|
||||||
|
fallback_station = join_nonempty('NHK', traverse_obj(meta, ('radio_broadcast', {str})), delim=' ')
|
||||||
|
series_meta = {
|
||||||
|
'series': join_nonempty('title', 'corner_name', delim=' ', from_dict=meta),
|
||||||
|
'series_id': programme_id,
|
||||||
|
'thumbnail': traverse_obj(meta, ('thumbnail_url', {url_or_none})),
|
||||||
|
'channel': fallback_station,
|
||||||
|
'uploader': fallback_station,
|
||||||
|
}
|
||||||
|
|
||||||
|
if headline_id:
|
||||||
|
episode = traverse_obj(meta, ('episodes', lambda _, v: v['id'] == int(headline_id), any))
|
||||||
|
if not episode:
|
||||||
|
raise ExtractorError('Content not found; it has most likely expired', expected=True)
|
||||||
|
return self._extract_episode_info(episode, programme_id, series_meta)
|
||||||
|
|
||||||
|
def entries():
|
||||||
|
for episode in traverse_obj(meta, ('episodes', ..., {dict})):
|
||||||
|
yield self._extract_episode_info(episode, programme_id, series_meta)
|
||||||
|
|
||||||
|
return self.playlist_result(
|
||||||
|
entries(), programme_id, title=series_meta.get('series'),
|
||||||
|
description=meta.get('series_description'), **series_meta)
|
||||||
|
|
||||||
|
|
||||||
class NhkRadioNewsPageIE(InfoExtractor):
|
class NhkRadioNewsPageIE(InfoExtractor):
|
||||||
|
|
Loading…
Reference in a new issue