[extractor/noice] Add NoicePodcast extractor (#5621)

Authored by: HobbyistDev
This commit is contained in:
HobbyistDev 2022-12-08 22:58:36 +09:00 committed by GitHub
parent dfc186d422
commit 28b8f57b4b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 117 additions and 0 deletions

View File

@ -1211,6 +1211,7 @@ from .nintendo import NintendoIE
from .nitter import NitterIE from .nitter import NitterIE
from .njpwworld import NJPWWorldIE from .njpwworld import NJPWWorldIE
from .nobelprize import NobelPrizeIE from .nobelprize import NobelPrizeIE
from .noice import NoicePodcastIE
from .nonktube import NonkTubeIE from .nonktube import NonkTubeIE
from .noodlemagazine import NoodleMagazineIE from .noodlemagazine import NoodleMagazineIE
from .noovo import NoovoIE from .noovo import NoovoIE

116
yt_dlp/extractor/noice.py Normal file
View File

@ -0,0 +1,116 @@
from .common import InfoExtractor
from ..utils import (
clean_html,
determine_ext,
int_or_none,
parse_iso8601,
traverse_obj,
variadic,
)
class NoicePodcastIE(InfoExtractor):
_VALID_URL = r'https?://open\.noice\.id/content/(?P<id>[a-fA-F0-9-]+)'
_TESTS = [{
'url': 'https://open.noice.id/content/7694bb04-ff0f-40fa-a60b-5b39f29584b2',
'info_dict': {
'id': '7694bb04-ff0f-40fa-a60b-5b39f29584b2',
'ext': 'm4a',
'season': 'Season 1',
'description': 'md5:58d1274e6857b6fbbecf47075885380d',
'release_date': '20221115',
'timestamp': 1668496642,
'season_number': 1,
'upload_date': '20221115',
'release_timestamp': 1668496642,
'title': 'Eps 1. Belajar dari Wishnutama: Kreatif Bukan Followers! (bersama Wishnutama)',
'modified_date': '20221121',
'categories': ['Bisnis dan Keuangan'],
'duration': 3567,
'modified_timestamp': 1669030647,
'thumbnail': 'https://images.noiceid.cc/catalog/content-1668496302560',
'channel_id': '9dab1024-5b92-4265-ae1c-63da87359832',
'like_count': int,
'channel': 'Noice Space Talks',
'comment_count': int,
'dislike_count': int,
'channel_follower_count': int,
}
}, {
'url': 'https://open.noice.id/content/222134e4-99f2-456f-b8a2-b8be404bf063',
'info_dict': {
'id': '222134e4-99f2-456f-b8a2-b8be404bf063',
'ext': 'm4a',
'release_timestamp': 1653488220,
'description': 'md5:35074f6190cef52b05dd133bb2ef460e',
'upload_date': '20220525',
'timestamp': 1653460637,
'release_date': '20220525',
'thumbnail': 'https://images.noiceid.cc/catalog/content-1653460337625',
'title': 'Eps 1: Dijodohin Sama Anak Pak RT',
'modified_timestamp': 1669030647,
'season_number': 1,
'modified_date': '20221121',
'categories': ['Cerita dan Drama'],
'duration': 1830,
'season': 'Season 1',
'channel_id': '60193f6b-d24d-4b23-913b-ceed5a731e74',
'dislike_count': int,
'like_count': int,
'comment_count': int,
'channel': 'Dear Jerome',
'channel_follower_count': int,
}
}]
def _get_formats_and_subtitles(self, media_url, video_id):
formats, subtitles = [], {}
for url in variadic(media_url):
ext = determine_ext(url)
if ext == 'm3u8':
fmts, subs = self._extract_m3u8_formats_and_subtitles(url, video_id)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
else:
formats.append({
'url': url,
'ext': 'mp3',
'vcodec': 'none',
'acodec': 'mp3',
})
return formats, subtitles
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
nextjs_data = self._search_nextjs_data(webpage, display_id)['props']['pageProps']['contentDetails']
media_url_list = traverse_obj(nextjs_data, (('rawContentUrl', 'url'), ))
formats, subtitles = self._get_formats_and_subtitles(media_url_list, display_id)
return {
'id': nextjs_data.get('id') or display_id,
'title': nextjs_data.get('title') or self._html_search_meta('og:title', webpage),
'formats': formats,
'subtitles': subtitles,
'description': (nextjs_data.get('description') or clean_html(nextjs_data.get('htmlDescription'))
or self._html_search_meta(['description', 'og:description'], webpage)),
'thumbnail': nextjs_data.get('image') or self._html_search_meta('og:image', webpage),
'timestamp': parse_iso8601(nextjs_data.get('createdAt')),
'release_timestamp': parse_iso8601(nextjs_data.get('publishedAt')),
'modified_timestamp': parse_iso8601(
nextjs_data.get('updatedAt') or self._html_search_meta('og:updated_time', webpage)),
'duration': int_or_none(nextjs_data.get('duration')),
'categories': traverse_obj(nextjs_data, ('genres', ..., 'name')),
'season': nextjs_data.get('seasonName'),
'season_number': int_or_none(nextjs_data.get('seasonNumber')),
'channel': traverse_obj(nextjs_data, ('catalog', 'title')),
'channel_id': traverse_obj(nextjs_data, ('catalog', 'id'), 'catalogId'),
**traverse_obj(nextjs_data, ('meta', 'aggregations', {
'like_count': 'likes',
'dislike_count': 'dislikes',
'comment_count': 'comments',
'channel_follower_count': 'followers',
}))
}