0
0
Fork 0
mirror of https://github.com/yt-dlp/yt-dlp.git synced 2024-11-29 03:23:02 +00:00

[ie/BiliBiliBangumi] Fix extractors (#7337)

- Overhaul BiliBiliBangumi extractor for the site's new API
- Add BiliBiliBangumiSeason extractor
- Refactor BiliBiliBangumiMedia extractor

Closes #6701, Closes #7400
Authored by: GD-Slime
This commit is contained in:
GD-Slime 2023-07-09 06:26:03 +08:00 committed by GitHub
parent 92315c0377
commit bdd0b75e3f
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 85 additions and 45 deletions

View file

@ -214,6 +214,7 @@
from .bilibili import ( from .bilibili import (
BiliBiliIE, BiliBiliIE,
BiliBiliBangumiIE, BiliBiliBangumiIE,
BiliBiliBangumiSeasonIE,
BiliBiliBangumiMediaIE, BiliBiliBangumiMediaIE,
BiliBiliSearchIE, BiliBiliSearchIE,
BilibiliCategoryIE, BilibiliCategoryIE,

View file

@ -18,6 +18,7 @@
float_or_none, float_or_none,
format_field, format_field,
int_or_none, int_or_none,
join_nonempty,
make_archive_id, make_archive_id,
merge_dicts, merge_dicts,
mimetype2ext, mimetype2ext,
@ -135,6 +136,17 @@ def _get_all_children(self, reply):
for children in map(self._get_all_children, traverse_obj(reply, ('replies', ...))): for children in map(self._get_all_children, traverse_obj(reply, ('replies', ...))):
yield from children yield from children
def _get_episodes_from_season(self, ss_id, url):
season_info = self._download_json(
'https://api.bilibili.com/pgc/web/season/section', ss_id,
note='Downloading season info', query={'season_id': ss_id},
headers={'Referer': url, **self.geo_verification_headers()})
for entry in traverse_obj(season_info, (
'result', 'main_section', 'episodes',
lambda _, v: url_or_none(v['share_url']) and v['id'])):
yield self.url_result(entry['share_url'], BiliBiliBangumiIE, f'ep{entry["id"]}')
class BiliBiliIE(BilibiliBaseIE): class BiliBiliIE(BilibiliBaseIE):
_VALID_URL = r'https?://www\.bilibili\.com/(?:video/|festival/\w+\?(?:[^#]*&)?bvid=)[aAbB][vV](?P<id>[^/?#&]+)' _VALID_URL = r'https?://www\.bilibili\.com/(?:video/|festival/\w+\?(?:[^#]*&)?bvid=)[aAbB][vV](?P<id>[^/?#&]+)'
@ -403,76 +415,93 @@ def _real_extract(self, url):
class BiliBiliBangumiIE(BilibiliBaseIE): class BiliBiliBangumiIE(BilibiliBaseIE):
_VALID_URL = r'(?x)https?://www\.bilibili\.com/bangumi/play/(?P<id>(?:ss|ep)\d+)' _VALID_URL = r'https?://(?:www\.)?bilibili\.com/bangumi/play/(?P<id>ep\d+)'
_TESTS = [{ _TESTS = [{
'url': 'https://www.bilibili.com/bangumi/play/ss897', 'url': 'https://www.bilibili.com/bangumi/play/ep267851',
'info_dict': { 'info_dict': {
'id': 'ss897', 'id': '267851',
'ext': 'mp4', 'ext': 'mp4',
'series': '神的记事本', 'series': '鬼灭之刃',
'season': '神的记事本', 'series_id': '4358',
'season_id': 897, 'season': '鬼灭之刃',
'season_id': '26801',
'season_number': 1, 'season_number': 1,
'episode': '你与旅行包', 'episode': '残酷',
'episode_number': 2, 'episode_id': '267851',
'title': '神的记事本第2话 你与旅行包', 'episode_number': 1,
'duration': 1428.487, 'title': '1 残酷',
'timestamp': 1310809380, 'duration': 1425.256,
'upload_date': '20110716', 'timestamp': 1554566400,
'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', 'upload_date': '20190406',
'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$'
}, },
}, { 'skip': 'According to the copyright owner\'s request, you may only watch the video after you are premium member.'
'url': 'https://www.bilibili.com/bangumi/play/ep508406',
'only_matching': True,
}] }]
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
episode_id = video_id[2:]
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
if '您所在的地区无法观看本片' in webpage: if '您所在的地区无法观看本片' in webpage:
raise GeoRestrictedError('This video is restricted') raise GeoRestrictedError('This video is restricted')
elif ('开通大会员观看' in webpage and '__playinfo__' not in webpage elif '正在观看预览,大会员免费看全片' in webpage:
or '正在观看预览,大会员免费看全片' in webpage):
self.raise_login_required('This video is for premium members only') self.raise_login_required('This video is for premium members only')
play_info = self._search_json(r'window\.__playinfo__\s*=', webpage, 'play info', video_id)['data'] headers = {'Referer': url, **self.geo_verification_headers()}
play_info = self._download_json(
'https://api.bilibili.com/pgc/player/web/v2/playurl', video_id,
'Extracting episode', query={'fnval': '4048', 'ep_id': episode_id},
headers=headers)
premium_only = play_info.get('code') == -10403
play_info = traverse_obj(play_info, ('result', 'video_info', {dict})) or {}
formats = self.extract_formats(play_info) formats = self.extract_formats(play_info)
if (not formats and '成为大会员抢先看' in webpage if not formats and (premium_only or '成为大会员抢先看' in webpage or '开通大会员观看' in webpage):
and play_info.get('durl') and not play_info.get('dash')):
self.raise_login_required('This video is for premium members only') self.raise_login_required('This video is for premium members only')
initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', video_id) bangumi_info = self._download_json(
'https://api.bilibili.com/pgc/view/web/season', video_id, 'Get episode details',
query={'ep_id': episode_id}, headers=headers)['result']
season_id = traverse_obj(initial_state, ('mediaInfo', 'season_id')) episode_number, episode_info = next((
(idx, ep) for idx, ep in enumerate(traverse_obj(
bangumi_info, ('episodes', ..., {dict})), 1)
if str_or_none(ep.get('id')) == episode_id), (1, {}))
season_id = bangumi_info.get('season_id')
season_number = season_id and next(( season_number = season_id and next((
idx + 1 for idx, e in enumerate( idx + 1 for idx, e in enumerate(
traverse_obj(initial_state, ('mediaInfo', 'seasons', ...))) traverse_obj(bangumi_info, ('seasons', ...)))
if e.get('season_id') == season_id if e.get('season_id') == season_id
), None) ), None)
aid = episode_info.get('aid')
return { return {
'id': video_id, 'id': video_id,
'formats': formats, 'formats': formats,
'title': traverse_obj(initial_state, 'h1Title'), **traverse_obj(bangumi_info, {
'episode': traverse_obj(initial_state, ('epInfo', 'long_title')), 'series': ('series', 'series_title', {str}),
'episode_number': int_or_none(traverse_obj(initial_state, ('epInfo', 'title'))), 'series_id': ('series', 'series_id', {str_or_none}),
'series': traverse_obj(initial_state, ('mediaInfo', 'series')), 'thumbnail': ('square_cover', {url_or_none}),
'season': traverse_obj(initial_state, ('mediaInfo', 'season_title')), }),
'season_id': season_id, 'title': join_nonempty('title', 'long_title', delim=' ', from_dict=episode_info),
'episode': episode_info.get('long_title'),
'episode_id': episode_id,
'episode_number': int_or_none(episode_info.get('title')) or episode_number,
'season_id': str_or_none(season_id),
'season_number': season_number, 'season_number': season_number,
'thumbnail': traverse_obj(initial_state, ('epInfo', 'cover')), 'timestamp': int_or_none(episode_info.get('pub_time')),
'timestamp': traverse_obj(initial_state, ('epInfo', 'pub_time')),
'duration': float_or_none(play_info.get('timelength'), scale=1000), 'duration': float_or_none(play_info.get('timelength'), scale=1000),
'subtitles': self.extract_subtitles( 'subtitles': self.extract_subtitles(video_id, aid, episode_info.get('cid')),
video_id, initial_state, traverse_obj(initial_state, ('epInfo', 'cid'))), '__post_extractor': self.extract_comments(aid),
'__post_extractor': self.extract_comments(traverse_obj(initial_state, ('epInfo', 'aid'))), 'http_headers': headers,
'http_headers': {'Referer': url, **self.geo_verification_headers()},
} }
class BiliBiliBangumiMediaIE(InfoExtractor): class BiliBiliBangumiMediaIE(BilibiliBaseIE):
_VALID_URL = r'https?://www\.bilibili\.com/bangumi/media/md(?P<id>\d+)' _VALID_URL = r'https?://www\.bilibili\.com/bangumi/media/md(?P<id>\d+)'
_TESTS = [{ _TESTS = [{
'url': 'https://www.bilibili.com/bangumi/media/md24097891', 'url': 'https://www.bilibili.com/bangumi/media/md24097891',
@ -485,16 +514,26 @@ class BiliBiliBangumiMediaIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
media_id = self._match_id(url) media_id = self._match_id(url)
webpage = self._download_webpage(url, media_id) webpage = self._download_webpage(url, media_id)
ss_id = self._search_json(
r'window\.__INITIAL_STATE__\s*=', webpage, 'initial_state', media_id)['mediaInfo']['season_id']
initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial_state', media_id) return self.playlist_result(self._get_episodes_from_season(ss_id, url), media_id)
episode_list = self._download_json(
'https://api.bilibili.com/pgc/web/season/section', media_id,
query={'season_id': initial_state['mediaInfo']['season_id']},
note='Downloading season info')['result']['main_section']['episodes']
return self.playlist_result((
self.url_result(entry['share_url'], BiliBiliBangumiIE, entry['aid']) class BiliBiliBangumiSeasonIE(BilibiliBaseIE):
for entry in episode_list), media_id) _VALID_URL = r'(?x)https?://www\.bilibili\.com/bangumi/play/ss(?P<id>\d+)'
_TESTS = [{
'url': 'https://www.bilibili.com/bangumi/play/ss26801',
'info_dict': {
'id': '26801'
},
'playlist_mincount': 26
}]
def _real_extract(self, url):
ss_id = self._match_id(url)
return self.playlist_result(self._get_episodes_from_season(ss_id, url), ss_id)
class BilibiliSpaceBaseIE(InfoExtractor): class BilibiliSpaceBaseIE(InfoExtractor):