From 82e3f6ebda56c84166494e157e0f856467ca5581 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Sun, 31 Jan 2021 13:18:06 +0530 Subject: [PATCH] [youtube_live_chat] Fix `parse_yt_initial_data` and add `fragment_retries` :ci skip dl --- youtube_dlc/downloader/youtube_live_chat.py | 102 +++++++++++--------- youtube_dlc/extractor/youtube.py | 11 +-- 2 files changed, 57 insertions(+), 56 deletions(-) diff --git a/youtube_dlc/downloader/youtube_live_chat.py b/youtube_dlc/downloader/youtube_live_chat.py index 223b4b81c..f162aff9c 100644 --- a/youtube_dlc/downloader/youtube_live_chat.py +++ b/youtube_dlc/downloader/youtube_live_chat.py @@ -4,6 +4,9 @@ import json from .fragment import FragmentFD +from ..compat import compat_urllib_error +from ..utils import try_get +from ..extractor.youtube import YoutubeBaseInfoExtractor as YT_BaseIE class YoutubeLiveChatReplayFD(FragmentFD): @@ -15,6 +18,7 @@ def real_download(self, filename, info_dict): video_id = info_dict['video_id'] self.to_screen('[%s] Downloading live chat' % self.FD_NAME) + fragment_retries = self.params.get('fragment_retries', 0) test = self.params.get('test', False) ctx = { @@ -28,15 +32,52 @@ def dl_fragment(url): return self._download_fragment(ctx, url, info_dict, headers) def parse_yt_initial_data(data): - window_patt = b'window\\["ytInitialData"\\]\\s*=\\s*(.*?)(?<=});' - var_patt = b'var\\s+ytInitialData\\s*=\\s*(.*?)(?<=});' - for patt in window_patt, var_patt: + patterns = ( + r'%s\\s*%s' % (YT_BaseIE._YT_INITIAL_DATA_RE, YT_BaseIE._YT_INITIAL_BOUNDARY_RE), + r'%s' % YT_BaseIE._YT_INITIAL_DATA_RE) + data = data.decode('utf-8', 'replace') + for patt in patterns: try: raw_json = re.search(patt, data).group(1) return json.loads(raw_json) except AttributeError: continue + def download_and_parse_fragment(url, frag_index): + count = 0 + while count <= fragment_retries: + try: + success, raw_fragment = dl_fragment(url) + if not success: + return False, None, None + data = parse_yt_initial_data(raw_fragment) or json.loads(raw_fragment)['response'] + + live_chat_continuation = try_get( + data, + lambda x: x['continuationContents']['liveChatContinuation'], dict) or {} + offset = continuation_id = None + processed_fragment = bytearray() + for action in live_chat_continuation.get('actions', []): + if 'replayChatItemAction' in action: + replay_chat_item_action = action['replayChatItemAction'] + offset = int(replay_chat_item_action['videoOffsetTimeMsec']) + processed_fragment.extend( + json.dumps(action, ensure_ascii=False).encode('utf-8') + b'\n') + if offset is not None: + continuation_id = try_get( + live_chat_continuation, + lambda x: x['continuations'][0]['liveChatReplayContinuationData']['continuation']) + self._append_fragment(ctx, processed_fragment) + + return True, continuation_id, offset + except compat_urllib_error.HTTPError as err: + count += 1 + if count <= fragment_retries: + self.report_retry_fragment(err, frag_index, count, fragment_retries) + if count > fragment_retries: + self.report_error('giving up after %s fragment retries' % fragment_retries) + return False, None, None + self._prepare_and_start_frag_download(ctx) success, raw_fragment = dl_fragment( @@ -44,54 +85,23 @@ def parse_yt_initial_data(data): if not success: return False data = parse_yt_initial_data(raw_fragment) - continuation_id = data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation'] + continuation_id = try_get( + data, + lambda x: x['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation']) # no data yet but required to call _append_fragment self._append_fragment(ctx, b'') - first = True - offset = None + frag_index = offset = 0 while continuation_id is not None: - data = None - if first: - url = 'https://www.youtube.com/live_chat_replay?continuation={}'.format(continuation_id) - success, raw_fragment = dl_fragment(url) - if not success: - return False - data = parse_yt_initial_data(raw_fragment) - else: - url = ('https://www.youtube.com/live_chat_replay/get_live_chat_replay' - + '?continuation={}'.format(continuation_id) - + '&playerOffsetMs={}'.format(max(offset - 5000, 0)) - + '&hidden=false' - + '&pbj=1') - success, raw_fragment = dl_fragment(url) - if not success: - return False - data = json.loads(raw_fragment)['response'] - - first = False - continuation_id = None - - live_chat_continuation = data['continuationContents']['liveChatContinuation'] - offset = None - processed_fragment = bytearray() - if 'actions' in live_chat_continuation: - for action in live_chat_continuation['actions']: - if 'replayChatItemAction' in action: - replay_chat_item_action = action['replayChatItemAction'] - offset = int(replay_chat_item_action['videoOffsetTimeMsec']) - processed_fragment.extend( - json.dumps(action, ensure_ascii=False).encode('utf-8') + b'\n') - try: - continuation_id = live_chat_continuation['continuations'][0]['liveChatReplayContinuationData']['continuation'] - except KeyError: - continuation_id = None - - self._append_fragment(ctx, processed_fragment) - - if test or offset is None: + frag_index += 1 + url = 'https://www.youtube.com/live_chat_replay?continuation=%s' % continuation_id + if frag_index > 1: + url += '&playerOffsetMs=%d&hidden=false&pbj=1' % max(offset - 5000, 0) + success, continuation_id, offset = download_and_parse_fragment(url, frag_index) + if not success: + return False + if test: break self._finish_frag_download(ctx) - return True diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index 7c32d3200..0ba6a299e 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -277,15 +277,6 @@ def _download_webpage_handle(self, *args, **kwargs): return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle( *args, **compat_kwargs(kwargs)) - def _get_yt_initial_data(self, video_id, webpage): - config = self._search_regex( - (r'window\["ytInitialData"\]\s*=\s*(.*?)(?<=});', - r'var\s+ytInitialData\s*=\s*(.*?)(?<=});'), - webpage, 'ytInitialData', default=None) - if config: - return self._parse_json( - uppercase_escape(config), video_id, fatal=False) - def _real_initialize(self): if self._downloader is None: return @@ -1943,7 +1934,7 @@ def feed_entry(name): has_live_chat_replay = False if not is_live: - yt_initial_data = self._get_yt_initial_data(video_id, video_webpage) + yt_initial_data = self._extract_yt_initial_data(video_id, video_webpage) try: yt_initial_data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation'] has_live_chat_replay = True