From 6014355c6142f68e20c8374e3787e5b5820f19e2 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sat, 29 Jul 2023 18:37:06 -0500 Subject: [PATCH] [ie/twitter] Add fallback, improve error handling (#7621) Closes #7579, Closes #7625 Authored by: bashonly --- yt_dlp/extractor/twitter.py | 81 +++++++++++++++++++++++++++---------- 1 file changed, 60 insertions(+), 21 deletions(-) diff --git a/yt_dlp/extractor/twitter.py b/yt_dlp/extractor/twitter.py index 9d87dbc4b..34b8625c3 100644 --- a/yt_dlp/extractor/twitter.py +++ b/yt_dlp/extractor/twitter.py @@ -1,3 +1,4 @@ +import functools import json import re @@ -279,6 +280,12 @@ def input_dict(subtask_id, text): 'Submitting confirmation code', headers, data=build_login_json(input_dict( next_subtask, self._get_tfa_info('confirmation code sent to your email or phone')))) + elif next_subtask == 'ArkoseLogin': + self.raise_login_required('Twitter is requiring captcha for this login attempt', method='cookies') + + elif next_subtask == 'DenyLoginSubtask': + self.raise_login_required('Twitter rejected this login attempt as suspicious', method='cookies') + elif next_subtask == 'LoginSuccessSubtask': raise ExtractorError('Twitter API did not grant auth token cookie') @@ -304,8 +311,9 @@ def _call_api(self, path, video_id, query={}, graphql=False): if result.get('errors'): errors = ', '.join(set(traverse_obj(result, ('errors', ..., 'message', {str})))) - raise ExtractorError( - f'Error(s) while querying API: {errors or "Unknown error"}', expected=True) + if errors and 'not authorized' in errors: + self.raise_login_required(remove_end(errors, '.')) + raise ExtractorError(f'Error(s) while querying API: {errors or "Unknown error"}') return result @@ -607,7 +615,7 @@ class TwitterIE(TwitterBaseIE): # has mp4 formats via mobile API 'url': 'https://twitter.com/news_al3alm/status/852138619213144067', 'info_dict': { - 'id': '852138619213144067', + 'id': '852077943283097602', 'ext': 'mp4', 'title': 'عالم الأخبار - كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعارضين : اتقوا الله .. الظلم ظلمات يوم القيامة', 'description': 'كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعارضين : اتقوا الله .. الظلم ظلمات يوم القيامة https://t.co/xg6OhpyKfN', @@ -616,8 +624,16 @@ class TwitterIE(TwitterBaseIE): 'duration': 277.4, 'timestamp': 1492000653, 'upload_date': '20170412', + 'display_id': '852138619213144067', + 'age_limit': 0, + 'uploader_url': 'https://twitter.com/news_al3alm', + 'thumbnail': r're:^https?://.*\.jpg', + 'tags': [], + 'repost_count': int, + 'view_count': int, + 'like_count': int, + 'comment_count': int, }, - 'skip': 'Account suspended', }, { 'url': 'https://twitter.com/i/web/status/910031516746514432', 'info_dict': { @@ -675,15 +691,15 @@ class TwitterIE(TwitterBaseIE): 'id': '1087791272830607360', 'display_id': '1087791357756956680', 'ext': 'mp4', - 'title': 'Twitter - A new is coming. Some of you got an opt-in to try it now. Check out the emoji button, quick keyboard shortcuts, upgraded trends, advanced search, and more. Let us know your thoughts!', + 'title': 'X - A new is coming. Some of you got an opt-in to try it now. Check out the emoji button, quick keyboard shortcuts, upgraded trends, advanced search, and more. Let us know your thoughts!', 'thumbnail': r're:^https?://.*\.jpg', 'description': 'md5:6dfd341a3310fb97d80d2bf7145df976', - 'uploader': 'Twitter', - 'uploader_id': 'Twitter', + 'uploader': 'X', + 'uploader_id': 'X', 'duration': 61.567, 'timestamp': 1548184644, 'upload_date': '20190122', - 'uploader_url': 'https://twitter.com/Twitter', + 'uploader_url': 'https://twitter.com/X', 'comment_count': int, 'repost_count': int, 'like_count': int, @@ -991,10 +1007,10 @@ class TwitterIE(TwitterBaseIE): 'view_count': int, 'thumbnail': 'https://pbs.twimg.com/ext_tw_video_thumb/1600009362759733248/pu/img/XVhFQivj75H_YxxV.jpg?name=orig', 'age_limit': 0, - 'uploader': 'Mün The Shinobi', + 'uploader': 'Mün The Friend Of YWAP', 'repost_count': int, 'upload_date': '20221206', - 'title': 'Mün The Shinobi - This is a genius ad by Apple. \U0001f525\U0001f525\U0001f525\U0001f525\U0001f525', + 'title': 'Mün The Friend Of YWAP - This is a genius ad by Apple. \U0001f525\U0001f525\U0001f525\U0001f525\U0001f525', 'comment_count': int, 'like_count': int, 'tags': [], @@ -1024,6 +1040,7 @@ class TwitterIE(TwitterBaseIE): 'repost_count': int, }, 'params': {'extractor_args': {'twitter': {'legacy_api': ['']}}}, + 'skip': 'Protected tweet', }, { # orig tweet w/ graphql 'url': 'https://twitter.com/liberdalau/status/1623739803874349067', @@ -1047,6 +1064,7 @@ class TwitterIE(TwitterBaseIE): 'repost_count': int, 'comment_count': int, }, + 'skip': 'Protected tweet', }, { # onion route 'url': 'https://twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid.onion/TwitterBlue/status/1484226494708662273', @@ -1103,6 +1121,8 @@ def _graphql_to_legacy(self, data, twid): reason = result.get('reason') if reason == 'NsfwLoggedOut': self.raise_login_required('NSFW tweet requires authentication') + elif reason == 'Protected': + self.raise_login_required('You are not authorized to view this protected tweet') raise ExtractorError(reason or 'Requested tweet is unavailable', expected=True) status = result.get('legacy', {}) @@ -1187,22 +1207,38 @@ def _build_graphql_query(self, media_id): } } - def _real_extract(self, url): - twid, selected_index = self._match_valid_url(url).group('id', 'index') - if not self.is_logged_in and self._configuration_arg('legacy_api'): - status = traverse_obj(self._call_api(f'statuses/show/{twid}.json', twid, { + def _extract_status(self, twid): + if self.is_logged_in: + return self._graphql_to_legacy( + self._call_graphql_api('zZXycP0V6H7m-2r0mOnFcA/TweetDetail', twid), twid) + + try: + if not self._configuration_arg('legacy_api'): + return self._graphql_to_legacy( + self._call_graphql_api('2ICDjqPd81tulZcYrtpTuQ/TweetResultByRestId', twid), twid) + return traverse_obj(self._call_api(f'statuses/show/{twid}.json', twid, { 'cards_platform': 'Web-12', 'include_cards': 1, 'include_reply_count': 1, 'include_user_entities': 0, 'tweet_mode': 'extended', }), 'retweeted_status', None) - elif not self.is_logged_in: - status = self._graphql_to_legacy( - self._call_graphql_api('2ICDjqPd81tulZcYrtpTuQ/TweetResultByRestId', twid), twid) - else: - status = self._graphql_to_legacy( - self._call_graphql_api('zZXycP0V6H7m-2r0mOnFcA/TweetDetail', twid), twid) + + except ExtractorError as e: + if e.expected: + raise + self.report_warning( + f'{e.orig_msg}. Falling back to syndication endpoint; some metadata may be missing', twid) + + status = self._download_json( + 'https://cdn.syndication.twimg.com/tweet-result', twid, 'Downloading syndication JSON', + headers={'User-Agent': 'Googlebot'}, query={'id': twid}) + status['extended_entities'] = {'media': status.get('mediaDetails')} + return status + + def _real_extract(self, url): + twid, selected_index = self._match_valid_url(url).group('id', 'index') + status = self._extract_status(twid) title = description = traverse_obj( status, (('full_text', 'text'), {lambda x: x.replace('\n', ' ')}), get_all=False) or '' @@ -1230,7 +1266,10 @@ def _real_extract(self, url): } def extract_from_video_info(media): - media_id = traverse_obj(media, 'id_str', 'id', expected_type=str_or_none) + media_id = traverse_obj(media, 'id_str', 'id', ( + 'video_info', 'variants', ..., 'url', + {functools.partial(re.search, r'_video/(\d+)/')}, 1 + ), get_all=False, expected_type=str_or_none) or twid self.write_debug(f'Extracting from video info: {media_id}') formats = []