[facebook] Improve title and uploader extraction

Closes #1943, closes #795
This commit is contained in:
pukkandan 2022-01-11 22:09:49 +05:30
parent fabb27fcea
commit 80fa6e5327
No known key found for this signature in database
GPG Key ID: 0F00D95A001F4698
1 changed files with 27 additions and 24 deletions

View File

@ -20,13 +20,13 @@
get_element_by_id, get_element_by_id,
int_or_none, int_or_none,
js_to_json, js_to_json,
limit_length,
merge_dicts, merge_dicts,
network_exceptions, network_exceptions,
parse_count, parse_count,
parse_qs, parse_qs,
qualities, qualities,
sanitized_Request, sanitized_Request,
traverse_obj,
try_get, try_get,
url_or_none, url_or_none,
urlencode_postdata, urlencode_postdata,
@ -398,28 +398,31 @@ def _extract_from_url(self, url, video_id):
url.replace('://m.facebook.com/', '://www.facebook.com/'), video_id) url.replace('://m.facebook.com/', '://www.facebook.com/'), video_id)
def extract_metadata(webpage): def extract_metadata(webpage):
video_title = self._html_search_regex( media_data = [self._parse_json(j, video_id, fatal=False) for j in re.findall(
r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>([^<]*)</h2>', webpage, r'handleWithCustomApplyEach\(\s*ScheduledApplyEach\s*,\s*(\{.+?\})\s*\);', webpage)]
'title', default=None) media = traverse_obj(media_data, (
if not video_title: ..., 'require', ..., ..., ..., '__bbox', 'result', 'data', 'attachments', ..., 'media'), expected_type=dict)
video_title = self._html_search_regex( media = [m for m in media if str(m.get('id')) == video_id and m.get('__typename') == 'Video']
r'(?s)<span class="fbPhotosPhotoCaption".*?id="fbPhotoPageCaption"><span class="hasCaption">(.*?)</span>',
webpage, 'alternative title', default=None) video_title = traverse_obj(media, (..., 'title', 'text'), get_all=False)
if not video_title: description = traverse_obj(media, (
video_title = self._html_search_meta( ..., 'creation_story', 'comet_sections', 'message', 'story', 'message', 'text'), get_all=False)
['og:title', 'twitter:title', 'description'], uploader = traverse_obj(media, (..., 'owner', 'name'), get_all=False)
webpage, 'title', default=None) uploader_id = traverse_obj(media, (..., 'owner', 'id'), get_all=False)
if video_title:
video_title = limit_length(video_title, 80) video_title = video_title or self._html_search_regex((
else: r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>(?P<content>[^<]*)</h2>',
video_title = 'Facebook video #%s' % video_id r'(?s)<span class="fbPhotosPhotoCaption".*?id="fbPhotoPageCaption"><span class="hasCaption">(?P<content>.*?)</span>',
description = self._html_search_meta( self._meta_regex('og:title'), self._meta_regex('twitter:title'), self._meta_regex('description'),
), webpage, 'title', default=None, group='content')
description = description or self._html_search_meta(
['description', 'og:description', 'twitter:description'], ['description', 'og:description', 'twitter:description'],
webpage, 'description', default=None) webpage, 'description', default=None)
uploader = clean_html(get_element_by_id( uploader = uploader or (
'fbPhotoPageAuthorName', webpage)) or self._search_regex( clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage))
r'ownerName\s*:\s*"([^"]+)"', webpage, 'uploader', or self._search_regex(
default=None) or self._og_search_title(webpage, fatal=False) (r'ownerName\s*:\s*"([^"]+)"', *self._og_regexes('title')), webpage, 'uploader', fatal=False))
timestamp = int_or_none(self._search_regex( timestamp = int_or_none(self._search_regex(
r'<abbr[^>]+data-utime=["\'](\d+)', webpage, r'<abbr[^>]+data-utime=["\'](\d+)', webpage,
'timestamp', default=None)) 'timestamp', default=None))
@ -434,17 +437,17 @@ def extract_metadata(webpage):
r'\bviewCount\s*:\s*["\']([\d,.]+)', webpage, 'view count', r'\bviewCount\s*:\s*["\']([\d,.]+)', webpage, 'view count',
default=None)) default=None))
info_dict = { info_dict = {
'title': video_title, 'title': video_title or description.replace('\n', ' ') or f'Facebook video #{video_id}',
'description': description, 'description': description,
'uploader': uploader, 'uploader': uploader,
'uploader_id': uploader_id,
'timestamp': timestamp, 'timestamp': timestamp,
'thumbnail': thumbnail, 'thumbnail': thumbnail,
'view_count': view_count, 'view_count': view_count,
} }
info_json_ld = self._search_json_ld(webpage, video_id, default={}) info_json_ld = self._search_json_ld(webpage, video_id, default={})
if info_json_ld.get('title'): if info_json_ld.get('title'):
info_json_ld['title'] = limit_length( info_json_ld['title'] = re.sub(r'\s*\|\s*Facebook$', '', info_json_ld['title'])
re.sub(r'\s*\|\s*Facebook$', '', info_json_ld['title']), 80)
return merge_dicts(info_json_ld, info_dict) return merge_dicts(info_json_ld, info_dict)
video_data = None video_data = None