0
0
Fork 0
mirror of https://github.com/yt-dlp/yt-dlp.git synced 2024-12-22 06:00:00 +00:00

[cleanup] Use _html_extract_title

This commit is contained in:
pukkandan 2022-04-04 13:57:35 +05:30
parent 85e801a9db
commit 04f3fd2c89
No known key found for this signature in database
GPG key ID: 7EEE9E1E817D0A39
38 changed files with 51 additions and 80 deletions

View file

@ -534,13 +534,13 @@ #### Example
Correct:
```python
title = self._html_search_regex(r'<title>([^<]+)</title>', webpage, 'title')
title = self._html_search_regex(r'<h1>([^<]+)</h1>', webpage, 'title')
```
Incorrect:
```python
TITLE_RE = r'<title>([^<]+)</title>'
TITLE_RE = r'<h1>([^<]+)</h1>'
# ...some lines of code...
title = self._html_search_regex(TITLE_RE, webpage, 'title')
```

View file

@ -14,7 +14,7 @@ class AdobeConnectIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
title = self._html_search_regex(r'<title>(.+?)</title>', webpage, 'title')
title = self._html_extract_title(webpage)
qs = compat_parse_qs(self._search_regex(r"swfUrl\s*=\s*'([^']+)'", webpage, 'swf url').split('?')[1])
is_live = qs.get('isLive', ['false'])[0] == 'true'
formats = []

View file

@ -7,6 +7,7 @@
int_or_none,
qualities,
remove_end,
strip_or_none,
try_get,
unified_timestamp,
url_basename,
@ -102,10 +103,7 @@ def _real_extract(self, url):
video_id = display_id
media_data = self._download_json(
'http://www.allocine.fr/ws/AcVisiondataV5.ashx?media=%s' % video_id, display_id)
title = remove_end(
self._html_search_regex(
r'(?s)<title>(.+?)</title>', webpage, 'title').strip(),
' - AlloCiné')
title = remove_end(strip_or_none(self._html_extract_title(webpage), ' - AlloCiné'))
for key, value in media_data['video'].items():
if not key.endswith('Path'):
continue

View file

@ -483,8 +483,7 @@ def _extract_yt_initial_variable(self, webpage, regex, video_id, name):
regex), webpage, name, default='{}'), video_id, fatal=False)
def _extract_webpage_title(self, webpage):
page_title = self._html_search_regex(
r'<title>([^<]*)</title>', webpage, 'title', default='')
page_title = self._html_extract_title(webpage, default='')
# YouTube video pages appear to always have either 'YouTube -' as prefix or '- YouTube' as suffix.
return self._html_search_regex(
r'(?:YouTube\s*-\s*(.*)$)|(?:(.*)\s*-\s*YouTube$)',

View file

@ -181,8 +181,7 @@ def _real_extract(self, url):
'title', default=None) or self._og_search_title(
webpage, default=None) or self._html_search_meta(
'twitter:title', webpage, 'title',
default=None) or self._search_regex(
r'<title>([^<]+)</title>', webpage, 'title', fatal=False)
default=None) or self._html_extract_title(webpage)
if title:
title = re.sub(r'\s*\|\s*.+?$', '', title)

View file

@ -906,9 +906,8 @@ def _real_extract(self, url):
playlist_title = json_ld_info.get('title')
if not playlist_title:
playlist_title = self._og_search_title(
webpage, default=None) or self._html_search_regex(
r'<title>(.+?)</title>', webpage, 'playlist title', default=None)
playlist_title = (self._og_search_title(webpage, default=None)
or self._html_extract_title(webpage, 'playlist title', default=None))
if playlist_title:
playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip()

View file

@ -29,9 +29,8 @@ def _real_extract(self, url):
self._sort_formats(formats)
return {
'id': video_id,
'title': self._og_search_title(
webpage, default=None) or self._html_search_regex(
r'(?s)<title>(.*?)</title>', webpage, 'video title'),
'title': (self._og_search_title(webpage, default=None)
or self._html_extract_title(webpage, 'video title')),
'description': self._og_search_description(webpage),
'thumbnail': self._og_search_thumbnail(webpage),
'age_limit': self._rta_search(webpage),

View file

@ -54,7 +54,7 @@ def _real_extract(self, url):
id = episode['id']
title = (episode.get('title')
or self._og_search_title(webpage, fatal=False)
or self._html_search_regex('<title>(.*?)</title>', webpage, 'title'))
or self._html_extract_title(webpage))
url = episode['m3u8']
formats = self._extract_m3u8_formats(url, display_id, ext='ts')
self._sort_formats(formats)

View file

@ -127,9 +127,9 @@ def _extract_player_init(self, player_init, display_id):
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
title = self._og_search_title(webpage, default=None) or self._html_search_meta(
'twitter:title', webpage, 'title', default=None) or self._html_search_regex(
r'<title>([^<]+)</title>', webpage, 'title', fatal=False)
title = (self._og_search_title(webpage, default=None)
or self._html_search_meta('twitter:title', webpage, 'title', default=None)
or self._html_extract_title(webpage))
entries = [
self._extract_player_init(player_init, display_id)
for player_init in re.findall(r'CBC\.APP\.Caffeine\.initInstance\(({.+?})\);', webpage)]

View file

@ -54,8 +54,7 @@ def _real_extract(self, url):
r'<script[^>]+src=["\'].*?\b(?:partner_id|p)/(\d+)',
webpage, 'kaltura partner_id')
title = self._search_regex(
r'<title>(.+?)\s*\|\s*.+?</title>', webpage, 'video title')
title = self._html_extract_title(webpage, 'video title')
select = self._search_regex(
r'(?s)<select[^>]+id="select-version"[^>]*>(.+?)</select>',

View file

@ -1329,9 +1329,8 @@ def _og_search_thumbnail(self, html, **kargs):
def _og_search_description(self, html, **kargs):
return self._og_search_property('description', html, fatal=False, **kargs)
def _og_search_title(self, html, **kargs):
kargs.setdefault('fatal', False)
return self._og_search_property('title', html, **kargs)
def _og_search_title(self, html, *, fatal=False, **kargs):
return self._og_search_property('title', html, fatal=fatal, **kargs)
def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
regexes = self._og_regexes('video') + self._og_regexes('video:url')
@ -1342,9 +1341,8 @@ def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
def _og_search_url(self, html, **kargs):
return self._og_search_property('url', html, **kargs)
def _html_extract_title(self, html, name, **kwargs):
return self._html_search_regex(
r'(?s)<title>(.*?)</title>', html, name, **kwargs)
def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
return self._html_search_regex(r'(?s)<title>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
name = variadic(name)

View file

@ -278,7 +278,7 @@ def _real_extract(self, url):
video_id, transform_source=js_to_json)
title = (self._og_search_title(webpage, default=None)
or self._html_search_regex(r'(?s)<title>(.*?)</title>', webpage, 'video title'))
or self._html_extract_title(webpage, 'video title'))
description = (self._og_search_description(webpage, default=None)
or self._html_search_meta('description', webpage, 'description', default=None))

View file

@ -75,8 +75,7 @@ def _real_extract(self, url):
r'<a[^>]+?href="([^"]+)"[^>]+?class="videoplayer"'],
webpage, 'video url')
title = self._og_search_title(webpage, default=None) or self._search_regex(
r'<title>([^<]+)</title>', webpage, 'title')
title = self._og_search_title(webpage, default=None) or self._html_extract_title(webpage)
duration = int_or_none(self._og_search_property(
'video:duration', webpage, 'duration', default=None))

View file

@ -29,8 +29,7 @@ def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
title = remove_end(self._html_search_regex(
r'<title>([^<]+)</title>', webpage, 'title'), ' - Foxgay.com')
title = remove_end(self._html_extract_title(webpage), ' - Foxgay.com')
description = get_element_by_id('inf_tit', webpage)
# The default user-agent with foxgay cookies leads to pages without videos

View file

@ -2873,10 +2873,8 @@ def _real_extract(self, url):
# Site Name | Video Title
# Video Title - Tagline | Site Name
# and so on and so forth; it's just not practical
video_title = self._og_search_title(
webpage, default=None) or self._html_search_regex(
r'(?s)<title>(.*?)</title>', webpage, 'video title',
default='video')
video_title = (self._og_search_title(webpage, default=None)
or self._html_extract_title(webpage, 'video title', default='video'))
# Try to detect age limit automatically
age_limit = self._rta_search(webpage)

View file

@ -23,9 +23,7 @@ def _real_extract(self, url):
webpage = self._download_webpage(url, video_id)
title = self._html_search_regex(
r'<title>(.+?)</title>', webpage,
'title', default=None) or self._og_search_title(webpage)
title = self._html_extract_title(webpage, default=None) or self._og_search_title(webpage)
video_url = self._proto_relative_url(self._search_regex(
r'<source[^>]+src=(["\'])(?P<url>.+?)\1',
webpage, 'video URL', default=None,

View file

@ -38,8 +38,7 @@ def _real_extract(self, url):
webpage = self._download_webpage(url, display_id)
title = remove_end(self._html_search_regex(
r'<title>([^<]+)</title>', webpage, 'title'), ' - Hell Porno')
title = remove_end(self._html_extract_title(webpage), ' - Hell Porno')
info = self._parse_html5_media_entries(url, webpage, display_id)[0]
self._sort_formats(info['formats'])

View file

@ -66,8 +66,7 @@ def _real_extract(self, url):
room_info = try_get(stream_data, lambda x: x['data'][0]['gameLiveInfo'])
if not room_info:
raise ExtractorError('Can not extract the room info', expected=True)
title = room_info.get('roomName') or room_info.get('introduction') or self._html_search_regex(
r'<title>([^<]+)</title>', webpage, 'title')
title = room_info.get('roomName') or room_info.get('introduction') or self._html_extract_title(webpage)
screen_type = room_info.get('screenType')
live_source_type = room_info.get('liveSourceType')
stream_info_list = stream_data['data'][0]['gameStreamInfoList']

View file

@ -68,7 +68,7 @@ def _real_extract(self, url):
video_info = traverse_obj(info, ('props', 'pageProps', 'videoPlaybackData', 'video'), default={})
title = (traverse_obj(video_info, ('name', 'value'), ('primaryTitle', 'titleText', 'text'))
or self._html_search_meta(('og:title', 'twitter:title'), webpage, default=None)
or self._html_search_regex(r'<title>(.+?)</title>', webpage, 'title'))
or self._html_extract_title(webpage))
data = video_info.get('playbackURLs') or try_get(self._download_json(
'https://www.imdb.com/ve/data/VIDEO_PLAYBACK_DATA', video_id,
query={

View file

@ -115,7 +115,7 @@ def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
video_title = self._html_search_regex(r'<title>(.*?)</title>', webpage, 'title')
video_title = self._html_extract_title(webpage)
video_description = self._html_search_meta('description', webpage, 'description')
if '/cn/' in url:

View file

@ -76,8 +76,7 @@ def _real_extract(self, url):
'age_limit': age_limit,
}
title = remove_end(self._html_search_regex(
r'<title>([^<]+)</title>', webpage, 'title'), ' | Iwara')
title = remove_end(self._html_extract_title(webpage), ' | Iwara')
thumbnail = self._html_search_regex(
r'poster=[\'"]([^\'"]+)', webpage, 'thumbnail', default=None)

View file

@ -102,7 +102,7 @@ def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
title = self._html_search_regex(r'<title>([^<]+)</title>', webpage, 'title')
title = self._html_extract_title(webpage)
description = clean_html(get_element_by_class('share-update-card__update-text', webpage))
like_count = int_or_none(get_element_by_class('social-counts-reactions__social-counts-numRections', webpage))
creator = strip_or_none(clean_html(get_element_by_class('comment__actor-name', webpage)))

View file

@ -24,8 +24,7 @@ def _real_extract(self, url):
webpage = self._download_webpage(
url, video_id, headers={'User-Agent': self._USER_AGENT_IPAD})
title = self._html_search_regex(
r'<title>([^<]+)</title>', webpage, 'title')
title = self._html_extract_title(webpage)
thumbnail = self._html_search_regex(
r'<div[^>]+class=(?P<q1>[\'"]).*\bvideo_img\b.*(?P=q1)[^>]+data-url=(?P<q2>[\'"])(?P<url>[^\'"]+)(?P=q2)',
webpage, 'thumbnail', fatal=False, group='url')

View file

@ -38,8 +38,7 @@ def _real_extract(self, url):
r'<errordesc>([^<]*)</errordesc>', playerapi, 'error description', fatal=False)
raise ExtractorError('%s said: %s' % (self.IE_NAME, error_desc), expected=True)
title = self._html_search_regex(
r'<title>([^<]+)</title>', playerapi, 'title')
title = self._html_extract_title(playerapi)
video_url = self._html_search_regex(
r'<file>([^<]+)</file>', playerapi, 'video URL')
thumbnail = self._html_search_regex(

View file

@ -106,8 +106,7 @@ def _real_extract(self, url):
uploader = None
webpage = self._download_webpage(url, media_id)
title = self._html_search_regex(
r'<title>(.+?)</title>', webpage, 'title')
title = self._html_extract_title(webpage)
media_url_string = self._search_regex(
r'"url"\s*:\s*("[^"]+"),', webpage, 'media url', default=None)
@ -219,8 +218,7 @@ def _real_extract(self, url):
webpage = self._download_webpage(url, playlist_id)
title = self._search_regex(
r'<title>([^>]+)</title>', webpage, 'title', default=None)
title = self._html_extract_title(webpage, default=None)
# cut left menu
webpage = self._search_regex(

View file

@ -309,7 +309,9 @@ def _real_extract(self, url):
webpage = self._download_webpage(f'https://www.nhk.or.jp/school/{program_id}/', program_id)
title = self._og_search_title(webpage, fatal=False) or self._html_extract_title(webpage, fatal=False) or self._html_search_regex(r'<h3>([^<]+?)とは?\s*</h3>', webpage, 'title', fatal=False)
title = (self._og_search_title(webpage)
or self._html_extract_title(webpage)
or self._html_search_regex(r'<h3>([^<]+?)とは?\s*</h3>', webpage, 'title', fatal=False))
title = re.sub(r'\s*\|\s*NHK\s+for\s+School\s*$', '', title) if title else None
description = self._html_search_regex(
r'(?s)<div\s+class="programDetail\s*">\s*<p>[^<]+</p>',

View file

@ -85,8 +85,7 @@ def _real_extract(self, url):
# Extract title - should be in the flashvars; if not, look elsewhere
if video_title is None:
video_title = self._html_search_regex(
r'<title>(.*?)</title', webpage, 'title')
video_title = self._html_extract_title(webpage)
return {
'id': video_id,

View file

@ -49,7 +49,7 @@ def _real_extract(self, url):
'quality': quality,
})
title = self._html_search_regex(r'<title>([^<]+)</title>', webpage, 'title')
title = self._html_extract_title(webpage)
thumbnail = self._html_search_regex(r'preview_url:\s+\'([^\']+)\'', webpage, 'thumbnail', default=None)
duration = self._html_search_regex(r'"icon-clock"></i>\s+<span>((?:\d+:?)+)', webpage, 'duration', default=None)

View file

@ -112,7 +112,7 @@ def _real_extract(self, url):
if smuggled_data.get('force_title'):
title = smuggled_data['force_title']
else:
title = self._html_search_regex(r'<title>([^<]+)</title>', webpage, video_id)
title = self._html_extract_title(webpage)
poster = qs.get('poster')
thumbnail = poster[0] if poster else None

View file

@ -36,8 +36,7 @@ def _real_extract(self, url):
webpage = self._download_webpage(
'http://www.sunporno.com/videos/%s' % video_id, video_id)
title = self._html_search_regex(
r'<title>([^<]+)</title>', webpage, 'title')
title = self._html_extract_title(webpage)
description = self._html_search_meta(
'description', webpage, 'description')
thumbnail = self._html_search_regex(

View file

@ -37,9 +37,7 @@ def _real_extract(self, url):
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
title = remove_end(self._html_search_regex(
r'<title>([^<]+)</title>', webpage, 'title'),
' - 視頻 - ThisAV.com-世界第一中文成人娛樂網站')
title = remove_end(self._html_extract_title(webpage), ' - 視頻 - ThisAV.com-世界第一中文成人娛樂網站')
video_url = self._html_search_regex(
r"addVariable\('file','([^']+)'\);", webpage, 'video url', default=None)
if video_url:

View file

@ -24,8 +24,7 @@ def _real_extract(self, url):
name = mobj.group('movie') + '/' + mobj.group('trailer_name')
webpage = self._download_webpage(url, name)
title = self._search_regex(r'<title>(.+?)</title>',
webpage, 'video title').replace(' - Trailer Addict', '')
title = self._html_extract_title(webpage, 'video title').replace(' - Trailer Addict', '')
view_count_str = self._search_regex(
r'<span class="views_n">([0-9,.]+)</span>',
webpage, 'view count', fatal=False)

View file

@ -42,8 +42,7 @@ def _real_extract(self, url):
video_url = self._search_regex(
r'<source[^>]+src="([^"]+)"', webpage, 'video url')
title = remove_start(self._html_search_regex(
r'<title>([^<]+)</title>', webpage, 'title'), 'ویدیو ورزش 3 | ')
title = remove_start(self._html_extract_title(webpage), 'ویدیو ورزش 3 | ')
description = self._html_search_regex(
r'(?s)<div class="matn">(.+?)</div>',

View file

@ -50,8 +50,7 @@ def _real_extract(self, url):
'https://vshare.io/v/%s/width-650/height-430/1' % video_id,
video_id, headers={'Referer': url})
title = self._html_search_regex(
r'<title>([^<]+)</title>', webpage, 'title')
title = self._html_extract_title(webpage)
title = title.split(' - ')[0]
error = self._html_search_regex(

View file

@ -28,7 +28,7 @@ def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
title = self._html_search_regex(r'<title>(.+?)</title>', webpage, 'title')
title = self._html_extract_title(webpage)
video_json = self._parse_json(self._html_search_regex(r'sources:\s*(.+?]),', webpage, 'video'), video_id, transform_source=js_to_json)
formats = []
for source in video_json:

View file

@ -73,8 +73,7 @@ def _real_extract(self, url):
webpage = self._download_webpage(
url, video_id, note='Revisiting webpage')
title = self._html_search_regex(
r'<title>(.+?)</title>', webpage, 'title')
title = self._html_extract_title(webpage)
video_formats = compat_parse_qs(self._search_regex(
r'video-sources=\\\"(.+?)\"', webpage, 'video_sources'))

View file

@ -533,7 +533,7 @@ def _real_extract(self, url):
title = self._html_search_meta(
['og:title', 'twitter:title'], webpage, 'title', default=None
) or self._html_search_regex('<title>([^<]+)</title>', webpage, 'title')
) or self._html_extract_title(webpage)
if display_id == host:
# Headline page (w/ multiple BC playlists) ('news.yahoo.co.jp', 'headlines.yahoo.co.jp/videonews/', ...)

View file

@ -36,8 +36,7 @@ def _real_extract(self, url):
webpage = self._download_webpage(url, video_id)
title = self._html_search_regex(
r'<title>(.+?)</title>', webpage, 'title')
title = self._html_extract_title(webpage)
formats = []