mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2024-11-25 02:45:12 +00:00
[cleanup] Use _html_extract_title
This commit is contained in:
parent
85e801a9db
commit
04f3fd2c89
38 changed files with 51 additions and 80 deletions
|
@ -534,13 +534,13 @@ #### Example
|
||||||
Correct:
|
Correct:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
title = self._html_search_regex(r'<title>([^<]+)</title>', webpage, 'title')
|
title = self._html_search_regex(r'<h1>([^<]+)</h1>', webpage, 'title')
|
||||||
```
|
```
|
||||||
|
|
||||||
Incorrect:
|
Incorrect:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
TITLE_RE = r'<title>([^<]+)</title>'
|
TITLE_RE = r'<h1>([^<]+)</h1>'
|
||||||
# ...some lines of code...
|
# ...some lines of code...
|
||||||
title = self._html_search_regex(TITLE_RE, webpage, 'title')
|
title = self._html_search_regex(TITLE_RE, webpage, 'title')
|
||||||
```
|
```
|
||||||
|
|
|
@ -14,7 +14,7 @@ class AdobeConnectIE(InfoExtractor):
|
||||||
def _real_extract(self, url):
|
def _real_extract(self, url):
|
||||||
video_id = self._match_id(url)
|
video_id = self._match_id(url)
|
||||||
webpage = self._download_webpage(url, video_id)
|
webpage = self._download_webpage(url, video_id)
|
||||||
title = self._html_search_regex(r'<title>(.+?)</title>', webpage, 'title')
|
title = self._html_extract_title(webpage)
|
||||||
qs = compat_parse_qs(self._search_regex(r"swfUrl\s*=\s*'([^']+)'", webpage, 'swf url').split('?')[1])
|
qs = compat_parse_qs(self._search_regex(r"swfUrl\s*=\s*'([^']+)'", webpage, 'swf url').split('?')[1])
|
||||||
is_live = qs.get('isLive', ['false'])[0] == 'true'
|
is_live = qs.get('isLive', ['false'])[0] == 'true'
|
||||||
formats = []
|
formats = []
|
||||||
|
|
|
@ -7,6 +7,7 @@
|
||||||
int_or_none,
|
int_or_none,
|
||||||
qualities,
|
qualities,
|
||||||
remove_end,
|
remove_end,
|
||||||
|
strip_or_none,
|
||||||
try_get,
|
try_get,
|
||||||
unified_timestamp,
|
unified_timestamp,
|
||||||
url_basename,
|
url_basename,
|
||||||
|
@ -102,10 +103,7 @@ def _real_extract(self, url):
|
||||||
video_id = display_id
|
video_id = display_id
|
||||||
media_data = self._download_json(
|
media_data = self._download_json(
|
||||||
'http://www.allocine.fr/ws/AcVisiondataV5.ashx?media=%s' % video_id, display_id)
|
'http://www.allocine.fr/ws/AcVisiondataV5.ashx?media=%s' % video_id, display_id)
|
||||||
title = remove_end(
|
title = remove_end(strip_or_none(self._html_extract_title(webpage), ' - AlloCiné'))
|
||||||
self._html_search_regex(
|
|
||||||
r'(?s)<title>(.+?)</title>', webpage, 'title').strip(),
|
|
||||||
' - AlloCiné')
|
|
||||||
for key, value in media_data['video'].items():
|
for key, value in media_data['video'].items():
|
||||||
if not key.endswith('Path'):
|
if not key.endswith('Path'):
|
||||||
continue
|
continue
|
||||||
|
|
|
@ -483,8 +483,7 @@ def _extract_yt_initial_variable(self, webpage, regex, video_id, name):
|
||||||
regex), webpage, name, default='{}'), video_id, fatal=False)
|
regex), webpage, name, default='{}'), video_id, fatal=False)
|
||||||
|
|
||||||
def _extract_webpage_title(self, webpage):
|
def _extract_webpage_title(self, webpage):
|
||||||
page_title = self._html_search_regex(
|
page_title = self._html_extract_title(webpage, default='')
|
||||||
r'<title>([^<]*)</title>', webpage, 'title', default='')
|
|
||||||
# YouTube video pages appear to always have either 'YouTube -' as prefix or '- YouTube' as suffix.
|
# YouTube video pages appear to always have either 'YouTube -' as prefix or '- YouTube' as suffix.
|
||||||
return self._html_search_regex(
|
return self._html_search_regex(
|
||||||
r'(?:YouTube\s*-\s*(.*)$)|(?:(.*)\s*-\s*YouTube$)',
|
r'(?:YouTube\s*-\s*(.*)$)|(?:(.*)\s*-\s*YouTube$)',
|
||||||
|
|
|
@ -181,8 +181,7 @@ def _real_extract(self, url):
|
||||||
'title', default=None) or self._og_search_title(
|
'title', default=None) or self._og_search_title(
|
||||||
webpage, default=None) or self._html_search_meta(
|
webpage, default=None) or self._html_search_meta(
|
||||||
'twitter:title', webpage, 'title',
|
'twitter:title', webpage, 'title',
|
||||||
default=None) or self._search_regex(
|
default=None) or self._html_extract_title(webpage)
|
||||||
r'<title>([^<]+)</title>', webpage, 'title', fatal=False)
|
|
||||||
if title:
|
if title:
|
||||||
title = re.sub(r'\s*\|\s*.+?$', '', title)
|
title = re.sub(r'\s*\|\s*.+?$', '', title)
|
||||||
|
|
||||||
|
|
|
@ -906,9 +906,8 @@ def _real_extract(self, url):
|
||||||
|
|
||||||
playlist_title = json_ld_info.get('title')
|
playlist_title = json_ld_info.get('title')
|
||||||
if not playlist_title:
|
if not playlist_title:
|
||||||
playlist_title = self._og_search_title(
|
playlist_title = (self._og_search_title(webpage, default=None)
|
||||||
webpage, default=None) or self._html_search_regex(
|
or self._html_extract_title(webpage, 'playlist title', default=None))
|
||||||
r'<title>(.+?)</title>', webpage, 'playlist title', default=None)
|
|
||||||
if playlist_title:
|
if playlist_title:
|
||||||
playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip()
|
playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip()
|
||||||
|
|
||||||
|
|
|
@ -29,9 +29,8 @@ def _real_extract(self, url):
|
||||||
self._sort_formats(formats)
|
self._sort_formats(formats)
|
||||||
return {
|
return {
|
||||||
'id': video_id,
|
'id': video_id,
|
||||||
'title': self._og_search_title(
|
'title': (self._og_search_title(webpage, default=None)
|
||||||
webpage, default=None) or self._html_search_regex(
|
or self._html_extract_title(webpage, 'video title')),
|
||||||
r'(?s)<title>(.*?)</title>', webpage, 'video title'),
|
|
||||||
'description': self._og_search_description(webpage),
|
'description': self._og_search_description(webpage),
|
||||||
'thumbnail': self._og_search_thumbnail(webpage),
|
'thumbnail': self._og_search_thumbnail(webpage),
|
||||||
'age_limit': self._rta_search(webpage),
|
'age_limit': self._rta_search(webpage),
|
||||||
|
|
|
@ -54,7 +54,7 @@ def _real_extract(self, url):
|
||||||
id = episode['id']
|
id = episode['id']
|
||||||
title = (episode.get('title')
|
title = (episode.get('title')
|
||||||
or self._og_search_title(webpage, fatal=False)
|
or self._og_search_title(webpage, fatal=False)
|
||||||
or self._html_search_regex('<title>(.*?)</title>', webpage, 'title'))
|
or self._html_extract_title(webpage))
|
||||||
url = episode['m3u8']
|
url = episode['m3u8']
|
||||||
formats = self._extract_m3u8_formats(url, display_id, ext='ts')
|
formats = self._extract_m3u8_formats(url, display_id, ext='ts')
|
||||||
self._sort_formats(formats)
|
self._sort_formats(formats)
|
||||||
|
|
|
@ -127,9 +127,9 @@ def _extract_player_init(self, player_init, display_id):
|
||||||
def _real_extract(self, url):
|
def _real_extract(self, url):
|
||||||
display_id = self._match_id(url)
|
display_id = self._match_id(url)
|
||||||
webpage = self._download_webpage(url, display_id)
|
webpage = self._download_webpage(url, display_id)
|
||||||
title = self._og_search_title(webpage, default=None) or self._html_search_meta(
|
title = (self._og_search_title(webpage, default=None)
|
||||||
'twitter:title', webpage, 'title', default=None) or self._html_search_regex(
|
or self._html_search_meta('twitter:title', webpage, 'title', default=None)
|
||||||
r'<title>([^<]+)</title>', webpage, 'title', fatal=False)
|
or self._html_extract_title(webpage))
|
||||||
entries = [
|
entries = [
|
||||||
self._extract_player_init(player_init, display_id)
|
self._extract_player_init(player_init, display_id)
|
||||||
for player_init in re.findall(r'CBC\.APP\.Caffeine\.initInstance\(({.+?})\);', webpage)]
|
for player_init in re.findall(r'CBC\.APP\.Caffeine\.initInstance\(({.+?})\);', webpage)]
|
||||||
|
|
|
@ -54,8 +54,7 @@ def _real_extract(self, url):
|
||||||
r'<script[^>]+src=["\'].*?\b(?:partner_id|p)/(\d+)',
|
r'<script[^>]+src=["\'].*?\b(?:partner_id|p)/(\d+)',
|
||||||
webpage, 'kaltura partner_id')
|
webpage, 'kaltura partner_id')
|
||||||
|
|
||||||
title = self._search_regex(
|
title = self._html_extract_title(webpage, 'video title')
|
||||||
r'<title>(.+?)\s*\|\s*.+?</title>', webpage, 'video title')
|
|
||||||
|
|
||||||
select = self._search_regex(
|
select = self._search_regex(
|
||||||
r'(?s)<select[^>]+id="select-version"[^>]*>(.+?)</select>',
|
r'(?s)<select[^>]+id="select-version"[^>]*>(.+?)</select>',
|
||||||
|
|
|
@ -1329,9 +1329,8 @@ def _og_search_thumbnail(self, html, **kargs):
|
||||||
def _og_search_description(self, html, **kargs):
|
def _og_search_description(self, html, **kargs):
|
||||||
return self._og_search_property('description', html, fatal=False, **kargs)
|
return self._og_search_property('description', html, fatal=False, **kargs)
|
||||||
|
|
||||||
def _og_search_title(self, html, **kargs):
|
def _og_search_title(self, html, *, fatal=False, **kargs):
|
||||||
kargs.setdefault('fatal', False)
|
return self._og_search_property('title', html, fatal=fatal, **kargs)
|
||||||
return self._og_search_property('title', html, **kargs)
|
|
||||||
|
|
||||||
def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
|
def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
|
||||||
regexes = self._og_regexes('video') + self._og_regexes('video:url')
|
regexes = self._og_regexes('video') + self._og_regexes('video:url')
|
||||||
|
@ -1342,9 +1341,8 @@ def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
|
||||||
def _og_search_url(self, html, **kargs):
|
def _og_search_url(self, html, **kargs):
|
||||||
return self._og_search_property('url', html, **kargs)
|
return self._og_search_property('url', html, **kargs)
|
||||||
|
|
||||||
def _html_extract_title(self, html, name, **kwargs):
|
def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
|
||||||
return self._html_search_regex(
|
return self._html_search_regex(r'(?s)<title>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
|
||||||
r'(?s)<title>(.*?)</title>', html, name, **kwargs)
|
|
||||||
|
|
||||||
def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
|
def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
|
||||||
name = variadic(name)
|
name = variadic(name)
|
||||||
|
|
|
@ -278,7 +278,7 @@ def _real_extract(self, url):
|
||||||
video_id, transform_source=js_to_json)
|
video_id, transform_source=js_to_json)
|
||||||
|
|
||||||
title = (self._og_search_title(webpage, default=None)
|
title = (self._og_search_title(webpage, default=None)
|
||||||
or self._html_search_regex(r'(?s)<title>(.*?)</title>', webpage, 'video title'))
|
or self._html_extract_title(webpage, 'video title'))
|
||||||
description = (self._og_search_description(webpage, default=None)
|
description = (self._og_search_description(webpage, default=None)
|
||||||
or self._html_search_meta('description', webpage, 'description', default=None))
|
or self._html_search_meta('description', webpage, 'description', default=None))
|
||||||
|
|
||||||
|
|
|
@ -75,8 +75,7 @@ def _real_extract(self, url):
|
||||||
r'<a[^>]+?href="([^"]+)"[^>]+?class="videoplayer"'],
|
r'<a[^>]+?href="([^"]+)"[^>]+?class="videoplayer"'],
|
||||||
webpage, 'video url')
|
webpage, 'video url')
|
||||||
|
|
||||||
title = self._og_search_title(webpage, default=None) or self._search_regex(
|
title = self._og_search_title(webpage, default=None) or self._html_extract_title(webpage)
|
||||||
r'<title>([^<]+)</title>', webpage, 'title')
|
|
||||||
duration = int_or_none(self._og_search_property(
|
duration = int_or_none(self._og_search_property(
|
||||||
'video:duration', webpage, 'duration', default=None))
|
'video:duration', webpage, 'duration', default=None))
|
||||||
|
|
||||||
|
|
|
@ -29,8 +29,7 @@ def _real_extract(self, url):
|
||||||
video_id = self._match_id(url)
|
video_id = self._match_id(url)
|
||||||
webpage = self._download_webpage(url, video_id)
|
webpage = self._download_webpage(url, video_id)
|
||||||
|
|
||||||
title = remove_end(self._html_search_regex(
|
title = remove_end(self._html_extract_title(webpage), ' - Foxgay.com')
|
||||||
r'<title>([^<]+)</title>', webpage, 'title'), ' - Foxgay.com')
|
|
||||||
description = get_element_by_id('inf_tit', webpage)
|
description = get_element_by_id('inf_tit', webpage)
|
||||||
|
|
||||||
# The default user-agent with foxgay cookies leads to pages without videos
|
# The default user-agent with foxgay cookies leads to pages without videos
|
||||||
|
|
|
@ -2873,10 +2873,8 @@ def _real_extract(self, url):
|
||||||
# Site Name | Video Title
|
# Site Name | Video Title
|
||||||
# Video Title - Tagline | Site Name
|
# Video Title - Tagline | Site Name
|
||||||
# and so on and so forth; it's just not practical
|
# and so on and so forth; it's just not practical
|
||||||
video_title = self._og_search_title(
|
video_title = (self._og_search_title(webpage, default=None)
|
||||||
webpage, default=None) or self._html_search_regex(
|
or self._html_extract_title(webpage, 'video title', default='video'))
|
||||||
r'(?s)<title>(.*?)</title>', webpage, 'video title',
|
|
||||||
default='video')
|
|
||||||
|
|
||||||
# Try to detect age limit automatically
|
# Try to detect age limit automatically
|
||||||
age_limit = self._rta_search(webpage)
|
age_limit = self._rta_search(webpage)
|
||||||
|
|
|
@ -23,9 +23,7 @@ def _real_extract(self, url):
|
||||||
|
|
||||||
webpage = self._download_webpage(url, video_id)
|
webpage = self._download_webpage(url, video_id)
|
||||||
|
|
||||||
title = self._html_search_regex(
|
title = self._html_extract_title(webpage, default=None) or self._og_search_title(webpage)
|
||||||
r'<title>(.+?)</title>', webpage,
|
|
||||||
'title', default=None) or self._og_search_title(webpage)
|
|
||||||
video_url = self._proto_relative_url(self._search_regex(
|
video_url = self._proto_relative_url(self._search_regex(
|
||||||
r'<source[^>]+src=(["\'])(?P<url>.+?)\1',
|
r'<source[^>]+src=(["\'])(?P<url>.+?)\1',
|
||||||
webpage, 'video URL', default=None,
|
webpage, 'video URL', default=None,
|
||||||
|
|
|
@ -38,8 +38,7 @@ def _real_extract(self, url):
|
||||||
|
|
||||||
webpage = self._download_webpage(url, display_id)
|
webpage = self._download_webpage(url, display_id)
|
||||||
|
|
||||||
title = remove_end(self._html_search_regex(
|
title = remove_end(self._html_extract_title(webpage), ' - Hell Porno')
|
||||||
r'<title>([^<]+)</title>', webpage, 'title'), ' - Hell Porno')
|
|
||||||
|
|
||||||
info = self._parse_html5_media_entries(url, webpage, display_id)[0]
|
info = self._parse_html5_media_entries(url, webpage, display_id)[0]
|
||||||
self._sort_formats(info['formats'])
|
self._sort_formats(info['formats'])
|
||||||
|
|
|
@ -66,8 +66,7 @@ def _real_extract(self, url):
|
||||||
room_info = try_get(stream_data, lambda x: x['data'][0]['gameLiveInfo'])
|
room_info = try_get(stream_data, lambda x: x['data'][0]['gameLiveInfo'])
|
||||||
if not room_info:
|
if not room_info:
|
||||||
raise ExtractorError('Can not extract the room info', expected=True)
|
raise ExtractorError('Can not extract the room info', expected=True)
|
||||||
title = room_info.get('roomName') or room_info.get('introduction') or self._html_search_regex(
|
title = room_info.get('roomName') or room_info.get('introduction') or self._html_extract_title(webpage)
|
||||||
r'<title>([^<]+)</title>', webpage, 'title')
|
|
||||||
screen_type = room_info.get('screenType')
|
screen_type = room_info.get('screenType')
|
||||||
live_source_type = room_info.get('liveSourceType')
|
live_source_type = room_info.get('liveSourceType')
|
||||||
stream_info_list = stream_data['data'][0]['gameStreamInfoList']
|
stream_info_list = stream_data['data'][0]['gameStreamInfoList']
|
||||||
|
|
|
@ -68,7 +68,7 @@ def _real_extract(self, url):
|
||||||
video_info = traverse_obj(info, ('props', 'pageProps', 'videoPlaybackData', 'video'), default={})
|
video_info = traverse_obj(info, ('props', 'pageProps', 'videoPlaybackData', 'video'), default={})
|
||||||
title = (traverse_obj(video_info, ('name', 'value'), ('primaryTitle', 'titleText', 'text'))
|
title = (traverse_obj(video_info, ('name', 'value'), ('primaryTitle', 'titleText', 'text'))
|
||||||
or self._html_search_meta(('og:title', 'twitter:title'), webpage, default=None)
|
or self._html_search_meta(('og:title', 'twitter:title'), webpage, default=None)
|
||||||
or self._html_search_regex(r'<title>(.+?)</title>', webpage, 'title'))
|
or self._html_extract_title(webpage))
|
||||||
data = video_info.get('playbackURLs') or try_get(self._download_json(
|
data = video_info.get('playbackURLs') or try_get(self._download_json(
|
||||||
'https://www.imdb.com/ve/data/VIDEO_PLAYBACK_DATA', video_id,
|
'https://www.imdb.com/ve/data/VIDEO_PLAYBACK_DATA', video_id,
|
||||||
query={
|
query={
|
||||||
|
|
|
@ -115,7 +115,7 @@ def _real_extract(self, url):
|
||||||
video_id = self._match_id(url)
|
video_id = self._match_id(url)
|
||||||
webpage = self._download_webpage(url, video_id)
|
webpage = self._download_webpage(url, video_id)
|
||||||
|
|
||||||
video_title = self._html_search_regex(r'<title>(.*?)</title>', webpage, 'title')
|
video_title = self._html_extract_title(webpage)
|
||||||
video_description = self._html_search_meta('description', webpage, 'description')
|
video_description = self._html_search_meta('description', webpage, 'description')
|
||||||
|
|
||||||
if '/cn/' in url:
|
if '/cn/' in url:
|
||||||
|
|
|
@ -76,8 +76,7 @@ def _real_extract(self, url):
|
||||||
'age_limit': age_limit,
|
'age_limit': age_limit,
|
||||||
}
|
}
|
||||||
|
|
||||||
title = remove_end(self._html_search_regex(
|
title = remove_end(self._html_extract_title(webpage), ' | Iwara')
|
||||||
r'<title>([^<]+)</title>', webpage, 'title'), ' | Iwara')
|
|
||||||
|
|
||||||
thumbnail = self._html_search_regex(
|
thumbnail = self._html_search_regex(
|
||||||
r'poster=[\'"]([^\'"]+)', webpage, 'thumbnail', default=None)
|
r'poster=[\'"]([^\'"]+)', webpage, 'thumbnail', default=None)
|
||||||
|
|
|
@ -102,7 +102,7 @@ def _real_extract(self, url):
|
||||||
video_id = self._match_id(url)
|
video_id = self._match_id(url)
|
||||||
webpage = self._download_webpage(url, video_id)
|
webpage = self._download_webpage(url, video_id)
|
||||||
|
|
||||||
title = self._html_search_regex(r'<title>([^<]+)</title>', webpage, 'title')
|
title = self._html_extract_title(webpage)
|
||||||
description = clean_html(get_element_by_class('share-update-card__update-text', webpage))
|
description = clean_html(get_element_by_class('share-update-card__update-text', webpage))
|
||||||
like_count = int_or_none(get_element_by_class('social-counts-reactions__social-counts-numRections', webpage))
|
like_count = int_or_none(get_element_by_class('social-counts-reactions__social-counts-numRections', webpage))
|
||||||
creator = strip_or_none(clean_html(get_element_by_class('comment__actor-name', webpage)))
|
creator = strip_or_none(clean_html(get_element_by_class('comment__actor-name', webpage)))
|
||||||
|
|
|
@ -24,8 +24,7 @@ def _real_extract(self, url):
|
||||||
webpage = self._download_webpage(
|
webpage = self._download_webpage(
|
||||||
url, video_id, headers={'User-Agent': self._USER_AGENT_IPAD})
|
url, video_id, headers={'User-Agent': self._USER_AGENT_IPAD})
|
||||||
|
|
||||||
title = self._html_search_regex(
|
title = self._html_extract_title(webpage)
|
||||||
r'<title>([^<]+)</title>', webpage, 'title')
|
|
||||||
thumbnail = self._html_search_regex(
|
thumbnail = self._html_search_regex(
|
||||||
r'<div[^>]+class=(?P<q1>[\'"]).*\bvideo_img\b.*(?P=q1)[^>]+data-url=(?P<q2>[\'"])(?P<url>[^\'"]+)(?P=q2)',
|
r'<div[^>]+class=(?P<q1>[\'"]).*\bvideo_img\b.*(?P=q1)[^>]+data-url=(?P<q2>[\'"])(?P<url>[^\'"]+)(?P=q2)',
|
||||||
webpage, 'thumbnail', fatal=False, group='url')
|
webpage, 'thumbnail', fatal=False, group='url')
|
||||||
|
|
|
@ -38,8 +38,7 @@ def _real_extract(self, url):
|
||||||
r'<errordesc>([^<]*)</errordesc>', playerapi, 'error description', fatal=False)
|
r'<errordesc>([^<]*)</errordesc>', playerapi, 'error description', fatal=False)
|
||||||
raise ExtractorError('%s said: %s' % (self.IE_NAME, error_desc), expected=True)
|
raise ExtractorError('%s said: %s' % (self.IE_NAME, error_desc), expected=True)
|
||||||
|
|
||||||
title = self._html_search_regex(
|
title = self._html_extract_title(playerapi)
|
||||||
r'<title>([^<]+)</title>', playerapi, 'title')
|
|
||||||
video_url = self._html_search_regex(
|
video_url = self._html_search_regex(
|
||||||
r'<file>([^<]+)</file>', playerapi, 'video URL')
|
r'<file>([^<]+)</file>', playerapi, 'video URL')
|
||||||
thumbnail = self._html_search_regex(
|
thumbnail = self._html_search_regex(
|
||||||
|
|
|
@ -106,8 +106,7 @@ def _real_extract(self, url):
|
||||||
uploader = None
|
uploader = None
|
||||||
webpage = self._download_webpage(url, media_id)
|
webpage = self._download_webpage(url, media_id)
|
||||||
|
|
||||||
title = self._html_search_regex(
|
title = self._html_extract_title(webpage)
|
||||||
r'<title>(.+?)</title>', webpage, 'title')
|
|
||||||
|
|
||||||
media_url_string = self._search_regex(
|
media_url_string = self._search_regex(
|
||||||
r'"url"\s*:\s*("[^"]+"),', webpage, 'media url', default=None)
|
r'"url"\s*:\s*("[^"]+"),', webpage, 'media url', default=None)
|
||||||
|
@ -219,8 +218,7 @@ def _real_extract(self, url):
|
||||||
|
|
||||||
webpage = self._download_webpage(url, playlist_id)
|
webpage = self._download_webpage(url, playlist_id)
|
||||||
|
|
||||||
title = self._search_regex(
|
title = self._html_extract_title(webpage, default=None)
|
||||||
r'<title>([^>]+)</title>', webpage, 'title', default=None)
|
|
||||||
|
|
||||||
# cut left menu
|
# cut left menu
|
||||||
webpage = self._search_regex(
|
webpage = self._search_regex(
|
||||||
|
|
|
@ -309,7 +309,9 @@ def _real_extract(self, url):
|
||||||
|
|
||||||
webpage = self._download_webpage(f'https://www.nhk.or.jp/school/{program_id}/', program_id)
|
webpage = self._download_webpage(f'https://www.nhk.or.jp/school/{program_id}/', program_id)
|
||||||
|
|
||||||
title = self._og_search_title(webpage, fatal=False) or self._html_extract_title(webpage, fatal=False) or self._html_search_regex(r'<h3>([^<]+?)とは?\s*</h3>', webpage, 'title', fatal=False)
|
title = (self._og_search_title(webpage)
|
||||||
|
or self._html_extract_title(webpage)
|
||||||
|
or self._html_search_regex(r'<h3>([^<]+?)とは?\s*</h3>', webpage, 'title', fatal=False))
|
||||||
title = re.sub(r'\s*\|\s*NHK\s+for\s+School\s*$', '', title) if title else None
|
title = re.sub(r'\s*\|\s*NHK\s+for\s+School\s*$', '', title) if title else None
|
||||||
description = self._html_search_regex(
|
description = self._html_search_regex(
|
||||||
r'(?s)<div\s+class="programDetail\s*">\s*<p>[^<]+</p>',
|
r'(?s)<div\s+class="programDetail\s*">\s*<p>[^<]+</p>',
|
||||||
|
|
|
@ -85,8 +85,7 @@ def _real_extract(self, url):
|
||||||
|
|
||||||
# Extract title - should be in the flashvars; if not, look elsewhere
|
# Extract title - should be in the flashvars; if not, look elsewhere
|
||||||
if video_title is None:
|
if video_title is None:
|
||||||
video_title = self._html_search_regex(
|
video_title = self._html_extract_title(webpage)
|
||||||
r'<title>(.*?)</title', webpage, 'title')
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'id': video_id,
|
'id': video_id,
|
||||||
|
|
|
@ -49,7 +49,7 @@ def _real_extract(self, url):
|
||||||
'quality': quality,
|
'quality': quality,
|
||||||
})
|
})
|
||||||
|
|
||||||
title = self._html_search_regex(r'<title>([^<]+)</title>', webpage, 'title')
|
title = self._html_extract_title(webpage)
|
||||||
thumbnail = self._html_search_regex(r'preview_url:\s+\'([^\']+)\'', webpage, 'thumbnail', default=None)
|
thumbnail = self._html_search_regex(r'preview_url:\s+\'([^\']+)\'', webpage, 'thumbnail', default=None)
|
||||||
duration = self._html_search_regex(r'"icon-clock"></i>\s+<span>((?:\d+:?)+)', webpage, 'duration', default=None)
|
duration = self._html_search_regex(r'"icon-clock"></i>\s+<span>((?:\d+:?)+)', webpage, 'duration', default=None)
|
||||||
|
|
||||||
|
|
|
@ -112,7 +112,7 @@ def _real_extract(self, url):
|
||||||
if smuggled_data.get('force_title'):
|
if smuggled_data.get('force_title'):
|
||||||
title = smuggled_data['force_title']
|
title = smuggled_data['force_title']
|
||||||
else:
|
else:
|
||||||
title = self._html_search_regex(r'<title>([^<]+)</title>', webpage, video_id)
|
title = self._html_extract_title(webpage)
|
||||||
poster = qs.get('poster')
|
poster = qs.get('poster')
|
||||||
thumbnail = poster[0] if poster else None
|
thumbnail = poster[0] if poster else None
|
||||||
|
|
||||||
|
|
|
@ -36,8 +36,7 @@ def _real_extract(self, url):
|
||||||
webpage = self._download_webpage(
|
webpage = self._download_webpage(
|
||||||
'http://www.sunporno.com/videos/%s' % video_id, video_id)
|
'http://www.sunporno.com/videos/%s' % video_id, video_id)
|
||||||
|
|
||||||
title = self._html_search_regex(
|
title = self._html_extract_title(webpage)
|
||||||
r'<title>([^<]+)</title>', webpage, 'title')
|
|
||||||
description = self._html_search_meta(
|
description = self._html_search_meta(
|
||||||
'description', webpage, 'description')
|
'description', webpage, 'description')
|
||||||
thumbnail = self._html_search_regex(
|
thumbnail = self._html_search_regex(
|
||||||
|
|
|
@ -37,9 +37,7 @@ def _real_extract(self, url):
|
||||||
|
|
||||||
video_id = mobj.group('id')
|
video_id = mobj.group('id')
|
||||||
webpage = self._download_webpage(url, video_id)
|
webpage = self._download_webpage(url, video_id)
|
||||||
title = remove_end(self._html_search_regex(
|
title = remove_end(self._html_extract_title(webpage), ' - 視頻 - ThisAV.com-世界第一中文成人娛樂網站')
|
||||||
r'<title>([^<]+)</title>', webpage, 'title'),
|
|
||||||
' - 視頻 - ThisAV.com-世界第一中文成人娛樂網站')
|
|
||||||
video_url = self._html_search_regex(
|
video_url = self._html_search_regex(
|
||||||
r"addVariable\('file','([^']+)'\);", webpage, 'video url', default=None)
|
r"addVariable\('file','([^']+)'\);", webpage, 'video url', default=None)
|
||||||
if video_url:
|
if video_url:
|
||||||
|
|
|
@ -24,8 +24,7 @@ def _real_extract(self, url):
|
||||||
name = mobj.group('movie') + '/' + mobj.group('trailer_name')
|
name = mobj.group('movie') + '/' + mobj.group('trailer_name')
|
||||||
webpage = self._download_webpage(url, name)
|
webpage = self._download_webpage(url, name)
|
||||||
|
|
||||||
title = self._search_regex(r'<title>(.+?)</title>',
|
title = self._html_extract_title(webpage, 'video title').replace(' - Trailer Addict', '')
|
||||||
webpage, 'video title').replace(' - Trailer Addict', '')
|
|
||||||
view_count_str = self._search_regex(
|
view_count_str = self._search_regex(
|
||||||
r'<span class="views_n">([0-9,.]+)</span>',
|
r'<span class="views_n">([0-9,.]+)</span>',
|
||||||
webpage, 'view count', fatal=False)
|
webpage, 'view count', fatal=False)
|
||||||
|
|
|
@ -42,8 +42,7 @@ def _real_extract(self, url):
|
||||||
video_url = self._search_regex(
|
video_url = self._search_regex(
|
||||||
r'<source[^>]+src="([^"]+)"', webpage, 'video url')
|
r'<source[^>]+src="([^"]+)"', webpage, 'video url')
|
||||||
|
|
||||||
title = remove_start(self._html_search_regex(
|
title = remove_start(self._html_extract_title(webpage), 'ویدیو ورزش 3 | ')
|
||||||
r'<title>([^<]+)</title>', webpage, 'title'), 'ویدیو ورزش 3 | ')
|
|
||||||
|
|
||||||
description = self._html_search_regex(
|
description = self._html_search_regex(
|
||||||
r'(?s)<div class="matn">(.+?)</div>',
|
r'(?s)<div class="matn">(.+?)</div>',
|
||||||
|
|
|
@ -50,8 +50,7 @@ def _real_extract(self, url):
|
||||||
'https://vshare.io/v/%s/width-650/height-430/1' % video_id,
|
'https://vshare.io/v/%s/width-650/height-430/1' % video_id,
|
||||||
video_id, headers={'Referer': url})
|
video_id, headers={'Referer': url})
|
||||||
|
|
||||||
title = self._html_search_regex(
|
title = self._html_extract_title(webpage)
|
||||||
r'<title>([^<]+)</title>', webpage, 'title')
|
|
||||||
title = title.split(' - ')[0]
|
title = title.split(' - ')[0]
|
||||||
|
|
||||||
error = self._html_search_regex(
|
error = self._html_search_regex(
|
||||||
|
|
|
@ -28,7 +28,7 @@ def _real_extract(self, url):
|
||||||
video_id = self._match_id(url)
|
video_id = self._match_id(url)
|
||||||
webpage = self._download_webpage(url, video_id)
|
webpage = self._download_webpage(url, video_id)
|
||||||
|
|
||||||
title = self._html_search_regex(r'<title>(.+?)</title>', webpage, 'title')
|
title = self._html_extract_title(webpage)
|
||||||
video_json = self._parse_json(self._html_search_regex(r'sources:\s*(.+?]),', webpage, 'video'), video_id, transform_source=js_to_json)
|
video_json = self._parse_json(self._html_search_regex(r'sources:\s*(.+?]),', webpage, 'video'), video_id, transform_source=js_to_json)
|
||||||
formats = []
|
formats = []
|
||||||
for source in video_json:
|
for source in video_json:
|
||||||
|
|
|
@ -73,8 +73,7 @@ def _real_extract(self, url):
|
||||||
webpage = self._download_webpage(
|
webpage = self._download_webpage(
|
||||||
url, video_id, note='Revisiting webpage')
|
url, video_id, note='Revisiting webpage')
|
||||||
|
|
||||||
title = self._html_search_regex(
|
title = self._html_extract_title(webpage)
|
||||||
r'<title>(.+?)</title>', webpage, 'title')
|
|
||||||
|
|
||||||
video_formats = compat_parse_qs(self._search_regex(
|
video_formats = compat_parse_qs(self._search_regex(
|
||||||
r'video-sources=\\\"(.+?)\"', webpage, 'video_sources'))
|
r'video-sources=\\\"(.+?)\"', webpage, 'video_sources'))
|
||||||
|
|
|
@ -533,7 +533,7 @@ def _real_extract(self, url):
|
||||||
|
|
||||||
title = self._html_search_meta(
|
title = self._html_search_meta(
|
||||||
['og:title', 'twitter:title'], webpage, 'title', default=None
|
['og:title', 'twitter:title'], webpage, 'title', default=None
|
||||||
) or self._html_search_regex('<title>([^<]+)</title>', webpage, 'title')
|
) or self._html_extract_title(webpage)
|
||||||
|
|
||||||
if display_id == host:
|
if display_id == host:
|
||||||
# Headline page (w/ multiple BC playlists) ('news.yahoo.co.jp', 'headlines.yahoo.co.jp/videonews/', ...)
|
# Headline page (w/ multiple BC playlists) ('news.yahoo.co.jp', 'headlines.yahoo.co.jp/videonews/', ...)
|
||||||
|
|
|
@ -36,8 +36,7 @@ def _real_extract(self, url):
|
||||||
|
|
||||||
webpage = self._download_webpage(url, video_id)
|
webpage = self._download_webpage(url, video_id)
|
||||||
|
|
||||||
title = self._html_search_regex(
|
title = self._html_extract_title(webpage)
|
||||||
r'<title>(.+?)</title>', webpage, 'title')
|
|
||||||
|
|
||||||
formats = []
|
formats = []
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue