mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2024-11-25 02:45:12 +00:00
[extractor] Framework for embed detection (#4307)
This commit is contained in:
parent
47304e07dc
commit
8f97a15d1c
8 changed files with 149 additions and 77 deletions
|
@ -9,11 +9,13 @@
|
||||||
write_string,
|
write_string,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# These bloat the lazy_extractors, so allow them to passthrough silently
|
||||||
|
ALLOWED_CLASSMETHODS = {'get_testcases', 'extract_from_webpage'}
|
||||||
|
|
||||||
|
|
||||||
class LazyLoadMetaClass(type):
|
class LazyLoadMetaClass(type):
|
||||||
def __getattr__(cls, name):
|
def __getattr__(cls, name):
|
||||||
# "_TESTS" bloat the lazy_extractors
|
if '_real_class' not in cls.__dict__ and name not in ALLOWED_CLASSMETHODS:
|
||||||
if '_real_class' not in cls.__dict__ and name != 'get_testcases':
|
|
||||||
write_string(
|
write_string(
|
||||||
'WARNING: Falling back to normal extractor since lazy extractor '
|
'WARNING: Falling back to normal extractor since lazy extractor '
|
||||||
f'{cls.__name__} does not have attribute {name}{bug_reports_message()}\n')
|
f'{cls.__name__} does not have attribute {name}{bug_reports_message()}\n')
|
||||||
|
|
|
@ -11,7 +11,7 @@
|
||||||
from inspect import getsource
|
from inspect import getsource
|
||||||
|
|
||||||
NO_ATTR = object()
|
NO_ATTR = object()
|
||||||
STATIC_CLASS_PROPERTIES = ['IE_NAME', 'IE_DESC', 'SEARCH_KEY', '_WORKING', '_NETRC_MACHINE', 'age_limit']
|
STATIC_CLASS_PROPERTIES = ['IE_NAME', 'IE_DESC', 'SEARCH_KEY', '_VALID_URL', '_WORKING', '_NETRC_MACHINE', 'age_limit']
|
||||||
CLASS_METHODS = [
|
CLASS_METHODS = [
|
||||||
'ie_key', 'working', 'description', 'suitable', '_match_valid_url', '_match_id', 'get_temp_id', 'is_suitable'
|
'ie_key', 'working', 'description', 'suitable', '_match_valid_url', '_match_id', 'get_temp_id', 'is_suitable'
|
||||||
]
|
]
|
||||||
|
@ -116,11 +116,6 @@ def build_lazy_ie(ie, name, attr_base):
|
||||||
}.get(base.__name__, base.__name__) for base in ie.__bases__)
|
}.get(base.__name__, base.__name__) for base in ie.__bases__)
|
||||||
|
|
||||||
s = IE_TEMPLATE.format(name=name, module=ie.__module__, bases=bases)
|
s = IE_TEMPLATE.format(name=name, module=ie.__module__, bases=bases)
|
||||||
valid_url = getattr(ie, '_VALID_URL', None)
|
|
||||||
if not valid_url and hasattr(ie, '_make_valid_url'):
|
|
||||||
valid_url = ie._make_valid_url()
|
|
||||||
if valid_url:
|
|
||||||
s += f' _VALID_URL = {valid_url!r}\n'
|
|
||||||
return s + '\n'.join(extra_ie_code(ie, attr_base))
|
return s + '\n'.join(extra_ie_code(ie, attr_base))
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1566,7 +1566,8 @@ def process_ie_result(self, ie_result, download=True, extra_info=None):
|
||||||
result_type = ie_result.get('_type', 'video')
|
result_type = ie_result.get('_type', 'video')
|
||||||
|
|
||||||
if result_type in ('url', 'url_transparent'):
|
if result_type in ('url', 'url_transparent'):
|
||||||
ie_result['url'] = sanitize_url(ie_result['url'])
|
ie_result['url'] = sanitize_url(
|
||||||
|
ie_result['url'], scheme='http' if self.params.get('prefer_insecure') else 'https')
|
||||||
if ie_result.get('original_url'):
|
if ie_result.get('original_url'):
|
||||||
extra_info.setdefault('original_url', ie_result['original_url'])
|
extra_info.setdefault('original_url', ie_result['original_url'])
|
||||||
|
|
||||||
|
|
|
@ -402,11 +402,11 @@ class BrightcoveNewIE(AdobePassIE):
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _extract_url(ie, webpage):
|
def _extract_url(ie, webpage):
|
||||||
urls = BrightcoveNewIE._extract_urls(ie, webpage)
|
urls = BrightcoveNewIE._extract_brightcove_urls(ie, webpage)
|
||||||
return urls[0] if urls else None
|
return urls[0] if urls else None
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _extract_urls(ie, webpage):
|
def _extract_brightcove_urls(ie, webpage):
|
||||||
# Reference:
|
# Reference:
|
||||||
# 1. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideoiniframe
|
# 1. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideoiniframe
|
||||||
# 2. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#tag
|
# 2. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#tag
|
||||||
|
|
|
@ -14,6 +14,7 @@
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
|
import types
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
import urllib.request
|
import urllib.request
|
||||||
import xml.etree.ElementTree
|
import xml.etree.ElementTree
|
||||||
|
@ -23,6 +24,7 @@
|
||||||
from ..downloader import FileDownloader
|
from ..downloader import FileDownloader
|
||||||
from ..downloader.f4m import get_base_url, remove_encrypted_media
|
from ..downloader.f4m import get_base_url, remove_encrypted_media
|
||||||
from ..utils import (
|
from ..utils import (
|
||||||
|
IDENTITY,
|
||||||
JSON_LD_RE,
|
JSON_LD_RE,
|
||||||
NO_DEFAULT,
|
NO_DEFAULT,
|
||||||
ExtractorError,
|
ExtractorError,
|
||||||
|
@ -59,6 +61,7 @@
|
||||||
parse_m3u8_attributes,
|
parse_m3u8_attributes,
|
||||||
parse_resolution,
|
parse_resolution,
|
||||||
sanitize_filename,
|
sanitize_filename,
|
||||||
|
sanitize_url,
|
||||||
sanitized_Request,
|
sanitized_Request,
|
||||||
str_or_none,
|
str_or_none,
|
||||||
str_to_int,
|
str_to_int,
|
||||||
|
@ -431,14 +434,26 @@ class InfoExtractor:
|
||||||
title, description etc.
|
title, description etc.
|
||||||
|
|
||||||
|
|
||||||
Subclasses of this should define a _VALID_URL regexp and, re-define the
|
Subclasses of this should also be added to the list of extractors and
|
||||||
_real_extract() and (optionally) _real_initialize() methods.
|
should define a _VALID_URL regexp and, re-define the _real_extract() and
|
||||||
Probably, they should also be added to the list of extractors.
|
(optionally) _real_initialize() methods.
|
||||||
|
|
||||||
Subclasses may also override suitable() if necessary, but ensure the function
|
Subclasses may also override suitable() if necessary, but ensure the function
|
||||||
signature is preserved and that this function imports everything it needs
|
signature is preserved and that this function imports everything it needs
|
||||||
(except other extractors), so that lazy_extractors works correctly.
|
(except other extractors), so that lazy_extractors works correctly.
|
||||||
|
|
||||||
|
Subclasses can define a list of _EMBED_REGEX, which will be searched for in
|
||||||
|
the HTML of Generic webpages. It may also override _extract_embed_urls
|
||||||
|
or _extract_from_webpage as necessary. While these are normally classmethods,
|
||||||
|
_extract_from_webpage is allowed to be an instance method.
|
||||||
|
|
||||||
|
_extract_from_webpage may raise self.StopExtraction() to stop further
|
||||||
|
processing of the webpage and obtain exclusive rights to it. This is useful
|
||||||
|
when the extractor cannot reliably be matched using just the URL.
|
||||||
|
Eg: invidious/peertube instances
|
||||||
|
|
||||||
|
Embed-only extractors can be defined by setting _VALID_URL = False.
|
||||||
|
|
||||||
To support username + password (or netrc) login, the extractor must define a
|
To support username + password (or netrc) login, the extractor must define a
|
||||||
_NETRC_MACHINE and re-define _perform_login(username, password) and
|
_NETRC_MACHINE and re-define _perform_login(username, password) and
|
||||||
(optionally) _initialize_pre_login() methods. The _perform_login method will
|
(optionally) _initialize_pre_login() methods. The _perform_login method will
|
||||||
|
@ -476,6 +491,8 @@ class InfoExtractor:
|
||||||
_NETRC_MACHINE = None
|
_NETRC_MACHINE = None
|
||||||
IE_DESC = None
|
IE_DESC = None
|
||||||
SEARCH_KEY = None
|
SEARCH_KEY = None
|
||||||
|
_VALID_URL = None
|
||||||
|
_EMBED_REGEX = []
|
||||||
|
|
||||||
def _login_hint(self, method=NO_DEFAULT, netrc=None):
|
def _login_hint(self, method=NO_DEFAULT, netrc=None):
|
||||||
password_hint = f'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
|
password_hint = f'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
|
||||||
|
@ -499,12 +516,12 @@ def __init__(self, downloader=None):
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _match_valid_url(cls, url):
|
def _match_valid_url(cls, url):
|
||||||
|
if cls._VALID_URL is False:
|
||||||
|
return None
|
||||||
# This does not use has/getattr intentionally - we want to know whether
|
# This does not use has/getattr intentionally - we want to know whether
|
||||||
# we have cached the regexp for *this* class, whereas getattr would also
|
# we have cached the regexp for *this* class, whereas getattr would also
|
||||||
# match the superclass
|
# match the superclass
|
||||||
if '_VALID_URL_RE' not in cls.__dict__:
|
if '_VALID_URL_RE' not in cls.__dict__:
|
||||||
if '_VALID_URL' not in cls.__dict__:
|
|
||||||
cls._VALID_URL = cls._make_valid_url()
|
|
||||||
cls._VALID_URL_RE = re.compile(cls._VALID_URL)
|
cls._VALID_URL_RE = re.compile(cls._VALID_URL)
|
||||||
return cls._VALID_URL_RE.match(url)
|
return cls._VALID_URL_RE.match(url)
|
||||||
|
|
||||||
|
@ -1143,10 +1160,12 @@ def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent
|
||||||
'url': url,
|
'url': url,
|
||||||
}
|
}
|
||||||
|
|
||||||
def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None, video_kwargs=None, **kwargs):
|
@classmethod
|
||||||
urls = (self.url_result(self._proto_relative_url(m), ie, **(video_kwargs or {}))
|
def playlist_from_matches(cls, matches, playlist_id=None, playlist_title=None,
|
||||||
for m in orderedSet(map(getter, matches) if getter else matches))
|
getter=IDENTITY, ie=None, video_kwargs=None, **kwargs):
|
||||||
return self.playlist_result(urls, playlist_id, playlist_title, **kwargs)
|
return cls.playlist_result(
|
||||||
|
(cls.url_result(m, ie, **(video_kwargs or {})) for m in orderedSet(map(getter, matches), lazy=True)),
|
||||||
|
playlist_id, playlist_title, **kwargs)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
|
def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
|
||||||
|
@ -1353,12 +1372,20 @@ def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs
|
||||||
def _dc_search_uploader(self, html):
|
def _dc_search_uploader(self, html):
|
||||||
return self._html_search_meta('dc.creator', html, 'uploader')
|
return self._html_search_meta('dc.creator', html, 'uploader')
|
||||||
|
|
||||||
def _rta_search(self, html):
|
@staticmethod
|
||||||
|
def _rta_search(html):
|
||||||
# See http://www.rtalabel.org/index.php?content=howtofaq#single
|
# See http://www.rtalabel.org/index.php?content=howtofaq#single
|
||||||
if re.search(r'(?ix)<meta\s+name="rating"\s+'
|
if re.search(r'(?ix)<meta\s+name="rating"\s+'
|
||||||
r' content="RTA-5042-1996-1400-1577-RTA"',
|
r' content="RTA-5042-1996-1400-1577-RTA"',
|
||||||
html):
|
html):
|
||||||
return 18
|
return 18
|
||||||
|
|
||||||
|
# And then there are the jokers who advertise that they use RTA, but actually don't.
|
||||||
|
AGE_LIMIT_MARKERS = [
|
||||||
|
r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
|
||||||
|
]
|
||||||
|
if any(re.search(marker, html) for marker in AGE_LIMIT_MARKERS):
|
||||||
|
return 18
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
def _media_rating_search(self, html):
|
def _media_rating_search(self, html):
|
||||||
|
@ -1965,14 +1992,9 @@ def http_scheme(self):
|
||||||
else 'https:')
|
else 'https:')
|
||||||
|
|
||||||
def _proto_relative_url(self, url, scheme=None):
|
def _proto_relative_url(self, url, scheme=None):
|
||||||
if url is None:
|
scheme = scheme or self.http_scheme()
|
||||||
return url
|
assert scheme.endswith(':')
|
||||||
if url.startswith('//'):
|
return sanitize_url(url, scheme=scheme[:-1])
|
||||||
if scheme is None:
|
|
||||||
scheme = self.http_scheme()
|
|
||||||
return scheme + url
|
|
||||||
else:
|
|
||||||
return url
|
|
||||||
|
|
||||||
def _sleep(self, timeout, video_id, msg_template=None):
|
def _sleep(self, timeout, video_id, msg_template=None):
|
||||||
if msg_template is None:
|
if msg_template is None:
|
||||||
|
@ -3767,10 +3789,12 @@ def geo_verification_headers(self):
|
||||||
headers['Ytdl-request-proxy'] = geo_verification_proxy
|
headers['Ytdl-request-proxy'] = geo_verification_proxy
|
||||||
return headers
|
return headers
|
||||||
|
|
||||||
def _generic_id(self, url):
|
@staticmethod
|
||||||
|
def _generic_id(url):
|
||||||
return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
|
return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
|
||||||
|
|
||||||
def _generic_title(self, url):
|
@staticmethod
|
||||||
|
def _generic_title(url):
|
||||||
return urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
|
return urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@ -3816,6 +3840,37 @@ def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_l
|
||||||
self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
|
self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def extract_from_webpage(cls, ydl, url, webpage):
|
||||||
|
ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType)
|
||||||
|
else ydl.get_info_extractor(cls.ie_key()))
|
||||||
|
yield from ie._extract_from_webpage(url, webpage) or []
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _extract_from_webpage(cls, url, webpage):
|
||||||
|
for embed_url in orderedSet(
|
||||||
|
cls._extract_embed_urls(url, webpage) or [], lazy=True):
|
||||||
|
yield cls.url_result(embed_url, cls)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _extract_embed_urls(cls, url, webpage):
|
||||||
|
"""@returns all the embed urls on the webpage"""
|
||||||
|
if '_EMBED_URL_RE' not in cls.__dict__:
|
||||||
|
assert isinstance(cls._EMBED_REGEX, (list, tuple))
|
||||||
|
for idx, regex in enumerate(cls._EMBED_REGEX):
|
||||||
|
assert regex.count('(?P<url>') == 1, \
|
||||||
|
f'{cls.__name__}._EMBED_REGEX[{idx}] must have exactly 1 url group\n\t{regex}'
|
||||||
|
cls._EMBED_URL_RE = tuple(map(re.compile, cls._EMBED_REGEX))
|
||||||
|
|
||||||
|
for regex in cls._EMBED_URL_RE:
|
||||||
|
for mobj in regex.finditer(webpage):
|
||||||
|
embed_url = urllib.parse.urljoin(url, unescapeHTML(mobj.group('url')))
|
||||||
|
if cls._VALID_URL is False or cls.suitable(embed_url):
|
||||||
|
yield embed_url
|
||||||
|
|
||||||
|
class StopExtraction(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
class SearchInfoExtractor(InfoExtractor):
|
class SearchInfoExtractor(InfoExtractor):
|
||||||
"""
|
"""
|
||||||
|
@ -3826,8 +3881,8 @@ class SearchInfoExtractor(InfoExtractor):
|
||||||
|
|
||||||
_MAX_RESULTS = float('inf')
|
_MAX_RESULTS = float('inf')
|
||||||
|
|
||||||
@classmethod
|
@classproperty
|
||||||
def _make_valid_url(cls):
|
def _VALID_URL(cls):
|
||||||
return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
|
return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
|
||||||
|
|
||||||
def _real_extract(self, query):
|
def _real_extract(self, query):
|
||||||
|
|
|
@ -3,6 +3,8 @@
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
import xml.etree.ElementTree
|
import xml.etree.ElementTree
|
||||||
|
|
||||||
|
from . import gen_extractor_classes
|
||||||
|
from .common import InfoExtractor # isort: split
|
||||||
from .ant1newsgr import Ant1NewsGrEmbedIE
|
from .ant1newsgr import Ant1NewsGrEmbedIE
|
||||||
from .anvato import AnvatoIE
|
from .anvato import AnvatoIE
|
||||||
from .apa import APAIE
|
from .apa import APAIE
|
||||||
|
@ -14,7 +16,6 @@
|
||||||
from .brightcove import BrightcoveLegacyIE, BrightcoveNewIE
|
from .brightcove import BrightcoveLegacyIE, BrightcoveNewIE
|
||||||
from .channel9 import Channel9IE
|
from .channel9 import Channel9IE
|
||||||
from .cloudflarestream import CloudflareStreamIE
|
from .cloudflarestream import CloudflareStreamIE
|
||||||
from .common import InfoExtractor
|
|
||||||
from .commonprotocols import RtmpIE
|
from .commonprotocols import RtmpIE
|
||||||
from .condenast import CondeNastIE
|
from .condenast import CondeNastIE
|
||||||
from .dailymail import DailyMailIE
|
from .dailymail import DailyMailIE
|
||||||
|
@ -115,6 +116,7 @@
|
||||||
determine_ext,
|
determine_ext,
|
||||||
dict_get,
|
dict_get,
|
||||||
float_or_none,
|
float_or_none,
|
||||||
|
format_field,
|
||||||
int_or_none,
|
int_or_none,
|
||||||
is_html,
|
is_html,
|
||||||
js_to_json,
|
js_to_json,
|
||||||
|
@ -2641,8 +2643,15 @@ def report_following_redirect(self, new_url):
|
||||||
"""Report information extraction."""
|
"""Report information extraction."""
|
||||||
self._downloader.to_screen('[redirect] Following redirect to %s' % new_url)
|
self._downloader.to_screen('[redirect] Following redirect to %s' % new_url)
|
||||||
|
|
||||||
def report_detected(self, name):
|
def report_detected(self, name, num=1, note=None):
|
||||||
self._downloader.write_debug(f'Identified a {name}')
|
if num > 1:
|
||||||
|
name += 's'
|
||||||
|
elif not num:
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
num = 'a'
|
||||||
|
|
||||||
|
self._downloader.write_debug(f'Identified {num} {name}{format_field(note, None, "; %s")}')
|
||||||
|
|
||||||
def _extract_rss(self, url, video_id, doc):
|
def _extract_rss(self, url, video_id, doc):
|
||||||
NS_MAP = {
|
NS_MAP = {
|
||||||
|
@ -2854,8 +2863,7 @@ def _real_extract(self, url):
|
||||||
|
|
||||||
if not self.get_param('test', False) and not is_intentional:
|
if not self.get_param('test', False) and not is_intentional:
|
||||||
force = self.get_param('force_generic_extractor', False)
|
force = self.get_param('force_generic_extractor', False)
|
||||||
self.report_warning(
|
self.report_warning('%s generic information extractor' % ('Forcing' if force else 'Falling back on'))
|
||||||
'%s on generic information extractor.' % ('Forcing' if force else 'Falling back'))
|
|
||||||
|
|
||||||
first_bytes = full_response.read(512)
|
first_bytes = full_response.read(512)
|
||||||
|
|
||||||
|
@ -2933,6 +2941,22 @@ def _real_extract(self, url):
|
||||||
self.report_detected('Camtasia video')
|
self.report_detected('Camtasia video')
|
||||||
return camtasia_res
|
return camtasia_res
|
||||||
|
|
||||||
|
info_dict.update({
|
||||||
|
# it's tempting to parse this further, but you would
|
||||||
|
# have to take into account all the variations like
|
||||||
|
# Video Title - Site Name
|
||||||
|
# Site Name | Video Title
|
||||||
|
# Video Title - Tagline | Site Name
|
||||||
|
# and so on and so forth; it's just not practical
|
||||||
|
'title': (self._og_search_title(webpage, default=None)
|
||||||
|
or self._html_extract_title(webpage, 'video title', default='video')),
|
||||||
|
'description': self._og_search_description(webpage, default=None),
|
||||||
|
'thumbnail': self._og_search_thumbnail(webpage, default=None),
|
||||||
|
'age_limit': self._rta_search(webpage),
|
||||||
|
})
|
||||||
|
|
||||||
|
domain_name = self._search_regex(r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
|
||||||
|
|
||||||
# Sometimes embedded video player is hidden behind percent encoding
|
# Sometimes embedded video player is hidden behind percent encoding
|
||||||
# (e.g. https://github.com/ytdl-org/youtube-dl/issues/2448)
|
# (e.g. https://github.com/ytdl-org/youtube-dl/issues/2448)
|
||||||
# Unescaping the whole page allows to handle those cases in a generic way
|
# Unescaping the whole page allows to handle those cases in a generic way
|
||||||
|
@ -2946,40 +2970,12 @@ def _real_extract(self, url):
|
||||||
r'<div[^>]+class=[^>]*?\bsqs-video-wrapper\b[^>]*>',
|
r'<div[^>]+class=[^>]*?\bsqs-video-wrapper\b[^>]*>',
|
||||||
lambda x: unescapeHTML(x.group(0)), webpage)
|
lambda x: unescapeHTML(x.group(0)), webpage)
|
||||||
|
|
||||||
# it's tempting to parse this further, but you would
|
# TODO: Remove
|
||||||
# have to take into account all the variations like
|
video_title, video_description, video_thumbnail, age_limit, video_uploader = \
|
||||||
# Video Title - Site Name
|
info_dict['title'], info_dict['description'], info_dict['thumbnail'], info_dict['age_limit'], domain_name
|
||||||
# Site Name | Video Title
|
|
||||||
# Video Title - Tagline | Site Name
|
|
||||||
# and so on and so forth; it's just not practical
|
|
||||||
video_title = (self._og_search_title(webpage, default=None)
|
|
||||||
or self._html_extract_title(webpage, 'video title', default='video'))
|
|
||||||
|
|
||||||
# Try to detect age limit automatically
|
# TODO: Move Embeds
|
||||||
age_limit = self._rta_search(webpage)
|
self._downloader.write_debug('Looking for single embeds')
|
||||||
# And then there are the jokers who advertise that they use RTA,
|
|
||||||
# but actually don't.
|
|
||||||
AGE_LIMIT_MARKERS = [
|
|
||||||
r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
|
|
||||||
]
|
|
||||||
if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS):
|
|
||||||
age_limit = 18
|
|
||||||
|
|
||||||
# video uploader is domain name
|
|
||||||
video_uploader = self._search_regex(
|
|
||||||
r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
|
|
||||||
|
|
||||||
video_description = self._og_search_description(webpage, default=None)
|
|
||||||
video_thumbnail = self._og_search_thumbnail(webpage, default=None)
|
|
||||||
|
|
||||||
info_dict.update({
|
|
||||||
'title': video_title,
|
|
||||||
'description': video_description,
|
|
||||||
'thumbnail': video_thumbnail,
|
|
||||||
'age_limit': age_limit,
|
|
||||||
})
|
|
||||||
|
|
||||||
self._downloader.write_debug('Looking for video embeds')
|
|
||||||
|
|
||||||
# Look for Brightcove Legacy Studio embeds
|
# Look for Brightcove Legacy Studio embeds
|
||||||
bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage)
|
bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage)
|
||||||
|
@ -2998,7 +2994,7 @@ def _real_extract(self, url):
|
||||||
}
|
}
|
||||||
|
|
||||||
# Look for Brightcove New Studio embeds
|
# Look for Brightcove New Studio embeds
|
||||||
bc_urls = BrightcoveNewIE._extract_urls(self, webpage)
|
bc_urls = BrightcoveNewIE._extract_brightcove_urls(self, webpage)
|
||||||
if bc_urls:
|
if bc_urls:
|
||||||
return self.playlist_from_matches(
|
return self.playlist_from_matches(
|
||||||
bc_urls, video_id, video_title,
|
bc_urls, video_id, video_title,
|
||||||
|
@ -3246,7 +3242,7 @@ def _real_extract(self, url):
|
||||||
return self.playlist_from_matches(sportbox_urls, video_id, video_title, ie=SportBoxIE.ie_key())
|
return self.playlist_from_matches(sportbox_urls, video_id, video_title, ie=SportBoxIE.ie_key())
|
||||||
|
|
||||||
# Look for embedded Spotify player
|
# Look for embedded Spotify player
|
||||||
spotify_urls = SpotifyBaseIE._extract_embed_urls(webpage)
|
spotify_urls = SpotifyBaseIE._extract_urls(webpage)
|
||||||
if spotify_urls:
|
if spotify_urls:
|
||||||
return self.playlist_from_matches(spotify_urls, video_id, video_title)
|
return self.playlist_from_matches(spotify_urls, video_id, video_title)
|
||||||
|
|
||||||
|
@ -3837,6 +3833,30 @@ def _real_extract(self, url):
|
||||||
tiktok_urls = TikTokIE._extract_urls(webpage)
|
tiktok_urls = TikTokIE._extract_urls(webpage)
|
||||||
if tiktok_urls:
|
if tiktok_urls:
|
||||||
return self.playlist_from_matches(tiktok_urls, video_id, video_title)
|
return self.playlist_from_matches(tiktok_urls, video_id, video_title)
|
||||||
|
# TODO: END: Move Embeds
|
||||||
|
|
||||||
|
self._downloader.write_debug('Looking for embeds')
|
||||||
|
embeds = []
|
||||||
|
for ie in gen_extractor_classes():
|
||||||
|
gen = ie.extract_from_webpage(self._downloader, url, webpage)
|
||||||
|
current_embeds = []
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
current_embeds.append(next(gen))
|
||||||
|
except self.StopExtraction:
|
||||||
|
self.report_detected(f'{ie.IE_NAME} exclusive embed', len(current_embeds),
|
||||||
|
embeds and 'discarding other embeds')
|
||||||
|
embeds = current_embeds
|
||||||
|
break
|
||||||
|
except StopIteration:
|
||||||
|
self.report_detected(f'{ie.IE_NAME} embed', len(current_embeds))
|
||||||
|
embeds.extend(current_embeds)
|
||||||
|
|
||||||
|
del current_embeds
|
||||||
|
if len(embeds) == 1:
|
||||||
|
return {**info_dict, **embeds[0]}
|
||||||
|
elif embeds:
|
||||||
|
return self.playlist_result(embeds, **info_dict)
|
||||||
|
|
||||||
# Look for HTML5 media
|
# Look for HTML5 media
|
||||||
entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls')
|
entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls')
|
||||||
|
@ -4119,7 +4139,6 @@ def filter_video(urls):
|
||||||
entries.append(self.url_result(video_url, 'Youtube'))
|
entries.append(self.url_result(video_url, 'Youtube'))
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# here's a fun little line of code for you:
|
|
||||||
video_id = os.path.splitext(video_id)[0]
|
video_id = os.path.splitext(video_id)[0]
|
||||||
headers = {
|
headers = {
|
||||||
'referer': full_response.geturl()
|
'referer': full_response.geturl()
|
||||||
|
|
|
@ -98,7 +98,7 @@ def _extract_episode(self, episode, series):
|
||||||
}
|
}
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _extract_embed_urls(cls, webpage):
|
def _extract_urls(cls, webpage):
|
||||||
return re.findall(
|
return re.findall(
|
||||||
r'<iframe[^>]+src="(https?://open\.spotify.com/embed/[^"]+)"',
|
r'<iframe[^>]+src="(https?://open\.spotify.com/embed/[^"]+)"',
|
||||||
webpage)
|
webpage)
|
||||||
|
|
|
@ -705,13 +705,13 @@ def sanitize_path(s, force=False):
|
||||||
return os.path.join(*sanitized_path)
|
return os.path.join(*sanitized_path)
|
||||||
|
|
||||||
|
|
||||||
def sanitize_url(url):
|
def sanitize_url(url, *, scheme='http'):
|
||||||
# Prepend protocol-less URLs with `http:` scheme in order to mitigate
|
# Prepend protocol-less URLs with `http:` scheme in order to mitigate
|
||||||
# the number of unwanted failures due to missing protocol
|
# the number of unwanted failures due to missing protocol
|
||||||
if url is None:
|
if url is None:
|
||||||
return
|
return
|
||||||
elif url.startswith('//'):
|
elif url.startswith('//'):
|
||||||
return 'http:%s' % url
|
return f'{scheme}:{url}'
|
||||||
# Fix some common typos seen so far
|
# Fix some common typos seen so far
|
||||||
COMMON_TYPOS = (
|
COMMON_TYPOS = (
|
||||||
# https://github.com/ytdl-org/youtube-dl/issues/15649
|
# https://github.com/ytdl-org/youtube-dl/issues/15649
|
||||||
|
|
Loading…
Reference in a new issue