[extractor] Use classmethod/property where possible

and refactor lazy extractors accordingly. This reduces the need to create extractor instances
2024-11-29 03:23:02 +00:00 · 2022-05-11 21:24:44 +05:30 · 2022-05-11 21:24:44 +05:30 · 82d020804d
commit 82d020804d
parent 7ddbf09c25
11 changed files with 188 additions and 167 deletions
--- a/devscripts/lazy_load_template.py
+++ b/devscripts/lazy_load_template.py
@ -1,30 +1,28 @@
+import importlib
+import random
 import re

-from ..utils import bug_reports_message, write_string
+from ..utils import bug_reports_message, classproperty, write_string


 class LazyLoadMetaClass(type):
    def __getattr__(cls, name):
-        if '_real_class' not in cls.__dict__:
+        # "is_suitable" requires "_TESTS". However, they bloat the lazy_extractors
+        if '_real_class' not in cls.__dict__ and name not in ('is_suitable', 'get_testcases'):
            write_string(
                'WARNING: Falling back to normal extractor since lazy extractor '
-                f'{cls.__name__} does not have attribute {name}{bug_reports_message()}')
-        return getattr(cls._get_real_class(), name)
+                f'{cls.__name__} does not have attribute {name}{bug_reports_message()}\n')
+        return getattr(cls.real_class, name)


 class LazyLoadExtractor(metaclass=LazyLoadMetaClass):
-    _module = None
-    _WORKING = True
-
-    @classmethod
-    def _get_real_class(cls):
+    @classproperty
+    def real_class(cls):
        if '_real_class' not in cls.__dict__:
-            mod = __import__(cls._module, fromlist=(cls.__name__,))
-            cls._real_class = getattr(mod, cls.__name__)
+            cls._real_class = getattr(importlib.import_module(cls._module), cls.__name__)
        return cls._real_class

    def __new__(cls, *args, **kwargs):
-        real_cls = cls._get_real_class()
-        instance = real_cls.__new__(real_cls)
+        instance = cls.real_class.__new__(cls.real_class)
        instance.__init__(*args, **kwargs)
        return instance
--- a/devscripts/make_lazy_extractors.py
+++ b/devscripts/make_lazy_extractors.py
@ -1,101 +1,125 @@
 #!/usr/bin/env python3
 import os
+import optparse
 import sys
 from inspect import getsource

 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

-lazy_extractors_filename = sys.argv[1] if len(sys.argv) > 1 else 'yt_dlp/extractor/lazy_extractors.py'
-if os.path.exists(lazy_extractors_filename):
+
+NO_ATTR = object()
+STATIC_CLASS_PROPERTIES = ['IE_NAME', 'IE_DESC', 'SEARCH_KEY', '_WORKING', '_NETRC_MACHINE']
+CLASS_METHODS = [
+    'ie_key', 'working', 'description', 'suitable', '_match_valid_url', '_match_id', 'get_temp_id',
+]
+IE_TEMPLATE = '''
+class {name}({bases}):
+    _module = {module!r}
+'''
+with open('devscripts/lazy_load_template.py', encoding='utf-8') as f:
+    MODULE_TEMPLATE = f.read()
+
+
+def main():
+    parser = optparse.OptionParser(usage='%prog [OUTFILE.py]')
+    args = parser.parse_args()[1] or ['yt_dlp/extractor/lazy_extractors.py']
+    if len(args) != 1:
+        parser.error('Expected only an output filename')
+
+    lazy_extractors_filename = args[0]
+    if os.path.exists(lazy_extractors_filename):
        os.remove(lazy_extractors_filename)

-# Block plugins from loading
-plugins_dirname = 'ytdlp_plugins'
-plugins_blocked_dirname = 'ytdlp_plugins_blocked'
-if os.path.exists(plugins_dirname):
-    os.rename(plugins_dirname, plugins_blocked_dirname)
+    _ALL_CLASSES = get_all_ies()  # Must be before import

-from yt_dlp.extractor import _ALL_CLASSES
-from yt_dlp.extractor.common import InfoExtractor, SearchInfoExtractor
+    from yt_dlp.extractor.common import InfoExtractor, SearchInfoExtractor

-if os.path.exists(plugins_blocked_dirname):
-    os.rename(plugins_blocked_dirname, plugins_dirname)
+    DummyInfoExtractor = type('InfoExtractor', (InfoExtractor,), {'IE_NAME': NO_ATTR})
+    module_src = '\n'.join((
+        MODULE_TEMPLATE,
+        '    _module = None',
+        *extra_ie_code(DummyInfoExtractor),
+        '\nclass LazyLoadSearchExtractor(LazyLoadExtractor):\n    pass\n',
+        *build_ies(_ALL_CLASSES, (InfoExtractor, SearchInfoExtractor), DummyInfoExtractor),
+    ))

-with open('devscripts/lazy_load_template.py', encoding='utf-8') as f:
-    module_template = f.read()
-
-CLASS_PROPERTIES = ['ie_key', 'working', '_match_valid_url', 'suitable', '_match_id', 'get_temp_id']
-module_contents = [
-    module_template,
-    *[getsource(getattr(InfoExtractor, k)) for k in CLASS_PROPERTIES],
-    '\nclass LazyLoadSearchExtractor(LazyLoadExtractor):\n    pass\n']
-
-ie_template = '''
-class {name}({bases}):
-    _module = '{module}'
-'''
+    with open(lazy_extractors_filename, 'wt', encoding='utf-8') as f:
+        f.write(f'{module_src}\n')


-def get_base_name(base):
-    if base is InfoExtractor:
-        return 'LazyLoadExtractor'
-    elif base is SearchInfoExtractor:
-        return 'LazyLoadSearchExtractor'
-    else:
-        return base.__name__
+def get_all_ies():
+    PLUGINS_DIRNAME = 'ytdlp_plugins'
+    BLOCKED_DIRNAME = f'{PLUGINS_DIRNAME}_blocked'
+    if os.path.exists(PLUGINS_DIRNAME):
+        os.rename(PLUGINS_DIRNAME, BLOCKED_DIRNAME)
+    try:
+        from yt_dlp.extractor import _ALL_CLASSES
+    finally:
+        if os.path.exists(BLOCKED_DIRNAME):
+            os.rename(BLOCKED_DIRNAME, PLUGINS_DIRNAME)
+    return _ALL_CLASSES


-def build_lazy_ie(ie, name):
-    s = ie_template.format(
-        name=name,
-        bases=', '.join(map(get_base_name, ie.__bases__)),
-        module=ie.__module__)
+def extra_ie_code(ie, base=None):
+    for var in STATIC_CLASS_PROPERTIES:
+        val = getattr(ie, var)
+        if val != (getattr(base, var) if base else NO_ATTR):
+            yield f'    {var} = {val!r}'
+    yield ''
+
+    for name in CLASS_METHODS:
+        f = getattr(ie, name)
+        if not base or f.__func__ != getattr(base, name).__func__:
+            yield getsource(f)
+
+
+def build_ies(ies, bases, attr_base):
+    names = []
+    for ie in sort_ies(ies, bases):
+        yield build_lazy_ie(ie, ie.__name__, attr_base)
+        if ie in ies:
+            names.append(ie.__name__)
+
+    yield f'\n_ALL_CLASSES = [{", ".join(names)}]'
+
+
+def sort_ies(ies, ignored_bases):
+    """find the correct sorting and add the required base classes so that subclasses can be correctly created"""
+    classes, returned_classes = ies[:-1], set()
+    assert ies[-1].__name__ == 'GenericIE', 'Last IE must be GenericIE'
+    while classes:
+        for c in classes[:]:
+            bases = set(c.__bases__) - {object, *ignored_bases}
+            restart = False
+            for b in bases:
+                if b not in classes and b not in returned_classes:
+                    assert b.__name__ != 'GenericIE', 'Cannot inherit from GenericIE'
+                    classes.insert(0, b)
+                    restart = True
+            if restart:
+                break
+            if bases <= returned_classes:
+                yield c
+                returned_classes.add(c)
+                classes.remove(c)
+                break
+    yield ies[-1]
+
+
+def build_lazy_ie(ie, name, attr_base):
+    bases = ', '.join({
+        'InfoExtractor': 'LazyLoadExtractor',
+        'SearchInfoExtractor': 'LazyLoadSearchExtractor',
+    }.get(base.__name__, base.__name__) for base in ie.__bases__)
+
+    s = IE_TEMPLATE.format(name=name, module=ie.__module__, bases=bases)
    valid_url = getattr(ie, '_VALID_URL', None)
    if not valid_url and hasattr(ie, '_make_valid_url'):
        valid_url = ie._make_valid_url()
    if valid_url:
        s += f'    _VALID_URL = {valid_url!r}\n'
-    if not ie._WORKING:
-        s += '    _WORKING = False\n'
-    if ie.suitable.__func__ is not InfoExtractor.suitable.__func__:
-        s += f'\n{getsource(ie.suitable)}'
-    return s
+    return s + '\n'.join(extra_ie_code(ie, attr_base))


-# find the correct sorting and add the required base classes so that subclasses
-# can be correctly created
-classes = _ALL_CLASSES[:-1]
-ordered_cls = []
-while classes:
-    for c in classes[:]:
-        bases = set(c.__bases__) - {object, InfoExtractor, SearchInfoExtractor}
-        stop = False
-        for b in bases:
-            if b not in classes and b not in ordered_cls:
-                if b.__name__ == 'GenericIE':
-                    exit()
-                classes.insert(0, b)
-                stop = True
-        if stop:
-            break
-        if all(b in ordered_cls for b in bases):
-            ordered_cls.append(c)
-            classes.remove(c)
-            break
-ordered_cls.append(_ALL_CLASSES[-1])
-
-names = []
-for ie in ordered_cls:
-    name = ie.__name__
-    src = build_lazy_ie(ie, name)
-    module_contents.append(src)
-    if ie in _ALL_CLASSES:
-        names.append(name)
-
-module_contents.append(
-    '\n_ALL_CLASSES = [{}]'.format(', '.join(names)))
-
-module_src = '\n'.join(module_contents) + '\n'
-
-with open(lazy_extractors_filename, 'wt', encoding='utf-8') as f:
-    f.write(module_src)
+if __name__ == '__main__':
+    main()
--- a/devscripts/make_supportedsites.py
+++ b/devscripts/make_supportedsites.py
@ -5,7 +5,7 @@

 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

-from yt_dlp.extractor import list_extractors
+from yt_dlp.extractor import list_extractor_classes


 def main():
@ -14,7 +14,7 @@ def main():
    if len(args) != 1:
        parser.error('Expected an output filename')

-    out = '\n'.join(ie.description() for ie in list_extractors(None) if ie.IE_DESC is not False)
+    out = '\n'.join(ie.description() for ie in list_extractor_classes() if ie.IE_DESC is not False)

    with open(args[0], 'w', encoding='utf-8') as outf:
        outf.write(f'# Supported sites\n{out}\n')
--- a/supportedsites.md
+++ b/supportedsites.md
@ -431,7 +431,6 @@ # Supported sites
 - **gem.cbc.ca**: [<abbr title="netrc machine"><em>cbcgem</em></abbr>]
 - **gem.cbc.ca:live**
 - **gem.cbc.ca:playlist**
- - **generic**: Generic downloader that works on some sites
 - **Gettr**
 - **GettrStreaming**
 - **Gfycat**
@ -1553,3 +1552,4 @@ # Supported sites
 - **zingmp3:album**
 - **zoom**
 - **Zype**
+ - **generic**: Generic downloader that works on some sites
--- a/yt_dlp/init.py
+++ b/yt_dlp/init.py
@ -11,7 +11,7 @@
 from .compat import compat_getpass, compat_os_name, compat_shlex_quote
 from .cookies import SUPPORTED_BROWSERS, SUPPORTED_KEYRINGS
 from .downloader import FileDownloader
-from .extractor import list_extractors
+from .extractor import GenericIE, list_extractor_classes
 from .extractor.adobepass import MSO_INFO
 from .extractor.common import InfoExtractor
 from .options import parseOpts
@ -76,14 +76,20 @@ def get_urls(urls, batchfile, verbose):
 def print_extractor_information(opts, urls):
    out = ''
    if opts.list_extractors:
-        for ie in list_extractors(opts.age_limit):
+        urls = dict.fromkeys(urls, False)
+        for ie in list_extractor_classes(opts.age_limit):
            out += ie.IE_NAME + (' (CURRENTLY BROKEN)' if not ie.working() else '') + '\n'
-            out += ''.join(f'  {url}\n' for url in filter(ie.suitable, urls))
+            if ie == GenericIE:
+                matched_urls = [url for url, matched in urls.items() if not matched]
+            else:
+                matched_urls = tuple(filter(ie.suitable, urls.keys()))
+                urls.update(dict.fromkeys(matched_urls, True))
+            out += ''.join(f'  {url}\n' for url in matched_urls)
    elif opts.list_extractor_descriptions:
        _SEARCHES = ('cute kittens', 'slithering pythons', 'falling cat', 'angry poodle', 'purple fish', 'running tortoise', 'sleeping bunny', 'burping cow')
        out = '\n'.join(
            ie.description(markdown=False, search_examples=_SEARCHES)
-            for ie in list_extractors(opts.age_limit) if ie.working() and ie.IE_DESC is not False) + '\n'
+            for ie in list_extractor_classes(opts.age_limit) if ie.working() and ie.IE_DESC is not False)
    elif opts.ap_list_mso:
        out = 'Supported TV Providers:\n%s\n' % render_table(
            ['mso', 'mso name'],
@ -862,7 +868,7 @@ def main(argv=None):
        sys.exit(f'\nERROR: {e}')


-from .extractor import gen_extractors
+from .extractor import gen_extractors, list_extractors
 __all__ = [
    'main',
    'YoutubeDL',
--- a/yt_dlp/extractor/init.py
+++ b/yt_dlp/extractor/init.py
@ -37,11 +37,17 @@ def gen_extractors():
    return [klass() for klass in gen_extractor_classes()]


-def list_extractors(age_limit):
+def list_extractor_classes(age_limit=None):
    """Return a list of extractors that are suitable for the given age, sorted by extractor name"""
-    return sorted(filter(
-        lambda ie: ie.is_suitable(age_limit),
-        gen_extractors()), key=lambda ie: ie.IE_NAME.lower())
+    yield from sorted(filter(
+        lambda ie: ie.is_suitable(age_limit) and ie != GenericIE,  # noqa: F405
+        gen_extractor_classes()), key=lambda ie: ie.IE_NAME.lower())
+    yield GenericIE  # noqa: F405
+
+
+def list_extractors(age_limit=None):
+    """Return a list of extractor instances that are suitable for the given age, sorted by extractor name"""
+    return [ie() for ie in list_extractor_classes(age_limit)]


 def get_info_extractor(ie_name):
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@ -40,6 +40,7 @@
    age_restricted,
    base_url,
    bug_reports_message,
+    classproperty,
    clean_html,
    determine_ext,
    determine_protocol,
@ -710,9 +711,9 @@ def ie_key(cls):
        """A string for getting the InfoExtractor with get_info_extractor"""
        return cls.__name__[:-2]

-    @property
-    def IE_NAME(self):
-        return type(self).__name__[:-2]
+    @classproperty
+    def IE_NAME(cls):
+        return cls.__name__[:-2]

    @staticmethod
    def __can_accept_status_code(err, expected_status):
@ -3624,56 +3625,57 @@ def _apply_first_set_cookie_header(self, url_handle, cookie):
                self._set_cookie(domain, cookie, value)
                break

-    def get_testcases(self, include_onlymatching=False):
-        t = getattr(self, '_TEST', None)
+    @classmethod
+    def get_testcases(cls, include_onlymatching=False):
+        t = getattr(cls, '_TEST', None)
        if t:
-            assert not hasattr(self, '_TESTS'), \
-                '%s has _TEST and _TESTS' % type(self).__name__
+            assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS'
            tests = [t]
        else:
-            tests = getattr(self, '_TESTS', [])
+            tests = getattr(cls, '_TESTS', [])
        for t in tests:
            if not include_onlymatching and t.get('only_matching', False):
                continue
-            t['name'] = type(self).__name__[:-len('IE')]
+            t['name'] = cls.ie_key()
            yield t

-    def is_suitable(self, age_limit):
+    @classmethod
+    def is_suitable(cls, age_limit):
        """ Test whether the extractor is generally suitable for the given
        age limit (i.e. pornographic sites are not, all others usually are) """

        any_restricted = False
-        for tc in self.get_testcases(include_onlymatching=False):
+        for tc in cls.get_testcases(include_onlymatching=False):
            if tc.get('playlist', []):
                tc = tc['playlist'][0]
-            is_restricted = age_restricted(
-                tc.get('info_dict', {}).get('age_limit'), age_limit)
+            is_restricted = age_restricted(tc.get('info_dict', {}).get('age_limit'), age_limit)
            if not is_restricted:
                return True
            any_restricted = any_restricted or is_restricted
        return not any_restricted

-    def description(self, *, markdown=True, search_examples=None):
+    @classmethod
+    def description(cls, *, markdown=True, search_examples=None):
        """Description of the extractor"""
        desc = ''
-        if self._NETRC_MACHINE:
+        if cls._NETRC_MACHINE:
            if markdown:
-                desc += f' [<abbr title="netrc machine"><em>{self._NETRC_MACHINE}</em></abbr>]'
+                desc += f' [<abbr title="netrc machine"><em>{cls._NETRC_MACHINE}</em></abbr>]'
            else:
-                desc += f' [{self._NETRC_MACHINE}]'
-        if self.IE_DESC is False:
+                desc += f' [{cls._NETRC_MACHINE}]'
+        if cls.IE_DESC is False:
            desc += ' [HIDDEN]'
-        elif self.IE_DESC:
-            desc += f' {self.IE_DESC}'
-        if self.SEARCH_KEY:
-            desc += f'; "{self.SEARCH_KEY}:" prefix'
+        elif cls.IE_DESC:
+            desc += f' {cls.IE_DESC}'
+        if cls.SEARCH_KEY:
+            desc += f'; "{cls.SEARCH_KEY}:" prefix'
            if search_examples:
                _COUNTS = ('', '5', '10', 'all')
-                desc += f' (Example: "{self.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
-        if not self.working():
+                desc += f' (Example: "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
+        if not cls.working():
            desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'

-        name = f' - **{self.IE_NAME}**' if markdown else self.IE_NAME
+        name = f' - **{cls.IE_NAME}**' if markdown else cls.IE_NAME
        return f'{name}:{desc}' if desc else name

    def extract_subtitles(self, *args, **kwargs):
@ -3849,6 +3851,6 @@ def _search_results(self, query):
        """Returns an iterator of search results"""
        raise NotImplementedError('This method must be implemented by subclasses')

-    @property
-    def SEARCH_KEY(self):
-        return self._SEARCH_KEY
+    @classproperty
+    def SEARCH_KEY(cls):
+        return cls._SEARCH_KEY
--- a/yt_dlp/extractor/drtv.py
+++ b/yt_dlp/extractor/drtv.py
@ -18,6 +18,7 @@
    url_or_none,
 )

+
 class DRTVIE(InfoExtractor):
    _VALID_URL = r'''(?x)
                    https?://
--- a/yt_dlp/extractor/testurl.py
+++ b/yt_dlp/extractor/testurl.py
@ -8,55 +8,36 @@ class TestURLIE(InfoExtractor):
    """ Allows addressing of the test cases as test:yout.*be_1 """

    IE_DESC = False  # Do not list
-    _VALID_URL = r'test(?:url)?:(?P<id>(?P<extractor>.+?)(?:_(?P<num>[0-9]+))?)$'
+    _VALID_URL = r'test(?:url)?:(?P<extractor>.+?)(?:_(?P<num>[0-9]+))?$'

    def _real_extract(self, url):
-        from ..extractor import gen_extractors
+        from ..extractor import gen_extractor_classes

-        mobj = self._match_valid_url(url)
-        video_id = mobj.group('id')
-        extractor_id = mobj.group('extractor')
-        all_extractors = gen_extractors()
+        extractor_id, num = self._match_valid_url(url).group('extractor', 'num')

        rex = re.compile(extractor_id, flags=re.IGNORECASE)
-        matching_extractors = [
-            e for e in all_extractors if rex.search(e.IE_NAME)]
+        matching_extractors = [e for e in gen_extractor_classes() if rex.search(e.IE_NAME)]

        if len(matching_extractors) == 0:
-            raise ExtractorError(
-                'No extractors matching %r found' % extractor_id,
-                expected=True)
+            raise ExtractorError('No extractors matching {extractor_id!r} found', expected=True)
        elif len(matching_extractors) > 1:
-            # Is it obvious which one to pick?
-            try:
+            try:  # Check for exact match
                extractor = next(
                    ie for ie in matching_extractors
                    if ie.IE_NAME.lower() == extractor_id.lower())
            except StopIteration:
                raise ExtractorError(
-                    ('Found multiple matching extractors: %s' %
-                        ' '.join(ie.IE_NAME for ie in matching_extractors)),
+                    'Found multiple matching extractors: %s' % ' '.join(ie.IE_NAME for ie in matching_extractors),
                    expected=True)
        else:
            extractor = matching_extractors[0]

-        num_str = mobj.group('num')
-        num = int(num_str) if num_str else 0
-
-        testcases = []
-        t = getattr(extractor, '_TEST', None)
-        if t:
-            testcases.append(t)
-        testcases.extend(getattr(extractor, '_TESTS', []))
-
+        testcases = tuple(extractor.get_testcases(True))
        try:
-            tc = testcases[num]
+            tc = testcases[int(num or 0)]
        except IndexError:
            raise ExtractorError(
-                ('Test case %d not found, got only %d tests' %
-                    (num, len(testcases))),
-                expected=True)
+                f'Test case {num or 0} not found, got only {len(testcases)} tests', expected=True)

-        self.to_screen('Test URL: %s' % tc['url'])
-
-        return self.url_result(tc['url'], video_id=video_id)
+        self.to_screen(f'Test URL: {tc["url"]}')
+        return self.url_result(tc['url'])
--- a/yt_dlp/extractor/youtube.py
+++ b/yt_dlp/extractor/youtube.py
@ -31,6 +31,7 @@
    NO_DEFAULT,
    ExtractorError,
    bug_reports_message,
+    classproperty,
    clean_html,
    datetime_from_str,
    dict_get,
@ -5781,16 +5782,17 @@ def _real_extract(self, url):
 class YoutubeFeedsInfoExtractor(InfoExtractor):
    """
    Base class for feed extractors
-    Subclasses must define the _FEED_NAME property.
+    Subclasses must re-define the _FEED_NAME property.
    """
    _LOGIN_REQUIRED = True
+    _FEED_NAME = 'feeds'

    def _real_initialize(self):
        YoutubeBaseInfoExtractor._check_login_required(self)

-    @property
+    @classproperty
    def IE_NAME(self):
-        return 'youtube:%s' % self._FEED_NAME
+        return f'youtube:{self._FEED_NAME}'

    def _real_extract(self, url):
        return self.url_result(
--- a/yt_dlp/utils.py
+++ b/yt_dlp/utils.py
@ -5321,6 +5321,7 @@ def merge_headers(*dicts):

 class classproperty:
    def __init__(self, f):
+        functools.update_wrapper(self, f)
        self.f = f

    def __get__(self, _, cls):