[utils] Improve get_elements_text_and_html_by_attribute regex (#2280)

Authored by: zmousm, pukkandan
2024-12-22 06:00:00 +00:00 · 2022-01-09 20:14:56 +02:00 · 2022-01-09 20:14:56 +02:00 · 0254f16274
commit 0254f16274
parent a70b71e85a
2 changed files with 15 additions and 16 deletions
--- a/test/test_utils.py
+++ b/test/test_utils.py
@ -1659,10 +1659,10 @@ def test_get_elements_text_and_html_by_attribute(self):
        html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING

        self.assertEqual(
-            get_elements_text_and_html_by_attribute('class', 'foo bar', html),
+            list(get_elements_text_and_html_by_attribute('class', 'foo bar', html)),
            list(zip(['nice', 'also nice'], self.GET_ELEMENTS_BY_CLASS_RES)))
-        self.assertEqual(get_elements_text_and_html_by_attribute('class', 'foo', html), [])
-        self.assertEqual(get_elements_text_and_html_by_attribute('class', 'no-such-foo', html), [])
+        self.assertEqual(list(get_elements_text_and_html_by_attribute('class', 'foo', html)), [])
+        self.assertEqual(list(get_elements_text_and_html_by_attribute('class', 'no-such-foo', html)), [])

    GET_ELEMENT_BY_TAG_TEST_STRING = '''
    random text lorem ipsum</p>
--- a/yt_dlp/utils.py
+++ b/yt_dlp/utils.py
@ -473,24 +473,23 @@ def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value
    attribute in the passed HTML document
    """

+    value_quote_optional = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
+
    value = re.escape(value) if escape_value else value

-    retlist = []
-    for m in re.finditer(r'''(?xs)
+    partial_element_re = r'''(?x)
        <(?P<tag>[a-zA-Z0-9:._-]+)
-         (?:\s+[a-zA-Z0-9_:.-]+(?:=\S*?|\s*=\s*(?:"[^"]*"|'[^']*')|))*?
-         \s+%(attribute)s(?:=%(value)s|\s*=\s*(?P<_q>['"]?)%(value)s(?P=_q))
-         (?:\s+[a-zA-Z0-9_:.-]+(?:=\S*?|\s*=\s*(?:"[^"]*"|'[^']*')|))*?
-        \s*>
-    ''' % {'attribute': re.escape(attribute), 'value': value}, html):
+         (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
+         \s%(attribute)s\s*=\s*(?P<_q>['"]%(vqo)s)(?-x:%(value)s)(?P=_q)
+        ''' % {'attribute': re.escape(attribute), 'value': value, 'vqo': value_quote_optional}
+
+    for m in re.finditer(partial_element_re, html):
        content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])

-        retlist.append((
-            unescapeHTML(re.sub(r'(?s)^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content)),
-            whole,
-        ))
-
-    return retlist
+        yield (
+            unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
+            whole
+        )


 class HTMLBreakOnClosingTagParser(compat_HTMLParser):