mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2024-11-10 00:25:06 +00:00
Do not override stdlib html parser 'locatestarttagend' regex (fixes #4081)
'<a href="foo" ><img src="bar" / ></a>' wouldn't be parsed right (the problem is '/ >', '/>' worked fine). We need to change it in python 2.6 (for example the description of youtube videos wouldn't be extracted).
This commit is contained in:
parent
ac35c26686
commit
4f195f55f0
1 changed files with 3 additions and 1 deletions
|
@ -152,7 +152,9 @@ def xpath_text(node, xpath, name=None, fatal=False):
|
||||||
return n.text
|
return n.text
|
||||||
|
|
||||||
|
|
||||||
|
if sys.version_info < (2, 7):
|
||||||
compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
|
compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
|
||||||
|
|
||||||
class BaseHTMLParser(compat_html_parser.HTMLParser):
|
class BaseHTMLParser(compat_html_parser.HTMLParser):
|
||||||
def __init(self):
|
def __init(self):
|
||||||
compat_html_parser.HTMLParser.__init__(self)
|
compat_html_parser.HTMLParser.__init__(self)
|
||||||
|
|
Loading…
Reference in a new issue