From 4bf912282a34b58b6b35d8f7e6be535770c89c76 Mon Sep 17 00:00:00 2001 From: coletdjnz Date: Sat, 29 Jul 2023 10:40:20 +1200 Subject: [PATCH] [networking] Remove dot segments during URL normalization (#7662) This implements RFC3986 5.2.4 remove_dot_segments during the URL normalization process. Closes #3355, #6526 Authored by: coletdjnz --- test/test_networking.py | 21 ++++++++++++++++++ test/test_utils.py | 42 ++++++++++++++++++++++++++++-------- yt_dlp/cookies.py | 6 +++--- yt_dlp/networking/_urllib.py | 7 +++--- yt_dlp/networking/common.py | 5 ++--- yt_dlp/utils/_legacy.py | 4 +++- yt_dlp/utils/_utils.py | 17 --------------- yt_dlp/utils/networking.py | 38 ++++++++++++++++++++++++++++++++ 8 files changed, 104 insertions(+), 36 deletions(-) diff --git a/test/test_networking.py b/test/test_networking.py index f0938ab91..684bf5f96 100644 --- a/test/test_networking.py +++ b/test/test_networking.py @@ -173,6 +173,12 @@ def do_GET(self): self.send_header('Location', self.path) self.send_header('Content-Length', '0') self.end_headers() + elif self.path == '/redirect_dotsegments': + self.send_response(301) + # redirect to /headers but with dot segments before + self.send_header('Location', '/a/b/./../../headers') + self.send_header('Content-Length', '0') + self.end_headers() elif self.path.startswith('/redirect_'): self._redirect() elif self.path.startswith('/method'): @@ -355,6 +361,21 @@ def test_percent_encode(self, handler): assert res.status == 200 res.close() + @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + def test_remove_dot_segments(self, handler): + with handler() as rh: + # This isn't a comprehensive test, + # but it should be enough to check whether the handler is removing dot segments + res = validate_and_send(rh, Request(f'http://127.0.0.1:{self.http_port}/a/b/./../../headers')) + assert res.status == 200 + assert res.url == f'http://127.0.0.1:{self.http_port}/headers' + res.close() + + res = validate_and_send(rh, Request(f'http://127.0.0.1:{self.http_port}/redirect_dotsegments')) + assert res.status == 200 + assert res.url == f'http://127.0.0.1:{self.http_port}/headers' + res.close() + @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) def test_unicode_path_redirection(self, handler): with handler() as rh: diff --git a/test/test_utils.py b/test/test_utils.py index b36bc04c2..453a01a1c 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -47,8 +47,6 @@ encode_base_n, encode_compat_str, encodeFilename, - escape_rfc3986, - escape_url, expand_path, extract_attributes, extract_basic_auth, @@ -132,7 +130,12 @@ xpath_text, xpath_with_ns, ) -from yt_dlp.utils.networking import HTTPHeaderDict +from yt_dlp.utils.networking import ( + HTTPHeaderDict, + escape_rfc3986, + normalize_url, + remove_dot_segments, +) class TestUtil(unittest.TestCase): @@ -933,24 +936,45 @@ def test_escape_rfc3986(self): self.assertEqual(escape_rfc3986('foo bar'), 'foo%20bar') self.assertEqual(escape_rfc3986('foo%20bar'), 'foo%20bar') - def test_escape_url(self): + def test_normalize_url(self): self.assertEqual( - escape_url('http://wowza.imust.org/srv/vod/telemb/new/UPLOAD/UPLOAD/20224_IncendieHavré_FD.mp4'), + normalize_url('http://wowza.imust.org/srv/vod/telemb/new/UPLOAD/UPLOAD/20224_IncendieHavré_FD.mp4'), 'http://wowza.imust.org/srv/vod/telemb/new/UPLOAD/UPLOAD/20224_IncendieHavre%CC%81_FD.mp4' ) self.assertEqual( - escape_url('http://www.ardmediathek.de/tv/Sturm-der-Liebe/Folge-2036-Zu-Mann-und-Frau-erklärt/Das-Erste/Video?documentId=22673108&bcastId=5290'), + normalize_url('http://www.ardmediathek.de/tv/Sturm-der-Liebe/Folge-2036-Zu-Mann-und-Frau-erklärt/Das-Erste/Video?documentId=22673108&bcastId=5290'), 'http://www.ardmediathek.de/tv/Sturm-der-Liebe/Folge-2036-Zu-Mann-und-Frau-erkl%C3%A4rt/Das-Erste/Video?documentId=22673108&bcastId=5290' ) self.assertEqual( - escape_url('http://тест.рф/фрагмент'), + normalize_url('http://тест.рф/фрагмент'), 'http://xn--e1aybc.xn--p1ai/%D1%84%D1%80%D0%B0%D0%B3%D0%BC%D0%B5%D0%BD%D1%82' ) self.assertEqual( - escape_url('http://тест.рф/абв?абв=абв#абв'), + normalize_url('http://тест.рф/абв?абв=абв#абв'), 'http://xn--e1aybc.xn--p1ai/%D0%B0%D0%B1%D0%B2?%D0%B0%D0%B1%D0%B2=%D0%B0%D0%B1%D0%B2#%D0%B0%D0%B1%D0%B2' ) - self.assertEqual(escape_url('http://vimeo.com/56015672#at=0'), 'http://vimeo.com/56015672#at=0') + self.assertEqual(normalize_url('http://vimeo.com/56015672#at=0'), 'http://vimeo.com/56015672#at=0') + + self.assertEqual(normalize_url('http://www.example.com/../a/b/../c/./d.html'), 'http://www.example.com/a/c/d.html') + + def test_remove_dot_segments(self): + self.assertEqual(remove_dot_segments('/a/b/c/./../../g'), '/a/g') + self.assertEqual(remove_dot_segments('mid/content=5/../6'), 'mid/6') + self.assertEqual(remove_dot_segments('/ad/../cd'), '/cd') + self.assertEqual(remove_dot_segments('/ad/../cd/'), '/cd/') + self.assertEqual(remove_dot_segments('/..'), '/') + self.assertEqual(remove_dot_segments('/./'), '/') + self.assertEqual(remove_dot_segments('/./a'), '/a') + self.assertEqual(remove_dot_segments('/abc/./.././d/././e/.././f/./../../ghi'), '/ghi') + self.assertEqual(remove_dot_segments('/'), '/') + self.assertEqual(remove_dot_segments('/t'), '/t') + self.assertEqual(remove_dot_segments('t'), 't') + self.assertEqual(remove_dot_segments(''), '') + self.assertEqual(remove_dot_segments('/../a/b/c'), '/a/b/c') + self.assertEqual(remove_dot_segments('../a'), 'a') + self.assertEqual(remove_dot_segments('./a'), 'a') + self.assertEqual(remove_dot_segments('.'), '') + self.assertEqual(remove_dot_segments('////'), '////') def test_js_to_json_vars_strings(self): self.assertDictEqual( diff --git a/yt_dlp/cookies.py b/yt_dlp/cookies.py index 80428c747..157f5b0c2 100644 --- a/yt_dlp/cookies.py +++ b/yt_dlp/cookies.py @@ -33,7 +33,6 @@ from .utils import ( Popen, error_to_str, - escape_url, expand_path, is_path_like, sanitize_url, @@ -42,6 +41,7 @@ write_string, ) from .utils._utils import _YDLLogger +from .utils.networking import normalize_url CHROMIUM_BASED_BROWSERS = {'brave', 'chrome', 'chromium', 'edge', 'opera', 'vivaldi'} SUPPORTED_BROWSERS = CHROMIUM_BASED_BROWSERS | {'firefox', 'safari'} @@ -1308,7 +1308,7 @@ def prepare_line(line): def get_cookie_header(self, url): """Generate a Cookie HTTP header for a given url""" - cookie_req = urllib.request.Request(escape_url(sanitize_url(url))) + cookie_req = urllib.request.Request(normalize_url(sanitize_url(url))) self.add_cookie_header(cookie_req) return cookie_req.get_header('Cookie') @@ -1317,7 +1317,7 @@ def get_cookies_for_url(self, url): # Policy `_now` attribute must be set before calling `_cookies_for_request` # Ref: https://github.com/python/cpython/blob/3.7/Lib/http/cookiejar.py#L1360 self._policy._now = self._now = int(time.time()) - return self._cookies_for_request(urllib.request.Request(escape_url(sanitize_url(url)))) + return self._cookies_for_request(urllib.request.Request(normalize_url(sanitize_url(url)))) def clear(self, *args, **kwargs): with contextlib.suppress(KeyError): diff --git a/yt_dlp/networking/_urllib.py b/yt_dlp/networking/_urllib.py index 3fe5fa52e..0c4794954 100644 --- a/yt_dlp/networking/_urllib.py +++ b/yt_dlp/networking/_urllib.py @@ -41,7 +41,8 @@ from ..dependencies import brotli from ..socks import ProxyError as SocksProxyError from ..socks import sockssocket -from ..utils import escape_url, update_url_query +from ..utils import update_url_query +from ..utils.networking import normalize_url SUPPORTED_ENCODINGS = ['gzip', 'deflate'] CONTENT_DECODE_ERRORS = [zlib.error, OSError] @@ -179,7 +180,7 @@ def http_request(self, req): # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09) # the code of this workaround has been moved here from YoutubeDL.urlopen() url = req.get_full_url() - url_escaped = escape_url(url) + url_escaped = normalize_url(url) # Substitute URL if any change after escaping if url != url_escaped: @@ -212,7 +213,7 @@ def http_response(self, req, resp): if location: # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3 location = location.encode('iso-8859-1').decode() - location_escaped = escape_url(location) + location_escaped = normalize_url(location) if location != location_escaped: del resp.headers['Location'] resp.headers['Location'] = location_escaped diff --git a/yt_dlp/networking/common.py b/yt_dlp/networking/common.py index 3164df49b..792e062fd 100644 --- a/yt_dlp/networking/common.py +++ b/yt_dlp/networking/common.py @@ -27,10 +27,9 @@ classproperty, deprecation_warning, error_to_str, - escape_url, update_url_query, ) -from ..utils.networking import HTTPHeaderDict +from ..utils.networking import HTTPHeaderDict, normalize_url if typing.TYPE_CHECKING: RequestData = bytes | Iterable[bytes] | typing.IO | None @@ -372,7 +371,7 @@ def url(self, url): raise TypeError('url must be a string') elif url.startswith('//'): url = 'http:' + url - self._url = escape_url(url) + self._url = normalize_url(url) @property def method(self): diff --git a/yt_dlp/utils/_legacy.py b/yt_dlp/utils/_legacy.py index 077000971..dde02092c 100644 --- a/yt_dlp/utils/_legacy.py +++ b/yt_dlp/utils/_legacy.py @@ -8,6 +8,8 @@ import zlib from ._utils import Popen, decode_base_n, preferredencoding +from .networking import escape_rfc3986 # noqa: F401 +from .networking import normalize_url as escape_url # noqa: F401 from .traversal import traverse_obj from ..dependencies import certifi, websockets from ..networking._helper import make_ssl_context @@ -197,7 +199,7 @@ def request_to_url(req): def sanitized_Request(url, *args, **kwargs): - from ..utils import escape_url, extract_basic_auth, sanitize_url + from ..utils import extract_basic_auth, sanitize_url url, auth_header = extract_basic_auth(escape_url(sanitize_url(url))) if auth_header is not None: headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {}) diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index abae0f17e..f5552ce80 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -2464,23 +2464,6 @@ def lowercase_escape(s): s) -def escape_rfc3986(s): - """Escape non-ASCII characters as suggested by RFC 3986""" - return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]") - - -def escape_url(url): - """Escape URL as suggested by RFC 3986""" - url_parsed = urllib.parse.urlparse(url) - return url_parsed._replace( - netloc=url_parsed.netloc.encode('idna').decode('ascii'), - path=escape_rfc3986(url_parsed.path), - params=escape_rfc3986(url_parsed.params), - query=escape_rfc3986(url_parsed.query), - fragment=escape_rfc3986(url_parsed.fragment) - ).geturl() - - def parse_qs(url, **kwargs): return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs) diff --git a/yt_dlp/utils/networking.py b/yt_dlp/utils/networking.py index e6515ec8e..bbcea84d2 100644 --- a/yt_dlp/utils/networking.py +++ b/yt_dlp/utils/networking.py @@ -121,3 +121,41 @@ def clean_headers(headers: HTTPHeaderDict): if 'Youtubedl-No-Compression' in headers: # compat del headers['Youtubedl-No-Compression'] headers['Accept-Encoding'] = 'identity' + + +def remove_dot_segments(path): + # Implements RFC3986 5.2.4 remote_dot_segments + # Pseudo-code: https://tools.ietf.org/html/rfc3986#section-5.2.4 + # https://github.com/urllib3/urllib3/blob/ba49f5c4e19e6bca6827282feb77a3c9f937e64b/src/urllib3/util/url.py#L263 + output = [] + segments = path.split('/') + for s in segments: + if s == '.': + continue + elif s == '..': + if output: + output.pop() + else: + output.append(s) + if not segments[0] and (not output or output[0]): + output.insert(0, '') + if segments[-1] in ('.', '..'): + output.append('') + return '/'.join(output) + + +def escape_rfc3986(s): + """Escape non-ASCII characters as suggested by RFC 3986""" + return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]") + + +def normalize_url(url): + """Normalize URL as suggested by RFC 3986""" + url_parsed = urllib.parse.urlparse(url) + return url_parsed._replace( + netloc=url_parsed.netloc.encode('idna').decode('ascii'), + path=escape_rfc3986(remove_dot_segments(url_parsed.path)), + params=escape_rfc3986(url_parsed.params), + query=escape_rfc3986(url_parsed.query), + fragment=escape_rfc3986(url_parsed.fragment) + ).geturl()