[redditr] Fix extraction for URLs with query (closes #14495)

2024-11-26 02:55:17 +00:00 · 2017-10-15 03:38:34 +07:00 · 2017-10-15 03:38:34 +07:00 · 9bb2c7673e
commit 9bb2c7673e
parent 715534083d
1 changed files with 7 additions and 2 deletions
--- a/youtube_dl/extractor/reddit.py
+++ b/youtube_dl/extractor/reddit.py
@ -1,5 +1,7 @@
 from __future__ import unicode_literals

+import re
+
 from .common import InfoExtractor
 from ..utils import (
    ExtractorError,
@ -45,7 +47,7 @@ def _real_extract(self, url):


 class RedditRIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?reddit\.com/r/[^/]+/comments/(?P<id>[^/]+)'
+    _VALID_URL = r'(?P<url>https?://(?:www\.)?reddit\.com/r/[^/]+/comments/(?P<id>[^/?#&]+))'
    _TESTS = [{
        'url': 'https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/',
        'info_dict': {
@ -83,10 +85,13 @@ class RedditRIE(InfoExtractor):
    }]

    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        url, video_id = mobj.group('url', 'id')
+
        video_id = self._match_id(url)

        data = self._download_json(
-            url + '.json', video_id)[0]['data']['children'][0]['data']
+            url + '/.json', video_id)[0]['data']['children'][0]['data']

        video_url = data['url']