From c2d2ee40eb168ef9f433b645271a55d821c327e5 Mon Sep 17 00:00:00 2001 From: "Lesmiscore (Naoya Ozaki)" Date: Wed, 23 Mar 2022 15:28:53 +0900 Subject: [PATCH] [generic] Extract subtitles from video.js (#3156) Authored by: Lesmiscore --- yt_dlp/extractor/generic.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index 97e34808f..4a2e30158 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -17,6 +17,7 @@ ) from ..utils import ( determine_ext, + dict_get, ExtractorError, float_or_none, HEADRequest, @@ -31,6 +32,7 @@ parse_resolution, sanitized_Request, smuggle_url, + str_or_none, unescapeHTML, unified_timestamp, unsmuggle_url, @@ -3778,11 +3780,12 @@ def _real_extract(self, url): # Video.js embed mobj = re.search( - r'(?s)\bvideojs\s*\(.+?\.src\s*\(\s*((?:\[.+?\]|{.+?}))\s*\)\s*;', + r'(?s)\bvideojs\s*\(.+?([a-zA-Z0-9_$]+)\.src\s*\(\s*((?:\[.+?\]|{.+?}))\s*\)\s*;', webpage) if mobj is not None: + varname = mobj.group(1) sources = self._parse_json( - mobj.group(1), video_id, transform_source=js_to_json, + mobj.group(2), video_id, transform_source=js_to_json, fatal=False) or [] if not isinstance(sources, list): sources = [sources] @@ -3819,6 +3822,21 @@ def _real_extract(self, url): 'Referer': full_response.geturl(), }, }) + # https://docs.videojs.com/player#addRemoteTextTrack + # https://html.spec.whatwg.org/multipage/media.html#htmltrackelement + for sub_match in re.finditer(rf'(?s){re.escape(varname)}' r'\.addRemoteTextTrack\(({.+?})\s*,\s*(?:true|false)\)', webpage): + sub = self._parse_json( + sub_match.group(1), video_id, transform_source=js_to_json, fatal=False) or {} + src = str_or_none(sub.get('src')) + if not src: + continue + subtitles.setdefault(dict_get(sub, ('language', 'srclang')) or 'und', []).append({ + 'url': compat_urlparse.urljoin(url, src), + 'name': sub.get('label'), + 'http_headers': { + 'Referer': full_response.geturl(), + }, + }) if formats or subtitles: self.report_detected('video.js embed') self._sort_formats(formats)