Improve --download-sections

* Support negative time-ranges * Add `*from-url` to obey time-ranges in URL Closes #7248
2024-12-22 06:00:00 +00:00 · 2023-06-22 04:54:39 +05:30 · 2023-06-22 04:54:39 +05:30 · b4e0d75848
commit b4e0d75848
parent 71dc18fa29
5 changed files with 74 additions and 32 deletions
--- a/README.md
+++ b/README.md
@ -610,12 +610,14 @@ ## Download Options:
    --no-hls-use-mpegts             Do not use the mpegts container for HLS
                                    videos. This is default when not downloading
                                    live streams
-    --download-sections REGEX       Download only chapters whose title matches
-                                    the given regular expression. Time ranges
-                                    prefixed by a "*" can also be used in place
-                                    of chapters to download the specified range.
-                                    Needs ffmpeg. This option can be used
-                                    multiple times to download multiple
+    --download-sections REGEX       Download only chapters that match the
+                                    regular expression. A "*" prefix denotes
+                                    time-range instead of chapter. Negative
+                                    timestamps are calculated from the end.
+                                    "*from-url" can be used to download between
+                                    the "start_time" and "end_time" extracted
+                                    from the URL. Needs ffmpeg. This option can
+                                    be used multiple times to download multiple
                                    sections, e.g. --download-sections
                                    "*10:15-inf" --download-sections "intro"
    --downloader [PROTO:]NAME       Name or path of the external downloader to
--- a/yt_dlp/YoutubeDL.py
+++ b/yt_dlp/YoutubeDL.py
@ -2806,11 +2806,13 @@ def to_screen(*msg):
                new_info.update(fmt)
                offset, duration = info_dict.get('section_start') or 0, info_dict.get('duration') or float('inf')
                end_time = offset + min(chapter.get('end_time', duration), duration)
+                # duration may not be accurate. So allow deviations <1sec
+                if end_time == float('inf') or end_time > offset + duration + 1:
+                    end_time = None
                if chapter or offset:
                    new_info.update({
                        'section_start': offset + chapter.get('start_time', 0),
-                        # duration may not be accurate. So allow deviations <1sec
-                        'section_end': end_time if end_time <= offset + duration + 1 else None,
+                        'section_end': end_time,
                        'section_title': chapter.get('title'),
                        'section_number': chapter.get('index'),
                    })
--- a/yt_dlp/init.py
+++ b/yt_dlp/init.py
@ -320,26 +320,49 @@ def validate_outtmpl(tmpl, msg):
        opts.skip_download = None
        del opts.outtmpl['default']

-    def parse_chapters(name, value):
-        chapters, ranges = [], []
+    def parse_chapters(name, value, advanced=False):
        parse_timestamp = lambda x: float('inf') if x in ('inf', 'infinite') else parse_duration(x)
+        TIMESTAMP_RE = r'''(?x)(?:
+            (?P<start_sign>-?)(?P<start>[^-]+)
+        )?\s*-\s*(?:
+            (?P<end_sign>-?)(?P<end>[^-]+)
+        )?'''
+
+        chapters, ranges, from_url = [], [], False
        for regex in value or []:
-            if regex.startswith('*'):
-                for range_ in map(str.strip, regex[1:].split(',')):
-                    mobj = range_ != '-' and re.fullmatch(r'([^-]+)?\s*-\s*([^-]+)?', range_)
-                    dur = mobj and (parse_timestamp(mobj.group(1) or '0'), parse_timestamp(mobj.group(2) or 'inf'))
-                    if None in (dur or [None]):
-                        raise ValueError(f'invalid {name} time range "{regex}". Must be of the form "*start-end"')
-                    ranges.append(dur)
+            if advanced and regex == '*from-url':
+                from_url = True
                continue
+            elif not regex.startswith('*'):
                try:
                    chapters.append(re.compile(regex))
                except re.error as err:
                    raise ValueError(f'invalid {name} regex "{regex}" - {err}')
-        return chapters, ranges
+                continue

-    opts.remove_chapters, opts.remove_ranges = parse_chapters('--remove-chapters', opts.remove_chapters)
-    opts.download_ranges = download_range_func(*parse_chapters('--download-sections', opts.download_ranges))
+            for range_ in map(str.strip, regex[1:].split(',')):
+                mobj = range_ != '-' and re.fullmatch(TIMESTAMP_RE, range_)
+                dur = mobj and [parse_timestamp(mobj.group('start') or '0'), parse_timestamp(mobj.group('end') or 'inf')]
+                signs = mobj and (mobj.group('start_sign'), mobj.group('end_sign'))
+
+                err = None
+                if None in (dur or [None]):
+                    err = 'Must be of the form "*start-end"'
+                elif not advanced and any(signs):
+                    err = 'Negative timestamps are not allowed'
+                else:
+                    dur[0] *= -1 if signs[0] else 1
+                    dur[1] *= -1 if signs[1] else 1
+                    if dur[1] == float('-inf'):
+                        err = '"-inf" is not a valid end'
+                if err:
+                    raise ValueError(f'invalid {name} time range "{regex}". {err}')
+                ranges.append(dur)
+
+        return chapters, ranges, from_url
+
+    opts.remove_chapters, opts.remove_ranges, _ = parse_chapters('--remove-chapters', opts.remove_chapters)
+    opts.download_ranges = download_range_func(*parse_chapters('--download-sections', opts.download_ranges, True))

    # Cookies from browser
    if opts.cookiesfrombrowser:
--- a/yt_dlp/options.py
+++ b/yt_dlp/options.py
@ -1012,8 +1012,9 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs):
        '--download-sections',
        metavar='REGEX', dest='download_ranges', action='append',
        help=(
-            'Download only chapters whose title matches the given regular expression. '
-            'Time ranges prefixed by a "*" can also be used in place of chapters to download the specified range. '
+            'Download only chapters that match the regular expression. '
+            'A "*" prefix denotes time-range instead of chapter. Negative timestamps are calculated from the end. '
+            '"*from-url" can be used to download between the "start_time" and "end_time" extracted from the URL. '
            'Needs ffmpeg. This option can be used multiple times to download multiple sections, '
            'e.g. --download-sections "*10:15-inf" --download-sections "intro"'))
    downloader.add_option(
--- a/yt_dlp/utils/_utils.py
+++ b/yt_dlp/utils/_utils.py
@ -3753,11 +3753,11 @@ def _match_func(info_dict, incomplete=False):


 class download_range_func:
-    def __init__(self, chapters, ranges):
-        self.chapters, self.ranges = chapters, ranges
+    def __init__(self, chapters, ranges, from_info=False):
+        self.chapters, self.ranges, self.from_info = chapters, ranges, from_info

    def __call__(self, info_dict, ydl):
-        if not self.ranges and not self.chapters:
+        if not any((self.ranges, self.chapters, self.from_info)):
            yield {}

        warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
@ -3770,7 +3770,21 @@ def __call__(self, info_dict, ydl):
        if self.chapters and warning:
            ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')

-        yield from ({'start_time': start, 'end_time': end} for start, end in self.ranges or [])
+        for start, end in self.ranges or []:
+            yield {
+                'start_time': self._handle_negative_timestamp(start, info_dict),
+                'end_time': self._handle_negative_timestamp(end, info_dict),
+            }
+
+        if self.from_info and (info_dict.get('start_time') or info_dict.get('end_time')):
+            yield {
+                'start_time': info_dict.get('start_time'),
+                'end_time': info_dict.get('end_time'),
+            }
+
+    @staticmethod
+    def _handle_negative_timestamp(time, info):
+        return max(info['duration'] + time, 0) if info.get('duration') and time < 0 else time

    def __eq__(self, other):
        return (isinstance(other, download_range_func)