mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2024-12-22 06:00:00 +00:00
Add pre-processor stage after_filter
* Move `_match_entry` and `post_extract` to `process_video_result`. It is also left in `process_info` for API compat
* `--list-...` options and `--force-write-archive` now obey filtering options
* Move `SponsorBlockPP` to `after_filter`. Closes https://github.com/yt-dlp/yt-dlp/issues/2536
* Reverts 4ec82a72bb
since this commit addresses the issue it was solving
This commit is contained in:
parent
1108613f02
commit
09b49e1f68
6 changed files with 39 additions and 58 deletions
20
README.md
20
README.md
|
@ -982,15 +982,17 @@ ## Post-Processing Options:
|
||||||
semicolon ";" delimited list of NAME=VALUE.
|
semicolon ";" delimited list of NAME=VALUE.
|
||||||
The "when" argument determines when the
|
The "when" argument determines when the
|
||||||
postprocessor is invoked. It can be one of
|
postprocessor is invoked. It can be one of
|
||||||
"pre_process" (after extraction),
|
"pre_process" (after video extraction),
|
||||||
"before_dl" (before video download),
|
"after_filter" (after video passes filter),
|
||||||
"post_process" (after video download;
|
"before_dl" (before each video download),
|
||||||
default), "after_move" (after moving file
|
"post_process" (after each video download;
|
||||||
to their final locations), "after_video"
|
default), "after_move" (after moving video
|
||||||
(after downloading and processing all
|
file to it's final locations),
|
||||||
formats of a video), or "playlist" (end of
|
"after_video" (after downloading and
|
||||||
playlist). This option can be used multiple
|
processing all formats of a video), or
|
||||||
times to add different postprocessors
|
"playlist" (at end of playlist). This
|
||||||
|
option can be used multiple times to add
|
||||||
|
different postprocessors
|
||||||
|
|
||||||
## SponsorBlock Options:
|
## SponsorBlock Options:
|
||||||
Make chapter entries for, or remove various segments (sponsor,
|
Make chapter entries for, or remove various segments (sponsor,
|
||||||
|
|
|
@ -30,9 +30,7 @@ def __init__(self, *args, **kwargs):
|
||||||
self.msgs = []
|
self.msgs = []
|
||||||
|
|
||||||
def process_info(self, info_dict):
|
def process_info(self, info_dict):
|
||||||
info_dict = info_dict.copy()
|
self.downloaded_info_dicts.append(info_dict.copy())
|
||||||
info_dict.pop('__original_infodict', None)
|
|
||||||
self.downloaded_info_dicts.append(info_dict)
|
|
||||||
|
|
||||||
def to_screen(self, msg):
|
def to_screen(self, msg):
|
||||||
self.msgs.append(msg)
|
self.msgs.append(msg)
|
||||||
|
@ -898,20 +896,6 @@ def run(self, info):
|
||||||
os.unlink(filename)
|
os.unlink(filename)
|
||||||
|
|
||||||
def test_match_filter(self):
|
def test_match_filter(self):
|
||||||
class FilterYDL(YDL):
|
|
||||||
def __init__(self, *args, **kwargs):
|
|
||||||
super(FilterYDL, self).__init__(*args, **kwargs)
|
|
||||||
self.params['simulate'] = True
|
|
||||||
|
|
||||||
def process_info(self, info_dict):
|
|
||||||
super(YDL, self).process_info(info_dict)
|
|
||||||
|
|
||||||
def _match_entry(self, info_dict, incomplete=False):
|
|
||||||
res = super(FilterYDL, self)._match_entry(info_dict, incomplete)
|
|
||||||
if res is None:
|
|
||||||
self.downloaded_info_dicts.append(info_dict.copy())
|
|
||||||
return res
|
|
||||||
|
|
||||||
first = {
|
first = {
|
||||||
'id': '1',
|
'id': '1',
|
||||||
'url': TEST_URL,
|
'url': TEST_URL,
|
||||||
|
@ -939,7 +923,7 @@ def _match_entry(self, info_dict, incomplete=False):
|
||||||
videos = [first, second]
|
videos = [first, second]
|
||||||
|
|
||||||
def get_videos(filter_=None):
|
def get_videos(filter_=None):
|
||||||
ydl = FilterYDL({'match_filter': filter_})
|
ydl = YDL({'match_filter': filter_, 'simulate': True})
|
||||||
for v in videos:
|
for v in videos:
|
||||||
ydl.process_ie_result(v, download=True)
|
ydl.process_ie_result(v, download=True)
|
||||||
return [v['id'] for v in ydl.downloaded_info_dicts]
|
return [v['id'] for v in ydl.downloaded_info_dicts]
|
||||||
|
|
|
@ -1037,8 +1037,7 @@ def validate_outtmpl(cls, outtmpl):
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _copy_infodict(info_dict):
|
def _copy_infodict(info_dict):
|
||||||
info_dict = dict(info_dict)
|
info_dict = dict(info_dict)
|
||||||
for key in ('__original_infodict', '__postprocessors'):
|
info_dict.pop('__postprocessors', None)
|
||||||
info_dict.pop(key, None)
|
|
||||||
return info_dict
|
return info_dict
|
||||||
|
|
||||||
def prepare_outtmpl(self, outtmpl, info_dict, sanitize=False):
|
def prepare_outtmpl(self, outtmpl, info_dict, sanitize=False):
|
||||||
|
@ -2512,8 +2511,6 @@ def is_wellformed(f):
|
||||||
if '__x_forwarded_for_ip' in info_dict:
|
if '__x_forwarded_for_ip' in info_dict:
|
||||||
del info_dict['__x_forwarded_for_ip']
|
del info_dict['__x_forwarded_for_ip']
|
||||||
|
|
||||||
# TODO Central sorting goes here
|
|
||||||
|
|
||||||
if self.params.get('check_formats') is True:
|
if self.params.get('check_formats') is True:
|
||||||
formats = LazyList(self._check_formats(formats[::-1]), reverse=True)
|
formats = LazyList(self._check_formats(formats[::-1]), reverse=True)
|
||||||
|
|
||||||
|
@ -2526,6 +2523,12 @@ def is_wellformed(f):
|
||||||
|
|
||||||
info_dict, _ = self.pre_process(info_dict)
|
info_dict, _ = self.pre_process(info_dict)
|
||||||
|
|
||||||
|
if self._match_entry(info_dict) is not None:
|
||||||
|
return info_dict
|
||||||
|
|
||||||
|
self.post_extract(info_dict)
|
||||||
|
info_dict, _ = self.pre_process(info_dict, 'after_filter')
|
||||||
|
|
||||||
# The pre-processors may have modified the formats
|
# The pre-processors may have modified the formats
|
||||||
formats = info_dict.get('formats', [info_dict])
|
formats = info_dict.get('formats', [info_dict])
|
||||||
|
|
||||||
|
@ -2610,15 +2613,12 @@ def is_wellformed(f):
|
||||||
+ ', '.join([f['format_id'] for f in formats_to_download]))
|
+ ', '.join([f['format_id'] for f in formats_to_download]))
|
||||||
max_downloads_reached = False
|
max_downloads_reached = False
|
||||||
for i, fmt in enumerate(formats_to_download):
|
for i, fmt in enumerate(formats_to_download):
|
||||||
formats_to_download[i] = new_info = dict(info_dict)
|
formats_to_download[i] = new_info = self._copy_infodict(info_dict)
|
||||||
# Save a reference to the original info_dict so that it can be modified in process_info if needed
|
|
||||||
new_info.update(fmt)
|
new_info.update(fmt)
|
||||||
new_info['__original_infodict'] = info_dict
|
|
||||||
try:
|
try:
|
||||||
self.process_info(new_info)
|
self.process_info(new_info)
|
||||||
except MaxDownloadsReached:
|
except MaxDownloadsReached:
|
||||||
max_downloads_reached = True
|
max_downloads_reached = True
|
||||||
new_info.pop('__original_infodict')
|
|
||||||
# Remove copied info
|
# Remove copied info
|
||||||
for key, val in tuple(new_info.items()):
|
for key, val in tuple(new_info.items()):
|
||||||
if info_dict.get(key) == val:
|
if info_dict.get(key) == val:
|
||||||
|
@ -2826,7 +2826,7 @@ def existing_file(self, filepaths, *, default_overwrite=True):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def process_info(self, info_dict):
|
def process_info(self, info_dict):
|
||||||
"""Process a single resolved IE result. (Modified it in-place)"""
|
"""Process a single resolved IE result. (Modifies it in-place)"""
|
||||||
|
|
||||||
assert info_dict.get('_type', 'video') == 'video'
|
assert info_dict.get('_type', 'video') == 'video'
|
||||||
original_infodict = info_dict
|
original_infodict = info_dict
|
||||||
|
@ -2834,18 +2834,22 @@ def process_info(self, info_dict):
|
||||||
if 'format' not in info_dict and 'ext' in info_dict:
|
if 'format' not in info_dict and 'ext' in info_dict:
|
||||||
info_dict['format'] = info_dict['ext']
|
info_dict['format'] = info_dict['ext']
|
||||||
|
|
||||||
|
# This is mostly just for backward compatibility of process_info
|
||||||
|
# As a side-effect, this allows for format-specific filters
|
||||||
if self._match_entry(info_dict) is not None:
|
if self._match_entry(info_dict) is not None:
|
||||||
info_dict['__write_download_archive'] = 'ignore'
|
info_dict['__write_download_archive'] = 'ignore'
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# Does nothing under normal operation - for backward compatibility of process_info
|
||||||
self.post_extract(info_dict)
|
self.post_extract(info_dict)
|
||||||
self._num_downloads += 1
|
|
||||||
|
|
||||||
# info_dict['_filename'] needs to be set for backward compatibility
|
# info_dict['_filename'] needs to be set for backward compatibility
|
||||||
info_dict['_filename'] = full_filename = self.prepare_filename(info_dict, warn=True)
|
info_dict['_filename'] = full_filename = self.prepare_filename(info_dict, warn=True)
|
||||||
temp_filename = self.prepare_filename(info_dict, 'temp')
|
temp_filename = self.prepare_filename(info_dict, 'temp')
|
||||||
files_to_move = {}
|
files_to_move = {}
|
||||||
|
|
||||||
|
self._num_downloads += 1
|
||||||
|
|
||||||
# Forced printings
|
# Forced printings
|
||||||
self.__forced_printings(info_dict, full_filename, incomplete=('format' not in info_dict))
|
self.__forced_printings(info_dict, full_filename, incomplete=('format' not in info_dict))
|
||||||
|
|
||||||
|
@ -3259,17 +3263,14 @@ def sanitize_info(info_dict, remove_private_keys=False):
|
||||||
return info_dict
|
return info_dict
|
||||||
info_dict.setdefault('epoch', int(time.time()))
|
info_dict.setdefault('epoch', int(time.time()))
|
||||||
info_dict.setdefault('_type', 'video')
|
info_dict.setdefault('_type', 'video')
|
||||||
remove_keys = {'__original_infodict'} # Always remove this since this may contain a copy of the entire dict
|
|
||||||
keep_keys = ['_type'] # Always keep this to facilitate load-info-json
|
|
||||||
if remove_private_keys:
|
if remove_private_keys:
|
||||||
remove_keys |= {
|
reject = lambda k, v: v is None or (k.startswith('_') and k != '_type') or k in {
|
||||||
'requested_downloads', 'requested_formats', 'requested_subtitles', 'requested_entries',
|
'requested_downloads', 'requested_formats', 'requested_subtitles', 'requested_entries',
|
||||||
'entries', 'filepath', 'infojson_filename', 'original_url', 'playlist_autonumber',
|
'entries', 'filepath', 'infojson_filename', 'original_url', 'playlist_autonumber',
|
||||||
}
|
}
|
||||||
reject = lambda k, v: k not in keep_keys and (
|
|
||||||
k.startswith('_') or k in remove_keys or v is None)
|
|
||||||
else:
|
else:
|
||||||
reject = lambda k, v: k in remove_keys
|
reject = lambda k, v: False
|
||||||
|
|
||||||
def filter_fn(obj):
|
def filter_fn(obj):
|
||||||
if isinstance(obj, dict):
|
if isinstance(obj, dict):
|
||||||
|
@ -3296,14 +3297,8 @@ def actual_post_extract(info_dict):
|
||||||
actual_post_extract(video_dict or {})
|
actual_post_extract(video_dict or {})
|
||||||
return
|
return
|
||||||
|
|
||||||
post_extractor = info_dict.get('__post_extractor') or (lambda: {})
|
post_extractor = info_dict.pop('__post_extractor', None) or (lambda: {})
|
||||||
extra = post_extractor().items()
|
info_dict.update(post_extractor())
|
||||||
info_dict.update(extra)
|
|
||||||
info_dict.pop('__post_extractor', None)
|
|
||||||
|
|
||||||
original_infodict = info_dict.get('__original_infodict') or {}
|
|
||||||
original_infodict.update(extra)
|
|
||||||
original_infodict.pop('__post_extractor', None)
|
|
||||||
|
|
||||||
actual_post_extract(info_dict or {})
|
actual_post_extract(info_dict or {})
|
||||||
|
|
||||||
|
|
|
@ -474,8 +474,8 @@ def report_unplayable_conflict(opt_name, arg, default=False, allowed=None):
|
||||||
'key': 'SponsorBlock',
|
'key': 'SponsorBlock',
|
||||||
'categories': sponsorblock_query,
|
'categories': sponsorblock_query,
|
||||||
'api': opts.sponsorblock_api,
|
'api': opts.sponsorblock_api,
|
||||||
# Run this immediately after extraction is complete
|
# Run this after filtering videos
|
||||||
'when': 'pre_process'
|
'when': 'after_filter'
|
||||||
})
|
})
|
||||||
if opts.parse_metadata:
|
if opts.parse_metadata:
|
||||||
postprocessors.append({
|
postprocessors.append({
|
||||||
|
|
|
@ -1550,11 +1550,11 @@ def _dict_from_options_callback(
|
||||||
'and (optionally) arguments to be passed to it, separated by a colon ":". '
|
'and (optionally) arguments to be passed to it, separated by a colon ":". '
|
||||||
'ARGS are a semicolon ";" delimited list of NAME=VALUE. '
|
'ARGS are a semicolon ";" delimited list of NAME=VALUE. '
|
||||||
'The "when" argument determines when the postprocessor is invoked. '
|
'The "when" argument determines when the postprocessor is invoked. '
|
||||||
'It can be one of "pre_process" (after extraction), '
|
'It can be one of "pre_process" (after video extraction), "after_filter" (after video passes filter), '
|
||||||
'"before_dl" (before video download), "post_process" (after video download; default), '
|
'"before_dl" (before each video download), "post_process" (after each video download; default), '
|
||||||
'"after_move" (after moving file to their final locations), '
|
'"after_move" (after moving video file to it\'s final locations), '
|
||||||
'"after_video" (after downloading and processing all formats of a video), '
|
'"after_video" (after downloading and processing all formats of a video), '
|
||||||
'or "playlist" (end of playlist). '
|
'or "playlist" (at end of playlist). '
|
||||||
'This option can be used multiple times to add different postprocessors'))
|
'This option can be used multiple times to add different postprocessors'))
|
||||||
|
|
||||||
sponsorblock = optparse.OptionGroup(parser, 'SponsorBlock Options', description=(
|
sponsorblock = optparse.OptionGroup(parser, 'SponsorBlock Options', description=(
|
||||||
|
|
|
@ -3166,7 +3166,7 @@ def q(qid):
|
||||||
return q
|
return q
|
||||||
|
|
||||||
|
|
||||||
POSTPROCESS_WHEN = {'pre_process', 'before_dl', 'after_move', 'post_process', 'after_video', 'playlist'}
|
POSTPROCESS_WHEN = {'pre_process', 'after_filter', 'before_dl', 'after_move', 'post_process', 'after_video', 'playlist'}
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_OUTTMPL = {
|
DEFAULT_OUTTMPL = {
|
||||||
|
|
Loading…
Reference in a new issue