diff --git a/README.md b/README.md index 98c737118..c57cabf6b 100644 --- a/README.md +++ b/README.md @@ -1085,7 +1085,7 @@ # OUTPUT TEMPLATE 1. **Default**: A literal default value can be specified for when the field is empty using a `|` separator. This overrides `--output-na-template`. Eg: `%(uploader|Unknown)s` -1. **More Conversions**: In addition to the normal format types `diouxXeEfFgGcrs`, `B`, `j`, `l`, `q` can be used for converting to **B**ytes, **j**son (flag `#` for pretty-printing), a comma separated **l**ist (flag `#` for `\n` newline-separated) and a string **q**uoted for the terminal (flag `#` to split a list into different arguments), respectively +1. **More Conversions**: In addition to the normal format types `diouxXeEfFgGcrs`, `B`, `j`, `l`, `q`, `D`, 'F' can be used for converting to **B**ytes, **j**son (flag `#` for pretty-printing), a comma separated **l**ist (flag `#` for `\n` newline-separated), a string **q**uoted for the terminal (flag `#` to split a list into different arguments), to add **D**ecimal suffixes (Eg: 10M), and to sanitize as **F**ilename (flag `#` for restricted), respectively 1. **Unicode normalization**: The format type `U` can be used for NFC [unicode normalization](https://docs.python.org/3/library/unicodedata.html#unicodedata.normalize). The alternate form flag (`#`) changes the normalization to NFD and the conversion flag `+` can be used for NFKC/NFKD compatibility equivalence normalization. Eg: `%(title)+.100U` is NFKC diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 6c2530046..39d7e1ec5 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -717,6 +717,7 @@ def test(tmpl, expected, *, info=None, **params): test('%(id)s', '.abcd', info={'id': '.abcd'}) test('%(id)s', 'ab__cd', info={'id': 'ab__cd'}) test('%(id)s', ('ab:cd', 'ab -cd'), info={'id': 'ab:cd'}) + test('%(id.0)s', '-', info={'id': '--'}) # Invalid templates self.assertTrue(isinstance(YoutubeDL.validate_outtmpl('%(title)'), ValueError)) @@ -777,6 +778,10 @@ def expect_same_infodict(out): test('%(title5)#U', 'a\u0301e\u0301i\u0301 𝐀') test('%(title5)+U', 'áéí A') test('%(title5)+#U', 'a\u0301e\u0301i\u0301 A') + test('%(height)D', '1K') + test('%(height)5.2D', ' 1.08K') + test('%(title4).10F', ('foo \'bar\' ', 'foo \'bar\'#')) + test('%(title4)#F', 'foo_bar_test') if compat_os_name == 'nt': test('%(title4)q', ('"foo \\"bar\\" test"', "'foo _'bar_' test'")) test('%(formats.:.id)#q', ('"id 1" "id 2" "id 3"', "'id 1' 'id 2' 'id 3'")) @@ -808,6 +813,11 @@ def expect_same_infodict(out): test('%(width-100,height+width|def)s', 'def') test('%(timestamp-x>%H\\,%M\\,%S,timestamp>%H\\,%M\\,%S)s', '12,00,00') + # Replacement + test('%(id&foo)s.bar', 'foo.bar') + test('%(title&foo)s.bar', 'NA.bar') + test('%(title&foo|baz)s.bar', 'baz.bar') + # Laziness def gen(): yield from range(5) @@ -836,11 +846,6 @@ def gen(): test('%(title3)s', ('foo/bar\\test', 'foo_bar_test')) test('folder/%(title3)s', ('folder/foo/bar\\test', 'folder%sfoo_bar_test' % os.path.sep)) - # Replacement - test('%(id&foo)s.bar', 'foo.bar') - test('%(title&foo)s.bar', 'NA.bar') - test('%(title&foo|baz)s.bar', 'baz.bar') - def test_format_note(self): ydl = YoutubeDL() self.assertEqual(ydl._format_note({}), '') diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index be0a9c43d..277b24a47 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -67,6 +67,7 @@ float_or_none, format_bytes, format_field, + format_decimal_suffix, formatSeconds, GeoRestrictedError, get_domain, @@ -1005,7 +1006,7 @@ def escape_outtmpl(outtmpl): def validate_outtmpl(cls, outtmpl): ''' @return None or Exception object ''' outtmpl = re.sub( - STR_FORMAT_RE_TMPL.format('[^)]*', '[ljqBU]'), + STR_FORMAT_RE_TMPL.format('[^)]*', '[ljqBUDF]'), lambda mobj: f'{mobj.group(0)[:-1]}s', cls._outtmpl_expandpath(outtmpl)) try: @@ -1021,8 +1022,12 @@ def _copy_infodict(info_dict): info_dict.pop(key, None) return info_dict - def prepare_outtmpl(self, outtmpl, info_dict, sanitize=None): - """ Make the outtmpl and info_dict suitable for substitution: ydl.escape_outtmpl(outtmpl) % info_dict """ + def prepare_outtmpl(self, outtmpl, info_dict, sanitize=False): + """ Make the outtmpl and info_dict suitable for substitution: ydl.escape_outtmpl(outtmpl) % info_dict + @param sanitize Whether to sanitize the output as a filename. + For backward compatibility, a function can also be passed + """ + info_dict.setdefault('epoch', int(time.time())) # keep epoch consistent once set info_dict = self._copy_infodict(info_dict) @@ -1043,7 +1048,7 @@ def prepare_outtmpl(self, outtmpl, info_dict, sanitize=None): } TMPL_DICT = {} - EXTERNAL_FORMAT_RE = re.compile(STR_FORMAT_RE_TMPL.format('[^)]*', f'[{STR_FORMAT_TYPES}ljqBU]')) + EXTERNAL_FORMAT_RE = re.compile(STR_FORMAT_RE_TMPL.format('[^)]*', f'[{STR_FORMAT_TYPES}ljqBUDF]')) MATH_FUNCTIONS = { '+': float.__add__, '-': float.__sub__, @@ -1051,7 +1056,7 @@ def prepare_outtmpl(self, outtmpl, info_dict, sanitize=None): # Field is of the form key1.key2... # where keys (except first) can be string, int or slice FIELD_RE = r'\w*(?:\.(?:\w+|{num}|{num}?(?::{num}?){{1,2}}))*'.format(num=r'(?:-?\d+)') - MATH_FIELD_RE = r'''{field}|{num}'''.format(field=FIELD_RE, num=r'-?\d+(?:.\d+)?') + MATH_FIELD_RE = r'''(?:{field}|{num})'''.format(field=FIELD_RE, num=r'-?\d+(?:.\d+)?') MATH_OPERATORS_RE = r'(?:%s)' % '|'.join(map(re.escape, MATH_FUNCTIONS.keys())) INTERNAL_FORMAT_RE = re.compile(r'''(?x) (?P-)? @@ -1107,6 +1112,13 @@ def get_value(mdict): na = self.params.get('outtmpl_na_placeholder', 'NA') + def filename_sanitizer(key, value, restricted=self.params.get('restrictfilenames')): + return sanitize_filename(str(value), restricted=restricted, + is_id=re.search(r'(^|[_.])id(\.|$)', key)) + + sanitizer = sanitize if callable(sanitize) else filename_sanitizer + sanitize = bool(sanitize) + def _dumpjson_default(obj): if isinstance(obj, (set, LazyList)): return list(obj) @@ -1117,7 +1129,7 @@ def create_key(outer_mobj): return outer_mobj.group(0) key = outer_mobj.group('key') mobj = re.match(INTERNAL_FORMAT_RE, key) - initial_field = mobj.group('fields').split('.')[-1] if mobj else '' + initial_field = mobj.group('fields') if mobj else '' value, replacement, default = None, None, na while mobj: mobj = mobj.groupdict() @@ -1153,6 +1165,10 @@ def create_key(outer_mobj): # "+" = compatibility equivalence, "#" = NFD 'NF%s%s' % ('K' if '+' in flags else '', 'D' if '#' in flags else 'C'), value), str_fmt + elif fmt[-1] == 'D': # decimal suffix + value, fmt = format_decimal_suffix(value, f'%{fmt[:-1]}f%s' if fmt[:-1] else '%d%s'), 's' + elif fmt[-1] == 'F': # filename sanitization + value, fmt = filename_sanitizer(initial_field, value, restricted='#' in flags), str_fmt elif fmt[-1] == 'c': if value: value = str(value)[0] @@ -1169,7 +1185,7 @@ def create_key(outer_mobj): # So we convert it to repr first value, fmt = repr(value), str_fmt if fmt[-1] in 'csr': - value = sanitize(initial_field, value) + value = sanitizer(initial_field, value) key = '%s\0%s' % (key.replace('%', '%\0'), outer_mobj.group('format')) TMPL_DICT[key] = value @@ -1183,12 +1199,8 @@ def evaluate_outtmpl(self, outtmpl, info_dict, *args, **kwargs): def _prepare_filename(self, info_dict, tmpl_type='default'): try: - sanitize = lambda k, v: sanitize_filename( - compat_str(v), - restricted=self.params.get('restrictfilenames'), - is_id=(k == 'id' or k.endswith('_id'))) outtmpl = self._outtmpl_expandpath(self.outtmpl_dict.get(tmpl_type, self.outtmpl_dict['default'])) - filename = self.evaluate_outtmpl(outtmpl, info_dict, sanitize) + filename = self.evaluate_outtmpl(outtmpl, info_dict, True) force_ext = OUTTMPL_TYPES.get(tmpl_type) if filename and force_ext is not None: diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 2919324c6..b1929f4db 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -2110,18 +2110,19 @@ def unsmuggle_url(smug_url, default=None): return url, data +def format_decimal_suffix(num, fmt='%d%s', *, factor=1000): + """ Formats numbers with decimal sufixes like K, M, etc """ + num, factor = float_or_none(num), float(factor) + if num is None: + return None + exponent = 0 if num == 0 else int(math.log(num, factor)) + suffix = ['', *'KMGTPEZY'][exponent] + converted = num / (factor ** exponent) + return fmt % (converted, suffix) + + def format_bytes(bytes): - if bytes is None: - return 'N/A' - if type(bytes) is str: - bytes = float(bytes) - if bytes == 0.0: - exponent = 0 - else: - exponent = int(math.log(bytes, 1024.0)) - suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent] - converted = float(bytes) / float(1024 ** exponent) - return '%.2f%s' % (converted, suffix) + return format_decimal_suffix(bytes, '%.2f%siB', factor=1024) or 'N/A' def lookup_unit_table(unit_table, s):