From 71dc18fa29263a1ff0472c23d81bfc8dd4422d48 Mon Sep 17 00:00:00 2001 From: Berkan Teber Date: Thu, 22 Jun 2023 10:27:54 +0300 Subject: [PATCH] [extractor/youtube] Improve description parsing performance (#7315) * The parsing is skipped when not needed * The regex is improved by simulating atomic groups with lookaheads Authored by: pukkandan, berkanteber --- yt_dlp/extractor/youtube.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index a90118680..ef9f1f11c 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -4346,15 +4346,21 @@ def process_language(container, base_url, lang_code, sub_name, query): info[d_k] = parse_duration(query[k][0]) # Youtube Music Auto-generated description - if video_description: + if (video_description or '').strip().endswith('\nAuto-generated by YouTube.'): + # XXX: Causes catastrophic backtracking if description has "·" + # E.g. https://www.youtube.com/watch?v=DoPaAxMQoiI + # Simulating atomic groups: (?P[^xy]+)x => (?=(?P[^xy]+))(?P=a)x + # reduces it, but does not fully fix it. https://regex101.com/r/8Ssf2h/2 mobj = re.search( r'''(?xs) - (?P[^·\n]+)·(?P[^\n]+)\n+ - (?P[^\n]+) + (?=(?P[^\n·]+))(?P=track)· + (?=(?P[^\n]+))(?P=artist)\n+ + (?=(?P[^\n]+))(?P=album)\n (?:.+?℗\s*(?P\d{4})(?!\d))? (?:.+?Released on\s*:\s*(?P\d{4}-\d{2}-\d{2}))? - (.+?\nArtist\s*:\s*(?P[^\n]+))? - .+\nAuto-generated\ by\ YouTube\.\s*$ + (.+?\nArtist\s*:\s* + (?=(?P[^\n]+))(?P=clean_artist)\n + )?.+\nAuto-generated\ by\ YouTube\.\s*$ ''', video_description) if mobj: release_year = mobj.group('release_year')