[youtube:tab] Extract more playlist metadata (#2069)

* Add fields modified_date, modified_timestamp * Add field playlist_count * [youtube:tab] Extract view_count, playlist_count, modified_date Authored by: coletdjnz, pukkandan
2024-11-22 02:15:12 +00:00 · 2022-01-07 11:03:02 +00:00 · 2022-01-07 11:03:02 +00:00 · f0d785d3ed
commit f0d785d3ed
parent 97a6b117d9
4 changed files with 65 additions and 26 deletions
--- a/README.md
+++ b/README.md
@ -1120,8 +1120,10 @@ # OUTPUT TEMPLATE
 - `creator` (string): The creator of the video
 - `timestamp` (numeric): UNIX timestamp of the moment the video became available
 - `upload_date` (string): Video upload date (YYYYMMDD)
- - `release_date` (string): The date (YYYYMMDD) when the video was released
 - `release_timestamp` (numeric): UNIX timestamp of the moment the video was released
+ - `release_date` (string): The date (YYYYMMDD) when the video was released
+ - `modified_timestamp` (numeric): UNIX timestamp of the moment the video was last modified
+ - `modified_date` (string): The date (YYYYMMDD) when the video was last modified
 - `uploader_id` (string): Nickname or id of the video uploader
 - `channel` (string): Full name of the channel the video is uploaded on
 - `channel_id` (string): Id of the channel
@ -1167,6 +1169,7 @@ # OUTPUT TEMPLATE
 - `video_autonumber` (numeric): Number that will be increased with each video
 - `n_entries` (numeric): Total number of extracted items in the playlist
 - `playlist` (string): Name or id of the playlist that contains the video
+ - `playlist_count` (numeric): Total number of items in the playlist. May not be known if entire playlist is not extracted
 - `playlist_index` (numeric): Index of the video in the playlist padded with leading zeros according the final index
 - `playlist_autonumber` (numeric): Position of the video in the playlist download queue padded with leading zeros according to the total length of the playlist
 - `playlist_id` (string): Playlist identifier
--- a/yt_dlp/YoutubeDL.py
+++ b/yt_dlp/YoutubeDL.py
@ -1636,14 +1636,15 @@ def iter_playlistitems(format):
            playlistitems = orderedSet(iter_playlistitems(playlistitems_str))

        ie_entries = ie_result['entries']
-        msg = (
-            'Downloading %d videos' if not isinstance(ie_entries, list)
-            else 'Collected %d videos; downloading %%d of them' % len(ie_entries))
-
        if isinstance(ie_entries, list):
+            playlist_count = len(ie_result)
+            msg = f'Collected {playlist_count} videos; downloading %d of them'
+            ie_result['playlist_count'] = ie_result.get('playlist_count') or playlist_count
+
            def get_entry(i):
                return ie_entries[i - 1]
        else:
+            msg = 'Downloading %d videos'
            if not isinstance(ie_entries, (PagedList, LazyList)):
                ie_entries = LazyList(ie_entries)

@ -1652,7 +1653,7 @@ def get_entry(i):
                    lambda self, i: ie_entries[i - 1]
                )(self, i)

-        entries = []
+        entries, broken = [], False
        items = playlistitems if playlistitems is not None else itertools.count(playliststart)
        for i in items:
            if i == 0:
@ -1674,6 +1675,7 @@ def get_entry(i):
                if entry is not None:
                    self._match_entry(entry, incomplete=True, silent=True)
            except (ExistingVideoReached, RejectedVideoReached):
+                broken = True
                break
        ie_result['entries'] = entries

@ -1684,6 +1686,9 @@ def get_entry(i):
            if entry is not None]
        n_entries = len(entries)

+        if not (ie_result.get('playlist_count') or broken or playlistitems or playlistend):
+            ie_result['playlist_count'] = n_entries
+
        if not playlistitems and (playliststart != 1 or playlistend):
            playlistitems = list(range(playliststart, playliststart + n_entries))
        ie_result['requested_entries'] = playlistitems
@ -1733,6 +1738,7 @@ def get_entry(i):
            extra = {
                'n_entries': n_entries,
                '_last_playlist_index': max(playlistitems) if playlistitems else (playlistend or n_entries),
+                'playlist_count': ie_result.get('playlist_count'),
                'playlist_index': playlist_index,
                'playlist_autonumber': i,
                'playlist': playlist,
@ -2331,6 +2337,7 @@ def sanitize_numeric_fields(info):
        for ts_key, date_key in (
                ('timestamp', 'upload_date'),
                ('release_timestamp', 'release_date'),
+                ('modified_timestamp', 'modified_date'),
        ):
            if info_dict.get(date_key) is None and info_dict.get(ts_key) is not None:
                # Working around out-of-range timestamp values (e.g. negative ones on Windows,
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@ -243,11 +243,16 @@ class InfoExtractor(object):
    uploader:       Full name of the video uploader.
    license:        License name the video is licensed under.
    creator:        The creator of the video.
-    release_timestamp: UNIX timestamp of the moment the video was released.
-    release_date:   The date (YYYYMMDD) when the video was released.
    timestamp:      UNIX timestamp of the moment the video was uploaded
    upload_date:    Video upload date (YYYYMMDD).
-                    If not explicitly set, calculated from timestamp.
+                    If not explicitly set, calculated from timestamp
+    release_timestamp: UNIX timestamp of the moment the video was released.
+                    If it is not clear whether to use timestamp or this, use the former
+    release_date:   The date (YYYYMMDD) when the video was released.
+                    If not explicitly set, calculated from release_timestamp
+    modified_timestamp: UNIX timestamp of the moment the video was last modified.
+    modified_date:   The date (YYYYMMDD) when the video was last modified.
+                    If not explicitly set, calculated from modified_timestamp
    uploader_id:    Nickname or id of the video uploader.
    uploader_url:   Full URL to a personal webpage of the video uploader.
    channel:        Full name of the channel the video is uploaded on.
@ -383,6 +388,11 @@ class InfoExtractor(object):
    Additionally, playlists can have "id", "title", and any other relevent
    attributes with the same semantics as videos (see above).

+    It can also have the following optional fields:
+
+    playlist_count: The total number of videos in a playlist. If not given,
+                    YoutubeDL tries to calculate it from "entries"
+

    _type "multi_video" indicates that there are multiple videos that
    form a single show, for examples multiple acts of an opera or TV episode.
--- a/yt_dlp/extractor/youtube.py
+++ b/yt_dlp/extractor/youtube.py
@ -62,6 +62,7 @@
    try_get,
    unescapeHTML,
    unified_strdate,
+    unified_timestamp,
    unsmuggle_url,
    update_url_query,
    url_or_none,
@ -667,6 +668,14 @@ def _get_text(data, *path_list, max_runs=None):
                if text:
                    return text

+    def _get_count(self, data, *path_list):
+        count_text = self._get_text(data, *path_list) or ''
+        count = parse_count(count_text)
+        if count is None:
+            count = str_to_int(
+                self._search_regex(r'^([\d,]+)', re.sub(r'\s', '', count_text), 'count', default=None))
+        return count
+
    @staticmethod
    def _extract_thumbnails(data, *path_list):
        """
@ -695,12 +704,15 @@ def _extract_thumbnails(data, *path_list):
    def extract_relative_time(relative_time_text):
        """
        Extracts a relative time from string and converts to dt object
-        e.g. 'streamed 6 days ago', '5 seconds ago (edited)'
+        e.g. 'streamed 6 days ago', '5 seconds ago (edited)', 'updated today'
        """
-        mobj = re.search(r'(?P<time>\d+)\s*(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?\s*ago', relative_time_text)
+        mobj = re.search(r'(?P<start>today|yesterday|now)|(?P<time>\d+)\s*(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?\s*ago', relative_time_text)
        if mobj:
+            start = mobj.group('start')
+            if start:
+                return datetime_from_str(start)
            try:
-                return datetime_from_str('now-%s%s' % (mobj.group('time'), mobj.group('unit')), precision='auto')
+                return datetime_from_str('now-%s%s' % (mobj.group('time'), mobj.group('unit')))
            except ValueError:
                return None

@ -710,6 +722,13 @@ def _extract_time_text(self, renderer, *path_list):
        timestamp = None
        if isinstance(dt, datetime.datetime):
            timestamp = calendar.timegm(dt.timetuple())
+
+        if timestamp is None:
+            timestamp = (
+                unified_timestamp(text) or unified_timestamp(
+                    self._search_regex(
+                        (r'(?:.+|^)(?:live|premieres|ed|ing)(?:\s*on)?\s*(.+\d)', r'\w+[\s,\.-]*\w+[\s,\.-]+20\d{2}'), text.lower(), 'time text', default=None)))
+
        if text and timestamp is None:
            self.report_warning('Cannot parse localized time text' + bug_reports_message(), only_once=True)
        return timestamp, text
@ -794,10 +813,7 @@ def _extract_video(self, renderer):
        description = self._get_text(renderer, 'descriptionSnippet')
        duration = parse_duration(self._get_text(
            renderer, 'lengthText', ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'text')))
-        view_count_text = self._get_text(renderer, 'viewCountText') or ''
-        view_count = str_to_int(self._search_regex(
-            r'^([\d,]+)', re.sub(r'\s', '', view_count_text),
-            'view count', default=None))
+        view_count = self._get_count(renderer, 'viewCountText')

        uploader = self._get_text(renderer, 'ownerText', 'shortBylineText')
        channel_id = traverse_obj(
@ -2317,8 +2333,8 @@ def extract_header(contents):
            _continuation = None
            for content in contents:
                comments_header_renderer = traverse_obj(content, 'commentsHeaderRenderer')
-                expected_comment_count = parse_count(self._get_text(
-                    comments_header_renderer, 'countText', 'commentsCount', max_runs=1))
+                expected_comment_count = self._get_count(
+                    comments_header_renderer, 'countText', 'commentsCount')

                if expected_comment_count:
                    tracker['est_total'] = expected_comment_count
@ -3603,6 +3619,7 @@ def _extract_from_tabs(self, item_id, ytcfg, data, tabs):
        tags = []

        selected_tab = self._extract_selected_tab(tabs)
+        primary_sidebar_renderer = self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer')
        renderer = try_get(
            data, lambda x: x['metadata']['channelMetadataRenderer'], dict)
        if renderer:
@ -3622,17 +3639,18 @@ def _extract_from_tabs(self, item_id, ytcfg, data, tabs):
        thumbnails = (
            self._extract_thumbnails(renderer, 'avatar')
            or self._extract_thumbnails(
-                self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer'),
-                ('thumbnailRenderer', 'playlistVideoThumbnailRenderer', 'thumbnail')))
+                primary_sidebar_renderer, ('thumbnailRenderer', 'playlistVideoThumbnailRenderer', 'thumbnail')))

        if playlist_id is None:
            playlist_id = item_id
+
+        playlist_stats = traverse_obj(primary_sidebar_renderer, 'stats')
+        last_updated_unix, _ = self._extract_time_text(playlist_stats, 2)
        if title is None:
-            title = (
-                try_get(data, lambda x: x['header']['hashtagHeaderRenderer']['hashtag']['simpleText'])
-                or playlist_id)
+            title = self._get_text(data, ('header', 'hashtagHeaderRenderer', 'hashtag')) or playlist_id
        title += format_field(selected_tab, 'title', ' - %s')
        title += format_field(selected_tab, 'expandedText', ' - %s')
+
        metadata = {
            'playlist_id': playlist_id,
            'playlist_title': title,
@ -3642,10 +3660,11 @@ def _extract_from_tabs(self, item_id, ytcfg, data, tabs):
            'uploader_url': channel_url,
            'thumbnails': thumbnails,
            'tags': tags,
+            'view_count': self._get_count(playlist_stats, 1),
+            'availability': self._extract_availability(data),
+            'modified_date': strftime_or_none(last_updated_unix, '%Y%m%d'),
+            'playlist_count': self._get_count(playlist_stats, 0)
        }
-        availability = self._extract_availability(data)
-        if availability:
-            metadata['availability'] = availability
        if not channel_id:
            metadata.update(self._extract_uploader(data))
        metadata.update({