Add infrastructure for paged lists

This commit allows to download pages in playlists as needed instead of all at once. Before this commit, youtube-dl http://www.youtube.com/user/ANNnewsCH/videos --playlist-end 2 --skip-download took quite some time - now it's almost instantaneous. As an example, the youtube:user extractor has been converted. Fixes #2175
2014-01-20 11:36:47 +01:00 · 2014-01-20 11:36:47 +01:00 · b7ab059084
parent c91778f8c0
commit b7ab059084
4 changed files with 92 additions and 25 deletions
--- a/test/test_utils.py
+++ b/test/test_utils.py
@ -18,6 +18,7 @@
    find_xpath_attr,
    get_meta_content,
    orderedSet,
    PagedList,
    parse_duration,
    sanitize_filename,
    shell_quote,
@ -200,5 +201,26 @@ def test_parse_duration(self):
        self.assertEqual(parse_duration('9:12:43'), 33163)
        self.assertEqual(parse_duration('x:y'), None)
    def test_paged_list(self):
        def testPL(size, pagesize, sliceargs, expected):
            def get_page(pagenum):
                firstid = pagenum * pagesize
                upto = min(size, pagenum * pagesize + pagesize)
                for i in range(firstid, upto):
                    yield i
            pl = PagedList(get_page, pagesize)
            got = pl.getslice(*sliceargs)
            self.assertEqual(got, expected)
        testPL(5, 2, (), [0, 1, 2, 3, 4])
        testPL(5, 2, (1,), [1, 2, 3, 4])
        testPL(5, 2, (2,), [2, 3, 4])
        testPL(5, 2, (4,), [4])
        testPL(5, 2, (0, 3), [0, 1, 2])
        testPL(5, 2, (1, 4), [1, 2, 3])
        testPL(5, 2, (2, 99), [2, 3, 4])
        testPL(5, 2, (20, 99), [])
 if __name__ == '__main__':
    unittest.main()
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@ -39,6 +39,7 @@
    locked_file,
    make_HTTPS_handler,
    MaxDownloadsReached,
    PagedList,
    PostProcessingError,
    platform_name,
    preferredencoding,
@ -575,19 +576,27 @@ def make_result(embedded_info):
            playlist_results = []
            n_all_entries = len(ie_result['entries'])
            playliststart = self.params.get('playliststart', 1) - 1
            playlistend = self.params.get('playlistend', None)
            # For backwards compatibility, interpret -1 as whole list
            if playlistend == -1:
                playlistend = None
            if isinstance(ie_result['entries'], list):
                n_all_entries = len(ie_result['entries'])
                entries = ie_result['entries'][playliststart:playlistend]
                n_entries = len(entries)
                self.to_screen(
-                "[%s] playlist '%s': Collected %d video ids (downloading %d of them)" %
+                    "[%s] playlist %s: Collected %d video ids (downloading %d of them)" %
                    (ie_result['extractor'], playlist, n_all_entries, n_entries))
            else:
                assert isinstance(ie_result['entries'], PagedList)
                entries = ie_result['entries'].getslice(
                    playliststart, playlistend)
                n_entries = len(entries)
                self.to_screen(
                    "[%s] playlist %s: Downloading %d videos" %
                    (ie_result['extractor'], playlist, n_entries))
            for i, entry in enumerate(entries, 1):
                self.to_screen('[download] Downloading video #%s of %s' % (i, n_entries))
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@ -27,6 +27,7 @@
    get_element_by_id,
    get_element_by_attribute,
    ExtractorError,
    PagedList,
    RegexNotFoundError,
    unescapeHTML,
    unified_strdate,
@ -1580,44 +1581,35 @@ def _real_extract(self, url):
        # page by page until there are no video ids - it means we got
        # all of them.
-        url_results = []
+        def download_page(pagenum):
        for pagenum in itertools.count(0):
            start_index = pagenum * self._GDATA_PAGE_SIZE + 1
            gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
-            page = self._download_webpage(gdata_url, username,
+            page = self._download_webpage(
-                                          u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
+                gdata_url, username,
                u'Downloading video ids from %d to %d' % (
                    start_index, start_index + self._GDATA_PAGE_SIZE))
            try:
                response = json.loads(page)
            except ValueError as err:
                raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
            if 'entry' not in response['feed']:
-                # Number of videos is a multiple of self._MAX_RESULTS
+                return
                break
            # Extract video identifiers
            entries = response['feed']['entry']
            for entry in entries:
                title = entry['title']['$t']
                video_id = entry['id']['$t'].split('/')[-1]
-                url_results.append({
+                yield {
                    '_type': 'url',
                    'url': video_id,
                    'ie_key': 'Youtube',
                    'id': 'video_id',
                    'title': title,
-                })
+                }
-
+        url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)
            # A little optimization - if current page is not
            # "full", ie. does not contain PAGE_SIZE video ids then
            # we can assume that this page is the last one - there
            # are no more ids on further pages - no need to query
            # again.
            if len(entries) < self._GDATA_PAGE_SIZE:
                break
        return self.playlist_result(url_results, playlist_title=username)
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@ -6,6 +6,7 @@
 import email.utils
 import errno
 import gzip
 import itertools
 import io
 import json
 import locale
@ -1161,3 +1162,46 @@ def check_executable(exe, args=[]):
    except OSError:
        return False
    return exe
 class PagedList(object):
    def __init__(self, pagefunc, pagesize):
        self._pagefunc = pagefunc
        self._pagesize = pagesize
    def getslice(self, start=0, end=None):
        res = []
        for pagenum in itertools.count(start // self._pagesize):
            firstid = pagenum * self._pagesize
            nextfirstid = pagenum * self._pagesize + self._pagesize
            if start >= nextfirstid:
                continue
            page_results = list(self._pagefunc(pagenum))
            startv = (
                start % self._pagesize
                if firstid <= start < nextfirstid
                else 0)
            endv = (
                ((end - 1) % self._pagesize) + 1
                if (end is not None and firstid <= end <= nextfirstid)
                else None)
            if startv != 0 or endv is not None:
                page_results = page_results[startv:endv]
            res.extend(page_results)
            # A little optimization - if current page is not "full", ie. does
            # not contain page_size videos then we can assume that this page
            # is the last one - there are no more ids on further pages -
            # i.e. no need to query again.
            if len(page_results) + startv < self._pagesize:
                break
            # If we got the whole page, but the next page is not interesting,
            # break out early as well
            if end == nextfirstid:
                break
        return res