Add infrastructure for paged lists

This commit allows to download pages in playlists as needed instead of all at once.
Before this commit,
    youtube-dl http://www.youtube.com/user/ANNnewsCH/videos --playlist-end 2 --skip-download
took quite some time - now it's almost instantaneous.
As an example, the youtube:user extractor has been converted.
Fixes #2175
This commit is contained in:
Philipp Hagemeister 2014-01-20 11:36:47 +01:00
parent c91778f8c0
commit b7ab059084
4 changed files with 92 additions and 25 deletions

View File

@ -18,6 +18,7 @@
find_xpath_attr, find_xpath_attr,
get_meta_content, get_meta_content,
orderedSet, orderedSet,
PagedList,
parse_duration, parse_duration,
sanitize_filename, sanitize_filename,
shell_quote, shell_quote,
@ -200,5 +201,26 @@ def test_parse_duration(self):
self.assertEqual(parse_duration('9:12:43'), 33163) self.assertEqual(parse_duration('9:12:43'), 33163)
self.assertEqual(parse_duration('x:y'), None) self.assertEqual(parse_duration('x:y'), None)
def test_paged_list(self):
def testPL(size, pagesize, sliceargs, expected):
def get_page(pagenum):
firstid = pagenum * pagesize
upto = min(size, pagenum * pagesize + pagesize)
for i in range(firstid, upto):
yield i
pl = PagedList(get_page, pagesize)
got = pl.getslice(*sliceargs)
self.assertEqual(got, expected)
testPL(5, 2, (), [0, 1, 2, 3, 4])
testPL(5, 2, (1,), [1, 2, 3, 4])
testPL(5, 2, (2,), [2, 3, 4])
testPL(5, 2, (4,), [4])
testPL(5, 2, (0, 3), [0, 1, 2])
testPL(5, 2, (1, 4), [1, 2, 3])
testPL(5, 2, (2, 99), [2, 3, 4])
testPL(5, 2, (20, 99), [])
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()

View File

@ -39,6 +39,7 @@
locked_file, locked_file,
make_HTTPS_handler, make_HTTPS_handler,
MaxDownloadsReached, MaxDownloadsReached,
PagedList,
PostProcessingError, PostProcessingError,
platform_name, platform_name,
preferredencoding, preferredencoding,
@ -575,19 +576,27 @@ def make_result(embedded_info):
playlist_results = [] playlist_results = []
n_all_entries = len(ie_result['entries'])
playliststart = self.params.get('playliststart', 1) - 1 playliststart = self.params.get('playliststart', 1) - 1
playlistend = self.params.get('playlistend', None) playlistend = self.params.get('playlistend', None)
# For backwards compatibility, interpret -1 as whole list # For backwards compatibility, interpret -1 as whole list
if playlistend == -1: if playlistend == -1:
playlistend = None playlistend = None
if isinstance(ie_result['entries'], list):
n_all_entries = len(ie_result['entries'])
entries = ie_result['entries'][playliststart:playlistend] entries = ie_result['entries'][playliststart:playlistend]
n_entries = len(entries) n_entries = len(entries)
self.to_screen( self.to_screen(
"[%s] playlist '%s': Collected %d video ids (downloading %d of them)" % "[%s] playlist %s: Collected %d video ids (downloading %d of them)" %
(ie_result['extractor'], playlist, n_all_entries, n_entries)) (ie_result['extractor'], playlist, n_all_entries, n_entries))
else:
assert isinstance(ie_result['entries'], PagedList)
entries = ie_result['entries'].getslice(
playliststart, playlistend)
n_entries = len(entries)
self.to_screen(
"[%s] playlist %s: Downloading %d videos" %
(ie_result['extractor'], playlist, n_entries))
for i, entry in enumerate(entries, 1): for i, entry in enumerate(entries, 1):
self.to_screen('[download] Downloading video #%s of %s' % (i, n_entries)) self.to_screen('[download] Downloading video #%s of %s' % (i, n_entries))

View File

@ -27,6 +27,7 @@
get_element_by_id, get_element_by_id,
get_element_by_attribute, get_element_by_attribute,
ExtractorError, ExtractorError,
PagedList,
RegexNotFoundError, RegexNotFoundError,
unescapeHTML, unescapeHTML,
unified_strdate, unified_strdate,
@ -1580,44 +1581,35 @@ def _real_extract(self, url):
# page by page until there are no video ids - it means we got # page by page until there are no video ids - it means we got
# all of them. # all of them.
url_results = [] def download_page(pagenum):
for pagenum in itertools.count(0):
start_index = pagenum * self._GDATA_PAGE_SIZE + 1 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index) gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
page = self._download_webpage(gdata_url, username, page = self._download_webpage(
u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE)) gdata_url, username,
u'Downloading video ids from %d to %d' % (
start_index, start_index + self._GDATA_PAGE_SIZE))
try: try:
response = json.loads(page) response = json.loads(page)
except ValueError as err: except ValueError as err:
raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err)) raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
if 'entry' not in response['feed']: if 'entry' not in response['feed']:
# Number of videos is a multiple of self._MAX_RESULTS return
break
# Extract video identifiers # Extract video identifiers
entries = response['feed']['entry'] entries = response['feed']['entry']
for entry in entries: for entry in entries:
title = entry['title']['$t'] title = entry['title']['$t']
video_id = entry['id']['$t'].split('/')[-1] video_id = entry['id']['$t'].split('/')[-1]
url_results.append({ yield {
'_type': 'url', '_type': 'url',
'url': video_id, 'url': video_id,
'ie_key': 'Youtube', 'ie_key': 'Youtube',
'id': 'video_id', 'id': 'video_id',
'title': title, 'title': title,
}) }
url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)
# A little optimization - if current page is not
# "full", ie. does not contain PAGE_SIZE video ids then
# we can assume that this page is the last one - there
# are no more ids on further pages - no need to query
# again.
if len(entries) < self._GDATA_PAGE_SIZE:
break
return self.playlist_result(url_results, playlist_title=username) return self.playlist_result(url_results, playlist_title=username)

View File

@ -6,6 +6,7 @@
import email.utils import email.utils
import errno import errno
import gzip import gzip
import itertools
import io import io
import json import json
import locale import locale
@ -1161,3 +1162,46 @@ def check_executable(exe, args=[]):
except OSError: except OSError:
return False return False
return exe return exe
class PagedList(object):
def __init__(self, pagefunc, pagesize):
self._pagefunc = pagefunc
self._pagesize = pagesize
def getslice(self, start=0, end=None):
res = []
for pagenum in itertools.count(start // self._pagesize):
firstid = pagenum * self._pagesize
nextfirstid = pagenum * self._pagesize + self._pagesize
if start >= nextfirstid:
continue
page_results = list(self._pagefunc(pagenum))
startv = (
start % self._pagesize
if firstid <= start < nextfirstid
else 0)
endv = (
((end - 1) % self._pagesize) + 1
if (end is not None and firstid <= end <= nextfirstid)
else None)
if startv != 0 or endv is not None:
page_results = page_results[startv:endv]
res.extend(page_results)
# A little optimization - if current page is not "full", ie. does
# not contain page_size videos then we can assume that this page
# is the last one - there are no more ids on further pages -
# i.e. no need to query again.
if len(page_results) + startv < self._pagesize:
break
# If we got the whole page, but the next page is not interesting,
# break out early as well
if end == nextfirstid:
break
return res