From 469d4c89686afca46333d85442bb770e6010518c Mon Sep 17 00:00:00 2001 From: Will Sewell Date: Mon, 17 Nov 2014 17:52:00 -0500 Subject: [PATCH] [vk] Added a new information extractor for pages that are a list of a user\'s videos on vk.com. It works in a same way to playlist style pages for the YT information extractors. --- youtube_dl/extractor/__init__.py | 5 ++++- youtube_dl/extractor/vk.py | 37 +++++++++++++++++++++++++++++++- 2 files changed, 40 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index f45ce05ab..b687a56b4 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -452,7 +452,10 @@ VineUserIE, ) from .viki import VikiIE -from .vk import VKIE +from .vk import ( + VKIE, + VKUserVideosIE, +) from .vodlocker import VodlockerIE from .vporn import VpornIE from .vrt import VRTIE diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 36cd7e52e..5223e5e2c 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -16,7 +16,7 @@ class VKIE(InfoExtractor): IE_NAME = 'vk.com' - _VALID_URL = r'https?://(?:m\.)?vk\.com/(?:video_ext\.php\?.*?\boid=(?P-?\d+).*?\bid=(?P\d+)|(?:.+?\?.*?z=)?video(?P.*?)(?:\?|%2F|$))' + _VALID_URL = r'https?://(?:m\.)?vk\.com/(?:video_ext\.php\?.*?\boid=(?P-?\d+).*?\bid=(?P\d+)|(?:.+?\?.*?z=)?video(?P[^s].*?)(?:\?|%2F|$))' _NETRC_MACHINE = 'vk' _TESTS = [ @@ -185,3 +185,38 @@ def _real_extract(self, url): 'uploader': data.get('md_author'), 'duration': data.get('duration') } + + +class VKUserVideosIE(InfoExtractor): + IE_NAME = 'vk.com:user-videos' + IE_DESC = 'All of a user\'s videos' + _VALID_URL = r'https?://(?:m\.)?vk\.com/videos([0-9]+)' + _TEMPLATE_URL = 'https://vk.com/videos' + _TEST = { + 'url': 'http://vk.com/videos205387401', + 'playlist_mincount': 4, + } + + def extract_videos_from_page(self, page): + ids_in_page = [] + for mobj in re.finditer(r'href="/video([0-9_]+)"', page): + if mobj.group(1) not in ids_in_page: + ids_in_page.append(mobj.group(1)) + return ids_in_page + + def _real_extract(self, url): + # Extract page id + mobj = re.match(self._VALID_URL, url) + if mobj is None: + raise ExtractorError('Invalid URL: %s' % url) + + # Download page and get video ids + page_id = mobj.group(1) + page = self._download_webpage(url, page_id) + video_ids = self.extract_videos_from_page(page) + + self._downloader.to_screen('[vk] User videos %s: Found %i videos' % (page_id, len(video_ids))) + + url_entries = [self.url_result('http://vk.com/video' + video_id, 'VK', video_id=video_id) + for video_id in video_ids] + return self.playlist_result(url_entries, page_id) \ No newline at end of file