From 984e4d487520bd2a860b31b3165416c879b28096 Mon Sep 17 00:00:00 2001 From: remitamine Date: Wed, 24 Jun 2015 01:13:23 +0100 Subject: [PATCH 01/10] [googledrive] Add new extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/googledrive.py | 106 ++++++++++++++++++++++++++++ 2 files changed, 107 insertions(+) create mode 100644 youtube_dl/extractor/googledrive.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 3cfa804ec..6655d7eb5 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -209,6 +209,7 @@ from .godtube import GodTubeIE from .goldenmoustache import GoldenMoustacheIE from .golem import GolemIE +from .googledrive import GoogleDriveIE from .googleplus import GooglePlusIE from .googlesearch import GoogleSearchIE from .gorillavid import GorillaVidIE diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py new file mode 100644 index 000000000..8c611fa47 --- /dev/null +++ b/youtube_dl/extractor/googledrive.py @@ -0,0 +1,106 @@ +from .common import InfoExtractor +from ..utils import RegexNotFoundError + +class GoogleDriveIE(InfoExtractor): + _VALID_URL = r'(?:https?://)?(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/))(?P.+?)(?:&|/|$)' + _TEST = { + 'url': 'https://drive.google.com/file/d/0BzpExh0WzJF0NlR5WUlxdEVsY0U/edit?pli=1', + 'info_dict': { + 'id': '0BzpExh0WzJF0NlR5WUlxdEVsY0U', + 'ext': 'mp4', + 'title': '[AHSH] Fairy Tail S2 - 01 [720p].mp4', + } + } + _formats = { + '5': {'ext': 'flv'}, + '6': {'ext': 'flv'}, + '13': {'ext': '3gp'}, + '17': {'ext': '3gp'}, + '18': {'ext': 'mp4'}, + '22': {'ext': 'mp4'}, + '34': {'ext': 'flv'}, + '35': {'ext': 'flv'}, + '36': {'ext': '3gp'}, + '37': {'ext': 'mp4'}, + '38': {'ext': 'mp4'}, + '43': {'ext': 'webm'}, + '44': {'ext': 'webm'}, + '45': {'ext': 'webm'}, + '46': {'ext': 'webm'}, + '59': {'ext': 'mp4'} + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage( + 'http://docs.google.com/file/d/'+video_id, video_id, encoding='unicode_escape' + ) + try: + title = self._html_search_regex( + r'"title","(?P.*?)"', + webpage, + 'title', + group='title' + ) + fmt_stream_map = self._html_search_regex( + r'"fmt_stream_map","(?P<fmt_stream_map>.*?)"', + webpage, + 'fmt_stream_map', + group='fmt_stream_map' + ) + fmt_list = self._html_search_regex( + r'"fmt_list","(?P<fmt_list>.*?)"', + webpage, + 'fmt_list', + group='fmt_list' + ) +# timestamp = self._html_search_regex( +# r'"timestamp","(?P<timestamp>.*?)"', +# webpage, +# 'timestamp', +# group='timestamp' +# ) + length_seconds = self._html_search_regex( + r'"length_seconds","(?P<length_seconds>.*?)"', + webpage, + 'length_seconds', + group='length_seconds' + ) + except RegexNotFoundError: + try: + reason = self._html_search_regex( + r'"reason","(?P<reason>.*?)"', + webpage, + 'reason', + group='reason' + ) + self.report_warning(reason) + return + except RegexNotFoundError: + self.report_warning('not a video') + return + + fmt_stream_map = fmt_stream_map.split(',') + fmt_list = fmt_list.split(',') + formats = [] + for i in range(len(fmt_stream_map)): + fmt_id, fmt_url = fmt_stream_map[i].split('|') + resolution = fmt_list[i].split('/')[1] + width, height = resolution.split('x') + formats.append({ + 'url': fmt_url, + 'format_id': fmt_id, + 'resolution': resolution, + 'width': int(width), + 'height': int(height), + 'ext': self._formats[fmt_id]['ext'] + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, +# 'timestamp': int(timestamp), + 'duration': int(length_seconds), + 'formats': formats + } From f120a7ab5e9c560a8114f9662e2f213243a945b0 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Wed, 24 Jun 2015 14:56:19 +0100 Subject: [PATCH 02/10] change the _TEST info --- youtube_dl/extractor/googledrive.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py index 8c611fa47..e3d5c3418 100644 --- a/youtube_dl/extractor/googledrive.py +++ b/youtube_dl/extractor/googledrive.py @@ -4,11 +4,11 @@ class GoogleDriveIE(InfoExtractor): _VALID_URL = r'(?:https?://)?(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/))(?P<id>.+?)(?:&|/|$)' _TEST = { - 'url': 'https://drive.google.com/file/d/0BzpExh0WzJF0NlR5WUlxdEVsY0U/edit?pli=1', + 'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1', 'info_dict': { - 'id': '0BzpExh0WzJF0NlR5WUlxdEVsY0U', + 'id': '0ByeS4oOUV-49Zzh4R1J6R09zazQ', 'ext': 'mp4', - 'title': '[AHSH] Fairy Tail S2 - 01 [720p].mp4', + 'title': 'Big Buck Bunny.mp4', } } _formats = { From 3e5f3df1729846a33631dd38a887cd1d81a727c1 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Mon, 29 Jun 2015 07:53:21 +0100 Subject: [PATCH 03/10] move the embed to a separate class --- youtube_dl/extractor/googledrive.py | 31 ++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py index e3d5c3418..ac891b275 100644 --- a/youtube_dl/extractor/googledrive.py +++ b/youtube_dl/extractor/googledrive.py @@ -1,8 +1,37 @@ +import re + from .common import InfoExtractor from ..utils import RegexNotFoundError +class GoogleDriveEmbedIE(InfoExtractor): + _VALID_URL = r'https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9-]{28})(?:/preview)' + _TEST = { + 'url': 'https://docs.google.com/file/d/0B8KB9DRosYGKMXNoeWxqa3JYclE/preview', + 'info_dict': { + 'id': '0B8KB9DRosYGKMXNoeWxqa3JYclE', + 'ext': 'mp4', + 'title': 'Jimmy Fallon Sings Since You\'ve Been Gone.wmv', + } + } + + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r'<iframe src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9-]{28})(?:/preview)', + webpage) + if mobj: + return 'https://drive.google.com/file/d/%s' % mobj.group('id') + + def _real_extract(self, url): + video_id = self._match_id(url) + return { + '_type': 'url', + 'ie-key': 'GoogleDrive', + 'url': 'https://drive.google.com/file/d/%s' % video_id + } + class GoogleDriveIE(InfoExtractor): - _VALID_URL = r'(?:https?://)?(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/))(?P<id>.+?)(?:&|/|$)' + _VALID_URL = r'https?://(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)(?P<id>[a-zA-Z0-9-]{28})' _TEST = { 'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1', 'info_dict': { From 2d651a2d02885cddf1752b45497e9113d3a3d403 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Mon, 29 Jun 2015 07:55:44 +0100 Subject: [PATCH 04/10] import google drive embed class --- youtube_dl/extractor/__init__.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 6655d7eb5..02e18a0da 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -209,7 +209,10 @@ from .godtube import GodTubeIE from .goldenmoustache import GoldenMoustacheIE from .golem import GolemIE -from .googledrive import GoogleDriveIE +from .googledrive import ( + GoogleDriveEmbedIE, + GoogleDriveIE, +) from .googleplus import GooglePlusIE from .googlesearch import GoogleSearchIE from .gorillavid import GorillaVidIE From 653789afc72d1a225b971541fb633dd768d58942 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Mon, 29 Jun 2015 08:01:30 +0100 Subject: [PATCH 05/10] add google drive embeds --- youtube_dl/extractor/generic.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 6d2efb22e..3f7b094db 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -48,6 +48,7 @@ from .dailymotion import DailymotionCloudIE from .onionstudios import OnionStudiosIE from .snagfilms import SnagFilmsEmbedIE +from .googledrive import GoogleDriveEmbedIE class GenericIE(InfoExtractor): @@ -1599,6 +1600,11 @@ def _playlist_from_matches(matches, getter=None, ie=None): if nbc_sports_url: return self.url_result(nbc_sports_url, 'NBCSportsVPlayer') + # Look for Google Drive embeds + google_drive_url = GoogleDriveEmbedIE._extract_url(webpage) + if google_drive_url: + return self.url_result(google_drive_url, 'GoogleDrive') + # Look for UDN embeds mobj = re.search( r'<iframe[^>]+src="(?P<url>%s)"' % UDNEmbedIE._VALID_URL, webpage) From 3b3d531965f0f36c20f5fa8557481c144170653f Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 17 Jul 2015 14:17:19 +0100 Subject: [PATCH 06/10] fix embed regex --- youtube_dl/extractor/googledrive.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py index ac891b275..c82c9037f 100644 --- a/youtube_dl/extractor/googledrive.py +++ b/youtube_dl/extractor/googledrive.py @@ -4,7 +4,7 @@ from ..utils import RegexNotFoundError class GoogleDriveEmbedIE(InfoExtractor): - _VALID_URL = r'https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9-]{28})(?:/preview)' + _VALID_URL = r'https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28})' _TEST = { 'url': 'https://docs.google.com/file/d/0B8KB9DRosYGKMXNoeWxqa3JYclE/preview', 'info_dict': { @@ -17,7 +17,7 @@ class GoogleDriveEmbedIE(InfoExtractor): @staticmethod def _extract_url(webpage): mobj = re.search( - r'<iframe src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9-]{28})(?:/preview)', + r'<iframe src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28})', webpage) if mobj: return 'https://drive.google.com/file/d/%s' % mobj.group('id') @@ -31,7 +31,7 @@ def _real_extract(self, url): } class GoogleDriveIE(InfoExtractor): - _VALID_URL = r'https?://(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)(?P<id>[a-zA-Z0-9-]{28})' + _VALID_URL = r'https?://(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)(?P<id>[a-zA-Z0-9_-]{28})' _TEST = { 'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1', 'info_dict': { From d1cc05e17eccccb7ee6473574c6a4f887104baeb Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 17 Jul 2015 14:37:21 +0100 Subject: [PATCH 07/10] remove unnecessary regex group names --- youtube_dl/extractor/googledrive.py | 32 ++++++++++++----------------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py index c82c9037f..6d9bcfefd 100644 --- a/youtube_dl/extractor/googledrive.py +++ b/youtube_dl/extractor/googledrive.py @@ -62,46 +62,40 @@ class GoogleDriveIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage( - 'http://docs.google.com/file/d/'+video_id, video_id, encoding='unicode_escape' + 'http://docs.google.com/file/d/' + video_id, video_id, encoding='unicode_escape' ) try: title = self._html_search_regex( - r'"title","(?P<title>.*?)"', + r'"title"\s+,\s+"[^"]+', webpage, - 'title', - group='title' + 'title' ) fmt_stream_map = self._html_search_regex( - r'"fmt_stream_map","(?P<fmt_stream_map>.*?)"', + r'"fmt_stream_map"\s+,\s+"[^"]+', webpage, - 'fmt_stream_map', - group='fmt_stream_map' + 'fmt_stream_map' ) fmt_list = self._html_search_regex( - r'"fmt_list","(?P<fmt_list>.*?)"', + r'"fmt_list"\s+,\s+"[^"]+', webpage, - 'fmt_list', - group='fmt_list' + 'fmt_list' ) # timestamp = self._html_search_regex( -# r'"timestamp","(?P<timestamp>.*?)"', +# r'"timestamp"\s+,\s+"[^"]+', # webpage, -# 'timestamp', -# group='timestamp' +# 'timestamp' # ) length_seconds = self._html_search_regex( - r'"length_seconds","(?P<length_seconds>.*?)"', + r'"length_seconds"\s+,\s+"[^"]+', webpage, - 'length_seconds', - group='length_seconds' + 'length_seconds' ) except RegexNotFoundError: try: reason = self._html_search_regex( - r'"reason","(?P<reason>.*?)"', + r'"reason","[^"]+', webpage, - 'reason', - group='reason' + 'reason' ) self.report_warning(reason) return From 36dbca87848fc5698d3e0b89380c7bcec741ceaf Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 17 Jul 2015 14:52:01 +0100 Subject: [PATCH 08/10] fix recursive error --- youtube_dl/extractor/googledrive.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py index 6d9bcfefd..a3d9b4450 100644 --- a/youtube_dl/extractor/googledrive.py +++ b/youtube_dl/extractor/googledrive.py @@ -26,7 +26,7 @@ def _real_extract(self, url): video_id = self._match_id(url) return { '_type': 'url', - 'ie-key': 'GoogleDrive', + 'ie_key': 'GoogleDrive', 'url': 'https://drive.google.com/file/d/%s' % video_id } @@ -66,34 +66,34 @@ def _real_extract(self, url): ) try: title = self._html_search_regex( - r'"title"\s+,\s+"[^"]+', + r'"title"\s*,\s*"([^"]+)', webpage, 'title' ) fmt_stream_map = self._html_search_regex( - r'"fmt_stream_map"\s+,\s+"[^"]+', + r'"fmt_stream_map"\s*,\s*"([^"]+)', webpage, 'fmt_stream_map' ) fmt_list = self._html_search_regex( - r'"fmt_list"\s+,\s+"[^"]+', + r'"fmt_list"\s*,\s*"([^"]+)', webpage, 'fmt_list' ) # timestamp = self._html_search_regex( -# r'"timestamp"\s+,\s+"[^"]+', +# r'"timestamp"\s*,\s*"([^"]+)', # webpage, # 'timestamp' # ) length_seconds = self._html_search_regex( - r'"length_seconds"\s+,\s+"[^"]+', + r'"length_seconds"\s*,\s*"([^"]+)', webpage, 'length_seconds' ) except RegexNotFoundError: try: reason = self._html_search_regex( - r'"reason","[^"]+', + r'"reason","([^"]+)', webpage, 'reason' ) From 8e92d21ebf6f17e14c9e916f22e49f27529556af Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sat, 18 Jul 2015 23:31:14 +0100 Subject: [PATCH 09/10] [googledrive] raise ExtractorError instead of warning --- youtube_dl/extractor/googledrive.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py index a3d9b4450..7bc7b7a0d 100644 --- a/youtube_dl/extractor/googledrive.py +++ b/youtube_dl/extractor/googledrive.py @@ -1,7 +1,10 @@ import re from .common import InfoExtractor -from ..utils import RegexNotFoundError +from ..utils import ( + RegexNotFoundError, + ExtractorError, +) class GoogleDriveEmbedIE(InfoExtractor): _VALID_URL = r'https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28})' @@ -97,10 +100,10 @@ def _real_extract(self, url): webpage, 'reason' ) - self.report_warning(reason) + raise ExtractorError(reason) return except RegexNotFoundError: - self.report_warning('not a video') + raise ExtractorError('not a video') return fmt_stream_map = fmt_stream_map.split(',') From 5b251628e9f45c89c1becb3f62c4212874eb74ea Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Mon, 21 Dec 2015 03:05:34 +0100 Subject: [PATCH 10/10] [googledrive] Modernize --- youtube_dl/extractor/__init__.py | 5 +- youtube_dl/extractor/generic.py | 4 +- youtube_dl/extractor/googledrive.py | 146 ++++++++++------------------ 3 files changed, 54 insertions(+), 101 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 02e18a0da..6655d7eb5 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -209,10 +209,7 @@ from .godtube import GodTubeIE from .goldenmoustache import GoldenMoustacheIE from .golem import GolemIE -from .googledrive import ( - GoogleDriveEmbedIE, - GoogleDriveIE, -) +from .googledrive import GoogleDriveIE from .googleplus import GooglePlusIE from .googlesearch import GoogleSearchIE from .gorillavid import GorillaVidIE diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 3f7b094db..abd98e500 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -48,7 +48,7 @@ from .dailymotion import DailymotionCloudIE from .onionstudios import OnionStudiosIE from .snagfilms import SnagFilmsEmbedIE -from .googledrive import GoogleDriveEmbedIE +from .googledrive import GoogleDriveIE class GenericIE(InfoExtractor): @@ -1601,7 +1601,7 @@ def _playlist_from_matches(matches, getter=None, ie=None): return self.url_result(nbc_sports_url, 'NBCSportsVPlayer') # Look for Google Drive embeds - google_drive_url = GoogleDriveEmbedIE._extract_url(webpage) + google_drive_url = GoogleDriveIE._extract_url(webpage) if google_drive_url: return self.url_result(google_drive_url, 'GoogleDrive') diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py index 7bc7b7a0d..f354c9c7a 100644 --- a/youtube_dl/extractor/googledrive.py +++ b/youtube_dl/extractor/googledrive.py @@ -1,132 +1,88 @@ +from __future__ import unicode_literals + import re from .common import InfoExtractor from ..utils import ( - RegexNotFoundError, ExtractorError, + int_or_none, ) -class GoogleDriveEmbedIE(InfoExtractor): - _VALID_URL = r'https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28})' + +class GoogleDriveIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)|video\.google\.com/get_player\?.*?docid=)(?P<id>[a-zA-Z0-9_-]{28})' _TEST = { - 'url': 'https://docs.google.com/file/d/0B8KB9DRosYGKMXNoeWxqa3JYclE/preview', + 'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1', + 'md5': '881f7700aec4f538571fa1e0eed4a7b6', 'info_dict': { - 'id': '0B8KB9DRosYGKMXNoeWxqa3JYclE', + 'id': '0ByeS4oOUV-49Zzh4R1J6R09zazQ', 'ext': 'mp4', - 'title': 'Jimmy Fallon Sings Since You\'ve Been Gone.wmv', + 'title': 'Big Buck Bunny.mp4', + 'duration': 46, } } + _FORMATS_EXT = { + '5': 'flv', + '6': 'flv', + '13': '3gp', + '17': '3gp', + '18': 'mp4', + '22': 'mp4', + '34': 'flv', + '35': 'flv', + '36': '3gp', + '37': 'mp4', + '38': 'mp4', + '43': 'webm', + '44': 'webm', + '45': 'webm', + '46': 'webm', + '59': 'mp4', + } @staticmethod def _extract_url(webpage): mobj = re.search( - r'<iframe src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28})', + r'<iframe[^>]+src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28})', webpage) if mobj: return 'https://drive.google.com/file/d/%s' % mobj.group('id') - def _real_extract(self, url): - video_id = self._match_id(url) - return { - '_type': 'url', - 'ie_key': 'GoogleDrive', - 'url': 'https://drive.google.com/file/d/%s' % video_id - } - -class GoogleDriveIE(InfoExtractor): - _VALID_URL = r'https?://(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)(?P<id>[a-zA-Z0-9_-]{28})' - _TEST = { - 'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1', - 'info_dict': { - 'id': '0ByeS4oOUV-49Zzh4R1J6R09zazQ', - 'ext': 'mp4', - 'title': 'Big Buck Bunny.mp4', - } - } - _formats = { - '5': {'ext': 'flv'}, - '6': {'ext': 'flv'}, - '13': {'ext': '3gp'}, - '17': {'ext': '3gp'}, - '18': {'ext': 'mp4'}, - '22': {'ext': 'mp4'}, - '34': {'ext': 'flv'}, - '35': {'ext': 'flv'}, - '36': {'ext': '3gp'}, - '37': {'ext': 'mp4'}, - '38': {'ext': 'mp4'}, - '43': {'ext': 'webm'}, - '44': {'ext': 'webm'}, - '45': {'ext': 'webm'}, - '46': {'ext': 'webm'}, - '59': {'ext': 'mp4'} - } - def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage( - 'http://docs.google.com/file/d/' + video_id, video_id, encoding='unicode_escape' - ) - try: - title = self._html_search_regex( - r'"title"\s*,\s*"([^"]+)', - webpage, - 'title' - ) - fmt_stream_map = self._html_search_regex( - r'"fmt_stream_map"\s*,\s*"([^"]+)', - webpage, - 'fmt_stream_map' - ) - fmt_list = self._html_search_regex( - r'"fmt_list"\s*,\s*"([^"]+)', - webpage, - 'fmt_list' - ) -# timestamp = self._html_search_regex( -# r'"timestamp"\s*,\s*"([^"]+)', -# webpage, -# 'timestamp' -# ) - length_seconds = self._html_search_regex( - r'"length_seconds"\s*,\s*"([^"]+)', - webpage, - 'length_seconds' - ) - except RegexNotFoundError: - try: - reason = self._html_search_regex( - r'"reason","([^"]+)', - webpage, - 'reason' - ) - raise ExtractorError(reason) - return - except RegexNotFoundError: - raise ExtractorError('not a video') - return + 'http://docs.google.com/file/d/%s' % video_id, video_id, encoding='unicode_escape') + + reason = self._search_regex(r'"reason"\s*,\s*"([^"]+)', webpage, 'reason', default=None) + if reason: + raise ExtractorError(reason) + + title = self._search_regex(r'"title"\s*,\s*"([^"]+)', webpage, 'title') + duration = int_or_none(self._search_regex( + r'"length_seconds"\s*,\s*"([^"]+)', webpage, 'length seconds', default=None)) + fmt_stream_map = self._search_regex( + r'"fmt_stream_map"\s*,\s*"([^"]+)', webpage, 'fmt stream map').split(',') + fmt_list = self._search_regex(r'"fmt_list"\s*,\s*"([^"]+)', webpage, 'fmt_list').split(',') - fmt_stream_map = fmt_stream_map.split(',') - fmt_list = fmt_list.split(',') formats = [] - for i in range(len(fmt_stream_map)): - fmt_id, fmt_url = fmt_stream_map[i].split('|') - resolution = fmt_list[i].split('/')[1] + for fmt, fmt_stream in zip(fmt_list, fmt_stream_map): + fmt_id, fmt_url = fmt_stream.split('|') + resolution = fmt.split('/')[1] width, height = resolution.split('x') formats.append({ 'url': fmt_url, 'format_id': fmt_id, 'resolution': resolution, - 'width': int(width), - 'height': int(height), - 'ext': self._formats[fmt_id]['ext'] + 'width': int_or_none(width), + 'height': int_or_none(height), + 'ext': self._FORMATS_EXT[fmt_id], }) self._sort_formats(formats) return { 'id': video_id, 'title': title, -# 'timestamp': int(timestamp), - 'duration': int(length_seconds), - 'formats': formats + 'thumbnail': self._og_search_thumbnail(webpage), + 'duration': duration, + 'formats': formats, }