From 6a5af6acb9131d702b0d206242053b202440dbb9 Mon Sep 17 00:00:00 2001 From: Mats Date: Thu, 25 Sep 2014 16:25:53 +0200 Subject: [PATCH] [golem] Add new extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/golem.py | 131 +++++++++++++++++++++++++++++++ 2 files changed, 132 insertions(+) create mode 100644 youtube_dl/extractor/golem.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 1f1fc0eb2..71fe38ca0 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -135,6 +135,7 @@ from .gdcvault import GDCVaultIE from .generic import GenericIE from .godtube import GodTubeIE +from .golem import GolemIE from .googleplus import GooglePlusIE from .googlesearch import GoogleSearchIE from .gorillavid import GorillaVidIE diff --git a/youtube_dl/extractor/golem.py b/youtube_dl/extractor/golem.py new file mode 100644 index 000000000..afb620b1c --- /dev/null +++ b/youtube_dl/extractor/golem.py @@ -0,0 +1,131 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import compat_urlparse + + +class GolemIE(InfoExtractor): + _VALID_URL = r'^https?://video\.golem\.de/.+?/(?P.+?)/' + _TEST = { + 'url': 'http://video.golem.de/handy/14095/iphone-6-und-6-plus-test.html', + 'md5': 'c1a2c0a3c863319651c7c992c5ee29bf', + 'info_dict': { + 'id': '14095', + 'format_id': 'high', + 'ext': 'mp4', + 'title': 'iPhone 6 und 6 Plus - Test', + 'duration': 300, + 'filesize': 65309548, + } + } + + _CONFIG = 'https://video.golem.de/xml/{}.xml' + _PREFIX = 'http://video.golem.de' + + def _warn(self, fmt, *args): + self.report_warning(fmt.format(*args), self._id) + + def _extract_format(self, elem): + format_id = elem.tag + + url = elem.findtext('./url') + if url == '': + self._warn("{}: url: empty, skipping", format_id) + return None + + fmt = { + 'format_id': format_id, + 'url': compat_urlparse.urljoin(self._PREFIX, url) + } + + try: + _, ext = elem.findtext('./filename', '').rsplit('.', 1) + except ValueError: + self._warn('{}: ext: missing extension', format_id) + else: + fmt['ext'] = ext + + filesize = elem.findtext('./filesize') + if filesize is not None: + try: + fmt['filesize'] = int(filesize) + except ValueError as e: + self._warn('{}: filesize: {}', format_id, e) + + width = elem.get('width') + if width is not None: + try: + fmt['width'] = int(width) + except ValueError as e: + self._warn('{}: width: {}', format_id, e) + + height = elem.get('height') + if height is not None: + try: + fmt['height'] = int(height) + except ValueError as e: + self._warn('{}: height: {}', format_id, e) + + return fmt + + def _extract_thumbnail(self, elem): + url = elem.findtext('./url') + if url == '': + return None + thumb = { + 'url': compat_urlparse.urljoin(self._PREFIX, url) + } + + width = elem.get('width') + if width is not None: + try: + thumb['width'] = int(width) + except ValueError as e: + self._warn('thumbnail: width: {}', e) + + height = elem.get('height') + if height is not None: + try: + thumb['height'] = int(height) + except ValueError as e: + self._warn('thumbnail: height: {}', e) + + return thumb + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + self._id = mobj.group('id') + + config = self._download_xml(self._CONFIG.format(self._id), self._id) + + info = { + 'id': self._id, + 'title': config.findtext('./title', 'golem') + } + + formats = [] + for e in config.findall('./*[url]'): + fmt = self._extract_format(e) + if fmt is not None: + formats.append(fmt) + self._sort_formats(formats) + info['formats'] = formats + + thumbnails = [] + for e in config.findall('.//teaser[url]'): + thumb = self._extract_thumbnail(e) + if thumb is not None: + thumbnails.append(thumb) + info['thumbnails'] = thumbnails + + playtime = config.findtext('./playtime') + if playtime is not None: + try: + info['duration'] = round(float(playtime)) + except ValueError as e: + self._warn('duration: {}', e) + + return info