From 1a117a77287e7dbd4d92f29062dabcf4efb86cb5 Mon Sep 17 00:00:00 2001 From: remitamine Date: Fri, 24 Jul 2015 12:00:20 +0100 Subject: [PATCH 1/2] [clipfish] extract mp4 video link --- youtube_dl/extractor/clipfish.py | 37 ++++++++++++++++---------------- 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/clipfish.py b/youtube_dl/extractor/clipfish.py index a5c3cb7c6..09dfaac60 100644 --- a/youtube_dl/extractor/clipfish.py +++ b/youtube_dl/extractor/clipfish.py @@ -1,13 +1,11 @@ from __future__ import unicode_literals -import re -import time -import xml.etree.ElementTree - from .common import InfoExtractor from ..utils import ( ExtractorError, - parse_duration, + int_or_none, + js_to_json, + determine_ext, ) @@ -17,37 +15,40 @@ class ClipfishIE(InfoExtractor): _VALID_URL = r'^https?://(?:www\.)?clipfish\.de/.*?/video/(?P[0-9]+)/' _TEST = { 'url': 'http://www.clipfish.de/special/game-trailer/video/3966754/fifa-14-e3-2013-trailer/', - 'md5': '2521cd644e862936cf2e698206e47385', + 'md5': '79bc922f3e8a9097b3d68a93780fd475', 'info_dict': { 'id': '3966754', 'ext': 'mp4', 'title': 'FIFA 14 - E3 2013 Trailer', 'duration': 82, - }, - 'skip': 'Blocked in the US' + } } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group(1) + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + video_info = self._parse_json( + js_to_json(self._html_search_regex('var videoObject = ({[^}]+?})', webpage, 'videoObject')), + video_id + ) + info_url = self._parse_json( + js_to_json(self._html_search_regex('var globalFlashvars = ({[^}]+?})', webpage, 'globalFlashvars')), + video_id + )['data'] - info_url = ('http://www.clipfish.de/devxml/videoinfo/%s?ts=%d' % - (video_id, int(time.time()))) doc = self._download_xml( info_url, video_id, note='Downloading info page') title = doc.find('title').text video_url = doc.find('filename').text - if video_url is None: - xml_bytes = xml.etree.ElementTree.tostring(doc) - raise ExtractorError('Cannot find video URL in document %r' % - xml_bytes) thumbnail = doc.find('imageurl').text - duration = parse_duration(doc.find('duration').text) + duration = int_or_none(video_info['length']) + formats = [{'url': video_info['videourl']},{'url': video_url}] + self._sort_formats(formats) return { 'id': video_id, 'title': title, - 'url': video_url, + 'formats': formats, 'thumbnail': thumbnail, 'duration': duration, } From fd5d8270dcd6d8baada3390a4a1cae5bdbcb6da4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 8 Aug 2015 01:10:41 +0600 Subject: [PATCH 2/2] [clipfish] Fix extraction, minimize requests, get rid of drm hds, extract m3u8 and more metadata --- youtube_dl/extractor/clipfish.py | 56 ++++++++++++++++++++------------ 1 file changed, 35 insertions(+), 21 deletions(-) diff --git a/youtube_dl/extractor/clipfish.py b/youtube_dl/extractor/clipfish.py index 09dfaac60..7af903571 100644 --- a/youtube_dl/extractor/clipfish.py +++ b/youtube_dl/extractor/clipfish.py @@ -1,18 +1,19 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( - ExtractorError, + determine_ext, int_or_none, js_to_json, - determine_ext, + parse_iso8601, + remove_end, ) class ClipfishIE(InfoExtractor): - IE_NAME = 'clipfish' - - _VALID_URL = r'^https?://(?:www\.)?clipfish\.de/.*?/video/(?P[0-9]+)/' + _VALID_URL = r'https?://(?:www\.)?clipfish\.de/(?:[^/]+/)+video/(?P[0-9]+)' _TEST = { 'url': 'http://www.clipfish.de/special/game-trailer/video/3966754/fifa-14-e3-2013-trailer/', 'md5': '79bc922f3e8a9097b3d68a93780fd475', @@ -20,35 +21,48 @@ class ClipfishIE(InfoExtractor): 'id': '3966754', 'ext': 'mp4', 'title': 'FIFA 14 - E3 2013 Trailer', + 'timestamp': 1370938118, + 'upload_date': '20130611', 'duration': 82, } } def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - video_info = self._parse_json( - js_to_json(self._html_search_regex('var videoObject = ({[^}]+?})', webpage, 'videoObject')), - video_id - ) - info_url = self._parse_json( - js_to_json(self._html_search_regex('var globalFlashvars = ({[^}]+?})', webpage, 'globalFlashvars')), - video_id - )['data'] - doc = self._download_xml( - info_url, video_id, note='Downloading info page') - title = doc.find('title').text - video_url = doc.find('filename').text - thumbnail = doc.find('imageurl').text - duration = int_or_none(video_info['length']) - formats = [{'url': video_info['videourl']},{'url': video_url}] + webpage = self._download_webpage(url, video_id) + + video_info = self._parse_json( + js_to_json(self._html_search_regex( + '(?s)videoObject\s*=\s*({.+?});', webpage, 'video object')), + video_id) + + formats = [] + for video_url in re.findall(r'var\s+videourl\s*=\s*"([^"]+)"', webpage): + ext = determine_ext(video_url) + if ext == 'm3u8': + formats.append({ + 'url': video_url.replace('de.hls.fra.clipfish.de', 'hls.fra.clipfish.de'), + 'ext': 'mp4', + 'format_id': 'hls', + }) + else: + formats.append({ + 'url': video_url, + 'format_id': ext, + }) self._sort_formats(formats) + title = remove_end(self._og_search_title(webpage), ' - Video') + thumbnail = self._og_search_thumbnail(webpage) + duration = int_or_none(video_info.get('length')) + timestamp = parse_iso8601(self._html_search_meta('uploadDate', webpage, 'upload date')) + return { 'id': video_id, 'title': title, 'formats': formats, 'thumbnail': thumbnail, 'duration': duration, + 'timestamp': timestamp, }