[wistia] Add extractor

This commit is contained in:
Philipp Hagemeister 2013-12-06 09:15:04 +01:00
parent 72135030d1
commit ef4fd84857
4 changed files with 80 additions and 10 deletions

View File

@ -488,7 +488,8 @@ def make_result(embedded_info):
new_result = ie_result.copy() new_result = ie_result.copy()
for f in ('_type', 'url', 'ext', 'player_url', 'formats', for f in ('_type', 'url', 'ext', 'player_url', 'formats',
'entries', 'urlhandle', 'ie_key', 'duration', 'entries', 'urlhandle', 'ie_key', 'duration',
'subtitles', 'annotations', 'format'): 'subtitles', 'annotations', 'format',
'thumbnail', 'thumbnails'):
if f in new_result: if f in new_result:
del new_result[f] del new_result[f]
if f in embedded_info: if f in embedded_info:

View File

@ -178,6 +178,7 @@
from .websurg import WeBSurgIE from .websurg import WeBSurgIE
from .weibo import WeiboIE from .weibo import WeiboIE
from .wimp import WimpIE from .wimp import WimpIE
from .wistia import WistiaIE
from .worldstarhiphop import WorldStarHipHopIE from .worldstarhiphop import WorldStarHipHopIE
from .xhamster import XHamsterIE from .xhamster import XHamsterIE
from .xnxx import XNXXIE from .xnxx import XNXXIE

View File

@ -169,8 +169,13 @@ def _real_extract(self, url):
# Site Name | Video Title # Site Name | Video Title
# Video Title - Tagline | Site Name # Video Title - Tagline | Site Name
# and so on and so forth; it's just not practical # and so on and so forth; it's just not practical
video_title = self._html_search_regex(r'<title>(.*)</title>', video_title = self._html_search_regex(
webpage, u'video title', default=u'video', flags=re.DOTALL) r'(?s)<title>(.*?)</title>', webpage, u'video title',
default=u'video')
# video uploader is domain name
video_uploader = self._search_regex(
r'^(?:https?://)?([^/]*)/.*', url, u'video uploader')
# Look for BrightCove: # Look for BrightCove:
bc_url = BrightcoveIE._extract_brightcove_url(webpage) bc_url = BrightcoveIE._extract_brightcove_url(webpage)
@ -188,7 +193,7 @@ def _real_extract(self, url):
# Look for embedded YouTube player # Look for embedded YouTube player
matches = re.findall( matches = re.findall(
r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?youtube.com/embed/.+?)\1', webpage) r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?youtube\.com/embed/.+?)\1', webpage)
if matches: if matches:
urlrs = [self.url_result(unescapeHTML(tuppl[1]), 'Youtube') urlrs = [self.url_result(unescapeHTML(tuppl[1]), 'Youtube')
for tuppl in matches] for tuppl in matches]
@ -197,13 +202,26 @@ def _real_extract(self, url):
# Look for embedded Dailymotion player # Look for embedded Dailymotion player
matches = re.findall( matches = re.findall(
r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion.com/embed/video/.+?)\1', webpage) r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage)
if matches: if matches:
urlrs = [self.url_result(unescapeHTML(tuppl[1]), 'Dailymotion') urlrs = [self.url_result(unescapeHTML(tuppl[1]), 'Dailymotion')
for tuppl in matches] for tuppl in matches]
return self.playlist_result( return self.playlist_result(
urlrs, playlist_id=video_id, playlist_title=video_title) urlrs, playlist_id=video_id, playlist_title=video_title)
# Look for embedded Wistia player
match = re.search(
r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)
if match:
return {
'_type': 'url_transparent',
'url': unescapeHTML(match.group('url')),
'ie_key': 'Wistia',
'uploader': video_uploader,
'title': video_title,
'id': video_id,
}
# Look for Bandcamp pages with custom domain # Look for Bandcamp pages with custom domain
mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage) mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
if mobj is not None: if mobj is not None:
@ -247,14 +265,9 @@ def _real_extract(self, url):
# here's a fun little line of code for you: # here's a fun little line of code for you:
video_id = os.path.splitext(video_id)[0] video_id = os.path.splitext(video_id)[0]
# video uploader is domain name
video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
url, u'video uploader')
return { return {
'id': video_id, 'id': video_id,
'url': video_url, 'url': video_url,
'uploader': video_uploader, 'uploader': video_uploader,
'upload_date': None,
'title': video_title, 'title': video_title,
} }

View File

@ -0,0 +1,55 @@
import json
import re
from .common import InfoExtractor
class WistiaIE(InfoExtractor):
_VALID_URL = r'^https?://(?:fast\.)?wistia\.net/embed/iframe/(?P<id>[a-z0-9]+)'
_TEST = {
u"url": u"http://fast.wistia.net/embed/iframe/sh7fpupwlt",
u"file": u"sh7fpupwlt.mov",
u"md5": u"cafeb56ec0c53c18c97405eecb3133df",
u"info_dict": {
u"title": u"cfh_resourceful_zdkh_final_1"
},
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
data_json = self._html_search_regex(
r'Wistia.iframeInit\((.*?), {}\);', webpage, u'video data')
data = json.loads(data_json)
formats = []
thumbnails = []
for atype, a in data['assets'].items():
if atype == 'still':
thumbnails.append({
'url': a['url'],
'resolution': '%dx%d' % (a['width'], a['height']),
})
continue
if atype == 'preview':
continue
formats.append({
'format_id': atype,
'url': a['url'],
'width': a['width'],
'height': a['height'],
'filesize': a['size'],
'ext': a['ext'],
})
formats.sort(key=lambda a: a['filesize'])
return {
'id': video_id,
'title': data['name'],
'formats': formats,
'thumbnails': thumbnails,
}