0
0
Fork 0
mirror of https://github.com/yt-dlp/yt-dlp.git synced 2024-12-22 06:00:00 +00:00

Add an extractor for orf.at (closes #1346)

Make find_xpath_attr also accept numbers in the value
This commit is contained in:
Jaime Marquínez Ferrándiz 2013-08-29 19:16:07 +02:00
parent 54fda45bac
commit 545434670b
3 changed files with 67 additions and 1 deletions

View file

@ -59,6 +59,7 @@
from .nba import NBAIE
from .nbc import NBCNewsIE
from .ooyala import OoyalaIE
from .orf import ORFIE
from .pbs import PBSIE
from .photobucket import PhotobucketIE
from .pornotube import PornotubeIE

View file

@ -0,0 +1,65 @@
import re
import xml.etree.ElementTree
import json
from .common import InfoExtractor
from ..utils import (
compat_urlparse,
ExtractorError,
find_xpath_attr,
)
class ORFIE(InfoExtractor):
_VALID_URL = r'https?://tvthek.orf.at/(programs/.+?/episodes|topics/.+?)/(?P<id>\d+)'
_TEST = {
u'url': u'http://tvthek.orf.at/programs/1171769-Wetter-ZIB/episodes/6557323-Wetter',
u'file': u'6566957.flv',
u'info_dict': {
u'title': u'Wetter',
u'description': u'Christa Kummer, Marcus Wadsak und Kollegen präsentieren abwechselnd ihre täglichen Wetterprognosen für Österreich.\r \r Mehr Wetter unter wetter.ORF.at',
},
u'params': {
# It uses rtmp
u'skip_download': True,
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
playlist_id = mobj.group('id')
webpage = self._download_webpage(url, playlist_id)
flash_xml = self._search_regex('ORF.flashXML = \'(.+?)\'', webpage, u'flash xml')
flash_xml = compat_urlparse.parse_qs('xml='+flash_xml)['xml'][0]
flash_config = xml.etree.ElementTree.fromstring(flash_xml.encode('utf-8'))
playlist_json = self._search_regex(r'playlist\': \'(\[.*?\])\'', webpage, u'playlist').replace(r'\"','"')
playlist = json.loads(playlist_json)
videos = []
ns = '{http://tempuri.org/XMLSchema.xsd}'
xpath = '%(ns)sPlaylist/%(ns)sItems/%(ns)sItem' % {'ns': ns}
webpage_description = self._og_search_description(webpage)
for (i, (item, info)) in enumerate(zip(flash_config.findall(xpath), playlist), 1):
# Get best quality url
rtmp_url = None
for q in ['Q6A', 'Q4A', 'Q1A']:
video_url = find_xpath_attr(item, '%sVideoUrl' % ns, 'quality', q)
if video_url is not None:
rtmp_url = video_url.text
break
if rtmp_url is None:
raise ExtractorError(u'Couldn\'t get video url: %s' % info['id'])
description = self._html_search_regex(
r'id="playlist_entry_%s".*?<p>(.*?)</p>' % i, webpage,
u'description', default=webpage_description, flags=re.DOTALL)
videos.append({
'_type': 'video',
'id': info['id'],
'title': info['title'],
'url': rtmp_url,
'ext': 'flv',
'description': description,
})
return videos

View file

@ -213,7 +213,7 @@ def write_json_file(obj, fn):
def find_xpath_attr(node, xpath, key, val):
""" Find the xpath xpath[@key=val] """
assert re.match(r'^[a-zA-Z]+$', key)
assert re.match(r'^[a-zA-Z@\s]*$', val)
assert re.match(r'^[a-zA-Z0-9@\s]*$', val)
expr = xpath + u"[@%s='%s']" % (key, val)
return node.find(expr)
else: