[vevo] Some improvements (fixes #1580)

Extract the info from http://videoplayer.vevo.com/VideoService/AuthenticateVideo?isrc={id} Some videos don't have an smil manifest, extract the video urls directly from the json and use the last version of the video. Extract all the available formats and set the 'formats' field of the result
2024-12-22 06:00:00 +00:00 · 2013-10-08 21:23:55 +02:00 · 2013-10-08 21:23:55 +02:00 · 88bd97e34c
commit 88bd97e34c
parent 2ae3edb1cf
1 changed files with 46 additions and 22 deletions
--- a/youtube_dl/extractor/vevo.py
+++ b/youtube_dl/extractor/vevo.py
@ -1,11 +1,15 @@
 import re
 import json
 import xml.etree.ElementTree
 import datetime
 from .common import InfoExtractor
 from ..utils import (
    determine_ext,
    ExtractorError,
 )
 class VevoIE(InfoExtractor):
    """
    Accepts urls from vevo.com or in the format 'vevo:{id}'
@ -15,11 +19,11 @@ class VevoIE(InfoExtractor):
    _TEST = {
        u'url': u'http://www.vevo.com/watch/hurts/somebody-to-die-for/GB1101300280',
        u'file': u'GB1101300280.mp4',
        u'md5': u'06bea460acb744eab74a9d7dcb4bfd61',
        u'info_dict': {
            u"upload_date": u"20130624",
            u"uploader": u"Hurts",
-            u"title": u"Somebody to Die For"
+            u"title": u"Somebody to Die For",
            u'duration': 230,
        }
    }
@ -27,27 +31,47 @@ def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        video_id = mobj.group('id')
-        json_url = 'http://www.vevo.com/data/video/%s' % video_id
+        json_url = 'http://videoplayer.vevo.com/VideoService/AuthenticateVideo?isrc=%s' % video_id
        base_url = 'http://smil.lvl3.vevo.com'
        videos_url = '%s/Video/V2/VFILE/%s/%sr.smil' % (base_url, video_id, video_id.lower())
        info_json = self._download_webpage(json_url, video_id, u'Downloading json info')
        links_webpage = self._download_webpage(videos_url, video_id, u'Downloading videos urls')
        self.report_extraction(video_id)
-        video_info = json.loads(info_json)
+        video_info = json.loads(info_json)['video']
-        m_urls = list(re.finditer(r'<video src="(?P<ext>.*?):/?(?P<url>.*?)"', links_webpage))
+        last_version = {'version': -1}
-        if m_urls is None or len(m_urls) == 0:
+        for version in video_info['videoVersions']:
-            raise ExtractorError(u'Unable to extract video url')
+            # These are the HTTP downloads, other types are for different manifests
-        # They are sorted from worst to best quality
+            if version['sourceType'] == 2:
-        m_url = m_urls[-1]
+                if version['version'] > last_version['version']:
-        video_url = base_url + '/' + m_url.group('url')
+                    last_version = version
-        ext = m_url.group('ext')
+        if last_version['version'] == -1:
            raise ExtractorError(u'Unable to extract last version of the video')
-        return {'url': video_url,
+        renditions = xml.etree.ElementTree.fromstring(last_version['data'])
-                'ext': ext,
+        formats = []
        # Already sorted from worst to best quality
        for rend in renditions.findall('rendition'):
            attr = rend.attrib
            f_url = attr['url']
            formats.append({
                'url': f_url,
                'ext': determine_ext(f_url),
                'height': int(attr['frameheight']),
                'width': int(attr['frameWidth']),
            })
        date_epoch = int(self._search_regex(
            r'/Date\((\d+)\)/', video_info['launchDate'], u'launch date'))/1000
        upload_date = datetime.datetime.fromtimestamp(date_epoch)
        info = {
            'id': video_id,
            'title': video_info['title'],
-                'thumbnail': video_info['img'],
+            'formats': formats,
-                'upload_date': video_info['launchDate'].replace('/',''),
+            'thumbnail': video_info['imageUrl'],
-                'uploader': video_info['Artists'][0]['title'],
+            'upload_date': upload_date.strftime('%Y%m%d'),
            'uploader': video_info['mainArtists'][0]['artistName'],
            'duration': video_info['duration'],
        }
        # TODO: Remove when #980 has been merged
        info.update(formats[-1])
        return info