[extractor/common] Expose fragments interface for dashsegments formats

2024-11-27 03:03:01 +00:00 · 2016-09-06 01:21:57 +07:00 · 2016-09-06 01:21:57 +07:00 · b4c1d6e800
commit b4c1d6e800
parent a0d5077c8d
1 changed files with 99 additions and 43 deletions
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@ -1551,21 +1551,12 @@ def is_drm_protected(element):

        def extract_multisegment_info(element, ms_parent_info):
            ms_info = ms_parent_info.copy()
-            segment_list = element.find(_add_ns('SegmentList'))
-            if segment_list is not None:
-                segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
-                if segment_urls_e:
-                    ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
-                initialization = segment_list.find(_add_ns('Initialization'))
-                if initialization is not None:
-                    ms_info['initialization_url'] = initialization.attrib['sourceURL']
-            else:
-                segment_template = element.find(_add_ns('SegmentTemplate'))
-                if segment_template is not None:
-                    start_number = segment_template.get('startNumber')
-                    if start_number:
-                        ms_info['start_number'] = int(start_number)
-                    segment_timeline = segment_template.find(_add_ns('SegmentTimeline'))
+
+            # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
+            # common attributes and elements.  We will only extract relevant
+            # for us.
+            def extract_common(source):
+                segment_timeline = source.find(_add_ns('SegmentTimeline'))
                if segment_timeline is not None:
                    s_e = segment_timeline.findall(_add_ns('S'))
                    if s_e:
@ -1580,13 +1571,32 @@ def extract_multisegment_info(element, ms_parent_info):
                                'd': int(s.attrib['d']),
                                'r': r,
                            })
-                    else:
-                        timescale = segment_template.get('timescale')
+                start_number = source.get('startNumber')
+                if start_number:
+                    ms_info['start_number'] = int(start_number)
+                timescale = source.get('timescale')
                if timescale:
                    ms_info['timescale'] = int(timescale)
-                        segment_duration = segment_template.get('duration')
+                segment_duration = source.get('duration')
                if segment_duration:
                    ms_info['segment_duration'] = int(segment_duration)
+
+            def extract_Initialization(source):
+                initialization = source.find(_add_ns('Initialization'))
+                if initialization is not None:
+                    ms_info['initialization_url'] = initialization.attrib['sourceURL']
+
+            segment_list = element.find(_add_ns('SegmentList'))
+            if segment_list is not None:
+                extract_common(segment_list)
+                extract_Initialization(segment_list)
+                segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
+                if segment_urls_e:
+                    ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
+            else:
+                segment_template = element.find(_add_ns('SegmentTemplate'))
+                if segment_template is not None:
+                    extract_common(segment_template)
                    media_template = segment_template.get('media')
                    if media_template:
                        ms_info['media_template'] = media_template
@ -1594,11 +1604,14 @@ def extract_multisegment_info(element, ms_parent_info):
                    if initialization:
                        ms_info['initialization_url'] = initialization
                    else:
-                        initialization = segment_template.find(_add_ns('Initialization'))
-                        if initialization is not None:
-                            ms_info['initialization_url'] = initialization.attrib['sourceURL']
+                        extract_Initialization(segment_template)
            return ms_info

+        def combine_url(base_url, target_url):
+            if re.match(r'^https?://', target_url):
+                return target_url
+            return '%s%s%s' % (base_url, '' if base_url.endswith('/') else '/', target_url)
+
        mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
        formats = []
        for period in mpd_doc.findall(_add_ns('Period')):
@ -1655,9 +1668,7 @@ def extract_multisegment_info(element, ms_parent_info):
                        }
                        representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
                        if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info:
-                            if 'total_number' not in representation_ms_info and 'segment_duration':
-                                segment_duration = float(representation_ms_info['segment_duration']) / float(representation_ms_info['timescale'])
-                                representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
+
                            media_template = representation_ms_info['media_template']
                            media_template = media_template.replace('$RepresentationID$', representation_id)
                            media_template = re.sub(r'\$(Number|Bandwidth|Time)\$', r'%(\1)d', media_template)
@ -1666,7 +1677,11 @@ def extract_multisegment_info(element, ms_parent_info):

                            # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
                            # can't be used at the same time
-                            if '%(Number' in media_template:
+                            if '%(Number' in media_template and 's' not in representation_ms_info:
+                                segment_duration = None
+                                if 'total_number' not in representation_ms_info and 'segment_duration':
+                                    segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
+                                    representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
                                representation_ms_info['segment_urls'] = [
                                    media_template % {
                                        'Number': segment_number,
@ -1675,28 +1690,65 @@ def extract_multisegment_info(element, ms_parent_info):
                                    for segment_number in range(
                                        representation_ms_info['start_number'],
                                        representation_ms_info['total_number'] + representation_ms_info['start_number'])]
+                                representation_ms_info['fragments'] = [{
+                                    'url': media_template % {
+                                        'Number': segment_number,
+                                        'Bandwidth': representation_attrib.get('bandwidth'),
+                                    },
+                                    'duration': segment_duration,
+                                } for segment_number in range(
+                                    representation_ms_info['start_number'],
+                                    representation_ms_info['total_number'] + representation_ms_info['start_number'])]
                            else:
+                                # $Number*$ or $Time$ in media template with S list available
+                                # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
+                                # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
                                representation_ms_info['segment_urls'] = []
+                                representation_ms_info['fragments'] = []
                                segment_time = 0
+                                segment_d = None
+                                segment_number = representation_ms_info['start_number']

                                def add_segment_url():
-                                    representation_ms_info['segment_urls'].append(
-                                        media_template % {
+                                    segment_url = media_template % {
                                        'Time': segment_time,
                                        'Bandwidth': representation_attrib.get('bandwidth'),
+                                        'Number': segment_number,
                                    }
-                                    )
+                                    representation_ms_info['segment_urls'].append(segment_url)
+                                    representation_ms_info['fragments'].append({
+                                        'url': segment_url,
+                                        'duration': float_or_none(segment_d, representation_ms_info['timescale']),
+                                    })

                                for num, s in enumerate(representation_ms_info['s']):
                                    segment_time = s.get('t') or segment_time
+                                    segment_d = s['d']
                                    add_segment_url()
+                                    segment_number += 1
                                    for r in range(s.get('r', 0)):
-                                        segment_time += s['d']
+                                        segment_time += segment_d
                                        add_segment_url()
-                                    segment_time += s['d']
+                                        segment_number += 1
+                                    segment_time += segment_d
+                        elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
+                            # No media template
+                            # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
+                            # or any YouTube dashsegments video
+                            fragments = []
+                            s_num = 0
+                            for segment_url in representation_ms_info['segment_urls']:
+                                s = representation_ms_info['s'][s_num]
+                                for r in range(s.get('r', 0) + 1):
+                                    fragments.append({
+                                        'url': segment_url,
+                                        'duration': float_or_none(s['d'], representation_ms_info['timescale']),
+                                    })
+                            representation_ms_info['fragments'] = fragments
                        if 'segment_urls' in representation_ms_info:
                            f.update({
                                'segment_urls': representation_ms_info['segment_urls'],
+                                'fragments': [],
                                'protocol': 'http_dash_segments',
                            })
                            if 'initialization_url' in representation_ms_info:
@ -1706,6 +1758,10 @@ def add_segment_url():
                                })
                                if not f.get('url'):
                                    f['url'] = initialization_url
+                                f['fragments'].append({'url': initialization_url})
+                            f['fragments'].extend(representation_ms_info['fragments'])
+                            for fragment in f['fragments']:
+                                fragment['url'] = combine_url(base_url, fragment['url'])
                        try:
                            existing_format = next(
                                fo for fo in formats