[bbccouk] Convert to new subtitles system

I haven't found any video available outside the UK, so I haven't added tests. I have updated how the srt file is build, because (at least for www.bbc.co.uk/programmes/p02j9b69) the subtitles is inside 'span' elements.
2024-11-16 01:25:06 +00:00 · 2015-02-19 16:46:41 +01:00 · 2015-02-19 16:46:41 +01:00 · f13b1e7d7f
commit f13b1e7d7f
parent 8807f1277f
1 changed files with 22 additions and 11 deletions
--- a/youtube_dl/extractor/bbccouk.py
+++ b/youtube_dl/extractor/bbccouk.py
@ -2,12 +2,12 @@

 import xml.etree.ElementTree

-from .subtitles import SubtitlesInfoExtractor
+from .common import InfoExtractor
 from ..utils import ExtractorError
 from ..compat import compat_HTTPError


-class BBCCoUkIE(SubtitlesInfoExtractor):
+class BBCCoUkIE(InfoExtractor):
    IE_NAME = 'bbc.co.uk'
    IE_DESC = 'BBC iPlayer'
    _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer(?:/[^/]+)?/(?:episode|playlist))/)|music/clips[/#])(?P<id>[\da-z]{8})'
@ -215,17 +215,32 @@ def _extract_audio(self, media, programme_id):
            formats.extend(conn_formats)
        return formats

-    def _extract_captions(self, media, programme_id):
+    def _get_subtitles(self, media, programme_id):
        subtitles = {}
        for connection in self._extract_connections(media):
            captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions')
            lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en')
            ps = captions.findall('./{0}body/{0}div/{0}p'.format('{http://www.w3.org/2006/10/ttaf1}'))
            srt = ''
+
+            def _extract_text(p):
+                if p.text is not None:
+                    stripped_text = p.text.strip()
+                    if stripped_text:
+                        return stripped_text
+                return ' '.join(span.text.strip() for span in p.findall('{http://www.w3.org/2006/10/ttaf1}span'))
            for pos, p in enumerate(ps):
-                srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (str(pos), p.get('begin'), p.get('end'),
-                                                          p.text.strip() if p.text is not None else '')
-            subtitles[lang] = srt
+                srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (str(pos), p.get('begin'), p.get('end'), _extract_text(p))
+            subtitles[lang] = [
+                {
+                    'url': connection.get('href'),
+                    'ext': 'ttml',
+                },
+                {
+                    'data': srt,
+                    'ext': 'srt',
+                },
+            ]
        return subtitles

    def _download_media_selector(self, programme_id):
@ -249,7 +264,7 @@ def _download_media_selector(self, programme_id):
            elif kind == 'video':
                formats.extend(self._extract_video(media, programme_id))
            elif kind == 'captions':
-                subtitles = self._extract_captions(media, programme_id)
+                subtitles = self.extract_subtitles(media, programme_id)

        return formats, subtitles

@ -324,10 +339,6 @@ def _real_extract(self, url):
        else:
            programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)

-        if self._downloader.params.get('listsubtitles', False):
-            self._list_available_subtitles(programme_id, subtitles)
-            return
-
        self._sort_formats(formats)

        return {