yt-dlp/yt_dlp/extractor/rtp.py

from .common import InfoExtractor
from ..utils import js_to_json
import re
import json
import urllib.parse
import base64


class RTPIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?rtp\.pt/play/p(?P<program_id>[0-9]+)/(?P<id>[^/?#]+)/?'
    _TESTS = [{
        'url': 'http://www.rtp.pt/play/p405/e174042/paixoes-cruzadas',
        'md5': 'e736ce0c665e459ddb818546220b4ef8',
        'info_dict': {
            'id': 'e174042',
            'ext': 'mp3',
            'title': 'Paixões Cruzadas',
            'description': 'As paixões musicais de António Cartaxo e António Macedo',
            'thumbnail': r're:^https?://.*\.jpg',
        },
    }, {
        'url': 'http://www.rtp.pt/play/p831/a-quimica-das-coisas',
        'only_matching': True,
    }]

    _RX_OBFUSCATION = re.compile(r'''(?xs)
        atob\s*\(\s*decodeURIComponent\s*\(\s*
            (\[[0-9A-Za-z%,'"]*\])
        \s*\.\s*join\(\s*(?:""|'')\s*\)\s*\)\s*\)
    ''')

    def __unobfuscate(self, data, *, video_id):
        if data.startswith('{'):
            data = self._RX_OBFUSCATION.sub(
                lambda m: json.dumps(
                    base64.b64decode(urllib.parse.unquote(
                        ''.join(self._parse_json(m.group(1), video_id))
                    )).decode('iso-8859-1')),
                data)
        return js_to_json(data)

    def _real_extract(self, url):
        video_id = self._match_id(url)

        webpage = self._download_webpage(url, video_id)
        title = self._html_search_meta(
            'twitter:title', webpage, display_name='title', fatal=True)

        f, config = self._search_regex(
            r'''(?sx)
                var\s+f\s*=\s*(?P<f>".*?"|{[^;]+?});\s*
                var\s+player1\s+=\s+new\s+RTPPlayer\s*\((?P<config>{(?:(?!\*/).)+?})\);(?!\s*\*/)
            ''', webpage,
            'player config', group=('f', 'config'))

        f = self._parse_json(
            f, video_id,
            lambda data: self.__unobfuscate(data, video_id=video_id))
        config = self._parse_json(
            config, video_id,
            lambda data: self.__unobfuscate(data, video_id=video_id))

        formats = []
        if isinstance(f, dict):
            f_hls = f.get('hls')
            if f_hls is not None:
                formats.extend(self._extract_m3u8_formats(
                    f_hls, video_id, 'mp4', 'm3u8_native', m3u8_id='hls'))

            f_dash = f.get('dash')
            if f_dash is not None:
                formats.extend(self._extract_mpd_formats(f_dash, video_id, mpd_id='dash'))
        else:
            formats.append({
                'format_id': 'f',
                'url': f,
                'vcodec': 'none' if config.get('mediaType') == 'audio' else None,
            })

        subtitles = {}

        vtt = config.get('vtt')
        if vtt is not None:
            for lcode, lname, url in vtt:
                subtitles.setdefault(lcode, []).append({
                    'name': lname,
                    'url': url,
                })

        return {
            'id': video_id,
            'title': title,
            'formats': formats,
            'description': self._html_search_meta(['description', 'twitter:description'], webpage),
            'thumbnail': config.get('poster') or self._og_search_thumbnail(webpage),
            'subtitles': subtitles,
        }
[rtp] Add new extractor (Closes #4382) 2014-12-12 18:22:24 +00:00			`from .common import InfoExtractor`
[RTP] Fix extraction and add subtitles (#497) Authored by: fstirlitz 2021-07-13 23:36:18 +00:00			`from ..utils import js_to_json`
			`import re`
			`import json`
			`import urllib.parse`
			`import base64`
[rtp] Add new extractor (Closes #4382) 2014-12-12 18:22:24 +00:00

			`class RTPIE(InfoExtractor):`
[rtp] Also match e-id-less URLs (#4382) 2014-12-13 23:13:07 +00:00			`_VALID_URL = r'https?://(?:www\.)?rtp\.pt/play/p(?P<program_id>[0-9]+)/(?P<id>[^/?#]+)/?'`
			`_TESTS = [{`
[rtp] Add new extractor (Closes #4382) 2014-12-12 18:22:24 +00:00			`'url': 'http://www.rtp.pt/play/p405/e174042/paixoes-cruzadas',`
[rtp] Construct regular HTTP download URLs (#4882) 2015-02-06 20:59:17 +00:00			`'md5': 'e736ce0c665e459ddb818546220b4ef8',`
[rtp] Add new extractor (Closes #4382) 2014-12-12 18:22:24 +00:00			`'info_dict': {`
[rtp] Fix test's id field 2014-12-21 14:28:40 +00:00			`'id': 'e174042',`
[rtp] Add new extractor (Closes #4382) 2014-12-12 18:22:24 +00:00			`'ext': 'mp3',`
			`'title': 'Paixões Cruzadas',`
			`'description': 'As paixões musicais de António Cartaxo e António Macedo',`
Fix "invalid escape sequences" error on Python 3.6 2017-01-02 12:08:07 +00:00			`'thumbnail': r're:^https?://.*\.jpg',`
[rtp] Add new extractor (Closes #4382) 2014-12-12 18:22:24 +00:00			`},`
[rtp] Also match e-id-less URLs (#4382) 2014-12-13 23:13:07 +00:00			`}, {`
			`'url': 'http://www.rtp.pt/play/p831/a-quimica-das-coisas',`
			`'only_matching': True,`
			`}]`
[rtp] Add new extractor (Closes #4382) 2014-12-12 18:22:24 +00:00
[RTP] Fix extraction and add subtitles (#497) Authored by: fstirlitz 2021-07-13 23:36:18 +00:00			`_RX_OBFUSCATION = re.compile(r'''(?xs)`
			`atob\s\(\sdecodeURIComponent\s\(\s`
			`(\[[0-9A-Za-z%,'"]*\])`
			`\s\.\sjoin\(\s(?:""\|'')\s\)\s\)\s\)`
			`''')`

			`def __unobfuscate(self, data, *, video_id):`
			`if data.startswith('{'):`
			`data = self._RX_OBFUSCATION.sub(`
			`lambda m: json.dumps(`
			`base64.b64decode(urllib.parse.unquote(`
			`''.join(self._parse_json(m.group(1), video_id))`
			`)).decode('iso-8859-1')),`
			`data)`
			`return js_to_json(data)`

[rtp] Add new extractor (Closes #4382) 2014-12-12 18:22:24 +00:00			`def _real_extract(self, url):`
			`video_id = self._match_id(url)`

			`webpage = self._download_webpage(url, video_id)`
			`title = self._html_search_meta(`
			`'twitter:title', webpage, display_name='title', fatal=True)`
[rtp] Construct regular HTTP download URLs (#4882) 2015-02-06 20:59:17 +00:00
[RTP] Fix extraction and add subtitles (#497) Authored by: fstirlitz 2021-07-13 23:36:18 +00:00			`f, config = self._search_regex(`
			`r'''(?sx)`
			`var\s+f\s=\s(?P<f>".?"\|{[^;]+?});\s`
			`var\s+player1\s+=\s+new\s+RTPPlayer\s\((?P<config>{(?:(?!\/).)+?})\);(?!\s\/)`
			`''', webpage,`
			`'player config', group=('f', 'config'))`

			`f = self._parse_json(`
			`f, video_id,`
			`lambda data: self.__unobfuscate(data, video_id=video_id))`
			`config = self._parse_json(`
			`config, video_id,`
			`lambda data: self.__unobfuscate(data, video_id=video_id))`

			`formats = []`
			`if isinstance(f, dict):`
			`f_hls = f.get('hls')`
			`if f_hls is not None:`
			`formats.extend(self._extract_m3u8_formats(`
			`f_hls, video_id, 'mp4', 'm3u8_native', m3u8_id='hls'))`

			`f_dash = f.get('dash')`
			`if f_dash is not None:`
			`formats.extend(self._extract_mpd_formats(f_dash, video_id, mpd_id='dash'))`
[rtp] fix extraction(closes #15099) 2019-05-28 03:58:12 +00:00			`else:`
[RTP] Fix extraction and add subtitles (#497) Authored by: fstirlitz 2021-07-13 23:36:18 +00:00			`formats.append({`
			`'format_id': 'f',`
			`'url': f,`
			`'vcodec': 'none' if config.get('mediaType') == 'audio' else None,`
			`})`

			`subtitles = {}`

			`vtt = config.get('vtt')`
			`if vtt is not None:`
			`for lcode, lname, url in vtt:`
			`subtitles.setdefault(lcode, []).append({`
			`'name': lname,`
			`'url': url,`
			`})`
[rtp] Construct regular HTTP download URLs (#4882) 2015-02-06 20:59:17 +00:00
[rtp] Add new extractor (Closes #4382) 2014-12-12 18:22:24 +00:00			`return {`
			`'id': video_id,`
			`'title': title,`
			`'formats': formats,`
[rtp] fix extraction(closes #15099) 2019-05-28 03:58:12 +00:00			`'description': self._html_search_meta(['description', 'twitter:description'], webpage),`
			`'thumbnail': config.get('poster') or self._og_search_thumbnail(webpage),`
[RTP] Fix extraction and add subtitles (#497) Authored by: fstirlitz 2021-07-13 23:36:18 +00:00			`'subtitles': subtitles,`
[rtp] Add new extractor (Closes #4382) 2014-12-12 18:22:24 +00:00			`}`