yt-dlp/yt_dlp/extractor/eagleplatform.py

import functools
import re

from .common import InfoExtractor
from ..compat import compat_HTTPError
from ..utils import (
    ExtractorError,
    int_or_none,
    smuggle_url,
    unsmuggle_url,
    url_or_none,
)


class EaglePlatformIE(InfoExtractor):
    _VALID_URL = r'''(?x)
                    (?:
                        eagleplatform:(?P<custom_host>[^/]+):|
                        https?://(?P<host>.+?\.media\.eagleplatform\.com)/index/player\?.*\brecord_id=
                    )
                    (?P<id>\d+)
                '''
    _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//.+?\.media\.eagleplatform\.com/index/player\?.+?)\1']
    _TESTS = [{
        # http://lenta.ru/news/2015/03/06/navalny/
        'url': 'http://lentaru.media.eagleplatform.com/index/player?player=new&record_id=227304&player_template_id=5201',
        # Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used
        'info_dict': {
            'id': '227304',
            'ext': 'mp4',
            'title': 'Навальный вышел на свободу',
            'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5',
            'thumbnail': r're:^https?://.*\.jpg$',
            'duration': 87,
            'view_count': int,
            'age_limit': 0,
        },
    }, {
        # http://muz-tv.ru/play/7129/
        # http://media.clipyou.ru/index/player?record_id=12820&width=730&height=415&autoplay=true
        'url': 'eagleplatform:media.clipyou.ru:12820',
        'md5': '358597369cf8ba56675c1df15e7af624',
        'info_dict': {
            'id': '12820',
            'ext': 'mp4',
            'title': "'O Sole Mio",
            'thumbnail': r're:^https?://.*\.jpg$',
            'duration': 216,
            'view_count': int,
        },
        'skip': 'Georestricted',
    }, {
        # referrer protected video (https://tvrain.ru/lite/teleshow/kak_vse_nachinalos/namin-418921/)
        'url': 'eagleplatform:tvrainru.media.eagleplatform.com:582306',
        'only_matching': True,
    }]

    @classmethod
    def _extract_embed_urls(cls, url, webpage):
        add_referer = functools.partial(smuggle_url, data={'referrer': url})

        res = tuple(super()._extract_embed_urls(url, webpage))
        if res:
            return map(add_referer, res)

        PLAYER_JS_RE = r'''
                        <script[^>]+
                            src=(?P<qjs>["\'])(?:https?:)?//(?P<host>(?:(?!(?P=qjs)).)+\.media\.eagleplatform\.com)/player/player\.js(?P=qjs)
                        .+?
                    '''
        # "Basic usage" embedding (see http://dultonmedia.github.io/eplayer/)
        mobj = re.search(
            r'''(?xs)
                    %s
                    <div[^>]+
                        class=(?P<qclass>["\'])eagleplayer(?P=qclass)[^>]+
                        data-id=["\'](?P<id>\d+)
            ''' % PLAYER_JS_RE, webpage)
        if mobj is not None:
            return [add_referer('eagleplatform:%(host)s:%(id)s' % mobj.groupdict())]
        # Generalization of "Javascript code usage", "Combined usage" and
        # "Usage without attaching to DOM" embeddings (see
        # http://dultonmedia.github.io/eplayer/)
        mobj = re.search(
            r'''(?xs)
                    %s
                    <script>
                    .+?
                    new\s+EaglePlayer\(
                        (?:[^,]+\s*,\s*)?
                        {
                            .+?
                            \bid\s*:\s*["\']?(?P<id>\d+)
                            .+?
                        }
                    \s*\)
                    .+?
                    </script>
            ''' % PLAYER_JS_RE, webpage)
        if mobj is not None:
            return [add_referer('eagleplatform:%(host)s:%(id)s' % mobj.groupdict())]

    @staticmethod
    def _handle_error(response):
        status = int_or_none(response.get('status', 200))
        if status != 200:
            raise ExtractorError(' '.join(response['errors']), expected=True)

    def _download_json(self, url_or_request, video_id, *args, **kwargs):
        try:
            response = super(EaglePlatformIE, self)._download_json(
                url_or_request, video_id, *args, **kwargs)
        except ExtractorError as ee:
            if isinstance(ee.cause, compat_HTTPError):
                response = self._parse_json(ee.cause.read().decode('utf-8'), video_id)
                self._handle_error(response)
            raise
        return response

    def _get_video_url(self, url_or_request, video_id, note='Downloading JSON metadata'):
        return self._download_json(url_or_request, video_id, note)['data'][0]

    def _real_extract(self, url):
        url, smuggled_data = unsmuggle_url(url, {})

        mobj = self._match_valid_url(url)
        host, video_id = mobj.group('custom_host') or mobj.group('host'), mobj.group('id')

        headers = {}
        query = {
            'id': video_id,
        }

        referrer = smuggled_data.get('referrer')
        if referrer:
            headers['Referer'] = referrer
            query['referrer'] = referrer

        player_data = self._download_json(
            'http://%s/api/player_data' % host, video_id,
            headers=headers, query=query)

        media = player_data['data']['playlist']['viewports'][0]['medialist'][0]

        title = media['title']
        description = media.get('description')
        thumbnail = self._proto_relative_url(media.get('snapshot'), 'http:')
        duration = int_or_none(media.get('duration'))
        view_count = int_or_none(media.get('views'))

        age_restriction = media.get('age_restriction')
        age_limit = None
        if age_restriction:
            age_limit = 0 if age_restriction == 'allow_all' else 18

        secure_m3u8 = self._proto_relative_url(media['sources']['secure_m3u8']['auto'], 'http:')

        formats = []

        m3u8_url = self._get_video_url(secure_m3u8, video_id, 'Downloading m3u8 JSON')
        m3u8_formats = self._extract_m3u8_formats(
            m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native',
            m3u8_id='hls', fatal=False)
        formats.extend(m3u8_formats)

        m3u8_formats_dict = {}
        for f in m3u8_formats:
            if f.get('height') is not None:
                m3u8_formats_dict[f['height']] = f

        mp4_data = self._download_json(
            # Secure mp4 URL is constructed according to Player.prototype.mp4 from
            # http://lentaru.media.eagleplatform.com/player/player.js
            re.sub(r'm3u8|hlsvod|hls|f4m', 'mp4s', secure_m3u8),
            video_id, 'Downloading mp4 JSON', fatal=False)
        if mp4_data:
            for format_id, format_url in mp4_data.get('data', {}).items():
                if not url_or_none(format_url):
                    continue
                height = int_or_none(format_id)
                if height is not None and m3u8_formats_dict.get(height):
                    f = m3u8_formats_dict[height].copy()
                    f.update({
                        'format_id': f['format_id'].replace('hls', 'http'),
                        'protocol': 'http',
                    })
                else:
                    f = {
                        'format_id': 'http-%s' % format_id,
                        'height': int_or_none(format_id),
                    }
                f['url'] = format_url
                formats.append(f)

        return {
            'id': video_id,
            'title': title,
            'description': description,
            'thumbnail': thumbnail,
            'duration': duration,
            'view_count': view_count,
            'age_limit': age_limit,
            'formats': formats,
        }


class ClipYouEmbedIE(InfoExtractor):
    _VALID_URL = False

    @classmethod
    def _extract_embed_urls(cls, url, webpage):
        mobj = re.search(
            r'<iframe[^>]+src="https?://(?P<host>media\.clipyou\.ru)/index/player\?.*\brecord_id=(?P<id>\d+).*"', webpage)
        if mobj is not None:
            yield smuggle_url('eagleplatform:%(host)s:%(id)s' % mobj.groupdict(), {'referrer': url})
[extractors] Use new framework for existing embeds (#4307) `Brightcove` is difficult to migrate because it's subclasses may depend on the signature of the current functions. So it is left as-is for now Note: Tests have not been migrated 2022-08-01 01:23:25 +00:00			`import functools`
[eagleplatform] Add extractor 2015-03-07 16:16:23 +00:00			`import re`

			`from .common import InfoExtractor`
Improve URL extraction 2018-07-21 12:08:28 +00:00			`from ..compat import compat_HTTPError`
[eagleplatform] Add extractor 2015-03-07 16:16:23 +00:00			`from ..utils import (`
			`ExtractorError,`
			`int_or_none,`
[extractors] Use new framework for existing embeds (#4307) `Brightcove` is difficult to migrate because it's subclasses may depend on the signature of the current functions. So it is left as-is for now Note: Tests have not been migrated 2022-08-01 01:23:25 +00:00			`smuggle_url,`
[eagleplatform] Add support for referrer protected videos (closes #13557) 2017-07-09 08:57:33 +00:00			`unsmuggle_url,`
Improve URL extraction 2018-07-21 12:08:28 +00:00			`url_or_none,`
[eagleplatform] Add extractor 2015-03-07 16:16:23 +00:00			`)`


			`class EaglePlatformIE(InfoExtractor):`
			`_VALID_URL = r'''(?x)`
			`(?:`
			`eagleplatform:(?P<custom_host>[^/]+):\|`
			`https?://(?P<host>.+?\.media\.eagleplatform\.com)/index/player\?.*\brecord_id=`
			`)`
			`(?P<id>\d+)`
			`'''`
[extractors] Use new framework for existing embeds (#4307) `Brightcove` is difficult to migrate because it's subclasses may depend on the signature of the current functions. So it is left as-is for now Note: Tests have not been migrated 2022-08-01 01:23:25 +00:00			`_EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//.+?\.media\.eagleplatform\.com/index/player\?.+?)\1']`
[eagleplatform] Add extractor 2015-03-07 16:16:23 +00:00			`_TESTS = [{`
			`# http://lenta.ru/news/2015/03/06/navalny/`
			`'url': 'http://lentaru.media.eagleplatform.com/index/player?player=new&record_id=227304&player_template_id=5201',`
[eagleplatform] Checking direct HTTP links Sometimes they fail with 404 2016-04-25 14:48:17 +00:00			`# Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used`
[eagleplatform] Add extractor 2015-03-07 16:16:23 +00:00			`'info_dict': {`
			`'id': '227304',`
			`'ext': 'mp4',`
			`'title': 'Навальный вышел на свободу',`
			`'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5',`
Fix "invalid escape sequences" error on Python 3.6 2017-01-02 12:08:07 +00:00			`'thumbnail': r're:^https?://.*\.jpg$',`
[eagleplatform] Add extractor 2015-03-07 16:16:23 +00:00			`'duration': 87,`
			`'view_count': int,`
			`'age_limit': 0,`
			`},`
			`}, {`
			`# http://muz-tv.ru/play/7129/`
			`# http://media.clipyou.ru/index/player?record_id=12820&width=730&height=415&autoplay=true`
			`'url': 'eagleplatform:media.clipyou.ru:12820',`
[eagleplatform] extract all http formats 2016-04-22 13:32:38 +00:00			`'md5': '358597369cf8ba56675c1df15e7af624',`
[eagleplatform] Add extractor 2015-03-07 16:16:23 +00:00			`'info_dict': {`
			`'id': '12820',`
			`'ext': 'mp4',`
			`'title': "'O Sole Mio",`
Fix "invalid escape sequences" error on Python 3.6 2017-01-02 12:08:07 +00:00			`'thumbnail': r're:^https?://.*\.jpg$',`
[eagleplatform] Add extractor 2015-03-07 16:16:23 +00:00			`'duration': 216,`
			`'view_count': int,`
			`},`
[eagleplatform] Skip georestricted test 2015-04-04 17:36:45 +00:00			`'skip': 'Georestricted',`
[eagleplatform] Add support for referrer protected videos (closes #13557) 2017-07-09 08:57:33 +00:00			`}, {`
			`# referrer protected video (https://tvrain.ru/lite/teleshow/kak_vse_nachinalos/namin-418921/)`
[eagleplatform] Fix test 2017-07-09 17:14:41 +00:00			`'url': 'eagleplatform:tvrainru.media.eagleplatform.com:582306',`
[eagleplatform] Add support for referrer protected videos (closes #13557) 2017-07-09 08:57:33 +00:00			`'only_matching': True,`
[eagleplatform] Add extractor 2015-03-07 16:16:23 +00:00			`}]`

[extractors] Use new framework for existing embeds (#4307) `Brightcove` is difficult to migrate because it's subclasses may depend on the signature of the current functions. So it is left as-is for now Note: Tests have not been migrated 2022-08-01 01:23:25 +00:00			`@classmethod`
			`def _extract_embed_urls(cls, url, webpage):`
			`add_referer = functools.partial(smuggle_url, data={'referrer': url})`

			`res = tuple(super()._extract_embed_urls(url, webpage))`
			`if res:`
			`return map(add_referer, res)`

[eagleplatform] Add support for another embed pattern (#13557) 2017-07-09 08:55:04 +00:00			`PLAYER_JS_RE = r'''`
			`<script[^>]+`
			`src=(?P<qjs>["\'])(?:https?:)?//(?P<host>(?:(?!(?P=qjs)).)+\.media\.eagleplatform\.com)/player/player\.js(?P=qjs)`
			`.+?`
			`'''`
			`# "Basic usage" embedding (see http://dultonmedia.github.io/eplayer/)`
[eagleplatform] Improve detection of embedded videos (Closes #10409) 2016-08-23 00:22:14 +00:00			`mobj = re.search(`
			`r'''(?xs)`
[eagleplatform] Add support for another embed pattern (#13557) 2017-07-09 08:55:04 +00:00			`%s`
[eagleplatform] Improve detection of embedded videos (Closes #10409) 2016-08-23 00:22:14 +00:00			`<div[^>]+`
[eagleplatform] Add support for another embed pattern (#13557) 2017-07-09 08:55:04 +00:00			`class=(?P<qclass>["\'])eagleplayer(?P=qclass)[^>]+`
[eagleplatform] Improve detection of embedded videos (Closes #10409) 2016-08-23 00:22:14 +00:00			`data-id=["\'](?P<id>\d+)`
[eagleplatform] Add support for another embed pattern (#13557) 2017-07-09 08:55:04 +00:00			`''' % PLAYER_JS_RE, webpage)`
			`if mobj is not None:`
[extractors] Use new framework for existing embeds (#4307) `Brightcove` is difficult to migrate because it's subclasses may depend on the signature of the current functions. So it is left as-is for now Note: Tests have not been migrated 2022-08-01 01:23:25 +00:00			`return [add_referer('eagleplatform:%(host)s:%(id)s' % mobj.groupdict())]`
[eagleplatform] Add support for another embed pattern (#13557) 2017-07-09 08:55:04 +00:00			`# Generalization of "Javascript code usage", "Combined usage" and`
			`# "Usage without attaching to DOM" embeddings (see`
			`# http://dultonmedia.github.io/eplayer/)`
			`mobj = re.search(`
			`r'''(?xs)`
			`%s`
			`<script>`
			`.+?`
			`new\s+EaglePlayer\(`
			`(?:[^,]+\s,\s)?`
			`{`
			`.+?`
			`\bid\s:\s["\']?(?P<id>\d+)`
			`.+?`
			`}`
			`\s*\)`
			`.+?`
			`</script>`
			`''' % PLAYER_JS_RE, webpage)`
[eagleplatform] Improve detection of embedded videos (Closes #10409) 2016-08-23 00:22:14 +00:00			`if mobj is not None:`
[extractors] Use new framework for existing embeds (#4307) `Brightcove` is difficult to migrate because it's subclasses may depend on the signature of the current functions. So it is left as-is for now Note: Tests have not been migrated 2022-08-01 01:23:25 +00:00			`return [add_referer('eagleplatform:%(host)s:%(id)s' % mobj.groupdict())]`
[eagleplatform] Improve embed detection and extract in separate routine (Closes #9926) 2016-06-29 16:01:34 +00:00
[eagleplatform] Make _handle_error staticmethod 2015-09-26 19:12:46 +00:00			`@staticmethod`
			`def _handle_error(response):`
[eagleplatform] Add extractor 2015-03-07 16:16:23 +00:00			`status = int_or_none(response.get('status', 200))`
			`if status != 200:`
			`raise ExtractorError(' '.join(response['errors']), expected=True)`

[eagleplatform] Add support for referrer protected videos (closes #13557) 2017-07-09 08:57:33 +00:00			`def _download_json(self, url_or_request, video_id, args, *kwargs):`
[eagleplatform] Fix error handling 2016-04-16 08:47:16 +00:00			`try:`
[eagleplatform] Add support for referrer protected videos (closes #13557) 2017-07-09 08:57:33 +00:00			`response = super(EaglePlatformIE, self)._download_json(`
			`url_or_request, video_id, args, *kwargs)`
[eagleplatform] Fix error handling 2016-04-16 08:47:16 +00:00			`except ExtractorError as ee:`
			`if isinstance(ee.cause, compat_HTTPError):`
			`response = self._parse_json(ee.cause.read().decode('utf-8'), video_id)`
			`self._handle_error(response)`
			`raise`
[eagleplatform] return the code to handle errors in all _download_json requests 2015-09-26 16:37:30 +00:00			`return response`

			`def _get_video_url(self, url_or_request, video_id, note='Downloading JSON metadata'):`
			`return self._download_json(url_or_request, video_id, note)['data'][0]`
[eagleplatform] Add extractor 2015-03-07 16:16:23 +00:00
			`def _real_extract(self, url):`
[eagleplatform] Add support for referrer protected videos (closes #13557) 2017-07-09 08:57:33 +00:00			`url, smuggled_data = unsmuggle_url(url, {})`

[extractor] Common function `_match_valid_url` 2021-08-19 01:41:24 +00:00			`mobj = self._match_valid_url(url)`
[eagleplatform] Add extractor 2015-03-07 16:16:23 +00:00			`host, video_id = mobj.group('custom_host') or mobj.group('host'), mobj.group('id')`

[eagleplatform] Add support for referrer protected videos (closes #13557) 2017-07-09 08:57:33 +00:00			`headers = {}`
			`query = {`
			`'id': video_id,`
			`}`

			`referrer = smuggled_data.get('referrer')`
			`if referrer:`
			`headers['Referer'] = referrer`
			`query['referrer'] = referrer`

[eagleplatform] Add extractor 2015-03-07 16:16:23 +00:00			`player_data = self._download_json(`
[eagleplatform] Add support for referrer protected videos (closes #13557) 2017-07-09 08:57:33 +00:00			`'http://%s/api/player_data' % host, video_id,`
			`headers=headers, query=query)`
[eagleplatform] Add extractor 2015-03-07 16:16:23 +00:00
			`media = player_data['data']['playlist']['viewports'][0]['medialist'][0]`

			`title = media['title']`
			`description = media.get('description')`
[eagleplatform] Use http scheme for thumbnail 2015-09-26 19:17:44 +00:00			`thumbnail = self._proto_relative_url(media.get('snapshot'), 'http:')`
[eagleplatform] Add extractor 2015-03-07 16:16:23 +00:00			`duration = int_or_none(media.get('duration'))`
			`view_count = int_or_none(media.get('views'))`

			`age_restriction = media.get('age_restriction')`
			`age_limit = None`
			`if age_restriction:`
			`age_limit = 0 if age_restriction == 'allow_all' else 18`

[eagleplatform] use http urls explicitly 2015-09-26 17:53:57 +00:00			`secure_m3u8 = self._proto_relative_url(media['sources']['secure_m3u8']['auto'], 'http:')`
[eagleplatform] Add extractor 2015-03-07 16:16:23 +00:00
[eagleplatform] extract all http formats 2016-04-22 13:32:38 +00:00			`formats = []`

[eagleplatform] extract mp4 url and fix thumbnail url 2015-09-26 14:30:02 +00:00			`m3u8_url = self._get_video_url(secure_m3u8, video_id, 'Downloading m3u8 JSON')`
[eagleplatform] extract all http formats 2016-04-22 13:32:38 +00:00			`m3u8_formats = self._extract_m3u8_formats(`
[eagleplatform] Fix extraction (closes #11160) 2016-11-10 20:26:29 +00:00			`m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native',`
			`m3u8_id='hls', fatal=False)`
[eagleplatform] extract all http formats 2016-04-22 13:32:38 +00:00			`formats.extend(m3u8_formats)`
[eagleplatform] extract mp4 url and fix thumbnail url 2015-09-26 14:30:02 +00:00
[eagleplatform] Fix extraction (closes #11160) 2016-11-10 20:26:29 +00:00			`m3u8_formats_dict = {}`
			`for f in m3u8_formats:`
			`if f.get('height') is not None:`
			`m3u8_formats_dict[f['height']] = f`

			`mp4_data = self._download_json(`
[eagleplatform] Simplify secure mp4 construction and clarify rationale 2015-09-26 19:10:39 +00:00			`# Secure mp4 URL is constructed according to Player.prototype.mp4 from`
			`# http://lentaru.media.eagleplatform.com/player/player.js`
[eagleplatform] Fix extraction (closes #11160) 2016-11-10 20:26:29 +00:00			`re.sub(r'm3u8\|hlsvod\|hls\|f4m', 'mp4s', secure_m3u8),`
			`video_id, 'Downloading mp4 JSON', fatal=False)`
			`if mp4_data:`
			`for format_id, format_url in mp4_data.get('data', {}).items():`
Improve URL extraction 2018-07-21 12:08:28 +00:00			`if not url_or_none(format_url):`
[eagleplatform] Checking direct HTTP links Sometimes they fail with 404 2016-04-25 14:48:17 +00:00			`continue`
[eagleplatform] Fix extraction (closes #11160) 2016-11-10 20:26:29 +00:00			`height = int_or_none(format_id)`
			`if height is not None and m3u8_formats_dict.get(height):`
			`f = m3u8_formats_dict[height].copy()`
			`f.update({`
			`'format_id': f['format_id'].replace('hls', 'http'),`
			`'protocol': 'http',`
			`})`
			`else:`
			`f = {`
			`'format_id': 'http-%s' % format_id,`
			`'height': int_or_none(format_id),`
			`}`
			`f['url'] = format_url`
			`formats.append(f)`
[eagleplatform] extract mp4 url and fix thumbnail url 2015-09-26 14:30:02 +00:00
[eagleplatform] Add extractor 2015-03-07 16:16:23 +00:00			`return {`
			`'id': video_id,`
			`'title': title,`
			`'description': description,`
			`'thumbnail': thumbnail,`
			`'duration': duration,`
			`'view_count': view_count,`
			`'age_limit': age_limit,`
			`'formats': formats,`
			`}`
[extractors] Use new framework for existing embeds (#4307) `Brightcove` is difficult to migrate because it's subclasses may depend on the signature of the current functions. So it is left as-is for now Note: Tests have not been migrated 2022-08-01 01:23:25 +00:00

			`class ClipYouEmbedIE(InfoExtractor):`
			`_VALID_URL = False`

			`@classmethod`
			`def _extract_embed_urls(cls, url, webpage):`
			`mobj = re.search(`
			`r'<iframe[^>]+src="https?://(?P<host>media\.clipyou\.ru)/index/player\?.\brecord_id=(?P<id>\d+)."', webpage)`
			`if mobj is not None:`
			`yield smuggle_url('eagleplatform:%(host)s:%(id)s' % mobj.groupdict(), {'referrer': url})`