yt-dlp/yt_dlp/extractor/crackle.py

# coding: utf-8
from __future__ import unicode_literals, division

import hashlib
import hmac
import re
import time

from .common import InfoExtractor
from ..compat import compat_HTTPError
from ..utils import (
    determine_ext,
    float_or_none,
    int_or_none,
    orderedSet,
    parse_age_limit,
    parse_duration,
    url_or_none,
    ExtractorError
)


class CrackleIE(InfoExtractor):
    _VALID_URL = r'(?:crackle:|https?://(?:(?:www|m)\.)?(?:sony)?crackle\.com/(?:playlist/\d+/|(?:[^/]+/)+))(?P<id>\d+)'
    _TESTS = [{
        # geo restricted to CA
        'url': 'https://www.crackle.com/andromeda/2502343',
        'info_dict': {
            'id': '2502343',
            'ext': 'mp4',
            'title': 'Under The Night',
            'description': 'md5:d2b8ca816579ae8a7bf28bfff8cefc8a',
            'duration': 2583,
            'view_count': int,
            'average_rating': 0,
            'age_limit': 14,
            'genre': 'Action, Sci-Fi',
            'creator': 'Allan Kroeker',
            'artist': 'Keith Hamilton Cobb, Kevin Sorbo, Lisa Ryder, Lexa Doig, Robert Hewitt Wolfe',
            'release_year': 2000,
            'series': 'Andromeda',
            'episode': 'Under The Night',
            'season_number': 1,
            'episode_number': 1,
        },
        'params': {
            # m3u8 download
            'skip_download': True,
        }
    }, {
        'url': 'https://www.sonycrackle.com/andromeda/2502343',
        'only_matching': True,
    }]

    _MEDIA_FILE_SLOTS = {
        '360p.mp4': {
            'width': 640,
            'height': 360,
        },
        '480p.mp4': {
            'width': 768,
            'height': 432,
        },
        '480p_1mbps.mp4': {
            'width': 852,
            'height': 480,
        },
    }

    def _download_json(self, url, *args, **kwargs):
        # Authorization generation algorithm is reverse engineered from:
        # https://www.sonycrackle.com/static/js/main.ea93451f.chunk.js
        timestamp = time.strftime('%Y%m%d%H%M', time.gmtime())
        h = hmac.new(b'IGSLUQCBDFHEOIFM', '|'.join([url, timestamp]).encode(), hashlib.sha1).hexdigest().upper()
        headers = {
            'Accept': 'application/json',
            'Authorization': '|'.join([h, timestamp, '117', '1']),
        }
        return InfoExtractor._download_json(self, url, *args, headers=headers, **kwargs)

    def _real_extract(self, url):
        video_id = self._match_id(url)

        geo_bypass_country = self.get_param('geo_bypass_country', None)
        countries = orderedSet((geo_bypass_country, 'US', 'AU', 'CA', 'AS', 'FM', 'GU', 'MP', 'PR', 'PW', 'MH', 'VI', ''))
        num_countries, num = len(countries) - 1, 0

        media = {}
        for num, country in enumerate(countries):
            if num == 1:  # start hard-coded list
                self.report_warning('%s. Trying with a list of known countries' % (
                    'Unable to obtain video formats from %s API' % geo_bypass_country if geo_bypass_country
                    else 'No country code was given using --geo-bypass-country'))
            elif num == num_countries:  # end of list
                geo_info = self._download_json(
                    'https://web-api-us.crackle.com/Service.svc/geo/country',
                    video_id, fatal=False, note='Downloading geo-location information from crackle API',
                    errnote='Unable to fetch geo-location information from crackle') or {}
                country = geo_info.get('CountryCode')
                if country is None:
                    continue
                self.to_screen('%s identified country as %s' % (self.IE_NAME, country))
                if country in countries:
                    self.to_screen('Downloading from %s API was already attempted. Skipping...' % country)
                    continue

            if country is None:
                continue
            try:
                media = self._download_json(
                    'https://web-api-us.crackle.com/Service.svc/details/media/%s/%s?disableProtocols=true' % (video_id, country),
                    video_id, note='Downloading media JSON from %s API' % country,
                    errnote='Unable to download media JSON')
            except ExtractorError as e:
                # 401 means geo restriction, trying next country
                if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
                    continue
                raise

            status = media.get('status')
            if status.get('messageCode') != '0':
                raise ExtractorError(
                    '%s said: %s %s - %s' % (
                        self.IE_NAME, status.get('messageCodeDescription'), status.get('messageCode'), status.get('message')),
                    expected=True)

            # Found video formats
            if isinstance(media.get('MediaURLs'), list):
                break

        ignore_no_formats = self.get_param('ignore_no_formats_error')
        allow_unplayable_formats = self.get_param('allow_unplayable_formats')

        if not media or (not media.get('MediaURLs') and not ignore_no_formats):
            raise ExtractorError(
                'Unable to access the crackle API. Try passing your country code '
                'to --geo-bypass-country. If it still does not work and the '
                'video is available in your country')
        title = media['Title']

        formats, subtitles = [], {}
        has_drm = False
        for e in media.get('MediaURLs') or []:
            if e.get('UseDRM'):
                has_drm = True
                if not allow_unplayable_formats:
                    continue
            format_url = url_or_none(e.get('Path'))
            if not format_url:
                continue
            ext = determine_ext(format_url)
            if ext == 'm3u8':
                fmts, subs = self._extract_m3u8_formats_and_subtitles(
                    format_url, video_id, 'mp4', entry_protocol='m3u8_native',
                    m3u8_id='hls', fatal=False)
                formats.extend(fmts)
                subtitles = self._merge_subtitles(subtitles, subs)
            elif ext == 'mpd':
                fmts, subs = self._extract_mpd_formats_and_subtitles(
                    format_url, video_id, mpd_id='dash', fatal=False)
                formats.extend(fmts)
                subtitles = self._merge_subtitles(subtitles, subs)
            elif format_url.endswith('.ism/Manifest'):
                fmts, subs = self._extract_ism_formats_and_subtitles(
                    format_url, video_id, ism_id='mss', fatal=False)
                formats.extend(fmts)
                subtitles = self._merge_subtitles(subtitles, subs)
            else:
                mfs_path = e.get('Type')
                mfs_info = self._MEDIA_FILE_SLOTS.get(mfs_path)
                if not mfs_info:
                    continue
                formats.append({
                    'url': format_url,
                    'format_id': 'http-' + mfs_path.split('.')[0],
                    'width': mfs_info['width'],
                    'height': mfs_info['height'],
                })
        if not formats and has_drm and not ignore_no_formats:
            raise ExtractorError('The video is DRM protected', expected=True)
        self._sort_formats(formats)

        description = media.get('Description')
        duration = int_or_none(media.get(
            'DurationInSeconds')) or parse_duration(media.get('Duration'))
        view_count = int_or_none(media.get('CountViews'))
        average_rating = float_or_none(media.get('UserRating'))
        age_limit = parse_age_limit(media.get('Rating'))
        genre = media.get('Genre')
        release_year = int_or_none(media.get('ReleaseYear'))
        creator = media.get('Directors')
        artist = media.get('Cast')

        if media.get('MediaTypeDisplayValue') == 'Full Episode':
            series = media.get('ShowName')
            episode = title
            season_number = int_or_none(media.get('Season'))
            episode_number = int_or_none(media.get('Episode'))
        else:
            series = episode = season_number = episode_number = None

        cc_files = media.get('ClosedCaptionFiles')
        if isinstance(cc_files, list):
            for cc_file in cc_files:
                if not isinstance(cc_file, dict):
                    continue
                cc_url = url_or_none(cc_file.get('Path'))
                if not cc_url:
                    continue
                lang = cc_file.get('Locale') or 'en'
                subtitles.setdefault(lang, []).append({'url': cc_url})

        thumbnails = []
        images = media.get('Images')
        if isinstance(images, list):
            for image_key, image_url in images.items():
                mobj = re.search(r'Img_(\d+)[xX](\d+)', image_key)
                if not mobj:
                    continue
                thumbnails.append({
                    'url': image_url,
                    'width': int(mobj.group(1)),
                    'height': int(mobj.group(2)),
                })

        return {
            'id': video_id,
            'title': title,
            'description': description,
            'duration': duration,
            'view_count': view_count,
            'average_rating': average_rating,
            'age_limit': age_limit,
            'genre': genre,
            'creator': creator,
            'artist': artist,
            'release_year': release_year,
            'series': series,
            'episode': episode,
            'season_number': season_number,
            'episode_number': episode_number,
            'thumbnails': thumbnails,
            'subtitles': subtitles,
            'formats': formats,
        }
[crackle] Add new extractor 2016-02-10 21:16:21 +00:00			`# coding: utf-8`
[crackle] Fix extraction and update _TESTS (closes #10333) 2016-08-25 14:22:31 +00:00			`from __future__ import unicode_literals, division`

[crackle] authorize media detail request(closes #16931) 2019-01-30 13:43:44 +00:00			`import hashlib`
			`import hmac`
[crackle] Fix extraction (closes #15969) 2018-03-23 16:53:18 +00:00			`import re`
[crackle] authorize media detail request(closes #16931) 2019-01-30 13:43:44 +00:00			`import time`
[crackle] Fix extraction (closes #15969) 2018-03-23 16:53:18 +00:00
[crackle] Add new extractor 2016-02-10 21:16:21 +00:00			`from .common import InfoExtractor`
Improve URL extraction 2018-07-21 12:08:28 +00:00			`from ..compat import compat_HTTPError`
[crackle] Fix extraction (closes #15969) 2018-03-23 16:53:18 +00:00			`from ..utils import (`
			`determine_ext,`
			`float_or_none,`
			`int_or_none,`
[crackle] Improve extraction (See desc) Closes #282 * Refactor authorization as an extension to `_download_json` * Better error messages and warnings * Respect `--ignore-no-formats-error` * Extract subtitles from manifests * Try with crackle's geo-location service if all hard-coded countries fail 2021-04-28 23:49:06 +00:00			`orderedSet,`
[crackle] Fix extraction (closes #15969) 2018-03-23 16:53:18 +00:00			`parse_age_limit,`
			`parse_duration,`
Improve URL extraction 2018-07-21 12:08:28 +00:00			`url_or_none,`
[crackle] Bypass geo restriction 2018-03-23 18:49:50 +00:00			`ExtractorError`
[crackle] Fix extraction (closes #15969) 2018-03-23 16:53:18 +00:00			`)`
[crackle] Add new extractor 2016-02-10 21:16:21 +00:00

			`class CrackleIE(InfoExtractor):`
[crackle] Add support for sonycrackle.com (closes #16698) 2018-06-11 19:06:30 +00:00			`_VALID_URL = r'(?:crackle:\|https?://(?:(?:www\|m)\.)?(?:sony)?crackle\.com/(?:playlist/\d+/\|(?:[^/]+/)+))(?P<id>\d+)'`
			`_TESTS = [{`
[crackle] Bypass geo restriction 2018-03-23 18:49:50 +00:00			`# geo restricted to CA`
[crackle] Fix extraction (closes #15969) 2018-03-23 16:53:18 +00:00			`'url': 'https://www.crackle.com/andromeda/2502343',`
[crackle] Add new extractor 2016-02-10 21:16:21 +00:00			`'info_dict': {`
[crackle] Fix extraction (closes #15969) 2018-03-23 16:53:18 +00:00			`'id': '2502343',`
[crackle] Add new extractor 2016-02-10 21:16:21 +00:00			`'ext': 'mp4',`
[crackle] Fix extraction (closes #15969) 2018-03-23 16:53:18 +00:00			`'title': 'Under The Night',`
			`'description': 'md5:d2b8ca816579ae8a7bf28bfff8cefc8a',`
			`'duration': 2583,`
			`'view_count': int,`
			`'average_rating': 0,`
			`'age_limit': 14,`
			`'genre': 'Action, Sci-Fi',`
			`'creator': 'Allan Kroeker',`
			`'artist': 'Keith Hamilton Cobb, Kevin Sorbo, Lisa Ryder, Lexa Doig, Robert Hewitt Wolfe',`
			`'release_year': 2000,`
			`'series': 'Andromeda',`
			`'episode': 'Under The Night',`
			`'season_number': 1,`
			`'episode_number': 1,`
[crackle] Add new extractor 2016-02-10 21:16:21 +00:00			`},`
			`'params': {`
			`# m3u8 download`
			`'skip_download': True,`
			`}`
[crackle] Add support for sonycrackle.com (closes #16698) 2018-06-11 19:06:30 +00:00			`}, {`
			`'url': 'https://www.sonycrackle.com/andromeda/2502343',`
			`'only_matching': True,`
			`}]`
[crackle] Add new extractor 2016-02-10 21:16:21 +00:00
[crackle] extract ism and http formats 2018-12-19 21:07:37 +00:00			`_MEDIA_FILE_SLOTS = {`
			`'360p.mp4': {`
			`'width': 640,`
			`'height': 360,`
			`},`
			`'480p.mp4': {`
			`'width': 768,`
			`'height': 432,`
			`},`
			`'480p_1mbps.mp4': {`
			`'width': 852,`
			`'height': 480,`
			`},`
			`}`

[crackle] Improve extraction (See desc) Closes #282 * Refactor authorization as an extension to `_download_json` * Better error messages and warnings * Respect `--ignore-no-formats-error` * Extract subtitles from manifests * Try with crackle's geo-location service if all hard-coded countries fail 2021-04-28 23:49:06 +00:00			`def _download_json(self, url, args, *kwargs):`
			`# Authorization generation algorithm is reverse engineered from:`
			`# https://www.sonycrackle.com/static/js/main.ea93451f.chunk.js`
			`timestamp = time.strftime('%Y%m%d%H%M', time.gmtime())`
			`h = hmac.new(b'IGSLUQCBDFHEOIFM', '\|'.join([url, timestamp]).encode(), hashlib.sha1).hexdigest().upper()`
			`headers = {`
			`'Accept': 'application/json',`
			`'Authorization': '\|'.join([h, timestamp, '117', '1']),`
			`}`
			`return InfoExtractor._download_json(self, url, args, headers=headers, *kwargs)`

[crackle] Add new extractor 2016-02-10 21:16:21 +00:00			`def _real_extract(self, url):`
			`video_id = self._match_id(url)`
[crackle] Fix extraction and update _TESTS (closes #10333) 2016-08-25 14:22:31 +00:00
[extractor] Add `write_debug` and `get_param` 2021-05-17 12:23:08 +00:00			`geo_bypass_country = self.get_param('geo_bypass_country', None)`
[crackle] Improve extraction (See desc) Closes #282 * Refactor authorization as an extension to `_download_json` * Better error messages and warnings * Respect `--ignore-no-formats-error` * Extract subtitles from manifests * Try with crackle's geo-location service if all hard-coded countries fail 2021-04-28 23:49:06 +00:00			`countries = orderedSet((geo_bypass_country, 'US', 'AU', 'CA', 'AS', 'FM', 'GU', 'MP', 'PR', 'PW', 'MH', 'VI', ''))`
			`num_countries, num = len(countries) - 1, 0`

			`media = {}`
			`for num, country in enumerate(countries):`
			`if num == 1: # start hard-coded list`
			`self.report_warning('%s. Trying with a list of known countries' % (`
			`'Unable to obtain video formats from %s API' % geo_bypass_country if geo_bypass_country`
			`else 'No country code was given using --geo-bypass-country'))`
			`elif num == num_countries: # end of list`
			`geo_info = self._download_json(`
			`'https://web-api-us.crackle.com/Service.svc/geo/country',`
			`video_id, fatal=False, note='Downloading geo-location information from crackle API',`
			`errnote='Unable to fetch geo-location information from crackle') or {}`
			`country = geo_info.get('CountryCode')`
			`if country is None:`
			`continue`
			`self.to_screen('%s identified country as %s' % (self.IE_NAME, country))`
			`if country in countries:`
			`self.to_screen('Downloading from %s API was already attempted. Skipping...' % country)`
			`continue`
[crackle] Fix extraction (closes #15969) 2018-03-23 16:53:18 +00:00
[crackle] Improve extraction (See desc) Closes #282 * Refactor authorization as an extension to `_download_json` * Better error messages and warnings * Respect `--ignore-no-formats-error` * Extract subtitles from manifests * Try with crackle's geo-location service if all hard-coded countries fail 2021-04-28 23:49:06 +00:00			`if country is None:`
			`continue`
[crackle] Bypass geo restriction 2018-03-23 18:49:50 +00:00			`try:`
			`media = self._download_json(`
[crackle] Improve extraction (See desc) Closes #282 * Refactor authorization as an extension to `_download_json` * Better error messages and warnings * Respect `--ignore-no-formats-error` * Extract subtitles from manifests * Try with crackle's geo-location service if all hard-coded countries fail 2021-04-28 23:49:06 +00:00			`'https://web-api-us.crackle.com/Service.svc/details/media/%s/%s?disableProtocols=true' % (video_id, country),`
			`video_id, note='Downloading media JSON from %s API' % country,`
			`errnote='Unable to download media JSON')`
[crackle] Bypass geo restriction 2018-03-23 18:49:50 +00:00			`except ExtractorError as e:`
			`# 401 means geo restriction, trying next country`
			`if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:`
[crackle] Fix extraction (closes #15969) 2018-03-23 16:53:18 +00:00			`continue`
[crackle] Bypass geo restriction 2018-03-23 18:49:50 +00:00			`raise`

[crackle] Improve extraction (See desc) Closes #282 * Refactor authorization as an extension to `_download_json` * Better error messages and warnings * Respect `--ignore-no-formats-error` * Extract subtitles from manifests * Try with crackle's geo-location service if all hard-coded countries fail 2021-04-28 23:49:06 +00:00			`status = media.get('status')`
			`if status.get('messageCode') != '0':`
			`raise ExtractorError(`
			`'%s said: %s %s - %s' % (`
			`self.IE_NAME, status.get('messageCodeDescription'), status.get('messageCode'), status.get('message')),`
			`expected=True)`

			`# Found video formats`
			`if isinstance(media.get('MediaURLs'), list):`
			`break`

[extractor] Add `write_debug` and `get_param` 2021-05-17 12:23:08 +00:00			`ignore_no_formats = self.get_param('ignore_no_formats_error')`
			`allow_unplayable_formats = self.get_param('allow_unplayable_formats')`
[crackle] Improve extraction (See desc) Closes #282 * Refactor authorization as an extension to `_download_json` * Better error messages and warnings * Respect `--ignore-no-formats-error` * Extract subtitles from manifests * Try with crackle's geo-location service if all hard-coded countries fail 2021-04-28 23:49:06 +00:00
			`if not media or (not media.get('MediaURLs') and not ignore_no_formats):`
			`raise ExtractorError(`
			`'Unable to access the crackle API. Try passing your country code '`
			`'to --geo-bypass-country. If it still does not work and the '`
			`'video is available in your country')`
			`title = media['Title']`

			`formats, subtitles = [], {}`
			`has_drm = False`
			`for e in media.get('MediaURLs') or []:`
			`if e.get('UseDRM'):`
			`has_drm = True`
			`if not allow_unplayable_formats:`
			`continue`
			`format_url = url_or_none(e.get('Path'))`
			`if not format_url:`
[crackle] Bypass geo restriction 2018-03-23 18:49:50 +00:00			`continue`
[crackle] Improve extraction (See desc) Closes #282 * Refactor authorization as an extension to `_download_json` * Better error messages and warnings * Respect `--ignore-no-formats-error` * Extract subtitles from manifests * Try with crackle's geo-location service if all hard-coded countries fail 2021-04-28 23:49:06 +00:00			`ext = determine_ext(format_url)`
			`if ext == 'm3u8':`
			`fmts, subs = self._extract_m3u8_formats_and_subtitles(`
			`format_url, video_id, 'mp4', entry_protocol='m3u8_native',`
			`m3u8_id='hls', fatal=False)`
			`formats.extend(fmts)`
			`subtitles = self._merge_subtitles(subtitles, subs)`
			`elif ext == 'mpd':`
			`fmts, subs = self._extract_mpd_formats_and_subtitles(`
			`format_url, video_id, mpd_id='dash', fatal=False)`
			`formats.extend(fmts)`
			`subtitles = self._merge_subtitles(subtitles, subs)`
			`elif format_url.endswith('.ism/Manifest'):`
			`fmts, subs = self._extract_ism_formats_and_subtitles(`
			`format_url, video_id, ism_id='mss', fatal=False)`
			`formats.extend(fmts)`
			`subtitles = self._merge_subtitles(subtitles, subs)`
			`else:`
			`mfs_path = e.get('Type')`
			`mfs_info = self._MEDIA_FILE_SLOTS.get(mfs_path)`
			`if not mfs_info:`
[crackle] Fix extraction (closes #15969) 2018-03-23 16:53:18 +00:00			`continue`
[crackle] Improve extraction (See desc) Closes #282 * Refactor authorization as an extension to `_download_json` * Better error messages and warnings * Respect `--ignore-no-formats-error` * Extract subtitles from manifests * Try with crackle's geo-location service if all hard-coded countries fail 2021-04-28 23:49:06 +00:00			`formats.append({`
			`'url': format_url,`
			`'format_id': 'http-' + mfs_path.split('.')[0],`
			`'width': mfs_info['width'],`
			`'height': mfs_info['height'],`
			`})`
			`if not formats and has_drm and not ignore_no_formats:`
			`raise ExtractorError('The video is DRM protected', expected=True)`
			`self._sort_formats(formats)`

			`description = media.get('Description')`
			`duration = int_or_none(media.get(`
			`'DurationInSeconds')) or parse_duration(media.get('Duration'))`
			`view_count = int_or_none(media.get('CountViews'))`
			`average_rating = float_or_none(media.get('UserRating'))`
			`age_limit = parse_age_limit(media.get('Rating'))`
			`genre = media.get('Genre')`
			`release_year = int_or_none(media.get('ReleaseYear'))`
			`creator = media.get('Directors')`
			`artist = media.get('Cast')`

			`if media.get('MediaTypeDisplayValue') == 'Full Episode':`
			`series = media.get('ShowName')`
			`episode = title`
			`season_number = int_or_none(media.get('Season'))`
			`episode_number = int_or_none(media.get('Episode'))`
			`else:`
			`series = episode = season_number = episode_number = None`

			`cc_files = media.get('ClosedCaptionFiles')`
			`if isinstance(cc_files, list):`
			`for cc_file in cc_files:`
			`if not isinstance(cc_file, dict):`
[crackle] Fix extraction (closes #15969) 2018-03-23 16:53:18 +00:00			`continue`
[crackle] Improve extraction (See desc) Closes #282 * Refactor authorization as an extension to `_download_json` * Better error messages and warnings * Respect `--ignore-no-formats-error` * Extract subtitles from manifests * Try with crackle's geo-location service if all hard-coded countries fail 2021-04-28 23:49:06 +00:00			`cc_url = url_or_none(cc_file.get('Path'))`
			`if not cc_url:`
			`continue`
			`lang = cc_file.get('Locale') or 'en'`
			`subtitles.setdefault(lang, []).append({'url': cc_url})`

			`thumbnails = []`
			`images = media.get('Images')`
			`if isinstance(images, list):`
			`for image_key, image_url in images.items():`
			`mobj = re.search(r'Img_(\d+)[xX](\d+)', image_key)`
			`if not mobj:`
			`continue`
			`thumbnails.append({`
			`'url': image_url,`
			`'width': int(mobj.group(1)),`
			`'height': int(mobj.group(2)),`
			`})`

			`return {`
			`'id': video_id,`
			`'title': title,`
			`'description': description,`
			`'duration': duration,`
			`'view_count': view_count,`
			`'average_rating': average_rating,`
			`'age_limit': age_limit,`
			`'genre': genre,`
			`'creator': creator,`
			`'artist': artist,`
			`'release_year': release_year,`
			`'series': series,`
			`'episode': episode,`
			`'season_number': season_number,`
			`'episode_number': episode_number,`
			`'thumbnails': thumbnails,`
			`'subtitles': subtitles,`
			`'formats': formats,`
			`}`