yt-dlp/yt_dlp/extractor/arcpublishing.py

import re

from .common import InfoExtractor
from ..utils import (
    extract_attributes,
    int_or_none,
    join_nonempty,
    parse_iso8601,
    try_get,
)


class ArcPublishingIE(InfoExtractor):
    _UUID_REGEX = r'[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12}'
    _VALID_URL = rf'arcpublishing:(?P<org>[a-z]+):(?P<id>{_UUID_REGEX})'
    _TESTS = [{
        # https://www.adn.com/politics/2020/11/02/video-senate-candidates-campaign-in-anchorage-on-eve-of-election-day/
        'url': 'arcpublishing:adn:8c99cb6e-b29c-4bc9-9173-7bf9979225ab',
        'only_matching': True,
    }, {
        # https://www.bostonglobe.com/video/2020/12/30/metro/footage-released-showing-officer-talking-about-striking-protesters-with-car/
        'url': 'arcpublishing:bostonglobe:232b7ae6-7d73-432d-bc0a-85dbf0119ab1',
        'only_matching': True,
    }, {
        # https://www.actionnewsjax.com/video/live-stream/
        'url': 'arcpublishing:cmg:cfb1cf1b-3ab5-4d1b-86c5-a5515d311f2a',
        'only_matching': True,
    }, {
        # https://elcomercio.pe/videos/deportes/deporte-total-futbol-peruano-seleccion-peruana-la-valorizacion-de-los-peruanos-en-el-exterior-tras-un-2020-atipico-nnav-vr-video-noticia/
        'url': 'arcpublishing:elcomercio:27a7e1f8-2ec7-4177-874f-a4feed2885b3',
        'only_matching': True,
    }, {
        # https://www.clickondetroit.com/video/community/2020/05/15/events-surrounding-woodward-dream-cruise-being-canceled/
        'url': 'arcpublishing:gmg:c8793fb2-8d44-4242-881e-2db31da2d9fe',
        'only_matching': True,
    }, {
        # https://www.wabi.tv/video/2020/12/30/trenton-company-making-equipment-pfizer-covid-vaccine/
        'url': 'arcpublishing:gray:0b0ba30e-032a-4598-8810-901d70e6033e',
        'only_matching': True,
    }, {
        # https://www.lateja.cr/el-mundo/video-china-aprueba-con-condiciones-su-primera/dfcbfa57-527f-45ff-a69b-35fe71054143/video/
        'url': 'arcpublishing:gruponacion:dfcbfa57-527f-45ff-a69b-35fe71054143',
        'only_matching': True,
    }, {
        # https://www.fifthdomain.com/video/2018/03/09/is-america-vulnerable-to-a-cyber-attack/
        'url': 'arcpublishing:mco:aa0ca6fe-1127-46d4-b32c-be0d6fdb8055',
        'only_matching': True,
    }, {
        # https://www.vl.no/kultur/2020/12/09/en-melding-fra-en-lytter-endret-julelista-til-lewi-bergrud/
        'url': 'arcpublishing:mentormedier:47a12084-650b-4011-bfd0-3699b6947b2d',
        'only_matching': True,
    }, {
        # https://www.14news.com/2020/12/30/whiskey-theft-caught-camera-henderson-liquor-store/
        'url': 'arcpublishing:raycom:b89f61f8-79fa-4c09-8255-e64237119bf7',
        'only_matching': True,
    }, {
        # https://www.theglobeandmail.com/world/video-ethiopian-woman-who-became-symbol-of-integration-in-italy-killed-on/
        'url': 'arcpublishing:tgam:411b34c1-8701-4036-9831-26964711664b',
        'only_matching': True,
    }, {
        # https://www.pilotonline.com/460f2931-8130-4719-8ea1-ffcb2d7cb685-132.html
        'url': 'arcpublishing:tronc:460f2931-8130-4719-8ea1-ffcb2d7cb685',
        'only_matching': True,
    }]
    _POWA_DEFAULTS = [
        (['cmg', 'prisa'], '%s-config-prod.api.cdn.arcpublishing.com/video'),
        ([
            'adn', 'advancelocal', 'answers', 'bonnier', 'bostonglobe', 'demo',
            'gmg', 'gruponacion', 'infobae', 'mco', 'nzme', 'pmn', 'raycom',
            'spectator', 'tbt', 'tgam', 'tronc', 'wapo', 'wweek',
        ], 'video-api-cdn.%s.arcpublishing.com/api'),
    ]

    @classmethod
    def _extract_embed_urls(cls, url, webpage):
        entries = []
        # https://arcpublishing.atlassian.net/wiki/spaces/POWA/overview
        for powa_el in re.findall(rf'(<div[^>]+class="[^"]*\bpowa\b[^"]*"[^>]+data-uuid="{ArcPublishingIE._UUID_REGEX}"[^>]*>)', webpage):
            powa = extract_attributes(powa_el) or {}
            org = powa.get('data-org')
            uuid = powa.get('data-uuid')
            if org and uuid:
                entries.append(f'arcpublishing:{org}:{uuid}')
        return entries

    def _real_extract(self, url):
        org, uuid = self._match_valid_url(url).groups()
        for orgs, tmpl in self._POWA_DEFAULTS:
            if org in orgs:
                base_api_tmpl = tmpl
                break
        else:
            base_api_tmpl = '%s-prod-cdn.video-api.arcpublishing.com/api'
        if org == 'wapo':
            org = 'washpost'
        video = self._download_json(
            'https://%s/v1/ansvideos/findByUuid' % (base_api_tmpl % org),
            uuid, query={'uuid': uuid})[0]
        title = video['headlines']['basic']
        is_live = video.get('status') == 'live'

        urls = []
        formats = []
        for s in video.get('streams', []):
            s_url = s.get('url')
            if not s_url or s_url in urls:
                continue
            urls.append(s_url)
            stream_type = s.get('stream_type')
            if stream_type == 'smil':
                smil_formats = self._extract_smil_formats(
                    s_url, uuid, fatal=False)
                for f in smil_formats:
                    if f['url'].endswith('/cfx/st'):
                        f['app'] = 'cfx/st'
                        if not f['play_path'].startswith('mp4:'):
                            f['play_path'] = 'mp4:' + f['play_path']
                        if isinstance(f['tbr'], float):
                            f['vbr'] = f['tbr'] * 1000
                            del f['tbr']
                            f['format_id'] = 'rtmp-%d' % f['vbr']
                formats.extend(smil_formats)
            elif stream_type in ('ts', 'hls'):
                m3u8_formats = self._extract_m3u8_formats(
                    s_url, uuid, 'mp4', live=is_live, m3u8_id='hls', fatal=False)
                if all(f.get('acodec') == 'none' for f in m3u8_formats):
                    continue
                for f in m3u8_formats:
                    height = f.get('height')
                    if not height:
                        continue
                    vbr = self._search_regex(
                        r'[_x]%d[_-](\d+)' % height, f['url'], 'vbr', default=None)
                    if vbr:
                        f['vbr'] = int(vbr)
                formats.extend(m3u8_formats)
            else:
                vbr = int_or_none(s.get('bitrate'))
                formats.append({
                    'format_id': join_nonempty(stream_type, vbr),
                    'vbr': vbr,
                    'width': int_or_none(s.get('width')),
                    'height': int_or_none(s.get('height')),
                    'filesize': int_or_none(s.get('filesize')),
                    'url': s_url,
                    'quality': -10,
                })

        subtitles = {}
        for subtitle in (try_get(video, lambda x: x['subtitles']['urls'], list) or []):
            subtitle_url = subtitle.get('url')
            if subtitle_url:
                subtitles.setdefault('en', []).append({'url': subtitle_url})

        return {
            'id': uuid,
            'title': title,
            'thumbnail': try_get(video, lambda x: x['promo_image']['url']),
            'description': try_get(video, lambda x: x['subheadlines']['basic']),
            'formats': formats,
            'duration': int_or_none(video.get('duration'), 100),
            'timestamp': parse_iso8601(video.get('created_date')),
            'subtitles': subtitles,
            'is_live': is_live,
        }
Update to ytdl-2021.01.03 2021-01-01 12:26:37 +00:00			`import re`

			`from .common import InfoExtractor`
			`from ..utils import (`
			`extract_attributes,`
			`int_or_none,`
[cleanup] Misc (#10075) Closes #10303 Authored by: bashonly, seproDev, jucor, c-basalt Co-authored-by: sepro <4618135+seproDev@users.noreply.github.com> Co-authored-by: Julien Cornebise <julien@cornebise.com> Co-authored-by: c-basalt <117849907+c-basalt@users.noreply.github.com> 2024-07-01 22:51:27 +00:00			`join_nonempty,`
Update to ytdl-2021.01.03 2021-01-01 12:26:37 +00:00			`parse_iso8601,`
			`try_get,`
			`)`


			`class ArcPublishingIE(InfoExtractor):`
			`_UUID_REGEX = r'[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12}'`
[cleanup] Add more ruff rules (#10149) Authored by: seproDev Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com> Reviewed-by: Simon Sawicki <contact@grub4k.xyz> 2024-06-11 23:09:58 +00:00			`_VALID_URL = rf'arcpublishing:(?P<org>[a-z]+):(?P<id>{_UUID_REGEX})'`
Update to ytdl-2021.01.03 2021-01-01 12:26:37 +00:00			`_TESTS = [{`
			`# https://www.adn.com/politics/2020/11/02/video-senate-candidates-campaign-in-anchorage-on-eve-of-election-day/`
			`'url': 'arcpublishing:adn:8c99cb6e-b29c-4bc9-9173-7bf9979225ab',`
			`'only_matching': True,`
			`}, {`
			`# https://www.bostonglobe.com/video/2020/12/30/metro/footage-released-showing-officer-talking-about-striking-protesters-with-car/`
			`'url': 'arcpublishing:bostonglobe:232b7ae6-7d73-432d-bc0a-85dbf0119ab1',`
			`'only_matching': True,`
			`}, {`
			`# https://www.actionnewsjax.com/video/live-stream/`
			`'url': 'arcpublishing:cmg:cfb1cf1b-3ab5-4d1b-86c5-a5515d311f2a',`
			`'only_matching': True,`
			`}, {`
			`# https://elcomercio.pe/videos/deportes/deporte-total-futbol-peruano-seleccion-peruana-la-valorizacion-de-los-peruanos-en-el-exterior-tras-un-2020-atipico-nnav-vr-video-noticia/`
			`'url': 'arcpublishing:elcomercio:27a7e1f8-2ec7-4177-874f-a4feed2885b3',`
			`'only_matching': True,`
			`}, {`
			`# https://www.clickondetroit.com/video/community/2020/05/15/events-surrounding-woodward-dream-cruise-being-canceled/`
			`'url': 'arcpublishing:gmg:c8793fb2-8d44-4242-881e-2db31da2d9fe',`
			`'only_matching': True,`
			`}, {`
			`# https://www.wabi.tv/video/2020/12/30/trenton-company-making-equipment-pfizer-covid-vaccine/`
			`'url': 'arcpublishing:gray:0b0ba30e-032a-4598-8810-901d70e6033e',`
			`'only_matching': True,`
			`}, {`
			`# https://www.lateja.cr/el-mundo/video-china-aprueba-con-condiciones-su-primera/dfcbfa57-527f-45ff-a69b-35fe71054143/video/`
			`'url': 'arcpublishing:gruponacion:dfcbfa57-527f-45ff-a69b-35fe71054143',`
			`'only_matching': True,`
			`}, {`
			`# https://www.fifthdomain.com/video/2018/03/09/is-america-vulnerable-to-a-cyber-attack/`
			`'url': 'arcpublishing:mco:aa0ca6fe-1127-46d4-b32c-be0d6fdb8055',`
			`'only_matching': True,`
			`}, {`
			`# https://www.vl.no/kultur/2020/12/09/en-melding-fra-en-lytter-endret-julelista-til-lewi-bergrud/`
			`'url': 'arcpublishing:mentormedier:47a12084-650b-4011-bfd0-3699b6947b2d',`
			`'only_matching': True,`
			`}, {`
			`# https://www.14news.com/2020/12/30/whiskey-theft-caught-camera-henderson-liquor-store/`
			`'url': 'arcpublishing:raycom:b89f61f8-79fa-4c09-8255-e64237119bf7',`
			`'only_matching': True,`
			`}, {`
			`# https://www.theglobeandmail.com/world/video-ethiopian-woman-who-became-symbol-of-integration-in-italy-killed-on/`
			`'url': 'arcpublishing:tgam:411b34c1-8701-4036-9831-26964711664b',`
			`'only_matching': True,`
			`}, {`
			`# https://www.pilotonline.com/460f2931-8130-4719-8ea1-ffcb2d7cb685-132.html`
			`'url': 'arcpublishing:tronc:460f2931-8130-4719-8ea1-ffcb2d7cb685',`
			`'only_matching': True,`
			`}]`
			`_POWA_DEFAULTS = [`
			`(['cmg', 'prisa'], '%s-config-prod.api.cdn.arcpublishing.com/video'),`
			`([`
			`'adn', 'advancelocal', 'answers', 'bonnier', 'bostonglobe', 'demo',`
			`'gmg', 'gruponacion', 'infobae', 'mco', 'nzme', 'pmn', 'raycom',`
			`'spectator', 'tbt', 'tgam', 'tronc', 'wapo', 'wweek',`
			`], 'video-api-cdn.%s.arcpublishing.com/api'),`
			`]`

[extractors] Use new framework for existing embeds (#4307) `Brightcove` is difficult to migrate because it's subclasses may depend on the signature of the current functions. So it is left as-is for now Note: Tests have not been migrated 2022-08-01 01:23:25 +00:00			`@classmethod`
			`def _extract_embed_urls(cls, url, webpage):`
Update to ytdl-2021.01.03 2021-01-01 12:26:37 +00:00			`entries = []`
			`# https://arcpublishing.atlassian.net/wiki/spaces/POWA/overview`
[cleanup] Add more ruff rules (#10149) Authored by: seproDev Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com> Reviewed-by: Simon Sawicki <contact@grub4k.xyz> 2024-06-11 23:09:58 +00:00			`for powa_el in re.findall(rf'(<div[^>]+class="[^"]\bpowa\b[^"]"[^>]+data-uuid="{ArcPublishingIE._UUID_REGEX}"[^>]*>)', webpage):`
Update to ytdl-2021.01.03 2021-01-01 12:26:37 +00:00			`powa = extract_attributes(powa_el) or {}`
			`org = powa.get('data-org')`
			`uuid = powa.get('data-uuid')`
			`if org and uuid:`
[cleanup] Add more ruff rules (#10149) Authored by: seproDev Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com> Reviewed-by: Simon Sawicki <contact@grub4k.xyz> 2024-06-11 23:09:58 +00:00			`entries.append(f'arcpublishing:{org}:{uuid}')`
Update to ytdl-2021.01.03 2021-01-01 12:26:37 +00:00			`return entries`

			`def _real_extract(self, url):`
[extractor] Common function `_match_valid_url` 2021-08-19 01:41:24 +00:00			`org, uuid = self._match_valid_url(url).groups()`
Update to ytdl-2021.01.03 2021-01-01 12:26:37 +00:00			`for orgs, tmpl in self._POWA_DEFAULTS:`
			`if org in orgs:`
			`base_api_tmpl = tmpl`
			`break`
			`else:`
			`base_api_tmpl = '%s-prod-cdn.video-api.arcpublishing.com/api'`
			`if org == 'wapo':`
			`org = 'washpost'`
			`video = self._download_json(`
			`'https://%s/v1/ansvideos/findByUuid' % (base_api_tmpl % org),`
			`uuid, query={'uuid': uuid})[0]`
			`title = video['headlines']['basic']`
			`is_live = video.get('status') == 'live'`

			`urls = []`
			`formats = []`
			`for s in video.get('streams', []):`
			`s_url = s.get('url')`
			`if not s_url or s_url in urls:`
			`continue`
			`urls.append(s_url)`
			`stream_type = s.get('stream_type')`
			`if stream_type == 'smil':`
			`smil_formats = self._extract_smil_formats(`
			`s_url, uuid, fatal=False)`
			`for f in smil_formats:`
			`if f['url'].endswith('/cfx/st'):`
			`f['app'] = 'cfx/st'`
			`if not f['play_path'].startswith('mp4:'):`
			`f['play_path'] = 'mp4:' + f['play_path']`
			`if isinstance(f['tbr'], float):`
			`f['vbr'] = f['tbr'] * 1000`
			`del f['tbr']`
			`f['format_id'] = 'rtmp-%d' % f['vbr']`
			`formats.extend(smil_formats)`
			`elif stream_type in ('ts', 'hls'):`
			`m3u8_formats = self._extract_m3u8_formats(`
[cleanup] Don't pass protocol to `_extract_m3u8_formats` for live videos `live` parameter already handles changing the protocol 2022-03-02 17:29:01 +00:00			`s_url, uuid, 'mp4', live=is_live, m3u8_id='hls', fatal=False)`
[cleanup] Add more ruff rules (#10149) Authored by: seproDev Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com> Reviewed-by: Simon Sawicki <contact@grub4k.xyz> 2024-06-11 23:09:58 +00:00			`if all(f.get('acodec') == 'none' for f in m3u8_formats):`
Update to ytdl-2021.01.03 2021-01-01 12:26:37 +00:00			`continue`
			`for f in m3u8_formats:`
			`height = f.get('height')`
			`if not height:`
			`continue`
			`vbr = self._search_regex(`
			`r'[_x]%d[_-](\d+)' % height, f['url'], 'vbr', default=None)`
			`if vbr:`
			`f['vbr'] = int(vbr)`
			`formats.extend(m3u8_formats)`
			`else:`
			`vbr = int_or_none(s.get('bitrate'))`
			`formats.append({`
[cleanup] Misc (#10075) Closes #10303 Authored by: bashonly, seproDev, jucor, c-basalt Co-authored-by: sepro <4618135+seproDev@users.noreply.github.com> Co-authored-by: Julien Cornebise <julien@cornebise.com> Co-authored-by: c-basalt <117849907+c-basalt@users.noreply.github.com> 2024-07-01 22:51:27 +00:00			`'format_id': join_nonempty(stream_type, vbr),`
Update to ytdl-2021.01.03 2021-01-01 12:26:37 +00:00			`'vbr': vbr,`
			`'width': int_or_none(s.get('width')),`
			`'height': int_or_none(s.get('height')),`
			`'filesize': int_or_none(s.get('filesize')),`
			`'url': s_url,`
[formatsort] Remove misuse of 'preference' 'preference' is to be used only when the format is better that ALL qualities of a lower preference irrespective of ANY sorting order the user requests. See deezer.py for correct use of this In the older sorting method, `preference`, `quality` and `language_preference` were functionally almost equivalent. So these disparities doesn't really matter there Also, despite what the documentation says, the default for `preference` was actually 0 and not -1. I have tried to correct this and also account for it when converting `preference` to `quality` 2021-02-18 22:03:16 +00:00			`'quality': -10,`
Update to ytdl-2021.01.03 2021-01-01 12:26:37 +00:00			`})`

			`subtitles = {}`
			`for subtitle in (try_get(video, lambda x: x['subtitles']['urls'], list) or []):`
			`subtitle_url = subtitle.get('url')`
			`if subtitle_url:`
			`subtitles.setdefault('en', []).append({'url': subtitle_url})`

			`return {`
			`'id': uuid,`
[extractor] Standardize `_live_title` 2021-12-15 16:00:46 +00:00			`'title': title,`
Update to ytdl-2021.01.03 2021-01-01 12:26:37 +00:00			`'thumbnail': try_get(video, lambda x: x['promo_image']['url']),`
			`'description': try_get(video, lambda x: x['subheadlines']['basic']),`
			`'formats': formats,`
			`'duration': int_or_none(video.get('duration'), 100),`
			`'timestamp': parse_iso8601(video.get('created_date')),`
			`'subtitles': subtitles,`
			`'is_live': is_live,`
			`}`