yt-dlp/yt_dlp/extractor/cspan.py

import re

from .common import InfoExtractor
from ..compat import compat_HTMLParseError
from ..utils import (
    determine_ext,
    ExtractorError,
    extract_attributes,
    find_xpath_attr,
    get_element_by_attribute,
    get_element_by_class,
    int_or_none,
    join_nonempty,
    js_to_json,
    merge_dicts,
    parse_iso8601,
    parse_qs,
    smuggle_url,
    str_to_int,
    unescapeHTML,
)
from .senategov import SenateISVPIE
from .ustream import UstreamIE


class CSpanIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?c-span\.org/video/\?(?P<id>[0-9a-f]+)'
    IE_DESC = 'C-SPAN'
    _TESTS = [{
        'url': 'http://www.c-span.org/video/?313572-1/HolderonV',
        'md5': '94b29a4f131ff03d23471dd6f60b6a1d',
        'info_dict': {
            'id': '315139',
            'title': 'Attorney General Eric Holder on Voting Rights Act Decision',
        },
        'playlist_mincount': 2,
        'skip': 'Regularly fails on travis, for unknown reasons',
    }, {
        'url': 'http://www.c-span.org/video/?c4486943/cspan-international-health-care-models',
        # md5 is unstable
        'info_dict': {
            'id': 'c4486943',
            'ext': 'mp4',
            'title': 'CSPAN - International Health Care Models',
            'description': 'md5:7a985a2d595dba00af3d9c9f0783c967',
        }
    }, {
        'url': 'http://www.c-span.org/video/?318608-1/gm-ignition-switch-recall',
        'info_dict': {
            'id': '342759',
            'title': 'General Motors Ignition Switch Recall',
        },
        'playlist_mincount': 6,
    }, {
        # Video from senate.gov
        'url': 'http://www.c-span.org/video/?104517-1/immigration-reforms-needed-protect-skilled-american-workers',
        'info_dict': {
            'id': 'judiciary031715',
            'ext': 'mp4',
            'title': 'Immigration Reforms Needed to Protect Skilled American Workers',
        },
        'params': {
            'skip_download': True,  # m3u8 downloads
        }
    }, {
        # Ustream embedded video
        'url': 'https://www.c-span.org/video/?114917-1/armed-services',
        'info_dict': {
            'id': '58428542',
            'ext': 'flv',
            'title': 'USHR07 Armed Services Committee',
            'description': 'hsas00-2118-20150204-1000et-07\n\n\nUSHR07 Armed Services Committee',
            'timestamp': 1423060374,
            'upload_date': '20150204',
            'uploader': 'HouseCommittee',
            'uploader_id': '12987475',
        },
    }, {
        # Audio Only
        'url': 'https://www.c-span.org/video/?437336-1/judiciary-antitrust-competition-policy-consumer-rights',
        'only_matching': True,
    }]
    BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s'

    def _real_extract(self, url):
        video_id = self._match_id(url)
        video_type = None
        webpage = self._download_webpage(url, video_id)

        ustream_url = UstreamIE._extract_url(webpage)
        if ustream_url:
            return self.url_result(ustream_url, UstreamIE.ie_key())

        if '&vod' not in url:
            bc = self._search_regex(
                r"(<[^>]+id='brightcove-player-embed'[^>]+>)",
                webpage, 'brightcove embed', default=None)
            if bc:
                bc_attr = extract_attributes(bc)
                bc_url = self.BRIGHTCOVE_URL_TEMPLATE % (
                    bc_attr.get('data-bcaccountid', '3162030207001'),
                    bc_attr.get('data-noprebcplayerid', 'SyGGpuJy3g'),
                    bc_attr.get('data-newbcplayerid', 'default'),
                    bc_attr['data-bcid'])
                return self.url_result(smuggle_url(bc_url, {'source_url': url}))

        def add_referer(formats):
            for f in formats:
                f.setdefault('http_headers', {})['Referer'] = url

        # As of 01.12.2020 this path looks to cover all cases making the rest
        # of the code unnecessary
        jwsetup = self._parse_json(
            self._search_regex(
                r'(?s)jwsetup\s*=\s*({.+?})\s*;', webpage, 'jwsetup',
                default='{}'),
            video_id, transform_source=js_to_json, fatal=False)
        if jwsetup:
            info = self._parse_jwplayer_data(
                jwsetup, video_id, require_title=False, m3u8_id='hls',
                base_url=url)
            add_referer(info['formats'])
            for subtitles in info['subtitles'].values():
                for subtitle in subtitles:
                    ext = determine_ext(subtitle['url'])
                    if ext == 'php':
                        ext = 'vtt'
                    subtitle['ext'] = ext
            ld_info = self._search_json_ld(webpage, video_id, default={})
            try:
                title = get_element_by_class('video-page-title', webpage)
            except compat_HTMLParseError:
                title = None
            if title is None:
                title = self._og_search_title(webpage)
            description = get_element_by_attribute('itemprop', 'description', webpage) or \
                self._html_search_meta(['og:description', 'description'], webpage)
            return merge_dicts(info, ld_info, {
                'title': title,
                'thumbnail': get_element_by_attribute('itemprop', 'thumbnailUrl', webpage),
                'description': description,
                'timestamp': parse_iso8601(get_element_by_attribute('itemprop', 'uploadDate', webpage)),
                'location': get_element_by_attribute('itemprop', 'contentLocation', webpage),
                'duration': int_or_none(self._search_regex(
                    r'jwsetup\.seclength\s*=\s*(\d+);',
                    webpage, 'duration', fatal=False)),
                'view_count': str_to_int(self._search_regex(
                    r"<span[^>]+class='views'[^>]*>([\d,]+)\s+Views</span>",
                    webpage, 'views', fatal=False)),
            })

        # Obsolete
        # We first look for clipid, because clipprog always appears before
        patterns = [r'id=\'clip(%s)\'\s*value=\'([0-9]+)\'' % t for t in ('id', 'prog')]
        results = list(filter(None, (re.search(p, webpage) for p in patterns)))
        if results:
            matches = results[0]
            video_type, video_id = matches.groups()
            video_type = 'clip' if video_type == 'id' else 'program'
        else:
            m = re.search(r'data-(?P<type>clip|prog)id=["\'](?P<id>\d+)', webpage)
            if m:
                video_id = m.group('id')
                video_type = 'program' if m.group('type') == 'prog' else 'clip'
            else:
                senate_isvp_url = SenateISVPIE._extract_url(webpage)
                if senate_isvp_url:
                    title = self._og_search_title(webpage)
                    surl = smuggle_url(senate_isvp_url, {'force_title': title})
                    return self.url_result(surl, 'SenateISVP', video_id, title)
                video_id = self._search_regex(
                    r'jwsetup\.clipprog\s*=\s*(\d+);',
                    webpage, 'jwsetup program id', default=None)
                if video_id:
                    video_type = 'program'
        if video_type is None or video_id is None:
            error_message = get_element_by_class('VLplayer-error-message', webpage)
            if error_message:
                raise ExtractorError(error_message)
            raise ExtractorError('unable to find video id and type')

        def get_text_attr(d, attr):
            return d.get(attr, {}).get('#text')

        data = self._download_json(
            'http://www.c-span.org/assets/player/ajax-player.php?os=android&html5=%s&id=%s' % (video_type, video_id),
            video_id)['video']
        if data['@status'] != 'Success':
            raise ExtractorError('%s said: %s' % (self.IE_NAME, get_text_attr(data, 'error')), expected=True)

        doc = self._download_xml(
            'http://www.c-span.org/common/services/flashXml.php?%sid=%s' % (video_type, video_id),
            video_id)

        description = self._html_search_meta('description', webpage)

        title = find_xpath_attr(doc, './/string', 'name', 'title').text
        thumbnail = find_xpath_attr(doc, './/string', 'name', 'poster').text

        files = data['files']
        capfile = get_text_attr(data, 'capfile')

        entries = []
        for partnum, f in enumerate(files):
            formats = []
            for quality in f.get('qualities', []):
                formats.append({
                    'format_id': '%s-%sp' % (get_text_attr(quality, 'bitrate'), get_text_attr(quality, 'height')),
                    'url': unescapeHTML(get_text_attr(quality, 'file')),
                    'height': int_or_none(get_text_attr(quality, 'height')),
                    'tbr': int_or_none(get_text_attr(quality, 'bitrate')),
                })
            if not formats:
                path = unescapeHTML(get_text_attr(f, 'path'))
                if not path:
                    continue
                formats = self._extract_m3u8_formats(
                    path, video_id, 'mp4', entry_protocol='m3u8_native',
                    m3u8_id='hls') if determine_ext(path) == 'm3u8' else [{'url': path, }]
            add_referer(formats)
            self._sort_formats(formats)
            entries.append({
                'id': '%s_%d' % (video_id, partnum + 1),
                'title': (
                    title if len(files) == 1 else
                    '%s part %d' % (title, partnum + 1)),
                'formats': formats,
                'description': description,
                'thumbnail': thumbnail,
                'duration': int_or_none(get_text_attr(f, 'length')),
                'subtitles': {
                    'en': [{
                        'url': capfile,
                        'ext': determine_ext(capfile, 'dfxp')
                    }],
                } if capfile else None,
            })

        if len(entries) == 1:
            entry = dict(entries[0])
            entry['id'] = 'c' + video_id if video_type == 'clip' else video_id
            return entry
        else:
            return {
                '_type': 'playlist',
                'entries': entries,
                'title': title,
                'id': 'c' + video_id if video_type == 'clip' else video_id,
            }


class CSpanCongressIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?c-span\.org/congress/'
    _TESTS = [{
        'url': 'https://www.c-span.org/congress/?chamber=house&date=2017-12-13&t=1513208380',
        'info_dict': {
            'id': 'house_2017-12-13',
            'title': 'Congressional Chronicle - Members of Congress, Hearings and More',
            'description': 'md5:54c264b7a8f219937987610243305a84',
            'thumbnail': r're:https://ximage.c-spanvideo.org/.+',
            'ext': 'mp4'
        }
    }]

    def _real_extract(self, url):
        query = parse_qs(url)
        video_date = query.get('date', [None])[0]
        video_id = join_nonempty(query.get('chamber', ['senate'])[0], video_date, delim='_')
        webpage = self._download_webpage(url, video_id)
        if not video_date:
            jwp_date = re.search(r'jwsetup.clipprogdate = \'(?P<date>\d{4}-\d{2}-\d{2})\';', webpage)
            if jwp_date:
                video_id = f'{video_id}_{jwp_date.group("date")}'
        jwplayer_data = self._parse_json(
            self._search_regex(r'jwsetup\s*=\s*({(?:.|\n)[^;]+});', webpage, 'player config'),
            video_id, transform_source=js_to_json)

        title = (self._og_search_title(webpage, default=None)
                 or self._html_extract_title(webpage, 'video title'))
        description = (self._og_search_description(webpage, default=None)
                       or self._html_search_meta('description', webpage, 'description', default=None))

        return {
            **self._parse_jwplayer_data(jwplayer_data, video_id, False),
            'title': re.sub(r'\s+', ' ', title.split('|')[0]).strip(),
            'description': description,
            'http_headers': {'Referer': 'https://www.c-span.org/'},
        }
Add CSpanIE (closes #312) 2013-06-26 15:55:54 +00:00			`import re`

			`from .common import InfoExtractor`
[extractor/cspan] Support of C-Span congress videos (#2295) Authored by: Grabien 2022-02-16 19:21:05 +00:00			`from ..compat import compat_HTMLParseError`
Add CSpanIE (closes #312) 2013-06-26 15:55:54 +00:00			`from ..utils import (`
[cspan] Extract subtitles 2015-04-24 15:46:51 +00:00			`determine_ext,`
[cspan] handle error massages and extract qualities 2015-10-17 20:30:38 +00:00			`ExtractorError,`
[cspan] add support for brightcove live embeds(closes #13028) 2017-05-08 23:47:37 +00:00			`extract_attributes,`
[cspan] add support for audio only pages and catch page errors(closes #14995) 2017-12-17 18:13:25 +00:00			`find_xpath_attr,`
Update to ytdl-2021.01.16 2021-01-16 12:40:15 +00:00			`get_element_by_attribute,`
[cspan] add support for audio only pages and catch page errors(closes #14995) 2017-12-17 18:13:25 +00:00			`get_element_by_class,`
			`int_or_none,`
[extractor/cspan] Support of C-Span congress videos (#2295) Authored by: Grabien 2022-02-16 19:21:05 +00:00			`join_nonempty,`
Update to ytdl-2021.01.03 2021-01-01 12:26:37 +00:00			`js_to_json,`
			`merge_dicts,`
Update to ytdl-2021.01.16 2021-01-16 12:40:15 +00:00			`parse_iso8601,`
[extractor/cspan] Support of C-Span congress videos (#2295) Authored by: Grabien 2022-02-16 19:21:05 +00:00			`parse_qs,`
[cspan] add support for audio only pages and catch page errors(closes #14995) 2017-12-17 18:13:25 +00:00			`smuggle_url,`
Update to ytdl-2021.01.16 2021-01-16 12:40:15 +00:00			`str_to_int,`
[cspan] add support for audio only pages and catch page errors(closes #14995) 2017-12-17 18:13:25 +00:00			`unescapeHTML,`
Add CSpanIE (closes #312) 2013-06-26 15:55:54 +00:00			`)`
[Senate.gov] Add SenateGovIE and fix SenateISVPIE (#1435) Authored by: Grabien, pukkandan 2021-11-27 10:37:45 +00:00			`from .senategov import SenateISVPIE`
[cspan] Support Ustream embedded videos Closes #11547 2017-01-20 14:11:43 +00:00			`from .ustream import UstreamIE`
Add CSpanIE (closes #312) 2013-06-26 15:55:54 +00:00
[cspan] Use HTTP download (Fixes #2098) 2014-01-05 03:30:00 +00:00
Add CSpanIE (closes #312) 2013-06-26 15:55:54 +00:00			`class CSpanIE(InfoExtractor):`
Add support for https for all extractors as preventive and future-proof measure 2016-03-21 15:36:32 +00:00			`_VALID_URL = r'https?://(?:www\.)?c-span\.org/video/\?(?P<id>[0-9a-f]+)'`
[cspan] Use HTTP download (Fixes #2098) 2014-01-05 03:30:00 +00:00			`IE_DESC = 'C-SPAN'`
[cspan] Add support for newer videos (Fixes #2577) 2014-03-21 01:10:24 +00:00			`_TESTS = [{`
[cspan] Fix extraction (fixes #2291) The webpage urls have changed. The title and thumbnail are now extracted from an xml. 2014-02-02 17:24:20 +00:00			`'url': 'http://www.c-span.org/video/?313572-1/HolderonV',`
[cspan] handle error massages and extract qualities 2015-10-17 20:30:38 +00:00			`'md5': '94b29a4f131ff03d23471dd6f60b6a1d',`
[cspan] Use HTTP download (Fixes #2098) 2014-01-05 03:30:00 +00:00			`'info_dict': {`
[cspan] Fix extraction (fixes #2291) The webpage urls have changed. The title and thumbnail are now extracted from an xml. 2014-02-02 17:24:20 +00:00			`'id': '315139',`
[cspan] Use HTTP download (Fixes #2098) 2014-01-05 03:30:00 +00:00			`'title': 'Attorney General Eric Holder on Voting Rights Act Decision',`
Move tests to the IE definitions 2013-06-27 18:46:46 +00:00			`},`
[cspan] Fix _TESTS 2017-01-20 14:25:20 +00:00			`'playlist_mincount': 2,`
[cspan] Disable test It works fine from all my machines, no matter where, but from travis, we get lots of 403s. Maybe another project is scraping CSPAN from travis and they're blocking the travis machines? 2014-01-22 14:10:00 +00:00			`'skip': 'Regularly fails on travis, for unknown reasons',`
[cspan] Add support for newer videos (Fixes #2577) 2014-03-21 01:10:24 +00:00			`}, {`
			`'url': 'http://www.c-span.org/video/?c4486943/cspan-international-health-care-models',`
[cspan] Fix _TESTS 2017-01-20 14:25:20 +00:00			`# md5 is unstable`
[cspan] Add support for newer videos (Fixes #2577) 2014-03-21 01:10:24 +00:00			`'info_dict': {`
[cspan] correct the clip info extraction 2015-10-03 18:28:48 +00:00			`'id': 'c4486943',`
[cspan] Add support for newer videos (Fixes #2577) 2014-03-21 01:10:24 +00:00			`'ext': 'mp4',`
[cspan] correct the clip info extraction 2015-10-03 18:28:48 +00:00			`'title': 'CSPAN - International Health Care Models',`
[cspan] Add support for newer videos (Fixes #2577) 2014-03-21 01:10:24 +00:00			`'description': 'md5:7a985a2d595dba00af3d9c9f0783c967',`
			`}`
Move playlist tests to extractors. From now on, test_download will run these tests. That means we benefit not only from the networking setup in there, but also from the other tests (for example test_all_urls to find problems with _VALID_URLs). 2014-08-27 22:58:24 +00:00			`}, {`
			`'url': 'http://www.c-span.org/video/?318608-1/gm-ignition-switch-recall',`
			`'info_dict': {`
			`'id': '342759',`
			`'title': 'General Motors Ignition Switch Recall',`
			`},`
[cspan] Fix _TESTS 2017-01-20 14:25:20 +00:00			`'playlist_mincount': 6,`
[CSpan] Add detection for Senate ISVP. Closes #5302 2015-04-20 19:18:38 +00:00			`}, {`
			`# Video from senate.gov`
			`'url': 'http://www.c-span.org/video/?104517-1/immigration-reforms-needed-protect-skilled-american-workers',`
			`'info_dict': {`
			`'id': 'judiciary031715',`
Fix unit tests for m3u8 and RTSP extractors that require ffmpeg or mplayer 2016-07-07 21:39:39 +00:00			`'ext': 'mp4',`
[CSpan] Add detection for Senate ISVP. Closes #5302 2015-04-20 19:18:38 +00:00			`'title': 'Immigration Reforms Needed to Protect Skilled American Workers',`
Fix unit tests for m3u8 and RTSP extractors that require ffmpeg or mplayer 2016-07-07 21:39:39 +00:00			`},`
			`'params': {`
			`'skip_download': True, # m3u8 downloads`
[CSpan] Add detection for Senate ISVP. Closes #5302 2015-04-20 19:18:38 +00:00			`}`
[cspan] Support Ustream embedded videos Closes #11547 2017-01-20 14:11:43 +00:00			`}, {`
			`# Ustream embedded video`
			`'url': 'https://www.c-span.org/video/?114917-1/armed-services',`
			`'info_dict': {`
			`'id': '58428542',`
			`'ext': 'flv',`
			`'title': 'USHR07 Armed Services Committee',`
			`'description': 'hsas00-2118-20150204-1000et-07\n\n\nUSHR07 Armed Services Committee',`
			`'timestamp': 1423060374,`
			`'upload_date': '20150204',`
			`'uploader': 'HouseCommittee',`
			`'uploader_id': '12987475',`
			`},`
[cspan] add support for audio only pages and catch page errors(closes #14995) 2017-12-17 18:13:25 +00:00			`}, {`
			`# Audio Only`
			`'url': 'https://www.c-span.org/video/?437336-1/judiciary-antitrust-competition-policy-consumer-rights',`
			`'only_matching': True,`
[cspan] Add support for newer videos (Fixes #2577) 2014-03-21 01:10:24 +00:00			`}]`
[cspan] add support for brightcove live embeds(closes #13028) 2017-05-08 23:47:37 +00:00			`BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s'`
Add CSpanIE (closes #312) 2013-06-26 15:55:54 +00:00
			`def _real_extract(self, url):`
[cspan] correct the clip info extraction 2015-10-03 18:28:48 +00:00			`video_id = self._match_id(url)`
[cspan] Initialize 'video_type' to avoid 'UnboundLocalError' exceptions (#8032) 2015-12-28 12:06:30 +00:00			`video_type = None`
[cspan] correct the clip info extraction 2015-10-03 18:28:48 +00:00			`webpage = self._download_webpage(url, video_id)`
[cspan] Support Ustream embedded videos Closes #11547 2017-01-20 14:11:43 +00:00
			`ustream_url = UstreamIE._extract_url(webpage)`
			`if ustream_url:`
			`return self.url_result(ustream_url, UstreamIE.ie_key())`

[cspan] add support for brightcove live embeds(closes #13028) 2017-05-08 23:47:37 +00:00			`if '&vod' not in url:`
			`bc = self._search_regex(`
			`r"(<[^>]+id='brightcove-player-embed'[^>]+>)",`
			`webpage, 'brightcove embed', default=None)`
			`if bc:`
			`bc_attr = extract_attributes(bc)`
			`bc_url = self.BRIGHTCOVE_URL_TEMPLATE % (`
			`bc_attr.get('data-bcaccountid', '3162030207001'),`
			`bc_attr.get('data-noprebcplayerid', 'SyGGpuJy3g'),`
			`bc_attr.get('data-newbcplayerid', 'default'),`
			`bc_attr['data-bcid'])`
			`return self.url_result(smuggle_url(bc_url, {'source_url': url}))`

Update to ytdl-2021.01.03 2021-01-01 12:26:37 +00:00			`def add_referer(formats):`
			`for f in formats:`
			`f.setdefault('http_headers', {})['Referer'] = url`

			`# As of 01.12.2020 this path looks to cover all cases making the rest`
			`# of the code unnecessary`
			`jwsetup = self._parse_json(`
			`self._search_regex(`
			`r'(?s)jwsetup\s=\s({.+?})\s*;', webpage, 'jwsetup',`
			`default='{}'),`
			`video_id, transform_source=js_to_json, fatal=False)`
			`if jwsetup:`
			`info = self._parse_jwplayer_data(`
			`jwsetup, video_id, require_title=False, m3u8_id='hls',`
			`base_url=url)`
			`add_referer(info['formats'])`
Update to ytdl-2021.01.16 2021-01-16 12:40:15 +00:00			`for subtitles in info['subtitles'].values():`
			`for subtitle in subtitles:`
			`ext = determine_ext(subtitle['url'])`
			`if ext == 'php':`
			`ext = 'vtt'`
			`subtitle['ext'] = ext`
Update to ytdl-2021.01.03 2021-01-01 12:26:37 +00:00			`ld_info = self._search_json_ld(webpage, video_id, default={})`
[extractor/cspan] Support of C-Span congress videos (#2295) Authored by: Grabien 2022-02-16 19:21:05 +00:00			`try:`
			`title = get_element_by_class('video-page-title', webpage)`
			`except compat_HTMLParseError:`
			`title = None`
			`if title is None:`
			`title = self._og_search_title(webpage)`
Update to ytdl-2021.01.16 2021-01-16 12:40:15 +00:00			`description = get_element_by_attribute('itemprop', 'description', webpage) or \`
			`self._html_search_meta(['og:description', 'description'], webpage)`
			`return merge_dicts(info, ld_info, {`
			`'title': title,`
			`'thumbnail': get_element_by_attribute('itemprop', 'thumbnailUrl', webpage),`
			`'description': description,`
			`'timestamp': parse_iso8601(get_element_by_attribute('itemprop', 'uploadDate', webpage)),`
			`'location': get_element_by_attribute('itemprop', 'contentLocation', webpage),`
			`'duration': int_or_none(self._search_regex(`
			`r'jwsetup\.seclength\s=\s(\d+);',`
			`webpage, 'duration', fatal=False)),`
			`'view_count': str_to_int(self._search_regex(`
			`r"<span[^>]+class='views'[^>]*>([\d,]+)\s+Views</span>",`
			`webpage, 'views', fatal=False)),`
			`})`
Update to ytdl-2021.01.03 2021-01-01 12:26:37 +00:00
			`# Obsolete`
[cspan] Fix extraction (fixes #8032) 2015-12-28 12:48:10 +00:00			`# We first look for clipid, because clipprog always appears before`
			`patterns = [r'id=\'clip(%s)\'\s*value=\'([0-9]+)\'' % t for t in ('id', 'prog')]`
			`results = list(filter(None, (re.search(p, webpage) for p in patterns)))`
			`if results:`
			`matches = results[0]`
[cspan] correct the clip info extraction 2015-10-03 18:28:48 +00:00			`video_type, video_id = matches.groups()`
[cspan] Fix extraction (fixes #8032) 2015-12-28 12:48:10 +00:00			`video_type = 'clip' if video_type == 'id' else 'program'`
[cspan] correct the clip info extraction 2015-10-03 18:28:48 +00:00			`else:`
[cspan] Fix clip/prog id extraction (#8317) 2016-01-26 14:42:20 +00:00			`m = re.search(r'data-(?P<type>clip\|prog)id=["\'](?P<id>\d+)', webpage)`
			`if m:`
			`video_id = m.group('id')`
			`video_type = 'program' if m.group('type') == 'prog' else 'clip'`
			`else:`
[extractors] Use new framework for existing embeds (#4307) `Brightcove` is difficult to migrate because it's subclasses may depend on the signature of the current functions. So it is left as-is for now Note: Tests have not been migrated 2022-08-01 01:23:25 +00:00			`senate_isvp_url = SenateISVPIE._extract_url(webpage)`
[cspan] Fix clip/prog id extraction (#8317) 2016-01-26 14:42:20 +00:00			`if senate_isvp_url:`
			`title = self._og_search_title(webpage)`
			`surl = smuggle_url(senate_isvp_url, {'force_title': title})`
			`return self.url_result(surl, 'SenateISVP', video_id, title)`
[cspan] add support for audio only pages and catch page errors(closes #14995) 2017-12-17 18:13:25 +00:00			`video_id = self._search_regex(`
			`r'jwsetup\.clipprog\s=\s(\d+);',`
			`webpage, 'jwsetup program id', default=None)`
			`if video_id:`
			`video_type = 'program'`
[cspan] Initialize 'video_type' to avoid 'UnboundLocalError' exceptions (#8032) 2015-12-28 12:06:30 +00:00			`if video_type is None or video_id is None:`
[cspan] add support for audio only pages and catch page errors(closes #14995) 2017-12-17 18:13:25 +00:00			`error_message = get_element_by_class('VLplayer-error-message', webpage)`
			`if error_message:`
			`raise ExtractorError(error_message)`
[cspan] Initialize 'video_type' to avoid 'UnboundLocalError' exceptions (#8032) 2015-12-28 12:06:30 +00:00			`raise ExtractorError('unable to find video id and type')`
[cspan] Use HTTP download (Fixes #2098) 2014-01-05 03:30:00 +00:00
[cspan] change into a function 2015-11-28 19:22:31 +00:00			`def get_text_attr(d, attr):`
			`return d.get(attr, {}).get('#text')`

[cspan] correct the clip info extraction 2015-10-03 18:28:48 +00:00			`data = self._download_json(`
[cspan] handle error massages and extract qualities 2015-10-17 20:30:38 +00:00			`'http://www.c-span.org/assets/player/ajax-player.php?os=android&html5=%s&id=%s' % (video_type, video_id),`
			`video_id)['video']`
			`if data['@status'] != 'Success':`
[cspan] change into a function 2015-11-28 19:22:31 +00:00			`raise ExtractorError('%s said: %s' % (self.IE_NAME, get_text_attr(data, 'error')), expected=True)`
[cspan] Use HTTP download (Fixes #2098) 2014-01-05 03:30:00 +00:00
[cspan] Support multiple segments (Fixes #2674) 2014-04-03 03:56:28 +00:00			`doc = self._download_xml(`
[cspan] correct the clip info extraction 2015-10-03 18:28:48 +00:00			`'http://www.c-span.org/common/services/flashXml.php?%sid=%s' % (video_type, video_id),`
[cspan] Fix extraction (fixes #2291) The webpage urls have changed. The title and thumbnail are now extracted from an xml. 2014-02-02 17:24:20 +00:00			`video_id)`

[cspan] correct the clip info extraction 2015-10-03 18:28:48 +00:00			`description = self._html_search_meta('description', webpage)`

[cspan] Support multiple segments (Fixes #2674) 2014-04-03 03:56:28 +00:00			`title = find_xpath_attr(doc, './/string', 'name', 'title').text`
			`thumbnail = find_xpath_attr(doc, './/string', 'name', 'poster').text`

[cspan] handle error massages and extract qualities 2015-10-17 20:30:38 +00:00			`files = data['files']`
[cspan] change into a function 2015-11-28 19:22:31 +00:00			`capfile = get_text_attr(data, 'capfile')`
[cspan] Support multiple segments (Fixes #2674) 2014-04-03 03:56:28 +00:00
[cspan] handle error massages and extract qualities 2015-10-17 20:30:38 +00:00			`entries = []`
			`for partnum, f in enumerate(files):`
			`formats = []`
[cspan] add support for audio only pages and catch page errors(closes #14995) 2017-12-17 18:13:25 +00:00			`for quality in f.get('qualities', []):`
[cspan] handle error massages and extract qualities 2015-10-17 20:30:38 +00:00			`formats.append({`
[cspan] change into a function 2015-11-28 19:22:31 +00:00			`'format_id': '%s-%sp' % (get_text_attr(quality, 'bitrate'), get_text_attr(quality, 'height')),`
			`'url': unescapeHTML(get_text_attr(quality, 'file')),`
			`'height': int_or_none(get_text_attr(quality, 'height')),`
			`'tbr': int_or_none(get_text_attr(quality, 'bitrate')),`
[cspan] handle error massages and extract qualities 2015-10-17 20:30:38 +00:00			`})`
[cspan] Extract from path when no qualities (Closes #8317) 2016-01-26 15:29:42 +00:00			`if not formats:`
[cspan] Unescape path (Closes #8365) 2016-01-29 18:26:33 +00:00			`path = unescapeHTML(get_text_attr(f, 'path'))`
[cspan] Extract from path when no qualities (Closes #8317) 2016-01-26 15:29:42 +00:00			`if not path:`
			`continue`
			`formats = self._extract_m3u8_formats(`
			`path, video_id, 'mp4', entry_protocol='m3u8_native',`
			`m3u8_id='hls') if determine_ext(path) == 'm3u8' else [{'url': path, }]`
Update to ytdl-2021.01.03 2021-01-01 12:26:37 +00:00			`add_referer(formats)`
[cspan] handle error massages and extract qualities 2015-10-17 20:30:38 +00:00			`self._sort_formats(formats)`
			`entries.append({`
			`'id': '%s_%d' % (video_id, partnum + 1),`
			`'title': (`
			`title if len(files) == 1 else`
			`'%s part %d' % (title, partnum + 1)),`
			`'formats': formats,`
			`'description': description,`
			`'thumbnail': thumbnail,`
[cspan] change into a function 2015-11-28 19:22:31 +00:00			`'duration': int_or_none(get_text_attr(f, 'length')),`
[cspan] handle error massages and extract qualities 2015-10-17 20:30:38 +00:00			`'subtitles': {`
			`'en': [{`
			`'url': capfile,`
			`'ext': determine_ext(capfile, 'dfxp')`
			`}],`
			`} if capfile else None,`
			`})`
[cspan] Fix extraction (fixes #2291) The webpage urls have changed. The title and thumbnail are now extracted from an xml. 2014-02-02 17:24:20 +00:00
[CSpan] Fix test cases CSpan_1 and CSpan_2 2015-04-20 19:30:54 +00:00			`if len(entries) == 1:`
			`entry = dict(entries[0])`
[cspan] correct the clip info extraction 2015-10-03 18:28:48 +00:00			`entry['id'] = 'c' + video_id if video_type == 'clip' else video_id`
[CSpan] Fix test cases CSpan_1 and CSpan_2 2015-04-20 19:30:54 +00:00			`return entry`
			`else:`
			`return {`
			`'_type': 'playlist',`
			`'entries': entries,`
			`'title': title,`
[cspan] correct the clip info extraction 2015-10-03 18:28:48 +00:00			`'id': 'c' + video_id if video_type == 'clip' else video_id,`
[CSpan] Fix test cases CSpan_1 and CSpan_2 2015-04-20 19:30:54 +00:00			`}`
[extractor/cspan] Support of C-Span congress videos (#2295) Authored by: Grabien 2022-02-16 19:21:05 +00:00

			`class CSpanCongressIE(InfoExtractor):`
			`_VALID_URL = r'https?://(?:www\.)?c-span\.org/congress/'`
			`_TESTS = [{`
			`'url': 'https://www.c-span.org/congress/?chamber=house&date=2017-12-13&t=1513208380',`
			`'info_dict': {`
			`'id': 'house_2017-12-13',`
			`'title': 'Congressional Chronicle - Members of Congress, Hearings and More',`
			`'description': 'md5:54c264b7a8f219937987610243305a84',`
			`'thumbnail': r're:https://ximage.c-spanvideo.org/.+',`
			`'ext': 'mp4'`
			`}`
			`}]`

			`def _real_extract(self, url):`
			`query = parse_qs(url)`
			`video_date = query.get('date', [None])[0]`
			`video_id = join_nonempty(query.get('chamber', ['senate'])[0], video_date, delim='_')`
			`webpage = self._download_webpage(url, video_id)`
			`if not video_date:`
			`jwp_date = re.search(r'jwsetup.clipprogdate = \'(?P<date>\d{4}-\d{2}-\d{2})\';', webpage)`
			`if jwp_date:`
			`video_id = f'{video_id}_{jwp_date.group("date")}'`
			`jwplayer_data = self._parse_json(`
			`self._search_regex(r'jwsetup\s=\s({(?:.\|\n)[^;]+});', webpage, 'player config'),`
			`video_id, transform_source=js_to_json)`

			`title = (self._og_search_title(webpage, default=None)`
[cleanup] Use `_html_extract_title` 2022-04-04 08:27:35 +00:00			`or self._html_extract_title(webpage, 'video title'))`
[extractor/cspan] Support of C-Span congress videos (#2295) Authored by: Grabien 2022-02-16 19:21:05 +00:00			`description = (self._og_search_description(webpage, default=None)`
			`or self._html_search_meta('description', webpage, 'description', default=None))`

			`return {`
			`**self._parse_jwplayer_data(jwplayer_data, video_id, False),`
			`'title': re.sub(r'\s+', ' ', title.split('\|')[0]).strip(),`
			`'description': description,`
			`'http_headers': {'Referer': 'https://www.c-span.org/'},`
			`}`