yt-dlp/yt_dlp/extractor/ora.py

import re
import urllib.parse

from .common import InfoExtractor
from ..utils import (
    get_element_by_attribute,
    qualities,
    unescapeHTML,
)


class OraTVIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?(?:ora\.tv|unsafespeech\.com)/([^/]+/)*(?P<id>[^/\?#]+)'
    _TESTS = [{
        'url': 'https://www.ora.tv/larrykingnow/2015/12/16/vine-youtube-stars-zach-king-king-bach-on-their-viral-videos-0_36jupg6090pq',
        'md5': 'fa33717591c631ec93b04b0e330df786',
        'info_dict': {
            'id': '50178',
            'ext': 'mp4',
            'title': 'Vine & YouTube Stars Zach King & King Bach On Their Viral Videos!',
            'description': 'md5:ebbc5b1424dd5dba7be7538148287ac1',
        },
    }, {
        'url': 'http://www.unsafespeech.com/video/2016/5/10/student-self-censorship-and-the-thought-police-on-university-campuses-0_6622bnkppw4d',
        'only_matching': True,
    }]

    def _real_extract(self, url):
        display_id = self._match_id(url)
        webpage = self._download_webpage(url, display_id)

        video_data = self._search_regex(
            r'"(?:video|current)"\s*:\s*({[^}]+?})', webpage, 'current video')
        m3u8_url = self._search_regex(
            r'hls_stream"?\s*:\s*"([^"]+)', video_data, 'm3u8 url', None)
        if m3u8_url:
            formats = self._extract_m3u8_formats(
                m3u8_url, display_id, 'mp4', 'm3u8_native',
                m3u8_id='hls', fatal=False)
            # similar to GameSpotIE
            m3u8_path = urllib.parse.urlparse(m3u8_url).path
            QUALITIES_RE = r'((,[a-z]+\d+)+,?)'
            available_qualities = self._search_regex(
                QUALITIES_RE, m3u8_path, 'qualities').strip(',').split(',')
            http_path = m3u8_path[1:].split('/', 1)[1]
            http_template = re.sub(QUALITIES_RE, r'%s', http_path)
            http_template = http_template.replace('.csmil/master.m3u8', '')
            http_template = urllib.parse.urljoin(
                'http://videocdn-pmd.ora.tv/', http_template)
            preference = qualities(
                ['mobile400', 'basic400', 'basic600', 'sd900', 'sd1200', 'sd1500', 'hd720', 'hd1080'])
            for q in available_qualities:
                formats.append({
                    'url': http_template % q,
                    'format_id': q,
                    'quality': preference(q),
                })
        else:
            return self.url_result(self._search_regex(
                r'"youtube_id"\s*:\s*"([^"]+)', webpage, 'youtube id'), 'Youtube')

        return {
            'id': self._search_regex(
                r'"id"\s*:\s*(\d+)', video_data, 'video id', default=display_id),
            'display_id': display_id,
            'title': unescapeHTML(self._og_search_title(webpage)),
            'description': get_element_by_attribute(
                'class', 'video_txt_decription', webpage),
            'thumbnail': self._proto_relative_url(self._search_regex(
                r'"thumb"\s*:\s*"([^"]+)', video_data, 'thumbnail', None)),
            'formats': formats,
        }
[ora] Add new extractor(closes #7732) 2015-12-31 15:35:51 +00:00			`import re`
[cleanup] Add more ruff rules (#10149) Authored by: seproDev Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com> Reviewed-by: Simon Sawicki <contact@grub4k.xyz> 2024-06-11 23:09:58 +00:00			`import urllib.parse`
[misc] Add `hatch`, `ruff`, `pre-commit` and improve dev docs (#7409) Authored by: bashonly, seproDev, Grub4K Co-authored-by: bashonly <88596187+bashonly@users.noreply.github.com> Co-authored-by: sepro <4618135+seproDev@users.noreply.github.com> 2024-05-26 19:27:21 +00:00
[ora] Add new extractor(closes #7732) 2015-12-31 15:35:51 +00:00			`from .common import InfoExtractor`
			`from ..utils import (`
			`get_element_by_attribute,`
			`qualities,`
			`unescapeHTML,`
			`)`


			`class OraTVIE(InfoExtractor):`
[ora] Revert extraction to regexes It's less fragile than using js_to_json with ora js 2016-05-14 14:45:18 +00:00			`_VALID_URL = r'https?://(?:www\.)?(?:ora\.tv\|unsafespeech\.com)/([^/]+/)*(?P<id>[^/\?#]+)'`
			`_TESTS = [{`
[ora] Add new extractor(closes #7732) 2015-12-31 15:35:51 +00:00			`'url': 'https://www.ora.tv/larrykingnow/2015/12/16/vine-youtube-stars-zach-king-king-bach-on-their-viral-videos-0_36jupg6090pq',`
			`'md5': 'fa33717591c631ec93b04b0e330df786',`
			`'info_dict': {`
			`'id': '50178',`
			`'ext': 'mp4',`
			`'title': 'Vine & YouTube Stars Zach King & King Bach On Their Viral Videos!',`
			`'description': 'md5:ebbc5b1424dd5dba7be7538148287ac1',`
[cleanup] Add more ruff rules (#10149) Authored by: seproDev Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com> Reviewed-by: Simon Sawicki <contact@grub4k.xyz> 2024-06-11 23:09:58 +00:00			`},`
[ora] Revert extraction to regexes It's less fragile than using js_to_json with ora js 2016-05-14 14:45:18 +00:00			`}, {`
			`'url': 'http://www.unsafespeech.com/video/2016/5/10/student-self-censorship-and-the-thought-police-on-university-campuses-0_6622bnkppw4d',`
			`'only_matching': True,`
			`}]`
[ora] Add new extractor(closes #7732) 2015-12-31 15:35:51 +00:00
			`def _real_extract(self, url):`
			`display_id = self._match_id(url)`
			`webpage = self._download_webpage(url, display_id)`

[ora] Revert extraction to regexes It's less fragile than using js_to_json with ora js 2016-05-14 14:45:18 +00:00			`video_data = self._search_regex(`
			`r'"(?:video\|current)"\s:\s({[^}]+?})', webpage, 'current video')`
			`m3u8_url = self._search_regex(`
			`r'hls_stream"?\s:\s"([^"]+)', video_data, 'm3u8 url', None)`
[ora] Add new extractor(closes #7732) 2015-12-31 15:35:51 +00:00			`if m3u8_url:`
			`formats = self._extract_m3u8_formats(`
			`m3u8_url, display_id, 'mp4', 'm3u8_native',`
			`m3u8_id='hls', fatal=False)`
Fix typos Closes #8200. 2016-01-10 15:17:47 +00:00			`# similar to GameSpotIE`
[cleanup] Add more ruff rules (#10149) Authored by: seproDev Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com> Reviewed-by: Simon Sawicki <contact@grub4k.xyz> 2024-06-11 23:09:58 +00:00			`m3u8_path = urllib.parse.urlparse(m3u8_url).path`
[ora] Add new extractor(closes #7732) 2015-12-31 15:35:51 +00:00			`QUALITIES_RE = r'((,[a-z]+\d+)+,?)'`
			`available_qualities = self._search_regex(`
			`QUALITIES_RE, m3u8_path, 'qualities').strip(',').split(',')`
			`http_path = m3u8_path[1:].split('/', 1)[1]`
			`http_template = re.sub(QUALITIES_RE, r'%s', http_path)`
			`http_template = http_template.replace('.csmil/master.m3u8', '')`
[cleanup] Add more ruff rules (#10149) Authored by: seproDev Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com> Reviewed-by: Simon Sawicki <contact@grub4k.xyz> 2024-06-11 23:09:58 +00:00			`http_template = urllib.parse.urljoin(`
[ora] Add new extractor(closes #7732) 2015-12-31 15:35:51 +00:00			`'http://videocdn-pmd.ora.tv/', http_template)`
			`preference = qualities(`
			`['mobile400', 'basic400', 'basic600', 'sd900', 'sd1200', 'sd1500', 'hd720', 'hd1080'])`
			`for q in available_qualities:`
			`formats.append({`
			`'url': http_template % q,`
			`'format_id': q,`
[formatsort] Remove misuse of 'preference' 'preference' is to be used only when the format is better that ALL qualities of a lower preference irrespective of ANY sorting order the user requests. See deezer.py for correct use of this In the older sorting method, `preference`, `quality` and `language_preference` were functionally almost equivalent. So these disparities doesn't really matter there Also, despite what the documentation says, the default for `preference` was actually 0 and not -1. I have tried to correct this and also account for it when converting `preference` to `quality` 2021-02-18 22:03:16 +00:00			`'quality': preference(q),`
[ora] Add new extractor(closes #7732) 2015-12-31 15:35:51 +00:00			`})`
			`else:`
			`return self.url_result(self._search_regex(`
			`r'"youtube_id"\s:\s"([^"]+)', webpage, 'youtube id'), 'Youtube')`

			`return {`
[ora] Revert extraction to regexes It's less fragile than using js_to_json with ora js 2016-05-14 14:45:18 +00:00			`'id': self._search_regex(`
			`r'"id"\s:\s(\d+)', video_data, 'video id', default=display_id),`
[ora] Add new extractor(closes #7732) 2015-12-31 15:35:51 +00:00			`'display_id': display_id,`
			`'title': unescapeHTML(self._og_search_title(webpage)),`
			`'description': get_element_by_attribute(`
			`'class', 'video_txt_decription', webpage),`
[ora] Revert extraction to regexes It's less fragile than using js_to_json with ora js 2016-05-14 14:45:18 +00:00			`'thumbnail': self._proto_relative_url(self._search_regex(`
			`r'"thumb"\s:\s"([^"]+)', video_data, 'thumbnail', None)),`
[ora] Add new extractor(closes #7732) 2015-12-31 15:35:51 +00:00			`'formats': formats,`
			`}`