mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2025-01-01 02:21:06 +00:00
[extractors/rtvc] Add extractors (#6578)
* Add `RTVCPlay` extractor * Add `RTVCPlayEmbed` extractor * Add `RTVCKaltura` extractor * Add `SenalColombiaLive` extractor Closes #6457 Authored by: elyse0
This commit is contained in:
parent
21b5ec86c2
commit
9b30cd3dfc
3 changed files with 322 additions and 0 deletions
|
@ -1621,6 +1621,11 @@
|
|||
from .rtp import RTPIE
|
||||
from .rtrfm import RTRFMIE
|
||||
from .rts import RTSIE
|
||||
from .rtvcplay import (
|
||||
RTVCPlayIE,
|
||||
RTVCPlayEmbedIE,
|
||||
RTVCKalturaIE,
|
||||
)
|
||||
from .rtve import (
|
||||
RTVEALaCartaIE,
|
||||
RTVEAudioIE,
|
||||
|
@ -1690,6 +1695,7 @@
|
|||
)
|
||||
from .scrolller import ScrolllerIE
|
||||
from .seeker import SeekerIE
|
||||
from .senalcolombia import SenalColombiaLiveIE
|
||||
from .senategov import SenateISVPIE, SenateGovIE
|
||||
from .sendtonews import SendtoNewsIE
|
||||
from .servus import ServusIE
|
||||
|
|
285
yt_dlp/extractor/rtvcplay.py
Normal file
285
yt_dlp/extractor/rtvcplay.py
Normal file
|
@ -0,0 +1,285 @@
|
|||
import re
|
||||
|
||||
from .common import InfoExtractor, ExtractorError
|
||||
from ..utils import (
|
||||
clean_html,
|
||||
determine_ext,
|
||||
int_or_none,
|
||||
float_or_none,
|
||||
js_to_json,
|
||||
mimetype2ext,
|
||||
traverse_obj,
|
||||
urljoin,
|
||||
url_or_none,
|
||||
)
|
||||
|
||||
|
||||
class RTVCPlayBaseIE(InfoExtractor):
|
||||
_BASE_VALID_URL = r'https?://(?:www\.)?rtvcplay\.co'
|
||||
|
||||
def _extract_player_config(self, webpage, video_id):
|
||||
return self._search_json(
|
||||
r'<script\b[^>]*>[^<]*(?:var|let|const)\s+config\s*=', re.sub(r'"\s*\+\s*"', '', webpage),
|
||||
'player_config', video_id, transform_source=js_to_json)
|
||||
|
||||
def _extract_formats_and_subtitles_player_config(self, player_config, video_id):
|
||||
formats, subtitles = [], {}
|
||||
for source in traverse_obj(player_config, ('sources', ..., lambda _, v: url_or_none(v['url']))):
|
||||
ext = mimetype2ext(source.get('mimetype'), default=determine_ext(source['url']))
|
||||
if ext == 'm3u8':
|
||||
fmts, subs = self._extract_m3u8_formats_and_subtitles(
|
||||
source['url'], video_id, 'mp4', fatal=False)
|
||||
formats.extend(fmts)
|
||||
self._merge_subtitles(subs, target=subtitles)
|
||||
else:
|
||||
formats.append({
|
||||
'url': source['url'],
|
||||
'ext': ext,
|
||||
})
|
||||
|
||||
return formats, subtitles
|
||||
|
||||
|
||||
class RTVCPlayIE(RTVCPlayBaseIE):
|
||||
_VALID_URL = RTVCPlayBaseIE._BASE_VALID_URL + r'/(?P<category>(?!embed)[^/]+)/(?:[^?#]+/)?(?P<id>[\w-]+)'
|
||||
|
||||
_TESTS = [{
|
||||
'url': 'https://www.rtvcplay.co/en-vivo/canal-institucional',
|
||||
'info_dict': {
|
||||
'id': 'canal-institucional',
|
||||
'title': r're:^Canal Institucional',
|
||||
'description': 'md5:eff9e548394175928059320c006031ea',
|
||||
'thumbnail': r're:^https?://.*\.(?:jpg|png)',
|
||||
'live_status': 'is_live',
|
||||
'ext': 'mp4',
|
||||
},
|
||||
'params': {
|
||||
'skip_download': 'Livestream',
|
||||
},
|
||||
}, {
|
||||
'url': 'https://www.rtvcplay.co/en-vivo/senal-colombia',
|
||||
'info_dict': {
|
||||
'id': 'senal-colombia',
|
||||
'title': r're:^Señal Colombia',
|
||||
'description': 'md5:799f16a401d97f40c33a2c6a3e2a507b',
|
||||
'thumbnail': r're:^https?://.*\.(?:jpg|png)',
|
||||
'live_status': 'is_live',
|
||||
'ext': 'mp4',
|
||||
},
|
||||
'params': {
|
||||
'skip_download': 'Livestream',
|
||||
},
|
||||
}, {
|
||||
'url': 'https://www.rtvcplay.co/en-vivo/radio-nacional',
|
||||
'info_dict': {
|
||||
'id': 'radio-nacional',
|
||||
'title': r're:^Radio Nacional',
|
||||
'description': 'md5:5de009bc6a9fa79d2a6cf0b73f977d53',
|
||||
'thumbnail': r're:^https?://.*\.(?:jpg|png)',
|
||||
'live_status': 'is_live',
|
||||
'ext': 'mp4',
|
||||
},
|
||||
'params': {
|
||||
'skip_download': 'Livestream',
|
||||
},
|
||||
}, {
|
||||
'url': 'https://www.rtvcplay.co/peliculas-ficcion/senoritas',
|
||||
'md5': '1288ee6f6d1330d880f98bff2ed710a3',
|
||||
'info_dict': {
|
||||
'id': 'senoritas',
|
||||
'title': 'Señoritas',
|
||||
'description': 'md5:f095a2bb52cb6cf279daf6302f86fb32',
|
||||
'thumbnail': r're:^https?://.*\.(?:jpg|png)',
|
||||
'ext': 'mp4',
|
||||
},
|
||||
}, {
|
||||
'url': 'https://www.rtvcplay.co/competencias-basicas-ciudadanas-y-socioemocionales/profe-en-tu-casa/james-regresa-clases-28022022',
|
||||
'md5': 'f040a7380a269ad633cf837384d5e9fc',
|
||||
'info_dict': {
|
||||
'id': 'james-regresa-clases-28022022',
|
||||
'title': 'James regresa a clases - 28/02/2022',
|
||||
'description': 'md5:c5dcdf757c7ab29305e8763c6007e675',
|
||||
'ext': 'mp4',
|
||||
},
|
||||
}, {
|
||||
'url': 'https://www.rtvcplay.co/peliculas-documentales/llinas-el-cerebro-y-el-universo',
|
||||
'info_dict': {
|
||||
'id': 'llinas-el-cerebro-y-el-universo',
|
||||
'title': 'Llinás, el cerebro y el universo',
|
||||
'description': 'md5:add875bf2309bb52b3e8b9b06116d9b0',
|
||||
'thumbnail': r're:^https?://.*\.(?:jpg|png)',
|
||||
},
|
||||
'playlist_mincount': 3,
|
||||
}, {
|
||||
'url': 'https://www.rtvcplay.co/competencias-basicas-ciudadanas-y-socioemocionales/profe-en-tu-casa',
|
||||
'info_dict': {
|
||||
'id': 'profe-en-tu-casa',
|
||||
'title': 'Profe en tu casa',
|
||||
'description': 'md5:47dbe20e263194413b1db2a2805a4f2e',
|
||||
'thumbnail': r're:^https?://.*\.(?:jpg|png)',
|
||||
},
|
||||
'playlist_mincount': 537,
|
||||
}, {
|
||||
'url': 'https://www.rtvcplay.co/series-al-oido/relato-de-un-naufrago-una-travesia-del-periodismo-a-la-literatura',
|
||||
'info_dict': {
|
||||
'id': 'relato-de-un-naufrago-una-travesia-del-periodismo-a-la-literatura',
|
||||
'title': 'Relato de un náufrago: una travesía del periodismo a la literatura',
|
||||
'description': 'md5:6da28fdca4a5a568ea47ef65ef775603',
|
||||
'thumbnail': r're:^https?://.*\.(?:jpg|png)',
|
||||
},
|
||||
'playlist_mincount': 5,
|
||||
}, {
|
||||
'url': 'https://www.rtvcplay.co/series-al-oido/diez-versiones',
|
||||
'info_dict': {
|
||||
'id': 'diez-versiones',
|
||||
'title': 'Diez versiones',
|
||||
'description': 'md5:997471ed971cb3fd8e41969457675306',
|
||||
'thumbnail': r're:^https?://.*\.(?:jpg|png)',
|
||||
},
|
||||
'playlist_mincount': 20,
|
||||
}]
|
||||
|
||||
def _real_extract(self, url):
|
||||
video_id, category = self._match_valid_url(url).group('id', 'category')
|
||||
webpage = self._download_webpage(url, video_id)
|
||||
|
||||
hydration = self._search_json(
|
||||
r'window\.__RTVCPLAY_STATE__\s*=', webpage, 'hydration',
|
||||
video_id, transform_source=js_to_json)['content']['currentContent']
|
||||
|
||||
asset_id = traverse_obj(hydration, ('video', 'assetid'))
|
||||
if asset_id:
|
||||
hls_url = hydration['base_url_hls'].replace('[node:field_asset_id]', asset_id)
|
||||
else:
|
||||
hls_url = traverse_obj(hydration, ('channel', 'hls'))
|
||||
|
||||
metadata = traverse_obj(hydration, {
|
||||
'title': 'title',
|
||||
'description': 'description',
|
||||
'thumbnail': ((('channel', 'image', 'logo'), ('resource', 'image', 'cover_desktop')), 'path'),
|
||||
}, get_all=False)
|
||||
|
||||
# Probably it's a program's page
|
||||
if not hls_url:
|
||||
seasons = traverse_obj(
|
||||
hydration, ('widgets', lambda _, y: y['type'] == 'seasonList', 'contents'),
|
||||
get_all=False)
|
||||
if not seasons:
|
||||
podcast_episodes = hydration.get('audios')
|
||||
if not podcast_episodes:
|
||||
raise ExtractorError('Could not find asset_id nor program playlist nor podcast episodes')
|
||||
|
||||
return self.playlist_result([
|
||||
self.url_result(episode['file'], url_transparent=True, **traverse_obj(episode, {
|
||||
'title': 'title',
|
||||
'description': ('description', {clean_html}),
|
||||
'episode_number': ('chapter_number', {float_or_none}, {int_or_none}),
|
||||
'season_number': ('season', {int_or_none}),
|
||||
})) for episode in podcast_episodes], video_id, **metadata)
|
||||
|
||||
entries = [self.url_result(
|
||||
urljoin(url, episode['slug']), url_transparent=True,
|
||||
**traverse_obj(season, {
|
||||
'season': 'title',
|
||||
'season_number': ('season', {int_or_none}),
|
||||
}), **traverse_obj(episode, {
|
||||
'title': 'title',
|
||||
'thumbnail': ('image', 'cover', 'path'),
|
||||
'episode_number': ('chapter_number', {int_or_none}),
|
||||
})) for season in seasons for episode in traverse_obj(season, ('contents', ...))]
|
||||
|
||||
return self.playlist_result(entries, video_id, **metadata)
|
||||
|
||||
formats, subtitles = self._extract_m3u8_formats_and_subtitles(hls_url, video_id, 'mp4')
|
||||
|
||||
return {
|
||||
'id': video_id,
|
||||
'formats': formats,
|
||||
'subtitles': subtitles,
|
||||
'is_live': category == 'en-vivo',
|
||||
**metadata,
|
||||
}
|
||||
|
||||
|
||||
class RTVCPlayEmbedIE(RTVCPlayBaseIE):
|
||||
_VALID_URL = RTVCPlayBaseIE._BASE_VALID_URL + r'/embed/(?P<id>[\w-]+)'
|
||||
|
||||
_TESTS = [{
|
||||
'url': 'https://www.rtvcplay.co/embed/72b0e699-248b-4929-a4a8-3782702fa7f9',
|
||||
'md5': 'ed529aeaee7aa2a72afe91ac7d1177a8',
|
||||
'info_dict': {
|
||||
'id': '72b0e699-248b-4929-a4a8-3782702fa7f9',
|
||||
'title': 'Tráiler: Señoritas',
|
||||
'thumbnail': r're:^https?://.*\.(?:jpg|png)',
|
||||
'ext': 'mp4',
|
||||
}
|
||||
}]
|
||||
|
||||
def _real_extract(self, url):
|
||||
video_id = self._match_id(url)
|
||||
webpage = self._download_webpage(url, video_id)
|
||||
|
||||
player_config = self._extract_player_config(webpage, video_id)
|
||||
formats, subtitles = self._extract_formats_and_subtitles_player_config(player_config, video_id)
|
||||
|
||||
asset_id = traverse_obj(player_config, ('rtvcplay', 'assetid'))
|
||||
metadata = {} if not asset_id else self._download_json(
|
||||
f'https://cms.rtvcplay.co/api/v1/video/asset-id/{asset_id}', video_id, fatal=False)
|
||||
|
||||
return {
|
||||
'id': video_id,
|
||||
'formats': formats,
|
||||
'subtitles': subtitles,
|
||||
**traverse_obj(metadata, {
|
||||
'title': 'title',
|
||||
'description': 'description',
|
||||
'thumbnail': ('image', ..., 'thumbnail', 'path'),
|
||||
}, get_all=False)
|
||||
}
|
||||
|
||||
|
||||
class RTVCKalturaIE(RTVCPlayBaseIE):
|
||||
_VALID_URL = r'https?://media\.rtvc\.gov\.co/kalturartvc/(?P<id>[\w-]+)'
|
||||
|
||||
_TESTS = [{
|
||||
'url': 'https://media.rtvc.gov.co/kalturartvc/indexSC.html',
|
||||
'info_dict': {
|
||||
'id': 'indexSC',
|
||||
'title': r're:^Señal Colombia',
|
||||
'description': 'md5:799f16a401d97f40c33a2c6a3e2a507b',
|
||||
'thumbnail': r're:^https?://.*\.(?:jpg|png)',
|
||||
'live_status': 'is_live',
|
||||
'ext': 'mp4',
|
||||
},
|
||||
'params': {
|
||||
'skip_download': 'Livestream',
|
||||
},
|
||||
}]
|
||||
|
||||
def _real_extract(self, url):
|
||||
video_id = self._match_id(url)
|
||||
webpage = self._download_webpage(url, video_id)
|
||||
|
||||
player_config = self._extract_player_config(webpage, video_id)
|
||||
formats, subtitles = self._extract_formats_and_subtitles_player_config(player_config, video_id)
|
||||
|
||||
channel_id = traverse_obj(player_config, ('rtvcplay', 'channelId'))
|
||||
metadata = {} if not channel_id else self._download_json(
|
||||
f'https://cms.rtvcplay.co/api/v1/taxonomy_term/streaming/{channel_id}', video_id, fatal=False)
|
||||
|
||||
fmts, subs = self._extract_m3u8_formats_and_subtitles(
|
||||
traverse_obj(metadata, ('channel', 'hls')), video_id, 'mp4', fatal=False)
|
||||
formats.extend(fmts)
|
||||
self._merge_subtitles(subs, target=subtitles)
|
||||
|
||||
return {
|
||||
'id': video_id,
|
||||
'formats': formats,
|
||||
'subtitles': subtitles,
|
||||
'is_live': True,
|
||||
**traverse_obj(metadata, {
|
||||
'title': 'title',
|
||||
'description': 'description',
|
||||
'thumbnail': ('channel', 'image', 'logo', 'path'),
|
||||
})
|
||||
}
|
31
yt_dlp/extractor/senalcolombia.py
Normal file
31
yt_dlp/extractor/senalcolombia.py
Normal file
|
@ -0,0 +1,31 @@
|
|||
from .common import InfoExtractor
|
||||
from .rtvcplay import RTVCKalturaIE
|
||||
|
||||
|
||||
class SenalColombiaLiveIE(InfoExtractor):
|
||||
_VALID_URL = r'https?://(?:www\.)?senalcolombia\.tv/(?P<id>senal-en-vivo)'
|
||||
|
||||
_TESTS = [{
|
||||
'url': 'https://www.senalcolombia.tv/senal-en-vivo',
|
||||
'info_dict': {
|
||||
'id': 'indexSC',
|
||||
'title': 're:^Señal Colombia',
|
||||
'description': 'md5:799f16a401d97f40c33a2c6a3e2a507b',
|
||||
'thumbnail': r're:^https?://.*\.(?:jpg|png)',
|
||||
'live_status': 'is_live',
|
||||
'ext': 'mp4',
|
||||
},
|
||||
'params': {
|
||||
'skip_download': 'Livestream',
|
||||
},
|
||||
}]
|
||||
|
||||
def _real_extract(self, url):
|
||||
display_id = self._match_id(url)
|
||||
webpage = self._download_webpage(url, display_id)
|
||||
|
||||
hydration = self._search_json(
|
||||
r'<script\b[^>]*data-drupal-selector\s*=\s*"[^"]*drupal-settings-json[^"]*"[^>]*>',
|
||||
webpage, 'hydration', display_id)
|
||||
|
||||
return self.url_result(hydration['envivosrc'], RTVCKalturaIE, display_id)
|
Loading…
Reference in a new issue