Compare commits

...

4 Commits

Author SHA1 Message Date
Nicolai Weitkemper 640911563d
Merge 481b97c394 into 89f535e265 2024-04-25 21:06:54 +02:00
bashonly 89f535e265
[ci] Fix `curl-cffi` installation (Bugfix for 02483bea1c)
Authored by: bashonly
2024-04-22 20:36:01 +00:00
bashonly ff38a011d5
[ie/crunchyroll] Fix auth and remove cookies support (#9749)
Closes #9745
Authored by: bashonly
2024-04-21 22:41:40 +00:00
Nicolai Weitkemper 481b97c394 [WIP] add extractor for RTL+ podcasts
Does not work yet!
2024-01-16 17:37:28 +01:00
4 changed files with 178 additions and 63 deletions

View File

@ -53,7 +53,7 @@ jobs:
with:
python-version: ${{ matrix.python-version }}
- name: Install test requirements
run: python3 ./devscripts/install_deps.py --include dev --include curl_cffi
run: python3 ./devscripts/install_deps.py --include dev --include curl-cffi
- name: Run tests
continue-on-error: False
run: |

View File

@ -1653,6 +1653,7 @@ from .rtlnl import (
RTLLuRadioIE,
)
from .rtl2 import RTL2IE
from .rtlplus_podcast import RtlPlusPodcastExtractorIE
from .rtnews import (
RTNewsIE,
RTDocumentryIE,

View File

@ -24,11 +24,15 @@ class CrunchyrollBaseIE(InfoExtractor):
_BASE_URL = 'https://www.crunchyroll.com'
_API_BASE = 'https://api.crunchyroll.com'
_NETRC_MACHINE = 'crunchyroll'
_REFRESH_TOKEN = None
_AUTH_HEADERS = None
_AUTH_EXPIRY = None
_API_ENDPOINT = None
_BASIC_AUTH = None
_BASIC_AUTH = 'Basic ' + base64.b64encode(':'.join((
't-kdgp2h8c3jub8fn0fq',
'yfLDfMfrYvKXh4JXS1LEI2cCqu1v5Wan',
)).encode()).decode()
_IS_PREMIUM = None
_CLIENT_ID = ('cr_web', 'noaihdevm_6iyg0a8l0q')
_LOCALE_LOOKUP = {
'ar': 'ar-SA',
'de': 'de-DE',
@ -43,69 +47,74 @@ class CrunchyrollBaseIE(InfoExtractor):
'hi': 'hi-IN',
}
@property
def is_logged_in(self):
return bool(self._get_cookies(self._BASE_URL).get('etp_rt'))
def _set_auth_info(self, response):
CrunchyrollBaseIE._IS_PREMIUM = 'cr_premium' in traverse_obj(response, ('access_token', {jwt_decode_hs256}, 'benefits', ...))
CrunchyrollBaseIE._AUTH_HEADERS = {'Authorization': response['token_type'] + ' ' + response['access_token']}
CrunchyrollBaseIE._AUTH_EXPIRY = time_seconds(seconds=traverse_obj(response, ('expires_in', {float_or_none}), default=300) - 10)
def _request_token(self, headers, data, note='Requesting token', errnote='Failed to request token'):
try: # TODO: Add impersonation support here
return self._download_json(
f'{self._BASE_URL}/auth/v1/token', None, note=note, errnote=errnote,
headers=headers, data=urlencode_postdata(data))
except ExtractorError as error:
if not isinstance(error.cause, HTTPError) or error.cause.status != 403:
raise
raise ExtractorError(
'Request blocked by Cloudflare; navigate to Crunchyroll in your browser, '
'then pass the fresh cookies (with --cookies-from-browser or --cookies) '
'and your browser\'s User-Agent (with --user-agent)', expected=True)
def _perform_login(self, username, password):
if self.is_logged_in:
if not CrunchyrollBaseIE._REFRESH_TOKEN:
CrunchyrollBaseIE._REFRESH_TOKEN = self.cache.load(self._NETRC_MACHINE, username)
if CrunchyrollBaseIE._REFRESH_TOKEN:
return
upsell_response = self._download_json(
f'{self._API_BASE}/get_upsell_data.0.json', None, 'Getting session id',
query={
'sess_id': 1,
'device_id': 'whatvalueshouldbeforweb',
'device_type': 'com.crunchyroll.static',
'access_token': 'giKq5eY27ny3cqz',
'referer': f'{self._BASE_URL}/welcome/login'
})
if upsell_response['code'] != 'ok':
raise ExtractorError('Could not get session id')
session_id = upsell_response['data']['session_id']
login_response = self._download_json(
f'{self._API_BASE}/login.1.json', None, 'Logging in',
data=urlencode_postdata({
'account': username,
'password': password,
'session_id': session_id
}))
if login_response['code'] != 'ok':
raise ExtractorError('Login failed. Server message: %s' % login_response['message'], expected=True)
if not self.is_logged_in:
raise ExtractorError('Login succeeded but did not set etp_rt cookie')
def _update_auth(self):
if CrunchyrollBaseIE._AUTH_HEADERS and CrunchyrollBaseIE._AUTH_REFRESH > time_seconds():
return
if not CrunchyrollBaseIE._BASIC_AUTH:
cx_api_param = self._CLIENT_ID[self.is_logged_in]
self.write_debug(f'Using cxApiParam={cx_api_param}')
CrunchyrollBaseIE._BASIC_AUTH = 'Basic ' + base64.b64encode(f'{cx_api_param}:'.encode()).decode()
auth_headers = {'Authorization': CrunchyrollBaseIE._BASIC_AUTH}
if self.is_logged_in:
grant_type = 'etp_rt_cookie'
else:
grant_type = 'client_id'
auth_headers['ETP-Anonymous-ID'] = uuid.uuid4()
try:
auth_response = self._download_json(
f'{self._BASE_URL}/auth/v1/token', None, note=f'Authenticating with grant_type={grant_type}',
headers=auth_headers, data=f'grant_type={grant_type}'.encode())
login_response = self._request_token(
headers={'Authorization': self._BASIC_AUTH}, data={
'username': username,
'password': password,
'grant_type': 'password',
'scope': 'offline_access',
}, note='Logging in', errnote='Failed to log in')
except ExtractorError as error:
if isinstance(error.cause, HTTPError) and error.cause.status == 403:
raise ExtractorError(
'Request blocked by Cloudflare; navigate to Crunchyroll in your browser, '
'then pass the fresh cookies (with --cookies-from-browser or --cookies) '
'and your browser\'s User-Agent (with --user-agent)', expected=True)
if isinstance(error.cause, HTTPError) and error.cause.status == 401:
raise ExtractorError('Invalid username and/or password', expected=True)
raise
CrunchyrollBaseIE._IS_PREMIUM = 'cr_premium' in traverse_obj(auth_response, ('access_token', {jwt_decode_hs256}, 'benefits', ...))
CrunchyrollBaseIE._AUTH_HEADERS = {'Authorization': auth_response['token_type'] + ' ' + auth_response['access_token']}
CrunchyrollBaseIE._AUTH_REFRESH = time_seconds(seconds=traverse_obj(auth_response, ('expires_in', {float_or_none}), default=300) - 10)
CrunchyrollBaseIE._REFRESH_TOKEN = login_response['refresh_token']
self.cache.store(self._NETRC_MACHINE, username, CrunchyrollBaseIE._REFRESH_TOKEN)
self._set_auth_info(login_response)
def _update_auth(self):
if CrunchyrollBaseIE._AUTH_HEADERS and CrunchyrollBaseIE._AUTH_EXPIRY > time_seconds():
return
auth_headers = {'Authorization': self._BASIC_AUTH}
if CrunchyrollBaseIE._REFRESH_TOKEN:
data = {
'refresh_token': CrunchyrollBaseIE._REFRESH_TOKEN,
'grant_type': 'refresh_token',
'scope': 'offline_access',
}
else:
data = {'grant_type': 'client_id'}
auth_headers['ETP-Anonymous-ID'] = uuid.uuid4()
try:
auth_response = self._request_token(auth_headers, data)
except ExtractorError as error:
username, password = self._get_login_info()
if not username or not isinstance(error.cause, HTTPError) or error.cause.status != 400:
raise
self.to_screen('Refresh token has expired. Re-logging in')
CrunchyrollBaseIE._REFRESH_TOKEN = None
self.cache.store(self._NETRC_MACHINE, username, None)
self._perform_login(username, password)
return
self._set_auth_info(auth_response)
def _locale_from_language(self, language):
config_locale = self._configuration_arg('metadata', ie_key=CrunchyrollBetaIE, casesense=True)
@ -168,7 +177,8 @@ class CrunchyrollBaseIE(InfoExtractor):
self._update_auth()
stream_response = self._download_json(
f'https://cr-play-service.prd.crunchyrollsvc.com/v1/{identifier}/console/switch/play',
display_id, note='Downloading stream info', headers=CrunchyrollBaseIE._AUTH_HEADERS)
display_id, note='Downloading stream info', errnote='Failed to download stream info',
headers=CrunchyrollBaseIE._AUTH_HEADERS)
available_formats = {'': ('', '', stream_response['url'])}
for hardsub_lang, stream in traverse_obj(stream_response, ('hardSubs', {dict.items}, lambda _, v: v[1]['url'])):
@ -383,9 +393,9 @@ class CrunchyrollBetaIE(CrunchyrollCmsBaseIE):
if not self._IS_PREMIUM and traverse_obj(response, (f'{object_type}_metadata', 'is_premium_only')):
message = f'This {object_type} is for premium members only'
if self.is_logged_in:
if CrunchyrollBaseIE._REFRESH_TOKEN:
raise ExtractorError(message, expected=True)
self.raise_login_required(message)
self.raise_login_required(message, method='password')
result['formats'], result['subtitles'] = self._extract_stream(internal_id)
@ -575,9 +585,9 @@ class CrunchyrollMusicIE(CrunchyrollBaseIE):
if not self._IS_PREMIUM and response.get('isPremiumOnly'):
message = f'This {response.get("type") or "media"} is for premium members only'
if self.is_logged_in:
if CrunchyrollBaseIE._REFRESH_TOKEN:
raise ExtractorError(message, expected=True)
self.raise_login_required(message)
self.raise_login_required(message, method='password')
result = self._transform_music_response(response)
result['formats'], _ = self._extract_stream(f'music/{internal_id}', internal_id)

View File

@ -0,0 +1,104 @@
from .common import InfoExtractor
class RtlPlusPodcastExtractorIE(InfoExtractor):
# _VALID_URL = r'https?://(?:www\.)?plus\.rtl\.de/podcast/(?P<podcastid>\w+)/(?P<id>\w+)'
_VALID_URL = r'https?://(?:www\.)?plus\.rtl\.de/podcast/it-gefluester-swkanad6uxwwh/(?P<id>\w+)'
_TESTS = [{
'url': 'https://plus.rtl.de/podcast/it-gefluester-swkanad6uxwwh/revolution-im-business-kuenstliche-intelligenz-im-fokus-vpyvzbrxkkx39',
'md5': '0051094e27f498c655cf0747f10995f2',
'info_dict': {
# For videos, only the 'id' and 'ext' fields are required to RUN the test:
'id': 'rrn:podcast:external:episode:vpyvzbrxkkx39',
'ext': 'mp3',
# Then if the test run fails, it will output the missing/incorrect fields.
# Properties can be added as:
# * A value, e.g.
'title': 'Revolution im Business: Künstliche Intelligenz im Fokus',
# * MD5 checksum; start the string with 'md5:', e.g.
'description': 'md5:c6f32c36570c3a0a776bfd5ae3ed0e88',
# * A regular expression; start the string with 're:', e.g.
# 'thumbnail': r're:^https?://.*\.jpg$',
# * A count of elements in a list; start the string with 'count:', e.g.
# 'tags': 'count:10',
# * Any Python type, e.g.
# 'view_count': int,
}
}]
def _real_extract(self, url):
video_slug = self._match_id(url)
# TODO: a token can be obtained from https://auth.rtl.de/auth/realms/rtlplus/protocol/openid-connect/token
TOKEN = "eyJhbGciOiJSUzI1NiIsInR5cCIgOiAiSldUIiwia2lkIiA6ICJ4N1RJT2o1bXd3T0daLS1fOVdjcmhDbzdHemVCTDgwOWQxZlByN29wUThBIn0.eyJleHAiOjE3MDU0MzQ5MjksImlhdCI6MTcwNTQyMDUyOSwianRpIjoiZDQ5ZTkyZjgtZWRiZi00NmU4LWIxNTctYTUyOGZlYjU5ZjgyIiwiaXNzIjoiaHR0cHM6Ly9hdXRoLnJ0bC5kZS9hdXRoL3JlYWxtcy9ydGxwbHVzIiwic3ViIjoiNWYyODFmOTAtOWM5OS00MzcwLWFmZDYtMTM1N2ZlMDc2N2YxIiwidHlwIjoiQmVhcmVyIiwiYXpwIjoiYW5vbnltb3VzLXVzZXIiLCJhbGxvd2VkLW9yaWdpbnMiOlsiKiJdLCJzY29wZSI6IiIsImNsaWVudEhvc3QiOiI5MS4xNjQuNS4xOTkiLCJjbGllbnRJZCI6ImFub255bW91cy11c2VyIiwiaXNHdWVzdCI6dHJ1ZSwicGVybWlzc2lvbnMiOnsiZ2VuZXJhbCI6eyJwb3J0YWJpbGl0eSI6ZmFsc2UsImFscGhhViI6dHJ1ZSwibWF4QW1vdW50T2ZQcm9maWxlcyI6NCwibWF4TXBhUHJvZmlsZXMiOjQsInNldFBpbiI6ZmFsc2UsIm1heERvd25sb2FkRGV2aWNlcyI6MCwiYWNjZXNzUHJlU2FsZSI6ZmFsc2V9LCJzdHJlYW1pbmciOnsidm9kQWNjZXNzVG9GcmVlQ29udGVudCI6dHJ1ZSwidm9kQWNjZXNzVG9QYXlDb250ZW50IjpmYWxzZSwibGl2ZXN0cmVhbUFjY2Vzc1RvRnJlZVR2IjpmYWxzZSwibGl2ZXN0cmVhbUFjY2Vzc1RvUGF5VHYiOmZhbHNlLCJsaXZlc3RyZWFtQWNjZXNzVG9GYXN0Ijp0cnVlLCJ2b2RRdWFsaXR5IjoiTE9XIiwibGl2ZVF1YWxpdHkiOiJMT1ciLCJmYXN0UXVhbGl0eSI6IkxPVyIsIm1heFBhcmFsbGVsU3RyZWFtcyI6MSwibGl2ZWV2ZW50QWNjZXNzVG9GcmVlVHYiOnRydWUsImxpdmVldmVudEFjY2Vzc1RvUGF5VHYiOmZhbHNlfSwid2F0Y2hGZWF0dXJlcyI6eyJjb250ZW50RG93bmxvYWQiOmZhbHNlLCJvcmlnaW5hbFZlcnNpb24iOmZhbHNlLCJjb250aW51ZVdhdGNoaW5nIjpmYWxzZSwic2tpcEFkIjpmYWxzZSwiZG9sYnkiOmZhbHNlLCJib29rbWFya1dhdGNoIjpmYWxzZX0sImFkdmVydGlzaW5nIjp7Im1heFByZVJvbGxzIjozLCJtaWRSb2xscyI6dHJ1ZSwicG9zdFJvbGxzIjp0cnVlLCJjaGFwdGVycyI6dHJ1ZSwic3BlY2lhbEFkcyI6ZmFsc2UsImJyZWFrQWRzIjpmYWxzZSwiYWRTY2hlbWUiOiJhZGFfZnJlZSIsInRlZFBheUFkdmVydGlzZW1lbnQiOmZhbHNlfSwibXVzaWMiOnsiYWNjZXNzTXVzaWNDb250ZW50IjpmYWxzZSwiYWNjZXNzTXVzaWNDb250ZW50T3RoZXJQcm9maWxlcyI6ZmFsc2UsImRlZXplck9mZmVyQ29kZSI6LTEsImRlZXplclRyaWFsT2ZmZXJDb2RlIjotMSwiZGVlemVyTWF4UGFyYWxsZWxTdHJlYW1zIjowLCJ2aWV3TXVzaWNDb250ZW50Ijp0cnVlLCJtYXhEZWV6ZXJEb3dubG9hZERldmljZXMiOjAsIm1heERlZXplckRvd25sb2FkRGV2aWNlc090aGVyUHJvZmlsZXMiOjB9LCJwb2RjYXN0cyI6eyJib29rbWFya1BvZGNhc3RzIjpmYWxzZSwiYWNjZXNzRnJlZVBvZGNhc3RzIjp0cnVlLCJhY2Nlc3NQcmVtaXVtUG9kY2FzdHMiOmZhbHNlLCJmb2xsb3dQb2RjYXN0cyI6ZmFsc2UsImRvd25sb2FkUG9kY2FzdHMiOmZhbHNlLCJjb250aW51ZUxpc3RlbmluZ1BvZGNhc3RzIjpmYWxzZX0sInJhZGlvIjp7ImFjY2Vzc1JhZGlvQ29udGVudCI6dHJ1ZX0sIm1hZ2F6aW5lIjp7ImFydGljbGVDcmVkaXRzIjowLCJhY2Nlc3NNYWdhemluZUFydGljbGVzIjpmYWxzZSwiYnJhbmRTdWJzY3JpcHRpb25TbG90cyI6MCwiYm9va21hcmtNYWdhemluZSI6ZmFsc2V9LCJhdWRpb2Jvb2tzIjp7ImNhblJlZGVlbUNyZWRpdCI6ZmFsc2UsImNhblJlZGVlbUNyZWRpdE90aGVyUHJvZmlsZXMiOmZhbHNlLCJhY2Nlc3NEZWV6ZXJBdWRpb2Jvb2tzIjpmYWxzZSwiYWNjZXNzRGVlemVyQXVkaW9ib29rc090aGVyUHJvZmlsZXMiOmZhbHNlLCJhY2Nlc3NQcmhBdWRpb2Jvb2tzIjpmYWxzZSwiYWNjZXNzUHJoQXVkaW9ib29rc090aGVyUHJvZmlsZXMiOmZhbHNlLCJhY2Nlc3NCb3VnaHRQcmhBdWRpb2Jvb2tzIjpmYWxzZSwiYWNjZXNzQm91Z2h0UHJoQXVkaW9ib29rc090aGVyUHJvZmlsZXMiOmZhbHNlLCJwcmhDcmVkaXRzIjowLCJwcmhNYXhQYXJhbGxlbFN0cmVhbXMiOjB9LCJ0b2dnbyI6eyJza2lwQWR2ZXJ0aXNpbmciOmZhbHNlfX0sImNsaWVudEFkZHJlc3MiOiI5MS4xNjQuNS4xOTkifQ.mZTkPjVYFreAK79jqX5jv6Yujh7bPt-nYNRGJpJHyFRUhAn0cywEIvnjXNvHlx2fCE0aGmG4H9tvPX-kyittyi_wANkOEs4DNSI_IwQrCiyXC1kQnQscVkbnXTly1AGHhEtMeCNlf16k8v7CyF-cDTet_1FmKOXdPCMnH3wppJoLjPvP0tadwbF0sFOUuaIn4bnZEkDoF-7S9B-jcHHQ-Z4ZaElqkf4gJ4qZNEuHiYdbw3fPOn6LQHbxSPKIl9rZnUzzOQr-3EkrTFgAdVCPcIqfQR0qRIILw9odxsYRwLAcdy85bbokOEhZ-yrQFkGWccZ_sCeAK96H6LVpWkS5YA"
HEADERS = {
'rtlplus-client-Id': 'rci:rtlplus:web',
'rtlplus-client-Version': '2024.1.16.2',
'Authorization': f'Bearer {TOKEN}'
}
# TODO: get the URL from the webpage
URL = 'https://cdn.gateway.now-plus-prod.aws-cbc.cloud/graphql?operationName=PodcastEpisode&variables=%7B%22id%22:%22vpyvzbrxkkx39%22,%22take%22:1%7D&extensions=%7B%22persistedQuery%22:%7B%22version%22:1,%22sha256Hash%22:%222693e24ad538a69c8698cf1fcbf984cfa49c7592cf5404cb4369167eab694ee0%22%7D%7D'
# https://cdn.gateway.now-plus-prod.aws-cbc.cloud/graphql?operationName=PodcastEpisode&variables=
# {"id":"vpyvzbrxkkx39","take":1}&extensions={"persistedQuery":{"version":1,"sha256Hash":"2693e24ad538a69c8698cf1fcbf984cfa49c7592cf5404cb4369167eab694ee0"}}
webpage = self._download_webpage(URL, video_slug, headers=HEADERS)
# print(webpage)
json = self._parse_json(webpage, video_slug)
assert json
data = json['data']
return {
# TODO: It looks like even though the correct URL is returned, yt-dlp messes up the download.
# It works with wget, so it's not an auth problem.
'url': data['podcastEpisode']['url'],
'id': data['podcastEpisode']['id'],
'title': data['podcastEpisode']['title'],
'description': data['podcastEpisode']['description'],
# TODO more properties (see yt_dlp/extractor/common.py)
}
# example API answer:
# {
# "data": {
# "podcastEpisode": {
# "__typename": "PodcastEpisode",
# "id": "rrn:podcast:external:episode:vpyvzbrxkkx39",
# "title": "Revolution im Business: Künstliche Intelligenz im Fokus",
# "description": "In dieser Folge tauchen wir in die Welt der künstlichen Intelligenz im Unternehmensumfeld ein. Gemeinsam mit SYNAXON-Vorstand Frank Roebers werfen wir einen Blick auf die vielfältigen Anwendungsgebiete von Künstlicher Intelligenz in Unternehmen und diskutieren, wie diese Technologien transformative Veränderungen bewirken können. Die Folge bietet nicht nur einen Überblick über aktuelle KI-Trends, sondern beleuchtet auch die Herausforderungen und ethischen Aspekte, die mit dem verstärkten Einsatz von KI verbunden sind. Frank teilt seine Expertise und Erfahrungen, um uns einen umfassenden Einblick in die Zukunft der Unternehmenswelt durch Künstliche Intelligenz zu geben. Begleitet uns auf dieser spannenden Reise in die Welt der Technologie!",
# "type": "PODCAST_EPISODE",
# "episodeType": "FULL",
# "url": "https://it-gefluester.podcaster.de/it-gefluester/media/Podcast_Frank_Roebers_KI_Im_Unternehmen_122023_-_28-12-23-_22-14.mp3",
# "duration": 2413,
# "releaseDate": "2024-01-15T05:00:00.000Z",
# "episodePosition": "FIRST",
# "mediaTier": "FREE",
# "image": {
# "imageFormat": {
# "url": "https://media.plus.rtl.de/podcast/revolution-im-business-kuenstliche-intelligenz-im-fokus-b2adud99hw2so.png",
# "__typename": "ImageFormat"
# },
# "small": {
# "width": 500,
# "height": 500,
# "url": "https://media.plus.rtl.de/podcast/revolution-im-business-kuenstliche-intelligenz-im-fokus-b2adud99hw2so.png?tr=w-500,h-500",
# "__typename": "ImageFormat"
# },
# "__typename": "UploadFile"
# },
# "seo": {
# "index": true,
# "title": "Revolution im Business: Künstliche Intelligenz im Fokus | RTL+",
# "description": "Revolution im Business: Künstliche Intelligenz im Fokus aus IT-Geflüster ► Alle Folgen auf RTL+ Podcast!",
# "__typename": "Seo"
# },
# "seasonNumber": 1,
# "episodeNumber": 1,
# "canonicalPath": "/podcast/it-gefluester-swkanad6uxwwh/revolution-im-business-kuenstliche-intelligenz-im-fokus-vpyvzbrxkkx39",
# "canonicalUrl": "https://plus.rtl.de/podcast/it-gefluester-swkanad6uxwwh/revolution-im-business-kuenstliche-intelligenz-im-fokus-vpyvzbrxkkx39"
# }
# },
# "extensions": {
# "traceId": "64e4c007ebac8d583fef9f3c3e162921"
# }
# }