From 5fec75c81cf1a83fac9b35af08dc6750dba0e07b Mon Sep 17 00:00:00 2001 From: nixxo Date: Mon, 26 Oct 2020 13:36:29 +0100 Subject: [PATCH 001/124] [mtv] Fix a missing match_id Fix a problem introduced in 320724f964f09a5e1f08edd246464db4f0d297f9 where is extracted the ID from the url with self._match_id but the problem is that ID is not always present in the url passed so the title should be extracted as proposed by the fix (and like is done in _real_extract (see line 337)) --- youtube_dlc/extractor/mtv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dlc/extractor/mtv.py b/youtube_dlc/extractor/mtv.py index 6b3658397..eaf43429f 100644 --- a/youtube_dlc/extractor/mtv.py +++ b/youtube_dlc/extractor/mtv.py @@ -300,7 +300,7 @@ def _extract_mgid(self, webpage, url, data_zone=None): except RegexNotFoundError: mgid = None - title = self._match_id(url) + title = url_basename(url) try: window_data = self._parse_json(self._search_regex( From 9754a441e39208b2453631de6e7b60bedd971384 Mon Sep 17 00:00:00 2001 From: bopol Date: Tue, 27 Oct 2020 12:10:52 +0100 Subject: [PATCH 002/124] use webarchive to fix a dead link in README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 9d40d2631..5532cd720 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ youtube-dlc - download videos from youtube.com or other video platforms. -youtube-dlc is a fork of youtube-dl with the intention of getting features tested by the community merged in the tool faster, since youtube-dl's development seems to be slowing down. (https://github.com/ytdl-org/youtube-dl/issues/26462) +youtube-dlc is a fork of youtube-dl with the intention of getting features tested by the community merged in the tool faster, since youtube-dl's development seems to be slowing down. (https://web.archive.org/web/20201014194602/https://github.com/ytdl-org/youtube-dl/issues/26462) - [INSTALLATION](#installation) - [DESCRIPTION](#description) From 87ab4fb11a70c61e46cd7ee642f830b475b89c93 Mon Sep 17 00:00:00 2001 From: Dan Walker Date: Tue, 27 Oct 2020 06:33:47 -0700 Subject: [PATCH 003/124] Added DRM logic In the event when there are no available sources due to DRM controlled sources, return a DRM error and don't proceed with trying. #28 reports that an ExtractorError "No video formats found". Which is true, because the formats list is empty, however it's empty because they are all locked. This provides a more informative message to the end-user. # TESTING Tried the URL provided in #28 and confirmed a DRM messages is returned. --- youtube_dlc/extractor/brightcove.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/youtube_dlc/extractor/brightcove.py b/youtube_dlc/extractor/brightcove.py index 2aa9f4782..638673c31 100644 --- a/youtube_dlc/extractor/brightcove.py +++ b/youtube_dlc/extractor/brightcove.py @@ -471,12 +471,17 @@ def _parse_brightcove_metadata(self, json_data, video_id, headers={}): title = json_data['name'].strip() formats = [] + sources_num = len(json_data.get('sources')) + key_systems_present = 0 for source in json_data.get('sources', []): container = source.get('container') ext = mimetype2ext(source.get('type')) src = source.get('src') - # https://support.brightcove.com/playback-api-video-fields-reference#key_systems_object - if ext == 'ism' or container == 'WVM' or source.get('key_systems'): + # https://apis.support.brightcove.com/playback/references/playback-api-video-fields-reference.html + if source.get('key_systems'): + key_systems_present += 1 + continue + elif ext == 'ism' or container == 'WVM': continue elif ext == 'm3u8' or container == 'M2TS': if not src: @@ -533,6 +538,10 @@ def build_format_id(kind): 'format_id': build_format_id('rtmp'), }) formats.append(f) + + if sources_num == key_systems_present: + raise ExtractorError('This video is DRM protected', expected=True) + if not formats: # for sonyliv.com DRM protected videos s3_source_url = json_data.get('custom_fields', {}).get('s3sourceurl') From fb9c36c45f629d4b76a609f05a19a5bef1c7fbad Mon Sep 17 00:00:00 2001 From: amigatomte <48889381+amigatomte@users.noreply.github.com> Date: Tue, 27 Oct 2020 15:27:51 +0100 Subject: [PATCH 004/124] Update to reflect website changes. --- youtube_dlc/extractor/urplay.py | 49 +++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 21 deletions(-) diff --git a/youtube_dlc/extractor/urplay.py b/youtube_dlc/extractor/urplay.py index 6030b7cb5..4bc2b78fb 100644 --- a/youtube_dlc/extractor/urplay.py +++ b/youtube_dlc/extractor/urplay.py @@ -3,6 +3,7 @@ from .common import InfoExtractor from ..utils import unified_timestamp +import re class URPlayIE(InfoExtractor): @@ -13,10 +14,10 @@ class URPlayIE(InfoExtractor): 'info_dict': { 'id': '203704', 'ext': 'mp4', - 'title': 'UR Samtiden - Livet, universum och rymdens märkliga musik : Om vetenskap, kritiskt tänkande och motstånd', + 'title': 'Om vetenskap, kritiskt tänkande och motstånd', 'description': 'md5:5344508a52aa78c1ced6c1b8b9e44e9a', - 'timestamp': 1513512768, - 'upload_date': '20171217', + 'timestamp': 1513292400, + 'upload_date': '20171214', }, }, { 'url': 'https://urskola.se/Produkter/190031-Tripp-Trapp-Trad-Sovkudde', @@ -37,35 +38,41 @@ def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - urplayer_data = self._parse_json(self._search_regex( - r'urPlayer\.init\(({.+?})\);', webpage, 'urplayer data'), video_id) + urplayer_data = re.sub(""", "\"", self._search_regex( + r'components\/Player\/Player\" data-react-props=\"({.+?})\"', + webpage, 'urplayer data')) + urplayer_data = self._parse_json(urplayer_data, video_id) + for i in range(len(urplayer_data['accessibleEpisodes'])): + if urplayer_data.get('accessibleEpisodes', {})[i].get('id') == int(video_id): + urplayer_data = urplayer_data['accessibleEpisodes'][i] + break + host = self._download_json('http://streaming-loadbalancer.ur.se/loadbalancer.json', video_id)['redirect'] - formats = [] - for quality_attr, quality, preference in (('', 'sd', 0), ('_hd', 'hd', 1)): - file_http = urplayer_data.get('file_http' + quality_attr) or urplayer_data.get('file_http_sub' + quality_attr) - if file_http: + urplayer_streams = urplayer_data.get("streamingInfo") + for quality in ('sd'), ('hd'): + location = (urplayer_streams.get("raw", {}).get(quality, {}).get("location") + or urplayer_streams.get("sweComplete", {}).get(quality, {}).get("location")) + if location: formats.extend(self._extract_wowza_formats( - 'http://%s/%splaylist.m3u8' % (host, file_http), video_id, skip_protocols=['rtmp', 'rtsp'])) + 'http://%s/%s/playlist.m3u8' % (host, location), video_id, + skip_protocols=['f4m', 'rtmp', 'rtsp'])) self._sort_formats(formats) - subtitles = {} - for subtitle in urplayer_data.get('subtitles', []): - subtitle_url = subtitle.get('file') - kind = subtitle.get('kind') - if not subtitle_url or (kind and kind != 'captions'): - continue - subtitles.setdefault(subtitle.get('label', 'Svenska'), []).append({ - 'url': subtitle_url, + subs = urplayer_streams.get("sweComplete", {}).get("tt", {}).get("location") + if subs: + subtitles.setdefault('Svenska', []).append({ + 'url': subs, }) return { 'id': video_id, 'title': urplayer_data['title'], 'description': self._og_search_description(webpage), - 'thumbnail': urplayer_data.get('image'), - 'timestamp': unified_timestamp(self._html_search_meta(('uploadDate', 'schema:uploadDate'), webpage, 'timestamp')), - 'series': urplayer_data.get('series_title'), + 'thumbnail': urplayer_data.get('image', {}).get('1280x720'), + 'timestamp': unified_timestamp(self._html_search_meta(('uploadDate', 'schema:uploadDate'), + webpage, 'timestamp')), + 'series': urplayer_data.get('seriesTitle'), 'subtitles': subtitles, 'formats': formats, } From 6f8557ec4db627bdd2fda4f47bc2492a04ce5d0d Mon Sep 17 00:00:00 2001 From: Unknown Date: Tue, 27 Oct 2020 16:49:42 +0100 Subject: [PATCH 005/124] [skip travis] add note to remove tvland. --- youtube_dlc/extractor/tvland.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dlc/extractor/tvland.py b/youtube_dlc/extractor/tvland.py index 791144128..225b6b078 100644 --- a/youtube_dlc/extractor/tvland.py +++ b/youtube_dlc/extractor/tvland.py @@ -3,6 +3,8 @@ from .spike import ParamountNetworkIE +# TODO: Remove - Reason not used anymore - Service moved to youtube + class TVLandIE(ParamountNetworkIE): IE_NAME = 'tvland.com' From 67b19799a5cc8ab24aa48de66cea4e2ad41315a8 Mon Sep 17 00:00:00 2001 From: Peter Oettig Date: Tue, 27 Oct 2020 20:39:49 +0100 Subject: [PATCH 006/124] Fixed problem with JS player URL The JS player URL could not be found anymore, possibly because of a change on Youtubes side. --- youtube_dlc/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index 4fb49b864..ccfaa733d 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -2051,7 +2051,7 @@ def _extract_filesize(media_url): if cipher: if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True): - ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")' + ASSETS_RE = r'(?:"assets":.+?"js":\s*("[^"]+"))|(?:"jsUrl":\s*("[^"]+"))' jsplayer_url_json = self._search_regex( ASSETS_RE, embed_webpage if age_gate else video_webpage, From 48aac9fc867087095079dee966a1511730d01f6d Mon Sep 17 00:00:00 2001 From: insaneracist Date: Tue, 27 Oct 2020 19:21:34 -0700 Subject: [PATCH 007/124] [bandcamp] restore album downloads flake8 conform --- youtube_dlc/extractor/bandcamp.py | 183 +++++++++++++++------------- youtube_dlc/extractor/extractors.py | 2 +- 2 files changed, 102 insertions(+), 83 deletions(-) diff --git a/youtube_dlc/extractor/bandcamp.py b/youtube_dlc/extractor/bandcamp.py index 9dbafe86d..8a37e1b94 100644 --- a/youtube_dlc/extractor/bandcamp.py +++ b/youtube_dlc/extractor/bandcamp.py @@ -25,7 +25,45 @@ ) -class BandcampIE(InfoExtractor): +class BandcampBaseIE(InfoExtractor): + """Provide base functions for Bandcamp extractors""" + + def _extract_json_from_html_data_attribute(self, webpage, suffix, video_id): + json_string = self._html_search_regex( + r' data-%s="([^"]*)' % suffix, + webpage, '%s json' % suffix, default='{}') + + return self._parse_json(json_string, video_id) + + def _parse_json_track(self, json): + formats = [] + file_ = json.get('file') + if isinstance(file_, dict): + for format_id, format_url in file_.items(): + if not url_or_none(format_url): + continue + ext, abr_str = format_id.split('-', 1) + formats.append({ + 'format_id': format_id, + 'url': self._proto_relative_url(format_url, 'http:'), + 'ext': ext, + 'vcodec': 'none', + 'acodec': ext, + 'abr': int_or_none(abr_str), + }) + + return { + 'duration': float_or_none(json.get('duration')), + 'id': str_or_none(json.get('track_id') or json.get('id')), + 'title': json.get('title'), + 'title_link': json.get('title_link'), + 'number': int_or_none(json.get('track_num')), + 'formats': formats + } + + +class BandcampIE(BandcampBaseIE): + IE_NAME = "Bandcamp:track" _VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P[^/?#&]+)' _TESTS = [{ 'url': 'http://youtube-dlc.bandcamp.com/track/youtube-dlc-test-song', @@ -85,52 +123,32 @@ class BandcampIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) title = mobj.group('title') + url_track_title = title webpage = self._download_webpage(url, title) thumbnail = self._html_search_meta('og:image', webpage, default=None) - track_id = None - track = None - track_number = None - duration = None + json_tralbum = self._extract_json_from_html_data_attribute(webpage, "tralbum", url_track_title) + json_embed = self._extract_json_from_html_data_attribute(webpage, "embed", url_track_title) - formats = [] - trackinfo_block = self._html_search_regex( - r'trackinfo(?:["\']|"):\[\s*({.+?})\s*\],(?:["\']|")', - webpage, 'track info', default='{}') + json_tracks = json_tralbum.get('trackinfo') + if not json_tracks: + raise ExtractorError('Could not extract track') - track_info = self._parse_json(trackinfo_block, title) - if track_info: - file_ = track_info.get('file') - if isinstance(file_, dict): - for format_id, format_url in file_.items(): - if not url_or_none(format_url): - continue - ext, abr_str = format_id.split('-', 1) - formats.append({ - 'format_id': format_id, - 'url': self._proto_relative_url(format_url, 'http:'), - 'ext': ext, - 'vcodec': 'none', - 'acodec': ext, - 'abr': int_or_none(abr_str), - }) + track = self._parse_json_track(json_tracks[0]) + artist = json_tralbum.get('artist') + album_title = json_embed.get('album_title') - track_id = str_or_none(track_info.get('track_id') or track_info.get('id')) - track_number = int_or_none(track_info.get('track_num')) - duration = float_or_none(track_info.get('duration')) + json_album = json_tralbum.get('packages') + if json_album: + json_album = json_album[0] + album_publish_date = json_album.get('album_publish_date') + album_release_date = json_album.get('album_release_date') + else: + album_publish_date = None + album_release_date = json_tralbum.get('album_release_date') - def extract(key): - data = self._html_search_regex( - r',(["\']|")%s\1:\1(?P<value>(?:\\\1|((?!\1).))+)\1' % key, - webpage, key, default=None, group='value') - return data.replace(r'\"', '"').replace('\\\\', '\\') if data else data - - track = extract('title') - artist = extract('artist') - album = extract('album_title') - timestamp = unified_timestamp( - extract('publish_date') or extract('album_publish_date')) - release_date = unified_strdate(extract('album_release_date')) + timestamp = unified_timestamp(json_tralbum.get('current', {}).get('publish_date') or album_publish_date) + release_date = unified_strdate(album_release_date) download_link = self._search_regex( r'freeDownloadPage(?:["\']|"):\s*(["\']|")(?P<url>(?:(?!\1).)+)\1', webpage, @@ -155,8 +173,6 @@ def extract(key): if info: downloads = info.get('downloads') if isinstance(downloads, dict): - if not track: - track = info.get('title') if not artist: artist = info.get('artist') if not thumbnail: @@ -190,7 +206,7 @@ def extract(key): retry_url = url_or_none(stat.get('retry_url')) if not retry_url: continue - formats.append({ + track['formats'].append({ 'url': self._proto_relative_url(retry_url, 'http:'), 'ext': download_formats.get(format_id), 'format_id': format_id, @@ -199,32 +215,37 @@ def extract(key): 'vcodec': 'none', }) - self._sort_formats(formats) + self._sort_formats(track['formats']) - title = '%s - %s' % (artist, track) if artist else track - - if not duration: - duration = float_or_none(self._html_search_meta( - 'duration', webpage, default=None)) + title = '%s - %s' % (artist, track.get('title')) if artist else track.get('title') return { - 'id': track_id, - 'title': title, - 'thumbnail': thumbnail, - 'uploader': artist, - 'timestamp': timestamp, - 'release_date': release_date, - 'duration': duration, - 'track': track, - 'track_number': track_number, - 'track_id': track_id, + 'album': album_title, 'artist': artist, - 'album': album, - 'formats': formats, + 'duration': track['duration'], + 'formats': track['formats'], + 'id': track['id'], + 'release_date': release_date, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'title': title, + 'track': track['title'], + 'track_id': track['id'], + 'track_number': track['number'], + 'uploader': artist } -class BandcampAlbumIE(InfoExtractor): +class BandcampAlbumTrackIE(BandcampIE): + IE_NAME = "Bandcamp:albumtrack" + """Hack class to force album downloads to have prefixed track numbers by default""" + def _real_extract(self, url): + data = super()._real_extract(url) + data['title'] = '{:02d} - {} - {}'.format(data['track_number'], data['artist'], data['track']) + return data + + +class BandcampAlbumIE(BandcampBaseIE): IE_NAME = 'Bandcamp:album' _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<album_id>[^/?#&]+))?' @@ -305,34 +326,32 @@ def _real_extract(self, url): album_id = mobj.group('album_id') playlist_id = album_id or uploader_id webpage = self._download_webpage(url, playlist_id) - track_elements = re.findall( - r'(?s)<div[^>]*>(.*?<a[^>]+href="([^"]+?)"[^>]+itemprop="url"[^>]*>.*?)</div>', webpage) - if not track_elements: - raise ExtractorError('The page doesn\'t contain any tracks') + + json_tralbum = self._extract_json_from_html_data_attribute(webpage, "tralbum", playlist_id) + json_embed = self._extract_json_from_html_data_attribute(webpage, "embed", playlist_id) + + json_tracks = json_tralbum.get('trackinfo') + if not json_tracks: + raise ExtractorError('Could not extract album tracks') + + album_title = json_embed.get('album_title') + # Only tracks with duration info have songs + tracks = [self._parse_json_track(track) for track in json_tracks] entries = [ self.url_result( - compat_urlparse.urljoin(url, t_path), - ie=BandcampIE.ie_key(), - video_title=self._search_regex( - r'<span\b[^>]+\bitemprop=["\']name["\'][^>]*>([^<]+)', - elem_content, 'track title', fatal=False)) - for elem_content, t_path in track_elements - if self._html_search_meta('duration', elem_content, default=None)] - - title = self._html_search_regex( - r'album_title\s*(?:"|["\']):\s*("|["\'])(?P<album>(?:\\\1|((?!\1).))+)\1', - webpage, 'title', fatal=False, group='album') - - if title: - title = title.replace(r'\"', '"') + compat_urlparse.urljoin(url, track['title_link']), + ie=BandcampAlbumTrackIE.ie_key(), + video_title=track['title']) + for track in tracks + if track.get('duration')] return { '_type': 'playlist', 'uploader_id': uploader_id, 'id': playlist_id, - 'title': title, - 'entries': entries, + 'title': album_title, + 'entries': entries } diff --git a/youtube_dlc/extractor/extractors.py b/youtube_dlc/extractor/extractors.py index d31edd7c8..fbd4ed1e3 100644 --- a/youtube_dlc/extractor/extractors.py +++ b/youtube_dlc/extractor/extractors.py @@ -84,7 +84,7 @@ ) from .azmedien import AZMedienIE from .baidu import BaiduVideoIE -from .bandcamp import BandcampIE, BandcampAlbumIE, BandcampWeeklyIE +from .bandcamp import BandcampIE, BandcampAlbumTrackIE, BandcampAlbumIE, BandcampWeeklyIE from .bbc import ( BBCCoUkIE, BBCCoUkArticleIE, From 3467b3e28f30bf53ca8355361806bbc74ccf2435 Mon Sep 17 00:00:00 2001 From: Unknown <blackjack4494@web.de> Date: Wed, 28 Oct 2020 12:18:04 +0100 Subject: [PATCH 008/124] [skip travis][bandcamp] fix minor typo in tests --- youtube_dlc/extractor/bandcamp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dlc/extractor/bandcamp.py b/youtube_dlc/extractor/bandcamp.py index 8a37e1b94..dbf96f33d 100644 --- a/youtube_dlc/extractor/bandcamp.py +++ b/youtube_dlc/extractor/bandcamp.py @@ -66,7 +66,7 @@ class BandcampIE(BandcampBaseIE): IE_NAME = "Bandcamp:track" _VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P<title>[^/?#&]+)' _TESTS = [{ - 'url': 'http://youtube-dlc.bandcamp.com/track/youtube-dlc-test-song', + 'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song', 'md5': 'c557841d5e50261777a6585648adf439', 'info_dict': { 'id': '1812978515', From 079a941282648b0278933e57adc4a77c3e95b86e Mon Sep 17 00:00:00 2001 From: Unknown <blackjack4494@web.de> Date: Wed, 28 Oct 2020 12:25:49 +0100 Subject: [PATCH 009/124] [mtv] add match_id reminder --- youtube_dlc/extractor/mtv.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dlc/extractor/mtv.py b/youtube_dlc/extractor/mtv.py index eaf43429f..feb442377 100644 --- a/youtube_dlc/extractor/mtv.py +++ b/youtube_dlc/extractor/mtv.py @@ -300,6 +300,7 @@ def _extract_mgid(self, webpage, url, data_zone=None): except RegexNotFoundError: mgid = None + # TODO: ideally use self._match_id(url) title = url_basename(url) try: From 6c6ee4905f386f4280d9c07de50eafd3a797c306 Mon Sep 17 00:00:00 2001 From: nixxo <c.nixxo@gmail.com> Date: Wed, 28 Oct 2020 13:48:22 +0100 Subject: [PATCH 010/124] [mtv] proposed fix --- youtube_dlc/extractor/mtv.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dlc/extractor/mtv.py b/youtube_dlc/extractor/mtv.py index feb442377..04cc95b6a 100644 --- a/youtube_dlc/extractor/mtv.py +++ b/youtube_dlc/extractor/mtv.py @@ -289,7 +289,7 @@ def _extract_new_triforce_mgid(self, webpage, url='', video_id=None): return mgid - def _extract_mgid(self, webpage, url, data_zone=None): + def _extract_mgid(self, webpage, url, title=None, data_zone=None): try: # the url can be http://media.mtvnservices.com/fb/{mgid}.swf # or http://media.mtvnservices.com/{mgid} @@ -300,8 +300,8 @@ def _extract_mgid(self, webpage, url, data_zone=None): except RegexNotFoundError: mgid = None - # TODO: ideally use self._match_id(url) - title = url_basename(url) + if not title: + title = url_basename(url) try: window_data = self._parse_json(self._search_regex( @@ -337,7 +337,7 @@ def _extract_mgid(self, webpage, url, data_zone=None): def _real_extract(self, url): title = url_basename(url) webpage = self._download_webpage(url, title) - mgid = self._extract_mgid(webpage, url) + mgid = self._extract_mgid(webpage, url, title=title) videos_info = self._get_videos_info(mgid, url=url) return videos_info From cf553deceb0e4d8f1fa6c66a7eabfe9a0f04343c Mon Sep 17 00:00:00 2001 From: Tom-Oliver Heidel <github@tom-oliver.eu> Date: Wed, 28 Oct 2020 15:18:13 +0100 Subject: [PATCH 011/124] [skip travis] update travis badge --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 5532cd720..08bddaa18 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -[![Build Status](https://travis-ci.com/blackjack4494/youtube-dlc.svg?branch=master)](https://travis-ci.com/blackjack4494/youtube-dlc) +[![Build Status](https://travis-ci.com/blackjack4494/yt-dlc.svg?branch=master)](https://travis-ci.com/blackjack4494/yt-dlc) [![PyPi](https://img.shields.io/pypi/v/youtube-dlc.svg)](https://pypi.org/project/youtube-dlc) [![Downloads](https://pepy.tech/badge/youtube-dlc)](https://pepy.tech/project/youtube-dlc) From ccec6955f34ce9933e48562e8538bbf98247c050 Mon Sep 17 00:00:00 2001 From: insaneracist <insaneracist@cyberdude.com> Date: Wed, 28 Oct 2020 08:12:58 -0700 Subject: [PATCH 012/124] [bandcamp] fix failing test. remove subclass hack --- youtube_dlc/extractor/bandcamp.py | 11 +---------- youtube_dlc/extractor/extractors.py | 2 +- 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/youtube_dlc/extractor/bandcamp.py b/youtube_dlc/extractor/bandcamp.py index dbf96f33d..7d29481c0 100644 --- a/youtube_dlc/extractor/bandcamp.py +++ b/youtube_dlc/extractor/bandcamp.py @@ -236,15 +236,6 @@ def _real_extract(self, url): } -class BandcampAlbumTrackIE(BandcampIE): - IE_NAME = "Bandcamp:albumtrack" - """Hack class to force album downloads to have prefixed track numbers by default""" - def _real_extract(self, url): - data = super()._real_extract(url) - data['title'] = '{:02d} - {} - {}'.format(data['track_number'], data['artist'], data['track']) - return data - - class BandcampAlbumIE(BandcampBaseIE): IE_NAME = 'Bandcamp:album' _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<album_id>[^/?#&]+))?' @@ -341,7 +332,7 @@ def _real_extract(self, url): entries = [ self.url_result( compat_urlparse.urljoin(url, track['title_link']), - ie=BandcampAlbumTrackIE.ie_key(), + ie=BandcampIE.ie_key(), video_title=track['title']) for track in tracks if track.get('duration')] diff --git a/youtube_dlc/extractor/extractors.py b/youtube_dlc/extractor/extractors.py index fbd4ed1e3..d31edd7c8 100644 --- a/youtube_dlc/extractor/extractors.py +++ b/youtube_dlc/extractor/extractors.py @@ -84,7 +84,7 @@ ) from .azmedien import AZMedienIE from .baidu import BaiduVideoIE -from .bandcamp import BandcampIE, BandcampAlbumTrackIE, BandcampAlbumIE, BandcampWeeklyIE +from .bandcamp import BandcampIE, BandcampAlbumIE, BandcampWeeklyIE from .bbc import ( BBCCoUkIE, BBCCoUkArticleIE, From c3c18d7b8a035c0099499147be5fcfe5f603e072 Mon Sep 17 00:00:00 2001 From: nixxo <c.nixxo@gmail.com> Date: Wed, 28 Oct 2020 16:55:58 +0100 Subject: [PATCH 013/124] [skyitalia] Add new extractor --- youtube_dlc/extractor/extractors.py | 4 + youtube_dlc/extractor/skyitalia.py | 119 ++++++++++++++++++++++++++++ 2 files changed, 123 insertions(+) create mode 100644 youtube_dlc/extractor/skyitalia.py diff --git a/youtube_dlc/extractor/extractors.py b/youtube_dlc/extractor/extractors.py index d31edd7c8..a0c7d0f42 100644 --- a/youtube_dlc/extractor/extractors.py +++ b/youtube_dlc/extractor/extractors.py @@ -1037,6 +1037,10 @@ SkyNewsIE, SkySportsIE, ) +from .skyitalia import ( + SkyArteItaliaIE, + SkyItaliaIE, +) from .slideshare import SlideshareIE from .slideslive import SlidesLiveIE from .slutload import SlutloadIE diff --git a/youtube_dlc/extractor/skyitalia.py b/youtube_dlc/extractor/skyitalia.py new file mode 100644 index 000000000..d9c35c3a1 --- /dev/null +++ b/youtube_dlc/extractor/skyitalia.py @@ -0,0 +1,119 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ExtractorError + + +class SkyItaliaBaseIE(InfoExtractor): + _GET_VIDEO_DATA = 'https://apid.sky.it/vdp/v1/getVideoData?token={token}&caller=sky&rendition=web&id={id}' + _TOKEN = 'F96WlOd8yoFmLQgiqv6fNQRvHZcsWk5jDaYnDvhbiJk' + _RES = { + 'low': [426, 240], + 'med': [640, 360], + 'high': [854, 480], + 'hd': [1280, 720] + } + + def _extract_video_id(self, url): + webpage = self._download_webpage(url, 'skyitalia') + video_id = self._html_search_regex( + [r'data-videoid=\"(\d+)\"', + r'http://player\.sky\.it/social\?id=(\d+)\&'], + webpage, 'video_id') + if video_id: + return video_id + raise ExtractorError('Video ID not found.') + + def _get_formats(self, video_id, token=_TOKEN): + data_url = self._GET_VIDEO_DATA.replace('{id}', video_id) + data_url = data_url.replace('{token}', token) + video_data = self._parse_json( + self._download_webpage(data_url, video_id), + video_id) + + formats = [] + for q, r in self._RES.items(): + key = 'web_' + q + '_url' + if key not in video_data: + continue + formats.append({ + 'url': video_data[key], + 'format_id': q, + 'width': r[0], + 'height': r[1] + }) + + self._sort_formats(formats) + title = video_data.get('title') + thumb = video_data.get('thumb') + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumb, + 'formats': formats + } + + def _real_extract(self, url): + video_id = self._match_id(url) + if video_id == 'None': + video_id = self._extract_video_id(url) + return self._get_formats(video_id, self._TOKEN) + + +class SkyItaliaIE(SkyItaliaBaseIE): + IE_NAME = 'sky.it' + _VALID_URL = r'''(?x)https?:// + (?P<ie>sport|tg24|video) + \.sky\.it/(?:.+?) + (?P<id>[0-9]{6})? + (?:$|\?)''' + + _TESTS = [{ + 'url': 'https://video.sky.it/sport/motogp/video/motogp-gp-emilia-romagna-highlights-prove-libere-616162', + 'md5': '9c03b590b06e5952d8051f0e02b0feca', + 'info_dict': { + 'id': '616162', + 'ext': 'mp4', + 'title': 'MotoGP, GP Emilia Romagna: gli highlights delle prove libere', + 'thumbnail': 'https://videoplatform.sky.it/thumbnail/2020/09/18/1600441214452_hl-libere-motogp-misano2_5602634_thumbnail_1.jpg', + } + }, { + 'url': 'https://sport.sky.it/motogp/2020/09/18/motogp-gp-emilia-romagna-misano-2020-prove-libere-diretta', + 'md5': '9c03b590b06e5952d8051f0e02b0feca', + 'info_dict': { + 'id': '616162', + 'ext': 'mp4', + 'title': 'MotoGP, GP Emilia Romagna: gli highlights delle prove libere', + 'thumbnail': 'https://videoplatform.sky.it/thumbnail/2020/09/18/1600441214452_hl-libere-motogp-misano2_5602634_thumbnail_1.jpg', + } + }, { + 'url': 'https://tg24.sky.it/salute-e-benessere/2020/09/18/coronavirus-vaccino-ue-sanofi', + 'md5': 'caa25e62dadb529bc5e0b078da99f854', + 'info_dict': { + 'id': '615904', + 'ext': 'mp4', + 'title': 'Covid-19, al Buzzi di Milano tamponi drive-in per studenti', + 'thumbnail': 'https://videoplatform.sky.it/thumbnail/2020/09/17/1600351405841_error-coronavirus-al-buzzi-di-milano-tamponi_thumbnail_1.jpg', + } + }, { + 'url': 'https://video.sky.it/sport/motogp/video/motogp-gp-emilia-romagna-highlights-prove-libere-616162?itm_source=parsely-api', + 'only_matching': True, + }] + + +class SkyArteItaliaIE(SkyItaliaBaseIE): + IE_NAME = 'arte.sky.it' + _VALID_URL = r'https?://arte\.sky\.it/video/.+?(?P<id>[0-9]{6})?$' + _TEST = { + 'url': 'https://arte.sky.it/video/federico-fellini-maestri-cinema/', + 'md5': '2f22513a89f45142f2746f878d690647', + 'info_dict': { + 'id': '612888', + 'ext': 'mp4', + 'title': 'I maestri del cinema Federico Felini', + 'thumbnail': 'https://videoplatform.sky.it/thumbnail/2020/09/03/1599146747305_i-maestri-del-cinema-federico-felini_thumbnail_1.jpg', + } + } + _TOKEN = 'LWk29hfiU39NNdq87ePeRach3nzTSV20o0lTv2001Cd' From 0b72c2bc31a3400134182154f23fdd494c838a5a Mon Sep 17 00:00:00 2001 From: nixxo <c.nixxo@gmail.com> Date: Wed, 28 Oct 2020 17:04:36 +0100 Subject: [PATCH 014/124] [skyitalia] removed arbitrary parameter --- youtube_dlc/extractor/skyitalia.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dlc/extractor/skyitalia.py b/youtube_dlc/extractor/skyitalia.py index d9c35c3a1..262701c6b 100644 --- a/youtube_dlc/extractor/skyitalia.py +++ b/youtube_dlc/extractor/skyitalia.py @@ -25,7 +25,7 @@ def _extract_video_id(self, url): return video_id raise ExtractorError('Video ID not found.') - def _get_formats(self, video_id, token=_TOKEN): + def _get_formats(self, video_id, token): data_url = self._GET_VIDEO_DATA.replace('{id}', video_id) data_url = data_url.replace('{token}', token) video_data = self._parse_json( From 81a20463a44d1039729ed5611d39d0bcb4abeb73 Mon Sep 17 00:00:00 2001 From: nixxo <c.nixxo@gmail.com> Date: Wed, 28 Oct 2020 17:06:49 +0100 Subject: [PATCH 015/124] [skyitalia] moved token --- youtube_dlc/extractor/skyitalia.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dlc/extractor/skyitalia.py b/youtube_dlc/extractor/skyitalia.py index 262701c6b..a4e894044 100644 --- a/youtube_dlc/extractor/skyitalia.py +++ b/youtube_dlc/extractor/skyitalia.py @@ -7,7 +7,6 @@ class SkyItaliaBaseIE(InfoExtractor): _GET_VIDEO_DATA = 'https://apid.sky.it/vdp/v1/getVideoData?token={token}&caller=sky&rendition=web&id={id}' - _TOKEN = 'F96WlOd8yoFmLQgiqv6fNQRvHZcsWk5jDaYnDvhbiJk' _RES = { 'low': [426, 240], 'med': [640, 360], @@ -101,6 +100,7 @@ class SkyItaliaIE(SkyItaliaBaseIE): 'url': 'https://video.sky.it/sport/motogp/video/motogp-gp-emilia-romagna-highlights-prove-libere-616162?itm_source=parsely-api', 'only_matching': True, }] + _TOKEN = 'F96WlOd8yoFmLQgiqv6fNQRvHZcsWk5jDaYnDvhbiJk' class SkyArteItaliaIE(SkyItaliaBaseIE): From a85e131b48ac618c9b5bd82a0ed5e288d095fb47 Mon Sep 17 00:00:00 2001 From: nixxo <c.nixxo@gmail.com> Date: Wed, 28 Oct 2020 20:32:28 +0100 Subject: [PATCH 016/124] [rcs] Add new extractor --- youtube_dlc/extractor/extractors.py | 6 + youtube_dlc/extractor/generic.py | 7 + youtube_dlc/extractor/rcs.py | 408 ++++++++++++++++++++++++++++ 3 files changed, 421 insertions(+) create mode 100644 youtube_dlc/extractor/rcs.py diff --git a/youtube_dlc/extractor/extractors.py b/youtube_dlc/extractor/extractors.py index d31edd7c8..c3b76f039 100644 --- a/youtube_dlc/extractor/extractors.py +++ b/youtube_dlc/extractor/extractors.py @@ -936,6 +936,12 @@ RayWenderlichCourseIE, ) from .rbmaradio import RBMARadioIE +from .rcs import ( + CorriereIE, + GazzettaIE, + RCSEmbedsIE, + RCSVariousIE, +) from .rds import RDSIE from .redbulltv import ( RedBullTVIE, diff --git a/youtube_dlc/extractor/generic.py b/youtube_dlc/extractor/generic.py index aba06b328..1641934f4 100644 --- a/youtube_dlc/extractor/generic.py +++ b/youtube_dlc/extractor/generic.py @@ -119,6 +119,7 @@ from .zype import ZypeIE from .odnoklassniki import OdnoklassnikiIE from .kinja import KinjaEmbedIE +from .rcs import RCSEmbedsIE class GenericIE(InfoExtractor): @@ -3213,6 +3214,12 @@ def _real_extract(self, url): return self.playlist_from_matches( zype_urls, video_id, video_title, ie=ZypeIE.ie_key()) + # Look for RCS media group embeds + rcs_urls = RCSEmbedsIE._extract_urls(webpage) + if rcs_urls: + return self.playlist_from_matches( + rcs_urls, video_id, video_title, ie=RCSEmbedsIE.ie_key()) + # Look for HTML5 media entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') if entries: diff --git a/youtube_dlc/extractor/rcs.py b/youtube_dlc/extractor/rcs.py new file mode 100644 index 000000000..183c14d64 --- /dev/null +++ b/youtube_dlc/extractor/rcs.py @@ -0,0 +1,408 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + clean_html, + ExtractorError, + js_to_json, + base_url, + url_basename, + urljoin, +) + + +class RCSIE(InfoExtractor): + _ALL_REPLACE = { + 'media2vam.corriere.it.edgesuite.net': + 'media2vam-corriere-it.akamaized.net', + 'media.youreporter.it.edgesuite.net': + 'media-youreporter-it.akamaized.net', + 'corrierepmd.corriere.it.edgesuite.net': + 'corrierepmd-corriere-it.akamaized.net', + 'media2vam-corriere-it.akamaized.net/fcs.quotidiani/vr/videos/': + 'video.corriere.it/vr360/videos/', + '.net//': '.net/', + } + _MP4_REPLACE = { + 'media2vam.corbologna.corriere.it.edgesuite.net': + 'media2vam-bologna-corriere-it.akamaized.net', + 'media2vam.corfiorentino.corriere.it.edgesuite.net': + 'media2vam-fiorentino-corriere-it.akamaized.net', + 'media2vam.cormezzogiorno.corriere.it.edgesuite.net': + 'media2vam-mezzogiorno-corriere-it.akamaized.net', + 'media2vam.corveneto.corriere.it.edgesuite.net': + 'media2vam-veneto-corriere-it.akamaized.net', + 'media2.oggi.it.edgesuite.net': + 'media2-oggi-it.akamaized.net', + 'media2.quimamme.it.edgesuite.net': + 'media2-quimamme-it.akamaized.net', + 'media2.amica.it.edgesuite.net': + 'media2-amica-it.akamaized.net', + 'media2.living.corriere.it.edgesuite.net': + 'media2-living-corriere-it.akamaized.net', + 'media2.style.corriere.it.edgesuite.net': + 'media2-style-corriere-it.akamaized.net', + 'media2.iodonna.it.edgesuite.net': + 'media2-iodonna-it.akamaized.net', + 'media2.leitv.it.edgesuite.net': + 'media2-leitv-it.akamaized.net', + } + _MIGRATION_MAP = { + 'videoamica-vh.akamaihd': 'amica', + 'media2-amica-it.akamaized': 'amica', + 'corrierevam-vh.akamaihd': 'corriere', + 'media2vam-corriere-it.akamaized': 'corriere', + 'cormezzogiorno-vh.akamaihd': 'corrieredelmezzogiorno', + 'media2vam-mezzogiorno-corriere-it.akamaized': 'corrieredelmezzogiorno', + 'corveneto-vh.akamaihd': 'corrieredelveneto', + 'media2vam-veneto-corriere-it.akamaized': 'corrieredelveneto', + 'corbologna-vh.akamaihd': 'corrieredibologna', + 'media2vam-bologna-corriere-it.akamaized': 'corrieredibologna', + 'corfiorentino-vh.akamaihd': 'corrierefiorentino', + 'media2vam-fiorentino-corriere-it.akamaized': 'corrierefiorentino', + 'corinnovazione-vh.akamaihd': 'corriereinnovazione', + 'media2-gazzanet-gazzetta-it.akamaized': 'gazzanet', + 'videogazzanet-vh.akamaihd': 'gazzanet', + 'videogazzaworld-vh.akamaihd': 'gazzaworld', + 'gazzettavam-vh.akamaihd': 'gazzetta', + 'media2vam-gazzetta-it.akamaized': 'gazzetta', + 'videoiodonna-vh.akamaihd': 'iodonna', + 'media2-leitv-it.akamaized': 'leitv', + 'videoleitv-vh.akamaihd': 'leitv', + 'videoliving-vh.akamaihd': 'living', + 'media2-living-corriere-it.akamaized': 'living', + 'media2-oggi-it.akamaized': 'oggi', + 'videooggi-vh.akamaihd': 'oggi', + 'media2-quimamme-it.akamaized': 'quimamme', + 'quimamme-vh.akamaihd': 'quimamme', + 'videorunning-vh.akamaihd': 'running', + 'media2-style-corriere-it.akamaized': 'style', + 'style-vh.akamaihd': 'style', + 'videostyle-vh.akamaihd': 'style', + 'media2-stylepiccoli-it.akamaized': 'stylepiccoli', + 'stylepiccoli-vh.akamaihd': 'stylepiccoli', + 'doveviaggi-vh.akamaihd': 'viaggi', + 'media2-doveviaggi-it.akamaized': 'viaggi', + 'media2-vivimilano-corriere-it.akamaized': 'vivimilano', + 'vivimilano-vh.akamaihd': 'vivimilano', + 'media2-youreporter-it.akamaized': 'youreporter' + } + _MIGRATION_MEDIA = { + 'advrcs-vh.akamaihd': '', + 'corriere-f.akamaihd': '', + 'corrierepmd-corriere-it.akamaized': '', + 'corrprotetto-vh.akamaihd': '', + 'gazzetta-f.akamaihd': '', + 'gazzettapmd-gazzetta-it.akamaized': '', + 'gazzprotetto-vh.akamaihd': '', + 'periodici-f.akamaihd': '', + 'periodicisecure-vh.akamaihd': '', + 'videocoracademy-vh.akamaihd': '' + } + + def _get_video_src(self, video): + mediaFiles = video['mediaProfile']['mediaFile'] + src = {} + # audio + if video['mediaType'] == 'AUDIO': + for aud in mediaFiles: + # todo: check + src['mp3'] = aud['value'] + # video + else: + for vid in mediaFiles: + if vid['mimeType'] == 'application/vnd.apple.mpegurl': + src['m3u8'] = vid['value'] + if vid['mimeType'] == 'video/mp4': + src['mp4'] = vid['value'] + + # replace host + for t in src: + for s, r in self._ALL_REPLACE.items(): + src[t] = src[t].replace(s, r) + for s, r in self._MP4_REPLACE.items(): + src[t] = src[t].replace(s, r) + + # switch cdn + if 'mp4' in src and 'm3u8' in src: + if '-lh.akamaihd' not in src['m3u8'] and 'akamai' in src['mp4']: + if 'm3u8' in src: + matches = re.search(r'(?:https*:)?\/\/(?P<host>.*)\.net\/i(?P<path>.*)$', src['m3u8']) + src['m3u8'] = 'https://vod.rcsobjects.it/hls/%s%s' % ( + self._MIGRATION_MAP[matches.group('host')], + matches.group('path').replace( + '///', '/').replace( + '//', '/').replace( + '.csmil', '.urlset' + ) + ) + if 'mp4' in src: + matches = re.search(r'(?:https*:)?\/\/(?P<host>.*)\.net\/i(?P<path>.*)$', src['mp4']) + if matches: + if matches.group('host') in self._MIGRATION_MEDIA: + vh_stream = 'https://media2.corriereobjects.it' + if src['mp4'].find('fcs.quotidiani_!'): + vh_stream = 'https://media2-it.corriereobjects.it' + src['mp4'] = '%s%s' % ( + vh_stream, + matches.group('path').replace( + '///', '/').replace( + '//', '/').replace( + '/fcs.quotidiani/mediacenter', '').replace( + '/fcs.quotidiani_!/mediacenter', '').replace( + 'corriere/content/mediacenter/', '').replace( + 'gazzetta/content/mediacenter/', '') + ) + else: + src['mp4'] = 'https://vod.rcsobjects.it/%s%s' % ( + self._MIGRATION_MAP[matches.group('host')], + matches.group('path').replace('///', '/').replace('//', '/') + ) + + if 'mp3' in src: + src['mp3'] = src['mp3'].replace( + 'media2vam-corriere-it.akamaized.net', + 'vod.rcsobjects.it/corriere') + if 'mp4' in src: + if src['mp4'].find('fcs.quotidiani_!'): + src['mp4'] = src['mp4'].replace('vod.rcsobjects', 'vod-it.rcsobjects') + if 'm3u8' in src: + if src['m3u8'].find('fcs.quotidiani_!'): + src['m3u8'] = src['m3u8'].replace('vod.rcsobjects', 'vod-it.rcsobjects') + + if 'geoblocking' in video['mediaProfile']: + if 'm3u8' in src: + src['m3u8'] = src['m3u8'].replace('vod.rcsobjects', 'vod-it.rcsobjects') + if 'mp4' in src: + src['mp4'] = src['mp4'].replace('vod.rcsobjects', 'vod-it.rcsobjects') + if 'm3u8' in src: + if src['m3u8'].find('csmil') and src['m3u8'].find('vod'): + src['m3u8'] = src['m3u8'].replace('.csmil', '.urlset') + + return src + + def _create_formats(self, urls, video_id): + formats = [] + formats = self._extract_m3u8_formats( + urls['m3u8'], video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False) + + if not formats: + formats.append({ + 'format_id': 'http-mp4', + 'url': urls['mp4'] + }) + self._sort_formats(formats) + return formats + + def _real_extract(self, url): + video_id = self._match_id(url) + mobj = re.search(self._VALID_URL, url).groupdict() + + if not mobj['cdn']: + raise ExtractorError('CDN not found in url: %s' % url) + + # for leitv/youreporter/viaggi don't use the embed page + if (mobj['cdn'] not in ['leitv.it', 'youreporter.it']) and (mobj['vid'] == 'video'): + url = 'https://video.%s/video-embed/%s' % (mobj['cdn'], video_id) + + page = self._download_webpage(url, video_id) + + video_data = None + # look for json video data url + json = self._search_regex( + r'''var url\s*=\s*["']((?:https?:)?//video\.rcs\.it/fragment-includes/video-includes/.+?\.json)["'];''', + page, video_id, default=None) + if json: + if json.startswith('//'): + json = 'https:%s' % json + video_data = self._download_json(json, video_id) + + # if url not found, look for json video data directly in the page + else: + json = self._search_regex( + r'[\s;]video\s*=\s*({[\s\S]+?})(?:;|,playlist=)', + page, video_id, default=None) + if json: + video_data = self._parse_json( + json, video_id, transform_source=js_to_json) + else: + # if no video data found try search for iframes + emb = RCSEmbedsIE._extract_url(page) + if emb: + return self._real_extract(emb) + + if not video_data: + raise ExtractorError('Video data not found in the page') + + formats = self._create_formats( + self._get_video_src(video_data), video_id) + + return { + 'id': video_id, + 'title': video_data['title'], + 'description': video_data['description'] or clean_html(video_data['htmlDescription']), + 'uploader': video_data['provider'] if video_data['provider'] else mobj['cdn'], + 'formats': formats + } + + +class RCSEmbedsIE(RCSIE): + IE_NAME = 'rcs:rcs' + _VALID_URL = r'''(?x) + https?://(?P<vid>video)\. + (?P<cdn> + (?: + rcs| + (?:corriere\w+\.)?corriere| + (?:gazzanet\.)?gazzetta + )\.it) + /video-embed/(?P<id>[^/=&\?]+?)(?:$|\?)''' + _TESTS = [{ + 'url': 'https://video.rcs.it/video-embed/iodonna-0001585037', + 'md5': '623ecc8ffe7299b2d0c1046d8331a9df', + 'info_dict': { + 'id': 'iodonna-0001585037', + 'ext': 'mp4', + 'title': 'Sky Arte racconta Madonna nella serie "Artist to icon"', + 'description': 'md5:65b09633df9ffee57f48b39e34c9e067', + 'uploader': 'rcs.it', + } + }, { + 'url': 'https://video.corriere.it/video-embed/b727632a-f9d0-11ea-91b0-38d50a849abb?player', + 'match_only': True + }, { + 'url': 'https://video.gazzetta.it/video-embed/49612410-00ca-11eb-bcd8-30d4253e0140', + 'match_only': True + }] + + @staticmethod + def _sanitize_urls(urls): + # add protocol if missing + for i, e in enumerate(urls): + if e.startswith('//'): + urls[i] = 'https:%s' % e + # clean iframes urls + for i, e in enumerate(urls): + urls[i] = urljoin(base_url(e), url_basename(e)) + return urls + + @staticmethod + def _extract_urls(webpage): + entries = [ + mobj.group('url') + for mobj in re.finditer(r'''(?x) + (?: + data-frame-src=| + <iframe[^\n]+src= + ) + (["']) + (?P<url>(?:https?:)?//video\. + (?: + rcs| + (?:corriere\w+\.)?corriere| + (?:gazzanet\.)?gazzetta + ) + \.it/video-embed/.+?) + \1''', webpage)] + return RCSEmbedsIE._sanitize_urls(entries) + + @staticmethod + def _extract_url(webpage): + urls = RCSEmbedsIE._extract_urls(webpage) + return urls[0] if urls else None + + +class CorriereIE(RCSIE): + IE_NAME = 'rcs:corriere' + _VALID_URL = r'''(?x)https?://(?P<vid>video|viaggi)\. + (?P<cdn> + (?: + corrieredelmezzogiorno\.| + corrieredelveneto\.| + corrieredibologna\.| + corrierefiorentino\. + )? + corriere\.it)/.+?/(?P<id>[^/]+)(?=\?|/$|$)''' + _TESTS = [{ + 'url': 'https://video.corriere.it/sport/formula-1/vettel-guida-ferrari-sf90-mugello-suo-fianco-c-elecrerc-bendato-video-esilarante/b727632a-f9d0-11ea-91b0-38d50a849abb', + 'md5': '0f4ededc202b0f00b6e509d831e2dcda', + 'info_dict': { + 'id': 'b727632a-f9d0-11ea-91b0-38d50a849abb', + 'ext': 'mp4', + 'title': 'Vettel guida la Ferrari SF90 al Mugello e al suo fianco c\'è Leclerc (bendato): il video è esilarante', + 'description': 'md5:93b51c9161ac8a64fb2f997b054d0152', + 'uploader': 'Corriere Tv', + } + }, { + 'url': 'https://viaggi.corriere.it/video/norvegia-il-nuovo-ponte-spettacolare-sopra-la-cascata-di-voringsfossen/', + 'md5': 'da378e4918d2afbf7d61c35abb948d4c', + 'info_dict': { + 'id': '5b7cd134-e2c1-11ea-89b3-b56dd0df2aa2', + 'ext': 'mp4', + 'title': 'La nuova spettacolare attrazione in Norvegia: il ponte sopra Vøringsfossen', + 'description': 'md5:18b35a291f6746c0c8dacd16e5f5f4f8', + 'uploader': 'DOVE Viaggi', + } + }, { + 'url': 'https://video.corriere.it/video-embed/b727632a-f9d0-11ea-91b0-38d50a849abb?player', + 'match_only': True + }, { + 'url': 'https://video.corriere.it/video-360/metro-copenaghen-tutta-italiana/a248a7f0-e2db-11e9-9830-af2de6b1f945', + 'match_only': True + }] + + +class GazzettaIE(RCSIE): + IE_NAME = 'rcs:gazzetta' + _VALID_URL = r'https?://(?P<vid>video)\.(?P<cdn>(?:gazzanet\.)?gazzetta\.it)/.+?/(?P<id>[^/]+?)(?:$|\?)' + _TESTS = [{ + 'url': 'https://video.gazzetta.it/video-motogp-catalogna-cadute-dovizioso-vale-rossi/49612410-00ca-11eb-bcd8-30d4253e0140?vclk=Videobar', + 'md5': 'eedc1b5defd18e67383afef51ff7bdf9', + 'info_dict': { + 'id': '49612410-00ca-11eb-bcd8-30d4253e0140', + 'ext': 'mp4', + 'title': 'Dovizioso, il contatto con Zarco e la caduta. E anche Vale finisce a terra', + 'description': 'md5:8c6e905dc3b9413218beca11ebd69778', + 'uploader': 'AMorici', + } + }, { + 'url': 'https://video.gazzetta.it/video-embed/49612410-00ca-11eb-bcd8-30d4253e0140', + 'match_only': True + }, { + 'url': 'https://video.gazzanet.gazzetta.it/video-embed/gazzanet-mo05-0000260789', + 'match_only': True + }] + + +class RCSVariousIE(RCSIE): + IE_NAME = 'rcs:various' + _VALID_URL = r'''(?x)https?://www\. + (?P<cdn> + leitv\.it| + youreporter\.it + )/(?:video/)?(?P<id>[^/]+?)(?:$|\?|/)''' + _TESTS = [{ + 'url': 'https://www.leitv.it/video/marmellata-di-ciliegie-fatta-in-casa/', + 'md5': '618aaabac32152199c1af86784d4d554', + 'info_dict': { + 'id': 'marmellata-di-ciliegie-fatta-in-casa', + 'ext': 'mp4', + 'title': 'Marmellata di ciliegie fatta in casa', + 'description': 'md5:89133864d6aad456dbcf6e7a29f86263', + 'uploader': 'leitv.it', + } + }, { + 'url': 'https://www.youreporter.it/fiume-sesia-3-ottobre-2020/', + 'md5': '8dccd436b47a830bab5b4a88232f391a', + 'info_dict': { + 'id': 'fiume-sesia-3-ottobre-2020', + 'ext': 'mp4', + 'title': 'Fiume Sesia 3 ottobre 2020', + 'description': 'md5:0070eef1cc884d13c970a4125063de55', + 'uploader': 'youreporter.it', + } + }] From 576d233fe67641da24b15751fc2a0e5c1e787034 Mon Sep 17 00:00:00 2001 From: insaneracist <insaneracist@cyberdude.com> Date: Wed, 28 Oct 2020 13:48:55 -0700 Subject: [PATCH 017/124] [xtube] fix extractor (#17) --- youtube_dlc/extractor/xtube.py | 47 +++++++++++++++------------------- 1 file changed, 20 insertions(+), 27 deletions(-) diff --git a/youtube_dlc/extractor/xtube.py b/youtube_dlc/extractor/xtube.py index 01b253dcb..081c5e2e7 100644 --- a/youtube_dlc/extractor/xtube.py +++ b/youtube_dlc/extractor/xtube.py @@ -5,6 +5,7 @@ from .common import InfoExtractor from ..utils import ( + ExtractorError, int_or_none, js_to_json, orderedSet, @@ -33,27 +34,11 @@ class XTubeIE(InfoExtractor): 'title': 'strange erotica', 'description': 'contains:an ET kind of thing', 'uploader': 'greenshowers', - 'duration': 450, + 'duration': 449, 'view_count': int, 'comment_count': int, 'age_limit': 18, } - }, { - # FLV videos with duplicated formats - 'url': 'http://www.xtube.com/video-watch/A-Super-Run-Part-1-YT-9299752', - 'md5': 'a406963eb349dd43692ec54631efd88b', - 'info_dict': { - 'id': '9299752', - 'display_id': 'A-Super-Run-Part-1-YT', - 'ext': 'flv', - 'title': 'A Super Run - Part 1 (YT)', - 'description': 'md5:4cc3af1aa1b0413289babc88f0d4f616', - 'uploader': 'tshirtguy59', - 'duration': 579, - 'view_count': int, - 'comment_count': int, - 'age_limit': 18, - }, }, { # new URL schema 'url': 'http://www.xtube.com/video-watch/strange-erotica-625837', @@ -89,16 +74,24 @@ def _real_extract(self, url): title, thumbnail, duration = [None] * 3 - config = self._parse_json(self._search_regex( - r'playerConf\s*=\s*({.+?})\s*,\s*\n', webpage, 'config', - default='{}'), video_id, transform_source=js_to_json, fatal=False) - if config: - config = config.get('mainRoll') - if isinstance(config, dict): - title = config.get('title') - thumbnail = config.get('poster') - duration = int_or_none(config.get('duration')) - sources = config.get('sources') or config.get('format') + json_config_string = self._search_regex( + r'playerConf=({.+?}),loaderConf', + webpage, 'config', default=None) + if not json_config_string: + raise ExtractorError("Could not extract video player data") + + json_config_string = json_config_string.replace("!0", "true").replace("!1", "false") + + config = self._parse_json(json_config_string, video_id, transform_source=js_to_json, fatal=False) + if not config: + raise ExtractorError("Could not extract video player data") + + config = config.get('mainRoll') + if isinstance(config, dict): + title = config.get('title') + thumbnail = config.get('poster') + duration = int_or_none(config.get('duration')) + sources = config.get('sources') or config.get('format') if not isinstance(sources, dict): sources = self._parse_json(self._search_regex( From 139e10ad9815f134588d89d27f570a3ee54f79dc Mon Sep 17 00:00:00 2001 From: insaneracist <insaneracist@cyberdude.com> Date: Wed, 28 Oct 2020 16:55:47 -0700 Subject: [PATCH 018/124] [newgrounds] fix: video download --- youtube_dlc/extractor/newgrounds.py | 103 +++++++++++++++++++--------- 1 file changed, 69 insertions(+), 34 deletions(-) diff --git a/youtube_dlc/extractor/newgrounds.py b/youtube_dlc/extractor/newgrounds.py index 82e7cf522..b9f01235f 100644 --- a/youtube_dlc/extractor/newgrounds.py +++ b/youtube_dlc/extractor/newgrounds.py @@ -4,6 +4,7 @@ from .common import InfoExtractor from ..utils import ( + ExtractorError, extract_attributes, int_or_none, parse_duration, @@ -20,22 +21,22 @@ class NewgroundsIE(InfoExtractor): 'info_dict': { 'id': '549479', 'ext': 'mp3', - 'title': 'B7 - BusMode', + 'title': 'Burn7 - B7 - BusMode', 'uploader': 'Burn7', 'timestamp': 1378878540, 'upload_date': '20130911', 'duration': 143, }, }, { - 'url': 'https://www.newgrounds.com/portal/view/673111', - 'md5': '3394735822aab2478c31b1004fe5e5bc', + 'url': 'https://www.newgrounds.com/portal/view/1', + 'md5': 'fbfb40e2dc765a7e830cb251d370d981', 'info_dict': { - 'id': '673111', + 'id': '1', 'ext': 'mp4', - 'title': 'Dancin', - 'uploader': 'Squirrelman82', - 'timestamp': 1460256780, - 'upload_date': '20160410', + 'title': 'Brian-Beaton - Scrotum 1', + 'uploader': 'Brian-Beaton', + 'timestamp': 955064100, + 'upload_date': '20000406', }, }, { # source format unavailable, additional mp4 formats @@ -43,7 +44,7 @@ class NewgroundsIE(InfoExtractor): 'info_dict': { 'id': '689400', 'ext': 'mp4', - 'title': 'ZTV News Episode 8', + 'title': 'Bennettthesage - ZTV News Episode 8', 'uploader': 'BennettTheSage', 'timestamp': 1487965140, 'upload_date': '20170224', @@ -55,42 +56,73 @@ class NewgroundsIE(InfoExtractor): def _real_extract(self, url): media_id = self._match_id(url) - + formats = [] + uploader = None webpage = self._download_webpage(url, media_id) title = self._html_search_regex( r'<title>([^>]+)', webpage, 'title') - media_url = self._parse_json(self._search_regex( - r'"url"\s*:\s*("[^"]+"),', webpage, ''), media_id) + media_url_string = self._search_regex( + r'"url"\s*:\s*("[^"]+"),', webpage, 'media url', default=None, fatal=False) - formats = [{ - 'url': media_url, - 'format_id': 'source', - 'quality': 1, - }] + if media_url_string: + media_url = self._parse_json(media_url_string, media_id) + formats = [{ + 'url': media_url, + 'format_id': 'source', + 'quality': 1, + }] - max_resolution = int_or_none(self._search_regex( - r'max_resolution["\']\s*:\s*(\d+)', webpage, 'max resolution', - default=None)) - if max_resolution: - url_base = media_url.rpartition('.')[0] - for resolution in (360, 720, 1080): - if resolution > max_resolution: - break - formats.append({ - 'url': '%s.%dp.mp4' % (url_base, resolution), - 'format_id': '%dp' % resolution, - 'height': resolution, - }) + max_resolution = int_or_none(self._search_regex( + r'max_resolution["\']\s*:\s*(\d+)', webpage, 'max resolution', + default=None)) + if max_resolution: + url_base = media_url.rpartition('.')[0] + for resolution in (360, 720, 1080): + if resolution > max_resolution: + break + formats.append({ + 'url': '%s.%dp.mp4' % (url_base, resolution), + 'format_id': '%dp' % resolution, + 'height': resolution, + }) + else: + video_id = int_or_none(self._search_regex( + r'data-movie-id=\\"([0-9]+)\\"', webpage, '')) + if not video_id: + raise ExtractorError('Could not extract media data') + + url_video_data = 'https://www.newgrounds.com/portal/video/%s' % video_id + headers = { + 'Accept': 'application/json', + 'Referer': url, + 'X-Requested-With': 'XMLHttpRequest' + } + json_video = self._download_json(url_video_data, video_id, headers=headers, fatal=False) + if not json_video: + raise ExtractorError('Could not fetch media data') + + uploader = json_video.get('author') + title = json_video.get('title') + media_formats = json_video.get('sources', []) + for media_format in media_formats: + media_sources = media_formats[media_format] + for source in media_sources: + formats.append({ + 'format_id': media_format, + 'quality': int_or_none(media_format[:-1]), + 'url': source.get('src') + }) self._check_formats(formats, media_id) self._sort_formats(formats) - uploader = self._html_search_regex( - (r'(?s)]*>(.+?).*?\s*Author\s*', - r'(?:Author|Writer)\s*]+>([^<]+)'), webpage, 'uploader', - fatal=False) + if not uploader: + uploader = self._html_search_regex( + (r'(?s)]*>(.+?).*?\s*(?:Author|Artist)\s*', + r'(?:Author|Writer)\s*]+>([^<]+)'), webpage, 'uploader', + fatal=False) timestamp = unified_timestamp(self._html_search_regex( (r'
\s*Uploaded\s*
\s*
([^<]+
\s*
[^<]+)', @@ -109,6 +141,9 @@ def _real_extract(self, url): if '
Song' in webpage: formats[0]['vcodec'] = 'none' + if uploader: + title = "%s - %s" % (uploader, title) + return { 'id': media_id, 'title': title, From 0704d2224b328caeafbce6a029904472628d12bd Mon Sep 17 00:00:00 2001 From: Unknown Date: Thu, 29 Oct 2020 01:56:55 +0100 Subject: [PATCH 019/124] [core] be able to hand over id and title using url_result --- youtube_dlc/YoutubeDL.py | 9 +++++++-- youtube_dlc/extractor/bandcamp.py | 2 +- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/youtube_dlc/YoutubeDL.py b/youtube_dlc/YoutubeDL.py index fc351db0d..f959a4e47 100644 --- a/youtube_dlc/YoutubeDL.py +++ b/youtube_dlc/YoutubeDL.py @@ -801,7 +801,7 @@ def add_extra_info(info_dict, extra_info): for key, value in extra_info.items(): info_dict.setdefault(key, value) - def extract_info(self, url, download=True, ie_key=None, extra_info={}, + def extract_info(self, url, download=True, ie_key=None, info_dict=None, extra_info={}, process=True, force_generic_extractor=False): ''' Returns a list with a dictionary for each video we find. @@ -836,6 +836,11 @@ def extract_info(self, url, download=True, ie_key=None, extra_info={}, '_type': 'compat_list', 'entries': ie_result, } + if info_dict: + if info_dict.get('id'): + ie_result['id'] = info_dict['id'] + if info_dict.get('title'): + ie_result['title'] = info_dict['title'] self.add_default_extra_info(ie_result, ie, url) if process: return self.process_ie_result(ie_result, download, extra_info) @@ -898,7 +903,7 @@ def process_ie_result(self, ie_result, download=True, extra_info={}): # We have to add extra_info to the results because it may be # contained in a playlist return self.extract_info(ie_result['url'], - download, + download, info_dict=ie_result, ie_key=ie_result.get('ie_key'), extra_info=extra_info) elif result_type == 'url_transparent': diff --git a/youtube_dlc/extractor/bandcamp.py b/youtube_dlc/extractor/bandcamp.py index 7d29481c0..0e7492764 100644 --- a/youtube_dlc/extractor/bandcamp.py +++ b/youtube_dlc/extractor/bandcamp.py @@ -332,7 +332,7 @@ def _real_extract(self, url): entries = [ self.url_result( compat_urlparse.urljoin(url, track['title_link']), - ie=BandcampIE.ie_key(), + ie=BandcampIE.ie_key(), video_id=track['id'], video_title=track['title']) for track in tracks if track.get('duration')] From 4932ba4aecf653166f04211680a48624b48f030f Mon Sep 17 00:00:00 2001 From: Unknown Date: Thu, 29 Oct 2020 02:57:43 +0100 Subject: [PATCH 020/124] [yt_live_chat] deactivate for now. --- youtube_dlc/extractor/youtube.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index ccfaa733d..5fd22081a 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -1366,14 +1366,14 @@ def _get_subtitles(self, video_id, webpage, has_live_chat_replay): 'ext': ext, }) sub_lang_list[lang] = sub_formats - if has_live_chat_replay: + """ if has_live_chat_replay: sub_lang_list['live_chat'] = [ { 'video_id': video_id, 'ext': 'json', 'protocol': 'youtube_live_chat_replay', }, - ] + ] """ if not sub_lang_list: self._downloader.report_warning('video doesn\'t have subtitles') return {} From bb8a73a0e2b5e4118a32dee9e3c30737107ed90b Mon Sep 17 00:00:00 2001 From: bopol Date: Wed, 28 Oct 2020 21:57:58 +0100 Subject: [PATCH 021/124] [nitter] Add new extractor --- docs/supportedsites.md | 1 + youtube_dlc/extractor/extractors.py | 1 + youtube_dlc/extractor/nitter.py | 167 ++++++++++++++++++++++++++++ 3 files changed, 169 insertions(+) create mode 100644 youtube_dlc/extractor/nitter.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index c46d122ff..3b98e7a12 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -582,6 +582,7 @@ # Supported sites - **niconico**: ニコニコ動画 - **NiconicoPlaylist** - **Nintendo** + - **Nitter** - **njoy**: N-JOY - **njoy:embed** - **NJPWWorld**: 新日本プロレスワールド diff --git a/youtube_dlc/extractor/extractors.py b/youtube_dlc/extractor/extractors.py index d31edd7c8..1dc2ab34c 100644 --- a/youtube_dlc/extractor/extractors.py +++ b/youtube_dlc/extractor/extractors.py @@ -751,6 +751,7 @@ from .ninegag import NineGagIE from .ninenow import NineNowIE from .nintendo import NintendoIE +from .nitter import NitterIE from .njpwworld import NJPWWorldIE from .nobelprize import NobelPrizeIE from .noco import NocoIE diff --git a/youtube_dlc/extractor/nitter.py b/youtube_dlc/extractor/nitter.py new file mode 100644 index 000000000..3191543ed --- /dev/null +++ b/youtube_dlc/extractor/nitter.py @@ -0,0 +1,167 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_urlparse +from ..utils import ( + parse_count, + unified_strdate, + unified_timestamp, + remove_end, + determine_ext, +) +import re + + +class NitterIE(InfoExtractor): + # Taken from https://github.com/zedeus/nitter/wiki/Instances + INSTANCES = ('nitter.net', + 'nitter.snopyta.org', + 'nitter.42l.fr', + 'nitter.nixnet.services', + 'nitter.13ad.de', + 'nitter.pussthecat.org', + 'nitter.mastodont.cat', + 'nitter.dark.fail', + 'nitter.tedomum.net', + 'nitter.cattube.org', + 'nitter.fdn.fr', + 'nitter.1d4.us', + 'nitter.kavin.rocks', + 'tweet.lambda.dance', + 'nitter.cc', + 'nitter.weaponizedhumiliation.com', + '3nzoldnxplag42gqjs23xvghtzf6t6yzssrtytnntc6ppc7xxuoneoad.onion', + 'nitter.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd.onion', + 'nitterlgj3n5fgwesu3vxc5h67ruku33nqaoeoocae2mvlzhsu6k7fqd.onion') + + _INSTANCES_RE = '(?:' + '|'.join([re.escape(instance) for instance in INSTANCES]) + ')' + _VALID_URL = r'https?://%(instance)s/(?P.+)/status/(?P[0-9]+)(#.)?' % {'instance': _INSTANCES_RE} + current_instance = INSTANCES[0] # the test and official instance + _TESTS = [ + { + # GIF (wrapped in mp4) + 'url': 'https://' + current_instance + '/firefox/status/1314279897502629888#m', + 'info_dict': { + 'id': '1314279897502629888', + 'ext': 'mp4', + 'title': 'Firefox 🔥 - You know the old saying, if you see something say something. Now you actually can with the YouTube regrets extension. Report harmful YouTube recommendations so others can avoid watching them. ➡️ https://mzl.la/3iFIiyg #UnfckTheInternet', + 'description': 'You know the old saying, if you see something say something. Now you actually can with the YouTube regrets extension. Report harmful YouTube recommendations so others can avoid watching them. ➡️ https://mzl.la/3iFIiyg #UnfckTheInternet', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Firefox 🔥', + 'uploader_id': 'firefox', + 'uploader_url': 'https://' + current_instance + '/firefox', + 'upload_date': '20201008', + 'timestamp': 1602183720, + }, + }, { # normal video + 'url': 'https://' + current_instance + '/Le___Doc/status/1299715685392756737#m', + 'info_dict': { + 'id': '1299715685392756737', + 'ext': 'mp4', + 'title': 'Le Doc - "Je ne prédis jamais rien" D Raoult, Août 2020...', + 'description': '"Je ne prédis jamais rien" D Raoult, Août 2020...', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Le Doc', + 'uploader_id': 'Le___Doc', + 'uploader_url': 'https://' + current_instance + '/Le___Doc', + 'upload_date': '20200829', + 'timestamp': 1598711341, + 'view_count': int, + 'like_count': int, + 'repost_count': int, + 'comment_count': int, + }, + }, { # video embed in a "Streaming Political Ads" box + 'url': 'https://' + current_instance + '/mozilla/status/1321147074491092994#m', + 'info_dict': { + 'id': '1321147074491092994', + 'ext': 'mp4', + 'title': "Mozilla - Are you being targeted with weird, ominous or just plain annoying political ads while streaming your favorite shows? This isn't a real political ad, but if you're watching streaming TV in the U.S., chances are you've seen quite a few. Learn more ➡️ https://mzl.la/StreamingAds", + 'description': "Are you being targeted with weird, ominous or just plain annoying political ads while streaming your favorite shows? This isn't a real political ad, but if you're watching streaming TV in the U.S., chances are you've seen quite a few. Learn more ➡️ https://mzl.la/StreamingAds", + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Mozilla', + 'uploader_id': 'mozilla', + 'uploader_url': 'https://' + current_instance + '/mozilla', + 'upload_date': '20201027', + 'timestamp': 1603820982 + }, + }, + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + parsed_url = compat_urlparse.urlparse(url) + base_url = parsed_url.scheme + '://' + parsed_url.netloc + + self._set_cookie(parsed_url.netloc, 'hlsPlayback', 'on') + webpage = self._download_webpage(url, video_id) + + video_url = base_url + self._html_search_regex(r'(?:]+data-url|]+src)="([^"]+)"', webpage, 'video url') + ext = determine_ext(video_url) + + if ext == 'unknown_video': + formats = self._extract_m3u8_formats(video_url, video_id, ext='mp4') + else: + formats = [{ + 'url': video_url, + 'ext': ext + }] + + title = ( + self._og_search_description(webpage).replace('\n', ' ') + or self._html_search_regex(r'
]+title="([^"]+)"', webpage, 'uploader name', fatal=False)) + + if uploader_id: + uploader_url = base_url + '/' + uploader_id + + uploader = self._html_search_regex(r']+title="([^"]+)"', webpage, 'uploader name', fatal=False) + + if uploader: + title = uploader + ' - ' + title + + view_count = parse_count(self._html_search_regex(r']+class="icon-play[^>]*>\s([^<]+)
', webpage, 'view count', fatal=False)) + like_count = parse_count(self._html_search_regex(r']+class="icon-heart[^>]*>\s([^<]+)', webpage, 'like count', fatal=False)) + repost_count = parse_count(self._html_search_regex(r']+class="icon-retweet[^>]*>\s([^<]+)', webpage, 'repost count', fatal=False)) + comment_count = parse_count(self._html_search_regex(r']+class="icon-comment[^>]*>\s([^<]+)', webpage, 'repost count', fatal=False)) + + thumbnail = base_url + (self._html_search_meta('og:image', webpage, 'thumbnail url') + or self._html_search_regex(r']+poster="([^"]+)"', webpage, 'thumbnail url', fatal=False)) + + thumbnail = remove_end(thumbnail, '%3Asmall') # if parsed with regex, it should contain this + + thumbnails = [] + thumbnail_ids = ('thumb', 'small', 'large', 'medium', 'orig') + for id in thumbnail_ids: + thumbnails.append({ + 'id': id, + 'url': thumbnail + '%3A' + id, + }) + + date = self._html_search_regex(r']+class="tweet-date"[^>]*>]+title="([^"]+)"', webpage, 'upload date', fatal=False) + upload_date = unified_strdate(date) + timestamp = unified_timestamp(date) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'uploader': uploader, + 'timestamp': timestamp, + 'uploader_id': uploader_id, + 'uploader_url': uploader_url, + 'view_count': view_count, + 'like_count': like_count, + 'repost_count': repost_count, + 'comment_count': comment_count, + 'formats': formats, + 'thumbnails': thumbnails, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + } From 508649e6f5f4d1153fe20fd5b9d327c881604bc4 Mon Sep 17 00:00:00 2001 From: nixxo Date: Thu, 29 Oct 2020 13:31:12 +0100 Subject: [PATCH 022/124] [rcs] fixed coding conventions --- youtube_dlc/extractor/rcs.py | 72 ++++++++++++++++++++---------------- 1 file changed, 40 insertions(+), 32 deletions(-) diff --git a/youtube_dlc/extractor/rcs.py b/youtube_dlc/extractor/rcs.py index 183c14d64..8dbd9913b 100644 --- a/youtube_dlc/extractor/rcs.py +++ b/youtube_dlc/extractor/rcs.py @@ -104,20 +104,20 @@ class RCSIE(InfoExtractor): } def _get_video_src(self, video): - mediaFiles = video['mediaProfile']['mediaFile'] + mediaFiles = video.get('mediaProfile').get('mediaFile') src = {} # audio - if video['mediaType'] == 'AUDIO': + if video.get('mediaType') == 'AUDIO': for aud in mediaFiles: # todo: check - src['mp3'] = aud['value'] + src['mp3'] = aud.get('value') # video else: for vid in mediaFiles: - if vid['mimeType'] == 'application/vnd.apple.mpegurl': - src['m3u8'] = vid['value'] - if vid['mimeType'] == 'video/mp4': - src['mp4'] = vid['value'] + if vid.get('mimeType') == 'application/vnd.apple.mpegurl': + src['m3u8'] = vid.get('value') + if vid.get('mimeType') == 'video/mp4': + src['mp4'] = vid.get('value') # replace host for t in src: @@ -128,9 +128,10 @@ def _get_video_src(self, video): # switch cdn if 'mp4' in src and 'm3u8' in src: - if '-lh.akamaihd' not in src['m3u8'] and 'akamai' in src['mp4']: + if ('-lh.akamaihd' not in src.get('m3u8') + and 'akamai' in src.get('mp4')): if 'm3u8' in src: - matches = re.search(r'(?:https*:)?\/\/(?P.*)\.net\/i(?P.*)$', src['m3u8']) + matches = re.search(r'(?:https*:)?\/\/(?P.*)\.net\/i(?P.*)$', src.get('m3u8')) src['m3u8'] = 'https://vod.rcsobjects.it/hls/%s%s' % ( self._MIGRATION_MAP[matches.group('host')], matches.group('path').replace( @@ -140,11 +141,11 @@ def _get_video_src(self, video): ) ) if 'mp4' in src: - matches = re.search(r'(?:https*:)?\/\/(?P.*)\.net\/i(?P.*)$', src['mp4']) + matches = re.search(r'(?:https*:)?\/\/(?P.*)\.net\/i(?P.*)$', src.get('mp4')) if matches: if matches.group('host') in self._MIGRATION_MEDIA: vh_stream = 'https://media2.corriereobjects.it' - if src['mp4'].find('fcs.quotidiani_!'): + if src.get('mp4').find('fcs.quotidiani_!'): vh_stream = 'https://media2-it.corriereobjects.it' src['mp4'] = '%s%s' % ( vh_stream, @@ -163,65 +164,68 @@ def _get_video_src(self, video): ) if 'mp3' in src: - src['mp3'] = src['mp3'].replace( + src['mp3'] = src.get('mp3').replace( 'media2vam-corriere-it.akamaized.net', 'vod.rcsobjects.it/corriere') if 'mp4' in src: - if src['mp4'].find('fcs.quotidiani_!'): - src['mp4'] = src['mp4'].replace('vod.rcsobjects', 'vod-it.rcsobjects') + if src.get('mp4').find('fcs.quotidiani_!'): + src['mp4'] = src.get('mp4').replace('vod.rcsobjects', 'vod-it.rcsobjects') if 'm3u8' in src: - if src['m3u8'].find('fcs.quotidiani_!'): - src['m3u8'] = src['m3u8'].replace('vod.rcsobjects', 'vod-it.rcsobjects') + if src.get('m3u8').find('fcs.quotidiani_!'): + src['m3u8'] = src.get('m3u8').replace('vod.rcsobjects', 'vod-it.rcsobjects') - if 'geoblocking' in video['mediaProfile']: + if 'geoblocking' in video.get('mediaProfile'): if 'm3u8' in src: - src['m3u8'] = src['m3u8'].replace('vod.rcsobjects', 'vod-it.rcsobjects') + src['m3u8'] = src.get('m3u8').replace('vod.rcsobjects', 'vod-it.rcsobjects') if 'mp4' in src: - src['mp4'] = src['mp4'].replace('vod.rcsobjects', 'vod-it.rcsobjects') + src['mp4'] = src.get('mp4').replace('vod.rcsobjects', 'vod-it.rcsobjects') if 'm3u8' in src: - if src['m3u8'].find('csmil') and src['m3u8'].find('vod'): - src['m3u8'] = src['m3u8'].replace('.csmil', '.urlset') + if src.get('m3u8').find('csmil') and src.get('m3u8').find('vod'): + src['m3u8'] = src.get('m3u8').replace('.csmil', '.urlset') return src def _create_formats(self, urls, video_id): formats = [] formats = self._extract_m3u8_formats( - urls['m3u8'], video_id, 'mp4', entry_protocol='m3u8_native', + urls.get('m3u8'), video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) if not formats: formats.append({ 'format_id': 'http-mp4', - 'url': urls['mp4'] + 'url': urls.get('mp4') }) self._sort_formats(formats) return formats def _real_extract(self, url): video_id = self._match_id(url) - mobj = re.search(self._VALID_URL, url).groupdict() + mobj = re.search(self._VALID_URL, url) - if not mobj['cdn']: + if 'cdn' not in mobj.groupdict(): raise ExtractorError('CDN not found in url: %s' % url) # for leitv/youreporter/viaggi don't use the embed page - if (mobj['cdn'] not in ['leitv.it', 'youreporter.it']) and (mobj['vid'] == 'video'): - url = 'https://video.%s/video-embed/%s' % (mobj['cdn'], video_id) + if ((mobj.group('cdn') not in ['leitv.it', 'youreporter.it']) + and (mobj.group('vid') == 'video')): + url = 'https://video.%s/video-embed/%s' % (mobj.group('cdn'), video_id) page = self._download_webpage(url, video_id) video_data = None # look for json video data url json = self._search_regex( - r'''var url\s*=\s*["']((?:https?:)?//video\.rcs\.it/fragment-includes/video-includes/.+?\.json)["'];''', + r'''(?x)var url\s*=\s*["']((?:https?:)? + //video\.rcs\.it + /fragment-includes/video-includes/.+?\.json)["'];''', page, video_id, default=None) if json: if json.startswith('//'): json = 'https:%s' % json video_data = self._download_json(json, video_id) - # if url not found, look for json video data directly in the page + # if json url not found, look for json video data directly in the page else: json = self._search_regex( r'[\s;]video\s*=\s*({[\s\S]+?})(?:;|,playlist=)', @@ -241,11 +245,15 @@ def _real_extract(self, url): formats = self._create_formats( self._get_video_src(video_data), video_id) + description = (video_data.get('description') + or clean_html(video_data.get('htmlDescription'))) + uploader = video_data.get('provider') or mobj.gruop('cdn') + return { 'id': video_id, - 'title': video_data['title'], - 'description': video_data['description'] or clean_html(video_data['htmlDescription']), - 'uploader': video_data['provider'] if video_data['provider'] else mobj['cdn'], + 'title': video_data.get('title'), + 'description': description, + 'uploader': uploader, 'formats': formats } From 920ad13673b4f60274fe132bf17d8019011dfc9b Mon Sep 17 00:00:00 2001 From: nixxo Date: Thu, 29 Oct 2020 13:37:07 +0100 Subject: [PATCH 023/124] [skyitalia] fixed coding conventions --- youtube_dlc/extractor/skyitalia.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dlc/extractor/skyitalia.py b/youtube_dlc/extractor/skyitalia.py index a4e894044..3c7bd465d 100644 --- a/youtube_dlc/extractor/skyitalia.py +++ b/youtube_dlc/extractor/skyitalia.py @@ -33,11 +33,11 @@ def _get_formats(self, video_id, token): formats = [] for q, r in self._RES.items(): - key = 'web_' + q + '_url' + key = 'web_%s_url' % q if key not in video_data: continue formats.append({ - 'url': video_data[key], + 'url': video_data.get(key), 'format_id': q, 'width': r[0], 'height': r[1] From 9322f1162de5d5f4fc7e911acdce782ccb943712 Mon Sep 17 00:00:00 2001 From: insaneracist Date: Thu, 29 Oct 2020 14:37:06 -0700 Subject: [PATCH 024/124] [youtube] fix: extract artist metadata from ytInitialData (#49) --- youtube_dlc/extractor/youtube.py | 46 ++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index 5fd22081a..c75a7edae 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -1406,6 +1406,44 @@ def _get_yt_initial_data(self, video_id, webpage): return self._parse_json( uppercase_escape(config), video_id, fatal=False) + def _get_music_metadata_from_yt_initial(self, yt_initial): + music_metadata = [] + key_map = { + 'Album': 'album', + 'Artist': 'artist', + 'Song': 'track' + } + contents = try_get(yt_initial, lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents']) + if type(contents) is list: + for content in contents: + music_track = {} + if type(content) is not dict: + continue + videoSecondaryInfoRenderer = try_get(content, lambda x: x['videoSecondaryInfoRenderer']) + if type(videoSecondaryInfoRenderer) is not dict: + continue + rows = try_get(videoSecondaryInfoRenderer, lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows']) + if type(rows) is not list: + continue + for row in rows: + metadataRowRenderer = try_get(row, lambda x: x['metadataRowRenderer']) + if type(metadataRowRenderer) is not dict: + continue + key = try_get(metadataRowRenderer, lambda x: x['title']['simpleText']) + value = try_get(metadataRowRenderer, lambda x: x['contents'][0]['simpleText']) or \ + try_get(metadataRowRenderer, lambda x: x['contents'][0]['runs'][0]['text']) + if type(key) is not str or type(value) is not str: + continue + if key in key_map: + if key_map[key] in music_track: + # we've started on a new track + music_metadata.append(music_track) + music_track = {} + music_track[key_map[key]] = value + if len(music_track.keys()): + music_metadata.append(music_track) + return music_metadata + def _get_automatic_captions(self, video_id, webpage): """We need the webpage for getting the captions url, pass it as an argument to speed up the process.""" @@ -2328,6 +2366,14 @@ def extract_meta(field): if release_year: release_year = int(release_year) + yt_initial = self._get_yt_initial_data(video_id, video_webpage) + if yt_initial: + music_metadata = self._get_music_metadata_from_yt_initial(yt_initial) + if len(music_metadata): + album = music_metadata[0].get('album') + artist = music_metadata[0].get('artist') + track = music_metadata[0].get('track') + m_episode = re.search( r']+id="watch7-headline"[^>]*>\s*]*>.*?>(?P[^<]+)\s*S(?P\d+)\s*•\s*E(?P\d+)', video_webpage) From 5b0a6a801084cced4b71c255270f53c881203ca8 Mon Sep 17 00:00:00 2001 From: insaneracist Date: Thu, 29 Oct 2020 16:11:14 -0700 Subject: [PATCH 025/124] [youtube] fix: extract mix playlist ids from ytInitialData (#33) --- youtube_dlc/extractor/youtube.py | 35 ++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index 5fd22081a..0354866ef 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -279,6 +279,15 @@ def _download_webpage_handle(self, *args, **kwargs): return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle( *args, **compat_kwargs(kwargs)) + def _get_yt_initial_data(self, video_id, webpage): + config = self._search_regex( + (r'window\["ytInitialData"\]\s*=\s*(.*?)(?<=});', + r'var\s+ytInitialData\s*=\s*(.*?)(?<=});'), + webpage, 'ytInitialData', default=None) + if config: + return self._parse_json( + uppercase_escape(config), video_id, fatal=False) + def _real_initialize(self): if self._downloader is None: return @@ -1397,15 +1406,6 @@ def _get_ytplayer_config(self, video_id, webpage): return self._parse_json( uppercase_escape(config), video_id, fatal=False) - def _get_yt_initial_data(self, video_id, webpage): - config = self._search_regex( - (r'window\["ytInitialData"\]\s*=\s*(.*?)(?<=});', - r'var\s+ytInitialData\s*=\s*(.*?)(?<=});'), - webpage, 'ytInitialData', default=None) - if config: - return self._parse_json( - uppercase_escape(config), video_id, fatal=False) - def _get_automatic_captions(self, video_id, webpage): """We need the webpage for getting the captions url, pass it as an argument to speed up the process.""" @@ -2765,6 +2765,16 @@ def extract_videos_from_page(self, page): return zip(ids_in_page, titles_in_page) + def _extract_mix_ids_from_yt_initial(self, yt_initial): + ids = [] + playlist_contents = try_get(yt_initial, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist']['contents']) + if type(playlist_contents) is list: + for item in playlist_contents: + videoId = try_get(item, lambda x: x['playlistPanelVideoRenderer']['videoId']) + if type(videoId) is str: + ids.append(videoId) + return ids + def _extract_mix(self, playlist_id): # The mixes are generated from a single video # the id of the playlist is just 'RD' + video_id @@ -2778,6 +2788,13 @@ def _extract_mix(self, playlist_id): r'''(?xs)data-video-username=".*?".*? href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s''' % re.escape(playlist_id), webpage)) + + # if no ids in html of page, try using embedded json + if (len(new_ids) == 0): + yt_initial = self._get_yt_initial_data(playlist_id, webpage) + if yt_initial: + new_ids = self._extract_mix_ids_from_yt_initial(yt_initial) + # Fetch new pages until all the videos are repeated, it seems that # there are always 51 unique videos. new_ids = [_id for _id in new_ids if _id not in ids] From 3086aa194fcacd4e12e9b266ec046c991bd07f2a Mon Sep 17 00:00:00 2001 From: Dan Walker Date: Fri, 30 Oct 2020 08:31:34 -0700 Subject: [PATCH 026/124] Added Comcast_SSO fix This fix had been proposed on yt-dl for a lengthy period of time but was never merged. It has been thoroughly tested but a large section of the community. --- youtube_dlc/extractor/adobepass.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/youtube_dlc/extractor/adobepass.py b/youtube_dlc/extractor/adobepass.py index 38dca1b0a..649f9940f 100644 --- a/youtube_dlc/extractor/adobepass.py +++ b/youtube_dlc/extractor/adobepass.py @@ -1438,6 +1438,13 @@ def extract_redirect_url(html, url=None, fatal=False): provider_redirect_page, 'oauth redirect') self._download_webpage( oauth_redirect_url, video_id, 'Confirming auto login') + elif 'automatically signed in with' in provider_redirect_page: + # Seems like comcast is rolling up new way of automatically signing customers + oauth_redirect_url = self._html_search_regex( + r'continue:\s*"(https://oauth.xfinity.com/oauth/authorize\?.+)"', provider_redirect_page, + 'oauth redirect (signed)') + # Just need to process the request. No useful data comes back + self._download_webpage(oauth_redirect_url, video_id, 'Confirming auto login') else: if '
Date: Fri, 30 Oct 2020 21:29:21 +0100 Subject: [PATCH 027/124] Only use video id to find metadata --- youtube_dlc/extractor/netzkino.py | 47 ++++++++++++++++--------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/youtube_dlc/extractor/netzkino.py b/youtube_dlc/extractor/netzkino.py index aec3026b1..3d1a06d0b 100644 --- a/youtube_dlc/extractor/netzkino.py +++ b/youtube_dlc/extractor/netzkino.py @@ -13,17 +13,16 @@ class NetzkinoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?netzkino\.de/\#!/(?P[^/]+)/(?P[^/]+)' + _VALID_URL = r'https?://(?:www\.)?netzkino\.de/\#!/[^/]+/(?P[^/]+)' - _TEST = { - 'url': 'http://www.netzkino.de/#!/scifikino/rakete-zum-mond', + _TESTS = [{ + 'url': 'https://www.netzkino.de/#!/scifikino/rakete-zum-mond', 'md5': '92a3f8b76f8d7220acce5377ea5d4873', 'info_dict': { 'id': 'rakete-zum-mond', 'ext': 'mp4', - 'title': 'Rakete zum Mond (Endstation Mond, Destination Moon)', - 'comments': 'mincount:3', - 'description': 'md5:1eddeacc7e62d5a25a2d1a7290c64a28', + 'title': 'Rakete zum Mond \u2013 Jules Verne', + 'description': 'md5:f0a8024479618ddbfa450ff48ffa6c60', 'upload_date': '20120813', 'thumbnail': r're:https?://.*\.jpg$', 'timestamp': 1344858571, @@ -32,17 +31,30 @@ class NetzkinoIE(InfoExtractor): 'params': { 'skip_download': 'Download only works from Germany', } - } + }, { + 'url': 'https://www.netzkino.de/#!/filme/dr-jekyll-mrs-hyde-2', + 'md5': 'c7728b2dadd04ff6727814847a51ef03', + 'info_dict': { + 'id': 'dr-jekyll-mrs-hyde-2', + 'ext': 'mp4', + 'title': 'Dr. Jekyll & Mrs. Hyde 2', + 'description': 'md5:c2e9626ebd02de0a794b95407045d186', + 'upload_date': '20190130', + 'thumbnail': r're:https?://.*\.jpg$', + 'timestamp': 1548849437, + 'age_limit': 18, + }, + 'params': { + 'skip_download': 'Download only works from Germany', + } + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - category_id = mobj.group('category') video_id = mobj.group('id') - api_url = 'http://api.netzkino.de.simplecache.net/capi-2.0a/categories/%s.json?d=www' % category_id - api_info = self._download_json(api_url, video_id) - info = next( - p for p in api_info['posts'] if p['slug'] == video_id) + api_url = 'https://api.netzkino.de.simplecache.net/capi-2.0a/movies/%s.json?d=www' % video_id + info = self._download_json(api_url, video_id) custom_fields = info['custom_fields'] production_js = self._download_webpage( @@ -67,23 +79,12 @@ def _real_extract(self, url): } for key, tpl in templates.items()] self._sort_formats(formats) - comments = [{ - 'timestamp': parse_iso8601(c.get('date'), delimiter=' '), - 'id': c['id'], - 'author': c['name'], - 'html': c['content'], - 'parent': 'root' if c.get('parent', 0) == 0 else c['parent'], - } for c in info.get('comments', [])] - return { 'id': video_id, 'formats': formats, - 'comments': comments, 'title': info['title'], 'age_limit': int_or_none(custom_fields.get('FSK')[0]), 'timestamp': parse_iso8601(info.get('date'), delimiter=' '), 'description': clean_html(info.get('content')), 'thumbnail': info.get('thumbnail'), - 'playlist_title': api_info.get('title'), - 'playlist_id': category_id, } From 59c5fa91c167a8d011a4efa073ad6fd0027b2ed8 Mon Sep 17 00:00:00 2001 From: Peter Oettig Date: Fri, 30 Oct 2020 23:24:55 +0100 Subject: [PATCH 028/124] Fixed problem with new youtube player, leading to "Unable to extract video data". --- youtube_dlc/extractor/youtube.py | 123 ++++++++++++++++++------------- 1 file changed, 70 insertions(+), 53 deletions(-) diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index 5fd22081a..3e1adc554 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -1390,6 +1390,7 @@ def _get_ytplayer_config(self, video_id, webpage): # https://github.com/ytdl-org/youtube-dl/pull/7599) r';ytplayer\.config\s*=\s*({.+?});ytplayer', r';ytplayer\.config\s*=\s*({.+?});', + r'ytInitialPlayerResponse\s*=\s*({.+?});var meta' ) config = self._search_regex( patterns, webpage, 'ytplayer.config', default=None) @@ -1416,10 +1417,11 @@ def _get_automatic_captions(self, video_id, webpage): self._downloader.report_warning(err_msg) return {} try: - args = player_config['args'] - caption_url = args.get('ttsurl') - if caption_url: + if "args" in player_config and "ttsurl" in player_config["args"]: + args = player_config['args'] + caption_url = args['ttsurl'] timestamp = args['timestamp'] + # We get the available subtitles list_params = compat_urllib_parse_urlencode({ 'type': 'list', @@ -1475,40 +1477,50 @@ def make_captions(sub_url, sub_langs): return captions # New captions format as of 22.06.2017 - player_response = args.get('player_response') - if player_response and isinstance(player_response, compat_str): - player_response = self._parse_json( - player_response, video_id, fatal=False) - if player_response: - renderer = player_response['captions']['playerCaptionsTracklistRenderer'] - caption_tracks = renderer['captionTracks'] - for caption_track in caption_tracks: - if 'kind' not in caption_track: - # not an automatic transcription - continue - base_url = caption_track['baseUrl'] - sub_lang_list = [] - for lang in renderer['translationLanguages']: - lang_code = lang.get('languageCode') - if lang_code: - sub_lang_list.append(lang_code) - return make_captions(base_url, sub_lang_list) + if "args" in player_config: + player_response = player_config["args"].get('player_response') + else: + # New player system (ytInitialPlayerResponse) as of October 2020 + player_response = player_config - self._downloader.report_warning("Couldn't find automatic captions for %s" % video_id) - return {} - # Some videos don't provide ttsurl but rather caption_tracks and - # caption_translation_languages (e.g. 20LmZk1hakA) - # Does not used anymore as of 22.06.2017 - caption_tracks = args['caption_tracks'] - caption_translation_languages = args['caption_translation_languages'] - caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0] - sub_lang_list = [] - for lang in caption_translation_languages.split(','): - lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang)) - sub_lang = lang_qs.get('lc', [None])[0] - if sub_lang: - sub_lang_list.append(sub_lang) - return make_captions(caption_url, sub_lang_list) + if player_response: + if isinstance(player_response, compat_str): + player_response = self._parse_json( + player_response, video_id, fatal=False) + + renderer = player_response['captions']['playerCaptionsTracklistRenderer'] + caption_tracks = renderer['captionTracks'] + for caption_track in caption_tracks: + if 'kind' not in caption_track: + # not an automatic transcription + continue + base_url = caption_track['baseUrl'] + sub_lang_list = [] + for lang in renderer['translationLanguages']: + lang_code = lang.get('languageCode') + if lang_code: + sub_lang_list.append(lang_code) + return make_captions(base_url, sub_lang_list) + + self._downloader.report_warning("Couldn't find automatic captions for %s" % video_id) + return {} + + if "args" in player_config: + args = player_config["args"] + + # Some videos don't provide ttsurl but rather caption_tracks and + # caption_translation_languages (e.g. 20LmZk1hakA) + # Does not used anymore as of 22.06.2017 + caption_tracks = args['caption_tracks'] + caption_translation_languages = args['caption_translation_languages'] + caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0] + sub_lang_list = [] + for lang in caption_translation_languages.split(','): + lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang)) + sub_lang = lang_qs.get('lc', [None])[0] + if sub_lang: + sub_lang_list.append(sub_lang) + return make_captions(caption_url, sub_lang_list) # An extractor error can be raise by the download process if there are # no automatic captions but there are subtitles except (KeyError, IndexError, ExtractorError): @@ -1784,21 +1796,24 @@ def extract_embedded_config(embed_webpage, video_id): # Try looking directly into the video webpage ytplayer_config = self._get_ytplayer_config(video_id, video_webpage) if ytplayer_config: - args = ytplayer_config['args'] - if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'): - # Convert to the same format returned by compat_parse_qs - video_info = dict((k, [v]) for k, v in args.items()) - add_dash_mpd(video_info) - # Rental video is not rented but preview is available (e.g. - # https://www.youtube.com/watch?v=yYr8q0y5Jfg, - # https://github.com/ytdl-org/youtube-dl/issues/10532) - if not video_info and args.get('ypc_vid'): - return self.url_result( - args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid']) - if args.get('livestream') == '1' or args.get('live_playback') == 1: - is_live = True - if not player_response: - player_response = extract_player_response(args.get('player_response'), video_id) + args = ytplayer_config.get("args") + if args is not None: + if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'): + # Convert to the same format returned by compat_parse_qs + video_info = dict((k, [v]) for k, v in args.items()) + add_dash_mpd(video_info) + # Rental video is not rented but preview is available (e.g. + # https://www.youtube.com/watch?v=yYr8q0y5Jfg, + # https://github.com/ytdl-org/youtube-dl/issues/10532) + if not video_info and args.get('ypc_vid'): + return self.url_result( + args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid']) + if args.get('livestream') == '1' or args.get('live_playback') == 1: + is_live = True + if not player_response: + player_response = extract_player_response(args.get('player_response'), video_id) + elif not player_response: + player_response = ytplayer_config if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True): add_dash_mpd_pr(player_response) else: @@ -1828,8 +1843,8 @@ def extract_embedded_config(embed_webpage, video_id): age_gate = False # Try looking directly into the video webpage ytplayer_config = self._get_ytplayer_config(video_id, video_webpage) - if ytplayer_config: - args = ytplayer_config['args'] + args = ytplayer_config.get("args") + if args is not None: if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'): # Convert to the same format returned by compat_parse_qs video_info = dict((k, [v]) for k, v in args.items()) @@ -1844,6 +1859,8 @@ def extract_embedded_config(embed_webpage, video_id): is_live = True if not player_response: player_response = extract_player_response(args.get('player_response'), video_id) + elif not player_response: + player_response = ytplayer_config if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True): add_dash_mpd_pr(player_response) From 9f448fcb269d52e757999cba6dab3ff7046d2c19 Mon Sep 17 00:00:00 2001 From: Unknown Date: Sat, 31 Oct 2020 05:46:51 +0100 Subject: [PATCH 029/124] [core/yt_live_chat] live_chat is back. dl() new parameter --- youtube_dlc/YoutubeDL.py | 7 +++++-- youtube_dlc/downloader/common.py | 24 ++++++++++++++++-------- youtube_dlc/extractor/youtube.py | 4 ++-- youtube_dlc/options.py | 2 +- 4 files changed, 24 insertions(+), 13 deletions(-) diff --git a/youtube_dlc/YoutubeDL.py b/youtube_dlc/YoutubeDL.py index f959a4e47..360595918 100644 --- a/youtube_dlc/YoutubeDL.py +++ b/youtube_dlc/YoutubeDL.py @@ -1857,13 +1857,13 @@ def ensure_dir_exists(path): self.report_error('Cannot write annotations file: ' + annofn) return - def dl(name, info): + def dl(name, info, subtitle=False): fd = get_suitable_downloader(info, self.params)(self, self.params) for ph in self._progress_hooks: fd.add_progress_hook(ph) if self.params.get('verbose'): self.to_stdout('[debug] Invoking downloader on %r' % info.get('url')) - return fd.download(name, info) + return fd.download(name, info, subtitle) subtitles_are_requested = any([self.params.get('writesubtitles', False), self.params.get('writeautomaticsub')]) @@ -1891,6 +1891,8 @@ def dl(name, info): return else: try: + dl(sub_filename, sub_info, subtitle=True) + ''' if self.params.get('sleep_interval_subtitles', False): dl(sub_filename, sub_info) else: @@ -1898,6 +1900,7 @@ def dl(name, info): sub_info['url'], info_dict['id'], note=False).read() with io.open(encodeFilename(sub_filename), 'wb') as subfile: subfile.write(sub_data) + ''' except (ExtractorError, IOError, OSError, ValueError, compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: self.report_warning('Unable to download subtitle for "%s": %s' % (sub_lang, error_to_compat_str(err))) diff --git a/youtube_dlc/downloader/common.py b/youtube_dlc/downloader/common.py index 31c286458..14bd322b4 100644 --- a/youtube_dlc/downloader/common.py +++ b/youtube_dlc/downloader/common.py @@ -326,7 +326,7 @@ def report_unable_to_resume(self): """Report it was impossible to resume download.""" self.to_screen('[download] Unable to resume') - def download(self, filename, info_dict): + def download(self, filename, info_dict, subtitle=False): """Download to a filename using the info from info_dict Return True on success and False otherwise """ @@ -353,15 +353,23 @@ def download(self, filename, info_dict): }) return True - min_sleep_interval = self.params.get('sleep_interval') - if min_sleep_interval: - max_sleep_interval = self.params.get('max_sleep_interval', min_sleep_interval) - sleep_interval = random.uniform(min_sleep_interval, max_sleep_interval) + if subtitle is False: + min_sleep_interval = self.params.get('sleep_interval') + if min_sleep_interval: + max_sleep_interval = self.params.get('max_sleep_interval', min_sleep_interval) + sleep_interval = random.uniform(min_sleep_interval, max_sleep_interval) + self.to_screen( + '[download] Sleeping %s seconds...' % ( + int(sleep_interval) if sleep_interval.is_integer() + else '%.2f' % sleep_interval)) + time.sleep(sleep_interval) + else: + sleep_interval_sub = self.params.get('sleep_interval_subtitles') self.to_screen( '[download] Sleeping %s seconds...' % ( - int(sleep_interval) if sleep_interval.is_integer() - else '%.2f' % sleep_interval)) - time.sleep(sleep_interval) + int(sleep_interval_sub))) + time.sleep(sleep_interval_sub) + return self.real_download(filename, info_dict) diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index 5fd22081a..ccfaa733d 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -1366,14 +1366,14 @@ def _get_subtitles(self, video_id, webpage, has_live_chat_replay): 'ext': ext, }) sub_lang_list[lang] = sub_formats - """ if has_live_chat_replay: + if has_live_chat_replay: sub_lang_list['live_chat'] = [ { 'video_id': video_id, 'ext': 'json', 'protocol': 'youtube_live_chat_replay', }, - ] """ + ] if not sub_lang_list: self._downloader.report_warning('video doesn\'t have subtitles') return {} diff --git a/youtube_dlc/options.py b/youtube_dlc/options.py index 1d7a7fed2..66b45220c 100644 --- a/youtube_dlc/options.py +++ b/youtube_dlc/options.py @@ -582,7 +582,7 @@ def _comma_separated_values_options_callback(option, opt_str, value, parser): 'along with --min-sleep-interval.')) workarounds.add_option( '--sleep-subtitles', - dest='sleep_interval_subtitles', action='store_true', default=False, + dest='sleep_interval_subtitles', action='store_true', default=0, help='Enforce sleep interval on subtitles as well') verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options') From 989188476804cccd2c4c858571c80f1e93a5afc3 Mon Sep 17 00:00:00 2001 From: Tom-Oliver Heidel Date: Sat, 31 Oct 2020 07:21:58 +0100 Subject: [PATCH 030/124] [skip travis] half done workflow --- .github/workflows/build.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 8db7e92f2..538740355 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -57,7 +57,7 @@ jobs: id: sha2_file env: SHA2: ${{ hashFiles('youtube-dlc') }} - run: echo "::set-output name=sha2_unix::${env:SHA2}" + run: echo "::set-output name=sha2_unix::$SHA2" - name: Install dependencies for pypi run: | python -m pip install --upgrade pip @@ -98,12 +98,12 @@ jobs: upload_url: ${{ needs.build_unix.outputs.upload_url }} asset_path: ./dist/youtube-dlc.exe asset_name: youtube-dlc.exe - asset_content_type: application/octet-stream + asset_content_type: application/vnd.microsoft.portable-executable - name: Get SHA2-256SUMS for youtube-dlc.exe id: sha2_file_win env: - SHA2: ${{ hashFiles('dist/youtube-dlc.exe') }} - run: echo "::set-output name=sha2_windows::${env:SHA2}" + SHA2_win: ${{ hashFiles('dist/youtube-dlc.exe') }} + run: echo "::set-output name=sha2_windows::$SHA2_win" build_windows32: @@ -133,12 +133,12 @@ jobs: upload_url: ${{ needs.build_unix.outputs.upload_url }} asset_path: ./dist/youtube-dlc_x86.exe asset_name: youtube-dlc_x86.exe - asset_content_type: application/octet-stream + asset_content_type: application/vnd.microsoft.portable-executable - name: Get SHA2-256SUMS for youtube-dlc_x86.exe id: sha2_file_win32 env: - SHA2: ${{ hashFiles('dist/youtube-dlc_x86.exe') }} - run: echo "::set-output name=sha2_windows32::${env:SHA2}" + SHA2_win32: ${{ hashFiles('dist/youtube-dlc_x86.exe') }} + run: echo "::set-output name=sha2_windows32::$SHA2_win32" - name: Make SHA2-256SUMS file env: SHA2_WINDOWS: ${{ needs.build_windows.outputs.sha2_windows }} From da6403d340bce709ca226fc3e195749032b02d88 Mon Sep 17 00:00:00 2001 From: Tom-Oliver Heidel Date: Sat, 31 Oct 2020 08:36:14 +0100 Subject: [PATCH 031/124] [skip travis] finalised workflow --- .github/workflows/build.yml | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 538740355..f5d94dc49 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -146,6 +146,18 @@ jobs: SHA2_UNIX: ${{ needs.build_unix.outputs.sha2_unix }} YTDLC_VERSION: ${{ needs.build_unix.outputs.ytdlc_version }} run: | - echo "$SHA2_WINDOWS youtube-dlc.exe" > SHA2-256SUMS - echo "$SHA2_WINDOWS32 youtube-dlc32.exe" > SHA2-256SUMS - echo "$SHA2_UNIX youtube-dlc" >> SHA2-256SUMS + echo "version:$YTDLC_VERSION" >> SHA2-256SUMS + echo "youtube-dlc.exe:$SHA2_WINDOWS" >> SHA2-256SUMS + echo "youtube-dlc32.exe:$SHA2_WINDOWS32" >> SHA2-256SUMS + echo "youtube-dlc:$SHA2_UNIX" >> SHA2-256SUMS + + - name: Upload 256SUMS file + id: upload-sums + uses: actions/upload-release-asset@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + upload_url: ${{ needs.build_unix.outputs.upload_url }} + asset_path: ./SHA2-256SUMS + asset_name: SHA2-256SUMS + asset_content_type: text/plain From fa57af1ef333b11630ba6ae4353a94ea118883d4 Mon Sep 17 00:00:00 2001 From: Unknown Date: Sat, 31 Oct 2020 08:57:55 +0100 Subject: [PATCH 032/124] flake8. added sha256 check to updater. --- youtube_dlc/YoutubeDL.py | 2 +- youtube_dlc/downloader/common.py | 2 -- youtube_dlc/update.py | 20 ++++++++++++++++++++ 3 files changed, 21 insertions(+), 3 deletions(-) diff --git a/youtube_dlc/YoutubeDL.py b/youtube_dlc/YoutubeDL.py index 360595918..dd55ba0f2 100644 --- a/youtube_dlc/YoutubeDL.py +++ b/youtube_dlc/YoutubeDL.py @@ -1872,7 +1872,7 @@ def dl(name, info, subtitle=False): # subtitles download errors are already managed as troubles in relevant IE # that way it will silently go on when used with unsupporting IE subtitles = info_dict['requested_subtitles'] - ie = self.get_info_extractor(info_dict['extractor_key']) + # ie = self.get_info_extractor(info_dict['extractor_key']) for sub_lang, sub_info in subtitles.items(): sub_format = sub_info['ext'] sub_filename = subtitles_filename(filename, sub_lang, sub_format, info_dict.get('ext')) diff --git a/youtube_dlc/downloader/common.py b/youtube_dlc/downloader/common.py index 14bd322b4..460364a0b 100644 --- a/youtube_dlc/downloader/common.py +++ b/youtube_dlc/downloader/common.py @@ -369,8 +369,6 @@ def download(self, filename, info_dict, subtitle=False): '[download] Sleeping %s seconds...' % ( int(sleep_interval_sub))) time.sleep(sleep_interval_sub) - - return self.real_download(filename, info_dict) def real_download(self, filename, info_dict): diff --git a/youtube_dlc/update.py b/youtube_dlc/update.py index e49e09c17..b358e902b 100644 --- a/youtube_dlc/update.py +++ b/youtube_dlc/update.py @@ -37,10 +37,26 @@ def update_self(to_screen, verbose, opener): JSON_URL = UPDATE_URL + 'versions.json' UPDATES_RSA_KEY = (0x9d60ee4d8f805312fdb15a62f87b95bd66177b91df176765d13514a0f1754bcd2057295c5b6f1d35daa6742c3ffc9a82d3e118861c207995a8031e151d863c9927e304576bc80692bc8e094896fcf11b66f3e29e04e3a71e9a11558558acea1840aec37fc396fb6b65dc81a1c4144e03bd1c011de62e3f1357b327d08426fe93, 65537) + def sha256sum(): + h = hashlib.sha256() + b = bytearray(128 * 1024) + mv = memoryview(b) + with open(os.path.realpath(sys.executable), 'rb', buffering=0) as f: + for n in iter(lambda: f.readinto(mv), 0): + h.update(mv[:n]) + return h.hexdigest() + + to_screen('Current Build Hash %s' % sha256sum()) + if not isinstance(globals().get('__loader__'), zipimporter) and not hasattr(sys, 'frozen'): to_screen('It looks like you installed youtube-dlc with a package manager, pip, setup.py or a tarball. Please use that to update.') return + # compiled file.exe can find itself by + # to_screen(os.path.basename(sys.executable)) + # and path to py or exe + # to_screen(os.path.realpath(sys.executable)) + # Check if there is a new version try: newversion = opener.open(VERSION_URL).read().decode('utf-8').strip() @@ -48,6 +64,7 @@ def update_self(to_screen, verbose, opener): if verbose: to_screen(encode_compat_str(traceback.format_exc())) to_screen('ERROR: can\'t find the current version. Please try again later.') + to_screen('Visit https://github.com/blackjack4494/yt-dlc/releases/latest') return if newversion == __version__: to_screen('youtube-dlc is up-to-date (' + __version__ + ')') @@ -61,6 +78,7 @@ def update_self(to_screen, verbose, opener): if verbose: to_screen(encode_compat_str(traceback.format_exc())) to_screen('ERROR: can\'t obtain versions info. Please try again later.') + to_screen('Visit https://github.com/blackjack4494/yt-dlc/releases/latest') return if 'signature' not in versions_info: to_screen('ERROR: the versions file is not signed or corrupted. Aborting.') @@ -109,6 +127,7 @@ def version_tuple(version_str): if verbose: to_screen(encode_compat_str(traceback.format_exc())) to_screen('ERROR: unable to download latest version') + to_screen('Visit https://github.com/blackjack4494/yt-dlc/releases/latest') return newcontent_hash = hashlib.sha256(newcontent).hexdigest() @@ -155,6 +174,7 @@ def version_tuple(version_str): if verbose: to_screen(encode_compat_str(traceback.format_exc())) to_screen('ERROR: unable to download latest version') + to_screen('Visit https://github.com/blackjack4494/yt-dlc/releases/latest') return newcontent_hash = hashlib.sha256(newcontent).hexdigest() From 7fb5f2f29d99fa269988c6586558c7e9d21e432d Mon Sep 17 00:00:00 2001 From: Unknown Date: Sat, 31 Oct 2020 09:26:04 +0100 Subject: [PATCH 033/124] [skip travis] templates and urls --- .github/ISSUE_TEMPLATE/1_broken_site.md | 10 +++++----- .github/ISSUE_TEMPLATE/2_site_support_request.md | 8 ++++---- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 6 +++--- .github/ISSUE_TEMPLATE/4_bug_report.md | 12 ++++++------ .github/ISSUE_TEMPLATE/5_feature_request.md | 6 +++--- .github/ISSUE_TEMPLATE/6_question.md | 6 +++--- .github/ISSUE_TEMPLATE_tmpl/1_broken_site.md | 6 +++--- .../ISSUE_TEMPLATE_tmpl/2_site_support_request.md | 6 +++--- .../ISSUE_TEMPLATE_tmpl/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE_tmpl/4_bug_report.md | 8 ++++---- .github/ISSUE_TEMPLATE_tmpl/5_feature_request.md | 4 ++-- youtube_dlc/utils.py | 4 ++-- 12 files changed, 40 insertions(+), 40 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index bf4251004..32c14aa85 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -21,15 +21,15 @@ ## Checklist - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dlc version **2020.10.26** +- [ ] I've verified that I'm running youtube-dlc version **2020.10.31** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -44,7 +44,7 @@ ## Verbose log [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dlc version 2020.10.26 + [debug] youtube-dlc version 2020.10.31 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 889005097..fe1aade05 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -21,15 +21,15 @@ ## Checklist - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dlcc version **2020.10.26** +- [ ] I've verified that I'm running youtube-dlcc version **2020.10.31** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index e5d714388..cddb81dda 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -21,13 +21,13 @@ ## Checklist - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dlc version **2020.10.26** +- [ ] I've verified that I'm running youtube-dlc version **2020.10.31** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 9de52f98c..920ae8dbc 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -21,16 +21,16 @@ ## Checklist - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dlc version **2020.10.26** +- [ ] I've verified that I'm running youtube-dlc version **2020.10.31** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -46,7 +46,7 @@ ## Verbose log [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dlc version 2020.10.26 + [debug] youtube-dlc version 2020.10.31 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 86fac96dd..7cc390f58 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -21,13 +21,13 @@ ## Checklist - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dlc version **2020.10.26** +- [ ] I've verified that I'm running youtube-dlc version **2020.10.31** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/6_question.md b/.github/ISSUE_TEMPLATE/6_question.md index 034a9c5ac..3c3ae0f3b 100644 --- a/.github/ISSUE_TEMPLATE/6_question.md +++ b/.github/ISSUE_TEMPLATE/6_question.md @@ -21,8 +21,8 @@ ## Checklist @@ -34,7 +34,7 @@ ## Checklist ## Question WRITE QUESTION HERE diff --git a/.github/ISSUE_TEMPLATE_tmpl/1_broken_site.md b/.github/ISSUE_TEMPLATE_tmpl/1_broken_site.md index 8f9bb2c33..3fe4d6968 100644 --- a/.github/ISSUE_TEMPLATE_tmpl/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE_tmpl/1_broken_site.md @@ -18,10 +18,10 @@ ## Checklist diff --git a/.github/ISSUE_TEMPLATE_tmpl/2_site_support_request.md b/.github/ISSUE_TEMPLATE_tmpl/2_site_support_request.md index 9748afd4d..aad8fa054 100644 --- a/.github/ISSUE_TEMPLATE_tmpl/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE_tmpl/2_site_support_request.md @@ -19,10 +19,10 @@ ## Checklist diff --git a/.github/ISSUE_TEMPLATE_tmpl/3_site_feature_request.md b/.github/ISSUE_TEMPLATE_tmpl/3_site_feature_request.md index f274e8aeb..2fb82f828 100644 --- a/.github/ISSUE_TEMPLATE_tmpl/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE_tmpl/3_site_feature_request.md @@ -18,8 +18,8 @@ ## Checklist diff --git a/.github/ISSUE_TEMPLATE_tmpl/4_bug_report.md b/.github/ISSUE_TEMPLATE_tmpl/4_bug_report.md index 788f1c9a1..b7bebf8ab 100644 --- a/.github/ISSUE_TEMPLATE_tmpl/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE_tmpl/4_bug_report.md @@ -18,11 +18,11 @@ ## Checklist diff --git a/.github/ISSUE_TEMPLATE_tmpl/5_feature_request.md b/.github/ISSUE_TEMPLATE_tmpl/5_feature_request.md index 9b3b8c3bf..99592f79d 100644 --- a/.github/ISSUE_TEMPLATE_tmpl/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE_tmpl/5_feature_request.md @@ -19,8 +19,8 @@ ## Checklist diff --git a/youtube_dlc/utils.py b/youtube_dlc/utils.py index 54a4ea2aa..f5dc1bdaf 100644 --- a/youtube_dlc/utils.py +++ b/youtube_dlc/utils.py @@ -2320,8 +2320,8 @@ def bug_reports_message(): if ytdl_is_updateable(): update_cmd = 'type youtube-dlc -U to update' else: - update_cmd = 'see https://yt-dl.org/update on how to update' - msg = '; please report this issue on https://yt-dl.org/bug .' + update_cmd = 'see https://github.com/blackjack4494/yt-dlc on how to update' + msg = '; please report this issue on https://github.com/blackjack4494/yt-dlc .' msg += ' Make sure you are using the latest version; %s.' % update_cmd msg += ' Be sure to call youtube-dlc with the --verbose flag and include its complete output.' return msg From e61f360157dfa51f2fd1cbc089c0c9a0680428a1 Mon Sep 17 00:00:00 2001 From: nixxo Date: Sat, 31 Oct 2020 14:52:07 +0100 Subject: [PATCH 034/124] [skyitalia] added geoblock msg --- youtube_dlc/extractor/skyitalia.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dlc/extractor/skyitalia.py b/youtube_dlc/extractor/skyitalia.py index 3c7bd465d..22a6be2be 100644 --- a/youtube_dlc/extractor/skyitalia.py +++ b/youtube_dlc/extractor/skyitalia.py @@ -13,6 +13,7 @@ class SkyItaliaBaseIE(InfoExtractor): 'high': [854, 480], 'hd': [1280, 720] } + _GEO_BYPASS = False def _extract_video_id(self, url): webpage = self._download_webpage(url, 'skyitalia') @@ -43,6 +44,9 @@ def _get_formats(self, video_id, token): 'height': r[1] }) + if not formats and video_data.get('geob') == 1: + self.raise_geo_restricted(countries=['IT']) + self._sort_formats(formats) title = video_data.get('title') thumb = video_data.get('thumb') From ae306df7e0e20866e39cc4f817edb99fe47ddc4d Mon Sep 17 00:00:00 2001 From: Unknown Date: Sun, 1 Nov 2020 03:01:10 +0100 Subject: [PATCH 035/124] [viki] new way of obtaining subtitles. --- youtube_dlc/extractor/viki.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/youtube_dlc/extractor/viki.py b/youtube_dlc/extractor/viki.py index f8e360338..0f188f84d 100644 --- a/youtube_dlc/extractor/viki.py +++ b/youtube_dlc/extractor/viki.py @@ -308,19 +308,17 @@ def _real_extract(self, url): 'url': thumbnail.get('url'), }) - stream_ids = [] - for f in formats: - s_id = f.get('stream_id') - if s_id is not None: - stream_ids.append(s_id) + new_video = self._download_json( + 'https://www.viki.com/api/videos/%s' % video_id, video_id, + 'Downloading new video JSON to get subtitles', headers={'x-viki-app-ver': '2.2.5.1428709186'}, expected_status=[200, 400, 404]) subtitles = {} - for subtitle_lang, _ in video.get('subtitle_completions', {}).items(): - subtitles[subtitle_lang] = [{ - 'ext': subtitles_format, - 'url': self._prepare_call( - 'videos/%s/subtitles/%s.%s?stream_id=%s' % (video_id, subtitle_lang, subtitles_format, stream_ids[0])), - } for subtitles_format in ('srt', 'vtt')] + for sub in new_video.get('streamSubtitles').get('dash'): + subtitles[sub.get('srclang')] = [{ + 'ext': 'vtt', + 'url': sub.get('src'), + 'completion': sub.get('percentage'), + }] result = { 'id': video_id, From 31108ce946eccbe765b12f0b8a9a47622af68c27 Mon Sep 17 00:00:00 2001 From: Unknown Date: Sun, 1 Nov 2020 03:36:16 +0100 Subject: [PATCH 036/124] [core] sleep-subtitles fix --- youtube_dlc/downloader/common.py | 11 ++++++----- youtube_dlc/options.py | 2 +- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/youtube_dlc/downloader/common.py b/youtube_dlc/downloader/common.py index 460364a0b..c65500d61 100644 --- a/youtube_dlc/downloader/common.py +++ b/youtube_dlc/downloader/common.py @@ -364,11 +364,12 @@ def download(self, filename, info_dict, subtitle=False): else '%.2f' % sleep_interval)) time.sleep(sleep_interval) else: - sleep_interval_sub = self.params.get('sleep_interval_subtitles') - self.to_screen( - '[download] Sleeping %s seconds...' % ( - int(sleep_interval_sub))) - time.sleep(sleep_interval_sub) + if self.params.get('sleep_interval_subtitles') > 0: + sleep_interval_sub = self.params.get('sleep_interval_subtitles') + self.to_screen( + '[download] Sleeping %s seconds...' % ( + sleep_interval_sub)) + time.sleep(sleep_interval_sub) return self.real_download(filename, info_dict) def real_download(self, filename, info_dict): diff --git a/youtube_dlc/options.py b/youtube_dlc/options.py index 66b45220c..3c8a1305e 100644 --- a/youtube_dlc/options.py +++ b/youtube_dlc/options.py @@ -582,7 +582,7 @@ def _comma_separated_values_options_callback(option, opt_str, value, parser): 'along with --min-sleep-interval.')) workarounds.add_option( '--sleep-subtitles', - dest='sleep_interval_subtitles', action='store_true', default=0, + dest='sleep_interval_subtitles', default=0, type=int, help='Enforce sleep interval on subtitles as well') verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options') From 764876a01f2f9c1eb59691678c5629fe283a39ce Mon Sep 17 00:00:00 2001 From: Tom-Oliver Heidel Date: Sun, 1 Nov 2020 03:38:44 +0100 Subject: [PATCH 037/124] [skip travis] select python 3.8 in workflow file --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index f5d94dc49..2bf54cd58 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -20,7 +20,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v2 with: - python-version: '3.x' + python-version: '3.8' - name: Install packages run: sudo apt-get -y install zip pandoc man - name: Bump version From 167c108f7072a8392c509e5e8b9f84c0e0c0bb28 Mon Sep 17 00:00:00 2001 From: Tom-Oliver Heidel Date: Mon, 2 Nov 2020 08:52:55 +0100 Subject: [PATCH 038/124] [skip travis] --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 08bddaa18..5a26906ac 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,5 @@ [![Build Status](https://travis-ci.com/blackjack4494/yt-dlc.svg?branch=master)](https://travis-ci.com/blackjack4494/yt-dlc) [![PyPi](https://img.shields.io/pypi/v/youtube-dlc.svg)](https://pypi.org/project/youtube-dlc) -[![Downloads](https://pepy.tech/badge/youtube-dlc)](https://pepy.tech/project/youtube-dlc) [![Gitter chat](https://img.shields.io/gitter/room/youtube-dlc/community)](https://gitter.im/youtube-dlc) [![License: Unlicense](https://img.shields.io/badge/license-Unlicense-blue.svg)](https://github.com/blackjack4494/youtube-dlc/blob/master/LICENSE) From 5c15c1a0d7c27d34e7d03161c5b27bf923e314cd Mon Sep 17 00:00:00 2001 From: insaneracist Date: Mon, 2 Nov 2020 14:54:47 -0800 Subject: [PATCH 039/124] python2: don't use str, use compat_str --- youtube_dlc/extractor/youtube.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index ad67fa410..d8f0dab1f 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -2813,11 +2813,11 @@ def extract_videos_from_page(self, page): def _extract_mix_ids_from_yt_initial(self, yt_initial): ids = [] - playlist_contents = try_get(yt_initial, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist']['contents']) - if type(playlist_contents) is list: + playlist_contents = try_get(yt_initial, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist']['contents'], list) + if playlist_contents: for item in playlist_contents: - videoId = try_get(item, lambda x: x['playlistPanelVideoRenderer']['videoId']) - if type(videoId) is str: + videoId = try_get(item, lambda x: x['playlistPanelVideoRenderer']['videoId'], compat_str) + if videoId: ids.append(videoId) return ids From 0536e60b48041d9c7d9ce8bbbef0eb2131ce3919 Mon Sep 17 00:00:00 2001 From: exwm Date: Sun, 1 Nov 2020 14:18:27 -0500 Subject: [PATCH 040/124] [vlive] fix: extractor tests for VODs --- youtube_dlc/extractor/vlive.py | 79 +++++++++++++++++----------------- 1 file changed, 39 insertions(+), 40 deletions(-) diff --git a/youtube_dlc/extractor/vlive.py b/youtube_dlc/extractor/vlive.py index f79531e6f..cc1d20a3a 100644 --- a/youtube_dlc/extractor/vlive.py +++ b/youtube_dlc/extractor/vlive.py @@ -11,7 +11,6 @@ from ..utils import ( ExtractorError, merge_dicts, - remove_start, try_get, urlencode_postdata, ) @@ -97,49 +96,49 @@ def is_logged_in(): def _real_extract(self, url): video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) - webpage = self._download_webpage( - 'https://www.vlive.tv/video/%s' % video_id, video_id) + PARAMS_RE = r'window\.__PRELOADED_STATE__\s*=\s*({.*});?\s*' + PARAMS_FIELD = 'params' - VIDEO_PARAMS_RE = r'\bvlive\.video\.init\(([^)]+)' - VIDEO_PARAMS_FIELD = 'video params' + params = self._search_regex( + PARAMS_RE, webpage, PARAMS_FIELD, default='', flags=re.DOTALL) + params = self._parse_json(params, video_id, fatal=False) - params = self._parse_json(self._search_regex( - VIDEO_PARAMS_RE, webpage, VIDEO_PARAMS_FIELD, default=''), video_id, - transform_source=lambda s: '[' + s + ']', fatal=False) + video_params = params["postDetail"]["post"].get("officialVideo") + if video_params is None: + raise ExtractorError('Invalid key: Failed to extract video parameters.') - if not params or len(params) < 7: - params = self._search_regex( - VIDEO_PARAMS_RE, webpage, VIDEO_PARAMS_FIELD) - params = [p.strip(r'"') for p in re.split(r'\s*,\s*', params)] + long_video_id = video_params["vodId"] + video_type = video_params["type"] + KEY_ENDPOINT = 'https://www.vlive.tv/globalv-web/vam-web/video/v1.0/vod/%s/inkey' % video_id + key_json = self._download_json(KEY_ENDPOINT, video_id, + headers={"referer": "https://www.vlive.tv"}) + key = key_json["inkey"] - status, long_video_id, key = params[2], params[5], params[6] - status = remove_start(status, 'PRODUCT_') - - if status in ('LIVE_ON_AIR', 'BIG_EVENT_ON_AIR'): - return self._live(video_id, webpage) - elif status in ('VOD_ON_AIR', 'BIG_EVENT_INTRO'): - return self._replay(video_id, webpage, long_video_id, key) - - if status == 'LIVE_END': - raise ExtractorError('Uploading for replay. Please wait...', - expected=True) - elif status == 'COMING_SOON': - raise ExtractorError('Coming soon!', expected=True) - elif status == 'CANCELED': - raise ExtractorError('We are sorry, ' - 'but the live broadcast has been canceled.', - expected=True) - elif status == 'ONLY_APP': - raise ExtractorError('Unsupported video type', expected=True) + if video_type in ('VOD'): + encoding_status = video_params["encodingStatus"] + if encoding_status == 'COMPLETE': + return self._replay(video_id, webpage, long_video_id, key, params) + else: + raise ExtractorError('VOD encoding not yet complete. Please try again later.', + expected=True) + elif video_type in ('LIVE'): + video_status = video_params["status"] + if video_status == 'RESERVED': + raise ExtractorError('Coming soon!', expected=True) + else: + return self._live(video_id, webpage, params) else: - raise ExtractorError('Unknown status %s' % status) + raise ExtractorError('Unknown video type %s' % video_type) - def _get_common_fields(self, webpage): + def _get_common_fields(self, webpage, params): title = self._og_search_title(webpage) - creator = self._html_search_regex( - r']+class="info_area"[^>]*>\s*(?:]*>.*?\s*)?]*>([^<]+)', - webpage, 'creator', fatal=False) + description = self._html_search_meta( + ['og:description', 'description', 'twitter:description'], + webpage, 'description', default=None) + creator = (try_get(params, lambda x: x["channel"]["channel"]["channelName"], compat_str) + or self._search_regex(r'on (.*) channel', description or '', 'creator', fatal=False)) thumbnail = self._og_search_thumbnail(webpage) return { 'title': title, @@ -147,7 +146,7 @@ def _get_common_fields(self, webpage): 'thumbnail': thumbnail, } - def _live(self, video_id, webpage): + def _live(self, video_id, webpage, params): init_page = self._download_init_page(video_id) live_params = self._search_regex( @@ -164,7 +163,7 @@ def _live(self, video_id, webpage): fatal=False, live=True)) self._sort_formats(formats) - info = self._get_common_fields(webpage) + info = self._get_common_fields(webpage, params) info.update({ 'title': self._live_title(info['title']), 'id': video_id, @@ -173,7 +172,7 @@ def _live(self, video_id, webpage): }) return info - def _replay(self, video_id, webpage, long_video_id, key): + def _replay(self, video_id, webpage, long_video_id, key, params): if '' in (long_video_id, key): init_page = self._download_init_page(video_id) video_info = self._parse_json(self._search_regex( @@ -186,7 +185,7 @@ def _replay(self, video_id, webpage, long_video_id, key): long_video_id, key = video_info['vid'], video_info['inkey'] return merge_dicts( - self._get_common_fields(webpage), + self._get_common_fields(webpage, params), self._extract_video_info(video_id, long_video_id, key)) def _download_init_page(self, video_id): From 5dcfd2508add09ab46d730f4802ce6da73edafaf Mon Sep 17 00:00:00 2001 From: exwm Date: Sun, 1 Nov 2020 15:04:05 -0500 Subject: [PATCH 041/124] [vlive] add: support video post urls --- youtube_dlc/extractor/vlive.py | 41 +++++++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/youtube_dlc/extractor/vlive.py b/youtube_dlc/extractor/vlive.py index cc1d20a3a..abbcfb32b 100644 --- a/youtube_dlc/extractor/vlive.py +++ b/youtube_dlc/extractor/vlive.py @@ -18,10 +18,10 @@ class VLiveIE(NaverBaseIE): IE_NAME = 'vlive' - _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/video/(?P[0-9]+)' + _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/(?:video|post)/(?P(?:\d-)?[0-9]+)' _NETRC_MACHINE = 'vlive' _TESTS = [{ - 'url': 'http://www.vlive.tv/video/1326', + 'url': 'https://www.vlive.tv/video/1326', 'md5': 'cc7314812855ce56de70a06a27314983', 'info_dict': { 'id': '1326', @@ -31,8 +31,21 @@ class VLiveIE(NaverBaseIE): 'view_count': int, 'uploader_id': 'muploader_a', }, - }, { - 'url': 'http://www.vlive.tv/video/16937', + }, + { + 'url': 'https://vlive.tv/post/1-18244258', + 'md5': 'cc7314812855ce56de70a06a27314983', + 'info_dict': { + 'id': '1326', + 'ext': 'mp4', + 'title': "[V LIVE] Girl's Day's Broadcast", + 'creator': "Girl's Day", + 'view_count': int, + 'uploader_id': 'muploader_a', + }, + }, + { + 'url': 'https://www.vlive.tv/video/16937', 'info_dict': { 'id': '16937', 'ext': 'mp4', @@ -95,24 +108,30 @@ def is_logged_in(): raise ExtractorError('Unable to log in', expected=True) def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + # url may match on a post or a video url with a post_id potentially matching a video_id + working_id = self._match_id(url) + webpage = self._download_webpage(url, working_id) PARAMS_RE = r'window\.__PRELOADED_STATE__\s*=\s*({.*});?\s*' PARAMS_FIELD = 'params' params = self._search_regex( PARAMS_RE, webpage, PARAMS_FIELD, default='', flags=re.DOTALL) - params = self._parse_json(params, video_id, fatal=False) + params = self._parse_json(params, working_id, fatal=False) - video_params = params["postDetail"]["post"].get("officialVideo") + video_params = try_get(params, lambda x: x["postDetail"]["post"]["officialVideo"]) if video_params is None: - raise ExtractorError('Invalid key: Failed to extract video parameters.') + if 'post' in url: + raise ExtractorError('Url does not appear to be a video post.') + else: + raise ExtractorError('Failed to extract video parameters.') + video_id = working_id if 'video' in url else str(video_params["videoSeq"]) long_video_id = video_params["vodId"] video_type = video_params["type"] - KEY_ENDPOINT = 'https://www.vlive.tv/globalv-web/vam-web/video/v1.0/vod/%s/inkey' % video_id - key_json = self._download_json(KEY_ENDPOINT, video_id, + + VOD_KEY_ENDPOINT = 'https://www.vlive.tv/globalv-web/vam-web/video/v1.0/vod/%s/inkey' % video_id + key_json = self._download_json(VOD_KEY_ENDPOINT, video_id, headers={"referer": "https://www.vlive.tv"}) key = key_json["inkey"] From 1923b146b378aed234f3cc91a61eb9c5aec2f684 Mon Sep 17 00:00:00 2001 From: exwm Date: Sun, 1 Nov 2020 15:40:47 -0500 Subject: [PATCH 042/124] [vlive] add: support new channel url format --- youtube_dlc/extractor/vlive.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/youtube_dlc/extractor/vlive.py b/youtube_dlc/extractor/vlive.py index abbcfb32b..98c405f21 100644 --- a/youtube_dlc/extractor/vlive.py +++ b/youtube_dlc/extractor/vlive.py @@ -220,15 +220,22 @@ def _download_init_page(self, video_id): class VLiveChannelIE(InfoExtractor): IE_NAME = 'vlive:channel' - _VALID_URL = r'https?://channels\.vlive\.tv/(?P[0-9A-Z]+)' - _TEST = { - 'url': 'http://channels.vlive.tv/FCD4B', + _VALID_URL = r'https?://(?:(?:www|m)\.)?(?:channels\.vlive\.tv/|vlive\.tv/channels?/)(?P[0-9A-Z]+)' + _TESTS = [{ + 'url': 'https://channels.vlive.tv/FCD4B', 'info_dict': { 'id': 'FCD4B', 'title': 'MAMAMOO', }, 'playlist_mincount': 110 - } + }, { + 'url': 'https://www.vlive.tv/channel/FCD4B', + 'info_dict': { + 'id': 'FCD4B', + 'title': 'MAMAMOO', + }, + 'playlist_mincount': 110 + }] _APP_ID = '8c6cc7b45d2568fb668be6e05b6e5a3b' def _real_extract(self, url): From 8ba3ad0a48bcc2e12f2ed82c0c5e0999e5e94281 Mon Sep 17 00:00:00 2001 From: exwm Date: Sun, 1 Nov 2020 21:15:45 -0500 Subject: [PATCH 043/124] [vlive] fix: fetching live video not yet uploaded for replay --- youtube_dlc/extractor/vlive.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dlc/extractor/vlive.py b/youtube_dlc/extractor/vlive.py index 98c405f21..70d5d8dfb 100644 --- a/youtube_dlc/extractor/vlive.py +++ b/youtube_dlc/extractor/vlive.py @@ -144,8 +144,10 @@ def _real_extract(self, url): expected=True) elif video_type in ('LIVE'): video_status = video_params["status"] - if video_status == 'RESERVED': + if video_status in ('RESERVED'): raise ExtractorError('Coming soon!', expected=True) + elif video_status in ('ENDED', 'END'): + raise ExtractorError('Uploading for replay. Please wait...', expected=True) else: return self._live(video_id, webpage, params) else: From 341736255610aea3920d9e8bf627705fdb6756b1 Mon Sep 17 00:00:00 2001 From: exwm Date: Sun, 1 Nov 2020 21:26:17 -0500 Subject: [PATCH 044/124] [vlive] fix: vod logic wrongly used for live video --- youtube_dlc/extractor/vlive.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/youtube_dlc/extractor/vlive.py b/youtube_dlc/extractor/vlive.py index 70d5d8dfb..5c8988c92 100644 --- a/youtube_dlc/extractor/vlive.py +++ b/youtube_dlc/extractor/vlive.py @@ -127,18 +127,12 @@ def _real_extract(self, url): raise ExtractorError('Failed to extract video parameters.') video_id = working_id if 'video' in url else str(video_params["videoSeq"]) - long_video_id = video_params["vodId"] + video_type = video_params["type"] - - VOD_KEY_ENDPOINT = 'https://www.vlive.tv/globalv-web/vam-web/video/v1.0/vod/%s/inkey' % video_id - key_json = self._download_json(VOD_KEY_ENDPOINT, video_id, - headers={"referer": "https://www.vlive.tv"}) - key = key_json["inkey"] - if video_type in ('VOD'): encoding_status = video_params["encodingStatus"] if encoding_status == 'COMPLETE': - return self._replay(video_id, webpage, long_video_id, key, params) + return self._replay(video_id, webpage, params, video_params) else: raise ExtractorError('VOD encoding not yet complete. Please try again later.', expected=True) @@ -193,7 +187,13 @@ def _live(self, video_id, webpage, params): }) return info - def _replay(self, video_id, webpage, long_video_id, key, params): + def _replay(self, video_id, webpage, params, video_params): + VOD_KEY_ENDPOINT = 'https://www.vlive.tv/globalv-web/vam-web/video/v1.0/vod/%s/inkey' % video_id + key_json = self._download_json(VOD_KEY_ENDPOINT, video_id, + headers={"referer": "https://www.vlive.tv"}) + key = key_json["inkey"] + long_video_id = video_params["vodId"] + if '' in (long_video_id, key): init_page = self._download_init_page(video_id) video_info = self._parse_json(self._search_regex( From 73cc1b9125b5f2f80d777f746c16b5e73b92ddd5 Mon Sep 17 00:00:00 2001 From: exwm Date: Mon, 2 Nov 2020 12:19:16 -0500 Subject: [PATCH 045/124] [vlive] fix: live video extractor * use live video info endpoint from v3 api --- youtube_dlc/extractor/vlive.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/youtube_dlc/extractor/vlive.py b/youtube_dlc/extractor/vlive.py index 5c8988c92..874f5203e 100644 --- a/youtube_dlc/extractor/vlive.py +++ b/youtube_dlc/extractor/vlive.py @@ -162,19 +162,16 @@ def _get_common_fields(self, webpage, params): } def _live(self, video_id, webpage, params): - init_page = self._download_init_page(video_id) + LIVE_INFO_ENDPOINT = 'https://www.vlive.tv/globalv-web/vam-web/old/v3/live/%s/playInfo' % video_id + play_info = self._download_json(LIVE_INFO_ENDPOINT, video_id, + headers={"referer": "https://www.vlive.tv"}) - live_params = self._search_regex( - r'"liveStreamInfo"\s*:\s*(".*"),', - init_page, 'live stream info') - live_params = self._parse_json(live_params, video_id) - live_params = self._parse_json(live_params, video_id) + streams = try_get(play_info, lambda x: x["result"]["streamList"]) or [] formats = [] - for vid in live_params.get('resolutions', []): + for stream in streams: formats.extend(self._extract_m3u8_formats( - vid['cdnUrl'], video_id, 'mp4', - m3u8_id=vid.get('name'), + stream['serviceUrl'], video_id, 'mp4', fatal=False, live=True)) self._sort_formats(formats) From 130599af9476284e7f0b3be4f68a0ff8346fb6ea Mon Sep 17 00:00:00 2001 From: exwm Date: Mon, 2 Nov 2020 18:34:54 -0500 Subject: [PATCH 046/124] [vlive] fix: raise login required error on vlive+ --- youtube_dlc/extractor/vlive.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/youtube_dlc/extractor/vlive.py b/youtube_dlc/extractor/vlive.py index 874f5203e..38d78eda1 100644 --- a/youtube_dlc/extractor/vlive.py +++ b/youtube_dlc/extractor/vlive.py @@ -120,8 +120,15 @@ def _real_extract(self, url): params = self._parse_json(params, working_id, fatal=False) video_params = try_get(params, lambda x: x["postDetail"]["post"]["officialVideo"]) + if video_params is None: - if 'post' in url: + error_data = try_get(params, lambda x: x["postDetail"]["error"]["data"]) + product_type = try_get(error_data, + [lambda x: x["officialVideo"]["productType"], + lambda x: x["board"]["boardType"]]) + if product_type in ('VLIVE_PLUS', 'VLIVE+'): + self.raise_login_required('This video is only available for VLIVE+ subscribers') + elif 'post' in url: raise ExtractorError('Url does not appear to be a video post.') else: raise ExtractorError('Failed to extract video parameters.') @@ -191,17 +198,6 @@ def _replay(self, video_id, webpage, params, video_params): key = key_json["inkey"] long_video_id = video_params["vodId"] - if '' in (long_video_id, key): - init_page = self._download_init_page(video_id) - video_info = self._parse_json(self._search_regex( - (r'(?s)oVideoStatus\s*=\s*({.+?})\s* Date: Sun, 12 Apr 2020 23:27:58 +0200 Subject: [PATCH 047/124] [zoomus] Add new extractor --- youtube_dlc/extractor/extractors.py | 1 + youtube_dlc/extractor/zoomus.py | 51 +++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) create mode 100644 youtube_dlc/extractor/zoomus.py diff --git a/youtube_dlc/extractor/extractors.py b/youtube_dlc/extractor/extractors.py index 666134d86..34a8cecd5 100644 --- a/youtube_dlc/extractor/extractors.py +++ b/youtube_dlc/extractor/extractors.py @@ -1544,4 +1544,5 @@ ) from .zdf import ZDFIE, ZDFChannelIE from .zingmp3 import ZingMp3IE +from .zoomus import ZoomUSIE from .zype import ZypeIE diff --git a/youtube_dlc/extractor/zoomus.py b/youtube_dlc/extractor/zoomus.py new file mode 100644 index 000000000..150dbced7 --- /dev/null +++ b/youtube_dlc/extractor/zoomus.py @@ -0,0 +1,51 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_iso8601, + try_get, + url_or_none, +) + + +class ZoomUSIE(InfoExtractor): + IE_NAME = 'zoom.us' + _VALID_URL = r'https://zoom.us/recording/play/(?P.*)' + + _TESTS = [{ + 'url': 'https://zoom.us/recording/play/SILVuCL4bFtRwWTtOCFQQxAsBQsJljFtm9e4Z_bvo-A8B-nzUSYZRNuPl3qW5IGK', + 'info_dict': { + 'ext': 'mp4', + 'topic': "GAZ Transformational Tuesdays W/ Landon & Stapes", + 'recordFileName': "Shared screen with speaker view", + } + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + #cookie = self._get_cookies(url)['_zm_ssid'] + + video_url = self._search_regex(r"viewMp4Url: \'(.*)\'", webpage, 'video url') + topic = self._search_regex(r"topic: \"(.*)\",", webpage, 'video url') + viewResolvtionsWidth = self._search_regex(r"viewResolvtionsWidth: (.*),", webpage, 'res width') + viewResolvtionsHeight = self._search_regex(r"viewResolvtionsHeight: (.*),", webpage, 'res width') + + formats = [] + formats.append({ + 'url': video_url, + 'width': int_or_none(viewResolvtionsWidth), + 'height': int_or_none(viewResolvtionsHeight), + 'http_headers': {'Accept': 'video/webm,video/ogg,video/*;q=0.9,application/ogg;q=0.7,audio/*;q=0.6,*/*;q=0.5', + 'Referer': 'https://zoom.us/', + } + }) + self._sort_formats(formats) + + return { + 'id': display_id, + 'title': topic, + 'formats': formats + } \ No newline at end of file From ef6be42014694bf67afb38b19e951180a5d0e9fb Mon Sep 17 00:00:00 2001 From: Roman Sebastian Karwacik Date: Sun, 12 Apr 2020 23:40:00 +0200 Subject: [PATCH 048/124] [zoomus] Allow for more urls --- youtube_dlc/extractor/zoomus.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/youtube_dlc/extractor/zoomus.py b/youtube_dlc/extractor/zoomus.py index 150dbced7..cdcf026e8 100644 --- a/youtube_dlc/extractor/zoomus.py +++ b/youtube_dlc/extractor/zoomus.py @@ -12,7 +12,7 @@ class ZoomUSIE(InfoExtractor): IE_NAME = 'zoom.us' - _VALID_URL = r'https://zoom.us/recording/play/(?P.*)' + _VALID_URL = r'https://(.*).?zoom.us/rec(ording)?/play/(?P.*)' _TESTS = [{ 'url': 'https://zoom.us/recording/play/SILVuCL4bFtRwWTtOCFQQxAsBQsJljFtm9e4Z_bvo-A8B-nzUSYZRNuPl3qW5IGK', @@ -26,7 +26,6 @@ class ZoomUSIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - #cookie = self._get_cookies(url)['_zm_ssid'] video_url = self._search_regex(r"viewMp4Url: \'(.*)\'", webpage, 'video url') topic = self._search_regex(r"topic: \"(.*)\",", webpage, 'video url') From 55cd2999edad0c9b148d5e9334a74be55bdb668c Mon Sep 17 00:00:00 2001 From: Roman Sebastian Karwacik Date: Mon, 13 Apr 2020 00:18:40 +0200 Subject: [PATCH 049/124] [zoomus] Cleanup --- youtube_dlc/extractor/zoomus.py | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/youtube_dlc/extractor/zoomus.py b/youtube_dlc/extractor/zoomus.py index cdcf026e8..a0e34801f 100644 --- a/youtube_dlc/extractor/zoomus.py +++ b/youtube_dlc/extractor/zoomus.py @@ -4,9 +4,6 @@ from .common import InfoExtractor from ..utils import ( int_or_none, - parse_iso8601, - try_get, - url_or_none, ) @@ -14,14 +11,15 @@ class ZoomUSIE(InfoExtractor): IE_NAME = 'zoom.us' _VALID_URL = r'https://(.*).?zoom.us/rec(ording)?/play/(?P.*)' - _TESTS = [{ + _TEST = { 'url': 'https://zoom.us/recording/play/SILVuCL4bFtRwWTtOCFQQxAsBQsJljFtm9e4Z_bvo-A8B-nzUSYZRNuPl3qW5IGK', 'info_dict': { - 'ext': 'mp4', - 'topic': "GAZ Transformational Tuesdays W/ Landon & Stapes", - 'recordFileName': "Shared screen with speaker view", + 'md5': '031a5b379f1547a8b29c5c4c837dccf2', + 'title': "GAZ Transformational Tuesdays W/ Landon & Stapes", + 'id': "SILVuCL4bFtRwWTtOCFQQxAsBQsJljFtm9e4Z_bvo-A8B-nzUSYZRNuPl3qW5IGK", + 'ext': "mp4", } - }] + } def _real_extract(self, url): display_id = self._match_id(url) @@ -37,9 +35,8 @@ def _real_extract(self, url): 'url': video_url, 'width': int_or_none(viewResolvtionsWidth), 'height': int_or_none(viewResolvtionsHeight), - 'http_headers': {'Accept': 'video/webm,video/ogg,video/*;q=0.9,application/ogg;q=0.7,audio/*;q=0.6,*/*;q=0.5', - 'Referer': 'https://zoom.us/', - } + 'http_headers': {'Accept': 'video/webm,video/ogg,video/*;q=0.9,application/ogg;q=0.7,audio/*;q=0.6,*/*;q=0.5', + 'Referer': 'https://zoom.us/'} }) self._sort_formats(formats) @@ -47,4 +44,4 @@ def _real_extract(self, url): 'id': display_id, 'title': topic, 'formats': formats - } \ No newline at end of file + } From abd273e17bb324296a81ea82be398e478ecdfa60 Mon Sep 17 00:00:00 2001 From: Roman Sebastian Karwacik Date: Mon, 13 Apr 2020 07:27:56 +0200 Subject: [PATCH 050/124] [zoomus] coding conventions --- youtube_dlc/extractor/zoomus.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/youtube_dlc/extractor/zoomus.py b/youtube_dlc/extractor/zoomus.py index a0e34801f..75a1b6375 100644 --- a/youtube_dlc/extractor/zoomus.py +++ b/youtube_dlc/extractor/zoomus.py @@ -4,12 +4,14 @@ from .common import InfoExtractor from ..utils import ( int_or_none, + url_or_none, + parse_filesize ) class ZoomUSIE(InfoExtractor): IE_NAME = 'zoom.us' - _VALID_URL = r'https://(.*).?zoom.us/rec(ording)?/play/(?P.*)' + _VALID_URL = r'https://(?:.*).?zoom.us/rec(?:ording)?/play/(?P[^?&=]{64})' _TEST = { 'url': 'https://zoom.us/recording/play/SILVuCL4bFtRwWTtOCFQQxAsBQsJljFtm9e4Z_bvo-A8B-nzUSYZRNuPl3qW5IGK', @@ -17,31 +19,33 @@ class ZoomUSIE(InfoExtractor): 'md5': '031a5b379f1547a8b29c5c4c837dccf2', 'title': "GAZ Transformational Tuesdays W/ Landon & Stapes", 'id': "SILVuCL4bFtRwWTtOCFQQxAsBQsJljFtm9e4Z_bvo-A8B-nzUSYZRNuPl3qW5IGK", - 'ext': "mp4", + 'ext': "mp4" } } def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - video_url = self._search_regex(r"viewMp4Url: \'(.*)\'", webpage, 'video url') - topic = self._search_regex(r"topic: \"(.*)\",", webpage, 'video url') - viewResolvtionsWidth = self._search_regex(r"viewResolvtionsWidth: (.*),", webpage, 'res width') - viewResolvtionsHeight = self._search_regex(r"viewResolvtionsHeight: (.*),", webpage, 'res width') + title = self._html_search_regex([r"topic: \"(.*)\",", r"(.*) - Zoom"], webpage, 'title') + viewResolvtionsWidth = self._search_regex(r"viewResolvtionsWidth: (\d*)", webpage, 'res width', fatal=False) + viewResolvtionsHeight = self._search_regex(r"viewResolvtionsHeight: (\d*)", webpage, 'res height', fatal=False) + fileSize = parse_filesize(self._search_regex(r"fileSize: \'(.+)\'", webpage, 'fileSize', fatal=False)) formats = [] formats.append({ - 'url': video_url, + 'url': url_or_none(video_url), 'width': int_or_none(viewResolvtionsWidth), 'height': int_or_none(viewResolvtionsHeight), 'http_headers': {'Accept': 'video/webm,video/ogg,video/*;q=0.9,application/ogg;q=0.7,audio/*;q=0.6,*/*;q=0.5', - 'Referer': 'https://zoom.us/'} + 'Referer': 'https://zoom.us/'}, + 'ext': "mp4", + 'filesize_approx': int_or_none(fileSize) }) self._sort_formats(formats) return { 'id': display_id, - 'title': topic, + 'title': title, 'formats': formats } From 81acad1279c59edf63ceb3348437521715276210 Mon Sep 17 00:00:00 2001 From: Roman Sebastian Karwacik Date: Mon, 20 Apr 2020 16:20:54 +0200 Subject: [PATCH 051/124] [zoomus] Added support for password protected videos --- youtube_dlc/extractor/zoomus.py | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/youtube_dlc/extractor/zoomus.py b/youtube_dlc/extractor/zoomus.py index 75a1b6375..eb8b0fd0c 100644 --- a/youtube_dlc/extractor/zoomus.py +++ b/youtube_dlc/extractor/zoomus.py @@ -3,9 +3,11 @@ from .common import InfoExtractor from ..utils import ( + ExtractorError, int_or_none, url_or_none, - parse_filesize + parse_filesize, + urlencode_postdata ) @@ -26,6 +28,12 @@ class ZoomUSIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) + + password_protected = self._search_regex(r']+?id="(password_form)"', webpage, 'password field', fatal=False) + if password_protected is not None: + self._verify_video_password(url, display_id, webpage) + webpage = self._download_webpage(url, display_id) + video_url = self._search_regex(r"viewMp4Url: \'(.*)\'", webpage, 'video url') title = self._html_search_regex([r"topic: \"(.*)\",", r"(.*) - Zoom"], webpage, 'title') viewResolvtionsWidth = self._search_regex(r"viewResolvtionsWidth: (\d*)", webpage, 'res width', fatal=False) @@ -49,3 +57,24 @@ def _real_extract(self, url): 'title': title, 'formats': formats } + + def _verify_video_password(self, url, video_id, webpage): + password = self._downloader.params.get('videopassword') + if password is None: + raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True) + meetId = self._search_regex(r']+?id="meetId" value="([^\"]+)"', webpage, 'meetId') + data = urlencode_postdata({ + 'id': meetId, + 'passwd': password, + 'action': "viewdetailedpage", + 'recaptcha': "" + }) + validation_url = url.split("zoom.us")[0]+"zoom.us/rec/validate_meet_passwd" + validation_response = self._download_json( + validation_url, video_id, + note='Validating Password...', + errnote='Wrong password?', + data=data) + + if validation_response['errorCode'] != 0: + raise ExtractorError('Login failed, %s said: %r' % (self.IE_NAME, validation_response['errorMessage'])) From aa13f124a5afcca3af3086ab7bcdc74783a95127 Mon Sep 17 00:00:00 2001 From: Roman Sebastian Karwacik Date: Tue, 21 Apr 2020 09:48:35 +0200 Subject: [PATCH 052/124] [zoomus] Adjusted referer header, fixed formating for flake8 --- youtube_dlc/extractor/zoomus.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/youtube_dlc/extractor/zoomus.py b/youtube_dlc/extractor/zoomus.py index eb8b0fd0c..f61f35da8 100644 --- a/youtube_dlc/extractor/zoomus.py +++ b/youtube_dlc/extractor/zoomus.py @@ -40,13 +40,15 @@ def _real_extract(self, url): viewResolvtionsHeight = self._search_regex(r"viewResolvtionsHeight: (\d*)", webpage, 'res height', fatal=False) fileSize = parse_filesize(self._search_regex(r"fileSize: \'(.+)\'", webpage, 'fileSize', fatal=False)) + urlprefix = url.split("zoom.us")[0] + "zoom.us/" + formats = [] formats.append({ 'url': url_or_none(video_url), 'width': int_or_none(viewResolvtionsWidth), 'height': int_or_none(viewResolvtionsHeight), 'http_headers': {'Accept': 'video/webm,video/ogg,video/*;q=0.9,application/ogg;q=0.7,audio/*;q=0.6,*/*;q=0.5', - 'Referer': 'https://zoom.us/'}, + 'Referer': urlprefix}, 'ext': "mp4", 'filesize_approx': int_or_none(fileSize) }) @@ -69,7 +71,7 @@ def _verify_video_password(self, url, video_id, webpage): 'action': "viewdetailedpage", 'recaptcha': "" }) - validation_url = url.split("zoom.us")[0]+"zoom.us/rec/validate_meet_passwd" + validation_url = url.split("zoom.us")[0] + "zoom.us/rec/validate_meet_passwd" validation_response = self._download_json( validation_url, video_id, note='Validating Password...', From b11a88fc243a078c2addbcf0d1377bd65495bc05 Mon Sep 17 00:00:00 2001 From: Roman Sebastian Karwacik Date: Tue, 2 Jun 2020 13:07:10 +0200 Subject: [PATCH 053/124] [zoomus] Adjusted url regex, now allowing for arbitrary long ids, dont throw warning if password field not found --- youtube_dlc/extractor/zoomus.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dlc/extractor/zoomus.py b/youtube_dlc/extractor/zoomus.py index f61f35da8..9aae30d37 100644 --- a/youtube_dlc/extractor/zoomus.py +++ b/youtube_dlc/extractor/zoomus.py @@ -13,7 +13,7 @@ class ZoomUSIE(InfoExtractor): IE_NAME = 'zoom.us' - _VALID_URL = r'https://(?:.*).?zoom.us/rec(?:ording)?/play/(?P[^?&=]{64})' + _VALID_URL = r'https://(?:.*).?zoom.us/rec(?:ording)?/play/(?P[A-Za-z0-9\-_]+)' _TEST = { 'url': 'https://zoom.us/recording/play/SILVuCL4bFtRwWTtOCFQQxAsBQsJljFtm9e4Z_bvo-A8B-nzUSYZRNuPl3qW5IGK', @@ -29,7 +29,7 @@ def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - password_protected = self._search_regex(r']+?id="(password_form)"', webpage, 'password field', fatal=False) + password_protected = self._search_regex(r']+?id="(password_form)"', webpage, 'password field', fatal=False, default=None) if password_protected is not None: self._verify_video_password(url, display_id, webpage) webpage = self._download_webpage(url, display_id) From 471115dbeefb899ee036d3e769da1f90070664b6 Mon Sep 17 00:00:00 2001 From: Tom-Oliver Heidel Date: Tue, 3 Nov 2020 10:31:31 +0100 Subject: [PATCH 054/124] [skip travis] add option to use pip to use master --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index 5a26906ac..83e51f68b 100644 --- a/README.md +++ b/README.md @@ -43,6 +43,10 @@ # INSTALLATION python -m pip install --upgrade youtube-dlc +If you want to install the current master branch + + python -m pip install git+https://github.com/blackjack4494/yt-dlc + **UNIX** (Linux, macOS, etc.) Using wget: From 15f6397c197af9ad464b2c385e3c8d4192aadb07 Mon Sep 17 00:00:00 2001 From: insaneracist Date: Tue, 3 Nov 2020 07:15:16 -0800 Subject: [PATCH 055/124] [youtube] get mix playlist title from ytInitialData --- youtube_dlc/extractor/youtube.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index d8f0dab1f..d736daa40 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -2825,6 +2825,7 @@ def _extract_mix(self, playlist_id): # The mixes are generated from a single video # the id of the playlist is just 'RD' + video_id ids = [] + yt_initial = None last_id = playlist_id[-11:] for n in itertools.count(1): url = 'https://www.youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id) @@ -2858,6 +2859,9 @@ def _extract_mix(self, playlist_id): or search_title('title')) title = clean_html(title_span) + if not title: + title = try_get(yt_initial, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist']['title'], compat_str) + return self.playlist_result(url_results, playlist_id, title) def _extract_playlist(self, playlist_id): From be5d6c213cc68ab0ae3764db7c3fd9ed128b3ff3 Mon Sep 17 00:00:00 2001 From: exwm Date: Tue, 3 Nov 2020 20:59:23 -0500 Subject: [PATCH 056/124] [vlive] refactor: delete dead function code --- youtube_dlc/extractor/vlive.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/youtube_dlc/extractor/vlive.py b/youtube_dlc/extractor/vlive.py index 38d78eda1..a205af921 100644 --- a/youtube_dlc/extractor/vlive.py +++ b/youtube_dlc/extractor/vlive.py @@ -202,16 +202,6 @@ def _replay(self, video_id, webpage, params, video_params): self._get_common_fields(webpage, params), self._extract_video_info(video_id, long_video_id, key)) - def _download_init_page(self, video_id): - return self._download_webpage( - 'https://www.vlive.tv/video/init/view', - video_id, note='Downloading live webpage', - data=urlencode_postdata({'videoSeq': video_id}), - headers={ - 'Referer': 'https://www.vlive.tv/video/%s' % video_id, - 'Content-Type': 'application/x-www-form-urlencoded' - }) - class VLiveChannelIE(InfoExtractor): IE_NAME = 'vlive:channel' From c434e9f504ed93ae851ff6b6b46051c91b0ec213 Mon Sep 17 00:00:00 2001 From: exwm Date: Tue, 3 Nov 2020 21:05:19 -0500 Subject: [PATCH 057/124] [vlive] fix: missing expected types for try_get --- youtube_dlc/extractor/vlive.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/youtube_dlc/extractor/vlive.py b/youtube_dlc/extractor/vlive.py index a205af921..fe9788d8f 100644 --- a/youtube_dlc/extractor/vlive.py +++ b/youtube_dlc/extractor/vlive.py @@ -119,13 +119,14 @@ def _real_extract(self, url): PARAMS_RE, webpage, PARAMS_FIELD, default='', flags=re.DOTALL) params = self._parse_json(params, working_id, fatal=False) - video_params = try_get(params, lambda x: x["postDetail"]["post"]["officialVideo"]) + video_params = try_get(params, lambda x: x["postDetail"]["post"]["officialVideo"], dict) if video_params is None: - error_data = try_get(params, lambda x: x["postDetail"]["error"]["data"]) + error_data = try_get(params, lambda x: x["postDetail"]["error"]["data"], dict) product_type = try_get(error_data, [lambda x: x["officialVideo"]["productType"], - lambda x: x["board"]["boardType"]]) + lambda x: x["board"]["boardType"]], + compat_str) if product_type in ('VLIVE_PLUS', 'VLIVE+'): self.raise_login_required('This video is only available for VLIVE+ subscribers') elif 'post' in url: @@ -173,7 +174,7 @@ def _live(self, video_id, webpage, params): play_info = self._download_json(LIVE_INFO_ENDPOINT, video_id, headers={"referer": "https://www.vlive.tv"}) - streams = try_get(play_info, lambda x: x["result"]["streamList"]) or [] + streams = try_get(play_info, lambda x: x["result"]["streamList"], list) or [] formats = [] for stream in streams: From 9c8bc84fd2000a90418aae17d89eb20f2418f54b Mon Sep 17 00:00:00 2001 From: exwm Date: Tue, 3 Nov 2020 21:27:49 -0500 Subject: [PATCH 058/124] [vlive] add: improved video extractor errors --- youtube_dlc/extractor/vlive.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/youtube_dlc/extractor/vlive.py b/youtube_dlc/extractor/vlive.py index fe9788d8f..935560b57 100644 --- a/youtube_dlc/extractor/vlive.py +++ b/youtube_dlc/extractor/vlive.py @@ -122,15 +122,24 @@ def _real_extract(self, url): video_params = try_get(params, lambda x: x["postDetail"]["post"]["officialVideo"], dict) if video_params is None: - error_data = try_get(params, lambda x: x["postDetail"]["error"]["data"], dict) + error = try_get(params, lambda x: x["postDetail"]["error"], dict) + error_data = try_get(error, lambda x: x["data"], dict) + error_video = try_get(error_data, lambda x: x["officialVideo"], dict) + error_msg = try_get(error, lambda x: x["message"], compat_str) product_type = try_get(error_data, [lambda x: x["officialVideo"]["productType"], lambda x: x["board"]["boardType"]], compat_str) - if product_type in ('VLIVE_PLUS', 'VLIVE+'): - self.raise_login_required('This video is only available for VLIVE+ subscribers') + + if error_video is not None: + if product_type in ('VLIVE_PLUS', 'VLIVE+'): + self.raise_login_required('This video is only available with V LIVE+.') + elif error_msg is not None: + raise ExtractorError('V LIVE reported the following error: %s' % error_msg) + else: + raise ExtractorError('Failed to extract video parameters.') elif 'post' in url: - raise ExtractorError('Url does not appear to be a video post.') + raise ExtractorError('Url does not appear to be a video post.', expected=True) else: raise ExtractorError('Failed to extract video parameters.') @@ -193,11 +202,12 @@ def _live(self, video_id, webpage, params): return info def _replay(self, video_id, webpage, params, video_params): + long_video_id = video_params["vodId"] + VOD_KEY_ENDPOINT = 'https://www.vlive.tv/globalv-web/vam-web/video/v1.0/vod/%s/inkey' % video_id key_json = self._download_json(VOD_KEY_ENDPOINT, video_id, headers={"referer": "https://www.vlive.tv"}) key = key_json["inkey"] - long_video_id = video_params["vodId"] return merge_dicts( self._get_common_fields(webpage, params), From ab36800b1fc7c17ab587bfe8015a0260db635efb Mon Sep 17 00:00:00 2001 From: nixxo Date: Wed, 4 Nov 2020 18:14:02 +0100 Subject: [PATCH 059/124] [la7] fix missing protocol --- youtube_dlc/extractor/la7.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dlc/extractor/la7.py b/youtube_dlc/extractor/la7.py index f5d4564fa..74b006fb5 100644 --- a/youtube_dlc/extractor/la7.py +++ b/youtube_dlc/extractor/la7.py @@ -36,6 +36,9 @@ class LA7IE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + if not url.startswith('http'): + url = '%s//%s' % (self.http_scheme(), url) + webpage = self._download_webpage(url, video_id) player_data = self._search_regex( From 659ddd7f7055baa8742433c2b73f01b3a1e2505f Mon Sep 17 00:00:00 2001 From: insaneracist Date: Wed, 4 Nov 2020 10:06:53 -0800 Subject: [PATCH 060/124] [youtube] fix: Youtube Music playlists --- youtube_dlc/extractor/youtube.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index 2e70ad6fa..d6550a776 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -2965,9 +2965,12 @@ def _real_extract(self, url): if video: return video + youtube_music_playlist_prefix = 'RDCLAK5uy_' if playlist_id.startswith(('RD', 'UL', 'PU')): - # Mixes require a custom extraction process - return self._extract_mix(playlist_id) + if not playlist_id.startswith(youtube_music_playlist_prefix): + # Mixes require a custom extraction process, + # Youtube Music playlists act like normal playlists (with randomized order) + return self._extract_mix(playlist_id) has_videos, playlist = self._extract_playlist(playlist_id) if has_videos or not video_id: From 7f4f0b21c26b59a1d621e6407ea2f4ed6c1a98be Mon Sep 17 00:00:00 2001 From: insaneracist Date: Wed, 4 Nov 2020 12:00:51 -0800 Subject: [PATCH 061/124] [youtube] added Youtube Music channel info --- youtube_dlc/extractor/youtube.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index d6550a776..cd4e844a0 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -2631,6 +2631,12 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): _VIDEO_RE_TPL = r'href="\s*/watch\?v=%s(?:&(?:[^"]*?index=(?P\d+))?(?:[^>]+>(?P[^<]+))?)?' _VIDEO_RE = _VIDEO_RE_TPL % r'(?P<id>[0-9A-Za-z_-]{11})' IE_NAME = 'youtube:playlist' + _YTM_PLAYLIST_PREFIX = 'RDCLAK5uy_' + _YTM_CHANNEL_INFO = { + 'uploader': 'Youtube Music', + 'uploader_id': 'music', # or "UC-9-kyTW8ZkZNDHQJ6FgpwQ" + 'uploader_url': 'https://www.youtube.com/music' + } _TESTS = [{ 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', 'info_dict': { @@ -2936,6 +2942,8 @@ def _extract_playlist(self, playlist_id): 'uploader_id': uploader_id, 'uploader_url': uploader_url, }) + if playlist_id.startswith(self._YTM_PLAYLIST_PREFIX): + playlist.update(self._YTM_CHANNEL_INFO) return has_videos, playlist @@ -2965,9 +2973,8 @@ def _real_extract(self, url): if video: return video - youtube_music_playlist_prefix = 'RDCLAK5uy_' if playlist_id.startswith(('RD', 'UL', 'PU')): - if not playlist_id.startswith(youtube_music_playlist_prefix): + if not playlist_id.startswith(self._YTM_PLAYLIST_PREFIX): # Mixes require a custom extraction process, # Youtube Music playlists act like normal playlists (with randomized order) return self._extract_mix(playlist_id) From 366a7a4753944802ed88638decd683f7472de53e Mon Sep 17 00:00:00 2001 From: insaneracist <insaneracist@cyberdude.com> Date: Wed, 4 Nov 2020 12:13:51 -0800 Subject: [PATCH 062/124] [zoom] rename extractor from zoomus --- youtube_dlc/extractor/extractors.py | 2 +- youtube_dlc/extractor/{zoomus.py => zoom.py} | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) rename youtube_dlc/extractor/{zoomus.py => zoom.py} (98%) diff --git a/youtube_dlc/extractor/extractors.py b/youtube_dlc/extractor/extractors.py index 34a8cecd5..24c107598 100644 --- a/youtube_dlc/extractor/extractors.py +++ b/youtube_dlc/extractor/extractors.py @@ -1544,5 +1544,5 @@ ) from .zdf import ZDFIE, ZDFChannelIE from .zingmp3 import ZingMp3IE -from .zoomus import ZoomUSIE +from .zoom import ZoomIE from .zype import ZypeIE diff --git a/youtube_dlc/extractor/zoomus.py b/youtube_dlc/extractor/zoom.py similarity index 98% rename from youtube_dlc/extractor/zoomus.py rename to youtube_dlc/extractor/zoom.py index 9aae30d37..003e1f901 100644 --- a/youtube_dlc/extractor/zoomus.py +++ b/youtube_dlc/extractor/zoom.py @@ -11,8 +11,8 @@ ) -class ZoomUSIE(InfoExtractor): - IE_NAME = 'zoom.us' +class ZoomIE(InfoExtractor): + IE_NAME = 'zoom' _VALID_URL = r'https://(?:.*).?zoom.us/rec(?:ording)?/play/(?P<id>[A-Za-z0-9\-_]+)' _TEST = { From 503d4a44f65146a63bf1bd5c04ac510a04fe0d33 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan@gmail.com> Date: Thu, 5 Nov 2020 01:47:52 +0530 Subject: [PATCH 063/124] Don't try to embed/convert json subtitles generated by youtube livechat --- youtube_dlc/postprocessor/ffmpeg.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/youtube_dlc/postprocessor/ffmpeg.py b/youtube_dlc/postprocessor/ffmpeg.py index 5e85f4eeb..c38db3143 100644 --- a/youtube_dlc/postprocessor/ffmpeg.py +++ b/youtube_dlc/postprocessor/ffmpeg.py @@ -412,7 +412,9 @@ def run(self, information): for lang, sub_info in subtitles.items(): sub_ext = sub_info['ext'] - if ext != 'webm' or ext == 'webm' and sub_ext == 'vtt': + if sub_ext == 'json': + self._downloader.to_screen('[ffmpeg] JSON subtitles cannot be embedded') + elif ext != 'webm' or ext == 'webm' and sub_ext == 'vtt': sub_langs.append(lang) sub_filenames.append(subtitles_filename(filename, lang, sub_ext, ext)) else: @@ -643,13 +645,18 @@ def run(self, info): self._downloader.to_screen( '[ffmpeg] Subtitle file for %s is already in the requested format' % new_ext) continue + elif ext == 'json': + self._downloader.to_screen( + '[ffmpeg] You have requested to convert json subtitles into another format, ' + 'which is currently not possible') + continue old_file = subtitles_filename(filename, lang, ext, info.get('ext')) sub_filenames.append(old_file) new_file = subtitles_filename(filename, lang, new_ext, info.get('ext')) if ext in ('dfxp', 'ttml', 'tt'): self._downloader.report_warning( - 'You have requested to convert dfxp (TTML) subtitles into another format, ' + '[ffmpeg] You have requested to convert dfxp (TTML) subtitles into another format, ' 'which results in style information loss') dfxp_file = old_file From 8abd647c59c9eb8f0fefd2b329e62b2b32bac6ea Mon Sep 17 00:00:00 2001 From: nixxo <c.nixxo@gmail.com> Date: Thu, 5 Nov 2020 20:52:28 +0100 Subject: [PATCH 064/124] [mailru] removed escaped braces, use urljoin, added tests --- youtube_dlc/extractor/mailru.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/youtube_dlc/extractor/mailru.py b/youtube_dlc/extractor/mailru.py index 6fdf70aa6..5bfe40649 100644 --- a/youtube_dlc/extractor/mailru.py +++ b/youtube_dlc/extractor/mailru.py @@ -12,6 +12,7 @@ parse_duration, remove_end, try_get, + urljoin, ) @@ -93,6 +94,14 @@ class MailRuIE(InfoExtractor): { 'url': 'https://my.mail.ru//list//sinyutin10/video/_myvideo/4.html', 'only_matching': True, + }, + { + 'url': 'https://my.mail.ru/mail/cloud-strife/video/embed/Games/2009', + 'only_matching': True, + }, + { + 'url': 'https://videoapi.my.mail.ru/videos/embed/mail/cloud-strife/Games/2009.html', + 'only_matching': True, } ] @@ -110,7 +119,7 @@ def _real_extract(self, url): webpage = self._download_webpage(url, video_id) page_config = self._parse_json(self._search_regex([ r'(?s)<script[^>]+class="sp-video__page-config"[^>]*>(.+?)</script>', - r'(?s)"video":\s*(\{.+?\}),'], + r'(?s)"video":\s*({.+?}),'], webpage, 'page config', default='{}'), video_id, fatal=False) if page_config: meta_url = page_config.get('metaUrl') or page_config.get('video', {}).get('metaUrl') or page_config.get('metadataUrl') @@ -121,7 +130,7 @@ def _real_extract(self, url): # fix meta_url if missing the host address if re.match(r'^\/\+\/', meta_url): - meta_url = 'https://my.mail.ru' + meta_url + meta_url = urljoin('https://my.mail.ru', meta_url) if meta_url: video_data = self._download_json( From 987d2e079ad0fd45df19b6183d38f83bcd528e9d Mon Sep 17 00:00:00 2001 From: The Hatsune Daishi <nao20010128@gmail.com> Date: Fri, 6 Nov 2020 15:15:07 +0900 Subject: [PATCH 065/124] [instagram] Fix extractor --- youtube_dlc/extractor/instagram.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/youtube_dlc/extractor/instagram.py b/youtube_dlc/extractor/instagram.py index b061850a1..bbfe23c76 100644 --- a/youtube_dlc/extractor/instagram.py +++ b/youtube_dlc/extractor/instagram.py @@ -126,16 +126,23 @@ def _real_extract(self, url): uploader_id, like_count, comment_count, comments, height, width) = [None] * 11 - shared_data = self._parse_json( - self._search_regex( - r'window\._sharedData\s*=\s*({.+?});', - webpage, 'shared data', default='{}'), - video_id, fatal=False) + shared_data = try_get(webpage, + (lambda x: self._parse_json( + self._search_regex( + r'window\.__additionalDataLoaded\(\'/(?:p|tv)/(?:[^/?#&]+)/\',({.+?})\);', + x, 'additional data', default='{}'), + video_id, fatal=False), + lambda x: self._parse_json( + self._search_regex( + r'window\._sharedData\s*=\s*({.+?});', + x, 'shared data', default='{}'), + video_id, fatal=False)['entry_data']['PostPage'][0]), + None) if shared_data: media = try_get( shared_data, - (lambda x: x['entry_data']['PostPage'][0]['graphql']['shortcode_media'], - lambda x: x['entry_data']['PostPage'][0]['media']), + (lambda x: x['graphql']['shortcode_media'], + lambda x: x['media']), dict) if media: video_url = media.get('video_url') From 5db4014b2367317fc6875aeb8fddc374b5225074 Mon Sep 17 00:00:00 2001 From: Unknown <blackjack4494@web.de> Date: Sat, 7 Nov 2020 15:05:05 +0100 Subject: [PATCH 066/124] [skip travis] readme and pypi update --- README.md | 3 ++- setup.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 83e51f68b..f884ad067 100644 --- a/README.md +++ b/README.md @@ -2,13 +2,14 @@ [![PyPi](https://img.shields.io/pypi/v/youtube-dlc.svg)](https://pypi.org/project/youtube-dlc) [![Gitter chat](https://img.shields.io/gitter/room/youtube-dlc/community)](https://gitter.im/youtube-dlc) -[![License: Unlicense](https://img.shields.io/badge/license-Unlicense-blue.svg)](https://github.com/blackjack4494/youtube-dlc/blob/master/LICENSE) +[![License: Unlicense](https://img.shields.io/badge/license-Unlicense-blue.svg)](https://github.com/blackjack4494/yt-dlc/blob/master/LICENSE) youtube-dlc - download videos from youtube.com or other video platforms. youtube-dlc is a fork of youtube-dl with the intention of getting features tested by the community merged in the tool faster, since youtube-dl's development seems to be slowing down. (https://web.archive.org/web/20201014194602/https://github.com/ytdl-org/youtube-dl/issues/26462) - [INSTALLATION](#installation) +- [UPDATE](#update) - [DESCRIPTION](#description) - [OPTIONS](#options) - [Network Options:](#network-options) diff --git a/setup.py b/setup.py index a10ef0a77..6908f2404 100644 --- a/setup.py +++ b/setup.py @@ -66,7 +66,7 @@ def run(self): description=DESCRIPTION, long_description=LONG_DESCRIPTION, # long_description_content_type="text/markdown", - url="https://github.com/blackjack4494/youtube-dlc", + url="https://github.com/blackjack4494/yt-dlc", packages=find_packages(exclude=("youtube_dl","test",)), #packages=[ # 'youtube_dlc', From 5943bb6214eca0a4aebb223d5a5800e3a024ae35 Mon Sep 17 00:00:00 2001 From: Tom-Oliver Heidel <github@tom-oliver.eu> Date: Sat, 7 Nov 2020 16:00:01 +0100 Subject: [PATCH 067/124] [skip travis] update workflow - sha file --- .github/workflows/build.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index f5d94dc49..cc344f601 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -109,7 +109,7 @@ jobs: runs-on: windows-latest - needs: build_unix + needs: [build_unix, build_windows] steps: - uses: actions/checkout@v2 @@ -146,10 +146,10 @@ jobs: SHA2_UNIX: ${{ needs.build_unix.outputs.sha2_unix }} YTDLC_VERSION: ${{ needs.build_unix.outputs.ytdlc_version }} run: | - echo "version:$YTDLC_VERSION" >> SHA2-256SUMS - echo "youtube-dlc.exe:$SHA2_WINDOWS" >> SHA2-256SUMS - echo "youtube-dlc32.exe:$SHA2_WINDOWS32" >> SHA2-256SUMS - echo "youtube-dlc:$SHA2_UNIX" >> SHA2-256SUMS + echo "version:${env:YTDLC_VERSION}" >> SHA2-256SUMS + echo "youtube-dlc.exe:${env:SHA2_WINDOWS}" >> SHA2-256SUMS + echo "youtube-dlc_x86.exe:${env:SHA2_WINDOWS32}" >> SHA2-256SUMS + echo "youtube-dlc:${env:SHA2_UNIX}" >> SHA2-256SUMS - name: Upload 256SUMS file id: upload-sums From b860e4cc2f53c7858054f73928f51188ea6b49b8 Mon Sep 17 00:00:00 2001 From: Nicolas SAPA <nico@ByMe.at> Date: Sun, 8 Nov 2020 08:36:26 +0100 Subject: [PATCH 068/124] [common] Make sure self.params.get('sleep_interval_subtitles') is int This can happen if another software is using yt-dlc'API (ie: tubeup). The stack trace would be: $ tubeup 'https://youtube.com/watch?v=JyE9OF03cao' [debug] Encodings: locale UTF-8, fs utf-8, out UTF-8, pref UTF-8 [debug] youtube-dlc version 2020.10.25 [debug] Python version 3.7.3 (CPython) - Linux-5.8.0-0.bpo.2-amd64-x86_64-with-debian-10.6 [debug] exe versions: ffmpeg 3.3.9, ffprobe 3.3.9 [debug] Proxy map: {} There are no annotations to write. ERROR: '>' not supported between instances of 'NoneType' and 'int' Traceback (most recent call last): File "/mnt/data2/Backup/Wiki/.local/lib/python3.7/site-packages/youtube_dlc/YoutubeDL.py", line 846, in extract_info return self.process_ie_result(ie_result, download, extra_info) File "/mnt/data2/Backup/Wiki/.local/lib/python3.7/site-packages/youtube_dlc/YoutubeDL.py", line 901, in process_ie_result return self.process_video_result(ie_result, download=download) File "/mnt/data2/Backup/Wiki/.local/lib/python3.7/site-packages/youtube_dlc/YoutubeDL.py", line 1696, in process_video_result self.process_info(new_info) File "/mnt/data2/Backup/Wiki/.local/lib/python3.7/site-packages/youtube_dlc/YoutubeDL.py", line 1894, in process_info dl(sub_filename, sub_info, subtitle=True) File "/mnt/data2/Backup/Wiki/.local/lib/python3.7/site-packages/youtube_dlc/YoutubeDL.py", line 1866, in dl return fd.download(name, info, subtitle) File "/mnt/data2/Backup/Wiki/.local/lib/python3.7/site-packages/youtube_dlc/downloader/common.py", line 367, in download if self.params.get('sleep_interval_subtitles') > 0: TypeError: '>' not supported between instances of 'NoneType' and 'int' --- youtube_dlc/downloader/common.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dlc/downloader/common.py b/youtube_dlc/downloader/common.py index c65500d61..7d303be1c 100644 --- a/youtube_dlc/downloader/common.py +++ b/youtube_dlc/downloader/common.py @@ -364,8 +364,10 @@ def download(self, filename, info_dict, subtitle=False): else '%.2f' % sleep_interval)) time.sleep(sleep_interval) else: - if self.params.get('sleep_interval_subtitles') > 0: + sleep_interval_sub = 0 + if type(self.params.get('sleep_interval_subtitles')) is int: sleep_interval_sub = self.params.get('sleep_interval_subtitles') + if sleep_interval_sub > 0: self.to_screen( '[download] Sleeping %s seconds...' % ( sleep_interval_sub)) From 8263104fe4f7aed96a1cc92be6b58cc219de876e Mon Sep 17 00:00:00 2001 From: Nicolas SAPA <nico@ByMe.at> Date: Sun, 8 Nov 2020 08:49:03 +0100 Subject: [PATCH 069/124] [youtube] Fix 'liveChatReplayContinuationData' missing 'continuation' key live_chat_continuation['continuations'][0]['liveChatReplayContinuationData']['continuation'] can not exist. So catch the KeyError. Traceback: $ tubeup 'https://youtube.com/watch?v=JyE9OF03cao' [debug] Encodings: locale UTF-8, fs utf-8, out UTF-8, pref UTF-8 [debug] youtube-dlc version 2020.10.25 [debug] Python version 3.7.3 (CPython) - Linux-5.8.0-0.bpo.2-amd64-x86_64-with-debian-10.6 [debug] exe versions: ffmpeg 3.3.9, ffprobe 3.3.9 [debug] Proxy map: {} There are no annotations to write. [download] 452.59KiB at 615.35KiB/s (00:01)ERROR: 'liveChatReplayContinuationData' Traceback (most recent call last): File "/mnt/data2/Backup/Wiki/.local/lib/python3.7/site-packages/youtube_dlc/YoutubeDL.py", line 846, in extract_info return self.process_ie_result(ie_result, download, extra_info) File "/mnt/data2/Backup/Wiki/.local/lib/python3.7/site-packages/youtube_dlc/YoutubeDL.py", line 901, in process_ie_result return self.process_video_result(ie_result, download=download) File "/mnt/data2/Backup/Wiki/.local/lib/python3.7/site-packages/youtube_dlc/YoutubeDL.py", line 1696, in process_video_result self.process_info(new_info) File "/mnt/data2/Backup/Wiki/.local/lib/python3.7/site-packages/youtube_dlc/YoutubeDL.py", line 1894, in process_info dl(sub_filename, sub_info, subtitle=True) File "/mnt/data2/Backup/Wiki/.local/lib/python3.7/site-packages/youtube_dlc/YoutubeDL.py", line 1866, in dl return fd.download(name, info, subtitle) File "/mnt/data2/Backup/Wiki/.local/lib/python3.7/site-packages/youtube_dlc/downloader/common.py", line 375, in download return self.real_download(filename, info_dict) File "/mnt/data2/Backup/Wiki/.local/lib/python3.7/site-packages/youtube_dlc/downloader/youtube_live_chat.py", line 85, in real_download continuation_id = live_chat_continuation['continuations'][0]['liveChatReplayContinuationData']['continuation'] KeyError: 'liveChatReplayContinuationData' --- youtube_dlc/downloader/youtube_live_chat.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dlc/downloader/youtube_live_chat.py b/youtube_dlc/downloader/youtube_live_chat.py index 4932dd9c5..b333afa5b 100644 --- a/youtube_dlc/downloader/youtube_live_chat.py +++ b/youtube_dlc/downloader/youtube_live_chat.py @@ -82,7 +82,10 @@ def parse_yt_initial_data(data): offset = int(replay_chat_item_action['videoOffsetTimeMsec']) processed_fragment.extend( json.dumps(action, ensure_ascii=False).encode('utf-8') + b'\n') - continuation_id = live_chat_continuation['continuations'][0]['liveChatReplayContinuationData']['continuation'] + try: + continuation_id = live_chat_continuation['continuations'][0]['liveChatReplayContinuationData']['continuation'] + except KeyError: + continuation_id = None self._append_fragment(ctx, processed_fragment) From 6857df609b60859e2864aadc61a869689d5ad2d0 Mon Sep 17 00:00:00 2001 From: WolfganP <2248211+WolfganP@users.noreply.github.com> Date: Sun, 8 Nov 2020 14:07:12 +0000 Subject: [PATCH 070/124] ITV BTCC new pages' URL update (articles instead of races) Not my changes, but from @franhp that didn't get merged on yt-dl on time It supports BTCC new pages' schema from 2019 an on (/articles/ instead of /races/) --- youtube_dlc/extractor/itv.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/youtube_dlc/extractor/itv.py b/youtube_dlc/extractor/itv.py index ad2f4eca5..9817745e8 100644 --- a/youtube_dlc/extractor/itv.py +++ b/youtube_dlc/extractor/itv.py @@ -20,6 +20,7 @@ merge_dicts, parse_duration, smuggle_url, + try_get, url_or_none, xpath_with_ns, xpath_element, @@ -280,12 +281,12 @@ def extract_subtitle(sub_url): class ITVBTCCIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?itv\.com/btcc/(?:[^/]+/)*(?P<id>[^/?#&]+)' _TEST = { - 'url': 'http://www.itv.com/btcc/races/btcc-2018-all-the-action-from-brands-hatch', + 'url': 'https://www.itv.com/btcc/articles/btcc-2019-brands-hatch-gp-race-action', 'info_dict': { - 'id': 'btcc-2018-all-the-action-from-brands-hatch', - 'title': 'BTCC 2018: All the action from Brands Hatch', + 'id': 'btcc-2019-brands-hatch-gp-race-action', + 'title': 'BTCC 2019: Brands Hatch GP race action', }, - 'playlist_mincount': 9, + 'playlist_mincount': 12, } BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1582188683001/HkiHLnNRx_default/index.html?videoId=%s' @@ -294,6 +295,16 @@ def _real_extract(self, url): webpage = self._download_webpage(url, playlist_id) + json_map = try_get(self._parse_json(self._html_search_regex( + '(?s)<script[^>]+id=[\'"]__NEXT_DATA__[^>]*>([^<]+)</script>', webpage, 'json_map'), playlist_id), + lambda x: x['props']['pageProps']['article']['body']['content']) or [] + + # Discard empty objects + video_ids = [] + for video in json_map: + if video['data'].get('id'): + video_ids.append(video['data']['id']) + entries = [ self.url_result( smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % video_id, { @@ -305,7 +316,7 @@ def _real_extract(self, url): 'referrer': url, }), ie=BrightcoveNewIE.ie_key(), video_id=video_id) - for video_id in re.findall(r'data-video-id=["\'](\d+)', webpage)] + for video_id in video_ids] title = self._og_search_title(webpage, fatal=False) From 85da4055c06ee5a2cf3462b2aa8404bcf7197955 Mon Sep 17 00:00:00 2001 From: WolfganP <2248211+WolfganP@users.noreply.github.com> Date: Sun, 8 Nov 2020 19:35:54 +0000 Subject: [PATCH 071/124] ITV BTCC new pages' URL update, fix on items count Fixed playlist_count as the variable was renamed --- youtube_dlc/extractor/itv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dlc/extractor/itv.py b/youtube_dlc/extractor/itv.py index 9817745e8..20144cd82 100644 --- a/youtube_dlc/extractor/itv.py +++ b/youtube_dlc/extractor/itv.py @@ -286,7 +286,7 @@ class ITVBTCCIE(InfoExtractor): 'id': 'btcc-2019-brands-hatch-gp-race-action', 'title': 'BTCC 2019: Brands Hatch GP race action', }, - 'playlist_mincount': 12, + 'playlist_count': 12, } BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1582188683001/HkiHLnNRx_default/index.html?videoId=%s' From 5867a1678924ad25a4784abfa5dbd28b5b69eb67 Mon Sep 17 00:00:00 2001 From: nixxo <c.nixxo@gmail.com> Date: Mon, 9 Nov 2020 10:59:25 +0100 Subject: [PATCH 072/124] [rcs] fixed embeds detection, fixed tests --- youtube_dlc/extractor/extractors.py | 3 +- youtube_dlc/extractor/rcs.py | 59 ++++++++++++++--------------- 2 files changed, 29 insertions(+), 33 deletions(-) diff --git a/youtube_dlc/extractor/extractors.py b/youtube_dlc/extractor/extractors.py index c3b76f039..ecbe68ab0 100644 --- a/youtube_dlc/extractor/extractors.py +++ b/youtube_dlc/extractor/extractors.py @@ -937,8 +937,7 @@ ) from .rbmaradio import RBMARadioIE from .rcs import ( - CorriereIE, - GazzettaIE, + RCSIE, RCSEmbedsIE, RCSVariousIE, ) diff --git a/youtube_dlc/extractor/rcs.py b/youtube_dlc/extractor/rcs.py index 8dbd9913b..830182c6d 100644 --- a/youtube_dlc/extractor/rcs.py +++ b/youtube_dlc/extractor/rcs.py @@ -14,7 +14,7 @@ ) -class RCSIE(InfoExtractor): +class RCSBaseIE(InfoExtractor): _ALL_REPLACE = { 'media2vam.corriere.it.edgesuite.net': 'media2vam-corriere-it.akamaized.net', @@ -237,7 +237,11 @@ def _real_extract(self, url): # if no video data found try search for iframes emb = RCSEmbedsIE._extract_url(page) if emb: - return self._real_extract(emb) + return { + '_type': 'url_transparent', + 'url': emb, + 'ie_key': RCSEmbedsIE.ie_key() + } if not video_data: raise ExtractorError('Video data not found in the page') @@ -247,7 +251,7 @@ def _real_extract(self, url): description = (video_data.get('description') or clean_html(video_data.get('htmlDescription'))) - uploader = video_data.get('provider') or mobj.gruop('cdn') + uploader = video_data.get('provider') or mobj.group('cdn') return { 'id': video_id, @@ -258,8 +262,7 @@ def _real_extract(self, url): } -class RCSEmbedsIE(RCSIE): - IE_NAME = 'rcs:rcs' +class RCSEmbedsIE(RCSBaseIE): _VALID_URL = r'''(?x) https?://(?P<vid>video)\. (?P<cdn> @@ -279,6 +282,16 @@ class RCSEmbedsIE(RCSIE): 'description': 'md5:65b09633df9ffee57f48b39e34c9e067', 'uploader': 'rcs.it', } + }, { + 'url': 'https://video.gazzanet.gazzetta.it/video-embed/gazzanet-mo05-0000260789', + 'md5': 'a043e3fecbe4d9ed7fc5d888652a5440', + 'info_dict': { + 'id': 'gazzanet-mo05-0000260789', + 'ext': 'mp4', + 'title': 'Valentino Rossi e papà Graziano si divertono col drifting', + 'description': 'md5:a8bf90d6adafd9815f70fc74c0fc370a', + 'uploader': 'rcd', + } }, { 'url': 'https://video.corriere.it/video-embed/b727632a-f9d0-11ea-91b0-38d50a849abb?player', 'match_only': True @@ -324,17 +337,17 @@ def _extract_url(webpage): return urls[0] if urls else None -class CorriereIE(RCSIE): - IE_NAME = 'rcs:corriere' +class RCSIE(RCSBaseIE): _VALID_URL = r'''(?x)https?://(?P<vid>video|viaggi)\. (?P<cdn> (?: - corrieredelmezzogiorno\.| - corrieredelveneto\.| - corrieredibologna\.| - corrierefiorentino\. - )? - corriere\.it)/.+?/(?P<id>[^/]+)(?=\?|/$|$)''' + corrieredelmezzogiorno\. + |corrieredelveneto\. + |corrieredibologna\. + |corrierefiorentino\. + )?corriere\.it + |(?:gazzanet\.)?gazzetta\.it) + /(?!video-embed/).+?/(?P<id>[^/\?]+)(?=\?|/$|$)''' _TESTS = [{ 'url': 'https://video.corriere.it/sport/formula-1/vettel-guida-ferrari-sf90-mugello-suo-fianco-c-elecrerc-bendato-video-esilarante/b727632a-f9d0-11ea-91b0-38d50a849abb', 'md5': '0f4ededc202b0f00b6e509d831e2dcda', @@ -356,18 +369,6 @@ class CorriereIE(RCSIE): 'uploader': 'DOVE Viaggi', } }, { - 'url': 'https://video.corriere.it/video-embed/b727632a-f9d0-11ea-91b0-38d50a849abb?player', - 'match_only': True - }, { - 'url': 'https://video.corriere.it/video-360/metro-copenaghen-tutta-italiana/a248a7f0-e2db-11e9-9830-af2de6b1f945', - 'match_only': True - }] - - -class GazzettaIE(RCSIE): - IE_NAME = 'rcs:gazzetta' - _VALID_URL = r'https?://(?P<vid>video)\.(?P<cdn>(?:gazzanet\.)?gazzetta\.it)/.+?/(?P<id>[^/]+?)(?:$|\?)' - _TESTS = [{ 'url': 'https://video.gazzetta.it/video-motogp-catalogna-cadute-dovizioso-vale-rossi/49612410-00ca-11eb-bcd8-30d4253e0140?vclk=Videobar', 'md5': 'eedc1b5defd18e67383afef51ff7bdf9', 'info_dict': { @@ -378,16 +379,12 @@ class GazzettaIE(RCSIE): 'uploader': 'AMorici', } }, { - 'url': 'https://video.gazzetta.it/video-embed/49612410-00ca-11eb-bcd8-30d4253e0140', - 'match_only': True - }, { - 'url': 'https://video.gazzanet.gazzetta.it/video-embed/gazzanet-mo05-0000260789', + 'url': 'https://video.corriere.it/video-360/metro-copenaghen-tutta-italiana/a248a7f0-e2db-11e9-9830-af2de6b1f945', 'match_only': True }] -class RCSVariousIE(RCSIE): - IE_NAME = 'rcs:various' +class RCSVariousIE(RCSBaseIE): _VALID_URL = r'''(?x)https?://www\. (?P<cdn> leitv\.it| From 876f1c17fff194cbed3595bb2a8497ea9e479bf7 Mon Sep 17 00:00:00 2001 From: Ali Sherief <alihsherief@linuxmail.org> Date: Mon, 9 Nov 2020 16:06:48 +0000 Subject: [PATCH 073/124] Fix #93 YoutubePlaylistsIE --- youtube_dlc/extractor/youtube.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index 3ec2581dc..35ac67b49 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -300,11 +300,12 @@ class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor): # Extract entries from page with "Load more" button def _entries(self, page, playlist_id): more_widget_html = content_html = page + mobj_reg = r'(?:(?:data-uix-load-more-href="[^"]+?;continuation=)|(?:"continuation":"))(?P<more>[^"]+)"' for page_num in itertools.count(1): for entry in self._process_page(content_html): yield entry - mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) + mobj = re.search(mobj_reg, more_widget_html) if not mobj: break @@ -315,7 +316,7 @@ def _entries(self, page, playlist_id): # Downloading page may result in intermittent 5xx HTTP error # that is usually worked around with a retry more = self._download_json( - 'https://www.youtube.com/%s' % mobj.group('more'), playlist_id, + 'https://www.youtube.com/browse_ajax?ctoken=%s' % mobj.group('more'), playlist_id, 'Downloading page #%s%s' % (page_num, ' (retry #%d)' % count if count else ''), transform_source=uppercase_escape, @@ -372,7 +373,7 @@ def extract_videos_from_page(self, page): class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): def _process_page(self, content): for playlist_id in orderedSet(re.findall( - r'<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*><a[^>]+href="/?playlist\?list=([0-9A-Za-z-_]{10,})"', + r'"/?playlist\?list=([0-9A-Za-z-_]{10,})"', content)): yield self.url_result( 'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist') From 142f2c8e99e61054d3354bd915a9e46cbd80c8ea Mon Sep 17 00:00:00 2001 From: Robin Dunn <> Date: Mon, 9 Nov 2020 15:24:42 -0800 Subject: [PATCH 074/124] fall-back to the old way to fetch subtitles, if needed --- youtube_dlc/extractor/viki.py | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/youtube_dlc/extractor/viki.py b/youtube_dlc/extractor/viki.py index 0f188f84d..6bddf8be9 100644 --- a/youtube_dlc/extractor/viki.py +++ b/youtube_dlc/extractor/viki.py @@ -308,17 +308,26 @@ def _real_extract(self, url): 'url': thumbnail.get('url'), }) - new_video = self._download_json( - 'https://www.viki.com/api/videos/%s' % video_id, video_id, - 'Downloading new video JSON to get subtitles', headers={'x-viki-app-ver': '2.2.5.1428709186'}, expected_status=[200, 400, 404]) - subtitles = {} - for sub in new_video.get('streamSubtitles').get('dash'): - subtitles[sub.get('srclang')] = [{ - 'ext': 'vtt', - 'url': sub.get('src'), - 'completion': sub.get('percentage'), - }] + try: + # New way to fetch subtitles + new_video = self._download_json( + 'https://www.viki.com/api/videos/%s' % video_id, video_id, + 'Downloading new video JSON to get subtitles', headers={'x-viki-app-ver': '2.2.5.1428709186'}, expected_status=[200, 400, 404]) + for sub in new_video.get('streamSubtitles').get('dash'): + subtitles[sub.get('srclang')] = [{ + 'ext': 'vtt', + 'url': sub.get('src'), + 'completion': sub.get('percentage'), + }] + except AttributeError: + # fall-back to the old way if there isn't a streamSubtitles attribute + for subtitle_lang, _ in video.get('subtitle_completions', {}).items(): + subtitles[subtitle_lang] = [{ + 'ext': subtitles_format, + 'url': self._prepare_call( + 'videos/%s/subtitles/%s.%s' % (video_id, subtitle_lang, subtitles_format)), + } for subtitles_format in ('srt', 'vtt')] result = { 'id': video_id, From da8fb75df5aa3a6bdda2afbe7bec7da905f0618a Mon Sep 17 00:00:00 2001 From: Tom-Oliver Heidel <github@tom-oliver.eu> Date: Tue, 10 Nov 2020 01:19:33 +0100 Subject: [PATCH 075/124] [skip travis] adjust python versions --- .github/workflows/build.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 74b50ecca..4920a30b8 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -82,7 +82,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v2 with: - python-version: '3.x' + python-version: '3.8' - name: Install Requirements run: pip install pyinstaller - name: Bump version @@ -116,7 +116,7 @@ jobs: - name: Set up Python 3.5.4 32-Bit uses: actions/setup-python@v2 with: - python-version: '3.5.4' + python-version: '3.4.4' architecture: 'x86' - name: Install Requirements for 32 Bit run: pip install pyinstaller==3.5 From 9833e7a015ca788a4f881c8ee945967b5f3d71bc Mon Sep 17 00:00:00 2001 From: Luc Ritchie <luc.ritchie@gmail.com> Date: Tue, 10 Nov 2020 03:38:26 -0500 Subject: [PATCH 076/124] fix: youtube: Polymer UI and JSON endpoints for playlists We already had a few copies of Polymer-style pagination handling logic for certain circumstances, but now we're forced into using it for all playlists since we can no longer disable Polymer. Refactor the logic to move it to the parent class for all entry lists (including e.g. search results, feeds, and list of playlists), and generify a bit to cover the child classes' use cases. --- youtube_dlc/extractor/youtube.py | 280 ++++++++++++++----------------- 1 file changed, 126 insertions(+), 154 deletions(-) diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index 3ec2581dc..273d823c2 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -36,6 +36,7 @@ get_element_by_attribute, get_element_by_id, int_or_none, + js_to_json, mimetype2ext, orderedSet, parse_codecs, @@ -70,6 +71,8 @@ class YoutubeBaseInfoExtractor(InfoExtractor): _LOGIN_REQUIRED = False _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}' + _INITIAL_DATA_RE = r'(?:window\["ytInitialData"\]|ytInitialData)\W?=\W?({.*?});' + _YTCFG_DATA_RE = r"ytcfg.set\(({.*?})\)" _YOUTUBE_CLIENT_HEADERS = { 'x-youtube-client-name': '1', @@ -274,7 +277,6 @@ def warn(message): def _download_webpage_handle(self, *args, **kwargs): query = kwargs.get('query', {}).copy() - query['disable_polymer'] = 'true' kwargs['query'] = query return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle( *args, **compat_kwargs(kwargs)) @@ -297,15 +299,60 @@ def _real_initialize(self): class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor): - # Extract entries from page with "Load more" button + def _find_entries_in_json(self, extracted): + entries = [] + c = {} + + def _real_find(obj): + if obj is None or isinstance(obj, str): + return + + if type(obj) is list: + for elem in obj: + _real_find(elem) + + if type(obj) is dict: + if self._is_entry(obj): + entries.append(obj) + return + + if 'continuationCommand' in obj: + c['continuation'] = obj + return + + for _, o in obj.items(): + _real_find(o) + + _real_find(extracted) + + return entries, try_get(c, lambda x: x["continuation"]) + def _entries(self, page, playlist_id): - more_widget_html = content_html = page + seen = [] + + yt_conf = {} + for m in re.finditer(self._YTCFG_DATA_RE, page): + parsed = self._parse_json(m.group(1), playlist_id, + transform_source=js_to_json, fatal=False) + if parsed: + yt_conf.update(parsed) + + data_json = self._parse_json(self._search_regex(self._INITIAL_DATA_RE, page, 'ytInitialData'), None) + for page_num in itertools.count(1): - for entry in self._process_page(content_html): + entries, continuation = self._find_entries_in_json(data_json) + processed = self._process_entries(entries, seen) + + if not processed: + break + for entry in processed: yield entry - mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) - if not mobj: + if not continuation or not yt_conf: + break + continuation_token = try_get(continuation, lambda x: x['continuationCommand']['token']) + continuation_url = try_get(continuation, lambda x: x['commandMetadata']['webCommandMetadata']['apiUrl']) + if not continuation_token or not continuation_url: break count = 0 @@ -314,12 +361,22 @@ def _entries(self, page, playlist_id): try: # Downloading page may result in intermittent 5xx HTTP error # that is usually worked around with a retry - more = self._download_json( - 'https://www.youtube.com/%s' % mobj.group('more'), playlist_id, - 'Downloading page #%s%s' - % (page_num, ' (retry #%d)' % count if count else ''), + data_json = self._download_json( + 'https://www.youtube.com%s' % continuation_url, + playlist_id, + 'Downloading page #%s%s' % (page_num, ' (retry #%d)' % count if count else ''), transform_source=uppercase_escape, - headers=self._YOUTUBE_CLIENT_HEADERS) + query={ + 'key': try_get(yt_conf, lambda x: x['INNERTUBE_API_KEY']) + }, + data=bytes(json.dumps({ + 'context': try_get(yt_conf, lambda x: x['INNERTUBE_CONTEXT']), + 'continuation': continuation_token + }), encoding='utf-8'), + headers={ + 'Content-Type': 'application/json' + } + ) break except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503): @@ -328,31 +385,30 @@ def _entries(self, page, playlist_id): continue raise - content_html = more['content_html'] - if not content_html.strip(): - # Some webpages show a "Load more" button but they don't - # have more videos - break - more_widget_html = more['load_more_widget_html'] + def _extract_title(self, renderer): + title = try_get(renderer, lambda x: x['title']['runs'][0]['text'], compat_str) + if title: + return title + return try_get(renderer, lambda x: x['title']['simpleText'], compat_str) class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): - def _process_page(self, content): - for video_id, video_title in self.extract_videos_from_page(content): - yield self.url_result(video_id, 'Youtube', video_id, video_title) + def _is_entry(self, obj): + return 'videoId' in obj - def extract_videos_from_page_impl(self, video_re, page, ids_in_page, titles_in_page): - for mobj in re.finditer(video_re, page): - # The link with index 0 is not the first video of the playlist (not sure if still actual) - if 'index' in mobj.groupdict() and mobj.group('id') == '0': + def _process_entries(self, entries, seen): + ids_in_page = [] + titles_in_page = [] + for renderer in entries: + video_id = try_get(renderer, lambda x: x['videoId']) + video_title = self._extract_title(renderer) + + if video_id is None or video_title is None: + # we do not have a videoRenderer or title extraction broke continue - video_id = mobj.group('id') - video_title = unescapeHTML( - mobj.group('title')) if 'title' in mobj.groupdict() else None - if video_title: - video_title = video_title.strip() - if video_title == '► Play all': - video_title = None + + video_title = video_title.strip() + try: idx = ids_in_page.index(video_id) if video_title and not titles_in_page[idx]: @@ -361,19 +417,16 @@ def extract_videos_from_page_impl(self, video_re, page, ids_in_page, titles_in_p ids_in_page.append(video_id) titles_in_page.append(video_title) - def extract_videos_from_page(self, page): - ids_in_page = [] - titles_in_page = [] - self.extract_videos_from_page_impl( - self._VIDEO_RE, page, ids_in_page, titles_in_page) - return zip(ids_in_page, titles_in_page) + for video_id, video_title in zip(ids_in_page, titles_in_page): + yield self.url_result(video_id, 'Youtube', video_id, video_title) class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): - def _process_page(self, content): - for playlist_id in orderedSet(re.findall( - r'<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*><a[^>]+href="/?playlist\?list=([0-9A-Za-z-_]{10,})"', - content)): + def _is_entry(self, obj): + return 'playlistId' in obj + + def _process_entries(self, entries, seen): + for playlist_id in orderedSet(try_get(r, lambda x: x['playlistId']) for r in entries): yield self.url_result( 'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist') @@ -3240,11 +3293,7 @@ class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor): }] -class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor): - _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?' - - -class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor): +class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistBaseInfoExtractor): IE_DESC = 'YouTube.com searches' # there doesn't appear to be a real limit, for example if you search for # 'python' you get more than 8.000.000 results @@ -3341,11 +3390,10 @@ class YoutubeSearchDateIE(YoutubeSearchIE): _SEARCH_PARAMS = 'CAI%3D' -class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor): +class YoutubeSearchURLIE(YoutubePlaylistBaseInfoExtractor): IE_DESC = 'YouTube.com search URLs' IE_NAME = 'youtube:search_url' _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)' - _SEARCH_DATA = r'(?:window\["ytInitialData"\]|ytInitialData)\W?=\W?({.*?});' _TESTS = [{ 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', 'playlist_mincount': 5, @@ -3357,28 +3405,14 @@ class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor): 'only_matching': True, }] - def _find_videos_in_json(self, extracted): - videos = [] + def _process_json_dict(self, obj, videos, c): + if "videoId" in obj: + videos.append(obj) + return - def _real_find(obj): - if obj is None or isinstance(obj, str): - return - - if type(obj) is list: - for elem in obj: - _real_find(elem) - - if type(obj) is dict: - if "videoId" in obj: - videos.append(obj) - return - - for _, o in obj.items(): - _real_find(o) - - _real_find(extracted) - - return videos + if "nextContinuationData" in obj: + c["continuation"] = obj["nextContinuationData"] + return def extract_videos_from_page_impl(self, page, ids_in_page, titles_in_page): search_response = self._parse_json(self._search_regex(self._SEARCH_DATA, page, 'ytInitialData'), None) @@ -3413,7 +3447,8 @@ def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) query = compat_urllib_parse_unquote_plus(mobj.group('query')) webpage = self._download_webpage(url, query) - return self.playlist_result(self._process_page(webpage), playlist_title=query) + data_json = self._process_initial_data(webpage) + return self.playlist_result(self._process_data(data_json), playlist_title=query) class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor): @@ -3435,14 +3470,12 @@ def _real_extract(self, url): 'https://www.youtube.com/show/%s/playlists' % playlist_id) -class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): +class YoutubeFeedsInfoExtractor(YoutubePlaylistBaseInfoExtractor): """ Base class for feed extractors Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties. """ _LOGIN_REQUIRED = True - _FEED_DATA = r'(?:window\["ytInitialData"\]|ytInitialData)\W?=\W?({.*?});' - _YTCFG_DATA = r"ytcfg.set\(({.*?})\)" @property def IE_NAME(self): @@ -3451,96 +3484,35 @@ def IE_NAME(self): def _real_initialize(self): self._login() - def _find_videos_in_json(self, extracted): - videos = [] - c = {} + def _process_entries(self, entries, seen): + new_info = [] + for v in entries: + v_id = try_get(v, lambda x: x['videoId']) + if not v_id: + continue - def _real_find(obj): - if obj is None or isinstance(obj, str): - return + have_video = False + for old in seen: + if old['videoId'] == v_id: + have_video = True + break - if type(obj) is list: - for elem in obj: - _real_find(elem) + if not have_video: + new_info.append(v) - if type(obj) is dict: - if "videoId" in obj: - videos.append(obj) - return + if not new_info: + return - if "nextContinuationData" in obj: - c["continuation"] = obj["nextContinuationData"] - return - - for _, o in obj.items(): - _real_find(o) - - _real_find(extracted) - - return videos, try_get(c, lambda x: x["continuation"]) - - def _entries(self, page): - info = [] - - yt_conf = self._parse_json(self._search_regex(self._YTCFG_DATA, page, 'ytcfg.set', default="null"), None, fatal=False) - - search_response = self._parse_json(self._search_regex(self._FEED_DATA, page, 'ytInitialData'), None) - - for page_num in itertools.count(1): - video_info, continuation = self._find_videos_in_json(search_response) - - new_info = [] - - for v in video_info: - v_id = try_get(v, lambda x: x['videoId']) - if not v_id: - continue - - have_video = False - for old in info: - if old['videoId'] == v_id: - have_video = True - break - - if not have_video: - new_info.append(v) - - if not new_info: - break - - info.extend(new_info) - - for video in new_info: - yield self.url_result(try_get(video, lambda x: x['videoId']), YoutubeIE.ie_key(), video_title=try_get(video, lambda x: x['title']['runs'][0]['text']) or try_get(video, lambda x: x['title']['simpleText'])) - - if not continuation or not yt_conf: - break - - search_response = self._download_json( - 'https://www.youtube.com/browse_ajax', self._PLAYLIST_TITLE, - 'Downloading page #%s' % page_num, - transform_source=uppercase_escape, - query={ - "ctoken": try_get(continuation, lambda x: x["continuation"]), - "continuation": try_get(continuation, lambda x: x["continuation"]), - "itct": try_get(continuation, lambda x: x["clickTrackingParams"]) - }, - headers={ - "X-YouTube-Client-Name": try_get(yt_conf, lambda x: x["INNERTUBE_CONTEXT_CLIENT_NAME"]), - "X-YouTube-Client-Version": try_get(yt_conf, lambda x: x["INNERTUBE_CONTEXT_CLIENT_VERSION"]), - "X-Youtube-Identity-Token": try_get(yt_conf, lambda x: x["ID_TOKEN"]), - "X-YouTube-Device": try_get(yt_conf, lambda x: x["DEVICE"]), - "X-YouTube-Page-CL": try_get(yt_conf, lambda x: x["PAGE_CL"]), - "X-YouTube-Page-Label": try_get(yt_conf, lambda x: x["PAGE_BUILD_LABEL"]), - "X-YouTube-Variants-Checksum": try_get(yt_conf, lambda x: x["VARIANTS_CHECKSUM"]), - }) + seen.extend(new_info) + for video in new_info: + yield self.url_result(try_get(video, lambda x: x['videoId']), YoutubeIE.ie_key(), video_title=self._extract_title(video)) def _real_extract(self, url): page = self._download_webpage( 'https://www.youtube.com/feed/%s' % self._FEED_NAME, self._PLAYLIST_TITLE) - return self.playlist_result( - self._entries(page), playlist_title=self._PLAYLIST_TITLE) + return self.playlist_result(self._entries(page, self._PLAYLIST_TITLE), + playlist_title=self._PLAYLIST_TITLE) class YoutubeWatchLaterIE(YoutubePlaylistIE): From 8f109ad4ad6bc734f817ccf3daefb9ed603d7480 Mon Sep 17 00:00:00 2001 From: Roman Karwacik <roman.karwacik@rwth-aachen.de> Date: Tue, 10 Nov 2020 10:39:57 +0100 Subject: [PATCH 077/124] [zoom] Fix url parsing for url's containing /share/ and dots --- youtube_dlc/extractor/zoom.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dlc/extractor/zoom.py b/youtube_dlc/extractor/zoom.py index 003e1f901..038a90297 100644 --- a/youtube_dlc/extractor/zoom.py +++ b/youtube_dlc/extractor/zoom.py @@ -13,7 +13,7 @@ class ZoomIE(InfoExtractor): IE_NAME = 'zoom' - _VALID_URL = r'https://(?:.*).?zoom.us/rec(?:ording)?/play/(?P<id>[A-Za-z0-9\-_]+)' + _VALID_URL = r'https://(?:.*).?zoom.us/rec(?:ording)?/(play|share)/(?P<id>[A-Za-z0-9\-_.]+)' _TEST = { 'url': 'https://zoom.us/recording/play/SILVuCL4bFtRwWTtOCFQQxAsBQsJljFtm9e4Z_bvo-A8B-nzUSYZRNuPl3qW5IGK', From 002ea8fe172c0bf234fd15d3775a527706843fc3 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan@gmail.com> Date: Tue, 27 Oct 2020 16:48:23 +0530 Subject: [PATCH 078/124] Fix external downloader when there is no http_header --- youtube_dlc/downloader/external.py | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/youtube_dlc/downloader/external.py b/youtube_dlc/downloader/external.py index c31f8910a..d2f8f271d 100644 --- a/youtube_dlc/downloader/external.py +++ b/youtube_dlc/downloader/external.py @@ -115,8 +115,10 @@ class CurlFD(ExternalFD): def _make_cmd(self, tmpfilename, info_dict): cmd = [self.exe, '--location', '-o', tmpfilename] - for key, val in info_dict['http_headers'].items(): - cmd += ['--header', '%s: %s' % (key, val)] + if info_dict.get('http_headers') is not None: + for key, val in info_dict['http_headers'].items(): + cmd += ['--header', '%s: %s' % (key, val)] + cmd += self._bool_option('--continue-at', 'continuedl', '-', '0') cmd += self._valueless_option('--silent', 'noprogress') cmd += self._valueless_option('--verbose', 'verbose') @@ -150,8 +152,9 @@ class AxelFD(ExternalFD): def _make_cmd(self, tmpfilename, info_dict): cmd = [self.exe, '-o', tmpfilename] - for key, val in info_dict['http_headers'].items(): - cmd += ['-H', '%s: %s' % (key, val)] + if info_dict.get('http_headers') is not None: + for key, val in info_dict['http_headers'].items(): + cmd += ['-H', '%s: %s' % (key, val)] cmd += self._configuration_args() cmd += ['--', info_dict['url']] return cmd @@ -162,8 +165,9 @@ class WgetFD(ExternalFD): def _make_cmd(self, tmpfilename, info_dict): cmd = [self.exe, '-O', tmpfilename, '-nv', '--no-cookies'] - for key, val in info_dict['http_headers'].items(): - cmd += ['--header', '%s: %s' % (key, val)] + if info_dict.get('http_headers') is not None: + for key, val in info_dict['http_headers'].items(): + cmd += ['--header', '%s: %s' % (key, val)] cmd += self._option('--limit-rate', 'ratelimit') retry = self._option('--tries', 'retries') if len(retry) == 2: @@ -189,8 +193,9 @@ def _make_cmd(self, tmpfilename, info_dict): if dn: cmd += ['--dir', dn] cmd += ['--out', os.path.basename(tmpfilename)] - for key, val in info_dict['http_headers'].items(): - cmd += ['--header', '%s: %s' % (key, val)] + if info_dict.get('http_headers') is not None: + for key, val in info_dict['http_headers'].items(): + cmd += ['--header', '%s: %s' % (key, val)] cmd += self._option('--interface', 'source_address') cmd += self._option('--all-proxy', 'proxy') cmd += self._bool_option('--check-certificate', 'nocheckcertificate', 'false', 'true', '=') @@ -206,8 +211,10 @@ def available(cls): def _make_cmd(self, tmpfilename, info_dict): cmd = ['http', '--download', '--output', tmpfilename, info_dict['url']] - for key, val in info_dict['http_headers'].items(): - cmd += ['%s:%s' % (key, val)] + + if info_dict.get('http_headers') is not None: + for key, val in info_dict['http_headers'].items(): + cmd += ['%s:%s' % (key, val)] return cmd @@ -253,7 +260,7 @@ def _call_downloader(self, tmpfilename, info_dict): # if end_time: # args += ['-t', compat_str(end_time - start_time)] - if info_dict['http_headers'] and re.match(r'^https?://', url): + if info_dict.get('http_headers') is not None and re.match(r'^https?://', url): # Trailing \r\n after each HTTP header is important to prevent warning from ffmpeg/avconv: # [http @ 00000000003d2fa0] No trailing CRLF found in HTTP header. headers = handle_youtubedl_headers(info_dict['http_headers']) From d7aec208f2a2ef883c7ffb14c0c4ceb4c9c9ddfa Mon Sep 17 00:00:00 2001 From: rigstot <rigstot@users.noreply.github.com> Date: Sun, 19 Jul 2020 15:07:29 +0200 Subject: [PATCH 079/124] implement ThisVid extractor deobfuscates the video URL using a reverse engineered version of KVS player's algorithm. This was tested against version 4.0.4, 5.0.1, 5.1.1.4 and 5.2.0.4 of the player and a warning will be issued if the major version changes. --- youtube_dlc/extractor/extractors.py | 1 + youtube_dlc/extractor/thisvid.py | 97 +++++++++++++++++++++++++++++ 2 files changed, 98 insertions(+) create mode 100644 youtube_dlc/extractor/thisvid.py diff --git a/youtube_dlc/extractor/extractors.py b/youtube_dlc/extractor/extractors.py index 666134d86..ee404f78d 100644 --- a/youtube_dlc/extractor/extractors.py +++ b/youtube_dlc/extractor/extractors.py @@ -1175,6 +1175,7 @@ from .thisamericanlife import ThisAmericanLifeIE from .thisav import ThisAVIE from .thisoldhouse import ThisOldHouseIE +from .thisvid import ThisVidIE from .threeqsdn import ThreeQSDNIE from .tiktok import TikTokIE from .tinypic import TinyPicIE diff --git a/youtube_dlc/extractor/thisvid.py b/youtube_dlc/extractor/thisvid.py new file mode 100644 index 000000000..f507e1b06 --- /dev/null +++ b/youtube_dlc/extractor/thisvid.py @@ -0,0 +1,97 @@ +# coding: utf-8 +from __future__ import unicode_literals +import re + +from .common import InfoExtractor + + +class ThisVidIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?thisvid\.com/(?P<type>videos|embed)/(?P<id>[A-Za-z0-9-]+/?)' + _TESTS = [{ + 'url': 'https://thisvid.com/videos/french-boy-pantsed/', + 'md5': '3397979512c682f6b85b3b04989df224', + 'info_dict': { + 'id': '2400174', + 'ext': 'mp4', + 'title': 'French Boy Pantsed', + 'thumbnail': 'https://media.thisvid.com/contents/videos_screenshots/2400000/2400174/preview.mp4.jpg', + 'age_limit': 18, + } + }, { + 'url': 'https://thisvid.com/embed/2400174/', + 'md5': '3397979512c682f6b85b3b04989df224', + 'info_dict': { + 'id': '2400174', + 'ext': 'mp4', + 'title': 'French Boy Pantsed', + 'thumbnail': 'https://media.thisvid.com/contents/videos_screenshots/2400000/2400174/preview.mp4.jpg', + 'age_limit': 18, + } + }] + + def _real_extract(self, url): + main_id = self._match_id(url) + webpage = self._download_webpage(url, main_id) + + # URL decryptor was reversed from version 4.0.4, later verified working with 5.2.0 and may change in the future. + kvs_version = self._html_search_regex(r'<script [^>]+?src="https://thisvid\.com/player/kt_player\.js\?v=(\d+(\.\d+)+)">', webpage, 'kvs_version', fatal=False) + if not kvs_version.startswith("5."): + self.report_warning("Major version change (" + kvs_version + ") in player engine--Download may fail.") + + title = self._html_search_regex(r'<title>(?:Video: )?(.+?)(?: - (?:\w+ porn at )?ThisVid(?:.com| tube))?', webpage, 'title') + # video_id, video_url and license_code from the 'flashvars' JSON object: + video_id = self._html_search_regex(r"video_id: '([0-9]+)',", webpage, 'video_id') + video_url = self._html_search_regex(r"video_url: '(function/0/.+?)',", webpage, 'video_url') + license_code = self._html_search_regex(r"license_code: '([0-9$]{16})',", webpage, 'license_code') + thumbnail = self._html_search_regex(r"preview_url: '((?:https?:)?//media.thisvid.com/.+?.jpg)',", webpage, 'thumbnail', fatal=False) + if thumbnail.startswith("//"): + thumbnail = "https:" + thumbnail + if (re.match(self._VALID_URL, url).group('type') == "videos"): + display_id = main_id + else: + display_id = self._search_regex(r'', webpage, 'display_id', fatal=False), + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'url': getrealurl(video_url, license_code), + 'thumbnail': thumbnail, + 'age_limit': 18, + } + + +def getrealurl(video_url, license_code): + urlparts = video_url.split('/')[2:] + license = getlicensetoken(license_code) + newmagic = urlparts[5][:32] + + for o in range(len(newmagic) - 1, -1, -1): + new = "" + l = (o + sum([int(n) for n in license[o:]])) % 32 + + for i in range(0, len(newmagic)): + if i == o: + new += newmagic[l] + elif i == l: + new += newmagic[o] + else: + new += newmagic[i] + newmagic = new + + urlparts[5] = newmagic + urlparts[5][32:] + return "/".join(urlparts) + + +def getlicensetoken(license): + modlicense = license.replace("$", "").replace("0", "1") + center = int(len(modlicense) / 2) + fronthalf = int(modlicense[:center + 1]) + backhalf = int(modlicense[center:]) + + modlicense = str(4 * abs(fronthalf - backhalf)) + retval = "" + for o in range(0, center + 1): + for i in range(1, 5): + retval += str((int(license[o + i]) + int(modlicense[o])) % 10) + return retval From 0f8566e90bee77775be133d551045698a84a2bdd Mon Sep 17 00:00:00 2001 From: Unknown Date: Tue, 10 Nov 2020 23:20:52 +0100 Subject: [PATCH 080/124] manually set limit for youtubesearchurl --- youtube_dlc/extractor/youtube.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index 273d823c2..0dbb3531c 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -327,7 +327,7 @@ def _real_find(obj): return entries, try_get(c, lambda x: x["continuation"]) - def _entries(self, page, playlist_id): + def _entries(self, page, playlist_id, n=1): seen = [] yt_conf = {} @@ -339,7 +339,8 @@ def _entries(self, page, playlist_id): data_json = self._parse_json(self._search_regex(self._INITIAL_DATA_RE, page, 'ytInitialData'), None) - for page_num in itertools.count(1): + # for page_num in itertools.count(1): + for page_num in range(n): entries, continuation = self._find_entries_in_json(data_json) processed = self._process_entries(entries, seen) @@ -3447,8 +3448,8 @@ def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) query = compat_urllib_parse_unquote_plus(mobj.group('query')) webpage = self._download_webpage(url, query) - data_json = self._process_initial_data(webpage) - return self.playlist_result(self._process_data(data_json), playlist_title=query) + # data_json = self._process_initial_data(webpage) + return self.playlist_result(self._entries(webpage, query, n=5), playlist_title=query) class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor): From 73ac85678588b1c2997a94c0069ac0a9309adf19 Mon Sep 17 00:00:00 2001 From: Luc Ritchie Date: Tue, 10 Nov 2020 17:47:40 -0500 Subject: [PATCH 081/124] [youtube] max_pages=5 for search, unlimited for everything else Also drop a few leftover methods in search that are no longer used. --- youtube_dlc/extractor/youtube.py | 39 ++++---------------------------- 1 file changed, 4 insertions(+), 35 deletions(-) diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index d8d12a721..2fea11070 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -328,7 +328,7 @@ def _real_find(obj): return entries, try_get(c, lambda x: x["continuation"]) - def _entries(self, page, playlist_id, n=1): + def _entries(self, page, playlist_id, max_pages=None): seen = [] yt_conf = {} @@ -340,8 +340,7 @@ def _entries(self, page, playlist_id, n=1): data_json = self._parse_json(self._search_regex(self._INITIAL_DATA_RE, page, 'ytInitialData'), None) - # for page_num in itertools.count(1): - for page_num in range(n): + for page_num in range(1, max_pages + 1) if max_pages is not None else itertools.count(1): entries, continuation = self._find_entries_in_json(data_json) processed = self._process_entries(entries, seen) @@ -366,7 +365,7 @@ def _entries(self, page, playlist_id, n=1): data_json = self._download_json( 'https://www.youtube.com%s' % continuation_url, playlist_id, - 'Downloading page #%s%s' % (page_num, ' (retry #%d)' % count if count else ''), + 'Downloading continuation page #%s%s' % (page_num, ' (retry #%d)' % count if count else ''), transform_source=uppercase_escape, query={ @@ -3418,41 +3417,11 @@ def _process_json_dict(self, obj, videos, c): c["continuation"] = obj["nextContinuationData"] return - def extract_videos_from_page_impl(self, page, ids_in_page, titles_in_page): - search_response = self._parse_json(self._search_regex(self._SEARCH_DATA, page, 'ytInitialData'), None) - - result_items = self._find_videos_in_json(search_response) - - for renderer in result_items: - video_id = try_get(renderer, lambda x: x['videoId']) - video_title = try_get(renderer, lambda x: x['title']['runs'][0]['text']) or try_get(renderer, lambda x: x['title']['simpleText']) - - if video_id is None or video_title is None: - # we do not have a videoRenderer or title extraction broke - continue - - video_title = video_title.strip() - - try: - idx = ids_in_page.index(video_id) - if video_title and not titles_in_page[idx]: - titles_in_page[idx] = video_title - except ValueError: - ids_in_page.append(video_id) - titles_in_page.append(video_title) - - def extract_videos_from_page(self, page): - ids_in_page = [] - titles_in_page = [] - self.extract_videos_from_page_impl(page, ids_in_page, titles_in_page) - return zip(ids_in_page, titles_in_page) - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) query = compat_urllib_parse_unquote_plus(mobj.group('query')) webpage = self._download_webpage(url, query) - # data_json = self._process_initial_data(webpage) - return self.playlist_result(self._entries(webpage, query, n=5), playlist_title=query) + return self.playlist_result(self._entries(webpage, query, max_pages=0), playlist_title=query) class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor): From 104bfdd24de9dd5f636887afd8b263a4c53673a7 Mon Sep 17 00:00:00 2001 From: Unknown Date: Wed, 11 Nov 2020 00:00:27 +0100 Subject: [PATCH 082/124] ytsearchurl 5 pages for around 100 results --- youtube_dlc/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index 2fea11070..d5d25859d 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -3421,7 +3421,7 @@ def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) query = compat_urllib_parse_unquote_plus(mobj.group('query')) webpage = self._download_webpage(url, query) - return self.playlist_result(self._entries(webpage, query, max_pages=0), playlist_title=query) + return self.playlist_result(self._entries(webpage, query, max_pages=5), playlist_title=query) class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor): From b28e751688a71f37ef6e468faf940bccb311afa9 Mon Sep 17 00:00:00 2001 From: Tom-Oliver Heidel Date: Wed, 11 Nov 2020 00:40:43 +0100 Subject: [PATCH 083/124] [skip travis] --- youtube_dlc/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dlc/version.py b/youtube_dlc/version.py index 440d8e488..3c68ae5eb 100644 --- a/youtube_dlc/version.py +++ b/youtube_dlc/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2020.10.25' +__version__ = '2020.11.11-1' From a1d6041497c50d59c6d275125d21cd3b613f6a1c Mon Sep 17 00:00:00 2001 From: nao20010128nao Date: Wed, 11 Nov 2020 08:59:09 +0000 Subject: [PATCH 084/124] [instagram] fix thumbnail URL extraction --- youtube_dlc/extractor/instagram.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dlc/extractor/instagram.py b/youtube_dlc/extractor/instagram.py index bbfe23c76..c3eba0114 100644 --- a/youtube_dlc/extractor/instagram.py +++ b/youtube_dlc/extractor/instagram.py @@ -151,7 +151,7 @@ def _real_extract(self, url): description = try_get( media, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'], compat_str) or media.get('caption') - thumbnail = media.get('display_src') + thumbnail = media.get('display_src') or media.get('thumbnail_src') timestamp = int_or_none(media.get('taken_at_timestamp') or media.get('date')) uploader = media.get('owner', {}).get('full_name') uploader_id = media.get('owner', {}).get('username') From 6bd79800c3c5d3a91561ee34a87dbaa9e8319ae9 Mon Sep 17 00:00:00 2001 From: Unknown Date: Wed, 11 Nov 2020 15:05:18 +0100 Subject: [PATCH 085/124] [youtube] python2 fix #168 proposed fix by awei78 --- youtube_dlc/extractor/youtube.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index d5d25859d..629a82c97 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -371,10 +371,10 @@ def _entries(self, page, playlist_id, max_pages=None): query={ 'key': try_get(yt_conf, lambda x: x['INNERTUBE_API_KEY']) }, - data=bytes(json.dumps({ + data= str(json.dumps({ 'context': try_get(yt_conf, lambda x: x['INNERTUBE_CONTEXT']), 'continuation': continuation_token - }), encoding='utf-8'), + })).encode(encoding='UTF-8',errors='strict'), headers={ 'Content-Type': 'application/json' } From c297a6c6619989f15b41935e49addff1d27e4e41 Mon Sep 17 00:00:00 2001 From: Tom-Oliver Heidel Date: Wed, 11 Nov 2020 15:08:12 +0100 Subject: [PATCH 086/124] [skip travis] --- youtube_dlc/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dlc/version.py b/youtube_dlc/version.py index 3c68ae5eb..201a981cf 100644 --- a/youtube_dlc/version.py +++ b/youtube_dlc/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2020.11.11-1' +__version__ = '2020.11.11-2' From 5e6cdcecdd1ac74592f27766ef38a3ae059d4ae7 Mon Sep 17 00:00:00 2001 From: Unknown Date: Wed, 11 Nov 2020 15:15:24 +0100 Subject: [PATCH 087/124] flake8 yt py2 fix --- youtube_dlc/extractor/youtube.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index 629a82c97..97cc793f9 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -371,10 +371,10 @@ def _entries(self, page, playlist_id, max_pages=None): query={ 'key': try_get(yt_conf, lambda x: x['INNERTUBE_API_KEY']) }, - data= str(json.dumps({ + data=str(json.dumps({ 'context': try_get(yt_conf, lambda x: x['INNERTUBE_CONTEXT']), 'continuation': continuation_token - })).encode(encoding='UTF-8',errors='strict'), + })).encode(encoding='UTF-8', errors='strict'), headers={ 'Content-Type': 'application/json' } From d052b9a112fb7ae749a829dceba6e3289663a303 Mon Sep 17 00:00:00 2001 From: Tom-Oliver Heidel Date: Wed, 11 Nov 2020 15:39:00 +0100 Subject: [PATCH 088/124] [skip travis] typo --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 4920a30b8..dd6a95256 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -113,7 +113,7 @@ jobs: steps: - uses: actions/checkout@v2 - - name: Set up Python 3.5.4 32-Bit + - name: Set up Python 3.4.4 32-Bit uses: actions/setup-python@v2 with: python-version: '3.4.4' From 9a68de12179c92b578fd00e16ff3ca63aab94c94 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Fri, 13 Nov 2020 02:40:51 +0530 Subject: [PATCH 089/124] Pre-check video IDs in the archive before downloading --- youtube_dlc/YoutubeDL.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/youtube_dlc/YoutubeDL.py b/youtube_dlc/YoutubeDL.py index dd55ba0f2..373e83715 100644 --- a/youtube_dlc/YoutubeDL.py +++ b/youtube_dlc/YoutubeDL.py @@ -821,12 +821,22 @@ def extract_info(self, url, download=True, ie_key=None, info_dict=None, extra_in if not ie.suitable(url): continue - ie = self.get_info_extractor(ie.ie_key()) + ie_key = ie.ie_key() + ie = self.get_info_extractor(ie_key) if not ie.working(): self.report_warning('The program functionality for this site has been marked as broken, ' 'and will probably not work.') try: + try: + temp_id = ie.extract_id(url) if callable(getattr(ie, 'extract_id', None)) else ie._match_id(url) + except AssertionError: + temp_id = None + if temp_id is not None and self.in_download_archive({'id': temp_id, 'ie_key': ie_key}): + self.to_screen("[download] [%s] %s has already been recorded in archive" % ( + ie_key, temp_id)) + break + ie_result = ie.extract(url) if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here) break From fe5caa2a7c0bb6f17c6833b540691b4df4cbde90 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Fri, 13 Nov 2020 03:05:29 +0530 Subject: [PATCH 090/124] Handle IndexError --- youtube_dlc/YoutubeDL.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dlc/YoutubeDL.py b/youtube_dlc/YoutubeDL.py index 373e83715..c85cbd88f 100644 --- a/youtube_dlc/YoutubeDL.py +++ b/youtube_dlc/YoutubeDL.py @@ -830,10 +830,10 @@ def extract_info(self, url, download=True, ie_key=None, info_dict=None, extra_in try: try: temp_id = ie.extract_id(url) if callable(getattr(ie, 'extract_id', None)) else ie._match_id(url) - except AssertionError: + except (AssertionError, IndexError): temp_id = None if temp_id is not None and self.in_download_archive({'id': temp_id, 'ie_key': ie_key}): - self.to_screen("[download] [%s] %s has already been recorded in archive" % ( + self.to_screen("[%s] %s: has already been recorded in archive" % ( ie_key, temp_id)) break From 63c00011d4ad59b37b08929ce413eb9506ac7150 Mon Sep 17 00:00:00 2001 From: Jody Bruchon Date: Thu, 12 Nov 2020 17:03:39 -0500 Subject: [PATCH 091/124] make_win.bat: don't use UPX to pack vcruntime140.dll If UPX is available in the PATH, then without this option, make_win.bat will corrupt the DLL and the built executable will be unusable. --- make_win.bat | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/make_win.bat b/make_win.bat index 891d517b3..c35d9937e 100644 --- a/make_win.bat +++ b/make_win.bat @@ -1 +1 @@ -py -m PyInstaller youtube_dlc\__main__.py --onefile --name youtube-dlc --version-file win\ver.txt --icon win\icon\cloud.ico \ No newline at end of file +py -m PyInstaller youtube_dlc\__main__.py --onefile --name youtube-dlc --version-file win\ver.txt --icon win\icon\cloud.ico --upx-exclude=vcruntime140.dll \ No newline at end of file From ea6e0c2b0d3e83f2f8a7766e07a57a2b5495afcc Mon Sep 17 00:00:00 2001 From: pukkandan Date: Mon, 21 Sep 2020 20:56:18 +0530 Subject: [PATCH 092/124] Add --break-on-existing by @gergesh Authored-by: Yoav Shai --- README.md | 2 ++ youtube_dlc/YoutubeDL.py | 10 ++++++++-- youtube_dlc/__init__.py | 1 + youtube_dlc/options.py | 4 ++++ 4 files changed, 15 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index f884ad067..170c85c48 100644 --- a/README.md +++ b/README.md @@ -217,6 +217,8 @@ ## Video Selection: --download-archive FILE Download only videos not listed in the archive file. Record the IDs of all downloaded videos in it. + --break-on-existing Stop the download process after attempting + to download a file that's in the archive. --include-ads Download advertisements as well (experimental) diff --git a/youtube_dlc/YoutubeDL.py b/youtube_dlc/YoutubeDL.py index dd55ba0f2..1cb1b421a 100644 --- a/youtube_dlc/YoutubeDL.py +++ b/youtube_dlc/YoutubeDL.py @@ -210,6 +210,8 @@ class YoutubeDL(object): download_archive: File name of a file where all downloads are recorded. Videos already present in the file are not downloaded again. + break_on_existing: Stop the download process after attempting to download a file that's + in the archive. cookiefile: File name where cookies should be read from and dumped to. nocheckcertificate:Do not verify SSL certificates prefer_insecure: Use HTTP instead of HTTPS to retrieve information. @@ -1038,8 +1040,12 @@ def report_download(num_entries): reason = self._match_entry(entry, incomplete=True) if reason is not None: - self.to_screen('[download] ' + reason) - continue + if reason.endswith('has already been recorded in the archive') and self.params.get('break_on_existing'): + print('[download] tried downloading a file that\'s already in the archive, stopping since --break-on-existing is set.') + break + else: + self.to_screen('[download] ' + reason) + continue entry_result = self.process_ie_result(entry, download=download, diff --git a/youtube_dlc/__init__.py b/youtube_dlc/__init__.py index 105786bc0..7d72ab985 100644 --- a/youtube_dlc/__init__.py +++ b/youtube_dlc/__init__.py @@ -405,6 +405,7 @@ def parse_retries(retries): 'youtube_print_sig_code': opts.youtube_print_sig_code, 'age_limit': opts.age_limit, 'download_archive': download_archive_fn, + 'break_on_existing': opts.break_on_existing, 'cookiefile': opts.cookiefile, 'nocheckcertificate': opts.no_check_certificate, 'prefer_insecure': opts.prefer_insecure, diff --git a/youtube_dlc/options.py b/youtube_dlc/options.py index 3c8a1305e..9ad8a6ddd 100644 --- a/youtube_dlc/options.py +++ b/youtube_dlc/options.py @@ -344,6 +344,10 @@ def _comma_separated_values_options_callback(option, opt_str, value, parser): '--download-archive', metavar='FILE', dest='download_archive', help='Download only videos not listed in the archive file. Record the IDs of all downloaded videos in it.') + selection.add_option( + '--break-on-existing', + action='store_true', dest='break_on_existing', default=False, + help="Stop the download process after attempting to download a file that's in the archive.") selection.add_option( '--include-ads', dest='include_ads', action='store_true', From 0366ae875692bbe38867761952db70a62e32fd53 Mon Sep 17 00:00:00 2001 From: Matthew Date: Sun, 15 Nov 2020 09:03:40 +1300 Subject: [PATCH 093/124] Fix search to not depend on index position for videoRenderer and token items. --- youtube_dlc/extractor/youtube.py | 39 ++++++++++++++++++++++++-------- 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index 97cc793f9..76c98ba36 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -3336,10 +3336,33 @@ def _entries(self, query, n): list) if not slr_contents: break - isr_contents = try_get( - slr_contents, - lambda x: x[0]['itemSectionRenderer']['contents'], - list) + + isr_contents = [] + continuation_token = None + # Youtube sometimes adds promoted content to searches, + # changing the index location of videos and token. + # So we search through all entries till we find them. + for index, isr in enumerate(slr_contents): + if len(isr_contents) == 0: + isr_contents = try_get( + slr_contents, + (lambda x: x[index]['itemSectionRenderer']['contents']), + list) + for content in isr_contents: + if content.get('videoRenderer') is not None: + break + else: + isr_contents = [] + + if continuation_token is None: + continuation_token = try_get( + slr_contents, + lambda x: x[index]['continuationItemRenderer']['continuationEndpoint']['continuationCommand'][ + 'token'], + compat_str) + if continuation_token is not None and isr_contents != []: + break + if not isr_contents: break for content in isr_contents: @@ -3373,13 +3396,9 @@ def _entries(self, query, n): } if total == n: return - token = try_get( - slr_contents, - lambda x: x[1]['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'], - compat_str) - if not token: + if not continuation_token: break - data['continuation'] = token + data['continuation'] = continuation_token def _get_n_results(self, query, n): """Get a specified number of results for a query""" From 55faba7ed77abad9dfe00bf850b9f8c4b04b036d Mon Sep 17 00:00:00 2001 From: pukkandan Date: Sun, 15 Nov 2020 01:42:07 +0530 Subject: [PATCH 094/124] Fix for os.rename error when embedding thumbnail to video in a different drive --- youtube_dlc/postprocessor/embedthumbnail.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dlc/postprocessor/embedthumbnail.py b/youtube_dlc/postprocessor/embedthumbnail.py index 4a0d02fc4..a7d53d7f5 100644 --- a/youtube_dlc/postprocessor/embedthumbnail.py +++ b/youtube_dlc/postprocessor/embedthumbnail.py @@ -89,9 +89,10 @@ def is_webp(path): os.rename(encodeFilename(temp_filename), encodeFilename(filename)) elif info['ext'] == 'mkv': - os.rename(encodeFilename(thumbnail_filename), encodeFilename('cover.jpg')) old_thumbnail_filename = thumbnail_filename - thumbnail_filename = 'cover.jpg' + thumbnail_filename = os.path.join(os.path.dirname(old_thumbnail_filename), 'cover.jpg') + os.remove(encodeFilename(thumbnail_filename)) + os.rename(encodeFilename(old_thumbnail_filename), encodeFilename(thumbnail_filename)) options = [ '-c', 'copy', '-attach', thumbnail_filename, '-metadata:s:t', 'mimetype=image/jpeg'] From 958804ad4e019ce59c6b5d72918dff846839220c Mon Sep 17 00:00:00 2001 From: pukkandan Date: Sun, 15 Nov 2020 01:38:54 +0530 Subject: [PATCH 095/124] Ensure all streams are copied when using ffmpeg --- youtube_dlc/postprocessor/embedthumbnail.py | 5 +++-- youtube_dlc/postprocessor/ffmpeg.py | 11 +++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/youtube_dlc/postprocessor/embedthumbnail.py b/youtube_dlc/postprocessor/embedthumbnail.py index 4a0d02fc4..7ca0ce6e5 100644 --- a/youtube_dlc/postprocessor/embedthumbnail.py +++ b/youtube_dlc/postprocessor/embedthumbnail.py @@ -94,7 +94,8 @@ def is_webp(path): thumbnail_filename = 'cover.jpg' options = [ - '-c', 'copy', '-attach', thumbnail_filename, '-metadata:s:t', 'mimetype=image/jpeg'] + '-c', 'copy', '-map', '0', + '-attach', thumbnail_filename, '-metadata:s:t', 'mimetype=image/jpeg'] self._downloader.to_screen('[ffmpeg] Adding thumbnail to "%s"' % filename) @@ -140,6 +141,6 @@ def is_webp(path): os.remove(encodeFilename(filename)) os.rename(encodeFilename(temp_filename), encodeFilename(filename)) else: - raise EmbedThumbnailPPError('Only mp3 and m4a/mp4 are supported for thumbnail embedding for now.') + raise EmbedThumbnailPPError('Only mp3, mkv, m4a and mp4 are supported for thumbnail embedding for now.') return [], info diff --git a/youtube_dlc/postprocessor/ffmpeg.py b/youtube_dlc/postprocessor/ffmpeg.py index c38db3143..c7071d73d 100644 --- a/youtube_dlc/postprocessor/ffmpeg.py +++ b/youtube_dlc/postprocessor/ffmpeg.py @@ -359,7 +359,7 @@ def run(self, information): if information['ext'] == self._preferedformat: self._downloader.to_screen('[ffmpeg] Not remuxing video file %s - already is in target format %s' % (path, self._preferedformat)) return [], information - options = ['-c', 'copy'] + options = ['-c', 'copy', '-map', '0'] prefix, sep, ext = path.rpartition('.') outpath = prefix + sep + self._preferedformat self._downloader.to_screen('[' + 'ffmpeg' + '] Remuxing video from %s to %s, Destination: ' % (information['ext'], self._preferedformat) + outpath) @@ -428,8 +428,7 @@ def run(self, information): input_files = [filename] + sub_filenames opts = [ - '-map', '0', - '-c', 'copy', + '-c', 'copy', '-map', '0', # Don't copy the existing subtitles, we may be running the # postprocessor a second time '-map', '-0:s', @@ -579,7 +578,7 @@ def run(self, info): filename = info['filepath'] temp_filename = prepend_extension(filename, 'temp') - options = ['-c', 'copy', '-aspect', '%f' % stretched_ratio] + options = ['-c', 'copy', '-map', '0', '-aspect', '%f' % stretched_ratio] self._downloader.to_screen('[ffmpeg] Fixing aspect ratio in "%s"' % filename) self.run_ffmpeg(filename, temp_filename, options) @@ -597,7 +596,7 @@ def run(self, info): filename = info['filepath'] temp_filename = prepend_extension(filename, 'temp') - options = ['-c', 'copy', '-f', 'mp4'] + options = ['-c', 'copy', '-map', '0', '-f', 'mp4'] self._downloader.to_screen('[ffmpeg] Correcting container in "%s"' % filename) self.run_ffmpeg(filename, temp_filename, options) @@ -613,7 +612,7 @@ def run(self, info): if self.get_audio_codec(filename) == 'aac': temp_filename = prepend_extension(filename, 'temp') - options = ['-c', 'copy', '-f', 'mp4', '-bsf:a', 'aac_adtstoasc'] + options = ['-c', 'copy', '-map', '0', '-f', 'mp4', '-bsf:a', 'aac_adtstoasc'] self._downloader.to_screen('[ffmpeg] Fixing malformed AAC bitstream in "%s"' % filename) self.run_ffmpeg(filename, temp_filename, options) From 9da76d30decd079dbd3ca3d708e475a6201754e4 Mon Sep 17 00:00:00 2001 From: Matthew Date: Sun, 15 Nov 2020 09:34:59 +1300 Subject: [PATCH 096/124] code consistency --- youtube_dlc/extractor/youtube.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index 76c98ba36..a9b591125 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -3343,7 +3343,7 @@ def _entries(self, query, n): # changing the index location of videos and token. # So we search through all entries till we find them. for index, isr in enumerate(slr_contents): - if len(isr_contents) == 0: + if not isr_contents: isr_contents = try_get( slr_contents, (lambda x: x[index]['itemSectionRenderer']['contents']), @@ -3360,7 +3360,7 @@ def _entries(self, query, n): lambda x: x[index]['continuationItemRenderer']['continuationEndpoint']['continuationCommand'][ 'token'], compat_str) - if continuation_token is not None and isr_contents != []: + if continuation_token is not None and isr_contents: break if not isr_contents: From 711bd5d362a1a7bec312e23a0f39deff2b3bf8f1 Mon Sep 17 00:00:00 2001 From: renalid Date: Sat, 14 Nov 2020 22:49:36 +0100 Subject: [PATCH 097/124] Update on france.tv extractor to fix thumbnail URL Fix the thumbnail URL extraction --- youtube_dlc/extractor/francetv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dlc/extractor/francetv.py b/youtube_dlc/extractor/francetv.py index e340cddba..910a8a329 100644 --- a/youtube_dlc/extractor/francetv.py +++ b/youtube_dlc/extractor/francetv.py @@ -186,7 +186,7 @@ def sign(manifest_url, manifest_id): 'id': video_id, 'title': self._live_title(title) if is_live else title, 'description': clean_html(info['synopsis']), - 'thumbnail': compat_urlparse.urljoin('http://pluzz.francetv.fr', info['image']), + 'thumbnail': compat_urlparse.urljoin('https://sivideo.webservices.francetelevisions.fr', info['image']), 'duration': int_or_none(info.get('real_duration')) or parse_duration(info['duree']), 'timestamp': int_or_none(info['diffusion']['timestamp']), 'is_live': is_live, From ec57f903c907bf8c48c9cd3eea75e6dadb855595 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Sun, 15 Nov 2020 04:18:39 +0530 Subject: [PATCH 098/124] Don't try to delete file if it doesn't exist --- youtube_dlc/postprocessor/embedthumbnail.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dlc/postprocessor/embedthumbnail.py b/youtube_dlc/postprocessor/embedthumbnail.py index a7d53d7f5..2ff3cff69 100644 --- a/youtube_dlc/postprocessor/embedthumbnail.py +++ b/youtube_dlc/postprocessor/embedthumbnail.py @@ -91,7 +91,8 @@ def is_webp(path): elif info['ext'] == 'mkv': old_thumbnail_filename = thumbnail_filename thumbnail_filename = os.path.join(os.path.dirname(old_thumbnail_filename), 'cover.jpg') - os.remove(encodeFilename(thumbnail_filename)) + if os.path.exists(thumbnail_filename): + os.remove(encodeFilename(thumbnail_filename)) os.rename(encodeFilename(old_thumbnail_filename), encodeFilename(thumbnail_filename)) options = [ From 2b547dd782bb31104085eef067d71ea7144b70ba Mon Sep 17 00:00:00 2001 From: lorpus Date: Sat, 14 Nov 2020 19:55:50 -0500 Subject: [PATCH 099/124] [bitwave.tv] new extractor --- docs/supportedsites.md | 1 + youtube_dlc/extractor/bitwave.py | 51 +++++++++++++++++++++++++++++ youtube_dlc/extractor/extractors.py | 4 +++ 3 files changed, 56 insertions(+) create mode 100644 youtube_dlc/extractor/bitwave.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 3b98e7a12..968593cd9 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -104,6 +104,7 @@ # Supported sites - **BIQLE** - **BitChute** - **BitChuteChannel** + - **bitwave.tv** - **BleacherReport** - **BleacherReportCMS** - **blinkx** diff --git a/youtube_dlc/extractor/bitwave.py b/youtube_dlc/extractor/bitwave.py new file mode 100644 index 000000000..6fe02c8c2 --- /dev/null +++ b/youtube_dlc/extractor/bitwave.py @@ -0,0 +1,51 @@ +from .common import InfoExtractor + + +class BitwaveReplayIE(InfoExtractor): + IE_NAME = 'bitwave:replay' + _VALID_URL = r'https?://(?:www\.)?bitwave\.tv/(?P\w+)/replay/(?P\w+)/?$' + + def _real_extract(self, url): + replay_id = self._match_id(url) + replay = self._download_json( + 'https://api.bitwave.tv/v1/replays/' + replay_id, + replay_id + ) + + return { + 'id': replay_id, + 'title': replay['data']['title'], + 'uploader': replay['data']['name'], + 'uploader_id': replay['data']['name'], + 'url': replay['data']['url'], + 'thumbnails': [ + {'url': x} for x in replay['data']['thumbnails'] + ], + } + + +class BitwaveStreamIE(InfoExtractor): + IE_NAME = 'bitwave:stream' + _VALID_URL = r'https?://(?:www\.)?bitwave\.tv/(?P\w+)/?$' + + def _real_extract(self, url): + username = self._match_id(url) + channel = self._download_json( + 'https://api.bitwave.tv/v1/channels/' + username, + username) + + formats = self._extract_m3u8_formats( + channel['data']['url'], username, + 'mp4') + self._sort_formats(formats) + + return { + 'id': username, + 'title': self._live_title(channel['data']['title']), + 'uploader': username, + 'uploader_id': username, + 'formats': formats, + 'thumbnail': channel['data']['thumbnail'], + 'is_live': True, + 'view_count': channel['data']['viewCount'] + } diff --git a/youtube_dlc/extractor/extractors.py b/youtube_dlc/extractor/extractors.py index c77ca12cc..90232c2a7 100644 --- a/youtube_dlc/extractor/extractors.py +++ b/youtube_dlc/extractor/extractors.py @@ -116,6 +116,10 @@ BitChuteIE, BitChuteChannelIE, ) +from .bitwave import ( + BitwaveReplayIE, + BitwaveStreamIE, +) from .biqle import BIQLEIE from .bleacherreport import ( BleacherReportIE, From d9c2b0a6de70a0bd610332202467eceb97bf1285 Mon Sep 17 00:00:00 2001 From: lorpus Date: Sat, 14 Nov 2020 20:18:30 -0500 Subject: [PATCH 100/124] [bitwave.tv] fix build --- youtube_dlc/extractor/bitwave.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dlc/extractor/bitwave.py b/youtube_dlc/extractor/bitwave.py index 6fe02c8c2..9aa210510 100644 --- a/youtube_dlc/extractor/bitwave.py +++ b/youtube_dlc/extractor/bitwave.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + from .common import InfoExtractor From d02f12107f3e0c640b942dafbf9d3e26f81e6473 Mon Sep 17 00:00:00 2001 From: Kyu Yeun Kim Date: Mon, 16 Nov 2020 22:03:48 +0900 Subject: [PATCH 101/124] [Vlive] Fix playlist handling when downloading a channel --- youtube_dlc/extractor/vlive.py | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/youtube_dlc/extractor/vlive.py b/youtube_dlc/extractor/vlive.py index 935560b57..ce6549d11 100644 --- a/youtube_dlc/extractor/vlive.py +++ b/youtube_dlc/extractor/vlive.py @@ -300,13 +300,34 @@ def _real_extract(self, url): for video in videos: video_id = video.get('videoSeq') - if not video_id: + video_type = video.get('videoType') + + if not video_id or not video_type: continue video_id = compat_str(video_id) - entries.append( - self.url_result( - 'http://www.vlive.tv/video/%s' % video_id, - ie=VLiveIE.ie_key(), video_id=video_id)) + + if video_type in ('PLAYLIST'): + playlist_videos = try_get( + video, + lambda x: x['videoPlaylist']['videoList'], list) + if not playlist_videos: + continue + + for playlist_video in playlist_videos: + playlist_video_id = playlist_video.get('videoSeq') + if not playlist_video_id: + continue + playlist_video_id = compat_str(playlist_video_id) + + entries.append( + self.url_result( + 'http://www.vlive.tv/video/%s' % playlist_video_id, + ie=VLiveIE.ie_key(), video_id=playlist_video_id)) + else: + entries.append( + self.url_result( + 'http://www.vlive.tv/video/%s' % video_id, + ie=VLiveIE.ie_key(), video_id=video_id)) return self.playlist_result( entries, channel_code, channel_name) From 8bdd16b4993b8d546b4cbbdbe4710db0bc2f971b Mon Sep 17 00:00:00 2001 From: pukkandan Date: Fri, 20 Nov 2020 00:52:59 +0530 Subject: [PATCH 102/124] Merge 'ytdl-org/youtube-dl/master' release 2020.11.19 Old Extractors left behind: VLivePlaylistIE YoutubeSearchURLIE YoutubeShowIE YoutubeFavouritesIE If removing old extractors, make corresponding changes in docs/supportedsites.md youtube_dlc/extractor/extractors.py Not merged: .github/ISSUE_TEMPLATE/1_broken_site.md .github/ISSUE_TEMPLATE/2_site_support_request.md .github/ISSUE_TEMPLATE/3_site_feature_request.md .github/ISSUE_TEMPLATE/4_bug_report.md .github/ISSUE_TEMPLATE/5_feature_request.md test/test_all_urls.py youtube_dlc/version.py Changelog --- devscripts/make_lazy_extractors.py | 2 +- docs/supportedsites.md | 14 +- test/test_all_urls.py | 26 +- test/test_utils.py | 28 + youtube_dlc/extractor/afreecatv.py | 2 +- youtube_dlc/extractor/arte.py | 167 ++- youtube_dlc/extractor/bandcamp.py | 262 ++-- youtube_dlc/extractor/cnbc.py | 19 +- youtube_dlc/extractor/common.py | 5 +- youtube_dlc/extractor/condenast.py | 27 +- youtube_dlc/extractor/extractors.py | 11 +- youtube_dlc/extractor/francetv.py | 47 +- youtube_dlc/extractor/generic.py | 9 +- youtube_dlc/extractor/iqiyi.py | 2 +- youtube_dlc/extractor/lbry.py | 88 ++ youtube_dlc/extractor/lrt.py | 91 +- youtube_dlc/extractor/malltv.py | 60 +- youtube_dlc/extractor/mgtv.py | 10 +- youtube_dlc/extractor/mtv.py | 12 + youtube_dlc/extractor/nbc.py | 5 +- youtube_dlc/extractor/ndr.py | 38 + youtube_dlc/extractor/rai.py | 157 +- youtube_dlc/extractor/servus.py | 111 +- youtube_dlc/extractor/spiegel.py | 161 +- youtube_dlc/extractor/twentythreevideo.py | 11 +- youtube_dlc/extractor/urplay.py | 77 +- youtube_dlc/extractor/usanetwork.py | 82 +- youtube_dlc/extractor/ustream.py | 7 +- youtube_dlc/extractor/vimeo.py | 11 +- youtube_dlc/extractor/vlive.py | 277 ++-- youtube_dlc/extractor/xtube.py | 31 +- youtube_dlc/extractor/youporn.py | 7 +- youtube_dlc/extractor/youtube.py | 1649 ++++++++++----------- youtube_dlc/utils.py | 17 +- 34 files changed, 1828 insertions(+), 1695 deletions(-) create mode 100644 youtube_dlc/extractor/lbry.py diff --git a/devscripts/make_lazy_extractors.py b/devscripts/make_lazy_extractors.py index e6de72b33..c27ef9781 100644 --- a/devscripts/make_lazy_extractors.py +++ b/devscripts/make_lazy_extractors.py @@ -61,7 +61,7 @@ def build_lazy_ie(ie, name): return s -# find the correct sorting and add the required base classes so that sublcasses +# find the correct sorting and add the required base classes so that subclasses # can be correctly created classes = _ALL_CLASSES[:-1] ordered_cls = [] diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 3b98e7a12..0481f7db9 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -59,9 +59,9 @@ # Supported sites - **ARD:mediathek** - **ARDBetaMediathek** - **Arkena** - - **arte.tv:+7** - - **arte.tv:embed** - - **arte.tv:playlist** + - **ArteTV** + - **ArteTVEmbed** + - **ArteTVPlaylist** - **AsianCrush** - **AsianCrushPlaylist** - **AtresPlayer** @@ -424,6 +424,7 @@ # Supported sites - **la7.it** - **laola1tv** - **laola1tv:embed** + - **lbry.tv** - **LCI** - **Lcp** - **LcpPlay** @@ -835,8 +836,6 @@ # Supported sites - **SpankBangPlaylist** - **Spankwire** - **Spiegel** - - **Spiegel:Article**: Articles on spiegel.de - - **Spiegeltv** - **sport.francetvinfo.fr** - **Sport5** - **SportBox** @@ -1147,19 +1146,18 @@ # Supported sites - **YourPorn** - **YourUpload** - **youtube**: YouTube.com - - **youtube:channel**: YouTube.com channels - **youtube:favorites**: YouTube.com favourite videos, ":ytfav" for short (requires authentication) - **youtube:history**: Youtube watch history, ":ythistory" for short (requires authentication) - **youtube:live**: YouTube.com live streams - **youtube:playlist**: YouTube.com playlists - - **youtube:playlists**: YouTube.com user/channel playlists - **youtube:recommended**: YouTube.com recommended videos, ":ytrec" for short (requires authentication) - **youtube:search**: YouTube.com searches - **youtube:search:date**: YouTube.com searches, newest videos first - **youtube:search_url**: YouTube.com search URLs - **youtube:show**: YouTube.com (multi-season) shows - **youtube:subscriptions**: YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication) - - **youtube:user**: YouTube.com user videos (URL or "ytuser" keyword) + - **YoutubeYtUser**: YouTube.com user videos (URL or "ytuser" keyword) + - **youtube:tab**: YouTube.com tab - **youtube:watchlater**: Youtube watch later list, ":ytwatchlater" for short (requires authentication) - **Zapiks** - **Zaq1** diff --git a/test/test_all_urls.py b/test/test_all_urls.py index 548bc6750..a44cf7549 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -31,15 +31,17 @@ def assertMatch(self, url, ie_list): def test_youtube_playlist_matching(self): assertPlaylist = lambda url: self.assertMatch(url, ['youtube:playlist']) + assertTab = lambda url: self.assertMatch(url, ['youtube:tab']) assertPlaylist('ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8') assertPlaylist('UUBABnxM4Ar9ten8Mdjj1j0Q') # 585 - assertPlaylist('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q') + assertPlaylist('PL63F0C78739B09958') + assertTab('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q') assertPlaylist('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8') - assertPlaylist('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC') - assertPlaylist('https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012') # 668 + assertTab('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC') + assertTab('https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012') # 668 self.assertFalse('youtube:playlist' in self.matching_ies('PLtS2H6bU1M')) # Top tracks - assertPlaylist('https://www.youtube.com/playlist?list=MCUS.20142101') + assertTab('https://www.youtube.com/playlist?list=MCUS.20142101') def test_youtube_matching(self): self.assertTrue(YoutubeIE.suitable('PLtS2H6bU1M')) @@ -50,26 +52,22 @@ def test_youtube_matching(self): self.assertMatch('http://www.cleanvideosearch.com/media/action/yt/watch?videoId=8v_4O44sfjM', ['youtube']) def test_youtube_channel_matching(self): - assertChannel = lambda url: self.assertMatch(url, ['youtube:channel']) + assertChannel = lambda url: self.assertMatch(url, ['youtube:tab']) assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM') assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM?feature=gb_ch_rec') assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM/videos') - def test_youtube_user_matching(self): - self.assertMatch('http://www.youtube.com/NASAgovVideo/videos', ['youtube:user']) + # def test_youtube_user_matching(self): + # self.assertMatch('http://www.youtube.com/NASAgovVideo/videos', ['youtube:tab']) def test_youtube_feeds(self): self.assertMatch('https://www.youtube.com/feed/watch_later', ['youtube:watchlater']) self.assertMatch('https://www.youtube.com/feed/subscriptions', ['youtube:subscriptions']) self.assertMatch('https://www.youtube.com/feed/recommended', ['youtube:recommended']) - self.assertMatch('https://www.youtube.com/my_favorites', ['youtube:favorites']) - def test_youtube_show_matching(self): - self.assertMatch('http://www.youtube.com/show/airdisasters', ['youtube:show']) - - def test_youtube_search_matching(self): - self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url']) - self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url']) + # def test_youtube_search_matching(self): + # self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url']) + # self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url']) def test_youtube_extract(self): assertExtractId = lambda url, id: self.assertEqual(YoutubeIE.extract_id(url), id) diff --git a/test/test_utils.py b/test/test_utils.py index 95231200b..16ad40831 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -937,6 +937,28 @@ def test_js_to_json_edgecases(self): self.assertEqual(d['x'], 1) self.assertEqual(d['y'], 'a') + # Just drop ! prefix for now though this results in a wrong value + on = js_to_json('''{ + a: !0, + b: !1, + c: !!0, + d: !!42.42, + e: !!![], + f: !"abc", + g: !"", + !42: 42 + }''') + self.assertEqual(json.loads(on), { + 'a': 0, + 'b': 1, + 'c': 0, + 'd': 42.42, + 'e': [], + 'f': "abc", + 'g': "", + '42': 42 + }) + on = js_to_json('["abc", "def",]') self.assertEqual(json.loads(on), ['abc', 'def']) @@ -994,6 +1016,12 @@ def test_js_to_json_edgecases(self): on = js_to_json('{42:4.2e1}') self.assertEqual(json.loads(on), {'42': 42.0}) + on = js_to_json('{ "0x40": "0x40" }') + self.assertEqual(json.loads(on), {'0x40': '0x40'}) + + on = js_to_json('{ "040": "040" }') + self.assertEqual(json.loads(on), {'040': '040'}) + def test_js_to_json_malformed(self): self.assertEqual(js_to_json('42a1'), '42"a1"') self.assertEqual(js_to_json('42a-1'), '42"a"-1') diff --git a/youtube_dlc/extractor/afreecatv.py b/youtube_dlc/extractor/afreecatv.py index 6275e5209..b56abb1e6 100644 --- a/youtube_dlc/extractor/afreecatv.py +++ b/youtube_dlc/extractor/afreecatv.py @@ -275,7 +275,7 @@ def _real_extract(self, url): video_element = video_xml.findall(compat_xpath('./track/video'))[-1] if video_element is None or video_element.text is None: raise ExtractorError( - 'Video %s video does not exist' % video_id, expected=True) + 'Video %s does not exist' % video_id, expected=True) video_url = video_element.text.strip() diff --git a/youtube_dlc/extractor/arte.py b/youtube_dlc/extractor/arte.py index 2bd3bfe8a..03abdbfaf 100644 --- a/youtube_dlc/extractor/arte.py +++ b/youtube_dlc/extractor/arte.py @@ -4,23 +4,57 @@ import re from .common import InfoExtractor -from ..compat import compat_str +from ..compat import ( + compat_str, + compat_urlparse, +) from ..utils import ( ExtractorError, int_or_none, qualities, try_get, unified_strdate, + url_or_none, ) -# There are different sources of video in arte.tv, the extraction process -# is different for each one. The videos usually expire in 7 days, so we can't -# add tests. - class ArteTVBaseIE(InfoExtractor): - def _extract_from_json_url(self, json_url, video_id, lang, title=None): - info = self._download_json(json_url, video_id) + _ARTE_LANGUAGES = 'fr|de|en|es|it|pl' + _API_BASE = 'https://api.arte.tv/api/player/v1' + + +class ArteTVIE(ArteTVBaseIE): + _VALID_URL = r'''(?x) + https?:// + (?: + (?:www\.)?arte\.tv/(?P%(langs)s)/videos| + api\.arte\.tv/api/player/v\d+/config/(?P%(langs)s) + ) + /(?P\d{6}-\d{3}-[AF]) + ''' % {'langs': ArteTVBaseIE._ARTE_LANGUAGES} + _TESTS = [{ + 'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/', + 'info_dict': { + 'id': '088501-000-A', + 'ext': 'mp4', + 'title': 'Mexico: Stealing Petrol to Survive', + 'upload_date': '20190628', + }, + }, { + 'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/', + 'only_matching': True, + }, { + 'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + lang = mobj.group('lang') or mobj.group('lang_2') + + info = self._download_json( + '%s/config/%s/%s' % (self._API_BASE, lang, video_id), video_id) player_info = info['videoJsonPlayer'] vsr = try_get(player_info, lambda x: x['VSR'], dict) @@ -37,18 +71,11 @@ def _extract_from_json_url(self, json_url, video_id, lang, title=None): if not upload_date_str: upload_date_str = (player_info.get('VRA') or player_info.get('VDA') or '').split(' ')[0] - title = (player_info.get('VTI') or title or player_info['VID']).strip() + title = (player_info.get('VTI') or player_info['VID']).strip() subtitle = player_info.get('VSU', '').strip() if subtitle: title += ' - %s' % subtitle - info_dict = { - 'id': player_info['VID'], - 'title': title, - 'description': player_info.get('VDE'), - 'upload_date': unified_strdate(upload_date_str), - 'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'), - } qfunc = qualities(['MQ', 'HQ', 'EQ', 'SQ']) LANGS = { @@ -65,6 +92,10 @@ def _extract_from_json_url(self, json_url, video_id, lang, title=None): formats = [] for format_id, format_dict in vsr.items(): f = dict(format_dict) + format_url = url_or_none(f.get('url')) + streamer = f.get('streamer') + if not format_url and not streamer: + continue versionCode = f.get('versionCode') l = re.escape(langcode) @@ -107,6 +138,16 @@ def _extract_from_json_url(self, json_url, video_id, lang, title=None): else: lang_pref = -1 + media_type = f.get('mediaType') + if media_type == 'hls': + m3u8_formats = self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id=format_id, fatal=False) + for m3u8_format in m3u8_formats: + m3u8_format['language_preference'] = lang_pref + formats.extend(m3u8_formats) + continue + format = { 'format_id': format_id, 'preference': -10 if f.get('videoFormat') == 'M3U8' else None, @@ -118,7 +159,7 @@ def _extract_from_json_url(self, json_url, video_id, lang, title=None): 'quality': qfunc(f.get('quality')), } - if f.get('mediaType') == 'rtmp': + if media_type == 'rtmp': format['url'] = f['streamer'] format['play_path'] = 'mp4:' + f['url'] format['ext'] = 'flv' @@ -127,56 +168,50 @@ def _extract_from_json_url(self, json_url, video_id, lang, title=None): formats.append(format) - self._check_formats(formats, video_id) self._sort_formats(formats) - info_dict['formats'] = formats - return info_dict + return { + 'id': player_info.get('VID') or video_id, + 'title': title, + 'description': player_info.get('VDE'), + 'upload_date': unified_strdate(upload_date_str), + 'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'), + 'formats': formats, + } -class ArteTVPlus7IE(ArteTVBaseIE): - IE_NAME = 'arte.tv:+7' - _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?Pfr|de|en|es|it|pl)/videos/(?P\d{6}-\d{3}-[AF])' - +class ArteTVEmbedIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+' _TESTS = [{ - 'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/', + 'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A', 'info_dict': { - 'id': '088501-000-A', + 'id': '100605-013-A', 'ext': 'mp4', - 'title': 'Mexico: Stealing Petrol to Survive', - 'upload_date': '20190628', + 'title': 'United we Stream November Lockdown Edition #13', + 'description': 'md5:be40b667f45189632b78c1425c7c2ce1', + 'upload_date': '20201116', }, + }, { + 'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A', + 'only_matching': True, }] - def _real_extract(self, url): - lang, video_id = re.match(self._VALID_URL, url).groups() - return self._extract_from_json_url( - 'https://api.arte.tv/api/player/v1/config/%s/%s' % (lang, video_id), - video_id, lang) - - -class ArteTVEmbedIE(ArteTVPlus7IE): - IE_NAME = 'arte.tv:embed' - _VALID_URL = r'''(?x) - https://www\.arte\.tv - /player/v3/index\.php\?json_url= - (?P - https?://api\.arte\.tv/api/player/v1/config/ - (?P[^/]+)/(?P\d{6}-\d{3}-[AF]) - ) - ''' - - _TESTS = [] + @staticmethod + def _extract_urls(webpage): + return [url for _, url in re.findall( + r'<(?:iframe|script)[^>]+src=(["\'])(?P(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1', + webpage)] def _real_extract(self, url): - json_url, lang, video_id = re.match(self._VALID_URL, url).groups() - return self._extract_from_json_url(json_url, video_id, lang) + qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + json_url = qs['json_url'][0] + video_id = ArteTVIE._match_id(json_url) + return self.url_result( + json_url, ie=ArteTVIE.ie_key(), video_id=video_id) class ArteTVPlaylistIE(ArteTVBaseIE): - IE_NAME = 'arte.tv:playlist' - _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?Pfr|de|en|es|it|pl)/videos/(?PRC-\d{6})' - + _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P%s)/videos/(?PRC-\d{6})' % ArteTVBaseIE._ARTE_LANGUAGES _TESTS = [{ 'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/', 'info_dict': { @@ -185,17 +220,35 @@ class ArteTVPlaylistIE(ArteTVBaseIE): 'description': 'md5:d322c55011514b3a7241f7fb80d494c2', }, 'playlist_mincount': 6, + }, { + 'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/', + 'only_matching': True, }] def _real_extract(self, url): lang, playlist_id = re.match(self._VALID_URL, url).groups() collection = self._download_json( - 'https://api.arte.tv/api/player/v1/collectionData/%s/%s?source=videos' - % (lang, playlist_id), playlist_id) + '%s/collectionData/%s/%s?source=videos' + % (self._API_BASE, lang, playlist_id), playlist_id) + entries = [] + for video in collection['videos']: + if not isinstance(video, dict): + continue + video_url = url_or_none(video.get('url')) or url_or_none(video.get('jsonUrl')) + if not video_url: + continue + video_id = video.get('programId') + entries.append({ + '_type': 'url_transparent', + 'url': video_url, + 'id': video_id, + 'title': video.get('title'), + 'alt_title': video.get('subtitle'), + 'thumbnail': url_or_none(try_get(video, lambda x: x['mainImage']['url'], compat_str)), + 'duration': int_or_none(video.get('durationSeconds')), + 'view_count': int_or_none(video.get('views')), + 'ie_key': ArteTVIE.ie_key(), + }) title = collection.get('title') description = collection.get('shortDescription') or collection.get('teaserText') - entries = [ - self._extract_from_json_url( - video['jsonUrl'], video.get('programId') or playlist_id, lang) - for video in collection['videos'] if video.get('jsonUrl')] return self.playlist_result(entries, playlist_id, title, description) diff --git a/youtube_dlc/extractor/bandcamp.py b/youtube_dlc/extractor/bandcamp.py index 0e7492764..69e673a26 100644 --- a/youtube_dlc/extractor/bandcamp.py +++ b/youtube_dlc/extractor/bandcamp.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import random @@ -5,10 +6,7 @@ import time from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urlparse, -) +from ..compat import compat_str from ..utils import ( ExtractorError, float_or_none, @@ -17,71 +15,32 @@ parse_filesize, str_or_none, try_get, - unescapeHTML, update_url_query, unified_strdate, unified_timestamp, url_or_none, + urljoin, ) -class BandcampBaseIE(InfoExtractor): - """Provide base functions for Bandcamp extractors""" - - def _extract_json_from_html_data_attribute(self, webpage, suffix, video_id): - json_string = self._html_search_regex( - r' data-%s="([^"]*)' % suffix, - webpage, '%s json' % suffix, default='{}') - - return self._parse_json(json_string, video_id) - - def _parse_json_track(self, json): - formats = [] - file_ = json.get('file') - if isinstance(file_, dict): - for format_id, format_url in file_.items(): - if not url_or_none(format_url): - continue - ext, abr_str = format_id.split('-', 1) - formats.append({ - 'format_id': format_id, - 'url': self._proto_relative_url(format_url, 'http:'), - 'ext': ext, - 'vcodec': 'none', - 'acodec': ext, - 'abr': int_or_none(abr_str), - }) - - return { - 'duration': float_or_none(json.get('duration')), - 'id': str_or_none(json.get('track_id') or json.get('id')), - 'title': json.get('title'), - 'title_link': json.get('title_link'), - 'number': int_or_none(json.get('track_num')), - 'formats': formats - } - - -class BandcampIE(BandcampBaseIE): - IE_NAME = "Bandcamp:track" - _VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P[^/?#&]+)' +class BandcampIE(InfoExtractor): + _VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song', 'md5': 'c557841d5e50261777a6585648adf439', 'info_dict': { 'id': '1812978515', 'ext': 'mp3', - 'title': "youtube-dl \"'/\\\u00e4\u21ad - youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad", + 'title': "youtube-dl \"'/\\ä↭ - youtube-dl \"'/\\ä↭ - youtube-dl test song \"'/\\ä↭", 'duration': 9.8485, - 'uploader': "youtube-dl \"'/\\\u00e4\u21ad", - 'timestamp': 1354224127, + 'uploader': 'youtube-dl "\'/\\ä↭', 'upload_date': '20121129', + 'timestamp': 1354224127, }, '_skip': 'There is a limit of 200 free downloads / month for the test song' }, { # free download 'url': 'http://benprunty.bandcamp.com/track/lanius-battle', - 'md5': '5d92af55811e47f38962a54c30b07ef0', 'info_dict': { 'id': '2650410135', 'ext': 'aiff', @@ -120,52 +79,59 @@ class BandcampIE(BandcampBaseIE): }, }] + def _extract_data_attr(self, webpage, video_id, attr='tralbum', fatal=True): + return self._parse_json(self._html_search_regex( + r'data-%s=(["\'])({.+?})\1' % attr, webpage, + attr + ' data', group=2), video_id, fatal=fatal) + def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - title = mobj.group('title') - url_track_title = title + title = self._match_id(url) webpage = self._download_webpage(url, title) - thumbnail = self._html_search_meta('og:image', webpage, default=None) + tralbum = self._extract_data_attr(webpage, title) + thumbnail = self._og_search_thumbnail(webpage) - json_tralbum = self._extract_json_from_html_data_attribute(webpage, "tralbum", url_track_title) - json_embed = self._extract_json_from_html_data_attribute(webpage, "embed", url_track_title) + track_id = None + track = None + track_number = None + duration = None - json_tracks = json_tralbum.get('trackinfo') - if not json_tracks: - raise ExtractorError('Could not extract track') + formats = [] + track_info = try_get(tralbum, lambda x: x['trackinfo'][0], dict) + if track_info: + file_ = track_info.get('file') + if isinstance(file_, dict): + for format_id, format_url in file_.items(): + if not url_or_none(format_url): + continue + ext, abr_str = format_id.split('-', 1) + formats.append({ + 'format_id': format_id, + 'url': self._proto_relative_url(format_url, 'http:'), + 'ext': ext, + 'vcodec': 'none', + 'acodec': ext, + 'abr': int_or_none(abr_str), + }) + track = track_info.get('title') + track_id = str_or_none( + track_info.get('track_id') or track_info.get('id')) + track_number = int_or_none(track_info.get('track_num')) + duration = float_or_none(track_info.get('duration')) - track = self._parse_json_track(json_tracks[0]) - artist = json_tralbum.get('artist') - album_title = json_embed.get('album_title') + embed = self._extract_data_attr(webpage, title, 'embed', False) + current = tralbum.get('current') or {} + artist = embed.get('artist') or current.get('artist') or tralbum.get('artist') + timestamp = unified_timestamp( + current.get('publish_date') or tralbum.get('album_publish_date')) - json_album = json_tralbum.get('packages') - if json_album: - json_album = json_album[0] - album_publish_date = json_album.get('album_publish_date') - album_release_date = json_album.get('album_release_date') - else: - album_publish_date = None - album_release_date = json_tralbum.get('album_release_date') - - timestamp = unified_timestamp(json_tralbum.get('current', {}).get('publish_date') or album_publish_date) - release_date = unified_strdate(album_release_date) - - download_link = self._search_regex( - r'freeDownloadPage(?:["\']|"):\s*(["\']|")(?P<url>(?:(?!\1).)+)\1', webpage, - 'download link', default=None, group='url') + download_link = tralbum.get('freeDownloadPage') if download_link: - track_id = self._search_regex( - r'\?id=(?P<id>\d+)&', - download_link, 'track id') + track_id = compat_str(tralbum['id']) download_webpage = self._download_webpage( download_link, track_id, 'Downloading free downloads page') - blob = self._parse_json( - self._search_regex( - r'data-blob=(["\'])(?P<blob>{.+?})\1', download_webpage, - 'blob', group='blob'), - track_id, transform_source=unescapeHTML) + blob = self._extract_data_attr(download_webpage, track_id, 'blob') info = try_get( blob, (lambda x: x['digital_items'][0], @@ -173,6 +139,8 @@ def _real_extract(self, url): if info: downloads = info.get('downloads') if isinstance(downloads, dict): + if not track: + track = info.get('title') if not artist: artist = info.get('artist') if not thumbnail: @@ -206,7 +174,7 @@ def _real_extract(self, url): retry_url = url_or_none(stat.get('retry_url')) if not retry_url: continue - track['formats'].append({ + formats.append({ 'url': self._proto_relative_url(retry_url, 'http:'), 'ext': download_formats.get(format_id), 'format_id': format_id, @@ -215,30 +183,34 @@ def _real_extract(self, url): 'vcodec': 'none', }) - self._sort_formats(track['formats']) + self._sort_formats(formats) - title = '%s - %s' % (artist, track.get('title')) if artist else track.get('title') + title = '%s - %s' % (artist, track) if artist else track + + if not duration: + duration = float_or_none(self._html_search_meta( + 'duration', webpage, default=None)) return { - 'album': album_title, - 'artist': artist, - 'duration': track['duration'], - 'formats': track['formats'], - 'id': track['id'], - 'release_date': release_date, - 'thumbnail': thumbnail, - 'timestamp': timestamp, + 'id': track_id, 'title': title, - 'track': track['title'], - 'track_id': track['id'], - 'track_number': track['number'], - 'uploader': artist + 'thumbnail': thumbnail, + 'uploader': artist, + 'timestamp': timestamp, + 'release_date': unified_strdate(tralbum.get('album_release_date')), + 'duration': duration, + 'track': track, + 'track_number': track_number, + 'track_id': track_id, + 'artist': artist, + 'album': embed.get('album_title'), + 'formats': formats, } -class BandcampAlbumIE(BandcampBaseIE): +class BandcampAlbumIE(BandcampIE): IE_NAME = 'Bandcamp:album' - _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<album_id>[^/?#&]+))?' + _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<id>[^/?#&]+))?' _TESTS = [{ 'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1', @@ -248,7 +220,10 @@ class BandcampAlbumIE(BandcampBaseIE): 'info_dict': { 'id': '1353101989', 'ext': 'mp3', - 'title': 'Intro', + 'title': 'Blazo - Intro', + 'timestamp': 1311756226, + 'upload_date': '20110727', + 'uploader': 'Blazo', } }, { @@ -256,7 +231,10 @@ class BandcampAlbumIE(BandcampBaseIE): 'info_dict': { 'id': '38097443', 'ext': 'mp3', - 'title': 'Kero One - Keep It Alive (Blazo remix)', + 'title': 'Blazo - Kero One - Keep It Alive (Blazo remix)', + 'timestamp': 1311757238, + 'upload_date': '20110727', + 'uploader': 'Blazo', } }, ], @@ -292,6 +270,7 @@ class BandcampAlbumIE(BandcampBaseIE): 'title': '"Entropy" EP', 'uploader_id': 'jstrecords', 'id': 'entropy-ep', + 'description': 'md5:0ff22959c943622972596062f2f366a5', }, 'playlist_mincount': 3, }, { @@ -301,6 +280,7 @@ class BandcampAlbumIE(BandcampBaseIE): 'id': 'we-are-the-plague', 'title': 'WE ARE THE PLAGUE', 'uploader_id': 'insulters', + 'description': 'md5:b3cf845ee41b2b1141dc7bde9237255f', }, 'playlist_count': 2, }] @@ -312,41 +292,34 @@ def suitable(cls, url): else super(BandcampAlbumIE, cls).suitable(url)) def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - uploader_id = mobj.group('subdomain') - album_id = mobj.group('album_id') + uploader_id, album_id = re.match(self._VALID_URL, url).groups() playlist_id = album_id or uploader_id webpage = self._download_webpage(url, playlist_id) - - json_tralbum = self._extract_json_from_html_data_attribute(webpage, "tralbum", playlist_id) - json_embed = self._extract_json_from_html_data_attribute(webpage, "embed", playlist_id) - - json_tracks = json_tralbum.get('trackinfo') - if not json_tracks: - raise ExtractorError('Could not extract album tracks') - - album_title = json_embed.get('album_title') - + tralbum = self._extract_data_attr(webpage, playlist_id) + track_info = tralbum.get('trackinfo') + if not track_info: + raise ExtractorError('The page doesn\'t contain any tracks') # Only tracks with duration info have songs - tracks = [self._parse_json_track(track) for track in json_tracks] entries = [ self.url_result( - compat_urlparse.urljoin(url, track['title_link']), - ie=BandcampIE.ie_key(), video_id=track['id'], - video_title=track['title']) - for track in tracks - if track.get('duration')] + urljoin(url, t['title_link']), BandcampIE.ie_key(), + str_or_none(t.get('track_id') or t.get('id')), t.get('title')) + for t in track_info + if t.get('duration')] + + current = tralbum.get('current') or {} return { '_type': 'playlist', 'uploader_id': uploader_id, 'id': playlist_id, - 'title': album_title, - 'entries': entries + 'title': current.get('title'), + 'description': current.get('about'), + 'entries': entries, } -class BandcampWeeklyIE(InfoExtractor): +class BandcampWeeklyIE(BandcampIE): IE_NAME = 'Bandcamp:weekly' _VALID_URL = r'https?://(?:www\.)?bandcamp\.com/?\?(?:.*?&)?show=(?P<id>\d+)' _TESTS = [{ @@ -361,29 +334,23 @@ class BandcampWeeklyIE(InfoExtractor): 'release_date': '20170404', 'series': 'Bandcamp Weekly', 'episode': 'Magic Moments', - 'episode_number': 208, 'episode_id': '224', - } + }, + 'params': { + 'format': 'opus-lo', + }, }, { 'url': 'https://bandcamp.com/?blah/blah@&show=228', 'only_matching': True }] def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + show_id = self._match_id(url) + webpage = self._download_webpage(url, show_id) - blob = self._parse_json( - self._search_regex( - r'data-blob=(["\'])(?P<blob>{.+?})\1', webpage, - 'blob', group='blob'), - video_id, transform_source=unescapeHTML) + blob = self._extract_data_attr(webpage, show_id, 'blob') - show = blob['bcw_show'] - - # This is desired because any invalid show id redirects to `bandcamp.com` - # which happens to expose the latest Bandcamp Weekly episode. - show_id = int_or_none(show.get('show_id')) or int_or_none(video_id) + show = blob['bcw_data'][show_id] formats = [] for format_id, format_url in show['audio_stream'].items(): @@ -408,20 +375,8 @@ def _real_extract(self, url): if subtitle: title += ' - %s' % subtitle - episode_number = None - seq = blob.get('bcw_seq') - - if seq and isinstance(seq, list): - try: - episode_number = next( - int_or_none(e.get('episode_number')) - for e in seq - if isinstance(e, dict) and int_or_none(e.get('id')) == show_id) - except StopIteration: - pass - return { - 'id': video_id, + 'id': show_id, 'title': title, 'description': show.get('desc') or show.get('short_desc'), 'duration': float_or_none(show.get('audio_duration')), @@ -429,7 +384,6 @@ def _real_extract(self, url): 'release_date': unified_strdate(show.get('published_date')), 'series': 'Bandcamp Weekly', 'episode': show.get('subtitle'), - 'episode_number': episode_number, - 'episode_id': compat_str(video_id), + 'episode_id': show_id, 'formats': formats } diff --git a/youtube_dlc/extractor/cnbc.py b/youtube_dlc/extractor/cnbc.py index 6889b0f40..7b9f4536a 100644 --- a/youtube_dlc/extractor/cnbc.py +++ b/youtube_dlc/extractor/cnbc.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import re from .common import InfoExtractor from ..utils import smuggle_url @@ -38,7 +39,7 @@ def _real_extract(self, url): class CNBCVideoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?cnbc\.com/video/(?:[^/]+/)+(?P<id>[^./?#&]+)' + _VALID_URL = r'https?://(?:www\.)?cnbc\.com(?P<path>/video/(?:[^/]+/)+(?P<id>[^./?#&]+)\.html)' _TEST = { 'url': 'https://www.cnbc.com/video/2018/07/19/trump-i-dont-necessarily-agree-with-raising-rates.html', 'info_dict': { @@ -56,11 +57,15 @@ class CNBCVideoIE(InfoExtractor): } def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - video_id = self._search_regex( - r'content_id["\']\s*:\s*["\'](\d+)', webpage, display_id, - 'video id') + path, display_id = re.match(self._VALID_URL, url).groups() + video_id = self._download_json( + 'https://webql-redesign.cnbcfm.com/graphql', display_id, query={ + 'query': '''{ + page(path: "%s") { + vcpsId + } +}''' % path, + })['data']['page']['vcpsId'] return self.url_result( - 'http://video.cnbc.com/gallery/?video=%s' % video_id, + 'http://video.cnbc.com/gallery/?video=%d' % video_id, CNBCIE.ie_key()) diff --git a/youtube_dlc/extractor/common.py b/youtube_dlc/extractor/common.py index 4b42d699f..f90cf36ed 100644 --- a/youtube_dlc/extractor/common.py +++ b/youtube_dlc/extractor/common.py @@ -1456,9 +1456,10 @@ def _is_valid_url(self, url, video_id, item='video', headers={}): try: self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers) return True - except ExtractorError: + except ExtractorError as e: self.to_screen( - '%s: %s URL is invalid, skipping' % (video_id, item)) + '%s: %s URL is invalid, skipping: %s' + % (video_id, item, error_to_compat_str(e.cause))) return False def http_scheme(self): diff --git a/youtube_dlc/extractor/condenast.py b/youtube_dlc/extractor/condenast.py index ed278fefc..d5e77af32 100644 --- a/youtube_dlc/extractor/condenast.py +++ b/youtube_dlc/extractor/condenast.py @@ -16,6 +16,8 @@ mimetype2ext, orderedSet, parse_iso8601, + strip_or_none, + try_get, ) @@ -82,6 +84,7 @@ class CondeNastIE(InfoExtractor): 'uploader': 'gq', 'upload_date': '20170321', 'timestamp': 1490126427, + 'description': 'How much grimmer would things be if these people were competent?', }, }, { # JS embed @@ -93,7 +96,7 @@ class CondeNastIE(InfoExtractor): 'title': '3D printed TSA Travel Sentry keys really do open TSA locks', 'uploader': 'arstechnica', 'upload_date': '20150916', - 'timestamp': 1442434955, + 'timestamp': 1442434920, } }, { 'url': 'https://player.cnevids.com/inline/video/59138decb57ac36b83000005.js?target=js-cne-player', @@ -196,6 +199,13 @@ def _extract_video(self, params): }) self._sort_formats(formats) + subtitles = {} + for t, caption in video_info.get('captions', {}).items(): + caption_url = caption.get('src') + if not (t in ('vtt', 'srt', 'tml') and caption_url): + continue + subtitles.setdefault('en', []).append({'url': caption_url}) + return { 'id': video_id, 'formats': formats, @@ -208,6 +218,7 @@ def _extract_video(self, params): 'season': video_info.get('season_title'), 'timestamp': parse_iso8601(video_info.get('premiere_date')), 'categories': video_info.get('categories'), + 'subtitles': subtitles, } def _real_extract(self, url): @@ -225,8 +236,16 @@ def _real_extract(self, url): if url_type == 'series': return self._extract_series(url, webpage) else: - params = self._extract_video_params(webpage, display_id) - info = self._search_json_ld( - webpage, display_id, fatal=False) + video = try_get(self._parse_json(self._search_regex( + r'__PRELOADED_STATE__\s*=\s*({.+?});', webpage, + 'preload state', '{}'), display_id), + lambda x: x['transformed']['video']) + if video: + params = {'videoId': video['id']} + info = {'description': strip_or_none(video.get('description'))} + else: + params = self._extract_video_params(webpage, display_id) + info = self._search_json_ld( + webpage, display_id, fatal=False) info.update(self._extract_video(params)) return info diff --git a/youtube_dlc/extractor/extractors.py b/youtube_dlc/extractor/extractors.py index c77ca12cc..9e832450a 100644 --- a/youtube_dlc/extractor/extractors.py +++ b/youtube_dlc/extractor/extractors.py @@ -62,7 +62,7 @@ ARDMediathekIE, ) from .arte import ( - ArteTVPlus7IE, + ArteTVIE, ArteTVEmbedIE, ArteTVPlaylistIE, ) @@ -542,6 +542,7 @@ EHFTVIE, ITTFIE, ) +from .lbry import LBRYIE from .lci import LCIIE from .lcp import ( LcpPlayIE, @@ -1079,8 +1080,7 @@ SpankBangPlaylistIE, ) from .spankwire import SpankwireIE -from .spiegel import SpiegelIE, SpiegelArticleIE -from .spiegeltv import SpiegeltvIE +from .spiegel import SpiegelIE from .spike import ( BellatorIE, ParamountNetworkIE, @@ -1505,12 +1505,11 @@ from .yourupload import YourUploadIE from .youtube import ( YoutubeIE, - YoutubeChannelIE, YoutubeFavouritesIE, YoutubeHistoryIE, YoutubeLiveIE, + YoutubeTabIE, YoutubePlaylistIE, - YoutubePlaylistsIE, YoutubeRecommendedIE, YoutubeSearchDateIE, YoutubeSearchIE, @@ -1519,7 +1518,7 @@ YoutubeSubscriptionsIE, YoutubeTruncatedIDIE, YoutubeTruncatedURLIE, - YoutubeUserIE, + YoutubeYtUserIE, YoutubeWatchLaterIE, ) from .zapiks import ZapiksIE diff --git a/youtube_dlc/extractor/francetv.py b/youtube_dlc/extractor/francetv.py index e340cddba..dbedfc091 100644 --- a/youtube_dlc/extractor/francetv.py +++ b/youtube_dlc/extractor/francetv.py @@ -17,6 +17,7 @@ parse_duration, try_get, url_or_none, + urljoin, ) from .dailymotion import DailymotionIE @@ -128,18 +129,38 @@ def sign(manifest_url, manifest_id): is_live = None - formats = [] - for video in info['videos']: - if video['statut'] != 'ONLINE': + videos = [] + + for video in (info.get('videos') or []): + if video.get('statut') != 'ONLINE': continue - video_url = video['url'] + if not video.get('url'): + continue + videos.append(video) + + if not videos: + for device_type in ['desktop', 'mobile']: + fallback_info = self._download_json( + 'https://player.webservices.francetelevisions.fr/v1/videos/%s' % video_id, + video_id, 'Downloading fallback %s video JSON' % device_type, query={ + 'device_type': device_type, + 'browser': 'chrome', + }, fatal=False) + + if fallback_info and fallback_info.get('video'): + videos.append(fallback_info['video']) + + formats = [] + for video in videos: + video_url = video.get('url') if not video_url: continue if is_live is None: is_live = (try_get( - video, lambda x: x['plages_ouverture'][0]['direct'], - bool) is True) or '/live.francetv.fr/' in video_url - format_id = video['format'] + video, lambda x: x['plages_ouverture'][0]['direct'], bool) is True + or video.get('is_live') is True + or '/live.francetv.fr/' in video_url) + format_id = video.get('format') ext = determine_ext(video_url) if ext == 'f4m': if georestricted: @@ -154,6 +175,9 @@ def sign(manifest_url, manifest_id): sign(video_url, format_id), video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id=format_id, fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + sign(video_url, format_id), video_id, mpd_id=format_id, fatal=False)) elif video_url.startswith('rtmp'): formats.append({ 'url': video_url, @@ -166,6 +190,7 @@ def sign(manifest_url, manifest_id): 'url': video_url, 'format_id': format_id, }) + self._sort_formats(formats) title = info['titre'] @@ -185,10 +210,10 @@ def sign(manifest_url, manifest_id): return { 'id': video_id, 'title': self._live_title(title) if is_live else title, - 'description': clean_html(info['synopsis']), - 'thumbnail': compat_urlparse.urljoin('http://pluzz.francetv.fr', info['image']), - 'duration': int_or_none(info.get('real_duration')) or parse_duration(info['duree']), - 'timestamp': int_or_none(info['diffusion']['timestamp']), + 'description': clean_html(info.get('synopsis')), + 'thumbnail': urljoin('http://pluzz.francetv.fr', info.get('image')), + 'duration': int_or_none(info.get('real_duration')) or parse_duration(info.get('duree')), + 'timestamp': int_or_none(try_get(info, lambda x: x['diffusion']['timestamp'])), 'is_live': is_live, 'formats': formats, 'subtitles': subtitles, diff --git a/youtube_dlc/extractor/generic.py b/youtube_dlc/extractor/generic.py index aba06b328..ce8cac5c1 100644 --- a/youtube_dlc/extractor/generic.py +++ b/youtube_dlc/extractor/generic.py @@ -91,6 +91,7 @@ from .videa import VideaIE from .twentymin import TwentyMinutenIE from .ustream import UstreamIE +from .arte import ArteTVEmbedIE from .videopress import VideoPressIE from .rutube import RutubeIE from .limelight import LimelightBaseIE @@ -2760,11 +2761,9 @@ def _real_extract(self, url): return self.url_result(ustream_url, UstreamIE.ie_key()) # Look for embedded arte.tv player - mobj = re.search( - r'<(?:script|iframe) [^>]*?src="(?P<url>http://www\.arte\.tv/(?:playerv2/embed|arte_vp/index)[^"]+)"', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'ArteTVEmbed') + arte_urls = ArteTVEmbedIE._extract_urls(webpage) + if arte_urls: + return self.playlist_from_matches(arte_urls, video_id, video_title) # Look for embedded francetv player mobj = re.search( diff --git a/youtube_dlc/extractor/iqiyi.py b/youtube_dlc/extractor/iqiyi.py index cd11aa70f..5df674daf 100644 --- a/youtube_dlc/extractor/iqiyi.py +++ b/youtube_dlc/extractor/iqiyi.py @@ -150,7 +150,7 @@ def run(self, target, ip, timestamp): elif function in other_functions: other_functions[function]() else: - raise ExtractorError('Unknown funcion %s' % function) + raise ExtractorError('Unknown function %s' % function) return sdk.target diff --git a/youtube_dlc/extractor/lbry.py b/youtube_dlc/extractor/lbry.py new file mode 100644 index 000000000..0a7ee919c --- /dev/null +++ b/youtube_dlc/extractor/lbry.py @@ -0,0 +1,88 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + determine_ext, + ExtractorError, + int_or_none, + mimetype2ext, + try_get, +) + + +class LBRYIE(InfoExtractor): + IE_NAME = 'lbry.tv' + _VALID_URL = r'https?://(?:www\.)?(?:lbry\.tv|odysee\.com)/(?P<id>@[0-9a-zA-Z-]+:[0-9a-z]+/[0-9a-zA-Z().-]+:[0-9a-z])' + _TESTS = [{ + # Video + 'url': 'https://lbry.tv/@Mantega:1/First-day-LBRY:1', + 'md5': '65bd7ec1f6744ada55da8e4c48a2edf9', + 'info_dict': { + 'id': '17f983b61f53091fb8ea58a9c56804e4ff8cff4d', + 'ext': 'mp4', + 'title': 'First day in LBRY? Start HERE!', + 'description': 'md5:f6cb5c704b332d37f5119313c2c98f51', + 'timestamp': 1595694354, + 'upload_date': '20200725', + } + }, { + # Audio + 'url': 'https://lbry.tv/@LBRYFoundation:0/Episode-1:e', + 'md5': 'c94017d3eba9b49ce085a8fad6b98d00', + 'info_dict': { + 'id': 'e7d93d772bd87e2b62d5ab993c1c3ced86ebb396', + 'ext': 'mp3', + 'title': 'The LBRY Foundation Community Podcast Episode 1 - Introduction, Streaming on LBRY, Transcoding', + 'description': 'md5:661ac4f1db09f31728931d7b88807a61', + 'timestamp': 1591312601, + 'upload_date': '20200604', + } + }, { + 'url': 'https://odysee.com/@BrodieRobertson:5/apple-is-tracking-everything-you-do-on:e', + 'only_matching': True, + }] + + def _call_api_proxy(self, method, display_id, params): + return self._download_json( + 'https://api.lbry.tv/api/v1/proxy', display_id, + headers={'Content-Type': 'application/json-rpc'}, + data=json.dumps({ + 'method': method, + 'params': params, + }).encode())['result'] + + def _real_extract(self, url): + display_id = self._match_id(url).replace(':', '#') + uri = 'lbry://' + display_id + result = self._call_api_proxy( + 'resolve', display_id, {'urls': [uri]})[uri] + result_value = result['value'] + if result_value.get('stream_type') not in ('video', 'audio'): + raise ExtractorError('Unsupported URL', expected=True) + streaming_url = self._call_api_proxy( + 'get', display_id, {'uri': uri})['streaming_url'] + source = result_value.get('source') or {} + media = result_value.get('video') or result_value.get('audio') or {} + signing_channel = result_value.get('signing_channel') or {} + + return { + 'id': result['claim_id'], + 'title': result_value['title'], + 'thumbnail': try_get(result_value, lambda x: x['thumbnail']['url'], compat_str), + 'description': result_value.get('description'), + 'license': result_value.get('license'), + 'timestamp': int_or_none(result.get('timestamp')), + 'tags': result_value.get('tags'), + 'width': int_or_none(media.get('width')), + 'height': int_or_none(media.get('height')), + 'duration': int_or_none(media.get('duration')), + 'channel': signing_channel.get('name'), + 'channel_id': signing_channel.get('claim_id'), + 'ext': determine_ext(source.get('name')) or mimetype2ext(source.get('media_type')), + 'filesize': int_or_none(source.get('size')), + 'url': streaming_url, + } diff --git a/youtube_dlc/extractor/lrt.py b/youtube_dlc/extractor/lrt.py index f5c997ef4..89d549858 100644 --- a/youtube_dlc/extractor/lrt.py +++ b/youtube_dlc/extractor/lrt.py @@ -5,28 +5,26 @@ from .common import InfoExtractor from ..utils import ( - determine_ext, - int_or_none, - parse_duration, - remove_end, + clean_html, + merge_dicts, ) class LRTIE(InfoExtractor): IE_NAME = 'lrt.lt' - _VALID_URL = r'https?://(?:www\.)?lrt\.lt/mediateka/irasas/(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?lrt\.lt(?P<path>/mediateka/irasas/(?P<id>[0-9]+))' _TESTS = [{ # m3u8 download - 'url': 'http://www.lrt.lt/mediateka/irasas/54391/', - 'md5': 'fe44cf7e4ab3198055f2c598fc175cb0', + 'url': 'https://www.lrt.lt/mediateka/irasas/2000127261/greita-ir-gardu-sicilijos-ikvepta-klasikiniu-makaronu-su-baklazanais-vakariene', + 'md5': '85cb2bb530f31d91a9c65b479516ade4', 'info_dict': { - 'id': '54391', + 'id': '2000127261', 'ext': 'mp4', - 'title': 'Septynios Kauno dienos', - 'description': 'md5:24d84534c7dc76581e59f5689462411a', - 'duration': 1783, - 'view_count': int, - 'like_count': int, + 'title': 'Greita ir gardu: Sicilijos įkvėpta klasikinių makaronų su baklažanais vakarienė', + 'description': 'md5:ad7d985f51b0dc1489ba2d76d7ed47fa', + 'duration': 3035, + 'timestamp': 1604079000, + 'upload_date': '20201030', }, }, { # direct mp3 download @@ -43,52 +41,35 @@ class LRTIE(InfoExtractor): }, }] + def _extract_js_var(self, webpage, var_name, default): + return self._search_regex( + r'%s\s*=\s*(["\'])((?:(?!\1).)+)\1' % var_name, + webpage, var_name.replace('_', ' '), default, group=2) + def _real_extract(self, url): - video_id = self._match_id(url) + path, video_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, video_id) - title = remove_end(self._og_search_title(webpage), ' - LRT') + media_url = self._extract_js_var(webpage, 'main_url', path) + media = self._download_json(self._extract_js_var( + webpage, 'media_info_url', + 'https://www.lrt.lt/servisai/stream_url/vod/media_info/'), + video_id, query={'url': media_url}) + jw_data = self._parse_jwplayer_data( + media['playlist_item'], video_id, base_url=url) - formats = [] - for _, file_url in re.findall( - r'file\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage): - ext = determine_ext(file_url) - if ext not in ('m3u8', 'mp3'): + json_ld_data = self._search_json_ld(webpage, video_id) + + tags = [] + for tag in (media.get('tags') or []): + tag_name = tag.get('name') + if not tag_name: continue - # mp3 served as m3u8 produces stuttered media file - if ext == 'm3u8' and '.mp3' in file_url: - continue - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - file_url, video_id, 'mp4', entry_protocol='m3u8_native', - fatal=False)) - elif ext == 'mp3': - formats.append({ - 'url': file_url, - 'vcodec': 'none', - }) - self._sort_formats(formats) + tags.append(tag_name) - thumbnail = self._og_search_thumbnail(webpage) - description = self._og_search_description(webpage) - duration = parse_duration(self._search_regex( - r'var\s+record_len\s*=\s*(["\'])(?P<duration>[0-9]+:[0-9]+:[0-9]+)\1', - webpage, 'duration', default=None, group='duration')) - - view_count = int_or_none(self._html_search_regex( - r'<div[^>]+class=(["\']).*?record-desc-seen.*?\1[^>]*>(?P<count>.+?)</div>', - webpage, 'view count', fatal=False, group='count')) - like_count = int_or_none(self._search_regex( - r'<span[^>]+id=(["\'])flikesCount.*?\1>(?P<count>\d+)<', - webpage, 'like count', fatal=False, group='count')) - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'thumbnail': thumbnail, - 'description': description, - 'duration': duration, - 'view_count': view_count, - 'like_count': like_count, + clean_info = { + 'description': clean_html(media.get('content')), + 'tags': tags, } + + return merge_dicts(clean_info, jw_data, json_ld_data) diff --git a/youtube_dlc/extractor/malltv.py b/youtube_dlc/extractor/malltv.py index 6f4fd927f..fadfd9338 100644 --- a/youtube_dlc/extractor/malltv.py +++ b/youtube_dlc/extractor/malltv.py @@ -1,10 +1,16 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..utils import merge_dicts +from ..utils import ( + clean_html, + dict_get, + float_or_none, + int_or_none, + merge_dicts, + parse_duration, + try_get, +) class MallTVIE(InfoExtractor): @@ -17,7 +23,7 @@ class MallTVIE(InfoExtractor): 'display_id': '18-miliard-pro-neziskovky-opravdu-jsou-sportovci-nebo-clovek-v-tisni-pijavice', 'ext': 'mp4', 'title': '18 miliard pro neziskovky. Opravdu jsou sportovci nebo Člověk v tísni pijavice?', - 'description': 'md5:25fc0ec42a72ba602b602c683fa29deb', + 'description': 'md5:db7d5744a4bd4043d9d98324aa72ab35', 'duration': 216, 'timestamp': 1538870400, 'upload_date': '20181007', @@ -37,20 +43,46 @@ def _real_extract(self, url): webpage = self._download_webpage( url, display_id, headers=self.geo_verification_headers()) - SOURCE_RE = r'(<source[^>]+\bsrc=(?:(["\'])(?:(?!\2).)+|[^\s]+)/(?P<id>[\da-z]+)/index)\b' + video = self._parse_json(self._search_regex( + r'videoObject\s*=\s*JSON\.parse\(JSON\.stringify\(({.+?})\)\);', + webpage, 'video object'), display_id) + video_source = video['VideoSource'] video_id = self._search_regex( - SOURCE_RE, webpage, 'video id', group='id') + r'/([\da-z]+)/index\b', video_source, 'video id') - media = self._parse_html5_media_entries( - url, re.sub(SOURCE_RE, r'\1.m3u8', webpage), video_id, - m3u8_id='hls', m3u8_entry_protocol='m3u8_native')[0] + formats = self._extract_m3u8_formats( + video_source + '.m3u8', video_id, 'mp4', 'm3u8_native') + self._sort_formats(formats) + + subtitles = {} + for s in (video.get('Subtitles') or {}): + s_url = s.get('Url') + if not s_url: + continue + subtitles.setdefault(s.get('Language') or 'cz', []).append({ + 'url': s_url, + }) + + entity_counts = video.get('EntityCounts') or {} + + def get_count(k): + v = entity_counts.get(k + 's') or {} + return int_or_none(dict_get(v, ('Count', 'StrCount'))) info = self._search_json_ld(webpage, video_id, default={}) - return merge_dicts(media, info, { + return merge_dicts({ 'id': video_id, 'display_id': display_id, - 'title': self._og_search_title(webpage, default=None) or display_id, - 'description': self._og_search_description(webpage, default=None), - 'thumbnail': self._og_search_thumbnail(webpage, default=None), - }) + 'title': video.get('Title'), + 'description': clean_html(video.get('Description')), + 'thumbnail': video.get('ThumbnailUrl'), + 'formats': formats, + 'subtitles': subtitles, + 'duration': int_or_none(video.get('DurationSeconds')) or parse_duration(video.get('Duration')), + 'view_count': get_count('View'), + 'like_count': get_count('Like'), + 'dislike_count': get_count('Dislike'), + 'average_rating': float_or_none(try_get(video, lambda x: x['EntityRating']['AvarageRate'])), + 'comment_count': get_count('Comment'), + }, info) diff --git a/youtube_dlc/extractor/mgtv.py b/youtube_dlc/extractor/mgtv.py index 71fc3ec56..cab3aa045 100644 --- a/youtube_dlc/extractor/mgtv.py +++ b/youtube_dlc/extractor/mgtv.py @@ -17,9 +17,8 @@ class MGTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?mgtv\.com/(v|b)/(?:[^/]+/)*(?P<id>\d+)\.html' + _VALID_URL = r'https?://(?:w(?:ww)?\.)?mgtv\.com/(v|b)/(?:[^/]+/)*(?P<id>\d+)\.html' IE_DESC = '芒果TV' - _GEO_COUNTRIES = ['CN'] _TESTS = [{ 'url': 'http://www.mgtv.com/v/1/290525/f/3116640.html', @@ -34,14 +33,18 @@ class MGTVIE(InfoExtractor): }, { 'url': 'http://www.mgtv.com/b/301817/3826653.html', 'only_matching': True, + }, { + 'url': 'https://w.mgtv.com/b/301817/3826653.html', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) + tk2 = base64.urlsafe_b64encode(b'did=%s|pno=1030|ver=0.3.0301|clit=%d' % (compat_str(uuid.uuid4()).encode(), time.time()))[::-1] try: api_data = self._download_json( 'https://pcweb.api.mgtv.com/player/video', video_id, query={ - 'tk2': base64.urlsafe_b64encode(b'did=%s|pno=1030|ver=0.3.0301|clit=%d' % (compat_str(uuid.uuid4()).encode(), time.time()))[::-1], + 'tk2': tk2, 'video_id': video_id, }, headers=self.geo_verification_headers())['data'] except ExtractorError as e: @@ -56,6 +59,7 @@ def _real_extract(self, url): stream_data = self._download_json( 'https://pcweb.api.mgtv.com/player/getSource', video_id, query={ 'pm2': api_data['atc']['pm2'], + 'tk2': tk2, 'video_id': video_id, }, headers=self.geo_verification_headers())['data'] stream_domain = stream_data['stream_domain'][0] diff --git a/youtube_dlc/extractor/mtv.py b/youtube_dlc/extractor/mtv.py index 04cc95b6a..d31f53137 100644 --- a/youtube_dlc/extractor/mtv.py +++ b/youtube_dlc/extractor/mtv.py @@ -403,6 +403,18 @@ class MTVIE(MTVServicesInfoExtractor): 'only_matching': True, }] + @staticmethod + def extract_child_with_type(parent, t): + children = parent['children'] + return next(c for c in children if c.get('type') == t) + + def _extract_mgid(self, webpage): + data = self._parse_json(self._search_regex( + r'__DATA__\s*=\s*({.+?});', webpage, 'data'), None) + main_container = self.extract_child_with_type(data, 'MainContainer') + video_player = self.extract_child_with_type(main_container, 'VideoPlayer') + return video_player['props']['media']['video']['config']['uri'] + class MTVJapanIE(MTVServicesInfoExtractor): IE_NAME = 'mtvjapan' diff --git a/youtube_dlc/extractor/nbc.py b/youtube_dlc/extractor/nbc.py index 6f3cb3003..ea5f5a315 100644 --- a/youtube_dlc/extractor/nbc.py +++ b/youtube_dlc/extractor/nbc.py @@ -10,7 +10,6 @@ from ..compat import compat_urllib_parse_unquote from ..utils import ( int_or_none, - js_to_json, parse_duration, smuggle_url, try_get, @@ -394,8 +393,8 @@ def _real_extract(self, url): webpage = self._download_webpage(url, video_id) data = self._parse_json(self._search_regex( - r'window\.__data\s*=\s*({.+});', webpage, - 'bootstrap json'), video_id, js_to_json) + r'<script[^>]+id="__NEXT_DATA__"[^>]*>({.+?})</script>', + webpage, 'bootstrap json'), video_id)['props']['initialState'] video_data = try_get(data, lambda x: x['video']['current'], dict) if not video_data: video_data = data['article']['content'][0]['primaryMedia']['video'] diff --git a/youtube_dlc/extractor/ndr.py b/youtube_dlc/extractor/ndr.py index f3897c71b..81abb3120 100644 --- a/youtube_dlc/extractor/ndr.py +++ b/youtube_dlc/extractor/ndr.py @@ -82,6 +82,29 @@ class NDRIE(NDRBaseIE): 'params': { 'skip_download': True, }, + }, { + # with subtitles + 'url': 'https://www.ndr.de/fernsehen/sendungen/extra_3/extra-3-Satiremagazin-mit-Christian-Ehring,sendung1091858.html', + 'info_dict': { + 'id': 'extra18674', + 'display_id': 'extra-3-Satiremagazin-mit-Christian-Ehring', + 'ext': 'mp4', + 'title': 'Extra 3 vom 11.11.2020 mit Christian Ehring', + 'description': 'md5:42ee53990a715eaaf4dc7f13a3bd56c6', + 'uploader': 'ndrtv', + 'upload_date': '20201113', + 'duration': 1749, + 'subtitles': { + 'de': [{ + 'ext': 'ttml', + 'url': r're:^https://www\.ndr\.de.+', + }], + }, + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Unable to download f4m manifest'], }, { 'url': 'https://www.ndr.de/Fettes-Brot-Ferris-MC-und-Thees-Uhlmann-live-on-stage,festivalsommer116.html', 'only_matching': True, @@ -242,6 +265,20 @@ def _real_extract(self, url): 'preference': quality_key(thumbnail.get('quality')), }) + subtitles = {} + tracks = config.get('tracks') + if tracks and isinstance(tracks, list): + for track in tracks: + if not isinstance(track, dict): + continue + track_url = urljoin(url, track.get('src')) + if not track_url: + continue + subtitles.setdefault(track.get('srclang') or 'de', []).append({ + 'url': track_url, + 'ext': 'ttml', + }) + return { 'id': video_id, 'title': title, @@ -251,6 +288,7 @@ def _real_extract(self, url): 'duration': duration, 'thumbnails': thumbnails, 'formats': formats, + 'subtitles': subtitles, } diff --git a/youtube_dlc/extractor/rai.py b/youtube_dlc/extractor/rai.py index 51a310f5c..a0836bf58 100644 --- a/youtube_dlc/extractor/rai.py +++ b/youtube_dlc/extractor/rai.py @@ -17,7 +17,7 @@ int_or_none, parse_duration, strip_or_none, - try_get, + unescapeHTML, unified_strdate, unified_timestamp, update_url_query, @@ -30,7 +30,6 @@ class RaiBaseIE(InfoExtractor): _UUID_RE = r'[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}' _GEO_COUNTRIES = ['IT'] _GEO_BYPASS = False - _BASE_URL = 'https://www.raiplay.it' def _extract_relinker_info(self, relinker_url, video_id): if not re.match(r'https?://', relinker_url): @@ -123,19 +122,40 @@ def _extract_subtitles(url, subtitle_url): class RaiPlayIE(RaiBaseIE): - _VALID_URL = r'(?P<url>(?P<base>https?://(?:www\.)?raiplay\.it/.+?-)(?P<id>%s)(?P<ext>\.(?:html|json)))' % RaiBaseIE._UUID_RE + _VALID_URL = r'(?P<url>https?://(?:www\.)?raiplay\.it/.+?-(?P<id>%s)\.(?:html|json))' % RaiBaseIE._UUID_RE _TESTS = [{ + 'url': 'http://www.raiplay.it/video/2016/10/La-Casa-Bianca-e06118bb-59a9-4636-b914-498e4cfd2c66.html?source=twitter', + 'md5': '340aa3b7afb54bfd14a8c11786450d76', + 'info_dict': { + 'id': 'e06118bb-59a9-4636-b914-498e4cfd2c66', + 'ext': 'mp4', + 'title': 'La Casa Bianca', + 'alt_title': 'S2016 - Puntata del 23/10/2016', + 'description': 'md5:a09d45890850458077d1f68bb036e0a5', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Rai 3', + 'creator': 'Rai 3', + 'duration': 3278, + 'timestamp': 1477764300, + 'upload_date': '20161029', + 'series': 'La Casa Bianca', + 'season': '2016', + }, + 'skip': 'This content is not available', + }, { 'url': 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html', 'md5': '8970abf8caf8aef4696e7b1f2adfc696', 'info_dict': { 'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391', 'ext': 'mp4', 'title': 'Report del 07/04/2014', - 'alt_title': 'St 2013/14 - Espresso nel caffè - 07/04/2014 ', + 'alt_title': 'St 2013/14 - Espresso nel caffè - 07/04/2014', 'description': 'md5:d730c168a58f4bb35600fc2f881ec04e', 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'Rai Gulp', 'duration': 6160, + 'series': 'Report', + 'season': '2013/14', }, 'params': { 'skip_download': True, @@ -146,11 +166,10 @@ class RaiPlayIE(RaiBaseIE): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - url, base, video_id, ext = mobj.group('url', 'base', 'id', 'ext') + url, video_id = re.match(self._VALID_URL, url).groups() media = self._download_json( - '%s%s.json' % (base, video_id), video_id, 'Downloading video JSON') + url.replace('.html', '.json'), video_id, 'Downloading video JSON') title = media['name'] video = media['video'] @@ -159,34 +178,38 @@ def _real_extract(self, url): self._sort_formats(relinker_info['formats']) thumbnails = [] - if 'images' in media: - for _, value in media.get('images').items(): - if value: - thumbnails.append({ - 'url': urljoin(RaiBaseIE._BASE_URL, value.replace('[RESOLUTION]', '600x400')) - }) + for _, value in media.get('images', {}).items(): + if value: + thumbnails.append({ + 'url': urljoin(url, value), + }) - timestamp = unified_timestamp(try_get( - media, lambda x: x['availabilities'][0]['start'], compat_str)) + date_published = media.get('date_published') + time_published = media.get('time_published') + if date_published and time_published: + date_published += ' ' + time_published subtitles = self._extract_subtitles(url, video.get('subtitles')) + program_info = media.get('program_info') or {} + season = media.get('season') + info = { 'id': video_id, 'title': self._live_title(title) if relinker_info.get( 'is_live') else title, - 'alt_title': media.get('subtitle'), + 'alt_title': strip_or_none(media.get('subtitle')), 'description': media.get('description'), 'uploader': strip_or_none(media.get('channel')), - 'creator': strip_or_none(media.get('editor')), + 'creator': strip_or_none(media.get('editor') or None), 'duration': parse_duration(video.get('duration')), - 'timestamp': timestamp, + 'timestamp': unified_timestamp(date_published), 'thumbnails': thumbnails, - 'series': try_get( - media, lambda x: x['isPartOf']['name'], compat_str), - 'season_number': int_or_none(try_get( - media, lambda x: x['isPartOf']['numeroStagioni'])), - 'season': media.get('stagione') or None, + 'series': program_info.get('name'), + 'season_number': int_or_none(season), + 'season': season if (season and not season.isdigit()) else None, + 'episode': media.get('episode_title'), + 'episode_number': int_or_none(media.get('episode')), 'subtitles': subtitles, } @@ -203,7 +226,7 @@ class RaiPlayLiveIE(RaiBaseIE): 'display_id': 'rainews24', 'ext': 'mp4', 'title': 're:^Diretta di Rai News 24 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', - 'description': 'md5:4d00bcf6dc98b27c6ec480de329d1497', + 'description': 'md5:6eca31500550f9376819f174e5644754', 'uploader': 'Rai News 24', 'creator': 'Rai News 24', 'is_live': True, @@ -216,32 +239,20 @@ class RaiPlayLiveIE(RaiBaseIE): def _real_extract(self, url): display_id = self._match_id(url) - media = self._download_json( - '%s.json' % urljoin(RaiBaseIE._BASE_URL, 'dirette/' + display_id), - display_id, 'Downloading channel JSON') + webpage = self._download_webpage(url, display_id) - title = media['name'] - video = media['video'] - video_id = media['id'].replace('ContentItem-', '') + video_id = self._search_regex( + r'data-uniquename=["\']ContentItem-(%s)' % RaiBaseIE._UUID_RE, + webpage, 'content id') - relinker_info = self._extract_relinker_info(video['content_url'], video_id) - self._sort_formats(relinker_info['formats']) - - info = { + return { + '_type': 'url_transparent', + 'ie_key': RaiPlayIE.ie_key(), + 'url': 'http://www.raiplay.it/dirette/ContentItem-%s.html' % video_id, 'id': video_id, 'display_id': display_id, - 'title': self._live_title(title) if relinker_info.get( - 'is_live') else title, - 'alt_title': media.get('subtitle'), - 'description': media.get('description'), - 'uploader': strip_or_none(media.get('channel')), - 'creator': strip_or_none(media.get('editor')), - 'duration': parse_duration(video.get('duration')), } - info.update(relinker_info) - return info - class RaiPlayPlaylistIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?raiplay\.it/programmi/(?P<id>[^/?#&]+)' @@ -250,7 +261,7 @@ class RaiPlayPlaylistIE(InfoExtractor): 'info_dict': { 'id': 'nondirloalmiocapo', 'title': 'Non dirlo al mio capo', - 'description': 'md5:98ab6b98f7f44c2843fd7d6f045f153b', + 'description': 'md5:9f3d603b2947c1c7abb098f3b14fac86', }, 'playlist_mincount': 12, }] @@ -258,25 +269,21 @@ class RaiPlayPlaylistIE(InfoExtractor): def _real_extract(self, url): playlist_id = self._match_id(url) - media = self._download_json( - '%s.json' % urljoin(RaiBaseIE._BASE_URL, 'programmi/' + playlist_id), - playlist_id, 'Downloading program JSON') + webpage = self._download_webpage(url, playlist_id) - title = media['name'] - description = media['program_info']['description'] - - content_sets = [s['id'] for b in media['blocks'] for s in b['sets']] + title = self._html_search_meta( + ('programma', 'nomeProgramma'), webpage, 'title') + description = unescapeHTML(self._html_search_meta( + ('description', 'og:description'), webpage, 'description')) entries = [] - for cs in content_sets: - medias = self._download_json( - '%s/%s.json' % (urljoin(RaiBaseIE._BASE_URL, 'programmi/' + playlist_id), cs), - cs, 'Downloading content set JSON') - for m in medias['items']: - video_url = urljoin(url, m['path_id']) - entries.append(self.url_result( - video_url, ie=RaiPlayIE.ie_key(), - video_id=RaiPlayIE._match_id(video_url))) + for mobj in re.finditer( + r'<a\b[^>]+\bhref=(["\'])(?P<path>/raiplay/video/.+?)\1', + webpage): + video_url = urljoin(url, mobj.group('path')) + entries.append(self.url_result( + video_url, ie=RaiPlayIE.ie_key(), + video_id=RaiPlayIE._match_id(video_url))) return self.playlist_result(entries, playlist_id, title, description) @@ -294,7 +301,8 @@ class RaiIE(RaiBaseIE): 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 1758, 'upload_date': '20140612', - } + }, + 'skip': 'This content is available only in Italy', }, { # with ContentItem in many metas 'url': 'http://www.rainews.it/dl/rainews/media/Weekend-al-cinema-da-Hollywood-arriva-il-thriller-di-Tate-Taylor-La-ragazza-del-treno-1632c009-c843-4836-bb65-80c33084a64b.html', @@ -320,6 +328,19 @@ class RaiIE(RaiBaseIE): 'duration': 2214, 'upload_date': '20161103', } + }, { + # drawMediaRaiTV(...) + 'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html', + 'md5': '2dd727e61114e1ee9c47f0da6914e178', + 'info_dict': { + 'id': '59d69d28-6bb6-409d-a4b5-ed44096560af', + 'ext': 'mp4', + 'title': 'Il pacco', + 'description': 'md5:4b1afae1364115ce5d78ed83cd2e5b3a', + 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20141221', + }, + 'skip': 'This content is not available', }, { # initEdizione('ContentItem-...' 'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined', @@ -331,6 +352,18 @@ class RaiIE(RaiBaseIE): 'upload_date': '20170401', }, 'skip': 'Changes daily', + }, { + # HDS live stream with only relinker URL + 'url': 'http://www.rai.tv/dl/RaiTV/dirette/PublishingBlock-1912dbbf-3f96-44c3-b4cf-523681fbacbc.html?channel=EuroNews', + 'info_dict': { + 'id': '1912dbbf-3f96-44c3-b4cf-523681fbacbc', + 'ext': 'flv', + 'title': 'EuroNews', + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'This content is available only in Italy', }, { # HLS live stream with ContentItem in og:url 'url': 'http://www.rainews.it/dl/rainews/live/ContentItem-3156f2f2-dc70-4953-8e2f-70d7489d4ce9.html', diff --git a/youtube_dlc/extractor/servus.py b/youtube_dlc/extractor/servus.py index 9401bf2cf..1610ddc2c 100644 --- a/youtube_dlc/extractor/servus.py +++ b/youtube_dlc/extractor/servus.py @@ -1,9 +1,15 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor +from ..utils import ( + determine_ext, + float_or_none, + int_or_none, + unified_timestamp, + urlencode_postdata, + url_or_none, +) class ServusIE(InfoExtractor): @@ -12,20 +18,29 @@ class ServusIE(InfoExtractor): (?:www\.)? (?: servus\.com/(?:(?:at|de)/p/[^/]+|tv/videos)| - servustv\.com/videos + (?:servustv|pm-wissen)\.com/videos ) /(?P<id>[aA]{2}-\w+|\d+-\d+) ''' _TESTS = [{ # new URL schema 'url': 'https://www.servustv.com/videos/aa-1t6vbu5pw1w12/', - 'md5': '3e1dd16775aa8d5cbef23628cfffc1f4', + 'md5': '60474d4c21f3eb148838f215c37f02b9', 'info_dict': { 'id': 'AA-1T6VBU5PW1W12', 'ext': 'mp4', 'title': 'Die Grünen aus Sicht des Volkes', + 'alt_title': 'Talk im Hangar-7 Voxpops Gruene', 'description': 'md5:1247204d85783afe3682644398ff2ec4', 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 62.442, + 'timestamp': 1605193976, + 'upload_date': '20201112', + 'series': 'Talk im Hangar-7', + 'season': 'Season 9', + 'season_number': 9, + 'episode': 'Episode 31 - September 14', + 'episode_number': 31, } }, { # old URL schema @@ -40,30 +55,94 @@ class ServusIE(InfoExtractor): }, { 'url': 'https://www.servus.com/tv/videos/1380889096408-1235196658/', 'only_matching': True, + }, { + 'url': 'https://www.pm-wissen.com/videos/aa-24mus4g2w2112/', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url).upper() - webpage = self._download_webpage(url, video_id) - title = self._search_regex( - (r'videoLabel\s*=\s*(["\'])(?P<title>(?:(?!\1).)+)\1', - r'<h\d+[^>]+\bclass=["\']heading--(?:one|two)["\'][^>]*>(?P<title>[^<]+)'), - webpage, 'title', default=None, - group='title') or self._og_search_title(webpage) - title = re.sub(r'\s*-\s*Servus TV\s*$', '', title) - description = self._og_search_description(webpage) - thumbnail = self._og_search_thumbnail(webpage) + token = self._download_json( + 'https://auth.redbullmediahouse.com/token', video_id, + 'Downloading token', data=urlencode_postdata({ + 'grant_type': 'client_credentials', + }), headers={ + 'Authorization': 'Basic SVgtMjJYNEhBNFdEM1cxMTpEdDRVSkFLd2ZOMG5IMjB1NGFBWTBmUFpDNlpoQ1EzNA==', + }) + access_token = token['access_token'] + token_type = token.get('token_type', 'Bearer') - formats = self._extract_m3u8_formats( - 'https://stv.rbmbtnx.net/api/v1/manifests/%s.m3u8' % video_id, - video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') + video = self._download_json( + 'https://sparkle-api.liiift.io/api/v1/stv/channels/international/assets/%s' % video_id, + video_id, 'Downloading video JSON', headers={ + 'Authorization': '%s %s' % (token_type, access_token), + }) + + formats = [] + thumbnail = None + for resource in video['resources']: + if not isinstance(resource, dict): + continue + format_url = url_or_none(resource.get('url')) + if not format_url: + continue + extension = resource.get('extension') + type_ = resource.get('type') + if extension == 'jpg' or type_ == 'reference_keyframe': + thumbnail = format_url + continue + ext = determine_ext(format_url) + if type_ == 'dash' or ext == 'mpd': + formats.extend(self._extract_mpd_formats( + format_url, video_id, mpd_id='dash', fatal=False)) + elif type_ == 'hls' or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + elif extension == 'mp4' or ext == 'mp4': + formats.append({ + 'url': format_url, + 'format_id': type_, + 'width': int_or_none(resource.get('width')), + 'height': int_or_none(resource.get('height')), + }) self._sort_formats(formats) + attrs = {} + for attribute in video['attributes']: + if not isinstance(attribute, dict): + continue + key = attribute.get('fieldKey') + value = attribute.get('fieldValue') + if not key or not value: + continue + attrs[key] = value + + title = attrs.get('title_stv') or video_id + alt_title = attrs.get('title') + description = attrs.get('long_description') or attrs.get('short_description') + series = attrs.get('label') + season = attrs.get('season') + episode = attrs.get('chapter') + duration = float_or_none(attrs.get('duration'), scale=1000) + season_number = int_or_none(self._search_regex( + r'Season (\d+)', season or '', 'season number', default=None)) + episode_number = int_or_none(self._search_regex( + r'Episode (\d+)', episode or '', 'episode number', default=None)) + return { 'id': video_id, 'title': title, + 'alt_title': alt_title, 'description': description, 'thumbnail': thumbnail, + 'duration': duration, + 'timestamp': unified_timestamp(video.get('lastPublished')), + 'series': series, + 'season': season, + 'season_number': season_number, + 'episode': episode, + 'episode_number': episode_number, 'formats': formats, } diff --git a/youtube_dlc/extractor/spiegel.py b/youtube_dlc/extractor/spiegel.py index 4df7f4ddc..2da32b9b2 100644 --- a/youtube_dlc/extractor/spiegel.py +++ b/youtube_dlc/extractor/spiegel.py @@ -1,159 +1,54 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor -from .nexx import ( - NexxIE, - NexxEmbedIE, -) -from .spiegeltv import SpiegeltvIE -from ..compat import compat_urlparse -from ..utils import ( - parse_duration, - strip_or_none, - unified_timestamp, -) +from .jwplatform import JWPlatformIE class SpiegelIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<id>[0-9]+)(?:-embed|-iframe)?(?:\.html)?(?:#.*)?$' + _UUID_RE = r'[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}' + _VALID_URL = r'https?://(?:www\.)?(?:spiegel|manager-magazin)\.de(?:/[^/]+)+/[^/]*-(?P<id>[0-9]+|%s)(?:-embed|-iframe)?(?:\.html)?(?:#.*)?$' % _UUID_RE _TESTS = [{ 'url': 'http://www.spiegel.de/video/vulkan-tungurahua-in-ecuador-ist-wieder-aktiv-video-1259285.html', - 'md5': 'b57399839d055fccfeb9a0455c439868', + 'md5': '50c7948883ec85a3e431a0a44b7ad1d6', 'info_dict': { - 'id': '563747', + 'id': 'II0BUyxY', + 'display_id': '1259285', 'ext': 'mp4', - 'title': 'Vulkanausbruch in Ecuador: Der "Feuerschlund" ist wieder aktiv', + 'title': 'Vulkan Tungurahua in Ecuador ist wieder aktiv - DER SPIEGEL - Wissenschaft', 'description': 'md5:8029d8310232196eb235d27575a8b9f4', - 'duration': 49, + 'duration': 48.0, 'upload_date': '20130311', - 'timestamp': 1362994320, + 'timestamp': 1362997920, }, }, { 'url': 'http://www.spiegel.de/video/schach-wm-videoanalyse-des-fuenften-spiels-video-1309159.html', - 'md5': '5b6c2f4add9d62912ed5fc78a1faed80', - 'info_dict': { - 'id': '580988', - 'ext': 'mp4', - 'title': 'Schach-WM in der Videoanalyse: Carlsen nutzt die Fehlgriffe des Titelverteidigers', - 'description': 'md5:c2322b65e58f385a820c10fa03b2d088', - 'duration': 983, - 'upload_date': '20131115', - 'timestamp': 1384546642, - }, - }, { - 'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-embed.html', - 'md5': '97b91083a672d72976faa8433430afb9', - 'info_dict': { - 'id': '601883', - 'ext': 'mp4', - 'description': 'SPIEGEL ONLINE-Nutzer durften den deutschen Astronauten Alexander Gerst über sein Leben auf der ISS-Station befragen. Hier kommen seine Antworten auf die besten sechs Fragen.', - 'title': 'Fragen an Astronaut Alexander Gerst: "Bekommen Sie die Tageszeiten mit?"', - 'upload_date': '20140904', - 'timestamp': 1409834160, - } - }, { - 'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-iframe.html', 'only_matching': True, }, { - # nexx video + 'url': 'https://www.spiegel.de/video/eifel-zoo-aufregung-um-ausgebrochene-raubtiere-video-99018031.html', + 'only_matching': True, + }, { + 'url': 'https://www.spiegel.de/panorama/urteile-im-goldmuenzenprozess-haftstrafen-fuer-clanmitglieder-a-aae8df48-43c1-4c61-867d-23f0a2d254b7', + 'only_matching': True, + }, { 'url': 'http://www.spiegel.de/video/spiegel-tv-magazin-ueber-guellekrise-in-schleswig-holstein-video-99012776.html', 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - metadata_url = 'http://www.spiegel.de/video/metadata/video-%s.json' % video_id - handle = self._request_webpage(metadata_url, video_id) - - # 302 to spiegel.tv, like http://www.spiegel.de/video/der-film-zum-wochenende-die-wahrheit-ueber-maenner-video-99003272.html - if SpiegeltvIE.suitable(handle.geturl()): - return self.url_result(handle.geturl(), 'Spiegeltv') - - video_data = self._parse_json(self._webpage_read_content( - handle, metadata_url, video_id), video_id) - title = video_data['title'] - nexx_id = video_data['nexxOmniaId'] - domain_id = video_data.get('nexxOmniaDomain') or '748' - - return { - '_type': 'url_transparent', - 'id': video_id, - 'url': 'nexx:%s:%s' % (domain_id, nexx_id), - 'title': title, - 'description': strip_or_none(video_data.get('teaser')), - 'duration': parse_duration(video_data.get('duration')), - 'timestamp': unified_timestamp(video_data.get('datum')), - 'ie_key': NexxIE.ie_key(), - } - - -class SpiegelArticleIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?spiegel\.de/(?!video/)[^?#]*?-(?P<id>[0-9]+)\.html' - IE_NAME = 'Spiegel:Article' - IE_DESC = 'Articles on spiegel.de' - _TESTS = [{ + }, { 'url': 'http://www.spiegel.de/sport/sonst/badminton-wm-die-randsportart-soll-populaerer-werden-a-987092.html', - 'info_dict': { - 'id': '1516455', - 'ext': 'mp4', - 'title': 'Faszination Badminton: Nennt es bloß nicht Federball', - 'description': 're:^Patrick Kämnitz gehört.{100,}', - 'upload_date': '20140825', - }, - }, { - 'url': 'http://www.spiegel.de/wissenschaft/weltall/astronaut-alexander-gerst-antwortet-spiegel-online-lesern-a-989876.html', - 'info_dict': { - - }, - 'playlist_count': 6, - }, { - # Nexx iFrame embed - 'url': 'http://www.spiegel.de/sptv/spiegeltv/spiegel-tv-ueber-schnellste-katapult-achterbahn-der-welt-taron-a-1137884.html', - 'info_dict': { - 'id': '161464', - 'ext': 'mp4', - 'title': 'Nervenkitzel Achterbahn', - 'alt_title': 'Karussellbauer in Deutschland', - 'description': 'md5:ffe7b1cc59a01f585e0569949aef73cc', - 'release_year': 2005, - 'creator': 'SPIEGEL TV', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 2761, - 'timestamp': 1394021479, - 'upload_date': '20140305', - }, - 'params': { - 'format': 'bestvideo', - 'skip_download': True, - }, + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - - # Single video on top of the page - video_link = self._search_regex( - r'<a href="([^"]+)" onclick="return spOpenVideo\(this,', webpage, - 'video page URL', default=None) - if video_link: - video_url = compat_urlparse.urljoin( - self.http_scheme() + '//spiegel.de/', video_link) - return self.url_result(video_url) - - # Multiple embedded videos - embeds = re.findall( - r'<div class="vid_holder[0-9]+.*?</div>\s*.*?url\s*=\s*"([^"]+)"', - webpage) - entries = [ - self.url_result(compat_urlparse.urljoin( - self.http_scheme() + '//spiegel.de/', embed_path)) - for embed_path in embeds] - if embeds: - return self.playlist_result(entries) - - return self.playlist_from_matches( - NexxEmbedIE._extract_urls(webpage), ie=NexxEmbedIE.ie_key()) + media_id = self._html_search_regex( + r'("|["\'])mediaId\1\s*:\s*("|["\'])(?P<id>(?:(?!\2).)+)\2', + webpage, 'media id', group='id') + return { + '_type': 'url_transparent', + 'id': video_id, + 'display_id': video_id, + 'url': 'jwplatform:%s' % media_id, + 'title': self._og_search_title(webpage, default=None), + 'ie_key': JWPlatformIE.ie_key(), + } diff --git a/youtube_dlc/extractor/twentythreevideo.py b/youtube_dlc/extractor/twentythreevideo.py index aa0c6e90f..dc5609192 100644 --- a/youtube_dlc/extractor/twentythreevideo.py +++ b/youtube_dlc/extractor/twentythreevideo.py @@ -8,8 +8,8 @@ class TwentyThreeVideoIE(InfoExtractor): IE_NAME = '23video' - _VALID_URL = r'https?://video\.(?P<domain>twentythree\.net|23video\.com|filmweb\.no)/v\.ihtml/player\.html\?(?P<query>.*?\bphoto(?:_|%5f)id=(?P<id>\d+).*)' - _TEST = { + _VALID_URL = r'https?://(?P<domain>[^.]+\.(?:twentythree\.net|23video\.com|filmweb\.no))/v\.ihtml/player\.html\?(?P<query>.*?\bphoto(?:_|%5f)id=(?P<id>\d+).*)' + _TESTS = [{ 'url': 'https://video.twentythree.net/v.ihtml/player.html?showDescriptions=0&source=site&photo%5fid=20448876&autoPlay=1', 'md5': '75fcf216303eb1dae9920d651f85ced4', 'info_dict': { @@ -21,11 +21,14 @@ class TwentyThreeVideoIE(InfoExtractor): 'uploader_id': '12258964', 'uploader': 'Rasmus Bysted', } - } + }, { + 'url': 'https://bonnier-publications-danmark.23video.com/v.ihtml/player.html?token=f0dc46476e06e13afd5a1f84a29e31e8&source=embed&photo%5fid=36137620', + 'only_matching': True, + }] def _real_extract(self, url): domain, query, photo_id = re.match(self._VALID_URL, url).groups() - base_url = 'https://video.%s' % domain + base_url = 'https://%s' % domain photo_data = self._download_json( base_url + '/api/photo/list?' + query, photo_id, query={ 'format': 'json', diff --git a/youtube_dlc/extractor/urplay.py b/youtube_dlc/extractor/urplay.py index 4bc2b78fb..2c41f78bd 100644 --- a/youtube_dlc/extractor/urplay.py +++ b/youtube_dlc/extractor/urplay.py @@ -2,8 +2,11 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import unified_timestamp -import re +from ..utils import ( + dict_get, + int_or_none, + unified_timestamp, +) class URPlayIE(InfoExtractor): @@ -14,7 +17,7 @@ class URPlayIE(InfoExtractor): 'info_dict': { 'id': '203704', 'ext': 'mp4', - 'title': 'Om vetenskap, kritiskt tänkande och motstånd', + 'title': 'UR Samtiden - Livet, universum och rymdens märkliga musik : Om vetenskap, kritiskt tänkande och motstånd', 'description': 'md5:5344508a52aa78c1ced6c1b8b9e44e9a', 'timestamp': 1513292400, 'upload_date': '20171214', @@ -26,7 +29,7 @@ class URPlayIE(InfoExtractor): 'ext': 'mp4', 'title': 'Tripp, Trapp, Träd : Sovkudde', 'description': 'md5:b86bffdae04a7e9379d1d7e5947df1d1', - 'timestamp': 1440093600, + 'timestamp': 1440086400, 'upload_date': '20150820', }, }, { @@ -36,28 +39,27 @@ class URPlayIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - + url = url.replace('skola.se/Produkter', 'play.se/program') webpage = self._download_webpage(url, video_id) - urplayer_data = re.sub(""", "\"", self._search_regex( - r'components\/Player\/Player\" data-react-props=\"({.+?})\"', - webpage, 'urplayer data')) - urplayer_data = self._parse_json(urplayer_data, video_id) - for i in range(len(urplayer_data['accessibleEpisodes'])): - if urplayer_data.get('accessibleEpisodes', {})[i].get('id') == int(video_id): - urplayer_data = urplayer_data['accessibleEpisodes'][i] - break + urplayer_data = self._parse_json(self._html_search_regex( + r'data-react-class="components/Player/Player"[^>]+data-react-props="({.+?})"', + webpage, 'urplayer data'), video_id)['currentProduct'] + episode = urplayer_data['title'] host = self._download_json('http://streaming-loadbalancer.ur.se/loadbalancer.json', video_id)['redirect'] formats = [] - urplayer_streams = urplayer_data.get("streamingInfo") - for quality in ('sd'), ('hd'): - location = (urplayer_streams.get("raw", {}).get(quality, {}).get("location") - or urplayer_streams.get("sweComplete", {}).get(quality, {}).get("location")) - if location: + urplayer_streams = urplayer_data.get('streamingInfo', {}) + + for k, v in urplayer_streams.get('raw', {}).items(): + if not (k in ('sd', 'hd') and isinstance(v, dict)): + continue + file_http = v.get('location') + if file_http: formats.extend(self._extract_wowza_formats( - 'http://%s/%s/playlist.m3u8' % (host, location), video_id, - skip_protocols=['f4m', 'rtmp', 'rtsp'])) + 'http://%s/%splaylist.m3u8' % (host, file_http), + video_id, skip_protocols=['f4m', 'rtmp', 'rtsp'])) self._sort_formats(formats) + subtitles = {} subs = urplayer_streams.get("sweComplete", {}).get("tt", {}).get("location") if subs: @@ -65,14 +67,37 @@ def _real_extract(self, url): 'url': subs, }) + image = urplayer_data.get('image') or {} + thumbnails = [] + for k, v in image.items(): + t = { + 'id': k, + 'url': v, + } + wh = k.split('x') + if len(wh) == 2: + t.update({ + 'width': int_or_none(wh[0]), + 'height': int_or_none(wh[1]), + }) + thumbnails.append(t) + + series = urplayer_data.get('series') or {} + series_title = dict_get(series, ('seriesTitle', 'title')) or dict_get(urplayer_data, ('seriesTitle', 'mainTitle')) + return { 'id': video_id, - 'title': urplayer_data['title'], - 'description': self._og_search_description(webpage), - 'thumbnail': urplayer_data.get('image', {}).get('1280x720'), - 'timestamp': unified_timestamp(self._html_search_meta(('uploadDate', 'schema:uploadDate'), - webpage, 'timestamp')), - 'series': urplayer_data.get('seriesTitle'), 'subtitles': subtitles, + 'title': '%s : %s' % (series_title, episode) if series_title else episode, + 'description': urplayer_data.get('description'), + 'thumbnails': thumbnails, + 'timestamp': unified_timestamp(urplayer_data.get('publishedAt')), + 'series': series_title, 'formats': formats, + 'duration': int_or_none(urplayer_data.get('duration')), + 'categories': urplayer_data.get('categories'), + 'tags': urplayer_data.get('keywords'), + 'season': series.get('label'), + 'episode': episode, + 'episode_number': int_or_none(urplayer_data.get('episodeNumber')), } diff --git a/youtube_dlc/extractor/usanetwork.py b/youtube_dlc/extractor/usanetwork.py index 54c7495cc..d953e460b 100644 --- a/youtube_dlc/extractor/usanetwork.py +++ b/youtube_dlc/extractor/usanetwork.py @@ -1,74 +1,24 @@ # coding: utf-8 from __future__ import unicode_literals -from .adobepass import AdobePassIE -from ..utils import ( - NO_DEFAULT, - smuggle_url, - update_url_query, -) +from .nbc import NBCIE -class USANetworkIE(AdobePassIE): - _VALID_URL = r'https?://(?:www\.)?usanetwork\.com/(?:[^/]+/videos|movies)/(?P<id>[^/?#]+)' - _TEST = { - 'url': 'http://www.usanetwork.com/mrrobot/videos/hpe-cybersecurity', - 'md5': '33c0d2ba381571b414024440d08d57fd', +class USANetworkIE(NBCIE): + _VALID_URL = r'https?(?P<permalink>://(?:www\.)?usanetwork\.com/(?:[^/]+/videos?|movies?)/(?:[^/]+/)?(?P<id>\d+))' + _TESTS = [{ + 'url': 'https://www.usanetwork.com/peacock-trailers/video/intelligence-trailer/4185302', 'info_dict': { - 'id': '3086229', + 'id': '4185302', 'ext': 'mp4', - 'title': 'HPE Cybersecurity', - 'description': 'The more we digitize our world, the more vulnerable we are.', - 'upload_date': '20160818', - 'timestamp': 1471535460, - 'uploader': 'NBCU-USA', + 'title': 'Intelligence (Trailer)', + 'description': 'A maverick NSA agent enlists the help of a junior systems analyst in a workplace power grab.', + 'upload_date': '20200715', + 'timestamp': 1594785600, + 'uploader': 'NBCU-MPAT', }, - } - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - def _x(name, default=NO_DEFAULT): - return self._search_regex( - r'data-%s\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1' % name, - webpage, name, default=default, group='value') - - video_id = _x('mpx-guid') - title = _x('episode-title') - mpx_account_id = _x('mpx-account-id', '2304992029') - - query = { - 'mbr': 'true', - } - if _x('is-full-episode', None) == '1': - query['manifest'] = 'm3u' - - if _x('is-entitlement', None) == '1': - adobe_pass = {} - drupal_settings = self._search_regex( - r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', - webpage, 'drupal settings', fatal=False) - if drupal_settings: - drupal_settings = self._parse_json(drupal_settings, video_id, fatal=False) - if drupal_settings: - adobe_pass = drupal_settings.get('adobePass', {}) - resource = self._get_mvpd_resource( - adobe_pass.get('adobePassResourceId', 'usa'), - title, video_id, _x('episode-rating', 'TV-14')) - query['auth'] = self._extract_mvpd_auth( - url, video_id, adobe_pass.get('adobePassRequestorId', 'usa'), resource) - - info = self._search_json_ld(webpage, video_id, default={}) - info.update({ - '_type': 'url_transparent', - 'url': smuggle_url(update_url_query( - 'http://link.theplatform.com/s/HNK2IC/media/guid/%s/%s' % (mpx_account_id, video_id), - query), {'force_smil_url': True}), - 'id': video_id, - 'title': title, - 'series': _x('show-title', None), - 'episode': title, - 'ie_key': 'ThePlatform', - }) - return info + 'params': { + # m3u8 download + 'skip_download': True, + }, + }] diff --git a/youtube_dlc/extractor/ustream.py b/youtube_dlc/extractor/ustream.py index 582090d0d..9e860aeb7 100644 --- a/youtube_dlc/extractor/ustream.py +++ b/youtube_dlc/extractor/ustream.py @@ -19,7 +19,7 @@ class UstreamIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ustream\.tv/(?P<type>recorded|embed|embed/recorded)/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?(?:ustream\.tv|video\.ibm\.com)/(?P<type>recorded|embed|embed/recorded)/(?P<id>\d+)' IE_NAME = 'ustream' _TESTS = [{ 'url': 'http://www.ustream.tv/recorded/20274954', @@ -67,12 +67,15 @@ class UstreamIE(InfoExtractor): 'params': { 'skip_download': True, # m3u8 download }, + }, { + 'url': 'https://video.ibm.com/embed/recorded/128240221?&autoplay=true&controls=true&volume=100', + 'only_matching': True, }] @staticmethod def _extract_url(webpage): mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>http://www\.ustream\.tv/embed/.+?)\1', webpage) + r'<iframe[^>]+?src=(["\'])(?P<url>http://(?:www\.)?(?:ustream\.tv|video\.ibm\.com)/embed/.+?)\1', webpage) if mobj is not None: return mobj.group('url') diff --git a/youtube_dlc/extractor/vimeo.py b/youtube_dlc/extractor/vimeo.py index 9839657ca..a0662a369 100644 --- a/youtube_dlc/extractor/vimeo.py +++ b/youtube_dlc/extractor/vimeo.py @@ -946,10 +946,13 @@ def _fetch_page(self, album_id, authorizaion, hashed_pass, page): def _real_extract(self, url): album_id = self._match_id(url) - webpage = self._download_webpage(url, album_id) - viewer = self._parse_json(self._search_regex( - r'bootstrap_data\s*=\s*({.+?})</script>', - webpage, 'bootstrap data'), album_id)['viewer'] + viewer = self._download_json( + 'https://vimeo.com/_rv/viewer', album_id, fatal=False) + if not viewer: + webpage = self._download_webpage(url, album_id) + viewer = self._parse_json(self._search_regex( + r'bootstrap_data\s*=\s*({.+?})</script>', + webpage, 'bootstrap data'), album_id)['viewer'] jwt = viewer['jwt'] album = self._download_json( 'https://api.vimeo.com/albums/' + album_id, diff --git a/youtube_dlc/extractor/vlive.py b/youtube_dlc/extractor/vlive.py index 935560b57..577e33f13 100644 --- a/youtube_dlc/extractor/vlive.py +++ b/youtube_dlc/extractor/vlive.py @@ -4,52 +4,48 @@ import re import time import itertools +import json from .common import InfoExtractor from .naver import NaverBaseIE -from ..compat import compat_str +from ..compat import ( + compat_HTTPError, + compat_str, +) from ..utils import ( ExtractorError, + int_or_none, merge_dicts, try_get, urlencode_postdata, ) -class VLiveIE(NaverBaseIE): +class VLiveBaseIE(NaverBaseIE): + _APP_ID = '8c6cc7b45d2568fb668be6e05b6e5a3b' + + +class VLiveIE(VLiveBaseIE): IE_NAME = 'vlive' - _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/(?:video|post)/(?P<id>(?:\d-)?[0-9]+)' + _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/(?:video|embed)/(?P<id>[0-9]+)' _NETRC_MACHINE = 'vlive' _TESTS = [{ - 'url': 'https://www.vlive.tv/video/1326', + 'url': 'http://www.vlive.tv/video/1326', 'md5': 'cc7314812855ce56de70a06a27314983', 'info_dict': { 'id': '1326', 'ext': 'mp4', - 'title': "[V LIVE] Girl's Day's Broadcast", + 'title': "Girl's Day's Broadcast", 'creator': "Girl's Day", 'view_count': int, 'uploader_id': 'muploader_a', }, - }, - { - 'url': 'https://vlive.tv/post/1-18244258', - 'md5': 'cc7314812855ce56de70a06a27314983', - 'info_dict': { - 'id': '1326', - 'ext': 'mp4', - 'title': "[V LIVE] Girl's Day's Broadcast", - 'creator': "Girl's Day", - 'view_count': int, - 'uploader_id': 'muploader_a', - }, - }, - { - 'url': 'https://www.vlive.tv/video/16937', + }, { + 'url': 'http://www.vlive.tv/video/16937', 'info_dict': { 'id': '16937', 'ext': 'mp4', - 'title': '[V LIVE] 첸백시 걍방', + 'title': '첸백시 걍방', 'creator': 'EXO', 'view_count': int, 'subtitles': 'mincount:12', @@ -70,12 +66,11 @@ class VLiveIE(NaverBaseIE): 'subtitles': 'mincount:10', }, 'skip': 'This video is only available for CH+ subscribers', + }, { + 'url': 'https://www.vlive.tv/embed/1326', + 'only_matching': True, }] - @classmethod - def suitable(cls, url): - return False if VLivePlaylistIE.suitable(url) else super(VLiveIE, cls).suitable(url) - def _real_initialize(self): self._login() @@ -107,118 +102,82 @@ def is_logged_in(): if not is_logged_in(): raise ExtractorError('Unable to log in', expected=True) + def _call_api(self, path_template, video_id, fields=None): + query = {'appId': self._APP_ID} + if fields: + query['fields'] = fields + return self._download_json( + 'https://www.vlive.tv/globalv-web/vam-web/' + path_template % video_id, video_id, + 'Downloading %s JSON metadata' % path_template.split('/')[-1].split('-')[0], + headers={'Referer': 'https://www.vlive.tv/'}, query=query) + def _real_extract(self, url): - # url may match on a post or a video url with a post_id potentially matching a video_id - working_id = self._match_id(url) - webpage = self._download_webpage(url, working_id) + video_id = self._match_id(url) - PARAMS_RE = r'window\.__PRELOADED_STATE__\s*=\s*({.*});?\s*</script>' - PARAMS_FIELD = 'params' + try: + post = self._call_api( + 'post/v1.0/officialVideoPost-%s', video_id, + 'author{nickname},channel{channelCode,channelName},officialVideo{commentCount,exposeStatus,likeCount,playCount,playTime,status,title,type,vodId}') + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + self.raise_login_required(json.loads(e.cause.read().decode())['message']) + raise - params = self._search_regex( - PARAMS_RE, webpage, PARAMS_FIELD, default='', flags=re.DOTALL) - params = self._parse_json(params, working_id, fatal=False) + video = post['officialVideo'] - video_params = try_get(params, lambda x: x["postDetail"]["post"]["officialVideo"], dict) + def get_common_fields(): + channel = post.get('channel') or {} + return { + 'title': video.get('title'), + 'creator': post.get('author', {}).get('nickname'), + 'channel': channel.get('channelName'), + 'channel_id': channel.get('channelCode'), + 'duration': int_or_none(video.get('playTime')), + 'view_count': int_or_none(video.get('playCount')), + 'like_count': int_or_none(video.get('likeCount')), + 'comment_count': int_or_none(video.get('commentCount')), + } - if video_params is None: - error = try_get(params, lambda x: x["postDetail"]["error"], dict) - error_data = try_get(error, lambda x: x["data"], dict) - error_video = try_get(error_data, lambda x: x["officialVideo"], dict) - error_msg = try_get(error, lambda x: x["message"], compat_str) - product_type = try_get(error_data, - [lambda x: x["officialVideo"]["productType"], - lambda x: x["board"]["boardType"]], - compat_str) - - if error_video is not None: - if product_type in ('VLIVE_PLUS', 'VLIVE+'): - self.raise_login_required('This video is only available with V LIVE+.') - elif error_msg is not None: - raise ExtractorError('V LIVE reported the following error: %s' % error_msg) - else: - raise ExtractorError('Failed to extract video parameters.') - elif 'post' in url: - raise ExtractorError('Url does not appear to be a video post.', expected=True) - else: - raise ExtractorError('Failed to extract video parameters.') - - video_id = working_id if 'video' in url else str(video_params["videoSeq"]) - - video_type = video_params["type"] - if video_type in ('VOD'): - encoding_status = video_params["encodingStatus"] - if encoding_status == 'COMPLETE': - return self._replay(video_id, webpage, params, video_params) - else: - raise ExtractorError('VOD encoding not yet complete. Please try again later.', - expected=True) - elif video_type in ('LIVE'): - video_status = video_params["status"] - if video_status in ('RESERVED'): + video_type = video.get('type') + if video_type == 'VOD': + inkey = self._call_api('video/v1.0/vod/%s/inkey', video_id)['inkey'] + vod_id = video['vodId'] + return merge_dicts( + get_common_fields(), + self._extract_video_info(video_id, vod_id, inkey)) + elif video_type == 'LIVE': + status = video.get('status') + if status == 'ON_AIR': + stream_url = self._call_api( + 'old/v3/live/%s/playInfo', + video_id)['result']['adaptiveStreamUrl'] + formats = self._extract_m3u8_formats(stream_url, video_id, 'mp4') + info = get_common_fields() + info.update({ + 'title': self._live_title(video['title']), + 'id': video_id, + 'formats': formats, + 'is_live': True, + }) + return info + elif status == 'ENDED': + raise ExtractorError( + 'Uploading for replay. Please wait...', expected=True) + elif status == 'RESERVED': raise ExtractorError('Coming soon!', expected=True) - elif video_status in ('ENDED', 'END'): - raise ExtractorError('Uploading for replay. Please wait...', expected=True) + elif video.get('exposeStatus') == 'CANCEL': + raise ExtractorError( + 'We are sorry, but the live broadcast has been canceled.', + expected=True) else: - return self._live(video_id, webpage, params) - else: - raise ExtractorError('Unknown video type %s' % video_type) - - def _get_common_fields(self, webpage, params): - title = self._og_search_title(webpage) - description = self._html_search_meta( - ['og:description', 'description', 'twitter:description'], - webpage, 'description', default=None) - creator = (try_get(params, lambda x: x["channel"]["channel"]["channelName"], compat_str) - or self._search_regex(r'on (.*) channel', description or '', 'creator', fatal=False)) - thumbnail = self._og_search_thumbnail(webpage) - return { - 'title': title, - 'creator': creator, - 'thumbnail': thumbnail, - } - - def _live(self, video_id, webpage, params): - LIVE_INFO_ENDPOINT = 'https://www.vlive.tv/globalv-web/vam-web/old/v3/live/%s/playInfo' % video_id - play_info = self._download_json(LIVE_INFO_ENDPOINT, video_id, - headers={"referer": "https://www.vlive.tv"}) - - streams = try_get(play_info, lambda x: x["result"]["streamList"], list) or [] - - formats = [] - for stream in streams: - formats.extend(self._extract_m3u8_formats( - stream['serviceUrl'], video_id, 'mp4', - fatal=False, live=True)) - self._sort_formats(formats) - - info = self._get_common_fields(webpage, params) - info.update({ - 'title': self._live_title(info['title']), - 'id': video_id, - 'formats': formats, - 'is_live': True, - }) - return info - - def _replay(self, video_id, webpage, params, video_params): - long_video_id = video_params["vodId"] - - VOD_KEY_ENDPOINT = 'https://www.vlive.tv/globalv-web/vam-web/video/v1.0/vod/%s/inkey' % video_id - key_json = self._download_json(VOD_KEY_ENDPOINT, video_id, - headers={"referer": "https://www.vlive.tv"}) - key = key_json["inkey"] - - return merge_dicts( - self._get_common_fields(webpage, params), - self._extract_video_info(video_id, long_video_id, key)) + raise ExtractorError('Unknown status ' + status) -class VLiveChannelIE(InfoExtractor): +class VLiveChannelIE(VLiveBaseIE): IE_NAME = 'vlive:channel' - _VALID_URL = r'https?://(?:(?:www|m)\.)?(?:channels\.vlive\.tv/|vlive\.tv/channels?/)(?P<id>[0-9A-Z]+)' + _VALID_URL = r'https?://(?:channels\.vlive\.tv|(?:(?:www|m)\.)?vlive\.tv/channel)/(?P<id>[0-9A-Z]+)' _TESTS = [{ - 'url': 'https://channels.vlive.tv/FCD4B', + 'url': 'http://channels.vlive.tv/FCD4B', 'info_dict': { 'id': 'FCD4B', 'title': 'MAMAMOO', @@ -226,63 +185,39 @@ class VLiveChannelIE(InfoExtractor): 'playlist_mincount': 110 }, { 'url': 'https://www.vlive.tv/channel/FCD4B', - 'info_dict': { - 'id': 'FCD4B', - 'title': 'MAMAMOO', - }, - 'playlist_mincount': 110 + 'only_matching': True, }] - _APP_ID = '8c6cc7b45d2568fb668be6e05b6e5a3b' + + def _call_api(self, path, channel_key_suffix, channel_value, note, query): + q = { + 'app_id': self._APP_ID, + 'channel' + channel_key_suffix: channel_value, + } + q.update(query) + return self._download_json( + 'http://api.vfan.vlive.tv/vproxy/channelplus/' + path, + channel_value, note='Downloading ' + note, query=q)['result'] def _real_extract(self, url): channel_code = self._match_id(url) - webpage = self._download_webpage( - 'http://channels.vlive.tv/%s/video' % channel_code, channel_code) + channel_seq = self._call_api( + 'decodeChannelCode', 'Code', channel_code, + 'decode channel code', {})['channelSeq'] - app_id = None - - app_js_url = self._search_regex( - r'<script[^>]+src=(["\'])(?P<url>http.+?/app\.js.*?)\1', - webpage, 'app js', default=None, group='url') - - if app_js_url: - app_js = self._download_webpage( - app_js_url, channel_code, 'Downloading app JS', fatal=False) - if app_js: - app_id = self._search_regex( - r'Global\.VFAN_APP_ID\s*=\s*[\'"]([^\'"]+)[\'"]', - app_js, 'app id', default=None) - - app_id = app_id or self._APP_ID - - channel_info = self._download_json( - 'http://api.vfan.vlive.tv/vproxy/channelplus/decodeChannelCode', - channel_code, note='Downloading decode channel code', - query={ - 'app_id': app_id, - 'channelCode': channel_code, - '_': int(time.time()) - }) - - channel_seq = channel_info['result']['channelSeq'] channel_name = None entries = [] for page_num in itertools.count(1): - video_list = self._download_json( - 'http://api.vfan.vlive.tv/vproxy/channelplus/getChannelVideoList', - channel_code, note='Downloading channel list page #%d' % page_num, - query={ - 'app_id': app_id, - 'channelSeq': channel_seq, + video_list = self._call_api( + 'getChannelVideoList', 'Seq', channel_seq, + 'channel list page #%d' % page_num, { # Large values of maxNumOfRows (~300 or above) may cause # empty responses (see [1]), e.g. this happens for [2] that # has more than 300 videos. # 1. https://github.com/ytdl-org/youtube-dl/issues/13830 # 2. http://channels.vlive.tv/EDBF. 'maxNumOfRows': 100, - '_': int(time.time()), 'pageNo': page_num } ) @@ -290,11 +225,11 @@ def _real_extract(self, url): if not channel_name: channel_name = try_get( video_list, - lambda x: x['result']['channelInfo']['channelName'], + lambda x: x['channelInfo']['channelName'], compat_str) videos = try_get( - video_list, lambda x: x['result']['videoList'], list) + video_list, lambda x: x['videoList'], list) if not videos: break @@ -312,7 +247,9 @@ def _real_extract(self, url): entries, channel_code, channel_name) -class VLivePlaylistIE(InfoExtractor): +# old extractor. Rewrite? + +class VLivePlaylistIE(VLiveBaseIE): IE_NAME = 'vlive:playlist' _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/video/(?P<video_id>[0-9]+)/playlist/(?P<id>[0-9]+)' _VIDEO_URL_TEMPLATE = 'http://www.vlive.tv/video/%s' diff --git a/youtube_dlc/extractor/xtube.py b/youtube_dlc/extractor/xtube.py index 081c5e2e7..98d2adb99 100644 --- a/youtube_dlc/extractor/xtube.py +++ b/youtube_dlc/extractor/xtube.py @@ -5,7 +5,6 @@ from .common import InfoExtractor from ..utils import ( - ExtractorError, int_or_none, js_to_json, orderedSet, @@ -34,7 +33,7 @@ class XTubeIE(InfoExtractor): 'title': 'strange erotica', 'description': 'contains:an ET kind of thing', 'uploader': 'greenshowers', - 'duration': 449, + 'duration': 450, 'view_count': int, 'comment_count': int, 'age_limit': 18, @@ -74,24 +73,16 @@ def _real_extract(self, url): title, thumbnail, duration = [None] * 3 - json_config_string = self._search_regex( - r'playerConf=({.+?}),loaderConf', - webpage, 'config', default=None) - if not json_config_string: - raise ExtractorError("Could not extract video player data") - - json_config_string = json_config_string.replace("!0", "true").replace("!1", "false") - - config = self._parse_json(json_config_string, video_id, transform_source=js_to_json, fatal=False) - if not config: - raise ExtractorError("Could not extract video player data") - - config = config.get('mainRoll') - if isinstance(config, dict): - title = config.get('title') - thumbnail = config.get('poster') - duration = int_or_none(config.get('duration')) - sources = config.get('sources') or config.get('format') + config = self._parse_json(self._search_regex( + r'playerConf\s*=\s*({.+?})\s*,\s*(?:\n|loaderConf)', webpage, 'config', + default='{}'), video_id, transform_source=js_to_json, fatal=False) + if config: + config = config.get('mainRoll') + if isinstance(config, dict): + title = config.get('title') + thumbnail = config.get('poster') + duration = int_or_none(config.get('duration')) + sources = config.get('sources') or config.get('format') if not isinstance(sources, dict): sources = self._parse_json(self._search_regex( diff --git a/youtube_dlc/extractor/youporn.py b/youtube_dlc/extractor/youporn.py index e7fca22de..7b9feafeb 100644 --- a/youtube_dlc/extractor/youporn.py +++ b/youtube_dlc/extractor/youporn.py @@ -29,7 +29,6 @@ class YouPornIE(InfoExtractor): 'upload_date': '20101217', 'average_rating': int, 'view_count': int, - 'comment_count': int, 'categories': list, 'tags': list, 'age_limit': 18, @@ -48,7 +47,6 @@ class YouPornIE(InfoExtractor): 'upload_date': '20110418', 'average_rating': int, 'view_count': int, - 'comment_count': int, 'categories': list, 'tags': list, 'age_limit': 18, @@ -156,7 +154,8 @@ def _real_extract(self, url): r'(?s)<div[^>]+class=["\']submitByLink["\'][^>]*>(.+?)</div>', webpage, 'uploader', fatal=False) upload_date = unified_strdate(self._html_search_regex( - [r'Date\s+[Aa]dded:\s*<span>([^<]+)', + [r'UPLOADED:\s*<span>([^<]+)', + r'Date\s+[Aa]dded:\s*<span>([^<]+)', r'(?s)<div[^>]+class=["\']videoInfo(?:Date|Time)["\'][^>]*>(.+?)</div>'], webpage, 'upload date', fatal=False)) @@ -171,7 +170,7 @@ def _real_extract(self, url): webpage, 'view count', fatal=False, group='count')) comment_count = str_to_int(self._search_regex( r'>All [Cc]omments? \(([\d,.]+)\)', - webpage, 'comment count', fatal=False)) + webpage, 'comment count', default=None)) def extract_tag_box(regex, title): tag_box = self._search_regex(regex, webpage, title, default=None) diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index 97cc793f9..bbd9b2c4c 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -16,7 +16,6 @@ from ..swfinterp import SWFInterpreter from ..compat import ( compat_chr, - compat_HTTPError, compat_kwargs, compat_parse_qs, compat_urllib_parse_unquote, @@ -30,13 +29,10 @@ bool_or_none, clean_html, error_to_compat_str, - extract_attributes, ExtractorError, float_or_none, - get_element_by_attribute, get_element_by_id, int_or_none, - js_to_json, mimetype2ext, orderedSet, parse_codecs, @@ -51,9 +47,11 @@ unescapeHTML, unified_strdate, unsmuggle_url, + update_url_query, uppercase_escape, url_or_none, urlencode_postdata, + urljoin, ) @@ -70,9 +68,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): # If True it will raise an error if no login info is provided _LOGIN_REQUIRED = False - _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}' - _INITIAL_DATA_RE = r'(?:window\["ytInitialData"\]|ytInitialData)\W?=\W?({.*?});' - _YTCFG_DATA_RE = r"ytcfg.set\(({.*?})\)" + _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM)' _YOUTUBE_CLIENT_HEADERS = { 'x-youtube-client-name': '1', @@ -297,147 +293,34 @@ def _real_initialize(self): if not self._login(): return + _DEFAULT_API_DATA = { + 'context': { + 'client': { + 'clientName': 'WEB', + 'clientVersion': '2.20201021.03.00', + } + }, + } -class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor): + def _call_api(self, ep, query, video_id): + data = self._DEFAULT_API_DATA.copy() + data.update(query) - def _find_entries_in_json(self, extracted): - entries = [] - c = {} + response = self._download_json( + 'https://www.youtube.com/youtubei/v1/%s' % ep, video_id=video_id, + note='Downloading API JSON', errnote='Unable to download API page', + data=json.dumps(data).encode('utf8'), + headers={'content-type': 'application/json'}, + query={'key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'}) - def _real_find(obj): - if obj is None or isinstance(obj, str): - return + return response - if type(obj) is list: - for elem in obj: - _real_find(elem) - - if type(obj) is dict: - if self._is_entry(obj): - entries.append(obj) - return - - if 'continuationCommand' in obj: - c['continuation'] = obj - return - - for _, o in obj.items(): - _real_find(o) - - _real_find(extracted) - - return entries, try_get(c, lambda x: x["continuation"]) - - def _entries(self, page, playlist_id, max_pages=None): - seen = [] - - yt_conf = {} - for m in re.finditer(self._YTCFG_DATA_RE, page): - parsed = self._parse_json(m.group(1), playlist_id, - transform_source=js_to_json, fatal=False) - if parsed: - yt_conf.update(parsed) - - data_json = self._parse_json(self._search_regex(self._INITIAL_DATA_RE, page, 'ytInitialData'), None) - - for page_num in range(1, max_pages + 1) if max_pages is not None else itertools.count(1): - entries, continuation = self._find_entries_in_json(data_json) - processed = self._process_entries(entries, seen) - - if not processed: - break - for entry in processed: - yield entry - - if not continuation or not yt_conf: - break - continuation_token = try_get(continuation, lambda x: x['continuationCommand']['token']) - continuation_url = try_get(continuation, lambda x: x['commandMetadata']['webCommandMetadata']['apiUrl']) - if not continuation_token or not continuation_url: - break - - count = 0 - retries = 3 - while count <= retries: - try: - # Downloading page may result in intermittent 5xx HTTP error - # that is usually worked around with a retry - data_json = self._download_json( - 'https://www.youtube.com%s' % continuation_url, - playlist_id, - 'Downloading continuation page #%s%s' % (page_num, ' (retry #%d)' % count if count else ''), - - transform_source=uppercase_escape, - query={ - 'key': try_get(yt_conf, lambda x: x['INNERTUBE_API_KEY']) - }, - data=str(json.dumps({ - 'context': try_get(yt_conf, lambda x: x['INNERTUBE_CONTEXT']), - 'continuation': continuation_token - })).encode(encoding='UTF-8', errors='strict'), - headers={ - 'Content-Type': 'application/json' - } - ) - break - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503): - count += 1 - if count <= retries: - continue - raise - - def _extract_title(self, renderer): - title = try_get(renderer, lambda x: x['title']['runs'][0]['text'], compat_str) - if title: - return title - return try_get(renderer, lambda x: x['title']['simpleText'], compat_str) - - -class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): - def _is_entry(self, obj): - return 'videoId' in obj - - def _process_entries(self, entries, seen): - ids_in_page = [] - titles_in_page = [] - for renderer in entries: - video_id = try_get(renderer, lambda x: x['videoId']) - video_title = self._extract_title(renderer) - - if video_id is None or video_title is None: - # we do not have a videoRenderer or title extraction broke - continue - - video_title = video_title.strip() - - try: - idx = ids_in_page.index(video_id) - if video_title and not titles_in_page[idx]: - titles_in_page[idx] = video_title - except ValueError: - ids_in_page.append(video_id) - titles_in_page.append(video_title) - - for video_id, video_title in zip(ids_in_page, titles_in_page): - yield self.url_result(video_id, 'Youtube', video_id, video_title) - - -class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): - def _is_entry(self, obj): - return 'playlistId' in obj - - def _process_entries(self, entries, seen): - for playlist_id in orderedSet(try_get(r, lambda x: x['playlistId']) for r in entries): - - yield self.url_result( - 'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist') - - def _real_extract(self, url): - playlist_id = self._match_id(url) - webpage = self._download_webpage(url, playlist_id) - title = self._og_search_title(webpage, fatal=False) - return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title) + def _extract_yt_initial_data(self, video_id, webpage): + return self._parse_json( + self._search_regex( + r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;', + webpage, 'yt initial data'), + video_id) class YoutubeIE(YoutubeBaseInfoExtractor): @@ -498,7 +381,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId= ) )? # all until now is optional -> you can pass the naked ID - ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID + (?P<id>[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID (?!.*?\blist= (?: %(playlist_id)s| # combined list/video URLs are handled by the playlist IE @@ -662,7 +545,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): } }, { - 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY', + 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=yZIXLfi8CZQ', 'note': 'Use the first video ID in the URL', 'info_dict': { 'id': 'BaW_jenozKc', @@ -703,6 +586,24 @@ class YoutubeIE(YoutubeBaseInfoExtractor): }, 'skip': 'format 141 not served anymore', }, + # DASH manifest with encrypted signature + { + 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA', + 'info_dict': { + 'id': 'IB3lcPjvWLA', + 'ext': 'm4a', + 'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson', + 'description': 'md5:8f5e2b82460520b619ccac1f509d43bf', + 'duration': 244, + 'uploader': 'AfrojackVEVO', + 'uploader_id': 'AfrojackVEVO', + 'upload_date': '20131011', + }, + 'params': { + 'youtube_include_dash_manifest': True, + 'format': '141/bestaudio[ext=m4a]', + }, + }, # Controversy video { 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8', @@ -734,6 +635,27 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'age_limit': 18, }, }, + # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421) + # YouTube Red ad is not captured for creator + { + 'url': '__2ABJjxzNo', + 'info_dict': { + 'id': '__2ABJjxzNo', + 'ext': 'mp4', + 'duration': 266, + 'upload_date': '20100430', + 'uploader_id': 'deadmau5', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5', + 'creator': 'Dada Life, deadmau5', + 'description': 'md5:12c56784b8032162bb936a5f76d55360', + 'uploader': 'deadmau5', + 'title': 'Deadmau5 - Some Chords (HD)', + 'alt_title': 'This Machine Kills Some Chords', + }, + 'expected_warnings': [ + 'DASH manifest missing', + ] + }, # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431) { 'url': 'lqQg6PlCWgI', @@ -1072,10 +994,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'url': 'sJL6WA-aGkQ', 'only_matching': True, }, - { - 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM', - 'only_matching': True, - }, { 'url': 'https://invidio.us/watch?v=BaW_jenozKc', 'only_matching': True, @@ -1127,73 +1045,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'skip_download': True, }, }, - { - # Youtube Music Auto-generated description - # Retrieve 'artist' field from 'Artist:' in video description - # when it is present on youtube music video - 'url': 'https://www.youtube.com/watch?v=k0jLE7tTwjY', - 'info_dict': { - 'id': 'k0jLE7tTwjY', - 'ext': 'mp4', - 'title': 'Latch Feat. Sam Smith', - 'description': 'md5:3cb1e8101a7c85fcba9b4fb41b951335', - 'upload_date': '20150110', - 'uploader': 'Various Artists - Topic', - 'uploader_id': 'UCNkEcmYdjrH4RqtNgh7BZ9w', - 'artist': 'Disclosure', - 'track': 'Latch Feat. Sam Smith', - 'album': 'Latch Featuring Sam Smith', - 'release_date': '20121008', - 'release_year': 2012, - }, - 'params': { - 'skip_download': True, - }, - }, - { - # Youtube Music Auto-generated description - # handle multiple artists on youtube music video - 'url': 'https://www.youtube.com/watch?v=74qn0eJSjpA', - 'info_dict': { - 'id': '74qn0eJSjpA', - 'ext': 'mp4', - 'title': 'Eastside', - 'description': 'md5:290516bb73dcbfab0dcc4efe6c3de5f2', - 'upload_date': '20180710', - 'uploader': 'Benny Blanco - Topic', - 'uploader_id': 'UCzqz_ksRu_WkIzmivMdIS7A', - 'artist': 'benny blanco, Halsey, Khalid', - 'track': 'Eastside', - 'album': 'Eastside', - 'release_date': '20180713', - 'release_year': 2018, - }, - 'params': { - 'skip_download': True, - }, - }, - { - # Youtube Music Auto-generated description - # handle youtube music video with release_year and no release_date - 'url': 'https://www.youtube.com/watch?v=-hcAI0g-f5M', - 'info_dict': { - 'id': '-hcAI0g-f5M', - 'ext': 'mp4', - 'title': 'Put It On Me', - 'description': 'md5:f6422397c07c4c907c6638e1fee380a5', - 'upload_date': '20180426', - 'uploader': 'Matt Maeson - Topic', - 'uploader_id': 'UCnEkIGqtGcQMLk73Kp-Q5LQ', - 'artist': 'Matt Maeson', - 'track': 'Put It On Me', - 'album': 'The Hearse', - 'release_date': None, - 'release_year': 2018, - }, - 'params': { - 'skip_download': True, - }, - }, { 'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q', 'only_matching': True, @@ -1455,7 +1306,7 @@ def _get_ytplayer_config(self, video_id, webpage): # https://github.com/ytdl-org/youtube-dl/pull/7599) r';ytplayer\.config\s*=\s*({.+?});ytplayer', r';ytplayer\.config\s*=\s*({.+?});', - r'ytInitialPlayerResponse\s*=\s*({.+?});var meta' + r'ytInitialPlayerResponse\s*=\s*({.+?});var meta' # Needed??? ) config = self._search_regex( patterns, webpage, 'ytplayer.config', default=None) @@ -1511,11 +1362,10 @@ def _get_automatic_captions(self, video_id, webpage): self._downloader.report_warning(err_msg) return {} try: - if "args" in player_config and "ttsurl" in player_config["args"]: - args = player_config['args'] - caption_url = args['ttsurl'] + args = player_config['args'] + caption_url = args.get('ttsurl') + if caption_url: timestamp = args['timestamp'] - # We get the available subtitles list_params = compat_urllib_parse_urlencode({ 'type': 'list', @@ -1571,24 +1421,13 @@ def make_captions(sub_url, sub_langs): return captions # New captions format as of 22.06.2017 - if "args" in player_config: - player_response = player_config["args"].get('player_response') - else: - # New player system (ytInitialPlayerResponse) as of October 2020 - player_response = player_config - - if player_response: - if isinstance(player_response, compat_str): - player_response = self._parse_json( - player_response, video_id, fatal=False) - - renderer = player_response['captions']['playerCaptionsTracklistRenderer'] - caption_tracks = renderer['captionTracks'] - for caption_track in caption_tracks: - if 'kind' not in caption_track: - # not an automatic transcription - continue - base_url = caption_track['baseUrl'] + player_response = args.get('player_response') + if player_response and isinstance(player_response, compat_str): + player_response = self._parse_json( + player_response, video_id, fatal=False) + if player_response: + renderer = player_response['captions']['playerCaptionsTracklistRenderer'] + base_url = renderer['captionTracks'][0]['baseUrl'] sub_lang_list = [] for lang in renderer['translationLanguages']: lang_code = lang.get('languageCode') @@ -1596,25 +1435,19 @@ def make_captions(sub_url, sub_langs): sub_lang_list.append(lang_code) return make_captions(base_url, sub_lang_list) - self._downloader.report_warning("Couldn't find automatic captions for %s" % video_id) - return {} - - if "args" in player_config: - args = player_config["args"] - - # Some videos don't provide ttsurl but rather caption_tracks and - # caption_translation_languages (e.g. 20LmZk1hakA) - # Does not used anymore as of 22.06.2017 - caption_tracks = args['caption_tracks'] - caption_translation_languages = args['caption_translation_languages'] - caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0] - sub_lang_list = [] - for lang in caption_translation_languages.split(','): - lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang)) - sub_lang = lang_qs.get('lc', [None])[0] - if sub_lang: - sub_lang_list.append(sub_lang) - return make_captions(caption_url, sub_lang_list) + # Some videos don't provide ttsurl but rather caption_tracks and + # caption_translation_languages (e.g. 20LmZk1hakA) + # Does not used anymore as of 22.06.2017 + caption_tracks = args['caption_tracks'] + caption_translation_languages = args['caption_translation_languages'] + caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0] + sub_lang_list = [] + for lang in caption_translation_languages.split(','): + lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang)) + sub_lang = lang_qs.get('lc', [None])[0] + if sub_lang: + sub_lang_list.append(sub_lang) + return make_captions(caption_url, sub_lang_list) # An extractor error can be raise by the download process if there are # no automatic captions but there are subtitles except (KeyError, IndexError, ExtractorError): @@ -1695,15 +1528,11 @@ def extract_id(cls, url): def _extract_chapters_from_json(self, webpage, video_id, duration): if not webpage: return - initial_data = self._parse_json( - self._search_regex( - r'window\["ytInitialData"\] = (.+);\n', webpage, - 'player args', default='{}'), - video_id, fatal=False) - if not initial_data or not isinstance(initial_data, dict): + data = self._extract_yt_initial_data(video_id, webpage) + if not data or not isinstance(data, dict): return chapters_list = try_get( - initial_data, + data, lambda x: x['playerOverlays'] ['playerOverlayRenderer'] ['decoratedPlayerBarRenderer'] @@ -1937,8 +1766,8 @@ def extract_embedded_config(embed_webpage, video_id): age_gate = False # Try looking directly into the video webpage ytplayer_config = self._get_ytplayer_config(video_id, video_webpage) - args = ytplayer_config.get("args") - if args is not None: + if ytplayer_config: + args = ytplayer_config.get('args', {}) if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'): # Convert to the same format returned by compat_parse_qs video_info = dict((k, [v]) for k, v in args.items()) @@ -1953,11 +1782,16 @@ def extract_embedded_config(embed_webpage, video_id): is_live = True if not player_response: player_response = extract_player_response(args.get('player_response'), video_id) - elif not player_response: - player_response = ytplayer_config if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True): add_dash_mpd_pr(player_response) + if not video_info and not player_response: + player_response = extract_player_response( + self._search_regex( + r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;', video_webpage, + 'initial player response', default='{}'), + video_id) + def extract_unavailable_message(): messages = [] for tag, kind in (('h1', 'message'), ('div', 'submessage')): @@ -2162,7 +1996,10 @@ def _extract_filesize(media_url): if cipher: if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True): - ASSETS_RE = r'(?:"assets":.+?"js":\s*("[^"]+"))|(?:"jsUrl":\s*("[^"]+"))' + ASSETS_RE = ( + r'<script[^>]+\bsrc=("[^"]+")[^>]+\bname=["\']player_ias/base', + r'"jsUrl"\s*:\s*("[^"]+")', + r'"assets":.+?"js":\s*("[^"]+")') jsplayer_url_json = self._search_regex( ASSETS_RE, embed_webpage if age_gate else video_webpage, @@ -2478,7 +2315,7 @@ def extract_meta(field): def _extract_count(count_name): return str_to_int(self._search_regex( - r'"accessibilityData":\{"label":"([\d,\w]+) %ss"\}' + r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>' % re.escape(count_name), video_webpage, count_name, default=None)) @@ -2656,44 +2493,43 @@ def decrypt_sig(mobj): } -class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): - IE_DESC = 'YouTube.com playlists' - _VALID_URL = r"""(?x)(?: - (?:https?://)? - (?:\w+\.)? - (?: - (?: - youtube(?:kids)?\.com| - invidio\.us - ) - / - (?: - (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/(?:videoseries|[0-9A-Za-z_-]{11})) - \? (?:.*?[&;])*? (?:p|a|list)= - | p/ - )| - youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist= - ) - ( - (?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)?[0-9A-Za-z-_]{10,} - # Top tracks, they can also include dots - |(?:MC)[\w\.]* - ) - .* - | - (%(playlist_id)s) - )""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE} - _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s' - _VIDEO_RE_TPL = r'href="\s*/watch\?v=%s(?:&(?:[^"]*?index=(?P<index>\d+))?(?:[^>]+>(?P<title>[^<]+))?)?' - _VIDEO_RE = _VIDEO_RE_TPL % r'(?P<id>[0-9A-Za-z_-]{11})' - IE_NAME = 'youtube:playlist' - _YTM_PLAYLIST_PREFIX = 'RDCLAK5uy_' - _YTM_CHANNEL_INFO = { - 'uploader': 'Youtube Music', - 'uploader_id': 'music', # or "UC-9-kyTW8ZkZNDHQJ6FgpwQ" - 'uploader_url': 'https://www.youtube.com/music' - } +class YoutubeTabIE(YoutubeBaseInfoExtractor): + IE_DESC = 'YouTube.com tab' + _VALID_URL = r'https?://(?:\w+\.)?(?:youtube(?:kids)?\.com|invidio\.us)/(?:(?:channel|c|user)/|(?:playlist|watch)\?.*?\blist=)(?P<id>[^/?#&]+)' + IE_NAME = 'youtube:tab' + _TESTS = [{ + # playlists, multipage + 'url': 'https://www.youtube.com/c/ИгорьКлейнер/playlists?view=1&flow=grid', + 'playlist_mincount': 94, + 'info_dict': { + 'id': 'UCqj7Cz7revf5maW9g5pgNcg', + 'title': 'Игорь Клейнер - Playlists', + 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2', + }, + }, { + # playlists, multipage, different order + 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd', + 'playlist_mincount': 94, + 'info_dict': { + 'id': 'UCqj7Cz7revf5maW9g5pgNcg', + 'title': 'Игорь Клейнер - Playlists', + 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2', + }, + }, { + # playlists, singlepage + 'url': 'https://www.youtube.com/user/ThirstForScience/playlists', + 'playlist_mincount': 4, + 'info_dict': { + 'id': 'UCAEtajcuhQ6an9WEzY9LEMQ', + 'title': 'ThirstForScience - Playlists', + 'description': 'md5:609399d937ea957b0f53cbffb747a14c', + } + }, { + 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists', + 'only_matching': True, + }, { + # basic, single video playlist 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', 'info_dict': { 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA', @@ -2703,6 +2539,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): }, 'playlist_count': 1, }, { + # empty playlist 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf', 'info_dict': { 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA', @@ -2711,6 +2548,69 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): 'title': 'youtube-dl empty playlist', }, 'playlist_count': 0, + }, { + # Home tab + 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/featured', + 'info_dict': { + 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'title': 'lex will - Home', + 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', + }, + 'playlist_mincount': 2, + }, { + # Videos tab + 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos', + 'info_dict': { + 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'title': 'lex will - Videos', + 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', + }, + 'playlist_mincount': 975, + }, { + # Videos tab, sorted by popular + 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos?view=0&sort=p&flow=grid', + 'info_dict': { + 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'title': 'lex will - Videos', + 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', + }, + 'playlist_mincount': 199, + }, { + # Playlists tab + 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/playlists', + 'info_dict': { + 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'title': 'lex will - Playlists', + 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', + }, + 'playlist_mincount': 17, + }, { + # Community tab + 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/community', + 'info_dict': { + 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'title': 'lex will - Community', + 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', + }, + 'playlist_mincount': 18, + }, { + # Channels tab + 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/channels', + 'info_dict': { + 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'title': 'lex will - Channels', + 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', + }, + 'playlist_mincount': 138, + }, { + 'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA', + 'only_matching': True, + }, { + 'url': 'https://www.youtubekids.com/channel/UCyu8StPfZWapR6rfW_JgqcA', + 'only_matching': True, + }, { + 'url': 'https://music.youtube.com/channel/UCT-K0qO8z6NzWrywqefBPBQ', + 'only_matching': True, }, { 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.', 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', @@ -2718,19 +2618,9 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): 'title': '29C3: Not my department', 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', 'uploader': 'Christiaan008', - 'uploader_id': 'ChRiStIaAn008', + 'uploader_id': 'UCEPzS1rYsrkqzSLNp76nrcg', }, 'playlist_count': 96, - }, { - 'note': 'issue #673', - 'url': 'PLBB231211A4F62143', - 'info_dict': { - 'title': '[OLD]Team Fortress 2 (Class-based LP)', - 'id': 'PLBB231211A4F62143', - 'uploader': 'Wickydoo', - 'uploader_id': 'Wickydoo', - }, - 'playlist_mincount': 26, }, { 'note': 'Large playlist', 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q', @@ -2738,45 +2628,13 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): 'title': 'Uploads from Cauchemar', 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q', 'uploader': 'Cauchemar', - 'uploader_id': 'Cauchemar89', + 'uploader_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q', }, - 'playlist_mincount': 799, + 'playlist_mincount': 1123, }, { - 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl', - 'info_dict': { - 'title': 'YDL_safe_search', - 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl', - }, - 'playlist_count': 2, - 'skip': 'This playlist is private', - }, { - 'note': 'embedded', - 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu', - 'playlist_count': 4, - 'info_dict': { - 'title': 'JODA15', - 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu', - 'uploader': 'milan', - 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw', - } - }, { - 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl', - 'playlist_mincount': 485, - 'info_dict': { - 'title': '2018 Chinese New Singles (11/6 updated)', - 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl', - 'uploader': 'LBK', - 'uploader_id': 'sdragonfang', - } - }, { - 'note': 'Embedded SWF player', - 'url': 'https://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0', - 'playlist_count': 4, - 'info_dict': { - 'title': 'JODA7', - 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ', - }, - 'skip': 'This playlist does not exist', + # even larger playlist, 8832 videos + 'url': 'http://www.youtube.com/user/NASAgovVideo/videos', + 'only_matching': True, }, { 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos', 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA', @@ -2784,9 +2642,22 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): 'title': 'Uploads from Interstellar Movie', 'id': 'UUXw-G3eDE9trcvY2sBMM_aA', 'uploader': 'Interstellar Movie', - 'uploader_id': 'InterstellarMovie1', + 'uploader_id': 'UCXw-G3eDE9trcvY2sBMM_aA', }, 'playlist_mincount': 21, + }, { + # https://github.com/ytdl-org/youtube-dl/issues/21844 + 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba', + 'info_dict': { + 'title': 'Data Analysis with Dr Mike Pound', + 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba', + 'uploader_id': 'UC9-y-6csu5WGm29I7JiwpnA', + 'uploader': 'Computerphile', + }, + 'playlist_mincount': 11, + }, { + 'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU', + 'only_matching': True, }, { # Playlist URL that does not actually serve a playlist 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4', @@ -2811,6 +2682,475 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): }, 'skip': 'This video is not available.', 'add_ie': [YoutubeIE.ie_key()], + }, { + 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if YoutubeLiveIE.suitable(url) else super( + YoutubeTabIE, cls).suitable(url) + + def _extract_channel_id(self, webpage): + channel_id = self._html_search_meta( + 'channelId', webpage, 'channel id', default=None) + if channel_id: + return channel_id + channel_url = self._html_search_meta( + ('og:url', 'al:ios:url', 'al:android:url', 'al:web:url', + 'twitter:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad', + 'twitter:app:url:googleplay'), webpage, 'channel url') + return self._search_regex( + r'https?://(?:www\.)?youtube\.com/channel/([^/?#&])+', + channel_url, 'channel id') + + @staticmethod + def _extract_grid_item_renderer(item): + for item_kind in ('Playlist', 'Video', 'Channel'): + renderer = item.get('grid%sRenderer' % item_kind) + if renderer: + return renderer + + def _extract_video(self, renderer): + video_id = renderer.get('videoId') + title = try_get( + renderer, + (lambda x: x['title']['runs'][0]['text'], + lambda x: x['title']['simpleText']), compat_str) + description = try_get( + renderer, lambda x: x['descriptionSnippet']['runs'][0]['text'], + compat_str) + duration = parse_duration(try_get( + renderer, lambda x: x['lengthText']['simpleText'], compat_str)) + view_count_text = try_get( + renderer, lambda x: x['viewCountText']['simpleText'], compat_str) or '' + view_count = str_to_int(self._search_regex( + r'^([\d,]+)', re.sub(r'\s', '', view_count_text), + 'view count', default=None)) + uploader = try_get( + renderer, lambda x: x['ownerText']['runs'][0]['text'], compat_str) + return { + '_type': 'url_transparent', + 'ie_key': YoutubeIE.ie_key(), + 'id': video_id, + 'url': video_id, + 'title': title, + 'description': description, + 'duration': duration, + 'view_count': view_count, + 'uploader': uploader, + } + + def _grid_entries(self, grid_renderer): + for item in grid_renderer['items']: + if not isinstance(item, dict): + continue + renderer = self._extract_grid_item_renderer(item) + if not isinstance(renderer, dict): + continue + title = try_get( + renderer, lambda x: x['title']['runs'][0]['text'], compat_str) + # playlist + playlist_id = renderer.get('playlistId') + if playlist_id: + yield self.url_result( + 'https://www.youtube.com/playlist?list=%s' % playlist_id, + ie=YoutubeTabIE.ie_key(), video_id=playlist_id, + video_title=title) + # video + video_id = renderer.get('videoId') + if video_id: + yield self._extract_video(renderer) + # channel + channel_id = renderer.get('channelId') + if channel_id: + title = try_get( + renderer, lambda x: x['title']['simpleText'], compat_str) + yield self.url_result( + 'https://www.youtube.com/channel/%s' % channel_id, + ie=YoutubeTabIE.ie_key(), video_title=title) + + def _shelf_entries_trimmed(self, shelf_renderer): + renderer = try_get( + shelf_renderer, lambda x: x['content']['horizontalListRenderer'], dict) + if not renderer: + return + # TODO: add support for nested playlists so each shelf is processed + # as separate playlist + # TODO: this includes only first N items + for entry in self._grid_entries(renderer): + yield entry + + def _shelf_entries(self, shelf_renderer): + ep = try_get( + shelf_renderer, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'], + compat_str) + shelf_url = urljoin('https://www.youtube.com', ep) + if not shelf_url: + return + title = try_get( + shelf_renderer, lambda x: x['title']['runs'][0]['text'], compat_str) + yield self.url_result(shelf_url, video_title=title) + + def _playlist_entries(self, video_list_renderer): + for content in video_list_renderer['contents']: + if not isinstance(content, dict): + continue + renderer = content.get('playlistVideoRenderer') or content.get('playlistPanelVideoRenderer') + if not isinstance(renderer, dict): + continue + video_id = renderer.get('videoId') + if not video_id: + continue + yield self._extract_video(renderer) + + def _video_entry(self, video_renderer): + video_id = video_renderer.get('videoId') + if video_id: + return self._extract_video(video_renderer) + + def _post_thread_entries(self, post_thread_renderer): + post_renderer = try_get( + post_thread_renderer, lambda x: x['post']['backstagePostRenderer'], dict) + if not post_renderer: + return + # video attachment + video_renderer = try_get( + post_renderer, lambda x: x['backstageAttachment']['videoRenderer'], dict) + video_id = None + if video_renderer: + entry = self._video_entry(video_renderer) + if entry: + yield entry + # inline video links + runs = try_get(post_renderer, lambda x: x['contentText']['runs'], list) or [] + for run in runs: + if not isinstance(run, dict): + continue + ep_url = try_get( + run, lambda x: x['navigationEndpoint']['urlEndpoint']['url'], compat_str) + if not ep_url: + continue + if not YoutubeIE.suitable(ep_url): + continue + ep_video_id = YoutubeIE._match_id(ep_url) + if video_id == ep_video_id: + continue + yield self.url_result(ep_url, ie=YoutubeIE.ie_key(), video_id=video_id) + + def _post_thread_continuation_entries(self, post_thread_continuation): + contents = post_thread_continuation.get('contents') + if not isinstance(contents, list): + return + for content in contents: + renderer = content.get('backstagePostThreadRenderer') + if not isinstance(renderer, dict): + continue + for entry in self._post_thread_entries(renderer): + yield entry + + @staticmethod + def _extract_next_continuation_data(renderer): + next_continuation = try_get( + renderer, lambda x: x['continuations'][0]['nextContinuationData'], dict) + if not next_continuation: + return + continuation = next_continuation.get('continuation') + if not continuation: + return + ctp = next_continuation.get('clickTrackingParams') + return { + 'ctoken': continuation, + 'continuation': continuation, + 'itct': ctp, + } + + @classmethod + def _extract_continuation(cls, renderer): + next_continuation = cls._extract_next_continuation_data(renderer) + if next_continuation: + return next_continuation + contents = renderer.get('contents') + if not isinstance(contents, list): + return + for content in contents: + if not isinstance(content, dict): + continue + continuation_ep = try_get( + content, lambda x: x['continuationItemRenderer']['continuationEndpoint'], + dict) + if not continuation_ep: + continue + continuation = try_get( + continuation_ep, lambda x: x['continuationCommand']['token'], compat_str) + if not continuation: + continue + ctp = continuation_ep.get('clickTrackingParams') + if not ctp: + continue + return { + 'ctoken': continuation, + 'continuation': continuation, + 'itct': ctp, + } + + def _entries(self, tab, identity_token): + continuation = None + slr_contents = try_get(tab, lambda x: x['sectionListRenderer']['contents'], list) or [] + for slr_content in slr_contents: + if not isinstance(slr_content, dict): + continue + is_renderer = try_get(slr_content, lambda x: x['itemSectionRenderer'], dict) + if not is_renderer: + continue + isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or [] + for isr_content in isr_contents: + if not isinstance(isr_content, dict): + continue + renderer = isr_content.get('playlistVideoListRenderer') + if renderer: + for entry in self._playlist_entries(renderer): + yield entry + continuation = self._extract_continuation(renderer) + continue + renderer = isr_content.get('gridRenderer') + if renderer: + for entry in self._grid_entries(renderer): + yield entry + continuation = self._extract_continuation(renderer) + continue + renderer = isr_content.get('shelfRenderer') + if renderer: + for entry in self._shelf_entries(renderer): + yield entry + continue + renderer = isr_content.get('backstagePostThreadRenderer') + if renderer: + for entry in self._post_thread_entries(renderer): + yield entry + continuation = self._extract_continuation(renderer) + continue + renderer = isr_content.get('videoRenderer') + if renderer: + entry = self._video_entry(renderer) + if entry: + yield entry + + if not continuation: + continuation = self._extract_continuation(is_renderer) + + headers = { + 'x-youtube-client-name': '1', + 'x-youtube-client-version': '2.20201112.04.01', + } + if identity_token: + headers['x-youtube-identity-token'] = identity_token + + for page_num in itertools.count(1): + if not continuation: + break + browse = self._download_json( + 'https://www.youtube.com/browse_ajax', None, + 'Downloading page %d' % page_num, + headers=headers, query=continuation, fatal=False) + if not browse: + break + response = try_get(browse, lambda x: x[1]['response'], dict) + if not response: + break + + continuation_contents = try_get( + response, lambda x: x['continuationContents'], dict) + if continuation_contents: + continuation_renderer = continuation_contents.get('playlistVideoListContinuation') + if continuation_renderer: + for entry in self._playlist_entries(continuation_renderer): + yield entry + continuation = self._extract_continuation(continuation_renderer) + continue + continuation_renderer = continuation_contents.get('gridContinuation') + if continuation_renderer: + for entry in self._grid_entries(continuation_renderer): + yield entry + continuation = self._extract_continuation(continuation_renderer) + continue + continuation_renderer = continuation_contents.get('itemSectionContinuation') + if continuation_renderer: + for entry in self._post_thread_continuation_entries(continuation_renderer): + yield entry + continuation = self._extract_continuation(continuation_renderer) + continue + + continuation_items = try_get( + response, lambda x: x['onResponseReceivedActions'][0]['appendContinuationItemsAction']['continuationItems'], list) + if continuation_items: + continuation_item = continuation_items[0] + if not isinstance(continuation_item, dict): + continue + renderer = continuation_item.get('playlistVideoRenderer') + if renderer: + video_list_renderer = {'contents': continuation_items} + for entry in self._playlist_entries(video_list_renderer): + yield entry + continuation = self._extract_continuation(video_list_renderer) + continue + + break + + @staticmethod + def _extract_selected_tab(tabs): + for tab in tabs: + if try_get(tab, lambda x: x['tabRenderer']['selected'], bool): + return tab['tabRenderer'] + else: + raise ExtractorError('Unable to find selected tab') + + @staticmethod + def _extract_uploader(data): + uploader = {} + sidebar_renderer = try_get( + data, lambda x: x['sidebar']['playlistSidebarRenderer']['items'], list) + if sidebar_renderer: + for item in sidebar_renderer: + if not isinstance(item, dict): + continue + renderer = item.get('playlistSidebarSecondaryInfoRenderer') + if not isinstance(renderer, dict): + continue + owner = try_get( + renderer, lambda x: x['videoOwner']['videoOwnerRenderer']['title']['runs'][0], dict) + if owner: + uploader['uploader'] = owner.get('text') + uploader['uploader_id'] = try_get( + owner, lambda x: x['navigationEndpoint']['browseEndpoint']['browseId'], compat_str) + uploader['uploader_url'] = urljoin( + 'https://www.youtube.com/', + try_get(owner, lambda x: x['navigationEndpoint']['browseEndpoint']['canonicalBaseUrl'], compat_str)) + return uploader + + def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token): + selected_tab = self._extract_selected_tab(tabs) + renderer = try_get( + data, lambda x: x['metadata']['channelMetadataRenderer'], dict) + if renderer: + channel_title = renderer.get('title') or item_id + tab_title = selected_tab.get('title') + title = channel_title or item_id + if tab_title: + title += ' - %s' % tab_title + description = renderer.get('description') + playlist_id = renderer.get('externalId') + renderer = try_get( + data, lambda x: x['metadata']['playlistMetadataRenderer'], dict) + if renderer: + title = renderer.get('title') + description = None + playlist_id = item_id + playlist = self.playlist_result( + self._entries(selected_tab['content'], identity_token), + playlist_id=playlist_id, playlist_title=title, + playlist_description=description) + playlist.update(self._extract_uploader(data)) + return playlist + + def _extract_from_playlist(self, item_id, data, playlist): + title = playlist.get('title') or try_get( + data, lambda x: x['titleText']['simpleText'], compat_str) + playlist_id = playlist.get('playlistId') or item_id + return self.playlist_result( + self._playlist_entries(playlist), playlist_id=playlist_id, + playlist_title=title) + + def _real_extract(self, url): + item_id = self._match_id(url) + url = compat_urlparse.urlunparse( + compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com')) + # Handle both video/playlist URLs + qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + video_id = qs.get('v', [None])[0] + playlist_id = qs.get('list', [None])[0] + if video_id and playlist_id: + if self._downloader.params.get('noplaylist'): + self.to_screen('Downloading just video %s because of --no-playlist' % video_id) + return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id) + self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) + webpage = self._download_webpage(url, item_id) + identity_token = self._search_regex( + r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage, + 'identity token', default=None) + data = self._extract_yt_initial_data(item_id, webpage) + tabs = try_get( + data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list) + if tabs: + return self._extract_from_tabs(item_id, webpage, data, tabs, identity_token) + playlist = try_get( + data, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict) + if playlist: + return self._extract_from_playlist(item_id, data, playlist) + # Fallback to video extraction if no playlist alike page is recognized + if video_id: + return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id) + # Failed to recognize + raise ExtractorError('Unable to recognize tab page') + + +class YoutubePlaylistIE(InfoExtractor): + IE_DESC = 'YouTube.com playlists' + _VALID_URL = r'''(?x)(?: + (?:https?://)? + (?:\w+\.)? + (?: + (?: + youtube(?:kids)?\.com| + invidio\.us| + youtu\.be + ) + /.*?\?.*?\blist= + )? + (?P<id>%(playlist_id)s) + )''' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE} + IE_NAME = 'youtube:playlist' + _TESTS = [{ + 'note': 'issue #673', + 'url': 'PLBB231211A4F62143', + 'info_dict': { + 'title': '[OLD]Team Fortress 2 (Class-based LP)', + 'id': 'PLBB231211A4F62143', + 'uploader': 'Wickydoo', + 'uploader_id': 'UCKSpbfbl5kRQpTdL7kMc-1Q', + }, + 'playlist_mincount': 29, + }, { + 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl', + 'info_dict': { + 'title': 'YDL_safe_search', + 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl', + }, + 'playlist_count': 2, + 'skip': 'This playlist is private', + }, { + 'note': 'embedded', + 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu', + 'playlist_count': 4, + 'info_dict': { + 'title': 'JODA15', + 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu', + 'uploader': 'milan', + 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw', + } + }, { + 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl', + 'playlist_mincount': 982, + 'info_dict': { + 'title': '2018 Chinese New Singles (11/6 updated)', + 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl', + 'uploader': 'LBK', + 'uploader_id': 'UC21nz3_MesPLqtDqwdvnoxA', + } }, { 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5', 'info_dict': { @@ -2831,16 +3171,6 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): 'noplaylist': True, 'skip_download': True, }, - }, { - # https://github.com/ytdl-org/youtube-dl/issues/21844 - 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba', - 'info_dict': { - 'title': 'Data Analysis with Dr Mike Pound', - 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba', - 'uploader_id': 'Computerphile', - 'uploader': 'Computerphile', - }, - 'playlist_mincount': 11, }, { 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21', 'only_matching': True, @@ -2851,363 +3181,35 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): # music album playlist 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM', 'only_matching': True, - }, { - 'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU', - 'only_matching': True, - }, { - 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g', - 'only_matching': True, - }] - - def _real_initialize(self): - self._login() - - def extract_videos_from_page(self, page): - ids_in_page = [] - titles_in_page = [] - - for item in re.findall( - r'(<[^>]*\bdata-video-id\s*=\s*["\'][0-9A-Za-z_-]{11}[^>]+>)', page): - attrs = extract_attributes(item) - video_id = attrs['data-video-id'] - video_title = unescapeHTML(attrs.get('data-title')) - if video_title: - video_title = video_title.strip() - ids_in_page.append(video_id) - titles_in_page.append(video_title) - - # Fallback with old _VIDEO_RE - self.extract_videos_from_page_impl( - self._VIDEO_RE, page, ids_in_page, titles_in_page) - - # Relaxed fallbacks - self.extract_videos_from_page_impl( - r'href="\s*/watch\?v\s*=\s*(?P<id>[0-9A-Za-z_-]{11})', page, - ids_in_page, titles_in_page) - self.extract_videos_from_page_impl( - r'data-video-ids\s*=\s*["\'](?P<id>[0-9A-Za-z_-]{11})', page, - ids_in_page, titles_in_page) - - return zip(ids_in_page, titles_in_page) - - def _extract_mix_ids_from_yt_initial(self, yt_initial): - ids = [] - playlist_contents = try_get(yt_initial, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist']['contents'], list) - if playlist_contents: - for item in playlist_contents: - videoId = try_get(item, lambda x: x['playlistPanelVideoRenderer']['videoId'], compat_str) - if videoId: - ids.append(videoId) - return ids - - def _extract_mix(self, playlist_id): - # The mixes are generated from a single video - # the id of the playlist is just 'RD' + video_id - ids = [] - yt_initial = None - last_id = playlist_id[-11:] - for n in itertools.count(1): - url = 'https://www.youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id) - webpage = self._download_webpage( - url, playlist_id, 'Downloading page {0} of Youtube mix'.format(n)) - new_ids = orderedSet(re.findall( - r'''(?xs)data-video-username=".*?".*? - href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s''' % re.escape(playlist_id), - webpage)) - - # if no ids in html of page, try using embedded json - if (len(new_ids) == 0): - yt_initial = self._get_yt_initial_data(playlist_id, webpage) - if yt_initial: - new_ids = self._extract_mix_ids_from_yt_initial(yt_initial) - - # Fetch new pages until all the videos are repeated, it seems that - # there are always 51 unique videos. - new_ids = [_id for _id in new_ids if _id not in ids] - if not new_ids: - break - ids.extend(new_ids) - last_id = ids[-1] - - url_results = self._ids_to_results(ids) - - search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage) - title_span = ( - search_title('playlist-title') - or search_title('title long-title') - or search_title('title')) - title = clean_html(title_span) - - if not title: - title = try_get(yt_initial, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist']['title'], compat_str) - - return self.playlist_result(url_results, playlist_id, title) - - def _extract_playlist(self, playlist_id): - url = self._TEMPLATE_URL % playlist_id - page = self._download_webpage(url, playlist_id) - - # the yt-alert-message now has tabindex attribute (see https://github.com/ytdl-org/youtube-dl/issues/11604) - for match in re.findall(r'<div class="yt-alert-message"[^>]*>([^<]+)</div>', page): - match = match.strip() - # Check if the playlist exists or is private - mobj = re.match(r'[^<]*(?:The|This) playlist (?P<reason>does not exist|is private)[^<]*', match) - if mobj: - reason = mobj.group('reason') - message = 'This playlist %s' % reason - if 'private' in reason: - message += ', use --username or --netrc to access it' - message += '.' - raise ExtractorError(message, expected=True) - elif re.match(r'[^<]*Invalid parameters[^<]*', match): - raise ExtractorError( - 'Invalid parameters. Maybe URL is incorrect.', - expected=True) - elif re.match(r'[^<]*Choose your language[^<]*', match): - continue - else: - self.report_warning('Youtube gives an alert message: ' + match) - - playlist_title = self._html_search_regex( - r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>', - page, 'title', default=None) - - _UPLOADER_BASE = r'class=["\']pl-header-details[^>]+>\s*<li>\s*<a[^>]+\bhref=' - uploader = self._html_search_regex( - r'%s["\']/(?:user|channel)/[^>]+>([^<]+)' % _UPLOADER_BASE, - page, 'uploader', default=None) - mobj = re.search( - r'%s(["\'])(?P<path>/(?:user|channel)/(?P<uploader_id>.+?))\1' % _UPLOADER_BASE, - page) - if mobj: - uploader_id = mobj.group('uploader_id') - uploader_url = compat_urlparse.urljoin(url, mobj.group('path')) - else: - uploader_id = uploader_url = None - - has_videos = True - - if not playlist_title: - try: - # Some playlist URLs don't actually serve a playlist (e.g. - # https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4) - next(self._entries(page, playlist_id)) - except StopIteration: - has_videos = False - - playlist = self.playlist_result( - self._entries(page, playlist_id), playlist_id, playlist_title) - playlist.update({ - 'uploader': uploader, - 'uploader_id': uploader_id, - 'uploader_url': uploader_url, - }) - if playlist_id.startswith(self._YTM_PLAYLIST_PREFIX): - playlist.update(self._YTM_CHANNEL_INFO) - - return has_videos, playlist - - def _check_download_just_video(self, url, playlist_id): - # Check if it's a video-specific URL - query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) - video_id = query_dict.get('v', [None])[0] or self._search_regex( - r'(?:(?:^|//)youtu\.be/|youtube\.com/embed/(?!videoseries))([0-9A-Za-z_-]{11})', url, - 'video id', default=None) - if video_id: - if self._downloader.params.get('noplaylist'): - self.to_screen('Downloading just video %s because of --no-playlist' % video_id) - return video_id, self.url_result(video_id, 'Youtube', video_id=video_id) - else: - self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) - return video_id, None - return None, None - - def _real_extract(self, url): - # Extract playlist id - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError('Invalid URL: %s' % url) - playlist_id = mobj.group(1) or mobj.group(2) - - video_id, video = self._check_download_just_video(url, playlist_id) - if video: - return video - - if playlist_id.startswith(('RD', 'UL', 'PU')): - if not playlist_id.startswith(self._YTM_PLAYLIST_PREFIX): - # Mixes require a custom extraction process, - # Youtube Music playlists act like normal playlists (with randomized order) - return self._extract_mix(playlist_id) - - has_videos, playlist = self._extract_playlist(playlist_id) - if has_videos or not video_id: - return playlist - - # Some playlist URLs don't actually serve a playlist (see - # https://github.com/ytdl-org/youtube-dl/issues/10537). - # Fallback to plain video extraction if there is a video id - # along with playlist id. - return self.url_result(video_id, 'Youtube', video_id=video_id) - - -class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): - IE_DESC = 'YouTube.com channels' - _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie|kids)?\.com|(?:www\.)?invidio\.us)/channel/(?P<id>[0-9A-Za-z_-]+)' - _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos' - _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?' - IE_NAME = 'youtube:channel' - _TESTS = [{ - 'note': 'paginated channel', - 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w', - 'playlist_mincount': 91, - 'info_dict': { - 'id': 'UUKfVa3S1e4PHvxWcwyMMg8w', - 'title': 'Uploads from lex will', - 'uploader': 'lex will', - 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - } - }, { - 'note': 'Age restricted channel', - # from https://www.youtube.com/user/DeusExOfficial - 'url': 'https://www.youtube.com/channel/UCs0ifCMCm1icqRbqhUINa0w', - 'playlist_mincount': 64, - 'info_dict': { - 'id': 'UUs0ifCMCm1icqRbqhUINa0w', - 'title': 'Uploads from Deus Ex', - 'uploader': 'Deus Ex', - 'uploader_id': 'DeusExOfficial', - }, - }, { - 'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA', - 'only_matching': True, - }, { - 'url': 'https://www.youtubekids.com/channel/UCyu8StPfZWapR6rfW_JgqcA', - 'only_matching': True, }] @classmethod def suitable(cls, url): - return (False if YoutubePlaylistsIE.suitable(url) or YoutubeLiveIE.suitable(url) - else super(YoutubeChannelIE, cls).suitable(url)) - - def _build_template_url(self, url, channel_id): - return self._TEMPLATE_URL % channel_id + return False if YoutubeTabIE.suitable(url) else super( + YoutubePlaylistIE, cls).suitable(url) def _real_extract(self, url): - channel_id = self._match_id(url) - - url = self._build_template_url(url, channel_id) - - # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778) - # Workaround by extracting as a playlist if managed to obtain channel playlist URL - # otherwise fallback on channel by page extraction - channel_page = self._download_webpage( - url + '?view=57', channel_id, - 'Downloading channel page', fatal=False) - if channel_page is False: - channel_playlist_id = False - else: - channel_playlist_id = self._html_search_meta( - 'channelId', channel_page, 'channel id', default=None) - if not channel_playlist_id: - channel_url = self._html_search_meta( - ('al:ios:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad'), - channel_page, 'channel url', default=None) - if channel_url: - channel_playlist_id = self._search_regex( - r'vnd\.youtube://user/([0-9A-Za-z_-]+)', - channel_url, 'channel id', default=None) - if channel_playlist_id and channel_playlist_id.startswith('UC'): - playlist_id = 'UU' + channel_playlist_id[2:] - return self.url_result( - compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist') - - channel_page = self._download_webpage(url, channel_id, 'Downloading page #1') - autogenerated = re.search(r'''(?x) - class="[^"]*?(?: - channel-header-autogenerated-label| - yt-channel-title-autogenerated - )[^"]*"''', channel_page) is not None - - if autogenerated: - # The videos are contained in a single page - # the ajax pages can't be used, they are empty - entries = [ - self.url_result( - video_id, 'Youtube', video_id=video_id, - video_title=video_title) - for video_id, video_title in self.extract_videos_from_page(channel_page)] - return self.playlist_result(entries, channel_id) - - try: - next(self._entries(channel_page, channel_id)) - except StopIteration: - alert_message = self._html_search_regex( - r'(?s)<div[^>]+class=(["\']).*?\byt-alert-message\b.*?\1[^>]*>(?P<alert>[^<]+)</div>', - channel_page, 'alert', default=None, group='alert') - if alert_message: - raise ExtractorError('Youtube said: %s' % alert_message, expected=True) - - return self.playlist_result(self._entries(channel_page, channel_id), channel_id) + playlist_id = self._match_id(url) + qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + if not qs: + qs = {'list': playlist_id} + return self.url_result( + update_url_query('https://www.youtube.com/playlist', qs), + ie=YoutubeTabIE.ie_key(), video_id=playlist_id) -class YoutubeUserIE(YoutubeChannelIE): - IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)' - _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results|shared)(?:$|[^a-z_A-Z0-9%-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_%-]+)' - _TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos' - IE_NAME = 'youtube:user' - +class YoutubeYtUserIE(InfoExtractor): + _VALID_URL = r'ytuser:(?P<id>.+)' _TESTS = [{ - 'url': 'https://www.youtube.com/user/TheLinuxFoundation', - 'playlist_mincount': 320, - 'info_dict': { - 'id': 'UUfX55Sx5hEFjoC3cNs6mCUQ', - 'title': 'Uploads from The Linux Foundation', - 'uploader': 'The Linux Foundation', - 'uploader_id': 'TheLinuxFoundation', - } - }, { - # Only available via https://www.youtube.com/c/12minuteathlete/videos - # but not https://www.youtube.com/user/12minuteathlete/videos - 'url': 'https://www.youtube.com/c/12minuteathlete/videos', - 'playlist_mincount': 249, - 'info_dict': { - 'id': 'UUVjM-zV6_opMDx7WYxnjZiQ', - 'title': 'Uploads from 12 Minute Athlete', - 'uploader': '12 Minute Athlete', - 'uploader_id': 'the12minuteathlete', - } - }, { 'url': 'ytuser:phihag', 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/c/gametrailers', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/c/Pawe%C5%82Zadro%C5%BCniak', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/gametrailers', - 'only_matching': True, - }, { - # This channel is not available, geo restricted to JP - 'url': 'https://www.youtube.com/user/kananishinoSMEJ/videos', - 'only_matching': True, }] - @classmethod - def suitable(cls, url): - # Don't return True if the url can be extracted with other youtube - # extractor, the regex would is too permissive and it would match. - other_yt_ies = iter(klass for (name, klass) in globals().items() if name.startswith('Youtube') and name.endswith('IE') and klass is not cls) - if any(ie.suitable(url) for ie in other_yt_ies): - return False - else: - return super(YoutubeUserIE, cls).suitable(url) - - def _build_template_url(self, url, channel_id): - mobj = re.match(self._VALID_URL, url) - return self._TEMPLATE_URL % (mobj.group('user') or 'user', mobj.group('id')) + def _real_extract(self, url): + user_id = self._match_id(url) + return self.url_result( + 'https://www.youtube.com/user/%s' % user_id, + ie=YoutubeTabIE.ie_key(), video_id=user_id) class YoutubeLiveIE(YoutubeBaseInfoExtractor): @@ -3262,41 +3264,7 @@ def _real_extract(self, url): return self.url_result(base_url) -class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor): - IE_DESC = 'YouTube.com user/channel playlists' - _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel|c)/(?P<id>[^/]+)/playlists' - IE_NAME = 'youtube:playlists' - - _TESTS = [{ - 'url': 'https://www.youtube.com/user/ThirstForScience/playlists', - 'playlist_mincount': 4, - 'info_dict': { - 'id': 'ThirstForScience', - 'title': 'ThirstForScience', - }, - }, { - # with "Load more" button - 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd', - 'playlist_mincount': 70, - 'info_dict': { - 'id': 'igorkle1', - 'title': 'Игорь Клейнер', - }, - }, { - 'url': 'https://www.youtube.com/channel/UCiU1dHvZObB2iP6xkJ__Icw/playlists', - 'playlist_mincount': 17, - 'info_dict': { - 'id': 'UCiU1dHvZObB2iP6xkJ__Icw', - 'title': 'Chem Player', - }, - 'skip': 'Blocked', - }, { - 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists', - 'only_matching': True, - }] - - -class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistBaseInfoExtractor): +class YoutubeSearchIE(SearchInfoExtractor, YoutubeBaseInfoExtractor): IE_DESC = 'YouTube.com searches' # there doesn't appear to be a real limit, for example if you search for # 'python' you get more than 8.000.000 results @@ -3393,57 +3361,7 @@ class YoutubeSearchDateIE(YoutubeSearchIE): _SEARCH_PARAMS = 'CAI%3D' -class YoutubeSearchURLIE(YoutubePlaylistBaseInfoExtractor): - IE_DESC = 'YouTube.com search URLs' - IE_NAME = 'youtube:search_url' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)' - _TESTS = [{ - 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', - 'playlist_mincount': 5, - 'info_dict': { - 'title': 'youtube-dl test video', - } - }, { - 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB', - 'only_matching': True, - }] - - def _process_json_dict(self, obj, videos, c): - if "videoId" in obj: - videos.append(obj) - return - - if "nextContinuationData" in obj: - c["continuation"] = obj["nextContinuationData"] - return - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - query = compat_urllib_parse_unquote_plus(mobj.group('query')) - webpage = self._download_webpage(url, query) - return self.playlist_result(self._entries(webpage, query, max_pages=5), playlist_title=query) - - -class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor): - IE_DESC = 'YouTube.com (multi-season) shows' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)' - IE_NAME = 'youtube:show' - _TESTS = [{ - 'url': 'https://www.youtube.com/show/airdisasters', - 'playlist_mincount': 5, - 'info_dict': { - 'id': 'airdisasters', - 'title': 'Air Disasters', - } - }] - - def _real_extract(self, url): - playlist_id = self._match_id(url) - return super(YoutubeShowIE, self)._real_extract( - 'https://www.youtube.com/show/%s/playlists' % playlist_id) - - -class YoutubeFeedsInfoExtractor(YoutubePlaylistBaseInfoExtractor): +class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): """ Base class for feed extractors Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties. @@ -3457,51 +3375,62 @@ def IE_NAME(self): def _real_initialize(self): self._login() - def _process_entries(self, entries, seen): - new_info = [] - for v in entries: - v_id = try_get(v, lambda x: x['videoId']) - if not v_id: - continue + def _entries(self, page): + # The extraction process is the same as for playlists, but the regex + # for the video ids doesn't contain an index + ids = [] + more_widget_html = content_html = page + for page_num in itertools.count(1): + matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html) - have_video = False - for old in seen: - if old['videoId'] == v_id: - have_video = True - break + # 'recommended' feed has infinite 'load more' and each new portion spins + # the same videos in (sometimes) slightly different order, so we'll check + # for unicity and break when portion has no new videos + new_ids = list(filter(lambda video_id: video_id not in ids, orderedSet(matches))) + if not new_ids: + break - if not have_video: - new_info.append(v) + ids.extend(new_ids) - if not new_info: - return + for entry in self._ids_to_results(new_ids): + yield entry - seen.extend(new_info) - for video in new_info: - yield self.url_result(try_get(video, lambda x: x['videoId']), YoutubeIE.ie_key(), video_title=self._extract_title(video)) + mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) + if not mobj: + break + + more = self._download_json( + 'https://www.youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE, + 'Downloading page #%s' % page_num, + transform_source=uppercase_escape, + headers=self._YOUTUBE_CLIENT_HEADERS) + content_html = more['content_html'] + more_widget_html = more['load_more_widget_html'] def _real_extract(self, url): page = self._download_webpage( 'https://www.youtube.com/feed/%s' % self._FEED_NAME, self._PLAYLIST_TITLE) - return self.playlist_result(self._entries(page, self._PLAYLIST_TITLE), - playlist_title=self._PLAYLIST_TITLE) + return self.playlist_result( + self._entries(page), playlist_title=self._PLAYLIST_TITLE) -class YoutubeWatchLaterIE(YoutubePlaylistIE): +class YoutubeWatchLaterIE(InfoExtractor): IE_NAME = 'youtube:watchlater' IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/(?:feed/watch_later|(?:playlist|watch)\?(?:.+&)?list=WL)|:ytwatchlater' + _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/watch_later|:ytwatchlater' _TESTS = [{ - 'url': 'https://www.youtube.com/playlist?list=WL', + 'url': 'https://www.youtube.com/feed/watch_later', 'only_matching': True, }, { - 'url': 'https://www.youtube.com/watch?v=bCNU9TrbiRk&index=1&list=WL', + 'url': ':ytwatchlater', 'only_matching': True, }] def _real_extract(self, url): + return self.url_result( + 'https://www.youtube.com/playlist?list=WL', ie=YoutubeTabIE.ie_key()) _, video = self._check_download_just_video(url, 'WL') if video: return video @@ -3509,18 +3438,6 @@ def _real_extract(self, url): return playlist -class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): - IE_NAME = 'youtube:favorites' - IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/my_favorites|:ytfav(?:ou?rites)?' - _LOGIN_REQUIRED = True - - def _real_extract(self, url): - webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos') - playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id') - return self.url_result(playlist_id, 'YoutubePlaylist') - - class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor): IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)' _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/recommended|:ytrec(?:ommended)?' @@ -3606,3 +3523,67 @@ def _real_extract(self, url): raise ExtractorError( 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url), expected=True) + + +# Old extractors. Are these cases handled elsewhere? + +class YoutubeSearchURLIE(YoutubeSearchIE): + IE_DESC = 'YouTube.com search URLs' + IE_NAME = 'youtube:search_url' + _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)' + _TESTS = [{ + 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', + 'playlist_mincount': 5, + 'info_dict': { + 'title': 'youtube-dl test video', + } + }, { + 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB', + 'only_matching': True, + }] + + def _process_json_dict(self, obj, videos, c): + if "videoId" in obj: + videos.append(obj) + return + + if "nextContinuationData" in obj: + c["continuation"] = obj["nextContinuationData"] + return + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + query = compat_urllib_parse_unquote_plus(mobj.group('query')) + webpage = self._download_webpage(url, query) + return self.playlist_result(self._entries(webpage, query, max_pages=5), playlist_title=query) + + +class YoutubeShowIE(InfoExtractor): + IE_DESC = 'YouTube.com (multi-season) shows' + _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)' + IE_NAME = 'youtube:show' + _TESTS = [{ + 'url': 'https://www.youtube.com/show/airdisasters', + 'playlist_mincount': 5, + 'info_dict': { + 'id': 'airdisasters', + 'title': 'Air Disasters', + } + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + return super(YoutubeShowIE, self)._real_extract( + 'https://www.youtube.com/show/%s/playlists' % playlist_id) + + +class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): + IE_NAME = 'youtube:favorites' + IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)' + _VALID_URL = r'https?://(?:www\.)?youtube\.com/my_favorites|:ytfav(?:ou?rites)?' + _LOGIN_REQUIRED = True + + def _real_extract(self, url): + webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos') + playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id') + return self.url_result(playlist_id, 'YoutubePlaylist') diff --git a/youtube_dlc/utils.py b/youtube_dlc/utils.py index f5dc1bdaf..975b741c5 100644 --- a/youtube_dlc/utils.py +++ b/youtube_dlc/utils.py @@ -4085,7 +4085,7 @@ def fix_kv(m): v = m.group(0) if v in ('true', 'false', 'null'): return v - elif v.startswith('/*') or v.startswith('//') or v == ',': + elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',': return "" if v[0] in ("'", '"'): @@ -4095,12 +4095,12 @@ def fix_kv(m): '\\\n': '', '\\x': '\\u00', }.get(m.group(0), m.group(0)), v[1:-1]) - - for regex, base in INTEGER_TABLE: - im = re.match(regex, v) - if im: - i = int(im.group(1), base) - return '"%d":' % i if v.endswith(':') else '%d' % i + else: + for regex, base in INTEGER_TABLE: + im = re.match(regex, v) + if im: + i = int(im.group(1), base) + return '"%d":' % i if v.endswith(':') else '%d' % i return '"%s"' % v @@ -4110,7 +4110,8 @@ def fix_kv(m): {comment}|,(?={skip}[\]}}])| (?:(?<![0-9])[eE]|[a-df-zA-DF-Z_])[.a-zA-Z_0-9]*| \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?| - [0-9]+(?={skip}:) + [0-9]+(?={skip}:)| + !+ '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code) From 097f1663a90b6db14d31102c690cc33448a47cf9 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan@gmail.com> Date: Fri, 20 Nov 2020 14:26:24 +0530 Subject: [PATCH 103/124] Detect embedded bitchute videos --- youtube_dlc/extractor/bitchute.py | 8 ++++++++ youtube_dlc/extractor/generic.py | 6 ++++++ 2 files changed, 14 insertions(+) diff --git a/youtube_dlc/extractor/bitchute.py b/youtube_dlc/extractor/bitchute.py index 92fc70b5a..94219a138 100644 --- a/youtube_dlc/extractor/bitchute.py +++ b/youtube_dlc/extractor/bitchute.py @@ -36,6 +36,14 @@ class BitChuteIE(InfoExtractor): 'only_matching': True, }] + @staticmethod + def _extract_urls(webpage): + return [ + mobj.group('url') + for mobj in re.finditer( + r'<(?:script|iframe)[^>]+\bsrc=(["\'])(?P<url>%s)' % BitChuteIE._VALID_URL, + webpage)] + def _real_extract(self, url): video_id = self._match_id(url) diff --git a/youtube_dlc/extractor/generic.py b/youtube_dlc/extractor/generic.py index aba06b328..2ea46da30 100644 --- a/youtube_dlc/extractor/generic.py +++ b/youtube_dlc/extractor/generic.py @@ -119,6 +119,7 @@ from .zype import ZypeIE from .odnoklassniki import OdnoklassnikiIE from .kinja import KinjaEmbedIE +from .bitchute import BitChuteIE class GenericIE(InfoExtractor): @@ -3213,6 +3214,11 @@ def _real_extract(self, url): return self.playlist_from_matches( zype_urls, video_id, video_title, ie=ZypeIE.ie_key()) + bitchute_urls = BitChuteIE._extract_urls(webpage) + if bitchute_urls: + return self.playlist_from_matches( + bitchute_urls, video_id, video_title, ie=BitChuteIE.ie_key()) + # Look for HTML5 media entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') if entries: From d3260f40cb08f7c89613d2938e0e7f0f074767b0 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan@gmail.com> Date: Sun, 22 Nov 2020 03:46:38 +0530 Subject: [PATCH 104/124] Revert changes to vlive --- youtube_dlc/extractor/vlive.py | 279 ++++++++++++++++++++------------- 1 file changed, 171 insertions(+), 108 deletions(-) diff --git a/youtube_dlc/extractor/vlive.py b/youtube_dlc/extractor/vlive.py index 577e33f13..935560b57 100644 --- a/youtube_dlc/extractor/vlive.py +++ b/youtube_dlc/extractor/vlive.py @@ -4,48 +4,52 @@ import re import time import itertools -import json from .common import InfoExtractor from .naver import NaverBaseIE -from ..compat import ( - compat_HTTPError, - compat_str, -) +from ..compat import compat_str from ..utils import ( ExtractorError, - int_or_none, merge_dicts, try_get, urlencode_postdata, ) -class VLiveBaseIE(NaverBaseIE): - _APP_ID = '8c6cc7b45d2568fb668be6e05b6e5a3b' - - -class VLiveIE(VLiveBaseIE): +class VLiveIE(NaverBaseIE): IE_NAME = 'vlive' - _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/(?:video|embed)/(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/(?:video|post)/(?P<id>(?:\d-)?[0-9]+)' _NETRC_MACHINE = 'vlive' _TESTS = [{ - 'url': 'http://www.vlive.tv/video/1326', + 'url': 'https://www.vlive.tv/video/1326', 'md5': 'cc7314812855ce56de70a06a27314983', 'info_dict': { 'id': '1326', 'ext': 'mp4', - 'title': "Girl's Day's Broadcast", + 'title': "[V LIVE] Girl's Day's Broadcast", 'creator': "Girl's Day", 'view_count': int, 'uploader_id': 'muploader_a', }, - }, { - 'url': 'http://www.vlive.tv/video/16937', + }, + { + 'url': 'https://vlive.tv/post/1-18244258', + 'md5': 'cc7314812855ce56de70a06a27314983', + 'info_dict': { + 'id': '1326', + 'ext': 'mp4', + 'title': "[V LIVE] Girl's Day's Broadcast", + 'creator': "Girl's Day", + 'view_count': int, + 'uploader_id': 'muploader_a', + }, + }, + { + 'url': 'https://www.vlive.tv/video/16937', 'info_dict': { 'id': '16937', 'ext': 'mp4', - 'title': '첸백시 걍방', + 'title': '[V LIVE] 첸백시 걍방', 'creator': 'EXO', 'view_count': int, 'subtitles': 'mincount:12', @@ -66,11 +70,12 @@ class VLiveIE(VLiveBaseIE): 'subtitles': 'mincount:10', }, 'skip': 'This video is only available for CH+ subscribers', - }, { - 'url': 'https://www.vlive.tv/embed/1326', - 'only_matching': True, }] + @classmethod + def suitable(cls, url): + return False if VLivePlaylistIE.suitable(url) else super(VLiveIE, cls).suitable(url) + def _real_initialize(self): self._login() @@ -102,82 +107,118 @@ def is_logged_in(): if not is_logged_in(): raise ExtractorError('Unable to log in', expected=True) - def _call_api(self, path_template, video_id, fields=None): - query = {'appId': self._APP_ID} - if fields: - query['fields'] = fields - return self._download_json( - 'https://www.vlive.tv/globalv-web/vam-web/' + path_template % video_id, video_id, - 'Downloading %s JSON metadata' % path_template.split('/')[-1].split('-')[0], - headers={'Referer': 'https://www.vlive.tv/'}, query=query) - def _real_extract(self, url): - video_id = self._match_id(url) + # url may match on a post or a video url with a post_id potentially matching a video_id + working_id = self._match_id(url) + webpage = self._download_webpage(url, working_id) - try: - post = self._call_api( - 'post/v1.0/officialVideoPost-%s', video_id, - 'author{nickname},channel{channelCode,channelName},officialVideo{commentCount,exposeStatus,likeCount,playCount,playTime,status,title,type,vodId}') - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - self.raise_login_required(json.loads(e.cause.read().decode())['message']) - raise + PARAMS_RE = r'window\.__PRELOADED_STATE__\s*=\s*({.*});?\s*</script>' + PARAMS_FIELD = 'params' - video = post['officialVideo'] + params = self._search_regex( + PARAMS_RE, webpage, PARAMS_FIELD, default='', flags=re.DOTALL) + params = self._parse_json(params, working_id, fatal=False) - def get_common_fields(): - channel = post.get('channel') or {} - return { - 'title': video.get('title'), - 'creator': post.get('author', {}).get('nickname'), - 'channel': channel.get('channelName'), - 'channel_id': channel.get('channelCode'), - 'duration': int_or_none(video.get('playTime')), - 'view_count': int_or_none(video.get('playCount')), - 'like_count': int_or_none(video.get('likeCount')), - 'comment_count': int_or_none(video.get('commentCount')), - } + video_params = try_get(params, lambda x: x["postDetail"]["post"]["officialVideo"], dict) - video_type = video.get('type') - if video_type == 'VOD': - inkey = self._call_api('video/v1.0/vod/%s/inkey', video_id)['inkey'] - vod_id = video['vodId'] - return merge_dicts( - get_common_fields(), - self._extract_video_info(video_id, vod_id, inkey)) - elif video_type == 'LIVE': - status = video.get('status') - if status == 'ON_AIR': - stream_url = self._call_api( - 'old/v3/live/%s/playInfo', - video_id)['result']['adaptiveStreamUrl'] - formats = self._extract_m3u8_formats(stream_url, video_id, 'mp4') - info = get_common_fields() - info.update({ - 'title': self._live_title(video['title']), - 'id': video_id, - 'formats': formats, - 'is_live': True, - }) - return info - elif status == 'ENDED': - raise ExtractorError( - 'Uploading for replay. Please wait...', expected=True) - elif status == 'RESERVED': - raise ExtractorError('Coming soon!', expected=True) - elif video.get('exposeStatus') == 'CANCEL': - raise ExtractorError( - 'We are sorry, but the live broadcast has been canceled.', - expected=True) + if video_params is None: + error = try_get(params, lambda x: x["postDetail"]["error"], dict) + error_data = try_get(error, lambda x: x["data"], dict) + error_video = try_get(error_data, lambda x: x["officialVideo"], dict) + error_msg = try_get(error, lambda x: x["message"], compat_str) + product_type = try_get(error_data, + [lambda x: x["officialVideo"]["productType"], + lambda x: x["board"]["boardType"]], + compat_str) + + if error_video is not None: + if product_type in ('VLIVE_PLUS', 'VLIVE+'): + self.raise_login_required('This video is only available with V LIVE+.') + elif error_msg is not None: + raise ExtractorError('V LIVE reported the following error: %s' % error_msg) + else: + raise ExtractorError('Failed to extract video parameters.') + elif 'post' in url: + raise ExtractorError('Url does not appear to be a video post.', expected=True) else: - raise ExtractorError('Unknown status ' + status) + raise ExtractorError('Failed to extract video parameters.') + + video_id = working_id if 'video' in url else str(video_params["videoSeq"]) + + video_type = video_params["type"] + if video_type in ('VOD'): + encoding_status = video_params["encodingStatus"] + if encoding_status == 'COMPLETE': + return self._replay(video_id, webpage, params, video_params) + else: + raise ExtractorError('VOD encoding not yet complete. Please try again later.', + expected=True) + elif video_type in ('LIVE'): + video_status = video_params["status"] + if video_status in ('RESERVED'): + raise ExtractorError('Coming soon!', expected=True) + elif video_status in ('ENDED', 'END'): + raise ExtractorError('Uploading for replay. Please wait...', expected=True) + else: + return self._live(video_id, webpage, params) + else: + raise ExtractorError('Unknown video type %s' % video_type) + + def _get_common_fields(self, webpage, params): + title = self._og_search_title(webpage) + description = self._html_search_meta( + ['og:description', 'description', 'twitter:description'], + webpage, 'description', default=None) + creator = (try_get(params, lambda x: x["channel"]["channel"]["channelName"], compat_str) + or self._search_regex(r'on (.*) channel', description or '', 'creator', fatal=False)) + thumbnail = self._og_search_thumbnail(webpage) + return { + 'title': title, + 'creator': creator, + 'thumbnail': thumbnail, + } + + def _live(self, video_id, webpage, params): + LIVE_INFO_ENDPOINT = 'https://www.vlive.tv/globalv-web/vam-web/old/v3/live/%s/playInfo' % video_id + play_info = self._download_json(LIVE_INFO_ENDPOINT, video_id, + headers={"referer": "https://www.vlive.tv"}) + + streams = try_get(play_info, lambda x: x["result"]["streamList"], list) or [] + + formats = [] + for stream in streams: + formats.extend(self._extract_m3u8_formats( + stream['serviceUrl'], video_id, 'mp4', + fatal=False, live=True)) + self._sort_formats(formats) + + info = self._get_common_fields(webpage, params) + info.update({ + 'title': self._live_title(info['title']), + 'id': video_id, + 'formats': formats, + 'is_live': True, + }) + return info + + def _replay(self, video_id, webpage, params, video_params): + long_video_id = video_params["vodId"] + + VOD_KEY_ENDPOINT = 'https://www.vlive.tv/globalv-web/vam-web/video/v1.0/vod/%s/inkey' % video_id + key_json = self._download_json(VOD_KEY_ENDPOINT, video_id, + headers={"referer": "https://www.vlive.tv"}) + key = key_json["inkey"] + + return merge_dicts( + self._get_common_fields(webpage, params), + self._extract_video_info(video_id, long_video_id, key)) -class VLiveChannelIE(VLiveBaseIE): +class VLiveChannelIE(InfoExtractor): IE_NAME = 'vlive:channel' - _VALID_URL = r'https?://(?:channels\.vlive\.tv|(?:(?:www|m)\.)?vlive\.tv/channel)/(?P<id>[0-9A-Z]+)' + _VALID_URL = r'https?://(?:(?:www|m)\.)?(?:channels\.vlive\.tv/|vlive\.tv/channels?/)(?P<id>[0-9A-Z]+)' _TESTS = [{ - 'url': 'http://channels.vlive.tv/FCD4B', + 'url': 'https://channels.vlive.tv/FCD4B', 'info_dict': { 'id': 'FCD4B', 'title': 'MAMAMOO', @@ -185,39 +226,63 @@ class VLiveChannelIE(VLiveBaseIE): 'playlist_mincount': 110 }, { 'url': 'https://www.vlive.tv/channel/FCD4B', - 'only_matching': True, + 'info_dict': { + 'id': 'FCD4B', + 'title': 'MAMAMOO', + }, + 'playlist_mincount': 110 }] - - def _call_api(self, path, channel_key_suffix, channel_value, note, query): - q = { - 'app_id': self._APP_ID, - 'channel' + channel_key_suffix: channel_value, - } - q.update(query) - return self._download_json( - 'http://api.vfan.vlive.tv/vproxy/channelplus/' + path, - channel_value, note='Downloading ' + note, query=q)['result'] + _APP_ID = '8c6cc7b45d2568fb668be6e05b6e5a3b' def _real_extract(self, url): channel_code = self._match_id(url) - channel_seq = self._call_api( - 'decodeChannelCode', 'Code', channel_code, - 'decode channel code', {})['channelSeq'] + webpage = self._download_webpage( + 'http://channels.vlive.tv/%s/video' % channel_code, channel_code) + app_id = None + + app_js_url = self._search_regex( + r'<script[^>]+src=(["\'])(?P<url>http.+?/app\.js.*?)\1', + webpage, 'app js', default=None, group='url') + + if app_js_url: + app_js = self._download_webpage( + app_js_url, channel_code, 'Downloading app JS', fatal=False) + if app_js: + app_id = self._search_regex( + r'Global\.VFAN_APP_ID\s*=\s*[\'"]([^\'"]+)[\'"]', + app_js, 'app id', default=None) + + app_id = app_id or self._APP_ID + + channel_info = self._download_json( + 'http://api.vfan.vlive.tv/vproxy/channelplus/decodeChannelCode', + channel_code, note='Downloading decode channel code', + query={ + 'app_id': app_id, + 'channelCode': channel_code, + '_': int(time.time()) + }) + + channel_seq = channel_info['result']['channelSeq'] channel_name = None entries = [] for page_num in itertools.count(1): - video_list = self._call_api( - 'getChannelVideoList', 'Seq', channel_seq, - 'channel list page #%d' % page_num, { + video_list = self._download_json( + 'http://api.vfan.vlive.tv/vproxy/channelplus/getChannelVideoList', + channel_code, note='Downloading channel list page #%d' % page_num, + query={ + 'app_id': app_id, + 'channelSeq': channel_seq, # Large values of maxNumOfRows (~300 or above) may cause # empty responses (see [1]), e.g. this happens for [2] that # has more than 300 videos. # 1. https://github.com/ytdl-org/youtube-dl/issues/13830 # 2. http://channels.vlive.tv/EDBF. 'maxNumOfRows': 100, + '_': int(time.time()), 'pageNo': page_num } ) @@ -225,11 +290,11 @@ def _real_extract(self, url): if not channel_name: channel_name = try_get( video_list, - lambda x: x['channelInfo']['channelName'], + lambda x: x['result']['channelInfo']['channelName'], compat_str) videos = try_get( - video_list, lambda x: x['videoList'], list) + video_list, lambda x: x['result']['videoList'], list) if not videos: break @@ -247,9 +312,7 @@ def _real_extract(self, url): entries, channel_code, channel_name) -# old extractor. Rewrite? - -class VLivePlaylistIE(VLiveBaseIE): +class VLivePlaylistIE(InfoExtractor): IE_NAME = 'vlive:playlist' _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/video/(?P<video_id>[0-9]+)/playlist/(?P<id>[0-9]+)' _VIDEO_URL_TEMPLATE = 'http://www.vlive.tv/video/%s' From 3462ffa8929d2a40588669578ca912d57a0da1bb Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan@gmail.com> Date: Sun, 22 Nov 2020 03:51:09 +0530 Subject: [PATCH 105/124] Implemented all Youtube Feeds (ytfav, ytwatchlater, ytsubs, ythistory, ytrec) and SearchURL --- docs/supportedsites.md | 5 +- test/test_all_urls.py | 5 +- youtube_dlc/YoutubeDL.py | 2 +- youtube_dlc/extractor/extractors.py | 1 - youtube_dlc/extractor/youtube.py | 338 ++++++++++++++++------------ 5 files changed, 202 insertions(+), 149 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 0481f7db9..860766f20 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1146,7 +1146,7 @@ # Supported sites - **YourPorn** - **YourUpload** - **youtube**: YouTube.com - - **youtube:favorites**: YouTube.com favourite videos, ":ytfav" for short (requires authentication) + - **youtube:favorites**: YouTube.com liked videos, ":ytfav" or "LL" for short (requires authentication) - **youtube:history**: Youtube watch history, ":ythistory" for short (requires authentication) - **youtube:live**: YouTube.com live streams - **youtube:playlist**: YouTube.com playlists @@ -1154,11 +1154,10 @@ # Supported sites - **youtube:search**: YouTube.com searches - **youtube:search:date**: YouTube.com searches, newest videos first - **youtube:search_url**: YouTube.com search URLs - - **youtube:show**: YouTube.com (multi-season) shows - **youtube:subscriptions**: YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication) - **YoutubeYtUser**: YouTube.com user videos (URL or "ytuser" keyword) - **youtube:tab**: YouTube.com tab - - **youtube:watchlater**: Youtube watch later list, ":ytwatchlater" for short (requires authentication) + - **youtube:watchlater**: Youtube watch later list, ":ytwatchlater" or "WL" for short (requires authentication) - **Zapiks** - **Zaq1** - **Zattoo** diff --git a/test/test_all_urls.py b/test/test_all_urls.py index a44cf7549..4784c633f 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -35,6 +35,9 @@ def test_youtube_playlist_matching(self): assertPlaylist('ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8') assertPlaylist('UUBABnxM4Ar9ten8Mdjj1j0Q') # 585 assertPlaylist('PL63F0C78739B09958') + assertTab('https://www.youtube.com/AsapSCIENCE') + assertTab('https://www.youtube.com/embedded') + assertTab('https://www.youtube.com/feed') # Own channel's home page assertTab('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q') assertPlaylist('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8') assertTab('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC') @@ -47,7 +50,7 @@ def test_youtube_matching(self): self.assertTrue(YoutubeIE.suitable('PLtS2H6bU1M')) self.assertFalse(YoutubeIE.suitable('https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012')) # 668 self.assertMatch('http://youtu.be/BaW_jenozKc', ['youtube']) - self.assertMatch('http://www.youtube.com/v/BaW_jenozKc', ['youtube']) + # self.assertMatch('http://www.youtube.com/v/BaW_jenozKc', ['youtube']) # /v/ is no longer valid self.assertMatch('https://youtube.googleapis.com/v/BaW_jenozKc', ['youtube']) self.assertMatch('http://www.cleanvideosearch.com/media/action/yt/watch?videoId=8v_4O44sfjM', ['youtube']) diff --git a/youtube_dlc/YoutubeDL.py b/youtube_dlc/YoutubeDL.py index bf02192eb..3c2970d9f 100644 --- a/youtube_dlc/YoutubeDL.py +++ b/youtube_dlc/YoutubeDL.py @@ -832,7 +832,7 @@ def extract_info(self, url, download=True, ie_key=None, info_dict=None, extra_in try: try: temp_id = ie.extract_id(url) if callable(getattr(ie, 'extract_id', None)) else ie._match_id(url) - except (AssertionError, IndexError): + except (AssertionError, IndexError, AttributeError): temp_id = None if temp_id is not None and self.in_download_archive({'id': temp_id, 'ie_key': ie_key}): self.to_screen("[%s] %s: has already been recorded in archive" % ( diff --git a/youtube_dlc/extractor/extractors.py b/youtube_dlc/extractor/extractors.py index 9e832450a..ee52492dc 100644 --- a/youtube_dlc/extractor/extractors.py +++ b/youtube_dlc/extractor/extractors.py @@ -1514,7 +1514,6 @@ YoutubeSearchDateIE, YoutubeSearchIE, YoutubeSearchURLIE, - YoutubeShowIE, YoutubeSubscriptionsIE, YoutubeTruncatedIDIE, YoutubeTruncatedURLIE, diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index bbd9b2c4c..3f3f9c58b 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -34,7 +34,6 @@ get_element_by_id, int_or_none, mimetype2ext, - orderedSet, parse_codecs, parse_count, parse_duration, @@ -64,11 +63,16 @@ class YoutubeBaseInfoExtractor(InfoExtractor): _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge' _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}' + _RESERVED_NAMES = ( + r'course|embed|watch|w|results|storefront|' + r'shared|index|account|reporthistory|t/terms|about|upload|signin|logout|' + r'feed/(watch_later|history|subscriptions|library|trending|recommended)') + _NETRC_MACHINE = 'youtube' # If True it will raise an error if no login info is provided _LOGIN_REQUIRED = False - _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM)' + _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM|LL|WL)' _YOUTUBE_CLIENT_HEADERS = { 'x-youtube-client-name': '1', @@ -2495,7 +2499,13 @@ def decrypt_sig(mobj): class YoutubeTabIE(YoutubeBaseInfoExtractor): IE_DESC = 'YouTube.com tab' - _VALID_URL = r'https?://(?:\w+\.)?(?:youtube(?:kids)?\.com|invidio\.us)/(?:(?:channel|c|user)/|(?:playlist|watch)\?.*?\blist=)(?P<id>[^/?#&]+)' + # (?x)^ will cause warning in LiveIE. So I cant split this into multiple lines using ''' + _VALID_URL = ( + r'https?://(?:\w+\.)?(?:youtube(?:kids)?\.com|invidio\.us)/' + r'(?:(?!(%s)([/#?]|$))|' + r'(?:channel|c|user)/|' + r'(?:playlist|watch)\?.*?\blist=)' + r'(?P<id>[^/?#&]+)') % YoutubeBaseInfoExtractor._RESERVED_NAMES IE_NAME = 'youtube:tab' _TESTS = [{ @@ -2692,8 +2702,10 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): @classmethod def suitable(cls, url): - return False if YoutubeLiveIE.suitable(url) else super( - YoutubeTabIE, cls).suitable(url) + IGNORE = (YoutubeLiveIE,) + return ( + False if any(ie.suitable(url) for ie in IGNORE) + else super(YoutubeTabIE, cls).suitable(url)) def _extract_channel_id(self, webpage): channel_id = self._html_search_meta( @@ -2808,6 +2820,26 @@ def _playlist_entries(self, video_list_renderer): continue yield self._extract_video(renderer) + def _itemSection_entries(self, item_sect_renderer): + for content in item_sect_renderer['contents']: + if not isinstance(content, dict): + continue + renderer = content.get('videoRenderer', {}) + if not isinstance(renderer, dict): + continue + video_id = renderer.get('videoId') + if not video_id: + continue + yield self._extract_video(renderer) + + def _rich_entries(self, rich_grid_renderer): + renderer = try_get( + rich_grid_renderer, lambda x: x['content']['videoRenderer'], dict) + video_id = renderer.get('videoId') + if not video_id: + return + yield self._extract_video(renderer) + def _video_entry(self, video_renderer): video_id = video_renderer.get('videoId') if video_id: @@ -2899,49 +2931,67 @@ def _extract_continuation(cls, renderer): } def _entries(self, tab, identity_token): - continuation = None - slr_contents = try_get(tab, lambda x: x['sectionListRenderer']['contents'], list) or [] - for slr_content in slr_contents: - if not isinstance(slr_content, dict): - continue - is_renderer = try_get(slr_content, lambda x: x['itemSectionRenderer'], dict) - if not is_renderer: - continue - isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or [] - for isr_content in isr_contents: - if not isinstance(isr_content, dict): - continue - renderer = isr_content.get('playlistVideoListRenderer') - if renderer: - for entry in self._playlist_entries(renderer): - yield entry - continuation = self._extract_continuation(renderer) - continue - renderer = isr_content.get('gridRenderer') - if renderer: - for entry in self._grid_entries(renderer): - yield entry - continuation = self._extract_continuation(renderer) - continue - renderer = isr_content.get('shelfRenderer') - if renderer: - for entry in self._shelf_entries(renderer): - yield entry - continue - renderer = isr_content.get('backstagePostThreadRenderer') - if renderer: - for entry in self._post_thread_entries(renderer): - yield entry - continuation = self._extract_continuation(renderer) - continue - renderer = isr_content.get('videoRenderer') - if renderer: - entry = self._video_entry(renderer) - if entry: - yield entry - if not continuation: - continuation = self._extract_continuation(is_renderer) + def extract_entries(parent_renderer): + slr_contents = try_get(parent_renderer, lambda x: x['contents'], list) or [] + for slr_content in slr_contents: + if not isinstance(slr_content, dict): + continue + is_renderer = try_get(slr_content, lambda x: x['itemSectionRenderer'], dict) + if not is_renderer: + renderer = slr_content.get('richItemRenderer') + if renderer: + for entry in self._rich_entries(renderer): + yield entry + continuation_list[0] = self._extract_continuation(parent_renderer) + continue + isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or [] + for isr_content in isr_contents: + if not isinstance(isr_content, dict): + continue + renderer = isr_content.get('playlistVideoListRenderer') + if renderer: + for entry in self._playlist_entries(renderer): + yield entry + continuation_list[0] = self._extract_continuation(renderer) + continue + renderer = isr_content.get('gridRenderer') + if renderer: + for entry in self._grid_entries(renderer): + yield entry + continuation_list[0] = self._extract_continuation(renderer) + continue + renderer = isr_content.get('shelfRenderer') + if renderer: + for entry in self._shelf_entries(renderer): + yield entry + continuation_list[0] = self._extract_continuation(parent_renderer) + continue + renderer = isr_content.get('backstagePostThreadRenderer') + if renderer: + for entry in self._post_thread_entries(renderer): + yield entry + continuation_list[0] = self._extract_continuation(renderer) + continue + renderer = isr_content.get('videoRenderer') + if renderer: + entry = self._video_entry(renderer) + if entry: + yield entry + if not continuation_list[0]: + continuation_list[0] = self._extract_continuation(is_renderer) + if not continuation_list[0]: + continuation_list[0] = self._extract_continuation(parent_renderer) + + continuation_list = [None] # Python 2 doesnot support nonlocal + parent_renderer = ( + try_get(tab, lambda x: x['sectionListRenderer'], dict) + or try_get(tab, lambda x: x['richGridRenderer'], dict) or {}) + if parent_renderer: + for entry in extract_entries(parent_renderer): + yield entry + + continuation = continuation_list[0] headers = { 'x-youtube-client-name': '1', @@ -2953,6 +3003,8 @@ def _entries(self, tab, identity_token): for page_num in itertools.count(1): if not continuation: break + if hasattr(self, '_MAX_PAGES') and page_num > self._MAX_PAGES: + break browse = self._download_json( 'https://www.youtube.com/browse_ajax', None, 'Downloading page %d' % page_num, @@ -2984,6 +3036,13 @@ def _entries(self, tab, identity_token): yield entry continuation = self._extract_continuation(continuation_renderer) continue + continuation_renderer = continuation_contents.get('sectionListContinuation') + if continuation_renderer: + continuation_list = [None] + for entry in extract_entries(continuation_renderer): + yield entry + continuation = continuation_list[0] + continue continuation_items = try_get( response, lambda x: x['onResponseReceivedActions'][0]['appendContinuationItemsAction']['continuationItems'], list) @@ -2998,7 +3057,12 @@ def _entries(self, tab, identity_token): yield entry continuation = self._extract_continuation(video_list_renderer) continue - + renderer = continuation_item.get('itemSectionRenderer') + if renderer: + for entry in self._itemSection_entries(renderer): + yield entry + continuation = self._extract_continuation({'contents': continuation_items}) + continue break @staticmethod @@ -3036,6 +3100,7 @@ def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token): selected_tab = self._extract_selected_tab(tabs) renderer = try_get( data, lambda x: x['metadata']['channelMetadataRenderer'], dict) + playlist_id = None if renderer: channel_title = renderer.get('title') or item_id tab_title = selected_tab.get('title') @@ -3050,6 +3115,8 @@ def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token): title = renderer.get('title') description = None playlist_id = item_id + if playlist_id is None: + return None playlist = self.playlist_result( self._entries(selected_tab['content'], identity_token), playlist_id=playlist_id, playlist_title=title, @@ -3214,7 +3281,7 @@ def _real_extract(self, url): class YoutubeLiveIE(YoutubeBaseInfoExtractor): IE_DESC = 'YouTube.com live streams' - _VALID_URL = r'(?P<base_url>https?://(?:\w+\.)?youtube\.com/(?:(?:user|channel|c)/)?(?P<id>[^/]+))/live' + _VALID_URL = r'(?P<base_url>%s)/live' % YoutubeTabIE._VALID_URL IE_NAME = 'youtube:live' _TESTS = [{ @@ -3361,12 +3428,42 @@ class YoutubeSearchDateIE(YoutubeSearchIE): _SEARCH_PARAMS = 'CAI%3D' -class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): +class YoutubeSearchURLIE(InfoExtractor): + IE_DESC = 'YouTube.com search URLs' + IE_NAME = 'youtube:search_url' + _PARAM_REGEX = r'' + _VALID_URL = r'https?://(?:www\.)?youtube\.com/results/?(?:\?|\?[^#]*?&)(?:sp=(?P<param1>[^&#]+)&(?:[^#]*&)?)?(?:q|search_query)=(?P<query>[^#&]+)(?:[^#]*?&sp=(?P<param2>[^#&]+))?' + _MAX_RESULTS = 100 + _TESTS = [{ + 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', + 'playlist_mincount': 5, + 'info_dict': { + 'title': 'youtube-dl test video', + } + }, { + 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + query = compat_urllib_parse_unquote_plus(mobj.group('query')) + IE = YoutubeSearchIE(self._downloader) + IE._SEARCH_PARAMS = mobj.group('param1') or mobj.group('param2') + self._downloader.to_screen(IE._SEARCH_PARAMS) + IE._MAX_RESULTS = self._MAX_RESULTS + return IE._get_n_results(query, self._MAX_RESULTS) + + +class YoutubeFeedsInfoExtractor(YoutubeTabIE): """ Base class for feed extractors Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties. """ _LOGIN_REQUIRED = True + _TESTS = [] + + # _MAX_PAGES = 5 @property def IE_NAME(self): @@ -3375,50 +3472,39 @@ def IE_NAME(self): def _real_initialize(self): self._login() - def _entries(self, page): - # The extraction process is the same as for playlists, but the regex - # for the video ids doesn't contain an index - ids = [] - more_widget_html = content_html = page - for page_num in itertools.count(1): - matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html) + def _shelf_entries(self, shelf_renderer): + renderer = try_get(shelf_renderer, lambda x: x['content']['gridRenderer'], dict) + if not renderer: + return + for entry in self._grid_entries(renderer): + yield entry - # 'recommended' feed has infinite 'load more' and each new portion spins - # the same videos in (sometimes) slightly different order, so we'll check - # for unicity and break when portion has no new videos - new_ids = list(filter(lambda video_id: video_id not in ids, orderedSet(matches))) - if not new_ids: - break - - ids.extend(new_ids) - - for entry in self._ids_to_results(new_ids): - yield entry - - mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) - if not mobj: - break - - more = self._download_json( - 'https://www.youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE, - 'Downloading page #%s' % page_num, - transform_source=uppercase_escape, - headers=self._YOUTUBE_CLIENT_HEADERS) - content_html = more['content_html'] - more_widget_html = more['load_more_widget_html'] + def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token): + selected_tab = self._extract_selected_tab(tabs) + return self.playlist_result( + self._entries(selected_tab['content'], identity_token), + playlist_title=self._PLAYLIST_TITLE) def _real_extract(self, url): - page = self._download_webpage( - 'https://www.youtube.com/feed/%s' % self._FEED_NAME, - self._PLAYLIST_TITLE) - return self.playlist_result( - self._entries(page), playlist_title=self._PLAYLIST_TITLE) + item_id = self._FEED_NAME + url = 'https://www.youtube.com/feed/%s' % self._FEED_NAME + webpage = self._download_webpage(url, item_id) + identity_token = self._search_regex( + r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage, + 'identity token', default=None) + data = self._extract_yt_initial_data(item_id, webpage) + tabs = try_get( + data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list) + if tabs: + return self._extract_from_tabs(item_id, webpage, data, tabs, identity_token) + # Failed to recognize + raise ExtractorError('Unable to recognize feed page') -class YoutubeWatchLaterIE(InfoExtractor): - IE_NAME = 'youtube:watchlater' - IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)' +class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor): + IE_DESC = 'Youtube watch later list, ":ytwatchlater" or "WL" for short (requires authentication)' _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/watch_later|:ytwatchlater' + _FEED_NAME = 'watchlater' _TESTS = [{ 'url': 'https://www.youtube.com/feed/watch_later', @@ -3429,25 +3515,33 @@ class YoutubeWatchLaterIE(InfoExtractor): }] def _real_extract(self, url): - return self.url_result( - 'https://www.youtube.com/playlist?list=WL', ie=YoutubeTabIE.ie_key()) - _, video = self._check_download_just_video(url, 'WL') - if video: - return video - _, playlist = self._extract_playlist('WL') - return playlist + return self.url_result('WL', ie=YoutubePlaylistIE.ie_key()) + + +class YoutubeFavouritesIE(YoutubeFeedsInfoExtractor): + IE_DESC = 'YouTube.com liked videos, ":ytfav" or "LL" for short (requires authentication)' + _VALID_URL = r':ytfav(?:ou?rite)s?' + _FEED_NAME = 'favourites' + + _TESTS = [{ + 'url': ':ytfav', + 'only_matching': True, + }] + + def _real_extract(self, url): + return self.url_result('LL', ie=YoutubePlaylistIE.ie_key()) class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor): IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/recommended|:ytrec(?:ommended)?' + _VALID_URL = r'https?://(?:www\.)?youtube\.com(?:/feed/recommended|/?[?#]|/?$)|:ytrec(?:ommended)?' _FEED_NAME = 'recommended' _PLAYLIST_TITLE = 'Youtube Recommended videos' class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor): IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?' + _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsub(?:scription)?s?' _FEED_NAME = 'subscriptions' _PLAYLIST_TITLE = 'Youtube Subscriptions' @@ -3525,40 +3619,9 @@ def _real_extract(self, url): expected=True) -# Old extractors. Are these cases handled elsewhere? - -class YoutubeSearchURLIE(YoutubeSearchIE): - IE_DESC = 'YouTube.com search URLs' - IE_NAME = 'youtube:search_url' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)' - _TESTS = [{ - 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', - 'playlist_mincount': 5, - 'info_dict': { - 'title': 'youtube-dl test video', - } - }, { - 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB', - 'only_matching': True, - }] - - def _process_json_dict(self, obj, videos, c): - if "videoId" in obj: - videos.append(obj) - return - - if "nextContinuationData" in obj: - c["continuation"] = obj["nextContinuationData"] - return - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - query = compat_urllib_parse_unquote_plus(mobj.group('query')) - webpage = self._download_webpage(url, query) - return self.playlist_result(self._entries(webpage, query, max_pages=5), playlist_title=query) - - -class YoutubeShowIE(InfoExtractor): +# Do Youtube show urls even exist anymore? I couldn't find any +r''' +class YoutubeShowIE(YoutubeTabIE): IE_DESC = 'YouTube.com (multi-season) shows' _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)' IE_NAME = 'youtube:show' @@ -3575,15 +3638,4 @@ def _real_extract(self, url): playlist_id = self._match_id(url) return super(YoutubeShowIE, self)._real_extract( 'https://www.youtube.com/show/%s/playlists' % playlist_id) - - -class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): - IE_NAME = 'youtube:favorites' - IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/my_favorites|:ytfav(?:ou?rites)?' - _LOGIN_REQUIRED = True - - def _real_extract(self, url): - webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos') - playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id') - return self.url_result(playlist_id, 'YoutubePlaylist') +''' From a0566bbf5c3dea282447efb2926d71bafe1b7720 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan@gmail.com> Date: Sat, 21 Nov 2020 20:20:42 +0530 Subject: [PATCH 106/124] Updated to release 2020.11.21.1 --- test/parameters.json | 2 +- test/test_YoutubeDL.py | 70 +++++++++ youtube_dlc/YoutubeDL.py | 79 ++++++---- youtube_dlc/compat.py | 2 +- youtube_dlc/downloader/http.py | 4 +- youtube_dlc/extractor/amara.py | 103 +++++++++++++ youtube_dlc/extractor/brightcove.py | 8 +- youtube_dlc/extractor/common.py | 2 +- youtube_dlc/extractor/discoverynetworks.py | 5 +- youtube_dlc/extractor/europa.py | 4 +- youtube_dlc/extractor/extractors.py | 2 +- youtube_dlc/extractor/francetv.py | 2 +- youtube_dlc/extractor/generic.py | 2 +- youtube_dlc/extractor/googledrive.py | 58 +++---- youtube_dlc/extractor/infoq.py | 7 +- youtube_dlc/extractor/kusi.py | 4 +- youtube_dlc/extractor/npr.py | 2 +- youtube_dlc/extractor/pbs.py | 2 +- youtube_dlc/extractor/rai.py | 132 +++++----------- youtube_dlc/extractor/soundcloud.py | 2 +- youtube_dlc/extractor/svt.py | 48 ++++-- youtube_dlc/extractor/tagesschau.py | 2 +- youtube_dlc/extractor/theplatform.py | 2 +- youtube_dlc/extractor/turner.py | 6 +- youtube_dlc/extractor/viki.py | 171 ++++++++++++--------- youtube_dlc/extractor/vimeo.py | 4 +- youtube_dlc/extractor/xiami.py | 8 +- youtube_dlc/extractor/youtube.py | 170 +++++++++++--------- youtube_dlc/utils.py | 16 +- 29 files changed, 559 insertions(+), 360 deletions(-) create mode 100644 youtube_dlc/extractor/amara.py diff --git a/test/parameters.json b/test/parameters.json index 7bf59c25f..65fd54428 100644 --- a/test/parameters.json +++ b/test/parameters.json @@ -37,7 +37,7 @@ "writeinfojson": true, "writesubtitles": false, "allsubtitles": false, - "listssubtitles": false, + "listsubtitles": false, "socket_timeout": 20, "fixup": "never" } diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 6d02c2a54..a9e649191 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -919,6 +919,76 @@ def _real_extract(self, url): self.assertEqual(downloaded['extractor'], 'testex') self.assertEqual(downloaded['extractor_key'], 'TestEx') + # Test case for https://github.com/ytdl-org/youtube-dl/issues/27064 + def test_ignoreerrors_for_playlist_with_url_transparent_iterable_entries(self): + + class _YDL(YDL): + def __init__(self, *args, **kwargs): + super(_YDL, self).__init__(*args, **kwargs) + + def trouble(self, s, tb=None): + pass + + ydl = _YDL({ + 'format': 'extra', + 'ignoreerrors': True, + }) + + class VideoIE(InfoExtractor): + _VALID_URL = r'video:(?P<id>\d+)' + + def _real_extract(self, url): + video_id = self._match_id(url) + formats = [{ + 'format_id': 'default', + 'url': 'url:', + }] + if video_id == '0': + raise ExtractorError('foo') + if video_id == '2': + formats.append({ + 'format_id': 'extra', + 'url': TEST_URL, + }) + return { + 'id': video_id, + 'title': 'Video %s' % video_id, + 'formats': formats, + } + + class PlaylistIE(InfoExtractor): + _VALID_URL = r'playlist:' + + def _entries(self): + for n in range(3): + video_id = compat_str(n) + yield { + '_type': 'url_transparent', + 'ie_key': VideoIE.ie_key(), + 'id': video_id, + 'url': 'video:%s' % video_id, + 'title': 'Video Transparent %s' % video_id, + } + + def _real_extract(self, url): + return self.playlist_result(self._entries()) + + ydl.add_info_extractor(VideoIE(ydl)) + ydl.add_info_extractor(PlaylistIE(ydl)) + info = ydl.extract_info('playlist:') + entries = info['entries'] + self.assertEqual(len(entries), 3) + self.assertTrue(entries[0] is None) + self.assertTrue(entries[1] is None) + self.assertEqual(len(ydl.downloaded_info_dicts), 1) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(entries[2], downloaded) + self.assertEqual(downloaded['url'], TEST_URL) + self.assertEqual(downloaded['title'], 'Video Transparent 2') + self.assertEqual(downloaded['id'], '2') + self.assertEqual(downloaded['extractor'], 'Video') + self.assertEqual(downloaded['extractor_key'], 'Video') + if __name__ == '__main__': unittest.main() diff --git a/youtube_dlc/YoutubeDL.py b/youtube_dlc/YoutubeDL.py index 3c2970d9f..ef6fe0a78 100644 --- a/youtube_dlc/YoutubeDL.py +++ b/youtube_dlc/YoutubeDL.py @@ -830,34 +830,23 @@ def extract_info(self, url, download=True, ie_key=None, info_dict=None, extra_in 'and will probably not work.') try: - try: - temp_id = ie.extract_id(url) if callable(getattr(ie, 'extract_id', None)) else ie._match_id(url) - except (AssertionError, IndexError, AttributeError): - temp_id = None - if temp_id is not None and self.in_download_archive({'id': temp_id, 'ie_key': ie_key}): - self.to_screen("[%s] %s: has already been recorded in archive" % ( - ie_key, temp_id)) - break + temp_id = ie.extract_id(url) if callable(getattr(ie, 'extract_id', None)) else ie._match_id(url) + except (AssertionError, IndexError, AttributeError): + temp_id = None + if temp_id is not None and self.in_download_archive({'id': temp_id, 'ie_key': ie_key}): + self.to_screen("[%s] %s: has already been recorded in archive" % ( + ie_key, temp_id)) + break - ie_result = ie.extract(url) - if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here) - break - if isinstance(ie_result, list): - # Backwards compatibility: old IE result format - ie_result = { - '_type': 'compat_list', - 'entries': ie_result, - } - if info_dict: - if info_dict.get('id'): - ie_result['id'] = info_dict['id'] - if info_dict.get('title'): - ie_result['title'] = info_dict['title'] - self.add_default_extra_info(ie_result, ie, url) - if process: - return self.process_ie_result(ie_result, download, extra_info) - else: - return ie_result + return self.__extract_info(url, ie, download, extra_info, process, info_dict) + + else: + self.report_error('no suitable InfoExtractor for URL %s' % url) + + def __handle_extraction_exceptions(func): + def wrapper(self, *args, **kwargs): + try: + return func(self, *args, **kwargs) except GeoRestrictedError as e: msg = e.msg if e.countries: @@ -865,20 +854,38 @@ def extract_info(self, url, download=True, ie_key=None, info_dict=None, extra_in map(ISO3166Utils.short2full, e.countries)) msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.' self.report_error(msg) - break except ExtractorError as e: # An error we somewhat expected self.report_error(compat_str(e), e.format_traceback()) - break except MaxDownloadsReached: raise except Exception as e: if self.params.get('ignoreerrors', False): self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc())) - break else: raise + return wrapper + + @__handle_extraction_exceptions + def __extract_info(self, url, ie, download, extra_info, process, info_dict): + ie_result = ie.extract(url) + if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here) + return + if isinstance(ie_result, list): + # Backwards compatibility: old IE result format + ie_result = { + '_type': 'compat_list', + 'entries': ie_result, + } + if info_dict: + if info_dict.get('id'): + ie_result['id'] = info_dict['id'] + if info_dict.get('title'): + ie_result['title'] = info_dict['title'] + self.add_default_extra_info(ie_result, ie, url) + if process: + return self.process_ie_result(ie_result, download, extra_info) else: - self.report_error('no suitable InfoExtractor for URL %s' % url) + return ie_result def add_default_extra_info(self, ie_result, ie, url): self.add_extra_info(ie_result, { @@ -1057,9 +1064,8 @@ def report_download(num_entries): self.to_screen('[download] ' + reason) continue - entry_result = self.process_ie_result(entry, - download=download, - extra_info=extra) + entry_result = self.__process_iterable_entry(entry, download, extra) + # TODO: skip failed (empty) entries? playlist_results.append(entry_result) ie_result['entries'] = playlist_results self.to_screen('[download] Finished downloading playlist: %s' % playlist) @@ -1088,6 +1094,11 @@ def _fixup(r): else: raise Exception('Invalid result type: %s' % result_type) + @__handle_extraction_exceptions + def __process_iterable_entry(self, entry, download, extra_info): + return self.process_ie_result( + entry, download=download, extra_info=extra_info) + def _build_format_filter(self, filter_spec): " Returns a function to filter the formats according to the filter_spec " diff --git a/youtube_dlc/compat.py b/youtube_dlc/compat.py index 1cf7efed6..ac889ddd7 100644 --- a/youtube_dlc/compat.py +++ b/youtube_dlc/compat.py @@ -2345,7 +2345,7 @@ def __init__(self, version, name, value, *args, **kwargs): # HTMLParseError has been deprecated in Python 3.3 and removed in # Python 3.5. Introducing dummy exception for Python >3.5 for compatible - # and uniform cross-version exceptiong handling + # and uniform cross-version exception handling class compat_HTMLParseError(Exception): pass diff --git a/youtube_dlc/downloader/http.py b/youtube_dlc/downloader/http.py index 96379caf1..d8ac41dcc 100644 --- a/youtube_dlc/downloader/http.py +++ b/youtube_dlc/downloader/http.py @@ -109,7 +109,9 @@ def establish_connection(): try: ctx.data = self.ydl.urlopen(request) except (compat_urllib_error.URLError, ) as err: - if isinstance(err.reason, socket.timeout): + # reason may not be available, e.g. for urllib2.HTTPError on python 2.6 + reason = getattr(err, 'reason', None) + if isinstance(reason, socket.timeout): raise RetryDownload(err) raise err # When trying to resume, Content-Range HTTP header of response has to be checked diff --git a/youtube_dlc/extractor/amara.py b/youtube_dlc/extractor/amara.py new file mode 100644 index 000000000..61d469574 --- /dev/null +++ b/youtube_dlc/extractor/amara.py @@ -0,0 +1,103 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from .youtube import YoutubeIE +from .vimeo import VimeoIE +from ..utils import ( + int_or_none, + parse_iso8601, + update_url_query, +) + + +class AmaraIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?amara\.org/(?:\w+/)?videos/(?P<id>\w+)' + _TESTS = [{ + # Youtube + 'url': 'https://amara.org/en/videos/jVx79ZKGK1ky/info/why-jury-trials-are-becoming-less-common/?tab=video', + 'md5': 'ea10daf2b6154b8c1ecf9922aca5e8ae', + 'info_dict': { + 'id': 'h6ZuVdvYnfE', + 'ext': 'mp4', + 'title': 'Why jury trials are becoming less common', + 'description': 'md5:a61811c319943960b6ab1c23e0cbc2c1', + 'thumbnail': r're:^https?://.*\.jpg$', + 'subtitles': dict, + 'upload_date': '20160813', + 'uploader': 'PBS NewsHour', + 'uploader_id': 'PBSNewsHour', + 'timestamp': 1549639570, + } + }, { + # Vimeo + 'url': 'https://amara.org/en/videos/kYkK1VUTWW5I/info/vimeo-at-ces-2011', + 'md5': '99392c75fa05d432a8f11df03612195e', + 'info_dict': { + 'id': '18622084', + 'ext': 'mov', + 'title': 'Vimeo at CES 2011!', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'thumbnail': r're:^https?://.*\.jpg$', + 'subtitles': dict, + 'timestamp': 1294763658, + 'upload_date': '20110111', + 'uploader': 'Sam Morrill', + 'uploader_id': 'sammorrill' + } + }, { + # Direct Link + 'url': 'https://amara.org/en/videos/s8KL7I3jLmh6/info/the-danger-of-a-single-story/', + 'md5': 'd3970f08512738ee60c5807311ff5d3f', + 'info_dict': { + 'id': 's8KL7I3jLmh6', + 'ext': 'mp4', + 'title': 'The danger of a single story', + 'description': 'md5:d769b31139c3b8bb5be9177f62ea3f23', + 'thumbnail': r're:^https?://.*\.jpg$', + 'subtitles': dict, + 'upload_date': '20091007', + 'timestamp': 1254942511, + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + meta = self._download_json( + 'https://amara.org/api/videos/%s/' % video_id, + video_id, query={'format': 'json'}) + title = meta['title'] + video_url = meta['all_urls'][0] + + subtitles = {} + for language in (meta.get('languages') or []): + subtitles_uri = language.get('subtitles_uri') + if not (subtitles_uri and language.get('published')): + continue + subtitle = subtitles.setdefault(language.get('code') or 'en', []) + for f in ('json', 'srt', 'vtt'): + subtitle.append({ + 'ext': f, + 'url': update_url_query(subtitles_uri, {'format': f}), + }) + + info = { + 'url': video_url, + 'id': video_id, + 'subtitles': subtitles, + 'title': title, + 'description': meta.get('description'), + 'thumbnail': meta.get('thumbnail'), + 'duration': int_or_none(meta.get('duration')), + 'timestamp': parse_iso8601(meta.get('created')), + } + + for ie in (YoutubeIE, VimeoIE): + if ie.suitable(video_url): + info.update({ + '_type': 'url_transparent', + 'ie_key': ie.ie_key(), + }) + break + + return info diff --git a/youtube_dlc/extractor/brightcove.py b/youtube_dlc/extractor/brightcove.py index 638673c31..c6ca939dd 100644 --- a/youtube_dlc/extractor/brightcove.py +++ b/youtube_dlc/extractor/brightcove.py @@ -147,7 +147,7 @@ class BrightcoveLegacyIE(InfoExtractor): ] @classmethod - def _build_brighcove_url(cls, object_str): + def _build_brightcove_url(cls, object_str): """ Build a Brightcove url from a xml string containing <object class="BrightcoveExperience">{params}</object> @@ -217,7 +217,7 @@ def find_param(name): return cls._make_brightcove_url(params) @classmethod - def _build_brighcove_url_from_js(cls, object_js): + def _build_brightcove_url_from_js(cls, object_js): # The layout of JS is as follows: # customBC.createVideo = function (width, height, playerID, playerKey, videoPlayer, VideoRandomID) { # // build Brightcove <object /> XML @@ -272,12 +272,12 @@ def _extract_brightcove_urls(cls, webpage): ).+?>\s*</object>''', webpage) if matches: - return list(filter(None, [cls._build_brighcove_url(m) for m in matches])) + return list(filter(None, [cls._build_brightcove_url(m) for m in matches])) matches = re.findall(r'(customBC\.createVideo\(.+?\);)', webpage) if matches: return list(filter(None, [ - cls._build_brighcove_url_from_js(custom_bc) + cls._build_brightcove_url_from_js(custom_bc) for custom_bc in matches])) return [src for _, src in re.findall( r'<iframe[^>]+src=([\'"])((?:https?:)?//link\.brightcove\.com/services/player/(?!\1).+)\1', webpage)] diff --git a/youtube_dlc/extractor/common.py b/youtube_dlc/extractor/common.py index f90cf36ed..2bc94acdd 100644 --- a/youtube_dlc/extractor/common.py +++ b/youtube_dlc/extractor/common.py @@ -1664,7 +1664,7 @@ def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None, # just the media without qualities renditions. # Fortunately, master playlist can be easily distinguished from media # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4] - # master playlist tags MUST NOT appear in a media playist and vice versa. + # master playlist tags MUST NOT appear in a media playlist and vice versa. # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every # media playlist and MUST NOT appear in master playlist thus we can # clearly detect media playlist with this criterion. diff --git a/youtube_dlc/extractor/discoverynetworks.py b/youtube_dlc/extractor/discoverynetworks.py index 607a54948..c512b95d0 100644 --- a/youtube_dlc/extractor/discoverynetworks.py +++ b/youtube_dlc/extractor/discoverynetworks.py @@ -7,7 +7,7 @@ class DiscoveryNetworksDeIE(DPlayIE): - _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:tlc|dmax)\.de|dplay\.co\.uk)/(?:programme|show)/(?P<programme>[^/]+)/video/(?P<alternate_id>[^/]+)' + _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:tlc|dmax)\.de|dplay\.co\.uk)/(?:programme|show|sendungen)/(?P<programme>[^/]+)/(?:video/)?(?P<alternate_id>[^/]+)' _TESTS = [{ 'url': 'https://www.tlc.de/programme/breaking-amish/video/die-welt-da-drauen/DCB331270001100', @@ -29,6 +29,9 @@ class DiscoveryNetworksDeIE(DPlayIE): }, { 'url': 'https://www.dplay.co.uk/show/ghost-adventures/video/hotel-leger-103620/EHD_280313B', 'only_matching': True, + }, { + 'url': 'https://tlc.de/sendungen/breaking-amish/die-welt-da-drauen/', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dlc/extractor/europa.py b/youtube_dlc/extractor/europa.py index 1efc0b2ec..2c1c747a1 100644 --- a/youtube_dlc/extractor/europa.py +++ b/youtube_dlc/extractor/europa.py @@ -60,7 +60,7 @@ def get_item(type_, preference): title = get_item('title', preferred_langs) or video_id description = get_item('description', preferred_langs) - thumbnmail = xpath_text(playlist, './info/thumburl', 'thumbnail') + thumbnail = xpath_text(playlist, './info/thumburl', 'thumbnail') upload_date = unified_strdate(xpath_text(playlist, './info/date', 'upload date')) duration = parse_duration(xpath_text(playlist, './info/duration', 'duration')) view_count = int_or_none(xpath_text(playlist, './info/views', 'views')) @@ -85,7 +85,7 @@ def get_item(type_, preference): 'id': video_id, 'title': title, 'description': description, - 'thumbnail': thumbnmail, + 'thumbnail': thumbnail, 'upload_date': upload_date, 'duration': duration, 'view_count': view_count, diff --git a/youtube_dlc/extractor/extractors.py b/youtube_dlc/extractor/extractors.py index ee52492dc..15522f942 100644 --- a/youtube_dlc/extractor/extractors.py +++ b/youtube_dlc/extractor/extractors.py @@ -36,6 +36,7 @@ from .airmozilla import AirMozillaIE from .aljazeera import AlJazeeraIE from .alphaporno import AlphaPornoIE +from .amara import AmaraIE from .alura import ( AluraIE, AluraCourseIE @@ -1507,7 +1508,6 @@ YoutubeIE, YoutubeFavouritesIE, YoutubeHistoryIE, - YoutubeLiveIE, YoutubeTabIE, YoutubePlaylistIE, YoutubeRecommendedIE, diff --git a/youtube_dlc/extractor/francetv.py b/youtube_dlc/extractor/francetv.py index dbedfc091..ab0df1bed 100644 --- a/youtube_dlc/extractor/francetv.py +++ b/youtube_dlc/extractor/francetv.py @@ -211,7 +211,7 @@ def sign(manifest_url, manifest_id): 'id': video_id, 'title': self._live_title(title) if is_live else title, 'description': clean_html(info.get('synopsis')), - 'thumbnail': urljoin('http://pluzz.francetv.fr', info.get('image')), + 'thumbnail': urljoin('https://sivideo.webservices.francetelevisions.fr', info.get('image')), 'duration': int_or_none(info.get('real_duration')) or parse_duration(info.get('duree')), 'timestamp': int_or_none(try_get(info, lambda x: x['diffusion']['timestamp'])), 'is_live': is_live, diff --git a/youtube_dlc/extractor/generic.py b/youtube_dlc/extractor/generic.py index ce8cac5c1..db4d3a933 100644 --- a/youtube_dlc/extractor/generic.py +++ b/youtube_dlc/extractor/generic.py @@ -842,7 +842,7 @@ class GenericIE(InfoExtractor): 'skip_download': True, } }, - # MTVSercices embed + # MTVServices embed { 'url': 'http://www.vulture.com/2016/06/new-key-peele-sketches-released.html', 'md5': 'ca1aef97695ef2c1d6973256a57e5252', diff --git a/youtube_dlc/extractor/googledrive.py b/youtube_dlc/extractor/googledrive.py index ec0d58a57..fdb15795a 100644 --- a/youtube_dlc/extractor/googledrive.py +++ b/youtube_dlc/extractor/googledrive.py @@ -3,11 +3,13 @@ import re from .common import InfoExtractor +from ..compat import compat_parse_qs from ..utils import ( determine_ext, ExtractorError, int_or_none, lowercase_escape, + try_get, update_url_query, ) @@ -38,21 +40,10 @@ class GoogleDriveIE(InfoExtractor): # video can't be watched anonymously due to view count limit reached, # but can be downloaded (see https://github.com/ytdl-org/youtube-dl/issues/14046) 'url': 'https://drive.google.com/file/d/0B-vUyvmDLdWDcEt4WjBqcmI2XzQ/view', - 'md5': 'bfbd670d03a470bb1e6d4a257adec12e', - 'info_dict': { - 'id': '0B-vUyvmDLdWDcEt4WjBqcmI2XzQ', - 'ext': 'mp4', - 'title': 'Annabelle Creation (2017)- Z.V1 [TH].MP4', - } + 'only_matching': True, }, { # video id is longer than 28 characters 'url': 'https://drive.google.com/file/d/1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ/edit', - 'info_dict': { - 'id': '1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ', - 'ext': 'mp4', - 'title': 'Andreea Banica feat Smiley - Hooky Song (Official Video).mp4', - 'duration': 189, - }, 'only_matching': True, }, { 'url': 'https://drive.google.com/open?id=0B2fjwgkl1A_CX083Tkowdmt6d28', @@ -171,23 +162,21 @@ def _get_automatic_captions(self, video_id, subtitles_id, hl): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage( - 'http://docs.google.com/file/d/%s' % video_id, video_id) + video_info = compat_parse_qs(self._download_webpage( + 'https://drive.google.com/get_video_info', + video_id, query={'docid': video_id})) - title = self._search_regex( - r'"title"\s*,\s*"([^"]+)', webpage, 'title', - default=None) or self._og_search_title(webpage) - duration = int_or_none(self._search_regex( - r'"length_seconds"\s*,\s*"([^"]+)', webpage, 'length seconds', - default=None)) + def get_value(key): + return try_get(video_info, lambda x: x[key][0]) + + reason = get_value('reason') + title = get_value('title') + if not title and reason: + raise ExtractorError(reason, expected=True) formats = [] - fmt_stream_map = self._search_regex( - r'"fmt_stream_map"\s*,\s*"([^"]+)', webpage, - 'fmt stream map', default='').split(',') - fmt_list = self._search_regex( - r'"fmt_list"\s*,\s*"([^"]+)', webpage, - 'fmt_list', default='').split(',') + fmt_stream_map = (get_value('fmt_stream_map') or '').split(',') + fmt_list = (get_value('fmt_list') or '').split(',') if fmt_stream_map and fmt_list: resolutions = {} for fmt in fmt_list: @@ -257,19 +246,14 @@ def add_source_format(urlh): if urlh and urlh.headers.get('Content-Disposition'): add_source_format(urlh) - if not formats: - reason = self._search_regex( - r'"reason"\s*,\s*"([^"]+)', webpage, 'reason', default=None) - if reason: - raise ExtractorError(reason, expected=True) + if not formats and reason: + raise ExtractorError(reason, expected=True) self._sort_formats(formats) - hl = self._search_regex( - r'"hl"\s*,\s*"([^"]+)', webpage, 'hl', default=None) + hl = get_value('hl') subtitles_id = None - ttsurl = self._search_regex( - r'"ttsurl"\s*,\s*"([^"]+)', webpage, 'ttsurl', default=None) + ttsurl = get_value('ttsurl') if ttsurl: # the video Id for subtitles will be the last value in the ttsurl # query string @@ -281,8 +265,8 @@ def add_source_format(urlh): return { 'id': video_id, 'title': title, - 'thumbnail': self._og_search_thumbnail(webpage, default=None), - 'duration': duration, + 'thumbnail': 'https://drive.google.com/thumbnail?id=' + video_id, + 'duration': int_or_none(get_value('length_seconds')), 'formats': formats, 'subtitles': self.extract_subtitles(video_id, subtitles_id, hl), 'automatic_captions': self.extract_automatic_captions( diff --git a/youtube_dlc/extractor/infoq.py b/youtube_dlc/extractor/infoq.py index 18249cf9b..0a70a1fb4 100644 --- a/youtube_dlc/extractor/infoq.py +++ b/youtube_dlc/extractor/infoq.py @@ -54,7 +54,7 @@ class InfoQIE(BokeCCBaseIE): def _extract_rtmp_video(self, webpage): # The server URL is hardcoded - video_url = 'rtmpe://video.infoq.com/cfx/st/' + video_url = 'rtmpe://videof.infoq.com/cfx/st/' # Extract video URL encoded_id = self._search_regex( @@ -86,17 +86,18 @@ def _extract_http_video(self, webpage): return [{ 'format_id': 'http_video', 'url': http_video_url, + 'http_headers': {'Referer': 'https://www.infoq.com/'}, }] def _extract_http_audio(self, webpage, video_id): - fields = self._hidden_inputs(webpage) + fields = self._form_hidden_inputs('mp3Form', webpage) http_audio_url = fields.get('filename') if not http_audio_url: return [] # base URL is found in the Location header in the response returned by # GET https://www.infoq.com/mp3download.action?filename=... when logged in. - http_audio_url = compat_urlparse.urljoin('http://res.infoq.com/downloads/mp3downloads/', http_audio_url) + http_audio_url = compat_urlparse.urljoin('http://ress.infoq.com/downloads/mp3downloads/', http_audio_url) http_audio_url = update_url_query(http_audio_url, self._extract_cf_auth(webpage)) # audio file seem to be missing some times even if there is a download link diff --git a/youtube_dlc/extractor/kusi.py b/youtube_dlc/extractor/kusi.py index 6a7e3baa7..9833d35eb 100644 --- a/youtube_dlc/extractor/kusi.py +++ b/youtube_dlc/extractor/kusi.py @@ -64,7 +64,7 @@ def _real_extract(self, url): duration = float_or_none(xpath_text(doc, 'DURATION'), scale=1000) description = xpath_text(doc, 'ABSTRACT') thumbnail = xpath_text(doc, './THUMBNAILIMAGE/FILENAME') - createtion_time = timeconvert(xpath_text(doc, 'rfc822creationdate')) + creation_time = timeconvert(xpath_text(doc, 'rfc822creationdate')) quality_options = doc.find('{http://search.yahoo.com/mrss/}group').findall('{http://search.yahoo.com/mrss/}content') formats = [] @@ -84,5 +84,5 @@ def _real_extract(self, url): 'duration': duration, 'formats': formats, 'thumbnail': thumbnail, - 'timestamp': createtion_time, + 'timestamp': creation_time, } diff --git a/youtube_dlc/extractor/npr.py b/youtube_dlc/extractor/npr.py index 53acc6e57..9d1122f0c 100644 --- a/youtube_dlc/extractor/npr.py +++ b/youtube_dlc/extractor/npr.py @@ -33,7 +33,7 @@ class NprIE(InfoExtractor): }, }], }, { - # mutlimedia, not media title + # multimedia, not media title 'url': 'https://www.npr.org/2017/06/19/533198237/tigers-jaw-tiny-desk-concert', 'info_dict': { 'id': '533198237', diff --git a/youtube_dlc/extractor/pbs.py b/youtube_dlc/extractor/pbs.py index 4dbe661be..d4baa16ee 100644 --- a/youtube_dlc/extractor/pbs.py +++ b/youtube_dlc/extractor/pbs.py @@ -477,7 +477,7 @@ def _extract_webpage(self, url): if media_id: return media_id, presumptive_id, upload_date, description - # Fronline video embedded via flp + # Frontline video embedded via flp video_id = self._search_regex( r'videoid\s*:\s*"([\d+a-z]{7,})"', webpage, 'videoid', default=None) if video_id: diff --git a/youtube_dlc/extractor/rai.py b/youtube_dlc/extractor/rai.py index a0836bf58..5eef7c633 100644 --- a/youtube_dlc/extractor/rai.py +++ b/youtube_dlc/extractor/rai.py @@ -16,8 +16,9 @@ GeoRestrictedError, int_or_none, parse_duration, + remove_start, strip_or_none, - unescapeHTML, + try_get, unified_strdate, unified_timestamp, update_url_query, @@ -67,7 +68,7 @@ def _extract_relinker_info(self, relinker_url, video_id): # This does not imply geo restriction (e.g. # http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html) - if media_url == 'http://download.rai.it/video_no_available.mp4': + if '/video_no_available.mp4' in media_url: continue ext = determine_ext(media_url) @@ -122,27 +123,8 @@ def _extract_subtitles(url, subtitle_url): class RaiPlayIE(RaiBaseIE): - _VALID_URL = r'(?P<url>https?://(?:www\.)?raiplay\.it/.+?-(?P<id>%s)\.(?:html|json))' % RaiBaseIE._UUID_RE + _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplay\.it/.+?-(?P<id>%s))\.(?:html|json)' % RaiBaseIE._UUID_RE _TESTS = [{ - 'url': 'http://www.raiplay.it/video/2016/10/La-Casa-Bianca-e06118bb-59a9-4636-b914-498e4cfd2c66.html?source=twitter', - 'md5': '340aa3b7afb54bfd14a8c11786450d76', - 'info_dict': { - 'id': 'e06118bb-59a9-4636-b914-498e4cfd2c66', - 'ext': 'mp4', - 'title': 'La Casa Bianca', - 'alt_title': 'S2016 - Puntata del 23/10/2016', - 'description': 'md5:a09d45890850458077d1f68bb036e0a5', - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Rai 3', - 'creator': 'Rai 3', - 'duration': 3278, - 'timestamp': 1477764300, - 'upload_date': '20161029', - 'series': 'La Casa Bianca', - 'season': '2016', - }, - 'skip': 'This content is not available', - }, { 'url': 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html', 'md5': '8970abf8caf8aef4696e7b1f2adfc696', 'info_dict': { @@ -166,10 +148,10 @@ class RaiPlayIE(RaiBaseIE): }] def _real_extract(self, url): - url, video_id = re.match(self._VALID_URL, url).groups() + base, video_id = re.match(self._VALID_URL, url).groups() media = self._download_json( - url.replace('.html', '.json'), video_id, 'Downloading video JSON') + base + '.json', video_id, 'Downloading video JSON') title = media['name'] video = media['video'] @@ -195,7 +177,8 @@ def _real_extract(self, url): season = media.get('season') info = { - 'id': video_id, + 'id': remove_start(media.get('id'), 'ContentItem-') or video_id, + 'display_id': video_id, 'title': self._live_title(title) if relinker_info.get( 'is_live') else title, 'alt_title': strip_or_none(media.get('subtitle')), @@ -217,16 +200,16 @@ def _real_extract(self, url): return info -class RaiPlayLiveIE(RaiBaseIE): - _VALID_URL = r'https?://(?:www\.)?raiplay\.it/dirette/(?P<id>[^/?#&]+)' - _TEST = { +class RaiPlayLiveIE(RaiPlayIE): + _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplay\.it/dirette/(?P<id>[^/?#&]+))' + _TESTS = [{ 'url': 'http://www.raiplay.it/dirette/rainews24', 'info_dict': { 'id': 'd784ad40-e0ae-4a69-aa76-37519d238a9c', 'display_id': 'rainews24', 'ext': 'mp4', 'title': 're:^Diretta di Rai News 24 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', - 'description': 'md5:6eca31500550f9376819f174e5644754', + 'description': 'md5:4d00bcf6dc98b27c6ec480de329d1497', 'uploader': 'Rai News 24', 'creator': 'Rai News 24', 'is_live': True, @@ -234,58 +217,50 @@ class RaiPlayLiveIE(RaiBaseIE): 'params': { 'skip_download': True, }, - } - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - video_id = self._search_regex( - r'data-uniquename=["\']ContentItem-(%s)' % RaiBaseIE._UUID_RE, - webpage, 'content id') - - return { - '_type': 'url_transparent', - 'ie_key': RaiPlayIE.ie_key(), - 'url': 'http://www.raiplay.it/dirette/ContentItem-%s.html' % video_id, - 'id': video_id, - 'display_id': display_id, - } + }] class RaiPlayPlaylistIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?raiplay\.it/programmi/(?P<id>[^/?#&]+)' + _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplay\.it/programmi/(?P<id>[^/?#&]+))' _TESTS = [{ 'url': 'http://www.raiplay.it/programmi/nondirloalmiocapo/', 'info_dict': { 'id': 'nondirloalmiocapo', 'title': 'Non dirlo al mio capo', - 'description': 'md5:9f3d603b2947c1c7abb098f3b14fac86', + 'description': 'md5:98ab6b98f7f44c2843fd7d6f045f153b', }, 'playlist_mincount': 12, }] def _real_extract(self, url): - playlist_id = self._match_id(url) + base, playlist_id = re.match(self._VALID_URL, url).groups() - webpage = self._download_webpage(url, playlist_id) - - title = self._html_search_meta( - ('programma', 'nomeProgramma'), webpage, 'title') - description = unescapeHTML(self._html_search_meta( - ('description', 'og:description'), webpage, 'description')) + program = self._download_json( + base + '.json', playlist_id, 'Downloading program JSON') entries = [] - for mobj in re.finditer( - r'<a\b[^>]+\bhref=(["\'])(?P<path>/raiplay/video/.+?)\1', - webpage): - video_url = urljoin(url, mobj.group('path')) - entries.append(self.url_result( - video_url, ie=RaiPlayIE.ie_key(), - video_id=RaiPlayIE._match_id(video_url))) + for b in (program.get('blocks') or []): + for s in (b.get('sets') or []): + s_id = s.get('id') + if not s_id: + continue + medias = self._download_json( + '%s/%s.json' % (base, s_id), s_id, + 'Downloading content set JSON', fatal=False) + if not medias: + continue + for m in (medias.get('items') or []): + path_id = m.get('path_id') + if not path_id: + continue + video_url = urljoin(url, path_id) + entries.append(self.url_result( + video_url, ie=RaiPlayIE.ie_key(), + video_id=RaiPlayIE._match_id(video_url))) - return self.playlist_result(entries, playlist_id, title, description) + return self.playlist_result( + entries, playlist_id, program.get('name'), + try_get(program, lambda x: x['program_info']['description'])) class RaiIE(RaiBaseIE): @@ -328,19 +303,6 @@ class RaiIE(RaiBaseIE): 'duration': 2214, 'upload_date': '20161103', } - }, { - # drawMediaRaiTV(...) - 'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html', - 'md5': '2dd727e61114e1ee9c47f0da6914e178', - 'info_dict': { - 'id': '59d69d28-6bb6-409d-a4b5-ed44096560af', - 'ext': 'mp4', - 'title': 'Il pacco', - 'description': 'md5:4b1afae1364115ce5d78ed83cd2e5b3a', - 'thumbnail': r're:^https?://.*\.jpg$', - 'upload_date': '20141221', - }, - 'skip': 'This content is not available', }, { # initEdizione('ContentItem-...' 'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined', @@ -352,18 +314,6 @@ class RaiIE(RaiBaseIE): 'upload_date': '20170401', }, 'skip': 'Changes daily', - }, { - # HDS live stream with only relinker URL - 'url': 'http://www.rai.tv/dl/RaiTV/dirette/PublishingBlock-1912dbbf-3f96-44c3-b4cf-523681fbacbc.html?channel=EuroNews', - 'info_dict': { - 'id': '1912dbbf-3f96-44c3-b4cf-523681fbacbc', - 'ext': 'flv', - 'title': 'EuroNews', - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'This content is available only in Italy', }, { # HLS live stream with ContentItem in og:url 'url': 'http://www.rainews.it/dl/rainews/live/ContentItem-3156f2f2-dc70-4953-8e2f-70d7489d4ce9.html', @@ -473,7 +423,7 @@ def _real_extract(self, url): except ExtractorError: pass - relinker_url = self._search_regex( + relinker_url = self._proto_relative_url(self._search_regex( r'''(?x) (?: var\s+videoURL| @@ -485,7 +435,7 @@ def _real_extract(self, url): //mediapolis(?:vod)?\.rai\.it/relinker/relinkerServlet\.htm\? (?:(?!\1).)*\bcont=(?:(?!\1).)+)\1 ''', - webpage, 'relinker URL', group='url') + webpage, 'relinker URL', group='url')) relinker_info = self._extract_relinker_info( urljoin(url, relinker_url), video_id) diff --git a/youtube_dlc/extractor/soundcloud.py b/youtube_dlc/extractor/soundcloud.py index ed70b7169..47f68bf19 100644 --- a/youtube_dlc/extractor/soundcloud.py +++ b/youtube_dlc/extractor/soundcloud.py @@ -649,7 +649,7 @@ def _real_extract(self, url): class SoundcloudPagedPlaylistBaseIE(SoundcloudIE): def _extract_playlist(self, base_url, playlist_id, playlist_title): - # Per the SoundCloud documentation, the maximum limit for a linked partioning query is 200. + # Per the SoundCloud documentation, the maximum limit for a linked partitioning query is 200. # https://developers.soundcloud.com/blog/offset-pagination-deprecated COMMON_QUERY = { 'limit': 200, diff --git a/youtube_dlc/extractor/svt.py b/youtube_dlc/extractor/svt.py index 2f6887d86..a0b6ef4db 100644 --- a/youtube_dlc/extractor/svt.py +++ b/youtube_dlc/extractor/svt.py @@ -9,6 +9,7 @@ determine_ext, dict_get, int_or_none, + unified_timestamp, str_or_none, strip_or_none, try_get, @@ -44,7 +45,8 @@ def _extract_video(self, video_info, video_id): 'format_id': player_type, 'url': vurl, }) - if not formats and video_info.get('rights', {}).get('geoBlockedSweden'): + rights = try_get(video_info, lambda x: x['rights'], dict) or {} + if not formats and rights.get('geoBlockedSweden'): self.raise_geo_restricted( 'This video is only available in Sweden', countries=self._GEO_COUNTRIES) @@ -70,6 +72,7 @@ def _extract_video(self, video_info, video_id): episode = video_info.get('episodeTitle') episode_number = int_or_none(video_info.get('episodeNumber')) + timestamp = unified_timestamp(rights.get('validFrom')) duration = int_or_none(dict_get(video_info, ('materialLength', 'contentDuration'))) age_limit = None adult = dict_get( @@ -84,6 +87,7 @@ def _extract_video(self, video_info, video_id): 'formats': formats, 'subtitles': subtitles, 'duration': duration, + 'timestamp': timestamp, 'age_limit': age_limit, 'series': series, 'season_number': season_number, @@ -136,26 +140,39 @@ class SVTPlayIE(SVTPlayBaseIE): IE_DESC = 'SVT Play and Öppet arkiv' _VALID_URL = r'''(?x) (?: - svt:(?P<svt_id>[^/?#&]+)| + (?: + svt:| + https?://(?:www\.)?svt\.se/barnkanalen/barnplay/[^/]+/ + ) + (?P<svt_id>[^/?#&]+)| https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp|kanaler)/(?P<id>[^/?#&]+) ) ''' _TESTS = [{ - 'url': 'http://www.svtplay.se/video/5996901/flygplan-till-haile-selassie/flygplan-till-haile-selassie-2', - 'md5': '2b6704fe4a28801e1a098bbf3c5ac611', + 'url': 'https://www.svtplay.se/video/26194546/det-har-ar-himlen', + 'md5': '2382036fd6f8c994856c323fe51c426e', 'info_dict': { - 'id': '5996901', + 'id': 'jNwpV9P', 'ext': 'mp4', - 'title': 'Flygplan till Haile Selassie', - 'duration': 3527, - 'thumbnail': r're:^https?://.*[\.-]jpg$', + 'title': 'Det här är himlen', + 'timestamp': 1586044800, + 'upload_date': '20200405', + 'duration': 3515, + 'thumbnail': r're:^https?://(?:.*[\.-]jpg|www.svtstatic.se/image/.*)$', 'age_limit': 0, 'subtitles': { 'sv': [{ - 'ext': 'wsrt', + 'ext': 'vtt', }] }, }, + 'params': { + 'format': 'bestvideo', + # skip for now due to download test asserts that segment is > 10000 bytes and svt uses + # init segments that are smaller + # AssertionError: Expected test_SVTPlay_jNwpV9P.mp4 to be at least 9.77KiB, but it's only 864.00B + 'skip_download': True, + }, }, { # geo restricted to Sweden 'url': 'http://www.oppetarkiv.se/video/5219710/trollflojten', @@ -172,6 +189,12 @@ class SVTPlayIE(SVTPlayBaseIE): }, { 'url': 'svt:14278044', 'only_matching': True, + }, { + 'url': 'https://www.svt.se/barnkanalen/barnplay/kar/eWv5MLX/', + 'only_matching': True, + }, { + 'url': 'svt:eWv5MLX', + 'only_matching': True, }] def _adjust_title(self, info): @@ -236,7 +259,10 @@ def _real_extract(self, url): r'["\']svtId["\']\s*:\s*["\']([\da-zA-Z-]+)'), webpage, 'video id') - return self._extract_by_video_id(svt_id, webpage) + info_dict = self._extract_by_video_id(svt_id, webpage) + info_dict['thumbnail'] = thumbnail + + return info_dict class SVTSeriesIE(SVTPlayBaseIE): @@ -360,7 +386,7 @@ class SVTPageIE(InfoExtractor): @classmethod def suitable(cls, url): - return False if SVTIE.suitable(url) else super(SVTPageIE, cls).suitable(url) + return False if SVTIE.suitable(url) or SVTPlayIE.suitable(url) else super(SVTPageIE, cls).suitable(url) def _real_extract(self, url): path, display_id = re.match(self._VALID_URL, url).groups() diff --git a/youtube_dlc/extractor/tagesschau.py b/youtube_dlc/extractor/tagesschau.py index c351b7545..8ceab7e35 100644 --- a/youtube_dlc/extractor/tagesschau.py +++ b/youtube_dlc/extractor/tagesschau.py @@ -86,7 +86,7 @@ def _real_extract(self, url): # return self._extract_via_api(kind, video_id) # JSON api does not provide some audio formats (e.g. ogg) thus - # extractiong audio via webpage + # extracting audio via webpage webpage = self._download_webpage(url, video_id) diff --git a/youtube_dlc/extractor/theplatform.py b/youtube_dlc/extractor/theplatform.py index 07055513a..41bfbe80f 100644 --- a/youtube_dlc/extractor/theplatform.py +++ b/youtube_dlc/extractor/theplatform.py @@ -208,7 +208,7 @@ def _extract_urls(cls, webpage): if m: return [m.group('url')] - # Are whitesapces ignored in URLs? + # Are whitespaces ignored in URLs? # https://github.com/ytdl-org/youtube-dl/issues/12044 matches = re.findall( r'(?s)<(?:iframe|script)[^>]+src=(["\'])((?:https?:)?//player\.theplatform\.com/p/.+?)\1', webpage) diff --git a/youtube_dlc/extractor/turner.py b/youtube_dlc/extractor/turner.py index 4a6cbfbb8..2964504a2 100644 --- a/youtube_dlc/extractor/turner.py +++ b/youtube_dlc/extractor/turner.py @@ -56,9 +56,9 @@ def _extract_cvp_info(self, data_src, video_id, path_data={}, ap_data={}): content_id = xpath_text(video_data, 'contentId') or video_id # rtmp_src = xpath_text(video_data, 'akamai/src') # if rtmp_src: - # splited_rtmp_src = rtmp_src.split(',') - # if len(splited_rtmp_src) == 2: - # rtmp_src = splited_rtmp_src[1] + # split_rtmp_src = rtmp_src.split(',') + # if len(split_rtmp_src) == 2: + # rtmp_src = split_rtmp_src[1] # aifp = xpath_text(video_data, 'akamai/aifp', default='') urls = [] diff --git a/youtube_dlc/extractor/viki.py b/youtube_dlc/extractor/viki.py index 6bddf8be9..2e3794344 100644 --- a/youtube_dlc/extractor/viki.py +++ b/youtube_dlc/extractor/viki.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import base64 import hashlib import hmac import itertools @@ -9,6 +10,10 @@ import time from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urllib_parse_urlparse, +) from ..utils import ( ExtractorError, int_or_none, @@ -166,19 +171,20 @@ class VikiIE(VikiBaseIE): }, { # episode 'url': 'http://www.viki.com/videos/44699v-boys-over-flowers-episode-1', - 'md5': '5fa476a902e902783ac7a4d615cdbc7a', + 'md5': '94e0e34fd58f169f40c184f232356cfe', 'info_dict': { 'id': '44699v', 'ext': 'mp4', 'title': 'Boys Over Flowers - Episode 1', 'description': 'md5:b89cf50038b480b88b5b3c93589a9076', - 'duration': 4204, + 'duration': 4172, 'timestamp': 1270496524, 'upload_date': '20100405', 'uploader': 'group8', 'like_count': int, 'age_limit': 13, - } + }, + 'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'], }, { # youtube external 'url': 'http://www.viki.com/videos/50562v-poor-nastya-complete-episode-1', @@ -195,14 +201,15 @@ class VikiIE(VikiBaseIE): 'uploader_id': 'ad14065n', 'like_count': int, 'age_limit': 13, - } + }, + 'skip': 'Page not found!', }, { 'url': 'http://www.viki.com/player/44699v', 'only_matching': True, }, { # non-English description 'url': 'http://www.viki.com/videos/158036v-love-in-magic', - 'md5': '1713ae35df5a521b31f6dc40730e7c9c', + 'md5': 'adf9e321a0ae5d0aace349efaaff7691', 'info_dict': { 'id': '158036v', 'ext': 'mp4', @@ -218,71 +225,11 @@ class VikiIE(VikiBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - video = self._call_api( - 'videos/%s.json' % video_id, video_id, 'Downloading video JSON') - - streams = self._call_api( - 'videos/%s/streams.json' % video_id, video_id, - 'Downloading video streams JSON') - - formats = [] - for format_id, stream_dict in streams.items(): - height = int_or_none(self._search_regex( - r'^(\d+)[pP]$', format_id, 'height', default=None)) - for protocol, format_dict in stream_dict.items(): - # rtmps URLs does not seem to work - if protocol == 'rtmps': - continue - format_url = format_dict.get('url') - format_drms = format_dict.get('drms') - format_stream_id = format_dict.get('id') - if format_id == 'm3u8': - m3u8_formats = self._extract_m3u8_formats( - format_url, video_id, 'mp4', - entry_protocol='m3u8_native', - m3u8_id='m3u8-%s' % protocol, fatal=False) - # Despite CODECS metadata in m3u8 all video-only formats - # are actually video+audio - for f in m3u8_formats: - if f.get('acodec') == 'none' and f.get('vcodec') != 'none': - f['acodec'] = None - formats.extend(m3u8_formats) - elif format_id == 'mpd': - mpd_formats = self._extract_mpd_formats( - format_url, video_id, - mpd_id='mpd-%s' % protocol, fatal=False) - formats.extend(mpd_formats) - elif format_id == 'mpd': - - formats.extend(mpd_formats) - elif format_url.startswith('rtmp'): - mobj = re.search( - r'^(?P<url>rtmp://[^/]+/(?P<app>.+?))/(?P<playpath>mp4:.+)$', - format_url) - if not mobj: - continue - formats.append({ - 'format_id': 'rtmp-%s' % format_id, - 'ext': 'flv', - 'url': mobj.group('url'), - 'play_path': mobj.group('playpath'), - 'app': mobj.group('app'), - 'page_url': url, - 'drms': format_drms, - 'stream_id': format_stream_id, - }) - else: - urlh = self._request_webpage( - HEADRequest(format_url), video_id, 'Checking file size', fatal=False) - formats.append({ - 'url': format_url, - 'format_id': '%s-%s' % (format_id, protocol), - 'height': height, - 'drms': format_drms, - 'stream_id': format_stream_id, - 'filesize': int_or_none(urlh.headers.get('Content-Length')), - }) - self._sort_formats(formats) + resp = self._download_json( + 'https://www.viki.com/api/videos/' + video_id, + video_id, 'Downloading video JSON', + headers={'x-viki-app-ver': '4.0.57'}) + video = resp['video'] self._check_errors(video) @@ -342,12 +289,84 @@ def _real_extract(self, url): 'subtitles': subtitles, } - if 'external' in streams: - result.update({ - '_type': 'url_transparent', - 'url': streams['external']['url'], - }) - return result + formats = [] + + def add_format(format_id, format_dict, protocol='http'): + # rtmps URLs does not seem to work + if protocol == 'rtmps': + return + format_url = format_dict.get('url') + if not format_url: + return + format_drms = format_dict.get('drms') + format_stream_id = format_dict.get('id') + qs = compat_parse_qs(compat_urllib_parse_urlparse(format_url).query) + stream = qs.get('stream', [None])[0] + if stream: + format_url = base64.b64decode(stream).decode() + if format_id in ('m3u8', 'hls'): + m3u8_formats = self._extract_m3u8_formats( + format_url, video_id, 'mp4', + entry_protocol='m3u8_native', + m3u8_id='m3u8-%s' % protocol, fatal=False) + # Despite CODECS metadata in m3u8 all video-only formats + # are actually video+audio + for f in m3u8_formats: + if '_drm/index_' in f['url']: + continue + if f.get('acodec') == 'none' and f.get('vcodec') != 'none': + f['acodec'] = None + formats.append(f) + elif format_id in ('mpd', 'dash'): + formats.extend(self._extract_mpd_formats( + format_url, video_id, 'mpd-%s' % protocol, fatal=False)) + elif format_url.startswith('rtmp'): + mobj = re.search( + r'^(?P<url>rtmp://[^/]+/(?P<app>.+?))/(?P<playpath>mp4:.+)$', + format_url) + if not mobj: + return + formats.append({ + 'format_id': 'rtmp-%s' % format_id, + 'ext': 'flv', + 'url': mobj.group('url'), + 'play_path': mobj.group('playpath'), + 'app': mobj.group('app'), + 'page_url': url, + 'drms': format_drms, + 'stream_id': format_stream_id, + }) + else: + urlh = self._request_webpage( + HEADRequest(format_url), video_id, 'Checking file size', fatal=False) + formats.append({ + 'url': format_url, + 'format_id': '%s-%s' % (format_id, protocol), + 'height': int_or_none(self._search_regex( + r'^(\d+)[pP]$', format_id, 'height', default=None)), + 'drms': format_drms, + 'stream_id': format_stream_id, + 'filesize': int_or_none(urlh.headers.get('Content-Length')), + }) + + for format_id, format_dict in (resp.get('streams') or {}).items(): + add_format(format_id, format_dict) + if not formats: + streams = self._call_api( + 'videos/%s/streams.json' % video_id, video_id, + 'Downloading video streams JSON') + + if 'external' in streams: + result.update({ + '_type': 'url_transparent', + 'url': streams['external']['url'], + }) + return result + + for format_id, stream_dict in streams.items(): + for protocol, format_dict in stream_dict.items(): + add_format(format_id, format_dict, protocol) + self._sort_formats(formats) result['formats'] = formats return result diff --git a/youtube_dlc/extractor/vimeo.py b/youtube_dlc/extractor/vimeo.py index a0662a369..51a0ab2fa 100644 --- a/youtube_dlc/extractor/vimeo.py +++ b/youtube_dlc/extractor/vimeo.py @@ -922,7 +922,7 @@ class VimeoAlbumIE(VimeoBaseInfoExtractor): }] _PAGE_SIZE = 100 - def _fetch_page(self, album_id, authorizaion, hashed_pass, page): + def _fetch_page(self, album_id, authorization, hashed_pass, page): api_page = page + 1 query = { 'fields': 'link,uri', @@ -934,7 +934,7 @@ def _fetch_page(self, album_id, authorizaion, hashed_pass, page): videos = self._download_json( 'https://api.vimeo.com/albums/%s/videos' % album_id, album_id, 'Downloading page %d' % api_page, query=query, headers={ - 'Authorization': 'jwt ' + authorizaion, + 'Authorization': 'jwt ' + authorization, })['data'] for video in videos: link = video.get('link') diff --git a/youtube_dlc/extractor/xiami.py b/youtube_dlc/extractor/xiami.py index 618da8382..769aab331 100644 --- a/youtube_dlc/extractor/xiami.py +++ b/youtube_dlc/extractor/xiami.py @@ -54,17 +54,17 @@ def _extract_tracks(self, item_id, referer, typ=None): def _decrypt(origin): n = int(origin[0]) origin = origin[1:] - short_lenth = len(origin) // n - long_num = len(origin) - short_lenth * n + short_length = len(origin) // n + long_num = len(origin) - short_length * n l = tuple() for i in range(0, n): - length = short_lenth + length = short_length if i < long_num: length += 1 l += (origin[0:length], ) origin = origin[length:] ans = '' - for i in range(0, short_lenth + 1): + for i in range(0, short_length + 1): for j in range(0, n): if len(l[j]) > i: ans += l[j][i] diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index 3f3f9c58b..fb2702d68 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -306,6 +306,8 @@ def _real_initialize(self): }, } + _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;' + def _call_api(self, ep, query, video_id): data = self._DEFAULT_API_DATA.copy() data.update(query) @@ -322,8 +324,8 @@ def _call_api(self, ep, query, video_id): def _extract_yt_initial_data(self, video_id, webpage): return self._parse_json( self._search_regex( - r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;', - webpage, 'yt initial data'), + (r'%s\s*\n' % self._YT_INITIAL_DATA_RE, + self._YT_INITIAL_DATA_RE), webpage, 'yt initial data'), video_id) @@ -1089,6 +1091,22 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'skip_download': True, }, }, + { + # with '};' inside yt initial data (see https://github.com/ytdl-org/youtube-dl/issues/27093) + 'url': 'https://www.youtube.com/watch?v=CHqg6qOn4no', + 'info_dict': { + 'id': 'CHqg6qOn4no', + 'ext': 'mp4', + 'title': 'Part 77 Sort a list of simple types in c#', + 'description': 'md5:b8746fa52e10cdbf47997903f13b20dc', + 'upload_date': '20130831', + 'uploader_id': 'kudvenkat', + 'uploader': 'kudvenkat', + }, + 'params': { + 'skip_download': True, + }, + }, ] def __init__(self, *args, **kwargs): @@ -2138,6 +2156,21 @@ def _extract_filesize(media_url): formats.append(a_format) else: error_message = extract_unavailable_message() + if not error_message: + reason_list = try_get( + player_response, + lambda x: x['playabilityStatus']['errorScreen']['playerErrorMessageRenderer']['subreason']['runs'], + list) or [] + for reason in reason_list: + if not isinstance(reason, dict): + continue + reason_text = try_get(reason, lambda x: x['text'], compat_str) + if reason_text: + if not error_message: + error_message = '' + error_message += reason_text + if error_message: + error_message = clean_html(error_message) if not error_message: error_message = clean_html(try_get( player_response, lambda x: x['playabilityStatus']['reason'], @@ -2319,8 +2352,8 @@ def extract_meta(field): def _extract_count(count_name): return str_to_int(self._search_regex( - r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>' - % re.escape(count_name), + (r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>' % re.escape(count_name), + r'["\']label["\']\s*:\s*["\']([\d,.]+)\s+%ss["\']' % re.escape(count_name)), video_webpage, count_name, default=None)) like_count = _extract_count('like') @@ -2613,13 +2646,13 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): }, 'playlist_mincount': 138, }, { - 'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA', + 'url': 'https://invidio.us/channel/UCmlqkdCBesrv2Lak1mF_MxA', 'only_matching': True, }, { - 'url': 'https://www.youtubekids.com/channel/UCyu8StPfZWapR6rfW_JgqcA', + 'url': 'https://www.youtubekids.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', 'only_matching': True, }, { - 'url': 'https://music.youtube.com/channel/UCT-K0qO8z6NzWrywqefBPBQ', + 'url': 'https://music.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', 'only_matching': True, }, { 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.', @@ -2666,7 +2699,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): }, 'playlist_mincount': 11, }, { - 'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU', + 'url': 'https://invidio.us/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', 'only_matching': True, }, { # Playlist URL that does not actually serve a playlist @@ -2698,14 +2731,59 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): }, { 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM', 'only_matching': True, - }] - - @classmethod - def suitable(cls, url): - IGNORE = (YoutubeLiveIE,) - return ( - False if any(ie.suitable(url) for ie in IGNORE) - else super(YoutubeTabIE, cls).suitable(url)) + }, { + 'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live', + 'info_dict': { + 'id': '9Auq9mYxFEE', + 'ext': 'mp4', + 'title': 'Watch Sky News live', + 'uploader': 'Sky News', + 'uploader_id': 'skynews', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/skynews', + 'upload_date': '20191102', + 'description': 'md5:78de4e1c2359d0ea3ed829678e38b662', + 'categories': ['News & Politics'], + 'tags': list, + 'like_count': int, + 'dislike_count': int, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.youtube.com/user/TheYoungTurks/live', + 'info_dict': { + 'id': 'a48o2S1cPoo', + 'ext': 'mp4', + 'title': 'The Young Turks - Live Main Show', + 'uploader': 'The Young Turks', + 'uploader_id': 'TheYoungTurks', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks', + 'upload_date': '20150715', + 'license': 'Standard YouTube License', + 'description': 'md5:438179573adcdff3c97ebb1ee632b891', + 'categories': ['News & Politics'], + 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'], + 'like_count': int, + 'dislike_count': int, + }, + 'params': { + 'skip_download': True, + }, + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/c/CommanderVideoHq/live', + 'only_matching': True, + }, + # TODO + # { + # 'url': 'https://www.youtube.com/TheYoungTurks/live', + # 'only_matching': True, + # } + ] def _extract_channel_id(self, webpage): channel_id = self._html_search_meta( @@ -3147,7 +3225,7 @@ def _real_extract(self, url): self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) webpage = self._download_webpage(url, item_id) identity_token = self._search_regex( - r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage, + r'\bID_TOKEN["\']\s*:\s/l*["\'](.+?)["\']', webpage, 'identity token', default=None) data = self._extract_yt_initial_data(item_id, webpage) tabs = try_get( @@ -3158,7 +3236,11 @@ def _real_extract(self, url): data, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict) if playlist: return self._extract_from_playlist(item_id, data, playlist) - # Fallback to video extraction if no playlist alike page is recognized + # Fallback to video extraction if no playlist alike page is recognized. + # First check for the current video then try the v attribute of URL query. + video_id = try_get( + data, lambda x: x['currentVideoEndpoint']['watchEndpoint']['videoId'], + compat_str) or video_id if video_id: return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id) # Failed to recognize @@ -3279,58 +3361,6 @@ def _real_extract(self, url): ie=YoutubeTabIE.ie_key(), video_id=user_id) -class YoutubeLiveIE(YoutubeBaseInfoExtractor): - IE_DESC = 'YouTube.com live streams' - _VALID_URL = r'(?P<base_url>%s)/live' % YoutubeTabIE._VALID_URL - IE_NAME = 'youtube:live' - - _TESTS = [{ - 'url': 'https://www.youtube.com/user/TheYoungTurks/live', - 'info_dict': { - 'id': 'a48o2S1cPoo', - 'ext': 'mp4', - 'title': 'The Young Turks - Live Main Show', - 'uploader': 'The Young Turks', - 'uploader_id': 'TheYoungTurks', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks', - 'upload_date': '20150715', - 'license': 'Standard YouTube License', - 'description': 'md5:438179573adcdff3c97ebb1ee632b891', - 'categories': ['News & Politics'], - 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'], - 'like_count': int, - 'dislike_count': int, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/c/CommanderVideoHq/live', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/TheYoungTurks/live', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - channel_id = mobj.group('id') - base_url = mobj.group('base_url') - webpage = self._download_webpage(url, channel_id, fatal=False) - if webpage: - page_type = self._og_search_property( - 'type', webpage, 'page type', default='') - video_id = self._html_search_meta( - 'videoId', webpage, 'video id', default=None) - if page_type.startswith('video') and video_id and re.match( - r'^[0-9A-Za-z_-]{11}$', video_id): - return self.url_result(video_id, YoutubeIE.ie_key()) - return self.url_result(base_url) - - class YoutubeSearchIE(SearchInfoExtractor, YoutubeBaseInfoExtractor): IE_DESC = 'YouTube.com searches' # there doesn't appear to be a real limit, for example if you search for diff --git a/youtube_dlc/utils.py b/youtube_dlc/utils.py index 975b741c5..68b4ca944 100644 --- a/youtube_dlc/utils.py +++ b/youtube_dlc/utils.py @@ -2460,7 +2460,7 @@ def __init__(self, code=None, msg='Unknown error'): # Parsing code and msg if (self.code in (errno.ENOSPC, errno.EDQUOT) - or 'No space left' in self.msg or 'Disk quota excedded' in self.msg): + or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg): self.reason = 'NO_SPACE' elif self.code == errno.E2BIG or 'Argument list too long' in self.msg: self.reason = 'VALUE_TOO_LONG' @@ -4215,10 +4215,10 @@ def parse_codecs(codecs_str): # http://tools.ietf.org/html/rfc6381 if not codecs_str: return {} - splited_codecs = list(filter(None, map( + split_codecs = list(filter(None, map( lambda str: str.strip(), codecs_str.strip().strip(',').split(',')))) vcodec, acodec = None, None - for full_codec in splited_codecs: + for full_codec in split_codecs: codec = full_codec.split('.')[0] if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v', 'hvc1', 'av01', 'theora'): if not vcodec: @@ -4229,10 +4229,10 @@ def parse_codecs(codecs_str): else: write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr) if not vcodec and not acodec: - if len(splited_codecs) == 2: + if len(split_codecs) == 2: return { - 'vcodec': splited_codecs[0], - 'acodec': splited_codecs[1], + 'vcodec': split_codecs[0], + 'acodec': split_codecs[1], } else: return { @@ -5471,7 +5471,7 @@ def encode_base_n(num, n, table=None): def decode_packed_codes(code): mobj = re.search(PACKED_CODES_RE, code) - obfucasted_code, base, count, symbols = mobj.groups() + obfuscated_code, base, count, symbols = mobj.groups() base = int(base) count = int(count) symbols = symbols.split('|') @@ -5484,7 +5484,7 @@ def decode_packed_codes(code): return re.sub( r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)], - obfucasted_code) + obfuscated_code) def caesar(s, alphabet, shift): From ef2f3c7f58f7409131c47e9ce56f1265547c43a8 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan@gmail.com> Date: Sun, 22 Nov 2020 04:16:05 +0530 Subject: [PATCH 107/124] Minor fixes --- docs/supportedsites.md | 9 +++---- youtube_dlc/extractor/youtube.py | 43 ++++++++++++++++---------------- 2 files changed, 25 insertions(+), 27 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 860766f20..99bb500b6 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1148,16 +1148,15 @@ # Supported sites - **youtube**: YouTube.com - **youtube:favorites**: YouTube.com liked videos, ":ytfav" or "LL" for short (requires authentication) - **youtube:history**: Youtube watch history, ":ythistory" for short (requires authentication) - - **youtube:live**: YouTube.com live streams - **youtube:playlist**: YouTube.com playlists - **youtube:recommended**: YouTube.com recommended videos, ":ytrec" for short (requires authentication) - - **youtube:search**: YouTube.com searches - - **youtube:search:date**: YouTube.com searches, newest videos first + - **youtube:search**: YouTube.com searches, "ytsearch" keyword + - **youtube:search:date**: YouTube.com searches, newest videos first, "ytsearchdate" keyword - **youtube:search_url**: YouTube.com search URLs - - **youtube:subscriptions**: YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication) - - **YoutubeYtUser**: YouTube.com user videos (URL or "ytuser" keyword) + - **youtube:subscriptions**: YouTube.com subscriptions feed, ":ytsubs" for short (requires authentication) - **youtube:tab**: YouTube.com tab - **youtube:watchlater**: Youtube watch later list, ":ytwatchlater" or "WL" for short (requires authentication) + - **YoutubeYtUser**: YouTube.com user videos, URL or "ytuser" keyword - **Zapiks** - **Zaq1** - **Zattoo** diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index fb2702d68..1b4be6075 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -72,7 +72,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): # If True it will raise an error if no login info is provided _LOGIN_REQUIRED = False - _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM|LL|WL)' + _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,})' _YOUTUBE_CLIENT_HEADERS = { 'x-youtube-client-name': '1', @@ -2532,7 +2532,6 @@ def decrypt_sig(mobj): class YoutubeTabIE(YoutubeBaseInfoExtractor): IE_DESC = 'YouTube.com tab' - # (?x)^ will cause warning in LiveIE. So I cant split this into multiple lines using ''' _VALID_URL = ( r'https?://(?:\w+\.)?(?:youtube(?:kids)?\.com|invidio\.us)/' r'(?:(?!(%s)([/#?]|$))|' @@ -2778,11 +2777,11 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'url': 'https://www.youtube.com/c/CommanderVideoHq/live', 'only_matching': True, }, - # TODO - # { - # 'url': 'https://www.youtube.com/TheYoungTurks/live', - # 'only_matching': True, - # } + # TODO + # { + # 'url': 'https://www.youtube.com/TheYoungTurks/live', + # 'only_matching': True, + # } ] def _extract_channel_id(self, webpage): @@ -3461,9 +3460,8 @@ class YoutubeSearchDateIE(YoutubeSearchIE): class YoutubeSearchURLIE(InfoExtractor): IE_DESC = 'YouTube.com search URLs' IE_NAME = 'youtube:search_url' - _PARAM_REGEX = r'' _VALID_URL = r'https?://(?:www\.)?youtube\.com/results/?(?:\?|\?[^#]*?&)(?:sp=(?P<param1>[^&#]+)&(?:[^#]*&)?)?(?:q|search_query)=(?P<query>[^#&]+)(?:[^#]*?&sp=(?P<param2>[^#&]+))?' - _MAX_RESULTS = 100 + # _MAX_RESULTS = 100 _TESTS = [{ 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', 'playlist_mincount': 5, @@ -3480,9 +3478,9 @@ def _real_extract(self, url): query = compat_urllib_parse_unquote_plus(mobj.group('query')) IE = YoutubeSearchIE(self._downloader) IE._SEARCH_PARAMS = mobj.group('param1') or mobj.group('param2') - self._downloader.to_screen(IE._SEARCH_PARAMS) - IE._MAX_RESULTS = self._MAX_RESULTS - return IE._get_n_results(query, self._MAX_RESULTS) + if hasattr(self, '_MAX_RESULTS'): + IE._MAX_RESULTS = self._MAX_RESULTS + return IE._get_n_results(query, IE._MAX_RESULTS) class YoutubeFeedsInfoExtractor(YoutubeTabIE): @@ -3491,9 +3489,8 @@ class YoutubeFeedsInfoExtractor(YoutubeTabIE): Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties. """ _LOGIN_REQUIRED = True - _TESTS = [] - # _MAX_PAGES = 5 + _TESTS = [] @property def IE_NAME(self): @@ -3531,10 +3528,10 @@ def _real_extract(self, url): raise ExtractorError('Unable to recognize feed page') -class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor): +class YoutubeWatchLaterIE(InfoExtractor): + IE_NAME = 'youtube:watchlater' IE_DESC = 'Youtube watch later list, ":ytwatchlater" or "WL" for short (requires authentication)' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/watch_later|:ytwatchlater' - _FEED_NAME = 'watchlater' + _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/watch_later|:ytwatchlater|WL' _TESTS = [{ 'url': 'https://www.youtube.com/feed/watch_later', @@ -3545,13 +3542,14 @@ class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor): }] def _real_extract(self, url): - return self.url_result('WL', ie=YoutubePlaylistIE.ie_key()) + return self.url_result( + 'https://www.youtube.com/playlist?list=WL', ie=YoutubeTabIE.ie_key()) -class YoutubeFavouritesIE(YoutubeFeedsInfoExtractor): +class YoutubeFavouritesIE(InfoExtractor): + IE_NAME = 'youtube:favourites' IE_DESC = 'YouTube.com liked videos, ":ytfav" or "LL" for short (requires authentication)' - _VALID_URL = r':ytfav(?:ou?rite)s?' - _FEED_NAME = 'favourites' + _VALID_URL = r':ytfav(?:ou?rite)?s?|LL' _TESTS = [{ 'url': ':ytfav', @@ -3559,7 +3557,8 @@ class YoutubeFavouritesIE(YoutubeFeedsInfoExtractor): }] def _real_extract(self, url): - return self.url_result('LL', ie=YoutubePlaylistIE.ie_key()) + return self.url_result( + 'https://www.youtube.com/playlist?list=LL', ie=YoutubeTabIE.ie_key()) class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor): From 7bd4a9b6110260f9ca7dcd0a55bd77a007c4748b Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan@gmail.com> Date: Sun, 22 Nov 2020 15:50:16 +0530 Subject: [PATCH 108/124] Added RDMM back Eg: https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM --- youtube_dlc/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index 1b4be6075..f273f4d66 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -72,7 +72,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): # If True it will raise an error if no login info is provided _LOGIN_REQUIRED = False - _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,})' + _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM)' _YOUTUBE_CLIENT_HEADERS = { 'x-youtube-client-name': '1', From 386e1dd908c652bf9796a29e53219bb2fdfa960d Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan@gmail.com> Date: Sun, 22 Nov 2020 18:39:09 +0530 Subject: [PATCH 109/124] Better implementation of YoutubeSearchURLIE --- youtube_dlc/extractor/youtube.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index f273f4d66..e24b9f3a4 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -3457,10 +3457,10 @@ class YoutubeSearchDateIE(YoutubeSearchIE): _SEARCH_PARAMS = 'CAI%3D' -class YoutubeSearchURLIE(InfoExtractor): +class YoutubeSearchURLIE(YoutubeSearchIE): IE_DESC = 'YouTube.com search URLs' - IE_NAME = 'youtube:search_url' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/results/?(?:\?|\?[^#]*?&)(?:sp=(?P<param1>[^&#]+)&(?:[^#]*&)?)?(?:q|search_query)=(?P<query>[^#&]+)(?:[^#]*?&sp=(?P<param2>[^#&]+))?' + IE_NAME = YoutubeSearchIE.IE_NAME + '_url' + _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?:[^&]+)(?:[&]|$)' # _MAX_RESULTS = 100 _TESTS = [{ 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', @@ -3473,14 +3473,15 @@ class YoutubeSearchURLIE(InfoExtractor): 'only_matching': True, }] + @classmethod + def _make_valid_url(cls): + return cls._VALID_URL + def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - query = compat_urllib_parse_unquote_plus(mobj.group('query')) - IE = YoutubeSearchIE(self._downloader) - IE._SEARCH_PARAMS = mobj.group('param1') or mobj.group('param2') - if hasattr(self, '_MAX_RESULTS'): - IE._MAX_RESULTS = self._MAX_RESULTS - return IE._get_n_results(query, IE._MAX_RESULTS) + qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + query = (qs.get('search_query') or qs.get('q'))[0] + self._SEARCH_PARAMS = qs.get('sp', ('',))[0] + return self._get_n_results(query, self._MAX_RESULTS) class YoutubeFeedsInfoExtractor(YoutubeTabIE): From 434406a9820961433a530844fc2e5e75c1983e0a Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan@gmail.com> Date: Mon, 23 Nov 2020 01:55:14 +0530 Subject: [PATCH 110/124] Cleanup YoutubeTabIE _VALID_URL regex --- youtube_dlc/extractor/youtube.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index e24b9f3a4..30f1a024e 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -64,7 +64,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}' _RESERVED_NAMES = ( - r'course|embed|watch|w|results|storefront|' + r'course|embed|playlist|watch|w|results|storefront|' r'shared|index|account|reporthistory|t/terms|about|upload|signin|logout|' r'feed/(watch_later|history|subscriptions|library|trending|recommended)') @@ -2532,12 +2532,11 @@ def decrypt_sig(mobj): class YoutubeTabIE(YoutubeBaseInfoExtractor): IE_DESC = 'YouTube.com tab' - _VALID_URL = ( - r'https?://(?:\w+\.)?(?:youtube(?:kids)?\.com|invidio\.us)/' - r'(?:(?!(%s)([/#?]|$))|' - r'(?:channel|c|user)/|' - r'(?:playlist|watch)\?.*?\blist=)' - r'(?P<id>[^/?#&]+)') % YoutubeBaseInfoExtractor._RESERVED_NAMES + _VALID_URL = (r'''(?x) + https?://(?:\w+\.)?(?:youtube(?:kids)?\.com|invidio\.us)/(?: + (?!(%s)([/#?]|$))|channel/|c/|user/| + (?P<not_channel>playlist|watch)/?\?.*?\blist=) + (?P<id>[^/?#&]+)''') % YoutubeBaseInfoExtractor._RESERVED_NAMES IE_NAME = 'youtube:tab' _TESTS = [{ From 036fcf3aa1fbd484c6629ab754eae715d21439a5 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan@gmail.com> Date: Mon, 23 Nov 2020 01:56:08 +0530 Subject: [PATCH 111/124] Redirect channel home to /video --- youtube_dlc/extractor/youtube.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index 30f1a024e..6fb18558d 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -3212,6 +3212,15 @@ def _real_extract(self, url): item_id = self._match_id(url) url = compat_urlparse.urlunparse( compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com')) + is_home = re.match(r'(?P<pre>%s)(?P<post>/?(?![^#?]).*$)' % self._VALID_URL, url) + if is_home: + self._downloader.to_screen('%s\n%s' % (is_home, is_home.group('not_channel'))) + if is_home is not None and is_home.group('not_channel') is None: + self._downloader.report_warning( + 'A channel/user page was given. All the channel\'s videos will be downloaded. ' + 'To download only the videos in the home page, add a "/home" to the URL') + url = '%s/videos%s' % (is_home.group('pre'), is_home.group('post') or '') + # Handle both video/playlist URLs qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) video_id = qs.get('v', [None])[0] From f8fb3b8a7862ccf2a9347989013407b40d092cda Mon Sep 17 00:00:00 2001 From: xypwn <54681180+xypwn@users.noreply.github.com> Date: Mon, 23 Nov 2020 16:49:39 +0100 Subject: [PATCH 112/124] [SouthparkDe] Support for English URLs Allow downloading English South Park episodes from the southpark.de --- youtube_dlc/extractor/southpark.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dlc/extractor/southpark.py b/youtube_dlc/extractor/southpark.py index 20ae7c5e7..95e6d2890 100644 --- a/youtube_dlc/extractor/southpark.py +++ b/youtube_dlc/extractor/southpark.py @@ -44,7 +44,7 @@ class SouthParkEsIE(SouthParkIE): class SouthParkDeIE(SouthParkIE): IE_NAME = 'southpark.de' - _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.de/(?:videoclip|collections|folgen)/(?P<id>(?P<unique_id>.+?)/.+?)(?:\?|#|$))' + _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.de/(?:(en/(videoclip|collections|episodes))|(videoclip|collections|folgen))/(?P<id>(?P<unique_id>.+?)/.+?)(?:\?|#|$))' # _FEED_URL = 'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed' _TESTS = [{ From a93f71ee5e1f8e68bacf7492e52e398578a98d50 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan@gmail.com> Date: Mon, 23 Nov 2020 12:09:03 +0530 Subject: [PATCH 113/124] Minor fixes --- youtube_dlc/extractor/youtube.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index 6fb18558d..0f15b0189 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -64,7 +64,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}' _RESERVED_NAMES = ( - r'course|embed|playlist|watch|w|results|storefront|' + r'course|embed|channel|c|user|playlist|watch|w|results|storefront|' r'shared|index|account|reporthistory|t/terms|about|upload|signin|logout|' r'feed/(watch_later|history|subscriptions|library|trending|recommended)') @@ -3066,7 +3066,6 @@ def extract_entries(parent_renderer): if parent_renderer: for entry in extract_entries(parent_renderer): yield entry - continuation = continuation_list[0] headers = { @@ -3213,8 +3212,6 @@ def _real_extract(self, url): url = compat_urlparse.urlunparse( compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com')) is_home = re.match(r'(?P<pre>%s)(?P<post>/?(?![^#?]).*$)' % self._VALID_URL, url) - if is_home: - self._downloader.to_screen('%s\n%s' % (is_home, is_home.group('not_channel'))) if is_home is not None and is_home.group('not_channel') is None: self._downloader.report_warning( 'A channel/user page was given. All the channel\'s videos will be downloaded. ' @@ -3232,7 +3229,7 @@ def _real_extract(self, url): self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) webpage = self._download_webpage(url, item_id) identity_token = self._search_regex( - r'\bID_TOKEN["\']\s*:\s/l*["\'](.+?)["\']', webpage, + r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage, 'identity token', default=None) data = self._extract_yt_initial_data(item_id, webpage) tabs = try_get( From 70c5802b5d651f840b6e94fb0cdc1105d5e048e6 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan@gmail.com> Date: Tue, 24 Nov 2020 02:33:08 +0530 Subject: [PATCH 114/124] Update to release 2020.11.24 except youtube and skyit extractors --- docs/supportedsites.md | 7 + youtube_dlc/extractor/box.py | 98 +++++++++++++ youtube_dlc/extractor/common.py | 27 ++++ youtube_dlc/extractor/extractors.py | 7 + youtube_dlc/extractor/franceinter.py | 3 + youtube_dlc/extractor/lbry.py | 5 +- youtube_dlc/extractor/nytimes.py | 38 +++++ youtube_dlc/extractor/pinterest.py | 201 +++++++++++++++++++++++++++ youtube_dlc/extractor/rumble.py | 67 +++++++++ 9 files changed, 452 insertions(+), 1 deletion(-) create mode 100644 youtube_dlc/extractor/box.py create mode 100644 youtube_dlc/extractor/pinterest.py create mode 100644 youtube_dlc/extractor/rumble.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 99bb500b6..45a546650 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -110,6 +110,7 @@ # Supported sites - **Bloomberg** - **BokeCC** - **BostonGlobe** + - **Box** - **Bpb**: Bundeszentrale für politische Bildung - **BR**: Bayerischer Rundfunk - **BravoTV** @@ -157,6 +158,7 @@ # Supported sites - **Chilloutzone** - **chirbit** - **chirbit:profile** + - **cielotv.it** - **Cinchcast** - **Cinemax** - **CiscoLiveSearch** @@ -618,6 +620,7 @@ # Supported sites - **Nuvid** - **NYTimes** - **NYTimesArticle** + - **NYTimesCooking** - **NZZ** - **ocw.mit.edu** - **OdaTV** @@ -670,6 +673,8 @@ # Supported sites - **PicartoVod** - **Piksel** - **Pinkbike** + - **Pinterest** + - **PinterestCollection** - **Pladform** - **Platzi** - **PlatziCourse** @@ -766,6 +771,7 @@ # Supported sites - **RTVNH** - **RTVS** - **RUHD** + - **RumbleEmbed** - **rutube**: Rutube videos - **rutube:channel**: Rutube channels - **rutube:embed**: Rutube embedded videos @@ -943,6 +949,7 @@ # Supported sites - **TV2DKBornholmPlay** - **TV4**: tv4.se and tv4play.se - **TV5MondePlus**: TV5MONDE+ + - **tv8.it** - **TVA** - **TVANouvelles** - **TVANouvellesArticle** diff --git a/youtube_dlc/extractor/box.py b/youtube_dlc/extractor/box.py new file mode 100644 index 000000000..aae82d1af --- /dev/null +++ b/youtube_dlc/extractor/box.py @@ -0,0 +1,98 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + parse_iso8601, + # try_get, + update_url_query, +) + + +class BoxIE(InfoExtractor): + _VALID_URL = r'https?://(?:[^.]+\.)?app\.box\.com/s/(?P<shared_name>[^/]+)/file/(?P<id>\d+)' + _TEST = { + 'url': 'https://mlssoccer.app.box.com/s/0evd2o3e08l60lr4ygukepvnkord1o1x/file/510727257538', + 'md5': '1f81b2fd3960f38a40a3b8823e5fcd43', + 'info_dict': { + 'id': '510727257538', + 'ext': 'mp4', + 'title': 'Garber St. Louis will be 28th MLS team +scarving.mp4', + 'uploader': 'MLS Video', + 'timestamp': 1566320259, + 'upload_date': '20190820', + 'uploader_id': '235196876', + } + } + + def _real_extract(self, url): + shared_name, file_id = re.match(self._VALID_URL, url).groups() + webpage = self._download_webpage(url, file_id) + request_token = self._parse_json(self._search_regex( + r'Box\.config\s*=\s*({.+?});', webpage, + 'Box config'), file_id)['requestToken'] + access_token = self._download_json( + 'https://app.box.com/app-api/enduserapp/elements/tokens', file_id, + 'Downloading token JSON metadata', + data=json.dumps({'fileIDs': [file_id]}).encode(), headers={ + 'Content-Type': 'application/json', + 'X-Request-Token': request_token, + 'X-Box-EndUser-API': 'sharedName=' + shared_name, + })[file_id]['read'] + shared_link = 'https://app.box.com/s/' + shared_name + f = self._download_json( + 'https://api.box.com/2.0/files/' + file_id, file_id, + 'Downloading file JSON metadata', headers={ + 'Authorization': 'Bearer ' + access_token, + 'BoxApi': 'shared_link=' + shared_link, + 'X-Rep-Hints': '[dash]', # TODO: extract `hls` formats + }, query={ + 'fields': 'authenticated_download_url,created_at,created_by,description,extension,is_download_available,name,representations,size' + }) + title = f['name'] + + query = { + 'access_token': access_token, + 'shared_link': shared_link + } + + formats = [] + + # for entry in (try_get(f, lambda x: x['representations']['entries'], list) or []): + # entry_url_template = try_get( + # entry, lambda x: x['content']['url_template']) + # if not entry_url_template: + # continue + # representation = entry.get('representation') + # if representation == 'dash': + # TODO: append query to every fragment URL + # formats.extend(self._extract_mpd_formats( + # entry_url_template.replace('{+asset_path}', 'manifest.mpd'), + # file_id, query=query)) + + authenticated_download_url = f.get('authenticated_download_url') + if authenticated_download_url and f.get('is_download_available'): + formats.append({ + 'ext': f.get('extension') or determine_ext(title), + 'filesize': f.get('size'), + 'format_id': 'download', + 'url': update_url_query(authenticated_download_url, query), + }) + + self._sort_formats(formats) + + creator = f.get('created_by') or {} + + return { + 'id': file_id, + 'title': title, + 'formats': formats, + 'description': f.get('description') or None, + 'uploader': creator.get('name'), + 'timestamp': parse_iso8601(f.get('created_at')), + 'uploader_id': creator.get('id'), + } diff --git a/youtube_dlc/extractor/common.py b/youtube_dlc/extractor/common.py index 2bc94acdd..aacdf06fe 100644 --- a/youtube_dlc/extractor/common.py +++ b/youtube_dlc/extractor/common.py @@ -2597,6 +2597,7 @@ def _media_formats(src, cur_media_type, type_info={}): def _extract_akamai_formats(self, manifest_url, video_id, hosts={}): formats = [] + hdcore_sign = 'hdcore=3.7.0' f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m') hds_host = hosts.get('hds') @@ -2609,6 +2610,7 @@ def _extract_akamai_formats(self, manifest_url, video_id, hosts={}): for entry in f4m_formats: entry.update({'extra_param_to_segment_url': hdcore_sign}) formats.extend(f4m_formats) + m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8') hls_host = hosts.get('hls') if hls_host: @@ -2616,6 +2618,31 @@ def _extract_akamai_formats(self, manifest_url, video_id, hosts={}): formats.extend(self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + + http_host = hosts.get('http') + if http_host and 'hdnea=' not in manifest_url: + REPL_REGEX = r'https://[^/]+/i/([^,]+),([^/]+),([^/]+).csmil/.+' + qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',') + qualities_length = len(qualities) + if len(formats) in (qualities_length + 1, qualities_length * 2 + 1): + i = 0 + http_formats = [] + for f in formats: + if f['protocol'] == 'm3u8_native' and f['vcodec'] != 'none': + for protocol in ('http', 'https'): + http_f = f.copy() + del http_f['manifest_url'] + http_url = re.sub( + REPL_REGEX, protocol + r'://%s/\1%s\3' % (http_host, qualities[i]), f['url']) + http_f.update({ + 'format_id': http_f['format_id'].replace('hls-', protocol + '-'), + 'url': http_url, + 'protocol': protocol, + }) + http_formats.append(http_f) + i += 1 + formats.extend(http_formats) + return formats def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]): diff --git a/youtube_dlc/extractor/extractors.py b/youtube_dlc/extractor/extractors.py index 15522f942..c50bdbb79 100644 --- a/youtube_dlc/extractor/extractors.py +++ b/youtube_dlc/extractor/extractors.py @@ -126,6 +126,7 @@ from .bloomberg import BloombergIE from .bokecc import BokeCCIE from .bostonglobe import BostonGlobeIE +from .box import BoxIE from .bpb import BpbIE from .br import ( BRIE, @@ -801,6 +802,7 @@ from .nytimes import ( NYTimesIE, NYTimesArticleIE, + NYTimesCookingIE, ) from .nuvid import NuvidIE from .nzz import NZZIE @@ -863,6 +865,10 @@ ) from .piksel import PikselIE from .pinkbike import PinkbikeIE +from .pinterest import ( + PinterestIE, + PinterestCollectionIE, +) from .pladform import PladformIE from .platzi import ( PlatziIE, @@ -981,6 +987,7 @@ from .rtvnh import RTVNHIE from .rtvs import RTVSIE from .ruhd import RUHDIE +from .rumble import RumbleEmbedIE from .rutube import ( RutubeIE, RutubeChannelIE, diff --git a/youtube_dlc/extractor/franceinter.py b/youtube_dlc/extractor/franceinter.py index 05806895c..ae822a50e 100644 --- a/youtube_dlc/extractor/franceinter.py +++ b/youtube_dlc/extractor/franceinter.py @@ -16,6 +16,7 @@ class FranceInterIE(InfoExtractor): 'ext': 'mp3', 'title': 'Affaire Cahuzac : le contentieux du compte en Suisse', 'description': 'md5:401969c5d318c061f86bda1fa359292b', + 'thumbnail': r're:^https?://.*\.jpg', 'upload_date': '20160907', }, } @@ -31,6 +32,7 @@ def _real_extract(self, url): title = self._og_search_title(webpage) description = self._og_search_description(webpage) + thumbnail = self._html_search_meta(['og:image', 'twitter:image'], webpage) upload_date_str = self._search_regex( r'class=["\']\s*cover-emission-period\s*["\'][^>]*>[^<]+\s+(\d{1,2}\s+[^\s]+\s+\d{4})<', @@ -48,6 +50,7 @@ def _real_extract(self, url): 'id': video_id, 'title': title, 'description': description, + 'thumbnail': thumbnail, 'upload_date': upload_date, 'formats': [{ 'url': video_url, diff --git a/youtube_dlc/extractor/lbry.py b/youtube_dlc/extractor/lbry.py index 0a7ee919c..6177297ab 100644 --- a/youtube_dlc/extractor/lbry.py +++ b/youtube_dlc/extractor/lbry.py @@ -16,7 +16,7 @@ class LBRYIE(InfoExtractor): IE_NAME = 'lbry.tv' - _VALID_URL = r'https?://(?:www\.)?(?:lbry\.tv|odysee\.com)/(?P<id>@[0-9a-zA-Z-]+:[0-9a-z]+/[0-9a-zA-Z().-]+:[0-9a-z])' + _VALID_URL = r'https?://(?:www\.)?(?:lbry\.tv|odysee\.com)/(?P<id>@[^:]+:[0-9a-z]+/[^:]+:[0-9a-z])' _TESTS = [{ # Video 'url': 'https://lbry.tv/@Mantega:1/First-day-LBRY:1', @@ -44,6 +44,9 @@ class LBRYIE(InfoExtractor): }, { 'url': 'https://odysee.com/@BrodieRobertson:5/apple-is-tracking-everything-you-do-on:e', 'only_matching': True, + }, { + 'url': "https://odysee.com/@ScammerRevolts:b0/I-SYSKEY'D-THE-SAME-SCAMMERS-3-TIMES!:b", + 'only_matching': True, }] def _call_api_proxy(self, method, display_id, params): diff --git a/youtube_dlc/extractor/nytimes.py b/youtube_dlc/extractor/nytimes.py index fc78ca56c..976b1c694 100644 --- a/youtube_dlc/extractor/nytimes.py +++ b/youtube_dlc/extractor/nytimes.py @@ -221,3 +221,41 @@ def _real_extract(self, url): r'NYTD\.FlexTypes\.push\s*\(\s*({.+})\s*\)\s*;'), webpage, 'podcast data') return self._extract_podcast_from_json(podcast_data, page_id, webpage) + + +class NYTimesCookingIE(NYTimesBaseIE): + _VALID_URL = r'https?://cooking\.nytimes\.com/(?:guid|recip)es/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://cooking.nytimes.com/recipes/1017817-cranberry-curd-tart', + 'md5': 'dab81fa2eaeb3f9ed47498bdcfcdc1d3', + 'info_dict': { + 'id': '100000004756089', + 'ext': 'mov', + 'timestamp': 1479383008, + 'uploader': 'By SHAW LASH, ADAM SAEWITZ and JAMES HERRON', + 'title': 'Cranberry Tart', + 'upload_date': '20161117', + 'description': 'If you are a fan of lemon curd or the classic French tarte au citron, you will love this cranberry version.', + }, + }, { + 'url': 'https://cooking.nytimes.com/guides/13-how-to-cook-a-turkey', + 'md5': '4b2e8c70530a89b8d905a2b572316eb8', + 'info_dict': { + 'id': '100000003951728', + 'ext': 'mov', + 'timestamp': 1445509539, + 'description': 'Turkey guide', + 'upload_date': '20151022', + 'title': 'Turkey', + } + }] + + def _real_extract(self, url): + page_id = self._match_id(url) + + webpage = self._download_webpage(url, page_id) + + video_id = self._search_regex( + r'data-video-id=["\'](\d+)', webpage, 'video id') + + return self._extract_video_from_id(video_id) diff --git a/youtube_dlc/extractor/pinterest.py b/youtube_dlc/extractor/pinterest.py new file mode 100644 index 000000000..b249c9eda --- /dev/null +++ b/youtube_dlc/extractor/pinterest.py @@ -0,0 +1,201 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + determine_ext, + float_or_none, + int_or_none, + try_get, + unified_timestamp, + url_or_none, +) + + +class PinterestBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:[^/]+\.)?pinterest\.(?:com|fr|de|ch|jp|cl|ca|it|co\.uk|nz|ru|com\.au|at|pt|co\.kr|es|com\.mx|dk|ph|th|com\.uy|co|nl|info|kr|ie|vn|com\.vn|ec|mx|in|pe|co\.at|hu|co\.in|co\.nz|id|com\.ec|com\.py|tw|be|uk|com\.bo|com\.pe)' + + def _call_api(self, resource, video_id, options): + return self._download_json( + 'https://www.pinterest.com/resource/%sResource/get/' % resource, + video_id, 'Download %s JSON metadata' % resource, query={ + 'data': json.dumps({'options': options}) + })['resource_response'] + + def _extract_video(self, data, extract_formats=True): + video_id = data['id'] + + title = (data.get('title') or data.get('grid_title') or video_id).strip() + + formats = [] + duration = None + if extract_formats: + for format_id, format_dict in data['videos']['video_list'].items(): + if not isinstance(format_dict, dict): + continue + format_url = url_or_none(format_dict.get('url')) + if not format_url: + continue + duration = float_or_none(format_dict.get('duration'), scale=1000) + ext = determine_ext(format_url) + if 'hls' in format_id.lower() or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id=format_id, fatal=False)) + else: + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'width': int_or_none(format_dict.get('width')), + 'height': int_or_none(format_dict.get('height')), + 'duration': duration, + }) + self._sort_formats( + formats, field_preference=('height', 'width', 'tbr', 'format_id')) + + description = data.get('description') or data.get('description_html') or data.get('seo_description') + timestamp = unified_timestamp(data.get('created_at')) + + def _u(field): + return try_get(data, lambda x: x['closeup_attribution'][field], compat_str) + + uploader = _u('full_name') + uploader_id = _u('id') + + repost_count = int_or_none(data.get('repin_count')) + comment_count = int_or_none(data.get('comment_count')) + categories = try_get(data, lambda x: x['pin_join']['visual_annotation'], list) + tags = data.get('hashtags') + + thumbnails = [] + images = data.get('images') + if isinstance(images, dict): + for thumbnail_id, thumbnail in images.items(): + if not isinstance(thumbnail, dict): + continue + thumbnail_url = url_or_none(thumbnail.get('url')) + if not thumbnail_url: + continue + thumbnails.append({ + 'url': thumbnail_url, + 'width': int_or_none(thumbnail.get('width')), + 'height': int_or_none(thumbnail.get('height')), + }) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'duration': duration, + 'timestamp': timestamp, + 'thumbnails': thumbnails, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'repost_count': repost_count, + 'comment_count': comment_count, + 'categories': categories, + 'tags': tags, + 'formats': formats, + 'extractor_key': PinterestIE.ie_key(), + } + + +class PinterestIE(PinterestBaseIE): + _VALID_URL = r'%s/pin/(?P<id>\d+)' % PinterestBaseIE._VALID_URL_BASE + _TESTS = [{ + 'url': 'https://www.pinterest.com/pin/664281013778109217/', + 'md5': '6550c2af85d6d9f3fe3b88954d1577fc', + 'info_dict': { + 'id': '664281013778109217', + 'ext': 'mp4', + 'title': 'Origami', + 'description': 'md5:b9d90ddf7848e897882de9e73344f7dd', + 'duration': 57.7, + 'timestamp': 1593073622, + 'upload_date': '20200625', + 'uploader': 'Love origami -I am Dafei', + 'uploader_id': '586523688879454212', + 'repost_count': 50, + 'comment_count': 0, + 'categories': list, + 'tags': list, + }, + }, { + 'url': 'https://co.pinterest.com/pin/824721750502199491/', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + data = self._call_api( + 'Pin', video_id, { + 'field_set_key': 'unauth_react_main_pin', + 'id': video_id, + })['data'] + return self._extract_video(data) + + +class PinterestCollectionIE(PinterestBaseIE): + _VALID_URL = r'%s/(?P<username>[^/]+)/(?P<id>[^/?#&]+)' % PinterestBaseIE._VALID_URL_BASE + _TESTS = [{ + 'url': 'https://www.pinterest.ca/mashal0407/cool-diys/', + 'info_dict': { + 'id': '585890301462791043', + 'title': 'cool diys', + }, + 'playlist_count': 8, + }, { + 'url': 'https://www.pinterest.ca/fudohub/videos/', + 'info_dict': { + 'id': '682858430939307450', + 'title': 'VIDEOS', + }, + 'playlist_mincount': 365, + 'skip': 'Test with extract_formats=False', + }] + + @classmethod + def suitable(cls, url): + return False if PinterestIE.suitable(url) else super( + PinterestCollectionIE, cls).suitable(url) + + def _real_extract(self, url): + username, slug = re.match(self._VALID_URL, url).groups() + board = self._call_api( + 'Board', slug, { + 'slug': slug, + 'username': username + })['data'] + board_id = board['id'] + options = { + 'board_id': board_id, + 'page_size': 250, + } + bookmark = None + entries = [] + while True: + if bookmark: + options['bookmarks'] = [bookmark] + board_feed = self._call_api('BoardFeed', board_id, options) + for item in (board_feed.get('data') or []): + if not isinstance(item, dict) or item.get('type') != 'pin': + continue + video_id = item.get('id') + if video_id: + # Some pins may not be available anonymously via pin URL + # video = self._extract_video(item, extract_formats=False) + # video.update({ + # '_type': 'url_transparent', + # 'url': 'https://www.pinterest.com/pin/%s/' % video_id, + # }) + # entries.append(video) + entries.append(self._extract_video(item)) + bookmark = board_feed.get('bookmark') + if not bookmark: + break + return self.playlist_result( + entries, playlist_id=board_id, playlist_title=board.get('name')) diff --git a/youtube_dlc/extractor/rumble.py b/youtube_dlc/extractor/rumble.py new file mode 100644 index 000000000..4a0225109 --- /dev/null +++ b/youtube_dlc/extractor/rumble.py @@ -0,0 +1,67 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + determine_ext, + int_or_none, + parse_iso8601, + try_get, +) + + +class RumbleEmbedIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?rumble\.com/embed/(?:[0-9a-z]+\.)?(?P<id>[0-9a-z]+)' + _TESTS = [{ + 'url': 'https://rumble.com/embed/v5pv5f', + 'md5': '36a18a049856720189f30977ccbb2c34', + 'info_dict': { + 'id': 'v5pv5f', + 'ext': 'mp4', + 'title': 'WMAR 2 News Latest Headlines | October 20, 6pm', + 'timestamp': 1571611968, + 'upload_date': '20191020', + } + }, { + 'url': 'https://rumble.com/embed/ufe9n.v5pv5f', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + video = self._download_json( + 'https://rumble.com/embedJS/', video_id, + query={'request': 'video', 'v': video_id}) + title = video['title'] + + formats = [] + for height, ua in (video.get('ua') or {}).items(): + for i in range(2): + f_url = try_get(ua, lambda x: x[i], compat_str) + if f_url: + ext = determine_ext(f_url) + f = { + 'ext': ext, + 'format_id': '%s-%sp' % (ext, height), + 'height': int_or_none(height), + 'url': f_url, + } + bitrate = try_get(ua, lambda x: x[i + 2]['bitrate']) + if bitrate: + f['tbr'] = int_or_none(bitrate) + formats.append(f) + self._sort_formats(formats) + + author = video.get('author') or {} + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': video.get('i'), + 'timestamp': parse_iso8601(video.get('pubDate')), + 'channel': author.get('name'), + 'channel_url': author.get('url'), + 'duration': int_or_none(video.get('duration')), + } From 70d5c17b0894642069a67a0659e16a9a814df38a Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan@gmail.com> Date: Tue, 24 Nov 2020 03:17:42 +0530 Subject: [PATCH 115/124] Minor changes to make it easier to merge --- docs/supportedsites.md | 4 +- youtube_dlc/extractor/youtube.py | 108 +++++++++++++++++-------------- 2 files changed, 60 insertions(+), 52 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 45a546650..db2295572 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1153,7 +1153,7 @@ # Supported sites - **YourPorn** - **YourUpload** - **youtube**: YouTube.com - - **youtube:favorites**: YouTube.com liked videos, ":ytfav" or "LL" for short (requires authentication) + - **youtube:favorites**: YouTube.com liked videos, ":ytfav" for short (requires authentication) - **youtube:history**: Youtube watch history, ":ythistory" for short (requires authentication) - **youtube:playlist**: YouTube.com playlists - **youtube:recommended**: YouTube.com recommended videos, ":ytrec" for short (requires authentication) @@ -1162,7 +1162,7 @@ # Supported sites - **youtube:search_url**: YouTube.com search URLs - **youtube:subscriptions**: YouTube.com subscriptions feed, ":ytsubs" for short (requires authentication) - **youtube:tab**: YouTube.com tab - - **youtube:watchlater**: Youtube watch later list, ":ytwatchlater" or "WL" for short (requires authentication) + - **youtube:watchlater**: Youtube watch later list, ":ytwatchlater" for short (requires authentication) - **YoutubeYtUser**: YouTube.com user videos, URL or "ytuser" keyword - **Zapiks** - **Zaq1** diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index 0f15b0189..e46614e4e 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -72,7 +72,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): # If True it will raise an error if no login info is provided _LOGIN_REQUIRED = False - _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM)' + _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM|WL|LL|LM)' _YOUTUBE_CLIENT_HEADERS = { 'x-youtube-client-name': '1', @@ -1328,7 +1328,6 @@ def _get_ytplayer_config(self, video_id, webpage): # https://github.com/ytdl-org/youtube-dl/pull/7599) r';ytplayer\.config\s*=\s*({.+?});ytplayer', r';ytplayer\.config\s*=\s*({.+?});', - r'ytInitialPlayerResponse\s*=\s*({.+?});var meta' # Needed??? ) config = self._search_regex( patterns, webpage, 'ytplayer.config', default=None) @@ -2532,11 +2531,22 @@ def decrypt_sig(mobj): class YoutubeTabIE(YoutubeBaseInfoExtractor): IE_DESC = 'YouTube.com tab' - _VALID_URL = (r'''(?x) - https?://(?:\w+\.)?(?:youtube(?:kids)?\.com|invidio\.us)/(?: - (?!(%s)([/#?]|$))|channel/|c/|user/| - (?P<not_channel>playlist|watch)/?\?.*?\blist=) - (?P<id>[^/?#&]+)''') % YoutubeBaseInfoExtractor._RESERVED_NAMES + _VALID_URL = r'''(?x) + https?:// + (?:\w+\.)? + (?: + youtube(?:kids)?\.com| + invidio\.us + )/ + (?: + (?:channel|c|user)/| + (?P<not_channel> + (?:playlist|watch)\?.*?\blist= + )| + (?!(%s)([/#?]|$)) # Direct URLs + ) + (?P<id>[^/?\#&]+) + ''' % YoutubeBaseInfoExtractor._RESERVED_NAMES IE_NAME = 'youtube:tab' _TESTS = [{ @@ -2910,7 +2920,7 @@ def _itemSection_entries(self, item_sect_renderer): def _rich_entries(self, rich_grid_renderer): renderer = try_get( - rich_grid_renderer, lambda x: x['content']['videoRenderer'], dict) + rich_grid_renderer, lambda x: x['content']['videoRenderer'], dict) or {} video_id = renderer.get('videoId') if not video_id: return @@ -3008,14 +3018,14 @@ def _extract_continuation(cls, renderer): def _entries(self, tab, identity_token): - def extract_entries(parent_renderer): - slr_contents = try_get(parent_renderer, lambda x: x['contents'], list) or [] - for slr_content in slr_contents: - if not isinstance(slr_content, dict): + def extract_entries(parent_renderer): # this needs to called again for continuation to work with feeds + contents = try_get(parent_renderer, lambda x: x['contents'], list) or [] + for content in contents: + if not isinstance(content, dict): continue - is_renderer = try_get(slr_content, lambda x: x['itemSectionRenderer'], dict) + is_renderer = try_get(content, lambda x: x['itemSectionRenderer'], dict) if not is_renderer: - renderer = slr_content.get('richItemRenderer') + renderer = content.get('richItemRenderer') if renderer: for entry in self._rich_entries(renderer): yield entry @@ -3041,7 +3051,6 @@ def extract_entries(parent_renderer): if renderer: for entry in self._shelf_entries(renderer): yield entry - continuation_list[0] = self._extract_continuation(parent_renderer) continue renderer = isr_content.get('backstagePostThreadRenderer') if renderer: @@ -3054,18 +3063,19 @@ def extract_entries(parent_renderer): entry = self._video_entry(renderer) if entry: yield entry + if not continuation_list[0]: continuation_list[0] = self._extract_continuation(is_renderer) - if not continuation_list[0]: - continuation_list[0] = self._extract_continuation(parent_renderer) + + if not continuation_list[0]: + continuation_list[0] = self._extract_continuation(parent_renderer) continuation_list = [None] # Python 2 doesnot support nonlocal parent_renderer = ( try_get(tab, lambda x: x['sectionListRenderer'], dict) or try_get(tab, lambda x: x['richGridRenderer'], dict) or {}) - if parent_renderer: - for entry in extract_entries(parent_renderer): - yield entry + for entry in extract_entries(parent_renderer): + yield entry continuation = continuation_list[0] headers = { @@ -3078,8 +3088,6 @@ def extract_entries(parent_renderer): for page_num in itertools.count(1): if not continuation: break - if hasattr(self, '_MAX_PAGES') and page_num > self._MAX_PAGES: - break browse = self._download_json( 'https://www.youtube.com/browse_ajax', None, 'Downloading page %d' % page_num, @@ -3111,7 +3119,7 @@ def extract_entries(parent_renderer): yield entry continuation = self._extract_continuation(continuation_renderer) continue - continuation_renderer = continuation_contents.get('sectionListContinuation') + continuation_renderer = continuation_contents.get('sectionListContinuation') # for feeds if continuation_renderer: continuation_list = [None] for entry in extract_entries(continuation_renderer): @@ -3125,19 +3133,13 @@ def extract_entries(parent_renderer): continuation_item = continuation_items[0] if not isinstance(continuation_item, dict): continue - renderer = continuation_item.get('playlistVideoRenderer') + renderer = continuation_item.get('playlistVideoRenderer') or continuation_item.get('itemSectionRenderer') if renderer: video_list_renderer = {'contents': continuation_items} for entry in self._playlist_entries(video_list_renderer): yield entry continuation = self._extract_continuation(video_list_renderer) continue - renderer = continuation_item.get('itemSectionRenderer') - if renderer: - for entry in self._itemSection_entries(renderer): - yield entry - continuation = self._extract_continuation({'contents': continuation_items}) - continue break @staticmethod @@ -3175,7 +3177,7 @@ def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token): selected_tab = self._extract_selected_tab(tabs) renderer = try_get( data, lambda x: x['metadata']['channelMetadataRenderer'], dict) - playlist_id = None + playlist_id = title = description = None if renderer: channel_title = renderer.get('title') or item_id tab_title = selected_tab.get('title') @@ -3191,7 +3193,9 @@ def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token): description = None playlist_id = item_id if playlist_id is None: - return None + playlist_id = item_id + if title is None: + title = "Youtube " + playlist_id.title() playlist = self.playlist_result( self._entries(selected_tab['content'], identity_token), playlist_id=playlist_id, playlist_title=title, @@ -3212,7 +3216,7 @@ def _real_extract(self, url): url = compat_urlparse.urlunparse( compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com')) is_home = re.match(r'(?P<pre>%s)(?P<post>/?(?![^#?]).*$)' % self._VALID_URL, url) - if is_home is not None and is_home.group('not_channel') is None: + if is_home is not None and is_home.group('not_channel') is None and item_id != 'feed': self._downloader.report_warning( 'A channel/user page was given. All the channel\'s videos will be downloaded. ' 'To download only the videos in the home page, add a "/home" to the URL') @@ -3365,6 +3369,25 @@ def _real_extract(self, url): ie=YoutubeTabIE.ie_key(), video_id=user_id) +class YoutubeFavouritesIE(InfoExtractor): + IE_NAME = 'youtube:favorites' + IE_DESC = 'YouTube.com liked videos, ":ytfav" for short (requires authentication)' + _VALID_URL = r':ytfav(?:ou?rite)?s?' + _LOGIN_REQUIRED = True + _TESTS = [{ + 'url': ':ytfav', + 'only_matching': True, + }, { + 'url': ':ytfavorites', + 'only_matching': True, + }] + + def _real_extract(self, url): + return self.url_result( + 'https://www.youtube.com/playlist?list=LL', + ie=YoutubeTabIE.ie_key()) + + class YoutubeSearchIE(SearchInfoExtractor, YoutubeBaseInfoExtractor): IE_DESC = 'YouTube.com searches' # there doesn't appear to be a real limit, for example if you search for @@ -3536,9 +3559,9 @@ def _real_extract(self, url): class YoutubeWatchLaterIE(InfoExtractor): IE_NAME = 'youtube:watchlater' - IE_DESC = 'Youtube watch later list, ":ytwatchlater" or "WL" for short (requires authentication)' _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/watch_later|:ytwatchlater|WL' + IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)' _TESTS = [{ 'url': 'https://www.youtube.com/feed/watch_later', 'only_matching': True, @@ -3552,21 +3575,6 @@ def _real_extract(self, url): 'https://www.youtube.com/playlist?list=WL', ie=YoutubeTabIE.ie_key()) -class YoutubeFavouritesIE(InfoExtractor): - IE_NAME = 'youtube:favourites' - IE_DESC = 'YouTube.com liked videos, ":ytfav" or "LL" for short (requires authentication)' - _VALID_URL = r':ytfav(?:ou?rite)?s?|LL' - - _TESTS = [{ - 'url': ':ytfav', - 'only_matching': True, - }] - - def _real_extract(self, url): - return self.url_result( - 'https://www.youtube.com/playlist?list=LL', ie=YoutubeTabIE.ie_key()) - - class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor): IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)' _VALID_URL = r'https?://(?:www\.)?youtube\.com(?:/feed/recommended|/?[?#]|/?$)|:ytrec(?:ommended)?' @@ -3575,8 +3583,8 @@ class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor): class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor): - IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)' _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsub(?:scription)?s?' + IE_DESC = 'YouTube.com subscriptions feed, ":ytsubs" for short (requires authentication)' _FEED_NAME = 'subscriptions' _PLAYLIST_TITLE = 'Youtube Subscriptions' From 3d3dddc94882c50f1c3ad15663bbd43cae7b0bea Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan@gmail.com> Date: Tue, 24 Nov 2020 03:29:10 +0530 Subject: [PATCH 116/124] Update youtube extractor to 2020.11.24 --- test/test_all_urls.py | 7 +- youtube_dlc/extractor/youtube.py | 132 ++++++++++++++++++------------- 2 files changed, 82 insertions(+), 57 deletions(-) diff --git a/test/test_all_urls.py b/test/test_all_urls.py index 4784c633f..8dcdc4e58 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -64,9 +64,10 @@ def test_youtube_channel_matching(self): # self.assertMatch('http://www.youtube.com/NASAgovVideo/videos', ['youtube:tab']) def test_youtube_feeds(self): - self.assertMatch('https://www.youtube.com/feed/watch_later', ['youtube:watchlater']) - self.assertMatch('https://www.youtube.com/feed/subscriptions', ['youtube:subscriptions']) - self.assertMatch('https://www.youtube.com/feed/recommended', ['youtube:recommended']) + self.assertMatch('https://www.youtube.com/feed/library', ['youtube:tab']) + self.assertMatch('https://www.youtube.com/feed/history', ['youtube:tab']) + self.assertMatch('https://www.youtube.com/feed/watch_later', ['youtube:tab']) + self.assertMatch('https://www.youtube.com/feed/subscriptions', ['youtube:tab']) # def test_youtube_search_matching(self): # self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url']) diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index e46614e4e..fd15d3865 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -2541,6 +2541,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): (?: (?:channel|c|user)/| (?P<not_channel> + feed/| (?:playlist|watch)\?.*?\blist= )| (?!(%s)([/#?]|$)) # Direct URLs @@ -2785,7 +2786,30 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): }, { 'url': 'https://www.youtube.com/c/CommanderVideoHq/live', 'only_matching': True, - }, + }, { + 'url': 'https://www.youtube.com/feed/trending', + 'only_matching': True, + }, { + # needs auth + 'url': 'https://www.youtube.com/feed/library', + 'only_matching': True, + }, { + # needs auth + 'url': 'https://www.youtube.com/feed/history', + 'only_matching': True, + }, { + # needs auth + 'url': 'https://www.youtube.com/feed/subscriptions', + 'only_matching': True, + }, { + # needs auth + 'url': 'https://www.youtube.com/feed/watch_later', + 'only_matching': True, + }, { + # no longer available? + 'url': 'https://www.youtube.com/feed/recommended', + 'only_matching': True, + } # TODO # { # 'url': 'https://www.youtube.com/TheYoungTurks/live', @@ -2872,27 +2896,34 @@ def _grid_entries(self, grid_renderer): 'https://www.youtube.com/channel/%s' % channel_id, ie=YoutubeTabIE.ie_key(), video_title=title) - def _shelf_entries_trimmed(self, shelf_renderer): - renderer = try_get( - shelf_renderer, lambda x: x['content']['horizontalListRenderer'], dict) - if not renderer: + def _shelf_entries_from_content(self, shelf_renderer): + content = shelf_renderer.get('content') + if not isinstance(content, dict): return - # TODO: add support for nested playlists so each shelf is processed - # as separate playlist - # TODO: this includes only first N items - for entry in self._grid_entries(renderer): - yield entry + renderer = content.get('gridRenderer') + if renderer: + # TODO: add support for nested playlists so each shelf is processed + # as separate playlist + # TODO: this includes only first N items + for entry in self._grid_entries(renderer): + yield entry + renderer = content.get('horizontalListRenderer') + if renderer: + # TODO + pass def _shelf_entries(self, shelf_renderer): ep = try_get( shelf_renderer, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'], compat_str) shelf_url = urljoin('https://www.youtube.com', ep) - if not shelf_url: - return - title = try_get( - shelf_renderer, lambda x: x['title']['runs'][0]['text'], compat_str) - yield self.url_result(shelf_url, video_title=title) + if shelf_url: + title = try_get( + shelf_renderer, lambda x: x['title']['runs'][0]['text'], compat_str) + yield self.url_result(shelf_url, video_title=title) + # Shelf may not contain shelf URL, fallback to extraction from content + for entry in self._shelf_entries_from_content(shelf_renderer): + yield entry def _playlist_entries(self, video_list_renderer): for content in video_list_renderer['contents']: @@ -2906,6 +2937,7 @@ def _playlist_entries(self, video_list_renderer): continue yield self._extract_video(renderer) + r""" # Not needed in the new implementation def _itemSection_entries(self, item_sect_renderer): for content in item_sect_renderer['contents']: if not isinstance(content, dict): @@ -2917,6 +2949,7 @@ def _itemSection_entries(self, item_sect_renderer): if not video_id: continue yield self._extract_video(renderer) + """ def _rich_entries(self, rich_grid_renderer): renderer = try_get( @@ -3369,7 +3402,7 @@ def _real_extract(self, url): ie=YoutubeTabIE.ie_key(), video_id=user_id) -class YoutubeFavouritesIE(InfoExtractor): +class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): IE_NAME = 'youtube:favorites' IE_DESC = 'YouTube.com liked videos, ":ytfav" for short (requires authentication)' _VALID_URL = r':ytfav(?:ou?rite)?s?' @@ -3515,7 +3548,7 @@ def _real_extract(self, url): class YoutubeFeedsInfoExtractor(YoutubeTabIE): """ Base class for feed extractors - Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties. + Subclasses must define the _FEED_NAME property. """ _LOGIN_REQUIRED = True # _MAX_PAGES = 5 @@ -3528,44 +3561,17 @@ def IE_NAME(self): def _real_initialize(self): self._login() - def _shelf_entries(self, shelf_renderer): - renderer = try_get(shelf_renderer, lambda x: x['content']['gridRenderer'], dict) - if not renderer: - return - for entry in self._grid_entries(renderer): - yield entry - - def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token): - selected_tab = self._extract_selected_tab(tabs) - return self.playlist_result( - self._entries(selected_tab['content'], identity_token), - playlist_title=self._PLAYLIST_TITLE) - def _real_extract(self, url): - item_id = self._FEED_NAME - url = 'https://www.youtube.com/feed/%s' % self._FEED_NAME - webpage = self._download_webpage(url, item_id) - identity_token = self._search_regex( - r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage, - 'identity token', default=None) - data = self._extract_yt_initial_data(item_id, webpage) - tabs = try_get( - data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list) - if tabs: - return self._extract_from_tabs(item_id, webpage, data, tabs, identity_token) - # Failed to recognize - raise ExtractorError('Unable to recognize feed page') + return self.url_result( + 'https://www.youtube.com/feed/%s' % self._FEED_NAME, + ie=YoutubeTabIE.ie_key()) class YoutubeWatchLaterIE(InfoExtractor): IE_NAME = 'youtube:watchlater' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/watch_later|:ytwatchlater|WL' - IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)' + _VALID_URL = r':ytwatchlater' _TESTS = [{ - 'url': 'https://www.youtube.com/feed/watch_later', - 'only_matching': True, - }, { 'url': ':ytwatchlater', 'only_matching': True, }] @@ -3577,23 +3583,41 @@ def _real_extract(self, url): class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor): IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)' - _VALID_URL = r'https?://(?:www\.)?youtube\.com(?:/feed/recommended|/?[?#]|/?$)|:ytrec(?:ommended)?' + _VALID_URL = r'https?://(?:www\.)?youtube\.com/?(?:[?#]|$)|:ytrec(?:ommended)?' _FEED_NAME = 'recommended' - _PLAYLIST_TITLE = 'Youtube Recommended videos' + _TESTS = [{ + 'url': ':ytrec', + 'only_matching': True, + }, { + 'url': ':ytrecommended', + 'only_matching': True, + }, { + 'url': 'https://youtube.com', + 'only_matching': True, + }] class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor): - _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsub(?:scription)?s?' IE_DESC = 'YouTube.com subscriptions feed, ":ytsubs" for short (requires authentication)' + _VALID_URL = r':ytsub(?:scription)?s?' _FEED_NAME = 'subscriptions' - _PLAYLIST_TITLE = 'Youtube Subscriptions' + _TESTS = [{ + 'url': ':ytsubs', + 'only_matching': True, + }, { + 'url': ':ytsubscriptions', + 'only_matching': True, + }] class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/history|:ythistory' + _VALID_URL = r':ythistory' _FEED_NAME = 'history' - _PLAYLIST_TITLE = 'Youtube History' + _TESTS = [{ + 'url': ':ythistory', + 'only_matching': True, + }] class YoutubeTruncatedURLIE(InfoExtractor): From 02ced43cbf763260ac35d4c92fa6fd5f89f69f72 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan@gmail.com> Date: Wed, 25 Nov 2020 19:14:49 +0530 Subject: [PATCH 117/124] Print youtube's warning message (Closes #256) --- youtube_dlc/extractor/youtube.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index fd15d3865..540f35337 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -3244,6 +3244,21 @@ def _extract_from_playlist(self, item_id, data, playlist): self._playlist_entries(playlist), playlist_id=playlist_id, playlist_title=title) + def _extract_alerts(self, data): + for alert_dict in try_get(data, lambda x: x['alerts'], list) or []: + for renderer in alert_dict: + alert = alert_dict[renderer] + alert_type = alert.get('type') + if not alert_type: + continue + message = try_get(alert, lambda x: x['text']['simpleText'], compat_str) + if message: + yield alert_type, message + for run in try_get(alert, lambda x: x['text']['runs'], list) or []: + message = try_get(run, lambda x: x['text'], compat_str) + if message: + yield alert_type, message + def _real_extract(self, url): item_id = self._match_id(url) url = compat_urlparse.urlunparse( @@ -3269,6 +3284,8 @@ def _real_extract(self, url): r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage, 'identity token', default=None) data = self._extract_yt_initial_data(item_id, webpage) + for alert_type, alert_message in self._extract_alerts(data): + self._downloader.report_warning('YouTube said: %s - %s' % (alert_type, alert_message)) tabs = try_get( data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list) if tabs: From 38d7028407b5db50e4d3c712d52b294ec1100c1f Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan@gmail.com> Date: Thu, 26 Nov 2020 22:57:34 +0530 Subject: [PATCH 118/124] Updated to release 2020.11.26 --- docs/supportedsites.md | 7 +- youtube_dlc/downloader/fragment.py | 14 +- youtube_dlc/extractor/bbc.py | 57 +++- youtube_dlc/extractor/cda.py | 35 ++- youtube_dlc/extractor/extractors.py | 9 +- youtube_dlc/extractor/medaltv.py | 131 +++++++++ youtube_dlc/extractor/nrk.py | 442 ++++++++++++++++------------ youtube_dlc/extractor/spreaker.py | 176 +++++++++++ youtube_dlc/extractor/viki.py | 7 +- youtube_dlc/extractor/vlive.py | 437 ++++++++++++--------------- youtube_dlc/extractor/youtube.py | 74 ++--- 11 files changed, 896 insertions(+), 493 deletions(-) create mode 100644 youtube_dlc/extractor/medaltv.py create mode 100644 youtube_dlc/extractor/spreaker.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index db2295572..ad11521f7 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -477,6 +477,7 @@ # Supported sites - **massengeschmack.tv** - **MatchTV** - **MDR**: MDR.DE and KiKA + - **MedalTV** - **media.ccc.de** - **media.ccc.de:lists** - **Medialaan** @@ -846,6 +847,10 @@ # Supported sites - **Sport5** - **SportBox** - **SportDeutschland** + - **Spreaker** + - **SpreakerPage** + - **SpreakerShow** + - **SpreakerShowPage** - **SpringboardPlatform** - **Sprout** - **sr:mediathek**: Saarländischer Rundfunk @@ -1064,7 +1069,7 @@ # Supported sites - **vk:wallpost** - **vlive** - **vlive:channel** - - **vlive:playlist** + - **vlive:post** - **Vodlocker** - **VODPl** - **VODPlatform** diff --git a/youtube_dlc/downloader/fragment.py b/youtube_dlc/downloader/fragment.py index 9339b3a62..cf4fd41da 100644 --- a/youtube_dlc/downloader/fragment.py +++ b/youtube_dlc/downloader/fragment.py @@ -97,12 +97,15 @@ def _write_ytdl_file(self, ctx): def _download_fragment(self, ctx, frag_url, info_dict, headers=None): fragment_filename = '%s-Frag%d' % (ctx['tmpfilename'], ctx['fragment_index']) - success = ctx['dl'].download(fragment_filename, { + fragment_info_dict = { 'url': frag_url, 'http_headers': headers or info_dict.get('http_headers'), - }) + } + success = ctx['dl'].download(fragment_filename, fragment_info_dict) if not success: return False, None + if fragment_info_dict.get('filetime'): + ctx['fragment_filetime'] = fragment_info_dict.get('filetime') down, frag_sanitized = sanitize_open(fragment_filename, 'rb') ctx['fragment_filename_sanitized'] = frag_sanitized frag_content = down.read() @@ -258,6 +261,13 @@ def _finish_frag_download(self, ctx): downloaded_bytes = ctx['complete_frags_downloaded_bytes'] else: self.try_rename(ctx['tmpfilename'], ctx['filename']) + if self.params.get('updatetime', True): + filetime = ctx.get('fragment_filetime') + if filetime: + try: + os.utime(ctx['filename'], (time.time(), filetime)) + except Exception: + pass downloaded_bytes = os.path.getsize(encodeFilename(ctx['filename'])) self._hook_progress({ diff --git a/youtube_dlc/extractor/bbc.py b/youtube_dlc/extractor/bbc.py index 002c39c39..54cbcdc8e 100644 --- a/youtube_dlc/extractor/bbc.py +++ b/youtube_dlc/extractor/bbc.py @@ -981,7 +981,7 @@ def _real_extract(self, url): group_id = self._search_regex( r'<div[^>]+\bclass=["\']video["\'][^>]+\bdata-pid=["\'](%s)' % self._ID_REGEX, webpage, 'group id', default=None) - if playlist_id: + if group_id: return self.url_result( 'https://www.bbc.co.uk/programmes/%s' % group_id, ie=BBCCoUkIE.ie_key()) @@ -1092,10 +1092,26 @@ def _real_extract(self, url): self._search_regex( r'(?s)bbcthreeConfig\s*=\s*({.+?})\s*;\s*<', webpage, 'bbcthree config', default='{}'), - playlist_id, transform_source=js_to_json, fatal=False) - if bbc3_config: + playlist_id, transform_source=js_to_json, fatal=False) or {} + payload = bbc3_config.get('payload') or {} + if payload: + clip = payload.get('currentClip') or {} + clip_vpid = clip.get('vpid') + clip_title = clip.get('title') + if clip_vpid and clip_title: + formats, subtitles = self._download_media_selector(clip_vpid) + self._sort_formats(formats) + return { + 'id': clip_vpid, + 'title': clip_title, + 'thumbnail': dict_get(clip, ('poster', 'imageUrl')), + 'description': clip.get('description'), + 'duration': parse_duration(clip.get('duration')), + 'formats': formats, + 'subtitles': subtitles, + } bbc3_playlist = try_get( - bbc3_config, lambda x: x['payload']['content']['bbcMedia']['playlist'], + payload, lambda x: x['content']['bbcMedia']['playlist'], dict) if bbc3_playlist: playlist_title = bbc3_playlist.get('title') or playlist_title @@ -1118,6 +1134,39 @@ def _real_extract(self, url): return self.playlist_result( entries, playlist_id, playlist_title, playlist_description) + initial_data = self._parse_json(self._search_regex( + r'window\.__INITIAL_DATA__\s*=\s*({.+?});', webpage, + 'preload state', default='{}'), playlist_id, fatal=False) + if initial_data: + def parse_media(media): + if not media: + return + for item in (try_get(media, lambda x: x['media']['items'], list) or []): + item_id = item.get('id') + item_title = item.get('title') + if not (item_id and item_title): + continue + formats, subtitles = self._download_media_selector(item_id) + self._sort_formats(formats) + entries.append({ + 'id': item_id, + 'title': item_title, + 'thumbnail': item.get('holdingImageUrl'), + 'formats': formats, + 'subtitles': subtitles, + }) + for resp in (initial_data.get('data') or {}).values(): + name = resp.get('name') + if name == 'media-experience': + parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict)) + elif name == 'article': + for block in (try_get(resp, lambda x: x['data']['blocks'], list) or []): + if block.get('type') != 'media': + continue + parse_media(block.get('model')) + return self.playlist_result( + entries, playlist_id, playlist_title, playlist_description) + def extract_all(pattern): return list(filter(None, map( lambda s: self._parse_json(s, playlist_id, fatal=False), diff --git a/youtube_dlc/extractor/cda.py b/youtube_dlc/extractor/cda.py index 0c3af23d5..d67900e62 100644 --- a/youtube_dlc/extractor/cda.py +++ b/youtube_dlc/extractor/cda.py @@ -5,10 +5,16 @@ import re from .common import InfoExtractor +from ..compat import ( + compat_chr, + compat_ord, + compat_urllib_parse_unquote, +) from ..utils import ( ExtractorError, float_or_none, int_or_none, + merge_dicts, multipart_encode, parse_duration, random_birthday, @@ -107,8 +113,9 @@ def _real_extract(self, url): r'Odsłony:(?:\s| )*([0-9]+)', webpage, 'view_count', default=None) average_rating = self._search_regex( - r'<(?:span|meta)[^>]+itemprop=(["\'])ratingValue\1[^>]*>(?P<rating_value>[0-9.]+)', - webpage, 'rating', fatal=False, group='rating_value') + (r'<(?:span|meta)[^>]+itemprop=(["\'])ratingValue\1[^>]*>(?P<rating_value>[0-9.]+)', + r'<span[^>]+\bclass=["\']rating["\'][^>]*>(?P<rating_value>[0-9.]+)'), webpage, 'rating', fatal=False, + group='rating_value') info_dict = { 'id': video_id, @@ -123,6 +130,24 @@ def _real_extract(self, url): 'age_limit': 18 if need_confirm_age else 0, } + # Source: https://www.cda.pl/js/player.js?t=1606154898 + def decrypt_file(a): + for p in ('_XDDD', '_CDA', '_ADC', '_CXD', '_QWE', '_Q5', '_IKSDE'): + a = a.replace(p, '') + a = compat_urllib_parse_unquote(a) + b = [] + for c in a: + f = compat_ord(c) + b.append(compat_chr(33 + (f + 14) % 94) if 33 <= f and 126 >= f else compat_chr(f)) + a = ''.join(b) + a = a.replace('.cda.mp4', '') + for p in ('.2cda.pl', '.3cda.pl'): + a = a.replace(p, '.cda.pl') + if '/upstream' in a: + a = a.replace('/upstream', '.mp4/upstream') + return 'https://' + a + return 'https://' + a + '.mp4' + def extract_format(page, version): json_str = self._html_search_regex( r'player_data=(\\?["\'])(?P<player_data>.+?)\1', page, @@ -141,6 +166,8 @@ def extract_format(page, version): video['file'] = codecs.decode(video['file'], 'rot_13') if video['file'].endswith('adc.mp4'): video['file'] = video['file'].replace('adc.mp4', '.mp4') + elif not video['file'].startswith('http'): + video['file'] = decrypt_file(video['file']) f = { 'url': video['file'], } @@ -179,4 +206,6 @@ def extract_format(page, version): self._sort_formats(formats) - return info_dict + info = self._search_json_ld(webpage, video_id, default={}) + + return merge_dicts(info_dict, info) diff --git a/youtube_dlc/extractor/extractors.py b/youtube_dlc/extractor/extractors.py index c50bdbb79..9fe458038 100644 --- a/youtube_dlc/extractor/extractors.py +++ b/youtube_dlc/extractor/extractors.py @@ -620,6 +620,7 @@ from .massengeschmacktv import MassengeschmackTVIE from .matchtv import MatchTVIE from .mdr import MDRIE +from .medaltv import MedalTVIE from .mediaset import MediasetIE from .mediasite import ( MediasiteIE, @@ -1102,6 +1103,12 @@ from .sport5 import Sport5IE from .sportbox import SportBoxIE from .sportdeutschland import SportDeutschlandIE +from .spreaker import ( + SpreakerIE, + SpreakerPageIE, + SpreakerShowIE, + SpreakerShowPageIE, +) from .springboardplatform import SpringboardPlatformIE from .sprout import SproutIE from .srgssr import ( @@ -1395,8 +1402,8 @@ ) from .vlive import ( VLiveIE, + VLivePostIE, VLiveChannelIE, - VLivePlaylistIE ) from .vodlocker import VodlockerIE from .vodpl import VODPlIE diff --git a/youtube_dlc/extractor/medaltv.py b/youtube_dlc/extractor/medaltv.py new file mode 100644 index 000000000..1603b55f6 --- /dev/null +++ b/youtube_dlc/extractor/medaltv.py @@ -0,0 +1,131 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + float_or_none, + int_or_none, + str_or_none, + try_get, +) + + +class MedalTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?medal\.tv/clips/(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'https://medal.tv/clips/34934644/3Is9zyGMoBMr', + 'md5': '7b07b064331b1cf9e8e5c52a06ae68fa', + 'info_dict': { + 'id': '34934644', + 'ext': 'mp4', + 'title': 'Quad Cold', + 'description': 'Medal,https://medal.tv/desktop/', + 'uploader': 'MowgliSB', + 'timestamp': 1603165266, + 'upload_date': '20201020', + 'uploader_id': 10619174, + } + }, { + 'url': 'https://medal.tv/clips/36787208', + 'md5': 'b6dc76b78195fff0b4f8bf4a33ec2148', + 'info_dict': { + 'id': '36787208', + 'ext': 'mp4', + 'title': 'u tk me i tk u bigger', + 'description': 'Medal,https://medal.tv/desktop/', + 'uploader': 'Mimicc', + 'timestamp': 1605580939, + 'upload_date': '20201117', + 'uploader_id': 5156321, + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + hydration_data = self._parse_json(self._search_regex( + r'<script[^>]*>\s*(?:var\s*)?hydrationData\s*=\s*({.+?})\s*</script>', + webpage, 'hydration data', default='{}'), video_id) + + clip = try_get( + hydration_data, lambda x: x['clips'][video_id], dict) or {} + if not clip: + raise ExtractorError( + 'Could not find video information.', video_id=video_id) + + title = clip['contentTitle'] + + source_width = int_or_none(clip.get('sourceWidth')) + source_height = int_or_none(clip.get('sourceHeight')) + + aspect_ratio = source_width / source_height if source_width and source_height else 16 / 9 + + def add_item(container, item_url, height, id_key='format_id', item_id=None): + item_id = item_id or '%dp' % height + if item_id not in item_url: + return + width = int(round(aspect_ratio * height)) + container.append({ + 'url': item_url, + id_key: item_id, + 'width': width, + 'height': height + }) + + formats = [] + thumbnails = [] + for k, v in clip.items(): + if not (v and isinstance(v, compat_str)): + continue + mobj = re.match(r'(contentUrl|thumbnail)(?:(\d+)p)?$', k) + if not mobj: + continue + prefix = mobj.group(1) + height = int_or_none(mobj.group(2)) + if prefix == 'contentUrl': + add_item( + formats, v, height or source_height, + item_id=None if height else 'source') + elif prefix == 'thumbnail': + add_item(thumbnails, v, height, 'id') + + error = clip.get('error') + if not formats and error: + if error == 404: + raise ExtractorError( + 'That clip does not exist.', + expected=True, video_id=video_id) + else: + raise ExtractorError( + 'An unknown error occurred ({0}).'.format(error), + video_id=video_id) + + self._sort_formats(formats) + + # Necessary because the id of the author is not known in advance. + # Won't raise an issue if no profile can be found as this is optional. + author = try_get( + hydration_data, lambda x: list(x['profiles'].values())[0], dict) or {} + author_id = str_or_none(author.get('id')) + author_url = 'https://medal.tv/users/{0}'.format(author_id) if author_id else None + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnails': thumbnails, + 'description': clip.get('contentDescription'), + 'uploader': author.get('displayName'), + 'timestamp': float_or_none(clip.get('created'), 1000), + 'uploader_id': author_id, + 'uploader_url': author_url, + 'duration': int_or_none(clip.get('videoLengthSeconds')), + 'view_count': int_or_none(clip.get('views')), + 'like_count': int_or_none(clip.get('likes')), + 'comment_count': int_or_none(clip.get('comments')), + } diff --git a/youtube_dlc/extractor/nrk.py b/youtube_dlc/extractor/nrk.py index 84aacbcda..4a395546f 100644 --- a/youtube_dlc/extractor/nrk.py +++ b/youtube_dlc/extractor/nrk.py @@ -9,6 +9,7 @@ compat_urllib_parse_unquote, ) from ..utils import ( + determine_ext, ExtractorError, int_or_none, js_to_json, @@ -16,17 +17,269 @@ parse_age_limit, parse_duration, try_get, + url_or_none, ) class NRKBaseIE(InfoExtractor): _GEO_COUNTRIES = ['NO'] - _api_host = None + +class NRKIE(NRKBaseIE): + _VALID_URL = r'''(?x) + (?: + nrk:| + https?:// + (?: + (?:www\.)?nrk\.no/video/(?:PS\*|[^_]+_)| + v8[-.]psapi\.nrk\.no/mediaelement/ + ) + ) + (?P<id>[^?\#&]+) + ''' + + _TESTS = [{ + # video + 'url': 'http://www.nrk.no/video/PS*150533', + 'md5': '706f34cdf1322577589e369e522b50ef', + 'info_dict': { + 'id': '150533', + 'ext': 'mp4', + 'title': 'Dompap og andre fugler i Piip-Show', + 'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f', + 'duration': 262, + } + }, { + # audio + 'url': 'http://www.nrk.no/video/PS*154915', + # MD5 is unstable + 'info_dict': { + 'id': '154915', + 'ext': 'flv', + 'title': 'Slik høres internett ut når du er blind', + 'description': 'md5:a621f5cc1bd75c8d5104cb048c6b8568', + 'duration': 20, + } + }, { + 'url': 'nrk:ecc1b952-96dc-4a98-81b9-5296dc7a98d9', + 'only_matching': True, + }, { + 'url': 'nrk:clip/7707d5a3-ebe7-434a-87d5-a3ebe7a34a70', + 'only_matching': True, + }, { + 'url': 'https://v8-psapi.nrk.no/mediaelement/ecc1b952-96dc-4a98-81b9-5296dc7a98d9', + 'only_matching': True, + }, { + 'url': 'https://www.nrk.no/video/dompap-og-andre-fugler-i-piip-show_150533', + 'only_matching': True, + }, { + 'url': 'https://www.nrk.no/video/humor/kommentatorboksen-reiser-til-sjos_d1fda11f-a4ad-437a-a374-0398bc84e999', + 'only_matching': True, + }] + + def _extract_from_playback(self, video_id): + manifest = self._download_json( + 'http://psapi.nrk.no/playback/manifest/%s' % video_id, + video_id, 'Downloading manifest JSON') + + playable = manifest['playable'] + + formats = [] + for asset in playable['assets']: + if not isinstance(asset, dict): + continue + if asset.get('encrypted'): + continue + format_url = url_or_none(asset.get('url')) + if not format_url: + continue + if asset.get('format') == 'HLS' or determine_ext(format_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + self._sort_formats(formats) + + data = self._download_json( + 'http://psapi.nrk.no/playback/metadata/%s' % video_id, + video_id, 'Downloading metadata JSON') + + preplay = data['preplay'] + titles = preplay['titles'] + title = titles['title'] + alt_title = titles.get('subtitle') + + description = preplay.get('description') + duration = parse_duration(playable.get('duration')) or parse_duration(data.get('duration')) + + thumbnails = [] + for image in try_get( + preplay, lambda x: x['poster']['images'], list) or []: + if not isinstance(image, dict): + continue + image_url = url_or_none(image.get('url')) + if not image_url: + continue + thumbnails.append({ + 'url': image_url, + 'width': int_or_none(image.get('pixelWidth')), + 'height': int_or_none(image.get('pixelHeight')), + }) + + return { + 'id': video_id, + 'title': title, + 'alt_title': alt_title, + 'description': description, + 'duration': duration, + 'thumbnails': thumbnails, + 'formats': formats, + } def _real_extract(self, url): video_id = self._match_id(url) + return self._extract_from_playback(video_id) + +class NRKTVIE(NRKBaseIE): + IE_DESC = 'NRK TV and NRK Radio' + _EPISODE_RE = r'(?P<id>[a-zA-Z]{4}\d{8})' + _VALID_URL = r'''(?x) + https?:// + (?:tv|radio)\.nrk(?:super)?\.no/ + (?:serie(?:/[^/]+){1,2}|program)/ + (?![Ee]pisodes)%s + (?:/\d{2}-\d{2}-\d{4})? + (?:\#del=(?P<part_id>\d+))? + ''' % _EPISODE_RE + _API_HOSTS = ('psapi-ne.nrk.no', 'psapi-we.nrk.no') + _TESTS = [{ + 'url': 'https://tv.nrk.no/program/MDDP12000117', + 'md5': '8270824df46ec629b66aeaa5796b36fb', + 'info_dict': { + 'id': 'MDDP12000117AA', + 'ext': 'mp4', + 'title': 'Alarm Trolltunga', + 'description': 'md5:46923a6e6510eefcce23d5ef2a58f2ce', + 'duration': 2223, + 'age_limit': 6, + }, + }, { + 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', + 'md5': '9a167e54d04671eb6317a37b7bc8a280', + 'info_dict': { + 'id': 'MUHH48000314AA', + 'ext': 'mp4', + 'title': '20 spørsmål 23.05.2014', + 'description': 'md5:bdea103bc35494c143c6a9acdd84887a', + 'duration': 1741, + 'series': '20 spørsmål', + 'episode': '23.05.2014', + }, + 'skip': 'NoProgramRights', + }, { + 'url': 'https://tv.nrk.no/program/mdfp15000514', + 'info_dict': { + 'id': 'MDFP15000514CA', + 'ext': 'mp4', + 'title': 'Grunnlovsjubiléet - Stor ståhei for ingenting 24.05.2014', + 'description': 'md5:89290c5ccde1b3a24bb8050ab67fe1db', + 'duration': 4605, + 'series': 'Kunnskapskanalen', + 'episode': '24.05.2014', + }, + 'params': { + 'skip_download': True, + }, + }, { + # single playlist video + 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2', + 'info_dict': { + 'id': 'MSPO40010515-part2', + 'ext': 'flv', + 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)', + 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Video is geo restricted'], + 'skip': 'particular part is not supported currently', + }, { + 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015', + 'playlist': [{ + 'info_dict': { + 'id': 'MSPO40010515AH', + 'ext': 'mp4', + 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015 (Part 1)', + 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d', + 'duration': 772, + 'series': 'Tour de Ski', + 'episode': '06.01.2015', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'info_dict': { + 'id': 'MSPO40010515BH', + 'ext': 'mp4', + 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015 (Part 2)', + 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d', + 'duration': 6175, + 'series': 'Tour de Ski', + 'episode': '06.01.2015', + }, + 'params': { + 'skip_download': True, + }, + }], + 'info_dict': { + 'id': 'MSPO40010515', + 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015', + 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d', + }, + 'expected_warnings': ['Video is geo restricted'], + }, { + 'url': 'https://tv.nrk.no/serie/anno/KMTE50001317/sesong-3/episode-13', + 'info_dict': { + 'id': 'KMTE50001317AA', + 'ext': 'mp4', + 'title': 'Anno 13:30', + 'description': 'md5:11d9613661a8dbe6f9bef54e3a4cbbfa', + 'duration': 2340, + 'series': 'Anno', + 'episode': '13:30', + 'season_number': 3, + 'episode_number': 13, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://tv.nrk.no/serie/nytt-paa-nytt/MUHH46000317/27-01-2017', + 'info_dict': { + 'id': 'MUHH46000317AA', + 'ext': 'mp4', + 'title': 'Nytt på Nytt 27.01.2017', + 'description': 'md5:5358d6388fba0ea6f0b6d11c48b9eb4b', + 'duration': 1796, + 'series': 'Nytt på nytt', + 'episode': '27.01.2017', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://radio.nrk.no/serie/dagsnytt/NPUB21019315/12-07-2015#', + 'only_matching': True, + }, { + 'url': 'https://tv.nrk.no/serie/lindmo/2018/MUHU11006318/avspiller', + 'only_matching': True, + }] + + _api_host = None + + def _extract_from_mediaelement(self, video_id): api_hosts = (self._api_host, ) if self._api_host else self._API_HOSTS for api_host in api_hosts: @@ -195,190 +448,9 @@ def video_id_and_title(idx): return self.playlist_result(entries, video_id, title, description) - -class NRKIE(NRKBaseIE): - _VALID_URL = r'''(?x) - (?: - nrk:| - https?:// - (?: - (?:www\.)?nrk\.no/video/PS\*| - v8[-.]psapi\.nrk\.no/mediaelement/ - ) - ) - (?P<id>[^?#&]+) - ''' - _API_HOSTS = ('psapi.nrk.no', 'v8-psapi.nrk.no') - _TESTS = [{ - # video - 'url': 'http://www.nrk.no/video/PS*150533', - 'md5': '706f34cdf1322577589e369e522b50ef', - 'info_dict': { - 'id': '150533', - 'ext': 'mp4', - 'title': 'Dompap og andre fugler i Piip-Show', - 'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f', - 'duration': 262, - } - }, { - # audio - 'url': 'http://www.nrk.no/video/PS*154915', - # MD5 is unstable - 'info_dict': { - 'id': '154915', - 'ext': 'flv', - 'title': 'Slik høres internett ut når du er blind', - 'description': 'md5:a621f5cc1bd75c8d5104cb048c6b8568', - 'duration': 20, - } - }, { - 'url': 'nrk:ecc1b952-96dc-4a98-81b9-5296dc7a98d9', - 'only_matching': True, - }, { - 'url': 'nrk:clip/7707d5a3-ebe7-434a-87d5-a3ebe7a34a70', - 'only_matching': True, - }, { - 'url': 'https://v8-psapi.nrk.no/mediaelement/ecc1b952-96dc-4a98-81b9-5296dc7a98d9', - 'only_matching': True, - }] - - -class NRKTVIE(NRKBaseIE): - IE_DESC = 'NRK TV and NRK Radio' - _EPISODE_RE = r'(?P<id>[a-zA-Z]{4}\d{8})' - _VALID_URL = r'''(?x) - https?:// - (?:tv|radio)\.nrk(?:super)?\.no/ - (?:serie(?:/[^/]+){1,2}|program)/ - (?![Ee]pisodes)%s - (?:/\d{2}-\d{2}-\d{4})? - (?:\#del=(?P<part_id>\d+))? - ''' % _EPISODE_RE - _API_HOSTS = ('psapi-ne.nrk.no', 'psapi-we.nrk.no') - _TESTS = [{ - 'url': 'https://tv.nrk.no/program/MDDP12000117', - 'md5': '8270824df46ec629b66aeaa5796b36fb', - 'info_dict': { - 'id': 'MDDP12000117AA', - 'ext': 'mp4', - 'title': 'Alarm Trolltunga', - 'description': 'md5:46923a6e6510eefcce23d5ef2a58f2ce', - 'duration': 2223, - 'age_limit': 6, - }, - }, { - 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', - 'md5': '9a167e54d04671eb6317a37b7bc8a280', - 'info_dict': { - 'id': 'MUHH48000314AA', - 'ext': 'mp4', - 'title': '20 spørsmål 23.05.2014', - 'description': 'md5:bdea103bc35494c143c6a9acdd84887a', - 'duration': 1741, - 'series': '20 spørsmål', - 'episode': '23.05.2014', - }, - 'skip': 'NoProgramRights', - }, { - 'url': 'https://tv.nrk.no/program/mdfp15000514', - 'info_dict': { - 'id': 'MDFP15000514CA', - 'ext': 'mp4', - 'title': 'Grunnlovsjubiléet - Stor ståhei for ingenting 24.05.2014', - 'description': 'md5:89290c5ccde1b3a24bb8050ab67fe1db', - 'duration': 4605, - 'series': 'Kunnskapskanalen', - 'episode': '24.05.2014', - }, - 'params': { - 'skip_download': True, - }, - }, { - # single playlist video - 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2', - 'info_dict': { - 'id': 'MSPO40010515-part2', - 'ext': 'flv', - 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)', - 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', - }, - 'params': { - 'skip_download': True, - }, - 'expected_warnings': ['Video is geo restricted'], - 'skip': 'particular part is not supported currently', - }, { - 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015', - 'playlist': [{ - 'info_dict': { - 'id': 'MSPO40010515AH', - 'ext': 'mp4', - 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015 (Part 1)', - 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d', - 'duration': 772, - 'series': 'Tour de Ski', - 'episode': '06.01.2015', - }, - 'params': { - 'skip_download': True, - }, - }, { - 'info_dict': { - 'id': 'MSPO40010515BH', - 'ext': 'mp4', - 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015 (Part 2)', - 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d', - 'duration': 6175, - 'series': 'Tour de Ski', - 'episode': '06.01.2015', - }, - 'params': { - 'skip_download': True, - }, - }], - 'info_dict': { - 'id': 'MSPO40010515', - 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015', - 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d', - }, - 'expected_warnings': ['Video is geo restricted'], - }, { - 'url': 'https://tv.nrk.no/serie/anno/KMTE50001317/sesong-3/episode-13', - 'info_dict': { - 'id': 'KMTE50001317AA', - 'ext': 'mp4', - 'title': 'Anno 13:30', - 'description': 'md5:11d9613661a8dbe6f9bef54e3a4cbbfa', - 'duration': 2340, - 'series': 'Anno', - 'episode': '13:30', - 'season_number': 3, - 'episode_number': 13, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://tv.nrk.no/serie/nytt-paa-nytt/MUHH46000317/27-01-2017', - 'info_dict': { - 'id': 'MUHH46000317AA', - 'ext': 'mp4', - 'title': 'Nytt på Nytt 27.01.2017', - 'description': 'md5:5358d6388fba0ea6f0b6d11c48b9eb4b', - 'duration': 1796, - 'series': 'Nytt på nytt', - 'episode': '27.01.2017', - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://radio.nrk.no/serie/dagsnytt/NPUB21019315/12-07-2015#', - 'only_matching': True, - }, { - 'url': 'https://tv.nrk.no/serie/lindmo/2018/MUHU11006318/avspiller', - 'only_matching': True, - }] + def _real_extract(self, url): + video_id = self._match_id(url) + return self._extract_from_mediaelement(video_id) class NRKTVEpisodeIE(InfoExtractor): diff --git a/youtube_dlc/extractor/spreaker.py b/youtube_dlc/extractor/spreaker.py new file mode 100644 index 000000000..beee6670c --- /dev/null +++ b/youtube_dlc/extractor/spreaker.py @@ -0,0 +1,176 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import itertools + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + float_or_none, + int_or_none, + str_or_none, + try_get, + unified_timestamp, + url_or_none, +) + + +def _extract_episode(data, episode_id=None): + title = data['title'] + download_url = data['download_url'] + + series = try_get(data, lambda x: x['show']['title'], compat_str) + uploader = try_get(data, lambda x: x['author']['fullname'], compat_str) + + thumbnails = [] + for image in ('image_original', 'image_medium', 'image'): + image_url = url_or_none(data.get('%s_url' % image)) + if image_url: + thumbnails.append({'url': image_url}) + + def stats(key): + return int_or_none(try_get( + data, + (lambda x: x['%ss_count' % key], + lambda x: x['stats']['%ss' % key]))) + + def duration(key): + return float_or_none(data.get(key), scale=1000) + + return { + 'id': compat_str(episode_id or data['episode_id']), + 'url': download_url, + 'display_id': data.get('permalink'), + 'title': title, + 'description': data.get('description'), + 'timestamp': unified_timestamp(data.get('published_at')), + 'uploader': uploader, + 'uploader_id': str_or_none(data.get('author_id')), + 'creator': uploader, + 'duration': duration('duration') or duration('length'), + 'view_count': stats('play'), + 'like_count': stats('like'), + 'comment_count': stats('message'), + 'format': 'MPEG Layer 3', + 'format_id': 'mp3', + 'container': 'mp3', + 'ext': 'mp3', + 'thumbnails': thumbnails, + 'series': series, + 'extractor_key': SpreakerIE.ie_key(), + } + + +class SpreakerIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + api\.spreaker\.com/ + (?: + (?:download/)?episode| + v2/episodes + )/ + (?P<id>\d+) + ''' + _TESTS = [{ + 'url': 'https://api.spreaker.com/episode/12534508', + 'info_dict': { + 'id': '12534508', + 'display_id': 'swm-ep15-how-to-market-your-music-part-2', + 'ext': 'mp3', + 'title': 'EP:15 | Music Marketing (Likes) - Part 2', + 'description': 'md5:0588c43e27be46423e183076fa071177', + 'timestamp': 1502250336, + 'upload_date': '20170809', + 'uploader': 'SWM', + 'uploader_id': '9780658', + 'duration': 1063.42, + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'series': 'Success With Music (SWM)', + }, + }, { + 'url': 'https://api.spreaker.com/download/episode/12534508/swm_ep15_how_to_market_your_music_part_2.mp3', + 'only_matching': True, + }, { + 'url': 'https://api.spreaker.com/v2/episodes/12534508?export=episode_segments', + 'only_matching': True, + }] + + def _real_extract(self, url): + episode_id = self._match_id(url) + data = self._download_json( + 'https://api.spreaker.com/v2/episodes/%s' % episode_id, + episode_id)['response']['episode'] + return _extract_episode(data, episode_id) + + +class SpreakerPageIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?spreaker\.com/user/[^/]+/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.spreaker.com/user/9780658/swm-ep15-how-to-market-your-music-part-2', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + episode_id = self._search_regex( + (r'data-episode_id=["\'](?P<id>\d+)', + r'episode_id\s*:\s*(?P<id>\d+)'), webpage, 'episode id') + return self.url_result( + 'https://api.spreaker.com/episode/%s' % episode_id, + ie=SpreakerIE.ie_key(), video_id=episode_id) + + +class SpreakerShowIE(InfoExtractor): + _VALID_URL = r'https?://api\.spreaker\.com/show/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.spreaker.com/show/3-ninjas-podcast', + 'info_dict': { + 'id': '4652058', + }, + 'playlist_mincount': 118, + }] + + def _entries(self, show_id): + for page_num in itertools.count(1): + episodes = self._download_json( + 'https://api.spreaker.com/show/%s/episodes' % show_id, + show_id, note='Downloading JSON page %d' % page_num, query={ + 'page': page_num, + 'max_per_page': 100, + }) + pager = try_get(episodes, lambda x: x['response']['pager'], dict) + if not pager: + break + results = pager.get('results') + if not results or not isinstance(results, list): + break + for result in results: + if not isinstance(result, dict): + continue + yield _extract_episode(result) + if page_num == pager.get('last_page'): + break + + def _real_extract(self, url): + show_id = self._match_id(url) + return self.playlist_result(self._entries(show_id), playlist_id=show_id) + + +class SpreakerShowPageIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?spreaker\.com/show/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.spreaker.com/show/success-with-music', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + show_id = self._search_regex( + r'show_id\s*:\s*(?P<id>\d+)', webpage, 'show id') + return self.url_result( + 'https://api.spreaker.com/show/%s' % show_id, + ie=SpreakerShowIE.ie_key(), video_id=show_id) diff --git a/youtube_dlc/extractor/viki.py b/youtube_dlc/extractor/viki.py index 2e3794344..09da4338d 100644 --- a/youtube_dlc/extractor/viki.py +++ b/youtube_dlc/extractor/viki.py @@ -21,6 +21,7 @@ parse_age_limit, parse_iso8601, sanitized_Request, + std_headers, ) @@ -227,8 +228,10 @@ def _real_extract(self, url): resp = self._download_json( 'https://www.viki.com/api/videos/' + video_id, - video_id, 'Downloading video JSON', - headers={'x-viki-app-ver': '4.0.57'}) + video_id, 'Downloading video JSON', headers={ + 'x-client-user-agent': std_headers['User-Agent'], + 'x-viki-app-ver': '4.0.57', + }) video = resp['video'] self._check_errors(video) diff --git a/youtube_dlc/extractor/vlive.py b/youtube_dlc/extractor/vlive.py index 935560b57..223709b1e 100644 --- a/youtube_dlc/extractor/vlive.py +++ b/youtube_dlc/extractor/vlive.py @@ -1,55 +1,50 @@ # coding: utf-8 from __future__ import unicode_literals -import re -import time import itertools +import json -from .common import InfoExtractor from .naver import NaverBaseIE -from ..compat import compat_str +from ..compat import ( + compat_HTTPError, + compat_str, +) from ..utils import ( ExtractorError, + int_or_none, merge_dicts, + str_or_none, + strip_or_none, try_get, urlencode_postdata, ) -class VLiveIE(NaverBaseIE): +class VLiveBaseIE(NaverBaseIE): + _APP_ID = '8c6cc7b45d2568fb668be6e05b6e5a3b' + + +class VLiveIE(VLiveBaseIE): IE_NAME = 'vlive' - _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/(?:video|post)/(?P<id>(?:\d-)?[0-9]+)' + _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/(?:video|embed)/(?P<id>[0-9]+)' _NETRC_MACHINE = 'vlive' _TESTS = [{ - 'url': 'https://www.vlive.tv/video/1326', + 'url': 'http://www.vlive.tv/video/1326', 'md5': 'cc7314812855ce56de70a06a27314983', 'info_dict': { 'id': '1326', 'ext': 'mp4', - 'title': "[V LIVE] Girl's Day's Broadcast", + 'title': "Girl's Day's Broadcast", 'creator': "Girl's Day", 'view_count': int, 'uploader_id': 'muploader_a', }, - }, - { - 'url': 'https://vlive.tv/post/1-18244258', - 'md5': 'cc7314812855ce56de70a06a27314983', - 'info_dict': { - 'id': '1326', - 'ext': 'mp4', - 'title': "[V LIVE] Girl's Day's Broadcast", - 'creator': "Girl's Day", - 'view_count': int, - 'uploader_id': 'muploader_a', - }, - }, - { - 'url': 'https://www.vlive.tv/video/16937', + }, { + 'url': 'http://www.vlive.tv/video/16937', 'info_dict': { 'id': '16937', 'ext': 'mp4', - 'title': '[V LIVE] 첸백시 걍방', + 'title': '첸백시 걍방', 'creator': 'EXO', 'view_count': int, 'subtitles': 'mincount:12', @@ -70,12 +65,15 @@ class VLiveIE(NaverBaseIE): 'subtitles': 'mincount:10', }, 'skip': 'This video is only available for CH+ subscribers', + }, { + 'url': 'https://www.vlive.tv/embed/1326', + 'only_matching': True, + }, { + # works only with gcc=KR + 'url': 'https://www.vlive.tv/video/225019', + 'only_matching': True, }] - @classmethod - def suitable(cls, url): - return False if VLivePlaylistIE.suitable(url) else super(VLiveIE, cls).suitable(url) - def _real_initialize(self): self._login() @@ -107,118 +105,159 @@ def is_logged_in(): if not is_logged_in(): raise ExtractorError('Unable to log in', expected=True) + def _call_api(self, path_template, video_id, fields=None): + query = {'appId': self._APP_ID, 'gcc': 'KR'} + if fields: + query['fields'] = fields + try: + return self._download_json( + 'https://www.vlive.tv/globalv-web/vam-web/' + path_template % video_id, video_id, + 'Downloading %s JSON metadata' % path_template.split('/')[-1].split('-')[0], + headers={'Referer': 'https://www.vlive.tv/'}, query=query) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + self.raise_login_required(json.loads(e.cause.read().decode())['message']) + raise + def _real_extract(self, url): - # url may match on a post or a video url with a post_id potentially matching a video_id - working_id = self._match_id(url) - webpage = self._download_webpage(url, working_id) + video_id = self._match_id(url) - PARAMS_RE = r'window\.__PRELOADED_STATE__\s*=\s*({.*});?\s*</script>' - PARAMS_FIELD = 'params' + post = self._call_api( + 'post/v1.0/officialVideoPost-%s', video_id, + 'author{nickname},channel{channelCode,channelName},officialVideo{commentCount,exposeStatus,likeCount,playCount,playTime,status,title,type,vodId}') - params = self._search_regex( - PARAMS_RE, webpage, PARAMS_FIELD, default='', flags=re.DOTALL) - params = self._parse_json(params, working_id, fatal=False) + video = post['officialVideo'] - video_params = try_get(params, lambda x: x["postDetail"]["post"]["officialVideo"], dict) + def get_common_fields(): + channel = post.get('channel') or {} + return { + 'title': video.get('title'), + 'creator': post.get('author', {}).get('nickname'), + 'channel': channel.get('channelName'), + 'channel_id': channel.get('channelCode'), + 'duration': int_or_none(video.get('playTime')), + 'view_count': int_or_none(video.get('playCount')), + 'like_count': int_or_none(video.get('likeCount')), + 'comment_count': int_or_none(video.get('commentCount')), + } - if video_params is None: - error = try_get(params, lambda x: x["postDetail"]["error"], dict) - error_data = try_get(error, lambda x: x["data"], dict) - error_video = try_get(error_data, lambda x: x["officialVideo"], dict) - error_msg = try_get(error, lambda x: x["message"], compat_str) - product_type = try_get(error_data, - [lambda x: x["officialVideo"]["productType"], - lambda x: x["board"]["boardType"]], - compat_str) - - if error_video is not None: - if product_type in ('VLIVE_PLUS', 'VLIVE+'): - self.raise_login_required('This video is only available with V LIVE+.') - elif error_msg is not None: - raise ExtractorError('V LIVE reported the following error: %s' % error_msg) - else: - raise ExtractorError('Failed to extract video parameters.') - elif 'post' in url: - raise ExtractorError('Url does not appear to be a video post.', expected=True) - else: - raise ExtractorError('Failed to extract video parameters.') - - video_id = working_id if 'video' in url else str(video_params["videoSeq"]) - - video_type = video_params["type"] - if video_type in ('VOD'): - encoding_status = video_params["encodingStatus"] - if encoding_status == 'COMPLETE': - return self._replay(video_id, webpage, params, video_params) - else: - raise ExtractorError('VOD encoding not yet complete. Please try again later.', - expected=True) - elif video_type in ('LIVE'): - video_status = video_params["status"] - if video_status in ('RESERVED'): + video_type = video.get('type') + if video_type == 'VOD': + inkey = self._call_api('video/v1.0/vod/%s/inkey', video_id)['inkey'] + vod_id = video['vodId'] + return merge_dicts( + get_common_fields(), + self._extract_video_info(video_id, vod_id, inkey)) + elif video_type == 'LIVE': + status = video.get('status') + if status == 'ON_AIR': + stream_url = self._call_api( + 'old/v3/live/%s/playInfo', + video_id)['result']['adaptiveStreamUrl'] + formats = self._extract_m3u8_formats(stream_url, video_id, 'mp4') + info = get_common_fields() + info.update({ + 'title': self._live_title(video['title']), + 'id': video_id, + 'formats': formats, + 'is_live': True, + }) + return info + elif status == 'ENDED': + raise ExtractorError( + 'Uploading for replay. Please wait...', expected=True) + elif status == 'RESERVED': raise ExtractorError('Coming soon!', expected=True) - elif video_status in ('ENDED', 'END'): - raise ExtractorError('Uploading for replay. Please wait...', expected=True) + elif video.get('exposeStatus') == 'CANCEL': + raise ExtractorError( + 'We are sorry, but the live broadcast has been canceled.', + expected=True) else: - return self._live(video_id, webpage, params) - else: - raise ExtractorError('Unknown video type %s' % video_type) - - def _get_common_fields(self, webpage, params): - title = self._og_search_title(webpage) - description = self._html_search_meta( - ['og:description', 'description', 'twitter:description'], - webpage, 'description', default=None) - creator = (try_get(params, lambda x: x["channel"]["channel"]["channelName"], compat_str) - or self._search_regex(r'on (.*) channel', description or '', 'creator', fatal=False)) - thumbnail = self._og_search_thumbnail(webpage) - return { - 'title': title, - 'creator': creator, - 'thumbnail': thumbnail, - } - - def _live(self, video_id, webpage, params): - LIVE_INFO_ENDPOINT = 'https://www.vlive.tv/globalv-web/vam-web/old/v3/live/%s/playInfo' % video_id - play_info = self._download_json(LIVE_INFO_ENDPOINT, video_id, - headers={"referer": "https://www.vlive.tv"}) - - streams = try_get(play_info, lambda x: x["result"]["streamList"], list) or [] - - formats = [] - for stream in streams: - formats.extend(self._extract_m3u8_formats( - stream['serviceUrl'], video_id, 'mp4', - fatal=False, live=True)) - self._sort_formats(formats) - - info = self._get_common_fields(webpage, params) - info.update({ - 'title': self._live_title(info['title']), - 'id': video_id, - 'formats': formats, - 'is_live': True, - }) - return info - - def _replay(self, video_id, webpage, params, video_params): - long_video_id = video_params["vodId"] - - VOD_KEY_ENDPOINT = 'https://www.vlive.tv/globalv-web/vam-web/video/v1.0/vod/%s/inkey' % video_id - key_json = self._download_json(VOD_KEY_ENDPOINT, video_id, - headers={"referer": "https://www.vlive.tv"}) - key = key_json["inkey"] - - return merge_dicts( - self._get_common_fields(webpage, params), - self._extract_video_info(video_id, long_video_id, key)) + raise ExtractorError('Unknown status ' + status) -class VLiveChannelIE(InfoExtractor): - IE_NAME = 'vlive:channel' - _VALID_URL = r'https?://(?:(?:www|m)\.)?(?:channels\.vlive\.tv/|vlive\.tv/channels?/)(?P<id>[0-9A-Z]+)' +class VLivePostIE(VLiveIE): + IE_NAME = 'vlive:post' + _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/post/(?P<id>\d-\d+)' _TESTS = [{ - 'url': 'https://channels.vlive.tv/FCD4B', + # uploadType = SOS + 'url': 'https://www.vlive.tv/post/1-20088044', + 'info_dict': { + 'id': '1-20088044', + 'title': 'Hola estrellitas la tierra les dice hola (si era así no?) Ha...', + 'description': 'md5:fab8a1e50e6e51608907f46c7fa4b407', + }, + 'playlist_count': 3, + }, { + # uploadType = V + 'url': 'https://www.vlive.tv/post/1-20087926', + 'info_dict': { + 'id': '1-20087926', + 'title': 'James Corden: And so, the baby becamos the Papa💜😭💪😭', + }, + 'playlist_count': 1, + }] + _FVIDEO_TMPL = 'fvideo/v1.0/fvideo-%%s/%s' + _SOS_TMPL = _FVIDEO_TMPL % 'sosPlayInfo' + _INKEY_TMPL = _FVIDEO_TMPL % 'inKey' + + def _real_extract(self, url): + post_id = self._match_id(url) + + post = self._call_api( + 'post/v1.0/post-%s', post_id, + 'attachments{video},officialVideo{videoSeq},plainBody,title') + + video_seq = str_or_none(try_get( + post, lambda x: x['officialVideo']['videoSeq'])) + if video_seq: + return self.url_result( + 'http://www.vlive.tv/video/' + video_seq, + VLiveIE.ie_key(), video_seq) + + title = post['title'] + entries = [] + for idx, video in enumerate(post['attachments']['video'].values()): + video_id = video.get('videoId') + if not video_id: + continue + upload_type = video.get('uploadType') + upload_info = video.get('uploadInfo') or {} + entry = None + if upload_type == 'SOS': + download = self._call_api( + self._SOS_TMPL, video_id)['videoUrl']['download'] + formats = [] + for f_id, f_url in download.items(): + formats.append({ + 'format_id': f_id, + 'url': f_url, + 'height': int_or_none(f_id[:-1]), + }) + self._sort_formats(formats) + entry = { + 'formats': formats, + 'id': video_id, + 'thumbnail': upload_info.get('imageUrl'), + } + elif upload_type == 'V': + vod_id = upload_info.get('videoId') + if not vod_id: + continue + inkey = self._call_api(self._INKEY_TMPL, video_id)['inKey'] + entry = self._extract_video_info(video_id, vod_id, inkey) + if entry: + entry['title'] = '%s_part%s' % (title, idx) + entries.append(entry) + return self.playlist_result( + entries, post_id, title, strip_or_none(post.get('plainBody'))) + + +class VLiveChannelIE(VLiveBaseIE): + IE_NAME = 'vlive:channel' + _VALID_URL = r'https?://(?:channels\.vlive\.tv|(?:(?:www|m)\.)?vlive\.tv/channel)/(?P<id>[0-9A-Z]+)' + _TESTS = [{ + 'url': 'http://channels.vlive.tv/FCD4B', 'info_dict': { 'id': 'FCD4B', 'title': 'MAMAMOO', @@ -226,63 +265,39 @@ class VLiveChannelIE(InfoExtractor): 'playlist_mincount': 110 }, { 'url': 'https://www.vlive.tv/channel/FCD4B', - 'info_dict': { - 'id': 'FCD4B', - 'title': 'MAMAMOO', - }, - 'playlist_mincount': 110 + 'only_matching': True, }] - _APP_ID = '8c6cc7b45d2568fb668be6e05b6e5a3b' + + def _call_api(self, path, channel_key_suffix, channel_value, note, query): + q = { + 'app_id': self._APP_ID, + 'channel' + channel_key_suffix: channel_value, + } + q.update(query) + return self._download_json( + 'http://api.vfan.vlive.tv/vproxy/channelplus/' + path, + channel_value, note='Downloading ' + note, query=q)['result'] def _real_extract(self, url): channel_code = self._match_id(url) - webpage = self._download_webpage( - 'http://channels.vlive.tv/%s/video' % channel_code, channel_code) + channel_seq = self._call_api( + 'decodeChannelCode', 'Code', channel_code, + 'decode channel code', {})['channelSeq'] - app_id = None - - app_js_url = self._search_regex( - r'<script[^>]+src=(["\'])(?P<url>http.+?/app\.js.*?)\1', - webpage, 'app js', default=None, group='url') - - if app_js_url: - app_js = self._download_webpage( - app_js_url, channel_code, 'Downloading app JS', fatal=False) - if app_js: - app_id = self._search_regex( - r'Global\.VFAN_APP_ID\s*=\s*[\'"]([^\'"]+)[\'"]', - app_js, 'app id', default=None) - - app_id = app_id or self._APP_ID - - channel_info = self._download_json( - 'http://api.vfan.vlive.tv/vproxy/channelplus/decodeChannelCode', - channel_code, note='Downloading decode channel code', - query={ - 'app_id': app_id, - 'channelCode': channel_code, - '_': int(time.time()) - }) - - channel_seq = channel_info['result']['channelSeq'] channel_name = None entries = [] for page_num in itertools.count(1): - video_list = self._download_json( - 'http://api.vfan.vlive.tv/vproxy/channelplus/getChannelVideoList', - channel_code, note='Downloading channel list page #%d' % page_num, - query={ - 'app_id': app_id, - 'channelSeq': channel_seq, + video_list = self._call_api( + 'getChannelVideoList', 'Seq', channel_seq, + 'channel list page #%d' % page_num, { # Large values of maxNumOfRows (~300 or above) may cause # empty responses (see [1]), e.g. this happens for [2] that # has more than 300 videos. # 1. https://github.com/ytdl-org/youtube-dl/issues/13830 # 2. http://channels.vlive.tv/EDBF. 'maxNumOfRows': 100, - '_': int(time.time()), 'pageNo': page_num } ) @@ -290,11 +305,11 @@ def _real_extract(self, url): if not channel_name: channel_name = try_get( video_list, - lambda x: x['result']['channelInfo']['channelName'], + lambda x: x['channelInfo']['channelName'], compat_str) videos = try_get( - video_list, lambda x: x['result']['videoList'], list) + video_list, lambda x: x['videoList'], list) if not videos: break @@ -310,79 +325,3 @@ def _real_extract(self, url): return self.playlist_result( entries, channel_code, channel_name) - - -class VLivePlaylistIE(InfoExtractor): - IE_NAME = 'vlive:playlist' - _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/video/(?P<video_id>[0-9]+)/playlist/(?P<id>[0-9]+)' - _VIDEO_URL_TEMPLATE = 'http://www.vlive.tv/video/%s' - _TESTS = [{ - # regular working playlist - 'url': 'https://www.vlive.tv/video/117956/playlist/117963', - 'info_dict': { - 'id': '117963', - 'title': '아이돌룸(IDOL ROOM) 41회 - (여자)아이들' - }, - 'playlist_mincount': 10 - }, { - # playlist with no playlistVideoSeqs - 'url': 'http://www.vlive.tv/video/22867/playlist/22912', - 'info_dict': { - 'id': '22867', - 'ext': 'mp4', - 'title': '[V LIVE] Valentine Day Message from MINA', - 'creator': 'TWICE', - 'view_count': int - }, - 'params': { - 'skip_download': True, - } - }] - - def _build_video_result(self, video_id, message): - self.to_screen(message) - return self.url_result( - self._VIDEO_URL_TEMPLATE % video_id, - ie=VLiveIE.ie_key(), video_id=video_id) - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id, playlist_id = mobj.group('video_id', 'id') - - if self._downloader.params.get('noplaylist'): - return self._build_video_result( - video_id, - 'Downloading just video %s because of --no-playlist' - % video_id) - - self.to_screen( - 'Downloading playlist %s - add --no-playlist to just download video' - % playlist_id) - - webpage = self._download_webpage( - 'http://www.vlive.tv/video/%s/playlist/%s' - % (video_id, playlist_id), playlist_id) - - raw_item_ids = self._search_regex( - r'playlistVideoSeqs\s*=\s*(\[[^]]+\])', webpage, - 'playlist video seqs', default=None, fatal=False) - - if not raw_item_ids: - return self._build_video_result( - video_id, - 'Downloading just video %s because no playlist was found' - % video_id) - - item_ids = self._parse_json(raw_item_ids, playlist_id) - - entries = [ - self.url_result( - self._VIDEO_URL_TEMPLATE % item_id, ie=VLiveIE.ie_key(), - video_id=compat_str(item_id)) - for item_id in item_ids] - - playlist_name = self._html_search_regex( - r'<div[^>]+class="[^"]*multicam_playlist[^>]*>\s*<h3[^>]+>([^<]+)', - webpage, 'playlist title', fatal=False) - - return self.playlist_result(entries, playlist_id, playlist_name) diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index 540f35337..72bc5a0da 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -1335,44 +1335,6 @@ def _get_ytplayer_config(self, video_id, webpage): return self._parse_json( uppercase_escape(config), video_id, fatal=False) - def _get_music_metadata_from_yt_initial(self, yt_initial): - music_metadata = [] - key_map = { - 'Album': 'album', - 'Artist': 'artist', - 'Song': 'track' - } - contents = try_get(yt_initial, lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents']) - if type(contents) is list: - for content in contents: - music_track = {} - if type(content) is not dict: - continue - videoSecondaryInfoRenderer = try_get(content, lambda x: x['videoSecondaryInfoRenderer']) - if type(videoSecondaryInfoRenderer) is not dict: - continue - rows = try_get(videoSecondaryInfoRenderer, lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows']) - if type(rows) is not list: - continue - for row in rows: - metadataRowRenderer = try_get(row, lambda x: x['metadataRowRenderer']) - if type(metadataRowRenderer) is not dict: - continue - key = try_get(metadataRowRenderer, lambda x: x['title']['simpleText']) - value = try_get(metadataRowRenderer, lambda x: x['contents'][0]['simpleText']) or \ - try_get(metadataRowRenderer, lambda x: x['contents'][0]['runs'][0]['text']) - if type(key) is not str or type(value) is not str: - continue - if key in key_map: - if key_map[key] in music_track: - # we've started on a new track - music_metadata.append(music_track) - music_track = {} - music_track[key_map[key]] = value - if len(music_track.keys()): - music_metadata.append(music_track) - return music_metadata - def _get_automatic_captions(self, video_id, webpage): """We need the webpage for getting the captions url, pass it as an argument to speed up the process.""" @@ -2295,7 +2257,7 @@ def extract_meta(field): # Youtube Music Auto-generated description release_date = release_year = None if video_description: - mobj = re.search(r'(?s)Provided to YouTube by [^\n]+\n+(?P<track>[^·]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?', video_description) + mobj = re.search(r'(?s)(?P<track>[^·\n]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?.+\nAuto-generated by YouTube\.\s*$', video_description) if mobj: if not track: track = mobj.group('track').strip() @@ -2312,13 +2274,33 @@ def extract_meta(field): if release_year: release_year = int(release_year) - yt_initial = self._get_yt_initial_data(video_id, video_webpage) - if yt_initial: - music_metadata = self._get_music_metadata_from_yt_initial(yt_initial) - if len(music_metadata): - album = music_metadata[0].get('album') - artist = music_metadata[0].get('artist') - track = music_metadata[0].get('track') + yt_initial_data = self._extract_yt_initial_data(video_id, video_webpage) + contents = try_get(yt_initial_data, lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'], list) or [] + for content in contents: + rows = try_get(content, lambda x: x['videoSecondaryInfoRenderer']['metadataRowContainer']['metadataRowContainerRenderer']['rows'], list) or [] + multiple_songs = False + for row in rows: + if try_get(row, lambda x: x['metadataRowRenderer']['hasDividerLine']) is True: + multiple_songs = True + break + for row in rows: + mrr = row.get('metadataRowRenderer') or {} + mrr_title = try_get( + mrr, lambda x: x['title']['simpleText'], compat_str) + mrr_contents = try_get( + mrr, lambda x: x['contents'][0], dict) or {} + mrr_contents_text = try_get(mrr_contents, [lambda x: x['simpleText'], lambda x: x['runs'][0]['text']], compat_str) + if not (mrr_title and mrr_contents_text): + continue + if mrr_title == 'License': + video_license = mrr_contents_text + elif not multiple_songs: + if mrr_title == 'Album': + album = mrr_contents_text + elif mrr_title == 'Artist': + artist = mrr_contents_text + elif mrr_title == 'Song': + track = mrr_contents_text m_episode = re.search( r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>', From a62cf342988d80148fcff608d3ff828238e9573b Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan@gmail.com> Date: Thu, 26 Nov 2020 23:40:40 +0530 Subject: [PATCH 119/124] [spreaker] fix SpreakerShowIE test URL --- youtube_dlc/extractor/spreaker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dlc/extractor/spreaker.py b/youtube_dlc/extractor/spreaker.py index beee6670c..6c7e40ae4 100644 --- a/youtube_dlc/extractor/spreaker.py +++ b/youtube_dlc/extractor/spreaker.py @@ -126,7 +126,7 @@ def _real_extract(self, url): class SpreakerShowIE(InfoExtractor): _VALID_URL = r'https?://api\.spreaker\.com/show/(?P<id>\d+)' _TESTS = [{ - 'url': 'https://www.spreaker.com/show/3-ninjas-podcast', + 'url': 'https://api.spreaker.com/show/4652058', 'info_dict': { 'id': '4652058', }, From f0c532a430a07f7965b68b22f9ceb90542b848f6 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan@gmail.com> Date: Fri, 27 Nov 2020 00:27:53 +0530 Subject: [PATCH 120/124] Fix some improper Youtube URLs Eg: https://www.youtube.com/watch?list=UUXIkr0SRTnZO4_QpZozvCCA --- youtube_dlc/extractor/youtube.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index 72bc5a0da..3570bce71 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -3256,11 +3256,20 @@ def _real_extract(self, url): qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) video_id = qs.get('v', [None])[0] playlist_id = qs.get('list', [None])[0] + + if is_home.group('not_channel').startswith('watch') and not video_id: + if playlist_id: + self._downloader.report_warning('%s is not a valid Youtube URL. Trying to download playlist %s' % (url, playlist_id)) + url = 'https://www.youtube.com/playlist?list=%s' % playlist_id + # return self.url_result(playlist_id, ie=YoutubePlaylistIE.ie_key()) + else: + raise ExtractorError('Unable to recognize tab page') if video_id and playlist_id: if self._downloader.params.get('noplaylist'): self.to_screen('Downloading just video %s because of --no-playlist' % video_id) return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id) self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) + webpage = self._download_webpage(url, item_id) identity_token = self._search_regex( r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage, From 2fa90513e569d401143d27d6fd333331a74d10f3 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan@gmail.com> Date: Fri, 27 Nov 2020 00:41:52 +0530 Subject: [PATCH 121/124] flake8 --- youtube_dlc/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index 3570bce71..d23c503ad 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -3269,7 +3269,7 @@ def _real_extract(self, url): self.to_screen('Downloading just video %s because of --no-playlist' % video_id) return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id) self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) - + webpage = self._download_webpage(url, item_id) identity_token = self._search_regex( r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage, From c78b936af4366259605e3e706bdeb5e173bf3d9b Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan@gmail.com> Date: Fri, 27 Nov 2020 01:16:02 +0530 Subject: [PATCH 122/124] bug fix --- youtube_dlc/extractor/youtube.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index d23c503ad..ad56b9b01 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -64,7 +64,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}' _RESERVED_NAMES = ( - r'course|embed|channel|c|user|playlist|watch|w|results|storefront|' + r'course|embed|channel|c|user|playlist|watch|w|results|storefront|oops|' r'shared|index|account|reporthistory|t/terms|about|upload|signin|logout|' r'feed/(watch_later|history|subscriptions|library|trending|recommended)') @@ -3257,7 +3257,7 @@ def _real_extract(self, url): video_id = qs.get('v', [None])[0] playlist_id = qs.get('list', [None])[0] - if is_home.group('not_channel').startswith('watch') and not video_id: + if is_home.group('not_channel') is not None and is_home.group('not_channel').startswith('watch') and not video_id: if playlist_id: self._downloader.report_warning('%s is not a valid Youtube URL. Trying to download playlist %s' % (url, playlist_id)) url = 'https://www.youtube.com/playlist?list=%s' % playlist_id From ae7c01431db6853bf39600d8d862806511fe4f36 Mon Sep 17 00:00:00 2001 From: lorpus <ligma@poggers.me> Date: Fri, 27 Nov 2020 00:23:13 -0500 Subject: [PATCH 123/124] [bitwave.tv] add test --- youtube_dlc/extractor/bitwave.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/youtube_dlc/extractor/bitwave.py b/youtube_dlc/extractor/bitwave.py index 9aa210510..eb16c469d 100644 --- a/youtube_dlc/extractor/bitwave.py +++ b/youtube_dlc/extractor/bitwave.py @@ -6,6 +6,10 @@ class BitwaveReplayIE(InfoExtractor): IE_NAME = 'bitwave:replay' _VALID_URL = r'https?://(?:www\.)?bitwave\.tv/(?P<user>\w+)/replay/(?P<id>\w+)/?$' + _TEST = { + 'url': 'https://bitwave.tv/RhythmicCarnage/replay/z4P6eq5L7WDrM85UCrVr', + 'only_matching': True + } def _real_extract(self, url): replay_id = self._match_id(url) @@ -29,6 +33,10 @@ def _real_extract(self, url): class BitwaveStreamIE(InfoExtractor): IE_NAME = 'bitwave:stream' _VALID_URL = r'https?://(?:www\.)?bitwave\.tv/(?P<id>\w+)/?$' + _TEST = { + 'url': 'https://bitwave.tv/doomtube', + 'only_matching': True + } def _real_extract(self, url): username = self._match_id(url) From 9b664dc4202f07f2d8f2bb47260131bc8246b906 Mon Sep 17 00:00:00 2001 From: bopol <bopol@e.email> Date: Fri, 27 Nov 2020 23:51:33 +0100 Subject: [PATCH 124/124] [ina] support mobile links --- youtube_dlc/extractor/ina.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dlc/extractor/ina.py b/youtube_dlc/extractor/ina.py index 12695af27..b3b2683cb 100644 --- a/youtube_dlc/extractor/ina.py +++ b/youtube_dlc/extractor/ina.py @@ -12,7 +12,7 @@ class InaIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ina\.fr/(?:video|audio)/(?P<id>[A-Z0-9_]+)' + _VALID_URL = r'https?://(?:(?:www|m)\.)?ina\.fr/(?:video|audio)/(?P<id>[A-Z0-9_]+)' _TESTS = [{ 'url': 'http://www.ina.fr/video/I12055569/francois-hollande-je-crois-que-c-est-clair-video.html', 'md5': 'a667021bf2b41f8dc6049479d9bb38a3', @@ -31,6 +31,9 @@ class InaIE(InfoExtractor): }, { 'url': 'https://www.ina.fr/video/P16173408-video.html', 'only_matching': True, + }, { + 'url': 'http://m.ina.fr/video/I12055569', + 'only_matching': True, }] def _real_extract(self, url):