0
0
Fork 0
mirror of https://github.com/yt-dlp/yt-dlp.git synced 2024-11-24 02:35:12 +00:00

[extractor/bitchute] Improve BitChuteChannelIE (#5066)

Authored by: flashdagger, pukkandan
This commit is contained in:
MMM 2022-11-09 04:30:15 +01:00 committed by GitHub
parent 8fddc232bf
commit c61473c1d6
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 99 additions and 41 deletions

View file

@ -1,14 +1,18 @@
import itertools import functools
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
HEADRequest, HEADRequest,
OnDemandPagedList,
clean_html, clean_html,
get_element_by_class, get_element_by_class,
get_elements_html_by_class,
int_or_none, int_or_none,
orderedSet, orderedSet,
parse_count,
parse_duration,
traverse_obj, traverse_obj,
unified_strdate, unified_strdate,
urlencode_postdata, urlencode_postdata,
@ -109,51 +113,103 @@ def _real_extract(self, url):
class BitChuteChannelIE(InfoExtractor): class BitChuteChannelIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?bitchute\.com/channel/(?P<id>[^/?#&]+)' _VALID_URL = r'https?://(?:www\.)?bitchute\.com/(?P<type>channel|playlist)/(?P<id>[^/?#&]+)'
_TEST = { _TESTS = [{
'url': 'https://www.bitchute.com/channel/victoriaxrave/', 'url': 'https://www.bitchute.com/channel/bitchute/',
'playlist_mincount': 185,
'info_dict': { 'info_dict': {
'id': 'victoriaxrave', 'id': 'bitchute',
'title': 'BitChute',
'description': 'md5:5329fb3866125afa9446835594a9b138',
}, },
} 'playlist': [
{
'md5': '7e427d7ed7af5a75b5855705ec750e2b',
'info_dict': {
'id': 'UGlrF9o9b-Q',
'ext': 'mp4',
'filesize': None,
'title': 'This is the first video on #BitChute !',
'description': 'md5:a0337e7b1fe39e32336974af8173a034',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'BitChute',
'upload_date': '20170103',
'duration': 16,
'view_count': int,
},
}
],
'params': {
'skip_download': True,
'playlist_items': '-1',
},
}, {
'url': 'https://www.bitchute.com/playlist/wV9Imujxasw9/',
'playlist_mincount': 20,
'info_dict': {
'id': 'wV9Imujxasw9',
'title': 'Bruce MacDonald and "The Light of Darkness"',
'description': 'md5:04913227d2714af1d36d804aa2ab6b1e',
}
}]
_TOKEN = 'zyG6tQcGPE5swyAEFLqKUwMuMMuF6IO2DZ6ZDQjGfsL0e4dcTLwqkTTul05Jdve7' _TOKEN = 'zyG6tQcGPE5swyAEFLqKUwMuMMuF6IO2DZ6ZDQjGfsL0e4dcTLwqkTTul05Jdve7'
PAGE_SIZE = 25
HTML_CLASS_NAMES = {
'channel': {
'container': 'channel-videos-container',
'title': 'channel-videos-title',
'description': 'channel-videos-text',
},
'playlist': {
'container': 'playlist-video',
'title': 'title',
'description': 'description',
}
def _entries(self, channel_id): }
channel_url = 'https://www.bitchute.com/channel/%s/' % channel_id
offset = 0 @staticmethod
for page_num in itertools.count(1): def _make_url(playlist_id, playlist_type):
data = self._download_json( return f'https://www.bitchute.com/{playlist_type}/{playlist_id}/'
'%sextend/' % channel_url, channel_id,
'Downloading channel page %d' % page_num, def _fetch_page(self, playlist_id, playlist_type, page_num):
data=urlencode_postdata({ playlist_url = self._make_url(playlist_id, playlist_type)
'csrfmiddlewaretoken': self._TOKEN, data = self._download_json(
'name': '', f'{playlist_url}extend/', playlist_id, f'Downloading page {page_num}',
'offset': offset, data=urlencode_postdata({
}), headers={ 'csrfmiddlewaretoken': self._TOKEN,
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'name': '',
'Referer': channel_url, 'offset': page_num * self.PAGE_SIZE,
'X-Requested-With': 'XMLHttpRequest', }), headers={
'Cookie': 'csrftoken=%s' % self._TOKEN, 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
}) 'Referer': playlist_url,
if data.get('success') is False: 'X-Requested-With': 'XMLHttpRequest',
break 'Cookie': f'csrftoken={self._TOKEN}',
html = data.get('html') })
if not html: if not data.get('success'):
break return
video_ids = re.findall( classes = self.HTML_CLASS_NAMES[playlist_type]
r'class=["\']channel-videos-image-container[^>]+>\s*<a\b[^>]+\bhref=["\']/video/([^"\'/]+)', for video_html in get_elements_html_by_class(classes['container'], data.get('html')):
html) video_id = self._search_regex(
if not video_ids: r'<a\s[^>]*\bhref=["\']/video/([^"\'/]+)', video_html, 'video id', default=None)
break if not video_id:
offset += len(video_ids) continue
for video_id in video_ids: yield self.url_result(
yield self.url_result( f'https://www.bitchute.com/video/{video_id}', BitChuteIE, video_id, url_transparent=True,
'https://www.bitchute.com/video/%s' % video_id, title=clean_html(get_element_by_class(classes['title'], video_html)),
ie=BitChuteIE.ie_key(), video_id=video_id) description=clean_html(get_element_by_class(classes['description'], video_html)),
duration=parse_duration(get_element_by_class('video-duration', video_html)),
view_count=parse_count(clean_html(get_element_by_class('video-views', video_html))))
def _real_extract(self, url): def _real_extract(self, url):
channel_id = self._match_id(url) playlist_type, playlist_id = self._match_valid_url(url).group('type', 'id')
webpage = self._download_webpage(self._make_url(playlist_id, playlist_type), playlist_id)
page_func = functools.partial(self._fetch_page, playlist_id, playlist_type)
return self.playlist_result( return self.playlist_result(
self._entries(channel_id), playlist_id=channel_id) OnDemandPagedList(page_func, self.PAGE_SIZE), playlist_id,
title=self._html_extract_title(webpage, default=None),
description=self._html_search_meta(
('description', 'og:description', 'twitter:description'), webpage, default=None),
playlist_count=int_or_none(self._html_search_regex(
r'<span>(\d+)\s+videos?</span>', webpage, 'playlist count', default=None)))

View file

@ -418,6 +418,8 @@ def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w
Return the text (content) and the html (whole) of the tag with the specified Return the text (content) and the html (whole) of the tag with the specified
attribute in the passed HTML document attribute in the passed HTML document
""" """
if not value:
return
quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?' quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'