[extractor/wordpress:mb.miniAudioPlayer] Add embed extractor (#5087)

Closes https://github.com/yt-dlp/yt-dlp/issues/4994

Authored by: coletdjnz
This commit is contained in:
Matthew 2022-10-09 18:55:26 +13:00 committed by GitHub
parent 1d55ebabc9
commit 4c9a1a3ba5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 94 additions and 3 deletions

View File

@ -1679,6 +1679,9 @@ def test_get_elements_text_and_html_by_attribute(self):
self.assertEqual(list(get_elements_text_and_html_by_attribute('class', 'foo', html)), []) self.assertEqual(list(get_elements_text_and_html_by_attribute('class', 'foo', html)), [])
self.assertEqual(list(get_elements_text_and_html_by_attribute('class', 'no-such-foo', html)), []) self.assertEqual(list(get_elements_text_and_html_by_attribute('class', 'no-such-foo', html)), [])
self.assertEqual(list(get_elements_text_and_html_by_attribute(
'class', 'foo', '<a class="foo">nice</a><span class="foo">nice</span>', tag='a')), [('nice', '<a class="foo">nice</a>')])
GET_ELEMENT_BY_TAG_TEST_STRING = ''' GET_ELEMENT_BY_TAG_TEST_STRING = '''
random text lorem ipsum</p> random text lorem ipsum</p>
<div> <div>

View File

@ -2165,7 +2165,10 @@
WistiaPlaylistIE, WistiaPlaylistIE,
WistiaChannelIE, WistiaChannelIE,
) )
from .wordpress import WordpressPlaylistEmbedIE from .wordpress import (
WordpressPlaylistEmbedIE,
WordpressMiniAudioPlayerEmbedIE,
)
from .worldstarhiphop import WorldStarHipHopIE from .worldstarhiphop import WorldStarHipHopIE
from .wppilot import ( from .wppilot import (
WPPilotIE, WPPilotIE,

View File

@ -1,6 +1,10 @@
import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
extract_attributes,
get_elements_by_class, get_elements_by_class,
get_elements_text_and_html_by_attribute,
int_or_none, int_or_none,
parse_duration, parse_duration,
traverse_obj, traverse_obj,
@ -67,3 +71,84 @@ def _extract_from_webpage(self, url, webpage):
'width': int_or_none(traverse_obj(track, ('dimensions', 'original', 'width'))), 'width': int_or_none(traverse_obj(track, ('dimensions', 'original', 'width'))),
} for track in traverse_obj(playlist_json, ('tracks', ...), expected_type=dict)] } for track in traverse_obj(playlist_json, ('tracks', ...), expected_type=dict)]
yield self.playlist_result(entries, self._generic_id(url) + f'-wp-playlist-{i+1}', 'Wordpress Playlist') yield self.playlist_result(entries, self._generic_id(url) + f'-wp-playlist-{i+1}', 'Wordpress Playlist')
class WordpressMiniAudioPlayerEmbedIE(InfoExtractor):
# WordPress MB Mini Player Plugin
# https://wordpress.org/plugins/wp-miniaudioplayer/
# Note: This is for the WordPress plugin version only.
_VALID_URL = False
IE_NAME = 'wordpress:mb.miniAudioPlayer'
_WEBPAGE_TESTS = [{
# Version 1.8.10: https://plugins.trac.wordpress.org/browser/wp-miniaudioplayer/tags/1.8.10
'url': 'https://news.samsung.com/global/over-the-horizon-the-evolution-of-the-samsung-galaxy-brand-sound',
'info_dict': {
'id': 'over-the-horizon-the-evolution-of-the-samsung-galaxy-brand-sound',
'title': 'Over the Horizon: The Evolution of the Samsung Galaxy Brand Sound',
'age_limit': 0,
'thumbnail': 'https://img.global.news.samsung.com/global/wp-content/uploads/2015/04/OTH_Main_Title-e1429612467870.jpg',
'description': 'md5:bc3dd738d1f11d9232e94e6629983bf7',
},
'playlist': [{
'info_dict': {
'id': 'over_the_horizon_2013',
'ext': 'mp3',
'title': 'Over the Horizon 2013',
'url': 'http://news.samsung.com/global/wp-content/uploads/ringtones/over_the_horizon_2013.mp3'
}
}],
'playlist_count': 6,
'params': {'skip_download': True}
}, {
# Version 1.9.3: https://plugins.trac.wordpress.org/browser/wp-miniaudioplayer/tags/1.9.3
'url': 'https://www.booksontape.com/collections/audiobooks-with-teacher-guides/',
'info_dict': {
'id': 'audiobooks-with-teacher-guides',
'title': 'Audiobooks with Teacher Guides | Books on Tape',
'age_limit': 0,
'thumbnail': 'https://www.booksontape.com/wp-content/uploads/2016/09/bot-logo-1200x630.jpg',
},
'playlist_mincount': 12
}, {
# Version 1.9.7: https://plugins.trac.wordpress.org/browser/wp-miniaudioplayer/tags/1.9.7
# But has spaces around href filter
'url': 'https://www.estudiords.com.br/temas/',
'info_dict': {
'id': 'temas',
'title': 'Temas Variados',
'age_limit': 0,
'timestamp': float,
'upload_date': str,
'thumbnail': 'https://www.estudiords.com.br/wp-content/uploads/2021/03/LOGO-TEMAS.png',
'description': 'md5:ab24d6a7ed0312ad2d466e721679f5a0',
},
'playlist_mincount': 30
}]
def _extract_from_webpage(self, url, webpage):
# Common function for the WordPress plugin version only.
mb_player_params = self._search_regex(
r'function\s*initializeMiniAudioPlayer\(\){[^}]+jQuery([^;]+)\.mb_miniPlayer',
webpage, 'mb player params', default=None)
if not mb_player_params:
return
# v1.55 - 1.9.3 has "a[href*='.mp3'] ,a[href*='.m4a']"
# v1.9.4+ has "a[href*='.mp3']" only
file_exts = re.findall(r'a\[href\s*\*=\s*\'\.([a-zA-Z\d]+)\'', mb_player_params)
if not file_exts:
return
candidates = get_elements_text_and_html_by_attribute(
'href', rf'(?:[^\"\']+\.(?:{"|".join(file_exts)}))', webpage, escape_value=False, tag='a')
for title, html in candidates:
attrs = extract_attributes(html)
# XXX: not tested - have not found any example of it being used
if any(c in (attrs.get('class') or '') for c in re.findall(r'\.not\("\.([^"]+)', mb_player_params)):
continue
href = attrs['href']
yield {
'id': self._generic_id(href),
'title': title or self._generic_title(href),
'url': href,
}

View File

@ -408,7 +408,7 @@ def get_elements_html_by_attribute(*args, **kwargs):
return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)] return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True): def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
""" """
Return the text (content) and the html (whole) of the tag with the specified Return the text (content) and the html (whole) of the tag with the specified
attribute in the passed HTML document attribute in the passed HTML document
@ -419,7 +419,7 @@ def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value
value = re.escape(value) if escape_value else value value = re.escape(value) if escape_value else value
partial_element_re = rf'''(?x) partial_element_re = rf'''(?x)
<(?P<tag>[a-zA-Z0-9:._-]+) <(?P<tag>{tag})
(?:\s(?:[^>"']|"[^"]*"|'[^']*')*)? (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
\s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q) \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
''' '''