[extractor/harpodeon] Add extractor (#4540)

Closes #4450
Authored by: eren-kemer
This commit is contained in:
Eren Kemer 2022-08-08 23:09:37 +02:00 committed by GitHub
parent f0ad6f8c51
commit e251986cbe
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 71 additions and 0 deletions

View File

@ -631,6 +631,7 @@
GronkhVodsIE GronkhVodsIE
) )
from .groupon import GrouponIE from .groupon import GrouponIE
from .harpodeon import HarpodeonIE
from .hbo import HBOIE from .hbo import HBOIE
from .hearthisat import HearThisAtIE from .hearthisat import HearThisAtIE
from .heise import HeiseIE from .heise import HeiseIE

View File

@ -0,0 +1,70 @@
from .common import InfoExtractor
from ..utils import unified_strdate
class HarpodeonIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?harpodeon\.com/(?:video|preview)/\w+/(?P<id>\d+)'
_TESTS = [{
'url': 'https://www.harpodeon.com/video/The_Smoking_Out_of_Bella_Butts/268068288',
'md5': '727371564a6a9ebccef2073535b5b6bd',
'skip': 'Free video could become unavailable',
'info_dict': {
'id': '268068288',
'ext': 'mp4',
'title': 'The Smoking Out of Bella Butts',
'description': 'md5:47e16bdb41fc8a79c83ab83af11c8b77',
'creator': 'Vitagraph Company of America',
'release_date': '19150101'
}
}, {
'url': 'https://www.harpodeon.com/preview/The_Smoking_Out_of_Bella_Butts/268068288',
'md5': '6dfea5412845f690c7331be703f884db',
'info_dict': {
'id': '268068288',
'ext': 'mp4',
'title': 'The Smoking Out of Bella Butts',
'description': 'md5:47e16bdb41fc8a79c83ab83af11c8b77',
'creator': 'Vitagraph Company of America',
'release_date': '19150101'
}
}, {
'url': 'https://www.harpodeon.com/preview/Behind_the_Screen/421838710',
'md5': '7979df9ca04637282cb7d172ab3a9c3b',
'info_dict': {
'id': '421838710',
'ext': 'mp4',
'title': 'Behind the Screen',
'description': 'md5:008972a3dc51fba3965ee517d2ba9155',
'creator': 'Lone Star Corporation',
'release_date': '19160101'
}
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
title, creator, release_year = self._search_regex(
r'''(?x)
<div[^>]+videoInfo[^<]*<h2[^>]*>(?P<title>[^>]+)</h2>
(?:\s*<p[^>]*>\((?P<creator>.+),\s*)?(?P<release_year>\d{4})?''',
webpage, 'title', group=('title', 'creator', 'release_year'),
fatal=False) or (None, None, None)
hp_base = self._html_search_regex(r'hpBase\(\s*["\']([^"\']+)', webpage, 'hp_base')
hp_inject_video, hp_resolution = self._search_regex(
r'''(?x)
hpInjectVideo\([\'\"](?P<hp_inject_video>\w+)[\'\"],
[\'\"](?P<hp_resolution>\d+)[\'\"]''',
webpage, 'hp_inject_video', group=['hp_inject_video', 'hp_resolution'])
return {
'id': video_id,
'title': title,
'url': f'{hp_base}{hp_inject_video}_{hp_resolution}.mp4',
'http_headers': {'Referer': url},
'description': self._html_search_meta('description', webpage, fatal=False),
'creator': creator,
'release_date': unified_strdate(f'{release_year}0101')
}