yt-dlp/yt_dlp/extractor/movingimage.py

from .common import InfoExtractor
from ..utils import (
    parse_duration,
    unescapeHTML,
)


class MovingImageIE(InfoExtractor):
    _VALID_URL = r'https?://movingimage\.nls\.uk/film/(?P<id>\d+)'
    _TEST = {
        'url': 'http://movingimage.nls.uk/film/3561',
        'md5': '4caa05c2b38453e6f862197571a7be2f',
        'info_dict': {
            'id': '3561',
            'ext': 'mp4',
            'title': 'SHETLAND WOOL',
            'description': 'md5:c5afca6871ad59b4271e7704fe50ab04',
            'duration': 900,
            'thumbnail': r're:^https?://.*\.jpg$',
        },
    }

    def _real_extract(self, url):
        video_id = self._match_id(url)

        webpage = self._download_webpage(url, video_id)

        formats = self._extract_m3u8_formats(
            self._html_search_regex(r'file\s*:\s*"([^"]+)"', webpage, 'm3u8 manifest URL'),
            video_id, ext='mp4', entry_protocol='m3u8_native')

        def search_field(field_name, fatal=False):
            return self._search_regex(
                rf'<span\s+class="field_title">{field_name}:</span>\s*<span\s+class="field_content">([^<]+)</span>',
                webpage, 'title', fatal=fatal)

        title = unescapeHTML(search_field('Title', fatal=True)).strip('()[]')
        description = unescapeHTML(search_field('Description'))
        duration = parse_duration(search_field('Running time'))
        thumbnail = self._search_regex(
            r"image\s*:\s*'([^']+)'", webpage, 'thumbnail', fatal=False)

        return {
            'id': video_id,
            'formats': formats,
            'title': title,
            'description': description,
            'duration': duration,
            'thumbnail': thumbnail,
        }
[ssa] Add extractor (Closes #5169) 2015-03-11 15:15:36 +00:00			`from .common import InfoExtractor`
			`from ..utils import (`
			`parse_duration,`
[misc] Add `hatch`, `ruff`, `pre-commit` and improve dev docs (#7409) Authored by: bashonly, seproDev, Grub4K Co-authored-by: bashonly <88596187+bashonly@users.noreply.github.com> Co-authored-by: sepro <4618135+seproDev@users.noreply.github.com> 2024-05-26 19:27:21 +00:00			`unescapeHTML,`
[ssa] Add extractor (Closes #5169) 2015-03-11 15:15:36 +00:00			`)`


[movingimage] Adapt to the new domain name and fix extraction Closes #10466 2016-09-01 08:58:16 +00:00			`class MovingImageIE(InfoExtractor):`
			`_VALID_URL = r'https?://movingimage\.nls\.uk/film/(?P<id>\d+)'`
[ssa] Add extractor (Closes #5169) 2015-03-11 15:15:36 +00:00			`_TEST = {`
[movingimage] Adapt to the new domain name and fix extraction Closes #10466 2016-09-01 08:58:16 +00:00			`'url': 'http://movingimage.nls.uk/film/3561',`
			`'md5': '4caa05c2b38453e6f862197571a7be2f',`
[ssa] Add extractor (Closes #5169) 2015-03-11 15:15:36 +00:00			`'info_dict': {`
			`'id': '3561',`
[movingimage] Adapt to the new domain name and fix extraction Closes #10466 2016-09-01 08:58:16 +00:00			`'ext': 'mp4',`
[ssa] Add extractor (Closes #5169) 2015-03-11 15:15:36 +00:00			`'title': 'SHETLAND WOOL',`
			`'description': 'md5:c5afca6871ad59b4271e7704fe50ab04',`
			`'duration': 900,`
Fix "invalid escape sequences" error on Python 3.6 2017-01-02 12:08:07 +00:00			`'thumbnail': r're:^https?://.*\.jpg$',`
[ssa] Add extractor (Closes #5169) 2015-03-11 15:15:36 +00:00			`},`
			`}`

			`def _real_extract(self, url):`
			`video_id = self._match_id(url)`

			`webpage = self._download_webpage(url, video_id)`

[movingimage] Adapt to the new domain name and fix extraction Closes #10466 2016-09-01 08:58:16 +00:00			`formats = self._extract_m3u8_formats(`
			`self._html_search_regex(r'file\s:\s"([^"]+)"', webpage, 'm3u8 manifest URL'),`
			`video_id, ext='mp4', entry_protocol='m3u8_native')`
[ssa] Add extractor (Closes #5169) 2015-03-11 15:15:36 +00:00
			`def search_field(field_name, fatal=False):`
			`return self._search_regex(`
[cleanup] Add more ruff rules (#10149) Authored by: seproDev Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com> Reviewed-by: Simon Sawicki <contact@grub4k.xyz> 2024-06-11 23:09:58 +00:00			`rf'<span\s+class="field_title">{field_name}:</span>\s*<span\s+class="field_content">([^<]+)</span>',`
[ssa] Add extractor (Closes #5169) 2015-03-11 15:15:36 +00:00			`webpage, 'title', fatal=fatal)`

			`title = unescapeHTML(search_field('Title', fatal=True)).strip('()[]')`
			`description = unescapeHTML(search_field('Description'))`
			`duration = parse_duration(search_field('Running time'))`
			`thumbnail = self._search_regex(`
[movingimage] Adapt to the new domain name and fix extraction Closes #10466 2016-09-01 08:58:16 +00:00			`r"image\s:\s'([^']+)'", webpage, 'thumbnail', fatal=False)`
[ssa] Add extractor (Closes #5169) 2015-03-11 15:15:36 +00:00
			`return {`
			`'id': video_id,`
[movingimage] Adapt to the new domain name and fix extraction Closes #10466 2016-09-01 08:58:16 +00:00			`'formats': formats,`
[ssa] Add extractor (Closes #5169) 2015-03-11 15:15:36 +00:00			`'title': title,`
			`'description': description,`
			`'duration': duration,`
			`'thumbnail': thumbnail,`
			`}`