yt-dlp/yt_dlp/extractor/pinkbike.py

import re

from .common import InfoExtractor
from ..utils import (
    int_or_none,
    remove_end,
    remove_start,
    str_to_int,
    unified_strdate,
)


class PinkbikeIE(InfoExtractor):
    _VALID_URL = r'https?://(?:(?:www\.)?pinkbike\.com/video/|es\.pinkbike\.org/i/kvid/kvid-y5\.swf\?id=)(?P<id>[0-9]+)'
    _TESTS = [{
        'url': 'http://www.pinkbike.com/video/402811/',
        'md5': '4814b8ca7651034cd87e3361d5c2155a',
        'info_dict': {
            'id': '402811',
            'ext': 'mp4',
            'title': 'Brandon Semenuk - RAW 100',
            'description': 'Official release: www.redbull.ca/rupertwalker',
            'thumbnail': r're:^https?://.*\.jpg$',
            'duration': 100,
            'upload_date': '20150406',
            'uploader': 'revelco',
            'location': 'Victoria, British Columbia, Canada',
            'view_count': int,
            'comment_count': int,
        }
    }, {
        'url': 'http://es.pinkbike.org/i/kvid/kvid-y5.swf?id=406629',
        'only_matching': True,
    }]

    def _real_extract(self, url):
        video_id = self._match_id(url)

        webpage = self._download_webpage(
            'http://www.pinkbike.com/video/%s' % video_id, video_id)

        formats = []
        for _, format_id, src in re.findall(
                r'data-quality=((?:\\)?["\'])(.+?)\1[^>]+src=\1(.+?)\1', webpage):
            height = int_or_none(self._search_regex(
                r'^(\d+)[pP]$', format_id, 'height', default=None))
            formats.append({
                'url': src,
                'format_id': format_id,
                'height': height,
            })

        title = remove_end(self._og_search_title(webpage), ' Video - Pinkbike')
        description = self._html_search_regex(
            r'(?s)id="media-description"[^>]*>(.+?)<',
            webpage, 'description', default=None) or remove_start(
            self._og_search_description(webpage), title + '. ')
        thumbnail = self._og_search_thumbnail(webpage)
        duration = int_or_none(self._html_search_meta(
            'video:duration', webpage, 'duration'))

        uploader = self._search_regex(
            r'<a[^>]+\brel=["\']author[^>]+>([^<]+)', webpage,
            'uploader', fatal=False)
        upload_date = unified_strdate(self._search_regex(
            r'class="fullTime"[^>]+title="([^"]+)"',
            webpage, 'upload date', fatal=False))

        location = self._html_search_regex(
            r'(?s)<dt>Location</dt>\s*<dd>(.+?)<',
            webpage, 'location', fatal=False)

        def extract_count(webpage, label):
            return str_to_int(self._search_regex(
                r'<span[^>]+class="stat-num"[^>]*>([\d,.]+)</span>\s*<span[^>]+class="stat-label"[^>]*>%s' % label,
                webpage, label, fatal=False))

        view_count = extract_count(webpage, 'Views')
        comment_count = extract_count(webpage, 'Comments')

        return {
            'id': video_id,
            'title': title,
            'description': description,
            'thumbnail': thumbnail,
            'duration': duration,
            'upload_date': upload_date,
            'uploader': uploader,
            'location': location,
            'view_count': view_count,
            'comment_count': comment_count,
            'formats': formats
        }
[pinkbike] new extractor 2015-05-24 20:26:59 +00:00			`import re`

			`from .common import InfoExtractor`
[pinkbike] used proper conversion methods 2015-05-24 21:45:10 +00:00			`from ..utils import (`
			`int_or_none,`
			`remove_end,`
[pinkbike] Improve and simplify 2015-06-19 18:10:08 +00:00			`remove_start,`
			`str_to_int,`
			`unified_strdate,`
[pinkbike] used proper conversion methods 2015-05-24 21:45:10 +00:00			`)`
[pinkbike] new extractor 2015-05-24 20:26:59 +00:00

			`class PinkbikeIE(InfoExtractor):`
[pinkbike] Improve and simplify 2015-06-19 18:10:08 +00:00			`_VALID_URL = r'https?://(?:(?:www\.)?pinkbike\.com/video/\|es\.pinkbike\.org/i/kvid/kvid-y5\.swf\?id=)(?P<id>[0-9]+)'`
[pinkbike] new extractor 2015-05-24 20:26:59 +00:00			`_TESTS = [{`
			`'url': 'http://www.pinkbike.com/video/402811/',`
			`'md5': '4814b8ca7651034cd87e3361d5c2155a',`
			`'info_dict': {`
			`'id': '402811',`
			`'ext': 'mp4',`
			`'title': 'Brandon Semenuk - RAW 100',`
[pinkbike] Improve and simplify 2015-06-19 18:10:08 +00:00			`'description': 'Official release: www.redbull.ca/rupertwalker',`
Fix "invalid escape sequences" error on Python 3.6 2017-01-02 12:08:07 +00:00			`'thumbnail': r're:^https?://.*\.jpg$',`
[pinkbike] Improve and simplify 2015-06-19 18:10:08 +00:00			`'duration': 100,`
[pinkbike] new extractor 2015-05-24 20:26:59 +00:00			`'upload_date': '20150406',`
[pinkbike] Improve and simplify 2015-06-19 18:10:08 +00:00			`'uploader': 'revelco',`
			`'location': 'Victoria, British Columbia, Canada',`
			`'view_count': int,`
			`'comment_count': int,`
[pinkbike] new extractor 2015-05-24 20:26:59 +00:00			`}`
			`}, {`
[pinkbike] Improve and simplify 2015-06-19 18:10:08 +00:00			`'url': 'http://es.pinkbike.org/i/kvid/kvid-y5.swf?id=406629',`
			`'only_matching': True,`
[pinkbike] new extractor 2015-05-24 20:26:59 +00:00			`}]`

			`def _real_extract(self, url):`
			`video_id = self._match_id(url)`

[pinkbike] Improve and simplify 2015-06-19 18:10:08 +00:00			`webpage = self._download_webpage(`
			`'http://www.pinkbike.com/video/%s' % video_id, video_id)`
[pinkbike] new extractor 2015-05-24 20:26:59 +00:00
[pinkbike] Improve and simplify 2015-06-19 18:10:08 +00:00			`formats = []`
			`for _, format_id, src in re.findall(`
[pinkbike] PEP8 2015-06-21 10:22:19 +00:00			`r'data-quality=((?:\\)?["\'])(.+?)\1[^>]+src=\1(.+?)\1', webpage):`
[pinkbike] Improve and simplify 2015-06-19 18:10:08 +00:00			`height = int_or_none(self._search_regex(`
			`r'^(\d+)[pP]$', format_id, 'height', default=None))`
			`formats.append({`
			`'url': src,`
			`'format_id': format_id,`
			`'height': height,`
			`})`
[pinkbike] used proper conversion methods 2015-05-24 21:45:10 +00:00
[pinkbike] Improve and simplify 2015-06-19 18:10:08 +00:00			`title = remove_end(self._og_search_title(webpage), ' Video - Pinkbike')`
			`description = self._html_search_regex(`
			`r'(?s)id="media-description"[^>]*>(.+?)<',`
			`webpage, 'description', default=None) or remove_start(`
			`self._og_search_description(webpage), title + '. ')`
			`thumbnail = self._og_search_thumbnail(webpage)`
[pinkbike] used proper conversion methods 2015-05-24 21:45:10 +00:00			`duration = int_or_none(self._html_search_meta(`
			`'video:duration', webpage, 'duration'))`
[pinkbike] new extractor 2015-05-24 20:26:59 +00:00
[pinkbike] Improve and simplify 2015-06-19 18:10:08 +00:00			`uploader = self._search_regex(`
[pinkbike] Fix uploader extraction (closes #12054) 2017-02-14 19:08:32 +00:00			`r'<a[^>]+\brel=["\']author[^>]+>([^<]+)', webpage,`
			`'uploader', fatal=False)`
[pinkbike] Improve and simplify 2015-06-19 18:10:08 +00:00			`upload_date = unified_strdate(self._search_regex(`
			`r'class="fullTime"[^>]+title="([^"]+)"',`
			`webpage, 'upload date', fatal=False))`
[pinkbike] new extractor 2015-05-24 20:26:59 +00:00
			`location = self._html_search_regex(`
[pinkbike] Improve and simplify 2015-06-19 18:10:08 +00:00			`r'(?s)<dt>Location</dt>\s*<dd>(.+?)<',`
			`webpage, 'location', fatal=False)`
[pinkbike] new extractor 2015-05-24 20:26:59 +00:00
[pinkbike] Improve and simplify 2015-06-19 18:10:08 +00:00			`def extract_count(webpage, label):`
			`return str_to_int(self._search_regex(`
			`r'<span[^>]+class="stat-num"[^>]>([\d,.]+)</span>\s<span[^>]+class="stat-label"[^>]*>%s' % label,`
			`webpage, label, fatal=False))`
[pinkbike] new extractor 2015-05-24 20:26:59 +00:00
[pinkbike] Improve and simplify 2015-06-19 18:10:08 +00:00			`view_count = extract_count(webpage, 'Views')`
			`comment_count = extract_count(webpage, 'Comments')`
[pinkbike] new extractor 2015-05-24 20:26:59 +00:00
			`return {`
			`'id': video_id,`
			`'title': title,`
			`'description': description,`
[pinkbike] Improve and simplify 2015-06-19 18:10:08 +00:00			`'thumbnail': thumbnail,`
[pinkbike] used proper conversion methods 2015-05-24 21:45:10 +00:00			`'duration': duration,`
[pinkbike] new extractor 2015-05-24 20:26:59 +00:00			`'upload_date': upload_date,`
[pinkbike] Improve and simplify 2015-06-19 18:10:08 +00:00			`'uploader': uploader,`
[pinkbike] new extractor 2015-05-24 20:26:59 +00:00			`'location': location,`
[pinkbike] Improve and simplify 2015-06-19 18:10:08 +00:00			`'view_count': view_count,`
			`'comment_count': comment_count,`
[pinkbike] new extractor 2015-05-24 20:26:59 +00:00			`'formats': formats`
			`}`