yt-dlp/youtube_dl/extractor/teamcoco.py

from __future__ import unicode_literals

import re

from .common import InfoExtractor
from ..utils import (
    ExtractorError,
)


class TeamcocoIE(InfoExtractor):
    _VALID_URL = r'http://teamcoco\.com/video/(?P<video_id>[0-9]+)?/?(?P<url_title>.*)'
    _TESTS = [
    {
        'url': 'http://teamcoco.com/video/80187/conan-becomes-a-mary-kay-beauty-consultant',
        'file': '80187.mp4',
        'md5': '3f7746aa0dc86de18df7539903d399ea',
        'info_dict': {
            'title': 'Conan Becomes A Mary Kay Beauty Consultant',
            'description': 'Mary Kay is perhaps the most trusted name in female beauty, so of course Conan is a natural choice to sell their products.'
        }
    },
    {
        'url': 'http://teamcoco.com/video/louis-ck-interview-george-w-bush',
        'file': '19705.mp4',
        'md5': 'cde9ba0fa3506f5f017ce11ead928f9a',
        'info_dict': {
            "description": "Louis C.K. got starstruck by George W. Bush, so what? Part one.",
            "title": "Louis C.K. Interview Pt. 1 11/3/11"
        }
    }
    ]

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        if mobj is None:
            raise ExtractorError('Invalid URL: %s' % url)
        url_title = mobj.group('url_title')
        webpage = self._download_webpage(url, url_title)
        
        video_id = mobj.group("video_id")
        if video_id == '':
            video_id = self._html_search_regex(
                r'<article class="video" data-id="(\d+?)"',
                webpage, 'video id')
        
        self.report_extraction(video_id)

        data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
        data = self._download_xml(data_url, video_id, 'Downloading data webpage')

        qualities = ['500k', '480p', '1000k', '720p', '1080p']
        formats = []
        for filed in data.findall('files/file'):
            if filed.attrib.get('playmode') == 'all':
                # it just duplicates one of the entries
                break
            file_url = filed.text
            m_format = re.search(r'(\d+(k|p))\.mp4', file_url)
            if m_format is not None:
                format_id = m_format.group(1)
            else:
                format_id = filed.attrib['bitrate']
            tbr = (
                int(filed.attrib['bitrate'])
                if filed.attrib['bitrate'].isdigit()
                else None)

            try:
                quality = qualities.index(format_id)
            except ValueError:
                quality = -1
            formats.append({
                'url': file_url,
                'ext': 'mp4',
                'tbr': tbr,
                'format_id': format_id,
                'quality': quality,
            })

        self._sort_formats(formats)

        return {
            'id': video_id,
            'formats': formats,
            'title': self._og_search_title(webpage),
            'thumbnail': self._og_search_thumbnail(webpage),
            'description': self._og_search_description(webpage),
        }
[teamcoco] Use unicode_literals 2014-01-17 02:15:09 +00:00			`from __future__ import unicode_literals`

[Teamcoco] Move into own file 2013-06-23 20:31:50 +00:00			`import re`

			`from .common import InfoExtractor`
			`from ..utils import (`
			`ExtractorError,`
			`)`


			`class TeamcocoIE(InfoExtractor):`
[teamcoco] Fix regex in 2.6 (#2700) The re engine does not want to repeat an empty string, for fear that something like (.) could be matching the tokens ... "" "" "" "" "" "" Of course, that's harmless with a question mark, although still somewhat strange. 2014-04-04 20:46:44 +00:00			`_VALID_URL = r'http://teamcoco\.com/video/(?P<video_id>[0-9]+)?/?(?P<url_title>.*)'`
Add a test for the new URL pages Add a test for the pages with the video_id in the URL. 2014-04-04 17:52:35 +00:00			`_TESTS = [`
			`{`
			`'url': 'http://teamcoco.com/video/80187/conan-becomes-a-mary-kay-beauty-consultant',`
			`'file': '80187.mp4',`
			`'md5': '3f7746aa0dc86de18df7539903d399ea',`
			`'info_dict': {`
			`'title': 'Conan Becomes A Mary Kay Beauty Consultant',`
			`'description': 'Mary Kay is perhaps the most trusted name in female beauty, so of course Conan is a natural choice to sell their products.'`
			`}`
			`},`
			`{`
[teamcoco] Use unicode_literals 2014-01-17 02:15:09 +00:00			`'url': 'http://teamcoco.com/video/louis-ck-interview-george-w-bush',`
			`'file': '19705.mp4',`
			`'md5': 'cde9ba0fa3506f5f017ce11ead928f9a',`
			`'info_dict': {`
			`"description": "Louis C.K. got starstruck by George W. Bush, so what? Part one.",`
			`"title": "Louis C.K. Interview Pt. 1 11/3/11"`
Move tests to the IE definitions 2013-06-27 18:46:46 +00:00			`}`
			`}`
Add a test for the new URL pages Add a test for the pages with the video_id in the URL. 2014-04-04 17:52:35 +00:00			`]`
[Teamcoco] Move into own file 2013-06-23 20:31:50 +00:00
			`def _real_extract(self, url):`
			`mobj = re.match(self._VALID_URL, url)`
			`if mobj is None:`
[teamcoco] Use unicode_literals 2014-01-17 02:15:09 +00:00			`raise ExtractorError('Invalid URL: %s' % url)`
Revert "Workaround for regex engine limitation" This reverts commit 6d0d573ecaf763ce2b043ef7f83a743170eab16d. 2014-04-04 19:37:49 +00:00			`url_title = mobj.group('url_title')`
[Teamcoco] Move into own file 2013-06-23 20:31:50 +00:00			`webpage = self._download_webpage(url, url_title)`
Support TeamCoco URLs with video_id in the title If the URL has the video_id in it, use that since the current method of finding the id breaks on those pages. Fixes 2698. 2014-04-04 17:42:34 +00:00
Revert "Workaround for regex engine limitation" This reverts commit 6d0d573ecaf763ce2b043ef7f83a743170eab16d. 2014-04-04 19:37:49 +00:00			`video_id = mobj.group("video_id")`
			`if video_id == '':`
Support TeamCoco URLs with video_id in the title If the URL has the video_id in it, use that since the current method of finding the id breaks on those pages. Fixes 2698. 2014-04-04 17:42:34 +00:00			`video_id = self._html_search_regex(`
			`r'<article class="video" data-id="(\d+?)"',`
			`webpage, 'video id')`

[Teamcoco] Move into own file 2013-06-23 20:31:50 +00:00			`self.report_extraction(video_id)`

			`data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id`
Use the new '_download_xml' helper in more extractors 2013-11-26 17:48:52 +00:00			`data = self._download_xml(data_url, video_id, 'Downloading data webpage')`
[Teamcoco] Move into own file 2013-06-23 20:31:50 +00:00
[teamcoco] Parse the xml file and extract all the formats 2013-11-03 16:48:12 +00:00			`qualities = ['500k', '480p', '1000k', '720p', '1080p']`
			`formats = []`
[teamcoco] Use centralized sorting 2014-01-17 02:22:02 +00:00			`for filed in data.findall('files/file'):`
			`if filed.attrib.get('playmode') == 'all':`
[teamcoco] Parse the xml file and extract all the formats 2013-11-03 16:48:12 +00:00			`# it just duplicates one of the entries`
			`break`
[teamcoco] Use centralized sorting 2014-01-17 02:22:02 +00:00			`file_url = filed.text`
[teamcoco] Parse the xml file and extract all the formats 2013-11-03 16:48:12 +00:00			`m_format = re.search(r'(\d+(k\|p))\.mp4', file_url)`
			`if m_format is not None:`
			`format_id = m_format.group(1)`
			`else:`
[teamcoco] Use centralized sorting 2014-01-17 02:22:02 +00:00			`format_id = filed.attrib['bitrate']`
			`tbr = (`
			`int(filed.attrib['bitrate'])`
			`if filed.attrib['bitrate'].isdigit()`
			`else None)`

			`try:`
			`quality = qualities.index(format_id)`
			`except ValueError:`
			`quality = -1`
[teamcoco] Parse the xml file and extract all the formats 2013-11-03 16:48:12 +00:00			`formats.append({`
			`'url': file_url,`
			`'ext': 'mp4',`
[teamcoco] Use centralized sorting 2014-01-17 02:22:02 +00:00			`'tbr': tbr,`
[teamcoco] Parse the xml file and extract all the formats 2013-11-03 16:48:12 +00:00			`'format_id': format_id,`
[teamcoco] Use centralized sorting 2014-01-17 02:22:02 +00:00			`'quality': quality,`
[teamcoco] Parse the xml file and extract all the formats 2013-11-03 16:48:12 +00:00			`})`
[teamcoco] Use centralized sorting 2014-01-17 02:22:02 +00:00
			`self._sort_formats(formats)`
[Teamcoco] Move into own file 2013-06-23 20:31:50 +00:00
[teamcoco] Parse the xml file and extract all the formats 2013-11-03 16:48:12 +00:00			`return {`
[teamcoco] Use unicode_literals 2014-01-17 02:15:09 +00:00			`'id': video_id,`
[teamcoco] Parse the xml file and extract all the formats 2013-11-03 16:48:12 +00:00			`'formats': formats,`
[teamcoco] Use unicode_literals 2014-01-17 02:15:09 +00:00			`'title': self._og_search_title(webpage),`
			`'thumbnail': self._og_search_thumbnail(webpage),`
InfoExtractor: add some helper methods to extract OpenGraph info 2013-07-12 17:00:19 +00:00			`'description': self._og_search_description(webpage),`
[teamcoco] Parse the xml file and extract all the formats 2013-11-03 16:48:12 +00:00			`}`