Document and test categories (#2923)

This commit is contained in:
Philipp Hagemeister 2014-05-15 12:41:42 +02:00
parent 5afa7f8bee
commit ad3bc6acd5
2 changed files with 11 additions and 7 deletions

View File

@ -113,6 +113,8 @@ class InfoExtractor(object):
webpage_url: The url to the video webpage, if given to youtube-dl it webpage_url: The url to the video webpage, if given to youtube-dl it
should allow to get the same result again. (It will be set should allow to get the same result again. (It will be set
by YoutubeDL if it's missing) by YoutubeDL if it's missing)
categories: A list of categories that the video falls in, for example
["Sports", "Berlin"]
Unless mentioned otherwise, the fields should be Unicode strings. Unless mentioned otherwise, the fields should be Unicode strings.

View File

@ -242,7 +242,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
u"uploader": u"Philipp Hagemeister", u"uploader": u"Philipp Hagemeister",
u"uploader_id": u"phihag", u"uploader_id": u"phihag",
u"upload_date": u"20121002", u"upload_date": u"20121002",
u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ." u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .",
u"categories": [u'Science & Technology'],
} }
}, },
{ {
@ -1136,18 +1137,19 @@ def _real_extract(self, url):
# upload date # upload date
upload_date = None upload_date = None
mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL) mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
if mobj is not None: if mobj is not None:
upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split()) upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
upload_date = unified_strdate(upload_date) upload_date = unified_strdate(upload_date)
video_categories = []
# categories
m_cat_container = get_element_by_id("eow-category", video_webpage) m_cat_container = get_element_by_id("eow-category", video_webpage)
if m_cat_container: if m_cat_container:
video_categories = re.findall(r'<a[^<]+>(.*?)</a>', category = self._html_search_regex(
m_cat_container, re.DOTALL) r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'cateory',
default=None)
video_categories = None if category is None else [category]
else:
video_categories = None
# description # description
video_description = get_element_by_id("eow-description", video_webpage) video_description = get_element_by_id("eow-description", video_webpage)