From d6a96153471ae7e93693cb4dee46cbec1492af7b Mon Sep 17 00:00:00 2001 From: Filippo Valsorda - Campagna Date: Tue, 10 Apr 2012 16:31:46 +0200 Subject: [PATCH 1/4] standardized the use of unescapeHTML; added clean_html() --- youtube_dl/__init__.py | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 5f874b72f..3fd5cadfd 100755 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -242,6 +242,18 @@ def htmlentity_transform(matchobj): return (u'&%s;' % entity) +def clean_html(html): + """Clean an HTML snippet into a readable string""" + # Newline vs
+ html = html.replace('\n', ' ') + html = re.sub('<\s*br\s*/?\s*>', '\n', html) + # Strip html tags + html = re.sub('<.*?>', '', html) + # Replace html entities + html = re.sub(ur'(?u)&(.+?);', htmlentity_transform, html) + return html + + def sanitize_title(utitle): """Sanitizes a video title so it could be used as part of a filename.""" utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle) @@ -3343,8 +3355,6 @@ def report_config_download(self, showName): self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName) def _real_extract(self, url): - htmlParser = HTMLParser.HTMLParser() - mobj = re.match(self._VALID_URL, url) if mobj is None: self._downloader.trouble(u'ERROR: invalid URL: %s' % url) @@ -3360,11 +3370,11 @@ def _real_extract(self, url): return descMatch = re.search(' Date: Tue, 10 Apr 2012 18:21:00 +0200 Subject: [PATCH 2/4] removed dependency from lxml: added IDParser --- youtube-dl | 131 ++++++++++++++++++++++++++--------------- youtube_dl/__init__.py | 100 +++++++++++++++++++++---------- 2 files changed, 155 insertions(+), 76 deletions(-) diff --git a/youtube-dl b/youtube-dl index 5224611d2..752d762eb 100755 --- a/youtube-dl +++ b/youtube-dl @@ -15,6 +15,7 @@ __authors__ = ( 'Kevin Ngo', 'Ori Avtalion', 'shizeeg', + 'Filippo Valsorda', ) __license__ = 'Public Domain' @@ -66,11 +67,6 @@ try: except ImportError: from cgi import parse_qs -try: - import lxml.etree -except ImportError: - pass # Handled below - try: import xml.etree.ElementTree except ImportError: # Python<2.5: Not officially supported, but let it slip @@ -197,6 +193,69 @@ except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/tr raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')') return res + +class IDParser(HTMLParser.HTMLParser): + """Modified HTMLParser that isolates a tag with the specified id""" + def __init__(self, id): + self.id = id + self.result = None + self.started = False + self.depth = {} + self.html = None + self.watch_startpos = False + HTMLParser.HTMLParser.__init__(self) + + def loads(self, html): + self.html = html + self.feed(html) + self.close() + + def handle_starttag(self, tag, attrs): + attrs = dict(attrs) + if self.started: + self.find_startpos(None) + if 'id' in attrs and attrs['id'] == self.id: + self.result = [tag] + self.started = True + self.watch_startpos = True + if self.started: + if not tag in self.depth: self.depth[tag] = 0 + self.depth[tag] += 1 + + def handle_endtag(self, tag): + if self.started: + if tag in self.depth: self.depth[tag] -= 1 + if self.depth[self.result[0]] == 0: + self.started = False + self.result.append(self.getpos()) + + def find_startpos(self, x): + """Needed to put the start position of the result (self.result[1]) + after the opening tag with the requested id""" + if self.watch_startpos: + self.watch_startpos = False + self.result.append(self.getpos()) + handle_entityref = handle_charref = handle_data = handle_comment = \ + handle_decl = handle_pi = unknown_decl = find_startpos + + def get_result(self): + if self.result == None: return None + if len(self.result) != 3: return None + lines = self.html.split('\n') + lines = lines[self.result[1][0]-1:self.result[2][0]] + lines[0] = lines[0][self.result[1][1]:] + if len(lines) == 1: + lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]] + lines[-1] = lines[-1][:self.result[2][1]] + return '\n'.join(lines).strip() + +def get_element_by_id(id, html): + """Return the content of the tag with the specified id in the passed HTML document""" + parser = IDParser(id) + parser.loads(html) + return parser.get_result() + + def preferredencoding(): """Get preferred encoding. @@ -241,6 +300,18 @@ def htmlentity_transform(matchobj): return (u'&%s;' % entity) +def clean_html(html): + """Clean an HTML snippet into a readable string""" + # Newline vs
+ html = html.replace('\n', ' ') + html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html) + # Strip html tags + html = re.sub('<.*?>', '', html) + # Replace html entities + html = re.sub(ur'(?u)&(.+?);', htmlentity_transform, html) + return html + + def sanitize_title(utitle): """Sanitizes a video title so it could be used as part of a filename.""" utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle) @@ -1419,18 +1490,9 @@ class YoutubeIE(InfoExtractor): pass # description - try: - lxml.etree - except NameError: - video_description = u'No description available.' - mobj = re.search(r'', video_webpage) - if mobj is not None: - video_description = mobj.group(1).decode('utf-8') - else: - html_parser = lxml.etree.HTMLParser(encoding='utf-8') - vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser) - video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()')) - # TODO use another parser + video_description = get_element_by_id("eow-description", video_webpage) + if video_description: video_description = clean_html(video_description.decode('utf8')) + else: video_description = '' # closed captions video_subtitles = None @@ -2164,18 +2226,9 @@ class VimeoIE(InfoExtractor): video_thumbnail = config["video"]["thumbnail"] # Extract video description - try: - lxml.etree - except NameError: - video_description = u'No description available.' - mobj = re.search(r'', webpage, re.MULTILINE) - if mobj is not None: - video_description = mobj.group(1) - else: - html_parser = lxml.etree.HTMLParser() - vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser) - video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip() - # TODO use another parser + video_description = get_element_by_id("description", webpage) + if video_description: video_description = clean_html(video_description.decode('utf8')) + else: video_description = '' # Extract upload date video_upload_date = u'NA' @@ -3342,8 +3395,6 @@ class EscapistIE(InfoExtractor): self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName) def _real_extract(self, url): - htmlParser = HTMLParser.HTMLParser() - mobj = re.match(self._VALID_URL, url) if mobj is None: self._downloader.trouble(u'ERROR: invalid URL: %s' % url) @@ -3359,11 +3410,11 @@ class EscapistIE(InfoExtractor): return descMatch = re.search(' html = html.replace('\n', ' ') - html = re.sub('<\s*br\s*/?\s*>', '\n', html) + html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html) # Strip html tags html = re.sub('<.*?>', '', html) # Replace html entities @@ -1432,18 +1490,9 @@ def _real_extract(self, url): pass # description - try: - lxml.etree - except NameError: - video_description = u'No description available.' - mobj = re.search(r'', video_webpage) - if mobj is not None: - video_description = mobj.group(1).decode('utf-8') - else: - html_parser = lxml.etree.HTMLParser(encoding='utf-8') - vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser) - video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()')) - # TODO use another parser + video_description = get_element_by_id("eow-description", video_webpage) + if video_description: video_description = clean_html(video_description.decode('utf8')) + else: video_description = '' # closed captions video_subtitles = None @@ -2177,18 +2226,9 @@ def _real_extract(self, url, new_video=True): video_thumbnail = config["video"]["thumbnail"] # Extract video description - try: - lxml.etree - except NameError: - video_description = u'No description available.' - mobj = re.search(r'', webpage, re.MULTILINE) - if mobj is not None: - video_description = mobj.group(1) - else: - html_parser = lxml.etree.HTMLParser() - vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser) - video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip() - # TODO use another parser + video_description = get_element_by_id("description", webpage) + if video_description: video_description = clean_html(video_description.decode('utf8')) + else: video_description = '' # Extract upload date video_upload_date = u'NA' From 781cc523af69a98efbd1b93cc89cec76145b8d14 Mon Sep 17 00:00:00 2001 From: Filippo Valsorda - Campagna Date: Tue, 10 Apr 2012 18:54:40 +0200 Subject: [PATCH 3/4] removed the undocumented HTMLParser.unescape, replaced with _unescapeHTML; fixed a bug in the use of _unescapeHTML (missing _, from d6a96153471ae7e93693cb4dee46cbec1492af7b) --- youtube-dl | 28 ++++++++++++++-------------- youtube_dl/__init__.py | 28 ++++++++++++++-------------- 2 files changed, 28 insertions(+), 28 deletions(-) diff --git a/youtube-dl b/youtube-dl index 752d762eb..78fb07ea1 100755 --- a/youtube-dl +++ b/youtube-dl @@ -308,13 +308,13 @@ def clean_html(html): # Strip html tags html = re.sub('<.*?>', '', html) # Replace html entities - html = re.sub(ur'(?u)&(.+?);', htmlentity_transform, html) + html = _unescapeHTML(html) return html def sanitize_title(utitle): """Sanitizes a video title so it could be used as part of a filename.""" - utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle) + utitle = _unescapeHTML(utitle) return utitle.replace(unicode(os.sep), u'%') @@ -371,8 +371,8 @@ def _unescapeHTML(s): """ assert type(s) == type(u'') - htmlParser = HTMLParser.HTMLParser() - return htmlParser.unescape(s) + result = re.sub(ur'(?u)&(.+?);', htmlentity_transform, s) + return result def _encodeFilename(s): """ @@ -1324,8 +1324,8 @@ class YoutubeIE(InfoExtractor): end = start + float(dur) start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000) end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000) - caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) - caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) # double cycle, inentional + caption = _unescapeHTML(caption) + caption = _unescapeHTML(caption) # double cycle, inentional srt += str(n) + '\n' srt += start + ' --> ' + end + '\n' srt += caption + '\n\n' @@ -2143,7 +2143,7 @@ class YahooIE(InfoExtractor): self._downloader.trouble(u'ERROR: Unable to extract media URL') return video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8') - video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url) + video_url = _unescapeHTML(video_url) try: # Process video information @@ -3410,11 +3410,11 @@ class EscapistIE(InfoExtractor): return descMatch = re.search('([^<]+)', coursepage) if m: - info['title'] = unescapeHTML(m.group(1)) + info['title'] = _unescapeHTML(m.group(1)) else: info['title'] = info['id'] info['stitle'] = _simplify_title(info['title']) m = re.search('([^<]+)', coursepage) if m: - info['description'] = unescapeHTML(m.group(1)) + info['description'] = _unescapeHTML(m.group(1)) links = _orderedSet(re.findall('', coursepage)) info['list'] = [ { 'type': 'reference', - 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage), + 'url': 'http://openclassroom.stanford.edu/MainFolder/' + _unescapeHTML(vpage), } for vpage in links] @@ -4007,7 +4007,7 @@ class StanfordOpenClassroomIE(InfoExtractor): info['list'] = [ { 'type': 'reference', - 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage), + 'url': 'http://openclassroom.stanford.edu/MainFolder/' + _unescapeHTML(cpage), } for cpage in links] diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 752d762eb..78fb07ea1 100755 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -308,13 +308,13 @@ def clean_html(html): # Strip html tags html = re.sub('<.*?>', '', html) # Replace html entities - html = re.sub(ur'(?u)&(.+?);', htmlentity_transform, html) + html = _unescapeHTML(html) return html def sanitize_title(utitle): """Sanitizes a video title so it could be used as part of a filename.""" - utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle) + utitle = _unescapeHTML(utitle) return utitle.replace(unicode(os.sep), u'%') @@ -371,8 +371,8 @@ def _unescapeHTML(s): """ assert type(s) == type(u'') - htmlParser = HTMLParser.HTMLParser() - return htmlParser.unescape(s) + result = re.sub(ur'(?u)&(.+?);', htmlentity_transform, s) + return result def _encodeFilename(s): """ @@ -1324,8 +1324,8 @@ def _closed_captions_xml_to_srt(self, xml_string): end = start + float(dur) start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000) end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000) - caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) - caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) # double cycle, inentional + caption = _unescapeHTML(caption) + caption = _unescapeHTML(caption) # double cycle, inentional srt += str(n) + '\n' srt += start + ' --> ' + end + '\n' srt += caption + '\n\n' @@ -2143,7 +2143,7 @@ def _real_extract(self, url, new_video=True): self._downloader.trouble(u'ERROR: Unable to extract media URL') return video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8') - video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url) + video_url = _unescapeHTML(video_url) try: # Process video information @@ -3410,11 +3410,11 @@ def _real_extract(self, url): return descMatch = re.search('([^<]+)', coursepage) if m: - info['title'] = unescapeHTML(m.group(1)) + info['title'] = _unescapeHTML(m.group(1)) else: info['title'] = info['id'] info['stitle'] = _simplify_title(info['title']) m = re.search('([^<]+)', coursepage) if m: - info['description'] = unescapeHTML(m.group(1)) + info['description'] = _unescapeHTML(m.group(1)) links = _orderedSet(re.findall('', coursepage)) info['list'] = [ { 'type': 'reference', - 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage), + 'url': 'http://openclassroom.stanford.edu/MainFolder/' + _unescapeHTML(vpage), } for vpage in links] @@ -4007,7 +4007,7 @@ def _real_extract(self, url): info['list'] = [ { 'type': 'reference', - 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage), + 'url': 'http://openclassroom.stanford.edu/MainFolder/' + _unescapeHTML(cpage), } for cpage in links] From 7a8501e307ec1283aeacb03b471b5509b8c92854 Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Tue, 10 Apr 2012 23:08:53 +0200 Subject: [PATCH 4/4] ignore parsing errors in get_element_by_id() --- youtube-dl | 5 ++++- youtube_dl/__init__.py | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/youtube-dl b/youtube-dl index 78fb07ea1..dc7ec136f 100755 --- a/youtube-dl +++ b/youtube-dl @@ -252,7 +252,10 @@ class IDParser(HTMLParser.HTMLParser): def get_element_by_id(id, html): """Return the content of the tag with the specified id in the passed HTML document""" parser = IDParser(id) - parser.loads(html) + try: + parser.loads(html) + except HTMLParser.HTMLParseError: + pass return parser.get_result() diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 78fb07ea1..dc7ec136f 100755 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -252,7 +252,10 @@ def get_result(self): def get_element_by_id(id, html): """Return the content of the tag with the specified id in the passed HTML document""" parser = IDParser(id) - parser.loads(html) + try: + parser.loads(html) + except HTMLParser.HTMLParseError: + pass return parser.get_result()