use ..utils/clean_html()

2024-11-26 02:55:17 +00:00 · 2013-08-03 10:29:58 +08:00 · 2013-08-03 10:29:58 +08:00 · 4ec929dc9b
commit 4ec929dc9b
parent 6624a2b07d
1 changed files with 6 additions and 13 deletions
--- a/youtube_dl/extractor/sohu.py
+++ b/youtube_dl/extractor/sohu.py
@ -7,7 +7,7 @@
 import urllib2
 from .common import InfoExtractor
-from ..utils import compat_urllib_request
+from ..utils import compat_urllib_request, clean_html
 class SohuIE(InfoExtractor):
@ -22,16 +22,6 @@ class SohuIE(InfoExtractor):
        },
    }
    def _clearn_html(self, string):
        tags = re.findall(r'<.+?>', string)
        for t in tags:
            string = string.replace(t, ' ')
        for i in range(2):
            spaces = re.findall(r'\s+', string)
            for s in spaces:
                string = string.replace(s, ' ')
        string = string.strip()
        return string
    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
@ -40,7 +30,7 @@ def _real_extract(self, url):
        pattern = r'<h1 id="video-title">\n*?(.+?)\n*?</h1>'
        compiled = re.compile(pattern, re.DOTALL)
        title = self._search_regex(compiled, webpage, u'video title').strip('\t\n')
-        title = self._clearn_html(title)
+        title = clean_html(title)
        pattern = re.compile(r'var vid="(\d+)"')
        result = re.search(pattern, webpage)
        if not result:
@ -93,5 +83,8 @@ def _real_extract(self, url):
            }
            files_info.append(info)
            time.sleep(1)
-
+        if num_of_parts == 1:
            info =  files_info[0]
            info['id'] = video_id
            return info
        return files_info