mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2024-11-26 02:55:17 +00:00
[utils] Support TTML without default namespace
In a strict sense such TTML is invalid, but Yahoo uses it.
This commit is contained in:
parent
2aa64b89b3
commit
1b0427e6c4
2 changed files with 21 additions and 3 deletions
|
@ -621,6 +621,21 @@ def test_dfxp2srt(self):
|
||||||
'''
|
'''
|
||||||
self.assertEqual(dfxp2srt(dfxp_data), srt_data)
|
self.assertEqual(dfxp2srt(dfxp_data), srt_data)
|
||||||
|
|
||||||
|
dfxp_data_no_default_namespace = '''<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<tt xml:lang="en" xmlns:tts="http://www.w3.org/ns/ttml#parameter">
|
||||||
|
<body>
|
||||||
|
<div xml:lang="en">
|
||||||
|
<p begin="0" end="1">The first line</p>
|
||||||
|
</div>
|
||||||
|
</body>
|
||||||
|
</tt>'''
|
||||||
|
srt_data = '''1
|
||||||
|
00:00:00,000 --> 00:00:01,000
|
||||||
|
The first line
|
||||||
|
|
||||||
|
'''
|
||||||
|
self.assertEqual(dfxp2srt(dfxp_data_no_default_namespace), srt_data)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|
|
@ -1848,9 +1848,9 @@ def parse_node(node):
|
||||||
out = str_or_empty(node.text)
|
out = str_or_empty(node.text)
|
||||||
|
|
||||||
for child in node:
|
for child in node:
|
||||||
if child.tag == _x('ttml:br'):
|
if child.tag in (_x('ttml:br'), 'br'):
|
||||||
out += '\n' + str_or_empty(child.tail)
|
out += '\n' + str_or_empty(child.tail)
|
||||||
elif child.tag == _x('ttml:span'):
|
elif child.tag in (_x('ttml:span'), 'span'):
|
||||||
out += str_or_empty(parse_node(child))
|
out += str_or_empty(parse_node(child))
|
||||||
else:
|
else:
|
||||||
out += str_or_empty(xml.etree.ElementTree.tostring(child))
|
out += str_or_empty(xml.etree.ElementTree.tostring(child))
|
||||||
|
@ -1859,7 +1859,10 @@ def parse_node(node):
|
||||||
|
|
||||||
dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8'))
|
dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8'))
|
||||||
out = []
|
out = []
|
||||||
paras = dfxp.findall(_x('.//ttml:p'))
|
paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
|
||||||
|
|
||||||
|
if not paras:
|
||||||
|
raise ValueError('Invalid dfxp/TTML subtitle')
|
||||||
|
|
||||||
for para, index in zip(paras, itertools.count(1)):
|
for para, index in zip(paras, itertools.count(1)):
|
||||||
begin_time = parse_dfxp_time_expr(para.attrib['begin'])
|
begin_time = parse_dfxp_time_expr(para.attrib['begin'])
|
||||||
|
|
Loading…
Reference in a new issue