Improve HTML::toPlaintext
- Ignore empty trimmed text nodes - Ignore anchor links - Ignore blank tags and avoids adding a doctype to transitional DOM objects
This commit is contained in:
parent
d5fc4a268d
commit
a0f77e1800
1 changed files with 10 additions and 4 deletions
|
@ -56,6 +56,7 @@ class HTML
|
|||
|
||||
$xpath = new DOMXPath($doc);
|
||||
|
||||
/** @var \DOMNode[] $list */
|
||||
$list = $xpath->query("//" . $tag);
|
||||
foreach ($list as $node) {
|
||||
$attr = [];
|
||||
|
@ -98,9 +99,12 @@ class HTML
|
|||
$node->parentNode->insertBefore($StartCode, $node);
|
||||
|
||||
if ($node->hasChildNodes()) {
|
||||
/** @var \DOMNode $child */
|
||||
foreach ($node->childNodes as $child) {
|
||||
$newNode = $child->cloneNode(true);
|
||||
$node->parentNode->insertBefore($newNode, $node);
|
||||
if (trim($child->nodeValue)) {
|
||||
$newNode = $child->cloneNode(true);
|
||||
$node->parentNode->insertBefore($newNode, $node);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -559,6 +563,8 @@ class HTML
|
|||
$ignore = false;
|
||||
}
|
||||
|
||||
$ignore = $ignore || strpos($treffer[1], '#') === 0;
|
||||
|
||||
if (!$ignore) {
|
||||
$urls[$treffer[1]] = $treffer[1];
|
||||
}
|
||||
|
@ -582,7 +588,7 @@ class HTML
|
|||
|
||||
$message = mb_convert_encoding($message, 'HTML-ENTITIES', "UTF-8");
|
||||
|
||||
@$doc->loadHTML($message);
|
||||
@$doc->loadHTML($message, LIBXML_HTML_NODEFDTD | LIBXML_NOBLANKS);
|
||||
|
||||
$message = $doc->saveHTML();
|
||||
// Remove eventual UTF-8 BOM
|
||||
|
@ -591,7 +597,7 @@ class HTML
|
|||
// Collecting all links
|
||||
$urls = self::collectURLs($message);
|
||||
|
||||
@$doc->loadHTML($message);
|
||||
@$doc->loadHTML($message, LIBXML_HTML_NODEFDTD | LIBXML_NOBLANKS);
|
||||
|
||||
self::tagToBBCode($doc, 'html', [], '', '');
|
||||
self::tagToBBCode($doc, 'body', [], '', '');
|
||||
|
|
Loading…
Reference in a new issue