Don't guess the site info / restrict the description length

This commit is contained in:
Michael Vogel 2019-11-15 14:28:42 +01:00
parent a8fa7e5187
commit a3b7f08f78

View file

@ -17,6 +17,16 @@ use Friendica\Database\DBA;
*/ */
class ParseUrl class ParseUrl
{ {
/**
* Maximum number of characters for the description
*/
const MAX_DESC_COUNT = 250;
/**
* Minimum number of characters for the description
*/
const MIN_DESC_COUNT = 100;
/** /**
* @brief Search for chached embeddable data of an url otherwise fetch it * @brief Search for chached embeddable data of an url otherwise fetch it
* *
@ -336,36 +346,7 @@ class ParseUrl
$siteinfo['type'] = 'link'; $siteinfo['type'] = 'link';
} }
if (empty($siteinfo['image']) && !$no_guessing) { if (!empty($siteinfo['image'])) {
$list = $xpath->query('//img[@src]');
foreach ($list as $node) {
$img_tag = [];
if ($node->attributes->length) {
foreach ($node->attributes as $attribute) {
$img_tag[$attribute->name] = $attribute->value;
}
}
$src = self::completeUrl($img_tag['src'], $url);
$photodata = Images::getInfoFromURLCached($src);
if (($photodata) && ($photodata[0] > 150) && ($photodata[1] > 150)) {
if ($photodata[0] > 300) {
$photodata[1] = round($photodata[1] * (300 / $photodata[0]));
$photodata[0] = 300;
}
if ($photodata[1] > 300) {
$photodata[0] = round($photodata[0] * (300 / $photodata[1]));
$photodata[1] = 300;
}
$siteinfo['images'][] = [
'src' => $src,
'width' => $photodata[0],
'height' => $photodata[1]
];
}
}
} elseif (!empty($siteinfo['image'])) {
$src = self::completeUrl($siteinfo['image'], $url); $src = self::completeUrl($siteinfo['image'], $url);
unset($siteinfo['image']); unset($siteinfo['image']);
@ -379,47 +360,15 @@ class ParseUrl
} }
} }
if ((@$siteinfo['text'] == '') && (@$siteinfo['title'] != '') && !$no_guessing) { if (!empty($siteinfo['text']) && mb_strlen($siteinfo['text']) > self::MAX_DESC_COUNT) {
$text = ''; $siteinfo['text'] = mb_substr($siteinfo['text'], 0, self::MAX_DESC_COUNT) . '…';
$pos = mb_strrpos($siteinfo['text'], '.');
$list = $xpath->query('//div[@class="article"]'); if ($pos > self::MIN_DESC_COUNT) {
foreach ($list as $node) { $siteinfo['text'] = mb_substr($siteinfo['text'], 0, $pos + 1);
if (strlen($node->nodeValue) > 40) {
$text .= ' ' . trim($node->nodeValue);
} }
} }
if ($text == '') { Logger::info('Siteinfo fetched', ['url' => $url, 'siteinfo' => $siteinfo]);
$list = $xpath->query('//div[@class="content"]');
foreach ($list as $node) {
if (strlen($node->nodeValue) > 40) {
$text .= ' ' . trim($node->nodeValue);
}
}
}
// If none text was found then take the paragraph content
if ($text == '') {
$list = $xpath->query('//p');
foreach ($list as $node) {
if (strlen($node->nodeValue) > 40) {
$text .= ' ' . trim($node->nodeValue);
}
}
}
if ($text != '') {
$text = trim(str_replace(["\n", "\r"], [' ', ' '], $text));
while (strpos($text, ' ')) {
$text = trim(str_replace(' ', ' ', $text));
}
$siteinfo['text'] = trim(html_entity_decode(substr($text, 0, 350), ENT_QUOTES, 'UTF-8') . '...');
}
}
Logger::log('Siteinfo for ' . $url . ' ' . print_r($siteinfo, true), Logger::DEBUG);
Hook::callAll('getsiteinfo', $siteinfo); Hook::callAll('getsiteinfo', $siteinfo);