From 008c8dbf36deb3e3c3f3120801c99a7e86f3b906 Mon Sep 17 00:00:00 2001 From: Michael Date: Mon, 2 Oct 2023 20:37:16 +0000 Subject: [PATCH 1/2] More languages / use profile text as fallback --- doc/Addons.md | 1 + doc/de/Addons.md | 1 + src/Content/Text/BBCode.php | 64 ++++++++++++++++++++++++++++++++++--- src/Core/L10n.php | 13 ++++++++ src/Model/Item.php | 30 ++++++++--------- 5 files changed, 90 insertions(+), 19 deletions(-) diff --git a/doc/Addons.md b/doc/Addons.md index a0b122679..8ad1f89eb 100644 --- a/doc/Addons.md +++ b/doc/Addons.md @@ -227,6 +227,7 @@ Called after the language detection. This can be used for alternative language d - **text**: The text that is analyzed. - **detected**: (input/output) Array of language codes detected in the related text. +- **uri-id**: The Uri-Id of the item. ### addon_settings Called when generating the HTML for the addon settings page. diff --git a/doc/de/Addons.md b/doc/de/Addons.md index bd13f6334..99639e2bd 100644 --- a/doc/de/Addons.md +++ b/doc/de/Addons.md @@ -109,6 +109,7 @@ Dieser Hook kann dafür verwendet werden, alternative Erkennungsfunktionen einzu `$data` ist ein Array: 'text' => Der analysierte Text. 'detected' => (Eingabe/Ausgabe) Das Array mit den erkannten Sprachen. + 'uri-id' => Die Uri-Id des Beitrags **'addon_settings'** - wird aufgerufen, wenn die HTML-Ausgabe der Addon-Einstellungsseite generiert wird. $b ist die HTML-Ausgabe (String) der Addon-Einstellungsseite vor dem finalen ""-Tag. diff --git a/src/Content/Text/BBCode.php b/src/Content/Text/BBCode.php index ab7300da1..36355f996 100644 --- a/src/Content/Text/BBCode.php +++ b/src/Content/Text/BBCode.php @@ -230,18 +230,73 @@ class BBCode { DI::profiler()->startRecording('rendering'); // Remove pictures in advance to avoid unneeded proxy calls + $text = preg_replace("/\[img\=([0-9]*)x([0-9]*)\](.*?)\[\/img\]/ism", ' ', $text); $text = preg_replace("/\[img\=(.*?)\](.*?)\[\/img\]/ism", ' $2 ', $text); $text = preg_replace("/\[img.*?\[\/img\]/ism", ' ', $text); // Remove attachment $text = self::replaceAttachment($text); - $naked_text = HTML::toPlaintext(self::convert($text, false, BBCode::EXTERNAL, true), 0, !$keep_urls); + $naked_text = HTML::toPlaintext(self::convert($text, false, self::EXTERNAL, true), 0, !$keep_urls); DI::profiler()->stopRecording(); return $naked_text; } + /** + * Converts text into a format that can be used for the channel search and the language detection. + * + * @param string $text + * @param integer $uri_id + * @return string + */ + public static function toSearchText(string $text, int $uri_id): string + { + // Removes attachments + $text = self::removeAttachment($text); + + // Add images because of possible alt texts + if (!empty($uri_id)) { + $text = Post\Media::addAttachmentsToBody($uri_id, $text, [Post\Media::IMAGE]); + } + + if (empty($text)) { + return ''; + } + + // Remove links without a link description + $text = preg_replace("~\[url\=.*\]https?:.*\[\/url\]~", ' ', $text); + + // Remove pictures + $text = preg_replace("/\[img\=([0-9]*)x([0-9]*)\](.*?)\[\/img\]/ism", ' ', $text); + + // Replace picture with the alt description + $text = preg_replace("/\[img\=.*?\](.*?)\[\/img\]/ism", ' $1 ', $text); + + // Remove the other pictures + $text = preg_replace("/\[img.*?\[\/img\]/ism", ' ', $text); + + // Removes mentions, remove links from hashtags + $text = preg_replace('/[@!]\[url\=.*?\].*?\[\/url\]/ism', ' ', $text); + $text = preg_replace('/[#]\[url\=.*?\](.*?)\[\/url\]/ism', ' #$1 ', $text); + $text = preg_replace('/[@!#]?\[url.*?\[\/url\]/ism', ' ', $text); + $text = preg_replace("/\[url=[^\[\]]*\](.*)\[\/url\]/Usi", ' $1 ', $text); + + // Convert it to plain text + $text = self::toPlaintext($text, false); + + // Remove possibly remaining links + $text = preg_replace(Strings::autoLinkRegEx(), '', $text); + + // Remove all unneeded white space + do { + $oldtext = $text; + $text = str_replace([' ', "\n", "\r", '"', '_'], ' ', $text); + } while ($oldtext != $text); + + return trim($text); + } + private static function proxyUrl(string $image, int $simplehtml = self::INTERNAL, int $uriid = 0, string $size = ''): string { // Only send proxied pictures to API and for internal display @@ -931,7 +986,7 @@ class BBCode $network = $contact['network'] ?? Protocol::PHANTOM; $tpl = Renderer::getMarkupTemplate('shared_content.tpl'); - $text .= BBCode::SHARED_ANCHOR . Renderer::replaceMacros($tpl, [ + $text .= self::SHARED_ANCHOR . Renderer::replaceMacros($tpl, [ '$profile' => $attributes['profile'], '$avatar' => $attributes['avatar'], '$author' => $attributes['author'], @@ -1112,6 +1167,7 @@ class BBCode public static function removeLinks(string $bbcode): string { DI::profiler()->startRecording('rendering'); + $bbcode = preg_replace("/\[img\=([0-9]*)x([0-9]*)\](.*?)\[\/img\]/ism", ' ', $bbcode); $bbcode = preg_replace("/\[img\=(.*?)\](.*?)\[\/img\]/ism", ' $1 ', $bbcode); $bbcode = preg_replace("/\[img.*?\[\/img\]/ism", ' ', $bbcode); @@ -1996,7 +2052,7 @@ class BBCode { DI::profiler()->startRecording('rendering'); - $text = BBCode::performWithEscapedTags($text, ['code', 'noparse', 'nobb', 'pre'], function ($text) { + $text = self::performWithEscapedTags($text, ['code', 'noparse', 'nobb', 'pre'], function ($text) { $text = preg_replace("/[\s|\n]*\[abstract\].*?\[\/abstract\][\s|\n]*/ism", ' ', $text); $text = preg_replace("/[\s|\n]*\[abstract=.*?\].*?\[\/abstract][\s|\n]*/ism", ' ', $text); return $text; @@ -2018,7 +2074,7 @@ class BBCode DI::profiler()->startRecording('rendering'); $addon = strtolower($addon); - $abstract = BBCode::performWithEscapedTags($text, ['code', 'noparse', 'nobb', 'pre'], function ($text) use ($addon) { + $abstract = self::performWithEscapedTags($text, ['code', 'noparse', 'nobb', 'pre'], function ($text) use ($addon) { if ($addon && preg_match('#\[abstract=' . preg_quote($addon, '#') . '](.*?)\[/abstract]#ism', $text, $matches)) { return $matches[1]; } diff --git a/src/Core/L10n.php b/src/Core/L10n.php index 7fd7fc4e8..414d578fc 100644 --- a/src/Core/L10n.php +++ b/src/Core/L10n.php @@ -400,20 +400,33 @@ class L10n // Additionally some more languages are added to that list that are used in the Fediverse. $additional_langs = [ 'af' => 'Afrikaans', + 'az-Latn' => 'azərbaycan dili', + 'bs-Latn' => 'bosanski jezik', + 'be' => 'беларуская мова', + 'bn' => 'বাংলা', 'cy' => 'Cymraeg', 'el-monoton' => 'Ελληνικά', 'eu' => 'euskara', 'fa' => 'فارسی', + 'ga' => 'Gaeilge', 'gl' => 'Galego', + 'he' => 'עברית', 'hi' => 'हिन्दी', 'hr' => 'Hrvatski', + 'hy' => 'Հայերեն', 'id' => 'bahasa Indonesia', + 'jv' => 'Basa Jawa', + 'ka' => 'ქართული', 'ko' => '한국인', 'lt' => 'lietuvių', 'lv' => 'latviešu', + 'ms-Latn' => 'Bahasa Melayu', + 'sr-Cyrl' => 'српски језик', 'sk' => 'slovenský', 'sl' => 'Slovenščina', + 'sq' => 'Shqip', 'sw' => 'Kiswahili', + 'ta' => 'தமிழ்', 'th' => 'แบบไทย', 'tl' => 'Wikang Tagalog', 'tr' => 'Türkçe', diff --git a/src/Model/Item.php b/src/Model/Item.php index fbb608e5c..bed726704 100644 --- a/src/Model/Item.php +++ b/src/Model/Item.php @@ -1987,7 +1987,7 @@ class Item return ''; } - $languages = self::getLanguageArray(trim($item['title'] . "\n" . $item['body']), 3); + $languages = self::getLanguageArray(trim($item['title'] . "\n" . $item['body']), 3, $item['uri-id'], $item['author-id']); if (empty($languages)) { return ''; } @@ -2000,25 +2000,24 @@ class Item * * @param string $body * @param integer $count + * @param integer $uri_id + * @param integer $author_id * @return array */ - public static function getLanguageArray(string $body, int $count): array + public static function getLanguageArray(string $body, int $count, int $uri_id = 0, int $author_id = 0): array { - // Convert attachments to links - $naked_body = BBCode::removeAttachment($body); - if (empty($naked_body)) { - return []; + $naked_body = BBCode::toSearchText($body, $uri_id); + + if ((count(explode(' ', $naked_body)) < 10) && (mb_strlen($naked_body) < 30) && $author_id) { + $author = Contact::selectFirst(['about'], ['id' => $author_id]); + if (!empty($author['about'])) { + $about = BBCode::toSearchText($author['about'], 0); + $about = self::getDominantLanguage($about); + Logger::debug('About field added', ['author' => $author_id, 'body' => $naked_body, 'about' => $about]); + $naked_body .= ' ' . $about; + } } - // Remove links and pictures - $naked_body = BBCode::removeLinks($naked_body); - - // Convert the title and the body to plain text - $naked_body = BBCode::toPlaintext($naked_body); - - // Remove possibly remaining links - $naked_body = trim(preg_replace(Strings::autoLinkRegEx(), '', $naked_body)); - if (empty($naked_body)) { return []; } @@ -2034,6 +2033,7 @@ class Item $data = [ 'text' => $naked_body, 'detected' => $languages, + 'uri-id' => $uri_id, ]; Hook::callAll('detect_languages', $data); From 557ef9acc99502641d1d47f57b2d4ea8b84a59b2 Mon Sep 17 00:00:00 2001 From: Michael Date: Tue, 3 Oct 2023 04:01:54 +0000 Subject: [PATCH 2/2] Improved documentation --- doc/Addons.md | 2 +- doc/de/Addons.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/Addons.md b/doc/Addons.md index 8ad1f89eb..bfccde5dd 100644 --- a/doc/Addons.md +++ b/doc/Addons.md @@ -226,7 +226,7 @@ Called after the language detection. This can be used for alternative language d `$data` is an array: - **text**: The text that is analyzed. -- **detected**: (input/output) Array of language codes detected in the related text. +- **detected**: (input/output) Array of language codes detected in the related text. The array key is the language code, the array value the probability. - **uri-id**: The Uri-Id of the item. ### addon_settings diff --git a/doc/de/Addons.md b/doc/de/Addons.md index 99639e2bd..c61b68b48 100644 --- a/doc/de/Addons.md +++ b/doc/de/Addons.md @@ -108,7 +108,7 @@ Wird nach der Sprachenerkennung aufgerufen. Dieser Hook kann dafür verwendet werden, alternative Erkennungsfunktionen einzubinden. `$data` ist ein Array: 'text' => Der analysierte Text. - 'detected' => (Eingabe/Ausgabe) Das Array mit den erkannten Sprachen. + 'detected' => (Eingabe/Ausgabe) Das Array mit den erkannten Sprachen. Der Sprachcode ist der Array-Schlüssel, der Array-Wert ist der dezimale Wert für die Wahrscheinlichkeit. 'uri-id' => Die Uri-Id des Beitrags **'addon_settings'** - wird aufgerufen, wenn die HTML-Ausgabe der Addon-Einstellungsseite generiert wird.