From 3f2b0b9422915529a0ea585aa4325b6d2f2f65cd Mon Sep 17 00:00:00 2001 From: Michael Date: Thu, 2 Nov 2023 22:49:25 +0000 Subject: [PATCH] Use ISO-639-1 for the language detection --- src/Content/Conversation/Factory/Channel.php | 6 +- src/Core/L10n.php | 149 +++++++++---------- src/Model/Item.php | 35 ++++- src/Model/Post/Engagement.php | 3 - src/Model/User.php | 29 ++-- src/Module/Conversation/Timeline.php | 3 +- src/Module/Settings/Display.php | 2 +- 7 files changed, 113 insertions(+), 114 deletions(-) diff --git a/src/Content/Conversation/Factory/Channel.php b/src/Content/Conversation/Factory/Channel.php index f03b58925..0e44c7e59 100644 --- a/src/Content/Conversation/Factory/Channel.php +++ b/src/Content/Conversation/Factory/Channel.php @@ -35,13 +35,13 @@ final class Channel extends Timeline */ public function getTimelines(int $uid): Timelines { - $language = User::getLanguageCode($uid); - $languages = $this->l10n->getAvailableLanguages(true); + $iso639 = new \Matriphe\ISO639\ISO639; + $native = $iso639->nativeByCode1(User::getLanguageCode($uid)); $tabs = [ new ChannelEntity(ChannelEntity::FORYOU, $this->l10n->t('For you'), $this->l10n->t('Posts from contacts you interact with and who interact with you'), 'y'), new ChannelEntity(ChannelEntity::WHATSHOT, $this->l10n->t('What\'s Hot'), $this->l10n->t('Posts with a lot of interactions'), 'h'), - new ChannelEntity(ChannelEntity::LANGUAGE, $languages[$language], $this->l10n->t('Posts in %s', $languages[$language]), 'g'), + new ChannelEntity(ChannelEntity::LANGUAGE, $native, $this->l10n->t('Posts in %s', $native), 'g'), new ChannelEntity(ChannelEntity::FOLLOWERS, $this->l10n->t('Followers'), $this->l10n->t('Posts from your followers that you don\'t follow'), 'f'), new ChannelEntity(ChannelEntity::SHARERSOFSHARERS, $this->l10n->t('Sharers of sharers'), $this->l10n->t('Posts from accounts that are followed by accounts that you follow'), 'r'), new ChannelEntity(ChannelEntity::IMAGE, $this->l10n->t('Images'), $this->l10n->t('Posts with images'), 'i'), diff --git a/src/Core/L10n.php b/src/Core/L10n.php index 548aea1ac..87d11de6f 100644 --- a/src/Core/L10n.php +++ b/src/Core/L10n.php @@ -378,7 +378,7 @@ class L10n * * @return array */ - public function getAvailableLanguages(bool $additional = false): array + public function getAvailableLanguages(): array { $langs = []; $strings_file_paths = glob('view/lang/*/strings.php'); @@ -392,107 +392,94 @@ class L10n $path_array = explode('/', $strings_file_path); $langs[$path_array[2]] = self::LANG_NAMES[$path_array[2]] ?? $path_array[2]; } - - if ($additional) { - // See https://github.com/friendica/friendica/issues/10511 - // Persian is manually added to language detection until a persian translation is provided for the interface, at - // which point it will be automatically available through `getAvailableLanguages()` and this should be removed. - // Additionally some more languages are added to that list that are used in the Fediverse. - $additional_langs = [ - 'af' => 'Afrikaans', - 'az-Latn' => 'azərbaycan dili', - 'bs-Latn' => 'bosanski jezik', - 'be' => 'беларуская мова', - 'bn' => 'বাংলা', - 'cy' => 'Cymraeg', - 'el-monoton' => 'ελληνικά', - 'eu' => 'euskara, euskera', - 'fa' => 'فارسی', - 'ga' => 'Gaeilge', - 'gl' => 'galego', - 'he' => 'עברית', - 'hi' => 'हिन्दी, हिंदी', - 'hr' => 'hrvatski jezik', - 'hy' => 'Հայերեն', - 'id' => 'Bahasa Indonesia', - 'jv' => 'basa Jawa', - 'ka' => 'ქართული', - 'ko' => '한국어, 조선어', - 'lt' => 'lietuvių kalba', - 'lv' => 'latviešu valoda', - 'ms-Latn' => 'bahasa Melayu, بهاس ملايو‎', - 'sr-Cyrl' => 'српски језик', - 'sk' => 'slovenčina, slovenský jazyk', - 'sl' => 'slovenski jezik, slovenščina', - 'sq' => 'Shqip', - 'sw' => 'Kiswahili', - 'ta' => 'தமிழ்', - 'th' => 'ไทย', - 'tl' => 'Wikang Tagalog, ᜏᜒᜃᜅ᜔ ᜆᜄᜎᜓᜄ᜔', - 'tr' => 'Türkçe', - 'pt-PT' => 'português', - 'uk' => 'українська мова', - 'uz' => 'Oʻzbek, Ўзбек, أۇزبېك‎', - 'vi' => 'Việt Nam', - 'zh-hant' => '繁體', - ]; - $langs = array_merge($additional_langs, $langs); - ksort($langs); - } } return $langs; } /** - * The language detection routine uses some slightly different language codes. - * This function changes the language array accordingly. + * Get language codes that are detectable by our language detection routines. + * Öanguages are excluded that aren't used often and that tend to false detections. + * The listed codes are a collection of both the official ISO 639-1 codes and + * the codes that are used by our built-in language detection routine. + * When the detection is done, the result only consists of the official ISO 639-1 codes. * - * @param array $languages * @return array */ - public function convertForLanguageDetection(array $languages): array + public function getDetectableLanguages(): array { - foreach ($languages as $key => $language) { - $newkey = $this->convertCodeForLanguageDetection($key); - if ($newkey != $key) { - if (!isset($languages[$newkey])) { - $languages[$newkey] = $language; - } - unset($languages[$key]); - } + $additional_langs = [ + 'af', 'az', 'az-Cyrl', 'az-Latn', 'be', 'bn', 'bs', 'bs-Cyrl', 'bs-Latn', + 'cy', 'da', 'el', 'el-monoton', 'el-polyton', 'en', 'eu', + 'fa', 'fi', 'ga', 'gl', 'he', 'hi', 'hr', 'hy', 'id', 'in', 'iw', 'jv', 'jw', + 'ka', 'ko', 'lt', 'lv', 'mo', 'ms', 'ms-Arab', 'ms-Latn', 'nb', 'nn', 'no', + 'pt', 'pt-PT', 'pt-BR', 'ro', 'sa', 'sk', 'sl', 'sq', 'sr', 'sr-Cyrl', 'sr-Latn', 'sw', + 'ta', 'th', 'tl', 'tr', 'ug', 'uk', 'uz', 'vi', 'zh', 'zh-Hant', 'zh-Hans', + ]; + + if (in_array('cld2', get_loaded_extensions())) { + $additional_langs = array_merge($additional_langs, + ['sd', 'si', 'yi', 'km', 'iu', 'lo', 'dv', 'gu', 'kn', 'te', 'ml', 'or', 'pa', 'iu']); } - ksort($languages); + $langs = array_merge($additional_langs, array_keys($this->getAvailableLanguages())); + sort($langs); + return $langs; + } + + /** + * Return a list of supported languages witzh their two byte language codes. + * + * @param bool $international If set to true, additionally the international language name is returned as well. + * @return array + */ + public function getLanguageCodes(bool $international = false): array + { + $iso639 = new \Matriphe\ISO639\ISO639; + + $languages = []; + + foreach ($this->getDetectableLanguages() as $code) { + $code = $this->toISO6391($code); + $native = $iso639->nativeByCode1($code); + $language = $iso639->languageByCode1($code); + if ($native != $iso639->languageByCode1($code) && $international) { + $languages[$code] = $this->t('%s (%s)', $native, $language); + } else { + $languages[$code] = $native; + } + } return $languages; } /** - * The language detection routine uses some slightly different language codes. - * This function changes the language codes accordingly. + * Convert the language code to ISO639-1 + * It also converts old codes to their new counterparts. * - * @param string $language + * @param string $code * @return string */ - public function convertCodeForLanguageDetection(string $language): string + public function toISO6391(string $code): string { - switch ($language) { - case 'da-dk': - return 'da'; - case 'en-us': - case 'en-gb': - return 'en'; - case 'fi-fi': - return 'fi'; - case 'nb-no': - return 'nb'; - case 'pt-br': - return 'pt-BR'; - case 'zh-cn': - return 'zh-Hans'; - default: - return $language; + if ((strlen($code) > 2) && (substr($code, 2, 1) == '-')) { + $code = substr($code, 0, 2); } + if (in_array($code, ['nb', 'nn'])) { + $code = 'no'; + } + if ($code == 'in') { + $code = 'id'; + } + if ($code == 'iw') { + $code = 'he'; + } + if ($code == 'jw') { + $code = 'jv'; + } + if ($code == 'mo') { + $code = 'ro'; + } + return $code; } /** diff --git a/src/Model/Item.php b/src/Model/Item.php index 811eab50b..e505e76d9 100644 --- a/src/Model/Item.php +++ b/src/Model/Item.php @@ -2034,15 +2034,12 @@ class Item return []; } - $availableLanguages = DI::l10n()->getAvailableLanguages(true); - $availableLanguages = DI::l10n()->convertForLanguageDetection($availableLanguages); - - $ld = new Language(array_keys($availableLanguages)); + $ld = new Language(DI::l10n()->getDetectableLanguages()); $result = []; foreach (self::splitByBlocks($searchtext) as $block) { - $languages = $ld->detect($block)->limit(0, $count)->close() ?: []; + $languages = $ld->detect($block)->close() ?: []; $data = [ 'text' => $block, @@ -2057,10 +2054,32 @@ class Item } } - arsort($result); - $result = array_slice($result, 0, $count); + $result = self::compactLanguages($result); - return $result; + arsort($result); + return array_slice($result, 0, $count); + } + + /** + * Concert the language code in the detection result to ISO 639-1. + * On duplicates the system uses the higher quality value. + * + * @param array $result + * @return array + */ + private static function compactLanguages(array $result): array + { + $languages = []; + foreach ($result as $language => $quality) { + if ($quality == 0) { + continue; + } + $code = DI::l10n()->toISO6391($language); + if (empty($languages[$code]) || ($languages[$code] < $quality)) { + $languages[$code] = $quality; + } + } + return $languages; } /** diff --git a/src/Model/Post/Engagement.php b/src/Model/Post/Engagement.php index 017c34d19..74b479653 100644 --- a/src/Model/Post/Engagement.php +++ b/src/Model/Post/Engagement.php @@ -35,9 +35,6 @@ use Friendica\Model\Verb; use Friendica\Protocol\Activity; use Friendica\Protocol\Relay; use Friendica\Util\DateTimeFormat; -use Friendica\Util\Strings; - -// Channel class Engagement { diff --git a/src/Model/User.php b/src/Model/User.php index 698172c1d..24a75b5d3 100644 --- a/src/Model/User.php +++ b/src/Model/User.php @@ -127,7 +127,6 @@ class User case 'community': return User::ACCOUNT_TYPE_COMMUNITY; - } return null; } @@ -425,7 +424,7 @@ class User * @return array user * @throws Exception */ - public static function getFirstAdmin(array $fields = []) : array + public static function getFirstAdmin(array $fields = []): array { if (!empty(DI::config()->get('config', 'admin_nickname'))) { return self::getByNickname(DI::config()->get('config', 'admin_nickname'), $fields); @@ -560,22 +559,20 @@ class User return $default_circle; } -/** - * Fetch the language code from the given user. If the code is invalid, return the system language - * - * @param integer $uid User-Id - * @return string - */ + /** + * Fetch the language code from the given user. If the code is invalid, return the system language + * + * @param integer $uid User-Id + * @return string + */ public static function getLanguageCode(int $uid): string { - $owner = self::getOwnerDataById($uid); - $languages = DI::l10n()->getAvailableLanguages(true); - if (in_array($owner['language'], array_keys($languages))) { - $language = $owner['language']; - } else { - $language = DI::config()->get('system', 'language'); + $owner = self::getOwnerDataById($uid); + $language = DI::l10n()->toISO6391($owner['language']); + if (in_array($language, array_keys(DI::l10n()->getLanguageCodes()))) { + return $language; } - return $language; + return DI::l10n()->toISO6391(DI::config()->get('system', 'language')); } /** @@ -1480,7 +1477,7 @@ class User Photo::delete(['uid' => $register['uid']]); return DBA::delete('user', ['uid' => $register['uid']]) && - Register::deleteByHash($register['hash']); + Register::deleteByHash($register['hash']); } /** diff --git a/src/Module/Conversation/Timeline.php b/src/Module/Conversation/Timeline.php index d15fefe3e..725634eb0 100644 --- a/src/Module/Conversation/Timeline.php +++ b/src/Module/Conversation/Timeline.php @@ -304,7 +304,7 @@ class Timeline extends BaseModule } elseif ($this->selectedTab == ChannelEntity::AUDIO) { $condition = ["`media-type` & ?", 4]; } elseif ($this->selectedTab == ChannelEntity::LANGUAGE) { - $condition = ["JSON_EXTRACT(JSON_KEYS(language), '$[0]') = ?", $this->l10n->convertCodeForLanguageDetection(User::getLanguageCode($uid))]; + $condition = ["JSON_EXTRACT(JSON_KEYS(language), '$[0]') = ?", User::getLanguageCode($uid)]; } elseif (is_numeric($this->selectedTab)) { $condition = $this->getUserChannelConditions($this->selectedTab, $this->session->getLocalUserId()); } @@ -421,7 +421,6 @@ class Timeline extends BaseModule { $conditions = []; $languages = $this->pConfig->get($uid, 'channel', 'languages', [User::getLanguageCode($uid)]); - $languages = $this->l10n->convertForLanguageDetection($languages); foreach ($languages as $language) { $conditions[] = "JSON_EXTRACT(JSON_KEYS(language), '$[0]') = ?"; $condition[] = $language; diff --git a/src/Module/Settings/Display.php b/src/Module/Settings/Display.php index b5dbf01eb..ad36da2dd 100644 --- a/src/Module/Settings/Display.php +++ b/src/Module/Settings/Display.php @@ -260,7 +260,7 @@ class Display extends BaseSettings $bookmarked_timelines = $this->pConfig->get($uid, 'system', 'network_timelines', $this->getAvailableTimelines($uid, true)->column('code')); $enabled_timelines = $this->pConfig->get($uid, 'system', 'enabled_timelines', $this->getAvailableTimelines($uid, false)->column('code')); $channel_languages = $this->pConfig->get($uid, 'channel', 'languages', [User::getLanguageCode($uid)]); - $languages = $this->l10n->getAvailableLanguages(true); + $languages = $this->l10n->getLanguageCodes(true); $timelines = []; foreach ($this->getAvailableTimelines($uid) as $timeline) {