From 3fcafd1d5019251fed40a1ac4fa34402d22796dd Mon Sep 17 00:00:00 2001 From: Hypolite Petovan Date: Tue, 10 Jan 2023 01:08:34 -0500 Subject: [PATCH] Replace HTML regular expression by HTML::extractCharset in ParseUrl::getSiteInfo - Address https://github.com/friendica/friendica/issues/12488#issuecomment-1374537440 --- src/Util/ParseUrl.php | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/src/Util/ParseUrl.php b/src/Util/ParseUrl.php index f21528353..9d19a4eba 100644 --- a/src/Util/ParseUrl.php +++ b/src/Util/ParseUrl.php @@ -24,6 +24,8 @@ namespace Friendica\Util; use DOMDocument; use DOMXPath; use Friendica\Content\OEmbed; +use Friendica\Content\Text\HTML; +use Friendica\Protocol\HTTP\MediaType; use Friendica\Core\Hook; use Friendica\Core\Logger; use Friendica\Database\Database; @@ -283,25 +285,13 @@ class ParseUrl } $charset = ''; - // Look for a charset, first in headers - // Expected form: Content-Type: text/html; charset=ISO-8859-4 - if (preg_match('/charset=([a-z0-9-_.\/]+)/i', $curlResult->getContentType(), $matches)) { - $charset = trim(trim(trim(array_pop($matches)), ';,')); - } else { - // Then in body that gets precedence - // Expected forms: - // - - // - - // - - // - - // We escape