Merge pull request #12645 from MrPetovan/bug/warnings

Replace HTML regular expression by HTML::extractCharset in ParseUrl::getSiteInfo
This commit is contained in:
Philipp 2023-01-10 19:23:52 +01:00 committed by GitHub
commit 1fc7d5ae85
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 1684 additions and 21 deletions

View file

@ -23,6 +23,7 @@ namespace Friendica\Content\Text;
use DOMDocument; use DOMDocument;
use DOMXPath; use DOMXPath;
use Friendica\Protocol\HTTP\MediaType;
use Friendica\Content\Widget\ContactBlock; use Friendica\Content\Widget\ContactBlock;
use Friendica\Core\Hook; use Friendica\Core\Hook;
use Friendica\Core\Renderer; use Friendica\Core\Renderer;
@ -1055,4 +1056,30 @@ class HTML
return $result !== false && $result->length > 0; return $result !== false && $result->length > 0;
} }
/**
* @param DOMDocument $doc
* @return string|null Lowercase charset
*/
public static function extractCharset(DOMDocument $doc): ?string
{
$xpath = new DOMXPath($doc);
$expression = "string(//meta[@charset]/@charset)";
if ($charset = $xpath->evaluate($expression)) {
return strtolower($charset);
}
try {
// This expression looks for a meta tag with the http-equiv attribute set to "content-type" ignoring case
// whose content attribute contains a "charset" string and returns its value
$expression = "string(//meta[@http-equiv][translate(@http-equiv, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz') = 'content-type'][contains(translate(@content, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'charset')]/@content)";
$mediaType = MediaType::fromContentType($xpath->evaluate($expression));
if (isset($mediaType->parameters['charset'])) {
return strtolower($mediaType->parameters['charset']);
}
} catch(\InvalidArgumentException $e) {}
return null;
}
} }

View file

@ -253,12 +253,12 @@ class System
$func['database'] = in_array($func['class'], ['Friendica\Database\DBA', 'Friendica\Database\Database']); $func['database'] = in_array($func['class'], ['Friendica\Database\DBA', 'Friendica\Database\Database']);
if (!$previous['database'] || !$func['database']) { if (!$previous['database'] || !$func['database']) {
$classparts = explode("\\", $func['class']); $classparts = explode("\\", $func['class']);
$callstack[] = array_pop($classparts).'::'.$func['function'] . '(' . $func['line'] . ')'; $callstack[] = array_pop($classparts).'::'.$func['function'] . (isset($func['line']) ? ' (' . $func['line'] . ')' : '');
$previous = $func; $previous = $func;
} }
} elseif (!in_array($func['function'], $ignore)) { } elseif (!in_array($func['function'], $ignore)) {
$func['database'] = ($func['function'] == 'q'); $func['database'] = ($func['function'] == 'q');
$callstack[] = $func['function'] . '(' . $func['line'] . ')'; $callstack[] = $func['function'] . (isset($func['line']) ? ' (' . $func['line'] . ')' : '');
$func['class'] = ''; $func['class'] = '';
$previous = $func; $previous = $func;
} }

View file

@ -0,0 +1,237 @@
<?php
/**
* @copyright Copyright (C) 2010-2023, the Friendica project
*
* @license GNU AGPL version 3 or any later version
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
*/
namespace Friendica\Protocol\HTTP;
/**
* @see https://httpwg.org/specs/rfc9110.html#media.type
*
* @property-read string $type
* @property-read string $subType
* @property-read string $parameters
*/
final class MediaType
{
const DQUOTE = '"';
const DIGIT = '0-9';
const ALPHA = 'a-zA-Z';
// @see https://www.charset.org/charsets/us-ascii
const VCHAR = "\\x21-\\x7E";
const SYMBOL_NO_DELIM = "!#$%&'*+-.^_`|~";
const OBSTEXT = "\\x80-\\xFF";
const QDTEXT = "\t \\x21\\x23-\\x5B\\x5D-\\x7E" . self::OBSTEXT;
/**
* @var string
*/
private $type;
/**
* @var @string
*/
private $subType;
/**
* @var string[]
*/
private $parameters;
public function __construct(string $type, string $subType, array $parameters = [])
{
if (!self::isToken($type)) {
throw new \InvalidArgumentException("Type isn't a valid token: " . $type);
}
if (!self::isToken($subType)) {
throw new \InvalidArgumentException("Subtype isn't a valid token: " . $subType);
}
foreach ($parameters as $key => $value) {
if (!self::isToken($key)) {
throw new \InvalidArgumentException("Parameter key isn't a valid token: " . $key);
}
if (!self::isToken($value) && !self::isQuotableString($value)) {
throw new \InvalidArgumentException("Parameter value isn't a valid token or a quotable string: " . $value);
}
}
$this->type = $type;
$this->subType = $subType;
$this->parameters = $parameters;
}
public function __get(string $name)
{
if (!isset($this->$name)) {
throw new \InvalidArgumentException('Unknown property ' . $name);
}
return $this->$name;
}
public static function fromContentType(string $contentType): self
{
if (!$contentType) {
throw new \InvalidArgumentException('Provided string is empty');
}
$parts = explode(';', $contentType);
$mimeTypeParts = explode('/', trim(array_shift($parts)));
if (count($mimeTypeParts) !== 2) {
throw new \InvalidArgumentException('Provided string doesn\'t look like a MIME type: ' . $contentType);
}
list($type, $subType) = $mimeTypeParts;
$parameters = [];
foreach ($parts as $parameterString) {
if (!trim($parameterString)) {
continue;
}
$parameterParts = explode('=', trim($parameterString));
if (count($parameterParts) < 2) {
throw new \InvalidArgumentException('Parameter lacks a value: ' . $parameterString);
}
if (count($parameterParts) > 2) {
throw new \InvalidArgumentException('Parameter has too many values: ' . $parameterString);
}
list($key, $value) = $parameterParts;
if (!self::isToken($value) && !self::isQuotedString($value)) {
throw new \InvalidArgumentException("Parameter value isn't a valid token or a quoted string: \"" . $value . '"');
}
if (self::isQuotedString($value)) {
$value = self::extractQuotedStringValue($value);
}
// Parameter keys are case-insensitive, values are not
$parameters[strtolower($key)] = $value;
}
return new self($type, $subType, $parameters);
}
public function __toString(): string
{
$parameters = $this->parameters;
array_walk($parameters, function (&$value, $key) {
$value = '; ' . $key . '=' . (self::isToken($value) ? $value : '"' . addcslashes($value, '"\\') . '"');
});
return $this->type . '/' . $this->subType . implode($parameters);
}
/**
* token = 1*tchar
* tchar = "!" / "#" / "$" / "%" / "&" / "'" / "*"
* / "+" / "-" / "." / "^" / "_" / "`" / "|" / "~"
* / DIGIT / ALPHA
* ; any VCHAR, except delimiters
*
* @see https://httpwg.org/specs/rfc9110.html#tokens
*
* @param string $string
* @return false|int
*/
private static function isToken(string $string)
{
$symbol = preg_quote(self::SYMBOL_NO_DELIM, '/');
$digit = self::DIGIT;
$alpha = self::ALPHA;
$pattern = "/^[$symbol$digit$alpha]+$/";
return preg_match($pattern, $string);
}
/**
* quoted-string = DQUOTE *( qdtext / quoted-pair ) DQUOTE
* qdtext = HTAB / SP / %x21 / %x23-5B / %x5D-7E / obs-text
*
* @see https://httpwg.org/specs/rfc9110.html#quoted.strings
*
* @param string $string
* @return bool
*/
private static function isQuotedString(string $string): bool
{
$dquote = self::DQUOTE;
$vchar = self::VCHAR;
$obsText = self::OBSTEXT;
$qdtext = '[' . self::QDTEXT . ']';
$quotedPair = "\\\\[\t $vchar$obsText]";
$pattern = "/^$dquote(?:$qdtext|$quotedPair)*$dquote$/";
return preg_match($pattern, $string);
}
/**
* Is the string an extracted quoted string value?
*
* @param string $string
* @return bool
*/
private static function isQuotableString(string $string): bool
{
$vchar = self::VCHAR;
$obsText = self::OBSTEXT;
$qdtext = '[' . self::QDTEXT . ']';
$quotedSingle = "[\t $vchar$obsText]";
$pattern = "/^(?:$qdtext|$quotedSingle)*$/";
return preg_match($pattern, $string);
}
/**
* Extracts the value from a quoted-string, removing quoted pairs
*
* @param string $value
* @return string
*/
private static function extractQuotedStringValue(string $value): string
{
return preg_replace_callback('/^"(.*)"$/', function ($matches) {
$vchar = self::VCHAR;
$obsText = self::OBSTEXT;
return preg_replace("/\\\\([\t $vchar$obsText])/", '$1', $matches[1]);
}, $value);
}
}

View file

@ -24,6 +24,8 @@ namespace Friendica\Util;
use DOMDocument; use DOMDocument;
use DOMXPath; use DOMXPath;
use Friendica\Content\OEmbed; use Friendica\Content\OEmbed;
use Friendica\Content\Text\HTML;
use Friendica\Protocol\HTTP\MediaType;
use Friendica\Core\Hook; use Friendica\Core\Hook;
use Friendica\Core\Logger; use Friendica\Core\Logger;
use Friendica\Database\Database; use Friendica\Database\Database;
@ -283,25 +285,13 @@ class ParseUrl
} }
$charset = ''; $charset = '';
// Look for a charset, first in headers try {
// Expected form: Content-Type: text/html; charset=ISO-8859-4 // Look for a charset, first in headers
if (preg_match('/charset=([a-z0-9-_.\/]+)/i', $curlResult->getContentType(), $matches)) { $mediaType = MediaType::fromContentType($curlResult->getContentType());
$charset = trim(trim(trim(array_pop($matches)), ';,')); if (isset($mediaType->parameters['charset'])) {
} else { $charset = $mediaType->parameters['charset'];
// Then in body that gets precedence }
// Expected forms: } catch(\InvalidArgumentException $e) {}
// - <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
// - <meta charset="utf-8">
// - <meta charset=utf-8>
// - <meta charSet="utf-8">
// We escape <style> and <script> tags since they can contain irrelevant charset information
// (see https://github.com/friendica/friendica/issues/9251#issuecomment-698636806)
Strings::performWithEscapedBlocks($body, '#<(?:style|script).*?</(?:style|script)>#ism', function ($body) use (&$charset) {
if (preg_match('/charset=["\']?([a-z0-9-_.\/]+)/i', $body, $matches)) {
$charset = trim(trim(trim(array_pop($matches)), ';,'));
}
});
}
$siteinfo['charset'] = $charset; $siteinfo['charset'] = $charset;
@ -322,6 +312,8 @@ class ParseUrl
$doc = new DOMDocument(); $doc = new DOMDocument();
@$doc->loadHTML($body); @$doc->loadHTML($body);
$siteinfo['charset'] = HTML::extractCharset($doc) ?? $siteinfo['charset'];
XML::deleteNode($doc, 'style'); XML::deleteNode($doc, 'style');
XML::deleteNode($doc, 'option'); XML::deleteNode($doc, 'option');
XML::deleteNode($doc, 'h1'); XML::deleteNode($doc, 'h1');

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1,150 @@
<?php
/**
* @copyright Copyright (C) 2010-2023, the Friendica project
*
* @license GNU AGPL version 3 or any later version
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
*/
namespace Friendica\Test\src\Protocol\HTTP;
use Friendica\Protocol\HTTP\MediaType;
class MediaTypeTest extends \PHPUnit\Framework\TestCase
{
public function dataValid(): array
{
return [
'HTML UTF-8' => [
'expected' => new MediaType('text', 'html', ['charset' => 'utf-8']),
'content-type' => 'text/html; charset=utf-8',
],
'HTML Northern Europe' => [
'expected' => new MediaType('text', 'html', ['charset' => 'ISO-8859-4']),
'content-type' => 'text/html; charset=ISO-8859-4',
],
'multipart/form-data' => [
'expected' => new MediaType('multipart', 'form-data', ['boundary' => '---------------------------974767299852498929531610575']),
'content-type' => 'multipart/form-data; boundary=---------------------------974767299852498929531610575',
],
'Multiple parameters' => [
'expected' => new MediaType('application', 'octet-stream', ['charset' => 'ISO-8859-4', 'another' => 'parameter']),
'content-type' => 'application/octet-stream; charset=ISO-8859-4 ; another=parameter',
],
'No parameters' => [
'expected' => new MediaType('application', 'vnd.adobe.air-application-installer-package+zip'),
'content-type' => 'application/vnd.adobe.air-application-installer-package+zip',
],
'No parameters colon' => [
'expected' => new MediaType('application', 'vnd.adobe.air-application-installer-package+zip'),
'content-type' => 'application/vnd.adobe.air-application-installer-package+zip;',
],
'No parameters space colon' => [
'expected' => new MediaType('application', 'vnd.adobe.air-application-installer-package+zip'),
'content-type' => 'application/vnd.adobe.air-application-installer-package+zip ;',
],
'No parameters space colon space' => [
'expected' => new MediaType('application', 'vnd.adobe.air-application-installer-package+zip'),
'content-type' => 'application/vnd.adobe.air-application-installer-package+zip ; ',
],
'Parameter quoted string' => [
'expected' => new MediaType('text', 'html', ['parameter' => 'Quoted string with a space and a "double-quote"']),
'content-type' => 'text/html; parameter="Quoted string with a space and a \"double-quote\""',
]
];
}
/**
* @dataProvider dataValid
*
* @param MediaType $expected
* @param string $contentType
* @return void
*/
public function testValid(MediaType $expected, string $contentType)
{
$this->assertEquals($expected, MediaType::fromContentType($contentType));
}
public function dataInvalid(): array
{
return [
'no slash' => ['application'],
'two slashes' => ['application/octet/stream'],
'parameter no value' => ['application/octet-stream ; parameter'],
'parameter too many values' => ['application/octet-stream ; parameter=value1=value2'],
'type non token' => ['appli"cation/octet-stream'],
'subtype non token' => ['application/octet\-stream'],
'parameter name non token' => ['application/octet-stream; para"meter=value'],
'parameter value invalid' => ['application/octet-stream; parameter="value"value'],
];
}
/**
* @dataProvider dataInvalid
*
* @param string $contentType
* @return void
*/
public function testInvalid(string $contentType)
{
$this->expectException(\InvalidArgumentException::class);
MediaType::fromContentType($contentType);
}
public function dataToString(): array
{
return [
'HTML UTF-8' => [
'content-type' => 'text/html; charset=utf-8',
'mediaType' => new MediaType('text', 'html', ['charset' => 'utf-8']),
],
'HTML Northern Europe' => [
'expected' => 'text/html; charset=ISO-8859-4',
'mediaType' => new MediaType('text', 'html', ['charset' => 'ISO-8859-4']),
],
'multipart/form-data' => [
'expected' => 'multipart/form-data; boundary=---------------------------974767299852498929531610575',
'mediaType' => new MediaType('multipart', 'form-data', ['boundary' => '---------------------------974767299852498929531610575']),
],
'Multiple parameters' => [
'expected' => 'application/octet-stream; charset=ISO-8859-4; another=parameter',
'mediaType' => new MediaType('application', 'octet-stream', ['charset' => 'ISO-8859-4', 'another' => 'parameter']),
],
'No parameters' => [
'expected' => 'application/vnd.adobe.air-application-installer-package+zip',
'mediaType' => new MediaType('application', 'vnd.adobe.air-application-installer-package+zip'),
],
'Parameter quoted string' => [
'expected' => 'text/html; parameter="Quoted string with a space and a \"double-quote\""',
'mediaType' => new MediaType('text', 'html', ['parameter' => 'Quoted string with a space and a "double-quote"']),
],
];
}
/**
* @dataProvider dataToString
*
* @param string $expected
* @param MediaType $mediaType
* @return void
*/
public function testToString(string $expected, MediaType $mediaType)
{
$this->assertEquals($expected, $mediaType->__toString());
}
}