From 91c4ab23593f141e3d441ebe70dce8883244b27e Mon Sep 17 00:00:00 2001 From: Hypolite Petovan Date: Fri, 23 Dec 2022 02:41:22 -0500 Subject: [PATCH 1/2] Add new xpathQuote and checkRelMeLink methods to Content\Text\HTML class - Add tests for both methods --- src/Content/Text/HTML.php | 48 ++++++ .../dom/relme/a-multiple-rel-value-end.html | 10 ++ .../relme/a-multiple-rel-value-middle.html | 10 ++ .../dom/relme/a-multiple-rel-value-start.html | 10 ++ .../dom/relme/a-single-rel-value-fail.html | 10 ++ .../dom/relme/a-single-rel-value.html | 10 ++ .../dom/relme/link-single-rel-value-fail.html | 11 ++ .../dom/relme/link-single-rel-value.html | 11 ++ tests/src/Content/Text/HTMLTest.php | 150 ++++++++++++++++++ 9 files changed, 270 insertions(+) create mode 100644 tests/datasets/dom/relme/a-multiple-rel-value-end.html create mode 100644 tests/datasets/dom/relme/a-multiple-rel-value-middle.html create mode 100644 tests/datasets/dom/relme/a-multiple-rel-value-start.html create mode 100644 tests/datasets/dom/relme/a-single-rel-value-fail.html create mode 100644 tests/datasets/dom/relme/a-single-rel-value.html create mode 100644 tests/datasets/dom/relme/link-single-rel-value-fail.html create mode 100644 tests/datasets/dom/relme/link-single-rel-value.html diff --git a/src/Content/Text/HTML.php b/src/Content/Text/HTML.php index fa5a0a590..00d609cb6 100644 --- a/src/Content/Text/HTML.php +++ b/src/Content/Text/HTML.php @@ -33,6 +33,7 @@ use Friendica\Util\Network; use Friendica\Util\Strings; use Friendica\Util\XML; use League\HTMLToMarkdown\HtmlConverter; +use Psr\Http\Message\UriInterface; class HTML { @@ -1007,4 +1008,51 @@ class HTML return $text; } + + /** + * XPath arbitrary string quoting + * + * @see https://stackoverflow.com/a/45228168 + * @param string $value + * @return string + */ + public static function xpathQuote(string $value): string + { + if (false === strpos($value, '"')) { + return '"' . $value . '"'; + } + + if (false === strpos($value, "'")) { + return "'" . $value . "'"; + } + + // if the value contains both single and double quotes, construct an + // expression that concatenates all non-double-quote substrings with + // the quotes, e.g.: + // + // concat("'foo'", '"', "bar") + return 'concat(' . implode(', \'"\', ', array_map(['self', 'xpathQuote'], explode('"', $value))) . ')'; + } + + /** + * Checks if the provided URL is present in the DOM document in an element with the rel="me" attribute + * + * XHTML Friends Network http://gmpg.org/xfn/ + * + * @param DOMDocument $doc + * @param UriInterface $meUrl + * @return bool + */ + public static function checkRelMeLink(DOMDocument $doc, UriInterface $meUrl): bool + { + $xpath = new \DOMXpath($doc); + + // This expression checks that "me" is among the space-delimited values of the "rel" attribute. + // And that the href attribute contains exactly the provided URL + $expression = "//*[contains(concat(' ', normalize-space(@rel), ' '), ' me ')][@href = " . self::xpathQuote($meUrl) . "]"; + + $result = $xpath->query($expression); + + return $result !== false && $result->length > 0; + } } diff --git a/tests/datasets/dom/relme/a-multiple-rel-value-end.html b/tests/datasets/dom/relme/a-multiple-rel-value-end.html new file mode 100644 index 000000000..5fa4cbdf4 --- /dev/null +++ b/tests/datasets/dom/relme/a-multiple-rel-value-end.html @@ -0,0 +1,10 @@ + + + + + Remote page + + + My Profile + + diff --git a/tests/datasets/dom/relme/a-multiple-rel-value-middle.html b/tests/datasets/dom/relme/a-multiple-rel-value-middle.html new file mode 100644 index 000000000..08d33f051 --- /dev/null +++ b/tests/datasets/dom/relme/a-multiple-rel-value-middle.html @@ -0,0 +1,10 @@ + + + + + Remote page + + + My Profile + + diff --git a/tests/datasets/dom/relme/a-multiple-rel-value-start.html b/tests/datasets/dom/relme/a-multiple-rel-value-start.html new file mode 100644 index 000000000..c71d8288c --- /dev/null +++ b/tests/datasets/dom/relme/a-multiple-rel-value-start.html @@ -0,0 +1,10 @@ + + + + + Remote page + + + My Profile + + diff --git a/tests/datasets/dom/relme/a-single-rel-value-fail.html b/tests/datasets/dom/relme/a-single-rel-value-fail.html new file mode 100644 index 000000000..2735aa4aa --- /dev/null +++ b/tests/datasets/dom/relme/a-single-rel-value-fail.html @@ -0,0 +1,10 @@ + + + + + Remote page + + + My Profile + + diff --git a/tests/datasets/dom/relme/a-single-rel-value.html b/tests/datasets/dom/relme/a-single-rel-value.html new file mode 100644 index 000000000..26d61204e --- /dev/null +++ b/tests/datasets/dom/relme/a-single-rel-value.html @@ -0,0 +1,10 @@ + + + + + Remote page + + + My Profile + + diff --git a/tests/datasets/dom/relme/link-single-rel-value-fail.html b/tests/datasets/dom/relme/link-single-rel-value-fail.html new file mode 100644 index 000000000..2b7df5cb2 --- /dev/null +++ b/tests/datasets/dom/relme/link-single-rel-value-fail.html @@ -0,0 +1,11 @@ + + + + + Remote page + + + + + + diff --git a/tests/datasets/dom/relme/link-single-rel-value.html b/tests/datasets/dom/relme/link-single-rel-value.html new file mode 100644 index 000000000..f18d000f3 --- /dev/null +++ b/tests/datasets/dom/relme/link-single-rel-value.html @@ -0,0 +1,11 @@ + + + + + Remote page + + + + + + diff --git a/tests/src/Content/Text/HTMLTest.php b/tests/src/Content/Text/HTMLTest.php index bc352e542..e4a860359 100644 --- a/tests/src/Content/Text/HTMLTest.php +++ b/tests/src/Content/Text/HTMLTest.php @@ -25,6 +25,8 @@ use Exception; use Friendica\Content\Text\HTML; use Friendica\Network\HTTPException\InternalServerErrorException; use Friendica\Test\FixtureTest; +use GuzzleHttp\Psr7\Uri; +use Psr\Http\Message\UriInterface; class HTMLTest extends FixtureTest { @@ -105,4 +107,152 @@ its surprisingly good", self::assertEquals($expectedBBCode, $actual); } + + public function dataXpathQuote(): array + { + return [ + 'no quotes' => [ + 'value' => "foo", + ], + 'double quotes only' => [ + 'value' => "\"foo", + ], + 'single quotes only' => [ + 'value' => "'foo", + ], + 'both; double quotes in mid-string' => [ + 'value' => "'foo\"bar", + ], + 'multiple double quotes in mid-string' => [ + 'value' => "'foo\"bar\"baz", + ], + 'string ends with double quotes' => [ + 'value' => "'foo\"", + ], + 'string ends with run of double quotes' => [ + 'value' => "'foo\"\"", + ], + 'string begins with double quotes' => [ + 'value' => "\"'foo", + ], + 'string begins with run of double quotes' => [ + 'value' => "\"\"'foo", + ], + 'run of double quotes in mid-string' => [ + 'value' => "'foo\"\"bar", + ], + ]; + } + + /** + * @dataProvider dataXpathQuote + * @param string $value + * @return void + * @throws \DOMException + */ + public function testXpathQuote(string $value) + { + $dom = new \DOMDocument(); + $element = $dom->createElement('test'); + $attribute = $dom->createAttribute('value'); + $attribute->value = $value; + $element->appendChild($attribute); + $dom->appendChild($element); + + $xpath = new \DOMXPath($dom); + + $result = $xpath->query('//test[@value = ' . HTML::xpathQuote($value) . ']'); + + $this->assertInstanceOf(\DOMNodeList::class, $result); + $this->assertEquals(1, $result->length); + } + + public function dataCheckRelMeLink(): array + { + $aSingleRelValue = new \DOMDocument(); + $aSingleRelValue->load(__DIR__ . '/../../../datasets/dom/relme/a-single-rel-value.html'); + + $aMultipleRelValueStart = new \DOMDocument(); + $aMultipleRelValueStart->load(__DIR__ . '/../../../datasets/dom/relme/a-multiple-rel-value-start.html'); + + $aMultipleRelValueMiddle = new \DOMDocument(); + $aMultipleRelValueMiddle->load(__DIR__ . '/../../../datasets/dom/relme/a-multiple-rel-value-middle.html'); + + $aMultipleRelValueEnd = new \DOMDocument(); + $aMultipleRelValueEnd->load(__DIR__ . '/../../../datasets/dom/relme/a-multiple-rel-value-end.html'); + + $linkSingleRelValue = new \DOMDocument(); + $linkSingleRelValue->load(__DIR__ . '/../../../datasets/dom/relme/link-single-rel-value.html'); + + $meUrl = new Uri('https://example.com/profile/me'); + + return [ + 'a-single-rel-value' => [ + 'doc' => $aSingleRelValue, + 'meUrl' => $meUrl + ], + 'a-multiple-rel-value-start' => [ + 'doc' => $aMultipleRelValueStart, + 'meUrl' => $meUrl + ], + 'a-multiple-rel-value-middle' => [ + 'doc' => $aMultipleRelValueMiddle, + 'meUrl' => $meUrl + ], + 'a-multiple-rel-value-end' => [ + 'doc' => $aMultipleRelValueEnd, + 'meUrl' => $meUrl + ], + 'link-single-rel-value' => [ + 'doc' => $linkSingleRelValue, + 'meUrl' => $meUrl + ], + ]; + } + + + /** + * @dataProvider dataCheckRelMeLink + * @param \DOMDocument $doc + * @param UriInterface $meUrl + * @return void + */ + public function testCheckRelMeLink(\DOMDocument $doc, UriInterface $meUrl) + { + $this->assertTrue(HTML::checkRelMeLink($doc, $meUrl)); + } + + public function dataCheckRelMeLinkFail(): array + { + $aSingleRelValueFail = new \DOMDocument(); + $aSingleRelValueFail->load(__DIR__ . '/../../../datasets/dom/relme/a-single-rel-value-fail.html'); + + $linkSingleRelValueFail = new \DOMDocument(); + $linkSingleRelValueFail->load(__DIR__ . '/../../../datasets/dom/relme/link-single-rel-value-fail.html'); + + $meUrl = new Uri('https://example.com/profile/me'); + + return [ + 'a-single-rel-value-fail' => [ + 'doc' => $aSingleRelValueFail, + 'meUrl' => $meUrl + ], + 'link-single-rel-value-fail' => [ + 'doc' => $linkSingleRelValueFail, + 'meUrl' => $meUrl + ], + ]; + } + + + /** + * @dataProvider dataCheckRelMeLinkFail + * @param \DOMDocument $doc + * @param UriInterface $meUrl + * @return void + */ + public function testCheckRelMeLinkFail(\DOMDocument $doc, UriInterface $meUrl) + { + $this->assertFalse(HTML::checkRelMeLink($doc, $meUrl)); + } } From b2f2fbc4d42b92451dd93c56ecb2983693b2c313 Mon Sep 17 00:00:00 2001 From: Hypolite Petovan Date: Fri, 23 Dec 2022 02:42:58 -0500 Subject: [PATCH 2/2] Flatten conditions and add call to HTML::checkRelMeLink in Worker\CheckRelMeProfileLink --- src/Worker/CheckRelMeProfileLink.php | 72 +++++++++++++--------------- 1 file changed, 33 insertions(+), 39 deletions(-) diff --git a/src/Worker/CheckRelMeProfileLink.php b/src/Worker/CheckRelMeProfileLink.php index 987619e19..ebd025488 100644 --- a/src/Worker/CheckRelMeProfileLink.php +++ b/src/Worker/CheckRelMeProfileLink.php @@ -22,14 +22,14 @@ namespace Friendica\Worker; use DOMDocument; -use Friendica\DI; +use Friendica\Content\Text\HTML; use Friendica\Core\Logger; +use Friendica\DI; use Friendica\Model\Profile; use Friendica\Model\User; use Friendica\Network\HTTPClient\Client\HttpClientAccept; use Friendica\Network\HTTPClient\Client\HttpClientOptions; -use Friendica\Util\Network; -use Friendica\Util\Strings; +use GuzzleHttp\Psr7\Uri; /* This class is used to verify the homepage link of a user profile. * To do so, we look for rel="me" links in the given homepage, if one @@ -56,43 +56,37 @@ class CheckRelMeProfileLink { Logger::notice('Verifying the homepage', ['uid' => $uid]); Profile::update(['homepage_verified' => false], $uid); - $homepageUrlVerified = false; - $owner = User::getOwnerDataById($uid); - if (!empty($owner['homepage'])) { - $xrd_timeout = DI::config()->get('system', 'xrd_timeout'); - $curlResult = DI::httpClient()->get($owner['homepage'], $accept_content = HttpClientAccept::HTML, [HttpClientOptions::TIMEOUT => $xrd_timeout]); - if ($curlResult->isSuccess()) { - $content = $curlResult->getBody(); - if (!$content) { - Logger::notice('Empty body of the fetched homepage link). Cannot verify the relation to profile of UID %s.', ['uid' => $uid, 'owner homepage' => $owner['homepage']]); - } else { - $doc = new DOMDocument(); - @$doc->loadHTML($content); - if (!$doc) { - Logger::notice('Could not parse the content'); - } else { - foreach ($doc->getElementsByTagName('a') as $link) { - $rel = $link->getAttribute('rel'); - if ($rel == 'me') { - $href = $link->getAttribute('href'); - if (!$homepageUrlVerified && Network::isValidHttpUrl($href)) { - $homepageUrlVerified = Strings::compareLink($owner['url'], $href); - } - } - } - } - if ($homepageUrlVerified) { - Profile::update(['homepage_verified' => true], $uid); - Logger::notice('Homepage URL verified', ['uid' => $uid, 'owner homepage' => $owner['homepage']]); - } else { - Logger::notice('Homepage URL could not be verified', ['uid' => $uid, 'owner homepage' => $owner['homepage']]); - } - } - } else { - Logger::notice('Could not cURL the homepage URL', ['owner homepage' => $owner['homepage']]); - } - } else { + + $owner = User::getOwnerDataById($uid); + if (empty($owner['homepage'])) { Logger::notice('The user has no homepage link.', ['uid' => $uid]); + return; + } + + $xrd_timeout = DI::config()->get('system', 'xrd_timeout'); + $curlResult = DI::httpClient()->get($owner['homepage'], HttpClientAccept::HTML, [HttpClientOptions::TIMEOUT => $xrd_timeout]); + if (!$curlResult->isSuccess()) { + Logger::notice('Could not cURL the homepage URL', ['owner homepage' => $owner['homepage']]); + return; + } + + $content = $curlResult->getBody(); + if (!$content) { + Logger::notice('Empty body of the fetched homepage link). Cannot verify the relation to profile of UID %s.', ['uid' => $uid, 'owner homepage' => $owner['homepage']]); + return; + } + + $doc = new DOMDocument(); + if (!@$doc->loadHTML($content)) { + Logger::notice('Could not parse the content'); + return; + } + + if (HTML::checkRelMeLink($doc, new Uri($owner['url']))) { + Profile::update(['homepage_verified' => true], $uid); + Logger::notice('Homepage URL verified', ['uid' => $uid, 'owner homepage' => $owner['homepage']]); + } else { + Logger::notice('Homepage URL could not be verified', ['uid' => $uid, 'owner homepage' => $owner['homepage']]); } } }