From 432587464ce16dff513ed2de340fa3437dbe45aa Mon Sep 17 00:00:00 2001 From: Hypolite Petovan Date: Fri, 17 Feb 2017 22:35:46 -0500 Subject: [PATCH] Fix Diaspora link attachment probe - Move analytics param stripping out of original_url - Remove HEAD curl request in ParseUrl::getSiteInfo - Replace original_url with strip_tracking_query_params in ParseUrl::getSiteInfo to prevent massive curl fest in border cases --- include/ParseUrl.php | 26 ++--------------------- include/network.php | 49 ++++++++++++++++++++++++++++++++++---------- 2 files changed, 40 insertions(+), 35 deletions(-) diff --git a/include/ParseUrl.php b/include/ParseUrl.php index b85175a25..3a2fe9d53 100644 --- a/include/ParseUrl.php +++ b/include/ParseUrl.php @@ -130,7 +130,7 @@ class ParseUrl { $url = trim($url, "'"); $url = trim($url, '"'); - $url = original_url($url); + $url = strip_tracking_query_params($url); $siteinfo["url"] = $url; $siteinfo["type"] = "link"; @@ -142,8 +142,7 @@ class ParseUrl { $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_HEADER, 1); - curl_setopt($ch, CURLOPT_NOBODY, 1); - curl_setopt($ch, CURLOPT_TIMEOUT, 3); + curl_setopt($ch, CURLOPT_TIMEOUT, 10); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_USERAGENT, $a->get_useragent()); curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, (($check_cert) ? true : false)); @@ -151,7 +150,6 @@ class ParseUrl { $header = curl_exec($ch); $curl_info = @curl_getinfo($ch); - $http_code = $curl_info["http_code"]; curl_close($ch); $a->save_timestamp($stamp1, "network"); @@ -197,26 +195,6 @@ class ParseUrl { } } - $stamp1 = microtime(true); - - // Now fetch the body as well - $ch = curl_init(); - curl_setopt($ch, CURLOPT_URL, $url); - curl_setopt($ch, CURLOPT_HEADER, 1); - curl_setopt($ch, CURLOPT_NOBODY, 0); - curl_setopt($ch, CURLOPT_TIMEOUT, 10); - curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); - curl_setopt($ch, CURLOPT_USERAGENT, $a->get_useragent()); - curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, (($check_cert) ? true : false)); - curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, (($check_cert) ? 2 : false)); - - $header = curl_exec($ch); - $curl_info = @curl_getinfo($ch); - $http_code = $curl_info["http_code"]; - curl_close($ch); - - $a->save_timestamp($stamp1, "network"); - // Fetch the first mentioned charset. Can be in body or header $charset = ""; if (preg_match('/charset=(.*?)['."'".'"\s\n]/', $header, $matches)) { diff --git a/include/network.php b/include/network.php index 7385c94a0..ecbe0e5c6 100644 --- a/include/network.php +++ b/include/network.php @@ -670,42 +670,69 @@ function fix_contact_ssl_policy(&$contact,$new_policy) { } } -function original_url($url, $depth=1, $fetchbody = false) { - - $a = get_app(); - - // Remove Analytics Data from Google and other tracking platforms +/** + * @brief Remove Google Analytics and other tracking platforms params from URL + * + * @param string $url + * @return string + */ +function strip_tracking_query_params($url) +{ $urldata = parse_url($url); if (is_string($urldata["query"])) { $query = $urldata["query"]; parse_str($query, $querydata); - if (is_array($querydata)) - foreach ($querydata AS $param=>$value) + if (is_array($querydata)) { + foreach ($querydata AS $param => $value) { if (in_array($param, array("utm_source", "utm_medium", "utm_term", "utm_content", "utm_campaign", "wt_mc", "pk_campaign", "pk_kwd", "mc_cid", "mc_eid", "fb_action_ids", "fb_action_types", "fb_ref", "awesm", "wtrid", "woo_campaign", "woo_source", "woo_medium", "woo_content", "woo_term"))) { - $pair = $param."=".urlencode($value); + $pair = $param . "=" . urlencode($value); $url = str_replace($pair, "", $url); // Second try: if the url isn't encoded completely - $pair = $param."=".str_replace(" ", "+", $value); + $pair = $param . "=" . str_replace(" ", "+", $value); $url = str_replace($pair, "", $url); // Third try: Maybey the url isn't encoded at all - $pair = $param."=".$value; + $pair = $param . "=" . $value; $url = str_replace($pair, "", $url); $url = str_replace(array("?&", "&&"), array("?", ""), $url); } + } + } - if (substr($url, -1, 1) == "?") + if (substr($url, -1, 1) == "?") { $url = substr($url, 0, -1); + } } + return $url; +} + +/** + * @brief Returns the original URL of the provided URL + * + * This function strips tracking query params and follows redirections, either + * through HTTP code or meta refresh tags. Stops after 10 redirections. + * + * @see ParseUrl::getSiteinfo + * + * @param string $url + * @param int $depth + * @param bool $fetchbody + * @return string + */ +function original_url($url, $depth = 1, $fetchbody = false) { + $a = get_app(); + + $url = strip_tracking_query_params($url); + if ($depth > 10) return($url);