Fix Diaspora link attachment probe
- Move analytics param stripping out of original_url - Remove HEAD curl request in ParseUrl::getSiteInfo - Replace original_url with strip_tracking_query_params in ParseUrl::getSiteInfo to prevent massive curl fest in border cases
This commit is contained in:
parent
99cfae63d7
commit
432587464c
2 changed files with 40 additions and 35 deletions
|
@ -130,7 +130,7 @@ class ParseUrl {
|
||||||
$url = trim($url, "'");
|
$url = trim($url, "'");
|
||||||
$url = trim($url, '"');
|
$url = trim($url, '"');
|
||||||
|
|
||||||
$url = original_url($url);
|
$url = strip_tracking_query_params($url);
|
||||||
|
|
||||||
$siteinfo["url"] = $url;
|
$siteinfo["url"] = $url;
|
||||||
$siteinfo["type"] = "link";
|
$siteinfo["type"] = "link";
|
||||||
|
@ -142,8 +142,7 @@ class ParseUrl {
|
||||||
$ch = curl_init();
|
$ch = curl_init();
|
||||||
curl_setopt($ch, CURLOPT_URL, $url);
|
curl_setopt($ch, CURLOPT_URL, $url);
|
||||||
curl_setopt($ch, CURLOPT_HEADER, 1);
|
curl_setopt($ch, CURLOPT_HEADER, 1);
|
||||||
curl_setopt($ch, CURLOPT_NOBODY, 1);
|
curl_setopt($ch, CURLOPT_TIMEOUT, 10);
|
||||||
curl_setopt($ch, CURLOPT_TIMEOUT, 3);
|
|
||||||
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
|
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
|
||||||
curl_setopt($ch, CURLOPT_USERAGENT, $a->get_useragent());
|
curl_setopt($ch, CURLOPT_USERAGENT, $a->get_useragent());
|
||||||
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, (($check_cert) ? true : false));
|
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, (($check_cert) ? true : false));
|
||||||
|
@ -151,7 +150,6 @@ class ParseUrl {
|
||||||
|
|
||||||
$header = curl_exec($ch);
|
$header = curl_exec($ch);
|
||||||
$curl_info = @curl_getinfo($ch);
|
$curl_info = @curl_getinfo($ch);
|
||||||
$http_code = $curl_info["http_code"];
|
|
||||||
curl_close($ch);
|
curl_close($ch);
|
||||||
|
|
||||||
$a->save_timestamp($stamp1, "network");
|
$a->save_timestamp($stamp1, "network");
|
||||||
|
@ -197,26 +195,6 @@ class ParseUrl {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
$stamp1 = microtime(true);
|
|
||||||
|
|
||||||
// Now fetch the body as well
|
|
||||||
$ch = curl_init();
|
|
||||||
curl_setopt($ch, CURLOPT_URL, $url);
|
|
||||||
curl_setopt($ch, CURLOPT_HEADER, 1);
|
|
||||||
curl_setopt($ch, CURLOPT_NOBODY, 0);
|
|
||||||
curl_setopt($ch, CURLOPT_TIMEOUT, 10);
|
|
||||||
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
|
|
||||||
curl_setopt($ch, CURLOPT_USERAGENT, $a->get_useragent());
|
|
||||||
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, (($check_cert) ? true : false));
|
|
||||||
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, (($check_cert) ? 2 : false));
|
|
||||||
|
|
||||||
$header = curl_exec($ch);
|
|
||||||
$curl_info = @curl_getinfo($ch);
|
|
||||||
$http_code = $curl_info["http_code"];
|
|
||||||
curl_close($ch);
|
|
||||||
|
|
||||||
$a->save_timestamp($stamp1, "network");
|
|
||||||
|
|
||||||
// Fetch the first mentioned charset. Can be in body or header
|
// Fetch the first mentioned charset. Can be in body or header
|
||||||
$charset = "";
|
$charset = "";
|
||||||
if (preg_match('/charset=(.*?)['."'".'"\s\n]/', $header, $matches)) {
|
if (preg_match('/charset=(.*?)['."'".'"\s\n]/', $header, $matches)) {
|
||||||
|
|
|
@ -670,42 +670,69 @@ function fix_contact_ssl_policy(&$contact,$new_policy) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function original_url($url, $depth=1, $fetchbody = false) {
|
/**
|
||||||
|
* @brief Remove Google Analytics and other tracking platforms params from URL
|
||||||
$a = get_app();
|
*
|
||||||
|
* @param string $url
|
||||||
// Remove Analytics Data from Google and other tracking platforms
|
* @return string
|
||||||
|
*/
|
||||||
|
function strip_tracking_query_params($url)
|
||||||
|
{
|
||||||
$urldata = parse_url($url);
|
$urldata = parse_url($url);
|
||||||
if (is_string($urldata["query"])) {
|
if (is_string($urldata["query"])) {
|
||||||
$query = $urldata["query"];
|
$query = $urldata["query"];
|
||||||
parse_str($query, $querydata);
|
parse_str($query, $querydata);
|
||||||
|
|
||||||
if (is_array($querydata))
|
if (is_array($querydata)) {
|
||||||
foreach ($querydata AS $param=>$value)
|
foreach ($querydata AS $param => $value) {
|
||||||
if (in_array($param, array("utm_source", "utm_medium", "utm_term", "utm_content", "utm_campaign",
|
if (in_array($param, array("utm_source", "utm_medium", "utm_term", "utm_content", "utm_campaign",
|
||||||
"wt_mc", "pk_campaign", "pk_kwd", "mc_cid", "mc_eid",
|
"wt_mc", "pk_campaign", "pk_kwd", "mc_cid", "mc_eid",
|
||||||
"fb_action_ids", "fb_action_types", "fb_ref",
|
"fb_action_ids", "fb_action_types", "fb_ref",
|
||||||
"awesm", "wtrid",
|
"awesm", "wtrid",
|
||||||
"woo_campaign", "woo_source", "woo_medium", "woo_content", "woo_term"))) {
|
"woo_campaign", "woo_source", "woo_medium", "woo_content", "woo_term"))) {
|
||||||
|
|
||||||
$pair = $param."=".urlencode($value);
|
$pair = $param . "=" . urlencode($value);
|
||||||
$url = str_replace($pair, "", $url);
|
$url = str_replace($pair, "", $url);
|
||||||
|
|
||||||
// Second try: if the url isn't encoded completely
|
// Second try: if the url isn't encoded completely
|
||||||
$pair = $param."=".str_replace(" ", "+", $value);
|
$pair = $param . "=" . str_replace(" ", "+", $value);
|
||||||
$url = str_replace($pair, "", $url);
|
$url = str_replace($pair, "", $url);
|
||||||
|
|
||||||
// Third try: Maybey the url isn't encoded at all
|
// Third try: Maybey the url isn't encoded at all
|
||||||
$pair = $param."=".$value;
|
$pair = $param . "=" . $value;
|
||||||
$url = str_replace($pair, "", $url);
|
$url = str_replace($pair, "", $url);
|
||||||
|
|
||||||
$url = str_replace(array("?&", "&&"), array("?", ""), $url);
|
$url = str_replace(array("?&", "&&"), array("?", ""), $url);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (substr($url, -1, 1) == "?")
|
if (substr($url, -1, 1) == "?") {
|
||||||
$url = substr($url, 0, -1);
|
$url = substr($url, 0, -1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return $url;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Returns the original URL of the provided URL
|
||||||
|
*
|
||||||
|
* This function strips tracking query params and follows redirections, either
|
||||||
|
* through HTTP code or meta refresh tags. Stops after 10 redirections.
|
||||||
|
*
|
||||||
|
* @see ParseUrl::getSiteinfo
|
||||||
|
*
|
||||||
|
* @param string $url
|
||||||
|
* @param int $depth
|
||||||
|
* @param bool $fetchbody
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
function original_url($url, $depth = 1, $fetchbody = false) {
|
||||||
|
$a = get_app();
|
||||||
|
|
||||||
|
$url = strip_tracking_query_params($url);
|
||||||
|
|
||||||
if ($depth > 10)
|
if ($depth > 10)
|
||||||
return($url);
|
return($url);
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue