Teknik is a suite of services with attractive and functional interfaces. https://www.teknik.io/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

UrlHelper.php 18KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508
  1. <?php
  2. /**
  3. * Piwik - free/libre analytics platform
  4. *
  5. * @link http://piwik.org
  6. * @license http://www.gnu.org/licenses/gpl-3.0.html GPL v3 or later
  7. *
  8. */
  9. namespace Piwik;
  10. /**
  11. * Contains less commonly needed URL helper methods.
  12. *
  13. */
  14. class UrlHelper
  15. {
  16. /**
  17. * Converts an array of query parameter name/value mappings into a query string.
  18. * Parameters that are in `$parametersToExclude` will not appear in the result.
  19. *
  20. * @static
  21. * @param $queryParameters Array of query parameters, eg, `array('site' => '0', 'date' => '2012-01-01')`.
  22. * @param $parametersToExclude Array of query parameter names that shouldn't be
  23. * in the result query string, eg, `array('date', 'period')`.
  24. * @return string A query string, eg, `"?site=0"`.
  25. * @api
  26. */
  27. public static function getQueryStringWithExcludedParameters($queryParameters, $parametersToExclude)
  28. {
  29. $validQuery = '';
  30. $separator = '&';
  31. foreach ($queryParameters as $name => $value) {
  32. // decode encoded square brackets
  33. $name = str_replace(array('%5B', '%5D'), array('[', ']'), $name);
  34. if (!in_array(strtolower($name), $parametersToExclude)) {
  35. if (is_array($value)) {
  36. foreach ($value as $param) {
  37. if ($param === false) {
  38. $validQuery .= $name . '[]' . $separator;
  39. } else {
  40. $validQuery .= $name . '[]=' . $param . $separator;
  41. }
  42. }
  43. } else if ($value === false) {
  44. $validQuery .= $name . $separator;
  45. } else {
  46. $validQuery .= $name . '=' . $value . $separator;
  47. }
  48. }
  49. }
  50. $validQuery = substr($validQuery, 0, -strlen($separator));
  51. return $validQuery;
  52. }
  53. /**
  54. * Reduce URL to more minimal form. 2 letter country codes are
  55. * replaced by '{}', while other parts are simply removed.
  56. *
  57. * Examples:
  58. * www.example.com -> example.com
  59. * search.example.com -> example.com
  60. * m.example.com -> example.com
  61. * de.example.com -> {}.example.com
  62. * example.de -> example.{}
  63. * example.co.uk -> example.{}
  64. *
  65. * @param string $url
  66. * @return string
  67. */
  68. public static function getLossyUrl($url)
  69. {
  70. static $countries;
  71. if (!isset($countries)) {
  72. $countries = implode('|', array_keys(Common::getCountriesList(true)));
  73. }
  74. return preg_replace(
  75. array(
  76. '/^(w+[0-9]*|search)\./',
  77. '/(^|\.)m\./',
  78. '/(\.(com|org|net|co|it|edu))?\.(' . $countries . ')(\/|$)/',
  79. '/(^|\.)(' . $countries . ')\./',
  80. ),
  81. array(
  82. '',
  83. '$1',
  84. '.{}$4',
  85. '$1{}.',
  86. ),
  87. $url);
  88. }
  89. /**
  90. * Returns true if the string passed may be a URL ie. it starts with protocol://.
  91. * We don't need a precise test here because the value comes from the website
  92. * tracked source code and the URLs may look very strange.
  93. *
  94. * @param string $url
  95. * @return bool
  96. */
  97. public static function isLookLikeUrl($url)
  98. {
  99. return preg_match('~^(ftp|news|http|https)?://(.*)$~D', $url, $matches) !== 0
  100. && strlen($matches[2]) > 0;
  101. }
  102. /**
  103. * Returns a URL created from the result of the [parse_url](http://php.net/manual/en/function.parse-url.php)
  104. * function.
  105. *
  106. * Copied from the PHP comments at [http://php.net/parse_url](http://php.net/parse_url).
  107. *
  108. * @param array $parsed Result of [parse_url](http://php.net/manual/en/function.parse-url.php).
  109. * @return false|string The URL or `false` if `$parsed` isn't an array.
  110. * @api
  111. */
  112. public static function getParseUrlReverse($parsed)
  113. {
  114. if (!is_array($parsed)) {
  115. return false;
  116. }
  117. $uri = !empty($parsed['scheme']) ? $parsed['scheme'] . ':' . (!strcasecmp($parsed['scheme'], 'mailto') ? '' : '//') : '';
  118. $uri .= !empty($parsed['user']) ? $parsed['user'] . (!empty($parsed['pass']) ? ':' . $parsed['pass'] : '') . '@' : '';
  119. $uri .= !empty($parsed['host']) ? $parsed['host'] : '';
  120. $uri .= !empty($parsed['port']) ? ':' . $parsed['port'] : '';
  121. if (!empty($parsed['path'])) {
  122. $uri .= (!strncmp($parsed['path'], '/', 1))
  123. ? $parsed['path']
  124. : ((!empty($uri) ? '/' : '') . $parsed['path']);
  125. }
  126. $uri .= !empty($parsed['query']) ? '?' . $parsed['query'] : '';
  127. $uri .= !empty($parsed['fragment']) ? '#' . $parsed['fragment'] : '';
  128. return $uri;
  129. }
  130. /**
  131. * Returns a URL query string as an array.
  132. *
  133. * @param string $urlQuery The query string, eg, `'?param1=value1&param2=value2'`.
  134. * @return array eg, `array('param1' => 'value1', 'param2' => 'value2')`
  135. * @api
  136. */
  137. public static function getArrayFromQueryString($urlQuery)
  138. {
  139. if (strlen($urlQuery) == 0) {
  140. return array();
  141. }
  142. if ($urlQuery[0] == '?') {
  143. $urlQuery = substr($urlQuery, 1);
  144. }
  145. $separator = '&';
  146. $urlQuery = $separator . $urlQuery;
  147. // $urlQuery = str_replace(array('%20'), ' ', $urlQuery);
  148. $referrerQuery = trim($urlQuery);
  149. $values = explode($separator, $referrerQuery);
  150. $nameToValue = array();
  151. foreach ($values as $value) {
  152. $pos = strpos($value, '=');
  153. if ($pos !== false) {
  154. $name = substr($value, 0, $pos);
  155. $value = substr($value, $pos + 1);
  156. if ($value === false) {
  157. $value = '';
  158. }
  159. } else {
  160. $name = $value;
  161. $value = false;
  162. }
  163. if (!empty($name)) {
  164. $name = Common::sanitizeInputValue($name);
  165. }
  166. if (!empty($value)) {
  167. $value = Common::sanitizeInputValue($value);
  168. }
  169. // if array without indexes
  170. $count = 0;
  171. $tmp = preg_replace('/(\[|%5b)(]|%5d)$/i', '', $name, -1, $count);
  172. if (!empty($tmp) && $count) {
  173. $name = $tmp;
  174. if (isset($nameToValue[$name]) == false || is_array($nameToValue[$name]) == false) {
  175. $nameToValue[$name] = array();
  176. }
  177. array_push($nameToValue[$name], $value);
  178. } else if (!empty($name)) {
  179. $nameToValue[$name] = $value;
  180. }
  181. }
  182. return $nameToValue;
  183. }
  184. /**
  185. * Returns the value of a single query parameter from the supplied query string.
  186. *
  187. * @param string $urlQuery The query string.
  188. * @param string $parameter The query parameter name to return.
  189. * @return string|null Parameter value if found (can be the empty string!), null if not found.
  190. * @api
  191. */
  192. public static function getParameterFromQueryString($urlQuery, $parameter)
  193. {
  194. $nameToValue = self::getArrayFromQueryString($urlQuery);
  195. if (isset($nameToValue[$parameter])) {
  196. return $nameToValue[$parameter];
  197. }
  198. return null;
  199. }
  200. /**
  201. * Returns the path and query string of a URL.
  202. *
  203. * @param string $url The URL.
  204. * @return string eg, `/test/index.php?module=CoreHome` if `$url` is `http://piwik.org/test/index.php?module=CoreHome`.
  205. * @api
  206. */
  207. public static function getPathAndQueryFromUrl($url)
  208. {
  209. $parsedUrl = parse_url($url);
  210. $result = '';
  211. if (isset($parsedUrl['path'])) {
  212. $result .= substr($parsedUrl['path'], 1);
  213. }
  214. if (isset($parsedUrl['query'])) {
  215. $result .= '?' . $parsedUrl['query'];
  216. }
  217. return $result;
  218. }
  219. /**
  220. * Extracts a keyword from a raw not encoded URL.
  221. * Will only extract keyword if a known search engine has been detected.
  222. * Returns the keyword:
  223. * - in UTF8: automatically converted from other charsets when applicable
  224. * - strtolowered: "QUErY test!" will return "query test!"
  225. * - trimmed: extra spaces before and after are removed
  226. *
  227. * Lists of supported search engines can be found in /core/DataFiles/SearchEngines.php
  228. * The function returns false when a keyword couldn't be found.
  229. * eg. if the url is "http://www.google.com/partners.html" this will return false,
  230. * as the google keyword parameter couldn't be found.
  231. *
  232. * @see unit tests in /tests/core/Common.test.php
  233. * @param string $referrerUrl URL referrer URL, eg. $_SERVER['HTTP_REFERER']
  234. * @return array|bool false if a keyword couldn't be extracted,
  235. * or array(
  236. * 'name' => 'Google',
  237. * 'keywords' => 'my searched keywords')
  238. */
  239. public static function extractSearchEngineInformationFromUrl($referrerUrl)
  240. {
  241. $referrerParsed = @parse_url($referrerUrl);
  242. $referrerHost = '';
  243. if (isset($referrerParsed['host'])) {
  244. $referrerHost = $referrerParsed['host'];
  245. }
  246. if (empty($referrerHost)) {
  247. return false;
  248. }
  249. // some search engines (eg. Bing Images) use the same domain
  250. // as an existing search engine (eg. Bing), we must also use the url path
  251. $referrerPath = '';
  252. if (isset($referrerParsed['path'])) {
  253. $referrerPath = $referrerParsed['path'];
  254. }
  255. // no search query
  256. if (!isset($referrerParsed['query'])) {
  257. $referrerParsed['query'] = '';
  258. }
  259. $query = $referrerParsed['query'];
  260. // Google Referrers URLs sometimes have the fragment which contains the keyword
  261. if (!empty($referrerParsed['fragment'])) {
  262. $query .= '&' . $referrerParsed['fragment'];
  263. }
  264. $searchEngines = Common::getSearchEngineUrls();
  265. $hostPattern = self::getLossyUrl($referrerHost);
  266. /*
  267. * Try to get the best matching 'host' in definitions
  268. * 1. check if host + path matches an definition
  269. * 2. check if host only matches
  270. * 3. check if host pattern + path matches
  271. * 4. check if host pattern matches
  272. * 5. special handling
  273. */
  274. if (array_key_exists($referrerHost . $referrerPath, $searchEngines)) {
  275. $referrerHost = $referrerHost . $referrerPath;
  276. } elseif (array_key_exists($referrerHost, $searchEngines)) {
  277. // no need to change host
  278. } elseif (array_key_exists($hostPattern . $referrerPath, $searchEngines)) {
  279. $referrerHost = $hostPattern . $referrerPath;
  280. } elseif (array_key_exists($hostPattern, $searchEngines)) {
  281. $referrerHost = $hostPattern;
  282. } elseif (!array_key_exists($referrerHost, $searchEngines)) {
  283. if (!strncmp($query, 'cx=partner-pub-', 15)) {
  284. // Google custom search engine
  285. $referrerHost = 'google.com/cse';
  286. } elseif (!strncmp($referrerPath, '/pemonitorhosted/ws/results/', 28)) {
  287. // private-label search powered by InfoSpace Metasearch
  288. $referrerHost = 'wsdsold.infospace.com';
  289. } elseif (strpos($referrerHost, '.images.search.yahoo.com') != false) {
  290. // Yahoo! Images
  291. $referrerHost = 'images.search.yahoo.com';
  292. } elseif (strpos($referrerHost, '.search.yahoo.com') != false) {
  293. // Yahoo!
  294. $referrerHost = 'search.yahoo.com';
  295. } else {
  296. return false;
  297. }
  298. }
  299. $searchEngineName = $searchEngines[$referrerHost][0];
  300. $variableNames = null;
  301. if (isset($searchEngines[$referrerHost][1])) {
  302. $variableNames = $searchEngines[$referrerHost][1];
  303. }
  304. if (!$variableNames) {
  305. $searchEngineNames = Common::getSearchEngineNames();
  306. $url = $searchEngineNames[$searchEngineName];
  307. $variableNames = $searchEngines[$url][1];
  308. }
  309. if (!is_array($variableNames)) {
  310. $variableNames = array($variableNames);
  311. }
  312. $key = null;
  313. if ($searchEngineName === 'Google Images'
  314. || ($searchEngineName === 'Google' && strpos($referrerUrl, '/imgres') !== false)
  315. ) {
  316. if (strpos($query, '&prev') !== false) {
  317. $query = urldecode(trim(self::getParameterFromQueryString($query, 'prev')));
  318. $query = str_replace('&', '&amp;', strstr($query, '?'));
  319. }
  320. $searchEngineName = 'Google Images';
  321. } else if ($searchEngineName === 'Google'
  322. && (strpos($query, '&as_') !== false || strpos($query, 'as_') === 0)
  323. ) {
  324. $keys = array();
  325. $key = self::getParameterFromQueryString($query, 'as_q');
  326. if (!empty($key)) {
  327. array_push($keys, $key);
  328. }
  329. $key = self::getParameterFromQueryString($query, 'as_oq');
  330. if (!empty($key)) {
  331. array_push($keys, str_replace('+', ' OR ', $key));
  332. }
  333. $key = self::getParameterFromQueryString($query, 'as_epq');
  334. if (!empty($key)) {
  335. array_push($keys, "\"$key\"");
  336. }
  337. $key = self::getParameterFromQueryString($query, 'as_eq');
  338. if (!empty($key)) {
  339. array_push($keys, "-$key");
  340. }
  341. $key = trim(urldecode(implode(' ', $keys)));
  342. }
  343. if ($searchEngineName === 'Google') {
  344. // top bar menu
  345. $tbm = self::getParameterFromQueryString($query, 'tbm');
  346. switch ($tbm) {
  347. case 'isch':
  348. $searchEngineName = 'Google Images';
  349. break;
  350. case 'vid':
  351. $searchEngineName = 'Google Video';
  352. break;
  353. case 'shop':
  354. $searchEngineName = 'Google Shopping';
  355. break;
  356. }
  357. }
  358. if (empty($key)) {
  359. foreach ($variableNames as $variableName) {
  360. if ($variableName[0] == '/') {
  361. // regular expression match
  362. if (preg_match($variableName, $referrerUrl, $matches)) {
  363. $key = trim(urldecode($matches[1]));
  364. break;
  365. }
  366. } else {
  367. // search for keywords now &vname=keyword
  368. $key = self::getParameterFromQueryString($query, $variableName);
  369. $key = trim(urldecode($key));
  370. // Special cases: empty or no keywords
  371. if (empty($key)
  372. && (
  373. // Google search with no keyword
  374. ($searchEngineName == 'Google'
  375. && (empty($query) && (empty($referrerPath) || $referrerPath == '/') && empty($referrerParsed['fragment']))
  376. )
  377. // Yahoo search with no keyword
  378. || ($searchEngineName == 'Yahoo!'
  379. && ($referrerParsed['host'] == 'r.search.yahoo.com')
  380. )
  381. // empty keyword parameter
  382. || strpos($query, sprintf('&%s=', $variableName)) !== false
  383. || strpos($query, sprintf('?%s=', $variableName)) !== false
  384. // search engines with no keyword
  385. || $searchEngineName == 'Google Images'
  386. || $searchEngineName == 'DuckDuckGo')
  387. ) {
  388. $key = false;
  389. }
  390. if (!empty($key)
  391. || $key === false
  392. ) {
  393. break;
  394. }
  395. }
  396. }
  397. }
  398. // $key === false is the special case "No keyword provided" which is a Search engine match
  399. if ($key === null
  400. || $key === ''
  401. ) {
  402. return false;
  403. }
  404. if (!empty($key)) {
  405. if (function_exists('iconv')
  406. && isset($searchEngines[$referrerHost][3])
  407. ) {
  408. // accepts string, array, or comma-separated list string in preferred order
  409. $charsets = $searchEngines[$referrerHost][3];
  410. if (!is_array($charsets)) {
  411. $charsets = explode(',', $charsets);
  412. }
  413. if (!empty($charsets)) {
  414. $charset = $charsets[0];
  415. if (count($charsets) > 1
  416. && function_exists('mb_detect_encoding')
  417. ) {
  418. $charset = mb_detect_encoding($key, $charsets);
  419. if ($charset === false) {
  420. $charset = $charsets[0];
  421. }
  422. }
  423. $newkey = @iconv($charset, 'UTF-8//IGNORE', $key);
  424. if (!empty($newkey)) {
  425. $key = $newkey;
  426. }
  427. }
  428. }
  429. $key = Common::mb_strtolower($key);
  430. }
  431. return array(
  432. 'name' => $searchEngineName,
  433. 'keywords' => $key,
  434. );
  435. }
  436. /**
  437. * Returns the query part from any valid url and adds additional parameters to the query part if needed.
  438. *
  439. * @param string $url Any url eg `"http://example.com/piwik/?foo=bar"`
  440. * @param array $additionalParamsToAdd If not empty the given parameters will be added to the query.
  441. *
  442. * @return string eg. `"foo=bar&foo2=bar2"`
  443. * @api
  444. */
  445. public static function getQueryFromUrl($url, array $additionalParamsToAdd = array())
  446. {
  447. $url = @parse_url($url);
  448. $query = '';
  449. if (!empty($url['query'])) {
  450. $query .= $url['query'];
  451. }
  452. if (!empty($additionalParamsToAdd)) {
  453. if (!empty($query)) {
  454. $query .= '&';
  455. }
  456. $query .= Url::getQueryStringFromParameters($additionalParamsToAdd);
  457. }
  458. return $query;
  459. }
  460. public static function getHostFromUrl($url)
  461. {
  462. if (!UrlHelper::isLookLikeUrl($url)) {
  463. $url = "http://" . $url;
  464. }
  465. return parse_url($url, PHP_URL_HOST);
  466. }
  467. }