*/ /** This class analyze the provided robots.txt file content and allow to * get the configured data for DomSearch. * It allow to examine an URL against the robots.txt file and return if the URL * is allowed to be used or not * The definition of the format of robots.txt file is available here : * http://www.robotstxt.org/norobots-rfc.txt * https://en.wikipedia.org/wiki/Robots_exclusion_standard */ class robotstxt { // PROPERTIES // {{{ /** The allowed urls */ private $allow = array (); /** The disallow urls */ private $disallow = array (); /** The sitemaps URL defined in the file */ private $sitemaps = array (); /** The crawldelay defined in the file (3s if not defined) */ private $crawldelay = 3; /** The host can be specified as default website */ private $host = null; /** The rule number matching the URLAllow rule */ private $matchRule = null; // }}} // METHODS /** Get the robots.txt file content and do the analyze * @param string $content The robots.txt file content to analyze * @param string $crawlerName The crawler name to use in analyze * @return $this */ public function __construct ($content, $crawlerName) // {{{ { if (trim ($content) === "") { $this->allow = array ("/"); return $this; } // Look if the User-agent is available for the defined crawlerName. // If Not check if the User-agent axists for * // If not accept all $content = preg_split('/\r\n|\r|\n/', $content); $keys = preg_grep ("~User-agent:\s*$crawlerName\s*#?~i", $content); if (empty ($keys)) $keys = preg_grep ("~User-agent:\s*\\*\s*#?~i", $content); if (empty ($keys)) { // No User-agent with crawlerName nor * : accept all $this->allow = array ("/"); return $this; } // Get the Allow and Disallow lines. The stop will arrive on first // User-Agent line arriving after a Allow/Disallow. // Comments and empty lines are removed for ($i = key ($keys) ; $i < count ($content) ; $i++) { $line = trim ($content[$i]); if (strtolower (substr ($line, 0, 6)) === "allow:") { $this->allow[] = $this->getValueFromLine ($line); } elseif (strtolower (substr ($line, 0, 9)) === "disallow:") { $this->disallow[] = $this->getValueFromLine ($line); } elseif (strtolower (substr ($line, 0, 12)) === "crawl-delay:") { $val = $this->getValueFromLine ($line); if ($val > 1 && $val < 60 && $this->crawldelay === 3) $this->crawldelay = intval ($val); } elseif (strtolower (substr ($line, 0, 11)) === "user-agent:" && (!empty ($this->allow) || ! empty ($this->disallow))) { // New user-agent line after valid allow/disallow : end of paragraph break; } else { // Comment, empty line, sitemap, host, not managed line... : SKIP } } $keys = preg_grep ("#^\s*Sitemap:\s*(?P\S+)\s*(\#)?#i", $content); foreach ($keys as $line) { $url = $this->getValueFromLine ($line); if (!! filter_var ($url, FILTER_VALIDATE_URL)) $this->sitemaps[] = $url; } $keys = preg_grep ("#^\s*Host:\s*(?P\S+)\s*(\#)?#i", $content); foreach ($keys as $line) { // Takes only the first one $this->host = $this->getValueFromLine ($line); break; } if (! in_array ("/", $this->disallow) && ! in_array ("/", $this->allow)) $this->allow[] = "/"; return $this; } // }}} /** Return true if the provided URL can be used against the robots.txt * definition or FALSE if it is not the case * @param string $url The URL to check * @return boolean The result of the test */ public function URLAllow ($url) // {{{ { $parse = parse_url ($url); $path = (isset ($parse["path"])) ? $parse["path"] : "/"; // Robots.txt files are always allowed if ($path === "/robots.txt") return true; $bestDisallow = -1; $bestAllow = -1; $allowRule = ""; $disallowRule = ""; foreach ($this->disallow as $partial) { if (strpos ($partial, "*") === false) { if (substr ($path, 0, strlen ($partial)) === $partial && $bestDisallow < strlen ($partial)) { $bestDisallow = strlen ($partial); $disallowRule = $partial; } } else { $partial = str_replace ("*", ".+", $partial); if (preg_match ("#$partial#", $path) === 1) { $bestDisallow = 255; $disallowRule = $partial; } } } foreach ($this->allow as $partial) { if (strpos ($partial, "*") === false) { if (substr ($path, 0, strlen ($partial)) === $partial && $bestAllow < strlen ($partial)) { $bestAllow = strlen ($partial); $allowRule = $partial; } } else { $partial = str_replace ("*", ".+", $partial); if (preg_match ("#$partial#", $path) === 1) { $bestAllow = 255; $allowRule = $partial; } } } if ($bestAllow > $bestDisallow) { $this->matchRule = $allowRule; return true; } $this->matchRule = $disallowRule; return false; } // }}} // GETTERS /** Return the allowed urls * @return array $allow The array of allow rules */ public function allow () // {{{ { return $this->allow; } // }}} /** Return the disallowed urls * @return array $disallow The array of disallow rules */ public function disallow () // {{{ { return $this->disallow; } // }}} /** Return the sitemaps url * @return array $sitemap The array of sitemaps URL */ public function sitemaps () // {{{ { return $this->sitemaps; } // }}} /** Return the crawldelay * @return integer $crawldelay The crawlDelay defined in robots.txt */ public function crawldelay () // {{{ { return $this->crawldelay; } // }}} /** Return the host * @return string $host The Host string defined in robots.txt */ public function host () // {{{ { return $this->host; } // }}} /** Return the matchRule * @return string $matchRule The matchRule matching the URLAllow test */ public function matchRule () // {{{ { return $this->matchRule; } // }}} // PRIVATE METHODS /** Get a line from robots.txt file and return the associated value. * Manage the evntual comments on the line * @param string $line the complete line from robots.txt file * @return string the value recorded on line */ private function getValueFromLine ($line) // {{{ { preg_match_all ("#^(?P\S+):\s*(?P\S*)\s*". "(\#\s*(?P.+)\s*)?\$#", $line, $matches); return $matches["value"][0]; } // }}} }