* @license BSD */ /** This class analyze the provided robots.txt file content and allow to * get the configured data for DomSearch. * It allow to examine an URL against the robots.txt file and return if the URL * is allowed to be used or not * The definition of the format of robots.txt file is available here : * http://www.robotstxt.org/norobots-rfc.txt * https://en.wikipedia.org/wiki/Robots_exclusion_standard */ class robotstxt { // PROPERTIES // {{{ /** The allowed urls */ private $allow = array (); /** The disallow urls */ private $disallow = array (); /** The sitemaps URL defined in the file */ private $sitemaps = array (); /** The crawldelay defined in the file (3s if not defined) */ private $crawldelay = 3; /** The host can be specified as default website */ private $host = null; /** The rule number matching the URLAllow rule */ private $matchRule = null; /** List the errors read on the file content. The key is the line where the * default is set */ private $errors = array (); // }}} // METHODS /** Get the robots.txt file content and do the analyze * @param string $content The robots.txt file content to analyze * @param string $crawlerName The crawler name to use in analyze * @return $this */ public function __construct ($content, $crawlerName) // {{{ { if (trim ($content) === "") { $this->allow = array ("/"); return $this; } // Look if the User-agent is available for the defined crawlerName. // If Not check if the User-agent axists for * // If not accept all $content = preg_split('/\r\n|\r|\n/', $content); $keys = preg_grep ("~User-agent:\s*$crawlerName\s*#?~i", $content); if (empty ($keys)) $keys = preg_grep ("~User-agent:\s*\\*\s*#?~i", $content); if (empty ($keys)) { // No User-agent with crawlerName nor * : accept all $this->allow = array ("/"); return $this; } // The sitemaps are not restricted to the user-agent foreach (preg_grep ("~Sitemap:\s+~i", $content) as $nb => $line) { $url = $this->getValueFromLine ($line); if (!! filter_var ($url, FILTER_VALIDATE_URL) && (substr ($url, 0, 7) === "http://" || substr ($url, 0, 8) === "https://")) $this->sitemaps[] = $url; else $this->errors[$nb] = dgettext ("domframework", "Sitemap : Invalid URL provided"); } // Get the Allow and Disallow lines. The stop will arrive on first // User-Agent line arriving after a Allow/Disallow. // Comments and empty lines are removed for ($i = key ($keys) ; $i < count ($content) ; $i++) { $line = trim ($content[$i]); if (stripos ($line, "Sitemap:") === 0) { // Already managed in the general parser. Not needed in the specific // user-agent parser. Must at least be catched to not generate an // error } elseif (stripos ($line, "Host:") === 0) { if ($this->host !== null) $this->errors[$i] = dgettext ("domframework", "Multiple Hosts set"); else $this->host = $this->getValueFromLine ($line); } elseif ($line === "" || $line[0] === "#") { // Comment, empty line : SKIP } elseif (stripos ($line, "allow:") === 0) { $this->allow[] = $this->getValueFromLine ($line); } elseif (stripos ($line, "disallow:") === 0) { $this->disallow[] = $this->getValueFromLine ($line); } elseif (stripos ($line, "crawl-delay:") === 0) { $val = $this->getValueFromLine ($line); if ($val > 1 && $val < 60 && $this->crawldelay === 3) $this->crawldelay = intval ($val); else $this->errors[$i] = dgettext ("domframework", "Crawldelay : value out of range (1-60)"); } elseif (stripos ($line, "user-agent:") === 0) { if (! empty ($this->allow) || ! empty ($this->disallow)) { // New user-agent line after valid allow/disallow : end of paragraph break; } else { // New user-agent. Do nothing } } else { // Not managed line : error $this->errors[$i] = sprintf (dgettext ("domframework", "Invalid line : unknown command : '%s'"), $line); } } if (! in_array ("/", $this->disallow) && ! in_array ("/", $this->allow)) $this->allow[] = "/"; return $this; } // }}} /** Return true if the provided URL can be used against the robots.txt * definition or FALSE if it is not the case * @param string $url The URL to check * @return boolean The result of the test */ public function URLAllow ($url) // {{{ { $parse = parse_url ($url); $path = (isset ($parse["path"])) ? $parse["path"] : "/"; // Robots.txt files are always allowed if ($path === "/robots.txt") return true; $bestDisallow = -1; $bestAllow = -1; $allowRule = ""; $disallowRule = ""; foreach ($this->disallow as $partial) { if (strpos ($partial, "*") === false) { if (substr ($path, 0, strlen ($partial)) === $partial && $bestDisallow < strlen ($partial)) { $bestDisallow = strlen ($partial); $disallowRule = $partial; } } else { $partial = str_replace ("*", ".+", $partial); if (preg_match ("#$partial#", $path) === 1) { $bestDisallow = 255; $disallowRule = $partial; } } } foreach ($this->allow as $partial) { if (strpos ($partial, "*") === false) { if (substr ($path, 0, strlen ($partial)) === $partial && $bestAllow < strlen ($partial)) { $bestAllow = strlen ($partial); $allowRule = $partial; } } else { $partial = str_replace ("*", ".+", $partial); if (preg_match ("#$partial#", $path) === 1) { $bestAllow = 255; $allowRule = $partial; } } } if ($bestAllow > $bestDisallow) { $this->matchRule = $allowRule; return true; } $this->matchRule = $disallowRule; return false; } // }}} // GETTERS /** Return the lines where an error occured * The key of the array is the line number with the default * @return array The errors */ public function errors () // {{{ { return $this->errors; } // }}} /** Return the allowed urls * @return array $allow The array of allow rules */ public function allow () // {{{ { return $this->allow; } // }}} /** Return the disallowed urls * @return array $disallow The array of disallow rules */ public function disallow () // {{{ { return $this->disallow; } // }}} /** Return the sitemaps url * @return array $sitemap The array of sitemaps URL */ public function sitemaps () // {{{ { return $this->sitemaps; } // }}} /** Return the crawldelay * @return integer $crawldelay The crawlDelay defined in robots.txt */ public function crawldelay () // {{{ { return $this->crawldelay; } // }}} /** Return the host * @return string $host The Host string defined in robots.txt */ public function host () // {{{ { return $this->host; } // }}} /** Return the matchRule * @return string $matchRule The matchRule matching the URLAllow test */ public function matchRule () // {{{ { return $this->matchRule; } // }}} // PRIVATE METHODS /** Get a line from robots.txt file and return the associated value. * Manage the evntual comments on the line * @param string $line the complete line from robots.txt file * @return string the value recorded on line */ private function getValueFromLine ($line) // {{{ { preg_match_all ("#^(?P\S+):\s*(?P\S*)\s*". "(\#\s*(?P.+)\s*)?\$#", $line, $matches); return $matches["value"][0]; } // }}} }