*/ namespace models; /** This class analyze the provided robots.txt file content and allow to * get the configured data for DomSearch. * It allow to examine an URL against the robots.txt file and return if the URL * is allowed to be used or not */ class robots { // PROPERTIES // {{{ /** The cralwer name wanted in robots.txt */ private $crawlerName = "DomSearch"; /** The allowed urls */ private $allow = array (); /** The disallow urls */ private $disallow = array (); /** The sitemaps URL defined in the file */ private $sitemaps = array (); /** The crawldelay defined in the file (3s if not defined) */ private $crawldelay = 3; /** The host can be specified as default website */ private $host = null; /** The rule matchine the URLAllow rule */ private $matchRule = null; // }}} // METHODS /** Get the robots.txt file content and do the analyze * @param string $content The robots.txt file content to analyze * @return $this */ public function __construct ($content) // {{{ { if (trim ($content) === "") { $this->allow = array ("/"); return $this; } $crawler = ""; $blocks = explode ("\n\n", $content); foreach ($blocks as $block) { preg_match_all ("#User-agent:\s+(.+)\$#mi", $block, $useragents); if (!isset ($useragents[1])) continue; if (! in_array ("*", $useragents[1]) && ! in_array ("DomSearch", $useragents[1])) continue; if (in_array ("*", $useragents[1])) { // Already set if ($crawler == "DomSearch") continue; $crawler = "*"; $this->allow = array (); $this->disallow = array (); $this->crawldelay = 3; } if (in_array ("DomSearch", $useragents[1])) { // If the information for DomSearch where already seen, skip the second // crawler information if ($crawler == "DomSearch") continue; $crawler = "DomSearch"; $this->allow = array (); $this->disallow = array (); $this->crawldelay = 3; } preg_match_all ("#^Allow:\s*(\S*)\s*\$#mUi", $block, $allows); preg_match_all ("#^Disallow:\s*(\S*)\s*\$#mUi", $block, $disallows); preg_match_all ("#^Crawl-delay:\s*(\d+)\s*#mi", $block, $crawldelay); if (isset ($allows[1])) $this->allow = $allows[1]; if (isset ($disallows[1])) $this->disallow = $disallows[1]; if (isset ($crawldelay[1][0]) && $crawldelay[1][0] > 0) $this->crawldelay = intval ($crawldelay[1][0]); } preg_match_all ("#^Sitemap:\s*(\S+)\s*\$#mi", $content, $sitemaps); if (isset ($sitemaps[1][0])) $this->sitemaps = $sitemaps[1]; preg_match_all ("#^Host:\s*(\S+)\s*\$#mi", $content, $host); if (isset ($host[1][0])) $this->host = $host[1][0]; if (! in_array ("/", $this->disallow) && ! in_array ("/", $this->allow)) $this->allow[] = "/"; return $this; } // }}} /** Return true if the provided URL can be used against the robots.txt * definition or FALSE if it is not the case * @param string $url The URL to check * @return boolean The result of the test */ public function URLAllow ($url) // {{{ { $parse = parse_url ($url); $path = (isset ($parse["path"])) ? $parse["path"] : "/"; $bestDisallow = -1; $bestAllow = -1; $allowRule = ""; $disallowRule = ""; foreach ($this->disallow as $partial) { if (strpos ($partial, "*") === false) { if (substr ($path, 0, strlen ($partial)) === $partial && $bestDisallow < strlen ($partial)) { $bestDisallow = strlen ($partial); $disallowRule = $partial; } } else { $partial = str_replace ("*", ".+", $partial); if (preg_match ("#$partial#", $path) === 1) { $bestDisallow = 255; $disallowRule = $partial; } } } foreach ($this->allow as $partial) { if (strpos ($partial, "*") === false) { if (substr ($path, 0, strlen ($partial)) === $partial && $bestAllow < strlen ($partial)) { $bestAllow = strlen ($partial); $allowRule = $partial; } } else { $partial = str_replace ("*", ".+", $partial); if (preg_match ("#$partial#", $path) === 1) { $bestAllow = 255; $allowRule = $partial; } } } if ($bestAllow < $bestDisallow) { $this->matchRule = $disallowRule; return false; } $this->matchRule = $allowRule; return true; } // }}} // GETTERS /** Return the allowed urls * @return array $allow The array of allow rules */ public function allow () // {{{ { return $this->allow; } // }}} /** Return the disallowed urls * @return array $disallow The array of disallow rules */ public function disallow () // {{{ { return $this->disallow; } // }}} /** Return the sitemaps url * @return array $sitemap The array of sitemaps URL */ public function sitemaps () // {{{ { return $this->sitemaps; } // }}} /** Return the crawldelay * @return integer $crawldelay The crawlDelay defined in robots.txt */ public function crawldelay () // {{{ { return $this->crawldelay; } // }}} /** Return the host * @return string $host The Host string defined in robots.txt */ public function host () // {{{ { return $this->host; } // }}} /** Return the matchRule * @return string $matchRule The matchRule matching the URLAllow test */ public function matchRule () // {{{ { return $this->matchRule; } // }}} }