diff --git a/robotstxt.php b/robotstxt.php new file mode 100644 index 0000000..cb2ad26 --- /dev/null +++ b/robotstxt.php @@ -0,0 +1,241 @@ + + */ + +namespace models; + +/** This class analyze the provided robots.txt file content and allow to + * get the configured data for DomSearch. + * It allow to examine an URL against the robots.txt file and return if the URL + * is allowed to be used or not + */ +class robots +{ + // PROPERTIES + // {{{ + /** The cralwer name wanted in robots.txt + */ + private $crawlerName = "DomSearch"; + + /** The allowed urls + */ + private $allow = array (); + + /** The disallow urls + */ + private $disallow = array (); + + /** The sitemaps URL defined in the file + */ + private $sitemaps = array (); + + /** The crawldelay defined in the file (3s if not defined) + */ + private $crawldelay = 3; + + /** The host can be specified as default website + */ + private $host = null; + + /** The rule matchine the URLAllow rule + */ + private $matchRule = null; + // }}} + + // METHODS + /** Get the robots.txt file content and do the analyze + * @param string $content The robots.txt file content to analyze + * @return $this + */ + public function __construct ($content) + // {{{ + { + if (trim ($content) === "") + { + $this->allow = array ("/"); + return $this; + } + $crawler = ""; + $blocks = explode ("\n\n", $content); + foreach ($blocks as $block) + { + preg_match_all ("#User-agent:\s+(.+)\$#mi", $block, $useragents); + if (!isset ($useragents[1])) + continue; + if (! in_array ("*", $useragents[1]) && + ! in_array ("DomSearch", $useragents[1])) + continue; + if (in_array ("*", $useragents[1])) + { + // Already set + if ($crawler == "DomSearch") + continue; + $crawler = "*"; + $this->allow = array (); + $this->disallow = array (); + $this->crawldelay = 3; + } + if (in_array ("DomSearch", $useragents[1])) + + { + // If the information for DomSearch where already seen, skip the second + // crawler information + if ($crawler == "DomSearch") + continue; + $crawler = "DomSearch"; + $this->allow = array (); + $this->disallow = array (); + $this->crawldelay = 3; + } + preg_match_all ("#^Allow:\s*(\S*)\s*\$#mUi", $block, $allows); + preg_match_all ("#^Disallow:\s*(\S*)\s*\$#mUi", $block, $disallows); + preg_match_all ("#^Crawl-delay:\s*(\d+)\s*#mi", $block, $crawldelay); + if (isset ($allows[1])) + $this->allow = $allows[1]; + if (isset ($disallows[1])) + $this->disallow = $disallows[1]; + if (isset ($crawldelay[1][0]) && $crawldelay[1][0] > 0) + $this->crawldelay = intval ($crawldelay[1][0]); + } + preg_match_all ("#^Sitemap:\s*(\S+)\s*\$#mi", $content, $sitemaps); + if (isset ($sitemaps[1][0])) + $this->sitemaps = $sitemaps[1]; + preg_match_all ("#^Host:\s*(\S+)\s*\$#mi", $content, $host); + if (isset ($host[1][0])) + $this->host = $host[1][0]; + if (! in_array ("/", $this->disallow) && + ! in_array ("/", $this->allow)) + $this->allow[] = "/"; + return $this; + } + // }}} + + /** Return true if the provided URL can be used against the robots.txt + * definition or FALSE if it is not the case + * @param string $url The URL to check + * @return boolean The result of the test + */ + public function URLAllow ($url) + // {{{ + { + $parse = parse_url ($url); + $path = (isset ($parse["path"])) ? $parse["path"] : "/"; + $bestDisallow = -1; + $bestAllow = -1; + $allowRule = ""; + $disallowRule = ""; + foreach ($this->disallow as $partial) + { + if (strpos ($partial, "*") === false) + { + if (substr ($path, 0, strlen ($partial)) === $partial && + $bestDisallow < strlen ($partial)) + { + $bestDisallow = strlen ($partial); + $disallowRule = $partial; + } + } + else + { + $partial = str_replace ("*", ".+", $partial); + if (preg_match ("#$partial#", $path) === 1) + { + $bestDisallow = 255; + $disallowRule = $partial; + } + } + } + foreach ($this->allow as $partial) + { + if (strpos ($partial, "*") === false) + { + if (substr ($path, 0, strlen ($partial)) === $partial && + $bestAllow < strlen ($partial)) + { + $bestAllow = strlen ($partial); + $allowRule = $partial; + } + } + else + { + $partial = str_replace ("*", ".+", $partial); + if (preg_match ("#$partial#", $path) === 1) + { + $bestAllow = 255; + $allowRule = $partial; + } + } + } + if ($bestAllow < $bestDisallow) + { + $this->matchRule = $disallowRule; + return false; + } + $this->matchRule = $allowRule; + return true; + } + // }}} + + // GETTERS + /** Return the allowed urls + * @return array $allow The array of allow rules + */ + public function allow () + // {{{ + { + return $this->allow; + } + // }}} + + /** Return the disallowed urls + * @return array $disallow The array of disallow rules + */ + public function disallow () + // {{{ + { + return $this->disallow; + } + // }}} + + /** Return the sitemaps url + * @return array $sitemap The array of sitemaps URL + */ + public function sitemaps () + // {{{ + { + return $this->sitemaps; + } + // }}} + + /** Return the crawldelay + * @return integer $crawldelay The crawlDelay defined in robots.txt + */ + public function crawldelay () + // {{{ + { + return $this->crawldelay; + } + // }}} + + /** Return the host + * @return string $host The Host string defined in robots.txt + */ + public function host () + // {{{ + { + return $this->host; + } + // }}} + + /** Return the matchRule + * @return string $matchRule The matchRule matching the URLAllow test + */ + public function matchRule () + // {{{ + { + return $this->matchRule; + } + // }}} +}