diff --git a/Tests/robotstxtTest.php b/Tests/robotstxtTest.php index 133e010..ae46f91 100644 --- a/Tests/robotstxtTest.php +++ b/Tests/robotstxtTest.php @@ -1,166 +1,181 @@ allow (); + $robotstxt = new robotstxt (""); + $res = $robotstxt->allow (); $this->assertSame ($res, ["/"]); } public function test_Construct_2 () { - $robots = new \models\robots (""); - $res = $robots->disallow (); + $robotstxt = new robotstxt (""); + $res = $robotstxt->disallow (); $this->assertSame ($res, array ()); } public function test_Construct_3 () { - $robots = new \models\robots (""); - $res = $robots->sitemaps (); + $robotstxt = new robotstxt (""); + $res = $robotstxt->sitemaps (); $this->assertSame ($res, array ()); } public function test_Construct_4 () { - $robots = new \models\robots (""); - $res = $robots->crawldelay (); + $robotstxt = new robotstxt (""); + $res = $robotstxt->crawldelay (); $this->assertSame ($res, 3); } // Allow public function test_allow_1 () { - $robots = new \models\robots ( + $robotstxt = new robotstxt ( "User-Agent: *\nDisallow:\n"); - $res = $robots->allow (); + $res = $robotstxt->allow (); $this->assertSame ($res, ["/"]); } public function test_allow_2 () { - $robots = new \models\robots ( + $robotstxt = new robotstxt ( "User-Agent: *\nDisallow:\n\nUser-Agent: DomSearch\nDisallow:\n"); - $res = $robots->allow (); + $res = $robotstxt->allow (); $this->assertSame ($res, ["/"]); } public function test_allow_3 () { - $robots = new \models\robots ( + $robotstxt = new robotstxt ( "User-Agent: DomSearch\nDisallow:\n\nUser-Agent: *\nDisallow:\n"); - $res = $robots->allow (); + $res = $robotstxt->allow (); + $this->assertSame ($res, ["/"]); + } + public function test_allow_4 () + { + $robotstxt = new robotstxt ( + "User-Agent: DomSearch\n". + "User-Agent: User1\n". + "User-Agent: User2\n". + "Disallow:\n\n". + "User-Agent: *\n". + "Disallow: /\n"); + $res = $robotstxt->allow (); $this->assertSame ($res, ["/"]); } // Disallow public function test_disallow_1 () { - $robots = new \models\robots ( + $robotstxt = new robotstxt ( "User-Agent: *\nDisallow: /\n"); - $res = $robots->disallow (); + $res = $robotstxt->disallow (); $this->assertSame ($res, ["/"]); } public function test_disallow_2 () { - $robots = new \models\robots ( + $robotstxt = new robotstxt ( "User-Agent: *\nDisallow: /\n\nUser-Agent: DomSearch\nDisallow: /\n"); - $res = $robots->disallow (); + $res = $robotstxt->disallow (); $this->assertSame ($res, ["/"]); } public function test_disallow_3 () { - $robots = new \models\robots ( + $robotstxt = new robotstxt ( "User-Agent: DomSearch\nDisallow: /\n\nUser-Agent: *\nDisallow: /\n"); - $res = $robots->disallow (); + $res = $robotstxt->disallow (); $this->assertSame ($res, ["/"]); } // Sitemaps public function test_sitemaps_1 () { - $robots = new \models\robots ( + $robotstxt = new robotstxt ( "User-Agent: DomSearch\nDisallow: /\n\nUser-Agent: *\nDisallow: /\n"); - $res = $robots->sitemaps (); + $res = $robotstxt->sitemaps (); $this->assertSame ($res, []); } public function test_sitemaps_2 () { - $robots = new \models\robots ( - "User-Agent: *\nDisallow: /\nSitemap: /sitemap.xml"); - $res = $robots->sitemaps (); - $this->assertSame ($res, ["/sitemap.xml"]); + $robotstxt = new robotstxt ( + "User-Agent: *\nDisallow: /\nSitemap: http://example.com/sitemap.xml"); + $res = $robotstxt->sitemaps (); + $this->assertSame ($res, ["http://example.com/sitemap.xml"]); } public function test_sitemaps_3 () { - $robots = new \models\robots ( - "User-Agent: *\nDisallow: /\nSitemap: /sitemap.xml\nSitemap: /SITEMAP.XML"); - $res = $robots->sitemaps (); - $this->assertSame ($res, ["/sitemap.xml", "/SITEMAP.XML"]); + $robotstxt = new robotstxt ( + "User-Agent: *\nDisallow: /\n". + "Sitemap: http://example.com/sitemap.xml\n". + "Sitemap: http://example.com/SITEMAP.XML"); + $res = $robotstxt->sitemaps (); + $this->assertSame ($res, + ["http://example.com/sitemap.xml", "http://example.com/SITEMAP.XML"]); } // Host public function test_host_1 () { - $robots = new \models\robots ( + $robotstxt = new robotstxt ( "User-Agent: *\nDisallow: /\n"); - $res = $robots->host (); + $res = $robotstxt->host (); $this->assertSame ($res, null); } public function test_host_2 () { - $robots = new \models\robots ( + $robotstxt = new robotstxt ( "User-Agent: *\nDisallow: /\n\nHost: localhost"); - $res = $robots->host (); + $res = $robotstxt->host (); $this->assertSame ($res, "localhost"); } // URLAllow public function test_urlallow_1 () { - $robots = new \models\robots (""); - $res = $robots->URLAllow ("/"); + $robotstxt = new robotstxt (""); + $res = $robotstxt->URLAllow ("/"); $this->assertSame ($res, true); } public function test_urlallow_2 () { - $robots = new \models\robots ( + $robotstxt = new robotstxt ( "User-Agent: *\nDisallow: /"); - $res = $robots->URLAllow ("/"); + $res = $robotstxt->URLAllow ("/"); $this->assertSame ($res, false); } public function test_urlallow_3 () { - $robots = new \models\robots ( + $robotstxt = new robotstxt ( "User-Agent: *\nDisallow: /\nAllow: /allow/"); - $res = $robots->URLAllow ("/"); + $res = $robotstxt->URLAllow ("/"); $this->assertSame ($res, false); } public function test_urlallow_4 () { - $robots = new \models\robots ( + $robotstxt = new robotstxt ( "User-Agent: *\nDisallow: /\nAllow: /allow/"); - $res = $robots->URLAllow ("/allow/file"); + $res = $robotstxt->URLAllow ("/allow/file"); $this->assertSame ($res, true); } public function test_urlallow_5 () { - $robots = new \models\robots ( + $robotstxt = new robotstxt ( "User-Agent: *\nDisallow: /\nAllow: /allow/*.gif$"); - $res = $robots->URLAllow ("/allow/file.gif"); + $res = $robotstxt->URLAllow ("/allow/file.gif"); $this->assertSame ($res, true); } public function test_urlallow_6 () { - $robots = new \models\robots ( + $robotstxt = new robotstxt ( "User-Agent: *\nDisallow: /\nAllow: /allow/*.gif$"); - $res = $robots->URLAllow ("/allow/.gif"); + $res = $robotstxt->URLAllow ("/allow/.gif"); $this->assertSame ($res, false); } public function test_urlallow_7 () { - $robots = new \models\robots ( + $robotstxt = new robotstxt ( "User-Agent: *\nDisallow: /\nAllow: /allow/*.gif\$"); - $res = $robots->URLAllow ("/allow/file.png"); + $res = $robotstxt->URLAllow ("/allow/file.png"); $this->assertSame ($res, false); } } diff --git a/robotstxt.php b/robotstxt.php index efc801c..e93e634 100644 --- a/robotstxt.php +++ b/robotstxt.php @@ -8,6 +8,9 @@ * get the configured data for DomSearch. * It allow to examine an URL against the robots.txt file and return if the URL * is allowed to be used or not + * The definition of the format of robots.txt file is available here : + * http://www.robotstxt.org/norobots-rfc.txt + * https://en.wikipedia.org/wiki/Robots_exclusion_standard */ class robotstxt { @@ -55,54 +58,64 @@ class robotstxt $this->allow = array ("/"); return $this; } - $crawler = ""; - $blocks = explode ("\n\n", $content); - foreach ($blocks as $block) + // Look if the User-agent is available for the defined crawlerName. + // If Not check if the User-agent axists for * + // If not accept all + $content = preg_split('/\r\n|\r|\n/', $content); + $keys = preg_grep ("~User-agent:\s*$this->crawlerName\s*#?~i", $content); + if (empty ($keys)) + $keys = preg_grep ("~User-agent:\s*\\*\s*#?~i", $content); + if (empty ($keys)) { - preg_match_all ("#User-agent:\s+(.+)\$#mi", $block, $useragents); - if (!isset ($useragents[1])) - continue; - if (! in_array ("*", $useragents[1]) && - ! in_array ("DomSearch", $useragents[1])) - continue; - if (in_array ("*", $useragents[1])) - { - // Already set - if ($crawler == "DomSearch") - continue; - $crawler = "*"; - $this->allow = array (); - $this->disallow = array (); - $this->crawldelay = 3; - } - if (in_array ("DomSearch", $useragents[1])) - - { - // If the information for DomSearch where already seen, skip the second - // crawler information - if ($crawler == "DomSearch") - continue; - $crawler = "DomSearch"; - $this->allow = array (); - $this->disallow = array (); - $this->crawldelay = 3; - } - preg_match_all ("#^Allow:\s*(\S*)\s*\$#mUi", $block, $allows); - preg_match_all ("#^Disallow:\s*(\S*)\s*\$#mUi", $block, $disallows); - preg_match_all ("#^Crawl-delay:\s*(\d+)\s*#mi", $block, $crawldelay); - if (isset ($allows[1])) - $this->allow = $allows[1]; - if (isset ($disallows[1])) - $this->disallow = $disallows[1]; - if (isset ($crawldelay[1][0]) && $crawldelay[1][0] > 0) - $this->crawldelay = intval ($crawldelay[1][0]); + // No User-agent with crawlerName nor * : accept all + $this->allow = array ("/"); + return $this; + } + // Get the Allow and Disallow lines. The stop will arrive on first + // User-Agent line arriving after a Allow/Disallow. + // Comments and empty lines are removed + for ($i = key ($keys) ; $i < count ($content) ; $i++) + { + $line = trim ($content[$i]); + if (strtolower (substr ($line, 0, 6)) === "allow:") + { + $this->allow[] = $this->getValueFromLine ($line); + } + elseif (strtolower (substr ($line, 0, 9)) === "disallow:") + { + $this->disallow[] = $this->getValueFromLine ($line); + } + elseif (strtolower (substr ($line, 0, 12)) === "crawl-delay:") + { + $val = $this->getValueFromLine ($line); + if ($val > 1 && $val < 60 && $this->crawldelay === 3) + $this->crawldelay = intval ($val); + } + elseif (strtolower (substr ($line, 0, 11)) === "user-agent:" && + (!empty ($this->allow) || ! empty ($this->disallow))) + { + // New user-agent line after valid allow/disallow : end of paragraph + break; + } + else + { + // Comment, empty line, sitemap, host, not managed line... : SKIP + } + } + $keys = preg_grep ("#^\s*Sitemap:\s*(?P\S+)\s*(\#)?#i", $content); + foreach ($keys as $line) + { + $url = $this->getValueFromLine ($line); + if (!! filter_var ($url, FILTER_VALIDATE_URL)) + $this->sitemaps[] = $url; + } + $keys = preg_grep ("#^\s*Host:\s*(?P\S+)\s*(\#)?#i", $content); + foreach ($keys as $line) + { + // Takes only the first one + $this->host = $this->getValueFromLine ($line); + break; } - preg_match_all ("#^Sitemap:\s*(\S+)\s*\$#mi", $content, $sitemaps); - if (isset ($sitemaps[1][0])) - $this->sitemaps = $sitemaps[1]; - preg_match_all ("#^Host:\s*(\S+)\s*\$#mi", $content, $host); - if (isset ($host[1][0])) - $this->host = $host[1][0]; if (! in_array ("/", $this->disallow) && ! in_array ("/", $this->allow)) $this->allow[] = "/"; @@ -236,4 +249,19 @@ class robotstxt return $this->matchRule; } // }}} + + // PRIVATE METHODS + /** Get a line from robots.txt file and return the associated value. + * Manage the evntual comments on the line + * @param string $line the complete line from robots.txt file + * @return string the value recorded on line + */ + private function getValueFromLine ($line) + // {{{ + { + preg_match_all ("#^(?P\S+):\s*(?P\S*)\s*". + "(\#\s*(?P.+)\s*)?\$#", $line, $matches); + return $matches["value"][0]; + } + // }}} }