diff --git a/Tests/RobotstxtTest.php b/Tests/RobotstxtTest.php index 8b494a5..fe5341d 100644 --- a/Tests/RobotstxtTest.php +++ b/Tests/RobotstxtTest.php @@ -15,25 +15,25 @@ use Domframework\Robotstxt; class RobotstxtTest extends \PHPUnit_Framework_TestCase { // Empty Robots - public function test_Construct_1() + public function testConstruct1() { $robotstxt = new Robotstxt("", "domsearch"); $res = $robotstxt->allow(); $this->assertSame($res, ["/"]); } - public function test_Construct_2() + public function testConstruct2() { $robotstxt = new Robotstxt("", "domsearch"); $res = $robotstxt->disallow(); $this->assertSame($res, array ()); } - public function test_Construct_3() + public function testConstruct3() { $robotstxt = new Robotstxt("", "domsearch"); $res = $robotstxt->sitemaps(); $this->assertSame($res, array ()); } - public function test_Construct_4() + public function testConstruct4() { $robotstxt = new Robotstxt("", "domsearch"); $res = $robotstxt->crawldelay(); @@ -41,7 +41,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase } // Allow - public function test_allow_1() + public function testAllow1() { $robotstxt = new Robotstxt( "User-Agent: *\nDisallow:\n", @@ -50,7 +50,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase $res = $robotstxt->allow(); $this->assertSame($res, ["/"]); } - public function test_allow_2() + public function testAllow2() { $robotstxt = new Robotstxt( "User-Agent: *\nDisallow:\n\nUser-Agent: DomSearch\nDisallow:\n", @@ -59,7 +59,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase $res = $robotstxt->allow(); $this->assertSame($res, ["/"]); } - public function test_allow_3() + public function testAllow3() { $robotstxt = new Robotstxt( "User-Agent: DomSearch\nDisallow:\n\nUser-Agent: *\nDisallow:\n", @@ -68,7 +68,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase $res = $robotstxt->allow(); $this->assertSame($res, ["/"]); } - public function test_allow_4() + public function testAllow4() { $robotstxt = new Robotstxt( "User-Agent: DomSearch\n" . @@ -84,7 +84,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase } // Disallow - public function test_disallow_1() + public function testDisallow1() { $robotstxt = new Robotstxt( "User-Agent: *\nDisallow: /\n", @@ -93,7 +93,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase $res = $robotstxt->disallow(); $this->assertSame($res, ["/"]); } - public function test_disallow_2() + public function testDisallow2() { $robotstxt = new Robotstxt( "User-Agent: *\nDisallow: /\n\nUser-Agent: DomSearch\nDisallow: /\n", @@ -102,7 +102,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase $res = $robotstxt->disallow(); $this->assertSame($res, ["/"]); } - public function test_disallow_3() + public function testDisallow3() { $robotstxt = new Robotstxt( "User-Agent: DomSearch\nDisallow: /\n\nUser-Agent: *\nDisallow: /\n", @@ -113,7 +113,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase } // Sitemaps - public function test_sitemaps_1() + public function testSitemaps1() { $robotstxt = new Robotstxt( "User-Agent: DomSearch\nDisallow: /\n\nUser-Agent: *\nDisallow: /\n", @@ -122,7 +122,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase $res = $robotstxt->sitemaps(); $this->assertSame($res, []); } - public function test_sitemaps_2() + public function testSitemaps2() { $robotstxt = new Robotstxt( "User-Agent: *\nDisallow: /\nSitemap: http://example.com/sitemap.xml", @@ -131,7 +131,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase $res = $robotstxt->sitemaps(); $this->assertSame($res, ["http://example.com/sitemap.xml"]); } - public function test_sitemaps_3() + public function testSitemaps3() { $robotstxt = new Robotstxt( "User-Agent: *\nDisallow: /\n" . @@ -146,7 +146,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase ); } - public function test_sitemaps_error_1() + public function testSitemapsError1() { $robotstxt = new Robotstxt( "User-Agent: *\nDisallow: /\nSitemap: URL", @@ -157,7 +157,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase } // Host - public function test_host_1() + public function testHost1() { $robotstxt = new Robotstxt( "User-Agent: *\nDisallow: /\n", @@ -166,7 +166,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase $res = $robotstxt->host(); $this->assertSame($res, null); } - public function test_host_2() + public function testHost2() { $robotstxt = new Robotstxt( "User-Agent: *\nDisallow: /\n\nHost: localhost", @@ -175,7 +175,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase $res = $robotstxt->host(); $this->assertSame($res, "localhost"); } - public function test_host_error_1() + public function testHostError1() { $robotstxt = new Robotstxt( "User-Agent: *\nDisallow: /\n\nHost: localhost\nHoST: toto", @@ -184,7 +184,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase $res = $robotstxt->host(); $this->assertSame($res, "localhost"); } - public function test_host_error_2() + public function testHostError2() { $robotstxt = new Robotstxt( "User-Agent: *\nDisallow: /\n\nHost: localhost\nHoST: toto", @@ -195,13 +195,13 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase } // URLAllow - public function test_urlallow_1() + public function testUrlallow1() { $robotstxt = new Robotstxt("", "domsearch"); $res = $robotstxt->URLAllow("/"); $this->assertSame($res, true); } - public function test_urlallow_2() + public function testUrlallow2() { $robotstxt = new Robotstxt( "User-Agent: *\nDisallow: /", @@ -210,7 +210,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase $res = $robotstxt->URLAllow("/"); $this->assertSame($res, false); } - public function test_urlallow_3() + public function testUrlallow3() { $robotstxt = new Robotstxt( "User-Agent: *\nDisallow: /\nAllow: /allow/", @@ -219,7 +219,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase $res = $robotstxt->URLAllow("/"); $this->assertSame($res, false); } - public function test_urlallow_4() + public function testUrlallow4() { $robotstxt = new Robotstxt( "User-Agent: *\nDisallow: /\nAllow: /allow/", @@ -228,7 +228,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase $res = $robotstxt->URLAllow("/allow/file"); $this->assertSame($res, true); } - public function test_urlallow_5() + public function testUrlallow5() { $robotstxt = new Robotstxt( "User-Agent: *\nDisallow: /\nAllow: /allow/*.gif$", @@ -237,7 +237,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase $res = $robotstxt->URLAllow("/allow/file.gif"); $this->assertSame($res, true); } - public function test_urlallow_6() + public function testUrlallow6() { $robotstxt = new Robotstxt( "User-Agent: *\nDisallow: /\nAllow: /allow/*.gif$", @@ -246,7 +246,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase $res = $robotstxt->URLAllow("/allow/.gif"); $this->assertSame($res, false); } - public function test_urlallow_7() + public function testUrlallow7() { $robotstxt = new Robotstxt( "User-Agent: *\nDisallow: /\nAllow: /allow/*.gif\$", @@ -257,7 +257,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase } // Tests like http://www.robotstxt.org/norobots-rfc.txt - public function test_rfc_unhipbot_1() + public function testRfcUnhipbot1() { // {{{ $robotstxt = new Robotstxt( @@ -285,7 +285,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase false ); } - public function test_rfc_unhipbot_2() + public function testRfcUnhipbot2() { // {{{ $robotstxt = new Robotstxt( @@ -313,7 +313,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase false ); } - public function test_rfc_unhipbot_3() + public function testRfcUnhipbot3() { // {{{ $robotstxt = new Robotstxt( @@ -341,7 +341,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase true ); } - public function test_rfc_unhipbot_4() + public function testRfcUnhipbot4() { // {{{ $robotstxt = new Robotstxt( @@ -369,7 +369,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase false ); } - public function test_rfc_unhipbot_5() + public function testRfcUnhipbot5() { // {{{ $robotstxt = new Robotstxt( @@ -397,7 +397,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase false ); } - public function test_rfc_unhipbot_6() + public function testRfcUnhipbot6() { // {{{ $robotstxt = new Robotstxt( @@ -425,7 +425,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase false ); } - public function test_rfc_unhipbot_7() + public function testRfcUnhipbot7() { // {{{ $robotstxt = new Robotstxt( @@ -453,7 +453,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase false ); } - public function test_rfc_unhipbot_8() + public function testRfcUnhipbot8() { // {{{ $robotstxt = new Robotstxt( @@ -481,7 +481,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase false ); } - public function test_rfc_unhipbot_9() + public function testRfcUnhipbot9() { // {{{ $robotstxt = new Robotstxt( @@ -509,7 +509,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase false ); } - public function test_rfc_unhipbot_10() + public function testRfcUnhipbot10() { // {{{ $robotstxt = new Robotstxt( @@ -537,7 +537,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase false ); } - public function test_rfc_unhipbot_11() + public function testRfcUnhipbot11() { // {{{ $robotstxt = new Robotstxt( @@ -565,7 +565,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase false ); } - public function test_rfc_webcrawler_1() + public function testRfcWebcrawler1() { // {{{ $robotstxt = new Robotstxt( @@ -593,7 +593,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase true ); } - public function test_rfc_webcrawler_2() + public function testRfcWebcrawler2() { // {{{ $robotstxt = new Robotstxt( @@ -621,7 +621,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase true ); } - public function test_rfc_webcrawler_3() + public function testRfcWebcrawler3() { // {{{ $robotstxt = new Robotstxt( @@ -649,7 +649,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase true ); } - public function test_rfc_webcrawler_4() + public function testRfcWebcrawler4() { // {{{ $robotstxt = new Robotstxt( @@ -677,7 +677,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase true ); } - public function test_rfc_webcrawler_5() + public function testRfcWebcrawler5() { // {{{ $robotstxt = new Robotstxt( @@ -705,7 +705,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase true ); } - public function test_rfc_webcrawler_6() + public function testRfcWebcrawler6() { // {{{ $robotstxt = new Robotstxt( @@ -733,7 +733,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase true ); } - public function test_rfc_webcrawler_7() + public function testRfcWebcrawler7() { // {{{ $robotstxt = new Robotstxt( @@ -761,7 +761,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase true ); } - public function test_rfc_webcrawler_8() + public function testRfcWebcrawler8() { // {{{ $robotstxt = new Robotstxt( @@ -789,7 +789,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase true ); } - public function test_rfc_webcrawler_9() + public function testRfcWebcrawler9() { // {{{ $robotstxt = new Robotstxt( @@ -817,7 +817,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase true ); } - public function test_rfc_webcrawler_10() + public function testRfcWebcrawler10() { // {{{ $robotstxt = new Robotstxt( @@ -845,7 +845,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase true ); } - public function test_rfc_webcrawler_11() + public function testRfcWebcrawler11() { // {{{ $robotstxt = new Robotstxt( @@ -873,7 +873,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase true ); } - public function test_rfc_excite_1() + public function testRfcExcite1() { // {{{ $robotstxt = new Robotstxt( @@ -901,7 +901,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase true ); } - public function test_rfc_excite_2() + public function testRfcExcite2() { // {{{ $robotstxt = new Robotstxt( @@ -929,7 +929,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase true ); } - public function test_rfc_excite_3() + public function testRfcExcite3() { // {{{ $robotstxt = new Robotstxt( @@ -957,7 +957,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase true ); } - public function test_rfc_excite_4() + public function testRfcExcite4() { // {{{ $robotstxt = new Robotstxt( @@ -985,7 +985,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase true ); } - public function test_rfc_excite_5() + public function testRfcExcite5() { // {{{ $robotstxt = new Robotstxt( @@ -1013,7 +1013,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase true ); } - public function test_rfc_excite_6() + public function testRfcExcite6() { // {{{ $robotstxt = new Robotstxt( @@ -1041,7 +1041,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase true ); } - public function test_rfc_excite_7() + public function testRfcExcite7() { // {{{ $robotstxt = new Robotstxt( @@ -1069,7 +1069,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase true ); } - public function test_rfc_excite_8() + public function testRfcExcite8() { // {{{ $robotstxt = new Robotstxt( @@ -1097,7 +1097,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase true ); } - public function test_rfc_excite_9() + public function testRfcExcite9() { // {{{ $robotstxt = new Robotstxt( @@ -1125,7 +1125,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase true ); } - public function test_rfc_excite_10() + public function testRfcExcite10() { // {{{ $robotstxt = new Robotstxt( @@ -1153,7 +1153,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase true ); } - public function test_rfc_excite_11() + public function testRfcExcite11() { // {{{ $robotstxt = new Robotstxt( @@ -1181,7 +1181,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase true ); } - public function test_rfc_other_1() + public function testRfcOther1() { // {{{ $robotstxt = new Robotstxt( @@ -1206,7 +1206,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase // }}} $this->assertSame($robotstxt->URLAllow("http://www.fict.org/"), false); } - public function test_rfc_other_2() + public function testRfcOther2() { // {{{ $robotstxt = new Robotstxt( @@ -1234,7 +1234,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase false ); } - public function test_rfc_other_3() + public function testRfcOther3() { // {{{ $robotstxt = new Robotstxt( @@ -1262,7 +1262,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase true ); } - public function test_rfc_other_4() + public function testRfcOther4() { // {{{ $robotstxt = new Robotstxt( @@ -1290,7 +1290,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase true ); } - public function test_rfc_other_5() + public function testRfcOther5() { // {{{ $robotstxt = new Robotstxt( @@ -1318,7 +1318,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase true ); } - public function test_rfc_other_6() + public function testRfcOther6() { // {{{ $robotstxt = new Robotstxt( @@ -1346,7 +1346,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase true ); } - public function test_rfc_other_7() + public function testRfcOther7() { // {{{ $robotstxt = new Robotstxt( @@ -1374,7 +1374,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase false ); } - public function test_rfc_other_8() + public function testRfcOther8() { // {{{ $robotstxt = new Robotstxt( @@ -1402,7 +1402,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase true ); } - public function test_rfc_other_9() + public function testRfcOther9() { // {{{ $robotstxt = new Robotstxt( @@ -1430,7 +1430,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase false ); } - public function test_rfc_other_10() + public function testRfcOther10() { // {{{ $robotstxt = new Robotstxt( @@ -1458,7 +1458,7 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase false ); } - public function test_rfc_other_11() + public function testRfcOther11() { // {{{ $robotstxt = new Robotstxt( @@ -1486,4 +1486,49 @@ class RobotstxtTest extends \PHPUnit_Framework_TestCase true ); } + + // Allow/Disallow must start by slash or be empty + public function testAllowDisallowSlash1() + { + $robotstxt = new Robotstxt( + "User-Agent: *\n" . + "Disallow: INVALID\n\n", + "domsearch" + ); + $res = $robotstxt->errors(); + $this->assertSame($res, [1 => "Disallow : Line must start by slash"]); + } + + public function testAllowDisallowSlash2() + { + $robotstxt = new Robotstxt( + "User-Agent: *\n" . + "Disallow: \n\n", + "domsearch" + ); + $res = $robotstxt->errors(); + $this->assertSame($res, []); + } + + public function testAllowDisallowSlash3() + { + $robotstxt = new Robotstxt( + "User-Agent: *\n" . + "Allow: INVALID\n\n", + "domsearch" + ); + $res = $robotstxt->errors(); + $this->assertSame($res, [1 => "Allow : Line must start by slash"]); + } + + public function testAllowDisallowSlash4() + { + $robotstxt = new Robotstxt( + "User-Agent: *\n" . + "Allow: \n\n", + "domsearch" + ); + $res = $robotstxt->errors(); + $this->assertSame($res, []); + } } diff --git a/src/Robotstxt.php b/src/Robotstxt.php index c2606a9..2b4b1cd 100644 --- a/src/Robotstxt.php +++ b/src/Robotstxt.php @@ -13,6 +13,7 @@ namespace Domframework; * It allow to examine an URL against the robots.txt file and return if the URL * is allowed to be used or not * The definition of the format of robots.txt file is available here : + * https://www.rfc-editor.org/rfc/rfc9309.txt * http://www.robotstxt.org/norobots-rfc.txt * https://en.wikipedia.org/wiki/Robots_exclusion_standard */ @@ -94,15 +95,15 @@ class Robotstxt // Get the Allow and Disallow lines. The stop will arrive on first // User-Agent line arriving after a Allow/Disallow. // Comments and empty lines are removed - for ($i = key($keys); $i < count($content); $i++) { - $line = trim($content[$i]); + for ($nb = key($keys); $nb < count($content); $nb++) { + $line = trim($content[$nb]); if (stripos($line, "Sitemap:") === 0) { // Already managed in the general parser. Not needed in the specific // user-agent parser. Must at least be catched to not generate an // error } elseif (stripos($line, "Host:") === 0) { if ($this->host !== null) { - $this->errors[$i] = dgettext( + $this->errors[$nb] = dgettext( "domframework", "Multiple Hosts set" ); @@ -112,15 +113,31 @@ class Robotstxt } elseif ($line === "" || $line[0] === "#") { // Comment, empty line : SKIP } elseif (stripos($line, "allow:") === 0) { - $this->allow[] = $this->getValueFromLine($line); + $allow = $this->getValueFromLine($line); + if (trim($allow) === "" || $allow[0] === "/") { + $this->allow[] = $allow; + } else { + $this->errors[$nb] = dgettext( + "domframework", + "Allow : Line must start by slash" + ); + } } elseif (stripos($line, "disallow:") === 0) { - $this->disallow[] = $this->getValueFromLine($line); + $disallow = $this->getValueFromLine($line); + if (trim($disallow) === "" || $disallow[0] === "/") { + $this->disallow[] = $disallow; + } else { + $this->errors[$nb] = dgettext( + "domframework", + "Disallow : Line must start by slash" + ); + } } elseif (stripos($line, "crawl-delay:") === 0) { $val = $this->getValueFromLine($line); if ($val > 1 && $val < 60 && $this->crawldelay === null) { $this->crawldelay = intval($val); } else { - $this->errors[$i] = dgettext( + $this->errors[$nb] = dgettext( "domframework", "Crawldelay : value out of range (1-60)" ); @@ -134,7 +151,7 @@ class Robotstxt } } else { // Not managed line : error - $this->errors[$i] = sprintf(dgettext( + $this->errors[$nb] = sprintf(dgettext( "domframework", "Invalid line : unknown command : '%s'" ), $line);