From f574476427d6e49f4672c98d9e65f95554f78bdb Mon Sep 17 00:00:00 2001 From: Dominique Fournier Date: Tue, 11 Jun 2019 18:59:44 +0000 Subject: [PATCH] robotsTxt : update Tests git-svn-id: https://svn.fournier38.fr/svn/ProgSVN/trunk@5343 bf3deb0d-5f1a-0410-827f-c0cc1f45334c --- Tests/robotstxtTest.php | 1255 +++++++++++++++++++++++++++++++++++++-- robotstxt.php | 24 +- 2 files changed, 1226 insertions(+), 53 deletions(-) diff --git a/Tests/robotstxtTest.php b/Tests/robotstxtTest.php index ae46f91..4c62358 100644 --- a/Tests/robotstxtTest.php +++ b/Tests/robotstxtTest.php @@ -6,25 +6,29 @@ class test_model extends PHPUnit_Framework_TestCase // Empty Robots public function test_Construct_1 () { - $robotstxt = new robotstxt (""); + $robotstxt = new robotstxt (); + $robotstxt->analyze ("", "domsearch"); $res = $robotstxt->allow (); $this->assertSame ($res, ["/"]); } public function test_Construct_2 () { - $robotstxt = new robotstxt (""); + $robotstxt = new robotstxt (); + $robotstxt->analyze ("", "domsearch"); $res = $robotstxt->disallow (); $this->assertSame ($res, array ()); } public function test_Construct_3 () { - $robotstxt = new robotstxt (""); + $robotstxt = new robotstxt (); + $robotstxt->analyze ("", "domsearch"); $res = $robotstxt->sitemaps (); $this->assertSame ($res, array ()); } public function test_Construct_4 () { - $robotstxt = new robotstxt (""); + $robotstxt = new robotstxt (); + $robotstxt->analyze ("", "domsearch"); $res = $robotstxt->crawldelay (); $this->assertSame ($res, 3); } @@ -32,34 +36,40 @@ class test_model extends PHPUnit_Framework_TestCase // Allow public function test_allow_1 () { - $robotstxt = new robotstxt ( - "User-Agent: *\nDisallow:\n"); + $robotstxt = new robotstxt (); + $robotstxt->analyze ( + "User-Agent: *\nDisallow:\n", "domsearch"); $res = $robotstxt->allow (); $this->assertSame ($res, ["/"]); } public function test_allow_2 () { - $robotstxt = new robotstxt ( - "User-Agent: *\nDisallow:\n\nUser-Agent: DomSearch\nDisallow:\n"); + $robotstxt = new robotstxt (); + $robotstxt->analyze ( + "User-Agent: *\nDisallow:\n\nUser-Agent: DomSearch\nDisallow:\n", + "domsearch"); $res = $robotstxt->allow (); $this->assertSame ($res, ["/"]); } public function test_allow_3 () { - $robotstxt = new robotstxt ( - "User-Agent: DomSearch\nDisallow:\n\nUser-Agent: *\nDisallow:\n"); + $robotstxt = new robotstxt (); + $robotstxt->analyze ( + "User-Agent: DomSearch\nDisallow:\n\nUser-Agent: *\nDisallow:\n", + "domsearch"); $res = $robotstxt->allow (); $this->assertSame ($res, ["/"]); } public function test_allow_4 () { - $robotstxt = new robotstxt ( + $robotstxt = new robotstxt (); + $robotstxt->analyze ( "User-Agent: DomSearch\n". "User-Agent: User1\n". "User-Agent: User2\n". "Disallow:\n\n". "User-Agent: *\n". - "Disallow: /\n"); + "Disallow: /\n", "domsearch"); $res = $robotstxt->allow (); $this->assertSame ($res, ["/"]); } @@ -67,22 +77,27 @@ class test_model extends PHPUnit_Framework_TestCase // Disallow public function test_disallow_1 () { - $robotstxt = new robotstxt ( - "User-Agent: *\nDisallow: /\n"); + $robotstxt = new robotstxt (); + $robotstxt->analyze ( + "User-Agent: *\nDisallow: /\n", "domsearch"); $res = $robotstxt->disallow (); $this->assertSame ($res, ["/"]); } public function test_disallow_2 () { - $robotstxt = new robotstxt ( - "User-Agent: *\nDisallow: /\n\nUser-Agent: DomSearch\nDisallow: /\n"); + $robotstxt = new robotstxt (); + $robotstxt->analyze ( + "User-Agent: *\nDisallow: /\n\nUser-Agent: DomSearch\nDisallow: /\n", + "domsearch"); $res = $robotstxt->disallow (); $this->assertSame ($res, ["/"]); } public function test_disallow_3 () { - $robotstxt = new robotstxt ( - "User-Agent: DomSearch\nDisallow: /\n\nUser-Agent: *\nDisallow: /\n"); + $robotstxt = new robotstxt (); + $robotstxt->analyze ( + "User-Agent: DomSearch\nDisallow: /\n\nUser-Agent: *\nDisallow: /\n", + "domsearch"); $res = $robotstxt->disallow (); $this->assertSame ($res, ["/"]); } @@ -90,24 +105,29 @@ class test_model extends PHPUnit_Framework_TestCase // Sitemaps public function test_sitemaps_1 () { - $robotstxt = new robotstxt ( - "User-Agent: DomSearch\nDisallow: /\n\nUser-Agent: *\nDisallow: /\n"); + $robotstxt = new robotstxt (); + $robotstxt->analyze ( + "User-Agent: DomSearch\nDisallow: /\n\nUser-Agent: *\nDisallow: /\n", + "domsearch"); $res = $robotstxt->sitemaps (); $this->assertSame ($res, []); } public function test_sitemaps_2 () { - $robotstxt = new robotstxt ( - "User-Agent: *\nDisallow: /\nSitemap: http://example.com/sitemap.xml"); + $robotstxt = new robotstxt (); + $robotstxt->analyze ( + "User-Agent: *\nDisallow: /\nSitemap: http://example.com/sitemap.xml", + "domsearch"); $res = $robotstxt->sitemaps (); $this->assertSame ($res, ["http://example.com/sitemap.xml"]); } public function test_sitemaps_3 () { - $robotstxt = new robotstxt ( + $robotstxt = new robotstxt (); + $robotstxt->analyze ( "User-Agent: *\nDisallow: /\n". "Sitemap: http://example.com/sitemap.xml\n". - "Sitemap: http://example.com/SITEMAP.XML"); + "Sitemap: http://example.com/SITEMAP.XML", "domsearch"); $res = $robotstxt->sitemaps (); $this->assertSame ($res, ["http://example.com/sitemap.xml", "http://example.com/SITEMAP.XML"]); @@ -116,15 +136,17 @@ class test_model extends PHPUnit_Framework_TestCase // Host public function test_host_1 () { - $robotstxt = new robotstxt ( - "User-Agent: *\nDisallow: /\n"); + $robotstxt = new robotstxt (); + $robotstxt->analyze ( + "User-Agent: *\nDisallow: /\n", "domsearch"); $res = $robotstxt->host (); $this->assertSame ($res, null); } public function test_host_2 () { - $robotstxt = new robotstxt ( - "User-Agent: *\nDisallow: /\n\nHost: localhost"); + $robotstxt = new robotstxt (); + $robotstxt->analyze ( + "User-Agent: *\nDisallow: /\n\nHost: localhost", "domsearch"); $res = $robotstxt->host (); $this->assertSame ($res, "localhost"); } @@ -132,50 +154,1201 @@ class test_model extends PHPUnit_Framework_TestCase // URLAllow public function test_urlallow_1 () { - $robotstxt = new robotstxt (""); + $robotstxt = new robotstxt (); + $robotstxt->analyze ("", "domsearch"); $res = $robotstxt->URLAllow ("/"); $this->assertSame ($res, true); } public function test_urlallow_2 () { - $robotstxt = new robotstxt ( - "User-Agent: *\nDisallow: /"); + $robotstxt = new robotstxt (); + $robotstxt->analyze ( + "User-Agent: *\nDisallow: /", "domsearch"); $res = $robotstxt->URLAllow ("/"); $this->assertSame ($res, false); } public function test_urlallow_3 () { - $robotstxt = new robotstxt ( - "User-Agent: *\nDisallow: /\nAllow: /allow/"); + $robotstxt = new robotstxt (); + $robotstxt->analyze ( + "User-Agent: *\nDisallow: /\nAllow: /allow/", "domsearch"); $res = $robotstxt->URLAllow ("/"); $this->assertSame ($res, false); } public function test_urlallow_4 () { - $robotstxt = new robotstxt ( - "User-Agent: *\nDisallow: /\nAllow: /allow/"); + $robotstxt = new robotstxt (); + $robotstxt->analyze ( + "User-Agent: *\nDisallow: /\nAllow: /allow/", "domsearch"); $res = $robotstxt->URLAllow ("/allow/file"); $this->assertSame ($res, true); } public function test_urlallow_5 () { - $robotstxt = new robotstxt ( - "User-Agent: *\nDisallow: /\nAllow: /allow/*.gif$"); + $robotstxt = new robotstxt (); + $robotstxt->analyze ( + "User-Agent: *\nDisallow: /\nAllow: /allow/*.gif$", "domsearch"); $res = $robotstxt->URLAllow ("/allow/file.gif"); $this->assertSame ($res, true); } public function test_urlallow_6 () { - $robotstxt = new robotstxt ( - "User-Agent: *\nDisallow: /\nAllow: /allow/*.gif$"); + $robotstxt = new robotstxt (); + $robotstxt->analyze ( + "User-Agent: *\nDisallow: /\nAllow: /allow/*.gif$", "domsearch"); $res = $robotstxt->URLAllow ("/allow/.gif"); $this->assertSame ($res, false); } public function test_urlallow_7 () { - $robotstxt = new robotstxt ( - "User-Agent: *\nDisallow: /\nAllow: /allow/*.gif\$"); + $robotstxt = new robotstxt (); + $robotstxt->analyze ( + "User-Agent: *\nDisallow: /\nAllow: /allow/*.gif\$", "domsearch"); $res = $robotstxt->URLAllow ("/allow/file.png"); $this->assertSame ($res, false); } + + // Tests like http://www.robotstxt.org/norobots-rfc.txt + public function test_rfc_unhipbot_1 () + { + // {{{ + $robotstxt = new robotstxt (); + $robotstxt->analyze ( + "# /robots.txt for http://www.fict.org/ + # comments to webmaster@fict.org + + User-agent: unhipbot + Disallow: / + + User-agent: webcrawler + User-agent: excite + Disallow: + + User-agent: * + Disallow: /org/plans.html + Allow: /org/ + Allow: /serv + Allow: /~mak + Disallow: /", "unhipbot" + ); + // }}} + $this->assertSame ( + $robotstxt->URLAllow ("http://www.fict.org/"), false); + } + public function test_rfc_unhipbot_2 () + { + // {{{ + $robotstxt = new robotstxt (); + $robotstxt->analyze ( + "# /robots.txt for http://www.fict.org/ + # comments to webmaster@fict.org + + User-agent: unhipbot + Disallow: / + + User-agent: webcrawler + User-agent: excite + Disallow: + + User-agent: * + Disallow: /org/plans.html + Allow: /org/ + Allow: /serv + Allow: /~mak + Disallow: /", "unhipbot" + ); + // }}} + $this->assertSame ( + $robotstxt->URLAllow ("http://www.fict.org/index.html"), false); + } + public function test_rfc_unhipbot_3 () + { + // {{{ + $robotstxt = new robotstxt (); + $robotstxt->analyze ( + "# /robots.txt for http://www.fict.org/ + # comments to webmaster@fict.org + + User-agent: unhipbot + Disallow: / + + User-agent: webcrawler + User-agent: excite + Disallow: + + User-agent: * + Disallow: /org/plans.html + Allow: /org/ + Allow: /serv + Allow: /~mak + Disallow: /", "unhipbot" + ); + // }}} + $this->assertSame ( + $robotstxt->URLAllow ("http://www.fict.org/robots.txt"), true); + } + public function test_rfc_unhipbot_4 () + { + // {{{ + $robotstxt = new robotstxt (); + $robotstxt->analyze ( + "# /robots.txt for http://www.fict.org/ + # comments to webmaster@fict.org + + User-agent: unhipbot + Disallow: / + + User-agent: webcrawler + User-agent: excite + Disallow: + + User-agent: * + Disallow: /org/plans.html + Allow: /org/ + Allow: /serv + Allow: /~mak + Disallow: /", "unhipbot" + ); + // }}} + $this->assertSame ( + $robotstxt->URLAllow ("http://www.fict.org/server.html"), false); + } + public function test_rfc_unhipbot_5 () + { + // {{{ + $robotstxt = new robotstxt (); + $robotstxt->analyze ( + "# /robots.txt for http://www.fict.org/ + # comments to webmaster@fict.org + + User-agent: unhipbot + Disallow: / + + User-agent: webcrawler + User-agent: excite + Disallow: + + User-agent: * + Disallow: /org/plans.html + Allow: /org/ + Allow: /serv + Allow: /~mak + Disallow: /", "unhipbot" + ); + // }}} + $this->assertSame ( + $robotstxt->URLAllow ("http://www.fict.org/services/fast.html"), false); + } + public function test_rfc_unhipbot_6 () + { + // {{{ + $robotstxt = new robotstxt (); + $robotstxt->analyze ( + "# /robots.txt for http://www.fict.org/ + # comments to webmaster@fict.org + + User-agent: unhipbot + Disallow: / + + User-agent: webcrawler + User-agent: excite + Disallow: + + User-agent: * + Disallow: /org/plans.html + Allow: /org/ + Allow: /serv + Allow: /~mak + Disallow: /", "unhipbot" + ); + // }}} + $this->assertSame ( + $robotstxt->URLAllow ("http://www.fict.org/services/slow.html"), false); + } + public function test_rfc_unhipbot_7 () + { + // {{{ + $robotstxt = new robotstxt ( + "# /robots.txt for http://www.fict.org/ + # comments to webmaster@fict.org + + User-agent: unhipbot + Disallow: / + + User-agent: webcrawler + User-agent: excite + Disallow: + + User-agent: * + Disallow: /org/plans.html + Allow: /org/ + Allow: /serv + Allow: /~mak + Disallow: /", "unhipbot" + ); + // }}} + $this->assertSame ( + $robotstxt->URLAllow ("http://www.fict.org/orgo.gif"), false); + } + public function test_rfc_unhipbot_8 () + { + // {{{ + $robotstxt = new robotstxt (); + $robotstxt->analyze ( + "# /robots.txt for http://www.fict.org/ + # comments to webmaster@fict.org + + User-agent: unhipbot + Disallow: / + + User-agent: webcrawler + User-agent: excite + Disallow: + + User-agent: * + Disallow: /org/plans.html + Allow: /org/ + Allow: /serv + Allow: /~mak + Disallow: /", "unhipbot" + ); + // }}} + $this->assertSame ( + $robotstxt->URLAllow ("http://www.fict.org/org/about.html"), false); + } + public function test_rfc_unhipbot_9 () + { + // {{{ + $robotstxt = new robotstxt (); + $robotstxt->analyze ( + "# /robots.txt for http://www.fict.org/ + # comments to webmaster@fict.org + + User-agent: unhipbot + Disallow: / + + User-agent: webcrawler + User-agent: excite + Disallow: + + User-agent: * + Disallow: /org/plans.html + Allow: /org/ + Allow: /serv + Allow: /~mak + Disallow: /", "unhipbot" + ); + // }}} + $this->assertSame ( + $robotstxt->URLAllow ("http://www.fict.org/org/plans.html"), false); + } + public function test_rfc_unhipbot_10 () + { + // {{{ + $robotstxt = new robotstxt (); + $robotstxt->analyze ( + "# /robots.txt for http://www.fict.org/ + # comments to webmaster@fict.org + + User-agent: unhipbot + Disallow: / + + User-agent: webcrawler + User-agent: excite + Disallow: + + User-agent: * + Disallow: /org/plans.html + Allow: /org/ + Allow: /serv + Allow: /~mak + Disallow: /", "unhipbot" + ); + // }}} + $this->assertSame ( + $robotstxt->URLAllow ("http://www.fict.org/~jim/jim.html"), false); + } + public function test_rfc_unhipbot_11 () + { + // {{{ + $robotstxt = new robotstxt (); + $robotstxt->analyze ( + "# /robots.txt for http://www.fict.org/ + # comments to webmaster@fict.org + + User-agent: unhipbot + Disallow: / + + User-agent: webcrawler + User-agent: excite + Disallow: + + User-agent: * + Disallow: /org/plans.html + Allow: /org/ + Allow: /serv + Allow: /~mak + Disallow: /", "unhipbot" + ); + // }}} + $this->assertSame ( + $robotstxt->URLAllow ("http://www.fict.org/~mak/mak.html"), false); + } + public function test_rfc_webcrawler_1 () + { + // {{{ + $robotstxt = new robotstxt (); + $robotstxt->analyze ( + "# /robots.txt for http://www.fict.org/ + # comments to webmaster@fict.org + + User-agent: unhipbot + Disallow: / + + User-agent: webcrawler + User-agent: excite + Disallow: + + User-agent: * + Disallow: /org/plans.html + Allow: /org/ + Allow: /serv + Allow: /~mak + Disallow: /", "webcrawler" + ); + // }}} + $this->assertSame ( + $robotstxt->URLAllow ("http://www.fict.org/"), true); + } + public function test_rfc_webcrawler_2 () + { + // {{{ + $robotstxt = new robotstxt (); + $robotstxt->analyze ( + "# /robots.txt for http://www.fict.org/ + # comments to webmaster@fict.org + + User-agent: unhipbot + Disallow: / + + User-agent: webcrawler + User-agent: excite + Disallow: + + User-agent: * + Disallow: /org/plans.html + Allow: /org/ + Allow: /serv + Allow: /~mak + Disallow: /", "webcrawler" + ); + // }}} + $this->assertSame ( + $robotstxt->URLAllow ("http://www.fict.org/index.html"), true); + } + public function test_rfc_webcrawler_3 () + { + // {{{ + $robotstxt = new robotstxt (); + $robotstxt->analyze ( + "# /robots.txt for http://www.fict.org/ + # comments to webmaster@fict.org + + User-agent: unhipbot + Disallow: / + + User-agent: webcrawler + User-agent: excite + Disallow: + + User-agent: * + Disallow: /org/plans.html + Allow: /org/ + Allow: /serv + Allow: /~mak + Disallow: /", "webcrawler" + ); + // }}} + $this->assertSame ( + $robotstxt->URLAllow ("http://www.fict.org/robots.txt"), true); + } + public function test_rfc_webcrawler_4 () + { + // {{{ + $robotstxt = new robotstxt (); + $robotstxt->analyze ( + "# /robots.txt for http://www.fict.org/ + # comments to webmaster@fict.org + + User-agent: unhipbot + Disallow: / + + User-agent: webcrawler + User-agent: excite + Disallow: + + User-agent: * + Disallow: /org/plans.html + Allow: /org/ + Allow: /serv + Allow: /~mak + Disallow: /", "webcrawler" + ); + // }}} + $this->assertSame ( + $robotstxt->URLAllow ("http://www.fict.org/server.html"), true); + } + public function test_rfc_webcrawler_5 () + { + // {{{ + $robotstxt = new robotstxt (); + $robotstxt->analyze ( + "# /robots.txt for http://www.fict.org/ + # comments to webmaster@fict.org + + User-agent: unhipbot + Disallow: / + + User-agent: webcrawler + User-agent: excite + Disallow: + + User-agent: * + Disallow: /org/plans.html + Allow: /org/ + Allow: /serv + Allow: /~mak + Disallow: /", "webcrawler" + ); + // }}} + $this->assertSame ( + $robotstxt->URLAllow ("http://www.fict.org/services/fast.html"), true); + } + public function test_rfc_webcrawler_6 () + { + // {{{ + $robotstxt = new robotstxt (); + $robotstxt->analyze ( + "# /robots.txt for http://www.fict.org/ + # comments to webmaster@fict.org + + User-agent: unhipbot + Disallow: / + + User-agent: webcrawler + User-agent: excite + Disallow: + + User-agent: * + Disallow: /org/plans.html + Allow: /org/ + Allow: /serv + Allow: /~mak + Disallow: /", "webcrawler" + ); + // }}} + $this->assertSame ( + $robotstxt->URLAllow ("http://www.fict.org/services/slow.html"), true); + } + public function test_rfc_webcrawler_7 () + { + // {{{ + $robotstxt = new robotstxt (); + $robotstxt->analyze ( + "# /robots.txt for http://www.fict.org/ + # comments to webmaster@fict.org + + User-agent: unhipbot + Disallow: / + + User-agent: webcrawler + User-agent: excite + Disallow: + + User-agent: * + Disallow: /org/plans.html + Allow: /org/ + Allow: /serv + Allow: /~mak + Disallow: /", "webcrawler" + ); + // }}} + $this->assertSame ( + $robotstxt->URLAllow ("http://www.fict.org/orgo.gif"), true); + } + public function test_rfc_webcrawler_8 () + { + // {{{ + $robotstxt = new robotstxt (); + $robotstxt->analyze ( + "# /robots.txt for http://www.fict.org/ + # comments to webmaster@fict.org + + User-agent: unhipbot + Disallow: / + + User-agent: webcrawler + User-agent: excite + Disallow: + + User-agent: * + Disallow: /org/plans.html + Allow: /org/ + Allow: /serv + Allow: /~mak + Disallow: /", "webcrawler" + ); + // }}} + $this->assertSame ( + $robotstxt->URLAllow ("http://www.fict.org/org/about.html"), true); + } + public function test_rfc_webcrawler_9 () + { + // {{{ + $robotstxt = new robotstxt (); + $robotstxt->analyze ( + "# /robots.txt for http://www.fict.org/ + # comments to webmaster@fict.org + + User-agent: unhipbot + Disallow: / + + User-agent: webcrawler + User-agent: excite + Disallow: + + User-agent: * + Disallow: /org/plans.html + Allow: /org/ + Allow: /serv + Allow: /~mak + Disallow: /", "webcrawler" + ); + // }}} + $this->assertSame ( + $robotstxt->URLAllow ("http://www.fict.org/org/plans.html"), true); + } + public function test_rfc_webcrawler_10 () + { + // {{{ + $robotstxt = new robotstxt (); + $robotstxt->analyze ( + "# /robots.txt for http://www.fict.org/ + # comments to webmaster@fict.org + + User-agent: unhipbot + Disallow: / + + User-agent: webcrawler + User-agent: excite + Disallow: + + User-agent: * + Disallow: /org/plans.html + Allow: /org/ + Allow: /serv + Allow: /~mak + Disallow: /", "webcrawler" + ); + // }}} + $this->assertSame ( + $robotstxt->URLAllow ("http://www.fict.org/~jim/jim.html"), true); + } + public function test_rfc_webcrawler_11 () + { + // {{{ + $robotstxt = new robotstxt (); + $robotstxt->analyze ( + "# /robots.txt for http://www.fict.org/ + # comments to webmaster@fict.org + + User-agent: unhipbot + Disallow: / + + User-agent: webcrawler + User-agent: excite + Disallow: + + User-agent: * + Disallow: /org/plans.html + Allow: /org/ + Allow: /serv + Allow: /~mak + Disallow: /", "webcrawler" + ); + // }}} + $this->assertSame ( + $robotstxt->URLAllow ("http://www.fict.org/~mak/mak.html"), true); + } + public function test_rfc_excite_1 () + { + // {{{ + $robotstxt = new robotstxt (); + $robotstxt->analyze ( + "# /robots.txt for http://www.fict.org/ + # comments to webmaster@fict.org + + User-agent: unhipbot + Disallow: / + + User-agent: webcrawler + User-agent: excite + Disallow: + + User-agent: * + Disallow: /org/plans.html + Allow: /org/ + Allow: /serv + Allow: /~mak + Disallow: /", "excite" + ); + // }}} + $this->assertSame ( + $robotstxt->URLAllow ("http://www.fict.org/"), true); + } + public function test_rfc_excite_2 () + { + // {{{ + $robotstxt = new robotstxt (); + $robotstxt->analyze ( + "# /robots.txt for http://www.fict.org/ + # comments to webmaster@fict.org + + User-agent: unhipbot + Disallow: / + + User-agent: webcrawler + User-agent: excite + Disallow: + + User-agent: * + Disallow: /org/plans.html + Allow: /org/ + Allow: /serv + Allow: /~mak + Disallow: /", "excite" + ); + // }}} + $this->assertSame ( + $robotstxt->URLAllow ("http://www.fict.org/index.html"), true); + } + public function test_rfc_excite_3 () + { + // {{{ + $robotstxt = new robotstxt (); + $robotstxt->analyze ( + "# /robots.txt for http://www.fict.org/ + # comments to webmaster@fict.org + + User-agent: unhipbot + Disallow: / + + User-agent: webcrawler + User-agent: excite + Disallow: + + User-agent: * + Disallow: /org/plans.html + Allow: /org/ + Allow: /serv + Allow: /~mak + Disallow: /", "excite" + ); + // }}} + $this->assertSame ( + $robotstxt->URLAllow ("http://www.fict.org/robots.txt"), true); + } + public function test_rfc_excite_4 () + { + // {{{ + $robotstxt = new robotstxt (); + $robotstxt->analyze ( + "# /robots.txt for http://www.fict.org/ + # comments to webmaster@fict.org + + User-agent: unhipbot + Disallow: / + + User-agent: webcrawler + User-agent: excite + Disallow: + + User-agent: * + Disallow: /org/plans.html + Allow: /org/ + Allow: /serv + Allow: /~mak + Disallow: /", "excite" + ); + // }}} + $this->assertSame ( + $robotstxt->URLAllow ("http://www.fict.org/server.html"), true); + } + public function test_rfc_excite_5 () + { + // {{{ + $robotstxt = new robotstxt (); + $robotstxt->analyze ( + "# /robots.txt for http://www.fict.org/ + # comments to webmaster@fict.org + + User-agent: unhipbot + Disallow: / + + User-agent: webcrawler + User-agent: excite + Disallow: + + User-agent: * + Disallow: /org/plans.html + Allow: /org/ + Allow: /serv + Allow: /~mak + Disallow: /", "excite" + ); + // }}} + $this->assertSame ( + $robotstxt->URLAllow ("http://www.fict.org/services/fast.html"), true); + } + public function test_rfc_excite_6 () + { + // {{{ + $robotstxt = new robotstxt (); + $robotstxt->analyze ( + "# /robots.txt for http://www.fict.org/ + # comments to webmaster@fict.org + + User-agent: unhipbot + Disallow: / + + User-agent: webcrawler + User-agent: excite + Disallow: + + User-agent: * + Disallow: /org/plans.html + Allow: /org/ + Allow: /serv + Allow: /~mak + Disallow: /", "excite" + ); + // }}} + $this->assertSame ( + $robotstxt->URLAllow ("http://www.fict.org/services/slow.html"), true); + } + public function test_rfc_excite_7 () + { + // {{{ + $robotstxt = new robotstxt (); + $robotstxt->analyze ( + "# /robots.txt for http://www.fict.org/ + # comments to webmaster@fict.org + + User-agent: unhipbot + Disallow: / + + User-agent: webcrawler + User-agent: excite + Disallow: + + User-agent: * + Disallow: /org/plans.html + Allow: /org/ + Allow: /serv + Allow: /~mak + Disallow: /", "excite" + ); + // }}} + $this->assertSame ( + $robotstxt->URLAllow ("http://www.fict.org/orgo.gif"), true); + } + public function test_rfc_excite_8 () + { + // {{{ + $robotstxt = new robotstxt (); + $robotstxt->analyze ( + "# /robots.txt for http://www.fict.org/ + # comments to webmaster@fict.org + + User-agent: unhipbot + Disallow: / + + User-agent: webcrawler + User-agent: excite + Disallow: + + User-agent: * + Disallow: /org/plans.html + Allow: /org/ + Allow: /serv + Allow: /~mak + Disallow: /", "excite" + ); + // }}} + $this->assertSame ( + $robotstxt->URLAllow ("http://www.fict.org/org/about.html"), true); + } + public function test_rfc_excite_9 () + { + // {{{ + $robotstxt = new robotstxt (); + $robotstxt->analyze ( + "# /robots.txt for http://www.fict.org/ + # comments to webmaster@fict.org + + User-agent: unhipbot + Disallow: / + + User-agent: webcrawler + User-agent: excite + Disallow: + + User-agent: * + Disallow: /org/plans.html + Allow: /org/ + Allow: /serv + Allow: /~mak + Disallow: /", "excite" + ); + // }}} + $this->assertSame ( + $robotstxt->URLAllow ("http://www.fict.org/org/plans.html"), true); + } + public function test_rfc_excite_10 () + { + // {{{ + $robotstxt = new robotstxt (); + $robotstxt->analyze ( + "# /robots.txt for http://www.fict.org/ + # comments to webmaster@fict.org + + User-agent: unhipbot + Disallow: / + + User-agent: webcrawler + User-agent: excite + Disallow: + + User-agent: * + Disallow: /org/plans.html + Allow: /org/ + Allow: /serv + Allow: /~mak + Disallow: /", "excite" + ); + // }}} + $this->assertSame ( + $robotstxt->URLAllow ("http://www.fict.org/~jim/jim.html"), true); + } + public function test_rfc_excite_11 () + { + // {{{ + $robotstxt = new robotstxt (); + $robotstxt->analyze ( + "# /robots.txt for http://www.fict.org/ + # comments to webmaster@fict.org + + User-agent: unhipbot + Disallow: / + + User-agent: webcrawler + User-agent: excite + Disallow: + + User-agent: * + Disallow: /org/plans.html + Allow: /org/ + Allow: /serv + Allow: /~mak + Disallow: /", "excite" + ); + // }}} + $this->assertSame ( + $robotstxt->URLAllow ("http://www.fict.org/~mak/mak.html"), true); + } + public function test_rfc_other_1 () + { + // {{{ + $robotstxt = new robotstxt (); + $robotstxt->analyze ( + "# /robots.txt for http://www.fict.org/ + # comments to webmaster@fict.org + + User-agent: unhipbot + Disallow: / + + User-agent: webcrawler + User-agent: excite + Disallow: + + User-agent: * + Disallow: /org/plans.html + Allow: /org/ + Allow: /serv + Allow: /~mak + Disallow: /", "other" + ); + // }}} + $this->assertSame ($robotstxt->URLAllow ("http://www.fict.org/"), false); + } + public function test_rfc_other_2 () + { + // {{{ + $robotstxt = new robotstxt (); + $robotstxt->analyze ( + "# /robots.txt for http://www.fict.org/ + # comments to webmaster@fict.org + + User-agent: unhipbot + Disallow: / + + User-agent: webcrawler + User-agent: excite + Disallow: + + User-agent: * + Disallow: /org/plans.html + Allow: /org/ + Allow: /serv + Allow: /~mak + Disallow: /", "other" + ); + // }}} + $this->assertSame ( + $robotstxt->URLAllow ("http://www.fict.org/index.html"), false); + } + public function test_rfc_other_3 () + { + // {{{ + $robotstxt = new robotstxt (); + $robotstxt->analyze ( + "# /robots.txt for http://www.fict.org/ + # comments to webmaster@fict.org + + User-agent: unhipbot + Disallow: / + + User-agent: webcrawler + User-agent: excite + Disallow: + + User-agent: * + Disallow: /org/plans.html + Allow: /org/ + Allow: /serv + Allow: /~mak + Disallow: /", "other" + ); + // }}} + $this->assertSame ( + $robotstxt->URLAllow ("http://www.fict.org/robots.txt"), true); + } + public function test_rfc_other_4 () + { + // {{{ + $robotstxt = new robotstxt (); + $robotstxt->analyze ( + "# /robots.txt for http://www.fict.org/ + # comments to webmaster@fict.org + + User-agent: unhipbot + Disallow: / + + User-agent: webcrawler + User-agent: excite + Disallow: + + User-agent: * + Disallow: /org/plans.html + Allow: /org/ + Allow: /serv + Allow: /~mak + Disallow: /", "other" + ); + // }}} + $this->assertSame ( + $robotstxt->URLAllow ("http://www.fict.org/server.html"), true); + } + public function test_rfc_other_5 () + { + // {{{ + $robotstxt = new robotstxt (); + $robotstxt->analyze ( + "# /robots.txt for http://www.fict.org/ + # comments to webmaster@fict.org + + User-agent: unhipbot + Disallow: / + + User-agent: webcrawler + User-agent: excite + Disallow: + + User-agent: * + Disallow: /org/plans.html + Allow: /org/ + Allow: /serv + Allow: /~mak + Disallow: /", "other" + ); + // }}} + $this->assertSame ( + $robotstxt->URLAllow ("http://www.fict.org/services/fast.html"), true); + } + public function test_rfc_other_6 () + { + // {{{ + $robotstxt = new robotstxt (); + $robotstxt->analyze ( + "# /robots.txt for http://www.fict.org/ + # comments to webmaster@fict.org + + User-agent: unhipbot + Disallow: / + + User-agent: webcrawler + User-agent: excite + Disallow: + + User-agent: * + Disallow: /org/plans.html + Allow: /org/ + Allow: /serv + Allow: /~mak + Disallow: /", "other" + ); + // }}} + $this->assertSame ( + $robotstxt->URLAllow ("http://www.fict.org/services/slow.html"), true); + } + public function test_rfc_other_7 () + { + // {{{ + $robotstxt = new robotstxt (); + $robotstxt->analyze ( + "# /robots.txt for http://www.fict.org/ + # comments to webmaster@fict.org + + User-agent: unhipbot + Disallow: / + + User-agent: webcrawler + User-agent: excite + Disallow: + + User-agent: * + Disallow: /org/plans.html + Allow: /org/ + Allow: /serv + Allow: /~mak + Disallow: /", "other" + ); + // }}} + $this->assertSame ( + $robotstxt->URLAllow ("http://www.fict.org/orgo.gif"), false); + } + public function test_rfc_other_8 () + { + // {{{ + $robotstxt = new robotstxt (); + $robotstxt->analyze ( + "# /robots.txt for http://www.fict.org/ + # comments to webmaster@fict.org + + User-agent: unhipbot + Disallow: / + + User-agent: webcrawler + User-agent: excite + Disallow: + + User-agent: * + Disallow: /org/plans.html + Allow: /org/ + Allow: /serv + Allow: /~mak + Disallow: /", "other" + ); + // }}} + $this->assertSame ( + $robotstxt->URLAllow ("http://www.fict.org/org/about.html"), true); + } + public function test_rfc_other_9 () + { + // {{{ + $robotstxt = new robotstxt (); + $robotstxt->analyze ( + "# /robots.txt for http://www.fict.org/ + # comments to webmaster@fict.org + + User-agent: unhipbot + Disallow: / + + User-agent: webcrawler + User-agent: excite + Disallow: + + User-agent: * + Disallow: /org/plans.html + Allow: /org/ + Allow: /serv + Allow: /~mak + Disallow: /", "other" + ); + // }}} + $this->assertSame ( + $robotstxt->URLAllow ("http://www.fict.org/org/plans.html"), false); + } + public function test_rfc_other_10 () + { + // {{{ + $robotstxt = new robotstxt (); + $robotstxt->analyze ( + "# /robots.txt for http://www.fict.org/ + # comments to webmaster@fict.org + + User-agent: unhipbot + Disallow: / + + User-agent: webcrawler + User-agent: excite + Disallow: + + User-agent: * + Disallow: /org/plans.html + Allow: /org/ + Allow: /serv + Allow: /~mak + Disallow: /", "other" + ); + // }}} + $this->assertSame ( + $robotstxt->URLAllow ("http://www.fict.org/~jim/jim.html"), false); + } + public function test_rfc_other_11 () + { + // {{{ + $robotstxt = new robotstxt (); + $robotstxt->analyze ( + "# /robots.txt for http://www.fict.org/ + # comments to webmaster@fict.org + + User-agent: unhipbot + Disallow: / + + User-agent: webcrawler + User-agent: excite + Disallow: + + User-agent: * + Disallow: /org/plans.html + Allow: /org/ + Allow: /serv + Allow: /~mak + Disallow: /", "other" + ); + // }}} + $this->assertSame ( + $robotstxt->URLAllow ("http://www.fict.org/~mak/mak.html"), true); + } } diff --git a/robotstxt.php b/robotstxt.php index e93e634..6cdd104 100644 --- a/robotstxt.php +++ b/robotstxt.php @@ -16,10 +16,6 @@ class robotstxt { // PROPERTIES // {{{ - /** The cralwer name wanted in robots.txt - */ - private $crawlerName = "DomSearch"; - /** The allowed urls */ private $allow = array (); @@ -40,7 +36,7 @@ class robotstxt */ private $host = null; - /** The rule matchine the URLAllow rule + /** The rule number matching the URLAllow rule */ private $matchRule = null; // }}} @@ -48,9 +44,10 @@ class robotstxt // METHODS /** Get the robots.txt file content and do the analyze * @param string $content The robots.txt file content to analyze + * @param string $crawlerName The crawler name to use in analyze * @return $this */ - public function __construct ($content) + public function analyze ($content, $crawlerName) // {{{ { if (trim ($content) === "") @@ -62,7 +59,7 @@ class robotstxt // If Not check if the User-agent axists for * // If not accept all $content = preg_split('/\r\n|\r|\n/', $content); - $keys = preg_grep ("~User-agent:\s*$this->crawlerName\s*#?~i", $content); + $keys = preg_grep ("~User-agent:\s*$crawlerName\s*#?~i", $content); if (empty ($keys)) $keys = preg_grep ("~User-agent:\s*\\*\s*#?~i", $content); if (empty ($keys)) @@ -133,6 +130,9 @@ class robotstxt { $parse = parse_url ($url); $path = (isset ($parse["path"])) ? $parse["path"] : "/"; + // Robots.txt files are always allowed + if ($path === "/robots.txt") + return true; $bestDisallow = -1; $bestAllow = -1; $allowRule = ""; @@ -179,13 +179,13 @@ class robotstxt } } } - if ($bestAllow < $bestDisallow) + if ($bestAllow > $bestDisallow) { - $this->matchRule = $disallowRule; - return false; + $this->matchRule = $allowRule; + return true; } - $this->matchRule = $allowRule; - return true; + $this->matchRule = $disallowRule; + return false; } // }}}