robotstxt : rewrite all the parser with the logic defined in RFC
git-svn-id: https://svn.fournier38.fr/svn/ProgSVN/trunk@5342 bf3deb0d-5f1a-0410-827f-c0cc1f45334c
This commit is contained in:
@@ -1,166 +1,181 @@
|
|||||||
<?php
|
<?php
|
||||||
/** Test the models/robots.txt file
|
/** Test the mrobotstxt.txt file
|
||||||
*/
|
*/
|
||||||
class test_model extends PHPUnit_Framework_TestCase
|
class test_model extends PHPUnit_Framework_TestCase
|
||||||
{
|
{
|
||||||
// Empty Robots
|
// Empty Robots
|
||||||
public function test_Construct_1 ()
|
public function test_Construct_1 ()
|
||||||
{
|
{
|
||||||
$robots = new \models\robots ("");
|
$robotstxt = new robotstxt ("");
|
||||||
$res = $robots->allow ();
|
$res = $robotstxt->allow ();
|
||||||
$this->assertSame ($res, ["/"]);
|
$this->assertSame ($res, ["/"]);
|
||||||
}
|
}
|
||||||
public function test_Construct_2 ()
|
public function test_Construct_2 ()
|
||||||
{
|
{
|
||||||
$robots = new \models\robots ("");
|
$robotstxt = new robotstxt ("");
|
||||||
$res = $robots->disallow ();
|
$res = $robotstxt->disallow ();
|
||||||
$this->assertSame ($res, array ());
|
$this->assertSame ($res, array ());
|
||||||
}
|
}
|
||||||
public function test_Construct_3 ()
|
public function test_Construct_3 ()
|
||||||
{
|
{
|
||||||
$robots = new \models\robots ("");
|
$robotstxt = new robotstxt ("");
|
||||||
$res = $robots->sitemaps ();
|
$res = $robotstxt->sitemaps ();
|
||||||
$this->assertSame ($res, array ());
|
$this->assertSame ($res, array ());
|
||||||
}
|
}
|
||||||
public function test_Construct_4 ()
|
public function test_Construct_4 ()
|
||||||
{
|
{
|
||||||
$robots = new \models\robots ("");
|
$robotstxt = new robotstxt ("");
|
||||||
$res = $robots->crawldelay ();
|
$res = $robotstxt->crawldelay ();
|
||||||
$this->assertSame ($res, 3);
|
$this->assertSame ($res, 3);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Allow
|
// Allow
|
||||||
public function test_allow_1 ()
|
public function test_allow_1 ()
|
||||||
{
|
{
|
||||||
$robots = new \models\robots (
|
$robotstxt = new robotstxt (
|
||||||
"User-Agent: *\nDisallow:\n");
|
"User-Agent: *\nDisallow:\n");
|
||||||
$res = $robots->allow ();
|
$res = $robotstxt->allow ();
|
||||||
$this->assertSame ($res, ["/"]);
|
$this->assertSame ($res, ["/"]);
|
||||||
}
|
}
|
||||||
public function test_allow_2 ()
|
public function test_allow_2 ()
|
||||||
{
|
{
|
||||||
$robots = new \models\robots (
|
$robotstxt = new robotstxt (
|
||||||
"User-Agent: *\nDisallow:\n\nUser-Agent: DomSearch\nDisallow:\n");
|
"User-Agent: *\nDisallow:\n\nUser-Agent: DomSearch\nDisallow:\n");
|
||||||
$res = $robots->allow ();
|
$res = $robotstxt->allow ();
|
||||||
$this->assertSame ($res, ["/"]);
|
$this->assertSame ($res, ["/"]);
|
||||||
}
|
}
|
||||||
public function test_allow_3 ()
|
public function test_allow_3 ()
|
||||||
{
|
{
|
||||||
$robots = new \models\robots (
|
$robotstxt = new robotstxt (
|
||||||
"User-Agent: DomSearch\nDisallow:\n\nUser-Agent: *\nDisallow:\n");
|
"User-Agent: DomSearch\nDisallow:\n\nUser-Agent: *\nDisallow:\n");
|
||||||
$res = $robots->allow ();
|
$res = $robotstxt->allow ();
|
||||||
|
$this->assertSame ($res, ["/"]);
|
||||||
|
}
|
||||||
|
public function test_allow_4 ()
|
||||||
|
{
|
||||||
|
$robotstxt = new robotstxt (
|
||||||
|
"User-Agent: DomSearch\n".
|
||||||
|
"User-Agent: User1\n".
|
||||||
|
"User-Agent: User2\n".
|
||||||
|
"Disallow:\n\n".
|
||||||
|
"User-Agent: *\n".
|
||||||
|
"Disallow: /\n");
|
||||||
|
$res = $robotstxt->allow ();
|
||||||
$this->assertSame ($res, ["/"]);
|
$this->assertSame ($res, ["/"]);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Disallow
|
// Disallow
|
||||||
public function test_disallow_1 ()
|
public function test_disallow_1 ()
|
||||||
{
|
{
|
||||||
$robots = new \models\robots (
|
$robotstxt = new robotstxt (
|
||||||
"User-Agent: *\nDisallow: /\n");
|
"User-Agent: *\nDisallow: /\n");
|
||||||
$res = $robots->disallow ();
|
$res = $robotstxt->disallow ();
|
||||||
$this->assertSame ($res, ["/"]);
|
$this->assertSame ($res, ["/"]);
|
||||||
}
|
}
|
||||||
public function test_disallow_2 ()
|
public function test_disallow_2 ()
|
||||||
{
|
{
|
||||||
$robots = new \models\robots (
|
$robotstxt = new robotstxt (
|
||||||
"User-Agent: *\nDisallow: /\n\nUser-Agent: DomSearch\nDisallow: /\n");
|
"User-Agent: *\nDisallow: /\n\nUser-Agent: DomSearch\nDisallow: /\n");
|
||||||
$res = $robots->disallow ();
|
$res = $robotstxt->disallow ();
|
||||||
$this->assertSame ($res, ["/"]);
|
$this->assertSame ($res, ["/"]);
|
||||||
}
|
}
|
||||||
public function test_disallow_3 ()
|
public function test_disallow_3 ()
|
||||||
{
|
{
|
||||||
$robots = new \models\robots (
|
$robotstxt = new robotstxt (
|
||||||
"User-Agent: DomSearch\nDisallow: /\n\nUser-Agent: *\nDisallow: /\n");
|
"User-Agent: DomSearch\nDisallow: /\n\nUser-Agent: *\nDisallow: /\n");
|
||||||
$res = $robots->disallow ();
|
$res = $robotstxt->disallow ();
|
||||||
$this->assertSame ($res, ["/"]);
|
$this->assertSame ($res, ["/"]);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Sitemaps
|
// Sitemaps
|
||||||
public function test_sitemaps_1 ()
|
public function test_sitemaps_1 ()
|
||||||
{
|
{
|
||||||
$robots = new \models\robots (
|
$robotstxt = new robotstxt (
|
||||||
"User-Agent: DomSearch\nDisallow: /\n\nUser-Agent: *\nDisallow: /\n");
|
"User-Agent: DomSearch\nDisallow: /\n\nUser-Agent: *\nDisallow: /\n");
|
||||||
$res = $robots->sitemaps ();
|
$res = $robotstxt->sitemaps ();
|
||||||
$this->assertSame ($res, []);
|
$this->assertSame ($res, []);
|
||||||
}
|
}
|
||||||
public function test_sitemaps_2 ()
|
public function test_sitemaps_2 ()
|
||||||
{
|
{
|
||||||
$robots = new \models\robots (
|
$robotstxt = new robotstxt (
|
||||||
"User-Agent: *\nDisallow: /\nSitemap: /sitemap.xml");
|
"User-Agent: *\nDisallow: /\nSitemap: http://example.com/sitemap.xml");
|
||||||
$res = $robots->sitemaps ();
|
$res = $robotstxt->sitemaps ();
|
||||||
$this->assertSame ($res, ["/sitemap.xml"]);
|
$this->assertSame ($res, ["http://example.com/sitemap.xml"]);
|
||||||
}
|
}
|
||||||
public function test_sitemaps_3 ()
|
public function test_sitemaps_3 ()
|
||||||
{
|
{
|
||||||
$robots = new \models\robots (
|
$robotstxt = new robotstxt (
|
||||||
"User-Agent: *\nDisallow: /\nSitemap: /sitemap.xml\nSitemap: /SITEMAP.XML");
|
"User-Agent: *\nDisallow: /\n".
|
||||||
$res = $robots->sitemaps ();
|
"Sitemap: http://example.com/sitemap.xml\n".
|
||||||
$this->assertSame ($res, ["/sitemap.xml", "/SITEMAP.XML"]);
|
"Sitemap: http://example.com/SITEMAP.XML");
|
||||||
|
$res = $robotstxt->sitemaps ();
|
||||||
|
$this->assertSame ($res,
|
||||||
|
["http://example.com/sitemap.xml", "http://example.com/SITEMAP.XML"]);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Host
|
// Host
|
||||||
public function test_host_1 ()
|
public function test_host_1 ()
|
||||||
{
|
{
|
||||||
$robots = new \models\robots (
|
$robotstxt = new robotstxt (
|
||||||
"User-Agent: *\nDisallow: /\n");
|
"User-Agent: *\nDisallow: /\n");
|
||||||
$res = $robots->host ();
|
$res = $robotstxt->host ();
|
||||||
$this->assertSame ($res, null);
|
$this->assertSame ($res, null);
|
||||||
}
|
}
|
||||||
public function test_host_2 ()
|
public function test_host_2 ()
|
||||||
{
|
{
|
||||||
$robots = new \models\robots (
|
$robotstxt = new robotstxt (
|
||||||
"User-Agent: *\nDisallow: /\n\nHost: localhost");
|
"User-Agent: *\nDisallow: /\n\nHost: localhost");
|
||||||
$res = $robots->host ();
|
$res = $robotstxt->host ();
|
||||||
$this->assertSame ($res, "localhost");
|
$this->assertSame ($res, "localhost");
|
||||||
}
|
}
|
||||||
|
|
||||||
// URLAllow
|
// URLAllow
|
||||||
public function test_urlallow_1 ()
|
public function test_urlallow_1 ()
|
||||||
{
|
{
|
||||||
$robots = new \models\robots ("");
|
$robotstxt = new robotstxt ("");
|
||||||
$res = $robots->URLAllow ("/");
|
$res = $robotstxt->URLAllow ("/");
|
||||||
$this->assertSame ($res, true);
|
$this->assertSame ($res, true);
|
||||||
}
|
}
|
||||||
public function test_urlallow_2 ()
|
public function test_urlallow_2 ()
|
||||||
{
|
{
|
||||||
$robots = new \models\robots (
|
$robotstxt = new robotstxt (
|
||||||
"User-Agent: *\nDisallow: /");
|
"User-Agent: *\nDisallow: /");
|
||||||
$res = $robots->URLAllow ("/");
|
$res = $robotstxt->URLAllow ("/");
|
||||||
$this->assertSame ($res, false);
|
$this->assertSame ($res, false);
|
||||||
}
|
}
|
||||||
public function test_urlallow_3 ()
|
public function test_urlallow_3 ()
|
||||||
{
|
{
|
||||||
$robots = new \models\robots (
|
$robotstxt = new robotstxt (
|
||||||
"User-Agent: *\nDisallow: /\nAllow: /allow/");
|
"User-Agent: *\nDisallow: /\nAllow: /allow/");
|
||||||
$res = $robots->URLAllow ("/");
|
$res = $robotstxt->URLAllow ("/");
|
||||||
$this->assertSame ($res, false);
|
$this->assertSame ($res, false);
|
||||||
}
|
}
|
||||||
public function test_urlallow_4 ()
|
public function test_urlallow_4 ()
|
||||||
{
|
{
|
||||||
$robots = new \models\robots (
|
$robotstxt = new robotstxt (
|
||||||
"User-Agent: *\nDisallow: /\nAllow: /allow/");
|
"User-Agent: *\nDisallow: /\nAllow: /allow/");
|
||||||
$res = $robots->URLAllow ("/allow/file");
|
$res = $robotstxt->URLAllow ("/allow/file");
|
||||||
$this->assertSame ($res, true);
|
$this->assertSame ($res, true);
|
||||||
}
|
}
|
||||||
public function test_urlallow_5 ()
|
public function test_urlallow_5 ()
|
||||||
{
|
{
|
||||||
$robots = new \models\robots (
|
$robotstxt = new robotstxt (
|
||||||
"User-Agent: *\nDisallow: /\nAllow: /allow/*.gif$");
|
"User-Agent: *\nDisallow: /\nAllow: /allow/*.gif$");
|
||||||
$res = $robots->URLAllow ("/allow/file.gif");
|
$res = $robotstxt->URLAllow ("/allow/file.gif");
|
||||||
$this->assertSame ($res, true);
|
$this->assertSame ($res, true);
|
||||||
}
|
}
|
||||||
public function test_urlallow_6 ()
|
public function test_urlallow_6 ()
|
||||||
{
|
{
|
||||||
$robots = new \models\robots (
|
$robotstxt = new robotstxt (
|
||||||
"User-Agent: *\nDisallow: /\nAllow: /allow/*.gif$");
|
"User-Agent: *\nDisallow: /\nAllow: /allow/*.gif$");
|
||||||
$res = $robots->URLAllow ("/allow/.gif");
|
$res = $robotstxt->URLAllow ("/allow/.gif");
|
||||||
$this->assertSame ($res, false);
|
$this->assertSame ($res, false);
|
||||||
}
|
}
|
||||||
public function test_urlallow_7 ()
|
public function test_urlallow_7 ()
|
||||||
{
|
{
|
||||||
$robots = new \models\robots (
|
$robotstxt = new robotstxt (
|
||||||
"User-Agent: *\nDisallow: /\nAllow: /allow/*.gif\$");
|
"User-Agent: *\nDisallow: /\nAllow: /allow/*.gif\$");
|
||||||
$res = $robots->URLAllow ("/allow/file.png");
|
$res = $robotstxt->URLAllow ("/allow/file.png");
|
||||||
$this->assertSame ($res, false);
|
$this->assertSame ($res, false);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
120
robotstxt.php
120
robotstxt.php
@@ -8,6 +8,9 @@
|
|||||||
* get the configured data for DomSearch.
|
* get the configured data for DomSearch.
|
||||||
* It allow to examine an URL against the robots.txt file and return if the URL
|
* It allow to examine an URL against the robots.txt file and return if the URL
|
||||||
* is allowed to be used or not
|
* is allowed to be used or not
|
||||||
|
* The definition of the format of robots.txt file is available here :
|
||||||
|
* http://www.robotstxt.org/norobots-rfc.txt
|
||||||
|
* https://en.wikipedia.org/wiki/Robots_exclusion_standard
|
||||||
*/
|
*/
|
||||||
class robotstxt
|
class robotstxt
|
||||||
{
|
{
|
||||||
@@ -55,54 +58,64 @@ class robotstxt
|
|||||||
$this->allow = array ("/");
|
$this->allow = array ("/");
|
||||||
return $this;
|
return $this;
|
||||||
}
|
}
|
||||||
$crawler = "";
|
// Look if the User-agent is available for the defined crawlerName.
|
||||||
$blocks = explode ("\n\n", $content);
|
// If Not check if the User-agent axists for *
|
||||||
foreach ($blocks as $block)
|
// If not accept all
|
||||||
|
$content = preg_split('/\r\n|\r|\n/', $content);
|
||||||
|
$keys = preg_grep ("~User-agent:\s*$this->crawlerName\s*#?~i", $content);
|
||||||
|
if (empty ($keys))
|
||||||
|
$keys = preg_grep ("~User-agent:\s*\\*\s*#?~i", $content);
|
||||||
|
if (empty ($keys))
|
||||||
{
|
{
|
||||||
preg_match_all ("#User-agent:\s+(.+)\$#mi", $block, $useragents);
|
// No User-agent with crawlerName nor * : accept all
|
||||||
if (!isset ($useragents[1]))
|
$this->allow = array ("/");
|
||||||
continue;
|
return $this;
|
||||||
if (! in_array ("*", $useragents[1]) &&
|
}
|
||||||
! in_array ("DomSearch", $useragents[1]))
|
// Get the Allow and Disallow lines. The stop will arrive on first
|
||||||
continue;
|
// User-Agent line arriving after a Allow/Disallow.
|
||||||
if (in_array ("*", $useragents[1]))
|
// Comments and empty lines are removed
|
||||||
{
|
for ($i = key ($keys) ; $i < count ($content) ; $i++)
|
||||||
// Already set
|
{
|
||||||
if ($crawler == "DomSearch")
|
$line = trim ($content[$i]);
|
||||||
continue;
|
if (strtolower (substr ($line, 0, 6)) === "allow:")
|
||||||
$crawler = "*";
|
{
|
||||||
$this->allow = array ();
|
$this->allow[] = $this->getValueFromLine ($line);
|
||||||
$this->disallow = array ();
|
}
|
||||||
$this->crawldelay = 3;
|
elseif (strtolower (substr ($line, 0, 9)) === "disallow:")
|
||||||
}
|
{
|
||||||
if (in_array ("DomSearch", $useragents[1]))
|
$this->disallow[] = $this->getValueFromLine ($line);
|
||||||
|
}
|
||||||
{
|
elseif (strtolower (substr ($line, 0, 12)) === "crawl-delay:")
|
||||||
// If the information for DomSearch where already seen, skip the second
|
{
|
||||||
// crawler information
|
$val = $this->getValueFromLine ($line);
|
||||||
if ($crawler == "DomSearch")
|
if ($val > 1 && $val < 60 && $this->crawldelay === 3)
|
||||||
continue;
|
$this->crawldelay = intval ($val);
|
||||||
$crawler = "DomSearch";
|
}
|
||||||
$this->allow = array ();
|
elseif (strtolower (substr ($line, 0, 11)) === "user-agent:" &&
|
||||||
$this->disallow = array ();
|
(!empty ($this->allow) || ! empty ($this->disallow)))
|
||||||
$this->crawldelay = 3;
|
{
|
||||||
}
|
// New user-agent line after valid allow/disallow : end of paragraph
|
||||||
preg_match_all ("#^Allow:\s*(\S*)\s*\$#mUi", $block, $allows);
|
break;
|
||||||
preg_match_all ("#^Disallow:\s*(\S*)\s*\$#mUi", $block, $disallows);
|
}
|
||||||
preg_match_all ("#^Crawl-delay:\s*(\d+)\s*#mi", $block, $crawldelay);
|
else
|
||||||
if (isset ($allows[1]))
|
{
|
||||||
$this->allow = $allows[1];
|
// Comment, empty line, sitemap, host, not managed line... : SKIP
|
||||||
if (isset ($disallows[1]))
|
}
|
||||||
$this->disallow = $disallows[1];
|
}
|
||||||
if (isset ($crawldelay[1][0]) && $crawldelay[1][0] > 0)
|
$keys = preg_grep ("#^\s*Sitemap:\s*(?P<value>\S+)\s*(\#)?#i", $content);
|
||||||
$this->crawldelay = intval ($crawldelay[1][0]);
|
foreach ($keys as $line)
|
||||||
|
{
|
||||||
|
$url = $this->getValueFromLine ($line);
|
||||||
|
if (!! filter_var ($url, FILTER_VALIDATE_URL))
|
||||||
|
$this->sitemaps[] = $url;
|
||||||
|
}
|
||||||
|
$keys = preg_grep ("#^\s*Host:\s*(?P<value>\S+)\s*(\#)?#i", $content);
|
||||||
|
foreach ($keys as $line)
|
||||||
|
{
|
||||||
|
// Takes only the first one
|
||||||
|
$this->host = $this->getValueFromLine ($line);
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
preg_match_all ("#^Sitemap:\s*(\S+)\s*\$#mi", $content, $sitemaps);
|
|
||||||
if (isset ($sitemaps[1][0]))
|
|
||||||
$this->sitemaps = $sitemaps[1];
|
|
||||||
preg_match_all ("#^Host:\s*(\S+)\s*\$#mi", $content, $host);
|
|
||||||
if (isset ($host[1][0]))
|
|
||||||
$this->host = $host[1][0];
|
|
||||||
if (! in_array ("/", $this->disallow) &&
|
if (! in_array ("/", $this->disallow) &&
|
||||||
! in_array ("/", $this->allow))
|
! in_array ("/", $this->allow))
|
||||||
$this->allow[] = "/";
|
$this->allow[] = "/";
|
||||||
@@ -236,4 +249,19 @@ class robotstxt
|
|||||||
return $this->matchRule;
|
return $this->matchRule;
|
||||||
}
|
}
|
||||||
// }}}
|
// }}}
|
||||||
|
|
||||||
|
// PRIVATE METHODS
|
||||||
|
/** Get a line from robots.txt file and return the associated value.
|
||||||
|
* Manage the evntual comments on the line
|
||||||
|
* @param string $line the complete line from robots.txt file
|
||||||
|
* @return string the value recorded on line
|
||||||
|
*/
|
||||||
|
private function getValueFromLine ($line)
|
||||||
|
// {{{
|
||||||
|
{
|
||||||
|
preg_match_all ("#^(?P<field>\S+):\s*(?P<value>\S*)\s*".
|
||||||
|
"(\#\s*(?P<comment>.+)\s*)?\$#", $line, $matches);
|
||||||
|
return $matches["value"][0];
|
||||||
|
}
|
||||||
|
// }}}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user