robotstxt : rewrite all the parser with the logic defined in RFC
git-svn-id: https://svn.fournier38.fr/svn/ProgSVN/trunk@5342 bf3deb0d-5f1a-0410-827f-c0cc1f45334c
This commit is contained in:
@@ -1,166 +1,181 @@
|
||||
<?php
|
||||
/** Test the models/robots.txt file
|
||||
/** Test the mrobotstxt.txt file
|
||||
*/
|
||||
class test_model extends PHPUnit_Framework_TestCase
|
||||
{
|
||||
// Empty Robots
|
||||
public function test_Construct_1 ()
|
||||
{
|
||||
$robots = new \models\robots ("");
|
||||
$res = $robots->allow ();
|
||||
$robotstxt = new robotstxt ("");
|
||||
$res = $robotstxt->allow ();
|
||||
$this->assertSame ($res, ["/"]);
|
||||
}
|
||||
public function test_Construct_2 ()
|
||||
{
|
||||
$robots = new \models\robots ("");
|
||||
$res = $robots->disallow ();
|
||||
$robotstxt = new robotstxt ("");
|
||||
$res = $robotstxt->disallow ();
|
||||
$this->assertSame ($res, array ());
|
||||
}
|
||||
public function test_Construct_3 ()
|
||||
{
|
||||
$robots = new \models\robots ("");
|
||||
$res = $robots->sitemaps ();
|
||||
$robotstxt = new robotstxt ("");
|
||||
$res = $robotstxt->sitemaps ();
|
||||
$this->assertSame ($res, array ());
|
||||
}
|
||||
public function test_Construct_4 ()
|
||||
{
|
||||
$robots = new \models\robots ("");
|
||||
$res = $robots->crawldelay ();
|
||||
$robotstxt = new robotstxt ("");
|
||||
$res = $robotstxt->crawldelay ();
|
||||
$this->assertSame ($res, 3);
|
||||
}
|
||||
|
||||
// Allow
|
||||
public function test_allow_1 ()
|
||||
{
|
||||
$robots = new \models\robots (
|
||||
$robotstxt = new robotstxt (
|
||||
"User-Agent: *\nDisallow:\n");
|
||||
$res = $robots->allow ();
|
||||
$res = $robotstxt->allow ();
|
||||
$this->assertSame ($res, ["/"]);
|
||||
}
|
||||
public function test_allow_2 ()
|
||||
{
|
||||
$robots = new \models\robots (
|
||||
$robotstxt = new robotstxt (
|
||||
"User-Agent: *\nDisallow:\n\nUser-Agent: DomSearch\nDisallow:\n");
|
||||
$res = $robots->allow ();
|
||||
$res = $robotstxt->allow ();
|
||||
$this->assertSame ($res, ["/"]);
|
||||
}
|
||||
public function test_allow_3 ()
|
||||
{
|
||||
$robots = new \models\robots (
|
||||
$robotstxt = new robotstxt (
|
||||
"User-Agent: DomSearch\nDisallow:\n\nUser-Agent: *\nDisallow:\n");
|
||||
$res = $robots->allow ();
|
||||
$res = $robotstxt->allow ();
|
||||
$this->assertSame ($res, ["/"]);
|
||||
}
|
||||
public function test_allow_4 ()
|
||||
{
|
||||
$robotstxt = new robotstxt (
|
||||
"User-Agent: DomSearch\n".
|
||||
"User-Agent: User1\n".
|
||||
"User-Agent: User2\n".
|
||||
"Disallow:\n\n".
|
||||
"User-Agent: *\n".
|
||||
"Disallow: /\n");
|
||||
$res = $robotstxt->allow ();
|
||||
$this->assertSame ($res, ["/"]);
|
||||
}
|
||||
|
||||
// Disallow
|
||||
public function test_disallow_1 ()
|
||||
{
|
||||
$robots = new \models\robots (
|
||||
$robotstxt = new robotstxt (
|
||||
"User-Agent: *\nDisallow: /\n");
|
||||
$res = $robots->disallow ();
|
||||
$res = $robotstxt->disallow ();
|
||||
$this->assertSame ($res, ["/"]);
|
||||
}
|
||||
public function test_disallow_2 ()
|
||||
{
|
||||
$robots = new \models\robots (
|
||||
$robotstxt = new robotstxt (
|
||||
"User-Agent: *\nDisallow: /\n\nUser-Agent: DomSearch\nDisallow: /\n");
|
||||
$res = $robots->disallow ();
|
||||
$res = $robotstxt->disallow ();
|
||||
$this->assertSame ($res, ["/"]);
|
||||
}
|
||||
public function test_disallow_3 ()
|
||||
{
|
||||
$robots = new \models\robots (
|
||||
$robotstxt = new robotstxt (
|
||||
"User-Agent: DomSearch\nDisallow: /\n\nUser-Agent: *\nDisallow: /\n");
|
||||
$res = $robots->disallow ();
|
||||
$res = $robotstxt->disallow ();
|
||||
$this->assertSame ($res, ["/"]);
|
||||
}
|
||||
|
||||
// Sitemaps
|
||||
public function test_sitemaps_1 ()
|
||||
{
|
||||
$robots = new \models\robots (
|
||||
$robotstxt = new robotstxt (
|
||||
"User-Agent: DomSearch\nDisallow: /\n\nUser-Agent: *\nDisallow: /\n");
|
||||
$res = $robots->sitemaps ();
|
||||
$res = $robotstxt->sitemaps ();
|
||||
$this->assertSame ($res, []);
|
||||
}
|
||||
public function test_sitemaps_2 ()
|
||||
{
|
||||
$robots = new \models\robots (
|
||||
"User-Agent: *\nDisallow: /\nSitemap: /sitemap.xml");
|
||||
$res = $robots->sitemaps ();
|
||||
$this->assertSame ($res, ["/sitemap.xml"]);
|
||||
$robotstxt = new robotstxt (
|
||||
"User-Agent: *\nDisallow: /\nSitemap: http://example.com/sitemap.xml");
|
||||
$res = $robotstxt->sitemaps ();
|
||||
$this->assertSame ($res, ["http://example.com/sitemap.xml"]);
|
||||
}
|
||||
public function test_sitemaps_3 ()
|
||||
{
|
||||
$robots = new \models\robots (
|
||||
"User-Agent: *\nDisallow: /\nSitemap: /sitemap.xml\nSitemap: /SITEMAP.XML");
|
||||
$res = $robots->sitemaps ();
|
||||
$this->assertSame ($res, ["/sitemap.xml", "/SITEMAP.XML"]);
|
||||
$robotstxt = new robotstxt (
|
||||
"User-Agent: *\nDisallow: /\n".
|
||||
"Sitemap: http://example.com/sitemap.xml\n".
|
||||
"Sitemap: http://example.com/SITEMAP.XML");
|
||||
$res = $robotstxt->sitemaps ();
|
||||
$this->assertSame ($res,
|
||||
["http://example.com/sitemap.xml", "http://example.com/SITEMAP.XML"]);
|
||||
}
|
||||
|
||||
// Host
|
||||
public function test_host_1 ()
|
||||
{
|
||||
$robots = new \models\robots (
|
||||
$robotstxt = new robotstxt (
|
||||
"User-Agent: *\nDisallow: /\n");
|
||||
$res = $robots->host ();
|
||||
$res = $robotstxt->host ();
|
||||
$this->assertSame ($res, null);
|
||||
}
|
||||
public function test_host_2 ()
|
||||
{
|
||||
$robots = new \models\robots (
|
||||
$robotstxt = new robotstxt (
|
||||
"User-Agent: *\nDisallow: /\n\nHost: localhost");
|
||||
$res = $robots->host ();
|
||||
$res = $robotstxt->host ();
|
||||
$this->assertSame ($res, "localhost");
|
||||
}
|
||||
|
||||
// URLAllow
|
||||
public function test_urlallow_1 ()
|
||||
{
|
||||
$robots = new \models\robots ("");
|
||||
$res = $robots->URLAllow ("/");
|
||||
$robotstxt = new robotstxt ("");
|
||||
$res = $robotstxt->URLAllow ("/");
|
||||
$this->assertSame ($res, true);
|
||||
}
|
||||
public function test_urlallow_2 ()
|
||||
{
|
||||
$robots = new \models\robots (
|
||||
$robotstxt = new robotstxt (
|
||||
"User-Agent: *\nDisallow: /");
|
||||
$res = $robots->URLAllow ("/");
|
||||
$res = $robotstxt->URLAllow ("/");
|
||||
$this->assertSame ($res, false);
|
||||
}
|
||||
public function test_urlallow_3 ()
|
||||
{
|
||||
$robots = new \models\robots (
|
||||
$robotstxt = new robotstxt (
|
||||
"User-Agent: *\nDisallow: /\nAllow: /allow/");
|
||||
$res = $robots->URLAllow ("/");
|
||||
$res = $robotstxt->URLAllow ("/");
|
||||
$this->assertSame ($res, false);
|
||||
}
|
||||
public function test_urlallow_4 ()
|
||||
{
|
||||
$robots = new \models\robots (
|
||||
$robotstxt = new robotstxt (
|
||||
"User-Agent: *\nDisallow: /\nAllow: /allow/");
|
||||
$res = $robots->URLAllow ("/allow/file");
|
||||
$res = $robotstxt->URLAllow ("/allow/file");
|
||||
$this->assertSame ($res, true);
|
||||
}
|
||||
public function test_urlallow_5 ()
|
||||
{
|
||||
$robots = new \models\robots (
|
||||
$robotstxt = new robotstxt (
|
||||
"User-Agent: *\nDisallow: /\nAllow: /allow/*.gif$");
|
||||
$res = $robots->URLAllow ("/allow/file.gif");
|
||||
$res = $robotstxt->URLAllow ("/allow/file.gif");
|
||||
$this->assertSame ($res, true);
|
||||
}
|
||||
public function test_urlallow_6 ()
|
||||
{
|
||||
$robots = new \models\robots (
|
||||
$robotstxt = new robotstxt (
|
||||
"User-Agent: *\nDisallow: /\nAllow: /allow/*.gif$");
|
||||
$res = $robots->URLAllow ("/allow/.gif");
|
||||
$res = $robotstxt->URLAllow ("/allow/.gif");
|
||||
$this->assertSame ($res, false);
|
||||
}
|
||||
public function test_urlallow_7 ()
|
||||
{
|
||||
$robots = new \models\robots (
|
||||
$robotstxt = new robotstxt (
|
||||
"User-Agent: *\nDisallow: /\nAllow: /allow/*.gif\$");
|
||||
$res = $robots->URLAllow ("/allow/file.png");
|
||||
$res = $robotstxt->URLAllow ("/allow/file.png");
|
||||
$this->assertSame ($res, false);
|
||||
}
|
||||
}
|
||||
|
||||
120
robotstxt.php
120
robotstxt.php
@@ -8,6 +8,9 @@
|
||||
* get the configured data for DomSearch.
|
||||
* It allow to examine an URL against the robots.txt file and return if the URL
|
||||
* is allowed to be used or not
|
||||
* The definition of the format of robots.txt file is available here :
|
||||
* http://www.robotstxt.org/norobots-rfc.txt
|
||||
* https://en.wikipedia.org/wiki/Robots_exclusion_standard
|
||||
*/
|
||||
class robotstxt
|
||||
{
|
||||
@@ -55,54 +58,64 @@ class robotstxt
|
||||
$this->allow = array ("/");
|
||||
return $this;
|
||||
}
|
||||
$crawler = "";
|
||||
$blocks = explode ("\n\n", $content);
|
||||
foreach ($blocks as $block)
|
||||
// Look if the User-agent is available for the defined crawlerName.
|
||||
// If Not check if the User-agent axists for *
|
||||
// If not accept all
|
||||
$content = preg_split('/\r\n|\r|\n/', $content);
|
||||
$keys = preg_grep ("~User-agent:\s*$this->crawlerName\s*#?~i", $content);
|
||||
if (empty ($keys))
|
||||
$keys = preg_grep ("~User-agent:\s*\\*\s*#?~i", $content);
|
||||
if (empty ($keys))
|
||||
{
|
||||
preg_match_all ("#User-agent:\s+(.+)\$#mi", $block, $useragents);
|
||||
if (!isset ($useragents[1]))
|
||||
continue;
|
||||
if (! in_array ("*", $useragents[1]) &&
|
||||
! in_array ("DomSearch", $useragents[1]))
|
||||
continue;
|
||||
if (in_array ("*", $useragents[1]))
|
||||
{
|
||||
// Already set
|
||||
if ($crawler == "DomSearch")
|
||||
continue;
|
||||
$crawler = "*";
|
||||
$this->allow = array ();
|
||||
$this->disallow = array ();
|
||||
$this->crawldelay = 3;
|
||||
}
|
||||
if (in_array ("DomSearch", $useragents[1]))
|
||||
|
||||
{
|
||||
// If the information for DomSearch where already seen, skip the second
|
||||
// crawler information
|
||||
if ($crawler == "DomSearch")
|
||||
continue;
|
||||
$crawler = "DomSearch";
|
||||
$this->allow = array ();
|
||||
$this->disallow = array ();
|
||||
$this->crawldelay = 3;
|
||||
}
|
||||
preg_match_all ("#^Allow:\s*(\S*)\s*\$#mUi", $block, $allows);
|
||||
preg_match_all ("#^Disallow:\s*(\S*)\s*\$#mUi", $block, $disallows);
|
||||
preg_match_all ("#^Crawl-delay:\s*(\d+)\s*#mi", $block, $crawldelay);
|
||||
if (isset ($allows[1]))
|
||||
$this->allow = $allows[1];
|
||||
if (isset ($disallows[1]))
|
||||
$this->disallow = $disallows[1];
|
||||
if (isset ($crawldelay[1][0]) && $crawldelay[1][0] > 0)
|
||||
$this->crawldelay = intval ($crawldelay[1][0]);
|
||||
// No User-agent with crawlerName nor * : accept all
|
||||
$this->allow = array ("/");
|
||||
return $this;
|
||||
}
|
||||
// Get the Allow and Disallow lines. The stop will arrive on first
|
||||
// User-Agent line arriving after a Allow/Disallow.
|
||||
// Comments and empty lines are removed
|
||||
for ($i = key ($keys) ; $i < count ($content) ; $i++)
|
||||
{
|
||||
$line = trim ($content[$i]);
|
||||
if (strtolower (substr ($line, 0, 6)) === "allow:")
|
||||
{
|
||||
$this->allow[] = $this->getValueFromLine ($line);
|
||||
}
|
||||
elseif (strtolower (substr ($line, 0, 9)) === "disallow:")
|
||||
{
|
||||
$this->disallow[] = $this->getValueFromLine ($line);
|
||||
}
|
||||
elseif (strtolower (substr ($line, 0, 12)) === "crawl-delay:")
|
||||
{
|
||||
$val = $this->getValueFromLine ($line);
|
||||
if ($val > 1 && $val < 60 && $this->crawldelay === 3)
|
||||
$this->crawldelay = intval ($val);
|
||||
}
|
||||
elseif (strtolower (substr ($line, 0, 11)) === "user-agent:" &&
|
||||
(!empty ($this->allow) || ! empty ($this->disallow)))
|
||||
{
|
||||
// New user-agent line after valid allow/disallow : end of paragraph
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Comment, empty line, sitemap, host, not managed line... : SKIP
|
||||
}
|
||||
}
|
||||
$keys = preg_grep ("#^\s*Sitemap:\s*(?P<value>\S+)\s*(\#)?#i", $content);
|
||||
foreach ($keys as $line)
|
||||
{
|
||||
$url = $this->getValueFromLine ($line);
|
||||
if (!! filter_var ($url, FILTER_VALIDATE_URL))
|
||||
$this->sitemaps[] = $url;
|
||||
}
|
||||
$keys = preg_grep ("#^\s*Host:\s*(?P<value>\S+)\s*(\#)?#i", $content);
|
||||
foreach ($keys as $line)
|
||||
{
|
||||
// Takes only the first one
|
||||
$this->host = $this->getValueFromLine ($line);
|
||||
break;
|
||||
}
|
||||
preg_match_all ("#^Sitemap:\s*(\S+)\s*\$#mi", $content, $sitemaps);
|
||||
if (isset ($sitemaps[1][0]))
|
||||
$this->sitemaps = $sitemaps[1];
|
||||
preg_match_all ("#^Host:\s*(\S+)\s*\$#mi", $content, $host);
|
||||
if (isset ($host[1][0]))
|
||||
$this->host = $host[1][0];
|
||||
if (! in_array ("/", $this->disallow) &&
|
||||
! in_array ("/", $this->allow))
|
||||
$this->allow[] = "/";
|
||||
@@ -236,4 +249,19 @@ class robotstxt
|
||||
return $this->matchRule;
|
||||
}
|
||||
// }}}
|
||||
|
||||
// PRIVATE METHODS
|
||||
/** Get a line from robots.txt file and return the associated value.
|
||||
* Manage the evntual comments on the line
|
||||
* @param string $line the complete line from robots.txt file
|
||||
* @return string the value recorded on line
|
||||
*/
|
||||
private function getValueFromLine ($line)
|
||||
// {{{
|
||||
{
|
||||
preg_match_all ("#^(?P<field>\S+):\s*(?P<value>\S*)\s*".
|
||||
"(\#\s*(?P<comment>.+)\s*)?\$#", $line, $matches);
|
||||
return $matches["value"][0];
|
||||
}
|
||||
// }}}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user