robotstxt : rewrite all the parser with the logic defined in RFC

git-svn-id: https://svn.fournier38.fr/svn/ProgSVN/trunk@5342 bf3deb0d-5f1a-0410-827f-c0cc1f45334c
This commit is contained in:
2019-06-11 09:08:53 +00:00
parent 2cfe3f4d17
commit ded1628c1f
2 changed files with 138 additions and 95 deletions

View File

@@ -1,166 +1,181 @@
<?php
/** Test the models/robots.txt file
/** Test the mrobotstxt.txt file
*/
class test_model extends PHPUnit_Framework_TestCase
{
// Empty Robots
public function test_Construct_1 ()
{
$robots = new \models\robots ("");
$res = $robots->allow ();
$robotstxt = new robotstxt ("");
$res = $robotstxt->allow ();
$this->assertSame ($res, ["/"]);
}
public function test_Construct_2 ()
{
$robots = new \models\robots ("");
$res = $robots->disallow ();
$robotstxt = new robotstxt ("");
$res = $robotstxt->disallow ();
$this->assertSame ($res, array ());
}
public function test_Construct_3 ()
{
$robots = new \models\robots ("");
$res = $robots->sitemaps ();
$robotstxt = new robotstxt ("");
$res = $robotstxt->sitemaps ();
$this->assertSame ($res, array ());
}
public function test_Construct_4 ()
{
$robots = new \models\robots ("");
$res = $robots->crawldelay ();
$robotstxt = new robotstxt ("");
$res = $robotstxt->crawldelay ();
$this->assertSame ($res, 3);
}
// Allow
public function test_allow_1 ()
{
$robots = new \models\robots (
$robotstxt = new robotstxt (
"User-Agent: *\nDisallow:\n");
$res = $robots->allow ();
$res = $robotstxt->allow ();
$this->assertSame ($res, ["/"]);
}
public function test_allow_2 ()
{
$robots = new \models\robots (
$robotstxt = new robotstxt (
"User-Agent: *\nDisallow:\n\nUser-Agent: DomSearch\nDisallow:\n");
$res = $robots->allow ();
$res = $robotstxt->allow ();
$this->assertSame ($res, ["/"]);
}
public function test_allow_3 ()
{
$robots = new \models\robots (
$robotstxt = new robotstxt (
"User-Agent: DomSearch\nDisallow:\n\nUser-Agent: *\nDisallow:\n");
$res = $robots->allow ();
$res = $robotstxt->allow ();
$this->assertSame ($res, ["/"]);
}
public function test_allow_4 ()
{
$robotstxt = new robotstxt (
"User-Agent: DomSearch\n".
"User-Agent: User1\n".
"User-Agent: User2\n".
"Disallow:\n\n".
"User-Agent: *\n".
"Disallow: /\n");
$res = $robotstxt->allow ();
$this->assertSame ($res, ["/"]);
}
// Disallow
public function test_disallow_1 ()
{
$robots = new \models\robots (
$robotstxt = new robotstxt (
"User-Agent: *\nDisallow: /\n");
$res = $robots->disallow ();
$res = $robotstxt->disallow ();
$this->assertSame ($res, ["/"]);
}
public function test_disallow_2 ()
{
$robots = new \models\robots (
$robotstxt = new robotstxt (
"User-Agent: *\nDisallow: /\n\nUser-Agent: DomSearch\nDisallow: /\n");
$res = $robots->disallow ();
$res = $robotstxt->disallow ();
$this->assertSame ($res, ["/"]);
}
public function test_disallow_3 ()
{
$robots = new \models\robots (
$robotstxt = new robotstxt (
"User-Agent: DomSearch\nDisallow: /\n\nUser-Agent: *\nDisallow: /\n");
$res = $robots->disallow ();
$res = $robotstxt->disallow ();
$this->assertSame ($res, ["/"]);
}
// Sitemaps
public function test_sitemaps_1 ()
{
$robots = new \models\robots (
$robotstxt = new robotstxt (
"User-Agent: DomSearch\nDisallow: /\n\nUser-Agent: *\nDisallow: /\n");
$res = $robots->sitemaps ();
$res = $robotstxt->sitemaps ();
$this->assertSame ($res, []);
}
public function test_sitemaps_2 ()
{
$robots = new \models\robots (
"User-Agent: *\nDisallow: /\nSitemap: /sitemap.xml");
$res = $robots->sitemaps ();
$this->assertSame ($res, ["/sitemap.xml"]);
$robotstxt = new robotstxt (
"User-Agent: *\nDisallow: /\nSitemap: http://example.com/sitemap.xml");
$res = $robotstxt->sitemaps ();
$this->assertSame ($res, ["http://example.com/sitemap.xml"]);
}
public function test_sitemaps_3 ()
{
$robots = new \models\robots (
"User-Agent: *\nDisallow: /\nSitemap: /sitemap.xml\nSitemap: /SITEMAP.XML");
$res = $robots->sitemaps ();
$this->assertSame ($res, ["/sitemap.xml", "/SITEMAP.XML"]);
$robotstxt = new robotstxt (
"User-Agent: *\nDisallow: /\n".
"Sitemap: http://example.com/sitemap.xml\n".
"Sitemap: http://example.com/SITEMAP.XML");
$res = $robotstxt->sitemaps ();
$this->assertSame ($res,
["http://example.com/sitemap.xml", "http://example.com/SITEMAP.XML"]);
}
// Host
public function test_host_1 ()
{
$robots = new \models\robots (
$robotstxt = new robotstxt (
"User-Agent: *\nDisallow: /\n");
$res = $robots->host ();
$res = $robotstxt->host ();
$this->assertSame ($res, null);
}
public function test_host_2 ()
{
$robots = new \models\robots (
$robotstxt = new robotstxt (
"User-Agent: *\nDisallow: /\n\nHost: localhost");
$res = $robots->host ();
$res = $robotstxt->host ();
$this->assertSame ($res, "localhost");
}
// URLAllow
public function test_urlallow_1 ()
{
$robots = new \models\robots ("");
$res = $robots->URLAllow ("/");
$robotstxt = new robotstxt ("");
$res = $robotstxt->URLAllow ("/");
$this->assertSame ($res, true);
}
public function test_urlallow_2 ()
{
$robots = new \models\robots (
$robotstxt = new robotstxt (
"User-Agent: *\nDisallow: /");
$res = $robots->URLAllow ("/");
$res = $robotstxt->URLAllow ("/");
$this->assertSame ($res, false);
}
public function test_urlallow_3 ()
{
$robots = new \models\robots (
$robotstxt = new robotstxt (
"User-Agent: *\nDisallow: /\nAllow: /allow/");
$res = $robots->URLAllow ("/");
$res = $robotstxt->URLAllow ("/");
$this->assertSame ($res, false);
}
public function test_urlallow_4 ()
{
$robots = new \models\robots (
$robotstxt = new robotstxt (
"User-Agent: *\nDisallow: /\nAllow: /allow/");
$res = $robots->URLAllow ("/allow/file");
$res = $robotstxt->URLAllow ("/allow/file");
$this->assertSame ($res, true);
}
public function test_urlallow_5 ()
{
$robots = new \models\robots (
$robotstxt = new robotstxt (
"User-Agent: *\nDisallow: /\nAllow: /allow/*.gif$");
$res = $robots->URLAllow ("/allow/file.gif");
$res = $robotstxt->URLAllow ("/allow/file.gif");
$this->assertSame ($res, true);
}
public function test_urlallow_6 ()
{
$robots = new \models\robots (
$robotstxt = new robotstxt (
"User-Agent: *\nDisallow: /\nAllow: /allow/*.gif$");
$res = $robots->URLAllow ("/allow/.gif");
$res = $robotstxt->URLAllow ("/allow/.gif");
$this->assertSame ($res, false);
}
public function test_urlallow_7 ()
{
$robots = new \models\robots (
$robotstxt = new robotstxt (
"User-Agent: *\nDisallow: /\nAllow: /allow/*.gif\$");
$res = $robots->URLAllow ("/allow/file.png");
$res = $robotstxt->URLAllow ("/allow/file.png");
$this->assertSame ($res, false);
}
}

View File

@@ -8,6 +8,9 @@
* get the configured data for DomSearch.
* It allow to examine an URL against the robots.txt file and return if the URL
* is allowed to be used or not
* The definition of the format of robots.txt file is available here :
* http://www.robotstxt.org/norobots-rfc.txt
* https://en.wikipedia.org/wiki/Robots_exclusion_standard
*/
class robotstxt
{
@@ -55,54 +58,64 @@ class robotstxt
$this->allow = array ("/");
return $this;
}
$crawler = "";
$blocks = explode ("\n\n", $content);
foreach ($blocks as $block)
// Look if the User-agent is available for the defined crawlerName.
// If Not check if the User-agent axists for *
// If not accept all
$content = preg_split('/\r\n|\r|\n/', $content);
$keys = preg_grep ("~User-agent:\s*$this->crawlerName\s*#?~i", $content);
if (empty ($keys))
$keys = preg_grep ("~User-agent:\s*\\*\s*#?~i", $content);
if (empty ($keys))
{
preg_match_all ("#User-agent:\s+(.+)\$#mi", $block, $useragents);
if (!isset ($useragents[1]))
continue;
if (! in_array ("*", $useragents[1]) &&
! in_array ("DomSearch", $useragents[1]))
continue;
if (in_array ("*", $useragents[1]))
{
// Already set
if ($crawler == "DomSearch")
continue;
$crawler = "*";
$this->allow = array ();
$this->disallow = array ();
$this->crawldelay = 3;
}
if (in_array ("DomSearch", $useragents[1]))
{
// If the information for DomSearch where already seen, skip the second
// crawler information
if ($crawler == "DomSearch")
continue;
$crawler = "DomSearch";
$this->allow = array ();
$this->disallow = array ();
$this->crawldelay = 3;
}
preg_match_all ("#^Allow:\s*(\S*)\s*\$#mUi", $block, $allows);
preg_match_all ("#^Disallow:\s*(\S*)\s*\$#mUi", $block, $disallows);
preg_match_all ("#^Crawl-delay:\s*(\d+)\s*#mi", $block, $crawldelay);
if (isset ($allows[1]))
$this->allow = $allows[1];
if (isset ($disallows[1]))
$this->disallow = $disallows[1];
if (isset ($crawldelay[1][0]) && $crawldelay[1][0] > 0)
$this->crawldelay = intval ($crawldelay[1][0]);
// No User-agent with crawlerName nor * : accept all
$this->allow = array ("/");
return $this;
}
// Get the Allow and Disallow lines. The stop will arrive on first
// User-Agent line arriving after a Allow/Disallow.
// Comments and empty lines are removed
for ($i = key ($keys) ; $i < count ($content) ; $i++)
{
$line = trim ($content[$i]);
if (strtolower (substr ($line, 0, 6)) === "allow:")
{
$this->allow[] = $this->getValueFromLine ($line);
}
elseif (strtolower (substr ($line, 0, 9)) === "disallow:")
{
$this->disallow[] = $this->getValueFromLine ($line);
}
elseif (strtolower (substr ($line, 0, 12)) === "crawl-delay:")
{
$val = $this->getValueFromLine ($line);
if ($val > 1 && $val < 60 && $this->crawldelay === 3)
$this->crawldelay = intval ($val);
}
elseif (strtolower (substr ($line, 0, 11)) === "user-agent:" &&
(!empty ($this->allow) || ! empty ($this->disallow)))
{
// New user-agent line after valid allow/disallow : end of paragraph
break;
}
else
{
// Comment, empty line, sitemap, host, not managed line... : SKIP
}
}
$keys = preg_grep ("#^\s*Sitemap:\s*(?P<value>\S+)\s*(\#)?#i", $content);
foreach ($keys as $line)
{
$url = $this->getValueFromLine ($line);
if (!! filter_var ($url, FILTER_VALIDATE_URL))
$this->sitemaps[] = $url;
}
$keys = preg_grep ("#^\s*Host:\s*(?P<value>\S+)\s*(\#)?#i", $content);
foreach ($keys as $line)
{
// Takes only the first one
$this->host = $this->getValueFromLine ($line);
break;
}
preg_match_all ("#^Sitemap:\s*(\S+)\s*\$#mi", $content, $sitemaps);
if (isset ($sitemaps[1][0]))
$this->sitemaps = $sitemaps[1];
preg_match_all ("#^Host:\s*(\S+)\s*\$#mi", $content, $host);
if (isset ($host[1][0]))
$this->host = $host[1][0];
if (! in_array ("/", $this->disallow) &&
! in_array ("/", $this->allow))
$this->allow[] = "/";
@@ -236,4 +249,19 @@ class robotstxt
return $this->matchRule;
}
// }}}
// PRIVATE METHODS
/** Get a line from robots.txt file and return the associated value.
* Manage the evntual comments on the line
* @param string $line the complete line from robots.txt file
* @return string the value recorded on line
*/
private function getValueFromLine ($line)
// {{{
{
preg_match_all ("#^(?P<field>\S+):\s*(?P<value>\S*)\s*".
"(\#\s*(?P<comment>.+)\s*)?\$#", $line, $matches);
return $matches["value"][0];
}
// }}}
}