robotstxt : rewrite all the parser with the logic defined in RFC

git-svn-id: https://svn.fournier38.fr/svn/ProgSVN/trunk@5342 bf3deb0d-5f1a-0410-827f-c0cc1f45334c
This commit is contained in:
2019-06-11 09:08:53 +00:00
parent 2cfe3f4d17
commit ded1628c1f
2 changed files with 138 additions and 95 deletions

View File

@@ -1,166 +1,181 @@
<?php <?php
/** Test the models/robots.txt file /** Test the mrobotstxt.txt file
*/ */
class test_model extends PHPUnit_Framework_TestCase class test_model extends PHPUnit_Framework_TestCase
{ {
// Empty Robots // Empty Robots
public function test_Construct_1 () public function test_Construct_1 ()
{ {
$robots = new \models\robots (""); $robotstxt = new robotstxt ("");
$res = $robots->allow (); $res = $robotstxt->allow ();
$this->assertSame ($res, ["/"]); $this->assertSame ($res, ["/"]);
} }
public function test_Construct_2 () public function test_Construct_2 ()
{ {
$robots = new \models\robots (""); $robotstxt = new robotstxt ("");
$res = $robots->disallow (); $res = $robotstxt->disallow ();
$this->assertSame ($res, array ()); $this->assertSame ($res, array ());
} }
public function test_Construct_3 () public function test_Construct_3 ()
{ {
$robots = new \models\robots (""); $robotstxt = new robotstxt ("");
$res = $robots->sitemaps (); $res = $robotstxt->sitemaps ();
$this->assertSame ($res, array ()); $this->assertSame ($res, array ());
} }
public function test_Construct_4 () public function test_Construct_4 ()
{ {
$robots = new \models\robots (""); $robotstxt = new robotstxt ("");
$res = $robots->crawldelay (); $res = $robotstxt->crawldelay ();
$this->assertSame ($res, 3); $this->assertSame ($res, 3);
} }
// Allow // Allow
public function test_allow_1 () public function test_allow_1 ()
{ {
$robots = new \models\robots ( $robotstxt = new robotstxt (
"User-Agent: *\nDisallow:\n"); "User-Agent: *\nDisallow:\n");
$res = $robots->allow (); $res = $robotstxt->allow ();
$this->assertSame ($res, ["/"]); $this->assertSame ($res, ["/"]);
} }
public function test_allow_2 () public function test_allow_2 ()
{ {
$robots = new \models\robots ( $robotstxt = new robotstxt (
"User-Agent: *\nDisallow:\n\nUser-Agent: DomSearch\nDisallow:\n"); "User-Agent: *\nDisallow:\n\nUser-Agent: DomSearch\nDisallow:\n");
$res = $robots->allow (); $res = $robotstxt->allow ();
$this->assertSame ($res, ["/"]); $this->assertSame ($res, ["/"]);
} }
public function test_allow_3 () public function test_allow_3 ()
{ {
$robots = new \models\robots ( $robotstxt = new robotstxt (
"User-Agent: DomSearch\nDisallow:\n\nUser-Agent: *\nDisallow:\n"); "User-Agent: DomSearch\nDisallow:\n\nUser-Agent: *\nDisallow:\n");
$res = $robots->allow (); $res = $robotstxt->allow ();
$this->assertSame ($res, ["/"]);
}
public function test_allow_4 ()
{
$robotstxt = new robotstxt (
"User-Agent: DomSearch\n".
"User-Agent: User1\n".
"User-Agent: User2\n".
"Disallow:\n\n".
"User-Agent: *\n".
"Disallow: /\n");
$res = $robotstxt->allow ();
$this->assertSame ($res, ["/"]); $this->assertSame ($res, ["/"]);
} }
// Disallow // Disallow
public function test_disallow_1 () public function test_disallow_1 ()
{ {
$robots = new \models\robots ( $robotstxt = new robotstxt (
"User-Agent: *\nDisallow: /\n"); "User-Agent: *\nDisallow: /\n");
$res = $robots->disallow (); $res = $robotstxt->disallow ();
$this->assertSame ($res, ["/"]); $this->assertSame ($res, ["/"]);
} }
public function test_disallow_2 () public function test_disallow_2 ()
{ {
$robots = new \models\robots ( $robotstxt = new robotstxt (
"User-Agent: *\nDisallow: /\n\nUser-Agent: DomSearch\nDisallow: /\n"); "User-Agent: *\nDisallow: /\n\nUser-Agent: DomSearch\nDisallow: /\n");
$res = $robots->disallow (); $res = $robotstxt->disallow ();
$this->assertSame ($res, ["/"]); $this->assertSame ($res, ["/"]);
} }
public function test_disallow_3 () public function test_disallow_3 ()
{ {
$robots = new \models\robots ( $robotstxt = new robotstxt (
"User-Agent: DomSearch\nDisallow: /\n\nUser-Agent: *\nDisallow: /\n"); "User-Agent: DomSearch\nDisallow: /\n\nUser-Agent: *\nDisallow: /\n");
$res = $robots->disallow (); $res = $robotstxt->disallow ();
$this->assertSame ($res, ["/"]); $this->assertSame ($res, ["/"]);
} }
// Sitemaps // Sitemaps
public function test_sitemaps_1 () public function test_sitemaps_1 ()
{ {
$robots = new \models\robots ( $robotstxt = new robotstxt (
"User-Agent: DomSearch\nDisallow: /\n\nUser-Agent: *\nDisallow: /\n"); "User-Agent: DomSearch\nDisallow: /\n\nUser-Agent: *\nDisallow: /\n");
$res = $robots->sitemaps (); $res = $robotstxt->sitemaps ();
$this->assertSame ($res, []); $this->assertSame ($res, []);
} }
public function test_sitemaps_2 () public function test_sitemaps_2 ()
{ {
$robots = new \models\robots ( $robotstxt = new robotstxt (
"User-Agent: *\nDisallow: /\nSitemap: /sitemap.xml"); "User-Agent: *\nDisallow: /\nSitemap: http://example.com/sitemap.xml");
$res = $robots->sitemaps (); $res = $robotstxt->sitemaps ();
$this->assertSame ($res, ["/sitemap.xml"]); $this->assertSame ($res, ["http://example.com/sitemap.xml"]);
} }
public function test_sitemaps_3 () public function test_sitemaps_3 ()
{ {
$robots = new \models\robots ( $robotstxt = new robotstxt (
"User-Agent: *\nDisallow: /\nSitemap: /sitemap.xml\nSitemap: /SITEMAP.XML"); "User-Agent: *\nDisallow: /\n".
$res = $robots->sitemaps (); "Sitemap: http://example.com/sitemap.xml\n".
$this->assertSame ($res, ["/sitemap.xml", "/SITEMAP.XML"]); "Sitemap: http://example.com/SITEMAP.XML");
$res = $robotstxt->sitemaps ();
$this->assertSame ($res,
["http://example.com/sitemap.xml", "http://example.com/SITEMAP.XML"]);
} }
// Host // Host
public function test_host_1 () public function test_host_1 ()
{ {
$robots = new \models\robots ( $robotstxt = new robotstxt (
"User-Agent: *\nDisallow: /\n"); "User-Agent: *\nDisallow: /\n");
$res = $robots->host (); $res = $robotstxt->host ();
$this->assertSame ($res, null); $this->assertSame ($res, null);
} }
public function test_host_2 () public function test_host_2 ()
{ {
$robots = new \models\robots ( $robotstxt = new robotstxt (
"User-Agent: *\nDisallow: /\n\nHost: localhost"); "User-Agent: *\nDisallow: /\n\nHost: localhost");
$res = $robots->host (); $res = $robotstxt->host ();
$this->assertSame ($res, "localhost"); $this->assertSame ($res, "localhost");
} }
// URLAllow // URLAllow
public function test_urlallow_1 () public function test_urlallow_1 ()
{ {
$robots = new \models\robots (""); $robotstxt = new robotstxt ("");
$res = $robots->URLAllow ("/"); $res = $robotstxt->URLAllow ("/");
$this->assertSame ($res, true); $this->assertSame ($res, true);
} }
public function test_urlallow_2 () public function test_urlallow_2 ()
{ {
$robots = new \models\robots ( $robotstxt = new robotstxt (
"User-Agent: *\nDisallow: /"); "User-Agent: *\nDisallow: /");
$res = $robots->URLAllow ("/"); $res = $robotstxt->URLAllow ("/");
$this->assertSame ($res, false); $this->assertSame ($res, false);
} }
public function test_urlallow_3 () public function test_urlallow_3 ()
{ {
$robots = new \models\robots ( $robotstxt = new robotstxt (
"User-Agent: *\nDisallow: /\nAllow: /allow/"); "User-Agent: *\nDisallow: /\nAllow: /allow/");
$res = $robots->URLAllow ("/"); $res = $robotstxt->URLAllow ("/");
$this->assertSame ($res, false); $this->assertSame ($res, false);
} }
public function test_urlallow_4 () public function test_urlallow_4 ()
{ {
$robots = new \models\robots ( $robotstxt = new robotstxt (
"User-Agent: *\nDisallow: /\nAllow: /allow/"); "User-Agent: *\nDisallow: /\nAllow: /allow/");
$res = $robots->URLAllow ("/allow/file"); $res = $robotstxt->URLAllow ("/allow/file");
$this->assertSame ($res, true); $this->assertSame ($res, true);
} }
public function test_urlallow_5 () public function test_urlallow_5 ()
{ {
$robots = new \models\robots ( $robotstxt = new robotstxt (
"User-Agent: *\nDisallow: /\nAllow: /allow/*.gif$"); "User-Agent: *\nDisallow: /\nAllow: /allow/*.gif$");
$res = $robots->URLAllow ("/allow/file.gif"); $res = $robotstxt->URLAllow ("/allow/file.gif");
$this->assertSame ($res, true); $this->assertSame ($res, true);
} }
public function test_urlallow_6 () public function test_urlallow_6 ()
{ {
$robots = new \models\robots ( $robotstxt = new robotstxt (
"User-Agent: *\nDisallow: /\nAllow: /allow/*.gif$"); "User-Agent: *\nDisallow: /\nAllow: /allow/*.gif$");
$res = $robots->URLAllow ("/allow/.gif"); $res = $robotstxt->URLAllow ("/allow/.gif");
$this->assertSame ($res, false); $this->assertSame ($res, false);
} }
public function test_urlallow_7 () public function test_urlallow_7 ()
{ {
$robots = new \models\robots ( $robotstxt = new robotstxt (
"User-Agent: *\nDisallow: /\nAllow: /allow/*.gif\$"); "User-Agent: *\nDisallow: /\nAllow: /allow/*.gif\$");
$res = $robots->URLAllow ("/allow/file.png"); $res = $robotstxt->URLAllow ("/allow/file.png");
$this->assertSame ($res, false); $this->assertSame ($res, false);
} }
} }

View File

@@ -8,6 +8,9 @@
* get the configured data for DomSearch. * get the configured data for DomSearch.
* It allow to examine an URL against the robots.txt file and return if the URL * It allow to examine an URL against the robots.txt file and return if the URL
* is allowed to be used or not * is allowed to be used or not
* The definition of the format of robots.txt file is available here :
* http://www.robotstxt.org/norobots-rfc.txt
* https://en.wikipedia.org/wiki/Robots_exclusion_standard
*/ */
class robotstxt class robotstxt
{ {
@@ -55,54 +58,64 @@ class robotstxt
$this->allow = array ("/"); $this->allow = array ("/");
return $this; return $this;
} }
$crawler = ""; // Look if the User-agent is available for the defined crawlerName.
$blocks = explode ("\n\n", $content); // If Not check if the User-agent axists for *
foreach ($blocks as $block) // If not accept all
$content = preg_split('/\r\n|\r|\n/', $content);
$keys = preg_grep ("~User-agent:\s*$this->crawlerName\s*#?~i", $content);
if (empty ($keys))
$keys = preg_grep ("~User-agent:\s*\\*\s*#?~i", $content);
if (empty ($keys))
{ {
preg_match_all ("#User-agent:\s+(.+)\$#mi", $block, $useragents); // No User-agent with crawlerName nor * : accept all
if (!isset ($useragents[1])) $this->allow = array ("/");
continue; return $this;
if (! in_array ("*", $useragents[1]) && }
! in_array ("DomSearch", $useragents[1])) // Get the Allow and Disallow lines. The stop will arrive on first
continue; // User-Agent line arriving after a Allow/Disallow.
if (in_array ("*", $useragents[1])) // Comments and empty lines are removed
{ for ($i = key ($keys) ; $i < count ($content) ; $i++)
// Already set {
if ($crawler == "DomSearch") $line = trim ($content[$i]);
continue; if (strtolower (substr ($line, 0, 6)) === "allow:")
$crawler = "*"; {
$this->allow = array (); $this->allow[] = $this->getValueFromLine ($line);
$this->disallow = array (); }
$this->crawldelay = 3; elseif (strtolower (substr ($line, 0, 9)) === "disallow:")
} {
if (in_array ("DomSearch", $useragents[1])) $this->disallow[] = $this->getValueFromLine ($line);
}
{ elseif (strtolower (substr ($line, 0, 12)) === "crawl-delay:")
// If the information for DomSearch where already seen, skip the second {
// crawler information $val = $this->getValueFromLine ($line);
if ($crawler == "DomSearch") if ($val > 1 && $val < 60 && $this->crawldelay === 3)
continue; $this->crawldelay = intval ($val);
$crawler = "DomSearch"; }
$this->allow = array (); elseif (strtolower (substr ($line, 0, 11)) === "user-agent:" &&
$this->disallow = array (); (!empty ($this->allow) || ! empty ($this->disallow)))
$this->crawldelay = 3; {
} // New user-agent line after valid allow/disallow : end of paragraph
preg_match_all ("#^Allow:\s*(\S*)\s*\$#mUi", $block, $allows); break;
preg_match_all ("#^Disallow:\s*(\S*)\s*\$#mUi", $block, $disallows); }
preg_match_all ("#^Crawl-delay:\s*(\d+)\s*#mi", $block, $crawldelay); else
if (isset ($allows[1])) {
$this->allow = $allows[1]; // Comment, empty line, sitemap, host, not managed line... : SKIP
if (isset ($disallows[1])) }
$this->disallow = $disallows[1]; }
if (isset ($crawldelay[1][0]) && $crawldelay[1][0] > 0) $keys = preg_grep ("#^\s*Sitemap:\s*(?P<value>\S+)\s*(\#)?#i", $content);
$this->crawldelay = intval ($crawldelay[1][0]); foreach ($keys as $line)
{
$url = $this->getValueFromLine ($line);
if (!! filter_var ($url, FILTER_VALIDATE_URL))
$this->sitemaps[] = $url;
}
$keys = preg_grep ("#^\s*Host:\s*(?P<value>\S+)\s*(\#)?#i", $content);
foreach ($keys as $line)
{
// Takes only the first one
$this->host = $this->getValueFromLine ($line);
break;
} }
preg_match_all ("#^Sitemap:\s*(\S+)\s*\$#mi", $content, $sitemaps);
if (isset ($sitemaps[1][0]))
$this->sitemaps = $sitemaps[1];
preg_match_all ("#^Host:\s*(\S+)\s*\$#mi", $content, $host);
if (isset ($host[1][0]))
$this->host = $host[1][0];
if (! in_array ("/", $this->disallow) && if (! in_array ("/", $this->disallow) &&
! in_array ("/", $this->allow)) ! in_array ("/", $this->allow))
$this->allow[] = "/"; $this->allow[] = "/";
@@ -236,4 +249,19 @@ class robotstxt
return $this->matchRule; return $this->matchRule;
} }
// }}} // }}}
// PRIVATE METHODS
/** Get a line from robots.txt file and return the associated value.
* Manage the evntual comments on the line
* @param string $line the complete line from robots.txt file
* @return string the value recorded on line
*/
private function getValueFromLine ($line)
// {{{
{
preg_match_all ("#^(?P<field>\S+):\s*(?P<value>\S*)\s*".
"(\#\s*(?P<comment>.+)\s*)?\$#", $line, $matches);
return $matches["value"][0];
}
// }}}
} }