robotstxt : rewrite all the parser with the logic defined in RFC

git-svn-id: https://svn.fournier38.fr/svn/ProgSVN/trunk@5342 bf3deb0d-5f1a-0410-827f-c0cc1f45334c
This commit is contained in:
2019-06-11 09:08:53 +00:00
parent 2cfe3f4d17
commit ded1628c1f
2 changed files with 138 additions and 95 deletions

View File

@@ -8,6 +8,9 @@
* get the configured data for DomSearch.
* It allow to examine an URL against the robots.txt file and return if the URL
* is allowed to be used or not
* The definition of the format of robots.txt file is available here :
* http://www.robotstxt.org/norobots-rfc.txt
* https://en.wikipedia.org/wiki/Robots_exclusion_standard
*/
class robotstxt
{
@@ -55,54 +58,64 @@ class robotstxt
$this->allow = array ("/");
return $this;
}
$crawler = "";
$blocks = explode ("\n\n", $content);
foreach ($blocks as $block)
// Look if the User-agent is available for the defined crawlerName.
// If Not check if the User-agent axists for *
// If not accept all
$content = preg_split('/\r\n|\r|\n/', $content);
$keys = preg_grep ("~User-agent:\s*$this->crawlerName\s*#?~i", $content);
if (empty ($keys))
$keys = preg_grep ("~User-agent:\s*\\*\s*#?~i", $content);
if (empty ($keys))
{
preg_match_all ("#User-agent:\s+(.+)\$#mi", $block, $useragents);
if (!isset ($useragents[1]))
continue;
if (! in_array ("*", $useragents[1]) &&
! in_array ("DomSearch", $useragents[1]))
continue;
if (in_array ("*", $useragents[1]))
{
// Already set
if ($crawler == "DomSearch")
continue;
$crawler = "*";
$this->allow = array ();
$this->disallow = array ();
$this->crawldelay = 3;
}
if (in_array ("DomSearch", $useragents[1]))
{
// If the information for DomSearch where already seen, skip the second
// crawler information
if ($crawler == "DomSearch")
continue;
$crawler = "DomSearch";
$this->allow = array ();
$this->disallow = array ();
$this->crawldelay = 3;
}
preg_match_all ("#^Allow:\s*(\S*)\s*\$#mUi", $block, $allows);
preg_match_all ("#^Disallow:\s*(\S*)\s*\$#mUi", $block, $disallows);
preg_match_all ("#^Crawl-delay:\s*(\d+)\s*#mi", $block, $crawldelay);
if (isset ($allows[1]))
$this->allow = $allows[1];
if (isset ($disallows[1]))
$this->disallow = $disallows[1];
if (isset ($crawldelay[1][0]) && $crawldelay[1][0] > 0)
$this->crawldelay = intval ($crawldelay[1][0]);
// No User-agent with crawlerName nor * : accept all
$this->allow = array ("/");
return $this;
}
// Get the Allow and Disallow lines. The stop will arrive on first
// User-Agent line arriving after a Allow/Disallow.
// Comments and empty lines are removed
for ($i = key ($keys) ; $i < count ($content) ; $i++)
{
$line = trim ($content[$i]);
if (strtolower (substr ($line, 0, 6)) === "allow:")
{
$this->allow[] = $this->getValueFromLine ($line);
}
elseif (strtolower (substr ($line, 0, 9)) === "disallow:")
{
$this->disallow[] = $this->getValueFromLine ($line);
}
elseif (strtolower (substr ($line, 0, 12)) === "crawl-delay:")
{
$val = $this->getValueFromLine ($line);
if ($val > 1 && $val < 60 && $this->crawldelay === 3)
$this->crawldelay = intval ($val);
}
elseif (strtolower (substr ($line, 0, 11)) === "user-agent:" &&
(!empty ($this->allow) || ! empty ($this->disallow)))
{
// New user-agent line after valid allow/disallow : end of paragraph
break;
}
else
{
// Comment, empty line, sitemap, host, not managed line... : SKIP
}
}
$keys = preg_grep ("#^\s*Sitemap:\s*(?P<value>\S+)\s*(\#)?#i", $content);
foreach ($keys as $line)
{
$url = $this->getValueFromLine ($line);
if (!! filter_var ($url, FILTER_VALIDATE_URL))
$this->sitemaps[] = $url;
}
$keys = preg_grep ("#^\s*Host:\s*(?P<value>\S+)\s*(\#)?#i", $content);
foreach ($keys as $line)
{
// Takes only the first one
$this->host = $this->getValueFromLine ($line);
break;
}
preg_match_all ("#^Sitemap:\s*(\S+)\s*\$#mi", $content, $sitemaps);
if (isset ($sitemaps[1][0]))
$this->sitemaps = $sitemaps[1];
preg_match_all ("#^Host:\s*(\S+)\s*\$#mi", $content, $host);
if (isset ($host[1][0]))
$this->host = $host[1][0];
if (! in_array ("/", $this->disallow) &&
! in_array ("/", $this->allow))
$this->allow[] = "/";
@@ -236,4 +249,19 @@ class robotstxt
return $this->matchRule;
}
// }}}
// PRIVATE METHODS
/** Get a line from robots.txt file and return the associated value.
* Manage the evntual comments on the line
* @param string $line the complete line from robots.txt file
* @return string the value recorded on line
*/
private function getValueFromLine ($line)
// {{{
{
preg_match_all ("#^(?P<field>\S+):\s*(?P<value>\S*)\s*".
"(\#\s*(?P<comment>.+)\s*)?\$#", $line, $matches);
return $matches["value"][0];
}
// }}}
}