Files
DomFramework/robotstxt.php

268 lines
6.8 KiB
PHP

<?php
/** DomFramework
* @package domframework
* @author Dominique Fournier <dominique@fournier38.fr>
*/
/** This class analyze the provided robots.txt file content and allow to
* get the configured data for DomSearch.
* It allow to examine an URL against the robots.txt file and return if the URL
* is allowed to be used or not
* The definition of the format of robots.txt file is available here :
* http://www.robotstxt.org/norobots-rfc.txt
* https://en.wikipedia.org/wiki/Robots_exclusion_standard
*/
class robotstxt
{
// PROPERTIES
// {{{
/** The cralwer name wanted in robots.txt
*/
private $crawlerName = "DomSearch";
/** The allowed urls
*/
private $allow = array ();
/** The disallow urls
*/
private $disallow = array ();
/** The sitemaps URL defined in the file
*/
private $sitemaps = array ();
/** The crawldelay defined in the file (3s if not defined)
*/
private $crawldelay = 3;
/** The host can be specified as default website
*/
private $host = null;
/** The rule matchine the URLAllow rule
*/
private $matchRule = null;
// }}}
// METHODS
/** Get the robots.txt file content and do the analyze
* @param string $content The robots.txt file content to analyze
* @return $this
*/
public function __construct ($content)
// {{{
{
if (trim ($content) === "")
{
$this->allow = array ("/");
return $this;
}
// Look if the User-agent is available for the defined crawlerName.
// If Not check if the User-agent axists for *
// If not accept all
$content = preg_split('/\r\n|\r|\n/', $content);
$keys = preg_grep ("~User-agent:\s*$this->crawlerName\s*#?~i", $content);
if (empty ($keys))
$keys = preg_grep ("~User-agent:\s*\\*\s*#?~i", $content);
if (empty ($keys))
{
// No User-agent with crawlerName nor * : accept all
$this->allow = array ("/");
return $this;
}
// Get the Allow and Disallow lines. The stop will arrive on first
// User-Agent line arriving after a Allow/Disallow.
// Comments and empty lines are removed
for ($i = key ($keys) ; $i < count ($content) ; $i++)
{
$line = trim ($content[$i]);
if (strtolower (substr ($line, 0, 6)) === "allow:")
{
$this->allow[] = $this->getValueFromLine ($line);
}
elseif (strtolower (substr ($line, 0, 9)) === "disallow:")
{
$this->disallow[] = $this->getValueFromLine ($line);
}
elseif (strtolower (substr ($line, 0, 12)) === "crawl-delay:")
{
$val = $this->getValueFromLine ($line);
if ($val > 1 && $val < 60 && $this->crawldelay === 3)
$this->crawldelay = intval ($val);
}
elseif (strtolower (substr ($line, 0, 11)) === "user-agent:" &&
(!empty ($this->allow) || ! empty ($this->disallow)))
{
// New user-agent line after valid allow/disallow : end of paragraph
break;
}
else
{
// Comment, empty line, sitemap, host, not managed line... : SKIP
}
}
$keys = preg_grep ("#^\s*Sitemap:\s*(?P<value>\S+)\s*(\#)?#i", $content);
foreach ($keys as $line)
{
$url = $this->getValueFromLine ($line);
if (!! filter_var ($url, FILTER_VALIDATE_URL))
$this->sitemaps[] = $url;
}
$keys = preg_grep ("#^\s*Host:\s*(?P<value>\S+)\s*(\#)?#i", $content);
foreach ($keys as $line)
{
// Takes only the first one
$this->host = $this->getValueFromLine ($line);
break;
}
if (! in_array ("/", $this->disallow) &&
! in_array ("/", $this->allow))
$this->allow[] = "/";
return $this;
}
// }}}
/** Return true if the provided URL can be used against the robots.txt
* definition or FALSE if it is not the case
* @param string $url The URL to check
* @return boolean The result of the test
*/
public function URLAllow ($url)
// {{{
{
$parse = parse_url ($url);
$path = (isset ($parse["path"])) ? $parse["path"] : "/";
$bestDisallow = -1;
$bestAllow = -1;
$allowRule = "";
$disallowRule = "";
foreach ($this->disallow as $partial)
{
if (strpos ($partial, "*") === false)
{
if (substr ($path, 0, strlen ($partial)) === $partial &&
$bestDisallow < strlen ($partial))
{
$bestDisallow = strlen ($partial);
$disallowRule = $partial;
}
}
else
{
$partial = str_replace ("*", ".+", $partial);
if (preg_match ("#$partial#", $path) === 1)
{
$bestDisallow = 255;
$disallowRule = $partial;
}
}
}
foreach ($this->allow as $partial)
{
if (strpos ($partial, "*") === false)
{
if (substr ($path, 0, strlen ($partial)) === $partial &&
$bestAllow < strlen ($partial))
{
$bestAllow = strlen ($partial);
$allowRule = $partial;
}
}
else
{
$partial = str_replace ("*", ".+", $partial);
if (preg_match ("#$partial#", $path) === 1)
{
$bestAllow = 255;
$allowRule = $partial;
}
}
}
if ($bestAllow < $bestDisallow)
{
$this->matchRule = $disallowRule;
return false;
}
$this->matchRule = $allowRule;
return true;
}
// }}}
// GETTERS
/** Return the allowed urls
* @return array $allow The array of allow rules
*/
public function allow ()
// {{{
{
return $this->allow;
}
// }}}
/** Return the disallowed urls
* @return array $disallow The array of disallow rules
*/
public function disallow ()
// {{{
{
return $this->disallow;
}
// }}}
/** Return the sitemaps url
* @return array $sitemap The array of sitemaps URL
*/
public function sitemaps ()
// {{{
{
return $this->sitemaps;
}
// }}}
/** Return the crawldelay
* @return integer $crawldelay The crawlDelay defined in robots.txt
*/
public function crawldelay ()
// {{{
{
return $this->crawldelay;
}
// }}}
/** Return the host
* @return string $host The Host string defined in robots.txt
*/
public function host ()
// {{{
{
return $this->host;
}
// }}}
/** Return the matchRule
* @return string $matchRule The matchRule matching the URLAllow test
*/
public function matchRule ()
// {{{
{
return $this->matchRule;
}
// }}}
// PRIVATE METHODS
/** Get a line from robots.txt file and return the associated value.
* Manage the evntual comments on the line
* @param string $line the complete line from robots.txt file
* @return string the value recorded on line
*/
private function getValueFromLine ($line)
// {{{
{
preg_match_all ("#^(?P<field>\S+):\s*(?P<value>\S*)\s*".
"(\#\s*(?P<comment>.+)\s*)?\$#", $line, $matches);
return $matches["value"][0];
}
// }}}
}