316 lines
8.2 KiB
PHP
316 lines
8.2 KiB
PHP
<?php
|
|
/** DomFramework
|
|
* @package domframework
|
|
* @author Dominique Fournier <dominique@fournier38.fr>
|
|
* @license BSD
|
|
*/
|
|
|
|
namespace Domframework;
|
|
|
|
/** This class analyze the provided robots.txt file content and allow to
|
|
* get the configured data for DomSearch.
|
|
* It allow to examine an URL against the robots.txt file and return if the URL
|
|
* is allowed to be used or not
|
|
* The definition of the format of robots.txt file is available here :
|
|
* http://www.robotstxt.org/norobots-rfc.txt
|
|
* https://en.wikipedia.org/wiki/Robots_exclusion_standard
|
|
*/
|
|
class robotstxt
|
|
{
|
|
// PROPERTIES
|
|
// {{{
|
|
/** The allowed urls
|
|
*/
|
|
private $allow = array ();
|
|
|
|
/** The disallow urls
|
|
*/
|
|
private $disallow = array ();
|
|
|
|
/** The sitemaps URL defined in the file
|
|
*/
|
|
private $sitemaps = array ();
|
|
|
|
/** The crawldelay defined in the file (3s if not defined)
|
|
*/
|
|
private $crawldelay = 3;
|
|
|
|
/** The host can be specified as default website
|
|
*/
|
|
private $host = null;
|
|
|
|
/** The rule number matching the URLAllow rule
|
|
*/
|
|
private $matchRule = null;
|
|
|
|
/** List the errors read on the file content. The key is the line where the
|
|
* default is set
|
|
*/
|
|
private $errors = array ();
|
|
// }}}
|
|
|
|
// METHODS
|
|
/** Get the robots.txt file content and do the analyze
|
|
* @param string $content The robots.txt file content to analyze
|
|
* @param string $crawlerName The crawler name to use in analyze
|
|
* @return $this
|
|
*/
|
|
public function __construct ($content, $crawlerName)
|
|
// {{{
|
|
{
|
|
if (trim ($content) === "")
|
|
{
|
|
$this->allow = array ("/");
|
|
return $this;
|
|
}
|
|
// Look if the User-agent is available for the defined crawlerName.
|
|
// If Not check if the User-agent axists for *
|
|
// If not accept all
|
|
$content = preg_split('/\r\n|\r|\n/', $content);
|
|
$keys = preg_grep ("~User-agent:\s*$crawlerName\s*#?~i", $content);
|
|
if (empty ($keys))
|
|
$keys = preg_grep ("~User-agent:\s*\\*\s*#?~i", $content);
|
|
if (empty ($keys))
|
|
{
|
|
// No User-agent with crawlerName nor * : accept all
|
|
$this->allow = array ("/");
|
|
return $this;
|
|
}
|
|
|
|
// The sitemaps are not restricted to the user-agent
|
|
foreach (preg_grep ("~Sitemap:\s+~i", $content) as $nb => $line)
|
|
{
|
|
$url = $this->getValueFromLine ($line);
|
|
if (!! filter_var ($url, FILTER_VALIDATE_URL) &&
|
|
(substr ($url, 0, 7) === "http://" ||
|
|
substr ($url, 0, 8) === "https://"))
|
|
$this->sitemaps[] = $url;
|
|
else
|
|
$this->errors[$nb] = dgettext ("domframework",
|
|
"Sitemap : Invalid URL provided");
|
|
}
|
|
|
|
// Get the Allow and Disallow lines. The stop will arrive on first
|
|
// User-Agent line arriving after a Allow/Disallow.
|
|
// Comments and empty lines are removed
|
|
for ($i = key ($keys) ; $i < count ($content) ; $i++)
|
|
{
|
|
$line = trim ($content[$i]);
|
|
if (stripos ($line, "Sitemap:") === 0)
|
|
{
|
|
// Already managed in the general parser. Not needed in the specific
|
|
// user-agent parser. Must at least be catched to not generate an
|
|
// error
|
|
}
|
|
elseif (stripos ($line, "Host:") === 0)
|
|
{
|
|
if ($this->host !== null)
|
|
$this->errors[$i] = dgettext ("domframework",
|
|
"Multiple Hosts set");
|
|
else
|
|
$this->host = $this->getValueFromLine ($line);
|
|
}
|
|
elseif ($line === "" || $line[0] === "#")
|
|
{
|
|
// Comment, empty line : SKIP
|
|
}
|
|
elseif (stripos ($line, "allow:") === 0)
|
|
{
|
|
$this->allow[] = $this->getValueFromLine ($line);
|
|
}
|
|
elseif (stripos ($line, "disallow:") === 0)
|
|
{
|
|
$this->disallow[] = $this->getValueFromLine ($line);
|
|
}
|
|
elseif (stripos ($line, "crawl-delay:") === 0)
|
|
{
|
|
$val = $this->getValueFromLine ($line);
|
|
if ($val > 1 && $val < 60 && $this->crawldelay === 3)
|
|
$this->crawldelay = intval ($val);
|
|
else
|
|
$this->errors[$i] = dgettext ("domframework",
|
|
"Crawldelay : value out of range (1-60)");
|
|
}
|
|
elseif (stripos ($line, "user-agent:") === 0)
|
|
{
|
|
if (! empty ($this->allow) || ! empty ($this->disallow))
|
|
{
|
|
// New user-agent line after valid allow/disallow : end of paragraph
|
|
break;
|
|
}
|
|
else
|
|
{
|
|
// New user-agent. Do nothing
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// Not managed line : error
|
|
$this->errors[$i] = sprintf (dgettext ("domframework",
|
|
"Invalid line : unknown command : '%s'"), $line);
|
|
}
|
|
}
|
|
if (! in_array ("/", $this->disallow) &&
|
|
! in_array ("/", $this->allow))
|
|
$this->allow[] = "/";
|
|
return $this;
|
|
}
|
|
// }}}
|
|
|
|
/** Return true if the provided URL can be used against the robots.txt
|
|
* definition or FALSE if it is not the case
|
|
* @param string $url The URL to check
|
|
* @return boolean The result of the test
|
|
*/
|
|
public function URLAllow ($url)
|
|
// {{{
|
|
{
|
|
$parse = parse_url ($url);
|
|
$path = (isset ($parse["path"])) ? $parse["path"] : "/";
|
|
// Robots.txt files are always allowed
|
|
if ($path === "/robots.txt")
|
|
return true;
|
|
$bestDisallow = -1;
|
|
$bestAllow = -1;
|
|
$allowRule = "";
|
|
$disallowRule = "";
|
|
foreach ($this->disallow as $partial)
|
|
{
|
|
if (strpos ($partial, "*") === false)
|
|
{
|
|
if (substr ($path, 0, strlen ($partial)) === $partial &&
|
|
$bestDisallow < strlen ($partial))
|
|
{
|
|
$bestDisallow = strlen ($partial);
|
|
$disallowRule = $partial;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
$partial = str_replace ("*", ".+", $partial);
|
|
if (preg_match ("#$partial#", $path) === 1)
|
|
{
|
|
$bestDisallow = 255;
|
|
$disallowRule = $partial;
|
|
}
|
|
}
|
|
}
|
|
foreach ($this->allow as $partial)
|
|
{
|
|
if (strpos ($partial, "*") === false)
|
|
{
|
|
if (substr ($path, 0, strlen ($partial)) === $partial &&
|
|
$bestAllow < strlen ($partial))
|
|
{
|
|
$bestAllow = strlen ($partial);
|
|
$allowRule = $partial;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
$partial = str_replace ("*", ".+", $partial);
|
|
if (preg_match ("#$partial#", $path) === 1)
|
|
{
|
|
$bestAllow = 255;
|
|
$allowRule = $partial;
|
|
}
|
|
}
|
|
}
|
|
if ($bestAllow > $bestDisallow)
|
|
{
|
|
$this->matchRule = $allowRule;
|
|
return true;
|
|
}
|
|
$this->matchRule = $disallowRule;
|
|
return false;
|
|
}
|
|
// }}}
|
|
|
|
// GETTERS
|
|
/** Return the lines where an error occured
|
|
* The key of the array is the line number with the default
|
|
* @return array The errors
|
|
*/
|
|
public function errors ()
|
|
// {{{
|
|
{
|
|
return $this->errors;
|
|
}
|
|
// }}}
|
|
|
|
/** Return the allowed urls
|
|
* @return array $allow The array of allow rules
|
|
*/
|
|
public function allow ()
|
|
// {{{
|
|
{
|
|
return $this->allow;
|
|
}
|
|
// }}}
|
|
|
|
/** Return the disallowed urls
|
|
* @return array $disallow The array of disallow rules
|
|
*/
|
|
public function disallow ()
|
|
// {{{
|
|
{
|
|
return $this->disallow;
|
|
}
|
|
// }}}
|
|
|
|
/** Return the sitemaps url
|
|
* @return array $sitemap The array of sitemaps URL
|
|
*/
|
|
public function sitemaps ()
|
|
// {{{
|
|
{
|
|
return $this->sitemaps;
|
|
}
|
|
// }}}
|
|
|
|
/** Return the crawldelay
|
|
* @return integer $crawldelay The crawlDelay defined in robots.txt
|
|
*/
|
|
public function crawldelay ()
|
|
// {{{
|
|
{
|
|
return $this->crawldelay;
|
|
}
|
|
// }}}
|
|
|
|
/** Return the host
|
|
* @return string $host The Host string defined in robots.txt
|
|
*/
|
|
public function host ()
|
|
// {{{
|
|
{
|
|
return $this->host;
|
|
}
|
|
// }}}
|
|
|
|
/** Return the matchRule
|
|
* @return string $matchRule The matchRule matching the URLAllow test
|
|
*/
|
|
public function matchRule ()
|
|
// {{{
|
|
{
|
|
return $this->matchRule;
|
|
}
|
|
// }}}
|
|
|
|
// PRIVATE METHODS
|
|
/** Get a line from robots.txt file and return the associated value.
|
|
* Manage the evntual comments on the line
|
|
* @param string $line the complete line from robots.txt file
|
|
* @return string the value recorded on line
|
|
*/
|
|
private function getValueFromLine ($line)
|
|
// {{{
|
|
{
|
|
preg_match_all ("#^(?P<field>\S+):\s*(?P<value>\S*)\s*".
|
|
"(\#\s*(?P<comment>.+)\s*)?\$#", $line, $matches);
|
|
return $matches["value"][0];
|
|
}
|
|
// }}}
|
|
}
|