robotstxt : Add support to understand the robots.txt files
git-svn-id: https://svn.fournier38.fr/svn/ProgSVN/trunk@5339 bf3deb0d-5f1a-0410-827f-c0cc1f45334c
This commit is contained in:
241
robotstxt.php
Normal file
241
robotstxt.php
Normal file
@@ -0,0 +1,241 @@
|
||||
<?php
|
||||
/** DomSearch
|
||||
* @package domsearch
|
||||
* @author Dominique Fournier <dominique@fournier38.fr>
|
||||
*/
|
||||
|
||||
namespace models;
|
||||
|
||||
/** This class analyze the provided robots.txt file content and allow to
|
||||
* get the configured data for DomSearch.
|
||||
* It allow to examine an URL against the robots.txt file and return if the URL
|
||||
* is allowed to be used or not
|
||||
*/
|
||||
class robots
|
||||
{
|
||||
// PROPERTIES
|
||||
// {{{
|
||||
/** The cralwer name wanted in robots.txt
|
||||
*/
|
||||
private $crawlerName = "DomSearch";
|
||||
|
||||
/** The allowed urls
|
||||
*/
|
||||
private $allow = array ();
|
||||
|
||||
/** The disallow urls
|
||||
*/
|
||||
private $disallow = array ();
|
||||
|
||||
/** The sitemaps URL defined in the file
|
||||
*/
|
||||
private $sitemaps = array ();
|
||||
|
||||
/** The crawldelay defined in the file (3s if not defined)
|
||||
*/
|
||||
private $crawldelay = 3;
|
||||
|
||||
/** The host can be specified as default website
|
||||
*/
|
||||
private $host = null;
|
||||
|
||||
/** The rule matchine the URLAllow rule
|
||||
*/
|
||||
private $matchRule = null;
|
||||
// }}}
|
||||
|
||||
// METHODS
|
||||
/** Get the robots.txt file content and do the analyze
|
||||
* @param string $content The robots.txt file content to analyze
|
||||
* @return $this
|
||||
*/
|
||||
public function __construct ($content)
|
||||
// {{{
|
||||
{
|
||||
if (trim ($content) === "")
|
||||
{
|
||||
$this->allow = array ("/");
|
||||
return $this;
|
||||
}
|
||||
$crawler = "";
|
||||
$blocks = explode ("\n\n", $content);
|
||||
foreach ($blocks as $block)
|
||||
{
|
||||
preg_match_all ("#User-agent:\s+(.+)\$#mi", $block, $useragents);
|
||||
if (!isset ($useragents[1]))
|
||||
continue;
|
||||
if (! in_array ("*", $useragents[1]) &&
|
||||
! in_array ("DomSearch", $useragents[1]))
|
||||
continue;
|
||||
if (in_array ("*", $useragents[1]))
|
||||
{
|
||||
// Already set
|
||||
if ($crawler == "DomSearch")
|
||||
continue;
|
||||
$crawler = "*";
|
||||
$this->allow = array ();
|
||||
$this->disallow = array ();
|
||||
$this->crawldelay = 3;
|
||||
}
|
||||
if (in_array ("DomSearch", $useragents[1]))
|
||||
|
||||
{
|
||||
// If the information for DomSearch where already seen, skip the second
|
||||
// crawler information
|
||||
if ($crawler == "DomSearch")
|
||||
continue;
|
||||
$crawler = "DomSearch";
|
||||
$this->allow = array ();
|
||||
$this->disallow = array ();
|
||||
$this->crawldelay = 3;
|
||||
}
|
||||
preg_match_all ("#^Allow:\s*(\S*)\s*\$#mUi", $block, $allows);
|
||||
preg_match_all ("#^Disallow:\s*(\S*)\s*\$#mUi", $block, $disallows);
|
||||
preg_match_all ("#^Crawl-delay:\s*(\d+)\s*#mi", $block, $crawldelay);
|
||||
if (isset ($allows[1]))
|
||||
$this->allow = $allows[1];
|
||||
if (isset ($disallows[1]))
|
||||
$this->disallow = $disallows[1];
|
||||
if (isset ($crawldelay[1][0]) && $crawldelay[1][0] > 0)
|
||||
$this->crawldelay = intval ($crawldelay[1][0]);
|
||||
}
|
||||
preg_match_all ("#^Sitemap:\s*(\S+)\s*\$#mi", $content, $sitemaps);
|
||||
if (isset ($sitemaps[1][0]))
|
||||
$this->sitemaps = $sitemaps[1];
|
||||
preg_match_all ("#^Host:\s*(\S+)\s*\$#mi", $content, $host);
|
||||
if (isset ($host[1][0]))
|
||||
$this->host = $host[1][0];
|
||||
if (! in_array ("/", $this->disallow) &&
|
||||
! in_array ("/", $this->allow))
|
||||
$this->allow[] = "/";
|
||||
return $this;
|
||||
}
|
||||
// }}}
|
||||
|
||||
/** Return true if the provided URL can be used against the robots.txt
|
||||
* definition or FALSE if it is not the case
|
||||
* @param string $url The URL to check
|
||||
* @return boolean The result of the test
|
||||
*/
|
||||
public function URLAllow ($url)
|
||||
// {{{
|
||||
{
|
||||
$parse = parse_url ($url);
|
||||
$path = (isset ($parse["path"])) ? $parse["path"] : "/";
|
||||
$bestDisallow = -1;
|
||||
$bestAllow = -1;
|
||||
$allowRule = "";
|
||||
$disallowRule = "";
|
||||
foreach ($this->disallow as $partial)
|
||||
{
|
||||
if (strpos ($partial, "*") === false)
|
||||
{
|
||||
if (substr ($path, 0, strlen ($partial)) === $partial &&
|
||||
$bestDisallow < strlen ($partial))
|
||||
{
|
||||
$bestDisallow = strlen ($partial);
|
||||
$disallowRule = $partial;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
$partial = str_replace ("*", ".+", $partial);
|
||||
if (preg_match ("#$partial#", $path) === 1)
|
||||
{
|
||||
$bestDisallow = 255;
|
||||
$disallowRule = $partial;
|
||||
}
|
||||
}
|
||||
}
|
||||
foreach ($this->allow as $partial)
|
||||
{
|
||||
if (strpos ($partial, "*") === false)
|
||||
{
|
||||
if (substr ($path, 0, strlen ($partial)) === $partial &&
|
||||
$bestAllow < strlen ($partial))
|
||||
{
|
||||
$bestAllow = strlen ($partial);
|
||||
$allowRule = $partial;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
$partial = str_replace ("*", ".+", $partial);
|
||||
if (preg_match ("#$partial#", $path) === 1)
|
||||
{
|
||||
$bestAllow = 255;
|
||||
$allowRule = $partial;
|
||||
}
|
||||
}
|
||||
}
|
||||
if ($bestAllow < $bestDisallow)
|
||||
{
|
||||
$this->matchRule = $disallowRule;
|
||||
return false;
|
||||
}
|
||||
$this->matchRule = $allowRule;
|
||||
return true;
|
||||
}
|
||||
// }}}
|
||||
|
||||
// GETTERS
|
||||
/** Return the allowed urls
|
||||
* @return array $allow The array of allow rules
|
||||
*/
|
||||
public function allow ()
|
||||
// {{{
|
||||
{
|
||||
return $this->allow;
|
||||
}
|
||||
// }}}
|
||||
|
||||
/** Return the disallowed urls
|
||||
* @return array $disallow The array of disallow rules
|
||||
*/
|
||||
public function disallow ()
|
||||
// {{{
|
||||
{
|
||||
return $this->disallow;
|
||||
}
|
||||
// }}}
|
||||
|
||||
/** Return the sitemaps url
|
||||
* @return array $sitemap The array of sitemaps URL
|
||||
*/
|
||||
public function sitemaps ()
|
||||
// {{{
|
||||
{
|
||||
return $this->sitemaps;
|
||||
}
|
||||
// }}}
|
||||
|
||||
/** Return the crawldelay
|
||||
* @return integer $crawldelay The crawlDelay defined in robots.txt
|
||||
*/
|
||||
public function crawldelay ()
|
||||
// {{{
|
||||
{
|
||||
return $this->crawldelay;
|
||||
}
|
||||
// }}}
|
||||
|
||||
/** Return the host
|
||||
* @return string $host The Host string defined in robots.txt
|
||||
*/
|
||||
public function host ()
|
||||
// {{{
|
||||
{
|
||||
return $this->host;
|
||||
}
|
||||
// }}}
|
||||
|
||||
/** Return the matchRule
|
||||
* @return string $matchRule The matchRule matching the URLAllow test
|
||||
*/
|
||||
public function matchRule ()
|
||||
// {{{
|
||||
{
|
||||
return $this->matchRule;
|
||||
}
|
||||
// }}}
|
||||
}
|
||||
Reference in New Issue
Block a user