robotsTxt : update Tests
git-svn-id: https://svn.fournier38.fr/svn/ProgSVN/trunk@5343 bf3deb0d-5f1a-0410-827f-c0cc1f45334c
This commit is contained in:
File diff suppressed because it is too large
Load Diff
@@ -16,10 +16,6 @@ class robotstxt
|
|||||||
{
|
{
|
||||||
// PROPERTIES
|
// PROPERTIES
|
||||||
// {{{
|
// {{{
|
||||||
/** The cralwer name wanted in robots.txt
|
|
||||||
*/
|
|
||||||
private $crawlerName = "DomSearch";
|
|
||||||
|
|
||||||
/** The allowed urls
|
/** The allowed urls
|
||||||
*/
|
*/
|
||||||
private $allow = array ();
|
private $allow = array ();
|
||||||
@@ -40,7 +36,7 @@ class robotstxt
|
|||||||
*/
|
*/
|
||||||
private $host = null;
|
private $host = null;
|
||||||
|
|
||||||
/** The rule matchine the URLAllow rule
|
/** The rule number matching the URLAllow rule
|
||||||
*/
|
*/
|
||||||
private $matchRule = null;
|
private $matchRule = null;
|
||||||
// }}}
|
// }}}
|
||||||
@@ -48,9 +44,10 @@ class robotstxt
|
|||||||
// METHODS
|
// METHODS
|
||||||
/** Get the robots.txt file content and do the analyze
|
/** Get the robots.txt file content and do the analyze
|
||||||
* @param string $content The robots.txt file content to analyze
|
* @param string $content The robots.txt file content to analyze
|
||||||
|
* @param string $crawlerName The crawler name to use in analyze
|
||||||
* @return $this
|
* @return $this
|
||||||
*/
|
*/
|
||||||
public function __construct ($content)
|
public function analyze ($content, $crawlerName)
|
||||||
// {{{
|
// {{{
|
||||||
{
|
{
|
||||||
if (trim ($content) === "")
|
if (trim ($content) === "")
|
||||||
@@ -62,7 +59,7 @@ class robotstxt
|
|||||||
// If Not check if the User-agent axists for *
|
// If Not check if the User-agent axists for *
|
||||||
// If not accept all
|
// If not accept all
|
||||||
$content = preg_split('/\r\n|\r|\n/', $content);
|
$content = preg_split('/\r\n|\r|\n/', $content);
|
||||||
$keys = preg_grep ("~User-agent:\s*$this->crawlerName\s*#?~i", $content);
|
$keys = preg_grep ("~User-agent:\s*$crawlerName\s*#?~i", $content);
|
||||||
if (empty ($keys))
|
if (empty ($keys))
|
||||||
$keys = preg_grep ("~User-agent:\s*\\*\s*#?~i", $content);
|
$keys = preg_grep ("~User-agent:\s*\\*\s*#?~i", $content);
|
||||||
if (empty ($keys))
|
if (empty ($keys))
|
||||||
@@ -133,6 +130,9 @@ class robotstxt
|
|||||||
{
|
{
|
||||||
$parse = parse_url ($url);
|
$parse = parse_url ($url);
|
||||||
$path = (isset ($parse["path"])) ? $parse["path"] : "/";
|
$path = (isset ($parse["path"])) ? $parse["path"] : "/";
|
||||||
|
// Robots.txt files are always allowed
|
||||||
|
if ($path === "/robots.txt")
|
||||||
|
return true;
|
||||||
$bestDisallow = -1;
|
$bestDisallow = -1;
|
||||||
$bestAllow = -1;
|
$bestAllow = -1;
|
||||||
$allowRule = "";
|
$allowRule = "";
|
||||||
@@ -179,13 +179,13 @@ class robotstxt
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if ($bestAllow < $bestDisallow)
|
if ($bestAllow > $bestDisallow)
|
||||||
{
|
{
|
||||||
$this->matchRule = $disallowRule;
|
$this->matchRule = $allowRule;
|
||||||
return false;
|
return true;
|
||||||
}
|
}
|
||||||
$this->matchRule = $allowRule;
|
$this->matchRule = $disallowRule;
|
||||||
return true;
|
return false;
|
||||||
}
|
}
|
||||||
// }}}
|
// }}}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user