robotsTxt : update Tests

git-svn-id: https://svn.fournier38.fr/svn/ProgSVN/trunk@5343 bf3deb0d-5f1a-0410-827f-c0cc1f45334c
This commit is contained in:
2019-06-11 18:59:44 +00:00
parent ded1628c1f
commit f574476427
2 changed files with 1226 additions and 53 deletions

View File

@@ -16,10 +16,6 @@ class robotstxt
{
// PROPERTIES
// {{{
/** The cralwer name wanted in robots.txt
*/
private $crawlerName = "DomSearch";
/** The allowed urls
*/
private $allow = array ();
@@ -40,7 +36,7 @@ class robotstxt
*/
private $host = null;
/** The rule matchine the URLAllow rule
/** The rule number matching the URLAllow rule
*/
private $matchRule = null;
// }}}
@@ -48,9 +44,10 @@ class robotstxt
// METHODS
/** Get the robots.txt file content and do the analyze
* @param string $content The robots.txt file content to analyze
* @param string $crawlerName The crawler name to use in analyze
* @return $this
*/
public function __construct ($content)
public function analyze ($content, $crawlerName)
// {{{
{
if (trim ($content) === "")
@@ -62,7 +59,7 @@ class robotstxt
// If Not check if the User-agent axists for *
// If not accept all
$content = preg_split('/\r\n|\r|\n/', $content);
$keys = preg_grep ("~User-agent:\s*$this->crawlerName\s*#?~i", $content);
$keys = preg_grep ("~User-agent:\s*$crawlerName\s*#?~i", $content);
if (empty ($keys))
$keys = preg_grep ("~User-agent:\s*\\*\s*#?~i", $content);
if (empty ($keys))
@@ -133,6 +130,9 @@ class robotstxt
{
$parse = parse_url ($url);
$path = (isset ($parse["path"])) ? $parse["path"] : "/";
// Robots.txt files are always allowed
if ($path === "/robots.txt")
return true;
$bestDisallow = -1;
$bestAllow = -1;
$allowRule = "";
@@ -179,13 +179,13 @@ class robotstxt
}
}
}
if ($bestAllow < $bestDisallow)
if ($bestAllow > $bestDisallow)
{
$this->matchRule = $disallowRule;
return false;
$this->matchRule = $allowRule;
return true;
}
$this->matchRule = $allowRule;
return true;
$this->matchRule = $disallowRule;
return false;
}
// }}}