robotsTxt : update Tests

git-svn-id: https://svn.fournier38.fr/svn/ProgSVN/trunk@5343 bf3deb0d-5f1a-0410-827f-c0cc1f45334c
This commit is contained in:
2019-06-11 18:59:44 +00:00
parent ded1628c1f
commit f574476427
2 changed files with 1226 additions and 53 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -16,10 +16,6 @@ class robotstxt
{ {
// PROPERTIES // PROPERTIES
// {{{ // {{{
/** The cralwer name wanted in robots.txt
*/
private $crawlerName = "DomSearch";
/** The allowed urls /** The allowed urls
*/ */
private $allow = array (); private $allow = array ();
@@ -40,7 +36,7 @@ class robotstxt
*/ */
private $host = null; private $host = null;
/** The rule matchine the URLAllow rule /** The rule number matching the URLAllow rule
*/ */
private $matchRule = null; private $matchRule = null;
// }}} // }}}
@@ -48,9 +44,10 @@ class robotstxt
// METHODS // METHODS
/** Get the robots.txt file content and do the analyze /** Get the robots.txt file content and do the analyze
* @param string $content The robots.txt file content to analyze * @param string $content The robots.txt file content to analyze
* @param string $crawlerName The crawler name to use in analyze
* @return $this * @return $this
*/ */
public function __construct ($content) public function analyze ($content, $crawlerName)
// {{{ // {{{
{ {
if (trim ($content) === "") if (trim ($content) === "")
@@ -62,7 +59,7 @@ class robotstxt
// If Not check if the User-agent axists for * // If Not check if the User-agent axists for *
// If not accept all // If not accept all
$content = preg_split('/\r\n|\r|\n/', $content); $content = preg_split('/\r\n|\r|\n/', $content);
$keys = preg_grep ("~User-agent:\s*$this->crawlerName\s*#?~i", $content); $keys = preg_grep ("~User-agent:\s*$crawlerName\s*#?~i", $content);
if (empty ($keys)) if (empty ($keys))
$keys = preg_grep ("~User-agent:\s*\\*\s*#?~i", $content); $keys = preg_grep ("~User-agent:\s*\\*\s*#?~i", $content);
if (empty ($keys)) if (empty ($keys))
@@ -133,6 +130,9 @@ class robotstxt
{ {
$parse = parse_url ($url); $parse = parse_url ($url);
$path = (isset ($parse["path"])) ? $parse["path"] : "/"; $path = (isset ($parse["path"])) ? $parse["path"] : "/";
// Robots.txt files are always allowed
if ($path === "/robots.txt")
return true;
$bestDisallow = -1; $bestDisallow = -1;
$bestAllow = -1; $bestAllow = -1;
$allowRule = ""; $allowRule = "";
@@ -179,13 +179,13 @@ class robotstxt
} }
} }
} }
if ($bestAllow < $bestDisallow) if ($bestAllow > $bestDisallow)
{ {
$this->matchRule = $disallowRule; $this->matchRule = $allowRule;
return false; return true;
} }
$this->matchRule = $allowRule; $this->matchRule = $disallowRule;
return true; return false;
} }
// }}} // }}}