Robots : values for Allow and Disallow must be empty or start by slash
This commit is contained in:
@@ -13,6 +13,7 @@ namespace Domframework;
|
||||
* It allow to examine an URL against the robots.txt file and return if the URL
|
||||
* is allowed to be used or not
|
||||
* The definition of the format of robots.txt file is available here :
|
||||
* https://www.rfc-editor.org/rfc/rfc9309.txt
|
||||
* http://www.robotstxt.org/norobots-rfc.txt
|
||||
* https://en.wikipedia.org/wiki/Robots_exclusion_standard
|
||||
*/
|
||||
@@ -94,15 +95,15 @@ class Robotstxt
|
||||
// Get the Allow and Disallow lines. The stop will arrive on first
|
||||
// User-Agent line arriving after a Allow/Disallow.
|
||||
// Comments and empty lines are removed
|
||||
for ($i = key($keys); $i < count($content); $i++) {
|
||||
$line = trim($content[$i]);
|
||||
for ($nb = key($keys); $nb < count($content); $nb++) {
|
||||
$line = trim($content[$nb]);
|
||||
if (stripos($line, "Sitemap:") === 0) {
|
||||
// Already managed in the general parser. Not needed in the specific
|
||||
// user-agent parser. Must at least be catched to not generate an
|
||||
// error
|
||||
} elseif (stripos($line, "Host:") === 0) {
|
||||
if ($this->host !== null) {
|
||||
$this->errors[$i] = dgettext(
|
||||
$this->errors[$nb] = dgettext(
|
||||
"domframework",
|
||||
"Multiple Hosts set"
|
||||
);
|
||||
@@ -112,15 +113,31 @@ class Robotstxt
|
||||
} elseif ($line === "" || $line[0] === "#") {
|
||||
// Comment, empty line : SKIP
|
||||
} elseif (stripos($line, "allow:") === 0) {
|
||||
$this->allow[] = $this->getValueFromLine($line);
|
||||
$allow = $this->getValueFromLine($line);
|
||||
if (trim($allow) === "" || $allow[0] === "/") {
|
||||
$this->allow[] = $allow;
|
||||
} else {
|
||||
$this->errors[$nb] = dgettext(
|
||||
"domframework",
|
||||
"Allow : Line must start by slash"
|
||||
);
|
||||
}
|
||||
} elseif (stripos($line, "disallow:") === 0) {
|
||||
$this->disallow[] = $this->getValueFromLine($line);
|
||||
$disallow = $this->getValueFromLine($line);
|
||||
if (trim($disallow) === "" || $disallow[0] === "/") {
|
||||
$this->disallow[] = $disallow;
|
||||
} else {
|
||||
$this->errors[$nb] = dgettext(
|
||||
"domframework",
|
||||
"Disallow : Line must start by slash"
|
||||
);
|
||||
}
|
||||
} elseif (stripos($line, "crawl-delay:") === 0) {
|
||||
$val = $this->getValueFromLine($line);
|
||||
if ($val > 1 && $val < 60 && $this->crawldelay === null) {
|
||||
$this->crawldelay = intval($val);
|
||||
} else {
|
||||
$this->errors[$i] = dgettext(
|
||||
$this->errors[$nb] = dgettext(
|
||||
"domframework",
|
||||
"Crawldelay : value out of range (1-60)"
|
||||
);
|
||||
@@ -134,7 +151,7 @@ class Robotstxt
|
||||
}
|
||||
} else {
|
||||
// Not managed line : error
|
||||
$this->errors[$i] = sprintf(dgettext(
|
||||
$this->errors[$nb] = sprintf(dgettext(
|
||||
"domframework",
|
||||
"Invalid line : unknown command : '%s'"
|
||||
), $line);
|
||||
|
||||
Reference in New Issue
Block a user