Robots : values for Allow and Disallow must be empty or start by slash

This commit is contained in:
2023-01-03 13:20:21 +01:00
parent a470048b4f
commit 535525db1d
2 changed files with 139 additions and 77 deletions

View File

@@ -13,6 +13,7 @@ namespace Domframework;
* It allow to examine an URL against the robots.txt file and return if the URL
* is allowed to be used or not
* The definition of the format of robots.txt file is available here :
* https://www.rfc-editor.org/rfc/rfc9309.txt
* http://www.robotstxt.org/norobots-rfc.txt
* https://en.wikipedia.org/wiki/Robots_exclusion_standard
*/
@@ -94,15 +95,15 @@ class Robotstxt
// Get the Allow and Disallow lines. The stop will arrive on first
// User-Agent line arriving after a Allow/Disallow.
// Comments and empty lines are removed
for ($i = key($keys); $i < count($content); $i++) {
$line = trim($content[$i]);
for ($nb = key($keys); $nb < count($content); $nb++) {
$line = trim($content[$nb]);
if (stripos($line, "Sitemap:") === 0) {
// Already managed in the general parser. Not needed in the specific
// user-agent parser. Must at least be catched to not generate an
// error
} elseif (stripos($line, "Host:") === 0) {
if ($this->host !== null) {
$this->errors[$i] = dgettext(
$this->errors[$nb] = dgettext(
"domframework",
"Multiple Hosts set"
);
@@ -112,15 +113,31 @@ class Robotstxt
} elseif ($line === "" || $line[0] === "#") {
// Comment, empty line : SKIP
} elseif (stripos($line, "allow:") === 0) {
$this->allow[] = $this->getValueFromLine($line);
$allow = $this->getValueFromLine($line);
if (trim($allow) === "" || $allow[0] === "/") {
$this->allow[] = $allow;
} else {
$this->errors[$nb] = dgettext(
"domframework",
"Allow : Line must start by slash"
);
}
} elseif (stripos($line, "disallow:") === 0) {
$this->disallow[] = $this->getValueFromLine($line);
$disallow = $this->getValueFromLine($line);
if (trim($disallow) === "" || $disallow[0] === "/") {
$this->disallow[] = $disallow;
} else {
$this->errors[$nb] = dgettext(
"domframework",
"Disallow : Line must start by slash"
);
}
} elseif (stripos($line, "crawl-delay:") === 0) {
$val = $this->getValueFromLine($line);
if ($val > 1 && $val < 60 && $this->crawldelay === null) {
$this->crawldelay = intval($val);
} else {
$this->errors[$i] = dgettext(
$this->errors[$nb] = dgettext(
"domframework",
"Crawldelay : value out of range (1-60)"
);
@@ -134,7 +151,7 @@ class Robotstxt
}
} else {
// Not managed line : error
$this->errors[$i] = sprintf(dgettext(
$this->errors[$nb] = sprintf(dgettext(
"domframework",
"Invalid line : unknown command : '%s'"
), $line);