RobotsTXT : Manage correctely the Sitemaps

This commit is contained in:
2021-05-06 13:42:41 +02:00
parent 5ac09bb36b
commit 5748fe7ae8

View File

@@ -74,6 +74,20 @@ class robotstxt
$this->allow = array ("/");
return $this;
}
// The sitemaps are not restricted to the user-agent
foreach (preg_grep ("~Sitemap:\s+~i", $content) as $nb => $line)
{
$url = $this->getValueFromLine ($line);
if (!! filter_var ($url, FILTER_VALIDATE_URL) &&
(substr ($url, 0, 7) === "http://" ||
substr ($url, 0, 8) === "https://"))
$this->sitemaps[] = $url;
else
$this->errors[$nb] = dgettext ("domframework",
"Sitemap : Invalid URL provided");
}
// Get the Allow and Disallow lines. The stop will arrive on first
// User-Agent line arriving after a Allow/Disallow.
// Comments and empty lines are removed
@@ -82,14 +96,9 @@ class robotstxt
$line = trim ($content[$i]);
if (stripos ($line, "Sitemap:") === 0)
{
$url = $this->getValueFromLine ($line);
if (!! filter_var ($url, FILTER_VALIDATE_URL) &&
(substr ($url, 0, 7) === "http://" ||
substr ($url, 0, 8) === "https://"))
$this->sitemaps[] = $url;
else
$this->errors[$i] = dgettext ("domframework",
"Sitemap : Invalid URL provided");
// Already managed in the general parser. Not needed in the specific
// user-agent parser. Must at least be catched to not generate an
// error
}
elseif (stripos ($line, "Host:") === 0)
{