robotstxt : catch the errors with the associated lines.

robotstxt : more unit tests


git-svn-id: https://svn.fournier38.fr/svn/ProgSVN/trunk@5986 bf3deb0d-5f1a-0410-827f-c0cc1f45334c
This commit is contained in:
2020-05-11 18:26:01 +00:00
parent 8f302cfede
commit e1f1ddaa24
2 changed files with 81 additions and 22 deletions

View File

@@ -119,6 +119,15 @@ class robotstxtTest extends PHPUnit_Framework_TestCase
["http://example.com/sitemap.xml", "http://example.com/SITEMAP.XML"]);
}
public function test_sitemaps_error_1 ()
{
$robotstxt = new robotstxt (
"User-Agent: *\nDisallow: /\nSitemap: URL",
"domsearch");
$res = $robotstxt->errors ();
$this->assertSame ($res, [2 => "Sitemap : Invalid URL provided"]);
}
// Host
public function test_host_1 ()
{
@@ -134,6 +143,20 @@ class robotstxtTest extends PHPUnit_Framework_TestCase
$res = $robotstxt->host ();
$this->assertSame ($res, "localhost");
}
public function test_host_error_1 ()
{
$robotstxt = new robotstxt (
"User-Agent: *\nDisallow: /\n\nHost: localhost\nHoST: toto", "domsearch");
$res = $robotstxt->host ();
$this->assertSame ($res, "localhost");
}
public function test_host_error_2 ()
{
$robotstxt = new robotstxt (
"User-Agent: *\nDisallow: /\n\nHost: localhost\nHoST: toto", "domsearch");
$res = $robotstxt->errors ();
$this->assertSame ($res, [4 => "Multiple Hosts set"]);
}
// URLAllow
public function test_urlallow_1 ()

View File

@@ -39,6 +39,11 @@ class robotstxt
/** The rule number matching the URLAllow rule
*/
private $matchRule = null;
/** List the errors read on the file content. The key is the line where the
* default is set
*/
private $errors = array ();
// }}}
// METHODS
@@ -74,45 +79,65 @@ class robotstxt
for ($i = key ($keys) ; $i < count ($content) ; $i++)
{
$line = trim ($content[$i]);
if (strtolower (substr ($line, 0, 6)) === "allow:")
if (stripos ($line, "Sitemap:") === 0)
{
$url = $this->getValueFromLine ($line);
if (!! filter_var ($url, FILTER_VALIDATE_URL) &&
(substr ($url, 0, 7) === "http://" ||
substr ($url, 0, 8) === "https://"))
$this->sitemaps[] = $url;
else
$this->errors[$i] = dgettext ("domframework",
"Sitemap : Invalid URL provided");
}
elseif (stripos ($line, "Host:") === 0)
{
if ($this->host !== null)
$this->errors[$i] = dgettext ("domframework",
"Multiple Hosts set");
else
$this->host = $this->getValueFromLine ($line);
}
elseif ($line === "" || $line[0] === "#")
{
// Comment, empty line : SKIP
}
elseif (stripos ($line, "allow:") === 0)
{
$this->allow[] = $this->getValueFromLine ($line);
}
elseif (strtolower (substr ($line, 0, 9)) === "disallow:")
elseif (stripos ($line, "disallow:") === 0)
{
$this->disallow[] = $this->getValueFromLine ($line);
}
elseif (strtolower (substr ($line, 0, 12)) === "crawl-delay:")
elseif (stripos ($line, "crawl-delay:") === 0)
{
$val = $this->getValueFromLine ($line);
if ($val > 1 && $val < 60 && $this->crawldelay === 3)
$this->crawldelay = intval ($val);
else
$this->errors[$i] = dgettext ("domframework",
"Crawldelay : value out of range (1-60)");
}
elseif (strtolower (substr ($line, 0, 11)) === "user-agent:" &&
(!empty ($this->allow) || ! empty ($this->disallow)))
elseif (stripos ($line, "user-agent:") === 0)
{
// New user-agent line after valid allow/disallow : end of paragraph
break;
if (! empty ($this->allow) || ! empty ($this->disallow))
{
// New user-agent line after valid allow/disallow : end of paragraph
break;
}
else
{
// New user-agent. Do nothing
}
}
else
{
// Comment, empty line, sitemap, host, not managed line... : SKIP
// Not managed line : error
$this->errors[$i] = sprintf (dgettext ("domframework",
"Invalid line : unknown command : '%s'"), $line);
}
}
$keys = preg_grep ("#^\s*Sitemap:\s*(?P<value>\S+)\s*(\#)?#i", $content);
foreach ($keys as $line)
{
$url = $this->getValueFromLine ($line);
if (!! filter_var ($url, FILTER_VALIDATE_URL))
$this->sitemaps[] = $url;
}
$keys = preg_grep ("#^\s*Host:\s*(?P<value>\S+)\s*(\#)?#i", $content);
foreach ($keys as $line)
{
// Takes only the first one
$this->host = $this->getValueFromLine ($line);
break;
}
if (! in_array ("/", $this->disallow) &&
! in_array ("/", $this->allow))
$this->allow[] = "/";
@@ -190,6 +215,17 @@ class robotstxt
// }}}
// GETTERS
/** Return the lines where an error occured
* The key of the array is the line number with the default
* @return array The errors
*/
public function errors ()
// {{{
{
return $this->errors;
}
// }}}
/** Return the allowed urls
* @return array $allow The array of allow rules
*/