robotstxt : catch the errors with the associated lines.
robotstxt : more unit tests git-svn-id: https://svn.fournier38.fr/svn/ProgSVN/trunk@5986 bf3deb0d-5f1a-0410-827f-c0cc1f45334c
This commit is contained in:
@@ -119,6 +119,15 @@ class robotstxtTest extends PHPUnit_Framework_TestCase
|
||||
["http://example.com/sitemap.xml", "http://example.com/SITEMAP.XML"]);
|
||||
}
|
||||
|
||||
public function test_sitemaps_error_1 ()
|
||||
{
|
||||
$robotstxt = new robotstxt (
|
||||
"User-Agent: *\nDisallow: /\nSitemap: URL",
|
||||
"domsearch");
|
||||
$res = $robotstxt->errors ();
|
||||
$this->assertSame ($res, [2 => "Sitemap : Invalid URL provided"]);
|
||||
}
|
||||
|
||||
// Host
|
||||
public function test_host_1 ()
|
||||
{
|
||||
@@ -134,6 +143,20 @@ class robotstxtTest extends PHPUnit_Framework_TestCase
|
||||
$res = $robotstxt->host ();
|
||||
$this->assertSame ($res, "localhost");
|
||||
}
|
||||
public function test_host_error_1 ()
|
||||
{
|
||||
$robotstxt = new robotstxt (
|
||||
"User-Agent: *\nDisallow: /\n\nHost: localhost\nHoST: toto", "domsearch");
|
||||
$res = $robotstxt->host ();
|
||||
$this->assertSame ($res, "localhost");
|
||||
}
|
||||
public function test_host_error_2 ()
|
||||
{
|
||||
$robotstxt = new robotstxt (
|
||||
"User-Agent: *\nDisallow: /\n\nHost: localhost\nHoST: toto", "domsearch");
|
||||
$res = $robotstxt->errors ();
|
||||
$this->assertSame ($res, [4 => "Multiple Hosts set"]);
|
||||
}
|
||||
|
||||
// URLAllow
|
||||
public function test_urlallow_1 ()
|
||||
|
||||
@@ -39,6 +39,11 @@ class robotstxt
|
||||
/** The rule number matching the URLAllow rule
|
||||
*/
|
||||
private $matchRule = null;
|
||||
|
||||
/** List the errors read on the file content. The key is the line where the
|
||||
* default is set
|
||||
*/
|
||||
private $errors = array ();
|
||||
// }}}
|
||||
|
||||
// METHODS
|
||||
@@ -74,44 +79,64 @@ class robotstxt
|
||||
for ($i = key ($keys) ; $i < count ($content) ; $i++)
|
||||
{
|
||||
$line = trim ($content[$i]);
|
||||
if (strtolower (substr ($line, 0, 6)) === "allow:")
|
||||
if (stripos ($line, "Sitemap:") === 0)
|
||||
{
|
||||
$url = $this->getValueFromLine ($line);
|
||||
if (!! filter_var ($url, FILTER_VALIDATE_URL) &&
|
||||
(substr ($url, 0, 7) === "http://" ||
|
||||
substr ($url, 0, 8) === "https://"))
|
||||
$this->sitemaps[] = $url;
|
||||
else
|
||||
$this->errors[$i] = dgettext ("domframework",
|
||||
"Sitemap : Invalid URL provided");
|
||||
}
|
||||
elseif (stripos ($line, "Host:") === 0)
|
||||
{
|
||||
if ($this->host !== null)
|
||||
$this->errors[$i] = dgettext ("domframework",
|
||||
"Multiple Hosts set");
|
||||
else
|
||||
$this->host = $this->getValueFromLine ($line);
|
||||
}
|
||||
elseif ($line === "" || $line[0] === "#")
|
||||
{
|
||||
// Comment, empty line : SKIP
|
||||
}
|
||||
elseif (stripos ($line, "allow:") === 0)
|
||||
{
|
||||
$this->allow[] = $this->getValueFromLine ($line);
|
||||
}
|
||||
elseif (strtolower (substr ($line, 0, 9)) === "disallow:")
|
||||
elseif (stripos ($line, "disallow:") === 0)
|
||||
{
|
||||
$this->disallow[] = $this->getValueFromLine ($line);
|
||||
}
|
||||
elseif (strtolower (substr ($line, 0, 12)) === "crawl-delay:")
|
||||
elseif (stripos ($line, "crawl-delay:") === 0)
|
||||
{
|
||||
$val = $this->getValueFromLine ($line);
|
||||
if ($val > 1 && $val < 60 && $this->crawldelay === 3)
|
||||
$this->crawldelay = intval ($val);
|
||||
else
|
||||
$this->errors[$i] = dgettext ("domframework",
|
||||
"Crawldelay : value out of range (1-60)");
|
||||
}
|
||||
elseif (strtolower (substr ($line, 0, 11)) === "user-agent:" &&
|
||||
(!empty ($this->allow) || ! empty ($this->disallow)))
|
||||
elseif (stripos ($line, "user-agent:") === 0)
|
||||
{
|
||||
if (! empty ($this->allow) || ! empty ($this->disallow))
|
||||
{
|
||||
// New user-agent line after valid allow/disallow : end of paragraph
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Comment, empty line, sitemap, host, not managed line... : SKIP
|
||||
// New user-agent. Do nothing
|
||||
}
|
||||
}
|
||||
$keys = preg_grep ("#^\s*Sitemap:\s*(?P<value>\S+)\s*(\#)?#i", $content);
|
||||
foreach ($keys as $line)
|
||||
else
|
||||
{
|
||||
$url = $this->getValueFromLine ($line);
|
||||
if (!! filter_var ($url, FILTER_VALIDATE_URL))
|
||||
$this->sitemaps[] = $url;
|
||||
// Not managed line : error
|
||||
$this->errors[$i] = sprintf (dgettext ("domframework",
|
||||
"Invalid line : unknown command : '%s'"), $line);
|
||||
}
|
||||
$keys = preg_grep ("#^\s*Host:\s*(?P<value>\S+)\s*(\#)?#i", $content);
|
||||
foreach ($keys as $line)
|
||||
{
|
||||
// Takes only the first one
|
||||
$this->host = $this->getValueFromLine ($line);
|
||||
break;
|
||||
}
|
||||
if (! in_array ("/", $this->disallow) &&
|
||||
! in_array ("/", $this->allow))
|
||||
@@ -190,6 +215,17 @@ class robotstxt
|
||||
// }}}
|
||||
|
||||
// GETTERS
|
||||
/** Return the lines where an error occured
|
||||
* The key of the array is the line number with the default
|
||||
* @return array The errors
|
||||
*/
|
||||
public function errors ()
|
||||
// {{{
|
||||
{
|
||||
return $this->errors;
|
||||
}
|
||||
// }}}
|
||||
|
||||
/** Return the allowed urls
|
||||
* @return array $allow The array of allow rules
|
||||
*/
|
||||
|
||||
Reference in New Issue
Block a user