robotstxt : catch the errors with the associated lines.
robotstxt : more unit tests git-svn-id: https://svn.fournier38.fr/svn/ProgSVN/trunk@5986 bf3deb0d-5f1a-0410-827f-c0cc1f45334c
This commit is contained in:
@@ -119,6 +119,15 @@ class robotstxtTest extends PHPUnit_Framework_TestCase
|
|||||||
["http://example.com/sitemap.xml", "http://example.com/SITEMAP.XML"]);
|
["http://example.com/sitemap.xml", "http://example.com/SITEMAP.XML"]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function test_sitemaps_error_1 ()
|
||||||
|
{
|
||||||
|
$robotstxt = new robotstxt (
|
||||||
|
"User-Agent: *\nDisallow: /\nSitemap: URL",
|
||||||
|
"domsearch");
|
||||||
|
$res = $robotstxt->errors ();
|
||||||
|
$this->assertSame ($res, [2 => "Sitemap : Invalid URL provided"]);
|
||||||
|
}
|
||||||
|
|
||||||
// Host
|
// Host
|
||||||
public function test_host_1 ()
|
public function test_host_1 ()
|
||||||
{
|
{
|
||||||
@@ -134,6 +143,20 @@ class robotstxtTest extends PHPUnit_Framework_TestCase
|
|||||||
$res = $robotstxt->host ();
|
$res = $robotstxt->host ();
|
||||||
$this->assertSame ($res, "localhost");
|
$this->assertSame ($res, "localhost");
|
||||||
}
|
}
|
||||||
|
public function test_host_error_1 ()
|
||||||
|
{
|
||||||
|
$robotstxt = new robotstxt (
|
||||||
|
"User-Agent: *\nDisallow: /\n\nHost: localhost\nHoST: toto", "domsearch");
|
||||||
|
$res = $robotstxt->host ();
|
||||||
|
$this->assertSame ($res, "localhost");
|
||||||
|
}
|
||||||
|
public function test_host_error_2 ()
|
||||||
|
{
|
||||||
|
$robotstxt = new robotstxt (
|
||||||
|
"User-Agent: *\nDisallow: /\n\nHost: localhost\nHoST: toto", "domsearch");
|
||||||
|
$res = $robotstxt->errors ();
|
||||||
|
$this->assertSame ($res, [4 => "Multiple Hosts set"]);
|
||||||
|
}
|
||||||
|
|
||||||
// URLAllow
|
// URLAllow
|
||||||
public function test_urlallow_1 ()
|
public function test_urlallow_1 ()
|
||||||
|
|||||||
@@ -39,6 +39,11 @@ class robotstxt
|
|||||||
/** The rule number matching the URLAllow rule
|
/** The rule number matching the URLAllow rule
|
||||||
*/
|
*/
|
||||||
private $matchRule = null;
|
private $matchRule = null;
|
||||||
|
|
||||||
|
/** List the errors read on the file content. The key is the line where the
|
||||||
|
* default is set
|
||||||
|
*/
|
||||||
|
private $errors = array ();
|
||||||
// }}}
|
// }}}
|
||||||
|
|
||||||
// METHODS
|
// METHODS
|
||||||
@@ -74,45 +79,65 @@ class robotstxt
|
|||||||
for ($i = key ($keys) ; $i < count ($content) ; $i++)
|
for ($i = key ($keys) ; $i < count ($content) ; $i++)
|
||||||
{
|
{
|
||||||
$line = trim ($content[$i]);
|
$line = trim ($content[$i]);
|
||||||
if (strtolower (substr ($line, 0, 6)) === "allow:")
|
if (stripos ($line, "Sitemap:") === 0)
|
||||||
|
{
|
||||||
|
$url = $this->getValueFromLine ($line);
|
||||||
|
if (!! filter_var ($url, FILTER_VALIDATE_URL) &&
|
||||||
|
(substr ($url, 0, 7) === "http://" ||
|
||||||
|
substr ($url, 0, 8) === "https://"))
|
||||||
|
$this->sitemaps[] = $url;
|
||||||
|
else
|
||||||
|
$this->errors[$i] = dgettext ("domframework",
|
||||||
|
"Sitemap : Invalid URL provided");
|
||||||
|
}
|
||||||
|
elseif (stripos ($line, "Host:") === 0)
|
||||||
|
{
|
||||||
|
if ($this->host !== null)
|
||||||
|
$this->errors[$i] = dgettext ("domframework",
|
||||||
|
"Multiple Hosts set");
|
||||||
|
else
|
||||||
|
$this->host = $this->getValueFromLine ($line);
|
||||||
|
}
|
||||||
|
elseif ($line === "" || $line[0] === "#")
|
||||||
|
{
|
||||||
|
// Comment, empty line : SKIP
|
||||||
|
}
|
||||||
|
elseif (stripos ($line, "allow:") === 0)
|
||||||
{
|
{
|
||||||
$this->allow[] = $this->getValueFromLine ($line);
|
$this->allow[] = $this->getValueFromLine ($line);
|
||||||
}
|
}
|
||||||
elseif (strtolower (substr ($line, 0, 9)) === "disallow:")
|
elseif (stripos ($line, "disallow:") === 0)
|
||||||
{
|
{
|
||||||
$this->disallow[] = $this->getValueFromLine ($line);
|
$this->disallow[] = $this->getValueFromLine ($line);
|
||||||
}
|
}
|
||||||
elseif (strtolower (substr ($line, 0, 12)) === "crawl-delay:")
|
elseif (stripos ($line, "crawl-delay:") === 0)
|
||||||
{
|
{
|
||||||
$val = $this->getValueFromLine ($line);
|
$val = $this->getValueFromLine ($line);
|
||||||
if ($val > 1 && $val < 60 && $this->crawldelay === 3)
|
if ($val > 1 && $val < 60 && $this->crawldelay === 3)
|
||||||
$this->crawldelay = intval ($val);
|
$this->crawldelay = intval ($val);
|
||||||
|
else
|
||||||
|
$this->errors[$i] = dgettext ("domframework",
|
||||||
|
"Crawldelay : value out of range (1-60)");
|
||||||
}
|
}
|
||||||
elseif (strtolower (substr ($line, 0, 11)) === "user-agent:" &&
|
elseif (stripos ($line, "user-agent:") === 0)
|
||||||
(!empty ($this->allow) || ! empty ($this->disallow)))
|
|
||||||
{
|
{
|
||||||
// New user-agent line after valid allow/disallow : end of paragraph
|
if (! empty ($this->allow) || ! empty ($this->disallow))
|
||||||
break;
|
{
|
||||||
|
// New user-agent line after valid allow/disallow : end of paragraph
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// New user-agent. Do nothing
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
// Comment, empty line, sitemap, host, not managed line... : SKIP
|
// Not managed line : error
|
||||||
|
$this->errors[$i] = sprintf (dgettext ("domframework",
|
||||||
|
"Invalid line : unknown command : '%s'"), $line);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
$keys = preg_grep ("#^\s*Sitemap:\s*(?P<value>\S+)\s*(\#)?#i", $content);
|
|
||||||
foreach ($keys as $line)
|
|
||||||
{
|
|
||||||
$url = $this->getValueFromLine ($line);
|
|
||||||
if (!! filter_var ($url, FILTER_VALIDATE_URL))
|
|
||||||
$this->sitemaps[] = $url;
|
|
||||||
}
|
|
||||||
$keys = preg_grep ("#^\s*Host:\s*(?P<value>\S+)\s*(\#)?#i", $content);
|
|
||||||
foreach ($keys as $line)
|
|
||||||
{
|
|
||||||
// Takes only the first one
|
|
||||||
$this->host = $this->getValueFromLine ($line);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
if (! in_array ("/", $this->disallow) &&
|
if (! in_array ("/", $this->disallow) &&
|
||||||
! in_array ("/", $this->allow))
|
! in_array ("/", $this->allow))
|
||||||
$this->allow[] = "/";
|
$this->allow[] = "/";
|
||||||
@@ -190,6 +215,17 @@ class robotstxt
|
|||||||
// }}}
|
// }}}
|
||||||
|
|
||||||
// GETTERS
|
// GETTERS
|
||||||
|
/** Return the lines where an error occured
|
||||||
|
* The key of the array is the line number with the default
|
||||||
|
* @return array The errors
|
||||||
|
*/
|
||||||
|
public function errors ()
|
||||||
|
// {{{
|
||||||
|
{
|
||||||
|
return $this->errors;
|
||||||
|
}
|
||||||
|
// }}}
|
||||||
|
|
||||||
/** Return the allowed urls
|
/** Return the allowed urls
|
||||||
* @return array $allow The array of allow rules
|
* @return array $allow The array of allow rules
|
||||||
*/
|
*/
|
||||||
|
|||||||
Reference in New Issue
Block a user