From e1f1ddaa24b0db59e59767d012b1718f9fee0ece Mon Sep 17 00:00:00 2001 From: Dominique Fournier Date: Mon, 11 May 2020 18:26:01 +0000 Subject: [PATCH] robotstxt : catch the errors with the associated lines. robotstxt : more unit tests git-svn-id: https://svn.fournier38.fr/svn/ProgSVN/trunk@5986 bf3deb0d-5f1a-0410-827f-c0cc1f45334c --- Tests/robotstxtTest.php | 23 ++++++++++++ robotstxt.php | 80 +++++++++++++++++++++++++++++------------ 2 files changed, 81 insertions(+), 22 deletions(-) diff --git a/Tests/robotstxtTest.php b/Tests/robotstxtTest.php index f3efe82..1b8c826 100644 --- a/Tests/robotstxtTest.php +++ b/Tests/robotstxtTest.php @@ -119,6 +119,15 @@ class robotstxtTest extends PHPUnit_Framework_TestCase ["http://example.com/sitemap.xml", "http://example.com/SITEMAP.XML"]); } + public function test_sitemaps_error_1 () + { + $robotstxt = new robotstxt ( + "User-Agent: *\nDisallow: /\nSitemap: URL", + "domsearch"); + $res = $robotstxt->errors (); + $this->assertSame ($res, [2 => "Sitemap : Invalid URL provided"]); + } + // Host public function test_host_1 () { @@ -134,6 +143,20 @@ class robotstxtTest extends PHPUnit_Framework_TestCase $res = $robotstxt->host (); $this->assertSame ($res, "localhost"); } + public function test_host_error_1 () + { + $robotstxt = new robotstxt ( + "User-Agent: *\nDisallow: /\n\nHost: localhost\nHoST: toto", "domsearch"); + $res = $robotstxt->host (); + $this->assertSame ($res, "localhost"); + } + public function test_host_error_2 () + { + $robotstxt = new robotstxt ( + "User-Agent: *\nDisallow: /\n\nHost: localhost\nHoST: toto", "domsearch"); + $res = $robotstxt->errors (); + $this->assertSame ($res, [4 => "Multiple Hosts set"]); + } // URLAllow public function test_urlallow_1 () diff --git a/robotstxt.php b/robotstxt.php index cadcd13..b1dcfcf 100644 --- a/robotstxt.php +++ b/robotstxt.php @@ -39,6 +39,11 @@ class robotstxt /** The rule number matching the URLAllow rule */ private $matchRule = null; + + /** List the errors read on the file content. The key is the line where the + * default is set + */ + private $errors = array (); // }}} // METHODS @@ -74,45 +79,65 @@ class robotstxt for ($i = key ($keys) ; $i < count ($content) ; $i++) { $line = trim ($content[$i]); - if (strtolower (substr ($line, 0, 6)) === "allow:") + if (stripos ($line, "Sitemap:") === 0) + { + $url = $this->getValueFromLine ($line); + if (!! filter_var ($url, FILTER_VALIDATE_URL) && + (substr ($url, 0, 7) === "http://" || + substr ($url, 0, 8) === "https://")) + $this->sitemaps[] = $url; + else + $this->errors[$i] = dgettext ("domframework", + "Sitemap : Invalid URL provided"); + } + elseif (stripos ($line, "Host:") === 0) + { + if ($this->host !== null) + $this->errors[$i] = dgettext ("domframework", + "Multiple Hosts set"); + else + $this->host = $this->getValueFromLine ($line); + } + elseif ($line === "" || $line[0] === "#") + { + // Comment, empty line : SKIP + } + elseif (stripos ($line, "allow:") === 0) { $this->allow[] = $this->getValueFromLine ($line); } - elseif (strtolower (substr ($line, 0, 9)) === "disallow:") + elseif (stripos ($line, "disallow:") === 0) { $this->disallow[] = $this->getValueFromLine ($line); } - elseif (strtolower (substr ($line, 0, 12)) === "crawl-delay:") + elseif (stripos ($line, "crawl-delay:") === 0) { $val = $this->getValueFromLine ($line); if ($val > 1 && $val < 60 && $this->crawldelay === 3) $this->crawldelay = intval ($val); + else + $this->errors[$i] = dgettext ("domframework", + "Crawldelay : value out of range (1-60)"); } - elseif (strtolower (substr ($line, 0, 11)) === "user-agent:" && - (!empty ($this->allow) || ! empty ($this->disallow))) + elseif (stripos ($line, "user-agent:") === 0) { - // New user-agent line after valid allow/disallow : end of paragraph - break; + if (! empty ($this->allow) || ! empty ($this->disallow)) + { + // New user-agent line after valid allow/disallow : end of paragraph + break; + } + else + { + // New user-agent. Do nothing + } } else { - // Comment, empty line, sitemap, host, not managed line... : SKIP + // Not managed line : error + $this->errors[$i] = sprintf (dgettext ("domframework", + "Invalid line : unknown command : '%s'"), $line); } } - $keys = preg_grep ("#^\s*Sitemap:\s*(?P\S+)\s*(\#)?#i", $content); - foreach ($keys as $line) - { - $url = $this->getValueFromLine ($line); - if (!! filter_var ($url, FILTER_VALIDATE_URL)) - $this->sitemaps[] = $url; - } - $keys = preg_grep ("#^\s*Host:\s*(?P\S+)\s*(\#)?#i", $content); - foreach ($keys as $line) - { - // Takes only the first one - $this->host = $this->getValueFromLine ($line); - break; - } if (! in_array ("/", $this->disallow) && ! in_array ("/", $this->allow)) $this->allow[] = "/"; @@ -190,6 +215,17 @@ class robotstxt // }}} // GETTERS + /** Return the lines where an error occured + * The key of the array is the line number with the default + * @return array The errors + */ + public function errors () + // {{{ + { + return $this->errors; + } + // }}} + /** Return the allowed urls * @return array $allow The array of allow rules */