robotstxt : rewrite all the parser with the logic defined in RFC

git-svn-id: https://svn.fournier38.fr/svn/ProgSVN/trunk@5342 bf3deb0d-5f1a-0410-827f-c0cc1f45334c
2019-06-11 09:08:53 +00:00
parent 2cfe3f4d17
commit ded1628c1f
2 changed files with 138 additions and 95 deletions
--- a/robotstxt.php
+++ b/robotstxt.php
@@ -8,6 +8,9 @@
  * get the configured data for DomSearch.
  * It allow to examine an URL against the robots.txt file and return if the URL
  * is allowed to be used or not
+  * The definition of the format of robots.txt file is available here :
+  *   http://www.robotstxt.org/norobots-rfc.txt
+  *   https://en.wikipedia.org/wiki/Robots_exclusion_standard
  */
 class robotstxt
 {
@@ -55,54 +58,64 @@ class robotstxt
      $this->allow = array ("/");
      return $this;
    }
-    $crawler = "";
-    $blocks = explode ("\n\n", $content);
-    foreach ($blocks as $block)
+    // Look if the User-agent is available for the defined crawlerName.
+    // If Not check if the User-agent axists for *
+    // If not accept all
+    $content =  preg_split('/\r\n|\r|\n/', $content);
+    $keys = preg_grep ("~User-agent:\s*$this->crawlerName\s*#?~i", $content);
+    if (empty ($keys))
+      $keys = preg_grep ("~User-agent:\s*\\*\s*#?~i", $content);
+    if (empty ($keys))
    {
-      preg_match_all ("#User-agent:\s+(.+)\$#mi", $block, $useragents);
-      if (!isset ($useragents[1]))
-        continue;
-      if (! in_array ("*", $useragents[1]) &&
-          ! in_array ("DomSearch", $useragents[1]))
-        continue;
-      if (in_array ("*", $useragents[1]))
-      {
-        // Already set
-        if ($crawler == "DomSearch")
-          continue;
-        $crawler = "*";
-        $this->allow = array ();
-        $this->disallow = array ();
-        $this->crawldelay = 3;
-      }
-      if (in_array ("DomSearch", $useragents[1]))
-
-      {
-        // If the information for DomSearch where already seen, skip the second
-        // crawler information
-        if ($crawler == "DomSearch")
-          continue;
-        $crawler = "DomSearch";
-        $this->allow = array ();
-        $this->disallow = array ();
-        $this->crawldelay = 3;
-      }
-      preg_match_all ("#^Allow:\s*(\S*)\s*\$#mUi", $block, $allows);
-      preg_match_all ("#^Disallow:\s*(\S*)\s*\$#mUi", $block, $disallows);
-      preg_match_all ("#^Crawl-delay:\s*(\d+)\s*#mi", $block, $crawldelay);
-      if (isset ($allows[1]))
-        $this->allow = $allows[1];
-      if (isset ($disallows[1]))
-        $this->disallow = $disallows[1];
-      if (isset ($crawldelay[1][0]) && $crawldelay[1][0] > 0)
-        $this->crawldelay = intval ($crawldelay[1][0]);
+      // No User-agent with crawlerName nor * : accept all
+      $this->allow = array ("/");
+      return $this;
+    }
+    // Get the Allow and Disallow lines. The stop will arrive on first
+    // User-Agent line arriving after a Allow/Disallow.
+    // Comments and empty lines are removed
+    for ($i = key ($keys) ; $i < count ($content) ; $i++)
+    {
+      $line = trim ($content[$i]);
+      if (strtolower (substr ($line, 0, 6)) === "allow:")
+      {
+        $this->allow[] = $this->getValueFromLine ($line);
+      }
+      elseif (strtolower (substr ($line, 0, 9)) === "disallow:")
+      {
+        $this->disallow[] = $this->getValueFromLine ($line);
+      }
+      elseif (strtolower (substr ($line, 0, 12)) === "crawl-delay:")
+      {
+        $val = $this->getValueFromLine ($line);
+        if ($val > 1 && $val < 60 && $this->crawldelay === 3)
+          $this->crawldelay = intval ($val);
+      }
+      elseif (strtolower (substr ($line, 0, 11)) === "user-agent:" &&
+        (!empty ($this->allow) || ! empty ($this->disallow)))
+      {
+        // New user-agent line after valid allow/disallow : end of paragraph
+        break;
+      }
+      else
+      {
+        // Comment, empty line, sitemap, host, not managed line... : SKIP
+      }
+    }
+    $keys = preg_grep ("#^\s*Sitemap:\s*(?P<value>\S+)\s*(\#)?#i", $content);
+    foreach ($keys as $line)
+    {
+      $url = $this->getValueFromLine ($line);
+      if (!! filter_var ($url, FILTER_VALIDATE_URL))
+        $this->sitemaps[] = $url;
+    }
+    $keys = preg_grep ("#^\s*Host:\s*(?P<value>\S+)\s*(\#)?#i", $content);
+    foreach ($keys as $line)
+    {
+      // Takes only the first one
+      $this->host = $this->getValueFromLine ($line);
+      break;
    }
-    preg_match_all ("#^Sitemap:\s*(\S+)\s*\$#mi", $content, $sitemaps);
-    if (isset ($sitemaps[1][0]))
-      $this->sitemaps = $sitemaps[1];
-    preg_match_all ("#^Host:\s*(\S+)\s*\$#mi", $content, $host);
-    if (isset ($host[1][0]))
-      $this->host = $host[1][0];
    if (! in_array ("/", $this->disallow) &&
        ! in_array ("/", $this->allow))
      $this->allow[] = "/";
@@ -236,4 +249,19 @@ class robotstxt
    return $this->matchRule;
  }
  // }}}
+
+  //    PRIVATE METHODS
+  /** Get a line from robots.txt file and return the associated value.
+    * Manage the evntual comments on the line
+    * @param string $line the complete line from robots.txt file
+    * @return string the value recorded on line
+    */
+  private function getValueFromLine ($line)
+  // {{{
+  {
+    preg_match_all ("#^(?P<field>\S+):\s*(?P<value>\S*)\s*".
+      "(\#\s*(?P<comment>.+)\s*)?\$#", $line, $matches);
+    return $matches["value"][0];
+  }
+  // }}}
 }