robotstxt : Add support to understand the robots.txt files

git-svn-id: https://svn.fournier38.fr/svn/ProgSVN/trunk@5339 bf3deb0d-5f1a-0410-827f-c0cc1f45334c
2019-06-10 14:35:49 +00:00
parent 37d83947aa
commit 1f466ffa47
1 changed files with 241 additions and 0 deletions
--- a/robotstxt.php
+++ b/robotstxt.php
@@ -0,0 +1,241 @@
+<?php
+/** DomSearch
+  * @package domsearch
+  * @author Dominique Fournier <dominique@fournier38.fr>
+  */
+
+namespace models;
+
+/** This class analyze the provided robots.txt file content and allow to
+  * get the configured data for DomSearch.
+  * It allow to examine an URL against the robots.txt file and return if the URL
+  * is allowed to be used or not
+  */
+class robots
+{
+  //    PROPERTIES
+  // {{{
+  /** The cralwer name wanted in robots.txt
+    */
+  private $crawlerName = "DomSearch";
+
+  /** The allowed urls
+    */
+  private $allow = array ();
+
+  /** The disallow urls
+    */
+  private $disallow = array ();
+
+  /** The sitemaps URL defined in the file
+    */
+  private $sitemaps = array ();
+
+  /** The crawldelay defined in the file (3s if not defined)
+    */
+  private $crawldelay = 3;
+
+  /** The host can be specified as default website
+    */
+  private $host = null;
+
+  /** The rule matchine the URLAllow rule
+    */
+  private $matchRule = null;
+  // }}}
+
+  //    METHODS
+  /** Get the robots.txt file content and do the analyze
+    * @param string $content The robots.txt file content to analyze
+    * @return $this
+    */
+  public function __construct ($content)
+  // {{{
+  {
+    if (trim ($content) === "")
+    {
+      $this->allow = array ("/");
+      return $this;
+    }
+    $crawler = "";
+    $blocks = explode ("\n\n", $content);
+    foreach ($blocks as $block)
+    {
+      preg_match_all ("#User-agent:\s+(.+)\$#mi", $block, $useragents);
+      if (!isset ($useragents[1]))
+        continue;
+      if (! in_array ("*", $useragents[1]) &&
+          ! in_array ("DomSearch", $useragents[1]))
+        continue;
+      if (in_array ("*", $useragents[1]))
+      {
+        // Already set
+        if ($crawler == "DomSearch")
+          continue;
+        $crawler = "*";
+        $this->allow = array ();
+        $this->disallow = array ();
+        $this->crawldelay = 3;
+      }
+      if (in_array ("DomSearch", $useragents[1]))
+
+      {
+        // If the information for DomSearch where already seen, skip the second
+        // crawler information
+        if ($crawler == "DomSearch")
+          continue;
+        $crawler = "DomSearch";
+        $this->allow = array ();
+        $this->disallow = array ();
+        $this->crawldelay = 3;
+      }
+      preg_match_all ("#^Allow:\s*(\S*)\s*\$#mUi", $block, $allows);
+      preg_match_all ("#^Disallow:\s*(\S*)\s*\$#mUi", $block, $disallows);
+      preg_match_all ("#^Crawl-delay:\s*(\d+)\s*#mi", $block, $crawldelay);
+      if (isset ($allows[1]))
+        $this->allow = $allows[1];
+      if (isset ($disallows[1]))
+        $this->disallow = $disallows[1];
+      if (isset ($crawldelay[1][0]) && $crawldelay[1][0] > 0)
+        $this->crawldelay = intval ($crawldelay[1][0]);
+    }
+    preg_match_all ("#^Sitemap:\s*(\S+)\s*\$#mi", $content, $sitemaps);
+    if (isset ($sitemaps[1][0]))
+      $this->sitemaps = $sitemaps[1];
+    preg_match_all ("#^Host:\s*(\S+)\s*\$#mi", $content, $host);
+    if (isset ($host[1][0]))
+      $this->host = $host[1][0];
+    if (! in_array ("/", $this->disallow) &&
+        ! in_array ("/", $this->allow))
+      $this->allow[] = "/";
+    return $this;
+  }
+  // }}}
+
+  /** Return true if the provided URL can be used against the robots.txt
+    * definition or FALSE if it is not the case
+    * @param string $url The URL to check
+    * @return boolean The result of the test
+    */
+  public function URLAllow ($url)
+  // {{{
+  {
+    $parse = parse_url ($url);
+    $path = (isset ($parse["path"])) ? $parse["path"] : "/";
+    $bestDisallow = -1;
+    $bestAllow = -1;
+    $allowRule = "";
+    $disallowRule = "";
+    foreach ($this->disallow as $partial)
+    {
+      if (strpos ($partial, "*") === false)
+      {
+        if (substr ($path, 0, strlen ($partial)) === $partial &&
+            $bestDisallow < strlen ($partial))
+        {
+          $bestDisallow = strlen ($partial);
+          $disallowRule = $partial;
+        }
+      }
+      else
+      {
+        $partial = str_replace ("*", ".+", $partial);
+        if (preg_match ("#$partial#", $path) === 1)
+        {
+          $bestDisallow = 255;
+          $disallowRule = $partial;
+        }
+      }
+    }
+    foreach ($this->allow as $partial)
+    {
+      if (strpos ($partial, "*") === false)
+      {
+        if (substr ($path, 0, strlen ($partial)) === $partial &&
+            $bestAllow < strlen ($partial))
+        {
+          $bestAllow = strlen ($partial);
+          $allowRule = $partial;
+        }
+      }
+      else
+      {
+        $partial = str_replace ("*", ".+", $partial);
+        if (preg_match ("#$partial#", $path) === 1)
+        {
+          $bestAllow = 255;
+          $allowRule = $partial;
+        }
+      }
+    }
+    if ($bestAllow < $bestDisallow)
+    {
+      $this->matchRule = $disallowRule;
+      return false;
+    }
+    $this->matchRule = $allowRule;
+    return true;
+  }
+  // }}}
+
+  //    GETTERS
+  /** Return the allowed urls
+    * @return array $allow The array of allow rules
+    */
+  public function allow ()
+  // {{{
+  {
+    return $this->allow;
+  }
+  // }}}
+
+  /** Return the disallowed urls
+    * @return array $disallow The array of disallow rules
+    */
+  public function disallow ()
+  // {{{
+  {
+    return $this->disallow;
+  }
+  // }}}
+
+  /** Return the sitemaps url
+    * @return array $sitemap The array of sitemaps URL
+    */
+  public function sitemaps ()
+  // {{{
+  {
+    return $this->sitemaps;
+  }
+  // }}}
+
+  /** Return the crawldelay
+    * @return integer $crawldelay The crawlDelay defined in robots.txt
+    */
+  public function crawldelay ()
+  // {{{
+  {
+    return $this->crawldelay;
+  }
+  // }}}
+
+  /** Return the host
+    * @return string $host The Host string defined in robots.txt
+    */
+  public function host ()
+  // {{{
+  {
+    return $this->host;
+  }
+  // }}}
+
+  /** Return the matchRule
+    * @return string $matchRule The matchRule matching the URLAllow test
+    */
+  public function matchRule ()
+  // {{{
+  {
+    return $this->matchRule;
+  }
+  // }}}
+}