DomFramework/robotstxt.php

<?php
/** DomFramework
  * @package domframework
  * @author Dominique Fournier <dominique@fournier38.fr>
  * @license BSD
  */

namespace Domframework;

/** This class analyze the provided robots.txt file content and allow to
  * get the configured data for DomSearch.
  * It allow to examine an URL against the robots.txt file and return if the URL
  * is allowed to be used or not
  * The definition of the format of robots.txt file is available here :
  *   http://www.robotstxt.org/norobots-rfc.txt
  *   https://en.wikipedia.org/wiki/Robots_exclusion_standard
  */
class robotstxt
{
  //    PROPERTIES
  // {{{
  /** The allowed urls
    */
  private $allow = array ();

  /** The disallow urls
    */
  private $disallow = array ();

  /** The sitemaps URL defined in the file
    */
  private $sitemaps = array ();

  /** The crawldelay defined in the file (3s if not defined)
    */
  private $crawldelay = 3;

  /** The host can be specified as default website
    */
  private $host = null;

  /** The rule number matching the URLAllow rule
    */
  private $matchRule = null;

  /** List the errors read on the file content. The key is the line where the
    * default is set
    */
  private $errors = array ();
  // }}}

  //    METHODS
  /** Get the robots.txt file content and do the analyze
    * @param string $content The robots.txt file content to analyze
    * @param string $crawlerName The crawler name to use in analyze
    * @return $this
    */
  public function __construct ($content, $crawlerName)
  // {{{
  {
    if (trim ($content) === "")
    {
      $this->allow = array ("/");
      return $this;
    }
    // Look if the User-agent is available for the defined crawlerName.
    // If Not check if the User-agent axists for *
    // If not accept all
    $content =  preg_split('/\r\n|\r|\n/', $content);
    $keys = preg_grep ("~User-agent:\s*$crawlerName\s*#?~i", $content);
    if (empty ($keys))
      $keys = preg_grep ("~User-agent:\s*\\*\s*#?~i", $content);
    if (empty ($keys))
    {
      // No User-agent with crawlerName nor * : accept all
      $this->allow = array ("/");
      return $this;
    }

    // The sitemaps are not restricted to the user-agent
    foreach (preg_grep ("~Sitemap:\s+~i", $content) as $nb => $line)
    {
      $url = $this->getValueFromLine ($line);
      if (!! filter_var ($url, FILTER_VALIDATE_URL) &&
           (substr ($url, 0, 7) === "http://" ||
            substr ($url, 0, 8) === "https://"))
        $this->sitemaps[] = $url;
      else
        $this->errors[$nb] = dgettext ("domframework",
          "Sitemap : Invalid URL provided");
    }

    // Get the Allow and Disallow lines. The stop will arrive on first
    // User-Agent line arriving after a Allow/Disallow.
    // Comments and empty lines are removed
    for ($i = key ($keys) ; $i < count ($content) ; $i++)
    {
      $line = trim ($content[$i]);
      if (stripos ($line, "Sitemap:") === 0)
      {
        // Already managed in the general parser. Not needed in the specific
        // user-agent parser. Must at least be catched to not generate an
        // error
      }
      elseif (stripos ($line, "Host:") === 0)
      {
        if ($this->host !== null)
          $this->errors[$i] = dgettext ("domframework",
            "Multiple Hosts set");
        else
          $this->host = $this->getValueFromLine ($line);
      }
      elseif ($line === "" || $line[0] === "#")
      {
        // Comment, empty line : SKIP
      }
      elseif (stripos ($line, "allow:") === 0)
      {
        $this->allow[] = $this->getValueFromLine ($line);
      }
      elseif (stripos ($line, "disallow:") === 0)
      {
        $this->disallow[] = $this->getValueFromLine ($line);
      }
      elseif (stripos ($line, "crawl-delay:") === 0)
      {
        $val = $this->getValueFromLine ($line);
        if ($val > 1 && $val < 60 && $this->crawldelay === 3)
          $this->crawldelay = intval ($val);
        else
          $this->errors[$i] = dgettext ("domframework",
            "Crawldelay : value out of range (1-60)");
      }
      elseif (stripos ($line, "user-agent:") === 0)
      {
        if (! empty ($this->allow) || ! empty ($this->disallow))
        {
          // New user-agent line after valid allow/disallow : end of paragraph
          break;
        }
        else
        {
          // New user-agent. Do nothing
        }
      }
      else
      {
        // Not managed line : error
        $this->errors[$i] = sprintf (dgettext ("domframework",
          "Invalid line : unknown command : '%s'"), $line);
      }
    }
    if (! in_array ("/", $this->disallow) &&
        ! in_array ("/", $this->allow))
      $this->allow[] = "/";
    return $this;
  }
  // }}}

  /** Return true if the provided URL can be used against the robots.txt
    * definition or FALSE if it is not the case
    * @param string $url The URL to check
    * @return boolean The result of the test
    */
  public function URLAllow ($url)
  // {{{
  {
    $parse = parse_url ($url);
    $path = (isset ($parse["path"])) ? $parse["path"] : "/";
    // Robots.txt files are always allowed
    if ($path === "/robots.txt")
      return true;
    $bestDisallow = -1;
    $bestAllow = -1;
    $allowRule = "";
    $disallowRule = "";
    foreach ($this->disallow as $partial)
    {
      if (strpos ($partial, "*") === false)
      {
        if (substr ($path, 0, strlen ($partial)) === $partial &&
            $bestDisallow < strlen ($partial))
        {
          $bestDisallow = strlen ($partial);
          $disallowRule = $partial;
        }
      }
      else
      {
        $partial = str_replace ("*", ".+", $partial);
        if (preg_match ("#$partial#", $path) === 1)
        {
          $bestDisallow = 255;
          $disallowRule = $partial;
        }
      }
    }
    foreach ($this->allow as $partial)
    {
      if (strpos ($partial, "*") === false)
      {
        if (substr ($path, 0, strlen ($partial)) === $partial &&
            $bestAllow < strlen ($partial))
        {
          $bestAllow = strlen ($partial);
          $allowRule = $partial;
        }
      }
      else
      {
        $partial = str_replace ("*", ".+", $partial);
        if (preg_match ("#$partial#", $path) === 1)
        {
          $bestAllow = 255;
          $allowRule = $partial;
        }
      }
    }
    if ($bestAllow > $bestDisallow)
    {
      $this->matchRule = $allowRule;
      return true;
    }
    $this->matchRule = $disallowRule;
    return false;
  }
  // }}}

  //    GETTERS
  /** Return the lines where an error occured
    * The key of the array is the line number with the default
    * @return array The errors
    */
  public function errors ()
  // {{{
  {
    return $this->errors;
  }
  // }}}

  /** Return the allowed urls
    * @return array $allow The array of allow rules
    */
  public function allow ()
  // {{{
  {
    return $this->allow;
  }
  // }}}

  /** Return the disallowed urls
    * @return array $disallow The array of disallow rules
    */
  public function disallow ()
  // {{{
  {
    return $this->disallow;
  }
  // }}}

  /** Return the sitemaps url
    * @return array $sitemap The array of sitemaps URL
    */
  public function sitemaps ()
  // {{{
  {
    return $this->sitemaps;
  }
  // }}}

  /** Return the crawldelay
    * @return integer $crawldelay The crawlDelay defined in robots.txt
    */
  public function crawldelay ()
  // {{{
  {
    return $this->crawldelay;
  }
  // }}}

  /** Return the host
    * @return string $host The Host string defined in robots.txt
    */
  public function host ()
  // {{{
  {
    return $this->host;
  }
  // }}}

  /** Return the matchRule
    * @return string $matchRule The matchRule matching the URLAllow test
    */
  public function matchRule ()
  // {{{
  {
    return $this->matchRule;
  }
  // }}}

  //    PRIVATE METHODS
  /** Get a line from robots.txt file and return the associated value.
    * Manage the evntual comments on the line
    * @param string $line the complete line from robots.txt file
    * @return string the value recorded on line
    */
  private function getValueFromLine ($line)
  // {{{
  {
    preg_match_all ("#^(?P<field>\S+):\s*(?P<value>\S*)\s*".
      "(\#\s*(?P<comment>.+)\s*)?\$#", $line, $matches);
    return $matches["value"][0];
  }
  // }}}
}