<?php
/** DomSearch
  * @package domsearch
  * @author Dominique Fournier <dominique@fournier38.fr>
  */

namespace models;

/** This class analyze the provided robots.txt file content and allow to
  * get the configured data for DomSearch.
  * It allow to examine an URL against the robots.txt file and return if the URL
  * is allowed to be used or not
  */
class robots
{
  //    PROPERTIES
  // {{{
  /** The cralwer name wanted in robots.txt
    */
  private $crawlerName = "DomSearch";

  /** The allowed urls
    */
  private $allow = array ();

  /** The disallow urls
    */
  private $disallow = array ();

  /** The sitemaps URL defined in the file
    */
  private $sitemaps = array ();

  /** The crawldelay defined in the file (3s if not defined)
    */
  private $crawldelay = 3;

  /** The host can be specified as default website
    */
  private $host = null;

  /** The rule matchine the URLAllow rule
    */
  private $matchRule = null;
  // }}}

  //    METHODS
  /** Get the robots.txt file content and do the analyze
    * @param string $content The robots.txt file content to analyze
    * @return $this
    */
  public function __construct ($content)
  // {{{
  {
    if (trim ($content) === "")
    {
      $this->allow = array ("/");
      return $this;
    }
    $crawler = "";
    $blocks = explode ("\n\n", $content);
    foreach ($blocks as $block)
    {
      preg_match_all ("#User-agent:\s+(.+)\$#mi", $block, $useragents);
      if (!isset ($useragents[1]))
        continue;
      if (! in_array ("*", $useragents[1]) &&
          ! in_array ("DomSearch", $useragents[1]))
        continue;
      if (in_array ("*", $useragents[1]))
      {
        // Already set
        if ($crawler == "DomSearch")
          continue;
        $crawler = "*";
        $this->allow = array ();
        $this->disallow = array ();
        $this->crawldelay = 3;
      }
      if (in_array ("DomSearch", $useragents[1]))

      {
        // If the information for DomSearch where already seen, skip the second
        // crawler information
        if ($crawler == "DomSearch")
          continue;
        $crawler = "DomSearch";
        $this->allow = array ();
        $this->disallow = array ();
        $this->crawldelay = 3;
      }
      preg_match_all ("#^Allow:\s*(\S*)\s*\$#mUi", $block, $allows);
      preg_match_all ("#^Disallow:\s*(\S*)\s*\$#mUi", $block, $disallows);
      preg_match_all ("#^Crawl-delay:\s*(\d+)\s*#mi", $block, $crawldelay);
      if (isset ($allows[1]))
        $this->allow = $allows[1];
      if (isset ($disallows[1]))
        $this->disallow = $disallows[1];
      if (isset ($crawldelay[1][0]) && $crawldelay[1][0] > 0)
        $this->crawldelay = intval ($crawldelay[1][0]);
    }
    preg_match_all ("#^Sitemap:\s*(\S+)\s*\$#mi", $content, $sitemaps);
    if (isset ($sitemaps[1][0]))
      $this->sitemaps = $sitemaps[1];
    preg_match_all ("#^Host:\s*(\S+)\s*\$#mi", $content, $host);
    if (isset ($host[1][0]))
      $this->host = $host[1][0];
    if (! in_array ("/", $this->disallow) &&
        ! in_array ("/", $this->allow))
      $this->allow[] = "/";
    return $this;
  }
  // }}}

  /** Return true if the provided URL can be used against the robots.txt
    * definition or FALSE if it is not the case
    * @param string $url The URL to check
    * @return boolean The result of the test
    */
  public function URLAllow ($url)
  // {{{
  {
    $parse = parse_url ($url);
    $path = (isset ($parse["path"])) ? $parse["path"] : "/";
    $bestDisallow = -1;
    $bestAllow = -1;
    $allowRule = "";
    $disallowRule = "";
    foreach ($this->disallow as $partial)
    {
      if (strpos ($partial, "*") === false)
      {
        if (substr ($path, 0, strlen ($partial)) === $partial &&
            $bestDisallow < strlen ($partial))
        {
          $bestDisallow = strlen ($partial);
          $disallowRule = $partial;
        }
      }
      else
      {
        $partial = str_replace ("*", ".+", $partial);
        if (preg_match ("#$partial#", $path) === 1)
        {
          $bestDisallow = 255;
          $disallowRule = $partial;
        }
      }
    }
    foreach ($this->allow as $partial)
    {
      if (strpos ($partial, "*") === false)
      {
        if (substr ($path, 0, strlen ($partial)) === $partial &&
            $bestAllow < strlen ($partial))
        {
          $bestAllow = strlen ($partial);
          $allowRule = $partial;
        }
      }
      else
      {
        $partial = str_replace ("*", ".+", $partial);
        if (preg_match ("#$partial#", $path) === 1)
        {
          $bestAllow = 255;
          $allowRule = $partial;
        }
      }
    }
    if ($bestAllow < $bestDisallow)
    {
      $this->matchRule = $disallowRule;
      return false;
    }
    $this->matchRule = $allowRule;
    return true;
  }
  // }}}

  //    GETTERS
  /** Return the allowed urls
    * @return array $allow The array of allow rules
    */
  public function allow ()
  // {{{
  {
    return $this->allow;
  }
  // }}}

  /** Return the disallowed urls
    * @return array $disallow The array of disallow rules
    */
  public function disallow ()
  // {{{
  {
    return $this->disallow;
  }
  // }}}

  /** Return the sitemaps url
    * @return array $sitemap The array of sitemaps URL
    */
  public function sitemaps ()
  // {{{
  {
    return $this->sitemaps;
  }
  // }}}

  /** Return the crawldelay
    * @return integer $crawldelay The crawlDelay defined in robots.txt
    */
  public function crawldelay ()
  // {{{
  {
    return $this->crawldelay;
  }
  // }}}

  /** Return the host
    * @return string $host The Host string defined in robots.txt
    */
  public function host ()
  // {{{
  {
    return $this->host;
  }
  // }}}

  /** Return the matchRule
    * @return string $matchRule The matchRule matching the URLAllow test
    */
  public function matchRule ()
  // {{{
  {
    return $this->matchRule;
  }
  // }}}
}