*/ /** This class allow to read the sitemaps files available in websites, and give * the available URL and parameters */ class sitemap { /** Return an array containing the URL in sitemap associated with the * information of priority and changefreq (in seconds) * array ( * "urls" => array ( * [http://domain.tld/path...] => array ( * "changefreq" => XX, * "priority" => ZZ), * ), * "sitemaps" => array ( * [http://domain.tld/sitemap2] => array ("lastmod" => XX) * )); * The lastmod is return in time() based on UTC * @param string $content The content file to analyze * @param string $url The website URL * @return array The content of the file if it is valid */ public function analyze ($content, $url) // {{{ { $finfo = new \finfo (FILEINFO_MIME_TYPE); $type = $finfo->buffer ($content); if ($type === "application/x-gzip") { // Uncompress GZIP $content = gzinflate (substr ($content,10)); } $content = trim ($content); $type = $finfo->buffer ($content); if ($type === "text/plain" || $type === "application/x-empty") return $this->analyzeText ($content, $url); elseif ($type === "application/xml" || $type === "text/xml") return $this->analyzeXML ($content, $url); else throw new \Exception ("Type of sitemap '$type' unknown for '$url'", 406); } // }}} /** Return an array containing the URL in sitemap associated with an empty * array, as the Text format provide only URL, so return empty array. * New sitemaps files can not be defined in text format. * array ( * "urls" => array ( * [http://domain.tld/path...] => array (), * ), * "sitemaps" => array () * ); * @param string $content The content file to analyze * @param string $url The website URL * @return array The content of the file if it is valid */ public function analyzeText ($content, $url) // {{{ { if (strlen ($content) > 10000000) { trigger_error ("Sitemap '$url' size is too big -> skip", E_USER_ERROR); return array ("urls" => array (), "sitemaps" => array ()); } $urls = preg_split('/\r\n|\r|\n/', trim ($content)); if ($urls === array ("")) $urls = array (); $urls = array_fill_keys ($urls, array ()); return array ("urls" => $urls, "sitemaps" => array ()); } // }}} /** Return an array containing the URL in sitemap associated with the * information of priority and changefreq (in seconds) * array ( * "urls" => array ( * [http://domain.tld/path...] => array ( * "changefreq" => XX, * "priority" => ZZ), * ), * "sitemaps" => array ( * [http://domain.tld/sitemap2] => array ("lastmod" => XX) * )); * @param string $content The content file to analyze * @param string $url The website URL * @return array The content of the file if it is valid */ public function analyzeXML ($content, $url) // {{{ { if (strlen ($content) > 10000000) throw new \Exception ("Sitemap '$url' size is too big -> skip", 406); $xml = @simplexml_load_string ($content); $res = array ("urls" => [], "sitemaps" => []); foreach ($xml->sitemap as $s) { $tmp = array (); if (! isset ($s->loc)) { trigger_error ("No location in sitemap '$url'", E_USER_WARNING); continue; } $loc = (string)$s->loc; if (isset ($s->lastmod)) { $lastmod = (string)$s->lastmod; $ts = strptime ($lastmod, "%Y-%m-%dT%H:%M:%S%z"); if ($ts === false) $ts = strptime ($lastmod, "%Y-%m-%d"); if ($ts === false) trigger_error ("Invalid lastmod in '$url' ($lastmod)", E_USER_WARNING); else $tmp["lastmod"] = gmmktime ($ts['tm_hour'], $ts['tm_min'], $ts['tm_sec'], $ts['tm_mon'] + 1, $ts['tm_mday'], ($ts['tm_year'] + 1900)); } $res["sitemaps"][$loc] = $tmp; } foreach ($xml->url as $u) { $tmp = array (); if (! isset ($u->loc)) { trigger_error ("No location in sitemap '$url' for url", E_USER_WARNING); continue; } $loc = (string)$u->loc; if (isset ($u->changefreq)) { $changefreq = (string)$u->changefreq; switch (strtolower ($changefreq)) { case "always": case "hourly": case "daily": case "weekly": case "monthly": case "yearly": case "never": $tmp["changefreq"] = strtolower ($u->changefreq); break; default: trigger_error ("Invalid chagefreq in '$url' ($changefreq)", E_USER_WARNING); } } if (isset ($u->priority)) { $priority = (float)$u->priority; if ($priority < 0.0 || $priority > 1.0) { trigger_error ("Invalid priority in '$url' ($priority)", E_USER_WARNING); } else { $tmp["priority"] = $priority; } } if (isset ($u->lastmod)) { $lastmod = (string)$u->lastmod; $ts = strptime ($lastmod, "%Y-%m-%dT%H:%M:%S%z"); if ($ts === false) $ts = strptime ($lastmod, "%Y-%m-%d"); if ($ts === false) trigger_error ("Invalid lastmod in '$url' ($lastmod)", E_USER_WARNING); else $tmp["lastmod"] = gmmktime ($ts['tm_hour'], $ts['tm_min'], $ts['tm_sec'], $ts['tm_mon'] + 1, $ts['tm_mday'], ($ts['tm_year'] + 1900)); } $res["urls"][$loc] = $tmp; } return $res; } // }}} }