* @license BSD */ namespace Domframework; /** * This class allow to read the sitemaps files available in websites, and give * the available URL and parameters */ class Sitemap { /** * Return an array containing the URL in sitemap associated with the * information of priority and changefreq (in seconds) * array ( * "urls" => array ( * [http://domain.tld/path...] => array ( * "changefreq" => XX, * "priority" => ZZ), * ), * "sitemaps" => array ( * [http://domain.tld/sitemap2] => array ("lastmod" => XX) * )); * The lastmod is return in time() based on UTC * @param string $content The content file to analyze * @param string $url The website URL * @return array The content of the file if it is valid */ public function analyze($content, $url) { $finfo = new \finfo(FILEINFO_MIME_TYPE); $type = $finfo->buffer($content); if ($type === "application/x-gzip") { // Uncompress GZIP $content = gzinflate(substr($content, 10)); } $content = trim($content); $type = $finfo->buffer($content); if ($type === "text/plain" || $type === "application/x-empty") { return $this->analyzeText($content, $url); } elseif ($type === "application/xml" || $type === "text/xml") { return $this->analyzeXML($content, $url); } else { throw new \Exception("Type of sitemap '$type' unknown for '$url'", 406); } } /** * Return an array containing the URL in sitemap associated with an empty * array, as the Text format provide only URL, so return empty array. * New sitemaps files can not be defined in text format. * array ( * "urls" => array ( * [http://domain.tld/path...] => array (), * ), * "sitemaps" => array () * ); * @param string $content The content file to analyze * @param string $url The website URL * @return array The content of the file if it is valid */ public function analyzeText($content, $url) { if (strlen($content) > 10000000) { trigger_error("Sitemap '$url' size is too big -> skip", E_USER_ERROR); return ["urls" => [], "sitemaps" => []]; } $urls = preg_split('/\r\n|\r|\n/', trim($content)); if ($urls === [""]) { $urls = []; } $urls = array_fill_keys($urls, []); return ["urls" => $urls, "sitemaps" => []]; } /** * Return an array containing the URL in sitemap associated with the * information of priority and changefreq (in seconds) * array ( * "urls" => array ( * [http://domain.tld/path...] => array ( * "changefreq" => XX, * "priority" => ZZ), * ), * "sitemaps" => array ( * [http://domain.tld/sitemap2] => array ("lastmod" => XX) * )); * @param string $content The content file to analyze * @param string $url The website URL * @return array The content of the file if it is valid */ public function analyzeXML($content, $url) { if (strlen($content) > 10000000) { throw new \Exception("Sitemap '$url' size is too big -> skip", 406); } $xml = @simplexml_load_string($content); $res = ["urls" => [], "sitemaps" => []]; foreach ($xml->sitemap as $s) { $tmp = []; if (! isset($s->loc)) { trigger_error("No location in sitemap '$url'", E_USER_WARNING); continue; } $loc = (string)$s->loc; if (isset($s->lastmod)) { $lastmod = (string)$s->lastmod; $ts = strptime($lastmod, "%Y-%m-%dT%H:%M:%S%z"); if ($ts === false) { $ts = strptime($lastmod, "%Y-%m-%d"); } if ($ts === false) { trigger_error( "Invalid lastmod in '$url' ($lastmod)", E_USER_WARNING ); } else { $tmp["lastmod"] = gmmktime( $ts['tm_hour'], $ts['tm_min'], $ts['tm_sec'], $ts['tm_mon'] + 1, $ts['tm_mday'], ($ts['tm_year'] + 1900) ); } } $res["sitemaps"][$loc] = $tmp; } foreach ($xml->url as $u) { $tmp = []; if (! isset($u->loc)) { trigger_error( "No location in sitemap '$url' for url", E_USER_WARNING ); continue; } $loc = (string)$u->loc; if (isset($u->changefreq)) { $changefreq = (string)$u->changefreq; switch (strtolower($changefreq)) { case "always": case "hourly": case "daily": case "weekly": case "monthly": case "yearly": case "never": $tmp["changefreq"] = strtolower($u->changefreq); break; default: trigger_error( "Invalid chagefreq in '$url' ($changefreq)", E_USER_WARNING ); } } if (isset($u->priority)) { $priority = (float)$u->priority; if ($priority < 0.0 || $priority > 1.0) { trigger_error( "Invalid priority in '$url' ($priority)", E_USER_WARNING ); } else { $tmp["priority"] = $priority; } } if (isset($u->lastmod)) { $lastmod = (string)$u->lastmod; $ts = strptime($lastmod, "%Y-%m-%dT%H:%M:%S%z"); if ($ts === false) { $ts = strptime($lastmod, "%Y-%m-%d"); } if ($ts === false) { trigger_error( "Invalid lastmod in '$url' ($lastmod)", E_USER_WARNING ); } else { $tmp["lastmod"] = gmmktime( $ts['tm_hour'], $ts['tm_min'], $ts['tm_sec'], $ts['tm_mon'] + 1, $ts['tm_mday'], ($ts['tm_year'] + 1900) ); } } $res["urls"][$loc] = $tmp; } return $res; } }