Files
DomFramework/src/Sitemap.php

204 lines
6.9 KiB
PHP

<?php
/**
* DomFramework
* @package domframework
* @author Dominique Fournier <dominique@fournier38.fr>
* @license BSD
*/
namespace Domframework;
/**
* This class allow to read the sitemaps files available in websites, and give
* the available URL and parameters
*/
class Sitemap
{
/**
* Return an array containing the URL in sitemap associated with the
* information of priority and changefreq (in seconds)
* array (
* "urls" => array (
* [http://domain.tld/path...] => array (
* "changefreq" => XX,
* "priority" => ZZ),
* ),
* "sitemaps" => array (
* [http://domain.tld/sitemap2] => array ("lastmod" => XX)
* ));
* The lastmod is return in time() based on UTC
* @param string $content The content file to analyze
* @param string $url The website URL
* @return array The content of the file if it is valid
*/
public function analyze($content, $url)
{
$finfo = new \finfo(FILEINFO_MIME_TYPE);
$type = $finfo->buffer($content);
if ($type === "application/x-gzip") {
// Uncompress GZIP
$content = gzinflate(substr($content, 10));
}
$content = trim($content);
$type = $finfo->buffer($content);
if ($type === "text/plain" || $type === "application/x-empty") {
return $this->analyzeText($content, $url);
} elseif ($type === "application/xml" || $type === "text/xml") {
return $this->analyzeXML($content, $url);
} else {
throw new \Exception("Type of sitemap '$type' unknown for '$url'", 406);
}
}
/**
* Return an array containing the URL in sitemap associated with an empty
* array, as the Text format provide only URL, so return empty array.
* New sitemaps files can not be defined in text format.
* array (
* "urls" => array (
* [http://domain.tld/path...] => array (),
* ),
* "sitemaps" => array ()
* );
* @param string $content The content file to analyze
* @param string $url The website URL
* @return array The content of the file if it is valid
*/
public function analyzeText($content, $url)
{
if (strlen($content) > 10000000) {
trigger_error("Sitemap '$url' size is too big -> skip", E_USER_ERROR);
return ["urls" => [], "sitemaps" => []];
}
$urls = preg_split('/\r\n|\r|\n/', trim($content));
if ($urls === [""]) {
$urls = [];
}
$urls = array_fill_keys($urls, []);
return ["urls" => $urls, "sitemaps" => []];
}
/**
* Return an array containing the URL in sitemap associated with the
* information of priority and changefreq (in seconds)
* array (
* "urls" => array (
* [http://domain.tld/path...] => array (
* "changefreq" => XX,
* "priority" => ZZ),
* ),
* "sitemaps" => array (
* [http://domain.tld/sitemap2] => array ("lastmod" => XX)
* ));
* @param string $content The content file to analyze
* @param string $url The website URL
* @return array The content of the file if it is valid
*/
public function analyzeXML($content, $url)
{
if (strlen($content) > 10000000) {
throw new \Exception("Sitemap '$url' size is too big -> skip", 406);
}
$xml = @simplexml_load_string($content);
$res = ["urls" => [], "sitemaps" => []];
foreach ($xml->sitemap as $s) {
$tmp = [];
if (! isset($s->loc)) {
trigger_error("No location in sitemap '$url'", E_USER_WARNING);
continue;
}
$loc = (string)$s->loc;
if (isset($s->lastmod)) {
$lastmod = (string)$s->lastmod;
$ts = strptime($lastmod, "%Y-%m-%dT%H:%M:%S%z");
if ($ts === false) {
$ts = strptime($lastmod, "%Y-%m-%d");
}
if ($ts === false) {
trigger_error(
"Invalid lastmod in '$url' ($lastmod)",
E_USER_WARNING
);
} else {
$tmp["lastmod"] = gmmktime(
$ts['tm_hour'],
$ts['tm_min'],
$ts['tm_sec'],
$ts['tm_mon'] + 1,
$ts['tm_mday'],
($ts['tm_year'] + 1900)
);
}
}
$res["sitemaps"][$loc] = $tmp;
}
foreach ($xml->url as $u) {
$tmp = [];
if (! isset($u->loc)) {
trigger_error(
"No location in sitemap '$url' for url",
E_USER_WARNING
);
continue;
}
$loc = (string)$u->loc;
if (isset($u->changefreq)) {
$changefreq = (string)$u->changefreq;
switch (strtolower($changefreq)) {
case "always":
case "hourly":
case "daily":
case "weekly":
case "monthly":
case "yearly":
case "never":
$tmp["changefreq"] = strtolower($u->changefreq);
break;
default:
trigger_error(
"Invalid chagefreq in '$url' ($changefreq)",
E_USER_WARNING
);
}
}
if (isset($u->priority)) {
$priority = (float)$u->priority;
if ($priority < 0.0 || $priority > 1.0) {
trigger_error(
"Invalid priority in '$url' ($priority)",
E_USER_WARNING
);
} else {
$tmp["priority"] = $priority;
}
}
if (isset($u->lastmod)) {
$lastmod = (string)$u->lastmod;
$ts = strptime($lastmod, "%Y-%m-%dT%H:%M:%S%z");
if ($ts === false) {
$ts = strptime($lastmod, "%Y-%m-%d");
}
if ($ts === false) {
trigger_error(
"Invalid lastmod in '$url' ($lastmod)",
E_USER_WARNING
);
} else {
$tmp["lastmod"] = gmmktime(
$ts['tm_hour'],
$ts['tm_min'],
$ts['tm_sec'],
$ts['tm_mon'] + 1,
$ts['tm_mday'],
($ts['tm_year'] + 1900)
);
}
}
$res["urls"][$loc] = $tmp;
}
return $res;
}
}