204 lines
6.9 KiB
PHP
204 lines
6.9 KiB
PHP
<?php
|
|
|
|
/**
|
|
* DomFramework
|
|
* @package domframework
|
|
* @author Dominique Fournier <dominique@fournier38.fr>
|
|
* @license BSD
|
|
*/
|
|
|
|
namespace Domframework;
|
|
|
|
/**
|
|
* This class allow to read the sitemaps files available in websites, and give
|
|
* the available URL and parameters
|
|
*/
|
|
class Sitemap
|
|
{
|
|
/**
|
|
* Return an array containing the URL in sitemap associated with the
|
|
* information of priority and changefreq (in seconds)
|
|
* array (
|
|
* "urls" => array (
|
|
* [http://domain.tld/path...] => array (
|
|
* "changefreq" => XX,
|
|
* "priority" => ZZ),
|
|
* ),
|
|
* "sitemaps" => array (
|
|
* [http://domain.tld/sitemap2] => array ("lastmod" => XX)
|
|
* ));
|
|
* The lastmod is return in time() based on UTC
|
|
* @param string $content The content file to analyze
|
|
* @param string $url The website URL
|
|
* @return array The content of the file if it is valid
|
|
*/
|
|
public function analyze($content, $url)
|
|
{
|
|
$finfo = new \finfo(FILEINFO_MIME_TYPE);
|
|
$type = $finfo->buffer($content);
|
|
if ($type === "application/x-gzip") {
|
|
// Uncompress GZIP
|
|
$content = gzinflate(substr($content, 10));
|
|
}
|
|
|
|
$content = trim($content);
|
|
$type = $finfo->buffer($content);
|
|
if ($type === "text/plain" || $type === "application/x-empty") {
|
|
return $this->analyzeText($content, $url);
|
|
} elseif ($type === "application/xml" || $type === "text/xml") {
|
|
return $this->analyzeXML($content, $url);
|
|
} else {
|
|
throw new \Exception("Type of sitemap '$type' unknown for '$url'", 406);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Return an array containing the URL in sitemap associated with an empty
|
|
* array, as the Text format provide only URL, so return empty array.
|
|
* New sitemaps files can not be defined in text format.
|
|
* array (
|
|
* "urls" => array (
|
|
* [http://domain.tld/path...] => array (),
|
|
* ),
|
|
* "sitemaps" => array ()
|
|
* );
|
|
* @param string $content The content file to analyze
|
|
* @param string $url The website URL
|
|
* @return array The content of the file if it is valid
|
|
*/
|
|
public function analyzeText($content, $url)
|
|
{
|
|
if (strlen($content) > 10000000) {
|
|
trigger_error("Sitemap '$url' size is too big -> skip", E_USER_ERROR);
|
|
return ["urls" => [], "sitemaps" => []];
|
|
}
|
|
$urls = preg_split('/\r\n|\r|\n/', trim($content));
|
|
if ($urls === [""]) {
|
|
$urls = [];
|
|
}
|
|
$urls = array_fill_keys($urls, []);
|
|
return ["urls" => $urls, "sitemaps" => []];
|
|
}
|
|
|
|
/**
|
|
* Return an array containing the URL in sitemap associated with the
|
|
* information of priority and changefreq (in seconds)
|
|
* array (
|
|
* "urls" => array (
|
|
* [http://domain.tld/path...] => array (
|
|
* "changefreq" => XX,
|
|
* "priority" => ZZ),
|
|
* ),
|
|
* "sitemaps" => array (
|
|
* [http://domain.tld/sitemap2] => array ("lastmod" => XX)
|
|
* ));
|
|
* @param string $content The content file to analyze
|
|
* @param string $url The website URL
|
|
* @return array The content of the file if it is valid
|
|
*/
|
|
public function analyzeXML($content, $url)
|
|
{
|
|
if (strlen($content) > 10000000) {
|
|
throw new \Exception("Sitemap '$url' size is too big -> skip", 406);
|
|
}
|
|
$xml = @simplexml_load_string($content);
|
|
$res = ["urls" => [], "sitemaps" => []];
|
|
foreach ($xml->sitemap as $s) {
|
|
$tmp = [];
|
|
if (! isset($s->loc)) {
|
|
trigger_error("No location in sitemap '$url'", E_USER_WARNING);
|
|
continue;
|
|
}
|
|
$loc = (string)$s->loc;
|
|
if (isset($s->lastmod)) {
|
|
$lastmod = (string)$s->lastmod;
|
|
$ts = strptime($lastmod, "%Y-%m-%dT%H:%M:%S%z");
|
|
if ($ts === false) {
|
|
$ts = strptime($lastmod, "%Y-%m-%d");
|
|
}
|
|
if ($ts === false) {
|
|
trigger_error(
|
|
"Invalid lastmod in '$url' ($lastmod)",
|
|
E_USER_WARNING
|
|
);
|
|
} else {
|
|
$tmp["lastmod"] = gmmktime(
|
|
$ts['tm_hour'],
|
|
$ts['tm_min'],
|
|
$ts['tm_sec'],
|
|
$ts['tm_mon'] + 1,
|
|
$ts['tm_mday'],
|
|
($ts['tm_year'] + 1900)
|
|
);
|
|
}
|
|
}
|
|
$res["sitemaps"][$loc] = $tmp;
|
|
}
|
|
foreach ($xml->url as $u) {
|
|
$tmp = [];
|
|
if (! isset($u->loc)) {
|
|
trigger_error(
|
|
"No location in sitemap '$url' for url",
|
|
E_USER_WARNING
|
|
);
|
|
continue;
|
|
}
|
|
$loc = (string)$u->loc;
|
|
if (isset($u->changefreq)) {
|
|
$changefreq = (string)$u->changefreq;
|
|
switch (strtolower($changefreq)) {
|
|
case "always":
|
|
case "hourly":
|
|
case "daily":
|
|
case "weekly":
|
|
case "monthly":
|
|
case "yearly":
|
|
case "never":
|
|
$tmp["changefreq"] = strtolower($u->changefreq);
|
|
break;
|
|
default:
|
|
trigger_error(
|
|
"Invalid chagefreq in '$url' ($changefreq)",
|
|
E_USER_WARNING
|
|
);
|
|
}
|
|
}
|
|
if (isset($u->priority)) {
|
|
$priority = (float)$u->priority;
|
|
if ($priority < 0.0 || $priority > 1.0) {
|
|
trigger_error(
|
|
"Invalid priority in '$url' ($priority)",
|
|
E_USER_WARNING
|
|
);
|
|
} else {
|
|
$tmp["priority"] = $priority;
|
|
}
|
|
}
|
|
if (isset($u->lastmod)) {
|
|
$lastmod = (string)$u->lastmod;
|
|
$ts = strptime($lastmod, "%Y-%m-%dT%H:%M:%S%z");
|
|
if ($ts === false) {
|
|
$ts = strptime($lastmod, "%Y-%m-%d");
|
|
}
|
|
if ($ts === false) {
|
|
trigger_error(
|
|
"Invalid lastmod in '$url' ($lastmod)",
|
|
E_USER_WARNING
|
|
);
|
|
} else {
|
|
$tmp["lastmod"] = gmmktime(
|
|
$ts['tm_hour'],
|
|
$ts['tm_min'],
|
|
$ts['tm_sec'],
|
|
$ts['tm_mon'] + 1,
|
|
$ts['tm_mday'],
|
|
($ts['tm_year'] + 1900)
|
|
);
|
|
}
|
|
}
|
|
$res["urls"][$loc] = $tmp;
|
|
}
|
|
return $res;
|
|
}
|
|
}
|