191 lines
5.8 KiB
PHP
191 lines
5.8 KiB
PHP
<?php
|
|
/** DomFramework
|
|
* @package domframework
|
|
* @author Dominique Fournier <dominique@fournier38.fr>
|
|
* @license BSD
|
|
*/
|
|
|
|
namespace Domframework;
|
|
|
|
/** This class allow to read the sitemaps files available in websites, and give
|
|
* the available URL and parameters
|
|
*/
|
|
class sitemap
|
|
{
|
|
/** Return an array containing the URL in sitemap associated with the
|
|
* information of priority and changefreq (in seconds)
|
|
* array (
|
|
* "urls" => array (
|
|
* [http://domain.tld/path...] => array (
|
|
* "changefreq" => XX,
|
|
* "priority" => ZZ),
|
|
* ),
|
|
* "sitemaps" => array (
|
|
* [http://domain.tld/sitemap2] => array ("lastmod" => XX)
|
|
* ));
|
|
* The lastmod is return in time() based on UTC
|
|
* @param string $content The content file to analyze
|
|
* @param string $url The website URL
|
|
* @return array The content of the file if it is valid
|
|
*/
|
|
public function analyze ($content, $url)
|
|
// {{{
|
|
{
|
|
$finfo = new \finfo (FILEINFO_MIME_TYPE);
|
|
$type = $finfo->buffer ($content);
|
|
if ($type === "application/x-gzip")
|
|
{
|
|
// Uncompress GZIP
|
|
$content = gzinflate (substr ($content,10));
|
|
}
|
|
|
|
$content = trim ($content);
|
|
$type = $finfo->buffer ($content);
|
|
if ($type === "text/plain" || $type === "application/x-empty")
|
|
return $this->analyzeText ($content, $url);
|
|
elseif ($type === "application/xml" || $type === "text/xml")
|
|
return $this->analyzeXML ($content, $url);
|
|
else
|
|
throw new \Exception ("Type of sitemap '$type' unknown for '$url'", 406);
|
|
}
|
|
// }}}
|
|
|
|
/** Return an array containing the URL in sitemap associated with an empty
|
|
* array, as the Text format provide only URL, so return empty array.
|
|
* New sitemaps files can not be defined in text format.
|
|
* array (
|
|
* "urls" => array (
|
|
* [http://domain.tld/path...] => array (),
|
|
* ),
|
|
* "sitemaps" => array ()
|
|
* );
|
|
* @param string $content The content file to analyze
|
|
* @param string $url The website URL
|
|
* @return array The content of the file if it is valid
|
|
*/
|
|
public function analyzeText ($content, $url)
|
|
// {{{
|
|
{
|
|
if (strlen ($content) > 10000000)
|
|
{
|
|
trigger_error ("Sitemap '$url' size is too big -> skip", E_USER_ERROR);
|
|
return array ("urls" => array (), "sitemaps" => array ());
|
|
}
|
|
$urls = preg_split('/\r\n|\r|\n/', trim ($content));
|
|
if ($urls === array (""))
|
|
$urls = array ();
|
|
$urls = array_fill_keys ($urls, array ());
|
|
return array ("urls" => $urls, "sitemaps" => array ());
|
|
}
|
|
// }}}
|
|
|
|
/** Return an array containing the URL in sitemap associated with the
|
|
* information of priority and changefreq (in seconds)
|
|
* array (
|
|
* "urls" => array (
|
|
* [http://domain.tld/path...] => array (
|
|
* "changefreq" => XX,
|
|
* "priority" => ZZ),
|
|
* ),
|
|
* "sitemaps" => array (
|
|
* [http://domain.tld/sitemap2] => array ("lastmod" => XX)
|
|
* ));
|
|
* @param string $content The content file to analyze
|
|
* @param string $url The website URL
|
|
* @return array The content of the file if it is valid
|
|
*/
|
|
public function analyzeXML ($content, $url)
|
|
// {{{
|
|
{
|
|
if (strlen ($content) > 10000000)
|
|
throw new \Exception ("Sitemap '$url' size is too big -> skip", 406);
|
|
$xml = @simplexml_load_string ($content);
|
|
$res = array ("urls" => [], "sitemaps" => []);
|
|
foreach ($xml->sitemap as $s)
|
|
{
|
|
$tmp = array ();
|
|
if (! isset ($s->loc))
|
|
{
|
|
trigger_error ("No location in sitemap '$url'", E_USER_WARNING);
|
|
continue;
|
|
}
|
|
$loc = (string)$s->loc;
|
|
if (isset ($s->lastmod))
|
|
{
|
|
$lastmod = (string)$s->lastmod;
|
|
$ts = strptime ($lastmod, "%Y-%m-%dT%H:%M:%S%z");
|
|
if ($ts === false)
|
|
$ts = strptime ($lastmod, "%Y-%m-%d");
|
|
if ($ts === false)
|
|
trigger_error ("Invalid lastmod in '$url' ($lastmod)",
|
|
E_USER_WARNING);
|
|
else
|
|
$tmp["lastmod"] = gmmktime ($ts['tm_hour'], $ts['tm_min'],
|
|
$ts['tm_sec'], $ts['tm_mon'] + 1, $ts['tm_mday'],
|
|
($ts['tm_year'] + 1900));
|
|
}
|
|
$res["sitemaps"][$loc] = $tmp;
|
|
}
|
|
foreach ($xml->url as $u)
|
|
{
|
|
$tmp = array ();
|
|
if (! isset ($u->loc))
|
|
{
|
|
trigger_error ("No location in sitemap '$url' for url",
|
|
E_USER_WARNING);
|
|
continue;
|
|
}
|
|
$loc = (string)$u->loc;
|
|
if (isset ($u->changefreq))
|
|
{
|
|
$changefreq = (string)$u->changefreq;
|
|
switch (strtolower ($changefreq))
|
|
{
|
|
case "always":
|
|
case "hourly":
|
|
case "daily":
|
|
case "weekly":
|
|
case "monthly":
|
|
case "yearly":
|
|
case "never":
|
|
$tmp["changefreq"] = strtolower ($u->changefreq);
|
|
break;
|
|
default:
|
|
trigger_error ("Invalid chagefreq in '$url' ($changefreq)",
|
|
E_USER_WARNING);
|
|
}
|
|
}
|
|
if (isset ($u->priority))
|
|
{
|
|
$priority = (float)$u->priority;
|
|
if ($priority < 0.0 || $priority > 1.0)
|
|
{
|
|
trigger_error ("Invalid priority in '$url' ($priority)",
|
|
E_USER_WARNING);
|
|
}
|
|
else
|
|
{
|
|
$tmp["priority"] = $priority;
|
|
}
|
|
}
|
|
if (isset ($u->lastmod))
|
|
{
|
|
$lastmod = (string)$u->lastmod;
|
|
$ts = strptime ($lastmod, "%Y-%m-%dT%H:%M:%S%z");
|
|
if ($ts === false)
|
|
$ts = strptime ($lastmod, "%Y-%m-%d");
|
|
if ($ts === false)
|
|
trigger_error ("Invalid lastmod in '$url' ($lastmod)",
|
|
E_USER_WARNING);
|
|
else
|
|
$tmp["lastmod"] = gmmktime ($ts['tm_hour'], $ts['tm_min'],
|
|
$ts['tm_sec'], $ts['tm_mon'] + 1, $ts['tm_mday'],
|
|
($ts['tm_year'] + 1900));
|
|
}
|
|
$res["urls"][$loc] = $tmp;
|
|
}
|
|
return $res;
|
|
}
|
|
// }}}
|
|
}
|