Add sitemap support

git-svn-id: https://svn.fournier38.fr/svn/ProgSVN/trunk@5355 bf3deb0d-5f1a-0410-827f-c0cc1f45334c
This commit is contained in:
2019-06-14 07:17:06 +00:00
parent 5c49e53251
commit 5db4b19abd
2 changed files with 279 additions and 0 deletions

187
sitemap.php Normal file
View File

@@ -0,0 +1,187 @@
<?php
/** DomFramework
* @package domframework
* @author Dominique Fournier <dominique@fournier38.fr>
*/
/** This class allow to read the sitemaps files available in websites, and give
* the available URL and parameters
*/
class sitemap
{
/** Return an array containing the URL in sitemap associated with the
* information of priority and changefreq (in seconds)
* array (
* "urls" => array (
* [http://domain.tld/path...] => array (
* "changefreq" => XX,
* "priority" => ZZ),
* ),
* "sitemaps" => array (
* [http://domain.tld/sitemap2] => array ("lastmod" => XX)
* ));
* The lastmod is return in time() based on UTC
* @param string $content The content file to analyze
* @param string $url The website URL
* @return array The content of the file if it is valid
*/
public function analyze ($content, $url)
// {{{
{
$finfo = new \finfo (FILEINFO_MIME_TYPE);
$type = $finfo->buffer ($content);
if ($type === "application/x-gzip")
{
// Uncompress GZIP
$content = gzinflate (substr ($content,10));
}
$content = trim ($content);
$type = $finfo->buffer ($content);
if ($type === "text/plain" || $type === "application/x-empty")
return $this->analyzeText ($content, $url);
elseif ($type === "application/xml")
return $this->analyzeXML ($content, $url);
else
throw new \Exception ("Type of sitemap '$type' unknown for '$url'", 406);
}
// }}}
/** Return an array containing the URL in sitemap associated with an empty
* array, as the Text format provide only URL, so return empty array.
* New sitemaps files can not be defined in text format.
* array (
* "urls" => array (
* [http://domain.tld/path...] => array (),
* ),
* "sitemaps" => array ()
* );
* @param string $content The content file to analyze
* @param string $url The website URL
* @return array The content of the file if it is valid
*/
public function analyzeText ($content, $url)
// {{{
{
if (strlen ($content) > 10000000)
{
trigger_error ("Sitemap '$url' size is too big -> skip", E_USER_ERROR);
return array ("urls" => array (), "sitemaps" => array ());
}
$urls = preg_split('/\r\n|\r|\n/', trim ($content));
if ($urls === array (""))
$urls = array ();
$urls = array_fill_keys ($urls, array ());
return array ("urls" => $urls, "sitemaps" => array ());
}
// }}}
/** Return an array containing the URL in sitemap associated with the
* information of priority and changefreq (in seconds)
* array (
* "urls" => array (
* [http://domain.tld/path...] => array (
* "changefreq" => XX,
* "priority" => ZZ),
* ),
* "sitemaps" => array (
* [http://domain.tld/sitemap2] => array ("lastmod" => XX)
* ));
* @param string $content The content file to analyze
* @param string $url The website URL
* @return array The content of the file if it is valid
*/
public function analyzeXML ($content, $url)
// {{{
{
if (strlen ($content) > 10000000)
throw new \Exception ("Sitemap '$url' size is too big -> skip", 406);
$xml = @simplexml_load_string ($content);
$res = array ("urls" => [], "sitemaps" => []);
foreach ($xml->sitemap as $s)
{
$tmp = array ();
if (! isset ($s->loc))
{
trigger_error ("No location in sitemap '$url'", E_USER_WARNING);
continue;
}
$loc = (string)$s->loc;
if (isset ($s->lastmod))
{
$lastmod = (string)$s->lastmod;
$ts = strptime ($lastmod, "%Y-%m-%dT%H:%M:%S%z");
if ($ts === false)
$ts = strptime ($lastmod, "%Y-%m-%d");
if ($ts === false)
trigger_error ("Invalid lastmod in '$url' ($lastmod)",
E_USER_WARNING);
else
$tmp["lastmod"] = gmmktime ($ts['tm_hour'], $ts['tm_min'],
$ts['tm_sec'], $ts['tm_mon'] + 1, $ts['tm_mday'],
($ts['tm_year'] + 1900));
}
$res["sitemaps"][$loc] = $tmp;
}
foreach ($xml->url as $u)
{
$tmp = array ();
if (! isset ($u->loc))
{
trigger_error ("No location in sitemap '$url' for url",
E_USER_WARNING);
continue;
}
$loc = (string)$u->loc;
if (isset ($u->changefreq))
{
$changefreq = (string)$u->changefreq;
switch (strtolower ($changefreq))
{
case "always":
case "hourly":
case "daily":
case "weekly":
case "monthly":
case "yearly":
case "never":
$tmp["changefreq"] = strtolower ($u->changefreq);
break;
default:
trigger_error ("Invalid chagefreq in '$url' ($changefreq)",
E_USER_WARNING);
}
}
if (isset ($u->priority))
{
$priority = (float)$u->priority;
if ($priority < 0.0 || $priority > 1.0)
{
trigger_error ("Invalid priority in '$url' ($priority)",
E_USER_WARNING);
}
else
{
$tmp["priority"] = $priority;
}
}
if (isset ($u->lastmod))
{
$lastmod = (string)$u->lastmod;
$ts = strptime ($lastmod, "%Y-%m-%dT%H:%M:%S%z");
if ($ts === false)
$ts = strptime ($lastmod, "%Y-%m-%d");
if ($ts === false)
trigger_error ("Invalid lastmod in '$url' ($lastmod)",
E_USER_WARNING);
else
$tmp["lastmod"] = gmmktime ($ts['tm_hour'], $ts['tm_min'],
$ts['tm_sec'], $ts['tm_mon'] + 1, $ts['tm_mday'],
($ts['tm_year'] + 1900));
}
$res["urls"][$loc] = $tmp;
}
return $res;
}
// }}}
}