Add sitemap support
git-svn-id: https://svn.fournier38.fr/svn/ProgSVN/trunk@5355 bf3deb0d-5f1a-0410-827f-c0cc1f45334c
This commit is contained in:
187
sitemap.php
Normal file
187
sitemap.php
Normal file
@@ -0,0 +1,187 @@
|
||||
<?php
|
||||
/** DomFramework
|
||||
* @package domframework
|
||||
* @author Dominique Fournier <dominique@fournier38.fr>
|
||||
*/
|
||||
|
||||
/** This class allow to read the sitemaps files available in websites, and give
|
||||
* the available URL and parameters
|
||||
*/
|
||||
class sitemap
|
||||
{
|
||||
/** Return an array containing the URL in sitemap associated with the
|
||||
* information of priority and changefreq (in seconds)
|
||||
* array (
|
||||
* "urls" => array (
|
||||
* [http://domain.tld/path...] => array (
|
||||
* "changefreq" => XX,
|
||||
* "priority" => ZZ),
|
||||
* ),
|
||||
* "sitemaps" => array (
|
||||
* [http://domain.tld/sitemap2] => array ("lastmod" => XX)
|
||||
* ));
|
||||
* The lastmod is return in time() based on UTC
|
||||
* @param string $content The content file to analyze
|
||||
* @param string $url The website URL
|
||||
* @return array The content of the file if it is valid
|
||||
*/
|
||||
public function analyze ($content, $url)
|
||||
// {{{
|
||||
{
|
||||
$finfo = new \finfo (FILEINFO_MIME_TYPE);
|
||||
$type = $finfo->buffer ($content);
|
||||
if ($type === "application/x-gzip")
|
||||
{
|
||||
// Uncompress GZIP
|
||||
$content = gzinflate (substr ($content,10));
|
||||
}
|
||||
|
||||
$content = trim ($content);
|
||||
$type = $finfo->buffer ($content);
|
||||
if ($type === "text/plain" || $type === "application/x-empty")
|
||||
return $this->analyzeText ($content, $url);
|
||||
elseif ($type === "application/xml")
|
||||
return $this->analyzeXML ($content, $url);
|
||||
else
|
||||
throw new \Exception ("Type of sitemap '$type' unknown for '$url'", 406);
|
||||
}
|
||||
// }}}
|
||||
|
||||
/** Return an array containing the URL in sitemap associated with an empty
|
||||
* array, as the Text format provide only URL, so return empty array.
|
||||
* New sitemaps files can not be defined in text format.
|
||||
* array (
|
||||
* "urls" => array (
|
||||
* [http://domain.tld/path...] => array (),
|
||||
* ),
|
||||
* "sitemaps" => array ()
|
||||
* );
|
||||
* @param string $content The content file to analyze
|
||||
* @param string $url The website URL
|
||||
* @return array The content of the file if it is valid
|
||||
*/
|
||||
public function analyzeText ($content, $url)
|
||||
// {{{
|
||||
{
|
||||
if (strlen ($content) > 10000000)
|
||||
{
|
||||
trigger_error ("Sitemap '$url' size is too big -> skip", E_USER_ERROR);
|
||||
return array ("urls" => array (), "sitemaps" => array ());
|
||||
}
|
||||
$urls = preg_split('/\r\n|\r|\n/', trim ($content));
|
||||
if ($urls === array (""))
|
||||
$urls = array ();
|
||||
$urls = array_fill_keys ($urls, array ());
|
||||
return array ("urls" => $urls, "sitemaps" => array ());
|
||||
}
|
||||
// }}}
|
||||
|
||||
/** Return an array containing the URL in sitemap associated with the
|
||||
* information of priority and changefreq (in seconds)
|
||||
* array (
|
||||
* "urls" => array (
|
||||
* [http://domain.tld/path...] => array (
|
||||
* "changefreq" => XX,
|
||||
* "priority" => ZZ),
|
||||
* ),
|
||||
* "sitemaps" => array (
|
||||
* [http://domain.tld/sitemap2] => array ("lastmod" => XX)
|
||||
* ));
|
||||
* @param string $content The content file to analyze
|
||||
* @param string $url The website URL
|
||||
* @return array The content of the file if it is valid
|
||||
*/
|
||||
public function analyzeXML ($content, $url)
|
||||
// {{{
|
||||
{
|
||||
if (strlen ($content) > 10000000)
|
||||
throw new \Exception ("Sitemap '$url' size is too big -> skip", 406);
|
||||
$xml = @simplexml_load_string ($content);
|
||||
$res = array ("urls" => [], "sitemaps" => []);
|
||||
foreach ($xml->sitemap as $s)
|
||||
{
|
||||
$tmp = array ();
|
||||
if (! isset ($s->loc))
|
||||
{
|
||||
trigger_error ("No location in sitemap '$url'", E_USER_WARNING);
|
||||
continue;
|
||||
}
|
||||
$loc = (string)$s->loc;
|
||||
if (isset ($s->lastmod))
|
||||
{
|
||||
$lastmod = (string)$s->lastmod;
|
||||
$ts = strptime ($lastmod, "%Y-%m-%dT%H:%M:%S%z");
|
||||
if ($ts === false)
|
||||
$ts = strptime ($lastmod, "%Y-%m-%d");
|
||||
if ($ts === false)
|
||||
trigger_error ("Invalid lastmod in '$url' ($lastmod)",
|
||||
E_USER_WARNING);
|
||||
else
|
||||
$tmp["lastmod"] = gmmktime ($ts['tm_hour'], $ts['tm_min'],
|
||||
$ts['tm_sec'], $ts['tm_mon'] + 1, $ts['tm_mday'],
|
||||
($ts['tm_year'] + 1900));
|
||||
}
|
||||
$res["sitemaps"][$loc] = $tmp;
|
||||
}
|
||||
foreach ($xml->url as $u)
|
||||
{
|
||||
$tmp = array ();
|
||||
if (! isset ($u->loc))
|
||||
{
|
||||
trigger_error ("No location in sitemap '$url' for url",
|
||||
E_USER_WARNING);
|
||||
continue;
|
||||
}
|
||||
$loc = (string)$u->loc;
|
||||
if (isset ($u->changefreq))
|
||||
{
|
||||
$changefreq = (string)$u->changefreq;
|
||||
switch (strtolower ($changefreq))
|
||||
{
|
||||
case "always":
|
||||
case "hourly":
|
||||
case "daily":
|
||||
case "weekly":
|
||||
case "monthly":
|
||||
case "yearly":
|
||||
case "never":
|
||||
$tmp["changefreq"] = strtolower ($u->changefreq);
|
||||
break;
|
||||
default:
|
||||
trigger_error ("Invalid chagefreq in '$url' ($changefreq)",
|
||||
E_USER_WARNING);
|
||||
}
|
||||
}
|
||||
if (isset ($u->priority))
|
||||
{
|
||||
$priority = (float)$u->priority;
|
||||
if ($priority < 0.0 || $priority > 1.0)
|
||||
{
|
||||
trigger_error ("Invalid priority in '$url' ($priority)",
|
||||
E_USER_WARNING);
|
||||
}
|
||||
else
|
||||
{
|
||||
$tmp["priority"] = $priority;
|
||||
}
|
||||
}
|
||||
if (isset ($u->lastmod))
|
||||
{
|
||||
$lastmod = (string)$u->lastmod;
|
||||
$ts = strptime ($lastmod, "%Y-%m-%dT%H:%M:%S%z");
|
||||
if ($ts === false)
|
||||
$ts = strptime ($lastmod, "%Y-%m-%d");
|
||||
if ($ts === false)
|
||||
trigger_error ("Invalid lastmod in '$url' ($lastmod)",
|
||||
E_USER_WARNING);
|
||||
else
|
||||
$tmp["lastmod"] = gmmktime ($ts['tm_hour'], $ts['tm_min'],
|
||||
$ts['tm_sec'], $ts['tm_mon'] + 1, $ts['tm_mday'],
|
||||
($ts['tm_year'] + 1900));
|
||||
}
|
||||
$res["urls"][$loc] = $tmp;
|
||||
}
|
||||
return $res;
|
||||
}
|
||||
// }}}
|
||||
}
|
||||
Reference in New Issue
Block a user