Add sitemap support
git-svn-id: https://svn.fournier38.fr/svn/ProgSVN/trunk@5355 bf3deb0d-5f1a-0410-827f-c0cc1f45334c
This commit is contained in:
92
Tests/sitemapTest.php
Normal file
92
Tests/sitemapTest.php
Normal file
@@ -0,0 +1,92 @@
|
|||||||
|
<?php
|
||||||
|
/** Test the sitemap.php file
|
||||||
|
*/
|
||||||
|
class test_sitemap extends PHPUnit_Framework_TestCase
|
||||||
|
{
|
||||||
|
// Empty Sitemap
|
||||||
|
public function test_EmptySitemap_1 ()
|
||||||
|
{
|
||||||
|
$sitemap = new sitemap ();
|
||||||
|
$res = $sitemap->analyze ("", "http://example.com");
|
||||||
|
$this->assertSame ($res, ["urls" => [], "sitemaps" => []]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Empty Sitemap
|
||||||
|
public function test_EmptySitemap_2 ()
|
||||||
|
{
|
||||||
|
$sitemap = new sitemap ();
|
||||||
|
$res = $sitemap->analyze (" ", "http://example.com");
|
||||||
|
$this->assertSame ($res, ["urls" => [], "sitemaps" => []]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Textual Sitemap
|
||||||
|
public function test_TextualSitemap_1 ()
|
||||||
|
{
|
||||||
|
$sitemap = new sitemap ();
|
||||||
|
$res = $sitemap->analyze ("http://example.com", "http://example.com");
|
||||||
|
$this->assertSame ($res,
|
||||||
|
["urls" => ["http://example.com" => []],
|
||||||
|
"sitemaps" => []]);
|
||||||
|
}
|
||||||
|
public function test_TextualSitemap_2 ()
|
||||||
|
{
|
||||||
|
$sitemap = new sitemap ();
|
||||||
|
$res = $sitemap->analyze ("http://example.com\nhttps://www.example.com\n\n",
|
||||||
|
"http://example.com");
|
||||||
|
$this->assertSame ($res,
|
||||||
|
["urls" => ["http://example.com" => [], "https://www.example.com" => []],
|
||||||
|
"sitemaps" => []]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// XML Sitemap
|
||||||
|
public function test_XMLSitemap_1 ()
|
||||||
|
{
|
||||||
|
$sitemap = new sitemap ();
|
||||||
|
$res = $sitemap->analyze (
|
||||||
|
'<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
|
||||||
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">
|
||||||
|
<url>
|
||||||
|
<loc>http://example.com/</loc>
|
||||||
|
<lastmod>2006-11-18</lastmod>
|
||||||
|
<changefreq>daily</changefreq>
|
||||||
|
<priority>0.8</priority>
|
||||||
|
</url>
|
||||||
|
</urlset>',
|
||||||
|
"http://example.com");
|
||||||
|
$this->assertSame ($res,
|
||||||
|
["urls" => [
|
||||||
|
"http://example.com/" => ["changefreq" => "daily",
|
||||||
|
"priority" => 0.8,
|
||||||
|
"lastmod" => 1163808000]
|
||||||
|
],
|
||||||
|
"sitemaps" => []]);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_XMLSitemap_2 ()
|
||||||
|
{
|
||||||
|
$sitemap = new sitemap ();
|
||||||
|
$res = $sitemap->analyze (
|
||||||
|
'<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
||||||
|
<sitemap>
|
||||||
|
<loc>http://www.example.com/sitemap1.xml.gz</loc>
|
||||||
|
<lastmod>2004-10-01T18:23:17+00:00</lastmod>
|
||||||
|
</sitemap>
|
||||||
|
<sitemap>
|
||||||
|
<loc>http://www.example.com/sitemap2.xml.gz</loc>
|
||||||
|
<lastmod>2005-01-01</lastmod>
|
||||||
|
</sitemap>
|
||||||
|
</sitemapindex>',
|
||||||
|
"http://example.com");
|
||||||
|
$this->assertSame ($res,
|
||||||
|
["urls" => [],
|
||||||
|
"sitemaps" => [
|
||||||
|
"http://www.example.com/sitemap1.xml.gz" => [
|
||||||
|
"lastmod" => 1096654997,],
|
||||||
|
"http://www.example.com/sitemap2.xml.gz" => [
|
||||||
|
"lastmod" => 1104537600, ],
|
||||||
|
]]);
|
||||||
|
}
|
||||||
|
}
|
||||||
187
sitemap.php
Normal file
187
sitemap.php
Normal file
@@ -0,0 +1,187 @@
|
|||||||
|
<?php
|
||||||
|
/** DomFramework
|
||||||
|
* @package domframework
|
||||||
|
* @author Dominique Fournier <dominique@fournier38.fr>
|
||||||
|
*/
|
||||||
|
|
||||||
|
/** This class allow to read the sitemaps files available in websites, and give
|
||||||
|
* the available URL and parameters
|
||||||
|
*/
|
||||||
|
class sitemap
|
||||||
|
{
|
||||||
|
/** Return an array containing the URL in sitemap associated with the
|
||||||
|
* information of priority and changefreq (in seconds)
|
||||||
|
* array (
|
||||||
|
* "urls" => array (
|
||||||
|
* [http://domain.tld/path...] => array (
|
||||||
|
* "changefreq" => XX,
|
||||||
|
* "priority" => ZZ),
|
||||||
|
* ),
|
||||||
|
* "sitemaps" => array (
|
||||||
|
* [http://domain.tld/sitemap2] => array ("lastmod" => XX)
|
||||||
|
* ));
|
||||||
|
* The lastmod is return in time() based on UTC
|
||||||
|
* @param string $content The content file to analyze
|
||||||
|
* @param string $url The website URL
|
||||||
|
* @return array The content of the file if it is valid
|
||||||
|
*/
|
||||||
|
public function analyze ($content, $url)
|
||||||
|
// {{{
|
||||||
|
{
|
||||||
|
$finfo = new \finfo (FILEINFO_MIME_TYPE);
|
||||||
|
$type = $finfo->buffer ($content);
|
||||||
|
if ($type === "application/x-gzip")
|
||||||
|
{
|
||||||
|
// Uncompress GZIP
|
||||||
|
$content = gzinflate (substr ($content,10));
|
||||||
|
}
|
||||||
|
|
||||||
|
$content = trim ($content);
|
||||||
|
$type = $finfo->buffer ($content);
|
||||||
|
if ($type === "text/plain" || $type === "application/x-empty")
|
||||||
|
return $this->analyzeText ($content, $url);
|
||||||
|
elseif ($type === "application/xml")
|
||||||
|
return $this->analyzeXML ($content, $url);
|
||||||
|
else
|
||||||
|
throw new \Exception ("Type of sitemap '$type' unknown for '$url'", 406);
|
||||||
|
}
|
||||||
|
// }}}
|
||||||
|
|
||||||
|
/** Return an array containing the URL in sitemap associated with an empty
|
||||||
|
* array, as the Text format provide only URL, so return empty array.
|
||||||
|
* New sitemaps files can not be defined in text format.
|
||||||
|
* array (
|
||||||
|
* "urls" => array (
|
||||||
|
* [http://domain.tld/path...] => array (),
|
||||||
|
* ),
|
||||||
|
* "sitemaps" => array ()
|
||||||
|
* );
|
||||||
|
* @param string $content The content file to analyze
|
||||||
|
* @param string $url The website URL
|
||||||
|
* @return array The content of the file if it is valid
|
||||||
|
*/
|
||||||
|
public function analyzeText ($content, $url)
|
||||||
|
// {{{
|
||||||
|
{
|
||||||
|
if (strlen ($content) > 10000000)
|
||||||
|
{
|
||||||
|
trigger_error ("Sitemap '$url' size is too big -> skip", E_USER_ERROR);
|
||||||
|
return array ("urls" => array (), "sitemaps" => array ());
|
||||||
|
}
|
||||||
|
$urls = preg_split('/\r\n|\r|\n/', trim ($content));
|
||||||
|
if ($urls === array (""))
|
||||||
|
$urls = array ();
|
||||||
|
$urls = array_fill_keys ($urls, array ());
|
||||||
|
return array ("urls" => $urls, "sitemaps" => array ());
|
||||||
|
}
|
||||||
|
// }}}
|
||||||
|
|
||||||
|
/** Return an array containing the URL in sitemap associated with the
|
||||||
|
* information of priority and changefreq (in seconds)
|
||||||
|
* array (
|
||||||
|
* "urls" => array (
|
||||||
|
* [http://domain.tld/path...] => array (
|
||||||
|
* "changefreq" => XX,
|
||||||
|
* "priority" => ZZ),
|
||||||
|
* ),
|
||||||
|
* "sitemaps" => array (
|
||||||
|
* [http://domain.tld/sitemap2] => array ("lastmod" => XX)
|
||||||
|
* ));
|
||||||
|
* @param string $content The content file to analyze
|
||||||
|
* @param string $url The website URL
|
||||||
|
* @return array The content of the file if it is valid
|
||||||
|
*/
|
||||||
|
public function analyzeXML ($content, $url)
|
||||||
|
// {{{
|
||||||
|
{
|
||||||
|
if (strlen ($content) > 10000000)
|
||||||
|
throw new \Exception ("Sitemap '$url' size is too big -> skip", 406);
|
||||||
|
$xml = @simplexml_load_string ($content);
|
||||||
|
$res = array ("urls" => [], "sitemaps" => []);
|
||||||
|
foreach ($xml->sitemap as $s)
|
||||||
|
{
|
||||||
|
$tmp = array ();
|
||||||
|
if (! isset ($s->loc))
|
||||||
|
{
|
||||||
|
trigger_error ("No location in sitemap '$url'", E_USER_WARNING);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
$loc = (string)$s->loc;
|
||||||
|
if (isset ($s->lastmod))
|
||||||
|
{
|
||||||
|
$lastmod = (string)$s->lastmod;
|
||||||
|
$ts = strptime ($lastmod, "%Y-%m-%dT%H:%M:%S%z");
|
||||||
|
if ($ts === false)
|
||||||
|
$ts = strptime ($lastmod, "%Y-%m-%d");
|
||||||
|
if ($ts === false)
|
||||||
|
trigger_error ("Invalid lastmod in '$url' ($lastmod)",
|
||||||
|
E_USER_WARNING);
|
||||||
|
else
|
||||||
|
$tmp["lastmod"] = gmmktime ($ts['tm_hour'], $ts['tm_min'],
|
||||||
|
$ts['tm_sec'], $ts['tm_mon'] + 1, $ts['tm_mday'],
|
||||||
|
($ts['tm_year'] + 1900));
|
||||||
|
}
|
||||||
|
$res["sitemaps"][$loc] = $tmp;
|
||||||
|
}
|
||||||
|
foreach ($xml->url as $u)
|
||||||
|
{
|
||||||
|
$tmp = array ();
|
||||||
|
if (! isset ($u->loc))
|
||||||
|
{
|
||||||
|
trigger_error ("No location in sitemap '$url' for url",
|
||||||
|
E_USER_WARNING);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
$loc = (string)$u->loc;
|
||||||
|
if (isset ($u->changefreq))
|
||||||
|
{
|
||||||
|
$changefreq = (string)$u->changefreq;
|
||||||
|
switch (strtolower ($changefreq))
|
||||||
|
{
|
||||||
|
case "always":
|
||||||
|
case "hourly":
|
||||||
|
case "daily":
|
||||||
|
case "weekly":
|
||||||
|
case "monthly":
|
||||||
|
case "yearly":
|
||||||
|
case "never":
|
||||||
|
$tmp["changefreq"] = strtolower ($u->changefreq);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
trigger_error ("Invalid chagefreq in '$url' ($changefreq)",
|
||||||
|
E_USER_WARNING);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (isset ($u->priority))
|
||||||
|
{
|
||||||
|
$priority = (float)$u->priority;
|
||||||
|
if ($priority < 0.0 || $priority > 1.0)
|
||||||
|
{
|
||||||
|
trigger_error ("Invalid priority in '$url' ($priority)",
|
||||||
|
E_USER_WARNING);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
$tmp["priority"] = $priority;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (isset ($u->lastmod))
|
||||||
|
{
|
||||||
|
$lastmod = (string)$u->lastmod;
|
||||||
|
$ts = strptime ($lastmod, "%Y-%m-%dT%H:%M:%S%z");
|
||||||
|
if ($ts === false)
|
||||||
|
$ts = strptime ($lastmod, "%Y-%m-%d");
|
||||||
|
if ($ts === false)
|
||||||
|
trigger_error ("Invalid lastmod in '$url' ($lastmod)",
|
||||||
|
E_USER_WARNING);
|
||||||
|
else
|
||||||
|
$tmp["lastmod"] = gmmktime ($ts['tm_hour'], $ts['tm_min'],
|
||||||
|
$ts['tm_sec'], $ts['tm_mon'] + 1, $ts['tm_mday'],
|
||||||
|
($ts['tm_year'] + 1900));
|
||||||
|
}
|
||||||
|
$res["urls"][$loc] = $tmp;
|
||||||
|
}
|
||||||
|
return $res;
|
||||||
|
}
|
||||||
|
// }}}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user