diff --git a/Tests/sitemapTest.php b/Tests/sitemapTest.php
new file mode 100644
index 0000000..76334a1
--- /dev/null
+++ b/Tests/sitemapTest.php
@@ -0,0 +1,92 @@
+analyze ("", "http://example.com");
+ $this->assertSame ($res, ["urls" => [], "sitemaps" => []]);
+ }
+
+ // Empty Sitemap
+ public function test_EmptySitemap_2 ()
+ {
+ $sitemap = new sitemap ();
+ $res = $sitemap->analyze (" ", "http://example.com");
+ $this->assertSame ($res, ["urls" => [], "sitemaps" => []]);
+ }
+
+ // Textual Sitemap
+ public function test_TextualSitemap_1 ()
+ {
+ $sitemap = new sitemap ();
+ $res = $sitemap->analyze ("http://example.com", "http://example.com");
+ $this->assertSame ($res,
+ ["urls" => ["http://example.com" => []],
+ "sitemaps" => []]);
+ }
+ public function test_TextualSitemap_2 ()
+ {
+ $sitemap = new sitemap ();
+ $res = $sitemap->analyze ("http://example.com\nhttps://www.example.com\n\n",
+ "http://example.com");
+ $this->assertSame ($res,
+ ["urls" => ["http://example.com" => [], "https://www.example.com" => []],
+ "sitemaps" => []]);
+ }
+
+ // XML Sitemap
+ public function test_XMLSitemap_1 ()
+ {
+ $sitemap = new sitemap ();
+ $res = $sitemap->analyze (
+'
+
+
+ http://example.com/
+ 2006-11-18
+ daily
+ 0.8
+
+',
+"http://example.com");
+ $this->assertSame ($res,
+ ["urls" => [
+ "http://example.com/" => ["changefreq" => "daily",
+ "priority" => 0.8,
+ "lastmod" => 1163808000]
+ ],
+ "sitemaps" => []]);
+ }
+
+ public function test_XMLSitemap_2 ()
+ {
+ $sitemap = new sitemap ();
+ $res = $sitemap->analyze (
+'
+
+
+ http://www.example.com/sitemap1.xml.gz
+ 2004-10-01T18:23:17+00:00
+
+
+ http://www.example.com/sitemap2.xml.gz
+ 2005-01-01
+
+',
+"http://example.com");
+ $this->assertSame ($res,
+ ["urls" => [],
+ "sitemaps" => [
+ "http://www.example.com/sitemap1.xml.gz" => [
+ "lastmod" => 1096654997,],
+ "http://www.example.com/sitemap2.xml.gz" => [
+ "lastmod" => 1104537600, ],
+ ]]);
+ }
+}
diff --git a/sitemap.php b/sitemap.php
new file mode 100644
index 0000000..a81cf1d
--- /dev/null
+++ b/sitemap.php
@@ -0,0 +1,187 @@
+
+ */
+
+/** This class allow to read the sitemaps files available in websites, and give
+ * the available URL and parameters
+ */
+class sitemap
+{
+ /** Return an array containing the URL in sitemap associated with the
+ * information of priority and changefreq (in seconds)
+ * array (
+ * "urls" => array (
+ * [http://domain.tld/path...] => array (
+ * "changefreq" => XX,
+ * "priority" => ZZ),
+ * ),
+ * "sitemaps" => array (
+ * [http://domain.tld/sitemap2] => array ("lastmod" => XX)
+ * ));
+ * The lastmod is return in time() based on UTC
+ * @param string $content The content file to analyze
+ * @param string $url The website URL
+ * @return array The content of the file if it is valid
+ */
+ public function analyze ($content, $url)
+ // {{{
+ {
+ $finfo = new \finfo (FILEINFO_MIME_TYPE);
+ $type = $finfo->buffer ($content);
+ if ($type === "application/x-gzip")
+ {
+ // Uncompress GZIP
+ $content = gzinflate (substr ($content,10));
+ }
+
+ $content = trim ($content);
+ $type = $finfo->buffer ($content);
+ if ($type === "text/plain" || $type === "application/x-empty")
+ return $this->analyzeText ($content, $url);
+ elseif ($type === "application/xml")
+ return $this->analyzeXML ($content, $url);
+ else
+ throw new \Exception ("Type of sitemap '$type' unknown for '$url'", 406);
+ }
+ // }}}
+
+ /** Return an array containing the URL in sitemap associated with an empty
+ * array, as the Text format provide only URL, so return empty array.
+ * New sitemaps files can not be defined in text format.
+ * array (
+ * "urls" => array (
+ * [http://domain.tld/path...] => array (),
+ * ),
+ * "sitemaps" => array ()
+ * );
+ * @param string $content The content file to analyze
+ * @param string $url The website URL
+ * @return array The content of the file if it is valid
+ */
+ public function analyzeText ($content, $url)
+ // {{{
+ {
+ if (strlen ($content) > 10000000)
+ {
+ trigger_error ("Sitemap '$url' size is too big -> skip", E_USER_ERROR);
+ return array ("urls" => array (), "sitemaps" => array ());
+ }
+ $urls = preg_split('/\r\n|\r|\n/', trim ($content));
+ if ($urls === array (""))
+ $urls = array ();
+ $urls = array_fill_keys ($urls, array ());
+ return array ("urls" => $urls, "sitemaps" => array ());
+ }
+ // }}}
+
+ /** Return an array containing the URL in sitemap associated with the
+ * information of priority and changefreq (in seconds)
+ * array (
+ * "urls" => array (
+ * [http://domain.tld/path...] => array (
+ * "changefreq" => XX,
+ * "priority" => ZZ),
+ * ),
+ * "sitemaps" => array (
+ * [http://domain.tld/sitemap2] => array ("lastmod" => XX)
+ * ));
+ * @param string $content The content file to analyze
+ * @param string $url The website URL
+ * @return array The content of the file if it is valid
+ */
+ public function analyzeXML ($content, $url)
+ // {{{
+ {
+ if (strlen ($content) > 10000000)
+ throw new \Exception ("Sitemap '$url' size is too big -> skip", 406);
+ $xml = @simplexml_load_string ($content);
+ $res = array ("urls" => [], "sitemaps" => []);
+ foreach ($xml->sitemap as $s)
+ {
+ $tmp = array ();
+ if (! isset ($s->loc))
+ {
+ trigger_error ("No location in sitemap '$url'", E_USER_WARNING);
+ continue;
+ }
+ $loc = (string)$s->loc;
+ if (isset ($s->lastmod))
+ {
+ $lastmod = (string)$s->lastmod;
+ $ts = strptime ($lastmod, "%Y-%m-%dT%H:%M:%S%z");
+ if ($ts === false)
+ $ts = strptime ($lastmod, "%Y-%m-%d");
+ if ($ts === false)
+ trigger_error ("Invalid lastmod in '$url' ($lastmod)",
+ E_USER_WARNING);
+ else
+ $tmp["lastmod"] = gmmktime ($ts['tm_hour'], $ts['tm_min'],
+ $ts['tm_sec'], $ts['tm_mon'] + 1, $ts['tm_mday'],
+ ($ts['tm_year'] + 1900));
+ }
+ $res["sitemaps"][$loc] = $tmp;
+ }
+ foreach ($xml->url as $u)
+ {
+ $tmp = array ();
+ if (! isset ($u->loc))
+ {
+ trigger_error ("No location in sitemap '$url' for url",
+ E_USER_WARNING);
+ continue;
+ }
+ $loc = (string)$u->loc;
+ if (isset ($u->changefreq))
+ {
+ $changefreq = (string)$u->changefreq;
+ switch (strtolower ($changefreq))
+ {
+ case "always":
+ case "hourly":
+ case "daily":
+ case "weekly":
+ case "monthly":
+ case "yearly":
+ case "never":
+ $tmp["changefreq"] = strtolower ($u->changefreq);
+ break;
+ default:
+ trigger_error ("Invalid chagefreq in '$url' ($changefreq)",
+ E_USER_WARNING);
+ }
+ }
+ if (isset ($u->priority))
+ {
+ $priority = (float)$u->priority;
+ if ($priority < 0.0 || $priority > 1.0)
+ {
+ trigger_error ("Invalid priority in '$url' ($priority)",
+ E_USER_WARNING);
+ }
+ else
+ {
+ $tmp["priority"] = $priority;
+ }
+ }
+ if (isset ($u->lastmod))
+ {
+ $lastmod = (string)$u->lastmod;
+ $ts = strptime ($lastmod, "%Y-%m-%dT%H:%M:%S%z");
+ if ($ts === false)
+ $ts = strptime ($lastmod, "%Y-%m-%d");
+ if ($ts === false)
+ trigger_error ("Invalid lastmod in '$url' ($lastmod)",
+ E_USER_WARNING);
+ else
+ $tmp["lastmod"] = gmmktime ($ts['tm_hour'], $ts['tm_min'],
+ $ts['tm_sec'], $ts['tm_mon'] + 1, $ts['tm_mday'],
+ ($ts['tm_year'] + 1900));
+ }
+ $res["urls"][$loc] = $tmp;
+ }
+ return $res;
+ }
+ // }}}
+}