diff --git a/Tests/sitemapTest.php b/Tests/sitemapTest.php new file mode 100644 index 0000000..76334a1 --- /dev/null +++ b/Tests/sitemapTest.php @@ -0,0 +1,92 @@ +analyze ("", "http://example.com"); + $this->assertSame ($res, ["urls" => [], "sitemaps" => []]); + } + + // Empty Sitemap + public function test_EmptySitemap_2 () + { + $sitemap = new sitemap (); + $res = $sitemap->analyze (" ", "http://example.com"); + $this->assertSame ($res, ["urls" => [], "sitemaps" => []]); + } + + // Textual Sitemap + public function test_TextualSitemap_1 () + { + $sitemap = new sitemap (); + $res = $sitemap->analyze ("http://example.com", "http://example.com"); + $this->assertSame ($res, + ["urls" => ["http://example.com" => []], + "sitemaps" => []]); + } + public function test_TextualSitemap_2 () + { + $sitemap = new sitemap (); + $res = $sitemap->analyze ("http://example.com\nhttps://www.example.com\n\n", + "http://example.com"); + $this->assertSame ($res, + ["urls" => ["http://example.com" => [], "https://www.example.com" => []], + "sitemaps" => []]); + } + + // XML Sitemap + public function test_XMLSitemap_1 () + { + $sitemap = new sitemap (); + $res = $sitemap->analyze ( +' + + + http://example.com/ + 2006-11-18 + daily + 0.8 + +', +"http://example.com"); + $this->assertSame ($res, + ["urls" => [ + "http://example.com/" => ["changefreq" => "daily", + "priority" => 0.8, + "lastmod" => 1163808000] + ], + "sitemaps" => []]); + } + + public function test_XMLSitemap_2 () + { + $sitemap = new sitemap (); + $res = $sitemap->analyze ( +' + + + http://www.example.com/sitemap1.xml.gz + 2004-10-01T18:23:17+00:00 + + + http://www.example.com/sitemap2.xml.gz + 2005-01-01 + +', +"http://example.com"); + $this->assertSame ($res, + ["urls" => [], + "sitemaps" => [ + "http://www.example.com/sitemap1.xml.gz" => [ + "lastmod" => 1096654997,], + "http://www.example.com/sitemap2.xml.gz" => [ + "lastmod" => 1104537600, ], + ]]); + } +} diff --git a/sitemap.php b/sitemap.php new file mode 100644 index 0000000..a81cf1d --- /dev/null +++ b/sitemap.php @@ -0,0 +1,187 @@ + + */ + +/** This class allow to read the sitemaps files available in websites, and give + * the available URL and parameters + */ +class sitemap +{ + /** Return an array containing the URL in sitemap associated with the + * information of priority and changefreq (in seconds) + * array ( + * "urls" => array ( + * [http://domain.tld/path...] => array ( + * "changefreq" => XX, + * "priority" => ZZ), + * ), + * "sitemaps" => array ( + * [http://domain.tld/sitemap2] => array ("lastmod" => XX) + * )); + * The lastmod is return in time() based on UTC + * @param string $content The content file to analyze + * @param string $url The website URL + * @return array The content of the file if it is valid + */ + public function analyze ($content, $url) + // {{{ + { + $finfo = new \finfo (FILEINFO_MIME_TYPE); + $type = $finfo->buffer ($content); + if ($type === "application/x-gzip") + { + // Uncompress GZIP + $content = gzinflate (substr ($content,10)); + } + + $content = trim ($content); + $type = $finfo->buffer ($content); + if ($type === "text/plain" || $type === "application/x-empty") + return $this->analyzeText ($content, $url); + elseif ($type === "application/xml") + return $this->analyzeXML ($content, $url); + else + throw new \Exception ("Type of sitemap '$type' unknown for '$url'", 406); + } + // }}} + + /** Return an array containing the URL in sitemap associated with an empty + * array, as the Text format provide only URL, so return empty array. + * New sitemaps files can not be defined in text format. + * array ( + * "urls" => array ( + * [http://domain.tld/path...] => array (), + * ), + * "sitemaps" => array () + * ); + * @param string $content The content file to analyze + * @param string $url The website URL + * @return array The content of the file if it is valid + */ + public function analyzeText ($content, $url) + // {{{ + { + if (strlen ($content) > 10000000) + { + trigger_error ("Sitemap '$url' size is too big -> skip", E_USER_ERROR); + return array ("urls" => array (), "sitemaps" => array ()); + } + $urls = preg_split('/\r\n|\r|\n/', trim ($content)); + if ($urls === array ("")) + $urls = array (); + $urls = array_fill_keys ($urls, array ()); + return array ("urls" => $urls, "sitemaps" => array ()); + } + // }}} + + /** Return an array containing the URL in sitemap associated with the + * information of priority and changefreq (in seconds) + * array ( + * "urls" => array ( + * [http://domain.tld/path...] => array ( + * "changefreq" => XX, + * "priority" => ZZ), + * ), + * "sitemaps" => array ( + * [http://domain.tld/sitemap2] => array ("lastmod" => XX) + * )); + * @param string $content The content file to analyze + * @param string $url The website URL + * @return array The content of the file if it is valid + */ + public function analyzeXML ($content, $url) + // {{{ + { + if (strlen ($content) > 10000000) + throw new \Exception ("Sitemap '$url' size is too big -> skip", 406); + $xml = @simplexml_load_string ($content); + $res = array ("urls" => [], "sitemaps" => []); + foreach ($xml->sitemap as $s) + { + $tmp = array (); + if (! isset ($s->loc)) + { + trigger_error ("No location in sitemap '$url'", E_USER_WARNING); + continue; + } + $loc = (string)$s->loc; + if (isset ($s->lastmod)) + { + $lastmod = (string)$s->lastmod; + $ts = strptime ($lastmod, "%Y-%m-%dT%H:%M:%S%z"); + if ($ts === false) + $ts = strptime ($lastmod, "%Y-%m-%d"); + if ($ts === false) + trigger_error ("Invalid lastmod in '$url' ($lastmod)", + E_USER_WARNING); + else + $tmp["lastmod"] = gmmktime ($ts['tm_hour'], $ts['tm_min'], + $ts['tm_sec'], $ts['tm_mon'] + 1, $ts['tm_mday'], + ($ts['tm_year'] + 1900)); + } + $res["sitemaps"][$loc] = $tmp; + } + foreach ($xml->url as $u) + { + $tmp = array (); + if (! isset ($u->loc)) + { + trigger_error ("No location in sitemap '$url' for url", + E_USER_WARNING); + continue; + } + $loc = (string)$u->loc; + if (isset ($u->changefreq)) + { + $changefreq = (string)$u->changefreq; + switch (strtolower ($changefreq)) + { + case "always": + case "hourly": + case "daily": + case "weekly": + case "monthly": + case "yearly": + case "never": + $tmp["changefreq"] = strtolower ($u->changefreq); + break; + default: + trigger_error ("Invalid chagefreq in '$url' ($changefreq)", + E_USER_WARNING); + } + } + if (isset ($u->priority)) + { + $priority = (float)$u->priority; + if ($priority < 0.0 || $priority > 1.0) + { + trigger_error ("Invalid priority in '$url' ($priority)", + E_USER_WARNING); + } + else + { + $tmp["priority"] = $priority; + } + } + if (isset ($u->lastmod)) + { + $lastmod = (string)$u->lastmod; + $ts = strptime ($lastmod, "%Y-%m-%dT%H:%M:%S%z"); + if ($ts === false) + $ts = strptime ($lastmod, "%Y-%m-%d"); + if ($ts === false) + trigger_error ("Invalid lastmod in '$url' ($lastmod)", + E_USER_WARNING); + else + $tmp["lastmod"] = gmmktime ($ts['tm_hour'], $ts['tm_min'], + $ts['tm_sec'], $ts['tm_mon'] + 1, $ts['tm_mday'], + ($ts['tm_year'] + 1900)); + } + $res["urls"][$loc] = $tmp; + } + return $res; + } + // }}} +}