1535 lines
34 KiB
PHP
1535 lines
34 KiB
PHP
<?php
|
|
|
|
/** DomFramework - Tests
|
|
* @package domframework
|
|
* @author Dominique Fournier <dominique@fournier38.fr>
|
|
* @license BSD
|
|
*/
|
|
|
|
namespace Domframework\Tests;
|
|
|
|
use Domframework\Robotstxt;
|
|
|
|
/** Test the Robotstxt file
|
|
*/
|
|
class RobotstxtTest extends \PHPUnit_Framework_TestCase
|
|
{
|
|
// Empty Robots
|
|
public function testConstruct1()
|
|
{
|
|
$robotstxt = new Robotstxt("", "domsearch");
|
|
$res = $robotstxt->allow();
|
|
$this->assertSame($res, ["/"]);
|
|
}
|
|
public function testConstruct2()
|
|
{
|
|
$robotstxt = new Robotstxt("", "domsearch");
|
|
$res = $robotstxt->disallow();
|
|
$this->assertSame($res, array ());
|
|
}
|
|
public function testConstruct3()
|
|
{
|
|
$robotstxt = new Robotstxt("", "domsearch");
|
|
$res = $robotstxt->sitemaps();
|
|
$this->assertSame($res, array ());
|
|
}
|
|
public function testConstruct4()
|
|
{
|
|
$robotstxt = new Robotstxt("", "domsearch");
|
|
$res = $robotstxt->crawldelay();
|
|
$this->assertSame($res, null);
|
|
}
|
|
|
|
// Allow
|
|
public function testAllow1()
|
|
{
|
|
$robotstxt = new Robotstxt(
|
|
"User-Agent: *\nDisallow:\n",
|
|
"domsearch"
|
|
);
|
|
$res = $robotstxt->allow();
|
|
$this->assertSame($res, ["/"]);
|
|
}
|
|
public function testAllow2()
|
|
{
|
|
$robotstxt = new Robotstxt(
|
|
"User-Agent: *\nDisallow:\n\nUser-Agent: DomSearch\nDisallow:\n",
|
|
"domsearch"
|
|
);
|
|
$res = $robotstxt->allow();
|
|
$this->assertSame($res, ["/"]);
|
|
}
|
|
public function testAllow3()
|
|
{
|
|
$robotstxt = new Robotstxt(
|
|
"User-Agent: DomSearch\nDisallow:\n\nUser-Agent: *\nDisallow:\n",
|
|
"domsearch"
|
|
);
|
|
$res = $robotstxt->allow();
|
|
$this->assertSame($res, ["/"]);
|
|
}
|
|
public function testAllow4()
|
|
{
|
|
$robotstxt = new Robotstxt(
|
|
"User-Agent: DomSearch\n" .
|
|
"User-Agent: User1\n" .
|
|
"User-Agent: User2\n" .
|
|
"Disallow:\n\n" .
|
|
"User-Agent: *\n" .
|
|
"Disallow: /\n",
|
|
"domsearch"
|
|
);
|
|
$res = $robotstxt->allow();
|
|
$this->assertSame($res, ["/"]);
|
|
}
|
|
|
|
// Disallow
|
|
public function testDisallow1()
|
|
{
|
|
$robotstxt = new Robotstxt(
|
|
"User-Agent: *\nDisallow: /\n",
|
|
"domsearch"
|
|
);
|
|
$res = $robotstxt->disallow();
|
|
$this->assertSame($res, ["/"]);
|
|
}
|
|
public function testDisallow2()
|
|
{
|
|
$robotstxt = new Robotstxt(
|
|
"User-Agent: *\nDisallow: /\n\nUser-Agent: DomSearch\nDisallow: /\n",
|
|
"domsearch"
|
|
);
|
|
$res = $robotstxt->disallow();
|
|
$this->assertSame($res, ["/"]);
|
|
}
|
|
public function testDisallow3()
|
|
{
|
|
$robotstxt = new Robotstxt(
|
|
"User-Agent: DomSearch\nDisallow: /\n\nUser-Agent: *\nDisallow: /\n",
|
|
"domsearch"
|
|
);
|
|
$res = $robotstxt->disallow();
|
|
$this->assertSame($res, ["/"]);
|
|
}
|
|
|
|
// Sitemaps
|
|
public function testSitemaps1()
|
|
{
|
|
$robotstxt = new Robotstxt(
|
|
"User-Agent: DomSearch\nDisallow: /\n\nUser-Agent: *\nDisallow: /\n",
|
|
"domsearch"
|
|
);
|
|
$res = $robotstxt->sitemaps();
|
|
$this->assertSame($res, []);
|
|
}
|
|
public function testSitemaps2()
|
|
{
|
|
$robotstxt = new Robotstxt(
|
|
"User-Agent: *\nDisallow: /\nSitemap: http://example.com/sitemap.xml",
|
|
"domsearch"
|
|
);
|
|
$res = $robotstxt->sitemaps();
|
|
$this->assertSame($res, ["http://example.com/sitemap.xml"]);
|
|
}
|
|
public function testSitemaps3()
|
|
{
|
|
$robotstxt = new Robotstxt(
|
|
"User-Agent: *\nDisallow: /\n" .
|
|
"Sitemap: http://example.com/sitemap.xml\n" .
|
|
"Sitemap: http://example.com/SITEMAP.XML",
|
|
"domsearch"
|
|
);
|
|
$res = $robotstxt->sitemaps();
|
|
$this->assertSame(
|
|
$res,
|
|
["http://example.com/sitemap.xml", "http://example.com/SITEMAP.XML"]
|
|
);
|
|
}
|
|
|
|
public function testSitemapsError1()
|
|
{
|
|
$robotstxt = new Robotstxt(
|
|
"User-Agent: *\nDisallow: /\nSitemap: URL",
|
|
"domsearch"
|
|
);
|
|
$res = $robotstxt->errors();
|
|
$this->assertSame($res, [2 => "Sitemap : Invalid URL provided"]);
|
|
}
|
|
|
|
// Host
|
|
public function testHost1()
|
|
{
|
|
$robotstxt = new Robotstxt(
|
|
"User-Agent: *\nDisallow: /\n",
|
|
"domsearch"
|
|
);
|
|
$res = $robotstxt->host();
|
|
$this->assertSame($res, null);
|
|
}
|
|
public function testHost2()
|
|
{
|
|
$robotstxt = new Robotstxt(
|
|
"User-Agent: *\nDisallow: /\n\nHost: localhost",
|
|
"domsearch"
|
|
);
|
|
$res = $robotstxt->host();
|
|
$this->assertSame($res, "localhost");
|
|
}
|
|
public function testHostError1()
|
|
{
|
|
$robotstxt = new Robotstxt(
|
|
"User-Agent: *\nDisallow: /\n\nHost: localhost\nHoST: toto",
|
|
"domsearch"
|
|
);
|
|
$res = $robotstxt->host();
|
|
$this->assertSame($res, "localhost");
|
|
}
|
|
public function testHostError2()
|
|
{
|
|
$robotstxt = new Robotstxt(
|
|
"User-Agent: *\nDisallow: /\n\nHost: localhost\nHoST: toto",
|
|
"domsearch"
|
|
);
|
|
$res = $robotstxt->errors();
|
|
$this->assertSame($res, [4 => "Multiple Hosts set"]);
|
|
}
|
|
|
|
// URLAllow
|
|
public function testUrlallow1()
|
|
{
|
|
$robotstxt = new Robotstxt("", "domsearch");
|
|
$res = $robotstxt->URLAllow("/");
|
|
$this->assertSame($res, true);
|
|
}
|
|
public function testUrlallow2()
|
|
{
|
|
$robotstxt = new Robotstxt(
|
|
"User-Agent: *\nDisallow: /",
|
|
"domsearch"
|
|
);
|
|
$res = $robotstxt->URLAllow("/");
|
|
$this->assertSame($res, false);
|
|
}
|
|
public function testUrlallow3()
|
|
{
|
|
$robotstxt = new Robotstxt(
|
|
"User-Agent: *\nDisallow: /\nAllow: /allow/",
|
|
"domsearch"
|
|
);
|
|
$res = $robotstxt->URLAllow("/");
|
|
$this->assertSame($res, false);
|
|
}
|
|
public function testUrlallow4()
|
|
{
|
|
$robotstxt = new Robotstxt(
|
|
"User-Agent: *\nDisallow: /\nAllow: /allow/",
|
|
"domsearch"
|
|
);
|
|
$res = $robotstxt->URLAllow("/allow/file");
|
|
$this->assertSame($res, true);
|
|
}
|
|
public function testUrlallow5()
|
|
{
|
|
$robotstxt = new Robotstxt(
|
|
"User-Agent: *\nDisallow: /\nAllow: /allow/*.gif$",
|
|
"domsearch"
|
|
);
|
|
$res = $robotstxt->URLAllow("/allow/file.gif");
|
|
$this->assertSame($res, true);
|
|
}
|
|
public function testUrlallow6()
|
|
{
|
|
$robotstxt = new Robotstxt(
|
|
"User-Agent: *\nDisallow: /\nAllow: /allow/*.gif$",
|
|
"domsearch"
|
|
);
|
|
$res = $robotstxt->URLAllow("/allow/.gif");
|
|
$this->assertSame($res, false);
|
|
}
|
|
public function testUrlallow7()
|
|
{
|
|
$robotstxt = new Robotstxt(
|
|
"User-Agent: *\nDisallow: /\nAllow: /allow/*.gif\$",
|
|
"domsearch"
|
|
);
|
|
$res = $robotstxt->URLAllow("/allow/file.png");
|
|
$this->assertSame($res, false);
|
|
}
|
|
|
|
// Tests like http://www.robotstxt.org/norobots-rfc.txt
|
|
public function testRfcUnhipbot1()
|
|
{
|
|
// {{{
|
|
$robotstxt = new Robotstxt(
|
|
"# /robots.txt for http://www.fict.org/
|
|
# comments to webmaster@fict.org
|
|
|
|
User-agent: unhipbot
|
|
Disallow: /
|
|
|
|
User-agent: webcrawler
|
|
User-agent: excite
|
|
Disallow:
|
|
|
|
User-agent: *
|
|
Disallow: /org/plans.html
|
|
Allow: /org/
|
|
Allow: /serv
|
|
Allow: /~mak
|
|
Disallow: /",
|
|
"unhipbot"
|
|
);
|
|
// }}}
|
|
$this->assertSame(
|
|
$robotstxt->URLAllow("http://www.fict.org/"),
|
|
false
|
|
);
|
|
}
|
|
public function testRfcUnhipbot2()
|
|
{
|
|
// {{{
|
|
$robotstxt = new Robotstxt(
|
|
"# /robots.txt for http://www.fict.org/
|
|
# comments to webmaster@fict.org
|
|
|
|
User-agent: unhipbot
|
|
Disallow: /
|
|
|
|
User-agent: webcrawler
|
|
User-agent: excite
|
|
Disallow:
|
|
|
|
User-agent: *
|
|
Disallow: /org/plans.html
|
|
Allow: /org/
|
|
Allow: /serv
|
|
Allow: /~mak
|
|
Disallow: /",
|
|
"unhipbot"
|
|
);
|
|
// }}}
|
|
$this->assertSame(
|
|
$robotstxt->URLAllow("http://www.fict.org/index.html"),
|
|
false
|
|
);
|
|
}
|
|
public function testRfcUnhipbot3()
|
|
{
|
|
// {{{
|
|
$robotstxt = new Robotstxt(
|
|
"# /robots.txt for http://www.fict.org/
|
|
# comments to webmaster@fict.org
|
|
|
|
User-agent: unhipbot
|
|
Disallow: /
|
|
|
|
User-agent: webcrawler
|
|
User-agent: excite
|
|
Disallow:
|
|
|
|
User-agent: *
|
|
Disallow: /org/plans.html
|
|
Allow: /org/
|
|
Allow: /serv
|
|
Allow: /~mak
|
|
Disallow: /",
|
|
"unhipbot"
|
|
);
|
|
// }}}
|
|
$this->assertSame(
|
|
$robotstxt->URLAllow("http://www.fict.org/robots.txt"),
|
|
true
|
|
);
|
|
}
|
|
public function testRfcUnhipbot4()
|
|
{
|
|
// {{{
|
|
$robotstxt = new Robotstxt(
|
|
"# /robots.txt for http://www.fict.org/
|
|
# comments to webmaster@fict.org
|
|
|
|
User-agent: unhipbot
|
|
Disallow: /
|
|
|
|
User-agent: webcrawler
|
|
User-agent: excite
|
|
Disallow:
|
|
|
|
User-agent: *
|
|
Disallow: /org/plans.html
|
|
Allow: /org/
|
|
Allow: /serv
|
|
Allow: /~mak
|
|
Disallow: /",
|
|
"unhipbot"
|
|
);
|
|
// }}}
|
|
$this->assertSame(
|
|
$robotstxt->URLAllow("http://www.fict.org/server.html"),
|
|
false
|
|
);
|
|
}
|
|
public function testRfcUnhipbot5()
|
|
{
|
|
// {{{
|
|
$robotstxt = new Robotstxt(
|
|
"# /robots.txt for http://www.fict.org/
|
|
# comments to webmaster@fict.org
|
|
|
|
User-agent: unhipbot
|
|
Disallow: /
|
|
|
|
User-agent: webcrawler
|
|
User-agent: excite
|
|
Disallow:
|
|
|
|
User-agent: *
|
|
Disallow: /org/plans.html
|
|
Allow: /org/
|
|
Allow: /serv
|
|
Allow: /~mak
|
|
Disallow: /",
|
|
"unhipbot"
|
|
);
|
|
// }}}
|
|
$this->assertSame(
|
|
$robotstxt->URLAllow("http://www.fict.org/services/fast.html"),
|
|
false
|
|
);
|
|
}
|
|
public function testRfcUnhipbot6()
|
|
{
|
|
// {{{
|
|
$robotstxt = new Robotstxt(
|
|
"# /robots.txt for http://www.fict.org/
|
|
# comments to webmaster@fict.org
|
|
|
|
User-agent: unhipbot
|
|
Disallow: /
|
|
|
|
User-agent: webcrawler
|
|
User-agent: excite
|
|
Disallow:
|
|
|
|
User-agent: *
|
|
Disallow: /org/plans.html
|
|
Allow: /org/
|
|
Allow: /serv
|
|
Allow: /~mak
|
|
Disallow: /",
|
|
"unhipbot"
|
|
);
|
|
// }}}
|
|
$this->assertSame(
|
|
$robotstxt->URLAllow("http://www.fict.org/services/slow.html"),
|
|
false
|
|
);
|
|
}
|
|
public function testRfcUnhipbot7()
|
|
{
|
|
// {{{
|
|
$robotstxt = new Robotstxt(
|
|
"# /robots.txt for http://www.fict.org/
|
|
# comments to webmaster@fict.org
|
|
|
|
User-agent: unhipbot
|
|
Disallow: /
|
|
|
|
User-agent: webcrawler
|
|
User-agent: excite
|
|
Disallow:
|
|
|
|
User-agent: *
|
|
Disallow: /org/plans.html
|
|
Allow: /org/
|
|
Allow: /serv
|
|
Allow: /~mak
|
|
Disallow: /",
|
|
"unhipbot"
|
|
);
|
|
// }}}
|
|
$this->assertSame(
|
|
$robotstxt->URLAllow("http://www.fict.org/orgo.gif"),
|
|
false
|
|
);
|
|
}
|
|
public function testRfcUnhipbot8()
|
|
{
|
|
// {{{
|
|
$robotstxt = new Robotstxt(
|
|
"# /robots.txt for http://www.fict.org/
|
|
# comments to webmaster@fict.org
|
|
|
|
User-agent: unhipbot
|
|
Disallow: /
|
|
|
|
User-agent: webcrawler
|
|
User-agent: excite
|
|
Disallow:
|
|
|
|
User-agent: *
|
|
Disallow: /org/plans.html
|
|
Allow: /org/
|
|
Allow: /serv
|
|
Allow: /~mak
|
|
Disallow: /",
|
|
"unhipbot"
|
|
);
|
|
// }}}
|
|
$this->assertSame(
|
|
$robotstxt->URLAllow("http://www.fict.org/org/about.html"),
|
|
false
|
|
);
|
|
}
|
|
public function testRfcUnhipbot9()
|
|
{
|
|
// {{{
|
|
$robotstxt = new Robotstxt(
|
|
"# /robots.txt for http://www.fict.org/
|
|
# comments to webmaster@fict.org
|
|
|
|
User-agent: unhipbot
|
|
Disallow: /
|
|
|
|
User-agent: webcrawler
|
|
User-agent: excite
|
|
Disallow:
|
|
|
|
User-agent: *
|
|
Disallow: /org/plans.html
|
|
Allow: /org/
|
|
Allow: /serv
|
|
Allow: /~mak
|
|
Disallow: /",
|
|
"unhipbot"
|
|
);
|
|
// }}}
|
|
$this->assertSame(
|
|
$robotstxt->URLAllow("http://www.fict.org/org/plans.html"),
|
|
false
|
|
);
|
|
}
|
|
public function testRfcUnhipbot10()
|
|
{
|
|
// {{{
|
|
$robotstxt = new Robotstxt(
|
|
"# /robots.txt for http://www.fict.org/
|
|
# comments to webmaster@fict.org
|
|
|
|
User-agent: unhipbot
|
|
Disallow: /
|
|
|
|
User-agent: webcrawler
|
|
User-agent: excite
|
|
Disallow:
|
|
|
|
User-agent: *
|
|
Disallow: /org/plans.html
|
|
Allow: /org/
|
|
Allow: /serv
|
|
Allow: /~mak
|
|
Disallow: /",
|
|
"unhipbot"
|
|
);
|
|
// }}}
|
|
$this->assertSame(
|
|
$robotstxt->URLAllow("http://www.fict.org/~jim/jim.html"),
|
|
false
|
|
);
|
|
}
|
|
public function testRfcUnhipbot11()
|
|
{
|
|
// {{{
|
|
$robotstxt = new Robotstxt(
|
|
"# /robots.txt for http://www.fict.org/
|
|
# comments to webmaster@fict.org
|
|
|
|
User-agent: unhipbot
|
|
Disallow: /
|
|
|
|
User-agent: webcrawler
|
|
User-agent: excite
|
|
Disallow:
|
|
|
|
User-agent: *
|
|
Disallow: /org/plans.html
|
|
Allow: /org/
|
|
Allow: /serv
|
|
Allow: /~mak
|
|
Disallow: /",
|
|
"unhipbot"
|
|
);
|
|
// }}}
|
|
$this->assertSame(
|
|
$robotstxt->URLAllow("http://www.fict.org/~mak/mak.html"),
|
|
false
|
|
);
|
|
}
|
|
public function testRfcWebcrawler1()
|
|
{
|
|
// {{{
|
|
$robotstxt = new Robotstxt(
|
|
"# /robots.txt for http://www.fict.org/
|
|
# comments to webmaster@fict.org
|
|
|
|
User-agent: unhipbot
|
|
Disallow: /
|
|
|
|
User-agent: webcrawler
|
|
User-agent: excite
|
|
Disallow:
|
|
|
|
User-agent: *
|
|
Disallow: /org/plans.html
|
|
Allow: /org/
|
|
Allow: /serv
|
|
Allow: /~mak
|
|
Disallow: /",
|
|
"webcrawler"
|
|
);
|
|
// }}}
|
|
$this->assertSame(
|
|
$robotstxt->URLAllow("http://www.fict.org/"),
|
|
true
|
|
);
|
|
}
|
|
public function testRfcWebcrawler2()
|
|
{
|
|
// {{{
|
|
$robotstxt = new Robotstxt(
|
|
"# /robots.txt for http://www.fict.org/
|
|
# comments to webmaster@fict.org
|
|
|
|
User-agent: unhipbot
|
|
Disallow: /
|
|
|
|
User-agent: webcrawler
|
|
User-agent: excite
|
|
Disallow:
|
|
|
|
User-agent: *
|
|
Disallow: /org/plans.html
|
|
Allow: /org/
|
|
Allow: /serv
|
|
Allow: /~mak
|
|
Disallow: /",
|
|
"webcrawler"
|
|
);
|
|
// }}}
|
|
$this->assertSame(
|
|
$robotstxt->URLAllow("http://www.fict.org/index.html"),
|
|
true
|
|
);
|
|
}
|
|
public function testRfcWebcrawler3()
|
|
{
|
|
// {{{
|
|
$robotstxt = new Robotstxt(
|
|
"# /robots.txt for http://www.fict.org/
|
|
# comments to webmaster@fict.org
|
|
|
|
User-agent: unhipbot
|
|
Disallow: /
|
|
|
|
User-agent: webcrawler
|
|
User-agent: excite
|
|
Disallow:
|
|
|
|
User-agent: *
|
|
Disallow: /org/plans.html
|
|
Allow: /org/
|
|
Allow: /serv
|
|
Allow: /~mak
|
|
Disallow: /",
|
|
"webcrawler"
|
|
);
|
|
// }}}
|
|
$this->assertSame(
|
|
$robotstxt->URLAllow("http://www.fict.org/robots.txt"),
|
|
true
|
|
);
|
|
}
|
|
public function testRfcWebcrawler4()
|
|
{
|
|
// {{{
|
|
$robotstxt = new Robotstxt(
|
|
"# /robots.txt for http://www.fict.org/
|
|
# comments to webmaster@fict.org
|
|
|
|
User-agent: unhipbot
|
|
Disallow: /
|
|
|
|
User-agent: webcrawler
|
|
User-agent: excite
|
|
Disallow:
|
|
|
|
User-agent: *
|
|
Disallow: /org/plans.html
|
|
Allow: /org/
|
|
Allow: /serv
|
|
Allow: /~mak
|
|
Disallow: /",
|
|
"webcrawler"
|
|
);
|
|
// }}}
|
|
$this->assertSame(
|
|
$robotstxt->URLAllow("http://www.fict.org/server.html"),
|
|
true
|
|
);
|
|
}
|
|
public function testRfcWebcrawler5()
|
|
{
|
|
// {{{
|
|
$robotstxt = new Robotstxt(
|
|
"# /robots.txt for http://www.fict.org/
|
|
# comments to webmaster@fict.org
|
|
|
|
User-agent: unhipbot
|
|
Disallow: /
|
|
|
|
User-agent: webcrawler
|
|
User-agent: excite
|
|
Disallow:
|
|
|
|
User-agent: *
|
|
Disallow: /org/plans.html
|
|
Allow: /org/
|
|
Allow: /serv
|
|
Allow: /~mak
|
|
Disallow: /",
|
|
"webcrawler"
|
|
);
|
|
// }}}
|
|
$this->assertSame(
|
|
$robotstxt->URLAllow("http://www.fict.org/services/fast.html"),
|
|
true
|
|
);
|
|
}
|
|
public function testRfcWebcrawler6()
|
|
{
|
|
// {{{
|
|
$robotstxt = new Robotstxt(
|
|
"# /robots.txt for http://www.fict.org/
|
|
# comments to webmaster@fict.org
|
|
|
|
User-agent: unhipbot
|
|
Disallow: /
|
|
|
|
User-agent: webcrawler
|
|
User-agent: excite
|
|
Disallow:
|
|
|
|
User-agent: *
|
|
Disallow: /org/plans.html
|
|
Allow: /org/
|
|
Allow: /serv
|
|
Allow: /~mak
|
|
Disallow: /",
|
|
"webcrawler"
|
|
);
|
|
// }}}
|
|
$this->assertSame(
|
|
$robotstxt->URLAllow("http://www.fict.org/services/slow.html"),
|
|
true
|
|
);
|
|
}
|
|
public function testRfcWebcrawler7()
|
|
{
|
|
// {{{
|
|
$robotstxt = new Robotstxt(
|
|
"# /robots.txt for http://www.fict.org/
|
|
# comments to webmaster@fict.org
|
|
|
|
User-agent: unhipbot
|
|
Disallow: /
|
|
|
|
User-agent: webcrawler
|
|
User-agent: excite
|
|
Disallow:
|
|
|
|
User-agent: *
|
|
Disallow: /org/plans.html
|
|
Allow: /org/
|
|
Allow: /serv
|
|
Allow: /~mak
|
|
Disallow: /",
|
|
"webcrawler"
|
|
);
|
|
// }}}
|
|
$this->assertSame(
|
|
$robotstxt->URLAllow("http://www.fict.org/orgo.gif"),
|
|
true
|
|
);
|
|
}
|
|
public function testRfcWebcrawler8()
|
|
{
|
|
// {{{
|
|
$robotstxt = new Robotstxt(
|
|
"# /robots.txt for http://www.fict.org/
|
|
# comments to webmaster@fict.org
|
|
|
|
User-agent: unhipbot
|
|
Disallow: /
|
|
|
|
User-agent: webcrawler
|
|
User-agent: excite
|
|
Disallow:
|
|
|
|
User-agent: *
|
|
Disallow: /org/plans.html
|
|
Allow: /org/
|
|
Allow: /serv
|
|
Allow: /~mak
|
|
Disallow: /",
|
|
"webcrawler"
|
|
);
|
|
// }}}
|
|
$this->assertSame(
|
|
$robotstxt->URLAllow("http://www.fict.org/org/about.html"),
|
|
true
|
|
);
|
|
}
|
|
public function testRfcWebcrawler9()
|
|
{
|
|
// {{{
|
|
$robotstxt = new Robotstxt(
|
|
"# /robots.txt for http://www.fict.org/
|
|
# comments to webmaster@fict.org
|
|
|
|
User-agent: unhipbot
|
|
Disallow: /
|
|
|
|
User-agent: webcrawler
|
|
User-agent: excite
|
|
Disallow:
|
|
|
|
User-agent: *
|
|
Disallow: /org/plans.html
|
|
Allow: /org/
|
|
Allow: /serv
|
|
Allow: /~mak
|
|
Disallow: /",
|
|
"webcrawler"
|
|
);
|
|
// }}}
|
|
$this->assertSame(
|
|
$robotstxt->URLAllow("http://www.fict.org/org/plans.html"),
|
|
true
|
|
);
|
|
}
|
|
public function testRfcWebcrawler10()
|
|
{
|
|
// {{{
|
|
$robotstxt = new Robotstxt(
|
|
"# /robots.txt for http://www.fict.org/
|
|
# comments to webmaster@fict.org
|
|
|
|
User-agent: unhipbot
|
|
Disallow: /
|
|
|
|
User-agent: webcrawler
|
|
User-agent: excite
|
|
Disallow:
|
|
|
|
User-agent: *
|
|
Disallow: /org/plans.html
|
|
Allow: /org/
|
|
Allow: /serv
|
|
Allow: /~mak
|
|
Disallow: /",
|
|
"webcrawler"
|
|
);
|
|
// }}}
|
|
$this->assertSame(
|
|
$robotstxt->URLAllow("http://www.fict.org/~jim/jim.html"),
|
|
true
|
|
);
|
|
}
|
|
public function testRfcWebcrawler11()
|
|
{
|
|
// {{{
|
|
$robotstxt = new Robotstxt(
|
|
"# /robots.txt for http://www.fict.org/
|
|
# comments to webmaster@fict.org
|
|
|
|
User-agent: unhipbot
|
|
Disallow: /
|
|
|
|
User-agent: webcrawler
|
|
User-agent: excite
|
|
Disallow:
|
|
|
|
User-agent: *
|
|
Disallow: /org/plans.html
|
|
Allow: /org/
|
|
Allow: /serv
|
|
Allow: /~mak
|
|
Disallow: /",
|
|
"webcrawler"
|
|
);
|
|
// }}}
|
|
$this->assertSame(
|
|
$robotstxt->URLAllow("http://www.fict.org/~mak/mak.html"),
|
|
true
|
|
);
|
|
}
|
|
public function testRfcExcite1()
|
|
{
|
|
// {{{
|
|
$robotstxt = new Robotstxt(
|
|
"# /robots.txt for http://www.fict.org/
|
|
# comments to webmaster@fict.org
|
|
|
|
User-agent: unhipbot
|
|
Disallow: /
|
|
|
|
User-agent: webcrawler
|
|
User-agent: excite
|
|
Disallow:
|
|
|
|
User-agent: *
|
|
Disallow: /org/plans.html
|
|
Allow: /org/
|
|
Allow: /serv
|
|
Allow: /~mak
|
|
Disallow: /",
|
|
"excite"
|
|
);
|
|
// }}}
|
|
$this->assertSame(
|
|
$robotstxt->URLAllow("http://www.fict.org/"),
|
|
true
|
|
);
|
|
}
|
|
public function testRfcExcite2()
|
|
{
|
|
// {{{
|
|
$robotstxt = new Robotstxt(
|
|
"# /robots.txt for http://www.fict.org/
|
|
# comments to webmaster@fict.org
|
|
|
|
User-agent: unhipbot
|
|
Disallow: /
|
|
|
|
User-agent: webcrawler
|
|
User-agent: excite
|
|
Disallow:
|
|
|
|
User-agent: *
|
|
Disallow: /org/plans.html
|
|
Allow: /org/
|
|
Allow: /serv
|
|
Allow: /~mak
|
|
Disallow: /",
|
|
"excite"
|
|
);
|
|
// }}}
|
|
$this->assertSame(
|
|
$robotstxt->URLAllow("http://www.fict.org/index.html"),
|
|
true
|
|
);
|
|
}
|
|
public function testRfcExcite3()
|
|
{
|
|
// {{{
|
|
$robotstxt = new Robotstxt(
|
|
"# /robots.txt for http://www.fict.org/
|
|
# comments to webmaster@fict.org
|
|
|
|
User-agent: unhipbot
|
|
Disallow: /
|
|
|
|
User-agent: webcrawler
|
|
User-agent: excite
|
|
Disallow:
|
|
|
|
User-agent: *
|
|
Disallow: /org/plans.html
|
|
Allow: /org/
|
|
Allow: /serv
|
|
Allow: /~mak
|
|
Disallow: /",
|
|
"excite"
|
|
);
|
|
// }}}
|
|
$this->assertSame(
|
|
$robotstxt->URLAllow("http://www.fict.org/robots.txt"),
|
|
true
|
|
);
|
|
}
|
|
public function testRfcExcite4()
|
|
{
|
|
// {{{
|
|
$robotstxt = new Robotstxt(
|
|
"# /robots.txt for http://www.fict.org/
|
|
# comments to webmaster@fict.org
|
|
|
|
User-agent: unhipbot
|
|
Disallow: /
|
|
|
|
User-agent: webcrawler
|
|
User-agent: excite
|
|
Disallow:
|
|
|
|
User-agent: *
|
|
Disallow: /org/plans.html
|
|
Allow: /org/
|
|
Allow: /serv
|
|
Allow: /~mak
|
|
Disallow: /",
|
|
"excite"
|
|
);
|
|
// }}}
|
|
$this->assertSame(
|
|
$robotstxt->URLAllow("http://www.fict.org/server.html"),
|
|
true
|
|
);
|
|
}
|
|
public function testRfcExcite5()
|
|
{
|
|
// {{{
|
|
$robotstxt = new Robotstxt(
|
|
"# /robots.txt for http://www.fict.org/
|
|
# comments to webmaster@fict.org
|
|
|
|
User-agent: unhipbot
|
|
Disallow: /
|
|
|
|
User-agent: webcrawler
|
|
User-agent: excite
|
|
Disallow:
|
|
|
|
User-agent: *
|
|
Disallow: /org/plans.html
|
|
Allow: /org/
|
|
Allow: /serv
|
|
Allow: /~mak
|
|
Disallow: /",
|
|
"excite"
|
|
);
|
|
// }}}
|
|
$this->assertSame(
|
|
$robotstxt->URLAllow("http://www.fict.org/services/fast.html"),
|
|
true
|
|
);
|
|
}
|
|
public function testRfcExcite6()
|
|
{
|
|
// {{{
|
|
$robotstxt = new Robotstxt(
|
|
"# /robots.txt for http://www.fict.org/
|
|
# comments to webmaster@fict.org
|
|
|
|
User-agent: unhipbot
|
|
Disallow: /
|
|
|
|
User-agent: webcrawler
|
|
User-agent: excite
|
|
Disallow:
|
|
|
|
User-agent: *
|
|
Disallow: /org/plans.html
|
|
Allow: /org/
|
|
Allow: /serv
|
|
Allow: /~mak
|
|
Disallow: /",
|
|
"excite"
|
|
);
|
|
// }}}
|
|
$this->assertSame(
|
|
$robotstxt->URLAllow("http://www.fict.org/services/slow.html"),
|
|
true
|
|
);
|
|
}
|
|
public function testRfcExcite7()
|
|
{
|
|
// {{{
|
|
$robotstxt = new Robotstxt(
|
|
"# /robots.txt for http://www.fict.org/
|
|
# comments to webmaster@fict.org
|
|
|
|
User-agent: unhipbot
|
|
Disallow: /
|
|
|
|
User-agent: webcrawler
|
|
User-agent: excite
|
|
Disallow:
|
|
|
|
User-agent: *
|
|
Disallow: /org/plans.html
|
|
Allow: /org/
|
|
Allow: /serv
|
|
Allow: /~mak
|
|
Disallow: /",
|
|
"excite"
|
|
);
|
|
// }}}
|
|
$this->assertSame(
|
|
$robotstxt->URLAllow("http://www.fict.org/orgo.gif"),
|
|
true
|
|
);
|
|
}
|
|
public function testRfcExcite8()
|
|
{
|
|
// {{{
|
|
$robotstxt = new Robotstxt(
|
|
"# /robots.txt for http://www.fict.org/
|
|
# comments to webmaster@fict.org
|
|
|
|
User-agent: unhipbot
|
|
Disallow: /
|
|
|
|
User-agent: webcrawler
|
|
User-agent: excite
|
|
Disallow:
|
|
|
|
User-agent: *
|
|
Disallow: /org/plans.html
|
|
Allow: /org/
|
|
Allow: /serv
|
|
Allow: /~mak
|
|
Disallow: /",
|
|
"excite"
|
|
);
|
|
// }}}
|
|
$this->assertSame(
|
|
$robotstxt->URLAllow("http://www.fict.org/org/about.html"),
|
|
true
|
|
);
|
|
}
|
|
public function testRfcExcite9()
|
|
{
|
|
// {{{
|
|
$robotstxt = new Robotstxt(
|
|
"# /robots.txt for http://www.fict.org/
|
|
# comments to webmaster@fict.org
|
|
|
|
User-agent: unhipbot
|
|
Disallow: /
|
|
|
|
User-agent: webcrawler
|
|
User-agent: excite
|
|
Disallow:
|
|
|
|
User-agent: *
|
|
Disallow: /org/plans.html
|
|
Allow: /org/
|
|
Allow: /serv
|
|
Allow: /~mak
|
|
Disallow: /",
|
|
"excite"
|
|
);
|
|
// }}}
|
|
$this->assertSame(
|
|
$robotstxt->URLAllow("http://www.fict.org/org/plans.html"),
|
|
true
|
|
);
|
|
}
|
|
public function testRfcExcite10()
|
|
{
|
|
// {{{
|
|
$robotstxt = new Robotstxt(
|
|
"# /robots.txt for http://www.fict.org/
|
|
# comments to webmaster@fict.org
|
|
|
|
User-agent: unhipbot
|
|
Disallow: /
|
|
|
|
User-agent: webcrawler
|
|
User-agent: excite
|
|
Disallow:
|
|
|
|
User-agent: *
|
|
Disallow: /org/plans.html
|
|
Allow: /org/
|
|
Allow: /serv
|
|
Allow: /~mak
|
|
Disallow: /",
|
|
"excite"
|
|
);
|
|
// }}}
|
|
$this->assertSame(
|
|
$robotstxt->URLAllow("http://www.fict.org/~jim/jim.html"),
|
|
true
|
|
);
|
|
}
|
|
public function testRfcExcite11()
|
|
{
|
|
// {{{
|
|
$robotstxt = new Robotstxt(
|
|
"# /robots.txt for http://www.fict.org/
|
|
# comments to webmaster@fict.org
|
|
|
|
User-agent: unhipbot
|
|
Disallow: /
|
|
|
|
User-agent: webcrawler
|
|
User-agent: excite
|
|
Disallow:
|
|
|
|
User-agent: *
|
|
Disallow: /org/plans.html
|
|
Allow: /org/
|
|
Allow: /serv
|
|
Allow: /~mak
|
|
Disallow: /",
|
|
"excite"
|
|
);
|
|
// }}}
|
|
$this->assertSame(
|
|
$robotstxt->URLAllow("http://www.fict.org/~mak/mak.html"),
|
|
true
|
|
);
|
|
}
|
|
public function testRfcOther1()
|
|
{
|
|
// {{{
|
|
$robotstxt = new Robotstxt(
|
|
"# /robots.txt for http://www.fict.org/
|
|
# comments to webmaster@fict.org
|
|
|
|
User-agent: unhipbot
|
|
Disallow: /
|
|
|
|
User-agent: webcrawler
|
|
User-agent: excite
|
|
Disallow:
|
|
|
|
User-agent: *
|
|
Disallow: /org/plans.html
|
|
Allow: /org/
|
|
Allow: /serv
|
|
Allow: /~mak
|
|
Disallow: /",
|
|
"other"
|
|
);
|
|
// }}}
|
|
$this->assertSame($robotstxt->URLAllow("http://www.fict.org/"), false);
|
|
}
|
|
public function testRfcOther2()
|
|
{
|
|
// {{{
|
|
$robotstxt = new Robotstxt(
|
|
"# /robots.txt for http://www.fict.org/
|
|
# comments to webmaster@fict.org
|
|
|
|
User-agent: unhipbot
|
|
Disallow: /
|
|
|
|
User-agent: webcrawler
|
|
User-agent: excite
|
|
Disallow:
|
|
|
|
User-agent: *
|
|
Disallow: /org/plans.html
|
|
Allow: /org/
|
|
Allow: /serv
|
|
Allow: /~mak
|
|
Disallow: /",
|
|
"other"
|
|
);
|
|
// }}}
|
|
$this->assertSame(
|
|
$robotstxt->URLAllow("http://www.fict.org/index.html"),
|
|
false
|
|
);
|
|
}
|
|
public function testRfcOther3()
|
|
{
|
|
// {{{
|
|
$robotstxt = new Robotstxt(
|
|
"# /robots.txt for http://www.fict.org/
|
|
# comments to webmaster@fict.org
|
|
|
|
User-agent: unhipbot
|
|
Disallow: /
|
|
|
|
User-agent: webcrawler
|
|
User-agent: excite
|
|
Disallow:
|
|
|
|
User-agent: *
|
|
Disallow: /org/plans.html
|
|
Allow: /org/
|
|
Allow: /serv
|
|
Allow: /~mak
|
|
Disallow: /",
|
|
"other"
|
|
);
|
|
// }}}
|
|
$this->assertSame(
|
|
$robotstxt->URLAllow("http://www.fict.org/robots.txt"),
|
|
true
|
|
);
|
|
}
|
|
public function testRfcOther4()
|
|
{
|
|
// {{{
|
|
$robotstxt = new Robotstxt(
|
|
"# /robots.txt for http://www.fict.org/
|
|
# comments to webmaster@fict.org
|
|
|
|
User-agent: unhipbot
|
|
Disallow: /
|
|
|
|
User-agent: webcrawler
|
|
User-agent: excite
|
|
Disallow:
|
|
|
|
User-agent: *
|
|
Disallow: /org/plans.html
|
|
Allow: /org/
|
|
Allow: /serv
|
|
Allow: /~mak
|
|
Disallow: /",
|
|
"other"
|
|
);
|
|
// }}}
|
|
$this->assertSame(
|
|
$robotstxt->URLAllow("http://www.fict.org/server.html"),
|
|
true
|
|
);
|
|
}
|
|
public function testRfcOther5()
|
|
{
|
|
// {{{
|
|
$robotstxt = new Robotstxt(
|
|
"# /robots.txt for http://www.fict.org/
|
|
# comments to webmaster@fict.org
|
|
|
|
User-agent: unhipbot
|
|
Disallow: /
|
|
|
|
User-agent: webcrawler
|
|
User-agent: excite
|
|
Disallow:
|
|
|
|
User-agent: *
|
|
Disallow: /org/plans.html
|
|
Allow: /org/
|
|
Allow: /serv
|
|
Allow: /~mak
|
|
Disallow: /",
|
|
"other"
|
|
);
|
|
// }}}
|
|
$this->assertSame(
|
|
$robotstxt->URLAllow("http://www.fict.org/services/fast.html"),
|
|
true
|
|
);
|
|
}
|
|
public function testRfcOther6()
|
|
{
|
|
// {{{
|
|
$robotstxt = new Robotstxt(
|
|
"# /robots.txt for http://www.fict.org/
|
|
# comments to webmaster@fict.org
|
|
|
|
User-agent: unhipbot
|
|
Disallow: /
|
|
|
|
User-agent: webcrawler
|
|
User-agent: excite
|
|
Disallow:
|
|
|
|
User-agent: *
|
|
Disallow: /org/plans.html
|
|
Allow: /org/
|
|
Allow: /serv
|
|
Allow: /~mak
|
|
Disallow: /",
|
|
"other"
|
|
);
|
|
// }}}
|
|
$this->assertSame(
|
|
$robotstxt->URLAllow("http://www.fict.org/services/slow.html"),
|
|
true
|
|
);
|
|
}
|
|
public function testRfcOther7()
|
|
{
|
|
// {{{
|
|
$robotstxt = new Robotstxt(
|
|
"# /robots.txt for http://www.fict.org/
|
|
# comments to webmaster@fict.org
|
|
|
|
User-agent: unhipbot
|
|
Disallow: /
|
|
|
|
User-agent: webcrawler
|
|
User-agent: excite
|
|
Disallow:
|
|
|
|
User-agent: *
|
|
Disallow: /org/plans.html
|
|
Allow: /org/
|
|
Allow: /serv
|
|
Allow: /~mak
|
|
Disallow: /",
|
|
"other"
|
|
);
|
|
// }}}
|
|
$this->assertSame(
|
|
$robotstxt->URLAllow("http://www.fict.org/orgo.gif"),
|
|
false
|
|
);
|
|
}
|
|
public function testRfcOther8()
|
|
{
|
|
// {{{
|
|
$robotstxt = new Robotstxt(
|
|
"# /robots.txt for http://www.fict.org/
|
|
# comments to webmaster@fict.org
|
|
|
|
User-agent: unhipbot
|
|
Disallow: /
|
|
|
|
User-agent: webcrawler
|
|
User-agent: excite
|
|
Disallow:
|
|
|
|
User-agent: *
|
|
Disallow: /org/plans.html
|
|
Allow: /org/
|
|
Allow: /serv
|
|
Allow: /~mak
|
|
Disallow: /",
|
|
"other"
|
|
);
|
|
// }}}
|
|
$this->assertSame(
|
|
$robotstxt->URLAllow("http://www.fict.org/org/about.html"),
|
|
true
|
|
);
|
|
}
|
|
public function testRfcOther9()
|
|
{
|
|
// {{{
|
|
$robotstxt = new Robotstxt(
|
|
"# /robots.txt for http://www.fict.org/
|
|
# comments to webmaster@fict.org
|
|
|
|
User-agent: unhipbot
|
|
Disallow: /
|
|
|
|
User-agent: webcrawler
|
|
User-agent: excite
|
|
Disallow:
|
|
|
|
User-agent: *
|
|
Disallow: /org/plans.html
|
|
Allow: /org/
|
|
Allow: /serv
|
|
Allow: /~mak
|
|
Disallow: /",
|
|
"other"
|
|
);
|
|
// }}}
|
|
$this->assertSame(
|
|
$robotstxt->URLAllow("http://www.fict.org/org/plans.html"),
|
|
false
|
|
);
|
|
}
|
|
public function testRfcOther10()
|
|
{
|
|
// {{{
|
|
$robotstxt = new Robotstxt(
|
|
"# /robots.txt for http://www.fict.org/
|
|
# comments to webmaster@fict.org
|
|
|
|
User-agent: unhipbot
|
|
Disallow: /
|
|
|
|
User-agent: webcrawler
|
|
User-agent: excite
|
|
Disallow:
|
|
|
|
User-agent: *
|
|
Disallow: /org/plans.html
|
|
Allow: /org/
|
|
Allow: /serv
|
|
Allow: /~mak
|
|
Disallow: /",
|
|
"other"
|
|
);
|
|
// }}}
|
|
$this->assertSame(
|
|
$robotstxt->URLAllow("http://www.fict.org/~jim/jim.html"),
|
|
false
|
|
);
|
|
}
|
|
public function testRfcOther11()
|
|
{
|
|
// {{{
|
|
$robotstxt = new Robotstxt(
|
|
"# /robots.txt for http://www.fict.org/
|
|
# comments to webmaster@fict.org
|
|
|
|
User-agent: unhipbot
|
|
Disallow: /
|
|
|
|
User-agent: webcrawler
|
|
User-agent: excite
|
|
Disallow:
|
|
|
|
User-agent: *
|
|
Disallow: /org/plans.html
|
|
Allow: /org/
|
|
Allow: /serv
|
|
Allow: /~mak
|
|
Disallow: /",
|
|
"other"
|
|
);
|
|
// }}}
|
|
$this->assertSame(
|
|
$robotstxt->URLAllow("http://www.fict.org/~mak/mak.html"),
|
|
true
|
|
);
|
|
}
|
|
|
|
// Allow/Disallow must start by slash or be empty
|
|
public function testAllowDisallowSlash1()
|
|
{
|
|
$robotstxt = new Robotstxt(
|
|
"User-Agent: *\n" .
|
|
"Disallow: INVALID\n\n",
|
|
"domsearch"
|
|
);
|
|
$res = $robotstxt->errors();
|
|
$this->assertSame($res, [1 => "Disallow : Line must start by slash"]);
|
|
}
|
|
|
|
public function testAllowDisallowSlash2()
|
|
{
|
|
$robotstxt = new Robotstxt(
|
|
"User-Agent: *\n" .
|
|
"Disallow: \n\n",
|
|
"domsearch"
|
|
);
|
|
$res = $robotstxt->errors();
|
|
$this->assertSame($res, []);
|
|
}
|
|
|
|
public function testAllowDisallowSlash3()
|
|
{
|
|
$robotstxt = new Robotstxt(
|
|
"User-Agent: *\n" .
|
|
"Allow: INVALID\n\n",
|
|
"domsearch"
|
|
);
|
|
$res = $robotstxt->errors();
|
|
$this->assertSame($res, [1 => "Allow : Line must start by slash"]);
|
|
}
|
|
|
|
public function testAllowDisallowSlash4()
|
|
{
|
|
$robotstxt = new Robotstxt(
|
|
"User-Agent: *\n" .
|
|
"Allow: \n\n",
|
|
"domsearch"
|
|
);
|
|
$res = $robotstxt->errors();
|
|
$this->assertSame($res, []);
|
|
}
|
|
}
|