summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorEvan Prodromou <evan@status.net>2010-01-31 10:12:26 -0500
committerEvan Prodromou <evan@status.net>2010-01-31 10:12:26 -0500
commitdc62246443e3584ef5267505275f618f6fa86bf7 (patch)
tree850c1f89d9e1df589bfe3eed47ce3b790407dc2b
parentfec8066bf76948142828b689708386861d089fb3 (diff)
Add a robots.txt URL to the site root
Adds a robots.txt file to the site root. Defaults defined by 'robotstxt' section of config. New events StartRobotsTxt and EndRobotsTxt to let plugins add information. Probably not useful if path is not /, but won't hurt anything, either.
-rw-r--r--EVENTS.txt6
-rw-r--r--README14
-rw-r--r--actions/robotstxt.php100
-rw-r--r--index.php5
-rw-r--r--lib/default.php4
-rw-r--r--lib/router.php2
6 files changed, 129 insertions, 2 deletions
diff --git a/EVENTS.txt b/EVENTS.txt
index 3317c80de..6bf12bf13 100644
--- a/EVENTS.txt
+++ b/EVENTS.txt
@@ -708,3 +708,9 @@ EndUserRegister: When a new user has been registered
- &$profile: new profile data
- &$user: new user account
+StartRobotsTxt: Before outputting the robots.txt page
+- &$action: RobotstxtAction being shown
+
+EndRobotsTxt: After the default robots.txt page (good place for customization)
+- &$action: RobotstxtAction being shown
+
diff --git a/README b/README
index da278f741..4e576dcdd 100644
--- a/README
+++ b/README
@@ -1496,6 +1496,20 @@ interface. It also makes the user's profile the root URL.
enabled: Whether to run in "single user mode". Default false.
nickname: nickname of the single user.
+robotstxt
+---------
+
+We put out a default robots.txt file to guide the processing of
+Web crawlers. See http://www.robotstxt.org/ for more information
+on the format of this file.
+
+crawldelay: if non-empty, this value is provided as the Crawl-Delay:
+ for the robots.txt file. see http://ur1.ca/l5a0
+ for more information. Default is zero, no explicit delay.
+disallow: Array of (virtual) directories to disallow. Default is 'main',
+ 'search', 'message', 'settings', 'admin'. Ignored when site
+ is private, in which case the entire site ('/') is disallowed.
+
Plugins
=======
diff --git a/actions/robotstxt.php b/actions/robotstxt.php
new file mode 100644
index 000000000..5131097c8
--- /dev/null
+++ b/actions/robotstxt.php
@@ -0,0 +1,100 @@
+<?php
+/**
+ * StatusNet - the distributed open-source microblogging tool
+ * Copyright (C) 2010, StatusNet, Inc.
+ *
+ * robots.txt generator
+ *
+ * PHP version 5
+ *
+ * @category Action
+ * @package StatusNet
+ * @author Evan Prodromou <evan@status.net>
+ * @license http://www.fsf.org/licensing/licenses/agpl.html AGPLv3
+ * @link http://status.net/
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+if (!defined('STATUSNET')) {
+ exit(1);
+}
+
+/**
+ * Prints out a static robots.txt
+ *
+ * @category Action
+ * @package StatusNet
+ * @author Evan Prodromou <evan@status.net>
+ * @license http://www.fsf.org/licensing/licenses/agpl.html AGPLv3
+ * @link http://status.net/
+ */
+
+class RobotstxtAction extends Action
+{
+ /**
+ * Handles requests
+ *
+ * Since this is a relatively static document, we
+ * don't do a prepare()
+ *
+ * @param array $args GET, POST, and URL params; unused.
+ *
+ * @return void
+ */
+
+ function handle($args)
+ {
+ if (Event::handle('StartRobotsTxt', array($this))) {
+
+ header('Content-Type: text/plain');
+
+ print "User-Agent: *\n";
+
+ if (common_config('site', 'private')) {
+
+ print "Disallow: /\n";
+
+ } else {
+
+ $disallow = common_config('robotstxt', 'disallow');
+
+ foreach ($disallow as $dir) {
+ print "Disallow: /$dir/\n";
+ }
+
+ $crawldelay = common_config('robotstxt', 'crawldelay');
+
+ if (!empty($crawldelay)) {
+ print "Crawl-delay: " . $crawldelay . "\n";
+ }
+ }
+
+ Event::handle('EndRobotsTxt', array($this));
+ }
+ }
+
+ /**
+ * Return true; this page doesn't touch the DB.
+ *
+ * @param array $args other arguments
+ *
+ * @return boolean is read only action?
+ */
+
+ function isReadOnly($args)
+ {
+ return true;
+ }
+}
diff --git a/index.php b/index.php
index 605b380bf..06ff9900f 100644
--- a/index.php
+++ b/index.php
@@ -285,8 +285,9 @@ function main()
if (!$user && common_config('site', 'private')
&& !isLoginAction($action)
&& !preg_match('/rss$/', $action)
- && !preg_match('/^Api/', $action)
- ) {
+ && $action != 'robotstxt'
+ && !preg_match('/^Api/', $action)) {
+
// set returnto
$rargs =& common_copy_args($args);
unset($rargs['action']);
diff --git a/lib/default.php b/lib/default.php
index 1337a9633..2bedc4bf0 100644
--- a/lib/default.php
+++ b/lib/default.php
@@ -270,4 +270,8 @@ $default =
'singleuser' =>
array('enabled' => false,
'nickname' => null),
+ 'robotstxt' =>
+ array('crawldelay' => 0,
+ 'disallow' => array('main', 'settings', 'admin', 'search', 'message')
+ ),
);
diff --git a/lib/router.php b/lib/router.php
index ca9f32812..4b5b8d0bb 100644
--- a/lib/router.php
+++ b/lib/router.php
@@ -73,6 +73,8 @@ class Router
if (Event::handle('StartInitializeRouter', array(&$m))) {
+ $m->connect('robots.txt', array('action' => 'robotstxt'));
+
$m->connect('opensearch/people', array('action' => 'opensearch',
'type' => 'people'));
$m->connect('opensearch/notice', array('action' => 'opensearch',