summaryrefslogtreecommitdiff
path: root/plugins/OStatus/lib/feeddiscovery.php
diff options
context:
space:
mode:
authorBrion Vibber <brion@pobox.com>2010-02-08 15:48:52 -0800
committerBrion Vibber <brion@pobox.com>2010-02-08 15:48:52 -0800
commitb2e8d8407cc7f1abb5e8767cd3403ac356775eaa (patch)
tree5cfaed5f1f21a697e147b41695c7e8c5282324a3 /plugins/OStatus/lib/feeddiscovery.php
parent3833dc8c1f3f9ba5e3b12bf2715e4a4fb3adabf1 (diff)
parent4e6f587f868d71f08c618d0dedf6ddf0331619c2 (diff)
Merge branch 'testing' of git@gitorious.org:statusnet/mainline into 0.9.x
Diffstat (limited to 'plugins/OStatus/lib/feeddiscovery.php')
-rw-r--r--plugins/OStatus/lib/feeddiscovery.php231
1 files changed, 231 insertions, 0 deletions
diff --git a/plugins/OStatus/lib/feeddiscovery.php b/plugins/OStatus/lib/feeddiscovery.php
new file mode 100644
index 000000000..39985fc90
--- /dev/null
+++ b/plugins/OStatus/lib/feeddiscovery.php
@@ -0,0 +1,231 @@
+<?php
+/*
+ * StatusNet - the distributed open-source microblogging tool
+ * Copyright (C) 2009, StatusNet, Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/**
+ * @package FeedSubPlugin
+ * @maintainer Brion Vibber <brion@status.net>
+ */
+
+if (!defined('STATUSNET') && !defined('LACONICA')) { exit(1); }
+
+class FeedSubBadURLException extends FeedSubException
+{
+}
+
+class FeedSubBadResponseException extends FeedSubException
+{
+}
+
+class FeedSubEmptyException extends FeedSubException
+{
+}
+
+class FeedSubBadHTMLException extends FeedSubException
+{
+}
+
+class FeedSubUnrecognizedTypeException extends FeedSubException
+{
+}
+
+class FeedSubNoFeedException extends FeedSubException
+{
+}
+
+/**
+ * Given a web page or feed URL, discover the final location of the feed
+ * and return its current contents.
+ *
+ * @example
+ * $feed = new FeedDiscovery();
+ * if ($feed->discoverFromURL($url)) {
+ * print $feed->uri;
+ * print $feed->type;
+ * processFeed($feed->body);
+ * }
+ */
+class FeedDiscovery
+{
+ public $uri;
+ public $type;
+ public $body;
+
+
+ public function feedMunger()
+ {
+ require_once 'XML/Feed/Parser.php';
+ $feed = new XML_Feed_Parser($this->body, false, false, true); // @fixme
+ return new FeedMunger($feed, $this->uri);
+ }
+
+ /**
+ * @param string $url
+ * @param bool $htmlOk pass false here if you don't want to follow web pages.
+ * @return string with validated URL
+ * @throws FeedSubBadURLException
+ * @throws FeedSubBadHtmlException
+ * @throws FeedSubNoFeedException
+ * @throws FeedSubEmptyException
+ * @throws FeedSubUnrecognizedTypeException
+ */
+ function discoverFromURL($url, $htmlOk=true)
+ {
+ try {
+ $client = new HTTPClient();
+ $response = $client->get($url);
+ } catch (HTTP_Request2_Exception $e) {
+ throw new FeedSubBadURLException($e);
+ }
+
+ if ($htmlOk) {
+ $type = $response->getHeader('Content-Type');
+ $isHtml = preg_match('!^(text/html|application/xhtml\+xml)!i', $type);
+ if ($isHtml) {
+ $target = $this->discoverFromHTML($response->getUrl(), $response->getBody());
+ if (!$target) {
+ throw new FeedSubNoFeedException($url);
+ }
+ return $this->discoverFromURL($target, false);
+ }
+ }
+
+ return $this->initFromResponse($response);
+ }
+
+ function initFromResponse($response)
+ {
+ if (!$response->isOk()) {
+ throw new FeedSubBadResponseException($response->getCode());
+ }
+
+ $sourceurl = $response->getUrl();
+ $body = $response->getBody();
+ if (!$body) {
+ throw new FeedSubEmptyException($sourceurl);
+ }
+
+ $type = $response->getHeader('Content-Type');
+ if (preg_match('!^(text/xml|application/xml|application/(rss|atom)\+xml)!i', $type)) {
+ $this->uri = $sourceurl;
+ $this->type = $type;
+ $this->body = $body;
+ return true;
+ } else {
+ common_log(LOG_WARNING, "Unrecognized feed type $type for $sourceurl");
+ throw new FeedSubUnrecognizedTypeException($type);
+ }
+ }
+
+ /**
+ * @param string $url source URL, used to resolve relative links
+ * @param string $body HTML body text
+ * @return mixed string with URL or false if no target found
+ */
+ function discoverFromHTML($url, $body)
+ {
+ // DOMDocument::loadHTML may throw warnings on unrecognized elements.
+ $old = error_reporting(error_reporting() & ~E_WARNING);
+ $dom = new DOMDocument();
+ $ok = $dom->loadHTML($body);
+ error_reporting($old);
+
+ if (!$ok) {
+ throw new FeedSubBadHtmlException();
+ }
+
+ // Autodiscovery links may be relative to the page's URL or <base href>
+ $base = false;
+ $nodes = $dom->getElementsByTagName('base');
+ for ($i = 0; $i < $nodes->length; $i++) {
+ $node = $nodes->item($i);
+ if ($node->hasAttributes()) {
+ $href = $node->attributes->getNamedItem('href');
+ if ($href) {
+ $base = trim($href->value);
+ }
+ }
+ }
+ if ($base) {
+ $base = $this->resolveURI($base, $url);
+ } else {
+ $base = $url;
+ }
+
+ // Ok... now on to the links!
+ // Types listed in order of priority -- we'll prefer Atom if available.
+ // @fixme merge with the munger link checks
+ $feeds = array(
+ 'application/atom+xml' => false,
+ 'application/rss+xml' => false,
+ );
+
+ $nodes = $dom->getElementsByTagName('link');
+ for ($i = 0; $i < $nodes->length; $i++) {
+ $node = $nodes->item($i);
+ if ($node->hasAttributes()) {
+ $rel = $node->attributes->getNamedItem('rel');
+ $type = $node->attributes->getNamedItem('type');
+ $href = $node->attributes->getNamedItem('href');
+ if ($rel && $type && $href) {
+ $rel = trim($rel->value);
+ $type = trim($type->value);
+ $href = trim($href->value);
+
+ if (trim($rel) == 'alternate' && array_key_exists($type, $feeds) && empty($feeds[$type])) {
+ // Save the first feed found of each type...
+ $feeds[$type] = $this->resolveURI($href, $base);
+ }
+ }
+ }
+ }
+
+ // Return the highest-priority feed found
+ foreach ($feeds as $type => $url) {
+ if ($url) {
+ return $url;
+ }
+ }
+
+ return false;
+ }
+
+ /**
+ * Resolve a possibly relative URL against some absolute base URL
+ * @param string $rel relative or absolute URL
+ * @param string $base absolute URL
+ * @return string absolute URL, or original URL if could not be resolved.
+ */
+ function resolveURI($rel, $base)
+ {
+ require_once "Net/URL2.php";
+ try {
+ $relUrl = new Net_URL2($rel);
+ if ($relUrl->isAbsolute()) {
+ return $rel;
+ }
+ $baseUrl = new Net_URL2($base);
+ $absUrl = $baseUrl->resolve($relUrl);
+ return $absUrl->getURL();
+ } catch (Exception $e) {
+ common_log(LOG_WARNING, 'Unable to resolve relative link "' .
+ $rel . '" against base "' . $base . '": ' . $e->getMessage());
+ return $rel;
+ }
+ }
+}