diff options
author | Brion Vibber <brion@pobox.com> | 2010-02-08 15:48:52 -0800 |
---|---|---|
committer | Brion Vibber <brion@pobox.com> | 2010-02-08 15:48:52 -0800 |
commit | b2e8d8407cc7f1abb5e8767cd3403ac356775eaa (patch) | |
tree | 5cfaed5f1f21a697e147b41695c7e8c5282324a3 /plugins/OStatus/lib/feeddiscovery.php | |
parent | 3833dc8c1f3f9ba5e3b12bf2715e4a4fb3adabf1 (diff) | |
parent | 4e6f587f868d71f08c618d0dedf6ddf0331619c2 (diff) |
Merge branch 'testing' of git@gitorious.org:statusnet/mainline into 0.9.x
Diffstat (limited to 'plugins/OStatus/lib/feeddiscovery.php')
-rw-r--r-- | plugins/OStatus/lib/feeddiscovery.php | 231 |
1 files changed, 231 insertions, 0 deletions
diff --git a/plugins/OStatus/lib/feeddiscovery.php b/plugins/OStatus/lib/feeddiscovery.php new file mode 100644 index 000000000..39985fc90 --- /dev/null +++ b/plugins/OStatus/lib/feeddiscovery.php @@ -0,0 +1,231 @@ +<?php +/* + * StatusNet - the distributed open-source microblogging tool + * Copyright (C) 2009, StatusNet, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +/** + * @package FeedSubPlugin + * @maintainer Brion Vibber <brion@status.net> + */ + +if (!defined('STATUSNET') && !defined('LACONICA')) { exit(1); } + +class FeedSubBadURLException extends FeedSubException +{ +} + +class FeedSubBadResponseException extends FeedSubException +{ +} + +class FeedSubEmptyException extends FeedSubException +{ +} + +class FeedSubBadHTMLException extends FeedSubException +{ +} + +class FeedSubUnrecognizedTypeException extends FeedSubException +{ +} + +class FeedSubNoFeedException extends FeedSubException +{ +} + +/** + * Given a web page or feed URL, discover the final location of the feed + * and return its current contents. + * + * @example + * $feed = new FeedDiscovery(); + * if ($feed->discoverFromURL($url)) { + * print $feed->uri; + * print $feed->type; + * processFeed($feed->body); + * } + */ +class FeedDiscovery +{ + public $uri; + public $type; + public $body; + + + public function feedMunger() + { + require_once 'XML/Feed/Parser.php'; + $feed = new XML_Feed_Parser($this->body, false, false, true); // @fixme + return new FeedMunger($feed, $this->uri); + } + + /** + * @param string $url + * @param bool $htmlOk pass false here if you don't want to follow web pages. + * @return string with validated URL + * @throws FeedSubBadURLException + * @throws FeedSubBadHtmlException + * @throws FeedSubNoFeedException + * @throws FeedSubEmptyException + * @throws FeedSubUnrecognizedTypeException + */ + function discoverFromURL($url, $htmlOk=true) + { + try { + $client = new HTTPClient(); + $response = $client->get($url); + } catch (HTTP_Request2_Exception $e) { + throw new FeedSubBadURLException($e); + } + + if ($htmlOk) { + $type = $response->getHeader('Content-Type'); + $isHtml = preg_match('!^(text/html|application/xhtml\+xml)!i', $type); + if ($isHtml) { + $target = $this->discoverFromHTML($response->getUrl(), $response->getBody()); + if (!$target) { + throw new FeedSubNoFeedException($url); + } + return $this->discoverFromURL($target, false); + } + } + + return $this->initFromResponse($response); + } + + function initFromResponse($response) + { + if (!$response->isOk()) { + throw new FeedSubBadResponseException($response->getCode()); + } + + $sourceurl = $response->getUrl(); + $body = $response->getBody(); + if (!$body) { + throw new FeedSubEmptyException($sourceurl); + } + + $type = $response->getHeader('Content-Type'); + if (preg_match('!^(text/xml|application/xml|application/(rss|atom)\+xml)!i', $type)) { + $this->uri = $sourceurl; + $this->type = $type; + $this->body = $body; + return true; + } else { + common_log(LOG_WARNING, "Unrecognized feed type $type for $sourceurl"); + throw new FeedSubUnrecognizedTypeException($type); + } + } + + /** + * @param string $url source URL, used to resolve relative links + * @param string $body HTML body text + * @return mixed string with URL or false if no target found + */ + function discoverFromHTML($url, $body) + { + // DOMDocument::loadHTML may throw warnings on unrecognized elements. + $old = error_reporting(error_reporting() & ~E_WARNING); + $dom = new DOMDocument(); + $ok = $dom->loadHTML($body); + error_reporting($old); + + if (!$ok) { + throw new FeedSubBadHtmlException(); + } + + // Autodiscovery links may be relative to the page's URL or <base href> + $base = false; + $nodes = $dom->getElementsByTagName('base'); + for ($i = 0; $i < $nodes->length; $i++) { + $node = $nodes->item($i); + if ($node->hasAttributes()) { + $href = $node->attributes->getNamedItem('href'); + if ($href) { + $base = trim($href->value); + } + } + } + if ($base) { + $base = $this->resolveURI($base, $url); + } else { + $base = $url; + } + + // Ok... now on to the links! + // Types listed in order of priority -- we'll prefer Atom if available. + // @fixme merge with the munger link checks + $feeds = array( + 'application/atom+xml' => false, + 'application/rss+xml' => false, + ); + + $nodes = $dom->getElementsByTagName('link'); + for ($i = 0; $i < $nodes->length; $i++) { + $node = $nodes->item($i); + if ($node->hasAttributes()) { + $rel = $node->attributes->getNamedItem('rel'); + $type = $node->attributes->getNamedItem('type'); + $href = $node->attributes->getNamedItem('href'); + if ($rel && $type && $href) { + $rel = trim($rel->value); + $type = trim($type->value); + $href = trim($href->value); + + if (trim($rel) == 'alternate' && array_key_exists($type, $feeds) && empty($feeds[$type])) { + // Save the first feed found of each type... + $feeds[$type] = $this->resolveURI($href, $base); + } + } + } + } + + // Return the highest-priority feed found + foreach ($feeds as $type => $url) { + if ($url) { + return $url; + } + } + + return false; + } + + /** + * Resolve a possibly relative URL against some absolute base URL + * @param string $rel relative or absolute URL + * @param string $base absolute URL + * @return string absolute URL, or original URL if could not be resolved. + */ + function resolveURI($rel, $base) + { + require_once "Net/URL2.php"; + try { + $relUrl = new Net_URL2($rel); + if ($relUrl->isAbsolute()) { + return $rel; + } + $baseUrl = new Net_URL2($base); + $absUrl = $baseUrl->resolve($relUrl); + return $absUrl->getURL(); + } catch (Exception $e) { + common_log(LOG_WARNING, 'Unable to resolve relative link "' . + $rel . '" against base "' . $base . '": ' . $e->getMessage()); + return $rel; + } + } +} |