From 99454be38cf1dc7f962441d23ccc0a59e7b05f3d Mon Sep 17 00:00:00 2001 From: Evan Prodromou Date: Sat, 20 Mar 2010 16:06:22 -0500 Subject: Move activity classes to their own files Moved the various classes used by the Activity class to their own files. There were >10 classes in the same file, with around 1500 lines in the file. Just too big. This change makes autoloading work for these classes, so also removed the hard require in lib/common.php. --- lib/activityutils.php | 243 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 243 insertions(+) create mode 100644 lib/activityutils.php (limited to 'lib/activityutils.php') diff --git a/lib/activityutils.php b/lib/activityutils.php new file mode 100644 index 000000000..c85a3db55 --- /dev/null +++ b/lib/activityutils.php @@ -0,0 +1,243 @@ +. + * + * @category Feed + * @package StatusNet + * @author Evan Prodromou + * @author Zach Copley + * @copyright 2010 StatusNet, Inc. + * @license http://www.fsf.org/licensing/licenses/agpl-3.0.html AGPLv3 + * @link http://status.net/ + */ + +if (!defined('STATUSNET')) { + exit(1); +} + +/** + * Utilities for turning DOMish things into Activityish things + * + * Some common functions that I didn't have the bandwidth to try to factor + * into some kind of reasonable superclass, so just dumped here. Might + * be useful to have an ActivityObject parent class or something. + * + * @category OStatus + * @package StatusNet + * @author Evan Prodromou + * @copyright 2010 StatusNet, Inc. + * @license http://www.fsf.org/licensing/licenses/agpl-3.0.html AGPLv3 + * @link http://status.net/ + */ + +class ActivityUtils +{ + const ATOM = 'http://www.w3.org/2005/Atom'; + + const LINK = 'link'; + const REL = 'rel'; + const TYPE = 'type'; + const HREF = 'href'; + + const CONTENT = 'content'; + const SRC = 'src'; + + /** + * Get the permalink for an Activity object + * + * @param DOMElement $element A DOM element + * + * @return string related link, if any + */ + + static function getPermalink($element) + { + return self::getLink($element, 'alternate', 'text/html'); + } + + /** + * Get the permalink for an Activity object + * + * @param DOMElement $element A DOM element + * + * @return string related link, if any + */ + + static function getLink(DOMNode $element, $rel, $type=null) + { + $els = $element->childNodes; + + foreach ($els as $link) { + + if (!($link instanceof DOMElement)) { + continue; + } + + if ($link->localName == self::LINK && $link->namespaceURI == self::ATOM) { + + $linkRel = $link->getAttribute(self::REL); + $linkType = $link->getAttribute(self::TYPE); + + if ($linkRel == $rel && + (is_null($type) || $linkType == $type)) { + return $link->getAttribute(self::HREF); + } + } + } + + return null; + } + + static function getLinks(DOMNode $element, $rel, $type=null) + { + $els = $element->childNodes; + $out = array(); + + foreach ($els as $link) { + if ($link->localName == self::LINK && $link->namespaceURI == self::ATOM) { + + $linkRel = $link->getAttribute(self::REL); + $linkType = $link->getAttribute(self::TYPE); + + if ($linkRel == $rel && + (is_null($type) || $linkType == $type)) { + $out[] = $link; + } + } + } + + return $out; + } + + /** + * Gets the first child element with the given tag + * + * @param DOMElement $element element to pick at + * @param string $tag tag to look for + * @param string $namespace Namespace to look under + * + * @return DOMElement found element or null + */ + + static function child(DOMNode $element, $tag, $namespace=self::ATOM) + { + $els = $element->childNodes; + if (empty($els) || $els->length == 0) { + return null; + } else { + for ($i = 0; $i < $els->length; $i++) { + $el = $els->item($i); + if ($el->localName == $tag && $el->namespaceURI == $namespace) { + return $el; + } + } + } + } + + /** + * Grab the text content of a DOM element child of the current element + * + * @param DOMElement $element Element whose children we examine + * @param string $tag Tag to look up + * @param string $namespace Namespace to use, defaults to Atom + * + * @return string content of the child + */ + + static function childContent(DOMNode $element, $tag, $namespace=self::ATOM) + { + $el = self::child($element, $tag, $namespace); + + if (empty($el)) { + return null; + } else { + return $el->textContent; + } + } + + static function childHtmlContent(DOMNode $element, $tag, $namespace=self::ATOM) + { + $el = self::child($element, $tag, $namespace); + + if (empty($el)) { + return null; + } else { + return self::textConstruct($el); + } + } + + /** + * Get the content of an atom:entry-like object + * + * @param DOMElement $element The element to examine. + * + * @return string unencoded HTML content of the element, like "This -< is HTML." + * + * @todo handle remote content + * @todo handle embedded XML mime types + * @todo handle base64-encoded non-XML and non-text mime types + */ + + static function getContent($element) + { + return self::childHtmlContent($element, self::CONTENT, self::ATOM); + } + + static function textConstruct($el) + { + $src = $el->getAttribute(self::SRC); + + if (!empty($src)) { + throw new ClientException(_("Can't handle remote content yet.")); + } + + $type = $el->getAttribute(self::TYPE); + + // slavishly following http://atompub.org/rfc4287.html#rfc.section.4.1.3.3 + + if (empty($type) || $type == 'text') { + return $el->textContent; + } else if ($type == 'html') { + $text = $el->textContent; + return htmlspecialchars_decode($text, ENT_QUOTES); + } else if ($type == 'xhtml') { + $divEl = ActivityUtils::child($el, 'div', 'http://www.w3.org/1999/xhtml'); + if (empty($divEl)) { + return null; + } + $doc = $divEl->ownerDocument; + $text = ''; + $children = $divEl->childNodes; + + for ($i = 0; $i < $children->length; $i++) { + $child = $children->item($i); + $text .= $doc->saveXML($child); + } + return trim($text); + } else if (in_array($type, array('text/xml', 'application/xml')) || + preg_match('#(+|/)xml$#', $type)) { + throw new ClientException(_("Can't handle embedded XML content yet.")); + } else if (strncasecmp($type, 'text/', 5)) { + return $el->textContent; + } else { + throw new ClientException(_("Can't handle embedded Base64 content yet.")); + } + } +} -- cgit v1.2.3-54-g00ecf From fcb614d0eb1f98bf8704654ed06e1f9d9733d359 Mon Sep 17 00:00:00 2001 From: Brion Vibber Date: Sun, 21 Mar 2010 16:25:12 -0700 Subject: Pull info as well as when we have an old-style ActivityStreams feed. This fixes subscription setup for Cliqset feeds, which currently have a bogus activity:actor/atom:id but a good atom:author/atom:uri --- lib/activityobject.php | 21 +++++++++++++++++++-- lib/activityutils.php | 22 ++++++++++++++++++++++ plugins/OStatus/classes/Ostatus_profile.php | 6 +----- 3 files changed, 42 insertions(+), 7 deletions(-) (limited to 'lib/activityutils.php') diff --git a/lib/activityobject.php b/lib/activityobject.php index e5cea727b..0a358ccab 100644 --- a/lib/activityobject.php +++ b/lib/activityobject.php @@ -156,7 +156,11 @@ class ActivityObject { $this->type = self::PERSON; // XXX: is this fair? $this->title = $this->_childContent($element, self::NAME); - $this->id = $this->_childContent($element, self::URI); + + $id = $this->_childContent($element, self::URI); + if (ActivityUtils::validateUri($id)) { + $this->id = $id; + } if (empty($this->id)) { $email = $this->_childContent($element, self::EMAIL); @@ -169,6 +173,15 @@ class ActivityObject private function _fromAtomEntry($element) { + if ($element->localName == 'actor') { + // Old-fashioned ... + // First pull anything from , then we'll add on top. + $author = ActivityUtils::child($element->parentNode, 'author'); + if ($author) { + $this->_fromAuthor($author); + } + } + $this->type = $this->_childContent($element, Activity::OBJECTTYPE, Activity::SPEC); @@ -176,7 +189,11 @@ class ActivityObject $this->type = ActivityObject::NOTE; } - $this->id = $this->_childContent($element, self::ID); + $id = $this->_childContent($element, self::ID); + if (ActivityUtils::validateUri($id)) { + $this->id = $id; + } + $this->summary = ActivityUtils::childHtmlContent($element, self::SUMMARY); $this->content = ActivityUtils::getContent($element); diff --git a/lib/activityutils.php b/lib/activityutils.php index c85a3db55..a7e99fb11 100644 --- a/lib/activityutils.php +++ b/lib/activityutils.php @@ -240,4 +240,26 @@ class ActivityUtils throw new ClientException(_("Can't handle embedded Base64 content yet.")); } } + + /** + * Is this a valid URI for remote profile/notice identification? + * Does not have to be a resolvable URL. + * @param string $uri + * @return boolean + */ + static function validateUri($uri) + { + if (Validate::uri($uri)) { + return true; + } + + // Possibly an upstream bug; tag: URIs aren't validated properly + // unless you explicitly ask for them. All other schemes are accepted + // for basic URI validation without asking. + if (Validate::uri($uri, array('allowed_scheme' => array('tag')))) { + return true; + } + + return false; + } } diff --git a/plugins/OStatus/classes/Ostatus_profile.php b/plugins/OStatus/classes/Ostatus_profile.php index 5595a9d29..e33509c47 100644 --- a/plugins/OStatus/classes/Ostatus_profile.php +++ b/plugins/OStatus/classes/Ostatus_profile.php @@ -1170,11 +1170,7 @@ class Ostatus_profile extends Memcached_DataObject protected static function getActivityObjectProfileURI($object) { if ($object->id) { - // Possibly an upstream bug; tag: URIs are rejected unless you - // explicitly ask for them. All other schemes are accepted for - // basic URI validation without asking. - if (Validate::uri($object->id) || - Validate::uri($object->id, array('allowed_scheme' => array('tag')))) { + if (ActivityUtils::validateUri($object->id)) { return $object->id; } } -- cgit v1.2.3-54-g00ecf From 8fd0059bf69ed16ed4efad7b8e16dc2afda32e18 Mon Sep 17 00:00:00 2001 From: Brion Vibber Date: Fri, 23 Apr 2010 15:40:48 -0700 Subject: Test cases and fixes for Atom and RSS content decoding. Fix extraction of Atom and ; we were failing to escape plaintext source data to HTML, and doing an extraneous double-deescape on HTML source resulting in breakage of notices containing text that looks like HTML. Only was working correctly previously. Fixes for RSS2 content processing: we were failing to load at all due to using wrong element name, and were applying an extraneous de-escape for rather than the escaping that is required to turn plaintext into HTML. (Per spec, must be plaintext.) --- lib/activity.php | 14 ++++++-- lib/activityutils.php | 12 +++++-- tests/ActivityParseTests.php | 77 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 98 insertions(+), 5 deletions(-) (limited to 'lib/activityutils.php') diff --git a/lib/activity.php b/lib/activity.php index 5d6230c6d..27f09ab4d 100644 --- a/lib/activity.php +++ b/lib/activity.php @@ -83,6 +83,7 @@ class Activity const CREATOR = 'creator'; const CONTENTNS = 'http://purl.org/rss/1.0/modules/content/'; + const ENCODED = 'encoded'; public $actor; // an ActivityObject public $verb; // a string (the URL) @@ -268,14 +269,21 @@ class Activity $this->title = ActivityUtils::childContent($item, ActivityObject::TITLE, self::RSS); - $contentEl = ActivityUtils::child($item, ActivityUtils::CONTENT, self::CONTENTNS); + $contentEl = ActivityUtils::child($item, self::ENCODED, self::CONTENTNS); if (!empty($contentEl)) { - $this->content = htmlspecialchars_decode($contentEl->textContent, ENT_QUOTES); + // XML node's text content is HTML; no further processing needed. + $this->content = $contentEl->textContent; } else { $descriptionEl = ActivityUtils::child($item, self::DESCRIPTION, self::RSS); if (!empty($descriptionEl)) { - $this->content = htmlspecialchars_decode($descriptionEl->textContent, ENT_QUOTES); + // Per spec, must be plaintext. + // In practice, often there's HTML... but these days good + // feeds are using which is explicitly + // real HTML. + // We'll treat this following spec, and do HTML escaping + // to convert from plaintext to HTML. + $this->content = htmlspecialchars($descriptionEl->textContent); } } diff --git a/lib/activityutils.php b/lib/activityutils.php index a7e99fb11..401fd7fc2 100644 --- a/lib/activityutils.php +++ b/lib/activityutils.php @@ -213,11 +213,19 @@ class ActivityUtils // slavishly following http://atompub.org/rfc4287.html#rfc.section.4.1.3.3 if (empty($type) || $type == 'text') { - return $el->textContent; + // We have plaintext saved as the XML text content. + // Since we want HTML, we need to escape any special chars. + return htmlspecialchars($el->textContent); } else if ($type == 'html') { + // We have HTML saved as the XML text content. + // No additional processing required once we've got it. $text = $el->textContent; - return htmlspecialchars_decode($text, ENT_QUOTES); + return $text; } else if ($type == 'xhtml') { + // Per spec, the contains a single + // HTML
with XHTML namespace on it as a child node. + // We need to pull all of that
's child nodes and + // serialize them back to an (X)HTML source fragment. $divEl = ActivityUtils::child($el, 'div', 'http://www.w3.org/1999/xhtml'); if (empty($divEl)) { return null; diff --git a/tests/ActivityParseTests.php b/tests/ActivityParseTests.php index 4563da914..378478d74 100644 --- a/tests/ActivityParseTests.php +++ b/tests/ActivityParseTests.php @@ -32,6 +32,18 @@ class ActivityParseTests extends PHPUnit_Framework_TestCase $this->assertEquals('tag:versioncentral.example.org,2009:/change/1643245', $act->objects[0]->id); } + public function testExample2() + { + global $_example2; + $dom = DOMDocument::loadXML($_example2); + $act = new Activity($dom->documentElement); + + $this->assertFalse(empty($act)); + // Did we handle correctly with a typical payload? + $this->assertEquals("

Geraldine posted a Photo on PhotoPanic

\n " . + "", trim($act->content)); + } + public function testExample3() { global $_example3; @@ -305,6 +317,71 @@ class ActivityParseTests extends PHPUnit_Framework_TestCase } + public function testAtomContent() + { + $tests = array(array("Some regular plain text.", + "Some regular plain text."), + array("<b>this is not HTML</b>", + "<b>this is not HTML</b>"), + array("Some regular plain HTML.", + "Some regular plain HTML."), + array("<b>this is too HTML</b>", + "this is too HTML"), + array("&lt;b&gt;but this is not HTML!&lt;/b&gt;", + "<b>but this is not HTML!</b>"), + array("
Some regular plain XHTML.
", + "Some regular plain XHTML."), + array("
This is some XHTML!
", + "This is some XHTML!"), + array("
<b>This is not some XHTML!</b>
", + "<b>This is not some XHTML!</b>"), + array("
&lt;b&gt;This is not some XHTML either!&lt;/b&gt;
", + "&lt;b&gt;This is not some XHTML either!&lt;/b&gt;")); + foreach ($tests as $data) { + list($source, $output) = $data; + $xml = "" . + "http://example.com/fakeid" . + "Test" . + "Atom content tests" . + $source . + ""; + $dom = DOMDocument::loadXML($xml); + $act = new Activity($dom->documentElement); + + $this->assertFalse(empty($act)); + $this->assertEquals($output, trim($act->content)); + } + } + + public function testRssContent() + { + $tests = array(array("Some regular plain HTML.", + "Some regular plain HTML."), + array("Some <b>exciting bold HTML</b>", + "Some exciting bold HTML"), + array("Some &lt;b&gt;escaped non-HTML.&lt;/b&gt;", + "Some <b>escaped non-HTML.</b>"), + array("Some plain text.", + "Some plain text."), + array("Some <b>non-HTML text</b>", + "Some <b>non-HTML text</b>"), + array("Some &lt;b&gt;double-escaped text&lt;/b&gt;", + "Some &lt;b&gt;double-escaped text&lt;/b&gt;")); + foreach ($tests as $data) { + list($source, $output) = $data; + $xml = "" . + "http://example.com/fakeid" . + "RSS content tests" . + $source . + ""; + $dom = DOMDocument::loadXML($xml); + $act = new Activity($dom->documentElement); + + $this->assertFalse(empty($act)); + $this->assertEquals($output, trim($act->content)); + } + } + } $_example1 = <<