diff options
author | Brion Vibber <brion@pobox.com> | 2010-04-23 15:40:48 -0700 |
---|---|---|
committer | Brion Vibber <brion@pobox.com> | 2010-04-23 15:40:48 -0700 |
commit | 8fd0059bf69ed16ed4efad7b8e16dc2afda32e18 (patch) | |
tree | 88b030330d671dbbb8dfd6430d45f29d9e4d8bea | |
parent | 9c8052e755e5ad4c8120ace9acdd75ee910e2ab7 (diff) |
Test cases and fixes for Atom and RSS content decoding.
Fix extraction of Atom <content type="text"> and <content type="html">; we were failing to escape plaintext source data to HTML, and doing an extraneous double-deescape on HTML source resulting in breakage of notices containing text that looks like HTML. Only <content type="xhtml"> was working correctly previously.
Fixes for RSS2 content processing: we were failing to load <content:encoded> at all due to using wrong element name, and were applying an extraneous de-escape for <description> rather than the escaping that is required to turn plaintext into HTML. (Per spec, <description> must be plaintext.)
-rw-r--r-- | lib/activity.php | 14 | ||||
-rw-r--r-- | lib/activityutils.php | 12 | ||||
-rw-r--r-- | tests/ActivityParseTests.php | 77 |
3 files changed, 98 insertions, 5 deletions
diff --git a/lib/activity.php b/lib/activity.php index 5d6230c6d..27f09ab4d 100644 --- a/lib/activity.php +++ b/lib/activity.php @@ -83,6 +83,7 @@ class Activity const CREATOR = 'creator'; const CONTENTNS = 'http://purl.org/rss/1.0/modules/content/'; + const ENCODED = 'encoded'; public $actor; // an ActivityObject public $verb; // a string (the URL) @@ -268,14 +269,21 @@ class Activity $this->title = ActivityUtils::childContent($item, ActivityObject::TITLE, self::RSS); - $contentEl = ActivityUtils::child($item, ActivityUtils::CONTENT, self::CONTENTNS); + $contentEl = ActivityUtils::child($item, self::ENCODED, self::CONTENTNS); if (!empty($contentEl)) { - $this->content = htmlspecialchars_decode($contentEl->textContent, ENT_QUOTES); + // <content:encoded> XML node's text content is HTML; no further processing needed. + $this->content = $contentEl->textContent; } else { $descriptionEl = ActivityUtils::child($item, self::DESCRIPTION, self::RSS); if (!empty($descriptionEl)) { - $this->content = htmlspecialchars_decode($descriptionEl->textContent, ENT_QUOTES); + // Per spec, <description> must be plaintext. + // In practice, often there's HTML... but these days good + // feeds are using <content:encoded> which is explicitly + // real HTML. + // We'll treat this following spec, and do HTML escaping + // to convert from plaintext to HTML. + $this->content = htmlspecialchars($descriptionEl->textContent); } } diff --git a/lib/activityutils.php b/lib/activityutils.php index a7e99fb11..401fd7fc2 100644 --- a/lib/activityutils.php +++ b/lib/activityutils.php @@ -213,11 +213,19 @@ class ActivityUtils // slavishly following http://atompub.org/rfc4287.html#rfc.section.4.1.3.3 if (empty($type) || $type == 'text') { - return $el->textContent; + // We have plaintext saved as the XML text content. + // Since we want HTML, we need to escape any special chars. + return htmlspecialchars($el->textContent); } else if ($type == 'html') { + // We have HTML saved as the XML text content. + // No additional processing required once we've got it. $text = $el->textContent; - return htmlspecialchars_decode($text, ENT_QUOTES); + return $text; } else if ($type == 'xhtml') { + // Per spec, the <content type="xhtml"> contains a single + // HTML <div> with XHTML namespace on it as a child node. + // We need to pull all of that <div>'s child nodes and + // serialize them back to an (X)HTML source fragment. $divEl = ActivityUtils::child($el, 'div', 'http://www.w3.org/1999/xhtml'); if (empty($divEl)) { return null; diff --git a/tests/ActivityParseTests.php b/tests/ActivityParseTests.php index 4563da914..378478d74 100644 --- a/tests/ActivityParseTests.php +++ b/tests/ActivityParseTests.php @@ -32,6 +32,18 @@ class ActivityParseTests extends PHPUnit_Framework_TestCase $this->assertEquals('tag:versioncentral.example.org,2009:/change/1643245', $act->objects[0]->id); } + public function testExample2() + { + global $_example2; + $dom = DOMDocument::loadXML($_example2); + $act = new Activity($dom->documentElement); + + $this->assertFalse(empty($act)); + // Did we handle <content type="html"> correctly with a typical payload? + $this->assertEquals("<p>Geraldine posted a Photo on PhotoPanic</p>\n " . + "<img src=\"/geraldine/photo1.jpg\">", trim($act->content)); + } + public function testExample3() { global $_example3; @@ -305,6 +317,71 @@ class ActivityParseTests extends PHPUnit_Framework_TestCase } + public function testAtomContent() + { + $tests = array(array("<content>Some regular plain text.</content>", + "Some regular plain text."), + array("<content><b>this is not HTML</b></content>", + "<b>this is not HTML</b>"), + array("<content type='html'>Some regular plain HTML.</content>", + "Some regular plain HTML."), + array("<content type='html'><b>this is too HTML</b></content>", + "<b>this is too HTML</b>"), + array("<content type='html'>&lt;b&gt;but this is not HTML!&lt;/b&gt;</content>", + "<b>but this is not HTML!</b>"), + array("<content type='xhtml'><div xmlns='http://www.w3.org/1999/xhtml'>Some regular plain XHTML.</div></content>", + "Some regular plain XHTML."), + array("<content type='xhtml'><div xmlns='http://www.w3.org/1999/xhtml'><b>This is some XHTML!</b></div></content>", + "<b>This is some XHTML!</b>"), + array("<content type='xhtml'><div xmlns='http://www.w3.org/1999/xhtml'><b>This is not some XHTML!</b></div></content>", + "<b>This is not some XHTML!</b>"), + array("<content type='xhtml'><div xmlns='http://www.w3.org/1999/xhtml'>&lt;b&gt;This is not some XHTML either!&lt;/b&gt;</div></content>", + "&lt;b&gt;This is not some XHTML either!&lt;/b&gt;")); + foreach ($tests as $data) { + list($source, $output) = $data; + $xml = "<entry xmlns='http://www.w3.org/2005/Atom'>" . + "<id>http://example.com/fakeid</id>" . + "<author><name>Test</name></author>" . + "<title>Atom content tests</title>" . + $source . + "</entry>"; + $dom = DOMDocument::loadXML($xml); + $act = new Activity($dom->documentElement); + + $this->assertFalse(empty($act)); + $this->assertEquals($output, trim($act->content)); + } + } + + public function testRssContent() + { + $tests = array(array("<content:encoded>Some regular plain HTML.</content:encoded>", + "Some regular plain HTML."), + array("<content:encoded>Some <b>exciting bold HTML</b></content:encoded>", + "Some <b>exciting bold HTML</b>"), + array("<content:encoded>Some &lt;b&gt;escaped non-HTML.&lt;/b&gt;</content:encoded>", + "Some <b>escaped non-HTML.</b>"), + array("<description>Some plain text.</description>", + "Some plain text."), + array("<description>Some <b>non-HTML text</b></description>", + "Some <b>non-HTML text</b>"), + array("<description>Some &lt;b&gt;double-escaped text&lt;/b&gt;</description>", + "Some &lt;b&gt;double-escaped text&lt;/b&gt;")); + foreach ($tests as $data) { + list($source, $output) = $data; + $xml = "<item xmlns:content='http://purl.org/rss/1.0/modules/content/'>" . + "<guid>http://example.com/fakeid</guid>" . + "<title>RSS content tests</title>" . + $source . + "</item>"; + $dom = DOMDocument::loadXML($xml); + $act = new Activity($dom->documentElement); + + $this->assertFalse(empty($act)); + $this->assertEquals($output, trim($act->content)); + } + } + } $_example1 = <<<EXAMPLE1 |