summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBrion Vibber <brion@pobox.com>2010-04-23 15:40:48 -0700
committerBrion Vibber <brion@pobox.com>2010-04-23 15:40:48 -0700
commit8fd0059bf69ed16ed4efad7b8e16dc2afda32e18 (patch)
tree88b030330d671dbbb8dfd6430d45f29d9e4d8bea
parent9c8052e755e5ad4c8120ace9acdd75ee910e2ab7 (diff)
Test cases and fixes for Atom and RSS content decoding.
Fix extraction of Atom <content type="text"> and <content type="html">; we were failing to escape plaintext source data to HTML, and doing an extraneous double-deescape on HTML source resulting in breakage of notices containing text that looks like HTML. Only <content type="xhtml"> was working correctly previously. Fixes for RSS2 content processing: we were failing to load <content:encoded> at all due to using wrong element name, and were applying an extraneous de-escape for <description> rather than the escaping that is required to turn plaintext into HTML. (Per spec, <description> must be plaintext.)
-rw-r--r--lib/activity.php14
-rw-r--r--lib/activityutils.php12
-rw-r--r--tests/ActivityParseTests.php77
3 files changed, 98 insertions, 5 deletions
diff --git a/lib/activity.php b/lib/activity.php
index 5d6230c6d..27f09ab4d 100644
--- a/lib/activity.php
+++ b/lib/activity.php
@@ -83,6 +83,7 @@ class Activity
const CREATOR = 'creator';
const CONTENTNS = 'http://purl.org/rss/1.0/modules/content/';
+ const ENCODED = 'encoded';
public $actor; // an ActivityObject
public $verb; // a string (the URL)
@@ -268,14 +269,21 @@ class Activity
$this->title = ActivityUtils::childContent($item, ActivityObject::TITLE, self::RSS);
- $contentEl = ActivityUtils::child($item, ActivityUtils::CONTENT, self::CONTENTNS);
+ $contentEl = ActivityUtils::child($item, self::ENCODED, self::CONTENTNS);
if (!empty($contentEl)) {
- $this->content = htmlspecialchars_decode($contentEl->textContent, ENT_QUOTES);
+ // <content:encoded> XML node's text content is HTML; no further processing needed.
+ $this->content = $contentEl->textContent;
} else {
$descriptionEl = ActivityUtils::child($item, self::DESCRIPTION, self::RSS);
if (!empty($descriptionEl)) {
- $this->content = htmlspecialchars_decode($descriptionEl->textContent, ENT_QUOTES);
+ // Per spec, <description> must be plaintext.
+ // In practice, often there's HTML... but these days good
+ // feeds are using <content:encoded> which is explicitly
+ // real HTML.
+ // We'll treat this following spec, and do HTML escaping
+ // to convert from plaintext to HTML.
+ $this->content = htmlspecialchars($descriptionEl->textContent);
}
}
diff --git a/lib/activityutils.php b/lib/activityutils.php
index a7e99fb11..401fd7fc2 100644
--- a/lib/activityutils.php
+++ b/lib/activityutils.php
@@ -213,11 +213,19 @@ class ActivityUtils
// slavishly following http://atompub.org/rfc4287.html#rfc.section.4.1.3.3
if (empty($type) || $type == 'text') {
- return $el->textContent;
+ // We have plaintext saved as the XML text content.
+ // Since we want HTML, we need to escape any special chars.
+ return htmlspecialchars($el->textContent);
} else if ($type == 'html') {
+ // We have HTML saved as the XML text content.
+ // No additional processing required once we've got it.
$text = $el->textContent;
- return htmlspecialchars_decode($text, ENT_QUOTES);
+ return $text;
} else if ($type == 'xhtml') {
+ // Per spec, the <content type="xhtml"> contains a single
+ // HTML <div> with XHTML namespace on it as a child node.
+ // We need to pull all of that <div>'s child nodes and
+ // serialize them back to an (X)HTML source fragment.
$divEl = ActivityUtils::child($el, 'div', 'http://www.w3.org/1999/xhtml');
if (empty($divEl)) {
return null;
diff --git a/tests/ActivityParseTests.php b/tests/ActivityParseTests.php
index 4563da914..378478d74 100644
--- a/tests/ActivityParseTests.php
+++ b/tests/ActivityParseTests.php
@@ -32,6 +32,18 @@ class ActivityParseTests extends PHPUnit_Framework_TestCase
$this->assertEquals('tag:versioncentral.example.org,2009:/change/1643245', $act->objects[0]->id);
}
+ public function testExample2()
+ {
+ global $_example2;
+ $dom = DOMDocument::loadXML($_example2);
+ $act = new Activity($dom->documentElement);
+
+ $this->assertFalse(empty($act));
+ // Did we handle <content type="html"> correctly with a typical payload?
+ $this->assertEquals("<p>Geraldine posted a Photo on PhotoPanic</p>\n " .
+ "<img src=\"/geraldine/photo1.jpg\">", trim($act->content));
+ }
+
public function testExample3()
{
global $_example3;
@@ -305,6 +317,71 @@ class ActivityParseTests extends PHPUnit_Framework_TestCase
}
+ public function testAtomContent()
+ {
+ $tests = array(array("<content>Some regular plain text.</content>",
+ "Some regular plain text."),
+ array("<content>&lt;b&gt;this is not HTML&lt;/b&gt;</content>",
+ "&lt;b&gt;this is not HTML&lt;/b&gt;"),
+ array("<content type='html'>Some regular plain HTML.</content>",
+ "Some regular plain HTML."),
+ array("<content type='html'>&lt;b&gt;this is too HTML&lt;/b&gt;</content>",
+ "<b>this is too HTML</b>"),
+ array("<content type='html'>&amp;lt;b&amp;gt;but this is not HTML!&amp;lt;/b&amp;gt;</content>",
+ "&lt;b&gt;but this is not HTML!&lt;/b&gt;"),
+ array("<content type='xhtml'><div xmlns='http://www.w3.org/1999/xhtml'>Some regular plain XHTML.</div></content>",
+ "Some regular plain XHTML."),
+ array("<content type='xhtml'><div xmlns='http://www.w3.org/1999/xhtml'><b>This is some XHTML!</b></div></content>",
+ "<b>This is some XHTML!</b>"),
+ array("<content type='xhtml'><div xmlns='http://www.w3.org/1999/xhtml'>&lt;b&gt;This is not some XHTML!&lt;/b&gt;</div></content>",
+ "&lt;b&gt;This is not some XHTML!&lt;/b&gt;"),
+ array("<content type='xhtml'><div xmlns='http://www.w3.org/1999/xhtml'>&amp;lt;b&amp;gt;This is not some XHTML either!&amp;lt;/b&amp;gt;</div></content>",
+ "&amp;lt;b&amp;gt;This is not some XHTML either!&amp;lt;/b&amp;gt;"));
+ foreach ($tests as $data) {
+ list($source, $output) = $data;
+ $xml = "<entry xmlns='http://www.w3.org/2005/Atom'>" .
+ "<id>http://example.com/fakeid</id>" .
+ "<author><name>Test</name></author>" .
+ "<title>Atom content tests</title>" .
+ $source .
+ "</entry>";
+ $dom = DOMDocument::loadXML($xml);
+ $act = new Activity($dom->documentElement);
+
+ $this->assertFalse(empty($act));
+ $this->assertEquals($output, trim($act->content));
+ }
+ }
+
+ public function testRssContent()
+ {
+ $tests = array(array("<content:encoded>Some regular plain HTML.</content:encoded>",
+ "Some regular plain HTML."),
+ array("<content:encoded>Some &lt;b&gt;exciting bold HTML&lt;/b&gt;</content:encoded>",
+ "Some <b>exciting bold HTML</b>"),
+ array("<content:encoded>Some &amp;lt;b&amp;gt;escaped non-HTML.&amp;lt;/b&amp;gt;</content:encoded>",
+ "Some &lt;b&gt;escaped non-HTML.&lt;/b&gt;"),
+ array("<description>Some plain text.</description>",
+ "Some plain text."),
+ array("<description>Some &lt;b&gt;non-HTML text&lt;/b&gt;</description>",
+ "Some &lt;b&gt;non-HTML text&lt;/b&gt;"),
+ array("<description>Some &amp;lt;b&amp;gt;double-escaped text&amp;lt;/b&amp;gt;</description>",
+ "Some &amp;lt;b&amp;gt;double-escaped text&amp;lt;/b&amp;gt;"));
+ foreach ($tests as $data) {
+ list($source, $output) = $data;
+ $xml = "<item xmlns:content='http://purl.org/rss/1.0/modules/content/'>" .
+ "<guid>http://example.com/fakeid</guid>" .
+ "<title>RSS content tests</title>" .
+ $source .
+ "</item>";
+ $dom = DOMDocument::loadXML($xml);
+ $act = new Activity($dom->documentElement);
+
+ $this->assertFalse(empty($act));
+ $this->assertEquals($output, trim($act->content));
+ }
+ }
+
}
$_example1 = <<<EXAMPLE1