diff options
author | Zach Copley <zach@status.net> | 2010-03-19 10:08:47 -0700 |
---|---|---|
committer | Zach Copley <zach@status.net> | 2010-03-19 10:08:47 -0700 |
commit | fb50a2d83c9a530479874d51f203ad8b3d4c9a7a (patch) | |
tree | 5322601414dcb9c3b7a39516bb81bb1d7063c980 | |
parent | 1f160bb7bc62d8672a4e600ab73fdddffd188194 (diff) | |
parent | 05e3768e6a833d99a5180d4306d68f59e2d8f8c9 (diff) |
Merge branch 'testing' of gitorious.org:statusnet/mainline into testing
* 'testing' of gitorious.org:statusnet/mainline:
Parse RSS items as activities
Remove hkit and do our own hcard parsing
Work around weird bug with HTML normalization via PHP DOM module; if source had xmlns and xml:lang I ended up with double output, breaking the subsequent parsing. Will have to track this down later and report upstream if not already resolved.
-rw-r--r-- | lib/activity.php | 272 | ||||
-rw-r--r-- | plugins/OStatus/extlib/hkit/hcard.profile.php | 105 | ||||
-rw-r--r-- | plugins/OStatus/extlib/hkit/hkit.class.php | 475 | ||||
-rw-r--r-- | plugins/OStatus/lib/discoveryhints.php | 229 | ||||
-rw-r--r-- | tests/ActivityParseTests.php | 95 |
5 files changed, 467 insertions, 709 deletions
diff --git a/lib/activity.php b/lib/activity.php index c67d090f7..5b304020d 100644 --- a/lib/activity.php +++ b/lib/activity.php @@ -643,38 +643,11 @@ class ActivityObject ); if ($element->tagName == 'author') { - - $this->type = self::PERSON; // XXX: is this fair? - $this->title = $this->_childContent($element, self::NAME); - $this->id = $this->_childContent($element, self::URI); - - if (empty($this->id)) { - $email = $this->_childContent($element, self::EMAIL); - if (!empty($email)) { - // XXX: acct: ? - $this->id = 'mailto:'.$email; - } - } - + $this->_fromAuthor($element); + } else if ($element->tagName == 'item') { + $this->_fromRssItem($element); } else { - - $this->type = $this->_childContent($element, Activity::OBJECTTYPE, - Activity::SPEC); - - if (empty($this->type)) { - $this->type = ActivityObject::NOTE; - } - - $this->id = $this->_childContent($element, self::ID); - $this->title = $this->_childContent($element, self::TITLE); - $this->summary = $this->_childContent($element, self::SUMMARY); - - $this->source = $this->_getSource($element); - - $this->content = ActivityUtils::getContent($element); - - $this->link = ActivityUtils::getPermalink($element); - + $this->_fromAtomEntry($element); } // Some per-type attributes... @@ -697,6 +670,72 @@ class ActivityObject } } + private function _fromAuthor($element) + { + $this->type = self::PERSON; // XXX: is this fair? + $this->title = $this->_childContent($element, self::NAME); + $this->id = $this->_childContent($element, self::URI); + + if (empty($this->id)) { + $email = $this->_childContent($element, self::EMAIL); + if (!empty($email)) { + // XXX: acct: ? + $this->id = 'mailto:'.$email; + } + } + } + + private function _fromAtomEntry($element) + { + $this->type = $this->_childContent($element, Activity::OBJECTTYPE, + Activity::SPEC); + + if (empty($this->type)) { + $this->type = ActivityObject::NOTE; + } + + $this->id = $this->_childContent($element, self::ID); + $this->title = $this->_childContent($element, self::TITLE); + $this->summary = $this->_childContent($element, self::SUMMARY); + + $this->source = $this->_getSource($element); + + $this->content = ActivityUtils::getContent($element); + + $this->link = ActivityUtils::getPermalink($element); + } + + // @fixme rationalize with Activity::_fromRssItem() + + private function _fromRssItem($item) + { + $this->title = ActivityUtils::childContent($item, ActivityObject::TITLE, Activity::RSS); + + $contentEl = ActivityUtils::child($item, ActivityUtils::CONTENT, Activity::CONTENTNS); + + if (!empty($contentEl)) { + $this->content = htmlspecialchars_decode($contentEl->textContent, ENT_QUOTES); + } else { + $descriptionEl = ActivityUtils::child($item, Activity::DESCRIPTION, Activity::RSS); + if (!empty($descriptionEl)) { + $this->content = htmlspecialchars_decode($descriptionEl->textContent, ENT_QUOTES); + } + } + + $this->link = ActivityUtils::childContent($item, ActivityUtils::LINK, Activity::RSS); + + $guidEl = ActivityUtils::child($item, Activity::GUID, Activity::RSS); + + if (!empty($guidEl)) { + $this->id = $guidEl->textContent; + + if ($guidEl->hasAttribute('isPermaLink')) { + // overwrites <link> + $this->link = $this->id; + } + } + } + private function _childContent($element, $tag, $namespace=ActivityUtils::ATOM) { return ActivityUtils::childContent($element, $tag, $namespace); @@ -1051,6 +1090,21 @@ class Activity const PUBLISHED = 'published'; const UPDATED = 'updated'; + const RSS = null; // no namespace! + + const PUBDATE = 'pubDate'; + const DESCRIPTION = 'description'; + const GUID = 'guid'; + const SELF = 'self'; + const IMAGE = 'image'; + const URL = 'url'; + + const DC = 'http://purl.org/dc/elements/1.1/'; + + const CREATOR = 'creator'; + + const CONTENTNS = 'http://purl.org/rss/1.0/modules/content/'; + public $actor; // an ActivityObject public $verb; // a string (the URL) public $object; // an ActivityObject @@ -1081,8 +1135,6 @@ class Activity return; } - $this->entry = $entry; - // Insist on a feed's root DOMElement; don't allow a DOMDocument if ($feed instanceof DOMDocument) { throw new ClientException( @@ -1090,8 +1142,22 @@ class Activity ); } + $this->entry = $entry; $this->feed = $feed; + if ($entry->namespaceURI == Activity::ATOM && + $entry->localName == 'entry') { + $this->_fromAtomEntry($entry, $feed); + } else if ($entry->namespaceURI == Activity::RSS && + $entry->localName == 'item') { + $this->_fromRssItem($entry, $feed); + } else { + throw new Exception("Unknown DOM element: {$entry->namespaceURI} {$entry->localName}"); + } + } + + function _fromAtomEntry($entry, $feed) + { $pubEl = $this->_child($entry, self::PUBLISHED, self::ATOM); if (!empty($pubEl)) { @@ -1177,6 +1243,69 @@ class Activity } } + function _fromRssItem($item, $rss) + { + $verbEl = $this->_child($item, self::VERB); + + if (!empty($verbEl)) { + $this->verb = trim($verbEl->textContent); + } else { + $this->verb = ActivityVerb::POST; + // XXX: do other implied stuff here + } + + $pubDateEl = $this->_child($item, self::PUBDATE, self::RSS); + + if (!empty($pubDateEl)) { + $this->time = strtotime($pubDateEl->textContent); + } + + $authorEl = $this->_child($item, self::AUTHOR, self::RSS); + + if (!empty($authorEl)) { + $this->actor = $this->_fromRssAuthor($authorEl); + } else { + $dcCreatorEl = $this->_child($item, self::CREATOR, self::DC); + if (!empty($dcCreatorEl)) { + $this->actor = $this->_fromDcCreator($dcCreatorEl); + } else if (!empty($rss)) { + $this->actor = $this->_fromRss($rss); + } + } + + $this->title = ActivityUtils::childContent($item, ActivityObject::TITLE, self::RSS); + + $contentEl = ActivityUtils::child($item, ActivityUtils::CONTENT, self::CONTENTNS); + + if (!empty($contentEl)) { + $this->content = htmlspecialchars_decode($contentEl->textContent, ENT_QUOTES); + } else { + $descriptionEl = ActivityUtils::child($item, self::DESCRIPTION, self::RSS); + if (!empty($descriptionEl)) { + $this->content = htmlspecialchars_decode($descriptionEl->textContent, ENT_QUOTES); + } + } + + $this->link = ActivityUtils::childContent($item, ActivityUtils::LINK, self::RSS); + + // @fixme enclosures + // @fixme thumbnails... maybe + + $guidEl = ActivityUtils::child($item, self::GUID, self::RSS); + + if (!empty($guidEl)) { + $this->id = $guidEl->textContent; + + if ($guidEl->hasAttribute('isPermaLink') && $guidEl->getAttribute('isPermaLink') != 'false') { + // overwrites <link> + $this->link = $this->id; + } + } + + $this->object = new ActivityObject($item); + $this->context = new ActivityContext($item); + } + /** * Returns an Atom <entry> based on this activity * @@ -1249,6 +1378,83 @@ class Activity return $xs->getString(); } + function _fromRssAuthor($el) + { + $text = $el->textContent; + + if (preg_match('/^(.*?) \((.*)\)$/', $text, $match)) { + $email = $match[1]; + $name = $match[2]; + } else if (preg_match('/^(.*?) <(.*)>$/', $text, $match)) { + $name = $match[1]; + $email = $match[2]; + } else if (preg_match('/.*@.*/', $text)) { + $email = $text; + $name = null; + } else { + $name = $text; + $email = null; + } + + // Not really enough info + + $actor = new ActivityObject(); + + $actor->element = $el; + + $actor->type = ActivityObject::PERSON; + $actor->title = $name; + + if (!empty($email)) { + $actor->id = 'mailto:'.$email; + } + + return $actor; + } + + function _fromDcCreator($el) + { + // Not really enough info + + $text = $el->textContent; + + $actor = new ActivityObject(); + + $actor->element = $el; + + $actor->title = $text; + $actor->type = ActivityObject::PERSON; + + return $actor; + } + + function _fromRss($el) + { + $actor = new ActivityObject(); + + $actor->element = $el; + + $actor->type = ActivityObject::PERSON; // @fixme guess better + + $actor->title = ActivityUtils::childContent($el, ActivityObject::TITLE, self::RSS); + $actor->link = ActivityUtils::childContent($el, ActivityUtils::LINK, self::RSS); + $actor->id = ActivityUtils::getLink($el, self::SELF); + + $desc = ActivityUtils::childContent($el, self::DESCRIPTION, self::RSS); + + if (!empty($desc)) { + $actor->content = htmlspecialchars_decode($desc, ENT_QUOTES); + } + + $imageEl = ActivityUtils::child($el, self::IMAGE, self::RSS); + + if (!empty($imageEl)) { + $actor->avatarLinks[] = ActivityUtils::childContent($imageEl, self::URL, self::RSS); + } + + return $actor; + } + private function _child($element, $tag, $namespace=self::SPEC) { return ActivityUtils::child($element, $tag, $namespace); diff --git a/plugins/OStatus/extlib/hkit/hcard.profile.php b/plugins/OStatus/extlib/hkit/hcard.profile.php deleted file mode 100644 index 6ec0dc890..000000000 --- a/plugins/OStatus/extlib/hkit/hcard.profile.php +++ /dev/null @@ -1,105 +0,0 @@ -<?php - // hcard profile for hkit - - $this->root_class = 'vcard'; - - $this->classes = array( - 'fn', array('honorific-prefix', 'given-name', 'additional-name', 'family-name', 'honorific-suffix'), - 'n', array('honorific-prefix', 'given-name', 'additional-name', 'family-name', 'honorific-suffix'), - 'adr', array('post-office-box', 'extended-address', 'street-address', 'postal-code', 'country-name', 'type', 'region', 'locality'), - 'label', 'bday', 'agent', 'nickname', 'photo', 'class', - 'email', array('type', 'value'), - 'category', 'key', 'logo', 'mailer', 'note', - 'org', array('organization-name', 'organization-unit'), - 'tel', array('type', 'value'), - 'geo', array('latitude', 'longitude'), - 'tz', 'uid', 'url', 'rev', 'role', 'sort-string', 'sound', 'title' - ); - - // classes that must only appear once per card - $this->singles = array( - 'fn' - ); - - // classes that are required (not strictly enforced - give at least one!) - $this->required = array( - 'fn' - ); - - $this->att_map = array( - 'fn' => array('IMG|alt'), - 'url' => array('A|href', 'IMG|src', 'AREA|href'), - 'photo' => array('IMG|src'), - 'bday' => array('ABBR|title'), - 'logo' => array('IMG|src'), - 'email' => array('A|href'), - 'geo' => array('ABBR|title') - ); - - - $this->callbacks = array( - 'url' => array($this, 'resolvePath'), - 'photo' => array($this, 'resolvePath'), - 'logo' => array($this, 'resolvePath'), - 'email' => array($this, 'resolveEmail') - ); - - - - function hKit_hcard_post($a) - { - - foreach ($a as &$vcard){ - - hKit_implied_n_optimization($vcard); - hKit_implied_n_from_fn($vcard); - - } - - return $a; - - } - - - function hKit_implied_n_optimization(&$vcard) - { - if (array_key_exists('fn', $vcard) && !is_array($vcard['fn']) && - !array_key_exists('n', $vcard) && (!array_key_exists('org', $vcard) || $vcard['fn'] != $vcard['org'])){ - - if (sizeof(explode(' ', $vcard['fn'])) == 2){ - $patterns = array(); - $patterns[] = array('/^(\S+),\s*(\S{1})$/', 2, 1); // Lastname, Initial - $patterns[] = array('/^(\S+)\s*(\S{1})\.*$/', 2, 1); // Lastname Initial(.) - $patterns[] = array('/^(\S+),\s*(\S+)$/', 2, 1); // Lastname, Firstname - $patterns[] = array('/^(\S+)\s*(\S+)$/', 1, 2); // Firstname Lastname - - foreach ($patterns as $pattern){ - if (preg_match($pattern[0], $vcard['fn'], $matches) === 1){ - $n = array(); - $n['given-name'] = $matches[$pattern[1]]; - $n['family-name'] = $matches[$pattern[2]]; - $vcard['n'] = $n; - - - break; - } - } - } - } - } - - - function hKit_implied_n_from_fn(&$vcard) - { - if (array_key_exists('fn', $vcard) && is_array($vcard['fn']) - && !array_key_exists('n', $vcard) && (!array_key_exists('org', $vcard) || $vcard['fn'] != $vcard['org'])){ - - $vcard['n'] = $vcard['fn']; - } - - if (array_key_exists('fn', $vcard) && is_array($vcard['fn'])){ - $vcard['fn'] = $vcard['fn']['text']; - } - } - -?>
\ No newline at end of file diff --git a/plugins/OStatus/extlib/hkit/hkit.class.php b/plugins/OStatus/extlib/hkit/hkit.class.php deleted file mode 100644 index c3a54cff6..000000000 --- a/plugins/OStatus/extlib/hkit/hkit.class.php +++ /dev/null @@ -1,475 +0,0 @@ -<?php - - /* - - hKit Library for PHP5 - a generic library for parsing Microformats - Copyright (C) 2006 Drew McLellan - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - - Author - Drew McLellan - http://allinthehead.com/ - - Contributors: - Scott Reynen - http://www.randomchaos.com/ - - Version 0.5, 22-Jul-2006 - fixed by-ref issue cropping up in PHP 5.0.5 - fixed a bug with a@title - added support for new fn=n optimisation - added support for new a.include include-pattern - Version 0.4, 23-Jun-2006 - prevented nested includes from causing infinite loops - returns false if URL can't be fetched - added pre-flight check for base support level - added deduping of once-only classnames - prevented accumulation of multiple 'value' values - tuned whitespace handling and treatment of DEL elements - Version 0.3, 21-Jun-2006 - added post-processor callback method into profiles - fixed minor problems raised by hcard testsuite - added support for include-pattern - added support for td@headers pattern - added implied-n optimization into default hcard profile - Version 0.2, 20-Jun-2006 - added class callback mechanism - added resolvePath & resolveEmail - added basic BASE support - Version 0.1.1, 19-Jun-2006 (different timezone, no time machine) - added external Tidy option - Version 0.1, 20-Jun-2006 - initial release - - - - - */ - - class hKit - { - - public $tidy_mode = 'proxy'; // 'proxy', 'exec', 'php' or 'none' - public $tidy_proxy = 'http://cgi.w3.org/cgi-bin/tidy?forceXML=on&docAddr='; // required only for tidy_mode=proxy - public $tmp_dir = '/path/to/writable/dir/'; // required only for tidy_mode=exec - - private $root_class = ''; - private $classes = ''; - private $singles = ''; - private $required = ''; - private $att_map = ''; - private $callbacks = ''; - private $processor = ''; - - private $url = ''; - private $base = ''; - private $doc = ''; - - - public function hKit() - { - // pre-flight checks - $pass = true; - $required = array('dom_import_simplexml', 'file_get_contents', 'simplexml_load_string'); - $missing = array(); - - foreach ($required as $f){ - if (!function_exists($f)){ - $pass = false; - $missing[] = $f . '()'; - } - } - - if (!$pass) - die('hKit error: these required functions are not available: <strong>' . implode(', ', $missing) . '</strong>'); - - } - - - public function getByURL($profile='', $url='') - { - - if ($profile=='' || $url == '') return false; - - $this->loadProfile($profile); - - $source = $this->loadURL($url); - - if ($source){ - $tidy_xhtml = $this->tidyThis($source); - - $fragment = false; - - if (strrchr($url, '#')) - $fragment = array_pop(explode('#', $url)); - - $doc = $this->loadDoc($tidy_xhtml, $fragment); - $s = $this->processNodes($doc, $this->classes); - $s = $this->postProcess($profile, $s); - - return $s; - }else{ - return false; - } - } - - public function getByString($profile='', $input_xml='') - { - if ($profile=='' || $input_xml == '') return false; - - $this->loadProfile($profile); - - $doc = $this->loadDoc($input_xml); - $s = $this->processNodes($doc, $this->classes); - $s = $this->postProcess($profile, $s); - - return $s; - - } - - private function processNodes($items, $classes, $allow_includes=true){ - - $out = array(); - - foreach($items as $item){ - $data = array(); - - for ($i=0; $i<sizeof($classes); $i++){ - - if (!is_array($classes[$i])){ - - $xpath = ".//*[contains(concat(' ',normalize-space(@class),' '),' " . $classes[$i] . " ')]"; - $results = $item->xpath($xpath); - - if ($results){ - foreach ($results as $result){ - if (isset($classes[$i+1]) && is_array($classes[$i+1])){ - $nodes = $this->processNodes($results, $classes[$i+1]); - if (sizeof($nodes) > 0){ - $nodes = array_merge(array('text'=>$this->getNodeValue($result, $classes[$i])), $nodes); - $data[$classes[$i]] = $nodes; - }else{ - $data[$classes[$i]] = $this->getNodeValue($result, $classes[$i]); - } - - }else{ - if (isset($data[$classes[$i]])){ - if (is_array($data[$classes[$i]])){ - // is already an array - append - $data[$classes[$i]][] = $this->getNodeValue($result, $classes[$i]); - - }else{ - // make it an array - if ($classes[$i] == 'value'){ // unless it's the 'value' of a type/value pattern - $data[$classes[$i]] .= $this->getNodeValue($result, $classes[$i]); - }else{ - $old_val = $data[$classes[$i]]; - $data[$classes[$i]] = array($old_val, $this->getNodeValue($result, $classes[$i])); - $old_val = false; - } - } - }else{ - // set as normal value - $data[$classes[$i]] = $this->getNodeValue($result, $classes[$i]); - - } - } - - // td@headers pattern - if (strtoupper(dom_import_simplexml($result)->tagName)== "TD" && $result['headers']){ - $include_ids = explode(' ', $result['headers']); - $doc = $this->doc; - foreach ($include_ids as $id){ - $xpath = "//*[@id='$id']/.."; - $includes = $doc->xpath($xpath); - foreach ($includes as $include){ - $tmp = $this->processNodes($include, $this->classes); - if (is_array($tmp)) $data = array_merge($data, $tmp); - } - } - } - } - } - } - $result = false; - } - - // include-pattern - if ($allow_includes){ - $xpath = ".//*[contains(concat(' ',normalize-space(@class),' '),' include ')]"; - $results = $item->xpath($xpath); - - if ($results){ - foreach ($results as $result){ - $tagName = strtoupper(dom_import_simplexml($result)->tagName); - if ((($tagName == "OBJECT" && $result['data']) || ($tagName == "A" && $result['href'])) - && preg_match('/\binclude\b/', $result['class'])){ - $att = ($tagName == "OBJECT" ? 'data' : 'href'); - $id = str_replace('#', '', $result[$att]); - $doc = $this->doc; - $xpath = "//*[@id='$id']"; - $includes = $doc->xpath($xpath); - foreach ($includes as $include){ - $include = simplexml_load_string('<root1><root2>'.$include->asXML().'</root2></root1>'); // don't ask. - $tmp = $this->processNodes($include, $this->classes, false); - if (is_array($tmp)) $data = array_merge($data, $tmp); - } - } - } - } - } - $out[] = $data; - } - - if (sizeof($out) > 1){ - return $out; - }else if (isset($data)){ - return $data; - }else{ - return array(); - } - } - - - private function getNodeValue($node, $className) - { - - $tag_name = strtoupper(dom_import_simplexml($node)->tagName); - $s = false; - - // ignore DEL tags - if ($tag_name == 'DEL') return $s; - - // look up att map values - if (array_key_exists($className, $this->att_map)){ - - foreach ($this->att_map[$className] as $map){ - if (preg_match("/$tag_name\|/", $map)){ - $s = ''.$node[array_pop($foo = explode('|', $map))]; - } - } - } - - // if nothing and OBJ, try data. - if (!$s && $tag_name=='OBJECT' && $node['data']) $s = ''.$node['data']; - - // if nothing and IMG, try alt. - if (!$s && $tag_name=='IMG' && $node['alt']) $s = ''.$node['alt']; - - // if nothing and AREA, try alt. - if (!$s && $tag_name=='AREA' && $node['alt']) $s = ''.$node['alt']; - - //if nothing and not A, try title. - if (!$s && $tag_name!='A' && $node['title']) $s = ''.$node['title']; - - - // if nothing found, go with node text - $s = ($s ? $s : implode(array_filter($node->xpath('child::node()'), array(&$this, "filterBlankValues")), ' ')); - - // callbacks - if (array_key_exists($className, $this->callbacks)){ - $s = preg_replace_callback('/.*/', $this->callbacks[$className], $s, 1); - } - - // trim and remove line breaks - if ($tag_name != 'PRE'){ - $s = trim(preg_replace('/[\r\n\t]+/', '', $s)); - $s = trim(preg_replace('/(\s{2})+/', ' ', $s)); - } - - return $s; - } - - private function filterBlankValues($s){ - return preg_match("/\w+/", $s); - } - - - private function tidyThis($source) - { - switch ( $this->tidy_mode ) - { - case 'exec': - $tmp_file = $this->tmp_dir.md5($source).'.txt'; - file_put_contents($tmp_file, $source); - exec("tidy -utf8 -indent -asxhtml -numeric -bare -quiet $tmp_file", $tidy); - unlink($tmp_file); - return implode("\n", $tidy); - break; - - case 'php': - $tidy = tidy_parse_string($source); - return tidy_clean_repair($tidy); - break; - - default: - return $source; - break; - } - - } - - - private function loadProfile($profile) - { - require_once("$profile.profile.php"); - } - - - private function loadDoc($input_xml, $fragment=false) - { - $xml = simplexml_load_string($input_xml); - - $this->doc = $xml; - - if ($fragment){ - $doc = $xml->xpath("//*[@id='$fragment']"); - $xml = simplexml_load_string($doc[0]->asXML()); - $doc = null; - } - - // base tag - if ($xml->head->base['href']) $this->base = $xml->head->base['href']; - - // xml:base attribute - PITA with SimpleXML - preg_match('/xml:base="(.*)"/', $xml->asXML(), $matches); - if (is_array($matches) && sizeof($matches)>1) $this->base = $matches[1]; - - return $xml->xpath("//*[contains(concat(' ',normalize-space(@class),' '),' $this->root_class ')]"); - - } - - - private function loadURL($url) - { - $this->url = $url; - - if ($this->tidy_mode == 'proxy' && $this->tidy_proxy != ''){ - $url = $this->tidy_proxy . $url; - } - - return @file_get_contents($url); - - } - - - private function postProcess($profile, $s) - { - $required = $this->required; - - if (is_array($s) && array_key_exists($required[0], $s)){ - $s = array($s); - } - - $s = $this->dedupeSingles($s); - - if (function_exists('hKit_'.$profile.'_post')){ - $s = call_user_func('hKit_'.$profile.'_post', $s); - } - - $s = $this->removeTextVals($s); - - return $s; - } - - - private function resolvePath($filepath) - { // ugly code ahoy: needs a serious tidy up - - $filepath = $filepath[0]; - - $base = $this->base; - $url = $this->url; - - if ($base != '' && strpos($base, '://') !== false) - $url = $base; - - $r = parse_url($url); - $domain = $r['scheme'] . '://' . $r['host']; - - if (!isset($r['path'])) $r['path'] = '/'; - $path = explode('/', $r['path']); - $file = explode('/', $filepath); - $new = array(''); - - if (strpos($filepath, '://') !== false || strpos($filepath, 'data:') !== false){ - return $filepath; - } - - if ($file[0] == ''){ - // absolute path - return ''.$domain . implode('/', $file); - }else{ - // relative path - if ($path[sizeof($path)-1] == '') array_pop($path); - if (strpos($path[sizeof($path)-1], '.') !== false) array_pop($path); - - foreach ($file as $segment){ - if ($segment == '..'){ - array_pop($path); - }else{ - $new[] = $segment; - } - } - return ''.$domain . implode('/', $path) . implode('/', $new); - } - } - - private function resolveEmail($v) - { - $parts = parse_url($v[0]); - return ($parts['path']); - } - - - private function dedupeSingles($s) - { - $singles = $this->singles; - - foreach ($s as &$item){ - foreach ($singles as $classname){ - if (array_key_exists($classname, $item) && is_array($item[$classname])){ - if (isset($item[$classname][0])) $item[$classname] = $item[$classname][0]; - } - } - } - - return $s; - } - - private function removeTextVals($s) - { - foreach ($s as $key => &$val){ - if ($key){ - $k = $key; - }else{ - $k = ''; - } - - if (is_array($val)){ - $val = $this->removeTextVals($val); - }else{ - if ($k == 'text'){ - $val = ''; - } - } - } - - return array_filter($s); - } - - } - - -?>
\ No newline at end of file diff --git a/plugins/OStatus/lib/discoveryhints.php b/plugins/OStatus/lib/discoveryhints.php index 4da2ec0f1..1bb0ad2ae 100644 --- a/plugins/OStatus/lib/discoveryhints.php +++ b/plugins/OStatus/lib/discoveryhints.php @@ -63,54 +63,12 @@ class DiscoveryHints { static function hcardHints($body, $url) { - common_debug("starting tidy"); - - $body = self::_tidy($body, $url); - - common_debug("done with tidy"); - - set_include_path(get_include_path() . PATH_SEPARATOR . INSTALLDIR . '/plugins/OStatus/extlib/hkit/'); - require_once('hkit.class.php'); - - // hKit code is not clean for notices and warnings - $old = error_reporting(); - error_reporting($old & ~E_NOTICE & ~E_WARNING); - - $h = new hKit; - $hcards = $h->getByString('hcard', $body); - - error_reporting($old); - - if (empty($hcards)) { - return array(); - } - - if (count($hcards) == 1) { - $hcard = $hcards[0]; - } else { - foreach ($hcards as $try) { - if (array_key_exists('url', $try)) { - if (is_string($try['url']) && $try['url'] == $url) { - $hcard = $try; - break; - } else if (is_array($try['url'])) { - foreach ($try['url'] as $tryurl) { - if ($tryurl == $url) { - $hcard = $try; - break 2; - } - } - } - } - } - // last chance; grab the first one - if (empty($hcard)) { - $hcard = $hcards[0]; - } - } + $hcard = self::_hcard($body, $url); $hints = array(); + // XXX: don't copy stuff into an array and then copy it again + if (array_key_exists('nickname', $hcard)) { $hints['nickname'] = $hcard['nickname']; } @@ -122,7 +80,7 @@ class DiscoveryHints { } if (array_key_exists('photo', $hcard)) { - $hints['avatar'] = $hcard['photo']; + $hints['avatar'] = $hcard['photo'][0]; } if (array_key_exists('note', $hcard)) { @@ -149,61 +107,142 @@ class DiscoveryHints { return $hints; } - /** - * hKit needs well-formed XML for its parsing. - * We'll take the HTML body here and normalize it to XML. - * - * @param string $body HTML document source, possibly not-well-formed - * @param string $url source URL - * @return string well-formed XML document source - * @throws Exception if HTML parsing failed. - */ - private static function _tidy($body, $url) + static function _hcard($body, $url) { - if (empty($body)) { - throw new Exception("Empty HTML could not be parsed."); - } - $dom = new DOMDocument(); + // DOMDocument::loadHTML may throw warnings on unrecognized elements. + + $old = error_reporting(error_reporting() & ~E_WARNING); - // Some HTML errors will trigger warnings, but still work. - $old = error_reporting(); - error_reporting($old & ~E_WARNING); - - $ok = $dom->loadHTML($body); + $doc = new DOMDocument(); + $doc->loadHTML($body); error_reporting($old); - - if ($ok) { - // hKit doesn't give us a chance to pass the source URL for - // resolving relative links, such as the avatar photo on a - // Google profile. We'll slip it into a <base> tag if there's - // not already one present. - $bases = $dom->getElementsByTagName('base'); - if ($bases && $bases->length >= 1) { - $base = $bases->item(0); - if ($base->hasAttribute('href')) { - $base->setAttribute('href', $url); - } - } else { - $base = $dom->createElement('base'); - $base->setAttribute('href', $url); - $heads = $dom->getElementsByTagName('head'); - if ($heads || $heads->length) { - $head = $heads->item(0); - } else { - $head = $dom->createElement('head'); - $root = $dom->documentRoot; - if ($root->firstChild) { - $root->insertBefore($head, $root->firstChild); - } else { - $root->appendChild($head); - } - } - $head->appendChild($base); + + $xp = new DOMXPath($doc); + + $hcardNodes = self::_getChildrenByClass($doc->documentElement, 'vcard', $xp); + + $hcards = array(); + + for ($i = 0; $i < $hcardNodes->length; $i++) { + + $hcardNode = $hcardNodes->item($i); + + $hcard = self::_hcardFromNode($hcardNode, $xp, $url); + + $hcards[] = $hcard; + } + + $repr = null; + + foreach ($hcards as $hcard) { + if (in_array($url, $hcard['url'])) { + $repr = $hcard; + break; } - return $dom->saveXML(); + } + + if (!is_null($repr)) { + return $repr; + } else if (count($hcards) > 0) { + return $hcards[0]; } else { - throw new Exception("Invalid HTML could not be parsed."); + return null; } } + + function _getChildrenByClass($el, $cls, $xp) + { + // borrowed from hkit. Thanks dudes! + + $qry = ".//*[contains(concat(' ',normalize-space(@class),' '),' $cls ')]"; + + $nodes = $xp->query($qry, $el); + + return $nodes; + } + + function _hcardFromNode($hcardNode, $xp, $base) + { + $hcard = array(); + + $hcard['url'] = array(); + + $urlNodes = self::_getChildrenByClass($hcardNode, 'url', $xp); + + for ($j = 0; $j < $urlNodes->length; $j++) { + + $urlNode = $urlNodes->item($j); + + if ($urlNode->hasAttribute('href')) { + $url = $urlNode->getAttribute('href'); + } else { + $url = $urlNode->textContent; + } + + $hcard['url'][] = self::_rel2abs($url, $base); + } + + $hcard['photo'] = array(); + + $photoNodes = self::_getChildrenByClass($hcardNode, 'photo', $xp); + + for ($j = 0; $j < $photoNodes->length; $j++) { + $photoNode = $photoNodes->item($j); + if ($photoNode->hasAttribute('src')) { + $url = $photoNode->getAttribute('src'); + } else if ($photoNode->hasAttribute('href')) { + $url = $photoNode->getAttribute('href'); + } else { + $url = $photoNode->textContent; + } + $hcard['photo'][] = self::_rel2abs($url, $base); + } + + $singles = array('nickname', 'note', 'fn', 'n', 'adr'); + + foreach ($singles as $single) { + + $nodes = self::_getChildrenByClass($hcardNode, $single, $xp); + + if ($nodes->length > 0) { + $node = $nodes->item(0); + $hcard[$single] = $node->textContent; + } + } + + return $hcard; + } + + // XXX: this is a first pass; we probably need + // to handle things like ../ and ./ and so on + + static function _rel2abs($rel, $wrt) + { + $parts = parse_url($rel); + + if ($parts === false) { + return false; + } + + // If it's got a scheme, use it + + if ($parts['scheme'] != '') { + return $rel; + } + + $w = parse_url($wrt); + + $base = $w['scheme'].'://'.$w['host']; + + if ($rel[0] == '/') { + return $base.$rel; + } + + $wp = explode('/', $w['path']); + + array_pop($wp); + + return $base.implode('/', $wp).'/'.$rel; + } } diff --git a/tests/ActivityParseTests.php b/tests/ActivityParseTests.php index 7bf9cec7c..b6980a6bb 100644 --- a/tests/ActivityParseTests.php +++ b/tests/ActivityParseTests.php @@ -138,9 +138,38 @@ class ActivityParseTests extends PHPUnit_Framework_TestCase $this->assertEquals($poco->urls[0]->value, 'http://example.com/blog.html'); $this->assertEquals($poco->urls[0]->primary, 'true'); $this->assertEquals($act->actor->geopoint, '37.7749295 -122.4194155'); - } + public function testExample6() + { + global $_example6; + + $dom = DOMDocument::loadXML($_example6); + + $rss = $dom->documentElement; + + $channels = $dom->getElementsByTagName('channel'); + + $channel = $channels->item(0); + + $items = $channel->getElementsByTagName('item'); + + $item = $items->item(0); + + $act = new Activity($item, $channel); + + $this->assertEquals($act->verb, ActivityVerb::POST); + + $this->assertEquals($act->id, 'http://en.blog.wordpress.com/?p=3857'); + $this->assertEquals($act->link, 'http://en.blog.wordpress.com/2010/03/03/rub-a-dub-dub-in-the-pubsubhubbub/'); + $this->assertEquals($act->title, 'Rub-a-Dub-Dub in the PubSubHubbub'); + $this->assertEquals($act->time, 1267634892); + + $actor = $act->actor; + + $this->assertFalse(empty($actor)); + $this->assertEquals($actor->title, "Joseph Scott"); + } } $_example1 = <<<EXAMPLE1 @@ -330,3 +359,67 @@ $_example5 = <<<EXAMPLE5 </entry> </feed> EXAMPLE5; + +$_example6 = <<<EXAMPLE6 +<?xml version="1.0" encoding="UTF-8"?> +<rss version="2.0" + xmlns:content="http://purl.org/rss/1.0/modules/content/" + xmlns:wfw="http://wellformedweb.org/CommentAPI/" + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:atom="http://www.w3.org/2005/Atom" + xmlns:sy="http://purl.org/rss/1.0/modules/syndication/" + xmlns:slash="http://purl.org/rss/1.0/modules/slash/" + xmlns:georss="http://www.georss.org/georss" xmlns:geo="http://www.w3.org/2003/01/geo/wgs84_pos#" xmlns:media="http://search.yahoo.com/mrss/" + > + + <channel> + <title>WordPress.com News</title> + <atom:link href="http://en.blog.wordpress.com/feed/" rel="self" type="application/rss+xml" /> + <link>http://en.blog.wordpress.com</link> + <description>The latest news on WordPress.com and the WordPress community.</description> + <lastBuildDate>Thu, 18 Mar 2010 23:25:35 +0000</lastBuildDate> + + <generator>http://wordpress.com/</generator> + <language>en</language> + <sy:updatePeriod>hourly</sy:updatePeriod> + <sy:updateFrequency>1</sy:updateFrequency> + <cloud domain='en.blog.wordpress.com' port='80' path='/?rsscloud=notify' registerProcedure='' protocol='http-post' /> + <image> + <url>http://www.gravatar.com/blavatar/e6392390e3bcfadff3671c5a5653d95b?s=96&d=http://s2.wp.com/i/buttonw-com.png</url> + <title>WordPress.com News</title> + <link>http://en.blog.wordpress.com</link> + </image> + <atom:link rel="search" type="application/opensearchdescription+xml" href="http://en.blog.wordpress.com/osd.xml" title="WordPress.com News" /> + <atom:link rel='hub' href='http://en.blog.wordpress.com/?pushpress=hub'/> + + <item> + <title>Rub-a-Dub-Dub in the PubSubHubbub</title> + <link>http://en.blog.wordpress.com/2010/03/03/rub-a-dub-dub-in-the-pubsubhubbub/</link> + <comments>http://en.blog.wordpress.com/2010/03/03/rub-a-dub-dub-in-the-pubsubhubbub/#comments</comments> + <pubDate>Wed, 03 Mar 2010 16:48:12 +0000</pubDate> + <dc:creator>Joseph Scott</dc:creator> + + <category><![CDATA[Feeds]]></category> + <category><![CDATA[atom]]></category> + <category><![CDATA[pubsubhubbub]]></category> + <category><![CDATA[rss]]></category> + + <guid isPermaLink="false">http://en.blog.wordpress.com/?p=3857</guid> + <description><![CDATA[From the tongue twisting name department we welcome PubSubHubbub, or as some people have shortened it to: PuSH. Like rssCloud, PuSH is a way for services that subscribe to updates from your blog (think Google Reader, Bloglines or Netvibes) to get updates even faster. In a nutshell, instead of having to periodically ask [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=en.blog.wordpress.com&blog=3584907&post=3857&subd=en.blog&ref=&feed=1" />]]></description> + <content:encoded><![CDATA[<p>From the tongue twisting name department we welcome <a href="http://code.google.com/p/pubsubhubbub/">PubSubHubbub</a>, or as some people have shortened it to: PuSH. Like <a href="http://en.blog.wordpress.com/2009/09/07/rss-in-the-clouds/">rssCloud</a>, PuSH is a way for services that subscribe to updates from your blog (think Google Reader, Bloglines or Netvibes) to get updates even faster. In a nutshell, instead of having to periodically ask your blog if there are any updates they can now register to automatically receive updates each time you publish new content. In most cases these updates are sent out within a second or two of when you hit the publish button.</p> + <p>Today we’ve turned on PuSH support for the more than 10.5 million blogs on WordPress.com. There’s nothing to configure, it’s working right now behind the scenes to help others keep up to date with your posts.</p> + <p>For those using the WordPress.org software we are releasing a new PuSH plugin: <a href="http://wordpress.org/extend/plugins/pushpress/">PuSHPress</a>. This plugin differs from the current PuSH related plugins by including a built-in hub.</p> + <p>For more PuSH related reading check out the <a href="http://code.google.com/p/pubsubhubbub/">PubSubHubbub project site</a> and <a href="http://groups.google.com/group/pubsubhubbub?pli=1">Google Group</a>. And if you really want to geek out there’s always the <a href="http://pubsubhubbub.googlecode.com/svn/trunk/pubsubhubbub-core-0.3.html">PubSubHubbub Spec</a> <img src='http://s.wordpress.com/wp-includes/images/smilies/icon_smile.gif' alt=':-)' class='wp-smiley' /> </p> + <br /> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/en.blog.wordpress.com/3857/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/en.blog.wordpress.com/3857/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/en.blog.wordpress.com/3857/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/en.blog.wordpress.com/3857/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/en.blog.wordpress.com/3857/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/en.blog.wordpress.com/3857/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/en.blog.wordpress.com/3857/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/en.blog.wordpress.com/3857/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/en.blog.wordpress.com/3857/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/en.blog.wordpress.com/3857/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=en.blog.wordpress.com&blog=3584907&post=3857&subd=en.blog&ref=&feed=1" />]]></content:encoded> + <wfw:commentRss>http://en.blog.wordpress.com/2010/03/03/rub-a-dub-dub-in-the-pubsubhubbub/feed/</wfw:commentRss> + + <slash:comments>96</slash:comments> + + <media:content url="http://1.gravatar.com/avatar/582b66ad5ae1b69c7601a990cb9a661a?s=96&d=identicon" medium="image"> + <media:title type="html">josephscott</media:title> + </media:content> + </item> + </channel> +</rss> +EXAMPLE6; + |