diff options
-rw-r--r-- | plugins/OStatus/extlib/hkit/hcard.profile.php | 105 | ||||
-rw-r--r-- | plugins/OStatus/extlib/hkit/hkit.class.php | 475 | ||||
-rw-r--r-- | plugins/OStatus/lib/discoveryhints.php | 242 |
3 files changed, 131 insertions, 691 deletions
diff --git a/plugins/OStatus/extlib/hkit/hcard.profile.php b/plugins/OStatus/extlib/hkit/hcard.profile.php deleted file mode 100644 index 6ec0dc890..000000000 --- a/plugins/OStatus/extlib/hkit/hcard.profile.php +++ /dev/null @@ -1,105 +0,0 @@ -<?php - // hcard profile for hkit - - $this->root_class = 'vcard'; - - $this->classes = array( - 'fn', array('honorific-prefix', 'given-name', 'additional-name', 'family-name', 'honorific-suffix'), - 'n', array('honorific-prefix', 'given-name', 'additional-name', 'family-name', 'honorific-suffix'), - 'adr', array('post-office-box', 'extended-address', 'street-address', 'postal-code', 'country-name', 'type', 'region', 'locality'), - 'label', 'bday', 'agent', 'nickname', 'photo', 'class', - 'email', array('type', 'value'), - 'category', 'key', 'logo', 'mailer', 'note', - 'org', array('organization-name', 'organization-unit'), - 'tel', array('type', 'value'), - 'geo', array('latitude', 'longitude'), - 'tz', 'uid', 'url', 'rev', 'role', 'sort-string', 'sound', 'title' - ); - - // classes that must only appear once per card - $this->singles = array( - 'fn' - ); - - // classes that are required (not strictly enforced - give at least one!) - $this->required = array( - 'fn' - ); - - $this->att_map = array( - 'fn' => array('IMG|alt'), - 'url' => array('A|href', 'IMG|src', 'AREA|href'), - 'photo' => array('IMG|src'), - 'bday' => array('ABBR|title'), - 'logo' => array('IMG|src'), - 'email' => array('A|href'), - 'geo' => array('ABBR|title') - ); - - - $this->callbacks = array( - 'url' => array($this, 'resolvePath'), - 'photo' => array($this, 'resolvePath'), - 'logo' => array($this, 'resolvePath'), - 'email' => array($this, 'resolveEmail') - ); - - - - function hKit_hcard_post($a) - { - - foreach ($a as &$vcard){ - - hKit_implied_n_optimization($vcard); - hKit_implied_n_from_fn($vcard); - - } - - return $a; - - } - - - function hKit_implied_n_optimization(&$vcard) - { - if (array_key_exists('fn', $vcard) && !is_array($vcard['fn']) && - !array_key_exists('n', $vcard) && (!array_key_exists('org', $vcard) || $vcard['fn'] != $vcard['org'])){ - - if (sizeof(explode(' ', $vcard['fn'])) == 2){ - $patterns = array(); - $patterns[] = array('/^(\S+),\s*(\S{1})$/', 2, 1); // Lastname, Initial - $patterns[] = array('/^(\S+)\s*(\S{1})\.*$/', 2, 1); // Lastname Initial(.) - $patterns[] = array('/^(\S+),\s*(\S+)$/', 2, 1); // Lastname, Firstname - $patterns[] = array('/^(\S+)\s*(\S+)$/', 1, 2); // Firstname Lastname - - foreach ($patterns as $pattern){ - if (preg_match($pattern[0], $vcard['fn'], $matches) === 1){ - $n = array(); - $n['given-name'] = $matches[$pattern[1]]; - $n['family-name'] = $matches[$pattern[2]]; - $vcard['n'] = $n; - - - break; - } - } - } - } - } - - - function hKit_implied_n_from_fn(&$vcard) - { - if (array_key_exists('fn', $vcard) && is_array($vcard['fn']) - && !array_key_exists('n', $vcard) && (!array_key_exists('org', $vcard) || $vcard['fn'] != $vcard['org'])){ - - $vcard['n'] = $vcard['fn']; - } - - if (array_key_exists('fn', $vcard) && is_array($vcard['fn'])){ - $vcard['fn'] = $vcard['fn']['text']; - } - } - -?>
\ No newline at end of file diff --git a/plugins/OStatus/extlib/hkit/hkit.class.php b/plugins/OStatus/extlib/hkit/hkit.class.php deleted file mode 100644 index fec6f4d8f..000000000 --- a/plugins/OStatus/extlib/hkit/hkit.class.php +++ /dev/null @@ -1,475 +0,0 @@ -<?php - - /* - - hKit Library for PHP5 - a generic library for parsing Microformats - Copyright (C) 2006 Drew McLellan - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - - Author - Drew McLellan - http://allinthehead.com/ - - Contributors: - Scott Reynen - http://www.randomchaos.com/ - - Version 0.5, 22-Jul-2006 - fixed by-ref issue cropping up in PHP 5.0.5 - fixed a bug with a@title - added support for new fn=n optimisation - added support for new a.include include-pattern - Version 0.4, 23-Jun-2006 - prevented nested includes from causing infinite loops - returns false if URL can't be fetched - added pre-flight check for base support level - added deduping of once-only classnames - prevented accumulation of multiple 'value' values - tuned whitespace handling and treatment of DEL elements - Version 0.3, 21-Jun-2006 - added post-processor callback method into profiles - fixed minor problems raised by hcard testsuite - added support for include-pattern - added support for td@headers pattern - added implied-n optimization into default hcard profile - Version 0.2, 20-Jun-2006 - added class callback mechanism - added resolvePath & resolveEmail - added basic BASE support - Version 0.1.1, 19-Jun-2006 (different timezone, no time machine) - added external Tidy option - Version 0.1, 20-Jun-2006 - initial release - - - - - */ - - class hKit - { - - public $tidy_mode = 'proxy'; // 'proxy', 'exec', 'php' or 'none' - public $tidy_proxy = 'http://cgi.w3.org/cgi-bin/tidy?forceXML=on&docAddr='; // required only for tidy_mode=proxy - public $tmp_dir = '/path/to/writable/dir/'; // required only for tidy_mode=exec - - private $root_class = ''; - private $classes = ''; - private $singles = ''; - private $required = ''; - private $att_map = ''; - private $callbacks = ''; - private $processor = ''; - - private $url = ''; - private $base = ''; - private $doc = ''; - - - public function hKit() - { - // pre-flight checks - $pass = true; - $required = array('dom_import_simplexml', 'file_get_contents', 'simplexml_load_string'); - $missing = array(); - - foreach ($required as $f){ - if (!function_exists($f)){ - $pass = false; - $missing[] = $f . '()'; - } - } - - if (!$pass) - die('hKit error: these required functions are not available: <strong>' . implode(', ', $missing) . '</strong>'); - - } - - - public function getByURL($profile='', $url='') - { - - if ($profile=='' || $url == '') return false; - - $this->loadProfile($profile); - - $source = $this->loadURL($url); - - if ($source){ - $tidy_xhtml = $this->tidyThis($source); - - $fragment = false; - - if (strrchr($url, '#')) - $fragment = array_pop(explode('#', $url)); - - $doc = $this->loadDoc($tidy_xhtml, $fragment); - $s = $this->processNodes($doc, $this->classes); - $s = $this->postProcess($profile, $s); - - return $s; - }else{ - return false; - } - } - - public function getByString($profile='', $input_xml='') - { - if ($profile=='' || $input_xml == '') return false; - - $this->loadProfile($profile); - - $doc = $this->loadDoc($input_xml); - $s = $this->processNodes($doc, $this->classes); - $s = $this->postProcess($profile, $s); - - return $s; - - } - - private function processNodes($items, $classes, $allow_includes=true){ - - $out = array(); - - foreach($items as $item){ - $data = array(); - - for ($i=0; $i<sizeof($classes); $i++){ - - if (!is_array($classes[$i])){ - - $xpath = ".//*[contains(concat(' ',normalize-space(@class),' '),' " . $classes[$i] . " ')]"; - $results = $item->xpath($xpath); - - if ($results){ - foreach ($results as $result){ - if (isset($classes[$i+1]) && is_array($classes[$i+1])){ - $nodes = $this->processNodes($results, $classes[$i+1]); - if (sizeof($nodes) > 0){ - $nodes = array_merge(array('text'=>$this->getNodeValue($result, $classes[$i])), $nodes); - $data[$classes[$i]] = $nodes; - }else{ - $data[$classes[$i]] = $this->getNodeValue($result, $classes[$i]); - } - - }else{ - if (isset($data[$classes[$i]])){ - if (is_array($data[$classes[$i]])){ - // is already an array - append - $data[$classes[$i]][] = $this->getNodeValue($result, $classes[$i]); - - }else{ - // make it an array - if ($classes[$i] == 'value'){ // unless it's the 'value' of a type/value pattern - $data[$classes[$i]] .= $this->getNodeValue($result, $classes[$i]); - }else{ - $old_val = $data[$classes[$i]]; - $data[$classes[$i]] = array($old_val, $this->getNodeValue($result, $classes[$i])); - $old_val = false; - } - } - }else{ - // set as normal value - $data[$classes[$i]] = $this->getNodeValue($result, $classes[$i]); - - } - } - - // td@headers pattern - if (strtoupper(dom_import_simplexml($result)->tagName)== "TD" && $result['headers']){ - $include_ids = explode(' ', $result['headers']); - $doc = $this->doc; - foreach ($include_ids as $id){ - $xpath = "//*[@id='$id']/.."; - $includes = $doc->xpath($xpath); - foreach ($includes as $include){ - $tmp = $this->processNodes($include, $this->classes); - if (is_array($tmp)) $data = array_merge($data, $tmp); - } - } - } - } - } - } - $result = false; - } - - // include-pattern - if ($allow_includes){ - $xpath = ".//*[contains(concat(' ',normalize-space(@class),' '),' include ')]"; - $results = $item->xpath($xpath); - - if ($results){ - foreach ($results as $result){ - $tagName = strtoupper(dom_import_simplexml($result)->tagName); - if ((($tagName == "OBJECT" && $result['data']) || ($tagName == "A" && $result['href'])) - && preg_match('/\binclude\b/', $result['class'])){ - $att = ($tagName == "OBJECT" ? 'data' : 'href'); - $id = str_replace('#', '', $result[$att]); - $doc = $this->doc; - $xpath = "//*[@id='$id']"; - $includes = $doc->xpath($xpath); - foreach ($includes as $include){ - $include = simplexml_load_string('<root1><root2>'.$include->asXML().'</root2></root1>'); // don't ask. - $tmp = $this->processNodes($include, $this->classes, false); - if (is_array($tmp)) $data = array_merge($data, $tmp); - } - } - } - } - } - $out[] = $data; - } - - if (sizeof($out) > 1){ - return $out; - }else if (isset($data)){ - return $data; - }else{ - return array(); - } - } - - - private function getNodeValue($node, $className) - { - - $tag_name = strtoupper(dom_import_simplexml($node)->tagName); - $s = false; - - // ignore DEL tags - if ($tag_name == 'DEL') return $s; - - // look up att map values - if (array_key_exists($className, $this->att_map)){ - - foreach ($this->att_map[$className] as $map){ - if (preg_match("/$tag_name\|/", $map)){ - $s = ''.$node[array_pop($foo = explode('|', $map))]; - } - } - } - - // if nothing and OBJ, try data. - if (!$s && $tag_name=='OBJECT' && $node['data']) $s = ''.$node['data']; - - // if nothing and IMG, try alt. - if (!$s && $tag_name=='IMG' && $node['alt']) $s = ''.$node['alt']; - - // if nothing and AREA, try alt. - if (!$s && $tag_name=='AREA' && $node['alt']) $s = ''.$node['alt']; - - //if nothing and not A, try title. - if (!$s && $tag_name!='A' && $node['title']) $s = ''.$node['title']; - - - // if nothing found, go with node text - $s = ($s ? $s : implode(array_filter($node->xpath('child::node()'), array(&$this, "filterBlankValues")), ' ')); - - // callbacks - if (array_key_exists($className, $this->callbacks)){ - $s = preg_replace_callback('/.*/', $this->callbacks[$className], $s, 1); - } - - // trim and remove line breaks - if ($tag_name != 'PRE'){ - $s = trim(preg_replace('/[\r\n\t]+/', '', $s)); - $s = trim(preg_replace('/(\s{2})+/', ' ', $s)); - } - - return $s; - } - - private function filterBlankValues($s){ - return preg_match("/\w+/", $s); - } - - - private function tidyThis($source) - { - switch ( $this->tidy_mode ) - { - case 'exec': - $tmp_file = $this->tmp_dir.md5($source).'.txt'; - file_put_contents($tmp_file, $source); - exec("tidy -utf8 -indent -asxhtml -numeric -bare -quiet $tmp_file", $tidy); - unlink($tmp_file); - return implode("\n", $tidy); - break; - - case 'php': - $tidy = tidy_parse_string($source); - return tidy_clean_repair($tidy); - break; - - default: - return $source; - break; - } - - } - - - private function loadProfile($profile) - { - require_once("$profile.profile.php"); - } - - - private function loadDoc($input_xml, $fragment=false) - { - $xml = simplexml_load_string($input_xml); - - $this->doc = $xml; - - if ($fragment){ - $doc = $xml->xpath("//*[@id='$fragment']"); - $xml = simplexml_load_string($doc[0]->asXML()); - $doc = null; - } - - // base tag - if ($xml->head->base['href']) $this->base = $xml->head->base['href']; - - // xml:base attribute - PITA with SimpleXML - preg_match('/xml:base="(.*)"/', $xml->asXML(), $matches); - if (is_array($matches) && sizeof($matches)>1) $this->base = $matches[1]; - - return $xml->xpath("//*[contains(concat(' ',normalize-space(@class),' '),' $this->root_class ')]"); - - } - - - private function loadURL($url) - { - $this->url = $url; - - if ($this->tidy_mode == 'proxy' && $this->tidy_proxy != ''){ - $url = $this->tidy_proxy . $url; - } - - return @file_get_contents($url); - - } - - - private function postProcess($profile, $s) - { - $required = $this->required; - - if (is_array($s) && array_key_exists($required[0], $s)){ - $s = array($s); - } - - $s = $this->dedupeSingles($s); - - if (function_exists('hKit_'.$profile.'_post')){ - $s = call_user_func('hKit_'.$profile.'_post', $s); - } - - $s = $this->removeTextVals($s); - - return $s; - } - - - private function resolvePath($filepath) - { // ugly code ahoy: needs a serious tidy up - - $filepath = $filepath[0]; - - $base = $this->base; - $url = $this->url; - - if ($base != '' && strpos($base, '://') !== false) - $url = $base; - - $r = parse_url($url); - $domain = $r['scheme'] . '://' . $r['host']; - - if (!isset($r['path'])) $r['path'] = '/'; - $path = explode('/', $r['path']); - $file = explode('/', $filepath); - $new = array(''); - - if (strpos($filepath, '://') !== false || strpos($filepath, 'data:') !== false){ - return $filepath; - } - - if ($file[0] == ''){ - // absolute path - return ''.$domain . implode('/', $file); - }else{ - // relative path - if ($path[sizeof($path)-1] == '') array_pop($path); - if (strpos($path[sizeof($path)-1], '.') !== false) array_pop($path); - - foreach ($file as $segment){ - if ($segment == '..'){ - array_pop($path); - }else{ - $new[] = $segment; - } - } - return ''.$domain . implode('/', $path) . implode('/', $new); - } - } - - private function resolveEmail($v) - { - $parts = parse_url($v[0]); - return ($parts['path']); - } - - - private function dedupeSingles($s) - { - $singles = $this->singles; - - foreach ($s as &$item){ - foreach ($singles as $classname){ - if (array_key_exists($classname, $item) && is_array($item[$classname])){ - if (isset($item[$classname][0])) $item[$classname] = $item[$classname][0]; - } - } - } - - return $s; - } - - private function removeTextVals($s) - { - foreach ($s as $key => &$val){ - if ($key){ - $k = $key; - }else{ - $k = ''; - } - - if (is_array($val)){ - $val = $this->removeTextVals($val); - }else{ - if ($k == 'text'){ - $val = ''; - } - } - } - - return array_filter($s); - } - - } - - -?> diff --git a/plugins/OStatus/lib/discoveryhints.php b/plugins/OStatus/lib/discoveryhints.php index 0273b5a92..1bb0ad2ae 100644 --- a/plugins/OStatus/lib/discoveryhints.php +++ b/plugins/OStatus/lib/discoveryhints.php @@ -63,54 +63,12 @@ class DiscoveryHints { static function hcardHints($body, $url) { - common_debug("starting tidy"); - - $body = self::_tidy($body, $url); - - common_debug("done with tidy"); - - set_include_path(get_include_path() . PATH_SEPARATOR . INSTALLDIR . '/plugins/OStatus/extlib/hkit/'); - require_once('hkit.class.php'); - - // hKit code is not clean for notices and warnings - $old = error_reporting(); - error_reporting($old & ~E_NOTICE & ~E_WARNING); - - $h = new hKit; - $hcards = $h->getByString('hcard', $body); - - error_reporting($old); - - if (empty($hcards)) { - return array(); - } - - if (count($hcards) == 1) { - $hcard = $hcards[0]; - } else { - foreach ($hcards as $try) { - if (array_key_exists('url', $try)) { - if (is_string($try['url']) && $try['url'] == $url) { - $hcard = $try; - break; - } else if (is_array($try['url'])) { - foreach ($try['url'] as $tryurl) { - if ($tryurl == $url) { - $hcard = $try; - break 2; - } - } - } - } - } - // last chance; grab the first one - if (empty($hcard)) { - $hcard = $hcards[0]; - } - } + $hcard = self::_hcard($body, $url); $hints = array(); + // XXX: don't copy stuff into an array and then copy it again + if (array_key_exists('nickname', $hcard)) { $hints['nickname'] = $hcard['nickname']; } @@ -122,7 +80,7 @@ class DiscoveryHints { } if (array_key_exists('photo', $hcard)) { - $hints['avatar'] = $hcard['photo']; + $hints['avatar'] = $hcard['photo'][0]; } if (array_key_exists('note', $hcard)) { @@ -149,80 +107,142 @@ class DiscoveryHints { return $hints; } - /** - * hKit needs well-formed XML for its parsing. - * We'll take the HTML body here and normalize it to XML. - * - * @param string $body HTML document source, possibly not-well-formed - * @param string $url source URL - * @return string well-formed XML document source - * @throws Exception if HTML parsing failed. - */ - private static function _tidy($body, $url) + static function _hcard($body, $url) { - if (empty($body)) { - throw new Exception("Empty HTML could not be parsed."); - } - $dom = new DOMDocument(); + // DOMDocument::loadHTML may throw warnings on unrecognized elements. + + $old = error_reporting(error_reporting() & ~E_WARNING); - // Some HTML errors will trigger warnings, but still work. - $old = error_reporting(); - error_reporting($old & ~E_WARNING); - - $ok = $dom->loadHTML($body); + $doc = new DOMDocument(); + $doc->loadHTML($body); error_reporting($old); - - if ($ok) { - // If the original had xmlns or xml:lang attributes on the - // <html>, we seen to end up with duplicates, which causes - // parse errors. Remove em! - // - // For some reason we have to iterate and remove them twice, - // *plus* they don't show up on hasAttribute() or removeAttribute(). - // This might be some weird bug in PHP or libxml2, uncertain if - // it affects other folks consistently. - $root = $dom->documentElement; - foreach ($root->attributes as $i => $x) { - if ($i == 'xmlns' || $i == 'xml:lang') { - $root->removeAttributeNode($x); - } + + $xp = new DOMXPath($doc); + + $hcardNodes = self::_getChildrenByClass($doc->documentElement, 'vcard', $xp); + + $hcards = array(); + + for ($i = 0; $i < $hcardNodes->length; $i++) { + + $hcardNode = $hcardNodes->item($i); + + $hcard = self::_hcardFromNode($hcardNode, $xp, $url); + + $hcards[] = $hcard; + } + + $repr = null; + + foreach ($hcards as $hcard) { + if (in_array($url, $hcard['url'])) { + $repr = $hcard; + break; } - foreach ($root->attributes as $i => $x) { - if ($i == 'xmlns' || $i == 'xml:lang') { - $root->removeAttributeNode($x); - } + } + + if (!is_null($repr)) { + return $repr; + } else if (count($hcards) > 0) { + return $hcards[0]; + } else { + return null; + } + } + + function _getChildrenByClass($el, $cls, $xp) + { + // borrowed from hkit. Thanks dudes! + + $qry = ".//*[contains(concat(' ',normalize-space(@class),' '),' $cls ')]"; + + $nodes = $xp->query($qry, $el); + + return $nodes; + } + + function _hcardFromNode($hcardNode, $xp, $base) + { + $hcard = array(); + + $hcard['url'] = array(); + + $urlNodes = self::_getChildrenByClass($hcardNode, 'url', $xp); + + for ($j = 0; $j < $urlNodes->length; $j++) { + + $urlNode = $urlNodes->item($j); + + if ($urlNode->hasAttribute('href')) { + $url = $urlNode->getAttribute('href'); + } else { + $url = $urlNode->textContent; } - // hKit doesn't give us a chance to pass the source URL for - // resolving relative links, such as the avatar photo on a - // Google profile. We'll slip it into a <base> tag if there's - // not already one present. - $bases = $dom->getElementsByTagName('base'); - if ($bases && $bases->length >= 1) { - $base = $bases->item(0); - if ($base->hasAttribute('href')) { - $base->setAttribute('href', $url); - } + $hcard['url'][] = self::_rel2abs($url, $base); + } + + $hcard['photo'] = array(); + + $photoNodes = self::_getChildrenByClass($hcardNode, 'photo', $xp); + + for ($j = 0; $j < $photoNodes->length; $j++) { + $photoNode = $photoNodes->item($j); + if ($photoNode->hasAttribute('src')) { + $url = $photoNode->getAttribute('src'); + } else if ($photoNode->hasAttribute('href')) { + $url = $photoNode->getAttribute('href'); } else { - $base = $dom->createElement('base'); - $base->setAttribute('href', $url); - $heads = $dom->getElementsByTagName('head'); - if ($heads || $heads->length) { - $head = $heads->item(0); - } else { - $head = $dom->createElement('head'); - if ($root->firstChild) { - $root->insertBefore($head, $root->firstChild); - } else { - $root->appendChild($head); - } - } - $head->appendChild($base); + $url = $photoNode->textContent; } - return $dom->saveXML(); - } else { - throw new Exception("Invalid HTML could not be parsed."); + $hcard['photo'][] = self::_rel2abs($url, $base); + } + + $singles = array('nickname', 'note', 'fn', 'n', 'adr'); + + foreach ($singles as $single) { + + $nodes = self::_getChildrenByClass($hcardNode, $single, $xp); + + if ($nodes->length > 0) { + $node = $nodes->item(0); + $hcard[$single] = $node->textContent; + } + } + + return $hcard; + } + + // XXX: this is a first pass; we probably need + // to handle things like ../ and ./ and so on + + static function _rel2abs($rel, $wrt) + { + $parts = parse_url($rel); + + if ($parts === false) { + return false; } + + // If it's got a scheme, use it + + if ($parts['scheme'] != '') { + return $rel; + } + + $w = parse_url($wrt); + + $base = $w['scheme'].'://'.$w['host']; + + if ($rel[0] == '/') { + return $base.$rel; + } + + $wp = explode('/', $w['path']); + + array_pop($wp); + + return $base.implode('/', $wp).'/'.$rel; } } |