From 36f02f459b27a81e7017e3bb60b201677fd15a30 Mon Sep 17 00:00:00 2001 From: Evan Prodromou Date: Thu, 25 Feb 2010 19:11:39 -0500 Subject: add hkit for hCard parsing --- plugins/OStatus/extlib/hkit/hcard.profile.php | 105 ++++++ plugins/OStatus/extlib/hkit/hkit.class.php | 475 ++++++++++++++++++++++++++ 2 files changed, 580 insertions(+) create mode 100644 plugins/OStatus/extlib/hkit/hcard.profile.php create mode 100644 plugins/OStatus/extlib/hkit/hkit.class.php (limited to 'plugins/OStatus/extlib') diff --git a/plugins/OStatus/extlib/hkit/hcard.profile.php b/plugins/OStatus/extlib/hkit/hcard.profile.php new file mode 100644 index 000000000..6ec0dc890 --- /dev/null +++ b/plugins/OStatus/extlib/hkit/hcard.profile.php @@ -0,0 +1,105 @@ +root_class = 'vcard'; + + $this->classes = array( + 'fn', array('honorific-prefix', 'given-name', 'additional-name', 'family-name', 'honorific-suffix'), + 'n', array('honorific-prefix', 'given-name', 'additional-name', 'family-name', 'honorific-suffix'), + 'adr', array('post-office-box', 'extended-address', 'street-address', 'postal-code', 'country-name', 'type', 'region', 'locality'), + 'label', 'bday', 'agent', 'nickname', 'photo', 'class', + 'email', array('type', 'value'), + 'category', 'key', 'logo', 'mailer', 'note', + 'org', array('organization-name', 'organization-unit'), + 'tel', array('type', 'value'), + 'geo', array('latitude', 'longitude'), + 'tz', 'uid', 'url', 'rev', 'role', 'sort-string', 'sound', 'title' + ); + + // classes that must only appear once per card + $this->singles = array( + 'fn' + ); + + // classes that are required (not strictly enforced - give at least one!) + $this->required = array( + 'fn' + ); + + $this->att_map = array( + 'fn' => array('IMG|alt'), + 'url' => array('A|href', 'IMG|src', 'AREA|href'), + 'photo' => array('IMG|src'), + 'bday' => array('ABBR|title'), + 'logo' => array('IMG|src'), + 'email' => array('A|href'), + 'geo' => array('ABBR|title') + ); + + + $this->callbacks = array( + 'url' => array($this, 'resolvePath'), + 'photo' => array($this, 'resolvePath'), + 'logo' => array($this, 'resolvePath'), + 'email' => array($this, 'resolveEmail') + ); + + + + function hKit_hcard_post($a) + { + + foreach ($a as &$vcard){ + + hKit_implied_n_optimization($vcard); + hKit_implied_n_from_fn($vcard); + + } + + return $a; + + } + + + function hKit_implied_n_optimization(&$vcard) + { + if (array_key_exists('fn', $vcard) && !is_array($vcard['fn']) && + !array_key_exists('n', $vcard) && (!array_key_exists('org', $vcard) || $vcard['fn'] != $vcard['org'])){ + + if (sizeof(explode(' ', $vcard['fn'])) == 2){ + $patterns = array(); + $patterns[] = array('/^(\S+),\s*(\S{1})$/', 2, 1); // Lastname, Initial + $patterns[] = array('/^(\S+)\s*(\S{1})\.*$/', 2, 1); // Lastname Initial(.) + $patterns[] = array('/^(\S+),\s*(\S+)$/', 2, 1); // Lastname, Firstname + $patterns[] = array('/^(\S+)\s*(\S+)$/', 1, 2); // Firstname Lastname + + foreach ($patterns as $pattern){ + if (preg_match($pattern[0], $vcard['fn'], $matches) === 1){ + $n = array(); + $n['given-name'] = $matches[$pattern[1]]; + $n['family-name'] = $matches[$pattern[2]]; + $vcard['n'] = $n; + + + break; + } + } + } + } + } + + + function hKit_implied_n_from_fn(&$vcard) + { + if (array_key_exists('fn', $vcard) && is_array($vcard['fn']) + && !array_key_exists('n', $vcard) && (!array_key_exists('org', $vcard) || $vcard['fn'] != $vcard['org'])){ + + $vcard['n'] = $vcard['fn']; + } + + if (array_key_exists('fn', $vcard) && is_array($vcard['fn'])){ + $vcard['fn'] = $vcard['fn']['text']; + } + } + +?> \ No newline at end of file diff --git a/plugins/OStatus/extlib/hkit/hkit.class.php b/plugins/OStatus/extlib/hkit/hkit.class.php new file mode 100644 index 000000000..c3a54cff6 --- /dev/null +++ b/plugins/OStatus/extlib/hkit/hkit.class.php @@ -0,0 +1,475 @@ +' . implode(', ', $missing) . ''); + + } + + + public function getByURL($profile='', $url='') + { + + if ($profile=='' || $url == '') return false; + + $this->loadProfile($profile); + + $source = $this->loadURL($url); + + if ($source){ + $tidy_xhtml = $this->tidyThis($source); + + $fragment = false; + + if (strrchr($url, '#')) + $fragment = array_pop(explode('#', $url)); + + $doc = $this->loadDoc($tidy_xhtml, $fragment); + $s = $this->processNodes($doc, $this->classes); + $s = $this->postProcess($profile, $s); + + return $s; + }else{ + return false; + } + } + + public function getByString($profile='', $input_xml='') + { + if ($profile=='' || $input_xml == '') return false; + + $this->loadProfile($profile); + + $doc = $this->loadDoc($input_xml); + $s = $this->processNodes($doc, $this->classes); + $s = $this->postProcess($profile, $s); + + return $s; + + } + + private function processNodes($items, $classes, $allow_includes=true){ + + $out = array(); + + foreach($items as $item){ + $data = array(); + + for ($i=0; $ixpath($xpath); + + if ($results){ + foreach ($results as $result){ + if (isset($classes[$i+1]) && is_array($classes[$i+1])){ + $nodes = $this->processNodes($results, $classes[$i+1]); + if (sizeof($nodes) > 0){ + $nodes = array_merge(array('text'=>$this->getNodeValue($result, $classes[$i])), $nodes); + $data[$classes[$i]] = $nodes; + }else{ + $data[$classes[$i]] = $this->getNodeValue($result, $classes[$i]); + } + + }else{ + if (isset($data[$classes[$i]])){ + if (is_array($data[$classes[$i]])){ + // is already an array - append + $data[$classes[$i]][] = $this->getNodeValue($result, $classes[$i]); + + }else{ + // make it an array + if ($classes[$i] == 'value'){ // unless it's the 'value' of a type/value pattern + $data[$classes[$i]] .= $this->getNodeValue($result, $classes[$i]); + }else{ + $old_val = $data[$classes[$i]]; + $data[$classes[$i]] = array($old_val, $this->getNodeValue($result, $classes[$i])); + $old_val = false; + } + } + }else{ + // set as normal value + $data[$classes[$i]] = $this->getNodeValue($result, $classes[$i]); + + } + } + + // td@headers pattern + if (strtoupper(dom_import_simplexml($result)->tagName)== "TD" && $result['headers']){ + $include_ids = explode(' ', $result['headers']); + $doc = $this->doc; + foreach ($include_ids as $id){ + $xpath = "//*[@id='$id']/.."; + $includes = $doc->xpath($xpath); + foreach ($includes as $include){ + $tmp = $this->processNodes($include, $this->classes); + if (is_array($tmp)) $data = array_merge($data, $tmp); + } + } + } + } + } + } + $result = false; + } + + // include-pattern + if ($allow_includes){ + $xpath = ".//*[contains(concat(' ',normalize-space(@class),' '),' include ')]"; + $results = $item->xpath($xpath); + + if ($results){ + foreach ($results as $result){ + $tagName = strtoupper(dom_import_simplexml($result)->tagName); + if ((($tagName == "OBJECT" && $result['data']) || ($tagName == "A" && $result['href'])) + && preg_match('/\binclude\b/', $result['class'])){ + $att = ($tagName == "OBJECT" ? 'data' : 'href'); + $id = str_replace('#', '', $result[$att]); + $doc = $this->doc; + $xpath = "//*[@id='$id']"; + $includes = $doc->xpath($xpath); + foreach ($includes as $include){ + $include = simplexml_load_string(''.$include->asXML().''); // don't ask. + $tmp = $this->processNodes($include, $this->classes, false); + if (is_array($tmp)) $data = array_merge($data, $tmp); + } + } + } + } + } + $out[] = $data; + } + + if (sizeof($out) > 1){ + return $out; + }else if (isset($data)){ + return $data; + }else{ + return array(); + } + } + + + private function getNodeValue($node, $className) + { + + $tag_name = strtoupper(dom_import_simplexml($node)->tagName); + $s = false; + + // ignore DEL tags + if ($tag_name == 'DEL') return $s; + + // look up att map values + if (array_key_exists($className, $this->att_map)){ + + foreach ($this->att_map[$className] as $map){ + if (preg_match("/$tag_name\|/", $map)){ + $s = ''.$node[array_pop($foo = explode('|', $map))]; + } + } + } + + // if nothing and OBJ, try data. + if (!$s && $tag_name=='OBJECT' && $node['data']) $s = ''.$node['data']; + + // if nothing and IMG, try alt. + if (!$s && $tag_name=='IMG' && $node['alt']) $s = ''.$node['alt']; + + // if nothing and AREA, try alt. + if (!$s && $tag_name=='AREA' && $node['alt']) $s = ''.$node['alt']; + + //if nothing and not A, try title. + if (!$s && $tag_name!='A' && $node['title']) $s = ''.$node['title']; + + + // if nothing found, go with node text + $s = ($s ? $s : implode(array_filter($node->xpath('child::node()'), array(&$this, "filterBlankValues")), ' ')); + + // callbacks + if (array_key_exists($className, $this->callbacks)){ + $s = preg_replace_callback('/.*/', $this->callbacks[$className], $s, 1); + } + + // trim and remove line breaks + if ($tag_name != 'PRE'){ + $s = trim(preg_replace('/[\r\n\t]+/', '', $s)); + $s = trim(preg_replace('/(\s{2})+/', ' ', $s)); + } + + return $s; + } + + private function filterBlankValues($s){ + return preg_match("/\w+/", $s); + } + + + private function tidyThis($source) + { + switch ( $this->tidy_mode ) + { + case 'exec': + $tmp_file = $this->tmp_dir.md5($source).'.txt'; + file_put_contents($tmp_file, $source); + exec("tidy -utf8 -indent -asxhtml -numeric -bare -quiet $tmp_file", $tidy); + unlink($tmp_file); + return implode("\n", $tidy); + break; + + case 'php': + $tidy = tidy_parse_string($source); + return tidy_clean_repair($tidy); + break; + + default: + return $source; + break; + } + + } + + + private function loadProfile($profile) + { + require_once("$profile.profile.php"); + } + + + private function loadDoc($input_xml, $fragment=false) + { + $xml = simplexml_load_string($input_xml); + + $this->doc = $xml; + + if ($fragment){ + $doc = $xml->xpath("//*[@id='$fragment']"); + $xml = simplexml_load_string($doc[0]->asXML()); + $doc = null; + } + + // base tag + if ($xml->head->base['href']) $this->base = $xml->head->base['href']; + + // xml:base attribute - PITA with SimpleXML + preg_match('/xml:base="(.*)"/', $xml->asXML(), $matches); + if (is_array($matches) && sizeof($matches)>1) $this->base = $matches[1]; + + return $xml->xpath("//*[contains(concat(' ',normalize-space(@class),' '),' $this->root_class ')]"); + + } + + + private function loadURL($url) + { + $this->url = $url; + + if ($this->tidy_mode == 'proxy' && $this->tidy_proxy != ''){ + $url = $this->tidy_proxy . $url; + } + + return @file_get_contents($url); + + } + + + private function postProcess($profile, $s) + { + $required = $this->required; + + if (is_array($s) && array_key_exists($required[0], $s)){ + $s = array($s); + } + + $s = $this->dedupeSingles($s); + + if (function_exists('hKit_'.$profile.'_post')){ + $s = call_user_func('hKit_'.$profile.'_post', $s); + } + + $s = $this->removeTextVals($s); + + return $s; + } + + + private function resolvePath($filepath) + { // ugly code ahoy: needs a serious tidy up + + $filepath = $filepath[0]; + + $base = $this->base; + $url = $this->url; + + if ($base != '' && strpos($base, '://') !== false) + $url = $base; + + $r = parse_url($url); + $domain = $r['scheme'] . '://' . $r['host']; + + if (!isset($r['path'])) $r['path'] = '/'; + $path = explode('/', $r['path']); + $file = explode('/', $filepath); + $new = array(''); + + if (strpos($filepath, '://') !== false || strpos($filepath, 'data:') !== false){ + return $filepath; + } + + if ($file[0] == ''){ + // absolute path + return ''.$domain . implode('/', $file); + }else{ + // relative path + if ($path[sizeof($path)-1] == '') array_pop($path); + if (strpos($path[sizeof($path)-1], '.') !== false) array_pop($path); + + foreach ($file as $segment){ + if ($segment == '..'){ + array_pop($path); + }else{ + $new[] = $segment; + } + } + return ''.$domain . implode('/', $path) . implode('/', $new); + } + } + + private function resolveEmail($v) + { + $parts = parse_url($v[0]); + return ($parts['path']); + } + + + private function dedupeSingles($s) + { + $singles = $this->singles; + + foreach ($s as &$item){ + foreach ($singles as $classname){ + if (array_key_exists($classname, $item) && is_array($item[$classname])){ + if (isset($item[$classname][0])) $item[$classname] = $item[$classname][0]; + } + } + } + + return $s; + } + + private function removeTextVals($s) + { + foreach ($s as $key => &$val){ + if ($key){ + $k = $key; + }else{ + $k = ''; + } + + if (is_array($val)){ + $val = $this->removeTextVals($val); + }else{ + if ($k == 'text'){ + $val = ''; + } + } + } + + return array_filter($s); + } + + } + + +?> \ No newline at end of file -- cgit v1.2.3-54-g00ecf