summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorEvan Prodromou <evan@status.net>2010-02-25 19:11:39 -0500
committerEvan Prodromou <evan@status.net>2010-02-25 19:11:39 -0500
commit36f02f459b27a81e7017e3bb60b201677fd15a30 (patch)
tree6224f70744bec7442da5e1efd922c437d32c3948
parent4aa82f75ed692e254c573096168e1cd7c0890ac6 (diff)
add hkit for hCard parsing
-rw-r--r--plugins/OStatus/extlib/hkit/hcard.profile.php105
-rw-r--r--plugins/OStatus/extlib/hkit/hkit.class.php475
2 files changed, 580 insertions, 0 deletions
diff --git a/plugins/OStatus/extlib/hkit/hcard.profile.php b/plugins/OStatus/extlib/hkit/hcard.profile.php
new file mode 100644
index 000000000..6ec0dc890
--- /dev/null
+++ b/plugins/OStatus/extlib/hkit/hcard.profile.php
@@ -0,0 +1,105 @@
+<?php
+ // hcard profile for hkit
+
+ $this->root_class = 'vcard';
+
+ $this->classes = array(
+ 'fn', array('honorific-prefix', 'given-name', 'additional-name', 'family-name', 'honorific-suffix'),
+ 'n', array('honorific-prefix', 'given-name', 'additional-name', 'family-name', 'honorific-suffix'),
+ 'adr', array('post-office-box', 'extended-address', 'street-address', 'postal-code', 'country-name', 'type', 'region', 'locality'),
+ 'label', 'bday', 'agent', 'nickname', 'photo', 'class',
+ 'email', array('type', 'value'),
+ 'category', 'key', 'logo', 'mailer', 'note',
+ 'org', array('organization-name', 'organization-unit'),
+ 'tel', array('type', 'value'),
+ 'geo', array('latitude', 'longitude'),
+ 'tz', 'uid', 'url', 'rev', 'role', 'sort-string', 'sound', 'title'
+ );
+
+ // classes that must only appear once per card
+ $this->singles = array(
+ 'fn'
+ );
+
+ // classes that are required (not strictly enforced - give at least one!)
+ $this->required = array(
+ 'fn'
+ );
+
+ $this->att_map = array(
+ 'fn' => array('IMG|alt'),
+ 'url' => array('A|href', 'IMG|src', 'AREA|href'),
+ 'photo' => array('IMG|src'),
+ 'bday' => array('ABBR|title'),
+ 'logo' => array('IMG|src'),
+ 'email' => array('A|href'),
+ 'geo' => array('ABBR|title')
+ );
+
+
+ $this->callbacks = array(
+ 'url' => array($this, 'resolvePath'),
+ 'photo' => array($this, 'resolvePath'),
+ 'logo' => array($this, 'resolvePath'),
+ 'email' => array($this, 'resolveEmail')
+ );
+
+
+
+ function hKit_hcard_post($a)
+ {
+
+ foreach ($a as &$vcard){
+
+ hKit_implied_n_optimization($vcard);
+ hKit_implied_n_from_fn($vcard);
+
+ }
+
+ return $a;
+
+ }
+
+
+ function hKit_implied_n_optimization(&$vcard)
+ {
+ if (array_key_exists('fn', $vcard) && !is_array($vcard['fn']) &&
+ !array_key_exists('n', $vcard) && (!array_key_exists('org', $vcard) || $vcard['fn'] != $vcard['org'])){
+
+ if (sizeof(explode(' ', $vcard['fn'])) == 2){
+ $patterns = array();
+ $patterns[] = array('/^(\S+),\s*(\S{1})$/', 2, 1); // Lastname, Initial
+ $patterns[] = array('/^(\S+)\s*(\S{1})\.*$/', 2, 1); // Lastname Initial(.)
+ $patterns[] = array('/^(\S+),\s*(\S+)$/', 2, 1); // Lastname, Firstname
+ $patterns[] = array('/^(\S+)\s*(\S+)$/', 1, 2); // Firstname Lastname
+
+ foreach ($patterns as $pattern){
+ if (preg_match($pattern[0], $vcard['fn'], $matches) === 1){
+ $n = array();
+ $n['given-name'] = $matches[$pattern[1]];
+ $n['family-name'] = $matches[$pattern[2]];
+ $vcard['n'] = $n;
+
+
+ break;
+ }
+ }
+ }
+ }
+ }
+
+
+ function hKit_implied_n_from_fn(&$vcard)
+ {
+ if (array_key_exists('fn', $vcard) && is_array($vcard['fn'])
+ && !array_key_exists('n', $vcard) && (!array_key_exists('org', $vcard) || $vcard['fn'] != $vcard['org'])){
+
+ $vcard['n'] = $vcard['fn'];
+ }
+
+ if (array_key_exists('fn', $vcard) && is_array($vcard['fn'])){
+ $vcard['fn'] = $vcard['fn']['text'];
+ }
+ }
+
+?> \ No newline at end of file
diff --git a/plugins/OStatus/extlib/hkit/hkit.class.php b/plugins/OStatus/extlib/hkit/hkit.class.php
new file mode 100644
index 000000000..c3a54cff6
--- /dev/null
+++ b/plugins/OStatus/extlib/hkit/hkit.class.php
@@ -0,0 +1,475 @@
+<?php
+
+ /*
+
+ hKit Library for PHP5 - a generic library for parsing Microformats
+ Copyright (C) 2006 Drew McLellan
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+ Author
+ Drew McLellan - http://allinthehead.com/
+
+ Contributors:
+ Scott Reynen - http://www.randomchaos.com/
+
+ Version 0.5, 22-Jul-2006
+ fixed by-ref issue cropping up in PHP 5.0.5
+ fixed a bug with a@title
+ added support for new fn=n optimisation
+ added support for new a.include include-pattern
+ Version 0.4, 23-Jun-2006
+ prevented nested includes from causing infinite loops
+ returns false if URL can't be fetched
+ added pre-flight check for base support level
+ added deduping of once-only classnames
+ prevented accumulation of multiple 'value' values
+ tuned whitespace handling and treatment of DEL elements
+ Version 0.3, 21-Jun-2006
+ added post-processor callback method into profiles
+ fixed minor problems raised by hcard testsuite
+ added support for include-pattern
+ added support for td@headers pattern
+ added implied-n optimization into default hcard profile
+ Version 0.2, 20-Jun-2006
+ added class callback mechanism
+ added resolvePath & resolveEmail
+ added basic BASE support
+ Version 0.1.1, 19-Jun-2006 (different timezone, no time machine)
+ added external Tidy option
+ Version 0.1, 20-Jun-2006
+ initial release
+
+
+
+
+ */
+
+ class hKit
+ {
+
+ public $tidy_mode = 'proxy'; // 'proxy', 'exec', 'php' or 'none'
+ public $tidy_proxy = 'http://cgi.w3.org/cgi-bin/tidy?forceXML=on&docAddr='; // required only for tidy_mode=proxy
+ public $tmp_dir = '/path/to/writable/dir/'; // required only for tidy_mode=exec
+
+ private $root_class = '';
+ private $classes = '';
+ private $singles = '';
+ private $required = '';
+ private $att_map = '';
+ private $callbacks = '';
+ private $processor = '';
+
+ private $url = '';
+ private $base = '';
+ private $doc = '';
+
+
+ public function hKit()
+ {
+ // pre-flight checks
+ $pass = true;
+ $required = array('dom_import_simplexml', 'file_get_contents', 'simplexml_load_string');
+ $missing = array();
+
+ foreach ($required as $f){
+ if (!function_exists($f)){
+ $pass = false;
+ $missing[] = $f . '()';
+ }
+ }
+
+ if (!$pass)
+ die('hKit error: these required functions are not available: <strong>' . implode(', ', $missing) . '</strong>');
+
+ }
+
+
+ public function getByURL($profile='', $url='')
+ {
+
+ if ($profile=='' || $url == '') return false;
+
+ $this->loadProfile($profile);
+
+ $source = $this->loadURL($url);
+
+ if ($source){
+ $tidy_xhtml = $this->tidyThis($source);
+
+ $fragment = false;
+
+ if (strrchr($url, '#'))
+ $fragment = array_pop(explode('#', $url));
+
+ $doc = $this->loadDoc($tidy_xhtml, $fragment);
+ $s = $this->processNodes($doc, $this->classes);
+ $s = $this->postProcess($profile, $s);
+
+ return $s;
+ }else{
+ return false;
+ }
+ }
+
+ public function getByString($profile='', $input_xml='')
+ {
+ if ($profile=='' || $input_xml == '') return false;
+
+ $this->loadProfile($profile);
+
+ $doc = $this->loadDoc($input_xml);
+ $s = $this->processNodes($doc, $this->classes);
+ $s = $this->postProcess($profile, $s);
+
+ return $s;
+
+ }
+
+ private function processNodes($items, $classes, $allow_includes=true){
+
+ $out = array();
+
+ foreach($items as $item){
+ $data = array();
+
+ for ($i=0; $i<sizeof($classes); $i++){
+
+ if (!is_array($classes[$i])){
+
+ $xpath = ".//*[contains(concat(' ',normalize-space(@class),' '),' " . $classes[$i] . " ')]";
+ $results = $item->xpath($xpath);
+
+ if ($results){
+ foreach ($results as $result){
+ if (isset($classes[$i+1]) && is_array($classes[$i+1])){
+ $nodes = $this->processNodes($results, $classes[$i+1]);
+ if (sizeof($nodes) > 0){
+ $nodes = array_merge(array('text'=>$this->getNodeValue($result, $classes[$i])), $nodes);
+ $data[$classes[$i]] = $nodes;
+ }else{
+ $data[$classes[$i]] = $this->getNodeValue($result, $classes[$i]);
+ }
+
+ }else{
+ if (isset($data[$classes[$i]])){
+ if (is_array($data[$classes[$i]])){
+ // is already an array - append
+ $data[$classes[$i]][] = $this->getNodeValue($result, $classes[$i]);
+
+ }else{
+ // make it an array
+ if ($classes[$i] == 'value'){ // unless it's the 'value' of a type/value pattern
+ $data[$classes[$i]] .= $this->getNodeValue($result, $classes[$i]);
+ }else{
+ $old_val = $data[$classes[$i]];
+ $data[$classes[$i]] = array($old_val, $this->getNodeValue($result, $classes[$i]));
+ $old_val = false;
+ }
+ }
+ }else{
+ // set as normal value
+ $data[$classes[$i]] = $this->getNodeValue($result, $classes[$i]);
+
+ }
+ }
+
+ // td@headers pattern
+ if (strtoupper(dom_import_simplexml($result)->tagName)== "TD" && $result['headers']){
+ $include_ids = explode(' ', $result['headers']);
+ $doc = $this->doc;
+ foreach ($include_ids as $id){
+ $xpath = "//*[@id='$id']/..";
+ $includes = $doc->xpath($xpath);
+ foreach ($includes as $include){
+ $tmp = $this->processNodes($include, $this->classes);
+ if (is_array($tmp)) $data = array_merge($data, $tmp);
+ }
+ }
+ }
+ }
+ }
+ }
+ $result = false;
+ }
+
+ // include-pattern
+ if ($allow_includes){
+ $xpath = ".//*[contains(concat(' ',normalize-space(@class),' '),' include ')]";
+ $results = $item->xpath($xpath);
+
+ if ($results){
+ foreach ($results as $result){
+ $tagName = strtoupper(dom_import_simplexml($result)->tagName);
+ if ((($tagName == "OBJECT" && $result['data']) || ($tagName == "A" && $result['href']))
+ && preg_match('/\binclude\b/', $result['class'])){
+ $att = ($tagName == "OBJECT" ? 'data' : 'href');
+ $id = str_replace('#', '', $result[$att]);
+ $doc = $this->doc;
+ $xpath = "//*[@id='$id']";
+ $includes = $doc->xpath($xpath);
+ foreach ($includes as $include){
+ $include = simplexml_load_string('<root1><root2>'.$include->asXML().'</root2></root1>'); // don't ask.
+ $tmp = $this->processNodes($include, $this->classes, false);
+ if (is_array($tmp)) $data = array_merge($data, $tmp);
+ }
+ }
+ }
+ }
+ }
+ $out[] = $data;
+ }
+
+ if (sizeof($out) > 1){
+ return $out;
+ }else if (isset($data)){
+ return $data;
+ }else{
+ return array();
+ }
+ }
+
+
+ private function getNodeValue($node, $className)
+ {
+
+ $tag_name = strtoupper(dom_import_simplexml($node)->tagName);
+ $s = false;
+
+ // ignore DEL tags
+ if ($tag_name == 'DEL') return $s;
+
+ // look up att map values
+ if (array_key_exists($className, $this->att_map)){
+
+ foreach ($this->att_map[$className] as $map){
+ if (preg_match("/$tag_name\|/", $map)){
+ $s = ''.$node[array_pop($foo = explode('|', $map))];
+ }
+ }
+ }
+
+ // if nothing and OBJ, try data.
+ if (!$s && $tag_name=='OBJECT' && $node['data']) $s = ''.$node['data'];
+
+ // if nothing and IMG, try alt.
+ if (!$s && $tag_name=='IMG' && $node['alt']) $s = ''.$node['alt'];
+
+ // if nothing and AREA, try alt.
+ if (!$s && $tag_name=='AREA' && $node['alt']) $s = ''.$node['alt'];
+
+ //if nothing and not A, try title.
+ if (!$s && $tag_name!='A' && $node['title']) $s = ''.$node['title'];
+
+
+ // if nothing found, go with node text
+ $s = ($s ? $s : implode(array_filter($node->xpath('child::node()'), array(&$this, "filterBlankValues")), ' '));
+
+ // callbacks
+ if (array_key_exists($className, $this->callbacks)){
+ $s = preg_replace_callback('/.*/', $this->callbacks[$className], $s, 1);
+ }
+
+ // trim and remove line breaks
+ if ($tag_name != 'PRE'){
+ $s = trim(preg_replace('/[\r\n\t]+/', '', $s));
+ $s = trim(preg_replace('/(\s{2})+/', ' ', $s));
+ }
+
+ return $s;
+ }
+
+ private function filterBlankValues($s){
+ return preg_match("/\w+/", $s);
+ }
+
+
+ private function tidyThis($source)
+ {
+ switch ( $this->tidy_mode )
+ {
+ case 'exec':
+ $tmp_file = $this->tmp_dir.md5($source).'.txt';
+ file_put_contents($tmp_file, $source);
+ exec("tidy -utf8 -indent -asxhtml -numeric -bare -quiet $tmp_file", $tidy);
+ unlink($tmp_file);
+ return implode("\n", $tidy);
+ break;
+
+ case 'php':
+ $tidy = tidy_parse_string($source);
+ return tidy_clean_repair($tidy);
+ break;
+
+ default:
+ return $source;
+ break;
+ }
+
+ }
+
+
+ private function loadProfile($profile)
+ {
+ require_once("$profile.profile.php");
+ }
+
+
+ private function loadDoc($input_xml, $fragment=false)
+ {
+ $xml = simplexml_load_string($input_xml);
+
+ $this->doc = $xml;
+
+ if ($fragment){
+ $doc = $xml->xpath("//*[@id='$fragment']");
+ $xml = simplexml_load_string($doc[0]->asXML());
+ $doc = null;
+ }
+
+ // base tag
+ if ($xml->head->base['href']) $this->base = $xml->head->base['href'];
+
+ // xml:base attribute - PITA with SimpleXML
+ preg_match('/xml:base="(.*)"/', $xml->asXML(), $matches);
+ if (is_array($matches) && sizeof($matches)>1) $this->base = $matches[1];
+
+ return $xml->xpath("//*[contains(concat(' ',normalize-space(@class),' '),' $this->root_class ')]");
+
+ }
+
+
+ private function loadURL($url)
+ {
+ $this->url = $url;
+
+ if ($this->tidy_mode == 'proxy' && $this->tidy_proxy != ''){
+ $url = $this->tidy_proxy . $url;
+ }
+
+ return @file_get_contents($url);
+
+ }
+
+
+ private function postProcess($profile, $s)
+ {
+ $required = $this->required;
+
+ if (is_array($s) && array_key_exists($required[0], $s)){
+ $s = array($s);
+ }
+
+ $s = $this->dedupeSingles($s);
+
+ if (function_exists('hKit_'.$profile.'_post')){
+ $s = call_user_func('hKit_'.$profile.'_post', $s);
+ }
+
+ $s = $this->removeTextVals($s);
+
+ return $s;
+ }
+
+
+ private function resolvePath($filepath)
+ { // ugly code ahoy: needs a serious tidy up
+
+ $filepath = $filepath[0];
+
+ $base = $this->base;
+ $url = $this->url;
+
+ if ($base != '' && strpos($base, '://') !== false)
+ $url = $base;
+
+ $r = parse_url($url);
+ $domain = $r['scheme'] . '://' . $r['host'];
+
+ if (!isset($r['path'])) $r['path'] = '/';
+ $path = explode('/', $r['path']);
+ $file = explode('/', $filepath);
+ $new = array('');
+
+ if (strpos($filepath, '://') !== false || strpos($filepath, 'data:') !== false){
+ return $filepath;
+ }
+
+ if ($file[0] == ''){
+ // absolute path
+ return ''.$domain . implode('/', $file);
+ }else{
+ // relative path
+ if ($path[sizeof($path)-1] == '') array_pop($path);
+ if (strpos($path[sizeof($path)-1], '.') !== false) array_pop($path);
+
+ foreach ($file as $segment){
+ if ($segment == '..'){
+ array_pop($path);
+ }else{
+ $new[] = $segment;
+ }
+ }
+ return ''.$domain . implode('/', $path) . implode('/', $new);
+ }
+ }
+
+ private function resolveEmail($v)
+ {
+ $parts = parse_url($v[0]);
+ return ($parts['path']);
+ }
+
+
+ private function dedupeSingles($s)
+ {
+ $singles = $this->singles;
+
+ foreach ($s as &$item){
+ foreach ($singles as $classname){
+ if (array_key_exists($classname, $item) && is_array($item[$classname])){
+ if (isset($item[$classname][0])) $item[$classname] = $item[$classname][0];
+ }
+ }
+ }
+
+ return $s;
+ }
+
+ private function removeTextVals($s)
+ {
+ foreach ($s as $key => &$val){
+ if ($key){
+ $k = $key;
+ }else{
+ $k = '';
+ }
+
+ if (is_array($val)){
+ $val = $this->removeTextVals($val);
+ }else{
+ if ($k == 'text'){
+ $val = '';
+ }
+ }
+ }
+
+ return array_filter($s);
+ }
+
+ }
+
+
+?> \ No newline at end of file