summaryrefslogtreecommitdiff
path: root/plugins/OStatus/extlib/hkit
diff options
context:
space:
mode:
authorEvan Prodromou <evan@status.net>2010-03-18 20:52:00 -0500
committerEvan Prodromou <evan@status.net>2010-03-18 20:52:00 -0500
commit17c50f338ceb574780476f6b788f48e2d7d06017 (patch)
tree1c8e7d1b332a3c17d9bc665abdd1d322da703df5 /plugins/OStatus/extlib/hkit
parentdbd44e51a2a9c5c63a6211002e5dd3b14483fb60 (diff)
Remove hkit and do our own hcard parsing
Parsing hcards for the data we need wasn't hard enough to justify using hkit. It was dependent on a number of external systems (something to run tidy), and only could handle XHTML. We now parse HTML with the PHP dom libraries used elsewhere, and scrape out our own hcards. Seems to work nicer and faster and most of all works with Google Buzz profile URLs.
Diffstat (limited to 'plugins/OStatus/extlib/hkit')
-rw-r--r--plugins/OStatus/extlib/hkit/hcard.profile.php105
-rw-r--r--plugins/OStatus/extlib/hkit/hkit.class.php475
2 files changed, 0 insertions, 580 deletions
diff --git a/plugins/OStatus/extlib/hkit/hcard.profile.php b/plugins/OStatus/extlib/hkit/hcard.profile.php
deleted file mode 100644
index 6ec0dc890..000000000
--- a/plugins/OStatus/extlib/hkit/hcard.profile.php
+++ /dev/null
@@ -1,105 +0,0 @@
-<?php
- // hcard profile for hkit
-
- $this->root_class = 'vcard';
-
- $this->classes = array(
- 'fn', array('honorific-prefix', 'given-name', 'additional-name', 'family-name', 'honorific-suffix'),
- 'n', array('honorific-prefix', 'given-name', 'additional-name', 'family-name', 'honorific-suffix'),
- 'adr', array('post-office-box', 'extended-address', 'street-address', 'postal-code', 'country-name', 'type', 'region', 'locality'),
- 'label', 'bday', 'agent', 'nickname', 'photo', 'class',
- 'email', array('type', 'value'),
- 'category', 'key', 'logo', 'mailer', 'note',
- 'org', array('organization-name', 'organization-unit'),
- 'tel', array('type', 'value'),
- 'geo', array('latitude', 'longitude'),
- 'tz', 'uid', 'url', 'rev', 'role', 'sort-string', 'sound', 'title'
- );
-
- // classes that must only appear once per card
- $this->singles = array(
- 'fn'
- );
-
- // classes that are required (not strictly enforced - give at least one!)
- $this->required = array(
- 'fn'
- );
-
- $this->att_map = array(
- 'fn' => array('IMG|alt'),
- 'url' => array('A|href', 'IMG|src', 'AREA|href'),
- 'photo' => array('IMG|src'),
- 'bday' => array('ABBR|title'),
- 'logo' => array('IMG|src'),
- 'email' => array('A|href'),
- 'geo' => array('ABBR|title')
- );
-
-
- $this->callbacks = array(
- 'url' => array($this, 'resolvePath'),
- 'photo' => array($this, 'resolvePath'),
- 'logo' => array($this, 'resolvePath'),
- 'email' => array($this, 'resolveEmail')
- );
-
-
-
- function hKit_hcard_post($a)
- {
-
- foreach ($a as &$vcard){
-
- hKit_implied_n_optimization($vcard);
- hKit_implied_n_from_fn($vcard);
-
- }
-
- return $a;
-
- }
-
-
- function hKit_implied_n_optimization(&$vcard)
- {
- if (array_key_exists('fn', $vcard) && !is_array($vcard['fn']) &&
- !array_key_exists('n', $vcard) && (!array_key_exists('org', $vcard) || $vcard['fn'] != $vcard['org'])){
-
- if (sizeof(explode(' ', $vcard['fn'])) == 2){
- $patterns = array();
- $patterns[] = array('/^(\S+),\s*(\S{1})$/', 2, 1); // Lastname, Initial
- $patterns[] = array('/^(\S+)\s*(\S{1})\.*$/', 2, 1); // Lastname Initial(.)
- $patterns[] = array('/^(\S+),\s*(\S+)$/', 2, 1); // Lastname, Firstname
- $patterns[] = array('/^(\S+)\s*(\S+)$/', 1, 2); // Firstname Lastname
-
- foreach ($patterns as $pattern){
- if (preg_match($pattern[0], $vcard['fn'], $matches) === 1){
- $n = array();
- $n['given-name'] = $matches[$pattern[1]];
- $n['family-name'] = $matches[$pattern[2]];
- $vcard['n'] = $n;
-
-
- break;
- }
- }
- }
- }
- }
-
-
- function hKit_implied_n_from_fn(&$vcard)
- {
- if (array_key_exists('fn', $vcard) && is_array($vcard['fn'])
- && !array_key_exists('n', $vcard) && (!array_key_exists('org', $vcard) || $vcard['fn'] != $vcard['org'])){
-
- $vcard['n'] = $vcard['fn'];
- }
-
- if (array_key_exists('fn', $vcard) && is_array($vcard['fn'])){
- $vcard['fn'] = $vcard['fn']['text'];
- }
- }
-
-?> \ No newline at end of file
diff --git a/plugins/OStatus/extlib/hkit/hkit.class.php b/plugins/OStatus/extlib/hkit/hkit.class.php
deleted file mode 100644
index c3a54cff6..000000000
--- a/plugins/OStatus/extlib/hkit/hkit.class.php
+++ /dev/null
@@ -1,475 +0,0 @@
-<?php
-
- /*
-
- hKit Library for PHP5 - a generic library for parsing Microformats
- Copyright (C) 2006 Drew McLellan
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-
- Author
- Drew McLellan - http://allinthehead.com/
-
- Contributors:
- Scott Reynen - http://www.randomchaos.com/
-
- Version 0.5, 22-Jul-2006
- fixed by-ref issue cropping up in PHP 5.0.5
- fixed a bug with a@title
- added support for new fn=n optimisation
- added support for new a.include include-pattern
- Version 0.4, 23-Jun-2006
- prevented nested includes from causing infinite loops
- returns false if URL can't be fetched
- added pre-flight check for base support level
- added deduping of once-only classnames
- prevented accumulation of multiple 'value' values
- tuned whitespace handling and treatment of DEL elements
- Version 0.3, 21-Jun-2006
- added post-processor callback method into profiles
- fixed minor problems raised by hcard testsuite
- added support for include-pattern
- added support for td@headers pattern
- added implied-n optimization into default hcard profile
- Version 0.2, 20-Jun-2006
- added class callback mechanism
- added resolvePath & resolveEmail
- added basic BASE support
- Version 0.1.1, 19-Jun-2006 (different timezone, no time machine)
- added external Tidy option
- Version 0.1, 20-Jun-2006
- initial release
-
-
-
-
- */
-
- class hKit
- {
-
- public $tidy_mode = 'proxy'; // 'proxy', 'exec', 'php' or 'none'
- public $tidy_proxy = 'http://cgi.w3.org/cgi-bin/tidy?forceXML=on&docAddr='; // required only for tidy_mode=proxy
- public $tmp_dir = '/path/to/writable/dir/'; // required only for tidy_mode=exec
-
- private $root_class = '';
- private $classes = '';
- private $singles = '';
- private $required = '';
- private $att_map = '';
- private $callbacks = '';
- private $processor = '';
-
- private $url = '';
- private $base = '';
- private $doc = '';
-
-
- public function hKit()
- {
- // pre-flight checks
- $pass = true;
- $required = array('dom_import_simplexml', 'file_get_contents', 'simplexml_load_string');
- $missing = array();
-
- foreach ($required as $f){
- if (!function_exists($f)){
- $pass = false;
- $missing[] = $f . '()';
- }
- }
-
- if (!$pass)
- die('hKit error: these required functions are not available: <strong>' . implode(', ', $missing) . '</strong>');
-
- }
-
-
- public function getByURL($profile='', $url='')
- {
-
- if ($profile=='' || $url == '') return false;
-
- $this->loadProfile($profile);
-
- $source = $this->loadURL($url);
-
- if ($source){
- $tidy_xhtml = $this->tidyThis($source);
-
- $fragment = false;
-
- if (strrchr($url, '#'))
- $fragment = array_pop(explode('#', $url));
-
- $doc = $this->loadDoc($tidy_xhtml, $fragment);
- $s = $this->processNodes($doc, $this->classes);
- $s = $this->postProcess($profile, $s);
-
- return $s;
- }else{
- return false;
- }
- }
-
- public function getByString($profile='', $input_xml='')
- {
- if ($profile=='' || $input_xml == '') return false;
-
- $this->loadProfile($profile);
-
- $doc = $this->loadDoc($input_xml);
- $s = $this->processNodes($doc, $this->classes);
- $s = $this->postProcess($profile, $s);
-
- return $s;
-
- }
-
- private function processNodes($items, $classes, $allow_includes=true){
-
- $out = array();
-
- foreach($items as $item){
- $data = array();
-
- for ($i=0; $i<sizeof($classes); $i++){
-
- if (!is_array($classes[$i])){
-
- $xpath = ".//*[contains(concat(' ',normalize-space(@class),' '),' " . $classes[$i] . " ')]";
- $results = $item->xpath($xpath);
-
- if ($results){
- foreach ($results as $result){
- if (isset($classes[$i+1]) && is_array($classes[$i+1])){
- $nodes = $this->processNodes($results, $classes[$i+1]);
- if (sizeof($nodes) > 0){
- $nodes = array_merge(array('text'=>$this->getNodeValue($result, $classes[$i])), $nodes);
- $data[$classes[$i]] = $nodes;
- }else{
- $data[$classes[$i]] = $this->getNodeValue($result, $classes[$i]);
- }
-
- }else{
- if (isset($data[$classes[$i]])){
- if (is_array($data[$classes[$i]])){
- // is already an array - append
- $data[$classes[$i]][] = $this->getNodeValue($result, $classes[$i]);
-
- }else{
- // make it an array
- if ($classes[$i] == 'value'){ // unless it's the 'value' of a type/value pattern
- $data[$classes[$i]] .= $this->getNodeValue($result, $classes[$i]);
- }else{
- $old_val = $data[$classes[$i]];
- $data[$classes[$i]] = array($old_val, $this->getNodeValue($result, $classes[$i]));
- $old_val = false;
- }
- }
- }else{
- // set as normal value
- $data[$classes[$i]] = $this->getNodeValue($result, $classes[$i]);
-
- }
- }
-
- // td@headers pattern
- if (strtoupper(dom_import_simplexml($result)->tagName)== "TD" && $result['headers']){
- $include_ids = explode(' ', $result['headers']);
- $doc = $this->doc;
- foreach ($include_ids as $id){
- $xpath = "//*[@id='$id']/..";
- $includes = $doc->xpath($xpath);
- foreach ($includes as $include){
- $tmp = $this->processNodes($include, $this->classes);
- if (is_array($tmp)) $data = array_merge($data, $tmp);
- }
- }
- }
- }
- }
- }
- $result = false;
- }
-
- // include-pattern
- if ($allow_includes){
- $xpath = ".//*[contains(concat(' ',normalize-space(@class),' '),' include ')]";
- $results = $item->xpath($xpath);
-
- if ($results){
- foreach ($results as $result){
- $tagName = strtoupper(dom_import_simplexml($result)->tagName);
- if ((($tagName == "OBJECT" && $result['data']) || ($tagName == "A" && $result['href']))
- && preg_match('/\binclude\b/', $result['class'])){
- $att = ($tagName == "OBJECT" ? 'data' : 'href');
- $id = str_replace('#', '', $result[$att]);
- $doc = $this->doc;
- $xpath = "//*[@id='$id']";
- $includes = $doc->xpath($xpath);
- foreach ($includes as $include){
- $include = simplexml_load_string('<root1><root2>'.$include->asXML().'</root2></root1>'); // don't ask.
- $tmp = $this->processNodes($include, $this->classes, false);
- if (is_array($tmp)) $data = array_merge($data, $tmp);
- }
- }
- }
- }
- }
- $out[] = $data;
- }
-
- if (sizeof($out) > 1){
- return $out;
- }else if (isset($data)){
- return $data;
- }else{
- return array();
- }
- }
-
-
- private function getNodeValue($node, $className)
- {
-
- $tag_name = strtoupper(dom_import_simplexml($node)->tagName);
- $s = false;
-
- // ignore DEL tags
- if ($tag_name == 'DEL') return $s;
-
- // look up att map values
- if (array_key_exists($className, $this->att_map)){
-
- foreach ($this->att_map[$className] as $map){
- if (preg_match("/$tag_name\|/", $map)){
- $s = ''.$node[array_pop($foo = explode('|', $map))];
- }
- }
- }
-
- // if nothing and OBJ, try data.
- if (!$s && $tag_name=='OBJECT' && $node['data']) $s = ''.$node['data'];
-
- // if nothing and IMG, try alt.
- if (!$s && $tag_name=='IMG' && $node['alt']) $s = ''.$node['alt'];
-
- // if nothing and AREA, try alt.
- if (!$s && $tag_name=='AREA' && $node['alt']) $s = ''.$node['alt'];
-
- //if nothing and not A, try title.
- if (!$s && $tag_name!='A' && $node['title']) $s = ''.$node['title'];
-
-
- // if nothing found, go with node text
- $s = ($s ? $s : implode(array_filter($node->xpath('child::node()'), array(&$this, "filterBlankValues")), ' '));
-
- // callbacks
- if (array_key_exists($className, $this->callbacks)){
- $s = preg_replace_callback('/.*/', $this->callbacks[$className], $s, 1);
- }
-
- // trim and remove line breaks
- if ($tag_name != 'PRE'){
- $s = trim(preg_replace('/[\r\n\t]+/', '', $s));
- $s = trim(preg_replace('/(\s{2})+/', ' ', $s));
- }
-
- return $s;
- }
-
- private function filterBlankValues($s){
- return preg_match("/\w+/", $s);
- }
-
-
- private function tidyThis($source)
- {
- switch ( $this->tidy_mode )
- {
- case 'exec':
- $tmp_file = $this->tmp_dir.md5($source).'.txt';
- file_put_contents($tmp_file, $source);
- exec("tidy -utf8 -indent -asxhtml -numeric -bare -quiet $tmp_file", $tidy);
- unlink($tmp_file);
- return implode("\n", $tidy);
- break;
-
- case 'php':
- $tidy = tidy_parse_string($source);
- return tidy_clean_repair($tidy);
- break;
-
- default:
- return $source;
- break;
- }
-
- }
-
-
- private function loadProfile($profile)
- {
- require_once("$profile.profile.php");
- }
-
-
- private function loadDoc($input_xml, $fragment=false)
- {
- $xml = simplexml_load_string($input_xml);
-
- $this->doc = $xml;
-
- if ($fragment){
- $doc = $xml->xpath("//*[@id='$fragment']");
- $xml = simplexml_load_string($doc[0]->asXML());
- $doc = null;
- }
-
- // base tag
- if ($xml->head->base['href']) $this->base = $xml->head->base['href'];
-
- // xml:base attribute - PITA with SimpleXML
- preg_match('/xml:base="(.*)"/', $xml->asXML(), $matches);
- if (is_array($matches) && sizeof($matches)>1) $this->base = $matches[1];
-
- return $xml->xpath("//*[contains(concat(' ',normalize-space(@class),' '),' $this->root_class ')]");
-
- }
-
-
- private function loadURL($url)
- {
- $this->url = $url;
-
- if ($this->tidy_mode == 'proxy' && $this->tidy_proxy != ''){
- $url = $this->tidy_proxy . $url;
- }
-
- return @file_get_contents($url);
-
- }
-
-
- private function postProcess($profile, $s)
- {
- $required = $this->required;
-
- if (is_array($s) && array_key_exists($required[0], $s)){
- $s = array($s);
- }
-
- $s = $this->dedupeSingles($s);
-
- if (function_exists('hKit_'.$profile.'_post')){
- $s = call_user_func('hKit_'.$profile.'_post', $s);
- }
-
- $s = $this->removeTextVals($s);
-
- return $s;
- }
-
-
- private function resolvePath($filepath)
- { // ugly code ahoy: needs a serious tidy up
-
- $filepath = $filepath[0];
-
- $base = $this->base;
- $url = $this->url;
-
- if ($base != '' && strpos($base, '://') !== false)
- $url = $base;
-
- $r = parse_url($url);
- $domain = $r['scheme'] . '://' . $r['host'];
-
- if (!isset($r['path'])) $r['path'] = '/';
- $path = explode('/', $r['path']);
- $file = explode('/', $filepath);
- $new = array('');
-
- if (strpos($filepath, '://') !== false || strpos($filepath, 'data:') !== false){
- return $filepath;
- }
-
- if ($file[0] == ''){
- // absolute path
- return ''.$domain . implode('/', $file);
- }else{
- // relative path
- if ($path[sizeof($path)-1] == '') array_pop($path);
- if (strpos($path[sizeof($path)-1], '.') !== false) array_pop($path);
-
- foreach ($file as $segment){
- if ($segment == '..'){
- array_pop($path);
- }else{
- $new[] = $segment;
- }
- }
- return ''.$domain . implode('/', $path) . implode('/', $new);
- }
- }
-
- private function resolveEmail($v)
- {
- $parts = parse_url($v[0]);
- return ($parts['path']);
- }
-
-
- private function dedupeSingles($s)
- {
- $singles = $this->singles;
-
- foreach ($s as &$item){
- foreach ($singles as $classname){
- if (array_key_exists($classname, $item) && is_array($item[$classname])){
- if (isset($item[$classname][0])) $item[$classname] = $item[$classname][0];
- }
- }
- }
-
- return $s;
- }
-
- private function removeTextVals($s)
- {
- foreach ($s as $key => &$val){
- if ($key){
- $k = $key;
- }else{
- $k = '';
- }
-
- if (is_array($val)){
- $val = $this->removeTextVals($val);
- }else{
- if ($k == 'text'){
- $val = '';
- }
- }
- }
-
- return array_filter($s);
- }
-
- }
-
-
-?> \ No newline at end of file