summaryrefslogtreecommitdiff
path: root/plugins/OStatus/lib/discoveryhints.php
diff options
context:
space:
mode:
authorEvan Prodromou <evan@status.net>2010-03-18 20:52:00 -0500
committerEvan Prodromou <evan@status.net>2010-03-18 20:52:00 -0500
commit17c50f338ceb574780476f6b788f48e2d7d06017 (patch)
tree1c8e7d1b332a3c17d9bc665abdd1d322da703df5 /plugins/OStatus/lib/discoveryhints.php
parentdbd44e51a2a9c5c63a6211002e5dd3b14483fb60 (diff)
Remove hkit and do our own hcard parsing
Parsing hcards for the data we need wasn't hard enough to justify using hkit. It was dependent on a number of external systems (something to run tidy), and only could handle XHTML. We now parse HTML with the PHP dom libraries used elsewhere, and scrape out our own hcards. Seems to work nicer and faster and most of all works with Google Buzz profile URLs.
Diffstat (limited to 'plugins/OStatus/lib/discoveryhints.php')
-rw-r--r--plugins/OStatus/lib/discoveryhints.php194
1 files changed, 130 insertions, 64 deletions
diff --git a/plugins/OStatus/lib/discoveryhints.php b/plugins/OStatus/lib/discoveryhints.php
index db13793dd..1bb0ad2ae 100644
--- a/plugins/OStatus/lib/discoveryhints.php
+++ b/plugins/OStatus/lib/discoveryhints.php
@@ -63,49 +63,12 @@ class DiscoveryHints {
static function hcardHints($body, $url)
{
- common_debug("starting tidy");
-
- $body = self::_tidy($body);
-
- common_debug("done with tidy");
-
- set_include_path(get_include_path() . PATH_SEPARATOR . INSTALLDIR . '/plugins/OStatus/extlib/hkit/');
- require_once('hkit.class.php');
-
- $h = new hKit;
-
- $hcards = $h->getByString('hcard', $body);
-
- if (empty($hcards)) {
- return array();
- }
-
- if (count($hcards) == 1) {
- $hcard = $hcards[0];
- } else {
- foreach ($hcards as $try) {
- if (array_key_exists('url', $try)) {
- if (is_string($try['url']) && $try['url'] == $url) {
- $hcard = $try;
- break;
- } else if (is_array($try['url'])) {
- foreach ($try['url'] as $tryurl) {
- if ($tryurl == $url) {
- $hcard = $try;
- break 2;
- }
- }
- }
- }
- }
- // last chance; grab the first one
- if (empty($hcard)) {
- $hcard = $hcards[0];
- }
- }
+ $hcard = self::_hcard($body, $url);
$hints = array();
+ // XXX: don't copy stuff into an array and then copy it again
+
if (array_key_exists('nickname', $hcard)) {
$hints['nickname'] = $hcard['nickname'];
}
@@ -117,7 +80,7 @@ class DiscoveryHints {
}
if (array_key_exists('photo', $hcard)) {
- $hints['avatar'] = $hcard['photo'];
+ $hints['avatar'] = $hcard['photo'][0];
}
if (array_key_exists('note', $hcard)) {
@@ -144,39 +107,142 @@ class DiscoveryHints {
return $hints;
}
- private static function _tidy($body)
+ static function _hcard($body, $url)
{
- if (function_exists('tidy_parse_string')) {
- common_debug("Tidying with extension");
- $text = tidy_parse_string($body);
- $text = tidy_clean_repair($text);
- return $body;
- } else if ($fullpath = self::_findProgram('tidy')) {
- common_debug("Tidying with program $fullpath");
- $tempfile = tempnam('/tmp', 'snht'); // statusnet hcard tidy
- file_put_contents($tempfile, $source);
- exec("$fullpath -utf8 -indent -asxhtml -numeric -bare -quiet $tempfile", $tidy);
- unlink($tempfile);
- return implode("\n", $tidy);
+ // DOMDocument::loadHTML may throw warnings on unrecognized elements.
+
+ $old = error_reporting(error_reporting() & ~E_WARNING);
+
+ $doc = new DOMDocument();
+ $doc->loadHTML($body);
+
+ error_reporting($old);
+
+ $xp = new DOMXPath($doc);
+
+ $hcardNodes = self::_getChildrenByClass($doc->documentElement, 'vcard', $xp);
+
+ $hcards = array();
+
+ for ($i = 0; $i < $hcardNodes->length; $i++) {
+
+ $hcardNode = $hcardNodes->item($i);
+
+ $hcard = self::_hcardFromNode($hcardNode, $xp, $url);
+
+ $hcards[] = $hcard;
+ }
+
+ $repr = null;
+
+ foreach ($hcards as $hcard) {
+ if (in_array($url, $hcard['url'])) {
+ $repr = $hcard;
+ break;
+ }
+ }
+
+ if (!is_null($repr)) {
+ return $repr;
+ } else if (count($hcards) > 0) {
+ return $hcards[0];
} else {
- common_debug("Not tidying.");
- return $body;
+ return null;
}
}
- private static function _findProgram($name)
+ function _getChildrenByClass($el, $cls, $xp)
+ {
+ // borrowed from hkit. Thanks dudes!
+
+ $qry = ".//*[contains(concat(' ',normalize-space(@class),' '),' $cls ')]";
+
+ $nodes = $xp->query($qry, $el);
+
+ return $nodes;
+ }
+
+ function _hcardFromNode($hcardNode, $xp, $base)
{
- $path = $_ENV['PATH'];
+ $hcard = array();
+
+ $hcard['url'] = array();
+
+ $urlNodes = self::_getChildrenByClass($hcardNode, 'url', $xp);
+
+ for ($j = 0; $j < $urlNodes->length; $j++) {
+
+ $urlNode = $urlNodes->item($j);
+
+ if ($urlNode->hasAttribute('href')) {
+ $url = $urlNode->getAttribute('href');
+ } else {
+ $url = $urlNode->textContent;
+ }
+
+ $hcard['url'][] = self::_rel2abs($url, $base);
+ }
+
+ $hcard['photo'] = array();
+
+ $photoNodes = self::_getChildrenByClass($hcardNode, 'photo', $xp);
+
+ for ($j = 0; $j < $photoNodes->length; $j++) {
+ $photoNode = $photoNodes->item($j);
+ if ($photoNode->hasAttribute('src')) {
+ $url = $photoNode->getAttribute('src');
+ } else if ($photoNode->hasAttribute('href')) {
+ $url = $photoNode->getAttribute('href');
+ } else {
+ $url = $photoNode->textContent;
+ }
+ $hcard['photo'][] = self::_rel2abs($url, $base);
+ }
+
+ $singles = array('nickname', 'note', 'fn', 'n', 'adr');
- $parts = explode(':', $path);
+ foreach ($singles as $single) {
- foreach ($parts as $part) {
- $fullpath = $part . '/' . $name;
- if (is_executable($fullpath)) {
- return $fullpath;
+ $nodes = self::_getChildrenByClass($hcardNode, $single, $xp);
+
+ if ($nodes->length > 0) {
+ $node = $nodes->item(0);
+ $hcard[$single] = $node->textContent;
}
}
- return null;
+ return $hcard;
+ }
+
+ // XXX: this is a first pass; we probably need
+ // to handle things like ../ and ./ and so on
+
+ static function _rel2abs($rel, $wrt)
+ {
+ $parts = parse_url($rel);
+
+ if ($parts === false) {
+ return false;
+ }
+
+ // If it's got a scheme, use it
+
+ if ($parts['scheme'] != '') {
+ return $rel;
+ }
+
+ $w = parse_url($wrt);
+
+ $base = $w['scheme'].'://'.$w['host'];
+
+ if ($rel[0] == '/') {
+ return $base.$rel;
+ }
+
+ $wp = explode('/', $w['path']);
+
+ array_pop($wp);
+
+ return $base.implode('/', $wp).'/'.$rel;
}
}