From f21f78364a9cbde2ca535a3983b384707ad097ae Mon Sep 17 00:00:00 2001 From: Evan Prodromou Date: Tue, 16 Mar 2010 11:25:18 -0500 Subject: Change the workflow to get better discovery Tried to re-structure the workflow of discovery to get more and richer data and hints. --- plugins/OStatus/lib/discovery.php | 78 ++++---------- plugins/OStatus/lib/discoveryhints.php | 182 +++++++++++++++++++++++++++++++++ plugins/OStatus/lib/feeddiscovery.php | 4 +- plugins/OStatus/lib/linkheader.php | 63 ++++++++++++ 4 files changed, 267 insertions(+), 60 deletions(-) create mode 100644 plugins/OStatus/lib/discoveryhints.php create mode 100644 plugins/OStatus/lib/linkheader.php (limited to 'plugins/OStatus/lib') diff --git a/plugins/OStatus/lib/discovery.php b/plugins/OStatus/lib/discovery.php index f8449b309..6d245677a 100644 --- a/plugins/OStatus/lib/discovery.php +++ b/plugins/OStatus/lib/discovery.php @@ -40,7 +40,7 @@ class Discovery const PROFILEPAGE = 'http://webfinger.net/rel/profile-page'; const UPDATESFROM = 'http://schemas.google.com/g/2010#updates-from'; const HCARD = 'http://microformats.org/profile/hcard'; - + public $methods = array(); public function __construct() @@ -50,12 +50,11 @@ class Discovery $this->registerMethod('Discovery_LRDD_Link_HTML'); } - public function registerMethod($class) { $this->methods[] = $class; } - + /** * Given a "user id" make sure it's normalized to either a webfinger * acct: uri or a profile HTTP URL. @@ -78,7 +77,7 @@ class Discovery public static function isWebfinger($user_id) { $uri = Discovery::normalize($user_id); - + return (substr($uri, 0, 5) == 'acct:'); } @@ -99,7 +98,7 @@ class Discovery } else { $xrd_uri = $link['href']; } - + $xrd = $this->fetchXrd($xrd_uri); if ($xrd) { return $xrd; @@ -114,14 +113,13 @@ class Discovery if (!is_array($links)) { return false; } - + foreach ($links as $link) { if ($link['rel'] == $service) { return $link; } } } - public static function applyTemplate($template, $id) { @@ -130,7 +128,6 @@ class Discovery return $template; } - public static function fetchXrd($url) { try { @@ -171,7 +168,7 @@ class Discovery_LRDD_Host_Meta implements Discovery_LRDD if ($xrd->host != $domain) { return false; } - + return $xrd->links; } } @@ -187,7 +184,7 @@ class Discovery_LRDD_Link_Header implements Discovery_LRDD } catch (HTTP_Request2_Exception $e) { return false; } - + if ($response->getStatus() != 200) { return false; } @@ -196,51 +193,17 @@ class Discovery_LRDD_Link_Header implements Discovery_LRDD if (!$link_header) { // return false; } - + return Discovery_LRDD_Link_Header::parseHeader($link_header); } protected static function parseHeader($header) { - preg_match('/^<[^>]+>/', $header, $uri_reference); - //if (empty($uri_reference)) return; - - $links = array(); - - $link_uri = trim($uri_reference[0], '<>'); - $link_rel = array(); - $link_type = null; - - // remove uri-reference from header - $header = substr($header, strlen($uri_reference[0])); - - // parse link-params - $params = explode(';', $header); - - foreach ($params as $param) { - if (empty($param)) continue; - list($param_name, $param_value) = explode('=', $param, 2); - $param_name = trim($param_name); - $param_value = preg_replace('(^"|"$)', '', trim($param_value)); - - // for now we only care about 'rel' and 'type' link params - // TODO do something with the other links-params - switch ($param_name) { - case 'rel': - $link_rel = trim($param_value); - break; - - case 'type': - $link_type = trim($param_value); - } - } - - $links[] = array( - 'href' => $link_uri, - 'rel' => $link_rel, - 'type' => $link_type); + $lh = new LinkHeader($header); - return $links; + return array('href' => $lh->href, + 'rel' => $lh->rel, + 'type' => $lh->type); } } @@ -262,49 +225,48 @@ class Discovery_LRDD_Link_HTML implements Discovery_LRDD return Discovery_LRDD_Link_HTML::parse($response->getBody()); } - public function parse($html) { $links = array(); - + preg_match('/]*)?>(.*?)<\/head>/is', $html, $head_matches); $head_html = $head_matches[2]; - + preg_match_all('/]*>/i', $head_html, $link_matches); - + foreach ($link_matches[0] as $link_html) { $link_url = null; $link_rel = null; $link_type = null; - + preg_match('/\srel=(("|\')([^\\2]*?)\\2|[^"\'\s]+)/i', $link_html, $rel_matches); if ( isset($rel_matches[3]) ) { $link_rel = $rel_matches[3]; } else if ( isset($rel_matches[1]) ) { $link_rel = $rel_matches[1]; } - + preg_match('/\shref=(("|\')([^\\2]*?)\\2|[^"\'\s]+)/i', $link_html, $href_matches); if ( isset($href_matches[3]) ) { $link_uri = $href_matches[3]; } else if ( isset($href_matches[1]) ) { $link_uri = $href_matches[1]; } - + preg_match('/\stype=(("|\')([^\\2]*?)\\2|[^"\'\s]+)/i', $link_html, $type_matches); if ( isset($type_matches[3]) ) { $link_type = $type_matches[3]; } else if ( isset($type_matches[1]) ) { $link_type = $type_matches[1]; } - + $links[] = array( 'href' => $link_url, 'rel' => $link_rel, 'type' => $link_type, ); } - + return $links; } } diff --git a/plugins/OStatus/lib/discoveryhints.php b/plugins/OStatus/lib/discoveryhints.php new file mode 100644 index 000000000..db13793dd --- /dev/null +++ b/plugins/OStatus/lib/discoveryhints.php @@ -0,0 +1,182 @@ +. + */ + +class DiscoveryHints { + + static function fromXRD($xrd) + { + $hints = array(); + + foreach ($xrd->links as $link) { + switch ($link['rel']) { + case Discovery::PROFILEPAGE: + $hints['profileurl'] = $link['href']; + break; + case Salmon::NS_REPLIES: + $hints['salmon'] = $link['href']; + break; + case Discovery::UPDATESFROM: + $hints['feedurl'] = $link['href']; + break; + case Discovery::HCARD: + $hints['hcardurl'] = $link['href']; + break; + default: + break; + } + } + + return $hints; + } + + static function fromHcardUrl($url) + { + $client = new HTTPClient(); + $client->setHeader('Accept', 'text/html,application/xhtml+xml'); + $response = $client->get($url); + + if (!$response->isOk()) { + return null; + } + + return self::hcardHints($response->getBody(), + $response->getUrl()); + } + + static function hcardHints($body, $url) + { + common_debug("starting tidy"); + + $body = self::_tidy($body); + + common_debug("done with tidy"); + + set_include_path(get_include_path() . PATH_SEPARATOR . INSTALLDIR . '/plugins/OStatus/extlib/hkit/'); + require_once('hkit.class.php'); + + $h = new hKit; + + $hcards = $h->getByString('hcard', $body); + + if (empty($hcards)) { + return array(); + } + + if (count($hcards) == 1) { + $hcard = $hcards[0]; + } else { + foreach ($hcards as $try) { + if (array_key_exists('url', $try)) { + if (is_string($try['url']) && $try['url'] == $url) { + $hcard = $try; + break; + } else if (is_array($try['url'])) { + foreach ($try['url'] as $tryurl) { + if ($tryurl == $url) { + $hcard = $try; + break 2; + } + } + } + } + } + // last chance; grab the first one + if (empty($hcard)) { + $hcard = $hcards[0]; + } + } + + $hints = array(); + + if (array_key_exists('nickname', $hcard)) { + $hints['nickname'] = $hcard['nickname']; + } + + if (array_key_exists('fn', $hcard)) { + $hints['fullname'] = $hcard['fn']; + } else if (array_key_exists('n', $hcard)) { + $hints['fullname'] = implode(' ', $hcard['n']); + } + + if (array_key_exists('photo', $hcard)) { + $hints['avatar'] = $hcard['photo']; + } + + if (array_key_exists('note', $hcard)) { + $hints['bio'] = $hcard['note']; + } + + if (array_key_exists('adr', $hcard)) { + if (is_string($hcard['adr'])) { + $hints['location'] = $hcard['adr']; + } else if (is_array($hcard['adr'])) { + $hints['location'] = implode(' ', $hcard['adr']); + } + } + + if (array_key_exists('url', $hcard)) { + if (is_string($hcard['url'])) { + $hints['homepage'] = $hcard['url']; + } else if (is_array($hcard['url'])) { + // HACK get the last one; that's how our hcards look + $hints['homepage'] = $hcard['url'][count($hcard['url'])-1]; + } + } + + return $hints; + } + + private static function _tidy($body) + { + if (function_exists('tidy_parse_string')) { + common_debug("Tidying with extension"); + $text = tidy_parse_string($body); + $text = tidy_clean_repair($text); + return $body; + } else if ($fullpath = self::_findProgram('tidy')) { + common_debug("Tidying with program $fullpath"); + $tempfile = tempnam('/tmp', 'snht'); // statusnet hcard tidy + file_put_contents($tempfile, $source); + exec("$fullpath -utf8 -indent -asxhtml -numeric -bare -quiet $tempfile", $tidy); + unlink($tempfile); + return implode("\n", $tidy); + } else { + common_debug("Not tidying."); + return $body; + } + } + + private static function _findProgram($name) + { + $path = $_ENV['PATH']; + + $parts = explode(':', $path); + + foreach ($parts as $part) { + $fullpath = $part . '/' . $name; + if (is_executable($fullpath)) { + return $fullpath; + } + } + + return null; + } +} diff --git a/plugins/OStatus/lib/feeddiscovery.php b/plugins/OStatus/lib/feeddiscovery.php index ff76b229e..f9ea3e713 100644 --- a/plugins/OStatus/lib/feeddiscovery.php +++ b/plugins/OStatus/lib/feeddiscovery.php @@ -117,7 +117,7 @@ class FeedDiscovery return $this->discoverFromURL($target, false); } } - + return $this->initFromResponse($response); } @@ -202,7 +202,7 @@ class FeedDiscovery 'application/atom+xml' => false, 'application/rss+xml' => false, ); - + $nodes = $dom->getElementsByTagName('link'); for ($i = 0; $i < $nodes->length; $i++) { $node = $nodes->item($i); diff --git a/plugins/OStatus/lib/linkheader.php b/plugins/OStatus/lib/linkheader.php new file mode 100644 index 000000000..2f6c66dc9 --- /dev/null +++ b/plugins/OStatus/lib/linkheader.php @@ -0,0 +1,63 @@ +]+>/', $str, $uri_reference); + //if (empty($uri_reference)) return; + + $this->uri = trim($uri_reference[0], '<>'); + $this->rel = array(); + $this->type = null; + + // remove uri-reference from header + $str = substr($str, strlen($uri_reference[0])); + + // parse link-params + $params = explode(';', $str); + + foreach ($params as $param) { + if (empty($param)) continue; + list($param_name, $param_value) = explode('=', $param, 2); + $param_name = trim($param_name); + $param_value = preg_replace('(^"|"$)', '', trim($param_value)); + + // for now we only care about 'rel' and 'type' link params + // TODO do something with the other links-params + switch ($param_name) { + case 'rel': + $this->rel = trim($param_value); + break; + + case 'type': + $this->type = trim($param_value); + } + } + } + + static function getLink($response, $rel=null, $type=null) + { + $headers = $response->getHeader('Link'); + + // Can get an array or string, so try to simplify the path + if (!is_array($headers)) { + $headers = array($headers); + } + + foreach ($headers as $header) { + $lh = new LinkHeader($header); + + if ((is_null($rel) || $lh->rel == $rel) && + (is_null($type) || $lh->type == $type)) { + return $lh->href; + } + } + + return null; + } +} \ No newline at end of file -- cgit v1.2.3-54-g00ecf