From 1301877dfe89c57c182246c0d7ba0ff6335fd17b Mon Sep 17 00:00:00 2001 From: Brion Vibber Date: Thu, 18 Mar 2010 17:08:19 -0700 Subject: OStatus discover fixes: * Subscription::start was sometimes passing users instead of profiles to hooks, which broke OStatus subscription notifications; now normalizing to profiles for processing. * H-card parsing would trigger a lot of PHP warnings and notices in hKit. Now suppressing warnings and notices for the duration of the call to keep them out of output when display_errors is on. * H-card parsing would trigger a PHP fatal error if the source page was not well-formed XML and Tidy was not present on the system. Switched normalization to use the PHP DOM module which is always present, as we have no need for Tidy's extra features here. * Trying to fetch avatars from Google profiles failed and triggered a PHP warning due to the relative URL not being resolved during h-card parsing. Now passing profile page URL into hKit by sneaking a tag in while we normalize the HTML source. * Profile pages without a "Link" header could trigger PHP notices due to a bad NULL -> array(NULL) conversion in LinkHeader::getLink(). Now checking that there was a return value before converting single return value into array. --- plugins/OStatus/lib/discoveryhints.php | 91 ++++++++++++++++++++++------------ plugins/OStatus/lib/linkheader.php | 24 ++++----- 2 files changed, 71 insertions(+), 44 deletions(-) (limited to 'plugins/OStatus/lib') diff --git a/plugins/OStatus/lib/discoveryhints.php b/plugins/OStatus/lib/discoveryhints.php index db13793dd..4da2ec0f1 100644 --- a/plugins/OStatus/lib/discoveryhints.php +++ b/plugins/OStatus/lib/discoveryhints.php @@ -65,17 +65,22 @@ class DiscoveryHints { { common_debug("starting tidy"); - $body = self::_tidy($body); + $body = self::_tidy($body, $url); common_debug("done with tidy"); set_include_path(get_include_path() . PATH_SEPARATOR . INSTALLDIR . '/plugins/OStatus/extlib/hkit/'); require_once('hkit.class.php'); - $h = new hKit; + // hKit code is not clean for notices and warnings + $old = error_reporting(); + error_reporting($old & ~E_NOTICE & ~E_WARNING); + $h = new hKit; $hcards = $h->getByString('hcard', $body); + error_reporting($old); + if (empty($hcards)) { return array(); } @@ -144,39 +149,61 @@ class DiscoveryHints { return $hints; } - private static function _tidy($body) + /** + * hKit needs well-formed XML for its parsing. + * We'll take the HTML body here and normalize it to XML. + * + * @param string $body HTML document source, possibly not-well-formed + * @param string $url source URL + * @return string well-formed XML document source + * @throws Exception if HTML parsing failed. + */ + private static function _tidy($body, $url) { - if (function_exists('tidy_parse_string')) { - common_debug("Tidying with extension"); - $text = tidy_parse_string($body); - $text = tidy_clean_repair($text); - return $body; - } else if ($fullpath = self::_findProgram('tidy')) { - common_debug("Tidying with program $fullpath"); - $tempfile = tempnam('/tmp', 'snht'); // statusnet hcard tidy - file_put_contents($tempfile, $source); - exec("$fullpath -utf8 -indent -asxhtml -numeric -bare -quiet $tempfile", $tidy); - unlink($tempfile); - return implode("\n", $tidy); - } else { - common_debug("Not tidying."); - return $body; + if (empty($body)) { + throw new Exception("Empty HTML could not be parsed."); } - } - - private static function _findProgram($name) - { - $path = $_ENV['PATH']; - - $parts = explode(':', $path); - - foreach ($parts as $part) { - $fullpath = $part . '/' . $name; - if (is_executable($fullpath)) { - return $fullpath; + $dom = new DOMDocument(); + + // Some HTML errors will trigger warnings, but still work. + $old = error_reporting(); + error_reporting($old & ~E_WARNING); + + $ok = $dom->loadHTML($body); + + error_reporting($old); + + if ($ok) { + // hKit doesn't give us a chance to pass the source URL for + // resolving relative links, such as the avatar photo on a + // Google profile. We'll slip it into a tag if there's + // not already one present. + $bases = $dom->getElementsByTagName('base'); + if ($bases && $bases->length >= 1) { + $base = $bases->item(0); + if ($base->hasAttribute('href')) { + $base->setAttribute('href', $url); + } + } else { + $base = $dom->createElement('base'); + $base->setAttribute('href', $url); + $heads = $dom->getElementsByTagName('head'); + if ($heads || $heads->length) { + $head = $heads->item(0); + } else { + $head = $dom->createElement('head'); + $root = $dom->documentRoot; + if ($root->firstChild) { + $root->insertBefore($head, $root->firstChild); + } else { + $root->appendChild($head); + } + } + $head->appendChild($base); } + return $dom->saveXML(); + } else { + throw new Exception("Invalid HTML could not be parsed."); } - - return null; } } diff --git a/plugins/OStatus/lib/linkheader.php b/plugins/OStatus/lib/linkheader.php index 2f6c66dc9..afcd66d26 100644 --- a/plugins/OStatus/lib/linkheader.php +++ b/plugins/OStatus/lib/linkheader.php @@ -43,21 +43,21 @@ class LinkHeader static function getLink($response, $rel=null, $type=null) { $headers = $response->getHeader('Link'); + if ($headers) { + // Can get an array or string, so try to simplify the path + if (!is_array($headers)) { + $headers = array($headers); + } - // Can get an array or string, so try to simplify the path - if (!is_array($headers)) { - $headers = array($headers); - } - - foreach ($headers as $header) { - $lh = new LinkHeader($header); + foreach ($headers as $header) { + $lh = new LinkHeader($header); - if ((is_null($rel) || $lh->rel == $rel) && - (is_null($type) || $lh->type == $type)) { - return $lh->href; + if ((is_null($rel) || $lh->rel == $rel) && + (is_null($type) || $lh->type == $type)) { + return $lh->href; + } } } - return null; } -} \ No newline at end of file +} -- cgit v1.2.3-54-g00ecf From 4a6c9e445149e42a4f81d5140296e7770c60bc6c Mon Sep 17 00:00:00 2001 From: Brion Vibber Date: Thu, 18 Mar 2010 17:55:21 -0700 Subject: Work around weird bug with HTML normalization via PHP DOM module; if source had xmlns and xml:lang I ended up with double output, breaking the subsequent parsing. Will have to track this down later and report upstream if not already resolved. --- plugins/OStatus/extlib/hkit/hkit.class.php | 2 +- plugins/OStatus/lib/discoveryhints.php | 21 ++++++++++++++++++++- 2 files changed, 21 insertions(+), 2 deletions(-) (limited to 'plugins/OStatus/lib') diff --git a/plugins/OStatus/extlib/hkit/hkit.class.php b/plugins/OStatus/extlib/hkit/hkit.class.php index c3a54cff6..fec6f4d8f 100644 --- a/plugins/OStatus/extlib/hkit/hkit.class.php +++ b/plugins/OStatus/extlib/hkit/hkit.class.php @@ -472,4 +472,4 @@ } -?> \ No newline at end of file +?> diff --git a/plugins/OStatus/lib/discoveryhints.php b/plugins/OStatus/lib/discoveryhints.php index 4da2ec0f1..0273b5a92 100644 --- a/plugins/OStatus/lib/discoveryhints.php +++ b/plugins/OStatus/lib/discoveryhints.php @@ -174,6 +174,26 @@ class DiscoveryHints { error_reporting($old); if ($ok) { + // If the original had xmlns or xml:lang attributes on the + // , we seen to end up with duplicates, which causes + // parse errors. Remove em! + // + // For some reason we have to iterate and remove them twice, + // *plus* they don't show up on hasAttribute() or removeAttribute(). + // This might be some weird bug in PHP or libxml2, uncertain if + // it affects other folks consistently. + $root = $dom->documentElement; + foreach ($root->attributes as $i => $x) { + if ($i == 'xmlns' || $i == 'xml:lang') { + $root->removeAttributeNode($x); + } + } + foreach ($root->attributes as $i => $x) { + if ($i == 'xmlns' || $i == 'xml:lang') { + $root->removeAttributeNode($x); + } + } + // hKit doesn't give us a chance to pass the source URL for // resolving relative links, such as the avatar photo on a // Google profile. We'll slip it into a tag if there's @@ -192,7 +212,6 @@ class DiscoveryHints { $head = $heads->item(0); } else { $head = $dom->createElement('head'); - $root = $dom->documentRoot; if ($root->firstChild) { $root->insertBefore($head, $root->firstChild); } else { -- cgit v1.2.3-54-g00ecf