* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * or see http://www.gnu.org/ * * @ingroup DifferenceEngine */ /** * When detecting the last common parent of two nodes, all results are stored as * a LastCommonParentResult. */ class LastCommonParentResult { // Parent public $parent; // Splitting public $splittingNeeded = false; // Depth public $lastCommonParentDepth = -1; // Index public $indexInLastCommonParent = -1; } class Modification{ const NONE = 1; const REMOVED = 2; const ADDED = 4; const CHANGED = 8; public $type; public $id = -1; public $firstOfID = false; public $changes; function __construct($type) { $this->type = $type; } public static function typeToString($type) { switch($type) { case self::NONE: return 'none'; case self::REMOVED: return 'removed'; case self::ADDED: return 'added'; case self::CHANGED: return 'changed'; } } } class DomTreeBuilder { public $textNodes = array(); public $bodyNode; private $currentParent; private $newWord = ''; protected $bodyStarted = false; protected $bodyEnded = false; private $whiteSpaceBeforeThis = false; private $lastSibling; private $notInPre = true; function __construct() { $this->bodyNode = $this->currentParent = new BodyNode(); $this->lastSibling = new DummyNode(); } /** * Must be called manually */ public function endDocument() { $this->endWord(); HTMLDiffer::diffDebug( count($this->textNodes) . " text nodes in document.\n" ); } public function startElement($parser, $name, /*array*/ $attributes) { if (strcasecmp($name, 'body') != 0) { HTMLDiffer::diffDebug( "Starting $name node.\n" ); $this->endWord(); $newNode = new TagNode($this->currentParent, $name, $attributes); $this->currentParent->children[] = $newNode; $this->currentParent = $newNode; $this->lastSibling = new DummyNode(); if ($this->whiteSpaceBeforeThis && !in_array(strtolower($this->currentParent->qName),TagNode::$blocks)) { $this->currentParent->whiteBefore = true; } $this->whiteSpaceBeforeThis = false; if(strcasecmp($name, 'pre') == 0) { $this->notInPre = false; } } } public function endElement($parser, $name) { if(strcasecmp($name, 'body') != 0) { HTMLDiffer::diffDebug( "Ending $name node.\n"); if (0 == strcasecmp($name,'img')) { // Insert a dummy leaf for the image $img = new ImageNode($this->currentParent, $this->currentParent->attributes); $this->currentParent->children[] = $img; $img->whiteBefore = $this->whiteSpaceBeforeThis; $this->lastSibling = $img; $this->textNodes[] = $img; } $this->endWord(); if (!in_array(strtolower($this->currentParent->qName),TagNode::$blocks)) { $this->lastSibling = $this->currentParent; } else { $this->lastSibling = new DummyNode(); } $this->currentParent = $this->currentParent->parent; $this->whiteSpaceBeforeThis = false; if (!$this->notInPre && strcasecmp($name, 'pre') == 0) { $this->notInPre = true; } } else { $this->endDocument(); } } const regex = '/([\s\.\,\"\\\'\(\)\?\:\;\!\{\}\-\+\*\=\_\[\]\&\|\$]{1})/'; const whitespace = '/^[\s]{1}$/'; const delimiter = '/^[\s\.\,\"\\\'\(\)\?\:\;\!\{\}\-\+\*\=\_\[\]\&\|\$]{1}$/'; public function characters($parser, $data) { $matches = preg_split(self::regex, $data, -1, PREG_SPLIT_DELIM_CAPTURE); foreach($matches as &$word) { if (preg_match(self::whitespace, $word) && $this->notInPre) { $this->endWord(); $this->lastSibling->whiteAfter = true; $this->whiteSpaceBeforeThis = true; } else if (preg_match(self::delimiter, $word)) { $this->endWord(); $textNode = new TextNode($this->currentParent, $word); $this->currentParent->children[] = $textNode; $textNode->whiteBefore = $this->whiteSpaceBeforeThis; $this->whiteSpaceBeforeThis = false; $this->lastSibling = $textNode; $this->textNodes[] = $textNode; } else { $this->newWord .= $word; } } } private function endWord() { if ($this->newWord !== '') { $node = new TextNode($this->currentParent, $this->newWord); $this->currentParent->children[] = $node; $node->whiteBefore = $this->whiteSpaceBeforeThis; $this->whiteSpaceBeforeThis = false; $this->lastSibling = $node; $this->textNodes[] = $node; $this->newWord = ""; } } public function getDiffLines() { return array_map(array('TextNode','toDiffLine'), $this->textNodes); } } class TextNodeDiffer { private $textNodes; public $bodyNode; private $oldTextNodes; private $oldBodyNode; private $newID = 0; private $changedID = 0; private $changedIDUsed = false; // used to remove the whitespace between a red and green block private $whiteAfterLastChangedPart = false; private $deletedID = 0; function __construct(DomTreeBuilder $tree, DomTreeBuilder $oldTree) { $this->textNodes = $tree->textNodes; $this->bodyNode = $tree->bodyNode; $this->oldTextNodes = $oldTree->textNodes; $this->oldBodyNode = $oldTree->bodyNode; } public function markAsNew($start, $end) { if ($end <= $start) { return; } if ($this->whiteAfterLastChangedPart) { $this->textNodes[$start]->whiteBefore = false; } for ($i = $start; $i < $end; ++$i) { $mod = new Modification(Modification::ADDED); $mod->id = $this->newID; $this->textNodes[$i]->modification = $mod; } if ($start < $end) { $this->textNodes[$start]->modification->firstOfID = true; } ++$this->newID; } public function handlePossibleChangedPart($leftstart, $leftend, $rightstart, $rightend) { $i = $rightstart; $j = $leftstart; if ($this->changedIDUsed) { ++$this->changedID; $this->changedIDUsed = false; } $changes; while ($i < $rightend) { $acthis = new AncestorComparator($this->textNodes[$i]->getParentTree()); $acother = new AncestorComparator($this->oldTextNodes[$j]->getParentTree()); $result = $acthis->getResult($acother); unset($acthis, $acother); if ( $result ) { $mod = new Modification(Modification::CHANGED); if (!$this->changedIDUsed) { $mod->firstOfID = true; } else if (!is_null( $result ) && $result !== $this->changes) { ++$this->changedID; $mod->firstOfID = true; } $mod->changes = $result; $mod->id = $this->changedID; $this->textNodes[$i]->modification = $mod; $this->changes = $result; $this->changedIDUsed = true; } else if ($this->changedIDUsed) { ++$this->changedID; $this->changedIDUsed = false; } ++$i; ++$j; } } public function markAsDeleted($start, $end, $before) { if ($end <= $start) { return; } if ($before > 0 && $this->textNodes[$before - 1]->whiteAfter) { $this->whiteAfterLastChangedPart = true; } else { $this->whiteAfterLastChangedPart = false; } for ($i = $start; $i < $end; ++$i) { $mod = new Modification(Modification::REMOVED); $mod->id = $this->deletedID; // oldTextNodes is used here because we're going to move its deleted // elements to this tree! $this->oldTextNodes[$i]->modification = $mod; } $this->oldTextNodes[$start]->modification->firstOfID = true; $root = $this->oldTextNodes[$start]->getLastCommonParent($this->oldTextNodes[$end-1])->parent; $junk1 = $junk2 = null; $deletedNodes = $root->getMinimalDeletedSet($this->deletedID, $junk1, $junk2); HTMLDiffer::diffDebug( "Minimal set of deleted nodes of size " . count($deletedNodes) . "\n" ); // Set prevLeaf to the leaf after which the old HTML needs to be // inserted if ($before > 0) { $prevLeaf = $this->textNodes[$before - 1]; } // Set nextLeaf to the leaf before which the old HTML needs to be // inserted if ($before < count($this->textNodes)) { $nextLeaf = $this->textNodes[$before]; } while (count($deletedNodes) > 0) { if (isset($prevLeaf)) { $prevResult = $prevLeaf->getLastCommonParent($deletedNodes[0]); } else { $prevResult = new LastCommonParentResult(); $prevResult->parent = $this->bodyNode; $prevResult->indexInLastCommonParent = -1; } if (isset($nextleaf)) { $nextResult = $nextLeaf->getLastCommonParent($deletedNodes[count($deletedNodes) - 1]); } else { $nextResult = new LastCommonParentResult(); $nextResult->parent = $this->bodyNode; $nextResult->indexInLastCommonParent = $this->bodyNode->getNbChildren(); } if ($prevResult->lastCommonParentDepth == $nextResult->lastCommonParentDepth) { // We need some metric to choose which way to add-... if ($deletedNodes[0]->parent === $deletedNodes[count($deletedNodes) - 1]->parent && $prevResult->parent === $nextResult->parent) { // The difference is not in the parent $prevResult->lastCommonParentDepth = $prevResult->lastCommonParentDepth + 1; } else { // The difference is in the parent, so compare them // now THIS is tricky $distancePrev = $deletedNodes[0]->parent->getMatchRatio($prevResult->parent); $distanceNext = $deletedNodes[count($deletedNodes) - 1]->parent->getMatchRatio($nextResult->parent); if ($distancePrev <= $distanceNext) { $prevResult->lastCommonParentDepth = $prevResult->lastCommonParentDepth + 1; } else { $nextResult->lastCommonParentDepth = $nextResult->lastCommonParentDepth + 1; } } } if ($prevResult->lastCommonParentDepth > $nextResult->lastCommonParentDepth) { // Inserting at the front if ($prevResult->splittingNeeded) { $prevLeaf->parent->splitUntil($prevResult->parent, $prevLeaf, true); } $prevLeaf = $deletedNodes[0]->copyTree(); unset($deletedNodes[0]); $deletedNodes = array_values($deletedNodes); $prevLeaf->setParent($prevResult->parent); $prevResult->parent->addChildAbsolute($prevLeaf,$prevResult->indexInLastCommonParent + 1); } else if ($prevResult->lastCommonParentDepth < $nextResult->lastCommonParentDepth) { // Inserting at the back if ($nextResult->splittingNeeded) { $splitOccured = $nextLeaf->parent->splitUntil($nextResult->parent, $nextLeaf, false); if ($splitOccured) { // The place where to insert is shifted one place to the // right $nextResult->indexInLastCommonParent = $nextResult->indexInLastCommonParent + 1; } } $nextLeaf = $deletedNodes[count(deletedNodes) - 1]->copyTree(); unset($deletedNodes[count(deletedNodes) - 1]); $deletedNodes = array_values($deletedNodes); $nextLeaf->setParent($nextResult->parent); $nextResult->parent->addChildAbsolute($nextLeaf,$nextResult->indexInLastCommonParent); } } ++$this->deletedID; } public function expandWhiteSpace() { $this->bodyNode->expandWhiteSpace(); } public function lengthNew(){ return count($this->textNodes); } public function lengthOld(){ return count($this->oldTextNodes); } } class HTMLDiffer { private $output; private static $debug = ''; function __construct($output) { $this->output = $output; } function htmlDiff($from, $to) { wfProfileIn( __METHOD__ ); // Create an XML parser $xml_parser = xml_parser_create(''); $domfrom = new DomTreeBuilder(); // Set the functions to handle opening and closing tags xml_set_element_handler($xml_parser, array($domfrom, "startElement"), array($domfrom, "endElement")); // Set the function to handle blocks of character data xml_set_character_data_handler($xml_parser, array($domfrom, "characters")); HTMLDiffer::diffDebug( "Parsing " . strlen($from) . " characters worth of HTML\n" ); if (!xml_parse($xml_parser, ''.Sanitizer::hackDocType().'
', false) || !xml_parse($xml_parser, $from, false) || !xml_parse($xml_parser, '', true)){ $error = xml_error_string(xml_get_error_code($xml_parser)); $line = xml_get_current_line_number($xml_parser); HTMLDiffer::diffDebug( "XML error: $error at line $line\n" ); } xml_parser_free($xml_parser); unset($from); $xml_parser = xml_parser_create(''); $domto = new DomTreeBuilder(); // Set the functions to handle opening and closing tags xml_set_element_handler($xml_parser, array($domto, "startElement"), array($domto, "endElement")); // Set the function to handle blocks of character data xml_set_character_data_handler($xml_parser, array($domto, "characters")); HTMLDiffer::diffDebug( "Parsing " . strlen($to) . " characters worth of HTML\n" ); if (!xml_parse($xml_parser, ''.Sanitizer::hackDocType().'', false) || !xml_parse($xml_parser, $to, false) || !xml_parse($xml_parser, '', true)){ $error = xml_error_string(xml_get_error_code($xml_parser)); $line = xml_get_current_line_number($xml_parser); HTMLDiffer::diffDebug( "XML error: $error at line $line\n" ); } xml_parser_free($xml_parser); unset($to); $diffengine = new WikiDiff3(); $differences = $this->preProcess($diffengine->diff_range($domfrom->getDiffLines(), $domto->getDiffLines())); unset($xml_parser, $diffengine); $domdiffer = new TextNodeDiffer($domto, $domfrom); $currentIndexLeft = 0; $currentIndexRight = 0; foreach ($differences as &$d) { if ($d->leftstart > $currentIndexLeft) { $domdiffer->handlePossibleChangedPart($currentIndexLeft, $d->leftstart, $currentIndexRight, $d->rightstart); } if ($d->leftlength > 0) { $domdiffer->markAsDeleted($d->leftstart, $d->leftend, $d->rightstart); } $domdiffer->markAsNew($d->rightstart, $d->rightend); $currentIndexLeft = $d->leftend; $currentIndexRight = $d->rightend; } $oldLength = $domdiffer->lengthOld(); if ($currentIndexLeft < $oldLength) { $domdiffer->handlePossibleChangedPart($currentIndexLeft, $oldLength, $currentIndexRight, $domdiffer->lengthNew()); } $domdiffer->expandWhiteSpace(); $output = new HTMLOutput('htmldiff', $this->output); $output->parse($domdiffer->bodyNode); wfProfileOut( __METHOD__ ); } private function preProcess(/*array*/ $differences) { $newRanges = array(); $nbDifferences = count($differences); for ($i = 0; $i < $nbDifferences; ++$i) { $leftStart = $differences[$i]->leftstart; $leftEnd = $differences[$i]->leftend; $rightStart = $differences[$i]->rightstart; $rightEnd = $differences[$i]->rightend; $leftLength = $leftEnd - $leftStart; $rightLength = $rightEnd - $rightStart; while ($i + 1 < $nbDifferences && self::score($leftLength, $differences[$i + 1]->leftlength, $rightLength, $differences[$i + 1]->rightlength) > ($differences[$i + 1]->leftstart - $leftEnd)) { $leftEnd = $differences[$i + 1]->leftend; $rightEnd = $differences[$i + 1]->rightend; $leftLength = $leftEnd - $leftStart; $rightLength = $rightEnd - $rightStart; ++$i; } $newRanges[] = new RangeDifference($leftStart, $leftEnd, $rightStart, $rightEnd); } return $newRanges; } /** * Heuristic to merge differences for readability. */ public static function score($ll, $nll, $rl, $nrl) { if (($ll == 0 && $nll == 0) || ($rl == 0 && $nrl == 0)) { return 0; } $numbers = array($ll, $nll, $rl, $nrl); $d = 0; foreach ($numbers as &$number) { while ($number > 3) { $d += 3; $number -= 3; $number *= 0.5; } $d += $number; } return $d / (1.5 * count($numbers)); } /** * Add to debug output * @param string $str Debug output */ public static function diffDebug( $str ) { self :: $debug .= $str; } /** * Get debug output * @return string */ public static function getDebugOutput() { return self :: $debug; } } class TextOnlyComparator { public $leafs = array(); function _construct(TagNode $tree) { $this->addRecursive($tree); $this->leafs = array_map(array('TextNode','toDiffLine'), $this->leafs); } private function addRecursive(TagNode $tree) { foreach ($tree->children as &$child) { if ($child instanceof TagNode) { $this->addRecursive($child); } else if ($child instanceof TextNode) { $this->leafs[] = $node; } } } public function getMatchRatio(TextOnlyComparator $other) { $nbOthers = count($other->leafs); $nbThis = count($this->leafs); if($nbOthers == 0 || $nbThis == 0){ return -log(0); } $diffengine = new WikiDiff3(25000, 1.35); $diffengine->diff($this->leafs, $other->leafs); $lcsLength = $diffengine->getLcsLength(); $distanceThis = $nbThis-$lcsLength; return (2.0 - $lcsLength/$nbOthers - $lcsLength/$nbThis) / 2.0; } } /** * A comparator used when calculating the difference in ancestry of two Nodes. */ class AncestorComparator { public $ancestors; public $ancestorsText; function __construct(/*array*/ $ancestors) { $this->ancestors = $ancestors; $this->ancestorsText = array_map(array('TagNode','toDiffLine'), $ancestors); } public $compareTxt = ""; public function getResult(AncestorComparator $other) { $diffengine = new WikiDiff3(10000, 1.35); $differences = $diffengine->diff_range($other->ancestorsText,$this->ancestorsText); if (count($differences) == 0){ return null; } $changeTxt = new ChangeTextGenerator($this, $other); return $changeTxt->getChanged($differences)->toString();; } } class ChangeTextGenerator { private $ancestorComparator; private $other; private $factory; function __construct(AncestorComparator $ancestorComparator, AncestorComparator $other) { $this->ancestorComparator = $ancestorComparator; $this->other = $other; $this->factory = new TagToStringFactory(); } public function getChanged(/*array*/ $differences) { $txt = new ChangeText; $rootlistopened = false; if (count($differences) > 1) { $txt->addHtml('