diff options
author | Pierre Schmitz <pierre@archlinux.de> | 2010-07-28 11:52:48 +0200 |
---|---|---|
committer | Pierre Schmitz <pierre@archlinux.de> | 2010-07-28 11:52:48 +0200 |
commit | 222b01f5169f1c7e69762e0e8904c24f78f71882 (patch) | |
tree | 8e932e12546bb991357ec48eb1638d1770be7a35 /includes/search | |
parent | 00ab76a6b686e98a914afc1975812d2b1aaa7016 (diff) |
update to MediaWiki 1.16.0
Diffstat (limited to 'includes/search')
-rw-r--r-- | includes/search/SearchEngine.php | 1248 | ||||
-rw-r--r-- | includes/search/SearchIBM_DB2.php | 224 | ||||
-rw-r--r-- | includes/search/SearchMySQL.php | 412 | ||||
-rw-r--r-- | includes/search/SearchMySQL4.php | 34 | ||||
-rw-r--r-- | includes/search/SearchOracle.php | 268 | ||||
-rw-r--r-- | includes/search/SearchPostgres.php | 246 | ||||
-rw-r--r-- | includes/search/SearchSqlite.php | 344 | ||||
-rw-r--r-- | includes/search/SearchUpdate.php | 113 |
8 files changed, 2889 insertions, 0 deletions
diff --git a/includes/search/SearchEngine.php b/includes/search/SearchEngine.php new file mode 100644 index 00000000..f4ca700d --- /dev/null +++ b/includes/search/SearchEngine.php @@ -0,0 +1,1248 @@ +<?php +/** + * @defgroup Search Search + * + * @file + * @ingroup Search + */ + +/** + * Contain a class for special pages + * @ingroup Search + */ +class SearchEngine { + var $limit = 10; + var $offset = 0; + var $prefix = ''; + var $searchTerms = array(); + var $namespaces = array( NS_MAIN ); + var $showRedirects = false; + + /** + * Perform a full text search query and return a result set. + * If title searches are not supported or disabled, return null. + * STUB + * + * @param $term String: raw search term + * @return SearchResultSet + */ + function searchText( $term ) { + return null; + } + + /** + * Perform a title-only search query and return a result set. + * If title searches are not supported or disabled, return null. + * STUB + * + * @param $term String: raw search term + * @return SearchResultSet + */ + function searchTitle( $term ) { + return null; + } + + /** If this search backend can list/unlist redirects */ + function acceptListRedirects() { + return true; + } + + /** + * When overridden in derived class, performs database-specific conversions + * on text to be used for searching or updating search index. + * Default implementation does nothing (simply returns $string). + * + * @param $string string: String to process + * @return string + */ + public function normalizeText( $string ) { + return $string; + } + + /** + * Transform search term in cases when parts of the query came as different GET params (when supported) + * e.g. for prefix queries: search=test&prefix=Main_Page/Archive -> test prefix:Main Page/Archive + */ + function transformSearchTerm( $term ) { + return $term; + } + + /** + * If an exact title match can be found, or a very slightly close match, + * return the title. If no match, returns NULL. + * + * @param $searchterm String + * @return Title + */ + public static function getNearMatch( $searchterm ) { + $title = self::getNearMatchInternal( $searchterm ); + + wfRunHooks( 'SearchGetNearMatchComplete', array( $searchterm, &$title ) ); + return $title; + } + + /** + * Really find the title match. + */ + private static function getNearMatchInternal( $searchterm ) { + global $wgContLang; + + $allSearchTerms = array($searchterm); + + if ( $wgContLang->hasVariants() ) { + $allSearchTerms = array_merge($allSearchTerms,$wgContLang->convertLinkToAllVariants($searchterm)); + } + + if( !wfRunHooks( 'SearchGetNearMatchBefore', array( $allSearchTerms, &$titleResult ) ) ) { + return $titleResult; + } + + foreach($allSearchTerms as $term) { + + # Exact match? No need to look further. + $title = Title::newFromText( $term ); + if (is_null($title)) + return null; + + if ( $title->getNamespace() == NS_SPECIAL || $title->isExternal() || $title->exists() ) { + return $title; + } + + # See if it still otherwise has content is some sane sense + $article = MediaWiki::articleFromTitle( $title ); + if( $article->hasViewableContent() ) { + return $title; + } + + # Now try all lower case (i.e. first letter capitalized) + # + $title = Title::newFromText( $wgContLang->lc( $term ) ); + if ( $title && $title->exists() ) { + return $title; + } + + # Now try capitalized string + # + $title = Title::newFromText( $wgContLang->ucwords( $term ) ); + if ( $title && $title->exists() ) { + return $title; + } + + # Now try all upper case + # + $title = Title::newFromText( $wgContLang->uc( $term ) ); + if ( $title && $title->exists() ) { + return $title; + } + + # Now try Word-Caps-Breaking-At-Word-Breaks, for hyphenated names etc + $title = Title::newFromText( $wgContLang->ucwordbreaks($term) ); + if ( $title && $title->exists() ) { + return $title; + } + + // Give hooks a chance at better match variants + $title = null; + if( !wfRunHooks( 'SearchGetNearMatch', array( $term, &$title ) ) ) { + return $title; + } + } + + $title = Title::newFromText( $searchterm ); + + # Entering an IP address goes to the contributions page + if ( ( $title->getNamespace() == NS_USER && User::isIP($title->getText() ) ) + || User::isIP( trim( $searchterm ) ) ) { + return SpecialPage::getTitleFor( 'Contributions', $title->getDBkey() ); + } + + + # Entering a user goes to the user page whether it's there or not + if ( $title->getNamespace() == NS_USER ) { + return $title; + } + + # Go to images that exist even if there's no local page. + # There may have been a funny upload, or it may be on a shared + # file repository such as Wikimedia Commons. + if( $title->getNamespace() == NS_FILE ) { + $image = wfFindFile( $title ); + if( $image ) { + return $title; + } + } + + # MediaWiki namespace? Page may be "implied" if not customized. + # Just return it, with caps forced as the message system likes it. + if( $title->getNamespace() == NS_MEDIAWIKI ) { + return Title::makeTitle( NS_MEDIAWIKI, $wgContLang->ucfirst( $title->getText() ) ); + } + + # Quoted term? Try without the quotes... + $matches = array(); + if( preg_match( '/^"([^"]+)"$/', $searchterm, $matches ) ) { + return SearchEngine::getNearMatch( $matches[1] ); + } + + return null; + } + + public static function legalSearchChars() { + return "A-Za-z_'.0-9\\x80-\\xFF\\-"; + } + + /** + * Set the maximum number of results to return + * and how many to skip before returning the first. + * + * @param $limit Integer + * @param $offset Integer + */ + function setLimitOffset( $limit, $offset = 0 ) { + $this->limit = intval( $limit ); + $this->offset = intval( $offset ); + } + + /** + * Set which namespaces the search should include. + * Give an array of namespace index numbers. + * + * @param $namespaces Array + */ + function setNamespaces( $namespaces ) { + $this->namespaces = $namespaces; + } + + /** + * Parse some common prefixes: all (search everything) + * or namespace names + * + * @param $query String + */ + function replacePrefixes( $query ){ + global $wgContLang; + + $parsed = $query; + if( strpos($query,':') === false ) { // nothing to do + wfRunHooks( 'SearchEngineReplacePrefixesComplete', array( $this, $query, &$parsed ) ); + return $parsed; + } + + $allkeyword = wfMsgForContent('searchall').":"; + if( strncmp($query, $allkeyword, strlen($allkeyword)) == 0 ){ + $this->namespaces = null; + $parsed = substr($query,strlen($allkeyword)); + } else if( strpos($query,':') !== false ) { + $prefix = substr($query,0,strpos($query,':')); + $index = $wgContLang->getNsIndex($prefix); + if($index !== false){ + $this->namespaces = array($index); + $parsed = substr($query,strlen($prefix)+1); + } + } + if(trim($parsed) == '') + $parsed = $query; // prefix was the whole query + + wfRunHooks( 'SearchEngineReplacePrefixesComplete', array( $this, $query, &$parsed ) ); + + return $parsed; + } + + /** + * Make a list of searchable namespaces and their canonical names. + * @return Array + */ + public static function searchableNamespaces() { + global $wgContLang; + $arr = array(); + foreach( $wgContLang->getNamespaces() as $ns => $name ) { + if( $ns >= NS_MAIN ) { + $arr[$ns] = $name; + } + } + + wfRunHooks( 'SearchableNamespaces', array( &$arr ) ); + return $arr; + } + + /** + * Extract default namespaces to search from the given user's + * settings, returning a list of index numbers. + * + * @param $user User + * @return Array + */ + public static function userNamespaces( $user ) { + global $wgSearchEverythingOnlyLoggedIn; + + // get search everything preference, that can be set to be read for logged-in users + $searcheverything = false; + if( ( $wgSearchEverythingOnlyLoggedIn && $user->isLoggedIn() ) + || !$wgSearchEverythingOnlyLoggedIn ) + $searcheverything = $user->getOption('searcheverything'); + + // searcheverything overrides other options + if( $searcheverything ) + return array_keys(SearchEngine::searchableNamespaces()); + + $arr = Preferences::loadOldSearchNs( $user ); + $searchableNamespaces = SearchEngine::searchableNamespaces(); + + $arr = array_intersect( $arr, array_keys($searchableNamespaces) ); // Filter + + return $arr; + } + + /** + * Find snippet highlight settings for a given user + * + * @param $user User + * @return Array contextlines, contextchars + */ + public static function userHighlightPrefs( &$user ){ + //$contextlines = $user->getOption( 'contextlines', 5 ); + //$contextchars = $user->getOption( 'contextchars', 50 ); + $contextlines = 2; // Hardcode this. Old defaults sucked. :) + $contextchars = 75; // same as above.... :P + return array($contextlines, $contextchars); + } + + /** + * An array of namespaces indexes to be searched by default + * + * @return Array + */ + public static function defaultNamespaces(){ + global $wgNamespacesToBeSearchedDefault; + + return array_keys($wgNamespacesToBeSearchedDefault, true); + } + + /** + * Get a list of namespace names useful for showing in tooltips + * and preferences + * + * @param $namespaces Array + */ + public static function namespacesAsText( $namespaces ){ + global $wgContLang; + + $formatted = array_map( array($wgContLang,'getFormattedNsText'), $namespaces ); + foreach( $formatted as $key => $ns ){ + if ( empty($ns) ) + $formatted[$key] = wfMsg( 'blanknamespace' ); + } + return $formatted; + } + + /** + * Return the help namespaces to be shown on Special:Search + * + * @return Array + */ + public static function helpNamespaces() { + global $wgNamespacesToBeSearchedHelp; + + return array_keys( $wgNamespacesToBeSearchedHelp, true ); + } + + /** + * Return a 'cleaned up' search string + * + * @param $text String + * @return String + */ + function filter( $text ) { + $lc = $this->legalSearchChars(); + return trim( preg_replace( "/[^{$lc}]/", " ", $text ) ); + } + /** + * Load up the appropriate search engine class for the currently + * active database backend, and return a configured instance. + * + * @return SearchEngine + */ + public static function create() { + global $wgSearchType; + $dbr = wfGetDB( DB_SLAVE ); + if( $wgSearchType ) { + $class = $wgSearchType; + } else { + $class = $dbr->getSearchEngine(); + } + $search = new $class( $dbr ); + $search->setLimitOffset(0,0); + return $search; + } + + /** + * Create or update the search index record for the given page. + * Title and text should be pre-processed. + * STUB + * + * @param $id Integer + * @param $title String + * @param $text String + */ + function update( $id, $title, $text ) { + // no-op + } + + /** + * Update a search index record's title only. + * Title should be pre-processed. + * STUB + * + * @param $id Integer + * @param $title String + */ + function updateTitle( $id, $title ) { + // no-op + } + + /** + * Get OpenSearch suggestion template + * + * @return String + */ + public static function getOpenSearchTemplate() { + global $wgOpenSearchTemplate, $wgServer, $wgScriptPath; + if( $wgOpenSearchTemplate ) { + return $wgOpenSearchTemplate; + } else { + $ns = implode( '|', SearchEngine::defaultNamespaces() ); + if( !$ns ) $ns = "0"; + return $wgServer . $wgScriptPath . '/api.php?action=opensearch&search={searchTerms}&namespace='.$ns; + } + } + + /** + * Get internal MediaWiki Suggest template + * + * @return String + */ + public static function getMWSuggestTemplate() { + global $wgMWSuggestTemplate, $wgServer, $wgScriptPath; + if($wgMWSuggestTemplate) + return $wgMWSuggestTemplate; + else + return $wgServer . $wgScriptPath . '/api.php?action=opensearch&search={searchTerms}&namespace={namespaces}&suggest'; + } +} + +/** + * @ingroup Search + */ +class SearchResultSet { + /** + * Fetch an array of regular expression fragments for matching + * the search terms as parsed by this engine in a text extract. + * STUB + * + * @return Array + */ + function termMatches() { + return array(); + } + + function numRows() { + return 0; + } + + /** + * Return true if results are included in this result set. + * STUB + * + * @return Boolean + */ + function hasResults() { + return false; + } + + /** + * Some search modes return a total hit count for the query + * in the entire article database. This may include pages + * in namespaces that would not be matched on the given + * settings. + * + * Return null if no total hits number is supported. + * + * @return Integer + */ + function getTotalHits() { + return null; + } + + /** + * Some search modes return a suggested alternate term if there are + * no exact hits. Returns true if there is one on this set. + * + * @return Boolean + */ + function hasSuggestion() { + return false; + } + + /** + * @return String: suggested query, null if none + */ + function getSuggestionQuery(){ + return null; + } + + /** + * @return String: HTML highlighted suggested query, '' if none + */ + function getSuggestionSnippet(){ + return ''; + } + + /** + * Return information about how and from where the results were fetched, + * should be useful for diagnostics and debugging + * + * @return String + */ + function getInfo() { + return null; + } + + /** + * Return a result set of hits on other (multiple) wikis associated with this one + * + * @return SearchResultSet + */ + function getInterwikiResults() { + return null; + } + + /** + * Check if there are results on other wikis + * + * @return Boolean + */ + function hasInterwikiResults() { + return $this->getInterwikiResults() != null; + } + + + /** + * Fetches next search result, or false. + * STUB + * + * @return SearchResult + */ + function next() { + return false; + } + + /** + * Frees the result set, if applicable. + */ + function free() { + // ... + } +} + +/** + * This class is used for different SQL-based search engines shipped with MediaWiki + */ +class SqlSearchResultSet extends SearchResultSet { + function __construct( $resultSet, $terms ) { + $this->mResultSet = $resultSet; + $this->mTerms = $terms; + } + + function termMatches() { + return $this->mTerms; + } + + function numRows() { + return $this->mResultSet->numRows(); + } + + function next() { + if ($this->mResultSet === false ) + return false; + + $row = $this->mResultSet->fetchObject(); + if ($row === false) + return false; + return new SearchResult($row); + } + + function free() { + $this->mResultSet->free(); + } +} + +/** + * @ingroup Search + */ +class SearchResultTooMany { + ## Some search engines may bail out if too many matches are found +} + + +/** + * @todo Fixme: This class is horribly factored. It would probably be better to + * have a useful base class to which you pass some standard information, then + * let the fancy self-highlighters extend that. + * @ingroup Search + */ +class SearchResult { + var $mRevision = null; + var $mImage = null; + + function __construct( $row ) { + $this->mTitle = Title::makeTitle( $row->page_namespace, $row->page_title ); + if( !is_null($this->mTitle) ){ + $this->mRevision = Revision::newFromTitle( $this->mTitle ); + if( $this->mTitle->getNamespace() === NS_FILE ) + $this->mImage = wfFindFile( $this->mTitle ); + } + } + + /** + * Check if this is result points to an invalid title + * + * @return Boolean + */ + function isBrokenTitle(){ + if( is_null($this->mTitle) ) + return true; + return false; + } + + /** + * Check if target page is missing, happens when index is out of date + * + * @return Boolean + */ + function isMissingRevision(){ + return !$this->mRevision && !$this->mImage; + } + + /** + * @return Title + */ + function getTitle() { + return $this->mTitle; + } + + /** + * @return Double or null if not supported + */ + function getScore() { + return null; + } + + /** + * Lazy initialization of article text from DB + */ + protected function initText(){ + if( !isset($this->mText) ){ + if($this->mRevision != null) + $this->mText = $this->mRevision->getText(); + else // TODO: can we fetch raw wikitext for commons images? + $this->mText = ''; + + } + } + + /** + * @param $terms Array: terms to highlight + * @return String: highlighted text snippet, null (and not '') if not supported + */ + function getTextSnippet($terms){ + global $wgUser, $wgAdvancedSearchHighlighting; + $this->initText(); + list($contextlines,$contextchars) = SearchEngine::userHighlightPrefs($wgUser); + $h = new SearchHighlighter(); + if( $wgAdvancedSearchHighlighting ) + return $h->highlightText( $this->mText, $terms, $contextlines, $contextchars ); + else + return $h->highlightSimple( $this->mText, $terms, $contextlines, $contextchars ); + } + + /** + * @param $terms Array: terms to highlight + * @return String: highlighted title, '' if not supported + */ + function getTitleSnippet($terms){ + return ''; + } + + /** + * @param $terms Array: terms to highlight + * @return String: highlighted redirect name (redirect to this page), '' if none or not supported + */ + function getRedirectSnippet($terms){ + return ''; + } + + /** + * @return Title object for the redirect to this page, null if none or not supported + */ + function getRedirectTitle(){ + return null; + } + + /** + * @return string highlighted relevant section name, null if none or not supported + */ + function getSectionSnippet(){ + return ''; + } + + /** + * @return Title object (pagename+fragment) for the section, null if none or not supported + */ + function getSectionTitle(){ + return null; + } + + /** + * @return String: timestamp + */ + function getTimestamp(){ + if( $this->mRevision ) + return $this->mRevision->getTimestamp(); + else if( $this->mImage ) + return $this->mImage->getTimestamp(); + return ''; + } + + /** + * @return Integer: number of words + */ + function getWordCount(){ + $this->initText(); + return str_word_count( $this->mText ); + } + + /** + * @return Integer: size in bytes + */ + function getByteSize(){ + $this->initText(); + return strlen( $this->mText ); + } + + /** + * @return Boolean if hit has related articles + */ + function hasRelated(){ + return false; + } + + /** + * @return String: interwiki prefix of the title (return iw even if title is broken) + */ + function getInterwikiPrefix(){ + return ''; + } +} + +/** + * Highlight bits of wikitext + * + * @ingroup Search + */ +class SearchHighlighter { + var $mCleanWikitext = true; + + function SearchHighlighter($cleanupWikitext = true){ + $this->mCleanWikitext = $cleanupWikitext; + } + + /** + * Default implementation of wikitext highlighting + * + * @param $text String + * @param $terms Array: terms to highlight (unescaped) + * @param $contextlines Integer + * @param $contextchars Integer + * @return String + */ + public function highlightText( $text, $terms, $contextlines, $contextchars ) { + global $wgLang, $wgContLang; + global $wgSearchHighlightBoundaries; + $fname = __METHOD__; + + if($text == '') + return ''; + + // spli text into text + templates/links/tables + $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)"; + // first capture group is for detecting nested templates/links/tables/references + $endPatterns = array( + 1 => '/(\{\{)|(\}\})/', // template + 2 => '/(\[\[)|(\]\])/', // image + 3 => "/(\n\\{\\|)|(\n\\|\\})/"); // table + + // FIXME: this should prolly be a hook or something + if(function_exists('wfCite')){ + $spat .= '|(<ref>)'; // references via cite extension + $endPatterns[4] = '/(<ref>)|(<\/ref>)/'; + } + $spat .= '/'; + $textExt = array(); // text extracts + $otherExt = array(); // other extracts + wfProfileIn( "$fname-split" ); + $start = 0; + $textLen = strlen($text); + $count = 0; // sequence number to maintain ordering + while( $start < $textLen ){ + // find start of template/image/table + if( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ){ + $epat = ''; + foreach($matches as $key => $val){ + if($key > 0 && $val[1] != -1){ + if($key == 2){ + // see if this is an image link + $ns = substr($val[0],2,-1); + if( $wgContLang->getNsIndex($ns) != NS_FILE ) + break; + + } + $epat = $endPatterns[$key]; + $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) ); + $start = $val[1]; + break; + } + } + if( $epat ){ + // find end (and detect any nested elements) + $level = 0; + $offset = $start + 1; + $found = false; + while( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ){ + if( array_key_exists(2,$endMatches) ){ + // found end + if($level == 0){ + $len = strlen($endMatches[2][0]); + $off = $endMatches[2][1]; + $this->splitAndAdd( $otherExt, $count, + substr( $text, $start, $off + $len - $start ) ); + $start = $off + $len; + $found = true; + break; + } else{ + // end of nested element + $level -= 1; + } + } else{ + // nested + $level += 1; + } + $offset = $endMatches[0][1] + strlen($endMatches[0][0]); + } + if( ! $found ){ + // couldn't find appropriate closing tag, skip + $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen($matches[0][0]) ) ); + $start += strlen($matches[0][0]); + } + continue; + } + } + // else: add as text extract + $this->splitAndAdd( $textExt, $count, substr($text,$start) ); + break; + } + + $all = $textExt + $otherExt; // these have disjunct key sets + + wfProfileOut( "$fname-split" ); + + // prepare regexps + foreach( $terms as $index => $term ) { + // manually do upper/lowercase stuff for utf-8 since PHP won't do it + if(preg_match('/[\x80-\xff]/', $term) ){ + $terms[$index] = preg_replace_callback('/./us',array($this,'caseCallback'),$terms[$index]); + } else { + $terms[$index] = $term; + } + } + $anyterm = implode( '|', $terms ); + $phrase = implode("$wgSearchHighlightBoundaries+", $terms ); + + // FIXME: a hack to scale contextchars, a correct solution + // would be to have contextchars actually be char and not byte + // length, and do proper utf-8 substrings and lengths everywhere, + // but PHP is making that very hard and unclean to implement :( + $scale = strlen($anyterm) / mb_strlen($anyterm); + $contextchars = intval( $contextchars * $scale ); + + $patPre = "(^|$wgSearchHighlightBoundaries)"; + $patPost = "($wgSearchHighlightBoundaries|$)"; + + $pat1 = "/(".$phrase.")/ui"; + $pat2 = "/$patPre(".$anyterm.")$patPost/ui"; + + wfProfileIn( "$fname-extract" ); + + $left = $contextlines; + + $snippets = array(); + $offsets = array(); + + // show beginning only if it contains all words + $first = 0; + $firstText = ''; + foreach($textExt as $index => $line){ + if(strlen($line)>0 && $line[0] != ';' && $line[0] != ':'){ + $firstText = $this->extract( $line, 0, $contextchars * $contextlines ); + $first = $index; + break; + } + } + if( $firstText ){ + $succ = true; + // check if first text contains all terms + foreach($terms as $term){ + if( ! preg_match("/$patPre".$term."$patPost/ui", $firstText) ){ + $succ = false; + break; + } + } + if( $succ ){ + $snippets[$first] = $firstText; + $offsets[$first] = 0; + } + } + if( ! $snippets ) { + // match whole query on text + $this->process($pat1, $textExt, $left, $contextchars, $snippets, $offsets); + // match whole query on templates/tables/images + $this->process($pat1, $otherExt, $left, $contextchars, $snippets, $offsets); + // match any words on text + $this->process($pat2, $textExt, $left, $contextchars, $snippets, $offsets); + // match any words on templates/tables/images + $this->process($pat2, $otherExt, $left, $contextchars, $snippets, $offsets); + + ksort($snippets); + } + + // add extra chars to each snippet to make snippets constant size + $extended = array(); + if( count( $snippets ) == 0){ + // couldn't find the target words, just show beginning of article + $targetchars = $contextchars * $contextlines; + $snippets[$first] = ''; + $offsets[$first] = 0; + } else{ + // if begin of the article contains the whole phrase, show only that !! + if( array_key_exists($first,$snippets) && preg_match($pat1,$snippets[$first]) + && $offsets[$first] < $contextchars * 2 ){ + $snippets = array ($first => $snippets[$first]); + } + + // calc by how much to extend existing snippets + $targetchars = intval( ($contextchars * $contextlines) / count ( $snippets ) ); + } + + foreach($snippets as $index => $line){ + $extended[$index] = $line; + $len = strlen($line); + if( $len < $targetchars - 20 ){ + // complete this line + if($len < strlen( $all[$index] )){ + $extended[$index] = $this->extract( $all[$index], $offsets[$index], $offsets[$index]+$targetchars, $offsets[$index]); + $len = strlen( $extended[$index] ); + } + + // add more lines + $add = $index + 1; + while( $len < $targetchars - 20 + && array_key_exists($add,$all) + && !array_key_exists($add,$snippets) ){ + $offsets[$add] = 0; + $tt = "\n".$this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] ); + $extended[$add] = $tt; + $len += strlen( $tt ); + $add++; + } + } + } + + //$snippets = array_map('htmlspecialchars', $extended); + $snippets = $extended; + $last = -1; + $extract = ''; + foreach($snippets as $index => $line){ + if($last == -1) + $extract .= $line; // first line + elseif($last+1 == $index && $offsets[$last]+strlen($snippets[$last]) >= strlen($all[$last])) + $extract .= " ".$line; // continous lines + else + $extract .= '<b> ... </b>' . $line; + + $last = $index; + } + if( $extract ) + $extract .= '<b> ... </b>'; + + $processed = array(); + foreach($terms as $term){ + if( ! isset($processed[$term]) ){ + $pat3 = "/$patPre(".$term.")$patPost/ui"; // highlight word + $extract = preg_replace( $pat3, + "\\1<span class='searchmatch'>\\2</span>\\3", $extract ); + $processed[$term] = true; + } + } + + wfProfileOut( "$fname-extract" ); + + return $extract; + } + + /** + * Split text into lines and add it to extracts array + * + * @param $extracts Array: index -> $line + * @param $count Integer + * @param $text String + */ + function splitAndAdd(&$extracts, &$count, $text){ + $split = explode( "\n", $this->mCleanWikitext? $this->removeWiki($text) : $text ); + foreach($split as $line){ + $tt = trim($line); + if( $tt ) + $extracts[$count++] = $tt; + } + } + + /** + * Do manual case conversion for non-ascii chars + * + * @param $matches Array + */ + function caseCallback($matches){ + global $wgContLang; + if( strlen($matches[0]) > 1 ){ + return '['.$wgContLang->lc($matches[0]).$wgContLang->uc($matches[0]).']'; + } else + return $matches[0]; + } + + /** + * Extract part of the text from start to end, but by + * not chopping up words + * @param $text String + * @param $start Integer + * @param $end Integer + * @param $posStart Integer: (out) actual start position + * @param $posEnd Integer: (out) actual end position + * @return String + */ + function extract($text, $start, $end, &$posStart = null, &$posEnd = null ){ + global $wgContLang; + + if( $start != 0) + $start = $this->position( $text, $start, 1 ); + if( $end >= strlen($text) ) + $end = strlen($text); + else + $end = $this->position( $text, $end ); + + if(!is_null($posStart)) + $posStart = $start; + if(!is_null($posEnd)) + $posEnd = $end; + + if($end > $start) + return substr($text, $start, $end-$start); + else + return ''; + } + + /** + * Find a nonletter near a point (index) in the text + * + * @param $text String + * @param $point Integer + * @param $offset Integer: offset to found index + * @return Integer: nearest nonletter index, or beginning of utf8 char if none + */ + function position($text, $point, $offset=0 ){ + $tolerance = 10; + $s = max( 0, $point - $tolerance ); + $l = min( strlen($text), $point + $tolerance ) - $s; + $m = array(); + if( preg_match('/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/', substr($text,$s,$l), $m, PREG_OFFSET_CAPTURE ) ){ + return $m[0][1] + $s + $offset; + } else{ + // check if point is on a valid first UTF8 char + $char = ord( $text[$point] ); + while( $char >= 0x80 && $char < 0xc0 ) { + // skip trailing bytes + $point++; + if($point >= strlen($text)) + return strlen($text); + $char = ord( $text[$point] ); + } + return $point; + + } + } + + /** + * Search extracts for a pattern, and return snippets + * + * @param $pattern String: regexp for matching lines + * @param $extracts Array: extracts to search + * @param $linesleft Integer: number of extracts to make + * @param $contextchars Integer: length of snippet + * @param $out Array: map for highlighted snippets + * @param $offsets Array: map of starting points of snippets + * @protected + */ + function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ){ + if($linesleft == 0) + return; // nothing to do + foreach($extracts as $index => $line){ + if( array_key_exists($index,$out) ) + continue; // this line already highlighted + + $m = array(); + if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) ) + continue; + + $offset = $m[0][1]; + $len = strlen($m[0][0]); + if($offset + $len < $contextchars) + $begin = 0; + elseif( $len > $contextchars) + $begin = $offset; + else + $begin = $offset + intval( ($len - $contextchars) / 2 ); + + $end = $begin + $contextchars; + + $posBegin = $begin; + // basic snippet from this line + $out[$index] = $this->extract($line,$begin,$end,$posBegin); + $offsets[$index] = $posBegin; + $linesleft--; + if($linesleft == 0) + return; + } + } + + /** + * Basic wikitext removal + * @protected + */ + function removeWiki($text) { + $fname = __METHOD__; + wfProfileIn( $fname ); + + //$text = preg_replace("/'{2,5}/", "", $text); + //$text = preg_replace("/\[[a-z]+:\/\/[^ ]+ ([^]]+)\]/", "\\2", $text); + //$text = preg_replace("/\[\[([^]|]+)\]\]/", "\\1", $text); + //$text = preg_replace("/\[\[([^]]+\|)?([^|]]+)\]\]/", "\\2", $text); + //$text = preg_replace("/\\{\\|(.*?)\\|\\}/", "", $text); + //$text = preg_replace("/\\[\\[[A-Za-z_-]+:([^|]+?)\\]\\]/", "", $text); + $text = preg_replace("/\\{\\{([^|]+?)\\}\\}/", "", $text); + $text = preg_replace("/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text); + $text = preg_replace("/\\[\\[([^|]+?)\\]\\]/", "\\1", $text); + $text = preg_replace_callback("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", array($this,'linkReplace'), $text); + //$text = preg_replace("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", "\\2", $text); + $text = preg_replace("/<\/?[^>]+>/", "", $text); + $text = preg_replace("/'''''/", "", $text); + $text = preg_replace("/('''|<\/?[iIuUbB]>)/", "", $text); + $text = preg_replace("/''/", "", $text); + + wfProfileOut( $fname ); + return $text; + } + + /** + * callback to replace [[target|caption]] kind of links, if + * the target is category or image, leave it + * + * @param $matches Array + */ + function linkReplace($matches){ + $colon = strpos( $matches[1], ':' ); + if( $colon === false ) + return $matches[2]; // replace with caption + global $wgContLang; + $ns = substr( $matches[1], 0, $colon ); + $index = $wgContLang->getNsIndex($ns); + if( $index !== false && ($index == NS_FILE || $index == NS_CATEGORY) ) + return $matches[0]; // return the whole thing + else + return $matches[2]; + + } + + /** + * Simple & fast snippet extraction, but gives completely unrelevant + * snippets + * + * @param $text String + * @param $terms Array + * @param $contextlines Integer + * @param $contextchars Integer + * @return String + */ + public function highlightSimple( $text, $terms, $contextlines, $contextchars ) { + global $wgLang, $wgContLang; + $fname = __METHOD__; + + $lines = explode( "\n", $text ); + + $terms = implode( '|', $terms ); + $max = intval( $contextchars ) + 1; + $pat1 = "/(.*)($terms)(.{0,$max})/i"; + + $lineno = 0; + + $extract = ""; + wfProfileIn( "$fname-extract" ); + foreach ( $lines as $line ) { + if ( 0 == $contextlines ) { + break; + } + ++$lineno; + $m = array(); + if ( ! preg_match( $pat1, $line, $m ) ) { + continue; + } + --$contextlines; + $pre = $wgContLang->truncate( $m[1], -$contextchars ); + + if ( count( $m ) < 3 ) { + $post = ''; + } else { + $post = $wgContLang->truncate( $m[3], $contextchars ); + } + + $found = $m[2]; + + $line = htmlspecialchars( $pre . $found . $post ); + $pat2 = '/(' . $terms . ")/i"; + $line = preg_replace( $pat2, + "<span class='searchmatch'>\\1</span>", $line ); + + $extract .= "${line}\n"; + } + wfProfileOut( "$fname-extract" ); + + return $extract; + } + +} + +/** + * Dummy class to be used when non-supported Database engine is present. + * @todo Fixme: dummy class should probably try something at least mildly useful, + * such as a LIKE search through titles. + * @ingroup Search + */ +class SearchEngineDummy extends SearchEngine { + // no-op +} diff --git a/includes/search/SearchIBM_DB2.php b/includes/search/SearchIBM_DB2.php new file mode 100644 index 00000000..d7587186 --- /dev/null +++ b/includes/search/SearchIBM_DB2.php @@ -0,0 +1,224 @@ +<?php +# Copyright (C) 2004 Brion Vibber <brion@pobox.com> +# http://www.mediawiki.org/ +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +# http://www.gnu.org/copyleft/gpl.html + +/** + * @file + * @ingroup Search + */ + +/** + * Search engine hook base class for IBM DB2 + * @ingroup Search + */ +class SearchIBM_DB2 extends SearchEngine { + function __construct($db) { + $this->db = $db; + } + + /** + * Perform a full text search query and return a result set. + * + * @param $term String: raw search term + * @return SqlSearchResultSet + */ + function searchText( $term ) { + $resultSet = $this->db->resultObject($this->db->query($this->getQuery($this->filter($term), true))); + return new SqlSearchResultSet($resultSet, $this->searchTerms); + } + + /** + * Perform a title-only search query and return a result set. + * + * @param $term String: taw search term + * @return SqlSearchResultSet + */ + function searchTitle($term) { + $resultSet = $this->db->resultObject($this->db->query($this->getQuery($this->filter($term), false))); + return new SqlSearchResultSet($resultSet, $this->searchTerms); + } + + + /** + * Return a partial WHERE clause to exclude redirects, if so set + * @return String + */ + function queryRedirect() { + if ($this->showRedirects) { + return ''; + } else { + return 'AND page_is_redirect=0'; + } + } + + /** + * Return a partial WHERE clause to limit the search to the given namespaces + * @return String + */ + function queryNamespaces() { + if( is_null($this->namespaces) ) + return ''; + $namespaces = implode(',', $this->namespaces); + if ($namespaces == '') { + $namespaces = '0'; + } + return 'AND page_namespace IN (' . $namespaces . ')'; + } + + /** + * Return a LIMIT clause to limit results on the query. + * @return String + */ + function queryLimit($sql) { + return $this->db->limitResult($sql, $this->limit, $this->offset); + } + + /** + * Does not do anything for generic search engine + * subclasses may define this though + * @return String + */ + function queryRanking($filteredTerm, $fulltext) { + // requires Net Search Extender or equivalent + // return ' ORDER BY score(1)'; + return ''; + } + + /** + * Construct the full SQL query to do the search. + * The guts shoulds be constructed in queryMain() + * @param string $filteredTerm String + * @param bool $fulltext Boolean + */ + function getQuery( $filteredTerm, $fulltext ) { + return $this->queryLimit($this->queryMain($filteredTerm, $fulltext) . ' ' . + $this->queryRedirect() . ' ' . + $this->queryNamespaces() . ' ' . + $this->queryRanking( $filteredTerm, $fulltext ) . ' '); + } + + + /** + * Picks which field to index on, depending on what type of query. + * @param $fulltext Boolean + * @return String + */ + function getIndexField($fulltext) { + return $fulltext ? 'si_text' : 'si_title'; + } + + /** + * Get the base part of the search query. + * + * @param string $filteredTerm String + * @param bool $fulltext Boolean + * @return String + */ + function queryMain( $filteredTerm, $fulltext ) { + $match = $this->parseQuery($filteredTerm, $fulltext); + $page = $this->db->tableName('page'); + $searchindex = $this->db->tableName('searchindex'); + return 'SELECT page_id, page_namespace, page_title ' . + "FROM $page,$searchindex " . + 'WHERE page_id=si_page AND ' . $match; + } + + /** @todo document */ + function parseQuery($filteredText, $fulltext) { + global $wgContLang; + $lc = SearchEngine::legalSearchChars(); + $this->searchTerms = array(); + + # FIXME: This doesn't handle parenthetical expressions. + $m = array(); + $q = array(); + + if (preg_match_all('/([-+<>~]?)(([' . $lc . ']+)(\*?)|"[^"]*")/', + $filteredText, $m, PREG_SET_ORDER)) { + foreach($m as $terms) { + + // Search terms in all variant forms, only + // apply on wiki with LanguageConverter + $temp_terms = $wgContLang->autoConvertToAllVariants( $terms[2] ); + if( is_array( $temp_terms )) { + $temp_terms = array_unique( array_values( $temp_terms )); + foreach( $temp_terms as $t ) + $q[] = $terms[1] . $wgContLang->normalizeForSearch( $t ); + } + else + $q[] = $terms[1] . $wgContLang->normalizeForSearch( $terms[2] ); + + if (!empty($terms[3])) { + $regexp = preg_quote( $terms[3], '/' ); + if ($terms[4]) + $regexp .= "[0-9A-Za-z_]+"; + } else { + $regexp = preg_quote(str_replace('"', '', $terms[2]), '/'); + } + $this->searchTerms[] = $regexp; + } + } + + $searchon = $this->db->strencode(join(',', $q)); + $field = $this->getIndexField($fulltext); + + // requires Net Search Extender or equivalent + //return " CONTAINS($field, '$searchon') > 0 "; + + return " lcase($field) LIKE lcase('%$searchon%')"; + } + + /** + * Create or update the search index record for the given page. + * Title and text should be pre-processed. + * + * @param $id Integer + * @param $title String + * @param $text String + */ + function update($id, $title, $text) { + $dbw = wfGetDB(DB_MASTER); + $dbw->replace('searchindex', + array('si_page'), + array( + 'si_page' => $id, + 'si_title' => $title, + 'si_text' => $text + ), 'SearchIBM_DB2::update' ); + // ? + //$dbw->query("CALL ctx_ddl.sync_index('si_text_idx')"); + //$dbw->query("CALL ctx_ddl.sync_index('si_title_idx')"); + } + + /** + * Update a search index record's title only. + * Title should be pre-processed. + * + * @param $id Integer + * @param $title String + */ + function updateTitle($id, $title) { + $dbw = wfGetDB(DB_MASTER); + + $dbw->update('searchindex', + array('si_title' => $title), + array('si_page' => $id), + 'SearchIBM_DB2::updateTitle', + array()); + } +} diff --git a/includes/search/SearchMySQL.php b/includes/search/SearchMySQL.php new file mode 100644 index 00000000..0c238be8 --- /dev/null +++ b/includes/search/SearchMySQL.php @@ -0,0 +1,412 @@ +<?php +# Copyright (C) 2004 Brion Vibber <brion@pobox.com> +# http://www.mediawiki.org/ +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +# http://www.gnu.org/copyleft/gpl.html + +/** + * @file + * @ingroup Search + */ + +/** + * Search engine hook for MySQL 4+ + * @ingroup Search + */ +class SearchMySQL extends SearchEngine { + var $strictMatching = true; + static $mMinSearchLength; + + /** @todo document */ + function __construct( $db ) { + $this->db = $db; + } + + /** + * Parse the user's query and transform it into an SQL fragment which will + * become part of a WHERE clause + */ + function parseQuery( $filteredText, $fulltext ) { + global $wgContLang; + $lc = SearchEngine::legalSearchChars(); // Minus format chars + $searchon = ''; + $this->searchTerms = array(); + + # FIXME: This doesn't handle parenthetical expressions. + $m = array(); + if( preg_match_all( '/([-+<>~]?)(([' . $lc . ']+)(\*?)|"[^"]*")/', + $filteredText, $m, PREG_SET_ORDER ) ) { + foreach( $m as $bits ) { + @list( /* all */, $modifier, $term, $nonQuoted, $wildcard ) = $bits; + + if( $nonQuoted != '' ) { + $term = $nonQuoted; + $quote = ''; + } else { + $term = str_replace( '"', '', $term ); + $quote = '"'; + } + + if( $searchon !== '' ) $searchon .= ' '; + if( $this->strictMatching && ($modifier == '') ) { + // If we leave this out, boolean op defaults to OR which is rarely helpful. + $modifier = '+'; + } + + // Some languages such as Serbian store the input form in the search index, + // so we may need to search for matches in multiple writing system variants. + $convertedVariants = $wgContLang->autoConvertToAllVariants( $term ); + if( is_array( $convertedVariants ) ) { + $variants = array_unique( array_values( $convertedVariants ) ); + } else { + $variants = array( $term ); + } + + // The low-level search index does some processing on input to work + // around problems with minimum lengths and encoding in MySQL's + // fulltext engine. + // For Chinese this also inserts spaces between adjacent Han characters. + $strippedVariants = array_map( + array( $wgContLang, 'normalizeForSearch' ), + $variants ); + + // Some languages such as Chinese force all variants to a canonical + // form when stripping to the low-level search index, so to be sure + // let's check our variants list for unique items after stripping. + $strippedVariants = array_unique( $strippedVariants ); + + $searchon .= $modifier; + if( count( $strippedVariants) > 1 ) + $searchon .= '('; + foreach( $strippedVariants as $stripped ) { + $stripped = $this->normalizeText( $stripped ); + if( $nonQuoted && strpos( $stripped, ' ' ) !== false ) { + // Hack for Chinese: we need to toss in quotes for + // multiple-character phrases since normalizeForSearch() + // added spaces between them to make word breaks. + $stripped = '"' . trim( $stripped ) . '"'; + } + $searchon .= "$quote$stripped$quote$wildcard "; + } + if( count( $strippedVariants) > 1 ) + $searchon .= ')'; + + // Match individual terms or quoted phrase in result highlighting... + // Note that variants will be introduced in a later stage for highlighting! + $regexp = $this->regexTerm( $term, $wildcard ); + $this->searchTerms[] = $regexp; + } + wfDebug( __METHOD__ . ": Would search with '$searchon'\n" ); + wfDebug( __METHOD__ . ': Match with /' . implode( '|', $this->searchTerms ) . "/\n" ); + } else { + wfDebug( __METHOD__ . ": Can't understand search query '{$filteredText}'\n" ); + } + + $searchon = $this->db->strencode( $searchon ); + $field = $this->getIndexField( $fulltext ); + return " MATCH($field) AGAINST('$searchon' IN BOOLEAN MODE) "; + } + + function regexTerm( $string, $wildcard ) { + global $wgContLang; + + $regex = preg_quote( $string, '/' ); + if( $wgContLang->hasWordBreaks() ) { + if( $wildcard ) { + // Don't cut off the final bit! + $regex = "\b$regex"; + } else { + $regex = "\b$regex\b"; + } + } else { + // For Chinese, words may legitimately abut other words in the text literal. + // Don't add \b boundary checks... note this could cause false positives + // for latin chars. + } + return $regex; + } + + public static function legalSearchChars() { + return "\"*" . parent::legalSearchChars(); + } + + /** + * Perform a full text search query and return a result set. + * + * @param $term String: raw search term + * @return MySQLSearchResultSet + */ + function searchText( $term ) { + return $this->searchInternal( $term, true ); + } + + /** + * Perform a title-only search query and return a result set. + * + * @param $term String: raw search term + * @return MySQLSearchResultSet + */ + function searchTitle( $term ) { + return $this->searchInternal( $term, false ); + } + + protected function searchInternal( $term, $fulltext ) { + global $wgCountTotalSearchHits; + + $filteredTerm = $this->filter( $term ); + $resultSet = $this->db->query( $this->getQuery( $filteredTerm, $fulltext ) ); + + $total = null; + if( $wgCountTotalSearchHits ) { + $totalResult = $this->db->query( $this->getCountQuery( $filteredTerm, $fulltext ) ); + $row = $totalResult->fetchObject(); + if( $row ) { + $total = intval( $row->c ); + } + $totalResult->free(); + } + + return new MySQLSearchResultSet( $resultSet, $this->searchTerms, $total ); + } + + + /** + * Return a partial WHERE clause to exclude redirects, if so set + * @return String + */ + function queryRedirect() { + if( $this->showRedirects ) { + return ''; + } else { + return 'AND page_is_redirect=0'; + } + } + + /** + * Return a partial WHERE clause to limit the search to the given namespaces + * @return String + */ + function queryNamespaces() { + if( is_null($this->namespaces) ) + return ''; # search all + if ( !count( $this->namespaces ) ) { + $namespaces = '0'; + } else { + $namespaces = $this->db->makeList( $this->namespaces ); + } + return 'AND page_namespace IN (' . $namespaces . ')'; + } + + /** + * Return a LIMIT clause to limit results on the query. + * @return String + */ + function queryLimit() { + return $this->db->limitResult( '', $this->limit, $this->offset ); + } + + /** + * Does not do anything for generic search engine + * subclasses may define this though + * @return String + */ + function queryRanking( $filteredTerm, $fulltext ) { + return ''; + } + + /** + * Construct the full SQL query to do the search. + * The guts shoulds be constructed in queryMain() + * @param $filteredTerm String + * @param $fulltext Boolean + */ + function getQuery( $filteredTerm, $fulltext ) { + return $this->queryMain( $filteredTerm, $fulltext ) . ' ' . + $this->queryRedirect() . ' ' . + $this->queryNamespaces() . ' ' . + $this->queryRanking( $filteredTerm, $fulltext ) . ' ' . + $this->queryLimit(); + } + + /** + * Picks which field to index on, depending on what type of query. + * @param $fulltext Boolean + * @return String + */ + function getIndexField( $fulltext ) { + return $fulltext ? 'si_text' : 'si_title'; + } + + /** + * Get the base part of the search query. + * The actual match syntax will depend on the server + * version; MySQL 3 and MySQL 4 have different capabilities + * in their fulltext search indexes. + * + * @param $filteredTerm String + * @param $fulltext Boolean + * @return String + */ + function queryMain( $filteredTerm, $fulltext ) { + $match = $this->parseQuery( $filteredTerm, $fulltext ); + $page = $this->db->tableName( 'page' ); + $searchindex = $this->db->tableName( 'searchindex' ); + return 'SELECT page_id, page_namespace, page_title ' . + "FROM $page,$searchindex " . + 'WHERE page_id=si_page AND ' . $match; + } + + function getCountQuery( $filteredTerm, $fulltext ) { + $match = $this->parseQuery( $filteredTerm, $fulltext ); + $page = $this->db->tableName( 'page' ); + $searchindex = $this->db->tableName( 'searchindex' ); + return "SELECT COUNT(*) AS c " . + "FROM $page,$searchindex " . + 'WHERE page_id=si_page AND ' . $match . + $this->queryRedirect() . ' ' . + $this->queryNamespaces(); + } + + /** + * Create or update the search index record for the given page. + * Title and text should be pre-processed. + * + * @param $id Integer + * @param $title String + * @param $text String + */ + function update( $id, $title, $text ) { + $dbw = wfGetDB( DB_MASTER ); + $dbw->replace( 'searchindex', + array( 'si_page' ), + array( + 'si_page' => $id, + 'si_title' => $this->normalizeText( $title ), + 'si_text' => $this->normalizeText( $text ) + ), __METHOD__ ); + } + + /** + * Update a search index record's title only. + * Title should be pre-processed. + * + * @param $id Integer + * @param $title String + */ + function updateTitle( $id, $title ) { + $dbw = wfGetDB( DB_MASTER ); + + $dbw->update( 'searchindex', + array( 'si_title' => $this->normalizeText( $title ) ), + array( 'si_page' => $id ), + __METHOD__, + array( $dbw->lowPriorityOption() ) ); + } + + /** + * Converts some characters for MySQL's indexing to grok it correctly, + * and pads short words to overcome limitations. + */ + function normalizeText( $string ) { + global $wgContLang; + + wfProfileIn( __METHOD__ ); + + // Some languages such as Chinese require word segmentation + $out = $wgContLang->wordSegmentation( $string ); + + // MySQL fulltext index doesn't grok utf-8, so we + // need to fold cases and convert to hex + $out = preg_replace_callback( + "/([\\xc0-\\xff][\\x80-\\xbf]*)/", + array( $this, 'stripForSearchCallback' ), + $wgContLang->lc( $out ) ); + + // And to add insult to injury, the default indexing + // ignores short words... Pad them so we can pass them + // through without reconfiguring the server... + $minLength = $this->minSearchLength(); + if( $minLength > 1 ) { + $n = $minLength - 1; + $out = preg_replace( + "/\b(\w{1,$n})\b/", + "$1u800", + $out ); + } + + // Periods within things like hostnames and IP addresses + // are also important -- we want a search for "example.com" + // or "192.168.1.1" to work sanely. + // + // MySQL's search seems to ignore them, so you'd match on + // "example.wikipedia.com" and "192.168.83.1" as well. + $out = preg_replace( + "/(\w)\.(\w|\*)/u", + "$1u82e$2", + $out ); + + wfProfileOut( __METHOD__ ); + + return $out; + } + + /** + * Armor a case-folded UTF-8 string to get through MySQL's + * fulltext search without being mucked up by funny charset + * settings or anything else of the sort. + */ + protected function stripForSearchCallback( $matches ) { + return 'u8' . bin2hex( $matches[1] ); + } + + /** + * Check MySQL server's ft_min_word_len setting so we know + * if we need to pad short words... + * + * @return int + */ + protected function minSearchLength() { + if( is_null( self::$mMinSearchLength ) ) { + $sql = "SHOW GLOBAL VARIABLES LIKE 'ft\\_min\\_word\\_len'"; + + $dbr = wfGetDB( DB_SLAVE ); + $result = $dbr->query( $sql ); + $row = $result->fetchObject(); + $result->free(); + + if( $row && $row->Variable_name == 'ft_min_word_len' ) { + self::$mMinSearchLength = intval( $row->Value ); + } else { + self::$mMinSearchLength = 0; + } + } + return self::$mMinSearchLength; + } +} + +/** + * @ingroup Search + */ +class MySQLSearchResultSet extends SqlSearchResultSet { + function MySQLSearchResultSet( $resultSet, $terms, $totalHits=null ) { + parent::__construct( $resultSet, $terms ); + $this->mTotalHits = $totalHits; + } + + function getTotalHits() { + return $this->mTotalHits; + } +}
\ No newline at end of file diff --git a/includes/search/SearchMySQL4.php b/includes/search/SearchMySQL4.php new file mode 100644 index 00000000..3e2bb2d1 --- /dev/null +++ b/includes/search/SearchMySQL4.php @@ -0,0 +1,34 @@ +<?php +# Copyright (C) 2004 Brion Vibber <brion@pobox.com> +# http://www.mediawiki.org/ +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +# http://www.gnu.org/copyleft/gpl.html + +/** + * @file + * @ingroup Search + */ + +/** + * Search engine hook for MySQL 4+ + * This class retained for backwards compatibility... + * The meat's been moved to SearchMySQL, since the 3.x variety is gone. + * @ingroup Search + * @deprecated + */ +class SearchMySQL4 extends SearchMySQL { + /* whee */ +} diff --git a/includes/search/SearchOracle.php b/includes/search/SearchOracle.php new file mode 100644 index 00000000..e4c5deee --- /dev/null +++ b/includes/search/SearchOracle.php @@ -0,0 +1,268 @@ +<?php +# Copyright (C) 2004 Brion Vibber <brion@pobox.com> +# http://www.mediawiki.org/ +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +# http://www.gnu.org/copyleft/gpl.html + +/** + * @file + * @ingroup Search + */ + +/** + * Search engine hook base class for Oracle (ConText). + * @ingroup Search + */ +class SearchOracle extends SearchEngine { + + private $reservedWords = array ('ABOUT' => 1, + 'ACCUM' => 1, + 'AND' => 1, + 'BT' => 1, + 'BTG' => 1, + 'BTI' => 1, + 'BTP' => 1, + 'FUZZY' => 1, + 'HASPATH' => 1, + 'INPATH' => 1, + 'MINUS' => 1, + 'NEAR' => 1, + 'NOT' => 1, + 'NT' => 1, + 'NTG' => 1, + 'NTI' => 1, + 'NTP' => 1, + 'OR' => 1, + 'PT' => 1, + 'RT' => 1, + 'SQE' => 1, + 'SYN' => 1, + 'TR' => 1, + 'TRSYN' => 1, + 'TT' => 1, + 'WITHIN' => 1); + + function __construct($db) { + $this->db = $db; + } + + /** + * Perform a full text search query and return a result set. + * + * @param $term String: raw search term + * @return SqlSearchResultSet + */ + function searchText( $term ) { + if ($term == '') + return new SqlSearchResultSet(false, ''); + + $resultSet = $this->db->resultObject($this->db->query($this->getQuery($this->filter($term), true))); + return new SqlSearchResultSet($resultSet, $this->searchTerms); + } + + /** + * Perform a title-only search query and return a result set. + * + * @param $term String: raw search term + * @return SqlSearchResultSet + */ + function searchTitle($term) { + if ($term == '') + return new SqlSearchResultSet(false, ''); + + $resultSet = $this->db->resultObject($this->db->query($this->getQuery($this->filter($term), false))); + return new MySQLSearchResultSet($resultSet, $this->searchTerms); + } + + + /** + * Return a partial WHERE clause to exclude redirects, if so set + * @return String + */ + function queryRedirect() { + if ($this->showRedirects) { + return ''; + } else { + return 'AND page_is_redirect=0'; + } + } + + /** + * Return a partial WHERE clause to limit the search to the given namespaces + * @return String + */ + function queryNamespaces() { + if( is_null($this->namespaces) ) + return ''; + if ( !count( $this->namespaces ) ) { + $namespaces = '0'; + } else { + $namespaces = $this->db->makeList( $this->namespaces ); + } + return 'AND page_namespace IN (' . $namespaces . ')'; + } + + /** + * Return a LIMIT clause to limit results on the query. + * @return String + */ + function queryLimit($sql) { + return $this->db->limitResult($sql, $this->limit, $this->offset); + } + + /** + * Does not do anything for generic search engine + * subclasses may define this though + * @return String + */ + function queryRanking($filteredTerm, $fulltext) { + return ' ORDER BY score(1)'; + } + + /** + * Construct the full SQL query to do the search. + * The guts shoulds be constructed in queryMain() + * @param $filteredTerm String + * @param $fulltext Boolean + */ + function getQuery( $filteredTerm, $fulltext ) { + return $this->queryLimit($this->queryMain($filteredTerm, $fulltext) . ' ' . + $this->queryRedirect() . ' ' . + $this->queryNamespaces() . ' ' . + $this->queryRanking( $filteredTerm, $fulltext ) . ' '); + } + + + /** + * Picks which field to index on, depending on what type of query. + * @param $fulltext Boolean + * @return String + */ + function getIndexField($fulltext) { + return $fulltext ? 'si_text' : 'si_title'; + } + + /** + * Get the base part of the search query. + * + * @param $filteredTerm String + * @param $fulltext Boolean + * @return String + */ + function queryMain( $filteredTerm, $fulltext ) { + $match = $this->parseQuery($filteredTerm, $fulltext); + $page = $this->db->tableName('page'); + $searchindex = $this->db->tableName('searchindex'); + return 'SELECT page_id, page_namespace, page_title ' . + "FROM $page,$searchindex " . + 'WHERE page_id=si_page AND ' . $match; + } + + /** + * Parse a user input search string, and return an SQL fragment to be used + * as part of a WHERE clause + */ + function parseQuery($filteredText, $fulltext) { + global $wgContLang; + $lc = SearchEngine::legalSearchChars(); + $this->searchTerms = array(); + + # FIXME: This doesn't handle parenthetical expressions. + $m = array(); + $searchon = ''; + if (preg_match_all('/([-+<>~]?)(([' . $lc . ']+)(\*?)|"[^"]*")/', + $filteredText, $m, PREG_SET_ORDER)) { + foreach($m as $terms) { + // Search terms in all variant forms, only + // apply on wiki with LanguageConverter + $temp_terms = $wgContLang->autoConvertToAllVariants( $terms[2] ); + if( is_array( $temp_terms )) { + $temp_terms = array_unique( array_values( $temp_terms )); + foreach( $temp_terms as $t ) { + $searchon .= ($terms[1] == '-' ? ' ~' : ' & ') . $this->escapeTerm( $t ); + } + } + else { + $searchon .= ($terms[1] == '-' ? ' ~' : ' & ') . $this->escapeTerm( $terms[2] ); + } + if (!empty($terms[3])) { + $regexp = preg_quote( $terms[3], '/' ); + if ($terms[4]) + $regexp .= "[0-9A-Za-z_]+"; + } else { + $regexp = preg_quote(str_replace('"', '', $terms[2]), '/'); + } + $this->searchTerms[] = $regexp; + } + } + + + $searchon = $this->db->addQuotes(ltrim($searchon, ' &')); + $field = $this->getIndexField($fulltext); + return " CONTAINS($field, $searchon, 1) > 0 "; + } + + private function escapeTerm($t) { + global $wgContLang; + $t = $wgContLang->normalizeForSearch($t); + $t = isset($this->reservedWords[strtoupper($t)]) ? '{'.$t.'}' : $t; + $t = preg_replace('/^"(.*)"$/', '($1)', $t); + $t = preg_replace('/([-&|])/', '\\\\$1', $t); + return $t; + } + /** + * Create or update the search index record for the given page. + * Title and text should be pre-processed. + * + * @param $id Integer + * @param $title String + * @param $text String + */ + function update($id, $title, $text) { + $dbw = wfGetDB(DB_MASTER); + $dbw->replace('searchindex', + array('si_page'), + array( + 'si_page' => $id, + 'si_title' => $title, + 'si_text' => $text + ), 'SearchOracle::update' ); + $dbw->query("CALL ctx_ddl.sync_index('si_text_idx')"); + $dbw->query("CALL ctx_ddl.sync_index('si_title_idx')"); + } + + /** + * Update a search index record's title only. + * Title should be pre-processed. + * + * @param int $id + * @param string $title + */ + function updateTitle($id, $title) { + $dbw = wfGetDB(DB_MASTER); + + $dbw->update('searchindex', + array('si_title' => $title), + array('si_page' => $id), + 'SearchOracle::updateTitle', + array()); + } + + + public static function legalSearchChars() { + return "\"" . parent::legalSearchChars(); + } +} diff --git a/includes/search/SearchPostgres.php b/includes/search/SearchPostgres.php new file mode 100644 index 00000000..0006fa82 --- /dev/null +++ b/includes/search/SearchPostgres.php @@ -0,0 +1,246 @@ +<?php +# Copyright (C) 2006-2007 Greg Sabino Mullane <greg@turnstep.com> +# http://www.mediawiki.org/ +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +# http://www.gnu.org/copyleft/gpl.html + +/** + * @file + * @ingroup Search + */ + +/** + * Search engine hook base class for Postgres + * @ingroup Search + */ +class SearchPostgres extends SearchEngine { + + function __construct( $db ) { + $this->db = $db; + } + + /** + * Perform a full text search query via tsearch2 and return a result set. + * Currently searches a page's current title (page.page_title) and + * latest revision article text (pagecontent.old_text) + * + * @param $term String: raw search term + * @return PostgresSearchResultSet + */ + function searchTitle( $term ) { + $q = $this->searchQuery( $term , 'titlevector', 'page_title' ); + $olderror = error_reporting(E_ERROR); + $resultSet = $this->db->resultObject( $this->db->query( $q, 'SearchPostgres', true ) ); + error_reporting($olderror); + if (!$resultSet) { + // Needed for "Query requires full scan, GIN doesn't support it" + return new SearchResultTooMany(); + } + return new PostgresSearchResultSet( $resultSet, $this->searchTerms ); + } + function searchText( $term ) { + $q = $this->searchQuery( $term, 'textvector', 'old_text' ); + $olderror = error_reporting(E_ERROR); + $resultSet = $this->db->resultObject( $this->db->query( $q, 'SearchPostgres', true ) ); + error_reporting($olderror); + if (!$resultSet) { + return new SearchResultTooMany(); + } + return new PostgresSearchResultSet( $resultSet, $this->searchTerms ); + } + + + /* + * Transform the user's search string into a better form for tsearch2 + * Returns an SQL fragment consisting of quoted text to search for. + */ + function parseQuery( $term ) { + + wfDebug( "parseQuery received: $term \n" ); + + ## No backslashes allowed + $term = preg_replace('/\\\/', '', $term); + + ## Collapse parens into nearby words: + $term = preg_replace('/\s*\(\s*/', ' (', $term); + $term = preg_replace('/\s*\)\s*/', ') ', $term); + + ## Treat colons as word separators: + $term = preg_replace('/:/', ' ', $term); + + $searchstring = ''; + $m = array(); + if( preg_match_all('/([-!]?)(\S+)\s*/', $term, $m, PREG_SET_ORDER ) ) { + foreach( $m as $terms ) { + if (strlen($terms[1])) { + $searchstring .= ' & !'; + } + if (strtolower($terms[2]) === 'and') { + $searchstring .= ' & '; + } + else if (strtolower($terms[2]) === 'or' or $terms[2] === '|') { + $searchstring .= ' | '; + } + else if (strtolower($terms[2]) === 'not') { + $searchstring .= ' & !'; + } + else { + $searchstring .= " & $terms[2]"; + } + } + } + + ## Strip out leading junk + $searchstring = preg_replace('/^[\s\&\|]+/', '', $searchstring); + + ## Remove any doubled-up operators + $searchstring = preg_replace('/([\!\&\|]) +(?:[\&\|] +)+/', "$1 ", $searchstring); + + ## Remove any non-spaced operators (e.g. "Zounds!") + $searchstring = preg_replace('/([^ ])[\!\&\|]/', "$1", $searchstring); + + ## Remove any trailing whitespace or operators + $searchstring = preg_replace('/[\s\!\&\|]+$/', '', $searchstring); + + ## Remove unnecessary quotes around everything + $searchstring = preg_replace('/^[\'"](.*)[\'"]$/', "$1", $searchstring); + + ## Quote the whole thing + $searchstring = $this->db->addQuotes($searchstring); + + wfDebug( "parseQuery returned: $searchstring \n" ); + + return $searchstring; + + } + + /** + * Construct the full SQL query to do the search. + * @param $filteredTerm String + * @param $fulltext String + */ + function searchQuery( $term, $fulltext, $colname ) { + global $wgDBversion; + + if ( !isset( $wgDBversion ) ) { + $this->db->getServerVersion(); + $wgDBversion = $this->db->numeric_version; + } + $prefix = $wgDBversion < 8.3 ? "'default'," : ''; + + # Get the SQL fragment for the given term + $searchstring = $this->parseQuery( $term ); + + ## We need a separate query here so gin does not complain about empty searches + $SQL = "SELECT to_tsquery($prefix $searchstring)"; + $res = $this->db->doQuery($SQL); + if (!$res) { + ## TODO: Better output (example to catch: one 'two) + die ("Sorry, that was not a valid search string. Please go back and try again"); + } + $top = pg_fetch_result($res,0,0); + + if ($top === "") { ## e.g. if only stopwords are used XXX return something better + $query = "SELECT page_id, page_namespace, page_title, 0 AS score ". + "FROM page p, revision r, pagecontent c WHERE p.page_latest = r.rev_id " . + "AND r.rev_text_id = c.old_id AND 1=0"; + } + else { + $m = array(); + if( preg_match_all("/'([^']+)'/", $top, $m, PREG_SET_ORDER ) ) { + foreach( $m as $terms ) { + $this->searchTerms[$terms[1]] = $terms[1]; + } + } + + $rankscore = $wgDBversion > 8.2 ? 5 : 1; + $rank = $wgDBversion < 8.3 ? 'rank' : 'ts_rank'; + $query = "SELECT page_id, page_namespace, page_title, ". + "$rank($fulltext, to_tsquery($prefix $searchstring), $rankscore) AS score ". + "FROM page p, revision r, pagecontent c WHERE p.page_latest = r.rev_id " . + "AND r.rev_text_id = c.old_id AND $fulltext @@ to_tsquery($prefix $searchstring)"; + } + + ## Redirects + if (! $this->showRedirects) + $query .= ' AND page_is_redirect = 0'; + + ## Namespaces - defaults to 0 + if( !is_null($this->namespaces) ){ // null -> search all + if ( count($this->namespaces) < 1) + $query .= ' AND page_namespace = 0'; + else { + $namespaces = $this->db->makeList( $this->namespaces ); + $query .= " AND page_namespace IN ($namespaces)"; + } + } + + $query .= " ORDER BY score DESC, page_id DESC"; + + $query .= $this->db->limitResult( '', $this->limit, $this->offset ); + + wfDebug( "searchQuery returned: $query \n" ); + + return $query; + } + + ## Most of the work of these two functions are done automatically via triggers + + function update( $pageid, $title, $text ) { + ## We don't want to index older revisions + $SQL = "UPDATE pagecontent SET textvector = NULL WHERE old_id IN ". + "(SELECT rev_text_id FROM revision WHERE rev_page = " . intval( $pageid ) . + " ORDER BY rev_text_id DESC OFFSET 1)"; + $this->db->doQuery($SQL); + return true; + } + + function updateTitle( $id, $title ) { + return true; + } + +} ## end of the SearchPostgres class + +/** + * @ingroup Search + */ +class PostgresSearchResult extends SearchResult { + function __construct( $row ) { + parent::__construct($row); + $this->score = $row->score; + } + function getScore() { + return $this->score; + } +} + +/** + * @ingroup Search + */ +class PostgresSearchResultSet extends SqlSearchResultSet { + function __construct( $resultSet, $terms ) { + parent::__construct( $resultSet, $terms ); + } + + function next() { + $row = $this->mResultSet->fetchObject(); + if( $row === false ) { + return false; + } else { + return new PostgresSearchResult( $row ); + } + } +} diff --git a/includes/search/SearchSqlite.php b/includes/search/SearchSqlite.php new file mode 100644 index 00000000..fb55efec --- /dev/null +++ b/includes/search/SearchSqlite.php @@ -0,0 +1,344 @@ +<?php +# SQLite search backend, based upon SearchMysql +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +# http://www.gnu.org/copyleft/gpl.html + +/** + * @file + * @ingroup Search + */ + +/** + * Search engine hook for SQLite + * @ingroup Search + */ +class SearchSqlite extends SearchEngine { + // Cached because SearchUpdate keeps recreating our class + private static $fulltextSupported = null; + + /** + * Creates an instance of this class + * @param $db DatabaseSqlite: database object + */ + function __construct( $db ) { + $this->db = $db; + } + + /** + * Whether fulltext search is supported by current schema + * @return Boolean + */ + function fulltextSearchSupported() { + if ( self::$fulltextSupported === null ) { + self::$fulltextSupported = $this->db->selectField( + 'updatelog', + 'ul_key', + array( 'ul_key' => 'fts3' ), + __METHOD__ ) !== false; + } + return self::$fulltextSupported; + } + + /** + * Parse the user's query and transform it into an SQL fragment which will + * become part of a WHERE clause + */ + function parseQuery( $filteredText, $fulltext ) { + global $wgContLang; + $lc = SearchEngine::legalSearchChars(); // Minus format chars + $searchon = ''; + $this->searchTerms = array(); + + $m = array(); + if( preg_match_all( '/([-+<>~]?)(([' . $lc . ']+)(\*?)|"[^"]*")/', + $filteredText, $m, PREG_SET_ORDER ) ) { + foreach( $m as $bits ) { + @list( /* all */, $modifier, $term, $nonQuoted, $wildcard ) = $bits; + + if( $nonQuoted != '' ) { + $term = $nonQuoted; + $quote = ''; + } else { + $term = str_replace( '"', '', $term ); + $quote = '"'; + } + + if( $searchon !== '' ) $searchon .= ' '; + + // Some languages such as Serbian store the input form in the search index, + // so we may need to search for matches in multiple writing system variants. + $convertedVariants = $wgContLang->autoConvertToAllVariants( $term ); + if( is_array( $convertedVariants ) ) { + $variants = array_unique( array_values( $convertedVariants ) ); + } else { + $variants = array( $term ); + } + + // The low-level search index does some processing on input to work + // around problems with minimum lengths and encoding in MySQL's + // fulltext engine. + // For Chinese this also inserts spaces between adjacent Han characters. + $strippedVariants = array_map( + array( $wgContLang, 'normalizeForSearch' ), + $variants ); + + // Some languages such as Chinese force all variants to a canonical + // form when stripping to the low-level search index, so to be sure + // let's check our variants list for unique items after stripping. + $strippedVariants = array_unique( $strippedVariants ); + + $searchon .= $modifier; + if( count( $strippedVariants) > 1 ) + $searchon .= '('; + foreach( $strippedVariants as $stripped ) { + if( $nonQuoted && strpos( $stripped, ' ' ) !== false ) { + // Hack for Chinese: we need to toss in quotes for + // multiple-character phrases since normalizeForSearch() + // added spaces between them to make word breaks. + $stripped = '"' . trim( $stripped ) . '"'; + } + $searchon .= "$quote$stripped$quote$wildcard "; + } + if( count( $strippedVariants) > 1 ) + $searchon .= ')'; + + // Match individual terms or quoted phrase in result highlighting... + // Note that variants will be introduced in a later stage for highlighting! + $regexp = $this->regexTerm( $term, $wildcard ); + $this->searchTerms[] = $regexp; + } + + } else { + wfDebug( __METHOD__ . ": Can't understand search query '{$filteredText}'\n" ); + } + + $searchon = $this->db->strencode( $searchon ); + $field = $this->getIndexField( $fulltext ); + return " $field MATCH '$searchon' "; + } + + function regexTerm( $string, $wildcard ) { + global $wgContLang; + + $regex = preg_quote( $string, '/' ); + if( $wgContLang->hasWordBreaks() ) { + if( $wildcard ) { + // Don't cut off the final bit! + $regex = "\b$regex"; + } else { + $regex = "\b$regex\b"; + } + } else { + // For Chinese, words may legitimately abut other words in the text literal. + // Don't add \b boundary checks... note this could cause false positives + // for latin chars. + } + return $regex; + } + + public static function legalSearchChars() { + return "\"*" . parent::legalSearchChars(); + } + + /** + * Perform a full text search query and return a result set. + * + * @param $term String: raw search term + * @return SqliteSearchResultSet + */ + function searchText( $term ) { + return $this->searchInternal( $term, true ); + } + + /** + * Perform a title-only search query and return a result set. + * + * @param $term String: raw search term + * @return SqliteSearchResultSet + */ + function searchTitle( $term ) { + return $this->searchInternal( $term, false ); + } + + protected function searchInternal( $term, $fulltext ) { + global $wgCountTotalSearchHits, $wgContLang; + + if ( !$this->fulltextSearchSupported() ) { + return null; + } + + $filteredTerm = $this->filter( $wgContLang->lc( $term ) ); + $resultSet = $this->db->query( $this->getQuery( $filteredTerm, $fulltext ) ); + + $total = null; + if( $wgCountTotalSearchHits ) { + $totalResult = $this->db->query( $this->getCountQuery( $filteredTerm, $fulltext ) ); + $row = $totalResult->fetchObject(); + if( $row ) { + $total = intval( $row->c ); + } + $totalResult->free(); + } + + return new SqliteSearchResultSet( $resultSet, $this->searchTerms, $total ); + } + + + /** + * Return a partial WHERE clause to exclude redirects, if so set + * @return String + */ + function queryRedirect() { + if( $this->showRedirects ) { + return ''; + } else { + return 'AND page_is_redirect=0'; + } + } + + /** + * Return a partial WHERE clause to limit the search to the given namespaces + * @return String + */ + function queryNamespaces() { + if( is_null($this->namespaces) ) + return ''; # search all + if ( !count( $this->namespaces ) ) { + $namespaces = '0'; + } else { + $namespaces = $this->db->makeList( $this->namespaces ); + } + return 'AND page_namespace IN (' . $namespaces . ')'; + } + + /** + * Returns a query with limit for number of results set. + * @param $sql String: + * @return String + */ + function limitResult( $sql ) { + return $this->db->limitResult( $sql, $this->limit, $this->offset ); + } + + /** + * Construct the full SQL query to do the search. + * The guts shoulds be constructed in queryMain() + * @param $filteredTerm String + * @param $fulltext Boolean + */ + function getQuery( $filteredTerm, $fulltext ) { + return $this->limitResult( + $this->queryMain( $filteredTerm, $fulltext ) . ' ' . + $this->queryRedirect() . ' ' . + $this->queryNamespaces() + ); + } + + /** + * Picks which field to index on, depending on what type of query. + * @param $fulltext Boolean + * @return String + */ + function getIndexField( $fulltext ) { + return $fulltext ? 'si_text' : 'si_title'; + } + + /** + * Get the base part of the search query. + * + * @param $filteredTerm String + * @param $fulltext Boolean + * @return String + */ + function queryMain( $filteredTerm, $fulltext ) { + $match = $this->parseQuery( $filteredTerm, $fulltext ); + $page = $this->db->tableName( 'page' ); + $searchindex = $this->db->tableName( 'searchindex' ); + return "SELECT $searchindex.rowid, page_namespace, page_title " . + "FROM $page,$searchindex " . + "WHERE page_id=$searchindex.rowid AND $match"; + } + + function getCountQuery( $filteredTerm, $fulltext ) { + $match = $this->parseQuery( $filteredTerm, $fulltext ); + $page = $this->db->tableName( 'page' ); + $searchindex = $this->db->tableName( 'searchindex' ); + return "SELECT COUNT(*) AS c " . + "FROM $page,$searchindex " . + "WHERE page_id=$searchindex.rowid AND $match" . + $this->queryRedirect() . ' ' . + $this->queryNamespaces(); + } + + /** + * Create or update the search index record for the given page. + * Title and text should be pre-processed. + * + * @param $id Integer + * @param $title String + * @param $text String + */ + function update( $id, $title, $text ) { + if ( !$this->fulltextSearchSupported() ) { + return; + } + // @todo: find a method to do it in a single request, + // couldn't do it so far due to typelessness of FTS3 tables. + $dbw = wfGetDB( DB_MASTER ); + + $dbw->delete( 'searchindex', array( 'rowid' => $id ), __METHOD__ ); + + $dbw->insert( 'searchindex', + array( + 'rowid' => $id, + 'si_title' => $title, + 'si_text' => $text + ), __METHOD__ ); + } + + /** + * Update a search index record's title only. + * Title should be pre-processed. + * + * @param $id Integer + * @param $title String + */ + function updateTitle( $id, $title ) { + if ( !$this->fulltextSearchSupported() ) { + return; + } + $dbw = wfGetDB( DB_MASTER ); + + $dbw->update( 'searchindex', + array( 'si_title' => $title ), + array( 'rowid' => $id ), + __METHOD__ ); + } +} + +/** + * @ingroup Search + */ +class SqliteSearchResultSet extends SqlSearchResultSet { + function SqliteSearchResultSet( $resultSet, $terms, $totalHits=null ) { + parent::__construct( $resultSet, $terms ); + $this->mTotalHits = $totalHits; + } + + function getTotalHits() { + return $this->mTotalHits; + } +}
\ No newline at end of file diff --git a/includes/search/SearchUpdate.php b/includes/search/SearchUpdate.php new file mode 100644 index 00000000..e30c70e6 --- /dev/null +++ b/includes/search/SearchUpdate.php @@ -0,0 +1,113 @@ +<?php +/** + * See deferred.txt + * @ingroup Search + */ +class SearchUpdate { + + /* private */ var $mId = 0, $mNamespace, $mTitle, $mText; + /* private */ var $mTitleWords; + + function SearchUpdate( $id, $title, $text = false ) { + $nt = Title::newFromText( $title ); + if( $nt ) { + $this->mId = $id; + $this->mText = $text; + + $this->mNamespace = $nt->getNamespace(); + $this->mTitle = $nt->getText(); # Discard namespace + + $this->mTitleWords = $this->mTextWords = array(); + } else { + wfDebug( "SearchUpdate object created with invalid title '$title'\n" ); + } + } + + function doUpdate() { + global $wgContLang, $wgDisableSearchUpdate; + + if( $wgDisableSearchUpdate || !$this->mId ) { + return false; + } + $fname = 'SearchUpdate::doUpdate'; + wfProfileIn( $fname ); + + $search = SearchEngine::create(); + $lc = SearchEngine::legalSearchChars() . '&#;'; + + if( $this->mText === false ) { + $search->updateTitle($this->mId, + Title::indexTitle( $this->mNamespace, $this->mTitle )); + wfProfileOut( $fname ); + return; + } + + # Language-specific strip/conversion + $text = $wgContLang->normalizeForSearch( $this->mText ); + + wfProfileIn( $fname.'-regexps' ); + $text = preg_replace( "/<\\/?\\s*[A-Za-z][^>]*?>/", + ' ', $wgContLang->lc( " " . $text . " " ) ); # Strip HTML markup + $text = preg_replace( "/(^|\\n)==\\s*([^\\n]+)\\s*==(\\s)/sD", + "\\1\\2 \\2 \\2\\3", $text ); # Emphasize headings + + # Strip external URLs + $uc = "A-Za-z0-9_\\/:.,~%\\-+&;#?!=()@\\xA0-\\xFF"; + $protos = "http|https|ftp|mailto|news|gopher"; + $pat = "/(^|[^\\[])({$protos}):[{$uc}]+([^{$uc}]|$)/"; + $text = preg_replace( $pat, "\\1 \\3", $text ); + + $p1 = "/([^\\[])\\[({$protos}):[{$uc}]+]/"; + $p2 = "/([^\\[])\\[({$protos}):[{$uc}]+\\s+([^\\]]+)]/"; + $text = preg_replace( $p1, "\\1 ", $text ); + $text = preg_replace( $p2, "\\1 \\3 ", $text ); + + # Internal image links + $pat2 = "/\\[\\[image:([{$uc}]+)\\.(gif|png|jpg|jpeg)([^{$uc}])/i"; + $text = preg_replace( $pat2, " \\1 \\3", $text ); + + $text = preg_replace( "/([^{$lc}])([{$lc}]+)]]([a-z]+)/", + "\\1\\2 \\2\\3", $text ); # Handle [[game]]s + + # Strip all remaining non-search characters + $text = preg_replace( "/[^{$lc}]+/", " ", $text ); + + # Handle 's, s' + # + # $text = preg_replace( "/([{$lc}]+)'s /", "\\1 \\1's ", $text ); + # $text = preg_replace( "/([{$lc}]+)s' /", "\\1s ", $text ); + # + # These tail-anchored regexps are insanely slow. The worst case comes + # when Japanese or Chinese text (ie, no word spacing) is written on + # a wiki configured for Western UTF-8 mode. The Unicode characters are + # expanded to hex codes and the "words" are very long paragraph-length + # monstrosities. On a large page the above regexps may take over 20 + # seconds *each* on a 1GHz-level processor. + # + # Following are reversed versions which are consistently fast + # (about 3 milliseconds on 1GHz-level processor). + # + $text = strrev( preg_replace( "/ s'([{$lc}]+)/", " s'\\1 \\1", strrev( $text ) ) ); + $text = strrev( preg_replace( "/ 's([{$lc}]+)/", " s\\1", strrev( $text ) ) ); + + # Strip wiki '' and ''' + $text = preg_replace( "/''[']*/", " ", $text ); + wfProfileOut( "$fname-regexps" ); + + wfRunHooks( 'SearchUpdate', array( $this->mId, $this->mNamespace, $this->mTitle, &$text ) ); + + # Perform the actual update + $search->update($this->mId, Title::indexTitle( $this->mNamespace, $this->mTitle ), + $text); + + wfProfileOut( $fname ); + } +} + +/** + * Placeholder class + * @ingroup Search + */ +class SearchUpdateMyISAM extends SearchUpdate { + # Inherits everything +} |