summaryrefslogtreecommitdiff
path: root/includes/search
diff options
context:
space:
mode:
authorPierre Schmitz <pierre@archlinux.de>2010-07-28 11:52:48 +0200
committerPierre Schmitz <pierre@archlinux.de>2010-07-28 11:52:48 +0200
commit222b01f5169f1c7e69762e0e8904c24f78f71882 (patch)
tree8e932e12546bb991357ec48eb1638d1770be7a35 /includes/search
parent00ab76a6b686e98a914afc1975812d2b1aaa7016 (diff)
update to MediaWiki 1.16.0
Diffstat (limited to 'includes/search')
-rw-r--r--includes/search/SearchEngine.php1248
-rw-r--r--includes/search/SearchIBM_DB2.php224
-rw-r--r--includes/search/SearchMySQL.php412
-rw-r--r--includes/search/SearchMySQL4.php34
-rw-r--r--includes/search/SearchOracle.php268
-rw-r--r--includes/search/SearchPostgres.php246
-rw-r--r--includes/search/SearchSqlite.php344
-rw-r--r--includes/search/SearchUpdate.php113
8 files changed, 2889 insertions, 0 deletions
diff --git a/includes/search/SearchEngine.php b/includes/search/SearchEngine.php
new file mode 100644
index 00000000..f4ca700d
--- /dev/null
+++ b/includes/search/SearchEngine.php
@@ -0,0 +1,1248 @@
+<?php
+/**
+ * @defgroup Search Search
+ *
+ * @file
+ * @ingroup Search
+ */
+
+/**
+ * Contain a class for special pages
+ * @ingroup Search
+ */
+class SearchEngine {
+ var $limit = 10;
+ var $offset = 0;
+ var $prefix = '';
+ var $searchTerms = array();
+ var $namespaces = array( NS_MAIN );
+ var $showRedirects = false;
+
+ /**
+ * Perform a full text search query and return a result set.
+ * If title searches are not supported or disabled, return null.
+ * STUB
+ *
+ * @param $term String: raw search term
+ * @return SearchResultSet
+ */
+ function searchText( $term ) {
+ return null;
+ }
+
+ /**
+ * Perform a title-only search query and return a result set.
+ * If title searches are not supported or disabled, return null.
+ * STUB
+ *
+ * @param $term String: raw search term
+ * @return SearchResultSet
+ */
+ function searchTitle( $term ) {
+ return null;
+ }
+
+ /** If this search backend can list/unlist redirects */
+ function acceptListRedirects() {
+ return true;
+ }
+
+ /**
+ * When overridden in derived class, performs database-specific conversions
+ * on text to be used for searching or updating search index.
+ * Default implementation does nothing (simply returns $string).
+ *
+ * @param $string string: String to process
+ * @return string
+ */
+ public function normalizeText( $string ) {
+ return $string;
+ }
+
+ /**
+ * Transform search term in cases when parts of the query came as different GET params (when supported)
+ * e.g. for prefix queries: search=test&prefix=Main_Page/Archive -> test prefix:Main Page/Archive
+ */
+ function transformSearchTerm( $term ) {
+ return $term;
+ }
+
+ /**
+ * If an exact title match can be found, or a very slightly close match,
+ * return the title. If no match, returns NULL.
+ *
+ * @param $searchterm String
+ * @return Title
+ */
+ public static function getNearMatch( $searchterm ) {
+ $title = self::getNearMatchInternal( $searchterm );
+
+ wfRunHooks( 'SearchGetNearMatchComplete', array( $searchterm, &$title ) );
+ return $title;
+ }
+
+ /**
+ * Really find the title match.
+ */
+ private static function getNearMatchInternal( $searchterm ) {
+ global $wgContLang;
+
+ $allSearchTerms = array($searchterm);
+
+ if ( $wgContLang->hasVariants() ) {
+ $allSearchTerms = array_merge($allSearchTerms,$wgContLang->convertLinkToAllVariants($searchterm));
+ }
+
+ if( !wfRunHooks( 'SearchGetNearMatchBefore', array( $allSearchTerms, &$titleResult ) ) ) {
+ return $titleResult;
+ }
+
+ foreach($allSearchTerms as $term) {
+
+ # Exact match? No need to look further.
+ $title = Title::newFromText( $term );
+ if (is_null($title))
+ return null;
+
+ if ( $title->getNamespace() == NS_SPECIAL || $title->isExternal() || $title->exists() ) {
+ return $title;
+ }
+
+ # See if it still otherwise has content is some sane sense
+ $article = MediaWiki::articleFromTitle( $title );
+ if( $article->hasViewableContent() ) {
+ return $title;
+ }
+
+ # Now try all lower case (i.e. first letter capitalized)
+ #
+ $title = Title::newFromText( $wgContLang->lc( $term ) );
+ if ( $title && $title->exists() ) {
+ return $title;
+ }
+
+ # Now try capitalized string
+ #
+ $title = Title::newFromText( $wgContLang->ucwords( $term ) );
+ if ( $title && $title->exists() ) {
+ return $title;
+ }
+
+ # Now try all upper case
+ #
+ $title = Title::newFromText( $wgContLang->uc( $term ) );
+ if ( $title && $title->exists() ) {
+ return $title;
+ }
+
+ # Now try Word-Caps-Breaking-At-Word-Breaks, for hyphenated names etc
+ $title = Title::newFromText( $wgContLang->ucwordbreaks($term) );
+ if ( $title && $title->exists() ) {
+ return $title;
+ }
+
+ // Give hooks a chance at better match variants
+ $title = null;
+ if( !wfRunHooks( 'SearchGetNearMatch', array( $term, &$title ) ) ) {
+ return $title;
+ }
+ }
+
+ $title = Title::newFromText( $searchterm );
+
+ # Entering an IP address goes to the contributions page
+ if ( ( $title->getNamespace() == NS_USER && User::isIP($title->getText() ) )
+ || User::isIP( trim( $searchterm ) ) ) {
+ return SpecialPage::getTitleFor( 'Contributions', $title->getDBkey() );
+ }
+
+
+ # Entering a user goes to the user page whether it's there or not
+ if ( $title->getNamespace() == NS_USER ) {
+ return $title;
+ }
+
+ # Go to images that exist even if there's no local page.
+ # There may have been a funny upload, or it may be on a shared
+ # file repository such as Wikimedia Commons.
+ if( $title->getNamespace() == NS_FILE ) {
+ $image = wfFindFile( $title );
+ if( $image ) {
+ return $title;
+ }
+ }
+
+ # MediaWiki namespace? Page may be "implied" if not customized.
+ # Just return it, with caps forced as the message system likes it.
+ if( $title->getNamespace() == NS_MEDIAWIKI ) {
+ return Title::makeTitle( NS_MEDIAWIKI, $wgContLang->ucfirst( $title->getText() ) );
+ }
+
+ # Quoted term? Try without the quotes...
+ $matches = array();
+ if( preg_match( '/^"([^"]+)"$/', $searchterm, $matches ) ) {
+ return SearchEngine::getNearMatch( $matches[1] );
+ }
+
+ return null;
+ }
+
+ public static function legalSearchChars() {
+ return "A-Za-z_'.0-9\\x80-\\xFF\\-";
+ }
+
+ /**
+ * Set the maximum number of results to return
+ * and how many to skip before returning the first.
+ *
+ * @param $limit Integer
+ * @param $offset Integer
+ */
+ function setLimitOffset( $limit, $offset = 0 ) {
+ $this->limit = intval( $limit );
+ $this->offset = intval( $offset );
+ }
+
+ /**
+ * Set which namespaces the search should include.
+ * Give an array of namespace index numbers.
+ *
+ * @param $namespaces Array
+ */
+ function setNamespaces( $namespaces ) {
+ $this->namespaces = $namespaces;
+ }
+
+ /**
+ * Parse some common prefixes: all (search everything)
+ * or namespace names
+ *
+ * @param $query String
+ */
+ function replacePrefixes( $query ){
+ global $wgContLang;
+
+ $parsed = $query;
+ if( strpos($query,':') === false ) { // nothing to do
+ wfRunHooks( 'SearchEngineReplacePrefixesComplete', array( $this, $query, &$parsed ) );
+ return $parsed;
+ }
+
+ $allkeyword = wfMsgForContent('searchall').":";
+ if( strncmp($query, $allkeyword, strlen($allkeyword)) == 0 ){
+ $this->namespaces = null;
+ $parsed = substr($query,strlen($allkeyword));
+ } else if( strpos($query,':') !== false ) {
+ $prefix = substr($query,0,strpos($query,':'));
+ $index = $wgContLang->getNsIndex($prefix);
+ if($index !== false){
+ $this->namespaces = array($index);
+ $parsed = substr($query,strlen($prefix)+1);
+ }
+ }
+ if(trim($parsed) == '')
+ $parsed = $query; // prefix was the whole query
+
+ wfRunHooks( 'SearchEngineReplacePrefixesComplete', array( $this, $query, &$parsed ) );
+
+ return $parsed;
+ }
+
+ /**
+ * Make a list of searchable namespaces and their canonical names.
+ * @return Array
+ */
+ public static function searchableNamespaces() {
+ global $wgContLang;
+ $arr = array();
+ foreach( $wgContLang->getNamespaces() as $ns => $name ) {
+ if( $ns >= NS_MAIN ) {
+ $arr[$ns] = $name;
+ }
+ }
+
+ wfRunHooks( 'SearchableNamespaces', array( &$arr ) );
+ return $arr;
+ }
+
+ /**
+ * Extract default namespaces to search from the given user's
+ * settings, returning a list of index numbers.
+ *
+ * @param $user User
+ * @return Array
+ */
+ public static function userNamespaces( $user ) {
+ global $wgSearchEverythingOnlyLoggedIn;
+
+ // get search everything preference, that can be set to be read for logged-in users
+ $searcheverything = false;
+ if( ( $wgSearchEverythingOnlyLoggedIn && $user->isLoggedIn() )
+ || !$wgSearchEverythingOnlyLoggedIn )
+ $searcheverything = $user->getOption('searcheverything');
+
+ // searcheverything overrides other options
+ if( $searcheverything )
+ return array_keys(SearchEngine::searchableNamespaces());
+
+ $arr = Preferences::loadOldSearchNs( $user );
+ $searchableNamespaces = SearchEngine::searchableNamespaces();
+
+ $arr = array_intersect( $arr, array_keys($searchableNamespaces) ); // Filter
+
+ return $arr;
+ }
+
+ /**
+ * Find snippet highlight settings for a given user
+ *
+ * @param $user User
+ * @return Array contextlines, contextchars
+ */
+ public static function userHighlightPrefs( &$user ){
+ //$contextlines = $user->getOption( 'contextlines', 5 );
+ //$contextchars = $user->getOption( 'contextchars', 50 );
+ $contextlines = 2; // Hardcode this. Old defaults sucked. :)
+ $contextchars = 75; // same as above.... :P
+ return array($contextlines, $contextchars);
+ }
+
+ /**
+ * An array of namespaces indexes to be searched by default
+ *
+ * @return Array
+ */
+ public static function defaultNamespaces(){
+ global $wgNamespacesToBeSearchedDefault;
+
+ return array_keys($wgNamespacesToBeSearchedDefault, true);
+ }
+
+ /**
+ * Get a list of namespace names useful for showing in tooltips
+ * and preferences
+ *
+ * @param $namespaces Array
+ */
+ public static function namespacesAsText( $namespaces ){
+ global $wgContLang;
+
+ $formatted = array_map( array($wgContLang,'getFormattedNsText'), $namespaces );
+ foreach( $formatted as $key => $ns ){
+ if ( empty($ns) )
+ $formatted[$key] = wfMsg( 'blanknamespace' );
+ }
+ return $formatted;
+ }
+
+ /**
+ * Return the help namespaces to be shown on Special:Search
+ *
+ * @return Array
+ */
+ public static function helpNamespaces() {
+ global $wgNamespacesToBeSearchedHelp;
+
+ return array_keys( $wgNamespacesToBeSearchedHelp, true );
+ }
+
+ /**
+ * Return a 'cleaned up' search string
+ *
+ * @param $text String
+ * @return String
+ */
+ function filter( $text ) {
+ $lc = $this->legalSearchChars();
+ return trim( preg_replace( "/[^{$lc}]/", " ", $text ) );
+ }
+ /**
+ * Load up the appropriate search engine class for the currently
+ * active database backend, and return a configured instance.
+ *
+ * @return SearchEngine
+ */
+ public static function create() {
+ global $wgSearchType;
+ $dbr = wfGetDB( DB_SLAVE );
+ if( $wgSearchType ) {
+ $class = $wgSearchType;
+ } else {
+ $class = $dbr->getSearchEngine();
+ }
+ $search = new $class( $dbr );
+ $search->setLimitOffset(0,0);
+ return $search;
+ }
+
+ /**
+ * Create or update the search index record for the given page.
+ * Title and text should be pre-processed.
+ * STUB
+ *
+ * @param $id Integer
+ * @param $title String
+ * @param $text String
+ */
+ function update( $id, $title, $text ) {
+ // no-op
+ }
+
+ /**
+ * Update a search index record's title only.
+ * Title should be pre-processed.
+ * STUB
+ *
+ * @param $id Integer
+ * @param $title String
+ */
+ function updateTitle( $id, $title ) {
+ // no-op
+ }
+
+ /**
+ * Get OpenSearch suggestion template
+ *
+ * @return String
+ */
+ public static function getOpenSearchTemplate() {
+ global $wgOpenSearchTemplate, $wgServer, $wgScriptPath;
+ if( $wgOpenSearchTemplate ) {
+ return $wgOpenSearchTemplate;
+ } else {
+ $ns = implode( '|', SearchEngine::defaultNamespaces() );
+ if( !$ns ) $ns = "0";
+ return $wgServer . $wgScriptPath . '/api.php?action=opensearch&search={searchTerms}&namespace='.$ns;
+ }
+ }
+
+ /**
+ * Get internal MediaWiki Suggest template
+ *
+ * @return String
+ */
+ public static function getMWSuggestTemplate() {
+ global $wgMWSuggestTemplate, $wgServer, $wgScriptPath;
+ if($wgMWSuggestTemplate)
+ return $wgMWSuggestTemplate;
+ else
+ return $wgServer . $wgScriptPath . '/api.php?action=opensearch&search={searchTerms}&namespace={namespaces}&suggest';
+ }
+}
+
+/**
+ * @ingroup Search
+ */
+class SearchResultSet {
+ /**
+ * Fetch an array of regular expression fragments for matching
+ * the search terms as parsed by this engine in a text extract.
+ * STUB
+ *
+ * @return Array
+ */
+ function termMatches() {
+ return array();
+ }
+
+ function numRows() {
+ return 0;
+ }
+
+ /**
+ * Return true if results are included in this result set.
+ * STUB
+ *
+ * @return Boolean
+ */
+ function hasResults() {
+ return false;
+ }
+
+ /**
+ * Some search modes return a total hit count for the query
+ * in the entire article database. This may include pages
+ * in namespaces that would not be matched on the given
+ * settings.
+ *
+ * Return null if no total hits number is supported.
+ *
+ * @return Integer
+ */
+ function getTotalHits() {
+ return null;
+ }
+
+ /**
+ * Some search modes return a suggested alternate term if there are
+ * no exact hits. Returns true if there is one on this set.
+ *
+ * @return Boolean
+ */
+ function hasSuggestion() {
+ return false;
+ }
+
+ /**
+ * @return String: suggested query, null if none
+ */
+ function getSuggestionQuery(){
+ return null;
+ }
+
+ /**
+ * @return String: HTML highlighted suggested query, '' if none
+ */
+ function getSuggestionSnippet(){
+ return '';
+ }
+
+ /**
+ * Return information about how and from where the results were fetched,
+ * should be useful for diagnostics and debugging
+ *
+ * @return String
+ */
+ function getInfo() {
+ return null;
+ }
+
+ /**
+ * Return a result set of hits on other (multiple) wikis associated with this one
+ *
+ * @return SearchResultSet
+ */
+ function getInterwikiResults() {
+ return null;
+ }
+
+ /**
+ * Check if there are results on other wikis
+ *
+ * @return Boolean
+ */
+ function hasInterwikiResults() {
+ return $this->getInterwikiResults() != null;
+ }
+
+
+ /**
+ * Fetches next search result, or false.
+ * STUB
+ *
+ * @return SearchResult
+ */
+ function next() {
+ return false;
+ }
+
+ /**
+ * Frees the result set, if applicable.
+ */
+ function free() {
+ // ...
+ }
+}
+
+/**
+ * This class is used for different SQL-based search engines shipped with MediaWiki
+ */
+class SqlSearchResultSet extends SearchResultSet {
+ function __construct( $resultSet, $terms ) {
+ $this->mResultSet = $resultSet;
+ $this->mTerms = $terms;
+ }
+
+ function termMatches() {
+ return $this->mTerms;
+ }
+
+ function numRows() {
+ return $this->mResultSet->numRows();
+ }
+
+ function next() {
+ if ($this->mResultSet === false )
+ return false;
+
+ $row = $this->mResultSet->fetchObject();
+ if ($row === false)
+ return false;
+ return new SearchResult($row);
+ }
+
+ function free() {
+ $this->mResultSet->free();
+ }
+}
+
+/**
+ * @ingroup Search
+ */
+class SearchResultTooMany {
+ ## Some search engines may bail out if too many matches are found
+}
+
+
+/**
+ * @todo Fixme: This class is horribly factored. It would probably be better to
+ * have a useful base class to which you pass some standard information, then
+ * let the fancy self-highlighters extend that.
+ * @ingroup Search
+ */
+class SearchResult {
+ var $mRevision = null;
+ var $mImage = null;
+
+ function __construct( $row ) {
+ $this->mTitle = Title::makeTitle( $row->page_namespace, $row->page_title );
+ if( !is_null($this->mTitle) ){
+ $this->mRevision = Revision::newFromTitle( $this->mTitle );
+ if( $this->mTitle->getNamespace() === NS_FILE )
+ $this->mImage = wfFindFile( $this->mTitle );
+ }
+ }
+
+ /**
+ * Check if this is result points to an invalid title
+ *
+ * @return Boolean
+ */
+ function isBrokenTitle(){
+ if( is_null($this->mTitle) )
+ return true;
+ return false;
+ }
+
+ /**
+ * Check if target page is missing, happens when index is out of date
+ *
+ * @return Boolean
+ */
+ function isMissingRevision(){
+ return !$this->mRevision && !$this->mImage;
+ }
+
+ /**
+ * @return Title
+ */
+ function getTitle() {
+ return $this->mTitle;
+ }
+
+ /**
+ * @return Double or null if not supported
+ */
+ function getScore() {
+ return null;
+ }
+
+ /**
+ * Lazy initialization of article text from DB
+ */
+ protected function initText(){
+ if( !isset($this->mText) ){
+ if($this->mRevision != null)
+ $this->mText = $this->mRevision->getText();
+ else // TODO: can we fetch raw wikitext for commons images?
+ $this->mText = '';
+
+ }
+ }
+
+ /**
+ * @param $terms Array: terms to highlight
+ * @return String: highlighted text snippet, null (and not '') if not supported
+ */
+ function getTextSnippet($terms){
+ global $wgUser, $wgAdvancedSearchHighlighting;
+ $this->initText();
+ list($contextlines,$contextchars) = SearchEngine::userHighlightPrefs($wgUser);
+ $h = new SearchHighlighter();
+ if( $wgAdvancedSearchHighlighting )
+ return $h->highlightText( $this->mText, $terms, $contextlines, $contextchars );
+ else
+ return $h->highlightSimple( $this->mText, $terms, $contextlines, $contextchars );
+ }
+
+ /**
+ * @param $terms Array: terms to highlight
+ * @return String: highlighted title, '' if not supported
+ */
+ function getTitleSnippet($terms){
+ return '';
+ }
+
+ /**
+ * @param $terms Array: terms to highlight
+ * @return String: highlighted redirect name (redirect to this page), '' if none or not supported
+ */
+ function getRedirectSnippet($terms){
+ return '';
+ }
+
+ /**
+ * @return Title object for the redirect to this page, null if none or not supported
+ */
+ function getRedirectTitle(){
+ return null;
+ }
+
+ /**
+ * @return string highlighted relevant section name, null if none or not supported
+ */
+ function getSectionSnippet(){
+ return '';
+ }
+
+ /**
+ * @return Title object (pagename+fragment) for the section, null if none or not supported
+ */
+ function getSectionTitle(){
+ return null;
+ }
+
+ /**
+ * @return String: timestamp
+ */
+ function getTimestamp(){
+ if( $this->mRevision )
+ return $this->mRevision->getTimestamp();
+ else if( $this->mImage )
+ return $this->mImage->getTimestamp();
+ return '';
+ }
+
+ /**
+ * @return Integer: number of words
+ */
+ function getWordCount(){
+ $this->initText();
+ return str_word_count( $this->mText );
+ }
+
+ /**
+ * @return Integer: size in bytes
+ */
+ function getByteSize(){
+ $this->initText();
+ return strlen( $this->mText );
+ }
+
+ /**
+ * @return Boolean if hit has related articles
+ */
+ function hasRelated(){
+ return false;
+ }
+
+ /**
+ * @return String: interwiki prefix of the title (return iw even if title is broken)
+ */
+ function getInterwikiPrefix(){
+ return '';
+ }
+}
+
+/**
+ * Highlight bits of wikitext
+ *
+ * @ingroup Search
+ */
+class SearchHighlighter {
+ var $mCleanWikitext = true;
+
+ function SearchHighlighter($cleanupWikitext = true){
+ $this->mCleanWikitext = $cleanupWikitext;
+ }
+
+ /**
+ * Default implementation of wikitext highlighting
+ *
+ * @param $text String
+ * @param $terms Array: terms to highlight (unescaped)
+ * @param $contextlines Integer
+ * @param $contextchars Integer
+ * @return String
+ */
+ public function highlightText( $text, $terms, $contextlines, $contextchars ) {
+ global $wgLang, $wgContLang;
+ global $wgSearchHighlightBoundaries;
+ $fname = __METHOD__;
+
+ if($text == '')
+ return '';
+
+ // spli text into text + templates/links/tables
+ $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)";
+ // first capture group is for detecting nested templates/links/tables/references
+ $endPatterns = array(
+ 1 => '/(\{\{)|(\}\})/', // template
+ 2 => '/(\[\[)|(\]\])/', // image
+ 3 => "/(\n\\{\\|)|(\n\\|\\})/"); // table
+
+ // FIXME: this should prolly be a hook or something
+ if(function_exists('wfCite')){
+ $spat .= '|(<ref>)'; // references via cite extension
+ $endPatterns[4] = '/(<ref>)|(<\/ref>)/';
+ }
+ $spat .= '/';
+ $textExt = array(); // text extracts
+ $otherExt = array(); // other extracts
+ wfProfileIn( "$fname-split" );
+ $start = 0;
+ $textLen = strlen($text);
+ $count = 0; // sequence number to maintain ordering
+ while( $start < $textLen ){
+ // find start of template/image/table
+ if( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ){
+ $epat = '';
+ foreach($matches as $key => $val){
+ if($key > 0 && $val[1] != -1){
+ if($key == 2){
+ // see if this is an image link
+ $ns = substr($val[0],2,-1);
+ if( $wgContLang->getNsIndex($ns) != NS_FILE )
+ break;
+
+ }
+ $epat = $endPatterns[$key];
+ $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
+ $start = $val[1];
+ break;
+ }
+ }
+ if( $epat ){
+ // find end (and detect any nested elements)
+ $level = 0;
+ $offset = $start + 1;
+ $found = false;
+ while( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ){
+ if( array_key_exists(2,$endMatches) ){
+ // found end
+ if($level == 0){
+ $len = strlen($endMatches[2][0]);
+ $off = $endMatches[2][1];
+ $this->splitAndAdd( $otherExt, $count,
+ substr( $text, $start, $off + $len - $start ) );
+ $start = $off + $len;
+ $found = true;
+ break;
+ } else{
+ // end of nested element
+ $level -= 1;
+ }
+ } else{
+ // nested
+ $level += 1;
+ }
+ $offset = $endMatches[0][1] + strlen($endMatches[0][0]);
+ }
+ if( ! $found ){
+ // couldn't find appropriate closing tag, skip
+ $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen($matches[0][0]) ) );
+ $start += strlen($matches[0][0]);
+ }
+ continue;
+ }
+ }
+ // else: add as text extract
+ $this->splitAndAdd( $textExt, $count, substr($text,$start) );
+ break;
+ }
+
+ $all = $textExt + $otherExt; // these have disjunct key sets
+
+ wfProfileOut( "$fname-split" );
+
+ // prepare regexps
+ foreach( $terms as $index => $term ) {
+ // manually do upper/lowercase stuff for utf-8 since PHP won't do it
+ if(preg_match('/[\x80-\xff]/', $term) ){
+ $terms[$index] = preg_replace_callback('/./us',array($this,'caseCallback'),$terms[$index]);
+ } else {
+ $terms[$index] = $term;
+ }
+ }
+ $anyterm = implode( '|', $terms );
+ $phrase = implode("$wgSearchHighlightBoundaries+", $terms );
+
+ // FIXME: a hack to scale contextchars, a correct solution
+ // would be to have contextchars actually be char and not byte
+ // length, and do proper utf-8 substrings and lengths everywhere,
+ // but PHP is making that very hard and unclean to implement :(
+ $scale = strlen($anyterm) / mb_strlen($anyterm);
+ $contextchars = intval( $contextchars * $scale );
+
+ $patPre = "(^|$wgSearchHighlightBoundaries)";
+ $patPost = "($wgSearchHighlightBoundaries|$)";
+
+ $pat1 = "/(".$phrase.")/ui";
+ $pat2 = "/$patPre(".$anyterm.")$patPost/ui";
+
+ wfProfileIn( "$fname-extract" );
+
+ $left = $contextlines;
+
+ $snippets = array();
+ $offsets = array();
+
+ // show beginning only if it contains all words
+ $first = 0;
+ $firstText = '';
+ foreach($textExt as $index => $line){
+ if(strlen($line)>0 && $line[0] != ';' && $line[0] != ':'){
+ $firstText = $this->extract( $line, 0, $contextchars * $contextlines );
+ $first = $index;
+ break;
+ }
+ }
+ if( $firstText ){
+ $succ = true;
+ // check if first text contains all terms
+ foreach($terms as $term){
+ if( ! preg_match("/$patPre".$term."$patPost/ui", $firstText) ){
+ $succ = false;
+ break;
+ }
+ }
+ if( $succ ){
+ $snippets[$first] = $firstText;
+ $offsets[$first] = 0;
+ }
+ }
+ if( ! $snippets ) {
+ // match whole query on text
+ $this->process($pat1, $textExt, $left, $contextchars, $snippets, $offsets);
+ // match whole query on templates/tables/images
+ $this->process($pat1, $otherExt, $left, $contextchars, $snippets, $offsets);
+ // match any words on text
+ $this->process($pat2, $textExt, $left, $contextchars, $snippets, $offsets);
+ // match any words on templates/tables/images
+ $this->process($pat2, $otherExt, $left, $contextchars, $snippets, $offsets);
+
+ ksort($snippets);
+ }
+
+ // add extra chars to each snippet to make snippets constant size
+ $extended = array();
+ if( count( $snippets ) == 0){
+ // couldn't find the target words, just show beginning of article
+ $targetchars = $contextchars * $contextlines;
+ $snippets[$first] = '';
+ $offsets[$first] = 0;
+ } else{
+ // if begin of the article contains the whole phrase, show only that !!
+ if( array_key_exists($first,$snippets) && preg_match($pat1,$snippets[$first])
+ && $offsets[$first] < $contextchars * 2 ){
+ $snippets = array ($first => $snippets[$first]);
+ }
+
+ // calc by how much to extend existing snippets
+ $targetchars = intval( ($contextchars * $contextlines) / count ( $snippets ) );
+ }
+
+ foreach($snippets as $index => $line){
+ $extended[$index] = $line;
+ $len = strlen($line);
+ if( $len < $targetchars - 20 ){
+ // complete this line
+ if($len < strlen( $all[$index] )){
+ $extended[$index] = $this->extract( $all[$index], $offsets[$index], $offsets[$index]+$targetchars, $offsets[$index]);
+ $len = strlen( $extended[$index] );
+ }
+
+ // add more lines
+ $add = $index + 1;
+ while( $len < $targetchars - 20
+ && array_key_exists($add,$all)
+ && !array_key_exists($add,$snippets) ){
+ $offsets[$add] = 0;
+ $tt = "\n".$this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
+ $extended[$add] = $tt;
+ $len += strlen( $tt );
+ $add++;
+ }
+ }
+ }
+
+ //$snippets = array_map('htmlspecialchars', $extended);
+ $snippets = $extended;
+ $last = -1;
+ $extract = '';
+ foreach($snippets as $index => $line){
+ if($last == -1)
+ $extract .= $line; // first line
+ elseif($last+1 == $index && $offsets[$last]+strlen($snippets[$last]) >= strlen($all[$last]))
+ $extract .= " ".$line; // continous lines
+ else
+ $extract .= '<b> ... </b>' . $line;
+
+ $last = $index;
+ }
+ if( $extract )
+ $extract .= '<b> ... </b>';
+
+ $processed = array();
+ foreach($terms as $term){
+ if( ! isset($processed[$term]) ){
+ $pat3 = "/$patPre(".$term.")$patPost/ui"; // highlight word
+ $extract = preg_replace( $pat3,
+ "\\1<span class='searchmatch'>\\2</span>\\3", $extract );
+ $processed[$term] = true;
+ }
+ }
+
+ wfProfileOut( "$fname-extract" );
+
+ return $extract;
+ }
+
+ /**
+ * Split text into lines and add it to extracts array
+ *
+ * @param $extracts Array: index -> $line
+ * @param $count Integer
+ * @param $text String
+ */
+ function splitAndAdd(&$extracts, &$count, $text){
+ $split = explode( "\n", $this->mCleanWikitext? $this->removeWiki($text) : $text );
+ foreach($split as $line){
+ $tt = trim($line);
+ if( $tt )
+ $extracts[$count++] = $tt;
+ }
+ }
+
+ /**
+ * Do manual case conversion for non-ascii chars
+ *
+ * @param $matches Array
+ */
+ function caseCallback($matches){
+ global $wgContLang;
+ if( strlen($matches[0]) > 1 ){
+ return '['.$wgContLang->lc($matches[0]).$wgContLang->uc($matches[0]).']';
+ } else
+ return $matches[0];
+ }
+
+ /**
+ * Extract part of the text from start to end, but by
+ * not chopping up words
+ * @param $text String
+ * @param $start Integer
+ * @param $end Integer
+ * @param $posStart Integer: (out) actual start position
+ * @param $posEnd Integer: (out) actual end position
+ * @return String
+ */
+ function extract($text, $start, $end, &$posStart = null, &$posEnd = null ){
+ global $wgContLang;
+
+ if( $start != 0)
+ $start = $this->position( $text, $start, 1 );
+ if( $end >= strlen($text) )
+ $end = strlen($text);
+ else
+ $end = $this->position( $text, $end );
+
+ if(!is_null($posStart))
+ $posStart = $start;
+ if(!is_null($posEnd))
+ $posEnd = $end;
+
+ if($end > $start)
+ return substr($text, $start, $end-$start);
+ else
+ return '';
+ }
+
+ /**
+ * Find a nonletter near a point (index) in the text
+ *
+ * @param $text String
+ * @param $point Integer
+ * @param $offset Integer: offset to found index
+ * @return Integer: nearest nonletter index, or beginning of utf8 char if none
+ */
+ function position($text, $point, $offset=0 ){
+ $tolerance = 10;
+ $s = max( 0, $point - $tolerance );
+ $l = min( strlen($text), $point + $tolerance ) - $s;
+ $m = array();
+ if( preg_match('/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/', substr($text,$s,$l), $m, PREG_OFFSET_CAPTURE ) ){
+ return $m[0][1] + $s + $offset;
+ } else{
+ // check if point is on a valid first UTF8 char
+ $char = ord( $text[$point] );
+ while( $char >= 0x80 && $char < 0xc0 ) {
+ // skip trailing bytes
+ $point++;
+ if($point >= strlen($text))
+ return strlen($text);
+ $char = ord( $text[$point] );
+ }
+ return $point;
+
+ }
+ }
+
+ /**
+ * Search extracts for a pattern, and return snippets
+ *
+ * @param $pattern String: regexp for matching lines
+ * @param $extracts Array: extracts to search
+ * @param $linesleft Integer: number of extracts to make
+ * @param $contextchars Integer: length of snippet
+ * @param $out Array: map for highlighted snippets
+ * @param $offsets Array: map of starting points of snippets
+ * @protected
+ */
+ function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ){
+ if($linesleft == 0)
+ return; // nothing to do
+ foreach($extracts as $index => $line){
+ if( array_key_exists($index,$out) )
+ continue; // this line already highlighted
+
+ $m = array();
+ if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) )
+ continue;
+
+ $offset = $m[0][1];
+ $len = strlen($m[0][0]);
+ if($offset + $len < $contextchars)
+ $begin = 0;
+ elseif( $len > $contextchars)
+ $begin = $offset;
+ else
+ $begin = $offset + intval( ($len - $contextchars) / 2 );
+
+ $end = $begin + $contextchars;
+
+ $posBegin = $begin;
+ // basic snippet from this line
+ $out[$index] = $this->extract($line,$begin,$end,$posBegin);
+ $offsets[$index] = $posBegin;
+ $linesleft--;
+ if($linesleft == 0)
+ return;
+ }
+ }
+
+ /**
+ * Basic wikitext removal
+ * @protected
+ */
+ function removeWiki($text) {
+ $fname = __METHOD__;
+ wfProfileIn( $fname );
+
+ //$text = preg_replace("/'{2,5}/", "", $text);
+ //$text = preg_replace("/\[[a-z]+:\/\/[^ ]+ ([^]]+)\]/", "\\2", $text);
+ //$text = preg_replace("/\[\[([^]|]+)\]\]/", "\\1", $text);
+ //$text = preg_replace("/\[\[([^]]+\|)?([^|]]+)\]\]/", "\\2", $text);
+ //$text = preg_replace("/\\{\\|(.*?)\\|\\}/", "", $text);
+ //$text = preg_replace("/\\[\\[[A-Za-z_-]+:([^|]+?)\\]\\]/", "", $text);
+ $text = preg_replace("/\\{\\{([^|]+?)\\}\\}/", "", $text);
+ $text = preg_replace("/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text);
+ $text = preg_replace("/\\[\\[([^|]+?)\\]\\]/", "\\1", $text);
+ $text = preg_replace_callback("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", array($this,'linkReplace'), $text);
+ //$text = preg_replace("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", "\\2", $text);
+ $text = preg_replace("/<\/?[^>]+>/", "", $text);
+ $text = preg_replace("/'''''/", "", $text);
+ $text = preg_replace("/('''|<\/?[iIuUbB]>)/", "", $text);
+ $text = preg_replace("/''/", "", $text);
+
+ wfProfileOut( $fname );
+ return $text;
+ }
+
+ /**
+ * callback to replace [[target|caption]] kind of links, if
+ * the target is category or image, leave it
+ *
+ * @param $matches Array
+ */
+ function linkReplace($matches){
+ $colon = strpos( $matches[1], ':' );
+ if( $colon === false )
+ return $matches[2]; // replace with caption
+ global $wgContLang;
+ $ns = substr( $matches[1], 0, $colon );
+ $index = $wgContLang->getNsIndex($ns);
+ if( $index !== false && ($index == NS_FILE || $index == NS_CATEGORY) )
+ return $matches[0]; // return the whole thing
+ else
+ return $matches[2];
+
+ }
+
+ /**
+ * Simple & fast snippet extraction, but gives completely unrelevant
+ * snippets
+ *
+ * @param $text String
+ * @param $terms Array
+ * @param $contextlines Integer
+ * @param $contextchars Integer
+ * @return String
+ */
+ public function highlightSimple( $text, $terms, $contextlines, $contextchars ) {
+ global $wgLang, $wgContLang;
+ $fname = __METHOD__;
+
+ $lines = explode( "\n", $text );
+
+ $terms = implode( '|', $terms );
+ $max = intval( $contextchars ) + 1;
+ $pat1 = "/(.*)($terms)(.{0,$max})/i";
+
+ $lineno = 0;
+
+ $extract = "";
+ wfProfileIn( "$fname-extract" );
+ foreach ( $lines as $line ) {
+ if ( 0 == $contextlines ) {
+ break;
+ }
+ ++$lineno;
+ $m = array();
+ if ( ! preg_match( $pat1, $line, $m ) ) {
+ continue;
+ }
+ --$contextlines;
+ $pre = $wgContLang->truncate( $m[1], -$contextchars );
+
+ if ( count( $m ) < 3 ) {
+ $post = '';
+ } else {
+ $post = $wgContLang->truncate( $m[3], $contextchars );
+ }
+
+ $found = $m[2];
+
+ $line = htmlspecialchars( $pre . $found . $post );
+ $pat2 = '/(' . $terms . ")/i";
+ $line = preg_replace( $pat2,
+ "<span class='searchmatch'>\\1</span>", $line );
+
+ $extract .= "${line}\n";
+ }
+ wfProfileOut( "$fname-extract" );
+
+ return $extract;
+ }
+
+}
+
+/**
+ * Dummy class to be used when non-supported Database engine is present.
+ * @todo Fixme: dummy class should probably try something at least mildly useful,
+ * such as a LIKE search through titles.
+ * @ingroup Search
+ */
+class SearchEngineDummy extends SearchEngine {
+ // no-op
+}
diff --git a/includes/search/SearchIBM_DB2.php b/includes/search/SearchIBM_DB2.php
new file mode 100644
index 00000000..d7587186
--- /dev/null
+++ b/includes/search/SearchIBM_DB2.php
@@ -0,0 +1,224 @@
+<?php
+# Copyright (C) 2004 Brion Vibber <brion@pobox.com>
+# http://www.mediawiki.org/
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+# http://www.gnu.org/copyleft/gpl.html
+
+/**
+ * @file
+ * @ingroup Search
+ */
+
+/**
+ * Search engine hook base class for IBM DB2
+ * @ingroup Search
+ */
+class SearchIBM_DB2 extends SearchEngine {
+ function __construct($db) {
+ $this->db = $db;
+ }
+
+ /**
+ * Perform a full text search query and return a result set.
+ *
+ * @param $term String: raw search term
+ * @return SqlSearchResultSet
+ */
+ function searchText( $term ) {
+ $resultSet = $this->db->resultObject($this->db->query($this->getQuery($this->filter($term), true)));
+ return new SqlSearchResultSet($resultSet, $this->searchTerms);
+ }
+
+ /**
+ * Perform a title-only search query and return a result set.
+ *
+ * @param $term String: taw search term
+ * @return SqlSearchResultSet
+ */
+ function searchTitle($term) {
+ $resultSet = $this->db->resultObject($this->db->query($this->getQuery($this->filter($term), false)));
+ return new SqlSearchResultSet($resultSet, $this->searchTerms);
+ }
+
+
+ /**
+ * Return a partial WHERE clause to exclude redirects, if so set
+ * @return String
+ */
+ function queryRedirect() {
+ if ($this->showRedirects) {
+ return '';
+ } else {
+ return 'AND page_is_redirect=0';
+ }
+ }
+
+ /**
+ * Return a partial WHERE clause to limit the search to the given namespaces
+ * @return String
+ */
+ function queryNamespaces() {
+ if( is_null($this->namespaces) )
+ return '';
+ $namespaces = implode(',', $this->namespaces);
+ if ($namespaces == '') {
+ $namespaces = '0';
+ }
+ return 'AND page_namespace IN (' . $namespaces . ')';
+ }
+
+ /**
+ * Return a LIMIT clause to limit results on the query.
+ * @return String
+ */
+ function queryLimit($sql) {
+ return $this->db->limitResult($sql, $this->limit, $this->offset);
+ }
+
+ /**
+ * Does not do anything for generic search engine
+ * subclasses may define this though
+ * @return String
+ */
+ function queryRanking($filteredTerm, $fulltext) {
+ // requires Net Search Extender or equivalent
+ // return ' ORDER BY score(1)';
+ return '';
+ }
+
+ /**
+ * Construct the full SQL query to do the search.
+ * The guts shoulds be constructed in queryMain()
+ * @param string $filteredTerm String
+ * @param bool $fulltext Boolean
+ */
+ function getQuery( $filteredTerm, $fulltext ) {
+ return $this->queryLimit($this->queryMain($filteredTerm, $fulltext) . ' ' .
+ $this->queryRedirect() . ' ' .
+ $this->queryNamespaces() . ' ' .
+ $this->queryRanking( $filteredTerm, $fulltext ) . ' ');
+ }
+
+
+ /**
+ * Picks which field to index on, depending on what type of query.
+ * @param $fulltext Boolean
+ * @return String
+ */
+ function getIndexField($fulltext) {
+ return $fulltext ? 'si_text' : 'si_title';
+ }
+
+ /**
+ * Get the base part of the search query.
+ *
+ * @param string $filteredTerm String
+ * @param bool $fulltext Boolean
+ * @return String
+ */
+ function queryMain( $filteredTerm, $fulltext ) {
+ $match = $this->parseQuery($filteredTerm, $fulltext);
+ $page = $this->db->tableName('page');
+ $searchindex = $this->db->tableName('searchindex');
+ return 'SELECT page_id, page_namespace, page_title ' .
+ "FROM $page,$searchindex " .
+ 'WHERE page_id=si_page AND ' . $match;
+ }
+
+ /** @todo document */
+ function parseQuery($filteredText, $fulltext) {
+ global $wgContLang;
+ $lc = SearchEngine::legalSearchChars();
+ $this->searchTerms = array();
+
+ # FIXME: This doesn't handle parenthetical expressions.
+ $m = array();
+ $q = array();
+
+ if (preg_match_all('/([-+<>~]?)(([' . $lc . ']+)(\*?)|"[^"]*")/',
+ $filteredText, $m, PREG_SET_ORDER)) {
+ foreach($m as $terms) {
+
+ // Search terms in all variant forms, only
+ // apply on wiki with LanguageConverter
+ $temp_terms = $wgContLang->autoConvertToAllVariants( $terms[2] );
+ if( is_array( $temp_terms )) {
+ $temp_terms = array_unique( array_values( $temp_terms ));
+ foreach( $temp_terms as $t )
+ $q[] = $terms[1] . $wgContLang->normalizeForSearch( $t );
+ }
+ else
+ $q[] = $terms[1] . $wgContLang->normalizeForSearch( $terms[2] );
+
+ if (!empty($terms[3])) {
+ $regexp = preg_quote( $terms[3], '/' );
+ if ($terms[4])
+ $regexp .= "[0-9A-Za-z_]+";
+ } else {
+ $regexp = preg_quote(str_replace('"', '', $terms[2]), '/');
+ }
+ $this->searchTerms[] = $regexp;
+ }
+ }
+
+ $searchon = $this->db->strencode(join(',', $q));
+ $field = $this->getIndexField($fulltext);
+
+ // requires Net Search Extender or equivalent
+ //return " CONTAINS($field, '$searchon') > 0 ";
+
+ return " lcase($field) LIKE lcase('%$searchon%')";
+ }
+
+ /**
+ * Create or update the search index record for the given page.
+ * Title and text should be pre-processed.
+ *
+ * @param $id Integer
+ * @param $title String
+ * @param $text String
+ */
+ function update($id, $title, $text) {
+ $dbw = wfGetDB(DB_MASTER);
+ $dbw->replace('searchindex',
+ array('si_page'),
+ array(
+ 'si_page' => $id,
+ 'si_title' => $title,
+ 'si_text' => $text
+ ), 'SearchIBM_DB2::update' );
+ // ?
+ //$dbw->query("CALL ctx_ddl.sync_index('si_text_idx')");
+ //$dbw->query("CALL ctx_ddl.sync_index('si_title_idx')");
+ }
+
+ /**
+ * Update a search index record's title only.
+ * Title should be pre-processed.
+ *
+ * @param $id Integer
+ * @param $title String
+ */
+ function updateTitle($id, $title) {
+ $dbw = wfGetDB(DB_MASTER);
+
+ $dbw->update('searchindex',
+ array('si_title' => $title),
+ array('si_page' => $id),
+ 'SearchIBM_DB2::updateTitle',
+ array());
+ }
+}
diff --git a/includes/search/SearchMySQL.php b/includes/search/SearchMySQL.php
new file mode 100644
index 00000000..0c238be8
--- /dev/null
+++ b/includes/search/SearchMySQL.php
@@ -0,0 +1,412 @@
+<?php
+# Copyright (C) 2004 Brion Vibber <brion@pobox.com>
+# http://www.mediawiki.org/
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+# http://www.gnu.org/copyleft/gpl.html
+
+/**
+ * @file
+ * @ingroup Search
+ */
+
+/**
+ * Search engine hook for MySQL 4+
+ * @ingroup Search
+ */
+class SearchMySQL extends SearchEngine {
+ var $strictMatching = true;
+ static $mMinSearchLength;
+
+ /** @todo document */
+ function __construct( $db ) {
+ $this->db = $db;
+ }
+
+ /**
+ * Parse the user's query and transform it into an SQL fragment which will
+ * become part of a WHERE clause
+ */
+ function parseQuery( $filteredText, $fulltext ) {
+ global $wgContLang;
+ $lc = SearchEngine::legalSearchChars(); // Minus format chars
+ $searchon = '';
+ $this->searchTerms = array();
+
+ # FIXME: This doesn't handle parenthetical expressions.
+ $m = array();
+ if( preg_match_all( '/([-+<>~]?)(([' . $lc . ']+)(\*?)|"[^"]*")/',
+ $filteredText, $m, PREG_SET_ORDER ) ) {
+ foreach( $m as $bits ) {
+ @list( /* all */, $modifier, $term, $nonQuoted, $wildcard ) = $bits;
+
+ if( $nonQuoted != '' ) {
+ $term = $nonQuoted;
+ $quote = '';
+ } else {
+ $term = str_replace( '"', '', $term );
+ $quote = '"';
+ }
+
+ if( $searchon !== '' ) $searchon .= ' ';
+ if( $this->strictMatching && ($modifier == '') ) {
+ // If we leave this out, boolean op defaults to OR which is rarely helpful.
+ $modifier = '+';
+ }
+
+ // Some languages such as Serbian store the input form in the search index,
+ // so we may need to search for matches in multiple writing system variants.
+ $convertedVariants = $wgContLang->autoConvertToAllVariants( $term );
+ if( is_array( $convertedVariants ) ) {
+ $variants = array_unique( array_values( $convertedVariants ) );
+ } else {
+ $variants = array( $term );
+ }
+
+ // The low-level search index does some processing on input to work
+ // around problems with minimum lengths and encoding in MySQL's
+ // fulltext engine.
+ // For Chinese this also inserts spaces between adjacent Han characters.
+ $strippedVariants = array_map(
+ array( $wgContLang, 'normalizeForSearch' ),
+ $variants );
+
+ // Some languages such as Chinese force all variants to a canonical
+ // form when stripping to the low-level search index, so to be sure
+ // let's check our variants list for unique items after stripping.
+ $strippedVariants = array_unique( $strippedVariants );
+
+ $searchon .= $modifier;
+ if( count( $strippedVariants) > 1 )
+ $searchon .= '(';
+ foreach( $strippedVariants as $stripped ) {
+ $stripped = $this->normalizeText( $stripped );
+ if( $nonQuoted && strpos( $stripped, ' ' ) !== false ) {
+ // Hack for Chinese: we need to toss in quotes for
+ // multiple-character phrases since normalizeForSearch()
+ // added spaces between them to make word breaks.
+ $stripped = '"' . trim( $stripped ) . '"';
+ }
+ $searchon .= "$quote$stripped$quote$wildcard ";
+ }
+ if( count( $strippedVariants) > 1 )
+ $searchon .= ')';
+
+ // Match individual terms or quoted phrase in result highlighting...
+ // Note that variants will be introduced in a later stage for highlighting!
+ $regexp = $this->regexTerm( $term, $wildcard );
+ $this->searchTerms[] = $regexp;
+ }
+ wfDebug( __METHOD__ . ": Would search with '$searchon'\n" );
+ wfDebug( __METHOD__ . ': Match with /' . implode( '|', $this->searchTerms ) . "/\n" );
+ } else {
+ wfDebug( __METHOD__ . ": Can't understand search query '{$filteredText}'\n" );
+ }
+
+ $searchon = $this->db->strencode( $searchon );
+ $field = $this->getIndexField( $fulltext );
+ return " MATCH($field) AGAINST('$searchon' IN BOOLEAN MODE) ";
+ }
+
+ function regexTerm( $string, $wildcard ) {
+ global $wgContLang;
+
+ $regex = preg_quote( $string, '/' );
+ if( $wgContLang->hasWordBreaks() ) {
+ if( $wildcard ) {
+ // Don't cut off the final bit!
+ $regex = "\b$regex";
+ } else {
+ $regex = "\b$regex\b";
+ }
+ } else {
+ // For Chinese, words may legitimately abut other words in the text literal.
+ // Don't add \b boundary checks... note this could cause false positives
+ // for latin chars.
+ }
+ return $regex;
+ }
+
+ public static function legalSearchChars() {
+ return "\"*" . parent::legalSearchChars();
+ }
+
+ /**
+ * Perform a full text search query and return a result set.
+ *
+ * @param $term String: raw search term
+ * @return MySQLSearchResultSet
+ */
+ function searchText( $term ) {
+ return $this->searchInternal( $term, true );
+ }
+
+ /**
+ * Perform a title-only search query and return a result set.
+ *
+ * @param $term String: raw search term
+ * @return MySQLSearchResultSet
+ */
+ function searchTitle( $term ) {
+ return $this->searchInternal( $term, false );
+ }
+
+ protected function searchInternal( $term, $fulltext ) {
+ global $wgCountTotalSearchHits;
+
+ $filteredTerm = $this->filter( $term );
+ $resultSet = $this->db->query( $this->getQuery( $filteredTerm, $fulltext ) );
+
+ $total = null;
+ if( $wgCountTotalSearchHits ) {
+ $totalResult = $this->db->query( $this->getCountQuery( $filteredTerm, $fulltext ) );
+ $row = $totalResult->fetchObject();
+ if( $row ) {
+ $total = intval( $row->c );
+ }
+ $totalResult->free();
+ }
+
+ return new MySQLSearchResultSet( $resultSet, $this->searchTerms, $total );
+ }
+
+
+ /**
+ * Return a partial WHERE clause to exclude redirects, if so set
+ * @return String
+ */
+ function queryRedirect() {
+ if( $this->showRedirects ) {
+ return '';
+ } else {
+ return 'AND page_is_redirect=0';
+ }
+ }
+
+ /**
+ * Return a partial WHERE clause to limit the search to the given namespaces
+ * @return String
+ */
+ function queryNamespaces() {
+ if( is_null($this->namespaces) )
+ return ''; # search all
+ if ( !count( $this->namespaces ) ) {
+ $namespaces = '0';
+ } else {
+ $namespaces = $this->db->makeList( $this->namespaces );
+ }
+ return 'AND page_namespace IN (' . $namespaces . ')';
+ }
+
+ /**
+ * Return a LIMIT clause to limit results on the query.
+ * @return String
+ */
+ function queryLimit() {
+ return $this->db->limitResult( '', $this->limit, $this->offset );
+ }
+
+ /**
+ * Does not do anything for generic search engine
+ * subclasses may define this though
+ * @return String
+ */
+ function queryRanking( $filteredTerm, $fulltext ) {
+ return '';
+ }
+
+ /**
+ * Construct the full SQL query to do the search.
+ * The guts shoulds be constructed in queryMain()
+ * @param $filteredTerm String
+ * @param $fulltext Boolean
+ */
+ function getQuery( $filteredTerm, $fulltext ) {
+ return $this->queryMain( $filteredTerm, $fulltext ) . ' ' .
+ $this->queryRedirect() . ' ' .
+ $this->queryNamespaces() . ' ' .
+ $this->queryRanking( $filteredTerm, $fulltext ) . ' ' .
+ $this->queryLimit();
+ }
+
+ /**
+ * Picks which field to index on, depending on what type of query.
+ * @param $fulltext Boolean
+ * @return String
+ */
+ function getIndexField( $fulltext ) {
+ return $fulltext ? 'si_text' : 'si_title';
+ }
+
+ /**
+ * Get the base part of the search query.
+ * The actual match syntax will depend on the server
+ * version; MySQL 3 and MySQL 4 have different capabilities
+ * in their fulltext search indexes.
+ *
+ * @param $filteredTerm String
+ * @param $fulltext Boolean
+ * @return String
+ */
+ function queryMain( $filteredTerm, $fulltext ) {
+ $match = $this->parseQuery( $filteredTerm, $fulltext );
+ $page = $this->db->tableName( 'page' );
+ $searchindex = $this->db->tableName( 'searchindex' );
+ return 'SELECT page_id, page_namespace, page_title ' .
+ "FROM $page,$searchindex " .
+ 'WHERE page_id=si_page AND ' . $match;
+ }
+
+ function getCountQuery( $filteredTerm, $fulltext ) {
+ $match = $this->parseQuery( $filteredTerm, $fulltext );
+ $page = $this->db->tableName( 'page' );
+ $searchindex = $this->db->tableName( 'searchindex' );
+ return "SELECT COUNT(*) AS c " .
+ "FROM $page,$searchindex " .
+ 'WHERE page_id=si_page AND ' . $match .
+ $this->queryRedirect() . ' ' .
+ $this->queryNamespaces();
+ }
+
+ /**
+ * Create or update the search index record for the given page.
+ * Title and text should be pre-processed.
+ *
+ * @param $id Integer
+ * @param $title String
+ * @param $text String
+ */
+ function update( $id, $title, $text ) {
+ $dbw = wfGetDB( DB_MASTER );
+ $dbw->replace( 'searchindex',
+ array( 'si_page' ),
+ array(
+ 'si_page' => $id,
+ 'si_title' => $this->normalizeText( $title ),
+ 'si_text' => $this->normalizeText( $text )
+ ), __METHOD__ );
+ }
+
+ /**
+ * Update a search index record's title only.
+ * Title should be pre-processed.
+ *
+ * @param $id Integer
+ * @param $title String
+ */
+ function updateTitle( $id, $title ) {
+ $dbw = wfGetDB( DB_MASTER );
+
+ $dbw->update( 'searchindex',
+ array( 'si_title' => $this->normalizeText( $title ) ),
+ array( 'si_page' => $id ),
+ __METHOD__,
+ array( $dbw->lowPriorityOption() ) );
+ }
+
+ /**
+ * Converts some characters for MySQL's indexing to grok it correctly,
+ * and pads short words to overcome limitations.
+ */
+ function normalizeText( $string ) {
+ global $wgContLang;
+
+ wfProfileIn( __METHOD__ );
+
+ // Some languages such as Chinese require word segmentation
+ $out = $wgContLang->wordSegmentation( $string );
+
+ // MySQL fulltext index doesn't grok utf-8, so we
+ // need to fold cases and convert to hex
+ $out = preg_replace_callback(
+ "/([\\xc0-\\xff][\\x80-\\xbf]*)/",
+ array( $this, 'stripForSearchCallback' ),
+ $wgContLang->lc( $out ) );
+
+ // And to add insult to injury, the default indexing
+ // ignores short words... Pad them so we can pass them
+ // through without reconfiguring the server...
+ $minLength = $this->minSearchLength();
+ if( $minLength > 1 ) {
+ $n = $minLength - 1;
+ $out = preg_replace(
+ "/\b(\w{1,$n})\b/",
+ "$1u800",
+ $out );
+ }
+
+ // Periods within things like hostnames and IP addresses
+ // are also important -- we want a search for "example.com"
+ // or "192.168.1.1" to work sanely.
+ //
+ // MySQL's search seems to ignore them, so you'd match on
+ // "example.wikipedia.com" and "192.168.83.1" as well.
+ $out = preg_replace(
+ "/(\w)\.(\w|\*)/u",
+ "$1u82e$2",
+ $out );
+
+ wfProfileOut( __METHOD__ );
+
+ return $out;
+ }
+
+ /**
+ * Armor a case-folded UTF-8 string to get through MySQL's
+ * fulltext search without being mucked up by funny charset
+ * settings or anything else of the sort.
+ */
+ protected function stripForSearchCallback( $matches ) {
+ return 'u8' . bin2hex( $matches[1] );
+ }
+
+ /**
+ * Check MySQL server's ft_min_word_len setting so we know
+ * if we need to pad short words...
+ *
+ * @return int
+ */
+ protected function minSearchLength() {
+ if( is_null( self::$mMinSearchLength ) ) {
+ $sql = "SHOW GLOBAL VARIABLES LIKE 'ft\\_min\\_word\\_len'";
+
+ $dbr = wfGetDB( DB_SLAVE );
+ $result = $dbr->query( $sql );
+ $row = $result->fetchObject();
+ $result->free();
+
+ if( $row && $row->Variable_name == 'ft_min_word_len' ) {
+ self::$mMinSearchLength = intval( $row->Value );
+ } else {
+ self::$mMinSearchLength = 0;
+ }
+ }
+ return self::$mMinSearchLength;
+ }
+}
+
+/**
+ * @ingroup Search
+ */
+class MySQLSearchResultSet extends SqlSearchResultSet {
+ function MySQLSearchResultSet( $resultSet, $terms, $totalHits=null ) {
+ parent::__construct( $resultSet, $terms );
+ $this->mTotalHits = $totalHits;
+ }
+
+ function getTotalHits() {
+ return $this->mTotalHits;
+ }
+} \ No newline at end of file
diff --git a/includes/search/SearchMySQL4.php b/includes/search/SearchMySQL4.php
new file mode 100644
index 00000000..3e2bb2d1
--- /dev/null
+++ b/includes/search/SearchMySQL4.php
@@ -0,0 +1,34 @@
+<?php
+# Copyright (C) 2004 Brion Vibber <brion@pobox.com>
+# http://www.mediawiki.org/
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+# http://www.gnu.org/copyleft/gpl.html
+
+/**
+ * @file
+ * @ingroup Search
+ */
+
+/**
+ * Search engine hook for MySQL 4+
+ * This class retained for backwards compatibility...
+ * The meat's been moved to SearchMySQL, since the 3.x variety is gone.
+ * @ingroup Search
+ * @deprecated
+ */
+class SearchMySQL4 extends SearchMySQL {
+ /* whee */
+}
diff --git a/includes/search/SearchOracle.php b/includes/search/SearchOracle.php
new file mode 100644
index 00000000..e4c5deee
--- /dev/null
+++ b/includes/search/SearchOracle.php
@@ -0,0 +1,268 @@
+<?php
+# Copyright (C) 2004 Brion Vibber <brion@pobox.com>
+# http://www.mediawiki.org/
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+# http://www.gnu.org/copyleft/gpl.html
+
+/**
+ * @file
+ * @ingroup Search
+ */
+
+/**
+ * Search engine hook base class for Oracle (ConText).
+ * @ingroup Search
+ */
+class SearchOracle extends SearchEngine {
+
+ private $reservedWords = array ('ABOUT' => 1,
+ 'ACCUM' => 1,
+ 'AND' => 1,
+ 'BT' => 1,
+ 'BTG' => 1,
+ 'BTI' => 1,
+ 'BTP' => 1,
+ 'FUZZY' => 1,
+ 'HASPATH' => 1,
+ 'INPATH' => 1,
+ 'MINUS' => 1,
+ 'NEAR' => 1,
+ 'NOT' => 1,
+ 'NT' => 1,
+ 'NTG' => 1,
+ 'NTI' => 1,
+ 'NTP' => 1,
+ 'OR' => 1,
+ 'PT' => 1,
+ 'RT' => 1,
+ 'SQE' => 1,
+ 'SYN' => 1,
+ 'TR' => 1,
+ 'TRSYN' => 1,
+ 'TT' => 1,
+ 'WITHIN' => 1);
+
+ function __construct($db) {
+ $this->db = $db;
+ }
+
+ /**
+ * Perform a full text search query and return a result set.
+ *
+ * @param $term String: raw search term
+ * @return SqlSearchResultSet
+ */
+ function searchText( $term ) {
+ if ($term == '')
+ return new SqlSearchResultSet(false, '');
+
+ $resultSet = $this->db->resultObject($this->db->query($this->getQuery($this->filter($term), true)));
+ return new SqlSearchResultSet($resultSet, $this->searchTerms);
+ }
+
+ /**
+ * Perform a title-only search query and return a result set.
+ *
+ * @param $term String: raw search term
+ * @return SqlSearchResultSet
+ */
+ function searchTitle($term) {
+ if ($term == '')
+ return new SqlSearchResultSet(false, '');
+
+ $resultSet = $this->db->resultObject($this->db->query($this->getQuery($this->filter($term), false)));
+ return new MySQLSearchResultSet($resultSet, $this->searchTerms);
+ }
+
+
+ /**
+ * Return a partial WHERE clause to exclude redirects, if so set
+ * @return String
+ */
+ function queryRedirect() {
+ if ($this->showRedirects) {
+ return '';
+ } else {
+ return 'AND page_is_redirect=0';
+ }
+ }
+
+ /**
+ * Return a partial WHERE clause to limit the search to the given namespaces
+ * @return String
+ */
+ function queryNamespaces() {
+ if( is_null($this->namespaces) )
+ return '';
+ if ( !count( $this->namespaces ) ) {
+ $namespaces = '0';
+ } else {
+ $namespaces = $this->db->makeList( $this->namespaces );
+ }
+ return 'AND page_namespace IN (' . $namespaces . ')';
+ }
+
+ /**
+ * Return a LIMIT clause to limit results on the query.
+ * @return String
+ */
+ function queryLimit($sql) {
+ return $this->db->limitResult($sql, $this->limit, $this->offset);
+ }
+
+ /**
+ * Does not do anything for generic search engine
+ * subclasses may define this though
+ * @return String
+ */
+ function queryRanking($filteredTerm, $fulltext) {
+ return ' ORDER BY score(1)';
+ }
+
+ /**
+ * Construct the full SQL query to do the search.
+ * The guts shoulds be constructed in queryMain()
+ * @param $filteredTerm String
+ * @param $fulltext Boolean
+ */
+ function getQuery( $filteredTerm, $fulltext ) {
+ return $this->queryLimit($this->queryMain($filteredTerm, $fulltext) . ' ' .
+ $this->queryRedirect() . ' ' .
+ $this->queryNamespaces() . ' ' .
+ $this->queryRanking( $filteredTerm, $fulltext ) . ' ');
+ }
+
+
+ /**
+ * Picks which field to index on, depending on what type of query.
+ * @param $fulltext Boolean
+ * @return String
+ */
+ function getIndexField($fulltext) {
+ return $fulltext ? 'si_text' : 'si_title';
+ }
+
+ /**
+ * Get the base part of the search query.
+ *
+ * @param $filteredTerm String
+ * @param $fulltext Boolean
+ * @return String
+ */
+ function queryMain( $filteredTerm, $fulltext ) {
+ $match = $this->parseQuery($filteredTerm, $fulltext);
+ $page = $this->db->tableName('page');
+ $searchindex = $this->db->tableName('searchindex');
+ return 'SELECT page_id, page_namespace, page_title ' .
+ "FROM $page,$searchindex " .
+ 'WHERE page_id=si_page AND ' . $match;
+ }
+
+ /**
+ * Parse a user input search string, and return an SQL fragment to be used
+ * as part of a WHERE clause
+ */
+ function parseQuery($filteredText, $fulltext) {
+ global $wgContLang;
+ $lc = SearchEngine::legalSearchChars();
+ $this->searchTerms = array();
+
+ # FIXME: This doesn't handle parenthetical expressions.
+ $m = array();
+ $searchon = '';
+ if (preg_match_all('/([-+<>~]?)(([' . $lc . ']+)(\*?)|"[^"]*")/',
+ $filteredText, $m, PREG_SET_ORDER)) {
+ foreach($m as $terms) {
+ // Search terms in all variant forms, only
+ // apply on wiki with LanguageConverter
+ $temp_terms = $wgContLang->autoConvertToAllVariants( $terms[2] );
+ if( is_array( $temp_terms )) {
+ $temp_terms = array_unique( array_values( $temp_terms ));
+ foreach( $temp_terms as $t ) {
+ $searchon .= ($terms[1] == '-' ? ' ~' : ' & ') . $this->escapeTerm( $t );
+ }
+ }
+ else {
+ $searchon .= ($terms[1] == '-' ? ' ~' : ' & ') . $this->escapeTerm( $terms[2] );
+ }
+ if (!empty($terms[3])) {
+ $regexp = preg_quote( $terms[3], '/' );
+ if ($terms[4])
+ $regexp .= "[0-9A-Za-z_]+";
+ } else {
+ $regexp = preg_quote(str_replace('"', '', $terms[2]), '/');
+ }
+ $this->searchTerms[] = $regexp;
+ }
+ }
+
+
+ $searchon = $this->db->addQuotes(ltrim($searchon, ' &'));
+ $field = $this->getIndexField($fulltext);
+ return " CONTAINS($field, $searchon, 1) > 0 ";
+ }
+
+ private function escapeTerm($t) {
+ global $wgContLang;
+ $t = $wgContLang->normalizeForSearch($t);
+ $t = isset($this->reservedWords[strtoupper($t)]) ? '{'.$t.'}' : $t;
+ $t = preg_replace('/^"(.*)"$/', '($1)', $t);
+ $t = preg_replace('/([-&|])/', '\\\\$1', $t);
+ return $t;
+ }
+ /**
+ * Create or update the search index record for the given page.
+ * Title and text should be pre-processed.
+ *
+ * @param $id Integer
+ * @param $title String
+ * @param $text String
+ */
+ function update($id, $title, $text) {
+ $dbw = wfGetDB(DB_MASTER);
+ $dbw->replace('searchindex',
+ array('si_page'),
+ array(
+ 'si_page' => $id,
+ 'si_title' => $title,
+ 'si_text' => $text
+ ), 'SearchOracle::update' );
+ $dbw->query("CALL ctx_ddl.sync_index('si_text_idx')");
+ $dbw->query("CALL ctx_ddl.sync_index('si_title_idx')");
+ }
+
+ /**
+ * Update a search index record's title only.
+ * Title should be pre-processed.
+ *
+ * @param int $id
+ * @param string $title
+ */
+ function updateTitle($id, $title) {
+ $dbw = wfGetDB(DB_MASTER);
+
+ $dbw->update('searchindex',
+ array('si_title' => $title),
+ array('si_page' => $id),
+ 'SearchOracle::updateTitle',
+ array());
+ }
+
+
+ public static function legalSearchChars() {
+ return "\"" . parent::legalSearchChars();
+ }
+}
diff --git a/includes/search/SearchPostgres.php b/includes/search/SearchPostgres.php
new file mode 100644
index 00000000..0006fa82
--- /dev/null
+++ b/includes/search/SearchPostgres.php
@@ -0,0 +1,246 @@
+<?php
+# Copyright (C) 2006-2007 Greg Sabino Mullane <greg@turnstep.com>
+# http://www.mediawiki.org/
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+# http://www.gnu.org/copyleft/gpl.html
+
+/**
+ * @file
+ * @ingroup Search
+ */
+
+/**
+ * Search engine hook base class for Postgres
+ * @ingroup Search
+ */
+class SearchPostgres extends SearchEngine {
+
+ function __construct( $db ) {
+ $this->db = $db;
+ }
+
+ /**
+ * Perform a full text search query via tsearch2 and return a result set.
+ * Currently searches a page's current title (page.page_title) and
+ * latest revision article text (pagecontent.old_text)
+ *
+ * @param $term String: raw search term
+ * @return PostgresSearchResultSet
+ */
+ function searchTitle( $term ) {
+ $q = $this->searchQuery( $term , 'titlevector', 'page_title' );
+ $olderror = error_reporting(E_ERROR);
+ $resultSet = $this->db->resultObject( $this->db->query( $q, 'SearchPostgres', true ) );
+ error_reporting($olderror);
+ if (!$resultSet) {
+ // Needed for "Query requires full scan, GIN doesn't support it"
+ return new SearchResultTooMany();
+ }
+ return new PostgresSearchResultSet( $resultSet, $this->searchTerms );
+ }
+ function searchText( $term ) {
+ $q = $this->searchQuery( $term, 'textvector', 'old_text' );
+ $olderror = error_reporting(E_ERROR);
+ $resultSet = $this->db->resultObject( $this->db->query( $q, 'SearchPostgres', true ) );
+ error_reporting($olderror);
+ if (!$resultSet) {
+ return new SearchResultTooMany();
+ }
+ return new PostgresSearchResultSet( $resultSet, $this->searchTerms );
+ }
+
+
+ /*
+ * Transform the user's search string into a better form for tsearch2
+ * Returns an SQL fragment consisting of quoted text to search for.
+ */
+ function parseQuery( $term ) {
+
+ wfDebug( "parseQuery received: $term \n" );
+
+ ## No backslashes allowed
+ $term = preg_replace('/\\\/', '', $term);
+
+ ## Collapse parens into nearby words:
+ $term = preg_replace('/\s*\(\s*/', ' (', $term);
+ $term = preg_replace('/\s*\)\s*/', ') ', $term);
+
+ ## Treat colons as word separators:
+ $term = preg_replace('/:/', ' ', $term);
+
+ $searchstring = '';
+ $m = array();
+ if( preg_match_all('/([-!]?)(\S+)\s*/', $term, $m, PREG_SET_ORDER ) ) {
+ foreach( $m as $terms ) {
+ if (strlen($terms[1])) {
+ $searchstring .= ' & !';
+ }
+ if (strtolower($terms[2]) === 'and') {
+ $searchstring .= ' & ';
+ }
+ else if (strtolower($terms[2]) === 'or' or $terms[2] === '|') {
+ $searchstring .= ' | ';
+ }
+ else if (strtolower($terms[2]) === 'not') {
+ $searchstring .= ' & !';
+ }
+ else {
+ $searchstring .= " & $terms[2]";
+ }
+ }
+ }
+
+ ## Strip out leading junk
+ $searchstring = preg_replace('/^[\s\&\|]+/', '', $searchstring);
+
+ ## Remove any doubled-up operators
+ $searchstring = preg_replace('/([\!\&\|]) +(?:[\&\|] +)+/', "$1 ", $searchstring);
+
+ ## Remove any non-spaced operators (e.g. "Zounds!")
+ $searchstring = preg_replace('/([^ ])[\!\&\|]/', "$1", $searchstring);
+
+ ## Remove any trailing whitespace or operators
+ $searchstring = preg_replace('/[\s\!\&\|]+$/', '', $searchstring);
+
+ ## Remove unnecessary quotes around everything
+ $searchstring = preg_replace('/^[\'"](.*)[\'"]$/', "$1", $searchstring);
+
+ ## Quote the whole thing
+ $searchstring = $this->db->addQuotes($searchstring);
+
+ wfDebug( "parseQuery returned: $searchstring \n" );
+
+ return $searchstring;
+
+ }
+
+ /**
+ * Construct the full SQL query to do the search.
+ * @param $filteredTerm String
+ * @param $fulltext String
+ */
+ function searchQuery( $term, $fulltext, $colname ) {
+ global $wgDBversion;
+
+ if ( !isset( $wgDBversion ) ) {
+ $this->db->getServerVersion();
+ $wgDBversion = $this->db->numeric_version;
+ }
+ $prefix = $wgDBversion < 8.3 ? "'default'," : '';
+
+ # Get the SQL fragment for the given term
+ $searchstring = $this->parseQuery( $term );
+
+ ## We need a separate query here so gin does not complain about empty searches
+ $SQL = "SELECT to_tsquery($prefix $searchstring)";
+ $res = $this->db->doQuery($SQL);
+ if (!$res) {
+ ## TODO: Better output (example to catch: one 'two)
+ die ("Sorry, that was not a valid search string. Please go back and try again");
+ }
+ $top = pg_fetch_result($res,0,0);
+
+ if ($top === "") { ## e.g. if only stopwords are used XXX return something better
+ $query = "SELECT page_id, page_namespace, page_title, 0 AS score ".
+ "FROM page p, revision r, pagecontent c WHERE p.page_latest = r.rev_id " .
+ "AND r.rev_text_id = c.old_id AND 1=0";
+ }
+ else {
+ $m = array();
+ if( preg_match_all("/'([^']+)'/", $top, $m, PREG_SET_ORDER ) ) {
+ foreach( $m as $terms ) {
+ $this->searchTerms[$terms[1]] = $terms[1];
+ }
+ }
+
+ $rankscore = $wgDBversion > 8.2 ? 5 : 1;
+ $rank = $wgDBversion < 8.3 ? 'rank' : 'ts_rank';
+ $query = "SELECT page_id, page_namespace, page_title, ".
+ "$rank($fulltext, to_tsquery($prefix $searchstring), $rankscore) AS score ".
+ "FROM page p, revision r, pagecontent c WHERE p.page_latest = r.rev_id " .
+ "AND r.rev_text_id = c.old_id AND $fulltext @@ to_tsquery($prefix $searchstring)";
+ }
+
+ ## Redirects
+ if (! $this->showRedirects)
+ $query .= ' AND page_is_redirect = 0';
+
+ ## Namespaces - defaults to 0
+ if( !is_null($this->namespaces) ){ // null -> search all
+ if ( count($this->namespaces) < 1)
+ $query .= ' AND page_namespace = 0';
+ else {
+ $namespaces = $this->db->makeList( $this->namespaces );
+ $query .= " AND page_namespace IN ($namespaces)";
+ }
+ }
+
+ $query .= " ORDER BY score DESC, page_id DESC";
+
+ $query .= $this->db->limitResult( '', $this->limit, $this->offset );
+
+ wfDebug( "searchQuery returned: $query \n" );
+
+ return $query;
+ }
+
+ ## Most of the work of these two functions are done automatically via triggers
+
+ function update( $pageid, $title, $text ) {
+ ## We don't want to index older revisions
+ $SQL = "UPDATE pagecontent SET textvector = NULL WHERE old_id IN ".
+ "(SELECT rev_text_id FROM revision WHERE rev_page = " . intval( $pageid ) .
+ " ORDER BY rev_text_id DESC OFFSET 1)";
+ $this->db->doQuery($SQL);
+ return true;
+ }
+
+ function updateTitle( $id, $title ) {
+ return true;
+ }
+
+} ## end of the SearchPostgres class
+
+/**
+ * @ingroup Search
+ */
+class PostgresSearchResult extends SearchResult {
+ function __construct( $row ) {
+ parent::__construct($row);
+ $this->score = $row->score;
+ }
+ function getScore() {
+ return $this->score;
+ }
+}
+
+/**
+ * @ingroup Search
+ */
+class PostgresSearchResultSet extends SqlSearchResultSet {
+ function __construct( $resultSet, $terms ) {
+ parent::__construct( $resultSet, $terms );
+ }
+
+ function next() {
+ $row = $this->mResultSet->fetchObject();
+ if( $row === false ) {
+ return false;
+ } else {
+ return new PostgresSearchResult( $row );
+ }
+ }
+}
diff --git a/includes/search/SearchSqlite.php b/includes/search/SearchSqlite.php
new file mode 100644
index 00000000..fb55efec
--- /dev/null
+++ b/includes/search/SearchSqlite.php
@@ -0,0 +1,344 @@
+<?php
+# SQLite search backend, based upon SearchMysql
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+# http://www.gnu.org/copyleft/gpl.html
+
+/**
+ * @file
+ * @ingroup Search
+ */
+
+/**
+ * Search engine hook for SQLite
+ * @ingroup Search
+ */
+class SearchSqlite extends SearchEngine {
+ // Cached because SearchUpdate keeps recreating our class
+ private static $fulltextSupported = null;
+
+ /**
+ * Creates an instance of this class
+ * @param $db DatabaseSqlite: database object
+ */
+ function __construct( $db ) {
+ $this->db = $db;
+ }
+
+ /**
+ * Whether fulltext search is supported by current schema
+ * @return Boolean
+ */
+ function fulltextSearchSupported() {
+ if ( self::$fulltextSupported === null ) {
+ self::$fulltextSupported = $this->db->selectField(
+ 'updatelog',
+ 'ul_key',
+ array( 'ul_key' => 'fts3' ),
+ __METHOD__ ) !== false;
+ }
+ return self::$fulltextSupported;
+ }
+
+ /**
+ * Parse the user's query and transform it into an SQL fragment which will
+ * become part of a WHERE clause
+ */
+ function parseQuery( $filteredText, $fulltext ) {
+ global $wgContLang;
+ $lc = SearchEngine::legalSearchChars(); // Minus format chars
+ $searchon = '';
+ $this->searchTerms = array();
+
+ $m = array();
+ if( preg_match_all( '/([-+<>~]?)(([' . $lc . ']+)(\*?)|"[^"]*")/',
+ $filteredText, $m, PREG_SET_ORDER ) ) {
+ foreach( $m as $bits ) {
+ @list( /* all */, $modifier, $term, $nonQuoted, $wildcard ) = $bits;
+
+ if( $nonQuoted != '' ) {
+ $term = $nonQuoted;
+ $quote = '';
+ } else {
+ $term = str_replace( '"', '', $term );
+ $quote = '"';
+ }
+
+ if( $searchon !== '' ) $searchon .= ' ';
+
+ // Some languages such as Serbian store the input form in the search index,
+ // so we may need to search for matches in multiple writing system variants.
+ $convertedVariants = $wgContLang->autoConvertToAllVariants( $term );
+ if( is_array( $convertedVariants ) ) {
+ $variants = array_unique( array_values( $convertedVariants ) );
+ } else {
+ $variants = array( $term );
+ }
+
+ // The low-level search index does some processing on input to work
+ // around problems with minimum lengths and encoding in MySQL's
+ // fulltext engine.
+ // For Chinese this also inserts spaces between adjacent Han characters.
+ $strippedVariants = array_map(
+ array( $wgContLang, 'normalizeForSearch' ),
+ $variants );
+
+ // Some languages such as Chinese force all variants to a canonical
+ // form when stripping to the low-level search index, so to be sure
+ // let's check our variants list for unique items after stripping.
+ $strippedVariants = array_unique( $strippedVariants );
+
+ $searchon .= $modifier;
+ if( count( $strippedVariants) > 1 )
+ $searchon .= '(';
+ foreach( $strippedVariants as $stripped ) {
+ if( $nonQuoted && strpos( $stripped, ' ' ) !== false ) {
+ // Hack for Chinese: we need to toss in quotes for
+ // multiple-character phrases since normalizeForSearch()
+ // added spaces between them to make word breaks.
+ $stripped = '"' . trim( $stripped ) . '"';
+ }
+ $searchon .= "$quote$stripped$quote$wildcard ";
+ }
+ if( count( $strippedVariants) > 1 )
+ $searchon .= ')';
+
+ // Match individual terms or quoted phrase in result highlighting...
+ // Note that variants will be introduced in a later stage for highlighting!
+ $regexp = $this->regexTerm( $term, $wildcard );
+ $this->searchTerms[] = $regexp;
+ }
+
+ } else {
+ wfDebug( __METHOD__ . ": Can't understand search query '{$filteredText}'\n" );
+ }
+
+ $searchon = $this->db->strencode( $searchon );
+ $field = $this->getIndexField( $fulltext );
+ return " $field MATCH '$searchon' ";
+ }
+
+ function regexTerm( $string, $wildcard ) {
+ global $wgContLang;
+
+ $regex = preg_quote( $string, '/' );
+ if( $wgContLang->hasWordBreaks() ) {
+ if( $wildcard ) {
+ // Don't cut off the final bit!
+ $regex = "\b$regex";
+ } else {
+ $regex = "\b$regex\b";
+ }
+ } else {
+ // For Chinese, words may legitimately abut other words in the text literal.
+ // Don't add \b boundary checks... note this could cause false positives
+ // for latin chars.
+ }
+ return $regex;
+ }
+
+ public static function legalSearchChars() {
+ return "\"*" . parent::legalSearchChars();
+ }
+
+ /**
+ * Perform a full text search query and return a result set.
+ *
+ * @param $term String: raw search term
+ * @return SqliteSearchResultSet
+ */
+ function searchText( $term ) {
+ return $this->searchInternal( $term, true );
+ }
+
+ /**
+ * Perform a title-only search query and return a result set.
+ *
+ * @param $term String: raw search term
+ * @return SqliteSearchResultSet
+ */
+ function searchTitle( $term ) {
+ return $this->searchInternal( $term, false );
+ }
+
+ protected function searchInternal( $term, $fulltext ) {
+ global $wgCountTotalSearchHits, $wgContLang;
+
+ if ( !$this->fulltextSearchSupported() ) {
+ return null;
+ }
+
+ $filteredTerm = $this->filter( $wgContLang->lc( $term ) );
+ $resultSet = $this->db->query( $this->getQuery( $filteredTerm, $fulltext ) );
+
+ $total = null;
+ if( $wgCountTotalSearchHits ) {
+ $totalResult = $this->db->query( $this->getCountQuery( $filteredTerm, $fulltext ) );
+ $row = $totalResult->fetchObject();
+ if( $row ) {
+ $total = intval( $row->c );
+ }
+ $totalResult->free();
+ }
+
+ return new SqliteSearchResultSet( $resultSet, $this->searchTerms, $total );
+ }
+
+
+ /**
+ * Return a partial WHERE clause to exclude redirects, if so set
+ * @return String
+ */
+ function queryRedirect() {
+ if( $this->showRedirects ) {
+ return '';
+ } else {
+ return 'AND page_is_redirect=0';
+ }
+ }
+
+ /**
+ * Return a partial WHERE clause to limit the search to the given namespaces
+ * @return String
+ */
+ function queryNamespaces() {
+ if( is_null($this->namespaces) )
+ return ''; # search all
+ if ( !count( $this->namespaces ) ) {
+ $namespaces = '0';
+ } else {
+ $namespaces = $this->db->makeList( $this->namespaces );
+ }
+ return 'AND page_namespace IN (' . $namespaces . ')';
+ }
+
+ /**
+ * Returns a query with limit for number of results set.
+ * @param $sql String:
+ * @return String
+ */
+ function limitResult( $sql ) {
+ return $this->db->limitResult( $sql, $this->limit, $this->offset );
+ }
+
+ /**
+ * Construct the full SQL query to do the search.
+ * The guts shoulds be constructed in queryMain()
+ * @param $filteredTerm String
+ * @param $fulltext Boolean
+ */
+ function getQuery( $filteredTerm, $fulltext ) {
+ return $this->limitResult(
+ $this->queryMain( $filteredTerm, $fulltext ) . ' ' .
+ $this->queryRedirect() . ' ' .
+ $this->queryNamespaces()
+ );
+ }
+
+ /**
+ * Picks which field to index on, depending on what type of query.
+ * @param $fulltext Boolean
+ * @return String
+ */
+ function getIndexField( $fulltext ) {
+ return $fulltext ? 'si_text' : 'si_title';
+ }
+
+ /**
+ * Get the base part of the search query.
+ *
+ * @param $filteredTerm String
+ * @param $fulltext Boolean
+ * @return String
+ */
+ function queryMain( $filteredTerm, $fulltext ) {
+ $match = $this->parseQuery( $filteredTerm, $fulltext );
+ $page = $this->db->tableName( 'page' );
+ $searchindex = $this->db->tableName( 'searchindex' );
+ return "SELECT $searchindex.rowid, page_namespace, page_title " .
+ "FROM $page,$searchindex " .
+ "WHERE page_id=$searchindex.rowid AND $match";
+ }
+
+ function getCountQuery( $filteredTerm, $fulltext ) {
+ $match = $this->parseQuery( $filteredTerm, $fulltext );
+ $page = $this->db->tableName( 'page' );
+ $searchindex = $this->db->tableName( 'searchindex' );
+ return "SELECT COUNT(*) AS c " .
+ "FROM $page,$searchindex " .
+ "WHERE page_id=$searchindex.rowid AND $match" .
+ $this->queryRedirect() . ' ' .
+ $this->queryNamespaces();
+ }
+
+ /**
+ * Create or update the search index record for the given page.
+ * Title and text should be pre-processed.
+ *
+ * @param $id Integer
+ * @param $title String
+ * @param $text String
+ */
+ function update( $id, $title, $text ) {
+ if ( !$this->fulltextSearchSupported() ) {
+ return;
+ }
+ // @todo: find a method to do it in a single request,
+ // couldn't do it so far due to typelessness of FTS3 tables.
+ $dbw = wfGetDB( DB_MASTER );
+
+ $dbw->delete( 'searchindex', array( 'rowid' => $id ), __METHOD__ );
+
+ $dbw->insert( 'searchindex',
+ array(
+ 'rowid' => $id,
+ 'si_title' => $title,
+ 'si_text' => $text
+ ), __METHOD__ );
+ }
+
+ /**
+ * Update a search index record's title only.
+ * Title should be pre-processed.
+ *
+ * @param $id Integer
+ * @param $title String
+ */
+ function updateTitle( $id, $title ) {
+ if ( !$this->fulltextSearchSupported() ) {
+ return;
+ }
+ $dbw = wfGetDB( DB_MASTER );
+
+ $dbw->update( 'searchindex',
+ array( 'si_title' => $title ),
+ array( 'rowid' => $id ),
+ __METHOD__ );
+ }
+}
+
+/**
+ * @ingroup Search
+ */
+class SqliteSearchResultSet extends SqlSearchResultSet {
+ function SqliteSearchResultSet( $resultSet, $terms, $totalHits=null ) {
+ parent::__construct( $resultSet, $terms );
+ $this->mTotalHits = $totalHits;
+ }
+
+ function getTotalHits() {
+ return $this->mTotalHits;
+ }
+} \ No newline at end of file
diff --git a/includes/search/SearchUpdate.php b/includes/search/SearchUpdate.php
new file mode 100644
index 00000000..e30c70e6
--- /dev/null
+++ b/includes/search/SearchUpdate.php
@@ -0,0 +1,113 @@
+<?php
+/**
+ * See deferred.txt
+ * @ingroup Search
+ */
+class SearchUpdate {
+
+ /* private */ var $mId = 0, $mNamespace, $mTitle, $mText;
+ /* private */ var $mTitleWords;
+
+ function SearchUpdate( $id, $title, $text = false ) {
+ $nt = Title::newFromText( $title );
+ if( $nt ) {
+ $this->mId = $id;
+ $this->mText = $text;
+
+ $this->mNamespace = $nt->getNamespace();
+ $this->mTitle = $nt->getText(); # Discard namespace
+
+ $this->mTitleWords = $this->mTextWords = array();
+ } else {
+ wfDebug( "SearchUpdate object created with invalid title '$title'\n" );
+ }
+ }
+
+ function doUpdate() {
+ global $wgContLang, $wgDisableSearchUpdate;
+
+ if( $wgDisableSearchUpdate || !$this->mId ) {
+ return false;
+ }
+ $fname = 'SearchUpdate::doUpdate';
+ wfProfileIn( $fname );
+
+ $search = SearchEngine::create();
+ $lc = SearchEngine::legalSearchChars() . '&#;';
+
+ if( $this->mText === false ) {
+ $search->updateTitle($this->mId,
+ Title::indexTitle( $this->mNamespace, $this->mTitle ));
+ wfProfileOut( $fname );
+ return;
+ }
+
+ # Language-specific strip/conversion
+ $text = $wgContLang->normalizeForSearch( $this->mText );
+
+ wfProfileIn( $fname.'-regexps' );
+ $text = preg_replace( "/<\\/?\\s*[A-Za-z][^>]*?>/",
+ ' ', $wgContLang->lc( " " . $text . " " ) ); # Strip HTML markup
+ $text = preg_replace( "/(^|\\n)==\\s*([^\\n]+)\\s*==(\\s)/sD",
+ "\\1\\2 \\2 \\2\\3", $text ); # Emphasize headings
+
+ # Strip external URLs
+ $uc = "A-Za-z0-9_\\/:.,~%\\-+&;#?!=()@\\xA0-\\xFF";
+ $protos = "http|https|ftp|mailto|news|gopher";
+ $pat = "/(^|[^\\[])({$protos}):[{$uc}]+([^{$uc}]|$)/";
+ $text = preg_replace( $pat, "\\1 \\3", $text );
+
+ $p1 = "/([^\\[])\\[({$protos}):[{$uc}]+]/";
+ $p2 = "/([^\\[])\\[({$protos}):[{$uc}]+\\s+([^\\]]+)]/";
+ $text = preg_replace( $p1, "\\1 ", $text );
+ $text = preg_replace( $p2, "\\1 \\3 ", $text );
+
+ # Internal image links
+ $pat2 = "/\\[\\[image:([{$uc}]+)\\.(gif|png|jpg|jpeg)([^{$uc}])/i";
+ $text = preg_replace( $pat2, " \\1 \\3", $text );
+
+ $text = preg_replace( "/([^{$lc}])([{$lc}]+)]]([a-z]+)/",
+ "\\1\\2 \\2\\3", $text ); # Handle [[game]]s
+
+ # Strip all remaining non-search characters
+ $text = preg_replace( "/[^{$lc}]+/", " ", $text );
+
+ # Handle 's, s'
+ #
+ # $text = preg_replace( "/([{$lc}]+)'s /", "\\1 \\1's ", $text );
+ # $text = preg_replace( "/([{$lc}]+)s' /", "\\1s ", $text );
+ #
+ # These tail-anchored regexps are insanely slow. The worst case comes
+ # when Japanese or Chinese text (ie, no word spacing) is written on
+ # a wiki configured for Western UTF-8 mode. The Unicode characters are
+ # expanded to hex codes and the "words" are very long paragraph-length
+ # monstrosities. On a large page the above regexps may take over 20
+ # seconds *each* on a 1GHz-level processor.
+ #
+ # Following are reversed versions which are consistently fast
+ # (about 3 milliseconds on 1GHz-level processor).
+ #
+ $text = strrev( preg_replace( "/ s'([{$lc}]+)/", " s'\\1 \\1", strrev( $text ) ) );
+ $text = strrev( preg_replace( "/ 's([{$lc}]+)/", " s\\1", strrev( $text ) ) );
+
+ # Strip wiki '' and '''
+ $text = preg_replace( "/''[']*/", " ", $text );
+ wfProfileOut( "$fname-regexps" );
+
+ wfRunHooks( 'SearchUpdate', array( $this->mId, $this->mNamespace, $this->mTitle, &$text ) );
+
+ # Perform the actual update
+ $search->update($this->mId, Title::indexTitle( $this->mNamespace, $this->mTitle ),
+ $text);
+
+ wfProfileOut( $fname );
+ }
+}
+
+/**
+ * Placeholder class
+ * @ingroup Search
+ */
+class SearchUpdateMyISAM extends SearchUpdate {
+ # Inherits everything
+}