diff options
Diffstat (limited to 'extensions/SpamBlacklist/SpamRegexBatch.php')
-rw-r--r-- | extensions/SpamBlacklist/SpamRegexBatch.php | 175 |
1 files changed, 175 insertions, 0 deletions
diff --git a/extensions/SpamBlacklist/SpamRegexBatch.php b/extensions/SpamBlacklist/SpamRegexBatch.php new file mode 100644 index 00000000..b14d671e --- /dev/null +++ b/extensions/SpamBlacklist/SpamRegexBatch.php @@ -0,0 +1,175 @@ +<?php + +/** + * Utility class for working with blacklists + */ +class SpamRegexBatch { + /** + * Build a set of regular expressions matching URLs with the list of regex fragments. + * Returns an empty list if the input list is empty. + * + * @param array $lines list of fragments which will match in URLs + * @param BaseBlacklist $blacklist + * @param int $batchSize largest allowed batch regex; + * if 0, will produce one regex per line + * @return array + */ + static function buildRegexes( $lines, BaseBlacklist $blacklist, $batchSize=4096 ) { + # Make regex + # It's faster using the S modifier even though it will usually only be run once + //$regex = 'https?://+[a-z0-9_\-.]*(' . implode( '|', $lines ) . ')'; + //return '/' . str_replace( '/', '\/', preg_replace('|\\\*/|', '/', $regex) ) . '/Sim'; + $regexes = array(); + $regexStart = $blacklist->getRegexStart(); + $regexEnd = $blacklist->getRegexEnd( $batchSize ); + $build = false; + foreach( $lines as $line ) { + if( substr( $line, -1, 1 ) == "\\" ) { + // Final \ will break silently on the batched regexes. + // Skip it here to avoid breaking the next line; + // warnings from getBadLines() will still trigger on + // edit to keep new ones from floating in. + continue; + } + // FIXME: not very robust size check, but should work. :) + if( $build === false ) { + $build = $line; + } elseif( strlen( $build ) + strlen( $line ) > $batchSize ) { + $regexes[] = $regexStart . + str_replace( '/', '\/', preg_replace('|\\\*/|u', '/', $build) ) . + $regexEnd; + $build = $line; + } else { + $build .= '|'; + $build .= $line; + } + } + if( $build !== false ) { + $regexes[] = $regexStart . + str_replace( '/', '\/', preg_replace('|\\\*/|u', '/', $build) ) . + $regexEnd; + } + return $regexes; + } + + /** + * Confirm that a set of regexes is either empty or valid. + * + * @param $regexes array set of regexes + * @return bool true if ok, false if contains invalid lines + */ + static function validateRegexes( $regexes ) { + foreach( $regexes as $regex ) { + wfSuppressWarnings(); + $ok = preg_match( $regex, '' ); + wfRestoreWarnings(); + + if( $ok === false ) { + return false; + } + } + return true; + } + + /** + * Strip comments and whitespace, then remove blanks + * + * @param $lines array + * @return array + */ + static function stripLines( $lines ) { + return array_filter( + array_map( 'trim', + preg_replace( '/#.*$/', '', + $lines ) ) ); + } + + /** + * Do a sanity check on the batch regex. + * + * @param $lines string unsanitized input lines + * @param $blacklist BaseBlacklist + * @param $fileName bool|string optional for debug reporting + * @return array of regexes + */ + static function buildSafeRegexes( $lines, BaseBlacklist $blacklist, $fileName=false ) { + $lines = SpamRegexBatch::stripLines( $lines ); + $regexes = SpamRegexBatch::buildRegexes( $lines, $blacklist ); + if( SpamRegexBatch::validateRegexes( $regexes ) ) { + return $regexes; + } else { + // _Something_ broke... rebuild line-by-line; it'll be + // slower if there's a lot of blacklist lines, but one + // broken line won't take out hundreds of its brothers. + if( $fileName ) { + wfDebugLog( 'SpamBlacklist', "Spam blacklist warning: bogus line in $fileName\n" ); + } + return SpamRegexBatch::buildRegexes( $lines, $blacklist, 0 ); + } + } + + /** + * Returns an array of invalid lines + * + * @param array $lines + * @param $blacklist BaseBlacklist + * @return array of input lines which produce invalid input, or empty array if no problems + */ + static function getBadLines( $lines, BaseBlacklist $blacklist ) { + $lines = SpamRegexBatch::stripLines( $lines ); + + $badLines = array(); + foreach( $lines as $line ) { + if( substr( $line, -1, 1 ) == "\\" ) { + // Final \ will break silently on the batched regexes. + $badLines[] = $line; + } + } + + $regexes = SpamRegexBatch::buildRegexes( $lines, $blacklist ); + if( SpamRegexBatch::validateRegexes( $regexes ) ) { + // No other problems! + return $badLines; + } + + // Something failed in the batch, so check them one by one. + foreach( $lines as $line ) { + $regexes = SpamRegexBatch::buildRegexes( array( $line ), $blacklist ); + if( !SpamRegexBatch::validateRegexes( $regexes ) ) { + $badLines[] = $line; + } + } + return $badLines; + } + + /** + * Build a set of regular expressions from the given multiline input text, + * with empty lines and comments stripped. + * + * @param $source string + * @param $blacklist BaseBlacklist + * @param $fileName bool|string optional, for reporting of bad files + * @return array of regular expressions, potentially empty + */ + static function regexesFromText( $source, BaseBlacklist $blacklist, $fileName=false ) { + $lines = explode( "\n", $source ); + return SpamRegexBatch::buildSafeRegexes( $lines, $blacklist, $fileName ); + } + + /** + * Build a set of regular expressions from a MediaWiki message. + * Will be correctly empty if the message isn't present. + * + * @param $message string + * @param $blacklist BaseBlacklist + * @return array of regular expressions, potentially empty + */ + static function regexesFromMessage( $message, BaseBlacklist $blacklist ) { + $source = wfMessage( $message )->inContentLanguage(); + if( !$source->isDisabled() ) { + return SpamRegexBatch::regexesFromText( $source->plain(), $blacklist ); + } else { + return array(); + } + } +} |