diff options
author | Luke Shumaker <lukeshu@sbcglobal.net> | 2016-05-01 15:31:33 -0400 |
---|---|---|
committer | Luke Shumaker <lukeshu@sbcglobal.net> | 2016-05-01 15:31:33 -0400 |
commit | 150f94f051128f367bc89f6b7e5f57eb2a69fc62 (patch) | |
tree | 181f454813b310ee97385058c6c6f2e3f34d5fd8 /includes/HtmlFormatter.php | |
parent | 7e85254903c7c0cb49e381f16b18441ea7b058cc (diff) | |
parent | 80f7dc77d430774192b929d780f96260066df2ee (diff) |
Merge commit '80f7dc'
# Conflicts:
# extensions/ArchInterWiki.sql
Diffstat (limited to 'includes/HtmlFormatter.php')
-rw-r--r-- | includes/HtmlFormatter.php | 19 |
1 files changed, 17 insertions, 2 deletions
diff --git a/includes/HtmlFormatter.php b/includes/HtmlFormatter.php index b2926d17..221cefbb 100644 --- a/includes/HtmlFormatter.php +++ b/includes/HtmlFormatter.php @@ -63,7 +63,15 @@ class HtmlFormatter { */ public function getDoc() { if ( !$this->doc ) { - $html = mb_convert_encoding( $this->html, 'HTML-ENTITIES', 'UTF-8' ); + // DOMDocument::loadHTML apparently isn't very good with encodings, so + // convert input to ASCII by encoding everything above 128 as entities. + if ( function_exists( 'mb_convert_encoding' ) ) { + $html = mb_convert_encoding( $this->html, 'HTML-ENTITIES', 'UTF-8' ); + } else { + $html = preg_replace_callback( '/[\x{80}-\x{10ffff}]/u', function ( $m ) { + return '&#' . UtfNormal\Utils::utf8ToCodepoint( $m[0] ) . ';'; + }, $this->html ); + } // Workaround for bug that caused spaces before references // to disappear during processing: @@ -244,7 +252,14 @@ class HtmlFormatter { ) ); } $html = $replacements->replace( $html ); - $html = mb_convert_encoding( $html, 'UTF-8', 'HTML-ENTITIES' ); + + if ( function_exists( 'mb_convert_encoding' ) ) { + // Just in case the conversion in getDoc() above used named + // entities that aren't known to html_entity_decode(). + $html = mb_convert_encoding( $html, 'UTF-8', 'HTML-ENTITIES' ); + } else { + $html = html_entity_decode( $html, ENT_COMPAT, 'utf-8' ); + } return $html; } |