summaryrefslogtreecommitdiff
path: root/includes/parser/MWTidy.php
blob: b310862f3f72289d114432016cfc6a2357f2ef88 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
<?php
/**
 * HTML validation and correction
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along
 * with this program; if not, write to the Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 * http://www.gnu.org/copyleft/gpl.html
 *
 * @file
 * @ingroup Parser
 */

/**
 * Class used to hide mw:editsection tokens from Tidy so that it doesn't break them
 * or break on them. This is a bit of a hack for now, but hopefully in the future
 * we may create a real postprocessor or something that will replace this.
 * It's called wrapper because for now it basically takes over MWTidy::tidy's task
 * of wrapping the text in a xhtml block
 *
 * This re-uses some of the parser's UNIQ tricks, though some of it is private so it's
 * duplicated. Perhaps we should create an abstract marker hiding class.
 *
 * @ingroup Parser
 */
class MWTidyWrapper {

	/**
	 * @var ReplacementArray
	 */
	protected $mTokens;

	protected $mUniqPrefix;

	protected $mMarkerIndex;

	public function __construct() {
		$this->mTokens = null;
		$this->mUniqPrefix = null;
	}

	/**
	 * @param string $text
	 * @return string
	 */
	public function getWrapped( $text ) {
		$this->mTokens = new ReplacementArray;
		$this->mUniqPrefix = "\x7fUNIQ" .
			dechex( mt_rand( 0, 0x7fffffff ) ) . dechex( mt_rand( 0, 0x7fffffff ) );
		$this->mMarkerIndex = 0;

		// Replace <mw:editsection> elements with placeholders
		$wrappedtext = preg_replace_callback( ParserOutput::EDITSECTION_REGEX,
			array( &$this, 'replaceCallback' ), $text );
		// ...and <mw:toc> markers
		$wrappedtext = preg_replace_callback( '/\<\\/?mw:toc\>/',
			array( &$this, 'replaceCallback' ), $wrappedtext );
		// ... and <math> tags
		$wrappedtext = preg_replace_callback( '/\<math(.*?)\<\\/math\>/s',
			array( &$this, 'replaceCallback' ), $wrappedtext );
		// Modify inline Microdata <link> and <meta> elements so they say <html-link> and <html-meta> so
		// we can trick Tidy into not stripping them out by including them in tidy's new-empty-tags config
		$wrappedtext = preg_replace( '!<(link|meta)([^>]*?)(/{0,1}>)!', '<html-$1$2$3', $wrappedtext );

		// Wrap the whole thing in a doctype and body for Tidy.
		$wrappedtext = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"' .
			' "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"><html>' .
			'<head><title>test</title></head><body>' . $wrappedtext . '</body></html>';

		return $wrappedtext;
	}

	/**
	 * @param array $m
	 *
	 * @return string
	 */
	public function replaceCallback( $m ) {
		$marker = "{$this->mUniqPrefix}-item-{$this->mMarkerIndex}" . Parser::MARKER_SUFFIX;
		$this->mMarkerIndex++;
		$this->mTokens->setPair( $marker, $m[0] );
		return $marker;
	}

	/**
	 * @param string $text
	 * @return string
	 */
	public function postprocess( $text ) {
		// Revert <html-{link,meta}> back to <{link,meta}>
		$text = preg_replace( '!<html-(link|meta)([^>]*?)(/{0,1}>)!', '<$1$2$3', $text );

		// Restore the contents of placeholder tokens
		$text = $this->mTokens->replace( $text );

		return $text;
	}

}

/**
 * Class to interact with HTML tidy
 *
 * Either the external tidy program or the in-process tidy extension
 * will be used depending on availability. Override the default
 * $wgTidyInternal setting to disable the internal if it's not working.
 *
 * @ingroup Parser
 */
class MWTidy {
	/**
	 * Interface with html tidy, used if $wgUseTidy = true.
	 * If tidy isn't able to correct the markup, the original will be
	 * returned in all its glory with a warning comment appended.
	 *
	 * @param string $text Hideous HTML input
	 * @return string Corrected HTML output
	 */
	public static function tidy( $text ) {
		global $wgTidyInternal;

		$wrapper = new MWTidyWrapper;
		$wrappedtext = $wrapper->getWrapped( $text );

		$retVal = null;
		if ( $wgTidyInternal ) {
			$correctedtext = self::execInternalTidy( $wrappedtext, false, $retVal );
		} else {
			$correctedtext = self::execExternalTidy( $wrappedtext, false, $retVal );
		}

		if ( $retVal < 0 ) {
			wfDebug( "Possible tidy configuration error!\n" );
			return $text . "\n<!-- Tidy was unable to run -->\n";
		} elseif ( is_null( $correctedtext ) ) {
			wfDebug( "Tidy error detected!\n" );
			return $text . "\n<!-- Tidy found serious XHTML errors -->\n";
		}

		$correctedtext = $wrapper->postprocess( $correctedtext ); // restore any hidden tokens

		return $correctedtext;
	}

	/**
	 * Check HTML for errors, used if $wgValidateAllHtml = true.
	 *
	 * @param string $text
	 * @param string &$errorStr Return the error string
	 * @return bool Whether the HTML is valid
	 */
	public static function checkErrors( $text, &$errorStr = null ) {
		global $wgTidyInternal;

		$retval = 0;
		if ( $wgTidyInternal ) {
			$errorStr = self::execInternalTidy( $text, true, $retval );
		} else {
			$errorStr = self::execExternalTidy( $text, true, $retval );
		}

		return ( $retval < 0 && $errorStr == '' ) || $retval == 0;
	}

	/**
	 * Spawn an external HTML tidy process and get corrected markup back from it.
	 * Also called in OutputHandler.php for full page validation
	 *
	 * @param string $text HTML to check
	 * @param bool $stderr Whether to read result from STDERR rather than STDOUT
	 * @param int &$retval Exit code (-1 on internal error)
	 * @return string|null
	 */
	private static function execExternalTidy( $text, $stderr = false, &$retval = null ) {
		global $wgTidyConf, $wgTidyBin, $wgTidyOpts;
		wfProfileIn( __METHOD__ );

		$cleansource = '';
		$opts = ' -utf8';

		if ( $stderr ) {
			$descriptorspec = array(
				0 => array( 'pipe', 'r' ),
				1 => array( 'file', wfGetNull(), 'a' ),
				2 => array( 'pipe', 'w' )
			);
		} else {
			$descriptorspec = array(
				0 => array( 'pipe', 'r' ),
				1 => array( 'pipe', 'w' ),
				2 => array( 'file', wfGetNull(), 'a' )
			);
		}

		$readpipe = $stderr ? 2 : 1;
		$pipes = array();

		$process = proc_open(
			"$wgTidyBin -config $wgTidyConf $wgTidyOpts$opts", $descriptorspec, $pipes );

		//NOTE: At least on linux, the process will be created even if tidy is not installed.
		//      This means that missing tidy will be treated as a validation failure.

		if ( is_resource( $process ) ) {
			// Theoretically, this style of communication could cause a deadlock
			// here. If the stdout buffer fills up, then writes to stdin could
			// block. This doesn't appear to happen with tidy, because tidy only
			// writes to stdout after it's finished reading from stdin. Search
			// for tidyParseStdin and tidySaveStdout in console/tidy.c
			fwrite( $pipes[0], $text );
			fclose( $pipes[0] );
			while ( !feof( $pipes[$readpipe] ) ) {
				$cleansource .= fgets( $pipes[$readpipe], 1024 );
			}
			fclose( $pipes[$readpipe] );
			$retval = proc_close( $process );
		} else {
			wfWarn( "Unable to start external tidy process" );
			$retval = -1;
		}

		if ( !$stderr && $cleansource == '' && $text != '' ) {
			// Some kind of error happened, so we couldn't get the corrected text.
			// Just give up; we'll use the source text and append a warning.
			$cleansource = null;
		}

		wfProfileOut( __METHOD__ );
		return $cleansource;
	}

	/**
	 * Use the HTML tidy extension to use the tidy library in-process,
	 * saving the overhead of spawning a new process.
	 *
	 * @param string $text HTML to check
	 * @param bool $stderr Whether to read result from error status instead of output
	 * @param int &$retval Exit code (-1 on internal error)
	 * @return string|null
	 */
	private static function execInternalTidy( $text, $stderr = false, &$retval = null ) {
		global $wgTidyConf, $wgDebugTidy;
		wfProfileIn( __METHOD__ );

		if ( !class_exists( 'tidy' ) ) {
			wfWarn( "Unable to load internal tidy class." );
			$retval = -1;

			wfProfileOut( __METHOD__ );
			return null;
		}

		$tidy = new tidy;
		$tidy->parseString( $text, $wgTidyConf, 'utf8' );

		if ( $stderr ) {
			$retval = $tidy->getStatus();

			wfProfileOut( __METHOD__ );
			return $tidy->errorBuffer;
		}

		$tidy->cleanRepair();
		$retval = $tidy->getStatus();
		if ( $retval == 2 ) {
			// 2 is magic number for fatal error
			// http://www.php.net/manual/en/function.tidy-get-status.php
			$cleansource = null;
		} else {
			$cleansource = tidy_get_output( $tidy );
			if ( $wgDebugTidy && $retval > 0 ) {
				$cleansource .= "<!--\nTidy reports:\n" .
					str_replace( '-->', '--&gt;', $tidy->errorBuffer ) .
					"\n-->";
			}
		}

		wfProfileOut( __METHOD__ );
		return $cleansource;
	}
}