diff options
author | Craig Andrews <candrews@integralblue.com> | 2009-07-27 13:42:03 -0400 |
---|---|---|
committer | Craig Andrews <candrews@integralblue.com> | 2009-07-27 13:42:03 -0400 |
commit | ac75772150c3fe9411408ac44db04e774d095aa0 (patch) | |
tree | fcf7b18289a31e602a821a7ea22f82e4c3cd3a54 /extlib/htmLawed | |
parent | b9cf19a2ee4b483709f1e964860fcf9209c4ba05 (diff) |
Sanitize html returned by oEmbed providers to protect laconica from XSS attacks
Diffstat (limited to 'extlib/htmLawed')
-rw-r--r-- | extlib/htmLawed/htmLawed.php | 715 | ||||
-rw-r--r-- | extlib/htmLawed/htmLawedTest.php | 592 | ||||
-rw-r--r-- | extlib/htmLawed/htmLawed_README.htm | 1979 | ||||
-rw-r--r-- | extlib/htmLawed/htmLawed_README.txt | 1600 | ||||
-rw-r--r-- | extlib/htmLawed/htmLawed_TESTCASE.txt | 370 |
5 files changed, 5256 insertions, 0 deletions
diff --git a/extlib/htmLawed/htmLawed.php b/extlib/htmLawed/htmLawed.php new file mode 100644 index 000000000..17f6e98ca --- /dev/null +++ b/extlib/htmLawed/htmLawed.php @@ -0,0 +1,715 @@ +<?php + +/* +htmLawed 1.1.8.1, 16 July 2009 +Copyright Santosh Patnaik +GPL v3 license +A PHP Labware internal utility; www.bioinformatics.org/phplabware/internal_utilities/htmLawed + +See htmLawed_README.txt/htm +*/ + +function htmLawed($t, $C=1, $S=array()){ +$C = is_array($C) ? $C : array(); +if(!empty($C['valid_xhtml'])){ + $C['elements'] = empty($C['elements']) ? '*-center-dir-font-isindex-menu-s-strike-u' : $C['elements']; + $C['make_tag_strict'] = isset($C['make_tag_strict']) ? $C['make_tag_strict'] : 2; + $C['xml:lang'] = isset($C['xml:lang']) ? $C['xml:lang'] : 2; +} +// config eles +$e = array('a'=>1, 'abbr'=>1, 'acronym'=>1, 'address'=>1, 'applet'=>1, 'area'=>1, 'b'=>1, 'bdo'=>1, 'big'=>1, 'blockquote'=>1, 'br'=>1, 'button'=>1, 'caption'=>1, 'center'=>1, 'cite'=>1, 'code'=>1, 'col'=>1, 'colgroup'=>1, 'dd'=>1, 'del'=>1, 'dfn'=>1, 'dir'=>1, 'div'=>1, 'dl'=>1, 'dt'=>1, 'em'=>1, 'embed'=>1, 'fieldset'=>1, 'font'=>1, 'form'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'hr'=>1, 'i'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'ins'=>1, 'isindex'=>1, 'kbd'=>1, 'label'=>1, 'legend'=>1, 'li'=>1, 'map'=>1, 'menu'=>1, 'noscript'=>1, 'object'=>1, 'ol'=>1, 'optgroup'=>1, 'option'=>1, 'p'=>1, 'param'=>1, 'pre'=>1, 'q'=>1, 'rb'=>1, 'rbc'=>1, 'rp'=>1, 'rt'=>1, 'rtc'=>1, 'ruby'=>1, 's'=>1, 'samp'=>1, 'script'=>1, 'select'=>1, 'small'=>1, 'span'=>1, 'strike'=>1, 'strong'=>1, 'sub'=>1, 'sup'=>1, 'table'=>1, 'tbody'=>1, 'td'=>1, 'textarea'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1, 'tt'=>1, 'u'=>1, 'ul'=>1, 'var'=>1); // 86/deprecated+embed+ruby +if(!empty($C['safe'])){ + unset($e['applet'], $e['embed'], $e['iframe'], $e['object'], $e['script']); +} +$x = !empty($C['elements']) ? str_replace(array("\n", "\r", "\t", ' '), '', $C['elements']) : '*'; +if($x == '-*'){$e = array();} +elseif(strpos($x, '*') === false){$e = array_flip(explode(',', $x));} +else{ + if(isset($x[1])){ + preg_match_all('`(?:^|-|\+)[^\-+]+?(?=-|\+|$)`', $x, $m, PREG_SET_ORDER); + for($i=count($m); --$i>=0;){$m[$i] = $m[$i][0];} + foreach($m as $v){ + if($v[0] == '+'){$e[substr($v, 1)] = 1;} + if($v[0] == '-' && isset($e[($v = substr($v, 1))]) && !in_array('+'. $v, $m)){unset($e[$v]);} + } + } +} +$C['elements'] =& $e; +// config attrs +$x = !empty($C['deny_attribute']) ? str_replace(array("\n", "\r", "\t", ' '), '', $C['deny_attribute']) : ''; +$x = array_flip((isset($x[0]) && $x[0] == '*') ? explode('-', $x) : explode(',', $x. (!empty($C['safe']) ? ',on*' : ''))); +if(isset($x['on*'])){ + unset($x['on*']); + $x += array('onblur'=>1, 'onchange'=>1, 'onclick'=>1, 'ondblclick'=>1, 'onfocus'=>1, 'onkeydown'=>1, 'onkeypress'=>1, 'onkeyup'=>1, 'onmousedown'=>1, 'onmousemove'=>1, 'onmouseout'=>1, 'onmouseover'=>1, 'onmouseup'=>1, 'onreset'=>1, 'onselect'=>1, 'onsubmit'=>1); +} +$C['deny_attribute'] = $x; +// config URL +$x = (isset($C['schemes'][2]) && strpos($C['schemes'], ':')) ? strtolower($C['schemes']) : 'href: aim, feed, file, ftp, gopher, http, https, irc, mailto, news, nntp, sftp, ssh, telnet; *:file, http, https'; +$C['schemes'] = array(); +foreach(explode(';', str_replace(array(' ', "\t", "\r", "\n"), '', $x)) as $v){ + $x = $x2 = null; list($x, $x2) = explode(':', $v, 2); + if($x2){$C['schemes'][$x] = array_flip(explode(',', $x2));} +} +if(!isset($C['schemes']['*'])){$C['schemes']['*'] = array('file'=>1, 'http'=>1, 'https'=>1,);} +if(!empty($C['safe']) && empty($C['schemes']['style'])){$C['schemes']['style'] = array('nil'=>1);} +$C['abs_url'] = isset($C['abs_url']) ? $C['abs_url'] : 0; +if(!isset($C['base_url']) or !preg_match('`^[a-zA-Z\d.+\-]+://[^/]+/(.+?/)?$`', $C['base_url'])){ + $C['base_url'] = $C['abs_url'] = 0; +} +// config rest +$C['and_mark'] = empty($C['and_mark']) ? 0 : 1; +$C['anti_link_spam'] = (isset($C['anti_link_spam']) && is_array($C['anti_link_spam']) && count($C['anti_link_spam']) == 2 && (empty($C['anti_link_spam'][0]) or hl_regex($C['anti_link_spam'][0])) && (empty($C['anti_link_spam'][1]) or hl_regex($C['anti_link_spam'][1]))) ? $C['anti_link_spam'] : 0; +$C['anti_mail_spam'] = isset($C['anti_mail_spam']) ? $C['anti_mail_spam'] : 0; +$C['balance'] = isset($C['balance']) ? (bool)$C['balance'] : 1; +$C['cdata'] = isset($C['cdata']) ? $C['cdata'] : (empty($C['safe']) ? 3 : 0); +$C['clean_ms_char'] = empty($C['clean_ms_char']) ? 0 : $C['clean_ms_char']; +$C['comment'] = isset($C['comment']) ? $C['comment'] : (empty($C['safe']) ? 3 : 0); +$C['css_expression'] = empty($C['css_expression']) ? 0 : 1; +$C['hexdec_entity'] = isset($C['hexdec_entity']) ? $C['hexdec_entity'] : 1; +$C['hook'] = (!empty($C['hook']) && function_exists($C['hook'])) ? $C['hook'] : 0; +$C['hook_tag'] = (!empty($C['hook_tag']) && function_exists($C['hook_tag'])) ? $C['hook_tag'] : 0; +$C['keep_bad'] = isset($C['keep_bad']) ? $C['keep_bad'] : 6; +$C['lc_std_val'] = isset($C['lc_std_val']) ? (bool)$C['lc_std_val'] : 1; +$C['make_tag_strict'] = isset($C['make_tag_strict']) ? $C['make_tag_strict'] : 1; +$C['named_entity'] = isset($C['named_entity']) ? (bool)$C['named_entity'] : 1; +$C['no_deprecated_attr'] = isset($C['no_deprecated_attr']) ? $C['no_deprecated_attr'] : 1; +$C['parent'] = isset($C['parent'][0]) ? strtolower($C['parent']) : 'body'; +$C['show_setting'] = !empty($C['show_setting']) ? $C['show_setting'] : 0; +$C['style_pass'] = empty($C['style_pass']) ? 0 : 1; +$C['tidy'] = empty($C['tidy']) ? 0 : $C['tidy']; +$C['unique_ids'] = isset($C['unique_ids']) ? $C['unique_ids'] : 1; +$C['xml:lang'] = isset($C['xml:lang']) ? $C['xml:lang'] : 0; + +if(isset($GLOBALS['C'])){$reC = $GLOBALS['C'];} +$GLOBALS['C'] = $C; +$S = is_array($S) ? $S : hl_spec($S); +if(isset($GLOBALS['S'])){$reS = $GLOBALS['S'];} +$GLOBALS['S'] = $S; + +$t = preg_replace('`[\x00-\x08\x0b-\x0c\x0e-\x1f]`', '', $t); +if($C['clean_ms_char']){ + $x = array("\x7f"=>'', "\x80"=>'€', "\x81"=>'', "\x83"=>'ƒ', "\x85"=>'…', "\x86"=>'†', "\x87"=>'‡', "\x88"=>'ˆ', "\x89"=>'‰', "\x8a"=>'Š', "\x8b"=>'‹', "\x8c"=>'Œ', "\x8d"=>'', "\x8e"=>'Ž', "\x8f"=>'', "\x90"=>'', "\x95"=>'•', "\x96"=>'–', "\x97"=>'—', "\x98"=>'˜', "\x99"=>'™', "\x9a"=>'š', "\x9b"=>'›', "\x9c"=>'œ', "\x9d"=>'', "\x9e"=>'ž', "\x9f"=>'Ÿ'); + $x = $x + ($C['clean_ms_char'] == 1 ? array("\x82"=>'‚', "\x84"=>'„', "\x91"=>'‘', "\x92"=>'’', "\x93"=>'“', "\x94"=>'”') : array("\x82"=>'\'', "\x84"=>'"', "\x91"=>'\'', "\x92"=>'\'', "\x93"=>'"', "\x94"=>'"')); + $t = strtr($t, $x); +} +if($C['cdata'] or $C['comment']){$t = preg_replace_callback('`<!(?:(?:--.*?--)|(?:\[CDATA\[.*?\]\]))>`sm', 'hl_cmtcd', $t);} +$t = preg_replace_callback('`&([A-Za-z][A-Za-z0-9]{1,30}|#(?:[0-9]{1,8}|[Xx][0-9A-Fa-f]{1,7}));`', 'hl_ent', str_replace('&', '&', $t)); +if($C['unique_ids'] && !isset($GLOBALS['hl_Ids'])){$GLOBALS['hl_Ids'] = array();} +if($C['hook']){$t = $C['hook']($t, $C, $S);} +if($C['show_setting'] && preg_match('`^[a-z][a-z0-9_]*$`i', $C['show_setting'])){ + $GLOBALS[$C['show_setting']] = array('config'=>$C, 'spec'=>$S, 'time'=>microtime()); +} +// main +$t = preg_replace_callback('`<(?:(?:\s|$)|(?:[^>]*(?:>|$)))|>`m', 'hl_tag', $t); +$t = $C['balance'] ? hl_bal($t, $C['keep_bad'], $C['parent']) : $t; +$t = (($C['cdata'] or $C['comment']) && strpos($t, "\x01") !== false) ? str_replace(array("\x01", "\x02", "\x03", "\x04", "\x05"), array('', '', '&', '<', '>'), $t) : $t; +$t = $C['tidy'] ? hl_tidy($t, $C['tidy'], $C['parent']) : $t; +unset($C, $e); +if(isset($reC)){$GLOBALS['C'] = $reC;} +if(isset($reS)){$GLOBALS['S'] = $reS;} +return $t; +// eof +} + +function hl_attrval($t, $p){ +// check attr val against $S +$o = 1; $l = strlen($t); +foreach($p as $k=>$v){ + switch($k){ + case 'maxlen':if($l > $v){$o = 0;} + break; case 'minlen': if($l < $v){$o = 0;} + break; case 'maxval': if((float)($t) > $v){$o = 0;} + break; case 'minval': if((float)($t) < $v){$o = 0;} + break; case 'match': if(!preg_match($v, $t)){$o = 0;} + break; case 'nomatch': if(preg_match($v, $t)){$o = 0;} + break; case 'oneof': + $m = 0; + foreach(explode('|', $v) as $n){if($t == $n){$m = 1; break;}} + $o = $m; + break; case 'noneof': + $m = 1; + foreach(explode('|', $v) as $n){if($t == $n){$m = 0; break;}} + $o = $m; + break; default: + break; + } + if(!$o){break;} +} +return ($o ? $t : (isset($p['default']) ? $p['default'] : 0)); +// eof +} + +function hl_bal($t, $do=1, $in='div'){ +// balance tags +// by content +$cB = array('blockquote'=>1, 'form'=>1, 'map'=>1, 'noscript'=>1); // Block +$cE = array('area'=>1, 'br'=>1, 'col'=>1, 'embed'=>1, 'hr'=>1, 'img'=>1, 'input'=>1, 'isindex'=>1, 'param'=>1); // Empty +$cF = array('button'=>1, 'del'=>1, 'div'=>1, 'dd'=>1, 'fieldset'=>1, 'iframe'=>1, 'ins'=>1, 'li'=>1, 'noscript'=>1, 'object'=>1, 'td'=>1, 'th'=>1); // Flow; later context-wise dynamic move of ins & del to $cI +$cI = array('a'=>1, 'abbr'=>1, 'acronym'=>1, 'address'=>1, 'b'=>1, 'bdo'=>1, 'big'=>1, 'caption'=>1, 'cite'=>1, 'code'=>1, 'dfn'=>1, 'dt'=>1, 'em'=>1, 'font'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'i'=>1, 'kbd'=>1, 'label'=>1, 'legend'=>1, 'p'=>1, 'pre'=>1, 'q'=>1, 'rb'=>1, 'rt'=>1, 's'=>1, 'samp'=>1, 'small'=>1, 'span'=>1, 'strike'=>1, 'strong'=>1, 'sub'=>1, 'sup'=>1, 'tt'=>1, 'u'=>1, 'var'=>1); // Inline +$cN = array('a'=>array('a'=>1), 'button'=>array('a'=>1, 'button'=>1, 'fieldset'=>1, 'form'=>1, 'iframe'=>1, 'input'=>1, 'label'=>1, 'select'=>1, 'textarea'=>1), 'fieldset'=>array('fieldset'=>1), 'form'=>array('form'=>1), 'label'=>array('label'=>1), 'noscript'=>array('script'=>1), 'pre'=>array('big'=>1, 'font'=>1, 'img'=>1, 'object'=>1, 'script'=>1, 'small'=>1, 'sub'=>1, 'sup'=>1), 'rb'=>array('ruby'=>1), 'rt'=>array('ruby'=>1)); // Illegal +$cN2 = array_keys($cN); +$cR = array('blockquote'=>1, 'dir'=>1, 'dl'=>1, 'form'=>1, 'map'=>1, 'menu'=>1, 'noscript'=>1, 'ol'=>1, 'optgroup'=>1, 'rbc'=>1, 'rtc'=>1, 'ruby'=>1, 'select'=>1, 'table'=>1, 'tbody'=>1, 'tfoot'=>1, 'thead'=>1, 'tr'=>1, 'ul'=>1); +$cS = array('colgroup'=>array('col'=>1), 'dir'=>array('li'), 'dl'=>array('dd'=>1, 'dt'=>1), 'menu'=>array('li'=>1), 'ol'=>array('li'=>1), 'optgroup'=>array('option'=>1), 'option'=>array('#pcdata'=>1), 'rbc'=>array('rb'=>1), 'rp'=>array('#pcdata'=>1), 'rtc'=>array('rt'=>1), 'ruby'=>array('rb'=>1, 'rbc'=>1, 'rp'=>1, 'rt'=>1, 'rtc'=>1), 'select'=>array('optgroup'=>1, 'option'=>1), 'script'=>array('#pcdata'=>1), 'table'=>array('caption'=>1, 'col'=>1, 'colgroup'=>1, 'tfoot'=>1, 'tbody'=>1, 'tr'=>1, 'thead'=>1), 'tbody'=>array('tr'=>1), 'tfoot'=>array('tr'=>1), 'textarea'=>array('#pcdata'=>1), 'thead'=>array('tr'=>1), 'tr'=>array('td'=>1, 'th'=>1), 'ul'=>array('li'=>1)); // Specific - immediate parent-child +$cO = array('address'=>array('p'=>1), 'applet'=>array('param'=>1), 'blockquote'=>array('script'=>1), 'fieldset'=>array('legend'=>1, '#pcdata'=>1), 'form'=>array('script'=>1), 'map'=>array('area'=>1), 'object'=>array('param'=>1, 'embed'=>1)); // Other +$cT = array('colgroup'=>1, 'dd'=>1, 'dt'=>1, 'li'=>1, 'option'=>1, 'p'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1); // Omitable closing +// block/inline type; ins & del both type; #pcdata: text +$eB = array('address'=>1, 'blockquote'=>1, 'center'=>1, 'del'=>1, 'dir'=>1, 'dl'=>1, 'div'=>1, 'fieldset'=>1, 'form'=>1, 'ins'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'hr'=>1, 'isindex'=>1, 'menu'=>1, 'noscript'=>1, 'ol'=>1, 'p'=>1, 'pre'=>1, 'table'=>1, 'ul'=>1); +$eI = array('#pcdata'=>1, 'a'=>1, 'abbr'=>1, 'acronym'=>1, 'applet'=>1, 'b'=>1, 'bdo'=>1, 'big'=>1, 'br'=>1, 'button'=>1, 'cite'=>1, 'code'=>1, 'del'=>1, 'dfn'=>1, 'em'=>1, 'embed'=>1, 'font'=>1, 'i'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'ins'=>1, 'kbd'=>1, 'label'=>1, 'map'=>1, 'object'=>1, 'param'=>1, 'q'=>1, 'ruby'=>1, 's'=>1, 'samp'=>1, 'select'=>1, 'script'=>1, 'small'=>1, 'span'=>1, 'strike'=>1, 'strong'=>1, 'sub'=>1, 'sup'=>1, 'textarea'=>1, 'tt'=>1, 'u'=>1, 'var'=>1); +$eN = array('a'=>1, 'big'=>1, 'button'=>1, 'fieldset'=>1, 'font'=>1, 'form'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'label'=>1, 'object'=>1, 'ruby'=>1, 'script'=>1, 'select'=>1, 'small'=>1, 'sub'=>1, 'sup'=>1, 'textarea'=>1); // Exclude from specific ele; $cN values +$eO = array('area'=>1, 'caption'=>1, 'col'=>1, 'colgroup'=>1, 'dd'=>1, 'dt'=>1, 'legend'=>1, 'li'=>1, 'optgroup'=>1, 'option'=>1, 'rb'=>1, 'rbc'=>1, 'rp'=>1, 'rt'=>1, 'rtc'=>1, 'script'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'thead'=>1, 'th'=>1, 'tr'=>1); // Missing in $eB & $eI +$eF = $eB + $eI; + +// $in sets allowed child +$in = ((isset($eF[$in]) && $in != '#pcdata') or isset($eO[$in])) ? $in : 'div'; +if(isset($cE[$in])){ + return (!$do ? '' : str_replace(array('<', '>'), array('<', '>'), $t)); +} +if(isset($cS[$in])){$inOk = $cS[$in];} +elseif(isset($cI[$in])){$inOk = $eI; $cI['del'] = 1; $cI['ins'] = 1;} +elseif(isset($cF[$in])){$inOk = $eF; unset($cI['del'], $cI['ins']);} +elseif(isset($cB[$in])){$inOk = $eB; unset($cI['del'], $cI['ins']);} +if(isset($cO[$in])){$inOk = $inOk + $cO[$in];} +if(isset($cN[$in])){$inOk = array_diff_assoc($inOk, $cN[$in]);} + +$t = explode('<', $t); +$ok = $q = array(); // $q seq list of open non-empty ele +ob_start(); + +for($i=-1, $ci=count($t); ++$i<$ci;){ + // allowed $ok in parent $p + if($ql = count($q)){ + $p = array_pop($q); + $q[] = $p; + if(isset($cS[$p])){$ok = $cS[$p];} + elseif(isset($cI[$p])){$ok = $eI; $cI['del'] = 1; $cI['ins'] = 1;} + elseif(isset($cF[$p])){$ok = $eF; unset($cI['del'], $cI['ins']);} + elseif(isset($cB[$p])){$ok = $eB; unset($cI['del'], $cI['ins']);} + if(isset($cO[$p])){$ok = $ok + $cO[$p];} + if(isset($cN[$p])){$ok = array_diff_assoc($ok, $cN[$p]);} + }else{$ok = $inOk; unset($cI['del'], $cI['ins']);} + // bad tags, & ele content + if(isset($e) && ($do == 1 or (isset($ok['#pcdata']) && ($do == 3 or $do == 5)))){ + echo '<', $s, $e, $a, '>'; + } + if(isset($x[0])){ + if($do < 3 or isset($ok['#pcdata'])){echo $x;} + elseif(strpos($x, "\x02\x04")){ + foreach(preg_split('`(\x01\x02[^\x01\x02]+\x02\x01)`', $x, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY) as $v){ + echo (substr($v, 0, 2) == "\x01\x02" ? $v : ($do > 4 ? preg_replace('`\S`', '', $v) : '')); + } + }elseif($do > 4){echo preg_replace('`\S`', '', $x);} + } + // get markup + if(!preg_match('`^(/?)([a-zA-Z1-6]+)([^>]*)>(.*)`sm', $t[$i], $r)){$x = $t[$i]; continue;} + $s = null; $e = null; $a = null; $x = null; list($all, $s, $e, $a, $x) = $r; + // close tag + if($s){ + if(isset($cE[$e]) or !in_array($e, $q)){continue;} // Empty/unopen + if($p == $e){array_pop($q); echo '</', $e, '>'; unset($e); continue;} // Last open + $add = ''; // Nesting - close open tags that need to be + for($j=-1, $cj=count($q); ++$j<$cj;){ + if(($d = array_pop($q)) == $e){break;} + else{$add .= "</{$d}>";} + } + echo $add, '</', $e, '>'; unset($e); continue; + } + // open tag + // $cB ele needs $eB ele as child + if(isset($cB[$e]) && strlen(trim($x))){ + $t[$i] = "{$e}{$a}>"; + array_splice($t, $i+1, 0, 'div>'. $x); unset($e, $x); ++$ci; --$i; continue; + } + if((($ql && isset($cB[$p])) or (isset($cB[$in]) && !$ql)) && !isset($eB[$e]) && !isset($ok[$e])){ + array_splice($t, $i, 0, 'div>'); unset($e, $x); ++$ci; --$i; continue; + } + // if no open ele, $in = parent; mostly immediate parent-child relation should hold + if(!$ql or !isset($eN[$e]) or !array_intersect($q, $cN2)){ + if(!isset($ok[$e])){ + if($ql && isset($cT[$p])){echo '</', array_pop($q), '>'; unset($e, $x); --$i;} + continue; + } + if(!isset($cE[$e])){$q[] = $e;} + echo '<', $e, $a, '>'; unset($e); continue; + } + // specific parent-child + if(isset($cS[$p][$e])){ + if(!isset($cE[$e])){$q[] = $e;} + echo '<', $e, $a, '>'; unset($e); continue; + } + // nesting + $add = ''; + $q2 = array(); + for($k=-1, $kc=count($q); ++$k<$kc;){ + $d = $q[$k]; + $ok2 = array(); + if(isset($cS[$d])){$q2[] = $d; continue;} + $ok2 = isset($cI[$d]) ? $eI : $eF; + if(isset($cO[$d])){$ok2 = $ok2 + $cO[$d];} + if(isset($cN[$d])){$ok2 = array_diff_assoc($ok2, $cN[$d]);} + if(!isset($ok2[$e])){ + if(!$k && !isset($inOk[$e])){continue 2;} + $add = "</{$d}>"; + for(;++$k<$kc;){$add = "</{$q[$k]}>{$add}";} + break; + } + else{$q2[] = $d;} + } + $q = $q2; + if(!isset($cE[$e])){$q[] = $e;} + echo $add, '<', $e, $a, '>'; unset($e); continue; +} + +// end +if($ql = count($q)){ + $p = array_pop($q); + $q[] = $p; + if(isset($cS[$p])){$ok = $cS[$p];} + elseif(isset($cI[$p])){$ok = $eI; $cI['del'] = 1; $cI['ins'] = 1;} + elseif(isset($cF[$p])){$ok = $eF; unset($cI['del'], $cI['ins']);} + elseif(isset($cB[$p])){$ok = $eB; unset($cI['del'], $cI['ins']);} + if(isset($cO[$p])){$ok = $ok + $cO[$p];} + if(isset($cN[$p])){$ok = array_diff_assoc($ok, $cN[$p]);} +}else{$ok = $inOk; unset($cI['del'], $cI['ins']);} +if(isset($e) && ($do == 1 or (isset($ok['#pcdata']) && ($do == 3 or $do == 5)))){ + echo '<', $s, $e, $a, '>'; +} +if(isset($x[0])){ + if(strlen(trim($x)) && (($ql && isset($cB[$p])) or (isset($cB[$in]) && !$ql))){ + echo '<div>', $x, '</div>'; + } + elseif($do < 3 or isset($ok['#pcdata'])){echo $x;} + elseif(strpos($x, "\x02\x04")){ + foreach(preg_split('`(\x01\x02[^\x01\x02]+\x02\x01)`', $x, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY) as $v){ + echo (substr($v, 0, 2) == "\x01\x02" ? $v : ($do > 4 ? preg_replace('`\S`', '', $v) : '')); + } + }elseif($do > 4){echo preg_replace('`\S`', '', $x);} +} +while(!empty($q) && ($e = array_pop($q))){echo '</', $e, '>';} +$o = ob_get_contents(); +ob_end_clean(); +return $o; +// eof +} + +function hl_cmtcd($t){ +// comment/CDATA sec handler +$t = $t[0]; +global $C; +if($t[3] == '-'){ + if(!$C['comment']){return $t;} + if($C['comment'] == 1){return '';} + if(substr(($t = preg_replace('`--+`', '-', substr($t, 4, -3))), -1) != ' '){$t .= ' ';} + $t = $C['comment'] == 2 ? str_replace(array('&', '<', '>'), array('&', '<', '>'), $t) : $t; + $t = "\x01\x02\x04!--$t--\x05\x02\x01"; +}else{ // CDATA + if(!$C['cdata']){return $t;} + if($C['cdata'] == 1){return '';} + $t = substr($t, 1, -1); + $t = $C['cdata'] == 2 ? str_replace(array('&', '<', '>'), array('&', '<', '>'), $t) : $t; + $t = "\x01\x01\x04$t\x05\x01\x01"; +} +return str_replace(array('&', '<', '>'), array("\x03", "\x04", "\x05"), $t); +// eof +} + +function hl_ent($t){ +// entitity handler +global $C; +$t = $t[1]; +static $U = array('quot'=>1,'amp'=>1,'lt'=>1,'gt'=>1); +static $N = array('fnof'=>'402', 'Alpha'=>'913', 'Beta'=>'914', 'Gamma'=>'915', 'Delta'=>'916', 'Epsilon'=>'917', 'Zeta'=>'918', 'Eta'=>'919', 'Theta'=>'920', 'Iota'=>'921', 'Kappa'=>'922', 'Lambda'=>'923', 'Mu'=>'924', 'Nu'=>'925', 'Xi'=>'926', 'Omicron'=>'927', 'Pi'=>'928', 'Rho'=>'929', 'Sigma'=>'931', 'Tau'=>'932', 'Upsilon'=>'933', 'Phi'=>'934', 'Chi'=>'935', 'Psi'=>'936', 'Omega'=>'937', 'alpha'=>'945', 'beta'=>'946', 'gamma'=>'947', 'delta'=>'948', 'epsilon'=>'949', 'zeta'=>'950', 'eta'=>'951', 'theta'=>'952', 'iota'=>'953', 'kappa'=>'954', 'lambda'=>'955', 'mu'=>'956', 'nu'=>'957', 'xi'=>'958', 'omicron'=>'959', 'pi'=>'960', 'rho'=>'961', 'sigmaf'=>'962', 'sigma'=>'963', 'tau'=>'964', 'upsilon'=>'965', 'phi'=>'966', 'chi'=>'967', 'psi'=>'968', 'omega'=>'969', 'thetasym'=>'977', 'upsih'=>'978', 'piv'=>'982', 'bull'=>'8226', 'hellip'=>'8230', 'prime'=>'8242', 'Prime'=>'8243', 'oline'=>'8254', 'frasl'=>'8260', 'weierp'=>'8472', 'image'=>'8465', 'real'=>'8476', 'trade'=>'8482', 'alefsym'=>'8501', 'larr'=>'8592', 'uarr'=>'8593', 'rarr'=>'8594', 'darr'=>'8595', 'harr'=>'8596', 'crarr'=>'8629', 'lArr'=>'8656', 'uArr'=>'8657', 'rArr'=>'8658', 'dArr'=>'8659', 'hArr'=>'8660', 'forall'=>'8704', 'part'=>'8706', 'exist'=>'8707', 'empty'=>'8709', 'nabla'=>'8711', 'isin'=>'8712', 'notin'=>'8713', 'ni'=>'8715', 'prod'=>'8719', 'sum'=>'8721', 'minus'=>'8722', 'lowast'=>'8727', 'radic'=>'8730', 'prop'=>'8733', 'infin'=>'8734', 'ang'=>'8736', 'and'=>'8743', 'or'=>'8744', 'cap'=>'8745', 'cup'=>'8746', 'int'=>'8747', 'there4'=>'8756', 'sim'=>'8764', 'cong'=>'8773', 'asymp'=>'8776', 'ne'=>'8800', 'equiv'=>'8801', 'le'=>'8804', 'ge'=>'8805', 'sub'=>'8834', 'sup'=>'8835', 'nsub'=>'8836', 'sube'=>'8838', 'supe'=>'8839', 'oplus'=>'8853', 'otimes'=>'8855', 'perp'=>'8869', 'sdot'=>'8901', 'lceil'=>'8968', 'rceil'=>'8969', 'lfloor'=>'8970', 'rfloor'=>'8971', 'lang'=>'9001', 'rang'=>'9002', 'loz'=>'9674', 'spades'=>'9824', 'clubs'=>'9827', 'hearts'=>'9829', 'diams'=>'9830', 'apos'=>'39', 'OElig'=>'338', 'oelig'=>'339', 'Scaron'=>'352', 'scaron'=>'353', 'Yuml'=>'376', 'circ'=>'710', 'tilde'=>'732', 'ensp'=>'8194', 'emsp'=>'8195', 'thinsp'=>'8201', 'zwnj'=>'8204', 'zwj'=>'8205', 'lrm'=>'8206', 'rlm'=>'8207', 'ndash'=>'8211', 'mdash'=>'8212', 'lsquo'=>'8216', 'rsquo'=>'8217', 'sbquo'=>'8218', 'ldquo'=>'8220', 'rdquo'=>'8221', 'bdquo'=>'8222', 'dagger'=>'8224', 'Dagger'=>'8225', 'permil'=>'8240', 'lsaquo'=>'8249', 'rsaquo'=>'8250', 'euro'=>'8364', 'nbsp'=>'160', 'iexcl'=>'161', 'cent'=>'162', 'pound'=>'163', 'curren'=>'164', 'yen'=>'165', 'brvbar'=>'166', 'sect'=>'167', 'uml'=>'168', 'copy'=>'169', 'ordf'=>'170', 'laquo'=>'171', 'not'=>'172', 'shy'=>'173', 'reg'=>'174', 'macr'=>'175', 'deg'=>'176', 'plusmn'=>'177', 'sup2'=>'178', 'sup3'=>'179', 'acute'=>'180', 'micro'=>'181', 'para'=>'182', 'middot'=>'183', 'cedil'=>'184', 'sup1'=>'185', 'ordm'=>'186', 'raquo'=>'187', 'frac14'=>'188', 'frac12'=>'189', 'frac34'=>'190', 'iquest'=>'191', 'Agrave'=>'192', 'Aacute'=>'193', 'Acirc'=>'194', 'Atilde'=>'195', 'Auml'=>'196', 'Aring'=>'197', 'AElig'=>'198', 'Ccedil'=>'199', 'Egrave'=>'200', 'Eacute'=>'201', 'Ecirc'=>'202', 'Euml'=>'203', 'Igrave'=>'204', 'Iacute'=>'205', 'Icirc'=>'206', 'Iuml'=>'207', 'ETH'=>'208', 'Ntilde'=>'209', 'Ograve'=>'210', 'Oacute'=>'211', 'Ocirc'=>'212', 'Otilde'=>'213', 'Ouml'=>'214', 'times'=>'215', 'Oslash'=>'216', 'Ugrave'=>'217', 'Uacute'=>'218', 'Ucirc'=>'219', 'Uuml'=>'220', 'Yacute'=>'221', 'THORN'=>'222', 'szlig'=>'223', 'agrave'=>'224', 'aacute'=>'225', 'acirc'=>'226', 'atilde'=>'227', 'auml'=>'228', 'aring'=>'229', 'aelig'=>'230', 'ccedil'=>'231', 'egrave'=>'232', 'eacute'=>'233', 'ecirc'=>'234', 'euml'=>'235', 'igrave'=>'236', 'iacute'=>'237', 'icirc'=>'238', 'iuml'=>'239', 'eth'=>'240', 'ntilde'=>'241', 'ograve'=>'242', 'oacute'=>'243', 'ocirc'=>'244', 'otilde'=>'245', 'ouml'=>'246', 'divide'=>'247', 'oslash'=>'248', 'ugrave'=>'249', 'uacute'=>'250', 'ucirc'=>'251', 'uuml'=>'252', 'yacute'=>'253', 'thorn'=>'254', 'yuml'=>'255'); +if($t[0] != '#'){ + return ($C['and_mark'] ? "\x06" : '&'). (isset($U[$t]) ? $t : (isset($N[$t]) ? (!$C['named_entity'] ? '#'. ($C['hexdec_entity'] > 1 ? 'x'. dechex($N[$t]) : $N[$t]) : $t) : 'amp;'. $t)). ';'; +} +if(($n = ctype_digit($t = substr($t, 1)) ? intval($t) : hexdec(substr($t, 1))) < 9 or ($n > 13 && $n < 32) or $n == 11 or $n == 12 or ($n > 126 && $n < 160 && $n != 133) or ($n > 55295 && ($n < 57344 or ($n > 64975 && $n < 64992) or $n == 65534 or $n == 65535 or $n > 1114111))){ + return ($C['and_mark'] ? "\x06" : '&'). "amp;#{$t};"; +} +return ($C['and_mark'] ? "\x06" : '&'). '#'. (((ctype_digit($t) && $C['hexdec_entity'] < 2) or !$C['hexdec_entity']) ? $n : 'x'. dechex($n)). ';'; +// eof +} + +function hl_prot($p, $c=null){ +// check URL scheme +global $C; +$b = $a = ''; +if($c == null){$c = 'style'; $b = $p[1]; $a = $p[3]; $p = trim($p[2]);} +$c = isset($C['schemes'][$c]) ? $C['schemes'][$c] : $C['schemes']['*']; +if(isset($c['*']) or !strcspn($p, '#?;')){return "{$b}{$p}{$a}";} // All ok, frag, query, param +if(preg_match('`^([a-z\d\-+.&#; ]+?)(:|&#(58|x3a);|%3a|\\\\0{0,4}3a).`i', $p, $m) && !isset($c[strtolower($m[1])])){ // Denied prot + return "{$b}denied:{$p}{$a}"; +} +if($C['abs_url']){ + if($C['abs_url'] == -1 && strpos($p, $C['base_url']) === 0){ // Make url rel + $p = substr($p, strlen($C['base_url'])); + }elseif(empty($m[1])){ // Make URL abs + if(substr($p, 0, 2) == '//'){$p = substr($C['base_url'], 0, strpos($C['base_url'], ':')+1). $p;} + elseif($p[0] == '/'){$p = preg_replace('`(^.+?://[^/]+)(.*)`', '$1', $C['base_url']). $p;} + elseif(strcspn($p, './')){$p = $C['base_url']. $p;} + else{ + preg_match('`^([a-zA-Z\d\-+.]+://[^/]+)(.*)`', $C['base_url'], $m); + $p = preg_replace('`(?<=/)\./`', '', $m[2]. $p); + while(preg_match('`(?<=/)([^/]{3,}|[^/.]+?|\.[^/.]|[^/.]\.)/\.\./`', $p)){ + $p = preg_replace('`(?<=/)([^/]{3,}|[^/.]+?|\.[^/.]|[^/.]\.)/\.\./`', '', $p); + } + $p = $m[1]. $p; + } + } +} +return "{$b}{$p}{$a}"; +// eof +} + +function hl_regex($p){ +// ?regex +if(empty($p)){return 0;} +if($t = ini_get('track_errors')){$o = isset($php_errormsg) ? $php_errormsg : null;} +else{ini_set('track_errors', 1);} +unset($php_errormsg); +if(($d = ini_get('display_errors'))){ini_set('display_errors', 0);} +preg_match($p, ''); +if($d){ini_set('display_errors', 1);} +$r = isset($php_errormsg) ? 0 : 1; +if($t){$php_errormsg = isset($o) ? $o : null;} +else{ini_set('track_errors', 0);} +return $r; +// eof +} + +function hl_spec($t){ +// final $spec +$s = array(); +$t = str_replace(array("\t", "\r", "\n", ' '), '', preg_replace('/"(?>(`.|[^"])*)"/sme', 'substr(str_replace(array(";", "|", "~", " ", ",", "/", "(", ")", \'`"\'), array("\x01", "\x02", "\x03", "\x04", "\x05", "\x06", "\x07", "\x08", "\""), "$0"), 1, -1)', trim($t))); +for($i = count(($t = explode(';', $t))); --$i>=0;){ + $w = $t[$i]; + if(empty($w) or ($e = strpos($w, '=')) === false or !strlen(($a = substr($w, $e+1)))){continue;} + $y = $n = array(); + foreach(explode(',', $a) as $v){ + if(!preg_match('`^([a-z:\-\*]+)(?:\((.*?)\))?`i', $v, $m)){continue;} + if(($x = strtolower($m[1])) == '-*'){$n['*'] = 1; continue;} + if($x[0] == '-'){$n[substr($x, 1)] = 1; continue;} + if(!isset($m[2])){$y[$x] = 1; continue;} + foreach(explode('/', $m[2]) as $m){ + if(empty($m) or ($p = strpos($m, '=')) == 0 or $p < 5){$y[$x] = 1; continue;} + $y[$x][strtolower(substr($m, 0, $p))] = str_replace(array("\x01", "\x02", "\x03", "\x04", "\x05", "\x06", "\x07", "\x08"), array(";", "|", "~", " ", ",", "/", "(", ")"), substr($m, $p+1)); + } + if(isset($y[$x]['match']) && !hl_regex($y[$x]['match'])){unset($y[$x]['match']);} + if(isset($y[$x]['nomatch']) && !hl_regex($y[$x]['nomatch'])){unset($y[$x]['nomatch']);} + } + if(!count($y) && !count($n)){continue;} + foreach(explode(',', substr($w, 0, $e)) as $v){ + if(!strlen(($v = strtolower($v)))){continue;} + if(count($y)){$s[$v] = $y;} + if(count($n)){$s[$v]['n'] = $n;} + } +} +return $s; +// eof +} + +function hl_tag($t){ +// tag/attribute handler +global $C; +$t = $t[0]; +// invalid < > +if($t == '< '){return '< ';} +if($t == '>'){return '>';} +if(!preg_match('`^<(/?)([a-zA-Z][a-zA-Z1-6]*)([^>]*?)\s?>$`m', $t, $m)){ + return str_replace(array('<', '>'), array('<', '>'), $t); +}elseif(!isset($C['elements'][($e = strtolower($m[2]))])){ + return (($C['keep_bad']%2) ? str_replace(array('<', '>'), array('<', '>'), $t) : ''); +} +// attr string +$a = str_replace(array("\xad", "\n", "\r", "\t"), ' ', trim($m[3])); +if(strpos($a, '&') !== false){ + str_replace(array('­', '­', '­'), ' ', $a); +} +// tag transform +static $eD = array('applet'=>1, 'center'=>1, 'dir'=>1, 'embed'=>1, 'font'=>1, 'isindex'=>1, 'menu'=>1, 's'=>1, 'strike'=>1, 'u'=>1); // Deprecated +if($C['make_tag_strict'] && isset($eD[$e])){ + $trt = hl_tag2($e, $a, $C['make_tag_strict']); + if(!$e){return (($C['keep_bad']%2) ? str_replace(array('<', '>'), array('<', '>'), $t) : '');} +} +// close tag +static $eE = array('area'=>1, 'br'=>1, 'col'=>1, 'embed'=>1, 'hr'=>1, 'img'=>1, 'input'=>1, 'isindex'=>1, 'param'=>1); // Empty ele +if(!empty($m[1])){ + return (!isset($eE[$e]) ? "</$e>" : (($C['keep_bad'])%2 ? str_replace(array('<', '>'), array('<', '>'), $t) : '')); +} + +// open tag & attr +static $aN = array('abbr'=>array('td'=>1, 'th'=>1), 'accept-charset'=>array('form'=>1), 'accept'=>array('form'=>1, 'input'=>1), 'accesskey'=>array('a'=>1, 'area'=>1, 'button'=>1, 'input'=>1, 'label'=>1, 'legend'=>1, 'textarea'=>1), 'action'=>array('form'=>1), 'align'=>array('caption'=>1, 'embed'=>1, 'applet'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'object'=>1, 'legend'=>1, 'table'=>1, 'hr'=>1, 'div'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'p'=>1, 'col'=>1, 'colgroup'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'alt'=>array('applet'=>1, 'area'=>1, 'img'=>1, 'input'=>1), 'archive'=>array('applet'=>1, 'object'=>1), 'axis'=>array('td'=>1, 'th'=>1), 'bgcolor'=>array('embed'=>1, 'table'=>1, 'tr'=>1, 'td'=>1, 'th'=>1), 'border'=>array('table'=>1, 'img'=>1, 'object'=>1), 'bordercolor'=>array('table'=>1, 'td'=>1, 'tr'=>1), 'cellpadding'=>array('table'=>1), 'cellspacing'=>array('table'=>1), 'char'=>array('col'=>1, 'colgroup'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'charoff'=>array('col'=>1, 'colgroup'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'charset'=>array('a'=>1, 'script'=>1), 'checked'=>array('input'=>1), 'cite'=>array('blockquote'=>1, 'q'=>1, 'del'=>1, 'ins'=>1), 'classid'=>array('object'=>1), 'clear'=>array('br'=>1), 'code'=>array('applet'=>1), 'codebase'=>array('object'=>1, 'applet'=>1), 'codetype'=>array('object'=>1), 'color'=>array('font'=>1), 'cols'=>array('textarea'=>1), 'colspan'=>array('td'=>1, 'th'=>1), 'compact'=>array('dir'=>1, 'dl'=>1, 'menu'=>1, 'ol'=>1, 'ul'=>1), 'coords'=>array('area'=>1, 'a'=>1), 'data'=>array('object'=>1), 'datetime'=>array('del'=>1, 'ins'=>1), 'declare'=>array('object'=>1), 'defer'=>array('script'=>1), 'dir'=>array('bdo'=>1), 'disabled'=>array('button'=>1, 'input'=>1, 'optgroup'=>1, 'option'=>1, 'select'=>1, 'textarea'=>1), 'enctype'=>array('form'=>1), 'face'=>array('font'=>1), 'for'=>array('label'=>1), 'frame'=>array('table'=>1), 'frameborder'=>array('iframe'=>1), 'headers'=>array('td'=>1, 'th'=>1), 'height'=>array('embed'=>1, 'iframe'=>1, 'td'=>1, 'th'=>1, 'img'=>1, 'object'=>1, 'applet'=>1), 'href'=>array('a'=>1, 'area'=>1), 'hreflang'=>array('a'=>1), 'hspace'=>array('applet'=>1, 'img'=>1, 'object'=>1), 'ismap'=>array('img'=>1, 'input'=>1), 'label'=>array('option'=>1, 'optgroup'=>1), 'language'=>array('script'=>1), 'longdesc'=>array('img'=>1, 'iframe'=>1), 'marginheight'=>array('iframe'=>1), 'marginwidth'=>array('iframe'=>1), 'maxlength'=>array('input'=>1), 'method'=>array('form'=>1), 'model'=>array('embed'=>1), 'multiple'=>array('select'=>1), 'name'=>array('button'=>1, 'embed'=>1, 'textarea'=>1, 'applet'=>1, 'select'=>1, 'form'=>1, 'iframe'=>1, 'img'=>1, 'a'=>1, 'input'=>1, 'object'=>1, 'map'=>1, 'param'=>1), 'nohref'=>array('area'=>1), 'noshade'=>array('hr'=>1), 'nowrap'=>array('td'=>1, 'th'=>1), 'object'=>array('applet'=>1), 'onblur'=>array('a'=>1, 'area'=>1, 'button'=>1, 'input'=>1, 'label'=>1, 'select'=>1, 'textarea'=>1), 'onchange'=>array('input'=>1, 'select'=>1, 'textarea'=>1), 'onfocus'=>array('a'=>1, 'area'=>1, 'button'=>1, 'input'=>1, 'label'=>1, 'select'=>1, 'textarea'=>1), 'onreset'=>array('form'=>1), 'onselect'=>array('input'=>1, 'textarea'=>1), 'onsubmit'=>array('form'=>1), 'pluginspage'=>array('embed'=>1), 'pluginurl'=>array('embed'=>1), 'prompt'=>array('isindex'=>1), 'readonly'=>array('textarea'=>1, 'input'=>1), 'rel'=>array('a'=>1), 'rev'=>array('a'=>1), 'rows'=>array('textarea'=>1), 'rowspan'=>array('td'=>1, 'th'=>1), 'rules'=>array('table'=>1), 'scope'=>array('td'=>1, 'th'=>1), 'scrolling'=>array('iframe'=>1), 'selected'=>array('option'=>1), 'shape'=>array('area'=>1, 'a'=>1), 'size'=>array('hr'=>1, 'font'=>1, 'input'=>1, 'select'=>1), 'span'=>array('col'=>1, 'colgroup'=>1), 'src'=>array('embed'=>1, 'script'=>1, 'input'=>1, 'iframe'=>1, 'img'=>1), 'standby'=>array('object'=>1), 'start'=>array('ol'=>1), 'summary'=>array('table'=>1), 'tabindex'=>array('a'=>1, 'area'=>1, 'button'=>1, 'input'=>1, 'object'=>1, 'select'=>1, 'textarea'=>1), 'target'=>array('a'=>1, 'area'=>1, 'form'=>1), 'type'=>array('a'=>1, 'embed'=>1, 'object'=>1, 'param'=>1, 'script'=>1, 'input'=>1, 'li'=>1, 'ol'=>1, 'ul'=>1, 'button'=>1), 'usemap'=>array('img'=>1, 'input'=>1, 'object'=>1), 'valign'=>array('col'=>1, 'colgroup'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'value'=>array('input'=>1, 'option'=>1, 'param'=>1, 'button'=>1, 'li'=>1), 'valuetype'=>array('param'=>1), 'vspace'=>array('applet'=>1, 'img'=>1, 'object'=>1), 'width'=>array('embed'=>1, 'hr'=>1, 'iframe'=>1, 'img'=>1, 'object'=>1, 'table'=>1, 'td'=>1, 'th'=>1, 'applet'=>1, 'col'=>1, 'colgroup'=>1, 'pre'=>1), 'wmode'=>array('embed'=>1), 'xml:space'=>array('pre'=>1, 'script'=>1, 'style'=>1)); // Ele-specific +static $aNE = array('checked'=>1, 'compact'=>1, 'declare'=>1, 'defer'=>1, 'disabled'=>1, 'ismap'=>1, 'multiple'=>1, 'nohref'=>1, 'noresize'=>1, 'noshade'=>1, 'nowrap'=>1, 'readonly'=>1, 'selected'=>1); // Empty +static $aNP = array('action'=>1, 'cite'=>1, 'classid'=>1, 'codebase'=>1, 'data'=>1, 'href'=>1, 'longdesc'=>1, 'model'=>1, 'pluginspage'=>1, 'pluginurl'=>1, 'usemap'=>1); // Need scheme check; excludes style, on* & src +static $aNU = array('class'=>array('param'=>1, 'script'=>1), 'dir'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'iframe'=>1, 'param'=>1, 'script'=>1), 'id'=>array('script'=>1), 'lang'=>array('applet'=>1, 'br'=>1, 'iframe'=>1, 'param'=>1, 'script'=>1), 'xml:lang'=>array('applet'=>1, 'br'=>1, 'iframe'=>1, 'param'=>1, 'script'=>1), 'onclick'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'ondblclick'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onkeydown'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onkeypress'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onkeyup'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onmousedown'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onmousemove'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onmouseout'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onmouseover'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onmouseup'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'style'=>array('param'=>1, 'script'=>1), 'title'=>array('param'=>1, 'script'=>1)); // Univ & exceptions + +if($C['lc_std_val']){ + // predef attr vals for $eAL & $aNE ele + static $aNL = array('all'=>1, 'baseline'=>1, 'bottom'=>1, 'button'=>1, 'center'=>1, 'char'=>1, 'checkbox'=>1, 'circle'=>1, 'col'=>1, 'colgroup'=>1, 'cols'=>1, 'data'=>1, 'default'=>1, 'file'=>1, 'get'=>1, 'groups'=>1, 'hidden'=>1, 'image'=>1, 'justify'=>1, 'left'=>1, 'ltr'=>1, 'middle'=>1, 'none'=>1, 'object'=>1, 'password'=>1, 'poly'=>1, 'post'=>1, 'preserve'=>1, 'radio'=>1, 'rect'=>1, 'ref'=>1, 'reset'=>1, 'right'=>1, 'row'=>1, 'rowgroup'=>1, 'rows'=>1, 'rtl'=>1, 'submit'=>1, 'text'=>1, 'top'=>1); + static $eAL = array('a'=>1, 'area'=>1, 'bdo'=>1, 'button'=>1, 'col'=>1, 'form'=>1, 'img'=>1, 'input'=>1, 'object'=>1, 'optgroup'=>1, 'option'=>1, 'param'=>1, 'script'=>1, 'select'=>1, 'table'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1, 'xml:space'=>1); + $lcase = isset($eAL[$e]) ? 1 : 0; +} + +$depTr = 0; +if($C['no_deprecated_attr']){ + // dep attr:applicable ele + static $aND = array('align'=>array('caption'=>1, 'div'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'hr'=>1, 'img'=>1, 'input'=>1, 'legend'=>1, 'object'=>1, 'p'=>1, 'table'=>1), 'bgcolor'=>array('table'=>1, 'td'=>1, 'th'=>1, 'tr'=>1), 'border'=>array('img'=>1, 'object'=>1), 'bordercolor'=>array('table'=>1, 'td'=>1, 'tr'=>1), 'clear'=>array('br'=>1), 'compact'=>array('dl'=>1, 'ol'=>1, 'ul'=>1), 'height'=>array('td'=>1, 'th'=>1), 'hspace'=>array('img'=>1, 'object'=>1), 'language'=>array('script'=>1), 'name'=>array('a'=>1, 'form'=>1, 'iframe'=>1, 'img'=>1, 'map'=>1), 'noshade'=>array('hr'=>1), 'nowrap'=>array('td'=>1, 'th'=>1), 'size'=>array('hr'=>1), 'start'=>array('ol'=>1), 'type'=>array('li'=>1, 'ol'=>1, 'ul'=>1), 'value'=>array('li'=>1), 'vspace'=>array('img'=>1, 'object'=>1), 'width'=>array('hr'=>1, 'pre'=>1, 'td'=>1, 'th'=>1)); + static $eAD = array('a'=>1, 'br'=>1, 'caption'=>1, 'div'=>1, 'dl'=>1, 'form'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'hr'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'legend'=>1, 'li'=>1, 'map'=>1, 'object'=>1, 'ol'=>1, 'p'=>1, 'pre'=>1, 'script'=>1, 'table'=>1, 'td'=>1, 'th'=>1, 'tr'=>1, 'ul'=>1); + $depTr = isset($eAD[$e]) ? 1 : 0; +} + +// attr name-vals +if(strpos($a, "\x01") !== false){$a = preg_replace('`\x01[^\x01]*\x01`', '', $a);} // No comment/CDATA sec +$mode = 0; $a = trim($a, ' /'); $aA = array(); +while(strlen($a)){ + $w = 0; + switch($mode){ + case 0: // Name + if(preg_match('`^[a-zA-Z][\-a-zA-Z:]+`', $a, $m)){ + $nm = strtolower($m[0]); + $w = $mode = 1; $a = ltrim(substr_replace($a, '', 0, strlen($m[0]))); + } + break; case 1: + if($a[0] == '='){ // = + $w = 1; $mode = 2; $a = ltrim($a, '= '); + }else{ // No val + $w = 1; $mode = 0; $a = ltrim($a); + $aA[$nm] = ''; + } + break; case 2: // Val + if(preg_match('`^"[^"]*"`', $a, $m) or preg_match("`^'[^']*'`", $a, $m) or preg_match("`^\s*[^\s\"']+`", $a, $m)){ + $m = $m[0]; $w = 1; $mode = 0; $a = ltrim(substr_replace($a, '', 0, strlen($m))); + $aA[$nm] = trim(($m[0] == '"' or $m[0] == '\'') ? substr($m, 1, -1) : $m); + } + break; + } + if($w == 0){ // Parse errs, deal with space, " & ' + $a = preg_replace('`^(?:"[^"]*("|$)|\'[^\']*(\'|$)|\S)*\s*`', '', $a); + $mode = 0; + } +} +if($mode == 1){$aA[$nm] = '';} + +// clean attrs +global $S; +$rl = isset($S[$e]) ? $S[$e] : array(); +$a = array(); $nfr = 0; +foreach($aA as $k=>$v){ + if(((isset($C['deny_attribute']['*']) ? isset($C['deny_attribute'][$k]) : !isset($C['deny_attribute'][$k])) or isset($rl[$k])) && ((!isset($rl['n'][$k]) && !isset($rl['n']['*'])) or isset($rl[$k])) && (isset($aN[$k][$e]) or (isset($aNU[$k]) && !isset($aNU[$k][$e])))){ + if(isset($aNE[$k])){$v = $k;} + elseif(!empty($lcase) && (($e != 'button' or $e != 'input') or $k == 'type')){ // Rather loose but ?not cause issues + $v = (isset($aNL[($v2 = strtolower($v))])) ? $v2 : $v; + } + if($k == 'style' && !$C['style_pass']){ + if(false !== strpos($v, '&#')){ + static $sC = array(' '=>' ', ' '=>' ', 'E'=>'e', 'E'=>'e', 'e'=>'e', 'e'=>'e', 'X'=>'x', 'X'=>'x', 'x'=>'x', 'x'=>'x', 'P'=>'p', 'P'=>'p', 'p'=>'p', 'p'=>'p', 'S'=>'s', 'S'=>'s', 's'=>'s', 's'=>'s', 'I'=>'i', 'I'=>'i', 'i'=>'i', 'i'=>'i', 'O'=>'o', 'O'=>'o', 'o'=>'o', 'o'=>'o', 'N'=>'n', 'N'=>'n', 'n'=>'n', 'n'=>'n', 'U'=>'u', 'U'=>'u', 'u'=>'u', 'u'=>'u', 'R'=>'r', 'R'=>'r', 'r'=>'r', 'r'=>'r', 'L'=>'l', 'L'=>'l', 'l'=>'l', 'l'=>'l', '('=>'(', '('=>'(', ')'=>')', ')'=>')', ' '=>':', ' '=>':', '"'=>'"', '"'=>'"', '''=>"'", '''=>"'", '/'=>'/', '/'=>'/', '*'=>'*', '*'=>'*', '\'=>'\\', '\'=>'\\'); + $v = strtr($v, $sC); + } + $v = preg_replace_callback('`(url(?:\()(?: )*(?:\'|"|&(?:quot|apos);)?)(.+)((?:\'|"|&(?:quot|apos);)?(?: )*(?:\)))`iS', 'hl_prot', $v); + $v = !$C['css_expression'] ? preg_replace('`expression`i', ' ', preg_replace('`\\\\\S|(/|(%2f))(\*|(%2a))`i', ' ', $v)) : $v; + }elseif(isset($aNP[$k]) or strpos($k, 'src') !== false or $k[0] == 'o'){ + $v = hl_prot($v, $k); + if($k == 'href'){ // X-spam + if($C['anti_mail_spam'] && strpos($v, 'mailto:') === 0){ + $v = str_replace('@', htmlspecialchars($C['anti_mail_spam']), $v); + }elseif($C['anti_link_spam']){ + $r1 = $C['anti_link_spam'][1]; + if(!empty($r1) && preg_match($r1, $v)){continue;} + $r0 = $C['anti_link_spam'][0]; + if(!empty($r0) && preg_match($r0, $v)){ + if(isset($a['rel'])){ + if(!preg_match('`\bnofollow\b`i', $a['rel'])){$a['rel'] .= ' nofollow';} + }elseif(isset($aA['rel'])){ + if(!preg_match('`\bnofollow\b`i', $aA['rel'])){$nfr = 1;} + }else{$a['rel'] = 'nofollow';} + } + } + } + } + if(isset($rl[$k]) && is_array($rl[$k]) && ($v = hl_attrval($v, $rl[$k])) === 0){continue;} + $a[$k] = str_replace('"', '"', $v); + } +} +if($nfr){$a['rel'] = isset($a['rel']) ? $a['rel']. ' nofollow' : 'nofollow';} + +// rqd attr +static $eAR = array('area'=>array('alt'=>'area'), 'bdo'=>array('dir'=>'ltr'), 'form'=>array('action'=>''), 'img'=>array('src'=>'', 'alt'=>'image'), 'map'=>array('name'=>''), 'optgroup'=>array('label'=>''), 'param'=>array('name'=>''), 'script'=>array('type'=>'text/javascript'), 'textarea'=>array('rows'=>'10', 'cols'=>'50')); +if(isset($eAR[$e])){ + foreach($eAR[$e] as $k=>$v){ + if(!isset($a[$k])){$a[$k] = isset($v[0]) ? $v : $k;} + } +} + +// depr attrs +if($depTr){ + $c = array(); + foreach($a as $k=>$v){ + if($k == 'style' or !isset($aND[$k][$e])){continue;} + if($k == 'align'){ + unset($a['align']); + if($e == 'img' && ($v == 'left' or $v == 'right')){$c[] = 'float: '. $v;} + elseif(($e == 'div' or $e == 'table') && $v == 'center'){$c[] = 'margin: auto';} + else{$c[] = 'text-align: '. $v;} + }elseif($k == 'bgcolor'){ + unset($a['bgcolor']); + $c[] = 'background-color: '. $v; + }elseif($k == 'border'){ + unset($a['border']); $c[] = "border: {$v}px"; + }elseif($k == 'bordercolor'){ + unset($a['bordercolor']); $c[] = 'border-color: '. $v; + }elseif($k == 'clear'){ + unset($a['clear']); $c[] = 'clear: '. ($v != 'all' ? $v : 'both'); + }elseif($k == 'compact'){ + unset($a['compact']); $c[] = 'font-size: 85%'; + }elseif($k == 'height' or $k == 'width'){ + unset($a[$k]); $c[] = $k. ': '. ($v[0] != '*' ? $v. (ctype_digit($v) ? 'px' : '') : 'auto'); + }elseif($k == 'hspace'){ + unset($a['hspace']); $c[] = "margin-left: {$v}px; margin-right: {$v}px"; + }elseif($k == 'language' && !isset($a['type'])){ + unset($a['language']); + $a['type'] = 'text/'. strtolower($v); + }elseif($k == 'name'){ + if($C['no_deprecated_attr'] == 2 or ($e != 'a' && $e != 'map')){unset($a['name']);} + if(!isset($a['id']) && preg_match('`[a-zA-Z][a-zA-Z\d.:_\-]*`', $v)){$a['id'] = $v;} + }elseif($k == 'noshade'){ + unset($a['noshade']); $c[] = 'border-style: none; border: 0; background-color: gray; color: gray'; + }elseif($k == 'nowrap'){ + unset($a['nowrap']); $c[] = 'white-space: nowrap'; + }elseif($k == 'size'){ + unset($a['size']); $c[] = 'size: '. $v. 'px'; + }elseif($k == 'start' or $k == 'value'){ + unset($a[$k]); + }elseif($k == 'type'){ + unset($a['type']); + static $ol_type = array('i'=>'lower-roman', 'I'=>'upper-roman', 'a'=>'lower-latin', 'A'=>'upper-latin', '1'=>'decimal'); + $c[] = 'list-style-type: '. (isset($ol_type[$v]) ? $ol_type[$v] : 'decimal'); + }elseif($k == 'vspace'){ + unset($a['vspace']); $c[] = "margin-top: {$v}px; margin-bottom: {$v}px"; + } + } + if(count($c)){ + $c = implode('; ', $c); + $a['style'] = isset($a['style']) ? rtrim($a['style'], ' ;'). '; '. $c. ';': $c. ';'; + } +} +// unique ID +if($C['unique_ids'] && isset($a['id'])){ + if(!preg_match('`^[A-Za-z][A-Za-z0-9_\-.:]*$`', ($id = $a['id'])) or (isset($GLOBALS['hl_Ids'][$id]) && $C['unique_ids'] == 1)){unset($a['id']); + }else{ + while(isset($GLOBALS['hl_Ids'][$id])){$id = $C['unique_ids']. $id;} + $GLOBALS['hl_Ids'][($a['id'] = $id)] = 1; + } +} +// xml:lang +if($C['xml:lang'] && isset($a['lang'])){ + $a['xml:lang'] = isset($a['xml:lang']) ? $a['xml:lang'] : $a['lang']; + if($C['xml:lang'] == 2){unset($a['lang']);} +} +// for transformed tag +if(!empty($trt)){ + $a['style'] = isset($a['style']) ? rtrim($a['style'], ' ;'). '; '. $trt : $trt; +} +// return with empty ele / +if(empty($C['hook_tag'])){ + $aA = ''; + foreach($a as $k=>$v){$aA .= " {$k}=\"{$v}\"";} + return "<{$e}{$aA}". (isset($eE[$e]) ? ' /' : ''). '>'; +} +else{return $C['hook_tag']($e, $a);} +// eof +} + +function hl_tag2(&$e, &$a, $t=1){ +// transform tag +if($e == 'center'){$e = 'div'; return 'text-align: center;';} +if($e == 'dir' or $e == 'menu'){$e = 'ul'; return '';} +if($e == 's' or $e == 'strike'){$e = 'span'; return 'text-decoration: line-through;';} +if($e == 'u'){$e = 'span'; return 'text-decoration: underline;';} +static $fs = array('0'=>'xx-small', '1'=>'xx-small', '2'=>'small', '3'=>'medium', '4'=>'large', '5'=>'x-large', '6'=>'xx-large', '7'=>'300%', '-1'=>'smaller', '-2'=>'60%', '+1'=>'larger', '+2'=>'150%', '+3'=>'200%', '+4'=>'300%'); +if($e == 'font'){ + $a2 = ''; + if(preg_match('`face\s*=\s*(\'|")([^=]+?)\\1`i', $a, $m) or preg_match('`face\s*=\s*([^"])(\S+)`i', $a, $m)){ + $a2 .= ' font-family: '. str_replace('"', '\'', trim($m[2])). ';'; + } + if(preg_match('`color\s*=\s*(\'|")?(.+?)(\\1|\s|$)`i', $a, $m)){ + $a2 .= ' color: '. trim($m[2]). ';'; + } + if(preg_match('`size\s*=\s*(\'|")?(.+?)(\\1|\s|$)`i', $a, $m) && isset($fs[($m = trim($m[2]))])){ + $a2 .= ' font-size: '. $fs[$m]. ';'; + } + $e = 'span'; return ltrim($a2); +} +if($t == 2){$e = 0; return 0;} +return ''; +// eof +} + +function hl_tidy($t, $w, $p){ +// Tidy/compact HTM +if(strpos(' pre,script,textarea', "$p,")){return $t;} +$t = str_replace(' </', '</', preg_replace(array('`(<\w[^>]*(?<!/)>)\s+`', '`\s+`', '`(<\w[^>]*(?<!/)>) `'), array(' $1', ' ', '$1'), preg_replace_callback(array('`(<(!\[CDATA\[))(.+?)(\]\]>)`sm', '`(<(!--))(.+?)(-->)`sm', '`(<(pre|script|textarea).*?>)(.+?)(</\2>)`sm'), create_function('$m', 'return $m[1]. str_replace(array("<", ">", "\n", "\r", "\t", " "), array("\x01", "\x02", "\x03", "\x04", "\x05", "\x07"), $m[3]). $m[4];'), $t))); +if(($w = strtolower($w)) == -1){ + return str_replace(array("\x01", "\x02", "\x03", "\x04", "\x05", "\x07"), array('<', '>', "\n", "\r", "\t", ' '), $t); +} +$s = strpos(" $w", 't') ? "\t" : ' '; +$s = preg_match('`\d`', $w, $m) ? str_repeat($s, $m[0]) : str_repeat($s, ($s == "\t" ? 1 : 2)); +$n = preg_match('`[ts]([1-9])`', $w, $m) ? $m[1] : 0; +$a = array('br'=>1); +$b = array('button'=>1, 'input'=>1, 'option'=>1); +$c = array('caption'=>1, 'dd'=>1, 'dt'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'isindex'=>1, 'label'=>1, 'legend'=>1, 'li'=>1, 'object'=>1, 'p'=>1, 'pre'=>1, 'td'=>1, 'textarea'=>1, 'th'=>1); +$d = array('address'=>1, 'blockquote'=>1, 'center'=>1, 'colgroup'=>1, 'dir'=>1, 'div'=>1, 'dl'=>1, 'fieldset'=>1, 'form'=>1, 'hr'=>1, 'iframe'=>1, 'map'=>1, 'menu'=>1, 'noscript'=>1, 'ol'=>1, 'optgroup'=>1, 'rbc'=>1, 'rtc'=>1, 'ruby'=>1, 'script'=>1, 'select'=>1, 'table'=>1, 'tfoot'=>1, 'thead'=>1, 'tr'=>1, 'ul'=>1); +ob_start(); +if(isset($d[$p])){echo str_repeat($s, ++$n);} +$t = explode('<', $t); +echo ltrim(array_shift($t)); +for($i=-1, $j=count($t); ++$i<$j;){ + $r = ''; list($e, $r) = explode('>', $t[$i]); + $x = $e[0] == '/' ? 0 : (substr($e, -1) == '/' ? 1 : ($e[0] != '!' ? 2 : -1)); + $y = !$x ? ltrim($e, '/') : ($x > 0 ? substr($e, 0, strcspn($e, ' ')) : 0); + $e = "<$e>"; + if(isset($d[$y])){ + if(!$x){echo "\n", str_repeat($s, --$n), "$e\n", str_repeat($s, $n);} + else{echo "\n", str_repeat($s, $n), "$e\n", str_repeat($s, ($x != 1 ? ++$n : $n));} + echo ltrim($r); continue; + } + $f = "\n". str_repeat($s, $n); + if(isset($c[$y])){ + if(!$x){echo $e, $f, ltrim($r);} + else{echo $f, $e, $r;} + }elseif(isset($b[$y])){echo $f, $e, $r; + }elseif(isset($a[$y])){echo $e, $f, ltrim($r); + }elseif(!$y){echo $f, $e, $f, ltrim($r); + }else{echo $e, $r;} +} +$t = preg_replace('`[\n]\s*?[\n]+`', "\n", ob_get_contents()); +ob_end_clean(); +if(($l = strpos(" $w", 'r') ? (strpos(" $w", 'n') ? "\r\n" : "\r") : 0)){ + $t = str_replace("\n", $l, $t); +} +return str_replace(array("\x01", "\x02", "\x03", "\x04", "\x05", "\x07"), array('<', '>', "\n", "\r", "\t", ' '), $t); +// eof +} + +function hl_version(){ +// rel +return '1.1.8.1'; +// eof +} + +function kses($t, $h, $p=array('http', 'https', 'ftp', 'news', 'nntp', 'telnet', 'gopher', 'mailto')){ +// kses compat +foreach($h as $k=>$v){ + $h[$k]['n']['*'] = 1; +} +$C['cdata'] = $C['comment'] = $C['make_tag_strict'] = $C['no_deprecated_attr'] = $C['unique_ids'] = 0; +$C['keep_bad'] = 1; +$C['elements'] = count($h) ? strtolower(implode(',', array_keys($h))) : '-*'; +$C['hook'] = 'kses_hook'; +$C['schemes'] = '*:'. implode(',', $p); +return htmLawed($t, $C, $h); +// eof +} + +function kses_hook($t, &$C, &$S){ +// kses compat +return $t; +// eof +}
\ No newline at end of file diff --git a/extlib/htmLawed/htmLawedTest.php b/extlib/htmLawed/htmLawedTest.php new file mode 100644 index 000000000..776828699 --- /dev/null +++ b/extlib/htmLawed/htmLawedTest.php @@ -0,0 +1,592 @@ +<?php + +/* +htmLawedTest.php, 16 July 2009 +htmLawed 1.1.8.1, 16 July 2009 +Copyright Santosh Patnaik +GPL v3 license +A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed + +Test htmLawed; user provides text input; input and processed input are shown as highlighted code and rendered HTML; also shown are execution time and peak memory usage +*/ + +// config +$_errs = 0; // display PHP errors +$_limit = 8000; // input character limit + +// more config +$_hlimit = 1000; // input character limit for showing hexdumps +$_hilite = 1; // 0 turns off slow Javascript-based code-highlighting, e.g., if $_limit is high +$_w3c_validate = 1; // 1 to show buttons to send input/output to w3c validator +$_sid = 'sid'; // session name; alphanum. +$_slife = 30; // session life in min. + +// errors +error_reporting(E_ALL | (defined('E_STRICT') ? E_STRICT : 1)); +ini_set('display_errors', $_errs); + +// session +session_name($_sid); +session_cache_limiter('private'); +session_cache_expire($_slife); +ini_set('session.gc_maxlifetime', $_slife * 60); +ini_set('session.use_only_cookies', 1); +ini_set('session.cookie_lifetime', 0); +session_start(); +if(!isset($_SESSION['token'])){ + $_SESSION['token'] = md5(uniqid(rand(), 1)); +} + +// slashes +if(get_magic_quotes_gpc()){ + foreach($_POST as $k => $v){ + $_POST[$k] = stripslashes($v); + } + ini_set('magic_quotes_gpc', 0); +} +set_magic_quotes_runtime(0); + +$_POST['enc'] = (isset($_POST['enc']) and preg_match('`^[-\w]+$`', $_POST['enc'])) ? $_POST['enc'] : 'utf-8'; + +// token for anti-CSRF +if(count($_POST)){ + if((empty($_GET['pre']) and ((!empty($_POST['token']) and !empty($_SESSION['token']) and $_POST['token'] != $_SESSION['token']) or empty($_POST[$_sid]) or $_POST[$_sid] != session_id() or empty($_COOKIE[$_sid]) or $_COOKIE[$_sid] != session_id())) or ($_POST[$_sid] != session_id())){ + $_POST = array('enc'=>'utf-8'); + } +} +if(empty($_GET['pre'])){ + $_SESSION['token'] = md5(uniqid(rand(), 1)); + $token = $_SESSION['token']; + session_regenerate_id(1); +} + +// compress +if(function_exists('gzencode') && isset($_SERVER['HTTP_ACCEPT_ENCODING']) && preg_match('`gzip|deflate`i', $_SERVER['HTTP_ACCEPT_ENCODING']) && !ini_get('zlib.output_compression')){ + ob_start('ob_gzhandler'); +} + +// HTM for unprocessed +if(isset($_POST['inputH'])){ + echo '<html><head><title>htmLawed test: HTML view of unprocessed input</title></head><body style="margin:0; padding: 0;"><p style="background-color: black; color: white; padding: 2px;"> Rendering of unprocessed input without an HTML doctype or charset declaration <small><a style="color: white; text-decoration: none;" href="1" onclick="javascript:window.close(this); return false;">close window</a> | <a style="color: white; text-decoration: none;" href="htmLawedTest.php" onclick="javascript: window.open(\'htmLawedTest.php\', \'hlmain\'); window.close(this); return false;">htmLawed test page</a></small></p><div>', $_POST['inputH'], '</div></body></html>'; + exit; +} + +// main +$_POST['text'] = isset($_POST['text']) ? $_POST['text'] : 'text to process; < '. $_limit. ' characters'. ($_hlimit ? ' (for binary hexdump view, < '. $_hlimit. ')' : ''); +$do = (!empty($_POST[$_sid]) && isset($_POST['text'][0]) && !isset($_POST['text'][$_limit])) ? 1 : 0; +$limit_exceeded = isset($_POST['text'][$_limit]) ? 1 : 0; +$pre_mem = memory_get_usage(); +$validation = (!empty($_POST[$_sid]) and isset($_POST['w3c_validate'][0])) ? 1 : 0; +include './htmLawed.php'; + +function format($t){ + $t = "\n". str_replace(array("\t", "\r\n", "\r", '&', '<', '>', "\n"), array(' ', "\n", "\n", '&', '<', '>', "<span class=\"newline\">¬</span><br />\n"), $t); + return str_replace(array('<br />', "\n ", ' '), array("\n<br />\n", "\n ", ' '), $t); +} + +function hexdump($d){ +// Mainly by Aidan Lister <aidan@php.net>, Peter Waller <iridum@php.net> + $hexi = ''; + $ascii = ''; + ob_start(); + echo '<pre>'; + $offset = 0; + $len = strlen($d); + for($i=$j=0; $i<$len; $i++) + { + // Convert to hexidecimal + $hexi .= sprintf("%02X ", ord($d[$i])); + // Replace non-viewable bytes with '.' + if(ord($d[$i]) >= 32){ + $ascii .= htmlspecialchars($d[$i]); + }else{ + $ascii .= '.'; + } + // Add extra column spacing + if($j == 7){ + $hexi .= ' '; + $ascii .= ' '; + } + // Add row + if(++$j == 16 || $i == $len-1){ + // Join the hexi / ascii output + echo sprintf("%04X %-49s %s", $offset, $hexi, $ascii); + // Reset vars + $hexi = $ascii = ''; + $offset += 16; + $j = 0; + // Add newline + if ($i !== $len-1){ + echo "\n"; + } + } + } + echo '</pre>'; + $o = ob_get_contents(); + ob_end_clean(); + return $o; +} +?> + +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" + "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> +<html lang="en" xml:lang="en"> +<head> +<meta http-equiv="Content-Type" content="text/html; charset=<?php echo htmlspecialchars($_POST['enc']); ?>" /> +<meta name="description" content="htmLawed <?php echo hl_version();?> test page" /> +<style type="text/css"><!--/*--><![CDATA[/*><!--*/ +a, a.resizer{text-decoration:none;} +a:hover, a.resizer:hover{color:red;} +a.resizer{color:green; float:right;} +body{background-color:#efefef;} +body, button, div, html, input, p{font-size:13px; font-family:'Lucida grande', Verdana, Arial, Helvetica, sans-serif;} +button, input{font-size: 85%;} +div.help{border-top: 1px dotted gray; margin-top: 15px; padding-top: 15px; color:#999999;} +#inputC, #inputD, #inputF, #inputR, #outputD, #outputF, #outputH, #outputR, #settingF{display:block;} +#inputC, #settingF{background-color:white; border:1px gray solid; padding:3px;} +#inputC li{margin: 0; padding: 0;} +#inputC ul{margin: 0; padding: 0; margin-left: 14px;} +#inputC input{margin: 0; margin-left: 2px; margin-right: 2px; padding: 1px; vertical-align: middle;} +#inputD{overflow:auto; background-color:#ffff99; border:1px #cc9966 solid; padding:3px;} +#inputR{overflow:auto; background-color:#ffffcc; border:1px #ffcc99 solid; padding:3px;} +#inputC, #settingF, #inputD, #inputR, #outputD, #outputR, textarea{font-size:100%; font-family:'Bitstream vera sans mono', 'courier new', 'courier', monospace;} +#outputD{overflow:auto; background-color: #99ffcc; border:1px #66cc99 solid; padding:3px;} +#outputH{overflow:auto; background-color:white; padding:3px; border:1px #dcdcdc solid;} +#outputR{overflow:auto; background-color: #ccffcc; border:1px #99cc99 solid; padding:3px;} +span.cmtcdata{color: orange;} +span.ctag{color:red;} +span.ent{border-bottom:1px dotted #999999;} +span.etag{color:purple;} +span.help{color:#999999;} +span.newline{color:#dcdcdc;} +span.notice{color:green;} +span.otag{color:blue;} +#topmost{margin:auto; width:98%;} +/*]]>*/--></style> +<script type="text/javascript"><!--//--><![CDATA[//><!-- +window.name = 'hlmain'; +function hl(i){ + <?php if(!$_hilite){echo 'return;'; }?> + var e = document.getElementById(i); + if(!e){return;} + run(e, '</[a-z1-6]+>', 'ctag'); + run(e, '<[a-z]+(?:[^>]*)/>', 'etag'); + run(e, '<[a-z1-6]+(?:[^>]*)>', 'otag'); + run(e, '&[#a-z0-9]+;', 'ent'); + run(e, '<!(?:(?:--(?:.|\n)*?--)|(?:\\[CDATA\\[(?:.|\n)*?\\]\\]))>', 'cmtcdata'); +} +function sndProc(){ + var f = document.getElementById('testform'); + if(!f){return;} + var e = document.createElement('input'); + e.type = 'hidden'; + e.name = '<?php echo htmlspecialchars($_sid); ?>'; + e.id = '<?php echo htmlspecialchars($_sid); ?>'; + e.value = readCookie('<?php echo htmlspecialchars($_sid); ?>'); + f.appendChild(e); + f.submit(); +} +function readCookie(n){ + var ne = n + '='; + var ca = document.cookie.split(';'); + for(var i=0;i < ca.length;i++){ + var c = ca[i]; + while(c.charAt(0)==' '){ + c = c.substring(1,c.length); + } + if(c.indexOf(ne) == 0){ + return c.substring(ne.length,c.length); + } + } + return null; +} +function run(e, q, c){ + var q = new RegExp(q); + if(e.firstChild == null){ + var m = q.exec(e.data); + if(m){ + var v = m[0]; + var k2 = e.splitText(m.index); + var k3 = k2.splitText(v.length); + var s = e.ownerDocument.createElement('span'); + e.parentNode.replaceChild(s, k2); + s.className = c; s.appendChild(k2); + } + } + for(var k = e.firstChild; k != null; k = k.nextSibling){ + if(k.nodeType == 3){ + var m = q.exec(k.data); + if(m){ + var v = m[0]; + var k2 = k.splitText(m.index); + var k3 = k2.splitText(v.length); + var s = k.ownerDocument.createElement('span'); + k.parentNode.replaceChild(s, k2); + s.className = c; s.appendChild(k2); + } + } + else if(c == 'ent' && k.nodeType == 1){ + var d = k.firstChild; + if(d){ + var m = q.exec(d.data); + if(m){ + var v = m[0]; + var d2 = d.splitText(m.index); + var d3 = d2.splitText(v.length); + var s = d.ownerDocument.createElement('span'); + d.parentNode.replaceChild(s, d2); + s.className = c; s.appendChild(d2); + } + } + } + } +} +function toggle(i){ + var e = document.getElementById(i); + if(!e){return;} + if(e.style){ + var a = e.style.display; + if(a == 'block'){e.style.display = 'none'; return;} + if(a == 'none'){e.style.display = 'block';} + else{e.style.display = 'none';} + return; + } + var a = e.visibility; + if(a == 'hidden'){e.visibility = 'show'; return;} + if(a == 'show'){e.visibility = 'hidden';} +} +function sndUnproc(){ + var i = document.getElementById('text'); + if(!i){return;} + i = i.value; + i = i.replace(/>/g, '>'); + i = i.replace(/</g, '<'); + i = i.replace(/"/g, '"'); + var w = window.open('htmLawedTest.php?pre=1', 'hlprehtm'); + var f = document.createElement('form'); + f.enctype = 'application/x-www-form-urlencoded'; + f.method = 'post'; + f.acceptCharset = '<?php echo htmlspecialchars($_POST['enc']); ?>'; + if(f.style){f.style.display = 'none';} + else{f.visibility = 'hidden';} + f.innerHTML = '<p style="display:none;"><input style="display:none;" type="hidden" name="token" id="token" value="<?php echo $token; ?>" /><input style="display:none;" type="hidden" name="<?php echo htmlspecialchars($_sid); ?>" id="<?php echo htmlspecialchars($_sid); ?>" value="' + readCookie('<?php echo htmlspecialchars($_sid); ?>') + '" /><input style="display:none;" type="hidden" name="inputH" id="inputH" value="'+ i+ '" /></p>'; + f.action = 'htmLawedTest.php?pre=1'; + f.target = 'hlprehtm'; + f.method = 'post'; + var b = document.getElementsByTagName('body')[0]; + b.appendChild(f); + f.submit(); + w.focus; +} +function sndValidn(id, type){ + var i = document.getElementById(id); + if(!i){return;} + i = i.value; + i = i.replace(/>/g, '>'); + i = i.replace(/</g, '<'); + i = i.replace(/"/g, '"'); + var w = window.open('http://validator.w3.org/check', 'validate'+id+type); + var f = document.createElement('form'); + f.enctype = 'application/x-www-form-urlencoded'; + f.method = 'post'; + f.acceptCharset = '<?php echo htmlspecialchars($_POST['enc']); ?>'; + if(f.style){f.style.display = 'none';} + else{f.visibility = 'hidden';} + f.innerHTML = '<p style="display:none;"><input style="display:none;" type="hidden" name="fragment" id="fragment" value="'+ i+ '" /><input style="display:none;" type="hidden" name="prefill" id="prefill" value="1" /><input style="display:none;" type="hidden" name="prefill_doctype" id="prefill_doctype" value="'+ type+ '" /><input style="display:none;" type="hidden" name="group" id="group" value="1" /><input type="hidden" name="ss" id="ss" value="1" /></p>'; + f.action = 'http://validator.w3.org/check'; + f.target = 'validate'+id+type; + var b = document.getElementsByTagName('body')[0]; + b.appendChild(f); + f.submit(); + w.focus; +} +tRs = { + formEl: null, + resizeClass: 'textarea', + adEv: function(t,ev,fn){ + if(typeof document.addEventListener != 'undefined'){ + t.addEventListener(ev,fn,false); + }else{ + t.attachEvent('on' + ev, fn); + } + }, + rmEv: function(t,ev,fn){ + if(typeof document.removeEventListener != 'undefined'){ + t.removeEventListener(ev,fn,false); + }else + { + t.detachEvent('on' + ev, fn); + } + }, + adBtn: function(){ + var textareas = document.getElementsByTagName('textarea'); + for(var i = 0; i < textareas.length; i++){ + var txtclass=textareas[i].className; + if(txtclass.substring(0,tRs.resizeClass.length)==tRs.resizeClass || + txtclass.substring(txtclass.length -tRs.resizeClass.length)==tRs.resizeClass){ + var a = document.createElement('a'); + a.appendChild(document.createTextNode("\u2195")); + a.style.cursor = 'n-resize'; + a.className= 'resizer'; + a.title = 'click-drag to resize' + tRs.adEv(a, 'mousedown', tRs.initResize); + textareas[i].parentNode.appendChild(a); + } + } + }, + initResize: function(event){ + if(typeof event == 'undefined'){ + event = window.event; + } + if(event.srcElement){ + var target = event.srcElement.previousSibling; + }else{ + var target = event.target.previousSibling; + } + if(target.nodeName.toLowerCase() == 'textarea' || (target.nodeName.toLowerCase() == 'input' && target.type == 'text')){ + tRs.formEl = target; + tRs.formEl.startHeight = tRs.formEl.clientHeight; + tRs.formEl.startY = event.clientY; + tRs.adEv(document, 'mousemove', tRs.resize); + tRs.adEv(document, 'mouseup', tRs.stopResize); + tRs.formEl.parentNode.style.cursor = 'n-resize'; + tRs.formEl.style.cursor = 'n-resize'; + try{ + event.preventDefault(); + }catch(e){ + } + } + }, + resize: function(event){ + if(typeof event == 'undefined'){ + event = window.event; + } + if(tRs.formEl.nodeName.toLowerCase() == 'textarea'){ + tRs.formEl.style.height = event.clientY - tRs.formEl.startY + tRs.formEl.startHeight + 'px'; + } + }, + stopResize: function(event){ + tRs.rmEv(document, 'mousedown', tRs.initResize); + tRs.rmEv(document, 'mousemove', tRs.resize); + tRs.formEl.style.cursor = 'text'; + tRs.formEl.parentNode.style.cursor = 'auto'; + return false; + } +}; +tRs.adEv(window, 'load', tRs.adBtn); +//--><!]]></script> +<title>htmLawed (<?php echo hl_version();?>) test</title> +</head> +<body> +<div id="topmost"> + +<h5 style="float: left; display: inline; margin-top: 0; margin-bottom: 5px;"><a href="http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed/index.php" title="htmLawed home">HTM<big><big>L</big></big>AWED</a> <?php echo hl_version();?> <a href="htmLawedTest.php" title="test home">TEST</a></h5> +<span style="float: right;" class="help"><a href="htmLawed_README.htm"><span class="notice">htm</span></a> / <a href="htmLawed_README.txt"><span class="notice">txt</span></a> documentation</span><br style="clear:both;" /> + +<a href="htmLawedTest.php" title="[toggle visibility] type or copy-paste" onclick="javascript:toggle('inputF'); return false;"><span class="notice">Input »</span> <span class="help" title="limit lower with multibyte characters<?php echo (($_hlimit < $_limit && $_hlimit)? '; limit is '. $_hlimit. ' for viewing binaries' : ''); ?>"><small>(max. <?php echo htmlspecialchars($_limit);?> chars)</small></span></a> + +<form id="testform" name="testform" action="htmLawedTest.php" method="post" accept-charset="<?php echo htmlspecialchars($_POST['enc']); ?>" style="padding:0; margin: 0; display:inline;"> + +<div id="inputF" style="display: block;"> + +<input type="hidden" name="token" id="token" value="<?php echo $token; ?>" /> +<div><textarea id="text" class="textarea" name="text" rows="5" cols="100" style="width: 100%;"><?php echo htmlspecialchars($_POST['text']);?></textarea></div> +<input type="submit" id="submitF" name="submitF" value="Process" style="float:left;" title="filter using htmLawed" onclick="javascript: sndProc(); return false;" onkeypress="javascript: sndProc(); return false;" /> + +<?php +if($do){ + if($validation){ + echo '<input type="hidden" value="1" name="w3c_validate" id="w3c_validate" />'; + } +?> + +<button type="button" title="rendered as web-page without a doctype or charset declaration" style="float: right;" onclick="javascript: sndUnproc(); return false;" onkeypress="javascript: sndUnproc(); return false;">View unprocessed</button> +<button type="button" onclick="javascript:document.getElementById('text').focus();document.getElementById('text').select()" title="select all to copy" style="float:right;">Select all</button> + +<?php +if($_w3c_validate && $validation){ +?> + +<button type="button" title="HTML 4.01 W3C online validation" style="float: right;" onclick="javascript: sndValidn('text', 'html401'); return false;" onkeypress="javascript: sndValidn('text', 'html401'); return false;">Check HTML</button> +<button type="button" title="XHTML 1.1 W3C online validation" style="float: right;" onclick="javascript: sndValidn('text', 'xhtml110'); return false;" onkeypress="javascript: sndValidn('text', 'xhtml110'); return false;">Check XHTML</button> + +<?php + } +} +else{ + if($_w3c_validate){ + echo '<span style="float: right;" class="help" title="for direct submission of input or output code to W3C validator for (X)HTML validation"><span style="font-size: 85%;"> Validator tools: </span><input type="checkbox" value="1" name="w3c_validate" id="w3c_validate" style="vertical-align: middle;"', ($validation ? ' checked="checked"' : ''), ' /></span>'; + } +} +?> + +<span style="float:right;" class="help"><span style="font-size: 85%;">Encoding: </span><input type="text" size="8" id="enc" name="enc" style="vertical-align: middle;" value="<?php echo htmlspecialchars($_POST['enc']); ?>" title="IANA-recognized name of the input character-set; can be multiple ;- or space-separated values; may not work in some browsers" /></span> + +</div> +<br style="clear:both;" /> + +<?php +if($limit_exceeded){ + echo '<br /><strong>Input text is too long!</strong><br />'; +} +?> + +<br /> + +<a href="htmLawedTest.php" title="[toggle visibility] htmLawed configuration" onclick="javascript:toggle('inputC'); return false;"><span class="notice">Settings »</span></a> + +<div id="inputC" style="display: none;"> +<table summary="none"> +<tr> +<td><span class="help" title="$config argument">Config:</span></td> +<td><ul> + +<?php +$cfg = array( +'abs_url'=>array('3', '0', 'absolute/relative URL conversion', '-1'), +'and_mark'=>array('2', '0', 'mark original <em>&</em> chars', '0', 'd'=>1), // 'd' to disable +'anti_link_spam'=>array('1', '0', 'modify <em>href</em> values as an anti-link spam measure', '0', array(array('30', '1', '', 'regex for extra <em>rel</em>'), array('30', '2', '', 'regex for no <em>href</em>'))), +'anti_mail_spam'=>array('1', '0', 'replace <em>@</em> in <em>mailto:</em> URLs', '0', '8', 'NO@SPAM', 'replacement'), +'balance'=>array('2', '1', 'fix nestings and balance tags', '0'), +'base_url'=>array('', '', 'base URL', '25'), +'cdata'=>array('4', 'nil', 'allow <em>CDATA</em> sections', 'nil'), +'clean_ms_char'=>array('3', '0', 'replace bad characters introduced by Microsoft apps. like <em>Word</em>', '0'), +'comment'=>array('4', 'nil', 'allow HTML comments', 'nil'), +'css_expression'=>array('2', 'nil', 'allow dynamic expressions in CSS style properties', 'nil'), +'deny_attribute'=>array('1', '0', 'denied attributes', '0', '50', '', 'these'), +'elements'=>array('', '', 'allowed elements', '50'), +'hexdec_entity'=>array('3', '1', 'convert hexadecimal numeric entities to decimal ones, or vice versa', '0'), +'hook'=>array('', '', 'name of hook function', '25'), +'hook_tag'=>array('', '', 'name of custom function to further check attribute values', '25'), +'keep_bad'=>array('7', '6', 'keep, or remove <em>bad</em> tag content', '0'), +'lc_std_val'=>array('2', '1', 'lower-case std. attribute values like <em>radio</em>', '0'), +'make_tag_strict'=>array('3', 'nil', 'transform deprecated elements', 'nil'), +'named_entity'=>array('2', '1', 'allow named entities, or convert numeric ones', '0'), +'no_deprecated_attr'=>array('3', '1', 'allow deprecated attributes, or transform them', '0'), +'parent'=>array('', 'div', 'name of parent element', '25'), +'safe'=>array('2', '0', 'for most <em>safe</em> HTML', '0'), +'schemes'=>array('', 'href: aim, feed, file, ftp, gopher, http, https, irc, mailto, news, nntp, sftp, ssh, telnet; *:file, http, https', 'allowed URL protocols', '50'), +'show_setting'=>array('', 'htmLawed_setting', 'variable name to record <em>finalized</em> htmLawed settings', '25', 'd'=>1), +'style_pass'=>array('2', 'nil', 'do not look at <em>style</em> attribute values', 'nil'), +'tidy'=>array('3', '0', 'beautify/compact', '-1', '8', '1t1', 'format'), +'unique_ids'=>array('2', '1', 'unique <em>id</em> values', '0', '8', 'my_', 'prefix'), +'valid_xhtml'=>array('2', 'nil', 'auto-set various parameters for most valid XHTML', 'nil'), +'xml:lang'=>array('3', 'nil', 'auto-add <em>xml:lang</em> attribute', '0'), +); +foreach($cfg as $k=>$v){ + echo '<li>', $k, ': '; + if(!empty($v[0])){ // input radio + $j = $v[3]; + for($i = $j-1; ++$i < $v[0]+$v[3];++$j){ + echo '<input type="radio" name="h', $k, '" value="', $i, '"', (!isset($_POST['h'. $k]) ? ($v[1] == $i ? ' checked="checked"' : '') : ($_POST['h'. $k] == $i ? ' checked="checked"' : '')), (isset($v['d']) ? ' disabled="disabled"' : ''), ' />', $i, ' '; + } + if($v[1] == 'nil'){ + echo '<input type="radio" name="h', $k, '" value="nil"', ((!isset($_POST['h'. $k]) or $_POST['h'. $k] == 'nil') ? ' checked="checked"' : ''), (isset($v['d']) ? ' disabled="disabled"' : ''), ' />not set '; + } + if(!empty($v[4])){ // + input text box + echo '<input type="radio" name="h', $k, '" value="', $j, '"', (((isset($_POST['h'. $k]) && $_POST['h'. $k] == $j) or (!isset($_POST['h'. $k]) && $j == $v[1])) ? ' checked="checked"' : ''), (isset($v['d']) ? ' disabled="disabled"' : ''), ' />'; + if(!is_array($v[4])){ + echo $v[6], ': <input type="text" size="', $v[4], '" name="h', $k. $j, '" value="', htmlspecialchars(isset($_POST['h'. $k. $j][0]) ? $_POST['h'. $k. $j] : $v[5]), '"', (isset($v['d']) ? ' disabled="disabled"' : ''), ' />'; + } + else{ + foreach($v[4] as $z){ + echo ' ', $z[3], ': <input type="text" size="', $z[0], '" name="h', $k. $j. $z[1], '" value="', htmlspecialchars(isset($_POST['h'. $k. $j. $z[1]][0]) ? $_POST['h'. $k. $j. $z[1]] : $z[2]), '"', (isset($v['d']) ? ' disabled="disabled"' : ''), ' />'; + } + } + } + } + elseif(ctype_digit($v[3])){ // input text + echo '<input type="text" size="', $v[3], '" name="h', $k, '" value="', htmlspecialchars(isset($_POST['h'. $k][0]) ? $_POST['h'. $k] : $v[1]), '"', (isset($v['d']) ? ' disabled="disabled"' : ''), ' />'; + } + else{} // text-area + echo ' <span class="help">', $v[2], '</span></li>'; +} +echo '</ul></td></tr><tr><td><span style="vertical-align: top;" class="help" title="$spec argument: element-specific attribute rules">Spec:</span></td><td><textarea name="spec" id="spec" cols="70" rows="3" style="width:80%;">', htmlspecialchars((isset($_POST['spec']) ? $_POST['spec'] : '')), '</textarea></td></tr></table>'; +?> + +</div> +</form> + +<?php +if($do){ + $cfg = array(); + foreach($_POST as $k=>$v){ + if($k[0] == 'h' && $v != 'nil'){ + $cfg[substr($k, 1)] = $v; + } + } + + if($cfg['anti_link_spam'] && (!empty($cfg['anti_link_spam11']) or !empty($cfg['anti_link_spam12']))){ + $cfg['anti_link_spam'] = array($cfg['anti_link_spam11'], $cfg['anti_link_spam12']); + } + unset($cfg['anti_link_spam11'], $cfg['anti_link_spam12']); + if($cfg['anti_mail_spam'] == 1){ + $cfg['anti_mail_spam'] = isset($cfg['anti_mail_spam1'][0]) ? $cfg['anti_mail_spam1'] : 0; + } + unset($cfg['anti_mail_spam11']); + if($cfg['deny_attribute'] == 1){ + $cfg['deny_attribute'] = isset($cfg['deny_attribute1'][0]) ? $cfg['deny_attribute1'] : 0; + } + unset($cfg['deny_attribute1']); + if($cfg['tidy'] == 2){ + $cfg['tidy'] = isset($cfg['tidy2'][0]) ? $cfg['tidy2'] : 0; + } + unset($cfg['tidy2']); + if($cfg['unique_ids'] == 2){ + $cfg['unique_ids'] = isset($cfg['unique_ids2'][0]) ? $cfg['unique_ids2'] : 1; + } + unset($cfg['unique_ids2']); + unset($cfg['and_mark']); // disabling and_mark + + $cfg['show_setting'] = 'hlcfg'; + $st = microtime(); + $out = htmLawed($_POST['text'], $cfg, str_replace(array('$', '{'), '', $_POST['spec'])); + $et = microtime(); + echo '<br /><a href="htmLawedTest.php" title="[toggle visibility] syntax-highlighted" onclick="javascript:toggle(\'inputR\'); return false;"><span class="notice">Input code »</span></a> <span class="help" title="tags estimated as half of total > and < chars; values may be inaccurate for non-ASCII text"><small><big>', strlen($_POST['text']), '</big> chars, ~<big>', round((substr_count($_POST['text'], '>') + substr_count($_POST['text'], '<'))/2), '</big> tags</small> </span><div id="inputR" style="display: none;">', format($_POST['text']), '</div><script type="text/javascript">hl(\'inputR\');</script>', (!isset($_POST['text'][$_hlimit]) ? ' <a href="htmLawedTest.php" title="[toggle visibility] hexdump; non-viewable characters like line-returns are shown as dots" onclick="javascript:toggle(\'inputD\'); return false;"><span class="notice">Input binary » </span></a><div id="inputD" style="display: none;">'. hexdump($_POST['text']). '</div>' : ''), ' <a href="htmLawedTest.php" title="[toggle visibility] finalized internal settings as interpreted by htmLawed; for developers" onclick="javascript:toggle(\'settingF\'); return false;"><span class="notice">Finalized internal settings » </span></a> <div id="settingF" style="display: none;">', str_replace(array(' ', "\t", ' '), array(' ', ' ', ' '), nl2br(htmlspecialchars(print_r($GLOBALS['hlcfg']['config'], true)))), '</div><script type="text/javascript">hl(\'settingF\');</script>', '<br /><a href="htmLawedTest.php" title="[toggle visibility] suitable for copy-paste" onclick="javascript:toggle(\'outputF\'); return false;"><span class="notice">Output »</span></a> <span class="help" title="approx., server-specific value excluding the \'include()\' call"><small>htmLawed processing time <big>', number_format(((substr($et,0,9)) + (substr($et,-10)) - (substr($st,0,9)) - (substr($st,-10))),4), '</big> s</small></span>', (($mem = memory_get_peak_usage()) !== false ? '<span class="help"><small>, peak memory usage <big>'. round(($mem-$pre_mem)/1048576, 2). '</big> <small>MB</small>' : ''), '</small></span><div id="outputF" style="display: block;"><div><textarea id="text2" class="textarea" name="text2" rows="5" cols="100" style="width: 100%;">', htmlspecialchars($out), '</textarea></div><button type="button" onclick="javascript:document.getElementById(\'text2\').focus();document.getElementById(\'text2\').select()" title="select all to copy" style="float:right;">Select all</button>'; + if($_w3c_validate && $validation) + { +?> + +<button type="button" title="HTML 4.01 W3C online validation" style="float: right;" onclick="javascript: sndValidn('text2', 'html401'); return false;" onkeypress="javascript: sndValidn('text2', 'html401'); return false;">Check HTML</button> +<button type="button" title="XHTML 1.1 W3C online validation" style="float: right;" onclick="javascript: sndValidn('text2', 'xhtml110'); return false;" onkeypress="javascript: sndValidn('text2', 'xhtml110'); return false;">Check XHTML</button> + +<?php + } + echo '</div><br /><a href="htmLawedTest.php" title="[toggle visibility] syntax-highlighted" onclick="javascript:toggle(\'outputR\'); return false;"><span class="notice">Output code »</span></a><div id="outputR" style="display: block;">', format($out), '</div><script type="text/javascript">hl(\'outputR\');</script>', (!isset($_POST['text'][$_hlimit]) ? '<br /><a href="htmLawedTest.php" title="[toggle visibility] hexdump; non-viewable characters like line-returns are shown as dots" onclick="javascript:toggle(\'outputD\'); return false;"><span class="notice">Output binary »</span></a><div id="outputD" style="display: none;">'. hexdump($out). '</div>' : ''), '<br /><a href="htmLawedTest.php" title="[toggle visibility] XHTML 1 Transitional doctype" onclick="javascript:toggle(\'outputH\'); return false;"><span class="notice">Output rendered »</span></a><div id="outputH" style="display: block;">', $out, '</div>'; +} +else{ +?> + +<br /> + +<div class="help">Use with a Javascript- and cookie-enabled, relatively new version of a common browser. <em>Submitted input will also be HTML-rendered (XHTML 1) after htmLawed-filtering.</em> + +<?php echo (file_exists('./htmLawed_TESTCASE.txt') ? '<br /><br />You can use text from <a href="htmLawed_TESTCASE.txt"><span class="notice">this collection of test-cases</span></a> in the input. Set the character encoding of the browser to Unicode/utf-8 before copying.' : ''); ?> + +<br /><br />For anti-XSS tests, try the <a href="http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed/htmLawedSafeModeTest.php"><span class="notice">special test-page</span></a> or see <a href="http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed/rsnake/RSnakeXSSTest.htm"><span class="notice">these results</span></a>. + +<br /><br /><small>Change <em>Encoding</em> to reflect the character encoding of the input text. Even then, it may not work or some characters may not display properly because of variable browser support and because of the form interface. Developers can write some PHP code to capture the filtered input to a file if this is important. +<br /><br />Refer to the htmLawed documentation (<a href="htmLawed_README.htm"><span class="notice">htm</span></a>/<a href="htmLawed_README.txt"><span class="notice">txt</span></a>) for details about <em>Settings</em>, and htmLawed's behavior and limitations. For <em>Settings</em>, incorrectly-specified values like regular expressions are silently ignored. One or more settings form-fields may have been disabled. Some characters are not allowed in the <em>Spec</em> field. + + +<br /><br />Hovering the mouse over some of the text can provide additional information in some browsers.</small> + +<?php +if($_w3c_validate){ +?> + +<small><br /><br />Because of character-encoding issues, the W3C validator (anyway not perfect) may reject validation requests or invalidate otherwise-valid code, esp. if text was copy-pasted in the input box. Local applications like the <em>HTML Validator</em> Firefox browser add-on may be useful in such cases.</small> + +<?php +} +?> + +</div> + +<?php +} +?> + +</div> +</body> +</html>
\ No newline at end of file diff --git a/extlib/htmLawed/htmLawed_README.htm b/extlib/htmLawed/htmLawed_README.htm new file mode 100644 index 000000000..e560e2eb2 --- /dev/null +++ b/extlib/htmLawed/htmLawed_README.htm @@ -0,0 +1,1979 @@ +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> +<head> +<meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> +<meta http-equiv="Content-Language" content="en" /> +<meta name="description" content="htmLawed PHP software is a free, open-source, customizable HTML input purifier and filter - htmLawed_README.txt - presented with rTxt2htm, a PHP Labware utility" /> +<meta name="keywords" content="htmLawed, HTM, HTML, HTML Tidy, converter, filter, formatter, purifier, sanitizer, XSS, input, PHP, software, code, script, security, cross-site scripting, hack, sanitize, remove, standards, tags, attributes, elements, htmLawed_README.txt, rTxt2htm, PHP Labware" /> +<style type="text/css" media="all"> +<!--/*--><![CDATA[/*><!--*/ +a {text-decoration:none; color: blue;} +a:hover {color: red;} +a:visited {color: blue;} +body {margin: 0; padding: 0;} +body, div, html, p {font-family: Georgia, 'Times new roman', Times;} +code.code {font-family: 'Bitstream vera sans mono', 'Courier New', 'Courier', monospace;} +div.comment {padding: 5px; color: #999999; font-size: 80%;} +div.comment a {color: #6699cc;} +div#body {width: 70%; margin: 5px; padding: 5px;} /* holds non-toc content */ +div#toc {position: fixed; top: 5px; left: 73%; z-index: 2; margin-top: 5px; margin-left: 5px; border: 1px solid gray; padding: 5px; background-color: #ededed; width: 23%; overflow: auto; max-height:94%; font-size: 90%;} /* holds content table (toc) */ +div#top {font-size: 14px; margin: 5px; padding: 5px;} /* holds all content */ +div.monospace {overflow: auto; font-family: 'Bitstream vera sans mono', 'Courier New', 'Courier', monospace;} +div.sub-section {padding-left: 15px;} +div.sub-sub-section {padding-left: 30px;} +h1 {font-size: 22px; margin-top: 5px; margin-bottom: 5px;} +h2 {font-size: 20px; float: left; margin-top: 15px; margin-bottom: 5px;} +h3 {font-size: 18px; float: left; margin-top: 15px; margin-bottom: 5px;} +h4 {font-size: 16px; float: left; margin-top: 15px; margin-bottom: 5px;} +hr {margin-top: 15px; margin-bottom: 5px;} +input, textarea {font-family: 'Bitstream vera sans mono', 'Courier New', 'Courier', monospace;} +p.subtle {color: gray; padding: 0; padding-top: 10px; margin: 0;} +p.subtle a, p.subtle a:visited {color: #6699cc;} +span.item-no {color: black;} +span.subtle {color: gray; margin: 0; padding:0;} +span.subtle a, span.subtle a:visited {color: #6699cc;} +span.term {font-family: 'Bitstream vera sans mono', 'Courier New', 'Courier', monospace;} +span.toc-item {color: black;} +span.totop {float: right; margin-top: 15px; margin-bottom: 5px;} +span.totop a, span.totop a:visited {color: #6699cc;} +@media screen { /* fixes for old IE */ + * html, * html body {overflow-y: auto!important; height: 100%; margin: 0; padding: 0;} + * html div#body {height: 100%; overflow-y: auto; position: relative;} + * html div#toc {position: absolute;} +} +/*]]>*/--> +</style> +<title>htmLawed documentation | htmLawed PHP software is a free, open-source, customizable HTML input purifier and filter</title> +</head> +<body> +<div id="top"> +<h1><a id="peak" name="peak"></a>htmLawed documentation</h1> + +<div id="toc"><span class="toc-item"><a href="#s1"><span class="item-no">1</span>  About htmLawed</a></span><br /> +  <span class="toc-item"><a href="#s1.1"><span class="item-no">1.1</span>  Example uses</a></span><br /> +  <span class="toc-item"><a href="#s1.2"><span class="item-no">1.2</span>  Features</a></span><br /> +  <span class="toc-item"><a href="#s1.3"><span class="item-no">1.3</span>  History</a></span><br /> +  <span class="toc-item"><a href="#s1.4"><span class="item-no">1.4</span>  License & copyright</a></span><br /> +  <span class="toc-item"><a href="#s1.5"><span class="item-no">1.5</span>  Terms used here</a></span><br /> +<span class="toc-item"><a href="#s2"><span class="item-no">2</span>  Usage</a></span><br /> +  <span class="toc-item"><a href="#s2.1"><span class="item-no">2.1</span>  Simple</a></span><br /> +  <span class="toc-item"><a href="#s2.2"><span class="item-no">2.2</span>  Configuring htmLawed using the <span class="term">$config</span> parameter</a></span><br /> +  <span class="toc-item"><a href="#s2.3"><span class="item-no">2.3</span>  Extra HTML specifications using the <span class="term">$spec</span> parameter</a></span><br /> +  <span class="toc-item"><a href="#s2.4"><span class="item-no">2.4</span>  Performance time & memory usage</a></span><br /> +  <span class="toc-item"><a href="#s2.5"><span class="item-no">2.5</span>  Some security risks to keep in mind</a></span><br /> +  <span class="toc-item"><a href="#s2.6"><span class="item-no">2.6</span>  Use without modifying old <span class="term">kses()</span> code</a></span><br /> +  <span class="toc-item"><a href="#s2.7"><span class="item-no">2.7</span>  Tolerance for ill-written HTML</a></span><br /> +  <span class="toc-item"><a href="#s2.8"><span class="item-no">2.8</span>  Limitations & work-arounds</a></span><br /> +  <span class="toc-item"><a href="#s2.9"><span class="item-no">2.9</span>  Examples</a></span><br /> +<span class="toc-item"><a href="#s3"><span class="item-no">3</span>  Details</a></span><br /> +  <span class="toc-item"><a href="#s3.1"><span class="item-no">3.1</span>  Invalid/dangerous characters</a></span><br /> +  <span class="toc-item"><a href="#s3.2"><span class="item-no">3.2</span>  Character references/entities</a></span><br /> +  <span class="toc-item"><a href="#s3.3"><span class="item-no">3.3</span>  HTML elements</a></span><br /> +    <span class="toc-item"><a href="#s3.3.1"><span class="item-no">3.3.1</span>  HTML comments and <span class="term">CDATA</span> sections</a></span><br /> +    <span class="toc-item"><a href="#s3.3.2"><span class="item-no">3.3.2</span>  Tag-transformation for better XHTML-Strict</a></span><br /> +    <span class="toc-item"><a href="#s3.3.3"><span class="item-no">3.3.3</span>  Tag balancing and proper nesting</a></span><br /> +    <span class="toc-item"><a href="#s3.3.4"><span class="item-no">3.3.4</span>  Elements requiring child elements</a></span><br /> +    <span class="toc-item"><a href="#s3.3.5"><span class="item-no">3.3.5</span>  Beautify or compact HTML</a></span><br /> +  <span class="toc-item"><a href="#s3.4"><span class="item-no">3.4</span>  Attributes</a></span><br /> +    <span class="toc-item"><a href="#s3.4.1"><span class="item-no">3.4.1</span>  Auto-addition of XHTML-required attributes</a></span><br /> +    <span class="toc-item"><a href="#s3.4.2"><span class="item-no">3.4.2</span>  Duplicate/invalid <span class="term">id</span> values</a></span><br /> +    <span class="toc-item"><a href="#s3.4.3"><span class="item-no">3.4.3</span>  URL schemes (protocols) and scripts in attribute values</a></span><br /> +    <span class="toc-item"><a href="#s3.4.4"><span class="item-no">3.4.4</span>  Absolute & relative URLs</a></span><br /> +    <span class="toc-item"><a href="#s3.4.5"><span class="item-no">3.4.5</span>  Lower-cased, standard attribute values</a></span><br /> +    <span class="toc-item"><a href="#s3.4.6"><span class="item-no">3.4.6</span>  Transformation of deprecated attributes</a></span><br /> +    <span class="toc-item"><a href="#s3.4.7"><span class="item-no">3.4.7</span>  Anti-spam & <span class="term">href</span></a></span><br /> +    <span class="toc-item"><a href="#s3.4.8"><span class="item-no">3.4.8</span>  Inline style properties</a></span><br /> +    <span class="toc-item"><a href="#s3.4.9"><span class="item-no">3.4.9</span>  Hook function for tag content</a></span><br /> +  <span class="toc-item"><a href="#s3.5"><span class="item-no">3.5</span>  Simple configuration directive for most valid XHTML</a></span><br /> +  <span class="toc-item"><a href="#s3.6"><span class="item-no">3.6</span>  Simple configuration directive for most <em>safe</em> HTML</a></span><br /> +  <span class="toc-item"><a href="#s3.7"><span class="item-no">3.7</span>  Using a hook function</a></span><br /> +  <span class="toc-item"><a href="#s3.8"><span class="item-no">3.8</span>  Obtaining <em>finalized</em> parameter values</a></span><br /> +  <span class="toc-item"><a href="#s3.9"><span class="item-no">3.9</span>  Retaining non-HTML tags in input with mixed markup</a></span><br /> +<span class="toc-item"><a href="#s4"><span class="item-no">4</span>  Other</a></span><br /> +  <span class="toc-item"><a href="#s4.1"><span class="item-no">4.1</span>  Support</a></span><br /> +  <span class="toc-item"><a href="#s4.2"><span class="item-no">4.2</span>  Known issues</a></span><br /> +  <span class="toc-item"><a href="#s4.3"><span class="item-no">4.3</span>  Change-log</a></span><br /> +  <span class="toc-item"><a href="#s4.4"><span class="item-no">4.4</span>  Testing</a></span><br /> +  <span class="toc-item"><a href="#s4.5"><span class="item-no">4.5</span>  Upgrade, & old versions</a></span><br /> +  <span class="toc-item"><a href="#s4.6"><span class="item-no">4.6</span>  Comparison with <span class="term">HTMLPurifier</span></a></span><br /> +  <span class="toc-item"><a href="#s4.7"><span class="item-no">4.7</span>  Use through application plug-ins/modules</a></span><br /> +  <span class="toc-item"><a href="#s4.8"><span class="item-no">4.8</span>  Use in non-PHP applications</a></span><br /> +  <span class="toc-item"><a href="#s4.9"><span class="item-no">4.9</span>  Donate</a></span><br /> +  <span class="toc-item"><a href="#s4.10"><span class="item-no">4.10</span>  Acknowledgements</a></span><br /> +<span class="toc-item"><a href="#s5"><span class="item-no">5</span>  Appendices</a></span><br /> +  <span class="toc-item"><a href="#s5.1"><span class="item-no">5.1</span>  Characters discouraged in HTML</a></span><br /> +  <span class="toc-item"><a href="#s5.2"><span class="item-no">5.2</span>  Valid attribute-element combinations</a></span><br /> +  <span class="toc-item"><a href="#s5.3"><span class="item-no">5.3</span>  CSS 2.1 properties accepting URLs</a></span><br /> +  <span class="toc-item"><a href="#s5.4"><span class="item-no">5.4</span>  Microsoft Windows 1252 character replacements</a></span><br /> +  <span class="toc-item"><a href="#s5.5"><span class="item-no">5.5</span>  URL format</a></span><br /> +  <span class="toc-item"><a href="#s5.6"><span class="item-no">5.6</span>  Brief on htmLawed code</a></span></div><!-- ended div toc --> + +<div id="body"> +<br /> +<div class="comment">htmLawed_README.txt, 16 July 2009<br /> +htmLawed 1.1.8.1, 16 July 2009 <br /> +Copyright Santosh Patnaik<br /> +GPL v3 license<br /> +A PHP Labware internal utility - <a href="http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed">http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed</a> </div> +<br /> + +<div class="section"><h2> +<a name="s1" id="s1"></a><span class="item-no">1</span>  About htmLawed +</h2><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> +<br /> +  htmLawed is a highly customizable single-file PHP script to make text secure, and standard- and admin policy-compliant for use in the body of HTML 4, XHTML 1 or 1.1, or generic XML documents. It is thus a configurable input (X)HTML filter, processor, purifier, sanitizer, beautifier, etc., and an alternative to the <a href="http://tidy.sourceforge.net">HTMLTidy</a> application.<br /> +<br /> +  The <em>lawing in</em> of input text is needed to ensure that HTML code in the text is standard-compliant, does not introduce security vulnerabilities, and does not break the aesthetics, design or layout of web-pages. htmLawed tries to do this by, for example, making HTML well-formed with balanced and properly nested tags, neutralizing code that may be used for cross-site scripting (<span class="term">XSS</span>) attacks, and allowing only specified HTML elements/tags and attributes.<br /> + +<div class="sub-section"><h3> +<a name="s1.1" id="s1.1"></a><span class="item-no">1.1</span>  Example uses +</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> +<br /> +  *  Filtering of text submitted as comments on blogs to allow only certain HTML elements<br /> +<br /> +  *  Making RSS/Atom newsfeed item-content standard-compliant: often one uses an excerpt from an HTML document for the content, and with unbalanced tags, non-numerical entities, etc., such excerpts may not be XML-compliant<br /> +<br /> +  *  Text processing for stricter XML standard-compliance: e.g., to have lowercased <span class="term">x</span> in hexadecimal numeric entities becomes necessary if an XHTML document with MathML content needs to be served as <span class="term">application/xml</span><br /> +<br /> +  *  Scraping text or data from web-pages<br /> +<br /> +  *  Pretty-printing HTML code<br /> + +</div> +<div class="sub-section"><h3> +<a name="s1.2" id="s1.2"></a><span class="item-no">1.2</span>  Features +</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> +<br /> +  Key: <span class="term">*</span> security feature, <span class="term">^</span> standard compliance, <span class="term">~</span> requires setting right options, <span class="term">`</span> different from <span class="term">Kses</span><br /> +<br /> +  *  make input more <strong>secure</strong> and <strong>standard-compliant</strong><br /> +  *  use for HTML 4, XHTML 1.0 or 1.1, or even generic <strong>XML</strong> documents  ^~`<br /> +<br /> +  *  <strong>beautify</strong> or <strong>compact</strong> HTML  ^~`<br /> +<br /> +  *  <strong>restrict elements</strong>  ^~`<br /> +  *  proper closure of empty elements like <span class="term">img</span>  ^`<br /> +  *  <strong>transform deprecated elements</strong> like <span class="term">u</span>  ^~`<br /> +  *  HTML <strong>comments</strong> and <span class="term">CDATA</span> sections can be permitted  ^~`<br /> +  *  elements like <span class="term">script</span>, <span class="term">object</span> and <span class="term">form</span> can be permitted  ~<br /> +<br /> +  *  <strong>restrict attributes</strong>, including <strong>element-specifically</strong>  ^~`<br /> +  *  remove <strong>invalid attributes</strong>  ^`<br /> +  *  element and attribute names are <strong>lower-cased</strong>  ^<br /> +  *  provide <strong>required attributes</strong>, like <span class="term">alt</span> for <span class="term">image</span>  ^`<br /> +  *  <strong>transform deprecated attributes</strong>  ^~`<br /> +  *  attributes <strong>declared only once</strong>  ^`<br /> +<br /> +  *  <strong>restrict attribute values</strong>, including <strong>element-specifically</strong>  ^~`<br /> +  *  a value is declared for <em>empty</em> (<em>minimized</em>) attributes like <span class="term">checked</span>  ^<br /> +  *  check for potentially dangerous attribute values  *~<br /> +  *  ensure <strong>unique</strong> <span class="term">id</span> attribute values  ^~`<br /> +  *  <strong>double-quote</strong> attribute values  ^<br /> +  *  lower-case <strong>standard attribute values</strong> like <span class="term">password</span>  ^`<br /> +<br /> +  *  <strong>attribute-specific URL protocol/scheme restriction</strong>  *~`<br /> +  *  disable <strong>dynamic expressions</strong> in <span class="term">style</span> values  *~`<br /> +<br /> +  *  neutralize invalid named character entities  ^`<br /> +  *  <strong>convert</strong> hexadecimal numeric entities to decimal ones, or vice versa  ^~`<br /> +  *  convert named entities to numeric ones for generic XML use  ^~`<br /> +<br /> +  *  remove <strong>null</strong> characters  *<br /> +  *  neutralize potentially dangerous proprietary Netscape <strong>Javascript entities</strong>  *<br /> +  *  replace potentially dangerous <strong>soft-hyphen</strong> character in attribute values with spaces  *<br /> +<br /> +  *  remove common <strong>invalid characters</strong> not allowed in HTML or XML  ^`<br /> +  *  replace <strong>characters from Microsoft applications</strong> like <span class="term">Word</span> that are discouraged in HTML or XML  ^~`<br /> +  *  neutralize entities for characters invalid or discouraged in HTML or XML  ^`<br /> +  *  appropriately neutralize <span class="term"><</span>, <span class="term">&</span>, <span class="term">"</span>, and <span class="term">></span> characters  ^*`<br /> +<br /> +  *  understands improperly spaced tag content (like, spread over more than a line) and properly spaces them  `<br /> +  *  attempts to <strong>balance tags</strong> for well-formedness  ^~`<br /> +  *  understands when <strong>omitable closing tags</strong> like <span class="term"></p></span> (allowed in HTML 4, transitional, e.g.) are missing  ^~`<br /> +  *  attempts to permit only <strong>validly nested tags</strong>  ^~`<br /> +  *  option to <strong>remove or neutralize bad content</strong> ^~`<br /> +  *  attempts to <strong>rectify common errors of plain-text misplacement</strong> (e.g., directly inside <span class="term">blockquote</span>) ^~`<br /> +<br /> +  *  fast, <strong>non-OOP</strong> code of ~45 kb incurring peak basal memory usage of ~0.5 MB<br /> +  *  <strong>compatible</strong> with pre-existing code using <span class="term">Kses</span> (the filter used by <span class="term">WordPress</span>)<br /> +<br /> +  *  optional <strong>anti-spam</strong> measures such as addition of <span class="term">rel="nofollow"</span> and link-disabling  ~`<br /> +  *  optionally makes <strong>relative URLs absolute</strong>, and vice versa  ~`<br /> +<br /> +  *  optionally mark <span class="term">&</span> to identify the entities for <span class="term">&</span>, <span class="term"><</span> and <span class="term">></span> introduced by htmLawed  ~`<br /> +<br /> +  *  allows deployment of powerful <strong>hook functions</strong> to <strong>inject</strong> HTML, <strong>consolidate</strong> <span class="term">style</span> attributes to <span class="term">class</span>, finely check attribute values, etc.  ~`<br /> +<br /> +  *  <strong>independent of character encoding</strong> of input and does not affect it<br /> +<br /> +  *  <strong>tolerance for ill-written HTML</strong> to a certain degree<br /> + +</div> +<div class="sub-section"><h3> +<a name="s1.3" id="s1.3"></a><span class="item-no">1.3</span>  History +</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> +<br /> +  htmLawed was developed for use with <span class="term">LabWiki</span>, a wiki software developed at PHP Labware, as a suitable software could not be found. Existing PHP software like <span class="term">Kses</span> and <span class="term">HTMLPurifier</span> were deemed inadequate, slow, resource-intensive, or dependent on external applications like <span class="term">HTML Tidy</span>.<br /> +<br /> +  htmLawed started as a modification of Ulf Harnhammar's <span class="term">Kses</span> (version 0.2.2) software, and is compatible with code that uses <span class="term">Kses</span>; see <a href="#s2.6">section 2.6</a>.<br /> + +</div> +<div class="sub-section"><h3> +<a name="s1.4" id="s1.4"></a><span class="item-no">1.4</span>  License & copyright +</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> +<br /> +  htmLawed is free and open-source software licensed under GPL license version <a href="http://www.gnu.org/licenses/gpl-3.0.txt">3</a>, and copyrighted by Santosh Patnaik, MD, PhD.<br /> + +</div> +<div class="sub-section"><h3> +<a name="s1.5" id="s1.5"></a><span class="item-no">1.5</span>  Terms used here +</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> +<br /> +  *  <em>administrator</em> - or admin; person setting up the code to pass input through htmLawed; also, <em>user</em><br /> +  *  <em>attributes</em> - name-value pairs like <span class="term">href="http://x.com"</span> in opening tags<br /> +  *  <em>author</em> - <em>writer</em><br /> +  *  <em>character</em> - atomic unit of text; internally represented by a numeric <em>code-point</em> as specified by the <em>encoding</em> or <em>charset</em> in use<br /> +  *  <em>entity</em> - markup like <span class="term">&gt;</span> and <span class="term">&#160;</span> used to refer to a character<br /> +  *  <em>element</em> - HTML element like <span class="term">a</span> and <span class="term">img</span><br /> +  *  <em>element content</em> -  content between the opening and closing tags of an element, like <span class="term">click</span> of <span class="term"><a href="x">click</a></span><br /> +  *  <em>HTML</em> - implies XHTML unless specified otherwise<br /> +  *  <em>input</em> - text string given to htmLawed to process<br /> +  *  <em>processing</em> - involves filtering, correction, etc., of input<br /> +  *  <em>safe</em> - absence or reduction of certain characters and HTML elements and attributes in the input that can otherwise potentially and circumstantially expose web-site users to security vulnerabilities like cross-site scripting attacks (XSS)<br /> +  *  <em>scheme</em> - URL protocol like <span class="term">http</span> and <span class="term">ftp</span><br /> +  *  <em>specs</em> - standard specifications<br /> +  *  <em>style property</em> - terms like <span class="term">border</span> and <span class="term">height</span> for which declarations are made in values for the <span class="term">style</span> attribute of elements<br /> +  *  <em>tag</em> - markers like <span class="term"><a href="x"></span> and <span class="term"></a></span> delineating element content; the opening tag can contain attributes<br /> +  *  <em>tag content</em> - consists of tag markers <span class="term"><</span> and <span class="term">></span>, element names like <span class="term">div</span>, and possibly attributes<br /> +  *  <em>user</em> - administrator<br /> +  *  <em>writer</em> - end-user like a blog commenter providing the input that is to be processed; also, <em>author</em><br /> + +</div> +</div> +<div class="section"><h2> +<a name="s2" id="s2"></a><span class="item-no">2</span>  Usage +</h2><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> +<br /> +  htmLawed should work with PHP 4.3 and higher. Either <span class="term">include()</span> the <span class="term">htmLawed.php</span> file or copy-paste the entire code.<br /> +<br /> +  To easily <strong>test</strong> htmLawed using a form-based interface, use the provided <a href="htmLawedTest.php">demo</a> (<span class="term">htmLawed.php</span> and <span class="term">htmLawedTest.php</span> should be in the same directory on the web-server).<br /> + +<div class="sub-section"><h3> +<a name="s2.1" id="s2.1"></a><span class="item-no">2.1</span>  Simple +</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> +<br /> +  The input text to be processed, <span class="term">$text</span>, is passed as an argument of type string; <span class="term">htmLawed()</span> returns the processed string:<br /> +<br /> + +<code class="code">    $processed = htmLawed($text);</code> +<br /> +<br /> +  <strong>Note</strong>: If input is from a <span class="term">$_GET</span> or <span class="term">$_POST</span> value, and <span class="term">magic quotes</span> are enabled on the PHP setup, run <span class="term">stripslashes()</span> on the input before passing to htmLawed.<br /> +<br /> +  By default, htmLawed will process the text allowing all valid HTML elements/tags, secure URL scheme/CSS style properties, etc. It will allow <span class="term">CDATA</span> sections and HTML comments, balance tags, and ensure proper nesting of elements. Such actions can be configured using two other optional arguments -- <span class="term">$config</span> and <span class="term">$spec</span>:<br /> +<br /> + +<code class="code">    $processed = htmLawed($text, $config, $spec);</code> +<br /> +<br /> +  These extra parameters are detailed below. Some examples are shown in <a href="#s2.9">section 2.9</a>.<br /> +<br /> +  <strong>Note</strong>: For maximum protection against <span class="term">XSS</span> and other scripting attacks (e.g., by disallowing Javascript code), consider using the <span class="term">safe</span> parameter; see <a href="#s3.6">section 3.6</a>.<br /> + +</div> +<div class="sub-section"><h3> +<a name="s2.2" id="s2.2"></a><span class="item-no">2.2</span>  Configuring htmLawed using the <span class="term">$config</span> parameter +</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> +<br /> +  <span class="term">$config</span> instructs htmLawed on how to tackle certain tasks. When <span class="term">$config</span> is not specified, or not set as an array (e.g., <span class="term">$config = 1</span>), htmLawed will take default actions. One or many of the task-action or value-specification pairs can be specified in <span class="term">$config</span> as array key-value pairs. If a parameter is not specified, htmLawed will use the default value/action indicated further below.<br /> +<br /> + +<code class="code">    $config = array('comment'=>0, 'cdata'=>1);</code> +<br /> + +<code class="code">    $processed = htmLawed($text, $config);</code> +<br /> +<br /> +  Or,<br /> +<br /> + +<code class="code">    $processed = htmLawed($text, array('comment'=>0, 'cdata'=>1));</code> +<br /> +<br /> +  Below are the possible value-specification combinations. In PHP code, values that are integers should not be quoted and should be used as numeric types (unless meant as string/text).<br /> +<br /> +  Key: <span class="term">*</span> default, <span class="term">^</span> different default when htmLawed is used in the Kses-compatible mode (see <a href="#s2.6">section 2.6</a>), <span class="term">~</span> different default when <span class="term">valid_xhtml</span> is set to <span class="term">1</span> (see <a href="#s3.5">section 3.5</a>), <span class="term">"</span> different default when <span class="term">safe</span> is set to <span class="term">1</span> (see <a href="#s3.6">section 3.6</a>)<br /> +<br /> +  <strong>abs_url</strong><br /> +  Make URLs absolute or relative; <span class="term">$config["base_url"]</span> needs to be set; see <a href="#s3.4.4">section 3.4.4</a><br /> +<br /> +  <span class="term">-1</span> - make relative<br /> +  <span class="term">0</span> - no action  *<br /> +  <span class="term">1</span> - make absolute<br /> +<br /> +  <strong>and_mark</strong><br /> +  Mark <span class="term">&</span> characters in the original input; see <a href="#s3.2">section 3.2</a><br /> +<br /> +  <strong>anti_link_spam</strong><br /> +  Anti-link-spam measure; see <a href="#s3.4.7">section 3.4.7</a><br /> +<br /> +  <span class="term">0</span> - no measure taken  *<br /> +  <span class="term">array("regex1", "regex2")</span> - will ensure a <span class="term">rel</span> attribute with <span class="term">nofollow</span> in its value in case the <span class="term">href</span> attribute value matches the regular expression pattern <span class="term">regex1</span>, and/or will remove <span class="term">href</span> if its value matches the regular expression pattern <span class="term">regex2</span>. E.g., <span class="term">array("/./", "/://\W*(?!(abc\.com|xyz\.org))/")</span>; see <a href="#s3.4.7">section 3.4.7</a> for more.<br /> +<br /> +  <strong>anti_mail_spam</strong><br /> +  Anti-mail-spam measure; see <a href="#s3.4.7">section 3.4.7</a><br /> +<br /> +  <span class="term">0</span> - no measure taken  *<br /> +  <span class="term">word</span> - <span class="term">@</span> in mail address in <span class="term">href</span> attribute value is replaced with specified <span class="term">word</span><br /> +<br /> +  <strong>balance</strong><br /> +  Balance tags for well-formedness and proper nesting; see <a href="#s3.3.3">section 3.3.3</a><br /> +<br /> +  <span class="term">0</span> - no<br /> +  <span class="term">1</span> - yes  *<br /> +<br /> +  <strong>base_url</strong><br /> +  Base URL value that needs to be set if <span class="term">$config["abs_url"]</span> is not <span class="term">0</span>; see <a href="#s3.4.4">section 3.4.4</a><br /> +<br /> +  <strong>cdata</strong><br /> +  Handling of <span class="term">CDATA</span> sections; see <a href="#s3.3.1">section 3.3.1</a><br /> +<br /> +  <span class="term">0</span> - don't consider <span class="term">CDATA</span> sections as markup and proceed as if plain text  ^"<br /> +  <span class="term">1</span> - remove<br /> +  <span class="term">2</span> - allow, but neutralize any <span class="term"><</span>, <span class="term">></span>, and <span class="term">&</span> inside by converting them to named entities<br /> +  <span class="term">3</span> - allow  *<br /> +<br /> +  <strong>clean_ms_char</strong><br /> +  Replace discouraged characters introduced by Microsoft Word, etc.; see <a href="#s3.1">section 3.1</a><br /> +<br /> +  <span class="term">0</span> - no  *<br /> +  <span class="term">1</span> - yes<br /> +  <span class="term">2</span> - yes, but replace special single & double quotes with ordinary ones<br /> +<br /> +  <strong>comment</strong><br /> +  Handling of HTML comments; see <a href="#s3.3.1">section 3.3.1</a><br /> +<br /> +  <span class="term">0</span> - don't consider comments as markup and proceed as if plain text  ^"<br /> +  <span class="term">1</span> - remove<br /> +  <span class="term">2</span> - allow, but neutralize any <span class="term"><</span>, <span class="term">></span>, and <span class="term">&</span> inside by converting to named entities<br /> +  <span class="term">3</span> - allow  *<br /> +<br /> +  <strong>css_expression</strong><br /> +  Allow dynamic CSS expression by not removing the expression from CSS property values in <span class="term">style</span> attributes; see <a href="#s3.4.8">section 3.4.8</a><br /> +<br /> +  <span class="term">0</span> - remove  *<br /> +  <span class="term">1</span> - allow<br /> +<br /> +  <strong>deny_attribute</strong><br /> +  Denied HTML attributes; see <a href="#s3.4">section 3.4</a><br /> +<br /> +  <span class="term">0</span> - none  *<br /> +  <span class="term">string</span> - dictated by values in <span class="term">string</span><br /> +  <span class="term">on*</span> (like <span class="term">onfocus</span>) attributes not allowed - "<br /> +<br /> +  <strong>elements</strong><br /> +  Allowed HTML elements; see <a href="#s3.3">section 3.3</a><br /> +<br /> +  <span class="term">* -center -dir -font -isindex -menu -s -strike -u</span> -  ~<br /> +  <span class="term">applet, embed, iframe, object, script</span> not allowed - "<br /> +<br /> +  <strong>hexdec_entity</strong><br /> +  Allow hexadecimal numeric entities and do not convert to the more widely accepted decimal ones, or convert decimal to hexadecimal ones; see <a href="#s3.2">section 3.2</a><br /> +<br /> +  <span class="term">0</span> - no<br /> +  <span class="term">1</span> - yes  *<br /> +  <span class="term">2</span> - convert decimal to hexadecimal ones<br /> +<br /> +  <strong>hook</strong><br /> +  Name of an optional hook function to alter the input string, <span class="term">$config</span> or <span class="term">$spec</span> before htmLawed starts its main work; see <a href="#s3.7">section 3.7</a><br /> +<br /> +  <span class="term">0</span> - no hook function  *<br /> +  <span class="term">name</span> - <span class="term">name</span> is name of the hook function (<span class="term">kses_hook</span>  ^)<br /> +<br /> +  <strong>hook_tag</strong><br /> +  Name of an optional hook function to alter tag content finalized by htmLawed; see <a href="#s3.4.9">section 3.4.9</a><br /> +<br /> +  <span class="term">0</span> - no hook function  *<br /> +  <span class="term">name</span> - <span class="term">name</span> is name of the hook function<br /> +<br /> +  <strong>keep_bad</strong><br /> +  Neutralize bad tags by converting <span class="term"><</span> and <span class="term">></span> to entities, or remove them; see <a href="#s3.3.3">section 3.3.3</a><br /> +<br /> +  <span class="term">0</span> - remove  ^<br /> +  <span class="term">1</span> - neutralize both tags and element content<br /> +  <span class="term">2</span> - remove tags but neutralize element content<br /> +  <span class="term">3</span> and <span class="term">4</span> - like <span class="term">1</span> and <span class="term">2</span> but remove if text (<span class="term">pcdata</span>) is invalid in parent element<br /> +  <span class="term">5</span> and <span class="term">6</span> * -  like <span class="term">3</span> and <span class="term">4</span> but line-breaks, tabs and spaces are left<br /> +<br /> +  <strong>lc_std_val</strong><br /> +  For XHTML compliance, predefined, standard attribute values, like <span class="term">get</span> for the <span class="term">method</span> attribute of <span class="term">form</span>, must be lowercased; see <a href="#s3.4.5">section 3.4.5</a><br /> +<br /> +  <span class="term">0</span> - no<br /> +  <span class="term">1</span> - yes  *<br /> +<br /> +  <strong>make_tag_strict</strong><br /> +  Transform/remove these non-strict XHTML elements, even if they are allowed by the admin: <span class="term">applet</span> <span class="term">center</span> <span class="term">dir</span> <span class="term">embed</span> <span class="term">font</span> <span class="term">isindex</span> <span class="term">menu</span> <span class="term">s</span> <span class="term">strike</span> <span class="term">u</span>; see <a href="#s3.3.2">section 3.3.2</a><br /> +<br /> +  <span class="term">0</span> - no  ^<br /> +  <span class="term">1</span> - yes, but leave <span class="term">applet</span>, <span class="term">embed</span> and <span class="term">isindex</span> elements that currently can't be transformed  *<br /> +  <span class="term">2</span> - yes, removing <span class="term">applet</span>, <span class="term">embed</span> and <span class="term">isindex</span> elements and their contents (nested elements remain)  ~<br /> +<br /> +  <strong>named_entity</strong><br /> +  Allow non-universal named HTML entities, or convert to numeric ones; see <a href="#s3.2">section 3.2</a><br /> +<br /> +  <span class="term">0</span> - convert<br /> +  <span class="term">1</span> - allow  *<br /> +<br /> +  <strong>no_deprecated_attr</strong><br /> +  Allow deprecated attributes or transform them; see <a href="#s3.4.6">section 3.4.6</a><br /> +<br /> +  <span class="term">0</span> - allow  ^<br /> +  <span class="term">1</span> - transform, but <span class="term">name</span> attributes for <span class="term">a</span> and <span class="term">map</span> are retained  *<br /> +  <span class="term">2</span> - transform<br /> +<br /> +  <strong>parent</strong><br /> +  Name of the parent element, possibly imagined, that will hold the input; see <a href="#s3.3">section 3.3</a><br /> +<br /> +  <strong>safe</strong><br /> +  Magic parameter to make input the most secure against XSS without needing to specify other relevant <span class="term">$config</span> parameters; see <a href="#s3.6">section 3.6</a><br /> +<br /> +  <span class="term">0</span> - no  *<br /> +  <span class="term">1</span> - will auto-adjust other relevant <span class="term">$config</span> parameters (indicated by <span class="term">"</span> in this list)<br /> +<br /> +  <strong>schemes</strong><br /> +  Array of attribute-specific, comma-separated, lower-cased list of schemes (protocols) allowed in attributes accepting URLs; <span class="term">*</span> covers all unspecified attributes; see <a href="#s3.4.3">section 3.4.3</a><br /> +<br /> +  <span class="term">href: aim, feed, file, ftp, gopher, http, https, irc, mailto, news, nntp, sftp, ssh, telnet; *:file, http, https</span>  *<br /> +  <span class="term">*: ftp, gopher, http, https, mailto, news, nntp, telnet</span>  ^<br /> +  <span class="term">href: aim, feed, file, ftp, gopher, http, https, irc, mailto, news, nntp, sftp, ssh, telnet; style: nil; *:file, http, https</span>  "<br /> +<br /> +  <strong>show_setting</strong><br /> +  Name of a PHP variable to assign the <em>finalized</em> <span class="term">$config</span> and <span class="term">$spec</span> values; see <a href="#s3.8">section 3.8</a><br /> +<br /> +  <strong>style_pass</strong><br /> +  Do not look at <span class="term">style</span> attribute values, letting them through without any alteration<br /> +<br /> +  <span class="term">0</span> - no *<br /> +  <span class="term">1</span> - htmLawed will let through any <span class="term">style</span> value; see <a href="#s3.4.8">section 3.4.8</a><br /> +<br /> +  <strong>tidy</strong><br /> +  Beautify or compact HTML code; see <a href="#s3.3.5">section 3.3.5</a><br /> +<br /> +  <span class="term">-1</span> - compact<br /> +  <span class="term">0</span> - no  *<br /> +  <span class="term">1</span> or <span class="term">string</span> - beautify (custom format specified by <span class="term">string</span>)<br /> +<br /> +  <strong>unique_ids</strong><br /> +  <span class="term">id</span> attribute value checks; see <a href="#s3.4.2">section 3.4.2</a><br /> +<br /> +  <span class="term">0</span> - no  ^<br /> +  <span class="term">1</span> - remove duplicate and/or invalid ones  *<br /> +  <span class="term">word</span> - remove invalid ones and replace duplicate ones with new and unique ones based on the <span class="term">word</span>; the admin-specified <span class="term">word</span>, like <span class="term">my_</span>, should begin with a letter (a-z) and can contain letters, digits, <span class="term">.</span>, <span class="term">_</span>, <span class="term">-</span>, and <span class="term">:</span>.<br /> +<br /> +  <strong>valid_xhtml</strong><br /> +  Magic parameter to make input the most valid XHTML without needing to specify other relevant <span class="term">$config</span> parameters; see <a href="#s3.5">section 3.5</a><br /> +<br /> +  <span class="term">0</span> - no  *<br /> +  <span class="term">1</span> - will auto-adjust other relevant <span class="term">$config</span> parameters (indicated by <span class="term">~</span> in this list)<br /> +<br /> +  <strong>xml:lang</strong><br /> +  Auto-adding <span class="term">xml:lang</span> attribute; see <a href="#s3.4.1">section 3.4.1</a><br /> +<br /> +  <span class="term">0</span> - no  *<br /> +  <span class="term">1</span> - add if <span class="term">lang</span> attribute is present<br /> +  <span class="term">2</span> - add if <span class="term">lang</span> attribute is present, and remove <span class="term">lang</span>  ~<br /> + +</div> +<div class="sub-section"><h3> +<a name="s2.3" id="s2.3"></a><span class="item-no">2.3</span>  Extra HTML specifications using the $spec parameter +</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> +<br /> +  The <span class="term">$spec</span> argument can be used to disallow an otherwise legal attribute for an element, or to restrict the attribute's values. This can also be helpful as a security measure (e.g., in certain versions of browsers, certain values can cause buffer overflows and denial of service attacks), or in enforcing admin policy compliance. <span class="term">$spec</span> is specified as a string of text containing one or more <em>rules</em>, with multiple rules separated from each other by a semi-colon (<span class="term">;</span>). E.g.,<br /> +<br /> + +<code class="code">    $spec = 'i=-*; td, tr=style, id, -*; a=id(match="/[a-z][a-z\d.:\-`"]*/i"/minval=2), href(maxlen=100/minlen=34); img=-width,-alt';</code> +<br /> + +<code class="code">    $processed = htmLawed($text, $config, $spec);</code> +<br /> +<br /> +  Or,<br /> +<br /> + +<code class="code">    $processed = htmLawed($text, $config, 'i=-*; td, tr=style, id, -*; a=id(match="/[a-z][a-z\d.:\-`"]*/i"/minval=2), href(maxlen=100/minlen=34); img=-width,-alt');</code> +<br /> +<br /> +  A rule begins with an HTML <strong>element</strong> name(s) (<em>rule-element</em>), for which the rule applies, followed by an equal (<span class="term">=</span>) sign. A rule-element may represent multiple elements if comma (,)-separated element names are used. E.g., <span class="term">th,td,tr=</span>.<br /> +<br /> +  Rest of the rule consists of comma-separated HTML <strong>attribute names</strong>. A minus (<span class="term">-</span>) character before an attribute means that the attribute is not permitted inside the rule-element. E.g., <span class="term">-width</span>. To deny all attributes, <span class="term">-*</span> can be used.<br /> +<br /> +  Following shows examples of rule excerpts with rule-element <span class="term">a</span> and the attributes that are being permitted:<br /> +<br /> +  *  <span class="term">a=</span> - all<br /> +  *  <span class="term">a=id</span> - all<br /> +  *  <span class="term">a=href, title, -id, -onclick</span> - all except <span class="term">id</span> and <span class="term">onclick</span><br /> +  *  <span class="term">a=*, id, -id</span> - all except <span class="term">id</span><br /> +  *  <span class="term">a=-*</span> - none<br /> +  *  <span class="term">a=-*, href, title</span> - none except <span class="term">href</span> and <span class="term">title</span><br /> +  *  <span class="term">a=-*, -id, href, title</span> - none except <span class="term">href</span> and <span class="term">title</span><br /> +<br /> +  Rules regarding <strong>attribute values</strong> are optionally specified inside round brackets after attribute names in slash ('/')-separated <em>parameter = value</em> pairs. E.g., <span class="term">title(maxlen=30/minlen=5)</span>. None, or one or more of the following parameters may be specified:<br /> +<br /> +  *  <span class="term">oneof</span> - one or more choices separated by <span class="term">|</span> that the value should match; if only one choice is provided, then the value must match that choice<br /> +<br /> +  *  <span class="term">noneof</span> - one or more choices separated by <span class="term">|</span> that the value should not match<br /> +<br /> +  *  <span class="term">maxlen</span> and <span class="term">minlen</span> - upper and lower limits for the number of characters in the attribute value; specified in numbers<br /> +<br /> +  *  <span class="term">maxval</span> and <span class="term">minval</span> - upper and lower limits for the numerical value specified in the attribute value; specified in numbers<br /> +<br /> +  *  <span class="term">match</span> and <span class="term">nomatch</span> - pattern that the attribute value should or should not match; specified as PHP/PCRE-compatible regular expressions with delimiters and possibly modifiers<br /> +<br /> +  *  <span class="term">default</span> - a value to force on the attribute if the value provided by the writer does not fit any of the specified parameters<br /> +<br /> +  If <span class="term">default</span> is not set and the attribute value does not satisfy any of the specified parameters, then the attribute is removed. The <span class="term">default</span> value can also be used to force all attribute declarations to take the same value (by getting the values declared illegal by setting, e.g., <span class="term">maxlen</span> to <span class="term">-1</span>).<br /> +<br /> +  Examples with <em>input</em> <span class="term"><input title="WIDTH" value="10em" /><input title="length" value="5" /></span> are shown below.<br /> +<br /> +  <em>Rule</em>: <span class="term">input=title(maxlen=60/minlen=6), value</span><br /> +  <em>Output</em>: <span class="term"><input value="10em" /><input title="length" value="5" /></span><br /> +<br /> +  <em>Rule</em>: <span class="term">input=title(), value(maxval=8/default=6)</span><br /> +  <em>Output</em>: <span class="term"><input title="WIDTH" value="6" /><input title="length" value="5" /></span><br /> +<br /> +  <em>Rule</em>: <span class="term">input=title(nomatch=$w.d$i), value(match=$em$/default=6em)</span><br /> +  <em>Output</em>: <span class="term"><input value="10em" /><input title="length" value="6em" /></span><br /> +<br /> +  <em>Rule</em>: <span class="term">input=title(oneof=height|depth/default=depth), value(noneof=5|6)</span><br /> +  <em>Output</em>: <span class="term"><input title="depth" value="10em" /><input title="depth" /></span><br /> +<br /> +  <strong>Special characters</strong>: The characters <span class="term">;</span>, <span class="term">,</span>, <span class="term">/</span>, <span class="term">(</span>, <span class="term">)</span>, <span class="term">|</span>, <span class="term">~</span> and space have special meanings in the rules. Words in the rules that use such characters, or the characters themselves, should be <em>escaped</em> by enclosing in pairs of double-quotes (<span class="term">"</span>). A back-tick (<span class="term">`</span>) can be used to escape a literal <span class="term">"</span>. An example rule illustrating this is <span class="term">input=value(maxlen=30/match="/^\w/"/default="your `"ID`"")</span>.<br /> +<br /> +  <strong>Note</strong>: To deny an attribute for all elements for which it is legal, <span class="term">$config["deny_attribute"]</span> (see <a href="#s3.4">section 3.4</a>) can be used instead of <span class="term">$spec</span>. Also, attributes can be allowed element-specifically through <span class="term">$spec</span> while being denied globally through <span class="term">$config["deny_attribute"]</span>. The <span class="term">hook_tag</span> parameter (<a href="#s3.4.9">section 3.4.9</a>) can also be used to implement the <span class="term">$spec</span> functionality.<br /> + +</div> +<div class="sub-section"><h3> +<a name="s2.4" id="s2.4"></a><span class="item-no">2.4</span>  Performance time & memory usage +</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> +<br /> +  The time and memory used by htmLawed depends on its configuration and the size of the input, and the amount, nestedness and well-formedness of the HTML markup within it. In particular, tag balancing and beautification each can increase the processing time by about a quarter.<br /> +<br /> +  The htmLawed <a href="htmLawedTest.php">demo</a> can be used to evaluate the performance and effects of different types of input and <span class="term">$config</span>.<br /> + +</div> +<div class="sub-section"><h3> +<a name="s2.5" id="s2.5"></a><span class="item-no">2.5</span>  Some security risks to keep in mind +</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> +<br /> +  When setting the parameters/arguments (like those to allow certain HTML elements) for use with htmLawed, one should bear in mind that the setting may let through potentially <em>dangerous</em> HTML code. (This may not be a problem if the authors are trusted.)<br /> +<br /> +  For example, following increase security risks:<br /> +<br /> +  *  Allowing <span class="term">script</span>, <span class="term">applet</span>, <span class="term">embed</span>, <span class="term">iframe</span> or <span class="term">object</span> elements, or certain of their attributes like <span class="term">allowscriptaccess</span><br /> +<br /> +  *  Allowing HTML comments (some Internet Explorer versions are vulnerable with, e.g., <span class="term"><!--[if gte IE 4]><script>alert("xss");</script><![endif]--></span><br /> +<br /> +  *  Allowing dynamic CSS expressions (a feature of the IE browser)<br /> +<br /> +  <em>Unsafe</em> HTML can be removed by setting <span class="term">$config</span> appropriately. E.g., <span class="term">$config["elements"] = "* -script"</span> (<a href="#s3.3">section 3.3</a>), <span class="term">$config["safe"] = 1</span> (<a href="#s3.6">section 3.6</a>), etc.<br /> + +</div> +<div class="sub-section"><h3> +<a name="s2.6" id="s2.6"></a><span class="item-no">2.6</span>  Use without modifying old <span class="term">kses()</span> code +</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> +<br /> +  The <span class="term">Kses</span> PHP script is used by many applications (like <span class="term">WordPress</span>). It is possible to have such applications use htmLawed instead, since it is compatible with code that calls the <span class="term">kses()</span> function declared in the <span class="term">Kses</span> file (usually named <span class="term">kses.php</span>). E.g., application code like this will continue to work after replacing <span class="term">Kses</span> with htmLawed:<br /> +<br /> + +<code class="code">    $comment_filtered = kses($comment_input, array('a'=>array(), 'b'=>array(), 'i'=>array()));</code> +<br /> +<br /> +  For some of the <span class="term">$config</span> parameters, htmLawed will use values other than the default ones. These are indicated by <span class="term">^</span> in <a href="#s2.2">section 2.2</a>. To force htmLawed to use other values, function <span class="term">kses()</span> in the htmLawed code should be edited -- a few configurable parameters/variables need to be changed.<br /> +<br /> +  If the application uses a <span class="term">Kses</span> file that has the <span class="term">kses()</span> function declared, then, to have the application use htmLawed instead of <span class="term">Kses</span>, simply rename <span class="term">htmLawed.php</span> (to <span class="term">kses.php</span>, e.g.) and replace the <span class="term">Kses</span> file (or just replace the code in the <span class="term">Kses</span> file with the htmLawed code). If the <span class="term">kses()</span> function in the <span class="term">Kses</span> file had been renamed by the application developer (e.g., in <span class="term">WordPress</span>, it is named <span class="term">wp_kses()</span>), then appropriately rename the <span class="term">kses()</span> function in the htmLawed code.<br /> +<br /> +  If the <span class="term">Kses</span> file used by the application has been highly altered by the application developers, then one may need a different approach. E.g., with <span class="term">WordPress</span>, it is best to copy the htmLawed code to <span class="term">wp_includes/kses.php</span>, rename the newly added function <span class="term">kses()</span> to <span class="term">wp_kses()</span>, and delete the code for the original <span class="term">wp_kses()</span> function.<br /> +<br /> +  If the <span class="term">Kses</span> code has a non-empty hook function (e.g., <span class="term">wp_kses_hook()</span> in case of <span class="term">WordPress</span>), then the code for htmLawed's <span class="term">kses_hook()</span> function should be appropriately edited. However, the requirement of the hook function should be re-evaluated considering that htmLawed has extra capabilities. With <span class="term">WordPress</span>, the hook function is an essential one. The following code is suggested for the htmLawed <span class="term">kses_hook()</span> in case of <span class="term">WordPress</span>:<br /> +<br /> + +<code class="code">    function kses_hook($string, &$cf, &$spec){</code> +<br /> + +<code class="code">    // kses compatibility</code> +<br /> + +<code class="code">    $allowed_html = $spec;</code> +<br /> + +<code class="code">    $allowed_protocols = array();</code> +<br /> + +<code class="code">    foreach($cf['schemes'] as $v){</code> +<br /> + +<code class="code">     foreach($v as $k2=>$v2){</code> +<br /> + +<code class="code">      if(!in_array($k2, $allowed_protocols)){</code> +<br /> + +<code class="code">       $allowed_protocols[] = $k2;</code> +<br /> + +<code class="code">      }</code> +<br /> + +<code class="code">     }</code> +<br /> + +<code class="code">    }</code> +<br /> + +<code class="code">    return wp_kses_hook($string, $allowed_html, $allowed_protocols);</code> +<br /> + +<code class="code">    // eof</code> +<br /> + +<code class="code">    }</code> +<br /> + +</div> +<div class="sub-section"><h3> +<a name="s2.7" id="s2.7"></a><span class="item-no">2.7</span>  Tolerance for ill-written HTML +</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> +<br /> +  htmLawed can work with ill-written HTML code in the input. However, HTML that is too ill-written may not be <em>read</em> as HTML, and be considered mere plain text instead. Following statements indicate the degree of <em>looseness</em> that htmLawed can work with, and can be provided in instructions to writers:<br /> +<br /> +  *  Tags must be flanked by <span class="term"><</span> and <span class="term">></span> with no <span class="term">></span> inside -- any needed <span class="term">></span> should be put in as <span class="term">&gt;</span>. It is possible for tag content (element name and attributes) to be spread over many lines instead of being on one. A space may be present between the tag content and <span class="term">></span>, like <span class="term"><div ></span> and <span class="term"><img / ></span>, but not after the <span class="term"><</span>.<br /> +<br /> +  *  Element and attribute names need not be lower-cased.<br /> +<br /> +  *  Attribute string of elements may be liberally spaced with tabs, line-breaks, etc.<br /> +<br /> +  *  Attribute values may not be double-quoted, or may be single-quoted.<br /> +<br /> +  *  Left-padding of numeric entities (like, <span class="term">&#0160;</span>, <span class="term">&x07ff;</span>) with <span class="term">0</span> is okay as long as the number of characters between between the <span class="term">&</span> and the <span class="term">;</span> does not exceed 8. All entities must end with <span class="term">;</span> though.<br /> +<br /> +  *  Named character entities must be properly cased. E.g., <span class="term">&Lt;</span> or <span class="term">&TILDE;</span> will not be let through without modification.<br /> +<br /> +  *  HTML comments should not be inside element tags (okay between tags), and should begin with <span class="term"><!--</span> and end with <span class="term">--></span>. Characters like <span class="term"><</span>, <span class="term">></span>, and <span class="term">&</span> may be allowed inside depending on <span class="term">$config</span>, but any <span class="term">--></span> inside should be put in as <span class="term">--&gt;</span>. Any <span class="term">--</span> inside will be automatically converted to <span class="term">-</span>, and a space will be added before the comment delimiter <span class="term">--></span>.<br /> +<br /> +  *  <span class="term">CDATA</span> sections should not be inside element tags, and can be in element content only if plain text is allowed for that element. They should begin with <span class="term"><[CDATA[</span> and end with <span class="term">]]></span>. Characters like <span class="term"><</span>, <span class="term">></span>, and <span class="term">&</span> may be allowed inside depending on <span class="term">$config</span>, but any <span class="term">]]></span> inside should be put in as <span class="term">]]&gt;</span>.<br /> +<br /> +  *  For attribute values, character entities <span class="term">&lt;</span>, <span class="term">&gt;</span> and <span class="term">&amp;</span> should be used instead of characters <span class="term"><</span> and <span class="term">></span>, and <span class="term">&</span> (when <span class="term">&</span> is not part of a character entity). This applies even for Javascript code in values of attributes like <span class="term">onclick</span>.<br /> +<br /> +  *  Characters <span class="term"><</span>, <span class="term">></span>, <span class="term">&</span> and <span class="term">"</span> that are part of actual Javascript, etc., code in <span class="term">script</span> elements should be used as such and not be put in as entities like <span class="term">&gt;</span>. Otherwise, though the HTML will be valid, the code may fail to work. Further, if such characters have to be used, then they should be put inside <span class="term">CDATA</span> sections.<br /> +<br /> +  *  Simple instructions like "an opening tag cannot be present between two closing tags" and "nested elements should be closed in the reverse order of how they were opened" can help authors write balanced HTML. If tags are imbalanced, htmLawed will try to balance them, but in the process, depending on <span class="term">$config["keep_bad"]</span>, some code/text may be lost.<br /> +<br /> +  *  Input authors should be notified of admin-specified allowed elements, attributes, configuration values (like conversion of named entities to numeric ones), etc.<br /> +<br /> +  *  With <span class="term">$config["unique_ids"]</span> not <span class="term">0</span> and the <span class="term">id</span> attribute being permitted, writers should carefully avoid using duplicate or invalid <span class="term">id</span> values as even though htmLawed will correct/remove the values, the final output may not be the one desired. E.g., when <span class="term"><a id="home"></a><input id="home" /><label for="home"></label></span> is processed into<br /> +<span class="term"><a id="home"></a><input id="prefix_home" /><label for="home"></label></span>.<br /> +<br /> +  *  Note that even if intended HTML is lost in a highly ill-written input, the processed output will be more secure and standard-compliant.<br /> +<br /> +  *  For URLs, unless <span class="term">$config["scheme"]</span> is appropriately set, writers should avoid using escape characters or entities in schemes. E.g., <span class="term">htt&#112;</span> (which many browsers will read as the harmless <span class="term">http</span>) may be considered bad by htmLawed.<br /> +<br /> +  *  htmLawed will attempt to put plain text present directly inside <span class="term">blockquote</span>, <span class="term">form</span>, <span class="term">map</span> and <span class="term">noscript</span> elements (illegal as per the specs) inside auto-generated <span class="term">div</span> elements.<br /> + +</div> +<div class="sub-section"><h3> +<a name="s2.8" id="s2.8"></a><span class="item-no">2.8</span>  Limitations & work-arounds +</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> +<br /> +  htmLawed's main objective is to make the input text <em>more</em> standard-compliant, secure for web-page readers, and free of HTML elements and attributes considered undesirable by the administrator. Some of its current limitations, regardless of this objective, are noted below along with work-arounds.<br /> +<br /> +  It should be borne in mind that no browser application is 100% standard-compliant, and that some of the standard specs (like asking for normalization of white-spacing within <span class="term">textarea</span> elements) are clearly wrong. Regarding security, note that <em>unsafe</em> HTML code is not necessarily legally invalid.<br /> +<br /> +  *  htmLawed is meant for input that goes into the <span class="term">body</span> of HTML documents. HTML's head-level elements are not supported, nor are the frameset elements <span class="term">frameset</span>, <span class="term">frame</span> and <span class="term">noframes</span>.<br /> +<br /> +  *  It cannot transform the non-standard <span class="term">embed</span> elements to the standard-compliant <span class="term">object</span> elements. Yet, it can allow <span class="term">embed</span> elements if permitted (<span class="term">embed</span> is widely used and supported). Admins can certainly use the <span class="term">hook_tag</span> parameter (<a href="#s3.4.9">section 3.4.9</a>) to deploy a custom embed-to-object converter function.<br /> +<br /> +  *  The only non-standard element that may be permitted is <span class="term">embed</span>; others like <span class="term">noembed</span> and <span class="term">nobr</span> cannot be permitted without modifying the htmLawed code.<br /> +<br /> +  *  It cannot handle input that has non-HTML code like <span class="term">SVG</span> and <span class="term">MathML</span>. One way around is to break the input into pieces and passing only those without non-HTML code to htmLawed. Another is described in <a href="#s3.9">section 3.9</a>. A third way may be to some how take advantage of the <span class="term">$config["and_mark"]</span> parameter (see <a href="#s3.2">section 3.2</a>).<br /> +<br /> +  *  By default, htmLawed won't check many attribute values for standard compliance. E.g., <span class="term">width="20m"</span> with the dimension in non-standard <span class="term">m</span> is let through. Implementing universal and strict attribute value checks can make htmLawed slow and resource-intensive. Admins should look at the <span class="term">hook_tag</span> parameter (<a href="#s3.4.9">section 3.4.9</a>) or <span class="term">$spec</span> to enforce finer checks.<br /> +<br /> +  *  The attributes, deprecated (which can be transformed too) or not, that it supports are largely those that are in the specs. Only a few of the proprietary attributes are supported.<br /> +<br /> +  *  Except for contained URLs and dynamic expressions (also optional), htmLawed does not check CSS style property values. Admins should look at using the <span class="term">hook_tag</span> parameter (<a href="#s3.4.9">section 3.4.9</a>) or <span class="term">$spec</span> for finer checks. Perhaps the best option is to disallow <span class="term">style</span> but allow <span class="term">class</span> attributes with the right <span class="term">oneof</span> or <span class="term">match</span> values for <span class="term">class</span>, and have the various class style properties in <span class="term">.css</span> CSS stylesheet files.<br /> +<br /> +  *  htmLawed does not parse emoticons, decode <em>BBcode</em>, or <em>wikify</em>, auto-converting text to proper HTML. Similarly, it won't convert line-breaks to <span class="term">br</span> elements. Such functions are beyond its purview. Admins should use other code to pre- or post-process the input for such purposes.<br /> +<br /> +  *  htmLawed cannot be used to have links force-opened in new windows (by auto-adding appropriate <span class="term">target</span> and <span class="term">onclick</span> attributes to <span class="term">a</span>). Admins should look at Javascript-based DOM-modifying solutions for this. Admins may also be able to use a custom hook function to enforce such checks (<span class="term">hook_tag</span> parameter; see <a href="#s3.4.9">section 3.4.9</a>).<br /> +<br /> +  *  Nesting-based checks are not possible. E.g., one cannot disallow <span class="term">p</span> elements specifically inside <span class="term">td</span> while permitting it elsewhere. Admins may be able to use a custom hook function to enforce such checks (<span class="term">hook_tag</span> parameter; see <a href="#s3.4.9">section 3.4.9</a>).<br /> +<br /> +  *  Except for optionally converting absolute or relative URLs to the other type, htmLawed will not alter URLs (e.g., to change the value of query strings or to convert <span class="term">http</span> to <span class="term">https</span>. Having absolute URLs may be a standard-requirement, e.g., when HTML is embedded in email messages, whereas altering URLs for other purposes is beyond htmLawed's goals. Admins may be able to use a custom hook function to enforce such checks (<span class="term">hook_tag</span> parameter; see <a href="#s3.4.9">section 3.4.9</a>).<br /> +<br /> +  *  Pairs of opening and closing tags that do not enclose any content (like <span class="term"><em></em></span>) are not removed. This may be against the standard specs for certain elements (e.g., <span class="term">table</span>). However, presence of such standard-incompliant code will not break the display or layout of content. Admins can also use simple regex-based code to filter out such code.<br /> +<br /> +  *  htmLawed does not check for certain element orderings described in the standard specs (e.g., in a <span class="term">table</span>, <span class="term">tbody</span> is allowed before <span class="term">tfoot</span>). Admins may be able to use a custom hook function to enforce such checks (<span class="term">hook_tag</span> parameter; see <a href="#s3.4.9">section 3.4.9</a>).<br /> +<br /> +  *  htmLawed does not check the number of nested elements. E.g., it will allow two <span class="term">caption</span> elements in a <span class="term">table</span> element, illegal as per the specs. Admins may be able to use a custom hook function to enforce such checks (<span class="term">hook_tag</span> parameter; see <a href="#s3.4.9">section 3.4.9</a>).<br /> +<br /> +  *  htmLawed might convert certain entities to actual characters and remove backslashes and CSS comment-markers (<span class="term">/*</span>) in <span class="term">style</span> attribute values in order to detect malicious HTML like crafted IE-specific dynamic expressions like <span class="term">&#101;xpression...</span>. If this is too harsh, admins can allow CSS expressions through htmLawed core but then use a custom function through the <span class="term">hook_tag</span> parameter (<a href="#s3.4.9">section 3.4.9</a>) to more specifically identify CSS expressions in the <span class="term">style</span> attribute values. Also, using <span class="term">$config["style_pass"]</span>, it is possible to have htmLawed pass <span class="term">style</span> attribute values without even looking at them (<a href="#s3.4.8">section 3.4.8</a>).<br /> +<br /> +  *  htmLawed does not correct certain possible attribute-based security vulnerabilities (e.g., <span class="term"><a href="http://x%22+style=%22background-image:xss">x</a></span>). These arise when browsers mis-identify markup in <em>escaped</em> text, defeating the very purpose of escaping text (a bad browser will read the given example as <span class="term"><a href="http://x" style="background-image:xss">x</a></span>).<br /> +<br /> +  *  Because of poor Unicode support in PHP, htmLawed does not remove the <em>high value</em> HTML-invalid characters with multi-byte code-points. Such characters however are extremely unlikely to be in the input. (see <a href="#s3.1">section 3.1</a>).<br /> +<br /> +  *  Like any script using PHP's PCRE regex functions, PHP setup-specific low PCRE limit values can cause htmLawed to at least partially fail with very long input texts.<br /> + +</div> +<div class="sub-section"><h3> +<a name="s2.9" id="s2.9"></a><span class="item-no">2.9</span>  Examples +</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> +<br /> +  <strong>1.</strong> A blog administrator wants to allow only <span class="term">a</span>, <span class="term">em</span>, <span class="term">strike</span>, <span class="term">strong</span> and <span class="term">u</span> in comments, but needs <span class="term">strike</span> and <span class="term">u</span> transformed to <span class="term">span</span> for better XHTML 1-strict compliance, and, he wants the <span class="term">a</span> links to be to <span class="term">http</span> or <span class="term">https</span> resources:<br /> +<br /> + +<code class="code">    $processed = htmLawed($in, array('elements'=>'a, em, strike, strong, u', 'make_tag_strict'=>1, 'safe'=>1, 'schemes'=>'*:http, https'), 'a=href');</code> +<br /> +<br /> +  <strong>2.</strong> An author uses a custom-made web application to load content on his web-site. He is the only one using that application and the content he generates has all types of HTML, including scripts. The web application uses htmLawed primarily as a tool to correct errors that creep in while writing HTML and to take care of the occasional <em>bad</em> characters in copy-paste text introduced by Microsoft Office. The web application provides a preview before submitted input is added to the content. For the previewing process, htmLawed is set up as follows:<br /> +<br /> + +<code class="code">    $processed = htmLawed($in, array('css_expression'=>1, 'keep_bad'=>1, 'make_tag_strict'=>1, 'schemes'=>'*:*', 'valid_xhtml'=>1));</code> +<br /> +<br /> +  For the final submission process, <span class="term">keep_bad</span> is set to <span class="term">6</span>. A value of <span class="term">1</span> for the preview process allows the author to note and correct any HTML mistake without losing any of the typed text.<br /> +<br /> +  <strong>3.</strong> A data-miner is scraping information in a specific table of similar web-pages and is collating the data rows, and uses htmLawed to reduce unnecessary markup and white-spaces:<br /> +<br /> + +<code class="code">    $processed = htmLawed($in, array('elements'=>'tr, td', 'tidy'=>-1), 'tr, td =');</code> +<br /> + +</div> +</div> +<div class="section"><h2> +<a name="s3" id="s3"></a><span class="item-no">3</span>  Details +</h2><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> +<div class="sub-section"><h3> +<a name="s3.1" id="s3.1"></a><span class="item-no">3.1</span>  Invalid/dangerous characters +</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> +<br /> +  Valid characters (more correctly, their code-points) in HTML or XML are, hexadecimally, <span class="term">9</span>, <span class="term">a</span>, <span class="term">d</span>, <span class="term">20</span> to <span class="term">d7ff</span>, and <span class="term">e000</span> to <span class="term">10ffff</span>, except <span class="term">fffe</span> and <span class="term">ffff</span> (decimally, <span class="term">9</span>, <span class="term">10</span>, <span class="term">13</span>, <span class="term">32</span> to <span class="term">55295</span>, and <span class="term">57344</span> to <span class="term">1114111</span>, except <span class="term">65534</span> and <span class="term">65535</span>). htmLawed removes the invalid characters <span class="term">0</span> to <span class="term">8</span>, <span class="term">b</span>, <span class="term">c</span>, and <span class="term">e</span> to <span class="term">1f</span>.<br /> +<br /> +  Because of PHP's poor native support for multi-byte characters, htmLawed cannot check for the remaining invalid code-points. However, for various reasons, it is very unlikely for any of those characters to be in the input.<br /> +<br /> +  Characters that are discouraged (see <a href="#s5.1">section 5.1</a>) but not invalid are not removed by htmLawed.<br /> +<br /> +  It (function <span class="term">hl_tag()</span>) also replaces the potentially dangerous (in some Mozilla [Firefox] and Opera browsers) soft-hyphen character (code-point, hexadecimally, <span class="term">ad</span>, or decimally, <span class="term">173</span>) in attribute values with spaces. Where required, the characters <span class="term"><</span>, <span class="term">></span>, <span class="term">&</span>, and <span class="term">"</span> are converted to entities.<br /> +<br /> +  With <span class="term">$config["clean_ms_char"]</span> set as <span class="term">1</span> or <span class="term">2</span>, many of the discouraged characters (decimal code-points <span class="term">127</span> to <span class="term">159</span> except <span class="term">133</span>) that many Microsoft applications incorrectly use (as per the <span class="term">Windows 1252</span> [<span class="term">Cp-1252</span>] or a similar encoding system), and the character for decimal code-point <span class="term">133</span>, are converted to appropriate decimal numerical entities (or removed for a few cases)-- see appendix in <a href="#s5.4">section 5.4</a>. This can help avoid some display issues arising from copying-pasting of content.<br /> +<br /> +  With <span class="term">$config["clean_ms_char"]</span> set as <span class="term">2</span>, characters for the hexadecimal code-points <span class="term">82</span>, <span class="term">91</span>, and <span class="term">92</span> (for special single-quotes), and <span class="term">84</span>, <span class="term">93</span>, and <span class="term">94</span> (for special double-quotes) are converted to ordinary single and double quotes respectively and not to entities.<br /> +<br /> +  The character values are replaced with entities/characters and not character values referred to by the entities/characters to keep this task independent of the character-encoding of input text.<br /> +<br /> +  The <span class="term">$config["clean_ms_char"]</span> parameter need not be used if authors do not copy-paste Microsoft-created text or if the input text is not believed to use the <span class="term">Windows 1252</span> or a similar encoding. Further, the input form and the web-pages displaying it or its content should have the character encoding appropriately marked-up.<br /> + +</div> +<div class="sub-section"><h3> +<a name="s3.2" id="s3.2"></a><span class="item-no">3.2</span>  Character references/entities +</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> +<br /> +  Valid character entities take the form <span class="term">&*;</span> where <span class="term">*</span> is <span class="term">#x</span> followed by a hexadecimal number (hexadecimal numeric entity; like <span class="term">&#xA0;</span> for non-breaking space), or alphanumeric like <span class="term">gt</span> (external or named entity; like <span class="term">&nbsp;</span> for non-breaking space), or <span class="term">#</span> followed by a number (decimal numeric entity; like <span class="term">&#160;</span> for non-breaking space). Character entities referring to the soft-hyphen character (the <span class="term">&shy;</span> or <span class="term">\xad</span> character; hexadecimal code-point <span class="term">ad</span> [decimal <span class="term">173</span>]) in attribute values are always replaced with spaces; soft-hyphens in attribute values introduce vulnerabilities in some older versions of the Opera and Mozilla [Firefox] browsers.<br /> +<br /> +  htmLawed (function <span class="term">hl_ent()</span>):<br /> +<br /> +  *  Neutralizes entities with multiple leading zeroes or missing semi-colons (potentially dangerous)<br /> +<br /> +  *  Lowercases the <span class="term">X</span> (for XML-compliance) and <span class="term">A-F</span> of hexadecimal numeric entities<br /> +<br /> +  *  Neutralizes entities referring to characters that are HTML-invalid (see <a href="#s3.1">section 3.1</a>)<br /> +<br /> +  *  Neutralizes entities referring to characters that are HTML-discouraged (code-points, hexadecimally, <span class="term">7f</span> to <span class="term">84</span>, <span class="term">86</span> to <span class="term">9f</span>, and <span class="term">fdd0</span> to <span class="term">fddf</span>, or decimally, <span class="term">127</span> to <span class="term">132</span>, <span class="term">134</span> to <span class="term">159</span>, and <span class="term">64991</span> to <span class="term">64976</span>). Entities referring to the remaining discouraged characters (see <a href="#s5.1">section 5.1</a> for a full list) are let through.<br /> +<br /> +  *  Neutralizes named entities that are not in the specs.<br /> +<br /> +  *  Optionally converts valid HTML-specific named entities except <span class="term">&gt;</span>, <span class="term">&lt;</span>, <span class="term">&quot;</span>, and <span class="term">&amp;</span> to decimal numeric ones (hexadecimal if $config["hexdec_entity"] is <span class="term">2</span>) for generic XML-compliance. For this, <span class="term">$config["named_entity"]</span> should be <span class="term">1</span>.<br /> +<br /> +  *  Optionally converts hexadecimal numeric entities to the more widely supported decimal ones. For this, <span class="term">$config["hexdec_entity"]</span> should be <span class="term">0</span>.<br /> +<br /> +  *  Optionally converts decimal numeric entities to the hexadecimal ones. For this, <span class="term">$config["hexdec_entity"]</span> should be <span class="term">2</span>.<br /> +<br /> +  <em>Neutralization</em> refers to the <em>entitification</em> of <span class="term">&</span> to <span class="term">&amp;</span>.<br /> +<br /> +  <strong>Note</strong>: htmLawed does not convert entities to the actual characters represented by them; one can pass the htmLawed output through PHP's <span class="term">html_entity_decode</span> <a href="http://www.php.net/html_entity_decode">function</a> for that.<br /> +<br /> +  <strong>Note</strong>: If <span class="term">$config["and_mark"]</span> is set, and set to a value other than <span class="term">0</span>, then the <span class="term">&</span> characters in the original input are replaced with the control character for the hexadecimal code-point <span class="term">6</span> (<span class="term">\x06</span>; <span class="term">&</span> characters introduced by htmLawed, e.g., after converting <span class="term"><</span> to <span class="term">&lt;</span>, are not affected). This allows one to distinguish, say, an <span class="term">&gt;</span> introduced by htmLawed and an <span class="term">&gt;</span> put in by the input writer, and can be helpful in further processing of the htmLawed-processed text (e.g., to identify the character sequence <span class="term">o(><)o</span> to generate an emoticon image). When this feature is active, admins should ensure that the htmLawed output is not directly used in web pages or XML documents as the presence of the <span class="term">\x06</span> can break documents. Before use in such documents, and preferably before any storage, any remaining <span class="term">\x06</span> should be changed back to <span class="term">&</span>, e.g., with:<br /> +<br /> + +<code class="code">    $final = str_replace("\x06", '&', $prelim);</code> +<br /> +<br /> +  Also, see <a href="#s3.9">section 3.9</a>.<br /> + +</div> +<div class="sub-section"><h3> +<a name="s3.3" id="s3.3"></a><span class="item-no">3.3</span>  HTML elements +</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> +<br /> +  htmLawed can be configured to allow only certain HTML elements (tags) in the input. Disallowed elements (just tag-content, and not element-content), based on <span class="term">$config["keep_bad"]</span>, are either <em>neutralized</em> (converted to plain text by entitification of <span class="term"><</span> and <span class="term">></span>) or removed.<br /> +<br /> +  E.g., with only <span class="term">em</span> permitted:<br /> +<br /> +  Input:<br /> +<br /> + +<code class="code">      <em>My</em> website is <a href="http://a.com>a.com</a>.</code> +<br /> +<br /> +  Output, with <span class="term">$config["keep_bad"] = 0</span>:<br /> +<br /> + +<code class="code">      <em>My</em> website is a.com.</code> +<br /> +<br /> +  Output, with <span class="term">$config["keep_bad"]</span> not <span class="term">0</span>:<br /> +<br /> + +<code class="code">      <em>My</em> website is &lt;a href=""&gt;a.com&lt;/a&gt;.</code> +<br /> +<br /> +  See <a href="#s3.3.3">section 3.3.3</a> for differences between the various non-zero <span class="term">$config["keep_bad"]</span> values.<br /> +<br /> +  htmLawed by default permits these 86 elements:<br /> +<br /> + +<code class="code">    a, abbr, acronym, address, applet, area, b, bdo, big, blockquote, br, button, caption, center, cite, code, col, colgroup, dd, del, dfn, dir, div, dl, dt, em, embed, fieldset, font, form, h1, h2, h3, h4, h5, h6, hr, i, iframe, img, input, ins, isindex, kbd, label, legend, li, map, menu, noscript, object, ol, optgroup, option, p, param, pre, q, rb, rbc, rp, rt, rtc, ruby, s, samp, script, select, small, span, strike, strong, sub, sup, table, tbody, td, textarea, tfoot, th, thead, tr, tt, u, ul, var</code> +<br /> +<br /> +  Except for <span class="term">embed</span> (included because of its wide-spread use) and the Ruby elements (<span class="term">rb</span>, <span class="term">rbc</span>, <span class="term">rp</span>, <span class="term">rt</span>, <span class="term">rtc</span>, <span class="term">ruby</span>; part of XHTML 1.1), these are all the elements in the HTML 4/XHTML 1 specs. Strict-specific specs. exclude <span class="term">center</span>, <span class="term">dir</span>, <span class="term">font</span>, <span class="term">isindex</span>, <span class="term">menu</span>, <span class="term">s</span>, <span class="term">strike</span>, and <span class="term">u</span>.<br /> +<br /> +  With <span class="term">$config["safe"] = 1</span>, the default set will exclude <span class="term">applet</span>, <span class="term">embed</span>, <span class="term">iframe</span>, <span class="term">object</span> and <span class="term">script</span>; see <a href="#s3.6">section 3.6</a>.<br /> +<br /> +  When <span class="term">$config["elements"]</span>, which specifies allowed elements, is <em>properly</em> defined, and neither empty nor set to <span class="term">0</span> or <span class="term">*</span>, the default set is not used. To have elements added to or removed from the default set, a <span class="term">+/-</span> notation is used. E.g., <span class="term">*-script-object</span> implies that only <span class="term">script</span> and <span class="term">object</span> are disallowed, whereas <span class="term">*+embed</span> means that <span class="term">noembed</span> is also allowed. Elements can also be specified as comma separated names. E.g., <span class="term">a, b, i</span> means only <span class="term">a</span>, <span class="term">b</span> and <span class="term">i</span> are permitted. In this notation, <span class="term">*</span>, <span class="term">+</span> and <span class="term">-</span> have no significance and can actually cause a mis-reading.<br /> +<br /> +  Some more examples of <span class="term">$config["elements"]</span> values indicating permitted elements (note that empty spaces are liberally allowed for clarity):<br /> +<br /> +  *  <span class="term">a, blockquote, code, em, strong</span> -- only <span class="term">a</span>, <span class="term">blockquote</span>, <span class="term">code</span>, <span class="term">em</span>, and <span class="term">strong</span><br /> +  *  <span class="term">*-script</span> -- all excluding <span class="term">script</span><br /> +  *  <span class="term">* -center -dir -font -isindex -menu -s -strike -u</span> -- only XHTML-Strict elements<br /> +  *  <span class="term">*+noembed-script</span> -- all including <span class="term">noembed</span> excluding <span class="term">script</span><br /> +<br /> +  Some mis-usages (and the resulting permitted elements) that can be avoided:<br /> +<br /> +  *  <span class="term">-*</span> -- none; instead of htmLawed, one might just use, e.g., the <span class="term">htmlspecialchars()</span> PHP function<br /> +  *  <span class="term">*, -script</span> -- all except <span class="term">script</span>; admin probably meant <span class="term">*-script</span><br /> +  *  <span class="term">-*, a, em, strong</span> -- all; admin probably meant <span class="term">a, em, strong</span><br /> +  *  <span class="term">*</span> -- all; admin need not have set <span class="term">elements</span><br /> +  *  <span class="term">*-form+form</span> -- all; a <span class="term">+</span> will always over-ride any <span class="term">-</span><br /> +  *  <span class="term">*, noembed</span> -- only <span class="term">noembed</span>; admin probably meant <span class="term">*+noembed</span><br /> +  *  <span class="term">a, +b, i</span> -- only <span class="term">a</span> and <span class="term">i</span>; admin probably meant <span class="term">a, b, i</span><br /> +<br /> +  Basically, when using the <span class="term">+/-</span> notation, commas (<span class="term">,</span>) should not be used, and vice versa, and <span class="term">*</span> should be used with the former but not the latter.<br /> +<br /> +  <strong>Note</strong>: Even if an element that is not in the default set is allowed through <span class="term">$config["elements"]</span>, like <span class="term">noembed</span> in the last example, it will eventually be removed during tag balancing unless such balancing is turned off (<span class="term">$config["balance"]</span> set to <span class="term">0</span>). Currently, the only way around this, which actually is simple, is to edit the various arrays in the function <span class="term">hl_bal()</span> to accommodate the element and its nesting properties.<br /> +<br /> +  <strong>A possibly second way to specify allowed elements</strong> is to set <span class="term">$config["parent"]</span> to an element name that supposedly will hold the input, and to set <span class="term">$config["balance"]</span> to <span class="term">1</span>. During tag balancing (see <a href="#s3.3.3">section 3.3.3</a>), all elements that cannot legally nest inside the parent element will be removed. The parent element is auto-reset to <span class="term">div</span> if <span class="term">$config["parent"]</span> is empty, <span class="term">body</span>, or an element not in htmLawed's default set of 86 elements.<br /> +<br /> +  <em>Tag transformation</em> is possible for improving XHTML-Strict compliance -- most of the deprecated elements are removed or converted to valid XHTML-Strict ones; see <a href="#s3.3.2">section 3.3.2</a>.<br /> + +<div class="sub-sub-section"><h4> +<a name="s3.3.1" id="s3.3.1"></a><span class="item-no">3.3.1</span>  Handling of comments and CDATA sections +</h4><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> +<br /> +  <span class="term">CDATA</span> sections have the format <span class="term"><![CDATA[...anything but not "]]>"...]]></span>, and HTML comments, <span class="term"><!--...anything but not "-->"... --></span>. Neither HTML comments nor <span class="term">CDATA</span> sections can reside inside tags. HTML comments can exist anywhere else, but <span class="term">CDATA</span> sections can exist only where plain text is allowed (e.g., immediately inside <span class="term">td</span> element content but not immediately inside <span class="term">tr</span> element content).<br /> +<br /> +  htmLawed (function <span class="term">hl_cmtcd()</span>) handles HTML comments or <span class="term">CDATA</span> sections depending on the values of <span class="term">$config["comment"]</span> or <span class="term">$config["cdata"]</span>. If <span class="term">0</span>, such markup is not looked for and the text is processed like plain text. If <span class="term">1</span>, it is removed completely. If <span class="term">2</span>, it is preserved but any <span class="term"><</span>, <span class="term">></span> and <span class="term">&</span> inside are changed to entities. If <span class="term">3</span>, they are left as such.<br /> +<br /> +  Note that for the last two cases, HTML comments and <span class="term">CDATA</span> sections will always be removed from tag content (function <span class="term">hl_tag()</span>).<br /> +<br /> +  Examples:<br /> +<br /> +  Input:<br /> + +<code class="code">    <!-- home link --><a href="home.htm"><![CDATA[x=&y]]>Home</a></code> +<br /> +  Output (<span class="term">$config["comment"] = 0, $config["cdata"] = 2</span>):<br /> + +<code class="code">    &lt;-- home link --&gt;<a href="home.htm"><![CDATA[x=&amp;y]]>Home</a></code> +<br /> +  Output (<span class="term">$config["comment"] = 1, $config["cdata"] = 2</span>):<br /> + +<code class="code">    <a href="home.htm"><![CDATA[x=&amp;y]]>Home</a></code> +<br /> +  Output (<span class="term">$config["comment"] = 2, $config["cdata"] = 2</span>):<br /> + +<code class="code">    <!-- home link --><a href="home.htm"><![CDATA[x=&amp;y]]>Home</a></code> +<br /> +  Output (<span class="term">$config["comment"] = 2, $config["cdata"] = 1</span>):<br /> + +<code class="code">    <!-- home link --><a href="home.htm">Home</a></code> +<br /> +  Output (<span class="term">$config["comment"] = 3, $config["cdata"] = 3</span>):<br /> + +<code class="code">    <!-- home link --><a href="home.htm"><![CDATA[x=&y]]>Home</a></code> +<br /> +<br /> +  For standard-compliance, comments are given the form <span class="term"><!--comment --></span>, and any <span class="term">--</span> in the content is made <span class="term">-</span>.<br /> +<br /> +  When <span class="term">$config["safe"] = 1</span>, CDATA sections and comments are considered plain text unless <span class="term">$config["comment"]</span> or <span class="term">$config["cdata"]</span> is explicitly specified; see <a href="#s3.6">section 3.6</a>.<br /> + +</div> +<div class="sub-sub-section"><h4> +<a name="s3.3.2" id="s3.3.2"></a><span class="item-no">3.3.2</span>  Tag-transformation for better XHTML-Strict +</h4><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> +<br /> +  If <span class="term">$config["make_tag_strict"]</span> is set and not <span class="term">0</span>, following non-XHTML-Strict elements (and attributes), even if admin-permitted, are mutated as indicated (element content remains intact; function <span class="term">hl_tag2()</span>):<br /> +<br /> +  *  applet - (based on <span class="term">$config["make_tag_strict"]</span>, unchanged (<span class="term">1</span>) or removed (<span class="term">2</span>))<br /> +  *  center - <span class="term">div style="text-align: center;"</span><br /> +  *  dir - <span class="term">ul</span><br /> +  *  embed - (based on <span class="term">$config["make_tag_strict"]</span>, unchanged (<span class="term">1</span>) or removed (<span class="term">2</span>))<br /> +  *  font (face, size, color) -    <span class="term">span style="font-family: ; font-size: ; color: ;"</span> (size transformation <a href="http://style.cleverchimp.com/font_size_intervals/altintervals.html">reference</a>)<br /> +  *  isindex - (based on <span class="term">$config["make_tag_strict"]</span>, unchanged (<span class="term">1</span>) or removed (<span class="term">2</span>))<br /> +  *  menu - <span class="term">ul</span><br /> +  *  s - <span class="term">span style="text-decoration: line-through;"</span><br /> +  *  strike - <span class="term">span style="text-decoration: line-through;"</span><br /> +  *  u - <span class="term">span style="text-decoration: underline;"</span><br /> +<br /> +  For an element with a pre-existing <span class="term">style</span> attribute value, the extra style properties are appended.<br /> +<br /> +  Example input:<br /> +<br /> + +<code class="code">    <center></code> +<br /> + +<code class="code">     The PHP <s>software</s> script used for this <strike>web-page</strike> web-page is <font style="font-weight: bold " face=arial size='+3' color   =  "red  ">htmLawedTest.php</font>, from <u style= 'color:green'>PHP Labware</u>.</code> +<br /> + +<code class="code">    </center></code> +<br /> +<br /> +  The output:<br /> +<br /> + +<code class="code">    <div style="text-align: center;"></code> +<br /> + +<code class="code">     The PHP <span style="text-decoration: line-through;">software</span> script used for this <span style="text-decoration: line-through;">web-page</span> web-page is <span style="font-weight: bold; font-family: arial; color: red; font-size: 200%;">htmLawedTest.php</span>, from <span style="color:green; text-decoration: underline;">PHP Labware</span>.</code> +<br /> + +<code class="code">    </div></code> +<br /> + +</div> +<div class="sub-section"><h3> +<a name="s3.3.3" id="s3.3.3"></a><span class="item-no">3.3.3</span>  Tag balancing and proper nesting +</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> +<br /> +  If <span class="term">$config["balance"]</span> is set to <span class="term">1</span>, htmLawed (function <span class="term">hl_bal()</span>) checks and corrects the input to have properly balanced tags and legal element content (i.e., any element nesting should be valid, and plain text may be present only in the content of elements that allow them).<br /> +<br /> +  Depending on the value of <span class="term">$config["keep_bad"]</span> (see <a href="#s2.2">section 2.2</a> and <a href="#s3.3">section 3.3</a>), illegal content may be removed or neutralized to plain text by converting < and > to entities:<br /> +<br /> +  <span class="term">0</span> - remove; this option is available only to maintain Kses-compatibility and should not be used otherwise (see <a href="#s2.6">section 2.6</a>)<br /> +  <span class="term">1</span> - neutralize tags and keep element content<br /> +  <span class="term">2</span> - remove tags but keep element content<br /> +  <span class="term">3</span> and <span class="term">4</span> - like <span class="term">1</span> and <span class="term">2</span>, but keep element content only if text (<span class="term">pcdata</span>) is valid in parent element as per specs<br /> +  <span class="term">5</span> and <span class="term">6</span> -  like <span class="term">3</span> and <span class="term">4</span>, but line-breaks, tabs and spaces are left<br /> +<br /> +  Example input (disallowing the <span class="term">p</span> element):<br /> +<br /> + +<code class="code">    <*> Pseudo-tags <*></code> +<br /> + +<code class="code">    <xml>Non-HTML tag xml</xml></code> +<br /> + +<code class="code">    <p></code> +<br /> + +<code class="code">    Disallowed tag p</code> +<br /> + +<code class="code">    </p></code> +<br /> + +<code class="code">    <ul>Bad<li>OK</li></ul></code> +<br /> +<br /> +  The output with <span class="term">$config["keep_bad"] = 1</span>:<br /> +<br /> + +<code class="code">    &lt;*&gt; Pseudo-tags &lt;*&gt;</code> +<br /> + +<code class="code">    &lt;xml&gt;Non-HTML tag xml&lt;/xml&gt;</code> +<br /> + +<code class="code">    &lt;p&gt;</code> +<br /> + +<code class="code">    Disallowed tag p</code> +<br /> + +<code class="code">    &lt;/p&gt;</code> +<br /> + +<code class="code">    <ul>Bad<li>OK</li></ul></code> +<br /> +<br /> +  The output with <span class="term">$config["keep_bad"] = 3</span>:<br /> +<br /> + +<code class="code">    &lt;*&gt; Pseudo-tags &lt;*&gt;</code> +<br /> + +<code class="code">    &lt;xml&gt;Non-HTML tag xml&lt;/xml&gt;</code> +<br /> + +<code class="code">    &lt;p&gt;</code> +<br /> + +<code class="code">    Disallowed tag p</code> +<br /> + +<code class="code">    &lt;/p&gt;</code> +<br /> + +<code class="code">    <ul><li>OK</li></ul></code> +<br /> +<br /> +  The output with <span class="term">$config["keep_bad"] = 6</span>:<br /> +<br /> + +<code class="code">    &lt;*&gt; Pseudo-tags &lt;*&gt;</code> +<br /> + +<code class="code">    Non-HTML tag xml</code> +<br /> +<br /> + +<code class="code">    Disallowed tag p</code> +<br /> +<br /> + +<code class="code">    <ul><li>OK</li></ul></code> +<br /> +<br /> +  An option like <span class="term">1</span> is useful, e.g., when a writer previews his submission, whereas one like <span class="term">3</span> is useful before content is finalized and made available to all.<br /> +<br /> +  <strong>Note:</strong> In the example above, unlike <span class="term"><*></span>, <span class="term"><xml></span> gets considered as a tag (even though there is no HTML element named <span class="term">xml</span>). In general, text matching the regular expression pattern <span class="term"><(/?)([a-zA-Z][a-zA-Z1-6]*)([^>]*?)\s?></span> is considered a tag (phrase enclosed by the angled brackets <span class="term"><</span> and <span class="term">></span>, and starting [with an optional slash preceding] with an alphanumeric word that starts with an alphabet...).<br /> +<br /> +  Nesting/content rules for each of the 86 elements in htmLawed's default set (see <a href="#s3.3">section 3.3</a>) are defined in function <span class="term">hl_bal()</span>. This means that if a non-standard element besides <span class="term">embed</span> is being permitted through <span class="term">$config["elements"]</span>, the element's tag content will end up getting removed if <span class="term">$config["balance"]</span> is set to <span class="term">1</span>.<br /> +<br /> +  Plain text and/or certain elements nested inside <span class="term">blockquote</span>, <span class="term">form</span>, <span class="term">map</span> and <span class="term">noscript</span> need to be in block-level elements. This point is often missed during manual writing of HTML code. htmLawed attempts to address this during balancing. E.g., if the parent container is set as <span class="term">form</span>, the input <span class="term">B:<input type="text" value="b" />C:<input type="text" value="c" /></span> is converted to <span class="term"><div>B:<input type="text" value="b" />C:<input type="text" value="c" /></div></span>.<br /> + +</div> +<div class="sub-section"><h3> +<a name="s3.3.4" id="s3.3.4"></a><span class="item-no">3.3.4</span>  Elements requiring child elements +</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> +<br /> +  As per specs, the following elements require legal child elements nested inside them:<br /> +<br /> + +<code class="code">    blockquote, dir, dl, form, map, menu, noscript, ol, optgroup, rbc, rtc, ruby, select, table, tbody, tfoot, thead, tr, ul</code> +<br /> +<br /> +  In some cases, the specs stipulate the number and/or the ordering of the child elements. A <span class="term">table</span> can have 0 or 1 <span class="term">caption</span>, <span class="term">tbody</span>, <span class="term">tfoot</span>, and <span class="term">thead</span>, but they must be in this order: <span class="term">caption</span>, <span class="term">thead</span>, <span class="term">tfoot</span>, <span class="term">tbody</span>.<br /> +<br /> +  htmLawed currently does not check for conformance to these rules. Note that any non-compliance in this regard will not introduce security vulnerabilities, crash browser applications, or affect the rendering of web-pages.<br /> + +</div> +<div class="sub-section"><h3> +<a name="s3.3.5" id="s3.3.5"></a><span class="item-no">3.3.5</span>  Beautify or compact HTML +</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> +<br /> +  By default, htmLawed will neither <em>beautify</em> HTML code by formatting it with indentations, etc., nor will it make it compact by removing un-needed white-space.(It does always properly white-space tag content.)<br /> +<br /> +  As per the HTML standards, spaces, tabs and line-breaks in web-pages (except those inside <span class="term">pre</span> elements) are all considered equivalent, and referred to as <em>white-spaces</em>. Browser applications are supposed to consider contiguous white-spaces as just a single space, and to disregard white-spaces trailing opening tags or preceding closing tags. This white-space <em>normalization</em> allows the use of text/code beautifully formatted with indentations and line-spacings for readability. Such <em>pretty</em> HTML can, however, increase the size of web-pages, or make the extraction or scraping of plain text cumbersome.<br /> +<br /> +  With the <span class="term">$config</span> parameter <span class="term">tidy</span>, htmLawed can be used to beautify or compact the input text. Input with just plain text and no HTML markup is also subject to this. Besides <span class="term">pre</span>, the <span class="term">script</span> and <span class="term">textarea</span> elements, CDATA sections, and HTML comments are not subjected to the tidying process.<br /> +<br /> +  To <em>compact</em>, use <span class="term">$config["tidy"] = -1</span>; single instances or runs of white-spaces are replaced with a single space, and white-spaces trailing and leading open and closing tags, respectively, are removed.<br /> +<br /> +  To <em>beautify</em>, <span class="term">$config["tidy"]</span> is set as <span class="term">1</span>, or for customized tidying, as a string like <span class="term">2s2n</span>. The <span class="term">s</span> or <span class="term">t</span> character specifies the use of spaces or tabs for indentation. The first and third characters, any of the digits 0-9, specify the number of spaces or tabs per indentation, and any parental lead spacing (extra indenting of the whole block of input text). The <span class="term">r</span> and <span class="term">n</span> characters are used to specify line-break characters: <span class="term">n</span> for <span class="term">\n</span> (Unix/Mac OS X line-breaks), <span class="term">rn</span> or <span class="term">nr</span> for <span class="term">\r\n</span> (Windows/DOS line-breaks), or <span class="term">r</span> for <span class="term">\r</span>.<br /> +<br /> +  The <span class="term">$config["tidy"]</span> value of <span class="term">1</span> is equivalent to <span class="term">2s0n</span>. Other <span class="term">$config["tidy"]</span> values are read loosely: a value of <span class="term">4</span> is equivalent to <span class="term">4s0n</span>; <span class="term">t2</span>, to <span class="term">1t2n</span>; <span class="term">s</span>, to <span class="term">2s0n</span>; <span class="term">2TR</span>, to <span class="term">2t0r</span>; <span class="term">T1</span>, to <span class="term">1t1n</span>; <span class="term">nr3</span>, to <span class="term">3s0nr</span>, and so on. Except in the indentations and line-spacings, runs of white-spaces are replaced with a single space during beautification.<br /> +<br /> +  Input formatting using <span class="term">$config["tidy"]</span> is not recommended when input text has mixed markup (like HTML + PHP).<br /> + +</div> +</div> +<div class="sub-section"><h3> +<a name="s3.4" id="s3.4"></a><span class="item-no">3.4</span>  Attributes +</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> +<br /> +  htmLawed will only permit attributes described in the HTML specs (including deprecated ones). It also permits some attributes for use with the <span class="term">embed</span> element (the non-standard <span class="term">embed</span> element is supported in htmLawed because of its widespread use), and the the <span class="term">xml:space</span> attribute (valid only in XHTML 1.1). A list of such 111 attributes and the elements they are allowed in is in <a href="#s5.2">section 5.2</a>.<br /> +<br /> +  When <span class="term">$config["deny_attribute"]</span> is not set, or set to <span class="term">0</span>, or empty (<span class="term">""</span>), all the 111 attributes are permitted. Otherwise, <span class="term">$config["deny_attribute"]</span> can be set as a list of comma-separated names of the denied attributes. <span class="term">on*</span> can be used to refer to the group of potentially dangerous, script-accepting attributes: <span class="term">onblur</span>, <span class="term">onchange</span>, <span class="term">onclick</span>, <span class="term">ondblclick</span>, <span class="term">onfocus</span>, <span class="term">onkeydown</span>, <span class="term">onkeypress</span>, <span class="term">onkeyup</span>, <span class="term">onmousedown</span>, <span class="term">onmousemove</span>, <span class="term">onmouseout</span>, <span class="term">onmouseover</span>, <span class="term">onmouseup</span>, <span class="term">onreset</span>, <span class="term">onselect</span> and <span class="term">onsubmit</span>.<br /> +<br /> +  Note that attributes specified in <span class="term">$config["deny_attribute"]</span> are denied globally, for all elements. To deny attributes for only specific elements, <span class="term">$spec</span> (see <a href="#s2.3">section 2.3</a>) can be used. <span class="term">$spec</span> can also be used to element-specifically permit an attribute otherwise denied through <span class="term">$config["deny_attribute"]</span>.<br /> +<br /> +  With <span class="term">$config["safe"] = 1</span> (<a href="#s3.6">section 3.6</a>), the <span class="term">on*</span> attributes are automatically disallowed.<br /> +<br /> +  <strong>Note</strong>: To deny all but a few attributes globally, a simpler way to specify <span class="term">$config["deny_attribute"]</span> would be to use the notation <span class="term">* -attribute1 -attribute2 ...</span>. Thus, a value of <span class="term">* -title -href</span> implies that except <span class="term">href</span> and <span class="term">title</span> (where allowed as per standards) all other attributes are to be removed. With this notation, the value for the parameter <span class="term">safe</span> (<a href="#s3.6">section 3.6</a>) will have no effect on <span class="term">deny_attribute</span>.<br /> +<br /> +  htmLawed (function <span class="term">hl_tag()</span>) also:<br /> +<br /> +  *  Lower-cases attribute names<br /> +  *  Removes duplicate attributes (last one stays)<br /> +  *  Gives attributes the form <span class="term">name="value"</span> and single-spaces them, removing unnecessary white-spacing<br /> +  *  Provides <em>required</em> attributes (see <a href="#s3.4.1">section 3.4.1</a>)<br /> +  *  Double-quotes values and escapes any <span class="term">"</span> inside them<br /> +  *  Replaces the possibly dangerous soft-hyphen characters (hexadecimal code-point <span class="term">ad</span>) in the values with spaces<br /> +  *  Allows custom function to additionally filter/modify attribute values (see <a href="#s3.4.9">section 3.4.9</a>)<br /> + +<div class="sub-sub-section"><h4> +<a name="s3.4.1" id="s3.4.1"></a><span class="item-no">3.4.1</span>  Auto-addition of XHTML-required attributes +</h4><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> +<br /> +  If indicated attributes for the following elements are found missing, htmLawed (function <span class="term">hl_tag()</span>) will add them (with values same as attribute names unless indicated otherwise below):<br /> +<br /> +  *  area - alt (<span class="term">area</span>)<br /> +  *  area, img - src, alt (<span class="term">image</span>)<br /> +  *  bdo - dir (<span class="term">ltr</span>)<br /> +  *  form - action<br /> +  *  map - name<br /> +  *  optgroup - label<br /> +  *  param - name<br /> +  *  script - type (<span class="term">text/javascript</span>)<br /> +  *  textarea - rows (<span class="term">10</span>), cols (<span class="term">50</span>)<br /> +<br /> +  Additionally, with <span class="term">$config["xml:lang"]</span> set to <span class="term">1</span> or <span class="term">2</span>, if the <span class="term">lang</span> but not the <span class="term">xml:lang</span> attribute is declared, then the latter is added too, with a value copied from that of <span class="term">lang</span>. This is for better standard-compliance. With <span class="term">$config["xml:lang"]</span> set to <span class="term">2</span>, the <span class="term">lang</span> attribute is removed (XHTML 1.1 specs).<br /> +<br /> +  Note that the <span class="term">name</span> attribute for <span class="term">map</span>, invalid in XHTML 1.1, is also transformed if required -- see <a href="#s3.4.6">section 3.4.6</a>.<br /> + +</div> +<div class="sub-sub-section"><h4> +<a name="s3.4.2" id="s3.4.2"></a><span class="item-no">3.4.2</span>  Duplicate/invalid <span class="term">id</span> values +</h4><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> +<br /> +  If <span class="term">$config["unique_ids"]</span> is <span class="term">1</span>, htmLawed (function <span class="term">hl_tag()</span>) removes <span class="term">id</span> attributes with values that are not XHTML-compliant (must begin with a letter and can contain letters, digits, <span class="term">:</span>, <span class="term">.</span>, <span class="term">-</span> and <span class="term">_</span>) or duplicate. If <span class="term">$config["unique_ids"]</span> is a word, any duplicate but otherwise valid value will be appropriately prefixed with the word to ensure its uniqueness. The word should begin with a letter and should contain only letters, numbers, <span class="term">:</span>, <span class="term">.</span>, <span class="term">_</span> and <span class="term">-</span>.<br /> +<br /> +  Even if multiple inputs need to be filtered (through multiple calls to htmLawed), htmLawed ensures uniqueness of <span class="term">id</span> values as it uses a global variable (<span class="term">$GLOBALS["hl_Ids"]</span> array). Further, an admin can restrict the use of certain <span class="term">id</span> values by presetting this variable before htmLawed is called into use. E.g.:<br /> +<br /> + +<code class="code">    $GLOBALS['hl_Ids'] = array('top'=>1, 'bottom'=>1, 'myform'=>1); // id values not allowed in input</code> +<br /> + +<code class="code">    $processed = htmLawed($text); // filter input</code> +<br /> + +</div> +<div class="sub-sub-section"><h4> +<a name="s3.4.3" id="s3.4.3"></a><span class="item-no">3.4.3</span>  URL schemes (protocols) and scripts in attribute values +</h4><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> +<br /> +  htmLawed edits attributes that take URLs as values if they are found to contain un-permitted schemes. E.g., if the <span class="term">afp</span> scheme is not permitted, then <span class="term"><a href="afp://domain.org"></span> becomes <span class="term"><a href="denied:afp://domain.org"></span>, and if Javascript is not permitted <span class="term"><a onclick="javascript:xss();"></span> becomes <span class="term"><a onclick="denied:javascript:xss();"></span>.<br /> +<br /> +  By default htmLawed permits these schemes in URLs for the <span class="term">href</span> attribute:<br /> +<br /> + +<code class="code">    aim, feed, file, ftp, gopher, http, https, irc, mailto, news, nntp, sftp, ssh, telnet</code> +<br /> +<br /> +  Also, only <span class="term">file</span>, <span class="term">http</span> and <span class="term">https</span> are permitted in attributes whose names start with <span class="term">o</span> (like <span class="term">onmouseover</span>), and in these attributes that accept URLs:<br /> +<br /> + +<code class="code">    action, cite, classid, codebase, data, href, longdesc, model, pluginspage, pluginurl, src, style, usemap</code> +<br /> +<br /> +  These default sets are used when <span class="term">$config["schemes"]</span> is not set (see <a href="#s2.2">section 2.2</a>). To over-ride the defaults, <span class="term">$config["schemes"]</span> is defined as a string of semi-colon-separated sub-strings of type <span class="term">attribute: comma-separated schemes</span>. E.g., <span class="term">href: mailto, http, https; onclick: javascript; src: http, https</span>. For unspecified attributes, <span class="term">file</span>, <span class="term">http</span> and <span class="term">https</span> are permitted. This can be changed by passing schemes for <span class="term">*</span> in <span class="term">$config["schemes"]</span>. E.g., <span class="term">href: mailto, http, https; *: https, https</span>.<br /> +<br /> +  <span class="term">*</span> can be put in the list of schemes to permit all protocols. E.g., <span class="term">style: *; img: http, https</span> results in protocols not being checked in <span class="term">style</span> attribute values. However, in such cases, any relative-to-absolute URL conversion, or vice versa, (<a href="#s3.4.4">section 3.4.4</a>) is not done.<br /> +<br /> +  Thus, <em>to allow Javascript</em>, one can set <span class="term">$config["schemes"]</span> as <span class="term">href: mailto, http, https; *: http, https, javascript</span>, or <span class="term">href: mailto, http, https, javascript; *: http, https, javascript</span>, or <span class="term">*: *</span>, and so on.<br /> +<br /> +  As a side-note, one may find <span class="term">style: *</span> useful as URLs in <span class="term">style</span> attributes can be specified in a variety of ways, and the patterns that htmLawed uses to identify URLs may mistakenly identify non-URL text.<br /> +<br /> +  <strong>Note</strong>: If URL-accepting attributes other than those listed above are being allowed, then the scheme will not be checked unless the attribute name contains the string <span class="term">src</span> (e.g., <span class="term">dynsrc</span>) or starts with <span class="term">o</span> (e.g., <span class="term">onbeforecopy</span>).<br /> +<br /> +  With <span class="term">$config["safe"] = 1</span>, all URLs are disallowed in the <span class="term">style</span> attribute values.<br /> + +</div> +<div class="sub-sub-section"><h4> +<a name="s3.4.4" id="s3.4.4"></a><span class="item-no">3.4.4</span>  Absolute & relative URLs in attribute values +</h4><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> +<br /> +  htmLawed can make absolute URLs in attributes like <span class="term">href</span> relative (<span class="term">$config["abs_url"]</span> is <span class="term">-1</span>), and vice versa (<span class="term">$config["abs_url"]</span> is <span class="term">1</span>). URLs in scripts are not considered for this, and so are URLs like <span class="term">#section_6</span> (fragment), <span class="term">?name=Tim#show</span> (starting with query string), and <span class="term">;var=1?name=Tim#show</span> (starting with parameters). Further, this requires that <span class="term">$config["base_url"]</span> be set properly, with the <span class="term">://</span> and a trailing slash (<span class="term">/</span>), with no query string, etc. E.g., <span class="term">file:///D:/page/</span>, <span class="term">https://abc.com/x/y/</span>, or <span class="term">http://localhost/demo/</span> are okay, but <span class="term">file:///D:/page/?help=1</span>, <span class="term">abc.com/x/y/</span> and <span class="term">http://localhost/demo/index.htm</span> are not.<br /> +<br /> +  For making absolute URLs relative, only those URLs that have the <span class="term">$config["base_url"]</span> string at the beginning are converted. E.g., with <span class="term">$config["base_url"] = "https://abc.com/x/y/"</span>, <span class="term">https://abc.com/x/y/a.gif</span> and <span class="term">https://abc.com/x/y/z/b.gif</span> become <span class="term">a.gif</span> and <span class="term">z/b.gif</span> respectively, while <span class="term">https://abc.com/x/c.gif</span> is not changed.<br /> +<br /> +  When making relative URLs absolute, only values for scheme, network location (host-name) and path values in the base URL are inherited. See <a href="#s5.5">section 5.5</a> for more about the URL specification as per RFC <a href="http://www.ietf.org/rfc/rfc1808.txt">1808</a>.<br /> + +</div> +<div class="sub-sub-section"><h4> +<a name="s3.4.5" id="s3.4.5"></a><span class="item-no">3.4.5</span>  Lower-cased, standard attribute values +</h4><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> +<br /> +  Optionally, for standard-compliance, htmLawed (function <span class="term">hl_tag()</span>) lower-cases standard attribute values to give, e.g., <span class="term">input type="password"</span> instead of <span class="term">input type="Password"</span>, if <span class="term">$config["lc_std_val"]</span> is <span class="term">1</span>. Attribute values matching those listed below for any of the elements (plus those for the <span class="term">type</span> attribute of <span class="term">button</span> or <span class="term">input</span>) are lower-cased:<br /> +<br /> + +<code class="code">    all, baseline, bottom, button, center, char, checkbox, circle, col, colgroup, cols, data, default, file, get, groups, hidden, image, justify, left, ltr, middle, none, object, password, poly, post, preserve, radio, rect, ref, reset, right, row, rowgroup, rows, rtl, submit, text, top</code> +<br /> +<br /> + +<code class="code">    a, area, bdo, button, col, form, img, input, object, option, optgroup, param, script, select, table, td, tfoot, th, thead, tr, xml:space</code> +<br /> +<br /> +  The following <em>empty</em> (<em>minimized</em>) attributes are always assigned lower-cased values (same as the names):<br /> +<br /> + +<code class="code">    checked, compact, declare, defer, disabled, ismap, multiple, nohref, noresize, noshade, nowrap, readonly, selected</code> +<br /> + +</div> +<div class="sub-sub-section"><h4> +<a name="s3.4.6" id="s3.4.6"></a><span class="item-no">3.4.6</span>  Transformation of deprecated attributes +</h4><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> +<br /> +  If <span class="term">$config["no_deprecated_attr"]</span> is <span class="term">0</span>, then deprecated attributes (see appendix in <a href="#s5.2">section 5.2</a>) are removed and, in most cases, their values are transformed to CSS style properties and added to the <span class="term">style</span> attributes (function <span class="term">hl_tag()</span>). Except for <span class="term">bordercolor</span> for <span class="term">table</span>, <span class="term">tr</span> and <span class="term">td</span>, the scores of proprietary attributes that were never part of any cross-browser standard are not supported.<br /> +<br /> +  <strong>Note</strong>: The attribute <span class="term">target</span> for <span class="term">a</span> is allowed even though it is not in XHTML 1.0 specs. This is because of the attribute's wide-spread use and browser-support, and because the attribute is valid in XHTML 1.1 onwards.<br /> +<br /> +  *  align - for <span class="term">img</span> with value of <span class="term">left</span> or <span class="term">right</span>, becomes, e.g., <span class="term">float: left</span>; for <span class="term">div</span> and <span class="term">table</span> with value <span class="term">center</span>, becomes <span class="term">margin: auto</span>; all others become, e.g., <span class="term">text-align: right</span><br /> +<br /> +  *  bgcolor - E.g., <span class="term">bgcolor="#ffffff"</span> becomes <span class="term">background-color: #ffffff</span><br /> +  *  border - E.g., <span class="term">height= "10"</span> becomes <span class="term">height: 10px</span><br /> +  *  bordercolor - E.g., <span class="term">bordercolor=#999999</span> becomes <span class="term">border-color: #999999;</span><br /> +  *  compact - <span class="term">font-size: 85%</span><br /> +  *  clear - E.g., 'clear="all" becomes <span class="term">clear: both</span><br /> +<br /> +  *  height - E.g., <span class="term">height= "10"</span> becomes <span class="term">height: 10px</span> and <span class="term">height="*"</span> becomes <span class="term">height: auto</span><br /> +<br /> +  *  hspace - E.g., <span class="term">hspace="10"</span> becomes <span class="term">margin-left: 10px; margin-right: 10px</span><br /> +  *  language - <span class="term">language="VBScript"</span> becomes <span class="term">type="text/vbscript"</span><br /> +  *  name - E.g., <span class="term">name="xx"</span> becomes <span class="term">id="xx"</span><br /> +  *  noshade - <span class="term">border-style: none; border: 0; background-color: gray; color: gray</span><br /> +  *  nowrap - <span class="term">white-space: nowrap</span><br /> +  *  size - E.g., <span class="term">size="10"</span> becomes <span class="term">height: 10px</span><br /> +  *  start - removed<br /> +  *  type - E.g., <span class="term">type="i"</span> becomes <span class="term">list-style-type: lower-roman</span><br /> +  *  value - removed<br /> +  *  vspace - E.g., <span class="term">vspace="10"</span> becomes <span class="term">margin-top: 10px; margin-bottom: 10px</span><br /> +  *  width - like <span class="term">height</span><br /> +<br /> +  Example input:<br /> +<br /> + +<code class="code">    <img src="j.gif" alt="image" name="dad's" /><img src="k.gif" alt="image" id="dad_off" name="dad" /></code> +<br /> + +<code class="code">    <br clear="left" /></code> +<br /> + +<code class="code">    <hr noshade size="1" /></code> +<br /> + +<code class="code">    <img name="img" src="i.gif" align="left" alt="image" hspace="10" vspace="10" width="10em" height="20" border="1" style="padding:5px;" /></code> +<br /> + +<code class="code">    <table width="50em" align="center" bgcolor="red"></code> +<br /> + +<code class="code">     <tr></code> +<br /> + +<code class="code">      <td width="20%"></code> +<br /> + +<code class="code">       <div align="center"></code> +<br /> + +<code class="code">        <h3 align="right">Section</h3></code> +<br /> + +<code class="code">        <p align="right">Para</p></code> +<br /> + +<code class="code">        <ol type="a" start="e"><li value="x">First item</li></ol></code> +<br /> + +<code class="code">       </div></code> +<br /> + +<code class="code">      </td></code> +<br /> + +<code class="code">      <td width="*"></code> +<br /> + +<code class="code">       <ol type="1"><li>First item</li></ol></code> +<br /> + +<code class="code">      </td></code> +<br /> + +<code class="code">     </tr></code> +<br /> + +<code class="code">    </table></code> +<br /> + +<code class="code">    <br clear="all" /></code> +<br /> +<br /> +  And the output with <span class="term">$config["no_deprecated_attr"] = 1</span>:<br /> +<br /> + +<code class="code">    <img src="j.gif" alt="image" /><img src="k.gif" alt="image" id="dad_off" /></code> +<br /> + +<code class="code">    <br style="clear: left;" /></code> +<br /> + +<code class="code">    <hr style="border-style: none; border: 0; background-color: gray; color: gray; size: 1px;" /></code> +<br /> + +<code class="code">    <img src="i.gif" alt="image" width="10em" height="20" style="padding:5px; float: left; margin-left: 10px; margin-right: 10px; margin-top: 10px; margin-bottom: 10px; border: 1px;" id="img" /></code> +<br /> + +<code class="code">    <table width="50em" style="margin: auto; background-color: red;"></code> +<br /> + +<code class="code">     <tr></code> +<br /> + +<code class="code">      <td style="width: 20%;"></code> +<br /> + +<code class="code">       <div style="margin: auto;"></code> +<br /> + +<code class="code">        <h3 style="text-align: right;">Section</h3></code> +<br /> + +<code class="code">        <p style="text-align: right;">Para</p></code> +<br /> + +<code class="code">        <ol style="list-style-type: lower-latin;"><li>First item</li></ol></code> +<br /> + +<code class="code">       </div></code> +<br /> + +<code class="code">      </td></code> +<br /> + +<code class="code">      <td style="width: auto;"></code> +<br /> + +<code class="code">       <ol style="list-style-type: decimal;"><li>First item</li></ol></code> +<br /> + +<code class="code">      </td></code> +<br /> + +<code class="code">     </tr></code> +<br /> + +<code class="code">    </table></code> +<br /> + +<code class="code">    <br style="clear: both;" /></code> +<br /> +<br /> +  For <span class="term">lang</span>, deprecated in XHTML 1.1, transformation is taken care of through <span class="term">$config["xml:lang"]</span>; see <a href="#s3.4.1">section 3.4.1</a>.<br /> +<br /> +  The attribute <span class="term">name</span> is deprecated in <span class="term">form</span>, <span class="term">iframe</span>, and <span class="term">img</span>, and is replaced with <span class="term">id</span> if an <span class="term">id</span> attribute doesn't exist and if the <span class="term">name</span> value is appropriate for <span class="term">id</span>. For such replacements for <span class="term">a</span> and <span class="term">map</span>, for which the <span class="term">name</span> attribute is deprecated in XHTML 1.1, <span class="term">$config["no_deprecated_attr"]</span> should be set to <span class="term">2</span> (when set to <span class="term">1</span>, for these two elements, the <span class="term">name</span> attribute is retained).<br /> + +</div> +<div class="sub-section"><h3> +<a name="s3.4.7" id="s3.4.7"></a><span class="item-no">3.4.7</span>  Anti-spam & <span class="term">href</span> +</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> +<br /> +  htmLawed (function <span class="term">hl_tag()</span>) can check the <span class="term">href</span> attribute values (link addresses) as an anti-spam (email or link spam) measure.<br /> +<br /> +  If <span class="term">$config["anti_mail_spam"]</span> is not <span class="term">0</span>, the <span class="term">@</span> of email addresses in <span class="term">href</span> values like <span class="term">mailto:a@b.com</span> is replaced with text specified by <span class="term">$config["anti_mail_spam"]</span>. The text should be of a form that makes it clear to others that the address needs to be edited before a mail is sent; e.g., <span class="term"><remove_this_antispam>@</span> (makes the example address <span class="term">a<remove_this_antispam>@b.com</span>).<br /> +<br /> +  For regular links, one can choose to have a <span class="term">rel</span> attribute with <span class="term">nofollow</span> in its value (which tells some search engines to not follow a link). This can discourage link spammers. Additionally, or as an alternative, one can choose to empty the <span class="term">href</span> value altogether (disable the link).<br /> +<br /> +  For use of these options, <span class="term">$config["anti_link_spam"]</span> should be set as an array with values <span class="term">regex1</span> and <span class="term">regex2</span>, both or one of which can be empty (like <span class="term">array("", "regex2")</span>) to indicate that that option is not to be used. Otherwise, <span class="term">regex1</span> or <span class="term">regex2</span> should be PHP- and PCRE-compatible regular expression patterns: <span class="term">href</span> values will be matched against them and those matching the pattern will accordingly be treated.<br /> +<br /> +  Note that the regular expressions should have <em>delimiters</em>, and be well-formed and preferably fast. Absolute efficiency/accuracy is often not needed.<br /> +<br /> +  An example, to have a <span class="term">rel</span> attribute with <span class="term">nofollow</span> for all links, and to disable links that do not point to domains <span class="term">abc.com</span> and <span class="term">xyz.org</span>:<br /> +<br /> + +<code class="code">    $config["anti_link_spam"] = array('`.`', '`://\W*(?!(abc\.com|xyz\.org))`');</code> +<br /> + +</div> +<div class="sub-section"><h3> +<a name="s3.4.8" id="s3.4.8"></a><span class="item-no">3.4.8</span>  Inline style properties +</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> +<br /> +  htmLawed can check URL schemes and dynamic expressions (to guard against Javascript, etc., script-based insecurities) in inline CSS style property values in the <span class="term">style</span> attributes. (CSS properties like <span class="term">background-image</span> that accept URLs in their values are noted in <a href="#s5.3">section 5.3</a>.) Dynamic CSS expressions that allow scripting in the IE browser, and can be a vulnerability, can be removed from property values by setting <span class="term">$config["css_expression"]</span> to <span class="term">1</span> (default setting).<br /> +<br /> +  <strong>Note</strong>: Because of the various ways of representing characters in attribute values (URL-escapement, entitification, etc.), htmLawed might alter the values of the <span class="term">style</span> attribute values, and may even falsely identify dynamic CSS expressions and URL schemes in them. If this is an important issue, checking of URLs and dynamic expressions can be turned off (<span class="term">$config["schemes"] = "...style:*..."</span>, see <a href="#s3.4.3">section 3.4.3</a>, and <span class="term">$config["css_expression"] = 0</span>). Alternately, admins can use their own custom function for finer handling of <span class="term">style</span> values through the <span class="term">hook_tag</span> parameter (see <a href="#s3.4.9">section 3.4.9</a>).<br /> +<br /> +  It is also possible to have htmLawed let through any <span class="term">style</span> value by setting <span class="term">$config["style_pass"]</span> to <span class="term">1</span>.<br /> +<br /> +  As such, it is better to set up a CSS file with class declarations, disallow the <span class="term">style</span> attribute, set a <span class="term">$spec</span> rule (see <a href="#s2.3">section 2.3</a>) for <span class="term">class</span> for the <span class="term">oneof</span> or <span class="term">match</span> parameter, and ask writers to make use of the <span class="term">class</span> attribute.<br /> + +</div> +<div class="sub-section"><h3> +<a name="s3.4.9" id="s3.4.9"></a><span class="item-no">3.4.9</span>  Hook function for tag content +</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> +<br /> +  It is possible to utilize a custom hook function to alter the tag content htmLawed has finalized (i.e., after it has checked/corrected for required attributes, transformed attributes, lower-cased attribute names, etc.).<br /> +<br /> +  When <span class="term">$config</span> parameter <span class="term">hook_tag</span> is set to the name of a function, htmLawed (function <span class="term">hl_tag()</span>) will pass on the element name, and the <em>finalized</em> attribute name-value pairs as array elements to the function. The function is expected to return the full opening tag string like <span class="term"><element_name attribute_1_name="attribute_1_value"...></span> (for empty elements like <span class="term">img</span> and <span class="term">input</span>, the element-closing slash <span class="term">/</span> should also be included).<br /> +<br /> +  This is a <strong>powerful functionality</strong> that can be exploited for various objectives: consolidate-and-convert inline <span class="term">style</span> attributes to <span class="term">class</span>, convert <span class="term">embed</span> elements to <span class="term">object</span>, permit only one <span class="term">caption</span> element in a <span class="term">table</span> element, disallow embedding of certain types of media, <strong>inject HTML</strong>, use <a href="http://csstidy.sourceforge.net">CSSTidy</a> to sanitize <span class="term">style</span> attribute values, etc.<br /> +<br /> +  As an example, the custom hook code below can be used to force a series of specifically ordered <span class="term">id</span> attributes on all elements, and a specific <span class="term">param</span> element inside all <span class="term">object</span> elements:<br /> +<br /> + +<code class="code">    function my_tag_function($element, $attribute_array){</code> +<br /> + +<code class="code">      static $id = 0;</code> +<br /> + +<code class="code">      // Remove any duplicate element</code> +<br /> + +<code class="code">      if($element == 'param' && isset($attribute_array['allowscriptaccess'])){</code> +<br /> + +<code class="code">        return '';</code> +<br /> + +<code class="code">      }</code> +<br /> +<br /> + +<code class="code">      $new_element = '';</code> +<br /> +<br /> + +<code class="code">      // Force a serialized ID number</code> +<br /> + +<code class="code">      $attribute_array['id'] = 'my_'. $id;</code> +<br /> + +<code class="code">      ++$id;</code> +<br /> +<br /> + +<code class="code">      // Inject param for allowscriptaccess</code> +<br /> + +<code class="code">      if($element == 'object'){</code> +<br /> + +<code class="code">        $new_element = '<param id='my_'. $id; allowscriptaccess="never" />';</code> +<br /> + +<code class="code">        ++$id;</code> +<br /> + +<code class="code">      }</code> +<br /> +<br /> + +<code class="code">      $string = '';</code> +<br /> + +<code class="code">      foreach($attribute_array as $k=>$v){</code> +<br /> + +<code class="code">        $string .= " {$k}=\"{$v}\"";</code> +<br /> + +<code class="code">      }</code> +<br /> + +<code class="code">      return "<{$element}{$string}". (isset($in_array($element, $empty_elements) ? ' /' : ''). '>'. $new_element;</code> +<br /> + +<code class="code">    }</code> +<br /> +<br /> +  The <span class="term">hook_tag</span> parameter is different from the <span class="term">hook</span> parameter (<a href="#s3.7">section 3.7</a>).<br /> +<br /> +  Snippets of hook function code developed by others may be available on the <a href="http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed">htmLawed</a> website.<br /> + +</div> +</div> +<div class="sub-section"><h3> +<a name="s3.5" id="s3.5"></a><span class="item-no">3.5</span>  Simple configuration directive for most valid XHTML +</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> +<br /> +  If <span class="term">$config["valid_xhtml"]</span> is set to <span class="term">1</span>, some relevant <span class="term">$config</span> parameters (indicated by <span class="term">~</span> in <a href="#s2.2">section 2.2</a>) are auto-adjusted. This allows one to pass the <span class="term">$config</span> argument with a simpler value. If a value for a parameter auto-set through <span class="term">valid_xhtml</span> is still manually provided, then that value will over-ride the auto-set value.<br /> + +</div> +<div class="sub-section"><h3> +<a name="s3.6" id="s3.6"></a><span class="item-no">3.6</span>  Simple configuration directive for most <em>safe</em> HTML +</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> +<br /> +  <em>Safe</em> HTML refers to HTML that is restricted to reduce the vulnerability for scripting attacks (such as XSS) based on HTML code which otherwise may still be legal and compliant with the HTML standard specs. When elements such as <span class="term">script</span> and <span class="term">object</span>, and attributes such as <span class="term">onmouseover</span> and <span class="term">style</span> are allowed in the input text, an input writer can introduce malevolent HTML code. Note that what is considered <span class="term">safe</span> depends on the nature of the web application and the trust-level accorded to its users.<br /> +<br /> +  htmLawed allows an admin to use <span class="term">$config["safe"]</span> to auto-adjust multiple <span class="term">$config</span> parameters (such as <span class="term">elements</span> which declares the allowed element-set), which otherwise would have to be manually set. The relevant parameters are indicated by <span class="term">"</span> in <a href="#s2.2">section 2.2</a>). Thus, one can pass the <span class="term">$config</span> argument with a simpler value.<br /> +<br /> +  With the value of <span class="term">1</span>, htmLawed considers <span class="term">CDATA</span> sections and HTML comments as plain text, and prohibits the <span class="term">applet</span>, <span class="term">embed</span>, <span class="term">iframe</span>, <span class="term">object</span> and <span class="term">script</span> elements, and the <span class="term">on*</span> attributes like <span class="term">onclick</span>. ( There are <span class="term">$config</span> parameters like <span class="term">css_expression</span> that are not affected by the value set for <span class="term">safe</span> but whose default values still contribute towards a more <em>safe</em> output.) Further, URLs with schemes (see <a href="#s3.4.3">section 3.4.3</a>) are neutralized so that, e.g., <span class="term">style="moz-binding:url(http://danger)"</span> becomes <span class="term">style="moz-binding:url(denied:http://danger)"</span> while <span class="term">style="moz-binding:url(ok)"</span> remains intact.<br /> +<br /> +  Admins, however, may still want to completely deny the <span class="term">style</span> attribute, e.g., with code like<br /> +<br /> + +<code class="code">    $processed = htmLawed($text, array('safe'=>1, 'deny_attribute'=>'style'));</code> +<br /> +<br /> +  If a value for a parameter auto-set through <span class="term">safe</span> is still manually provided, then that value can over-ride the auto-set value. E.g., with <span class="term">$config["safe"] = 1</span> and <span class="term">$config["elements"] = "*+script"</span>, <span class="term">script</span>, but not <span class="term">applet</span>, is allowed.<br /> +<br /> +  A page illustrating the efficacy of htmLawed's anti-XSS abilities with <span class="term">safe</span> set to <span class="term">1</span> against XSS vectors listed by <a href="http://ha.ckers.org/xss.html">RSnake</a> may be available <a href="http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed/rsnake/RSnakeXSSTest.htm">here</a>.<br /> + +</div> +<div class="sub-section"><h3> +<a name="s3.7" id="s3.7"></a><span class="item-no">3.7</span>  Using a hook function +</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> +<br /> +  If <span class="term">$config["hook"]</span> is not set to <span class="term">0</span>, then htmLawed will allow preliminarily processed input to be altered by a hook function named by <span class="term">$config["hook"]</span> before starting the main work (but after handling of characters, entities, HTML comments and <span class="term">CDATA</span> sections -- see code for function <span class="term">htmLawed()</span>).<br /> +<br /> +  The hook function also allows one to alter the <em>finalized</em> values of <span class="term">$config</span> and <span class="term">$spec</span>.<br /> +<br /> +  Note that the <span class="term">hook</span> parameter is different from the <span class="term">hook_tag</span> parameter (<a href="#s3.4.9">section 3.4.9</a>).<br /> +<br /> +  Snippets of hook function code developed by others may be available on the <a href="http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed">htmLawed</a> website.<br /> + +</div> +<div class="sub-section"><h3> +<a name="s3.8" id="s3.8"></a><span class="item-no">3.8</span>  Obtaining <em>finalized</em> parameter values +</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> +<br /> +  htmLawed can assign the <em>finalized</em> <span class="term">$config</span> and <span class="term">$spec</span> values to a variable named by <span class="term">$config["show_setting"]</span>. The variable, made global by htmLawed, is set as an array with three keys: <span class="term">config</span>, with the <span class="term">$config</span> value, <span class="term">spec</span>, with the <span class="term">$spec</span> value, and <span class="term">time</span>, with a value that is the Unix time (the output of PHP's <span class="term">microtime()</span> function) when the value was assigned. Admins should use a PHP-compliant variable name (e.g., one that does not begin with a numerical digit) that does not conflict with variable names in their non-htmLawed code.<br /> +<br /> +  The values, which are also post-hook function (if any), can be used to auto-generate information (on, e.g., the elements that are permitted) for input writers.<br /> + +</div> +<div class="sub-section"><h3> +<a name="s3.9" id="s3.9"></a><span class="item-no">3.9</span>  Retaining non-HTML tags in input with mixed markup +</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> +<br /> +  htmLawed does not remove certain characters that though invalid are nevertheless discouraged in HTML documents as per the specs (see <a href="#s5.1">section 5.1</a>). This can be utilized to deal with input that contains mixed markup. Input that may have HTML markup as well as some other markup that is based on the <span class="term"><</span>, <span class="term">></span> and <span class="term">&</span> characters is considered to have mixed markup. The non-HTML markup can be rather proprietary (like markup for emoticons/smileys), or standard (like MathML or SVG). Or it can be programming code meant for execution/evaluation (such as embedded PHP code).<br /> +<br /> +  To deal with such mixed markup, the input text can be pre-processed to hide the non-HTML markup by specifically replacing the <span class="term"><</span>, <span class="term">></span> and <span class="term">&</span> characters with some of the HTML-discouraged characters (see <a href="#s3.1.2">section 3.1.2</a>). Post-htmLawed processing, the replacements are reverted.<br /> +<br /> +  An example (mixed HTML and PHP code in input text):<br /> +<br /> + +<code class="code">    $text = preg_replace('`<\?php(.+?)\?>`sm', "\x83?php\\1?\x84", $text);</code> +<br /> + +<code class="code">    $processed = htmLawed($text);</code> +<br /> + +<code class="code">    $processed = preg_replace('`\x83\?php(.+?)\?\x84`sm', '<?php$1?>', $processed);</code> +<br /> +<br /> +  This code will not work if <span class="term">$config["clean_ms_char"]</span> is set to <span class="term">1</span> (<a href="#s3.1">section 3.1</a>), in which case one should instead deploy a hook function (<a href="#s3.7">section 3.7</a>). (htmLawed internally uses certain control characters, code-points <span class="term">1</span> to <span class="term">7</span>, and use of these characters as markers in the logic of hook functions may cause issues.)<br /> +<br /> +  Admins may also be able to use <span class="term">$config["and_mark"]</span> to deal with such mixed markup; see <a href="#s3.2">section 3.2</a>.<br /> + +</div> +</div> +<div class="section"><h2> +<a name="s4" id="s4"></a><span class="item-no">4</span>  Other +</h2><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> +<div class="sub-section"><h3> +<a name="s4.1" id="s4.1"></a><span class="item-no">4.1</span>  Support +</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> +<br /> +  A careful re-reading of this documentation will very likely answer your questions.<br /> +<br /> +  Software updates and forum-based community-support may be found at <a href="http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed">http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed</a>. For general PHP issues (not htmLawed-specific), support may be found through internet searches and at <a href="http://php.net">http://php.net</a>.<br /> + +</div> +<div class="sub-section"><h3> +<a name="s4.2" id="s4.2"></a><span class="item-no">4.2</span>  Known issues +</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> +<br /> +  See <a href="#s2.8">section 2.8</a>.<br /> +<br /> +  Readers are advised to cross-check information given in this document.<br /> + +</div> +<div class="sub-section"><h3> +<a name="s4.3" id="s4.3"></a><span class="item-no">4.3</span>  Change-log +</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> +<br /> +  (The release date for the downloadable package of files containing documentation, demo script, test-cases, etc., besides the <span class="term">htmLawed.php</span> file may be updated independently if the secondary files are revised.)<br /> +<br /> +  <em>Version number - Release date. Notes</em><br /> +<br /> +  1.1.8.1 - 16 July 2009. Minor code-change to fix a PHP error notice<br /> +<br /> +  1.1.8 - 23 April 2009. Parameter <span class="term">deny_attribute</span> now accepts the wild-card <span class="term">*</span>, making it simpler to specify its value when all but a few attributes are being denied; fixed a bug in interpreting <span class="term">$spec</span><br /> +<br /> +  1.1.7 - 11-12 March 2009. Attributes globally denied through <span class="term">deny_attribute</span> can be allowed element-specifically through <span class="term">$spec</span>; <span class="term">$config["style_pass"]</span> allowing letting through any <span class="term">style</span> value introduced; altered logic to catch certain types of dynamic crafted CSS expressions<br /> +<br /> +  1.1.3-6 - 28-31 January - 4 February 2009. Altered logic to catch certain types of dynamic crafted CSS expressions<br /> +<br /> +  1.1.2 - 22 January 2009. Fixed bug in parsing of <span class="term">font</span> attributes during tag transformation<br /> +<br /> +  1.1.1 - 27 September 2008. Better nesting correction when omitable closing tags are absent<br /> +<br /> +  1.1 - 29 June 2008. <span class="term">$config["hook_tag"]</span> and <span class="term">$config["format"]</span> introduced for custom tag/attribute check/modification/injection and output compaction/beautification; fixed a regex-in-$spec parsing bug<br /> +<br /> +  1.0.9 - 11 June 2008. Fixed bug in invalid HTML code-point entity check<br /> +<br /> +  1.0.8 - 15 May 2008. <span class="term">bordercolor</span> attribute for <span class="term">table</span>, <span class="term">td</span> and <span class="term">tr</span><br /> +<br /> +  1.0.7 - 1 May 2008. Support for <span class="term">wmode</span> attribute for <span class="term">embed</span>; <span class="term">$config["show_setting"]</span> introduced; improved <span class="term">$config["elements"]</span> evaluation<br /> +<br /> +  1.0.6 - 20 April 2008. <span class="term">$config["and_mark"]</span> introduced<br /> +<br /> +  1.0.5 - 12 March 2008. <span class="term">style</span> URL schemes essentially disallowed when $config <span class="term">safe</span> is on; improved regex for CSS expression search<br /> +<br /> +  1.0.4 - 10 March 2008. Improved corrections for <span class="term">blockquote</span>, <span class="term">form</span>, <span class="term">map</span> and <span class="term">noscript</span><br /> +<br /> +  1.0.3 - 3 March 2008. Character entities for soft-hyphens are now replaced with spaces (instead of being removed); a bug allowing <span class="term">td</span> directly inside <span class="term">table</span> fixed; <span class="term">safe</span> <span class="term">$config</span> parameter added<br /> +<br /> +  1.0.2 - 13 February 2008. Improved implementation of <span class="term">$config["keep_bad"]</span><br /> +<br /> +  1.0.1 - 7 November 2007. Improved regex for identifying URLs, protocols and dynamic expressions (<span class="term">hl_tag()</span> and <span class="term">hl_prot()</span>); no error display with <span class="term">hl_regex()</span><br /> +<br /> +  1.0 - 2 November 2007. First release<br /> + +</div> +<div class="sub-section"><h3> +<a name="s4.4" id="s4.4"></a><span class="item-no">4.4</span>  Testing +</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> +<br /> +  To test htmLawed using a form interface, a <a href="htmLawedTest.php">demo</a> web-page is provided with the htmLawed distribution (<span class="term">htmLawed.php</span> and <span class="term">htmLawedTest.php</span> should be in the same directory on the web-server). A file with <a href="htmLawed_TESTCASE.txt">test-cases</a> is also provided.<br /> + +</div> +<div class="sub-section"><h3> +<a name="s4.5" id="s4.5"></a><span class="item-no">4.5</span>  Upgrade, & old versions +</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> +<br /> +  Upgrading is as simple as replacing the previous version of <span class="term">htmLawed.php</span> (assuming it was not modified for customized features). As htmLawed output is almost always used in static documents, upgrading should not affect old, finalized content.<br /> +<br /> +  Old versions of htmLawed may be available online. E.g., for version 1.0, check <a href="http://www.bioinformatics.org/phplabware/downloads/htmLawed1.zip">http://www.bioinformatics.org/phplabware/downloads/htmLawed1.zip</a>, for 1.1.1, htmLawed111.zip, and for 1.1.10, htmLawed1110.zip.<br /> + +</div> +<div class="sub-section"><h3> +<a name="s4.6" id="s4.6"></a><span class="item-no">4.6</span>  Comparison with <span class="term">HTMLPurifier</span> +</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> +<br /> +  The HTMLPurifier PHP library by Edward Yang is a very good HTML filtering script that uses object oriented PHP code. Compared to htmLawed, it:<br /> +<br /> +  *  does not support PHP versions older than 5.0 (HTMLPurifier dropped PHP 4 support after version 2)<br /> +<br /> +  *  is 15-20 times bigger (scores of files totalling more than 750 kb)<br /> +<br /> +  *  consumes 10-15 times more RAM memory (just including the HTMLPurifier files without calling the filter requires a few MBs of memory)<br /> +<br /> +  *  is expectedly slower<br /> +<br /> +  *  does not allow admins to fully allow all valid HTML (because of incomplete HTML support, it always considers elements like <span class="term">script</span> illegal)<br /> +<br /> +  *  lacks many of the extra features of htmLawed (like entity conversions and code compaction/beautification)<br /> +<br /> +  *  has poor documentation<br /> +<br /> +  However, HTMLPurifier has finer checks for character encodings and attribute values, and can log warnings and errors. Visit the HTMLPurifier <a href="http://htmlpurifier.org">website</a> for updated information.<br /> + +</div> +<div class="sub-section"><h3> +<a name="s4.7" id="s4.7"></a><span class="item-no">4.7</span>  Use through application plug-ins/modules +</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> +<br /> +  Plug-ins/modules to implement htmLawed in applications such as Drupal and DokuWiki may have been developed. Please check the application websites and the forum on the htmLawed <a href="http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed">site</a>.<br /> + +</div> +<div class="sub-section"><h3> +<a name="s4.8" id="s4.8"></a><span class="item-no">4.8</span>  Use in non-PHP applications +</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> +<br /> +  Non-PHP applications written in Python, Ruby, etc., may be able to use htmLawed through system calls to the PHP engine. Such code may have been documented on the internet. Also check the forum on the htmLawed <a href="http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed">site</a>.<br /> + +</div> +<div class="sub-section"><h3> +<a name="s4.9" id="s4.9"></a><span class="item-no">4.9</span>  Donate +</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> +<br /> +  A donation in any currency and amount to appreciate or support this software can be sent by <a href="http://paypal.com">PayPal</a> to this email address: drpatnaik at yahoo dot com.<br /> + +</div> +<div class="sub-section"><h3> +<a name="s4.10" id="s4.10"></a><span class="item-no">4.10</span>  Acknowledgements +</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> +<br /> +  Bryan Blakey, Ulf Harnhammer, Gareth Heyes, Lukasz Pilorz, Shelley Powers, Edward Yang, and many anonymous users.<br /> +<br /> +  Thank you!<br /> + +</div> +</div> +<div class="section"><h2> +<a name="s5" id="s5"></a><span class="item-no">5</span>  Appendices +</h2><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> +<div class="sub-section"><h3> +<a name="s5.1" id="s5.1"></a><span class="item-no">5.1</span>  Characters discouraged in XHTML +</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> +<br /> +  Characters represented by the following hexadecimal code-points are <em>not</em> invalid, even though some validators may issue messages stating otherwise.<br /> +<br /> +  <span class="term">7f</span> to <span class="term">84</span>, <span class="term">86</span> to <span class="term">9f</span>, <span class="term">fdd0</span> to <span class="term">fddf</span>, <span class="term">1fffe</span>, <span class="term">1ffff</span>, <span class="term">2fffe</span>, <span class="term">2ffff</span>, <span class="term">3fffe</span>, <span class="term">3ffff</span>, <span class="term">4fffe</span>, <span class="term">4ffff</span>, <span class="term">5fffe</span>, <span class="term">5ffff</span>, <span class="term">6fffe</span>, <span class="term">6ffff</span>, <span class="term">7fffe</span>, <span class="term">7ffff</span>, <span class="term">8fffe</span>, <span class="term">8ffff</span>, <span class="term">9fffe</span>, <span class="term">9ffff</span>, <span class="term">afffe</span>, <span class="term">affff</span>, <span class="term">bfffe</span>, <span class="term">bffff</span>, <span class="term">cfffe</span>, <span class="term">cffff</span>, <span class="term">dfffe</span>, <span class="term">dffff</span>, <span class="term">efffe</span>, <span class="term">effff</span>, <span class="term">ffffe</span>, <span class="term">fffff</span>, <span class="term">10fffe</span> and <span class="term">10ffff</span><br /> + +</div> +<div class="sub-section"><h3> +<a name="s5.2" id="s5.2"></a><span class="item-no">5.2</span>  Valid attribute-element combinations +</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> +<br /> +  Valid attribute-element combinations as per W3C specs.<br /> +<br /> +  *  includes deprecated attributes (marked <span class="term">^</span>), attributes for the non-standard <span class="term">embed</span> element (marked <span class="term">*</span>), and the proprietary <span class="term">bordercolor</span> (marked <span class="term">~</span>)<br /> +  *  only non-frameset, HTML body elements<br /> +  *  <span class="term">name</span> for <span class="term">a</span> and <span class="term">map</span>, and <span class="term">lang</span> are invalid in XHTML 1.1<br /> +  *  <span class="term">target</span> is valid for <span class="term">a</span> in XHTML 1.1 and higher<br /> +  *  <span class="term">xml:space</span> is only for XHTML 1.1<br /> +<br /> +  abbr - td, th<br /> +  accept - form, input<br /> +  accept-charset - form<br /> +  accesskey - a, area, button, input, label, legend, textarea<br /> +  action - form<br /> +  align - caption^, embed, applet, iframe, img^, input^, object^, legend^, table^, hr^, div^, h1^, h2^, h3^, h4^, h5^, h6^, p^, col, colgroup, tbody, td, tfoot, th, thead, tr<br /> +  alt - applet, area, img, input<br /> +  archive - applet, object<br /> +  axis - td, th<br /> +  bgcolor - embed, table^, tr^, td^, th^<br /> +  border - table, img^, object^<br /> +  bordercolor~ - table, td, tr<br /> +  cellpadding - table<br /> +  cellspacing - table<br /> +  char - col, colgroup, tbody, td, tfoot, th, thead, tr<br /> +  charoff - col, colgroup, tbody, td, tfoot, th, thead, tr<br /> +  charset - a, script<br /> +  checked - input<br /> +  cite - blockquote, q, del, ins<br /> +  classid - object<br /> +  clear - br^<br /> +  code - applet<br /> +  codebase - object, applet<br /> +  codetype - object<br /> +  color - font<br /> +  cols - textarea<br /> +  colspan - td, th<br /> +  compact - dir, dl^, menu, ol^, ul^<br /> +  coords - area, a<br /> +  data - object<br /> +  datetime - del, ins<br /> +  declare - object<br /> +  defer - script<br /> +  dir - bdo<br /> +  disabled - button, input, optgroup, option, select, textarea<br /> +  enctype - form<br /> +  face - font<br /> +  for - label<br /> +  frame - table<br /> +  frameborder - iframe<br /> +  headers - td, th<br /> +  height - embed, iframe, td^, th^, img, object, applet<br /> +  href - a, area<br /> +  hreflang - a<br /> +  hspace - applet, img^, object^<br /> +  ismap - img, input<br /> +  label - option, optgroup<br /> +  language - script^<br /> +  longdesc - img, iframe<br /> +  marginheight - iframe<br /> +  marginwidth - iframe<br /> +  maxlength - input<br /> +  method - form<br /> +  model* - embed<br /> +  multiple - select<br /> +  name - button, embed, textarea, applet^, select, form^, iframe^, img^, a^, input, object, map^, param<br /> +  nohref - area<br /> +  noshade - hr^<br /> +  nowrap - td^, th^<br /> +  object - applet<br /> +  onblur - a, area, button, input, label, select, textarea<br /> +  onchange - input, select, textarea<br /> +  onfocus - a, area, button, input, label, select, textarea<br /> +  onreset - form<br /> +  onselect - input, textarea<br /> +  onsubmit - form<br /> +  pluginspage* - embed<br /> +  pluginurl* - embed<br /> +  prompt - isindex<br /> +  readonly - textarea, input<br /> +  rel - a<br /> +  rev - a<br /> +  rows - textarea<br /> +  rowspan - td, th<br /> +  rules - table<br /> +  scope - td, th<br /> +  scrolling - iframe<br /> +  selected - option<br /> +  shape - area, a<br /> +  size - hr^, font, input, select<br /> +  span - col, colgroup<br /> +  src - embed, script, input, iframe, img<br /> +  standby - object<br /> +  start - ol^<br /> +  summary - table<br /> +  tabindex - a, area, button, input, object, select, textarea<br /> +  target - a^, area, form<br /> +  type - a, embed, object, param, script, input, li^, ol^, ul^, button<br /> +  usemap - img, input, object<br /> +  valign - col, colgroup, tbody, td, tfoot, th, thead, tr<br /> +  value - input, option, param, button, li^<br /> +  valuetype - param<br /> +  vspace - applet, img^, object^<br /> +  width - embed, hr^, iframe, img, object, table, td^, th^, applet, col, colgroup, pre^<br /> +  wmode - embed<br /> +  xml:space - pre, script, style<br /> +<br /> +  These are allowed in all but the shown elements:<br /> +<br /> +  class - param, script<br /> +  dir - applet, bdo, br, iframe, param, script<br /> +  id - script<br /> +  lang - applet, br, iframe, param, script<br /> +  onclick - applet, bdo, br, font, iframe, isindex, param, script<br /> +  ondblclick - applet, bdo, br, font, iframe, isindex, param, script<br /> +  onkeydown - applet, bdo, br, font, iframe, isindex, param, script<br /> +  onkeypress - applet, bdo, br, font, iframe, isindex, param, script<br /> +  onkeyup - applet, bdo, br, font, iframe, isindex, param, script<br /> +  onmousedown - applet, bdo, br, font, iframe, isindex, param, script<br /> +  onmousemove - applet, bdo, br, font, iframe, isindex, param, script<br /> +  onmouseout - applet, bdo, br, font, iframe, isindex, param, script<br /> +  onmouseover - applet, bdo, br, font, iframe, isindex, param, script<br /> +  onmouseup - applet, bdo, br, font, iframe, isindex, param, script<br /> +  style - param, script<br /> +  title - param, script<br /> +  xml:lang - applet, br, iframe, param, script<br /> + +</div> +<div class="sub-section"><h3> +<a name="s5.3" id="s5.3"></a><span class="item-no">5.3</span>  CSS 2.1 properties accepting URLs +</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> +<br /> +  background<br /> +  background-image<br /> +  content<br /> +  cue-after<br /> +  cue-before<br /> +  cursor<br /> +  list-style<br /> +  list-style-image<br /> +  play-during<br /> + +</div> +<div class="sub-section"><h3> +<a name="s5.4" id="s5.4"></a><span class="item-no">5.4</span>  Microsoft Windows 1252 character replacements +</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> +<br /> +  Key: <span class="term">d</span> double, <span class="term">l</span> left, <span class="term">q</span> quote, <span class="term">r</span> right, <span class="term">s.</span> single<br /> +<br /> +  Code-point (decimal) - hexadecimal value - replacement entity - represented character<br /> +<br /> +  127 - 7f - (removed) - (not used)<br /> +  128 - 80 - &#8364; - euro<br /> +  129 - 81 - (removed) - (not used)<br /> +  130 - 82 - &#8218; - baseline s. q<br /> +  131 - 83 - &#402; - florin<br /> +  132 - 84 - &#8222; - baseline d q<br /> +  133 - 85 - &#8230; - ellipsis<br /> +  134 - 86 - &#8224; - dagger<br /> +  135 - 87 - &#8225; - d dagger<br /> +  136 - 88 - &#710; - circumflex accent<br /> +  137 - 89 - &#8240; - permile<br /> +  138 - 8a - &#352; - S Hacek<br /> +  139 - 8b - &#8249; - l s. guillemet<br /> +  140 - 8c - &#338; - OE ligature<br /> +  141 - 8d - (removed) - (not used)<br /> +  142 - 8e - &#381; - Z dieresis<br /> +  143 - 8f - (removed) - (not used)<br /> +  144 - 90 - (removed) - (not used)<br /> +  145 - 91 - &#8216; - l s. q<br /> +  146 - 92 - &#8217; - r s. q<br /> +  147 - 93 - &#8220; - l d q<br /> +  148 - 94 - &#8221; - r d q<br /> +  149 - 95 - &#8226; - bullet<br /> +  150 - 96 - &#8211; - en dash<br /> +  151 - 97 - &#8212; - em dash<br /> +  152 - 98 - &#732; - tilde accent<br /> +  153 - 99 - &#8482; - trademark<br /> +  154 - 9a - &#353; - s Hacek<br /> +  155 - 9b - &#8250; - r s. guillemet<br /> +  156 - 9c - &#339; - oe ligature<br /> +  157 - 9d - (removed) - (not used)<br /> +  158 - 9e - &#382; - z dieresis<br /> +  159 - 9f - &#376; - Y dieresis<br /> + +</div> +<div class="sub-section"><h3> +<a name="s5.5" id="s5.5"></a><span class="item-no">5.5</span>  URL format +</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> +<br /> +  An <em>absolute</em> URL has a <span class="term">protocol</span> or <span class="term">scheme</span>, a <span class="term">network location</span> or <span class="term">hostname</span>, and, optional <span class="term">path</span>, <span class="term">parameters</span>, <span class="term">query</span> and <span class="term">fragment</span> segments. Thus, an absolute URL has this generic structure:<br /> +<br /> + +<code class="code">    (scheme) : (//network location) /(path) ;(parameters) ?(query) #(fragment)</code> +<br /> +<br /> +  The schemes can only contain letters, digits, <span class="term">+</span>, <span class="term">.</span> and <span class="term">-</span>. Hostname is the portion after the <span class="term">//</span> and up to the first <span class="term">/</span> (if any; else, up to the end) when <span class="term">:</span> is followed by a <span class="term">//</span> (e.g., <span class="term">abc.com</span> in <span class="term">ftp://abc.com/def</span>); otherwise, it consists of everything after the <span class="term">:</span> (e.g., <span class="term">def@abc.com</span> in mailto:def@abc.com').<br /> +<br /> +  <em>Relative</em> URLs do not have explicit schemes and network locations; such values are inherited from a <em>base</em> URL.<br /> + +</div> +<div class="sub-section"><h3> +<a name="s5.6" id="s5.6"></a><span class="item-no">5.6</span>  Brief on htmLawed code +</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> +<br /> +  Much of the code's logic and reasoning can be understood from the documentation above.<br /> +<br /> +  The <strong>output</strong> of htmLawed is a text string containing the processed input. There is no custom error tracking.<br /> +<br /> +  <strong>Function arguments</strong> for htmLawed are:<br /> +<br /> +  *  <span class="term">$in</span> - 1st argument; a text string; the <strong>input text</strong> to be processed. Any extraneous slashes added by PHP when <em>magic quotes</em> are enabled should be removed beforehand using PHP's <span class="term">stripslashes()</span> function.<br /> +<br /> +  *  <span class="term">$config</span> - 2nd argument; an associative array; optional (named <span class="term">$C</span> in htmLawed code). The array has keys with names like <span class="term">balance</span> and <span class="term">keep_bad</span>, and the values, which can be boolean, string, or array, depending on the key, are read to accordingly set the <strong>configurable parameters</strong> (indicated by the keys). All configurable parameters receive some default value if the value to be used is not specified by the user through <span class="term">$config</span>. <em>Finalized</em> <span class="term">$config</span> is thus a filtered and possibly larger array.<br /> +<br /> +  *  <span class="term">$spec</span> - 3rd argument; a text string; optional. The string has rules, written in an htmLawed-designated format, <strong>specifying</strong> element-specific attribute and attribute value restrictions. Function <span class="term">hl_spec()</span> is used to convert the string to an associative-array for internal use. <em>Finalized</em> <span class="term">$spec</span> is thus an array.<br /> +<br /> +  <em>Finalized</em> <span class="term">$config</span> and <span class="term">$spec</span> are made <strong>global variables</strong> while htmLawed is at work. Values of any pre-existing global variables with same names are noted, and their values are restored after htmLawed finishes processing the input (to capture the <em>finalized</em> values, the <span class="term">show_settings</span> parameter of <span class="term">$config</span> should be used). Depending on <span class="term">$config</span>, another global variable <span class="term">hl_Ids</span>, to track <span class="term">id</span> attribute values for uniqueness, may be set. Unlike the other two variables, this one is not reset (or unset) post-processing.<br /> +<br /> +  Except for the main function <span class="term">htmLawed()</span> and the functions <span class="term">kses()</span> and <span class="term">kses_hook()</span>, htmLawed's functions are <strong>name-spaced</strong> using the <span class="term">hl_</span> prefix. The <strong>functions</strong> and their roles are:<br /> +<br /> +  *  <span class="term">hl_attrval</span> - checking attribute values against $spec<br /> +  *  <span class="term">hl_bal</span> - tag balancing<br /> +  *  <span class="term">hl_cmtcd</span> - handling CDATA sections and HTML comments<br /> +  *  <span class="term">hl_ent</span> - entity handling<br /> +  *  <span class="term">hl_prot</span> - checking a URL scheme/protocol<br /> +  *  <span class="term">hl_regex</span> - checking syntax of a regular expression<br /> +  *  <span class="term">hl_spec</span> - converting user-supplied $spec value to one used by htmLawed internally<br /> +  *  <span class="term">hl_tag</span> - handling tags<br /> +  *  <span class="term">hl_tag2</span> - transforming tags<br /> +  *  <span class="term">hl_tidy</span> - compact/beautify HTML<br /> +  *  <span class="term">hl_version</span> - reporting htmLawed version<br /> +  *  <span class="term">htmLawed</span> - main function<br /> +  *  <span class="term">kses</span> - main function of <span class="term">kses</span><br /> +  *  <span class="term">kses_hook</span> - hook function of <span class="term">kses</span><br /> +<br /> +  The last two are for compatibility with pre-existing code using the <span class="term">kses</span> script. htmLawed's <span class="term">kses()</span> basically passes on the filtering task to <span class="term">htmLawed()</span> function after deciphering <span class="term">$config</span> and <span class="term">$spec</span> from the argument values supplied to it. <span class="term">kses_hook()</span> is an empty function and is meant for being filled with custom code if the <span class="term">kses</span> script users were using one.<br /> +<br /> +  <span class="term">htmLawed()</span> finalizes <span class="term">$spec</span> (with the help of <span class="term">hl_spec()</span>) and <span class="term">$config</span>, and globalizes them. Finalization of <span class="term">$config</span> involves setting default values if an inappropriate or invalid one is supplied. This includes calling <span class="term">hl_regex()</span> to check well-formedness of regular expression patterns if such expressions are user-supplied through <span class="term">$config</span>. <span class="term">htmLawed()</span> then removes invalid characters like nulls and <span class="term">x01</span> and appropriately handles entities using <span class="term">hl_ent()</span>. HTML comments and CDATA sections are identified and treated as per <span class="term">$config</span> with the help of <span class="term">hl_cmtcd()</span>. When retained, the <span class="term"><</span> and <span class="term">></span> characters identifying them, and the <span class="term"><</span>, <span class="term">></span> and <span class="term">&</span> characters inside them, are replaced with control characters (code-points <span class="term">1</span> to <span class="term">5</span>) till any tag balancing is completed.<br /> +<br /> +  After this <em>initial processing</em> <span class="term">htmLawed()</span> identifies tags using regex and processes them with the help of <span class="term">hl_tag()</span> --  a large function that analyzes tag content, filtering it as per HTML standards, <span class="term">$config</span> and <span class="term">$spec</span>. Among other things, <span class="term">hl_tag()</span> transforms deprecated elements using <span class="term">hl_tag2()</span>, removes attributes from closing tags, checks attribute values as per <span class="term">$spec</span> rules using <span class="term">hl_attrval()</span>, and checks URL protocols using <span class="term">hl_prot()</span>. <span class="term">htmLawed()</span> performs tag balancing and nesting checks with a call to <span class="term">hl_bal()</span>, and optionally compacts/beautifies the output with proper white-spacing with a call to <span class="term">hl_tidy()</span>. The latter temporarily replaces white-space, and <span class="term"><</span>, <span class="term">></span> and <span class="term">&</span> characters inside <span class="term">pre</span>, <span class="term">script</span> and <span class="term">textarea</span> elements, and HTML comments and CDATA sections with control characters (code-points <span class="term">1</span> to <span class="term">5</span>, and <span class="term">7</span>).<br /> +<br /> +  htmLawed permits the use of custom code or <strong>hook functions</strong> at two stages. The first, called inside <span class="term">htmLawed()</span>, allows the input text as well as the finalized $config and $spec values to be altered right after the initial processing (see <a href="#s3.7">section 3.7</a>). The second is called by <span class="term">hl_tag()</span> once the tag content is finalized (see <a href="#s3.4.9">section 3.4.9</a>).<br /> +<br /> +  Being dictated by the external and stable HTML standard, htmLawed's objective is very clear-cut and less concerned with tweakability. The code is only minimally annotated with comments -- it is not meant to instruct; PHP developers familiar with the HTML specs will see the logic, and others can always refer to the htmLawed documentation. The compact structuring of the statements is meant to aid in quickly grasping the logic, at least when viewed with code syntax highlighted. +</div> +</div> +<br /> +<hr /><br /><br /><span class="subtle"><small>HTM version of <em><a href="htmLawed_README.txt">htmLawed_README.txt</a></em> generated on 23 Apr, 2009 using <a href="http://www.bioinformatics.org/phplabware/internal_utilities">rTxt2htm</a> from PHP Labware</small></span> +</div><!-- ended div body --> +</div><!-- ended div top --> +</body> +</html>
\ No newline at end of file diff --git a/extlib/htmLawed/htmLawed_README.txt b/extlib/htmLawed/htmLawed_README.txt new file mode 100644 index 000000000..3ce4b9ac1 --- /dev/null +++ b/extlib/htmLawed/htmLawed_README.txt @@ -0,0 +1,1600 @@ +/* +htmLawed_README.txt, 16 July 2009 +htmLawed 1.1.8.1, 16 July 2009 +Copyright Santosh Patnaik +GPL v3 license +A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed +*/ + + +== Content ========================================================== + + +1 About htmLawed + 1.1 Example uses + 1.2 Features + 1.3 History + 1.4 License & copyright + 1.5 Terms used here +2 Usage + 2.1 Simple + 2.2 Configuring htmLawed using the '$config' parameter + 2.3 Extra HTML specifications using the '$spec' parameter + 2.4 Performance time & memory usage + 2.5 Some security risks to keep in mind + 2.6 Use without modifying old 'kses()' code + 2.7 Tolerance for ill-written HTML + 2.8 Limitations & work-arounds + 2.9 Examples +3 Details + 3.1 Invalid/dangerous characters + 3.2 Character references/entities + 3.3 HTML elements + 3.3.1 HTML comments and 'CDATA' sections + 3.3.2 Tag-transformation for better XHTML-Strict + 3.3.3 Tag balancing and proper nesting + 3.3.4 Elements requiring child elements + 3.3.5 Beautify or compact HTML + 3.4 Attributes + 3.4.1 Auto-addition of XHTML-required attributes + 3.4.2 Duplicate/invalid 'id' values + 3.4.3 URL schemes (protocols) and scripts in attribute values + 3.4.4 Absolute & relative URLs + 3.4.5 Lower-cased, standard attribute values + 3.4.6 Transformation of deprecated attributes + 3.4.7 Anti-spam & 'href' + 3.4.8 Inline style properties + 3.4.9 Hook function for tag content + 3.5 Simple configuration directive for most valid XHTML + 3.6 Simple configuration directive for most `safe` HTML + 3.7 Using a hook function + 3.8 Obtaining `finalized` parameter values + 3.9 Retaining non-HTML tags in input with mixed markup +4 Other + 4.1 Support + 4.2 Known issues + 4.3 Change-log + 4.4 Testing + 4.5 Upgrade, & old versions + 4.6 Comparison with 'HTMLPurifier' + 4.7 Use through application plug-ins/modules + 4.8 Use in non-PHP applications + 4.9 Donate + 4.10 Acknowledgements +5 Appendices + 5.1 Characters discouraged in HTML + 5.2 Valid attribute-element combinations + 5.3 CSS 2.1 properties accepting URLs + 5.4 Microsoft Windows 1252 character replacements + 5.5 URL format + 5.6 Brief on htmLawed code + + +== 1 About htmLawed ================================================ + + + htmLawed is a highly customizable single-file PHP script to make text secure, and standard- and admin policy-compliant for use in the body of HTML 4, XHTML 1 or 1.1, or generic XML documents. It is thus a configurable input (X)HTML filter, processor, purifier, sanitizer, beautifier, etc., and an alternative to the HTMLTidy:- http://tidy.sourceforge.net application. + + The `lawing in` of input text is needed to ensure that HTML code in the text is standard-compliant, does not introduce security vulnerabilities, and does not break the aesthetics, design or layout of web-pages. htmLawed tries to do this by, for example, making HTML well-formed with balanced and properly nested tags, neutralizing code that may be used for cross-site scripting ('XSS') attacks, and allowing only specified HTML elements/tags and attributes. + + +-- 1.1 Example uses ------------------------------------------------ + + + * Filtering of text submitted as comments on blogs to allow only certain HTML elements + + * Making RSS/Atom newsfeed item-content standard-compliant: often one uses an excerpt from an HTML document for the content, and with unbalanced tags, non-numerical entities, etc., such excerpts may not be XML-compliant + + * Text processing for stricter XML standard-compliance: e.g., to have lowercased 'x' in hexadecimal numeric entities becomes necessary if an XHTML document with MathML content needs to be served as 'application/xml' + + * Scraping text or data from web-pages + + * Pretty-printing HTML code + + +-- 1.2 Features ---------------------------------------------------o + + + Key: '*' security feature, '^' standard compliance, '~' requires setting right options, '`' different from 'Kses' + + * make input more *secure* and *standard-compliant* + * use for HTML 4, XHTML 1.0 or 1.1, or even generic *XML* documents ^~` + + * *beautify* or *compact* HTML ^~` + + * *restrict elements* ^~` + * proper closure of empty elements like 'img' ^` + * *transform deprecated elements* like 'u' ^~` + * HTML *comments* and 'CDATA' sections can be permitted ^~` + * elements like 'script', 'object' and 'form' can be permitted ~ + + * *restrict attributes*, including *element-specifically* ^~` + * remove *invalid attributes* ^` + * element and attribute names are *lower-cased* ^ + * provide *required attributes*, like 'alt' for 'image' ^` + * *transform deprecated attributes* ^~` + * attributes *declared only once* ^` + + * *restrict attribute values*, including *element-specifically* ^~` + * a value is declared for `empty` (`minimized`) attributes like 'checked' ^ + * check for potentially dangerous attribute values *~ + * ensure *unique* 'id' attribute values ^~` + * *double-quote* attribute values ^ + * lower-case *standard attribute values* like 'password' ^` + + * *attribute-specific URL protocol/scheme restriction* *~` + * disable *dynamic expressions* in 'style' values *~` + + * neutralize invalid named character entities ^` + * *convert* hexadecimal numeric entities to decimal ones, or vice versa ^~` + * convert named entities to numeric ones for generic XML use ^~` + + * remove *null* characters * + * neutralize potentially dangerous proprietary Netscape *Javascript entities* * + * replace potentially dangerous *soft-hyphen* character in attribute values with spaces * + + * remove common *invalid characters* not allowed in HTML or XML ^` + * replace *characters from Microsoft applications* like 'Word' that are discouraged in HTML or XML ^~` + * neutralize entities for characters invalid or discouraged in HTML or XML ^` + * appropriately neutralize '<', '&', '"', and '>' characters ^*` + + * understands improperly spaced tag content (like, spread over more than a line) and properly spaces them ` + * attempts to *balance tags* for well-formedness ^~` + * understands when *omitable closing tags* like '</p>' (allowed in HTML 4, transitional, e.g.) are missing ^~` + * attempts to permit only *validly nested tags* ^~` + * option to *remove or neutralize bad content* ^~` + * attempts to *rectify common errors of plain-text misplacement* (e.g., directly inside 'blockquote') ^~` + + * fast, *non-OOP* code of ~45 kb incurring peak basal memory usage of ~0.5 MB + * *compatible* with pre-existing code using 'Kses' (the filter used by 'WordPress') + + * optional *anti-spam* measures such as addition of 'rel="nofollow"' and link-disabling ~` + * optionally makes *relative URLs absolute*, and vice versa ~` + + * optionally mark '&' to identify the entities for '&', '<' and '>' introduced by htmLawed ~` + + * allows deployment of powerful *hook functions* to *inject* HTML, *consolidate* 'style' attributes to 'class', finely check attribute values, etc. ~` + + * *independent of character encoding* of input and does not affect it + + * *tolerance for ill-written HTML* to a certain degree + + +-- 1.3 History ----------------------------------------------------o + + + htmLawed was developed for use with 'LabWiki', a wiki software developed at PHP Labware, as a suitable software could not be found. Existing PHP software like 'Kses' and 'HTMLPurifier' were deemed inadequate, slow, resource-intensive, or dependent on external applications like 'HTML Tidy'. + + htmLawed started as a modification of Ulf Harnhammar's 'Kses' (version 0.2.2) software, and is compatible with code that uses 'Kses'; see section:- #2.6. + + +-- 1.4 License & copyright ----------------------------------------o + + + htmLawed is free and open-source software licensed under GPL license version 3:- http://www.gnu.org/licenses/gpl-3.0.txt, and copyrighted by Santosh Patnaik, MD, PhD. + + +-- 1.5 Terms used here --------------------------------------------o + + + * `administrator` - or admin; person setting up the code to pass input through htmLawed; also, `user` + * `attributes` - name-value pairs like 'href="http://x.com"' in opening tags + * `author` - `writer` + * `character` - atomic unit of text; internally represented by a numeric `code-point` as specified by the `encoding` or `charset` in use + * `entity` - markup like '>' and ' ' used to refer to a character + * `element` - HTML element like 'a' and 'img' + * `element content` - content between the opening and closing tags of an element, like 'click' of '<a href="x">click</a>' + * `HTML` - implies XHTML unless specified otherwise + * `input` - text string given to htmLawed to process + * `processing` - involves filtering, correction, etc., of input + * `safe` - absence or reduction of certain characters and HTML elements and attributes in the input that can otherwise potentially and circumstantially expose web-site users to security vulnerabilities like cross-site scripting attacks (XSS) + * `scheme` - URL protocol like 'http' and 'ftp' + * `specs` - standard specifications + * `style property` - terms like 'border' and 'height' for which declarations are made in values for the 'style' attribute of elements + * `tag` - markers like '<a href="x">' and '</a>' delineating element content; the opening tag can contain attributes + * `tag content` - consists of tag markers '<' and '>', element names like 'div', and possibly attributes + * `user` - administrator + * `writer` - end-user like a blog commenter providing the input that is to be processed; also, `author` + + +== 2 Usage ========================================================oo + + + htmLawed should work with PHP 4.3 and higher. Either 'include()' the 'htmLawed.php' file or copy-paste the entire code. + + To easily *test* htmLawed using a form-based interface, use the provided demo:- htmLawedTest.php ('htmLawed.php' and 'htmLawedTest.php' should be in the same directory on the web-server). + + +-- 2.1 Simple ------------------------------------------------------ + + + The input text to be processed, '$text', is passed as an argument of type string; 'htmLawed()' returns the processed string: + + $processed = htmLawed($text); + + *Note*: If input is from a '$_GET' or '$_POST' value, and 'magic quotes' are enabled on the PHP setup, run 'stripslashes()' on the input before passing to htmLawed. + + By default, htmLawed will process the text allowing all valid HTML elements/tags, secure URL scheme/CSS style properties, etc. It will allow 'CDATA' sections and HTML comments, balance tags, and ensure proper nesting of elements. Such actions can be configured using two other optional arguments -- '$config' and '$spec': + + $processed = htmLawed($text, $config, $spec); + + These extra parameters are detailed below. Some examples are shown in section:- #2.9. + + *Note*: For maximum protection against 'XSS' and other scripting attacks (e.g., by disallowing Javascript code), consider using the 'safe' parameter; see section:- #3.6. + + +-- 2.2 Configuring htmLawed using the '$config' parameter ---------o + + + '$config' instructs htmLawed on how to tackle certain tasks. When '$config' is not specified, or not set as an array (e.g., '$config = 1'), htmLawed will take default actions. One or many of the task-action or value-specification pairs can be specified in '$config' as array key-value pairs. If a parameter is not specified, htmLawed will use the default value/action indicated further below. + + $config = array('comment'=>0, 'cdata'=>1); + $processed = htmLawed($text, $config); + + Or, + + $processed = htmLawed($text, array('comment'=>0, 'cdata'=>1)); + + Below are the possible value-specification combinations. In PHP code, values that are integers should not be quoted and should be used as numeric types (unless meant as string/text). + + Key: '*' default, '^' different default when htmLawed is used in the Kses-compatible mode (see section:- #2.6), '~' different default when 'valid_xhtml' is set to '1' (see section:- #3.5), '"' different default when 'safe' is set to '1' (see section:- #3.6) + + *abs_url* + Make URLs absolute or relative; '$config["base_url"]' needs to be set; see section:- #3.4.4 + + '-1' - make relative + '0' - no action * + '1' - make absolute + + *and_mark* + Mark '&' characters in the original input; see section:- #3.2 + + *anti_link_spam* + Anti-link-spam measure; see section:- #3.4.7 + + '0' - no measure taken * + 'array("regex1", "regex2")' - will ensure a 'rel' attribute with 'nofollow' in its value in case the 'href' attribute value matches the regular expression pattern 'regex1', and/or will remove 'href' if its value matches the regular expression pattern 'regex2'. E.g., 'array("/./", "/://\W*(?!(abc\.com|xyz\.org))/")'; see section:- #3.4.7 for more. + + *anti_mail_spam* + Anti-mail-spam measure; see section:- #3.4.7 + + '0' - no measure taken * + 'word' - '@' in mail address in 'href' attribute value is replaced with specified 'word' + + *balance* + Balance tags for well-formedness and proper nesting; see section:- #3.3.3 + + '0' - no + '1' - yes * + + *base_url* + Base URL value that needs to be set if '$config["abs_url"]' is not '0'; see section:- #3.4.4 + + *cdata* + Handling of 'CDATA' sections; see section:- #3.3.1 + + '0' - don't consider 'CDATA' sections as markup and proceed as if plain text ^" + '1' - remove + '2' - allow, but neutralize any '<', '>', and '&' inside by converting them to named entities + '3' - allow * + + *clean_ms_char* + Replace discouraged characters introduced by Microsoft Word, etc.; see section:- #3.1 + + '0' - no * + '1' - yes + '2' - yes, but replace special single & double quotes with ordinary ones + + *comment* + Handling of HTML comments; see section:- #3.3.1 + + '0' - don't consider comments as markup and proceed as if plain text ^" + '1' - remove + '2' - allow, but neutralize any '<', '>', and '&' inside by converting to named entities + '3' - allow * + + *css_expression* + Allow dynamic CSS expression by not removing the expression from CSS property values in 'style' attributes; see section:- #3.4.8 + + '0' - remove * + '1' - allow + + *deny_attribute* + Denied HTML attributes; see section:- #3.4 + + '0' - none * + 'string' - dictated by values in 'string' + 'on*' (like 'onfocus') attributes not allowed - " + + *elements* + Allowed HTML elements; see section:- #3.3 + + '* -center -dir -font -isindex -menu -s -strike -u' - ~ + 'applet, embed, iframe, object, script' not allowed - " + + *hexdec_entity* + Allow hexadecimal numeric entities and do not convert to the more widely accepted decimal ones, or convert decimal to hexadecimal ones; see section:- #3.2 + + '0' - no + '1' - yes * + '2' - convert decimal to hexadecimal ones + + *hook* + Name of an optional hook function to alter the input string, '$config' or '$spec' before htmLawed starts its main work; see section:- #3.7 + + '0' - no hook function * + 'name' - 'name' is name of the hook function ('kses_hook' ^) + + *hook_tag* + Name of an optional hook function to alter tag content finalized by htmLawed; see section:- #3.4.9 + + '0' - no hook function * + 'name' - 'name' is name of the hook function + + *keep_bad* + Neutralize bad tags by converting '<' and '>' to entities, or remove them; see section:- #3.3.3 + + '0' - remove ^ + '1' - neutralize both tags and element content + '2' - remove tags but neutralize element content + '3' and '4' - like '1' and '2' but remove if text ('pcdata') is invalid in parent element + '5' and '6' * - like '3' and '4' but line-breaks, tabs and spaces are left + + *lc_std_val* + For XHTML compliance, predefined, standard attribute values, like 'get' for the 'method' attribute of 'form', must be lowercased; see section:- #3.4.5 + + '0' - no + '1' - yes * + + *make_tag_strict* + Transform/remove these non-strict XHTML elements, even if they are allowed by the admin: 'applet' 'center' 'dir' 'embed' 'font' 'isindex' 'menu' 's' 'strike' 'u'; see section:- #3.3.2 + + '0' - no ^ + '1' - yes, but leave 'applet', 'embed' and 'isindex' elements that currently can't be transformed * + '2' - yes, removing 'applet', 'embed' and 'isindex' elements and their contents (nested elements remain) ~ + + *named_entity* + Allow non-universal named HTML entities, or convert to numeric ones; see section:- #3.2 + + '0' - convert + '1' - allow * + + *no_deprecated_attr* + Allow deprecated attributes or transform them; see section:- #3.4.6 + + '0' - allow ^ + '1' - transform, but 'name' attributes for 'a' and 'map' are retained * + '2' - transform + + *parent* + Name of the parent element, possibly imagined, that will hold the input; see section:- #3.3 + + *safe* + Magic parameter to make input the most secure against XSS without needing to specify other relevant '$config' parameters; see section:- #3.6 + + '0' - no * + '1' - will auto-adjust other relevant '$config' parameters (indicated by '"' in this list) + + *schemes* + Array of attribute-specific, comma-separated, lower-cased list of schemes (protocols) allowed in attributes accepting URLs; '*' covers all unspecified attributes; see section:- #3.4.3 + + 'href: aim, feed, file, ftp, gopher, http, https, irc, mailto, news, nntp, sftp, ssh, telnet; *:file, http, https' * + '*: ftp, gopher, http, https, mailto, news, nntp, telnet' ^ + 'href: aim, feed, file, ftp, gopher, http, https, irc, mailto, news, nntp, sftp, ssh, telnet; style: nil; *:file, http, https' " + + *show_setting* + Name of a PHP variable to assign the `finalized` '$config' and '$spec' values; see section:- #3.8 + + *style_pass* + Do not look at 'style' attribute values, letting them through without any alteration + + '0' - no * + '1' - htmLawed will let through any 'style' value; see section:- #3.4.8 + + *tidy* + Beautify or compact HTML code; see section:- #3.3.5 + + '-1' - compact + '0' - no * + '1' or 'string' - beautify (custom format specified by 'string') + + *unique_ids* + 'id' attribute value checks; see section:- #3.4.2 + + '0' - no ^ + '1' - remove duplicate and/or invalid ones * + 'word' - remove invalid ones and replace duplicate ones with new and unique ones based on the 'word'; the admin-specified 'word', like 'my_', should begin with a letter (a-z) and can contain letters, digits, '.', '_', '-', and ':'. + + *valid_xhtml* + Magic parameter to make input the most valid XHTML without needing to specify other relevant '$config' parameters; see section:- #3.5 + + '0' - no * + '1' - will auto-adjust other relevant '$config' parameters (indicated by '~' in this list) + + *xml:lang* + Auto-adding 'xml:lang' attribute; see section:- #3.4.1 + + '0' - no * + '1' - add if 'lang' attribute is present + '2' - add if 'lang' attribute is present, and remove 'lang' ~ + + +-- 2.3 Extra HTML specifications using the $spec parameter --------o + + + The '$spec' argument can be used to disallow an otherwise legal attribute for an element, or to restrict the attribute's values. This can also be helpful as a security measure (e.g., in certain versions of browsers, certain values can cause buffer overflows and denial of service attacks), or in enforcing admin policy compliance. '$spec' is specified as a string of text containing one or more `rules`, with multiple rules separated from each other by a semi-colon (';'). E.g., + + $spec = 'i=-*; td, tr=style, id, -*; a=id(match="/[a-z][a-z\d.:\-`"]*/i"/minval=2), href(maxlen=100/minlen=34); img=-width,-alt'; + $processed = htmLawed($text, $config, $spec); + + Or, + + $processed = htmLawed($text, $config, 'i=-*; td, tr=style, id, -*; a=id(match="/[a-z][a-z\d.:\-`"]*/i"/minval=2), href(maxlen=100/minlen=34); img=-width,-alt'); + + A rule begins with an HTML *element* name(s) (`rule-element`), for which the rule applies, followed by an equal ('=') sign. A rule-element may represent multiple elements if comma (,)-separated element names are used. E.g., 'th,td,tr='. + + Rest of the rule consists of comma-separated HTML *attribute names*. A minus ('-') character before an attribute means that the attribute is not permitted inside the rule-element. E.g., '-width'. To deny all attributes, '-*' can be used. + + Following shows examples of rule excerpts with rule-element 'a' and the attributes that are being permitted: + + * 'a=' - all + * 'a=id' - all + * 'a=href, title, -id, -onclick' - all except 'id' and 'onclick' + * 'a=*, id, -id' - all except 'id' + * 'a=-*' - none + * 'a=-*, href, title' - none except 'href' and 'title' + * 'a=-*, -id, href, title' - none except 'href' and 'title' + + Rules regarding *attribute values* are optionally specified inside round brackets after attribute names in slash ('/')-separated `parameter = value` pairs. E.g., 'title(maxlen=30/minlen=5)'. None, or one or more of the following parameters may be specified: + + * 'oneof' - one or more choices separated by '|' that the value should match; if only one choice is provided, then the value must match that choice + + * 'noneof' - one or more choices separated by '|' that the value should not match + + * 'maxlen' and 'minlen' - upper and lower limits for the number of characters in the attribute value; specified in numbers + + * 'maxval' and 'minval' - upper and lower limits for the numerical value specified in the attribute value; specified in numbers + + * 'match' and 'nomatch' - pattern that the attribute value should or should not match; specified as PHP/PCRE-compatible regular expressions with delimiters and possibly modifiers + + * 'default' - a value to force on the attribute if the value provided by the writer does not fit any of the specified parameters + + If 'default' is not set and the attribute value does not satisfy any of the specified parameters, then the attribute is removed. The 'default' value can also be used to force all attribute declarations to take the same value (by getting the values declared illegal by setting, e.g., 'maxlen' to '-1'). + + Examples with `input` '<input title="WIDTH" value="10em" /><input title="length" value="5" />' are shown below. + + `Rule`: 'input=title(maxlen=60/minlen=6), value' + `Output`: '<input value="10em" /><input title="length" value="5" />' + + `Rule`: 'input=title(), value(maxval=8/default=6)' + `Output`: '<input title="WIDTH" value="6" /><input title="length" value="5" />' + + `Rule`: 'input=title(nomatch=$w.d$i), value(match=$em$/default=6em)' + `Output`: '<input value="10em" /><input title="length" value="6em" />' + + `Rule`: 'input=title(oneof=height|depth/default=depth), value(noneof=5|6)' + `Output`: '<input title="depth" value="10em" /><input title="depth" />' + + *Special characters*: The characters ';', ',', '/', '(', ')', '|', '~' and space have special meanings in the rules. Words in the rules that use such characters, or the characters themselves, should be `escaped` by enclosing in pairs of double-quotes ('"'). A back-tick ('`') can be used to escape a literal '"'. An example rule illustrating this is 'input=value(maxlen=30/match="/^\w/"/default="your `"ID`"")'. + + *Note*: To deny an attribute for all elements for which it is legal, '$config["deny_attribute"]' (see section:- #3.4) can be used instead of '$spec'. Also, attributes can be allowed element-specifically through '$spec' while being denied globally through '$config["deny_attribute"]'. The 'hook_tag' parameter (section:- #3.4.9) can also be used to implement the '$spec' functionality. + + +-- 2.4 Performance time & memory usage ----------------------------o + + + The time and memory used by htmLawed depends on its configuration and the size of the input, and the amount, nestedness and well-formedness of the HTML markup within it. In particular, tag balancing and beautification each can increase the processing time by about a quarter. + + The htmLawed demo:- htmLawedTest.php can be used to evaluate the performance and effects of different types of input and '$config'. + + +-- 2.5 Some security risks to keep in mind ------------------------o + + + When setting the parameters/arguments (like those to allow certain HTML elements) for use with htmLawed, one should bear in mind that the setting may let through potentially `dangerous` HTML code. (This may not be a problem if the authors are trusted.) + + For example, following increase security risks: + + * Allowing 'script', 'applet', 'embed', 'iframe' or 'object' elements, or certain of their attributes like 'allowscriptaccess' + + * Allowing HTML comments (some Internet Explorer versions are vulnerable with, e.g., '<!--[if gte IE 4]><script>alert("xss");</script><![endif]-->' + + * Allowing dynamic CSS expressions (a feature of the IE browser) + + `Unsafe` HTML can be removed by setting '$config' appropriately. E.g., '$config["elements"] = "* -script"' (section:- #3.3), '$config["safe"] = 1' (section:- #3.6), etc. + + +-- 2.6 Use without modifying old 'kses()' code --------------------o + + + The 'Kses' PHP script is used by many applications (like 'WordPress'). It is possible to have such applications use htmLawed instead, since it is compatible with code that calls the 'kses()' function declared in the 'Kses' file (usually named 'kses.php'). E.g., application code like this will continue to work after replacing 'Kses' with htmLawed: + + $comment_filtered = kses($comment_input, array('a'=>array(), 'b'=>array(), 'i'=>array())); + + For some of the '$config' parameters, htmLawed will use values other than the default ones. These are indicated by '^' in section:- #2.2. To force htmLawed to use other values, function 'kses()' in the htmLawed code should be edited -- a few configurable parameters/variables need to be changed. + + If the application uses a 'Kses' file that has the 'kses()' function declared, then, to have the application use htmLawed instead of 'Kses', simply rename 'htmLawed.php' (to 'kses.php', e.g.) and replace the 'Kses' file (or just replace the code in the 'Kses' file with the htmLawed code). If the 'kses()' function in the 'Kses' file had been renamed by the application developer (e.g., in 'WordPress', it is named 'wp_kses()'), then appropriately rename the 'kses()' function in the htmLawed code. + + If the 'Kses' file used by the application has been highly altered by the application developers, then one may need a different approach. E.g., with 'WordPress', it is best to copy the htmLawed code to 'wp_includes/kses.php', rename the newly added function 'kses()' to 'wp_kses()', and delete the code for the original 'wp_kses()' function. + + If the 'Kses' code has a non-empty hook function (e.g., 'wp_kses_hook()' in case of 'WordPress'), then the code for htmLawed's 'kses_hook()' function should be appropriately edited. However, the requirement of the hook function should be re-evaluated considering that htmLawed has extra capabilities. With 'WordPress', the hook function is an essential one. The following code is suggested for the htmLawed 'kses_hook()' in case of 'WordPress': + + function kses_hook($string, &$cf, &$spec){ + // kses compatibility + $allowed_html = $spec; + $allowed_protocols = array(); + foreach($cf['schemes'] as $v){ + foreach($v as $k2=>$v2){ + if(!in_array($k2, $allowed_protocols)){ + $allowed_protocols[] = $k2; + } + } + } + return wp_kses_hook($string, $allowed_html, $allowed_protocols); + // eof + } + + +-- 2.7 Tolerance for ill-written HTML -----------------------------o + + + htmLawed can work with ill-written HTML code in the input. However, HTML that is too ill-written may not be `read` as HTML, and be considered mere plain text instead. Following statements indicate the degree of `looseness` that htmLawed can work with, and can be provided in instructions to writers: + + * Tags must be flanked by '<' and '>' with no '>' inside -- any needed '>' should be put in as '>'. It is possible for tag content (element name and attributes) to be spread over many lines instead of being on one. A space may be present between the tag content and '>', like '<div >' and '<img / >', but not after the '<'. + + * Element and attribute names need not be lower-cased. + + * Attribute string of elements may be liberally spaced with tabs, line-breaks, etc. + + * Attribute values may not be double-quoted, or may be single-quoted. + + * Left-padding of numeric entities (like, ' ', '&x07ff;') with '0' is okay as long as the number of characters between between the '&' and the ';' does not exceed 8. All entities must end with ';' though. + + * Named character entities must be properly cased. E.g., '≪' or '&TILDE;' will not be let through without modification. + + * HTML comments should not be inside element tags (okay between tags), and should begin with '<!--' and end with '-->'. Characters like '<', '>', and '&' may be allowed inside depending on '$config', but any '-->' inside should be put in as '-->'. Any '--' inside will be automatically converted to '-', and a space will be added before the comment delimiter '-->'. + + * 'CDATA' sections should not be inside element tags, and can be in element content only if plain text is allowed for that element. They should begin with '<[CDATA[' and end with ']]>'. Characters like '<', '>', and '&' may be allowed inside depending on '$config', but any ']]>' inside should be put in as ']]>'. + + * For attribute values, character entities '<', '>' and '&' should be used instead of characters '<' and '>', and '&' (when '&' is not part of a character entity). This applies even for Javascript code in values of attributes like 'onclick'. + + * Characters '<', '>', '&' and '"' that are part of actual Javascript, etc., code in 'script' elements should be used as such and not be put in as entities like '>'. Otherwise, though the HTML will be valid, the code may fail to work. Further, if such characters have to be used, then they should be put inside 'CDATA' sections. + + * Simple instructions like "an opening tag cannot be present between two closing tags" and "nested elements should be closed in the reverse order of how they were opened" can help authors write balanced HTML. If tags are imbalanced, htmLawed will try to balance them, but in the process, depending on '$config["keep_bad"]', some code/text may be lost. + + * Input authors should be notified of admin-specified allowed elements, attributes, configuration values (like conversion of named entities to numeric ones), etc. + + * With '$config["unique_ids"]' not '0' and the 'id' attribute being permitted, writers should carefully avoid using duplicate or invalid 'id' values as even though htmLawed will correct/remove the values, the final output may not be the one desired. E.g., when '<a id="home"></a><input id="home" /><label for="home"></label>' is processed into +'<a id="home"></a><input id="prefix_home" /><label for="home"></label>'. + + * Note that even if intended HTML is lost in a highly ill-written input, the processed output will be more secure and standard-compliant. + + * For URLs, unless '$config["scheme"]' is appropriately set, writers should avoid using escape characters or entities in schemes. E.g., 'http' (which many browsers will read as the harmless 'http') may be considered bad by htmLawed. + + * htmLawed will attempt to put plain text present directly inside 'blockquote', 'form', 'map' and 'noscript' elements (illegal as per the specs) inside auto-generated 'div' elements. + + +-- 2.8 Limitations & work-arounds ---------------------------------o + + + htmLawed's main objective is to make the input text `more` standard-compliant, secure for web-page readers, and free of HTML elements and attributes considered undesirable by the administrator. Some of its current limitations, regardless of this objective, are noted below along with work-arounds. + + It should be borne in mind that no browser application is 100% standard-compliant, and that some of the standard specs (like asking for normalization of white-spacing within 'textarea' elements) are clearly wrong. Regarding security, note that `unsafe` HTML code is not necessarily legally invalid. + + * htmLawed is meant for input that goes into the 'body' of HTML documents. HTML's head-level elements are not supported, nor are the frameset elements 'frameset', 'frame' and 'noframes'. + + * It cannot transform the non-standard 'embed' elements to the standard-compliant 'object' elements. Yet, it can allow 'embed' elements if permitted ('embed' is widely used and supported). Admins can certainly use the 'hook_tag' parameter (section:- #3.4.9) to deploy a custom embed-to-object converter function. + + * The only non-standard element that may be permitted is 'embed'; others like 'noembed' and 'nobr' cannot be permitted without modifying the htmLawed code. + + * It cannot handle input that has non-HTML code like 'SVG' and 'MathML'. One way around is to break the input into pieces and passing only those without non-HTML code to htmLawed. Another is described in section:- #3.9. A third way may be to some how take advantage of the '$config["and_mark"]' parameter (see section:- #3.2). + + * By default, htmLawed won't check many attribute values for standard compliance. E.g., 'width="20m"' with the dimension in non-standard 'm' is let through. Implementing universal and strict attribute value checks can make htmLawed slow and resource-intensive. Admins should look at the 'hook_tag' parameter (section:- #3.4.9) or '$spec' to enforce finer checks. + + * The attributes, deprecated (which can be transformed too) or not, that it supports are largely those that are in the specs. Only a few of the proprietary attributes are supported. + + * Except for contained URLs and dynamic expressions (also optional), htmLawed does not check CSS style property values. Admins should look at using the 'hook_tag' parameter (section:- #3.4.9) or '$spec' for finer checks. Perhaps the best option is to disallow 'style' but allow 'class' attributes with the right 'oneof' or 'match' values for 'class', and have the various class style properties in '.css' CSS stylesheet files. + + * htmLawed does not parse emoticons, decode `BBcode`, or `wikify`, auto-converting text to proper HTML. Similarly, it won't convert line-breaks to 'br' elements. Such functions are beyond its purview. Admins should use other code to pre- or post-process the input for such purposes. + + * htmLawed cannot be used to have links force-opened in new windows (by auto-adding appropriate 'target' and 'onclick' attributes to 'a'). Admins should look at Javascript-based DOM-modifying solutions for this. Admins may also be able to use a custom hook function to enforce such checks ('hook_tag' parameter; see section:- #3.4.9). + + * Nesting-based checks are not possible. E.g., one cannot disallow 'p' elements specifically inside 'td' while permitting it elsewhere. Admins may be able to use a custom hook function to enforce such checks ('hook_tag' parameter; see section:- #3.4.9). + + * Except for optionally converting absolute or relative URLs to the other type, htmLawed will not alter URLs (e.g., to change the value of query strings or to convert 'http' to 'https'. Having absolute URLs may be a standard-requirement, e.g., when HTML is embedded in email messages, whereas altering URLs for other purposes is beyond htmLawed's goals. Admins may be able to use a custom hook function to enforce such checks ('hook_tag' parameter; see section:- #3.4.9). + + * Pairs of opening and closing tags that do not enclose any content (like '<em></em>') are not removed. This may be against the standard specs for certain elements (e.g., 'table'). However, presence of such standard-incompliant code will not break the display or layout of content. Admins can also use simple regex-based code to filter out such code. + + * htmLawed does not check for certain element orderings described in the standard specs (e.g., in a 'table', 'tbody' is allowed before 'tfoot'). Admins may be able to use a custom hook function to enforce such checks ('hook_tag' parameter; see section:- #3.4.9). + + * htmLawed does not check the number of nested elements. E.g., it will allow two 'caption' elements in a 'table' element, illegal as per the specs. Admins may be able to use a custom hook function to enforce such checks ('hook_tag' parameter; see section:- #3.4.9). + + * htmLawed might convert certain entities to actual characters and remove backslashes and CSS comment-markers ('/*') in 'style' attribute values in order to detect malicious HTML like crafted IE-specific dynamic expressions like 'expression...'. If this is too harsh, admins can allow CSS expressions through htmLawed core but then use a custom function through the 'hook_tag' parameter (section:- #3.4.9) to more specifically identify CSS expressions in the 'style' attribute values. Also, using '$config["style_pass"]', it is possible to have htmLawed pass 'style' attribute values without even looking at them (section:- #3.4.8). + + * htmLawed does not correct certain possible attribute-based security vulnerabilities (e.g., '<a href="http://x%22+style=%22background-image:xss">x</a>'). These arise when browsers mis-identify markup in `escaped` text, defeating the very purpose of escaping text (a bad browser will read the given example as '<a href="http://x" style="background-image:xss">x</a>'). + + * Because of poor Unicode support in PHP, htmLawed does not remove the `high value` HTML-invalid characters with multi-byte code-points. Such characters however are extremely unlikely to be in the input. (see section:- #3.1). + + * Like any script using PHP's PCRE regex functions, PHP setup-specific low PCRE limit values can cause htmLawed to at least partially fail with very long input texts. + + +-- 2.9 Examples ---------------------------------------------------o + + + *1.* A blog administrator wants to allow only 'a', 'em', 'strike', 'strong' and 'u' in comments, but needs 'strike' and 'u' transformed to 'span' for better XHTML 1-strict compliance, and, he wants the 'a' links to be to 'http' or 'https' resources: + + $processed = htmLawed($in, array('elements'=>'a, em, strike, strong, u', 'make_tag_strict'=>1, 'safe'=>1, 'schemes'=>'*:http, https'), 'a=href'); + + *2.* An author uses a custom-made web application to load content on his web-site. He is the only one using that application and the content he generates has all types of HTML, including scripts. The web application uses htmLawed primarily as a tool to correct errors that creep in while writing HTML and to take care of the occasional `bad` characters in copy-paste text introduced by Microsoft Office. The web application provides a preview before submitted input is added to the content. For the previewing process, htmLawed is set up as follows: + + $processed = htmLawed($in, array('css_expression'=>1, 'keep_bad'=>1, 'make_tag_strict'=>1, 'schemes'=>'*:*', 'valid_xhtml'=>1)); + + For the final submission process, 'keep_bad' is set to '6'. A value of '1' for the preview process allows the author to note and correct any HTML mistake without losing any of the typed text. + + *3.* A data-miner is scraping information in a specific table of similar web-pages and is collating the data rows, and uses htmLawed to reduce unnecessary markup and white-spaces: + + $processed = htmLawed($in, array('elements'=>'tr, td', 'tidy'=>-1), 'tr, td ='); + + +== 3 Details =====================================================oo + + +-- 3.1 Invalid/dangerous characters -------------------------------- + + + Valid characters (more correctly, their code-points) in HTML or XML are, hexadecimally, '9', 'a', 'd', '20' to 'd7ff', and 'e000' to '10ffff', except 'fffe' and 'ffff' (decimally, '9', '10', '13', '32' to '55295', and '57344' to '1114111', except '65534' and '65535'). htmLawed removes the invalid characters '0' to '8', 'b', 'c', and 'e' to '1f'. + + Because of PHP's poor native support for multi-byte characters, htmLawed cannot check for the remaining invalid code-points. However, for various reasons, it is very unlikely for any of those characters to be in the input. + + Characters that are discouraged (see section:- #5.1) but not invalid are not removed by htmLawed. + + It (function 'hl_tag()') also replaces the potentially dangerous (in some Mozilla [Firefox] and Opera browsers) soft-hyphen character (code-point, hexadecimally, 'ad', or decimally, '173') in attribute values with spaces. Where required, the characters '<', '>', '&', and '"' are converted to entities. + + With '$config["clean_ms_char"]' set as '1' or '2', many of the discouraged characters (decimal code-points '127' to '159' except '133') that many Microsoft applications incorrectly use (as per the 'Windows 1252' ['Cp-1252'] or a similar encoding system), and the character for decimal code-point '133', are converted to appropriate decimal numerical entities (or removed for a few cases)-- see appendix in section:- #5.4. This can help avoid some display issues arising from copying-pasting of content. + + With '$config["clean_ms_char"]' set as '2', characters for the hexadecimal code-points '82', '91', and '92' (for special single-quotes), and '84', '93', and '94' (for special double-quotes) are converted to ordinary single and double quotes respectively and not to entities. + + The character values are replaced with entities/characters and not character values referred to by the entities/characters to keep this task independent of the character-encoding of input text. + + The '$config["clean_ms_char"]' parameter need not be used if authors do not copy-paste Microsoft-created text or if the input text is not believed to use the 'Windows 1252' or a similar encoding. Further, the input form and the web-pages displaying it or its content should have the character encoding appropriately marked-up. + + +-- 3.2 Character references/entities ------------------------------o + + + Valid character entities take the form '&*;' where '*' is '#x' followed by a hexadecimal number (hexadecimal numeric entity; like ' ' for non-breaking space), or alphanumeric like 'gt' (external or named entity; like ' ' for non-breaking space), or '#' followed by a number (decimal numeric entity; like ' ' for non-breaking space). Character entities referring to the soft-hyphen character (the '­' or '\xad' character; hexadecimal code-point 'ad' [decimal '173']) in attribute values are always replaced with spaces; soft-hyphens in attribute values introduce vulnerabilities in some older versions of the Opera and Mozilla [Firefox] browsers. + + htmLawed (function 'hl_ent()'): + + * Neutralizes entities with multiple leading zeroes or missing semi-colons (potentially dangerous) + + * Lowercases the 'X' (for XML-compliance) and 'A-F' of hexadecimal numeric entities + + * Neutralizes entities referring to characters that are HTML-invalid (see section:- #3.1) + + * Neutralizes entities referring to characters that are HTML-discouraged (code-points, hexadecimally, '7f' to '84', '86' to '9f', and 'fdd0' to 'fddf', or decimally, '127' to '132', '134' to '159', and '64991' to '64976'). Entities referring to the remaining discouraged characters (see section:- #5.1 for a full list) are let through. + + * Neutralizes named entities that are not in the specs. + + * Optionally converts valid HTML-specific named entities except '>', '<', '"', and '&' to decimal numeric ones (hexadecimal if $config["hexdec_entity"] is '2') for generic XML-compliance. For this, '$config["named_entity"]' should be '1'. + + * Optionally converts hexadecimal numeric entities to the more widely supported decimal ones. For this, '$config["hexdec_entity"]' should be '0'. + + * Optionally converts decimal numeric entities to the hexadecimal ones. For this, '$config["hexdec_entity"]' should be '2'. + + `Neutralization` refers to the `entitification` of '&' to '&'. + + *Note*: htmLawed does not convert entities to the actual characters represented by them; one can pass the htmLawed output through PHP's 'html_entity_decode' function:- http://www.php.net/html_entity_decode for that. + + *Note*: If '$config["and_mark"]' is set, and set to a value other than '0', then the '&' characters in the original input are replaced with the control character for the hexadecimal code-point '6' ('\x06'; '&' characters introduced by htmLawed, e.g., after converting '<' to '<', are not affected). This allows one to distinguish, say, an '>' introduced by htmLawed and an '>' put in by the input writer, and can be helpful in further processing of the htmLawed-processed text (e.g., to identify the character sequence 'o(><)o' to generate an emoticon image). When this feature is active, admins should ensure that the htmLawed output is not directly used in web pages or XML documents as the presence of the '\x06' can break documents. Before use in such documents, and preferably before any storage, any remaining '\x06' should be changed back to '&', e.g., with: + + $final = str_replace("\x06", '&', $prelim); + + Also, see section:- #3.9. + + +-- 3.3 HTML elements ----------------------------------------------o + + + htmLawed can be configured to allow only certain HTML elements (tags) in the input. Disallowed elements (just tag-content, and not element-content), based on '$config["keep_bad"]', are either `neutralized` (converted to plain text by entitification of '<' and '>') or removed. + + E.g., with only 'em' permitted: + + Input: + + <em>My</em> website is <a href="http://a.com>a.com</a>. + + Output, with '$config["keep_bad"] = 0': + + <em>My</em> website is a.com. + + Output, with '$config["keep_bad"]' not '0': + + <em>My</em> website is <a href="">a.com</a>. + + See section:- #3.3.3 for differences between the various non-zero '$config["keep_bad"]' values. + + htmLawed by default permits these 86 elements: + + a, abbr, acronym, address, applet, area, b, bdo, big, blockquote, br, button, caption, center, cite, code, col, colgroup, dd, del, dfn, dir, div, dl, dt, em, embed, fieldset, font, form, h1, h2, h3, h4, h5, h6, hr, i, iframe, img, input, ins, isindex, kbd, label, legend, li, map, menu, noscript, object, ol, optgroup, option, p, param, pre, q, rb, rbc, rp, rt, rtc, ruby, s, samp, script, select, small, span, strike, strong, sub, sup, table, tbody, td, textarea, tfoot, th, thead, tr, tt, u, ul, var + + Except for 'embed' (included because of its wide-spread use) and the Ruby elements ('rb', 'rbc', 'rp', 'rt', 'rtc', 'ruby'; part of XHTML 1.1), these are all the elements in the HTML 4/XHTML 1 specs. Strict-specific specs. exclude 'center', 'dir', 'font', 'isindex', 'menu', 's', 'strike', and 'u'. + + With '$config["safe"] = 1', the default set will exclude 'applet', 'embed', 'iframe', 'object' and 'script'; see section:- #3.6. + + When '$config["elements"]', which specifies allowed elements, is `properly` defined, and neither empty nor set to '0' or '*', the default set is not used. To have elements added to or removed from the default set, a '+/-' notation is used. E.g., '*-script-object' implies that only 'script' and 'object' are disallowed, whereas '*+embed' means that 'noembed' is also allowed. Elements can also be specified as comma separated names. E.g., 'a, b, i' means only 'a', 'b' and 'i' are permitted. In this notation, '*', '+' and '-' have no significance and can actually cause a mis-reading. + + Some more examples of '$config["elements"]' values indicating permitted elements (note that empty spaces are liberally allowed for clarity): + + * 'a, blockquote, code, em, strong' -- only 'a', 'blockquote', 'code', 'em', and 'strong' + * '*-script' -- all excluding 'script' + * '* -center -dir -font -isindex -menu -s -strike -u' -- only XHTML-Strict elements + * '*+noembed-script' -- all including 'noembed' excluding 'script' + + Some mis-usages (and the resulting permitted elements) that can be avoided: + + * '-*' -- none; instead of htmLawed, one might just use, e.g., the 'htmlspecialchars()' PHP function + * '*, -script' -- all except 'script'; admin probably meant '*-script' + * '-*, a, em, strong' -- all; admin probably meant 'a, em, strong' + * '*' -- all; admin need not have set 'elements' + * '*-form+form' -- all; a '+' will always over-ride any '-' + * '*, noembed' -- only 'noembed'; admin probably meant '*+noembed' + * 'a, +b, i' -- only 'a' and 'i'; admin probably meant 'a, b, i' + + Basically, when using the '+/-' notation, commas (',') should not be used, and vice versa, and '*' should be used with the former but not the latter. + + *Note*: Even if an element that is not in the default set is allowed through '$config["elements"]', like 'noembed' in the last example, it will eventually be removed during tag balancing unless such balancing is turned off ('$config["balance"]' set to '0'). Currently, the only way around this, which actually is simple, is to edit the various arrays in the function 'hl_bal()' to accommodate the element and its nesting properties. + + *A possibly second way to specify allowed elements* is to set '$config["parent"]' to an element name that supposedly will hold the input, and to set '$config["balance"]' to '1'. During tag balancing (see section:- #3.3.3), all elements that cannot legally nest inside the parent element will be removed. The parent element is auto-reset to 'div' if '$config["parent"]' is empty, 'body', or an element not in htmLawed's default set of 86 elements. + + `Tag transformation` is possible for improving XHTML-Strict compliance -- most of the deprecated elements are removed or converted to valid XHTML-Strict ones; see section:- #3.3.2. + + +.. 3.3.1 Handling of comments and CDATA sections ................... + + + 'CDATA' sections have the format '<![CDATA[...anything but not "]]>"...]]>', and HTML comments, '<!--...anything but not "-->"... -->'. Neither HTML comments nor 'CDATA' sections can reside inside tags. HTML comments can exist anywhere else, but 'CDATA' sections can exist only where plain text is allowed (e.g., immediately inside 'td' element content but not immediately inside 'tr' element content). + + htmLawed (function 'hl_cmtcd()') handles HTML comments or 'CDATA' sections depending on the values of '$config["comment"]' or '$config["cdata"]'. If '0', such markup is not looked for and the text is processed like plain text. If '1', it is removed completely. If '2', it is preserved but any '<', '>' and '&' inside are changed to entities. If '3', they are left as such. + + Note that for the last two cases, HTML comments and 'CDATA' sections will always be removed from tag content (function 'hl_tag()'). + + Examples: + + Input: + <!-- home link --><a href="home.htm"><![CDATA[x=&y]]>Home</a> + Output ('$config["comment"] = 0, $config["cdata"] = 2'): + <-- home link --><a href="home.htm"><![CDATA[x=&y]]>Home</a> + Output ('$config["comment"] = 1, $config["cdata"] = 2'): + <a href="home.htm"><![CDATA[x=&y]]>Home</a> + Output ('$config["comment"] = 2, $config["cdata"] = 2'): + <!-- home link --><a href="home.htm"><![CDATA[x=&y]]>Home</a> + Output ('$config["comment"] = 2, $config["cdata"] = 1'): + <!-- home link --><a href="home.htm">Home</a> + Output ('$config["comment"] = 3, $config["cdata"] = 3'): + <!-- home link --><a href="home.htm"><![CDATA[x=&y]]>Home</a> + + For standard-compliance, comments are given the form '<!--comment -->', and any '--' in the content is made '-'. + + When '$config["safe"] = 1', CDATA sections and comments are considered plain text unless '$config["comment"]' or '$config["cdata"]' is explicitly specified; see section:- #3.6. + + +.. 3.3.2 Tag-transformation for better XHTML-Strict ................o + + + If '$config["make_tag_strict"]' is set and not '0', following non-XHTML-Strict elements (and attributes), even if admin-permitted, are mutated as indicated (element content remains intact; function 'hl_tag2()'): + + * applet - (based on '$config["make_tag_strict"]', unchanged ('1') or removed ('2')) + * center - 'div style="text-align: center;"' + * dir - 'ul' + * embed - (based on '$config["make_tag_strict"]', unchanged ('1') or removed ('2')) + * font (face, size, color) - 'span style="font-family: ; font-size: ; color: ;"' (size transformation reference:- http://style.cleverchimp.com/font_size_intervals/altintervals.html) + * isindex - (based on '$config["make_tag_strict"]', unchanged ('1') or removed ('2')) + * menu - 'ul' + * s - 'span style="text-decoration: line-through;"' + * strike - 'span style="text-decoration: line-through;"' + * u - 'span style="text-decoration: underline;"' + + For an element with a pre-existing 'style' attribute value, the extra style properties are appended. + + Example input: + + <center> + The PHP <s>software</s> script used for this <strike>web-page</strike> web-page is <font style="font-weight: bold " face=arial size='+3' color = "red ">htmLawedTest.php</font>, from <u style= 'color:green'>PHP Labware</u>. + </center> + + The output: + + <div style="text-align: center;"> + The PHP <span style="text-decoration: line-through;">software</span> script used for this <span style="text-decoration: line-through;">web-page</span> web-page is <span style="font-weight: bold; font-family: arial; color: red; font-size: 200%;">htmLawedTest.php</span>, from <span style="color:green; text-decoration: underline;">PHP Labware</span>. + </div> + + +-- 3.3.3 Tag balancing and proper nesting -------------------------o + + + If '$config["balance"]' is set to '1', htmLawed (function 'hl_bal()') checks and corrects the input to have properly balanced tags and legal element content (i.e., any element nesting should be valid, and plain text may be present only in the content of elements that allow them). + + Depending on the value of '$config["keep_bad"]' (see section:- #2.2 and section:- #3.3), illegal content may be removed or neutralized to plain text by converting < and > to entities: + + '0' - remove; this option is available only to maintain Kses-compatibility and should not be used otherwise (see section:- #2.6) + '1' - neutralize tags and keep element content + '2' - remove tags but keep element content + '3' and '4' - like '1' and '2', but keep element content only if text ('pcdata') is valid in parent element as per specs + '5' and '6' - like '3' and '4', but line-breaks, tabs and spaces are left + + Example input (disallowing the 'p' element): + + <*> Pseudo-tags <*> + <xml>Non-HTML tag xml</xml> + <p> + Disallowed tag p + </p> + <ul>Bad<li>OK</li></ul> + + The output with '$config["keep_bad"] = 1': + + <*> Pseudo-tags <*> + <xml>Non-HTML tag xml</xml> + <p> + Disallowed tag p + </p> + <ul>Bad<li>OK</li></ul> + + The output with '$config["keep_bad"] = 3': + + <*> Pseudo-tags <*> + <xml>Non-HTML tag xml</xml> + <p> + Disallowed tag p + </p> + <ul><li>OK</li></ul> + + The output with '$config["keep_bad"] = 6': + + <*> Pseudo-tags <*> + Non-HTML tag xml + + Disallowed tag p + + <ul><li>OK</li></ul> + + An option like '1' is useful, e.g., when a writer previews his submission, whereas one like '3' is useful before content is finalized and made available to all. + + *Note:* In the example above, unlike '<*>', '<xml>' gets considered as a tag (even though there is no HTML element named 'xml'). In general, text matching the regular expression pattern '<(/?)([a-zA-Z][a-zA-Z1-6]*)([^>]*?)\s?>' is considered a tag (phrase enclosed by the angled brackets '<' and '>', and starting [with an optional slash preceding] with an alphanumeric word that starts with an alphabet...). + + Nesting/content rules for each of the 86 elements in htmLawed's default set (see section:- #3.3) are defined in function 'hl_bal()'. This means that if a non-standard element besides 'embed' is being permitted through '$config["elements"]', the element's tag content will end up getting removed if '$config["balance"]' is set to '1'. + + Plain text and/or certain elements nested inside 'blockquote', 'form', 'map' and 'noscript' need to be in block-level elements. This point is often missed during manual writing of HTML code. htmLawed attempts to address this during balancing. E.g., if the parent container is set as 'form', the input 'B:<input type="text" value="b" />C:<input type="text" value="c" />' is converted to '<div>B:<input type="text" value="b" />C:<input type="text" value="c" /></div>'. + + +-- 3.3.4 Elements requiring child elements ------------------------o + + + As per specs, the following elements require legal child elements nested inside them: + + blockquote, dir, dl, form, map, menu, noscript, ol, optgroup, rbc, rtc, ruby, select, table, tbody, tfoot, thead, tr, ul + + In some cases, the specs stipulate the number and/or the ordering of the child elements. A 'table' can have 0 or 1 'caption', 'tbody', 'tfoot', and 'thead', but they must be in this order: 'caption', 'thead', 'tfoot', 'tbody'. + + htmLawed currently does not check for conformance to these rules. Note that any non-compliance in this regard will not introduce security vulnerabilities, crash browser applications, or affect the rendering of web-pages. + + +-- 3.3.5 Beautify or compact HTML ---------------------------------o + + + By default, htmLawed will neither `beautify` HTML code by formatting it with indentations, etc., nor will it make it compact by removing un-needed white-space.(It does always properly white-space tag content.) + + As per the HTML standards, spaces, tabs and line-breaks in web-pages (except those inside 'pre' elements) are all considered equivalent, and referred to as `white-spaces`. Browser applications are supposed to consider contiguous white-spaces as just a single space, and to disregard white-spaces trailing opening tags or preceding closing tags. This white-space `normalization` allows the use of text/code beautifully formatted with indentations and line-spacings for readability. Such `pretty` HTML can, however, increase the size of web-pages, or make the extraction or scraping of plain text cumbersome. + + With the '$config' parameter 'tidy', htmLawed can be used to beautify or compact the input text. Input with just plain text and no HTML markup is also subject to this. Besides 'pre', the 'script' and 'textarea' elements, CDATA sections, and HTML comments are not subjected to the tidying process. + + To `compact`, use '$config["tidy"] = -1'; single instances or runs of white-spaces are replaced with a single space, and white-spaces trailing and leading open and closing tags, respectively, are removed. + + To `beautify`, '$config["tidy"]' is set as '1', or for customized tidying, as a string like '2s2n'. The 's' or 't' character specifies the use of spaces or tabs for indentation. The first and third characters, any of the digits 0-9, specify the number of spaces or tabs per indentation, and any parental lead spacing (extra indenting of the whole block of input text). The 'r' and 'n' characters are used to specify line-break characters: 'n' for '\n' (Unix/Mac OS X line-breaks), 'rn' or 'nr' for '\r\n' (Windows/DOS line-breaks), or 'r' for '\r'. + + The '$config["tidy"]' value of '1' is equivalent to '2s0n'. Other '$config["tidy"]' values are read loosely: a value of '4' is equivalent to '4s0n'; 't2', to '1t2n'; 's', to '2s0n'; '2TR', to '2t0r'; 'T1', to '1t1n'; 'nr3', to '3s0nr', and so on. Except in the indentations and line-spacings, runs of white-spaces are replaced with a single space during beautification. + + Input formatting using '$config["tidy"]' is not recommended when input text has mixed markup (like HTML + PHP). + + +-- 3.4 Attributes ------------------------------------------------oo + + + htmLawed will only permit attributes described in the HTML specs (including deprecated ones). It also permits some attributes for use with the 'embed' element (the non-standard 'embed' element is supported in htmLawed because of its widespread use), and the the 'xml:space' attribute (valid only in XHTML 1.1). A list of such 111 attributes and the elements they are allowed in is in section:- #5.2. + + When '$config["deny_attribute"]' is not set, or set to '0', or empty ('""'), all the 111 attributes are permitted. Otherwise, '$config["deny_attribute"]' can be set as a list of comma-separated names of the denied attributes. 'on*' can be used to refer to the group of potentially dangerous, script-accepting attributes: 'onblur', 'onchange', 'onclick', 'ondblclick', 'onfocus', 'onkeydown', 'onkeypress', 'onkeyup', 'onmousedown', 'onmousemove', 'onmouseout', 'onmouseover', 'onmouseup', 'onreset', 'onselect' and 'onsubmit'. + + Note that attributes specified in '$config["deny_attribute"]' are denied globally, for all elements. To deny attributes for only specific elements, '$spec' (see section:- #2.3) can be used. '$spec' can also be used to element-specifically permit an attribute otherwise denied through '$config["deny_attribute"]'. + + With '$config["safe"] = 1' (section:- #3.6), the 'on*' attributes are automatically disallowed. + + *Note*: To deny all but a few attributes globally, a simpler way to specify '$config["deny_attribute"]' would be to use the notation '* -attribute1 -attribute2 ...'. Thus, a value of '* -title -href' implies that except 'href' and 'title' (where allowed as per standards) all other attributes are to be removed. With this notation, the value for the parameter 'safe' (section:- #3.6) will have no effect on 'deny_attribute'. + + htmLawed (function 'hl_tag()') also: + + * Lower-cases attribute names + * Removes duplicate attributes (last one stays) + * Gives attributes the form 'name="value"' and single-spaces them, removing unnecessary white-spacing + * Provides `required` attributes (see section:- #3.4.1) + * Double-quotes values and escapes any '"' inside them + * Replaces the possibly dangerous soft-hyphen characters (hexadecimal code-point 'ad') in the values with spaces + * Allows custom function to additionally filter/modify attribute values (see section:- #3.4.9) + + +.. 3.4.1 Auto-addition of XHTML-required attributes ................ + + + If indicated attributes for the following elements are found missing, htmLawed (function 'hl_tag()') will add them (with values same as attribute names unless indicated otherwise below): + + * area - alt ('area') + * area, img - src, alt ('image') + * bdo - dir ('ltr') + * form - action + * map - name + * optgroup - label + * param - name + * script - type ('text/javascript') + * textarea - rows ('10'), cols ('50') + + Additionally, with '$config["xml:lang"]' set to '1' or '2', if the 'lang' but not the 'xml:lang' attribute is declared, then the latter is added too, with a value copied from that of 'lang'. This is for better standard-compliance. With '$config["xml:lang"]' set to '2', the 'lang' attribute is removed (XHTML 1.1 specs). + + Note that the 'name' attribute for 'map', invalid in XHTML 1.1, is also transformed if required -- see section:- #3.4.6. + + +.. 3.4.2 Duplicate/invalid 'id' values ............................o + + + If '$config["unique_ids"]' is '1', htmLawed (function 'hl_tag()') removes 'id' attributes with values that are not XHTML-compliant (must begin with a letter and can contain letters, digits, ':', '.', '-' and '_') or duplicate. If '$config["unique_ids"]' is a word, any duplicate but otherwise valid value will be appropriately prefixed with the word to ensure its uniqueness. The word should begin with a letter and should contain only letters, numbers, ':', '.', '_' and '-'. + + Even if multiple inputs need to be filtered (through multiple calls to htmLawed), htmLawed ensures uniqueness of 'id' values as it uses a global variable ('$GLOBALS["hl_Ids"]' array). Further, an admin can restrict the use of certain 'id' values by presetting this variable before htmLawed is called into use. E.g.: + + $GLOBALS['hl_Ids'] = array('top'=>1, 'bottom'=>1, 'myform'=>1); // id values not allowed in input + $processed = htmLawed($text); // filter input + + +.. 3.4.3 URL schemes (protocols) and scripts in attribute values ............o + + + htmLawed edits attributes that take URLs as values if they are found to contain un-permitted schemes. E.g., if the 'afp' scheme is not permitted, then '<a href="afp://domain.org">' becomes '<a href="denied:afp://domain.org">', and if Javascript is not permitted '<a onclick="javascript:xss();">' becomes '<a onclick="denied:javascript:xss();">'. + + By default htmLawed permits these schemes in URLs for the 'href' attribute: + + aim, feed, file, ftp, gopher, http, https, irc, mailto, news, nntp, sftp, ssh, telnet + + Also, only 'file', 'http' and 'https' are permitted in attributes whose names start with 'o' (like 'onmouseover'), and in these attributes that accept URLs: + + action, cite, classid, codebase, data, href, longdesc, model, pluginspage, pluginurl, src, style, usemap + + These default sets are used when '$config["schemes"]' is not set (see section:- #2.2). To over-ride the defaults, '$config["schemes"]' is defined as a string of semi-colon-separated sub-strings of type 'attribute: comma-separated schemes'. E.g., 'href: mailto, http, https; onclick: javascript; src: http, https'. For unspecified attributes, 'file', 'http' and 'https' are permitted. This can be changed by passing schemes for '*' in '$config["schemes"]'. E.g., 'href: mailto, http, https; *: https, https'. + + '*' can be put in the list of schemes to permit all protocols. E.g., 'style: *; img: http, https' results in protocols not being checked in 'style' attribute values. However, in such cases, any relative-to-absolute URL conversion, or vice versa, (section:- #3.4.4) is not done. + + Thus, `to allow Javascript`, one can set '$config["schemes"]' as 'href: mailto, http, https; *: http, https, javascript', or 'href: mailto, http, https, javascript; *: http, https, javascript', or '*: *', and so on. + + As a side-note, one may find 'style: *' useful as URLs in 'style' attributes can be specified in a variety of ways, and the patterns that htmLawed uses to identify URLs may mistakenly identify non-URL text. + + *Note*: If URL-accepting attributes other than those listed above are being allowed, then the scheme will not be checked unless the attribute name contains the string 'src' (e.g., 'dynsrc') or starts with 'o' (e.g., 'onbeforecopy'). + + With '$config["safe"] = 1', all URLs are disallowed in the 'style' attribute values. + + +.. 3.4.4 Absolute & relative URLs in attribute values .............o + + + htmLawed can make absolute URLs in attributes like 'href' relative ('$config["abs_url"]' is '-1'), and vice versa ('$config["abs_url"]' is '1'). URLs in scripts are not considered for this, and so are URLs like '#section_6' (fragment), '?name=Tim#show' (starting with query string), and ';var=1?name=Tim#show' (starting with parameters). Further, this requires that '$config["base_url"]' be set properly, with the '://' and a trailing slash ('/'), with no query string, etc. E.g., 'file:///D:/page/', 'https://abc.com/x/y/', or 'http://localhost/demo/' are okay, but 'file:///D:/page/?help=1', 'abc.com/x/y/' and 'http://localhost/demo/index.htm' are not. + + For making absolute URLs relative, only those URLs that have the '$config["base_url"]' string at the beginning are converted. E.g., with '$config["base_url"] = "https://abc.com/x/y/"', 'https://abc.com/x/y/a.gif' and 'https://abc.com/x/y/z/b.gif' become 'a.gif' and 'z/b.gif' respectively, while 'https://abc.com/x/c.gif' is not changed. + + When making relative URLs absolute, only values for scheme, network location (host-name) and path values in the base URL are inherited. See section:- #5.5 for more about the URL specification as per RFC 1808:- http://www.ietf.org/rfc/rfc1808.txt. + + +.. 3.4.5 Lower-cased, standard attribute values ....................o + + + Optionally, for standard-compliance, htmLawed (function 'hl_tag()') lower-cases standard attribute values to give, e.g., 'input type="password"' instead of 'input type="Password"', if '$config["lc_std_val"]' is '1'. Attribute values matching those listed below for any of the elements (plus those for the 'type' attribute of 'button' or 'input') are lower-cased: + + all, baseline, bottom, button, center, char, checkbox, circle, col, colgroup, cols, data, default, file, get, groups, hidden, image, justify, left, ltr, middle, none, object, password, poly, post, preserve, radio, rect, ref, reset, right, row, rowgroup, rows, rtl, submit, text, top + + a, area, bdo, button, col, form, img, input, object, option, optgroup, param, script, select, table, td, tfoot, th, thead, tr, xml:space + + The following `empty` (`minimized`) attributes are always assigned lower-cased values (same as the names): + + checked, compact, declare, defer, disabled, ismap, multiple, nohref, noresize, noshade, nowrap, readonly, selected + + +.. 3.4.6 Transformation of deprecated attributes ..................o + + + If '$config["no_deprecated_attr"]' is '0', then deprecated attributes (see appendix in section:- #5.2) are removed and, in most cases, their values are transformed to CSS style properties and added to the 'style' attributes (function 'hl_tag()'). Except for 'bordercolor' for 'table', 'tr' and 'td', the scores of proprietary attributes that were never part of any cross-browser standard are not supported. + + *Note*: The attribute 'target' for 'a' is allowed even though it is not in XHTML 1.0 specs. This is because of the attribute's wide-spread use and browser-support, and because the attribute is valid in XHTML 1.1 onwards. + + * align - for 'img' with value of 'left' or 'right', becomes, e.g., 'float: left'; for 'div' and 'table' with value 'center', becomes 'margin: auto'; all others become, e.g., 'text-align: right' + + * bgcolor - E.g., 'bgcolor="#ffffff"' becomes 'background-color: #ffffff' + * border - E.g., 'height= "10"' becomes 'height: 10px' + * bordercolor - E.g., 'bordercolor=#999999' becomes 'border-color: #999999;' + * compact - 'font-size: 85%' + * clear - E.g., 'clear="all" becomes 'clear: both' + + * height - E.g., 'height= "10"' becomes 'height: 10px' and 'height="*"' becomes 'height: auto' + + * hspace - E.g., 'hspace="10"' becomes 'margin-left: 10px; margin-right: 10px' + * language - 'language="VBScript"' becomes 'type="text/vbscript"' + * name - E.g., 'name="xx"' becomes 'id="xx"' + * noshade - 'border-style: none; border: 0; background-color: gray; color: gray' + * nowrap - 'white-space: nowrap' + * size - E.g., 'size="10"' becomes 'height: 10px' + * start - removed + * type - E.g., 'type="i"' becomes 'list-style-type: lower-roman' + * value - removed + * vspace - E.g., 'vspace="10"' becomes 'margin-top: 10px; margin-bottom: 10px' + * width - like 'height' + + Example input: + + <img src="j.gif" alt="image" name="dad's" /><img src="k.gif" alt="image" id="dad_off" name="dad" /> + <br clear="left" /> + <hr noshade size="1" /> + <img name="img" src="i.gif" align="left" alt="image" hspace="10" vspace="10" width="10em" height="20" border="1" style="padding:5px;" /> + <table width="50em" align="center" bgcolor="red"> + <tr> + <td width="20%"> + <div align="center"> + <h3 align="right">Section</h3> + <p align="right">Para</p> + <ol type="a" start="e"><li value="x">First item</li></ol> + </div> + </td> + <td width="*"> + <ol type="1"><li>First item</li></ol> + </td> + </tr> + </table> + <br clear="all" /> + + And the output with '$config["no_deprecated_attr"] = 1': + + <img src="j.gif" alt="image" /><img src="k.gif" alt="image" id="dad_off" /> + <br style="clear: left;" /> + <hr style="border-style: none; border: 0; background-color: gray; color: gray; size: 1px;" /> + <img src="i.gif" alt="image" width="10em" height="20" style="padding:5px; float: left; margin-left: 10px; margin-right: 10px; margin-top: 10px; margin-bottom: 10px; border: 1px;" id="img" /> + <table width="50em" style="margin: auto; background-color: red;"> + <tr> + <td style="width: 20%;"> + <div style="margin: auto;"> + <h3 style="text-align: right;">Section</h3> + <p style="text-align: right;">Para</p> + <ol style="list-style-type: lower-latin;"><li>First item</li></ol> + </div> + </td> + <td style="width: auto;"> + <ol style="list-style-type: decimal;"><li>First item</li></ol> + </td> + </tr> + </table> + <br style="clear: both;" /> + + For 'lang', deprecated in XHTML 1.1, transformation is taken care of through '$config["xml:lang"]'; see section:- #3.4.1. + + The attribute 'name' is deprecated in 'form', 'iframe', and 'img', and is replaced with 'id' if an 'id' attribute doesn't exist and if the 'name' value is appropriate for 'id'. For such replacements for 'a' and 'map', for which the 'name' attribute is deprecated in XHTML 1.1, '$config["no_deprecated_attr"]' should be set to '2' (when set to '1', for these two elements, the 'name' attribute is retained). + + +-- 3.4.7 Anti-spam & 'href' ---------------------------------------o + + + htmLawed (function 'hl_tag()') can check the 'href' attribute values (link addresses) as an anti-spam (email or link spam) measure. + + If '$config["anti_mail_spam"]' is not '0', the '@' of email addresses in 'href' values like 'mailto:a@b.com' is replaced with text specified by '$config["anti_mail_spam"]'. The text should be of a form that makes it clear to others that the address needs to be edited before a mail is sent; e.g., '<remove_this_antispam>@' (makes the example address 'a<remove_this_antispam>@b.com'). + + For regular links, one can choose to have a 'rel' attribute with 'nofollow' in its value (which tells some search engines to not follow a link). This can discourage link spammers. Additionally, or as an alternative, one can choose to empty the 'href' value altogether (disable the link). + + For use of these options, '$config["anti_link_spam"]' should be set as an array with values 'regex1' and 'regex2', both or one of which can be empty (like 'array("", "regex2")') to indicate that that option is not to be used. Otherwise, 'regex1' or 'regex2' should be PHP- and PCRE-compatible regular expression patterns: 'href' values will be matched against them and those matching the pattern will accordingly be treated. + + Note that the regular expressions should have `delimiters`, and be well-formed and preferably fast. Absolute efficiency/accuracy is often not needed. + + An example, to have a 'rel' attribute with 'nofollow' for all links, and to disable links that do not point to domains 'abc.com' and 'xyz.org': + + $config["anti_link_spam"] = array('`.`', '`://\W*(?!(abc\.com|xyz\.org))`'); + + +-- 3.4.8 Inline style properties ----------------------------------o + + + htmLawed can check URL schemes and dynamic expressions (to guard against Javascript, etc., script-based insecurities) in inline CSS style property values in the 'style' attributes. (CSS properties like 'background-image' that accept URLs in their values are noted in section:- #5.3.) Dynamic CSS expressions that allow scripting in the IE browser, and can be a vulnerability, can be removed from property values by setting '$config["css_expression"]' to '1' (default setting). + + *Note*: Because of the various ways of representing characters in attribute values (URL-escapement, entitification, etc.), htmLawed might alter the values of the 'style' attribute values, and may even falsely identify dynamic CSS expressions and URL schemes in them. If this is an important issue, checking of URLs and dynamic expressions can be turned off ('$config["schemes"] = "...style:*..."', see section:- #3.4.3, and '$config["css_expression"] = 0'). Alternately, admins can use their own custom function for finer handling of 'style' values through the 'hook_tag' parameter (see section:- #3.4.9). + + It is also possible to have htmLawed let through any 'style' value by setting '$config["style_pass"]' to '1'. + + As such, it is better to set up a CSS file with class declarations, disallow the 'style' attribute, set a '$spec' rule (see section:- #2.3) for 'class' for the 'oneof' or 'match' parameter, and ask writers to make use of the 'class' attribute. + + +-- 3.4.9 Hook function for tag content ----------------------------o + + + It is possible to utilize a custom hook function to alter the tag content htmLawed has finalized (i.e., after it has checked/corrected for required attributes, transformed attributes, lower-cased attribute names, etc.). + + When '$config' parameter 'hook_tag' is set to the name of a function, htmLawed (function 'hl_tag()') will pass on the element name, and the `finalized` attribute name-value pairs as array elements to the function. The function is expected to return the full opening tag string like '<element_name attribute_1_name="attribute_1_value"...>' (for empty elements like 'img' and 'input', the element-closing slash '/' should also be included). + + This is a *powerful functionality* that can be exploited for various objectives: consolidate-and-convert inline 'style' attributes to 'class', convert 'embed' elements to 'object', permit only one 'caption' element in a 'table' element, disallow embedding of certain types of media, *inject HTML*, use CSSTidy:- http://csstidy.sourceforge.net to sanitize 'style' attribute values, etc. + + As an example, the custom hook code below can be used to force a series of specifically ordered 'id' attributes on all elements, and a specific 'param' element inside all 'object' elements: + + function my_tag_function($element, $attribute_array){ + static $id = 0; + // Remove any duplicate element + if($element == 'param' && isset($attribute_array['allowscriptaccess'])){ + return ''; + } + + $new_element = ''; + + // Force a serialized ID number + $attribute_array['id'] = 'my_'. $id; + ++$id; + + // Inject param for allowscriptaccess + if($element == 'object'){ + $new_element = '<param id='my_'. $id; allowscriptaccess="never" />'; + ++$id; + } + + $string = ''; + foreach($attribute_array as $k=>$v){ + $string .= " {$k}=\"{$v}\""; + } + return "<{$element}{$string}". (isset($in_array($element, $empty_elements) ? ' /' : ''). '>'. $new_element; + } + + The 'hook_tag' parameter is different from the 'hook' parameter (section:- #3.7). + + Snippets of hook function code developed by others may be available on the htmLawed:- http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed website. + + +-- 3.5 Simple configuration directive for most valid XHTML -------oo + + + If '$config["valid_xhtml"]' is set to '1', some relevant '$config' parameters (indicated by '~' in section:- #2.2) are auto-adjusted. This allows one to pass the '$config' argument with a simpler value. If a value for a parameter auto-set through 'valid_xhtml' is still manually provided, then that value will over-ride the auto-set value. + + +-- 3.6 Simple configuration directive for most `safe` HTML --------o + + + `Safe` HTML refers to HTML that is restricted to reduce the vulnerability for scripting attacks (such as XSS) based on HTML code which otherwise may still be legal and compliant with the HTML standard specs. When elements such as 'script' and 'object', and attributes such as 'onmouseover' and 'style' are allowed in the input text, an input writer can introduce malevolent HTML code. Note that what is considered 'safe' depends on the nature of the web application and the trust-level accorded to its users. + + htmLawed allows an admin to use '$config["safe"]' to auto-adjust multiple '$config' parameters (such as 'elements' which declares the allowed element-set), which otherwise would have to be manually set. The relevant parameters are indicated by '"' in section:- #2.2). Thus, one can pass the '$config' argument with a simpler value. + + With the value of '1', htmLawed considers 'CDATA' sections and HTML comments as plain text, and prohibits the 'applet', 'embed', 'iframe', 'object' and 'script' elements, and the 'on*' attributes like 'onclick'. ( There are '$config' parameters like 'css_expression' that are not affected by the value set for 'safe' but whose default values still contribute towards a more `safe` output.) Further, URLs with schemes (see section:- #3.4.3) are neutralized so that, e.g., 'style="moz-binding:url(http://danger)"' becomes 'style="moz-binding:url(denied:http://danger)"' while 'style="moz-binding:url(ok)"' remains intact. + + Admins, however, may still want to completely deny the 'style' attribute, e.g., with code like + + $processed = htmLawed($text, array('safe'=>1, 'deny_attribute'=>'style')); + + If a value for a parameter auto-set through 'safe' is still manually provided, then that value can over-ride the auto-set value. E.g., with '$config["safe"] = 1' and '$config["elements"] = "*+script"', 'script', but not 'applet', is allowed. + + A page illustrating the efficacy of htmLawed's anti-XSS abilities with 'safe' set to '1' against XSS vectors listed by RSnake:- http://ha.ckers.org/xss.html may be available here:- http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed/rsnake/RSnakeXSSTest.htm. + + +-- 3.7 Using a hook function --------------------------------------o + + + If '$config["hook"]' is not set to '0', then htmLawed will allow preliminarily processed input to be altered by a hook function named by '$config["hook"]' before starting the main work (but after handling of characters, entities, HTML comments and 'CDATA' sections -- see code for function 'htmLawed()'). + + The hook function also allows one to alter the `finalized` values of '$config' and '$spec'. + + Note that the 'hook' parameter is different from the 'hook_tag' parameter (section:- #3.4.9). + + Snippets of hook function code developed by others may be available on the htmLawed:- http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed website. + + +-- 3.8 Obtaining `finalized` parameter values ---------------------o + + + htmLawed can assign the `finalized` '$config' and '$spec' values to a variable named by '$config["show_setting"]'. The variable, made global by htmLawed, is set as an array with three keys: 'config', with the '$config' value, 'spec', with the '$spec' value, and 'time', with a value that is the Unix time (the output of PHP's 'microtime()' function) when the value was assigned. Admins should use a PHP-compliant variable name (e.g., one that does not begin with a numerical digit) that does not conflict with variable names in their non-htmLawed code. + + The values, which are also post-hook function (if any), can be used to auto-generate information (on, e.g., the elements that are permitted) for input writers. + + +-- 3.9 Retaining non-HTML tags in input with mixed markup ---------o + + + htmLawed does not remove certain characters that though invalid are nevertheless discouraged in HTML documents as per the specs (see section:- #5.1). This can be utilized to deal with input that contains mixed markup. Input that may have HTML markup as well as some other markup that is based on the '<', '>' and '&' characters is considered to have mixed markup. The non-HTML markup can be rather proprietary (like markup for emoticons/smileys), or standard (like MathML or SVG). Or it can be programming code meant for execution/evaluation (such as embedded PHP code). + + To deal with such mixed markup, the input text can be pre-processed to hide the non-HTML markup by specifically replacing the '<', '>' and '&' characters with some of the HTML-discouraged characters (see section:- #3.1.2). Post-htmLawed processing, the replacements are reverted. + + An example (mixed HTML and PHP code in input text): + + $text = preg_replace('`<\?php(.+?)\?>`sm', "\x83?php\\1?\x84", $text); + $processed = htmLawed($text); + $processed = preg_replace('`\x83\?php(.+?)\?\x84`sm', '<?php$1?>', $processed); + + This code will not work if '$config["clean_ms_char"]' is set to '1' (section:- #3.1), in which case one should instead deploy a hook function (section:- #3.7). (htmLawed internally uses certain control characters, code-points '1' to '7', and use of these characters as markers in the logic of hook functions may cause issues.) + + Admins may also be able to use '$config["and_mark"]' to deal with such mixed markup; see section:- #3.2. + + +== 4 Other =======================================================oo + + +-- 4.1 Support ----------------------------------------------------- + + + A careful re-reading of this documentation will very likely answer your questions. + + Software updates and forum-based community-support may be found at http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed. For general PHP issues (not htmLawed-specific), support may be found through internet searches and at http://php.net. + + +-- 4.2 Known issues -----------------------------------------------o + + + See section:- #2.8. + + Readers are advised to cross-check information given in this document. + + +-- 4.3 Change-log -------------------------------------------------o + + + (The release date for the downloadable package of files containing documentation, demo script, test-cases, etc., besides the 'htmLawed.php' file may be updated independently if the secondary files are revised.) + + `Version number - Release date. Notes` + + 1.1.8.1 - 16 July 2009. Minor code-change to fix a PHP error notice + + 1.1.8 - 23 April 2009. Parameter 'deny_attribute' now accepts the wild-card '*', making it simpler to specify its value when all but a few attributes are being denied; fixed a bug in interpreting '$spec' + + 1.1.7 - 11-12 March 2009. Attributes globally denied through 'deny_attribute' can be allowed element-specifically through '$spec'; '$config["style_pass"]' allowing letting through any 'style' value introduced; altered logic to catch certain types of dynamic crafted CSS expressions + + 1.1.3-6 - 28-31 January - 4 February 2009. Altered logic to catch certain types of dynamic crafted CSS expressions + + 1.1.2 - 22 January 2009. Fixed bug in parsing of 'font' attributes during tag transformation + + 1.1.1 - 27 September 2008. Better nesting correction when omitable closing tags are absent + + 1.1 - 29 June 2008. '$config["hook_tag"]' and '$config["format"]' introduced for custom tag/attribute check/modification/injection and output compaction/beautification; fixed a regex-in-$spec parsing bug + + 1.0.9 - 11 June 2008. Fixed bug in invalid HTML code-point entity check + + 1.0.8 - 15 May 2008. 'bordercolor' attribute for 'table', 'td' and 'tr' + + 1.0.7 - 1 May 2008. Support for 'wmode' attribute for 'embed'; '$config["show_setting"]' introduced; improved '$config["elements"]' evaluation + + 1.0.6 - 20 April 2008. '$config["and_mark"]' introduced + + 1.0.5 - 12 March 2008. 'style' URL schemes essentially disallowed when $config 'safe' is on; improved regex for CSS expression search + + 1.0.4 - 10 March 2008. Improved corrections for 'blockquote', 'form', 'map' and 'noscript' + + 1.0.3 - 3 March 2008. Character entities for soft-hyphens are now replaced with spaces (instead of being removed); a bug allowing 'td' directly inside 'table' fixed; 'safe' '$config' parameter added + + 1.0.2 - 13 February 2008. Improved implementation of '$config["keep_bad"]' + + 1.0.1 - 7 November 2007. Improved regex for identifying URLs, protocols and dynamic expressions ('hl_tag()' and 'hl_prot()'); no error display with 'hl_regex()' + + 1.0 - 2 November 2007. First release + + +-- 4.4 Testing ----------------------------------------------------o + + + To test htmLawed using a form interface, a demo:- htmLawedTest.php web-page is provided with the htmLawed distribution ('htmLawed.php' and 'htmLawedTest.php' should be in the same directory on the web-server). A file with test-cases:- htmLawed_TESTCASE.txt is also provided. + + +-- 4.5 Upgrade, & old versions ------------------------------------o + + + Upgrading is as simple as replacing the previous version of 'htmLawed.php' (assuming it was not modified for customized features). As htmLawed output is almost always used in static documents, upgrading should not affect old, finalized content. + + Old versions of htmLawed may be available online. E.g., for version 1.0, check http://www.bioinformatics.org/phplabware/downloads/htmLawed1.zip, for 1.1.1, htmLawed111.zip, and for 1.1.10, htmLawed1110.zip. + + +-- 4.6 Comparison with 'HTMLPurifier' -----------------------------o + + + The HTMLPurifier PHP library by Edward Yang is a very good HTML filtering script that uses object oriented PHP code. Compared to htmLawed, it: + + * does not support PHP versions older than 5.0 (HTMLPurifier dropped PHP 4 support after version 2) + + * is 15-20 times bigger (scores of files totalling more than 750 kb) + + * consumes 10-15 times more RAM memory (just including the HTMLPurifier files without calling the filter requires a few MBs of memory) + + * is expectedly slower + + * does not allow admins to fully allow all valid HTML (because of incomplete HTML support, it always considers elements like 'script' illegal) + + * lacks many of the extra features of htmLawed (like entity conversions and code compaction/beautification) + + * has poor documentation + + However, HTMLPurifier has finer checks for character encodings and attribute values, and can log warnings and errors. Visit the HTMLPurifier website:- http://htmlpurifier.org for updated information. + + +-- 4.7 Use through application plug-ins/modules -------------------o + + + Plug-ins/modules to implement htmLawed in applications such as Drupal and DokuWiki may have been developed. Please check the application websites and the forum on the htmLawed site:- http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed. + + +-- 4.8 Use in non-PHP applications --------------------------------o + + + Non-PHP applications written in Python, Ruby, etc., may be able to use htmLawed through system calls to the PHP engine. Such code may have been documented on the internet. Also check the forum on the htmLawed site:- http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed. + + +-- 4.9 Donate -----------------------------------------------------o + + + A donation in any currency and amount to appreciate or support this software can be sent by PayPal:- http://paypal.com to this email address: drpatnaik at yahoo dot com. + + +-- 4.10 Acknowledgements ------------------------------------------o + + + Bryan Blakey, Ulf Harnhammer, Gareth Heyes, Lukasz Pilorz, Shelley Powers, Edward Yang, and many anonymous users. + + Thank you! + + +== 5 Appendices ==================================================oo + + +-- 5.1 Characters discouraged in XHTML ----------------------------- + + + Characters represented by the following hexadecimal code-points are `not` invalid, even though some validators may issue messages stating otherwise. + + '7f' to '84', '86' to '9f', 'fdd0' to 'fddf', '1fffe', '1ffff', '2fffe', '2ffff', '3fffe', '3ffff', '4fffe', '4ffff', '5fffe', '5ffff', '6fffe', '6ffff', '7fffe', '7ffff', '8fffe', '8ffff', '9fffe', '9ffff', 'afffe', 'affff', 'bfffe', 'bffff', 'cfffe', 'cffff', 'dfffe', 'dffff', 'efffe', 'effff', 'ffffe', 'fffff', '10fffe' and '10ffff' + + +-- 5.2 Valid attribute-element combinations -----------------------o + + + Valid attribute-element combinations as per W3C specs. + + * includes deprecated attributes (marked '^'), attributes for the non-standard 'embed' element (marked '*'), and the proprietary 'bordercolor' (marked '~') + * only non-frameset, HTML body elements + * 'name' for 'a' and 'map', and 'lang' are invalid in XHTML 1.1 + * 'target' is valid for 'a' in XHTML 1.1 and higher + * 'xml:space' is only for XHTML 1.1 + + abbr - td, th + accept - form, input + accept-charset - form + accesskey - a, area, button, input, label, legend, textarea + action - form + align - caption^, embed, applet, iframe, img^, input^, object^, legend^, table^, hr^, div^, h1^, h2^, h3^, h4^, h5^, h6^, p^, col, colgroup, tbody, td, tfoot, th, thead, tr + alt - applet, area, img, input + archive - applet, object + axis - td, th + bgcolor - embed, table^, tr^, td^, th^ + border - table, img^, object^ + bordercolor~ - table, td, tr + cellpadding - table + cellspacing - table + char - col, colgroup, tbody, td, tfoot, th, thead, tr + charoff - col, colgroup, tbody, td, tfoot, th, thead, tr + charset - a, script + checked - input + cite - blockquote, q, del, ins + classid - object + clear - br^ + code - applet + codebase - object, applet + codetype - object + color - font + cols - textarea + colspan - td, th + compact - dir, dl^, menu, ol^, ul^ + coords - area, a + data - object + datetime - del, ins + declare - object + defer - script + dir - bdo + disabled - button, input, optgroup, option, select, textarea + enctype - form + face - font + for - label + frame - table + frameborder - iframe + headers - td, th + height - embed, iframe, td^, th^, img, object, applet + href - a, area + hreflang - a + hspace - applet, img^, object^ + ismap - img, input + label - option, optgroup + language - script^ + longdesc - img, iframe + marginheight - iframe + marginwidth - iframe + maxlength - input + method - form + model* - embed + multiple - select + name - button, embed, textarea, applet^, select, form^, iframe^, img^, a^, input, object, map^, param + nohref - area + noshade - hr^ + nowrap - td^, th^ + object - applet + onblur - a, area, button, input, label, select, textarea + onchange - input, select, textarea + onfocus - a, area, button, input, label, select, textarea + onreset - form + onselect - input, textarea + onsubmit - form + pluginspage* - embed + pluginurl* - embed + prompt - isindex + readonly - textarea, input + rel - a + rev - a + rows - textarea + rowspan - td, th + rules - table + scope - td, th + scrolling - iframe + selected - option + shape - area, a + size - hr^, font, input, select + span - col, colgroup + src - embed, script, input, iframe, img + standby - object + start - ol^ + summary - table + tabindex - a, area, button, input, object, select, textarea + target - a^, area, form + type - a, embed, object, param, script, input, li^, ol^, ul^, button + usemap - img, input, object + valign - col, colgroup, tbody, td, tfoot, th, thead, tr + value - input, option, param, button, li^ + valuetype - param + vspace - applet, img^, object^ + width - embed, hr^, iframe, img, object, table, td^, th^, applet, col, colgroup, pre^ + wmode - embed + xml:space - pre, script, style + + These are allowed in all but the shown elements: + + class - param, script + dir - applet, bdo, br, iframe, param, script + id - script + lang - applet, br, iframe, param, script + onclick - applet, bdo, br, font, iframe, isindex, param, script + ondblclick - applet, bdo, br, font, iframe, isindex, param, script + onkeydown - applet, bdo, br, font, iframe, isindex, param, script + onkeypress - applet, bdo, br, font, iframe, isindex, param, script + onkeyup - applet, bdo, br, font, iframe, isindex, param, script + onmousedown - applet, bdo, br, font, iframe, isindex, param, script + onmousemove - applet, bdo, br, font, iframe, isindex, param, script + onmouseout - applet, bdo, br, font, iframe, isindex, param, script + onmouseover - applet, bdo, br, font, iframe, isindex, param, script + onmouseup - applet, bdo, br, font, iframe, isindex, param, script + style - param, script + title - param, script + xml:lang - applet, br, iframe, param, script + + +-- 5.3 CSS 2.1 properties accepting URLs ------------------------o + + + background + background-image + content + cue-after + cue-before + cursor + list-style + list-style-image + play-during + + +-- 5.4 Microsoft Windows 1252 character replacements --------------o + + + Key: 'd' double, 'l' left, 'q' quote, 'r' right, 's.' single + + Code-point (decimal) - hexadecimal value - replacement entity - represented character + + 127 - 7f - (removed) - (not used) + 128 - 80 - € - euro + 129 - 81 - (removed) - (not used) + 130 - 82 - ‚ - baseline s. q + 131 - 83 - ƒ - florin + 132 - 84 - „ - baseline d q + 133 - 85 - … - ellipsis + 134 - 86 - † - dagger + 135 - 87 - ‡ - d dagger + 136 - 88 - ˆ - circumflex accent + 137 - 89 - ‰ - permile + 138 - 8a - Š - S Hacek + 139 - 8b - ‹ - l s. guillemet + 140 - 8c - Œ - OE ligature + 141 - 8d - (removed) - (not used) + 142 - 8e - Ž - Z dieresis + 143 - 8f - (removed) - (not used) + 144 - 90 - (removed) - (not used) + 145 - 91 - ‘ - l s. q + 146 - 92 - ’ - r s. q + 147 - 93 - “ - l d q + 148 - 94 - ” - r d q + 149 - 95 - • - bullet + 150 - 96 - – - en dash + 151 - 97 - — - em dash + 152 - 98 - ˜ - tilde accent + 153 - 99 - ™ - trademark + 154 - 9a - š - s Hacek + 155 - 9b - › - r s. guillemet + 156 - 9c - œ - oe ligature + 157 - 9d - (removed) - (not used) + 158 - 9e - ž - z dieresis + 159 - 9f - Ÿ - Y dieresis + + +-- 5.5 URL format -------------------------------------------------o + + + An `absolute` URL has a 'protocol' or 'scheme', a 'network location' or 'hostname', and, optional 'path', 'parameters', 'query' and 'fragment' segments. Thus, an absolute URL has this generic structure: + + (scheme) : (//network location) /(path) ;(parameters) ?(query) #(fragment) + + The schemes can only contain letters, digits, '+', '.' and '-'. Hostname is the portion after the '//' and up to the first '/' (if any; else, up to the end) when ':' is followed by a '//' (e.g., 'abc.com' in 'ftp://abc.com/def'); otherwise, it consists of everything after the ':' (e.g., 'def@abc.com' in mailto:def@abc.com'). + + `Relative` URLs do not have explicit schemes and network locations; such values are inherited from a `base` URL. + + +-- 5.6 Brief on htmLawed code -------------------------------------o + + + Much of the code's logic and reasoning can be understood from the documentation above. + + The *output* of htmLawed is a text string containing the processed input. There is no custom error tracking. + + *Function arguments* for htmLawed are: + + * '$in' - 1st argument; a text string; the *input text* to be processed. Any extraneous slashes added by PHP when `magic quotes` are enabled should be removed beforehand using PHP's 'stripslashes()' function. + + * '$config' - 2nd argument; an associative array; optional (named '$C' in htmLawed code). The array has keys with names like 'balance' and 'keep_bad', and the values, which can be boolean, string, or array, depending on the key, are read to accordingly set the *configurable parameters* (indicated by the keys). All configurable parameters receive some default value if the value to be used is not specified by the user through '$config'. `Finalized` '$config' is thus a filtered and possibly larger array. + + * '$spec' - 3rd argument; a text string; optional. The string has rules, written in an htmLawed-designated format, *specifying* element-specific attribute and attribute value restrictions. Function 'hl_spec()' is used to convert the string to an associative-array for internal use. `Finalized` '$spec' is thus an array. + + `Finalized` '$config' and '$spec' are made *global variables* while htmLawed is at work. Values of any pre-existing global variables with same names are noted, and their values are restored after htmLawed finishes processing the input (to capture the `finalized` values, the 'show_settings' parameter of '$config' should be used). Depending on '$config', another global variable 'hl_Ids', to track 'id' attribute values for uniqueness, may be set. Unlike the other two variables, this one is not reset (or unset) post-processing. + + Except for the main function 'htmLawed()' and the functions 'kses()' and 'kses_hook()', htmLawed's functions are *name-spaced* using the 'hl_' prefix. The *functions* and their roles are: + + * 'hl_attrval' - checking attribute values against $spec + * 'hl_bal' - tag balancing + * 'hl_cmtcd' - handling CDATA sections and HTML comments + * 'hl_ent' - entity handling + * 'hl_prot' - checking a URL scheme/protocol + * 'hl_regex' - checking syntax of a regular expression + * 'hl_spec' - converting user-supplied $spec value to one used by htmLawed internally + * 'hl_tag' - handling tags + * 'hl_tag2' - transforming tags + * 'hl_tidy' - compact/beautify HTML + * 'hl_version' - reporting htmLawed version + * 'htmLawed' - main function + * 'kses' - main function of 'kses' + * 'kses_hook' - hook function of 'kses' + + The last two are for compatibility with pre-existing code using the 'kses' script. htmLawed's 'kses()' basically passes on the filtering task to 'htmLawed()' function after deciphering '$config' and '$spec' from the argument values supplied to it. 'kses_hook()' is an empty function and is meant for being filled with custom code if the 'kses' script users were using one. + + 'htmLawed()' finalizes '$spec' (with the help of 'hl_spec()') and '$config', and globalizes them. Finalization of '$config' involves setting default values if an inappropriate or invalid one is supplied. This includes calling 'hl_regex()' to check well-formedness of regular expression patterns if such expressions are user-supplied through '$config'. 'htmLawed()' then removes invalid characters like nulls and 'x01' and appropriately handles entities using 'hl_ent()'. HTML comments and CDATA sections are identified and treated as per '$config' with the help of 'hl_cmtcd()'. When retained, the '<' and '>' characters identifying them, and the '<', '>' and '&' characters inside them, are replaced with control characters (code-points '1' to '5') till any tag balancing is completed. + + After this `initial processing` 'htmLawed()' identifies tags using regex and processes them with the help of 'hl_tag()' -- a large function that analyzes tag content, filtering it as per HTML standards, '$config' and '$spec'. Among other things, 'hl_tag()' transforms deprecated elements using 'hl_tag2()', removes attributes from closing tags, checks attribute values as per '$spec' rules using 'hl_attrval()', and checks URL protocols using 'hl_prot()'. 'htmLawed()' performs tag balancing and nesting checks with a call to 'hl_bal()', and optionally compacts/beautifies the output with proper white-spacing with a call to 'hl_tidy()'. The latter temporarily replaces white-space, and '<', '>' and '&' characters inside 'pre', 'script' and 'textarea' elements, and HTML comments and CDATA sections with control characters (code-points '1' to '5', and '7'). + + htmLawed permits the use of custom code or *hook functions* at two stages. The first, called inside 'htmLawed()', allows the input text as well as the finalized $config and $spec values to be altered right after the initial processing (see section:- #3.7). The second is called by 'hl_tag()' once the tag content is finalized (see section:- #3.4.9). + + Being dictated by the external and stable HTML standard, htmLawed's objective is very clear-cut and less concerned with tweakability. The code is only minimally annotated with comments -- it is not meant to instruct; PHP developers familiar with the HTML specs will see the logic, and others can always refer to the htmLawed documentation. The compact structuring of the statements is meant to aid in quickly grasping the logic, at least when viewed with code syntax highlighted. + +___________________________________________________________________oo + + +@@description: htmLawed PHP software is a free, open-source, customizable HTML input purifier and filter +@@encoding: utf-8 +@@keywords: htmLawed, HTM, HTML, HTML Tidy, converter, filter, formatter, purifier, sanitizer, XSS, input, PHP, software, code, script, security, cross-site scripting, hack, sanitize, remove, standards, tags, attributes, elements +@@language: en +@@title: htmLawed documentation
\ No newline at end of file diff --git a/extlib/htmLawed/htmLawed_TESTCASE.txt b/extlib/htmLawed/htmLawed_TESTCASE.txt new file mode 100644 index 000000000..366465ce3 --- /dev/null +++ b/extlib/htmLawed/htmLawed_TESTCASE.txt @@ -0,0 +1,370 @@ +/* +htmLawed_TESTCASE.txt, 23 April 2009 +htmLawed 1.1.8.1, 16 July 2009 +Copyright Santosh Patnaik +GPL v3 license +A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed +*/ + +This file has UTF-8-encoded text with both correct and incorrect/malformed HTML/XHTML code snippets to test htmLawed (test cases/samples). The entire text may also be used as a unit. + +************************************************ +when viewing this file in a web browser, set the +character encoding to Unicode/UTF-8 +************************************************ + +--------------------- start -------------------- + +<em>Try different $config and $spec values. Some text even when filtered in will not be displayed in a rendered web-page</em><br /> + +<h6>Attributes</h6> + +<strong>Xml:lang:</strong><a lang="en" xml:lang="en"></a>, <a lang="en"></a>, <a xml:lang="en"></a><br /> +<strong>Standard, predefined value, or empty attribute:</strong> <input type="text" disabled />, <input type="text" disabled="DISABLED" />, <input type="text" disabled="1" /><br /> +<strong>Required:</strong> <img />, <img alt="image" /><br /> +<strong>Quote & space variation:</strong> <a id=id1 name=xy>a</a>, <a id='id2' name="xy">a</a>, <a id=' id3 ' name = "n" >a</a><br /> +<strong>Invalid:</strong> <a id="id4" src="s">a</a><br /> +<strong>Duplicated:</strong> <a id="id5" id="id6">a</a><br /> +<strong>Deprecated:</strong> <a id="id7" target="self" name="n">a</a>, <hr noshade="noshade" /><br /> +<strong>Casing:</strong> <a HREF=""></a><br /> +<strong>Admin-restricted?:</strong> <a href="x" onclick="alert();"></a> + +<h6>Attribute values</h6> + +<strong>Duplicate ID value:</strong><a id="id8"></a>, <a id="my_id8"></a>, <a id="id8"></a><br /> +(try 'my_' for prefix)<br /> +<strong>Double-quotes in value:</strong><a title=ab"c"></a>, <a title="ab"c"></a>, <a title='ab"c'></a><br /> +(try filter for CSS expression)<br /> +<strong>CSS expression</strong>: <div style="prop:expression();"></div><div style="prop:expression()"></div><div style="prop: expression();"></div><div style="prop : expression()"></div><div style="prop:expression(js);"></div><div style="prop:expression(js;)"></div><div style="prop: expression('js');"></div><div style="prop : expr ession('js':)"></div><div style="prop:expression( 'js@ );"></div><br /> +<strong>Other:</strong> <input size="50" class="my" value="an input an input an input" />, <input size="5" class="your" value="an input" /><br /> +(try 'maxlen', 'maxval', etc., for 'input' in '$spec') + +<h6>Blockquotes</h6> + +<blockquote>abc</blockquote><br /> +<blockquote>abc<div>def</div></blockquote><br /> +<blockquote><div>abc</div>def</blockquote><br /> +<blockquote>abc<div>def</div>ghi</blockquote><br /> +abc<div>def</div>ghi<br /> +(try with blockquote parent) + +<h6>CDATA sections</h6> + +<strong>Special characters inside:</strong> <![CDATA[ ]]> ]]>, <![CDATA[ 3 < 4 > 3.5, & 4 > 4 ]]><br /> +<strong>Normal:</strong> <![CDATA[ check ]]>, <em>CDATA follows:<![CDATA[ check ]]></em><br /> +<strong>Malformed:</strong> <![cdata check ]]>, < ![CDATA check ]]>, <![CDATA check ]]>, < ![CDATA check ] ]><br /> +<strong>Invalid:</strong> <em <![CDATA[ check ]]>>CDATA in tag content</em>, <table><![CDATA[ check ]]><tr><td>text not allowed</td></tr></table> + +<h6>Complex-1: deprecated elements</h6> + +<center> +The PHP <s>software</s> script used for this <strike>web-page</strike> webpage is <font style="font-weight: bold " face=arial size='+3' color = "red ">htmLawedTest.php</font>, from <u style= 'color:green'>PHP Labware</u>. +</center> + +<h6>Complex-2: deprecated attributes</h6> + +<img src="s" alt="a" name="n" /><img src="s" alt="a" id="id9" name="n" /> +<br clear="left" /> +<hr noshade size="1" /> +<img name="id10" src="s" align="left" alt="image" hspace="10" vspace="10" width="10em" height="20" border="1" style="padding:5px;" /> +<table width="50em" align="center" bgcolor="red"> + <tr> + <td width="20%"> + <div align="center"> + <h3 align="right">Section</h3> + <p align="right">Para</p> + <ol type="a" start="e"><li value="x"><a name="x">First</a> <a name="x" id="id11">item</a></li></ol> + </div> + </td> + <td width="*"> + <ol type="1"><li>First item</li></ol> + </td> + </tr> + </table> +<br clear="all" /> + +<h6>Complex-3: embed, object, area</h6> + +<object width="425" height="350"><param name="movie" value="http://www.youtube.com/v/ls7gi1VwdIQ"></param><embed src="http://www.youtube.com/v/ls7gi1VwdIQ" type="application/x-shockwave-flash" width="425" height="350"></embed></object><br /> + +<embed src="http://www.youtube.com/v/ls7gi1VwdIQ" type="application/x-shockwave-flash" width="425" height="350"></embed><br /> + +<object data="1.gif" type="image/gif" usemap="#map1"><map name="map1"> +<p>navigate the site: <a href="1" shape="REct" coOrds="0,0,118,28">1</a> | <a href="3" shape="circle" coords="184,200,60">3</a> | <a href="4" shape="poly" coords="276,0,276,28,100,200,50,50,276,0">4</a></p> +<area href="5" shape="Rect" coords="0,0,118,28"> +</map></object> + +<h6>Complex-4: nested and other tables</h6> + +<table border="1" bgcolor="red"> <tr> <td> Cell </td> <td colspan="2" rowspan="2"> <table border="1" bgcolor="green"> <tr> <td> Cell </td> <td colspan="2" rowspan="2"> </td> </tr> <tr> <td> Cell </td> </tr> <tr> <td> Cell </td> <td> Cell </td> <td> Cell </td> </tr> </table> </td> </tr> <tr> <td> Cell </td> </tr> <tr> <td> Cell </td> <td> Cell </td> <td> Cell </td> </tr> </table><br /> +<strong>PCDATA wrong:</strong> <table>Well<caption>Hello</caption></table><br /> +<strong>Missing tr:</strong> <table><td>Well</td></table><br /> + +<h6>Complex-5: pseudo, disallowed or non-HTML tags</h6> + +(Try different 'keep_bad' values) +<*> Pseudotags <*> +<xml>Non-HTML tag xml</xml> +<p> +Disallowed tag p +</p> +<ul>Bad<li>OK</li></ul> + +<h6>Elements</h6> + +<strong>Unbalanced:</strong> <a href="h"><em>check</a></em><br /> +<strong>Non-XHTML:</strong> <div><center><dir></dir></center></div><br /> +<strong>Malformed:</strong> < a href=""></a>, <a href="" ></a>, <a href="" ></a>, <a href="" +></a>, <a href="">< /a>, < a href=""></a >, <img src="s" alt="a" />, <img src="s" alt="a"/ >, <imgsrc="s" alt="a" /><br /> +<strong>Invalid:</strong> <image src="s" alt="a" /><br /> +<strong>Empty:</strong> <img src="s" alt="a" />, <img src="s" alt="a"></img>, <img src="s" alt="a">text</img><br /> +<strong>Content invalid:</strong> <a href="h">1<a>2</a></a><br /> +<strong>Content invalid?:</strong> <form></form><br /> (try setting 'form' as parent) +<strong>Casing:</strong> <A href=""></a> + +<h6>Entities</h6> + +<strong>Special:</strong> & 3 < 2 & 5>4 and j >i >a & i<j>a<br /> +<strong>Padding:</strong> B B f f  <br /> +<strong>Malformed:</strong> & #x27;, &x27;, ' &TILDE;, &tilde<br /> +<strong>Invalid:</strong> , �, , �, , &bad;<br /> +<strong>Discouraged characters:</strong> , „, , <br /> +<strong>Context:</strong> '>', <?<br /> +<strong>Casing:</strong> ', ', &TILDE;, ˜ +<br /> +(also check named-to-numeric and hexdec-to-decimal, and vice versa, conversions) + +<h6>Format</h6> + +<strong>Valid but ill-formatted:</strong> text <!-- comment --> +text <!-- +A c o m m e n t --> +<script> + <![CDATA[ + code + ]]> +</script><!-- comment --><![CDATA[ cdata ]]> <a>text</b> text<pre id="none">p r e</pre> +<textarea>text</textarea> <textarea> + text text +</textarea> text text <br /><hr /> +text <img src="none" alt="none" /> t<em class="none">e<strong>x</strong>t</em> +text <img src="none" alt="none" /> <b>t<em> e <strong> x </strong> t</em></b> + <a href="a"> text <img src="none" alt="none" /> <b>t <em> e <strong> x </strong> t</em></b> + </a> +<span style="background-color: yellow;">text <img src="none" alt="none" /> <b> <em> t e <strong> x </strong> t</em></b></span> +<script>script</script> +<div> + <pre id="none">p <a>r</a> e <!-- comment --> </pre> + <pre> + pre + </pre> +</div> +<div><div><table border="1" style="background-color: red;"><tr><td>Cell</td><td colspan="2" rowspan="2"><table border="1" style="background-color: green;"><tr><td>Cell</td><td colspan="2" rowspan="2"></td></tr><tr><td>Cell</td></tr><tr><td>Cell</td><td>Cell</td><td>Cell</td></tr></table></td></tr><tr><td>Cell</td></tr><tr><td>Cell</td><td>Cell</td><td>Cell</td></tr></table></div></div> +(try to compact or beautify) + +<h6>Forms</h6> + +(note nesting of 'form', missing required attributes, etc.)<br /> +<form> +<script type="text/javascript">s</script> +<fieldset><legend>p</legend>l <input name="personal_lastname" type="text" tabindex="1"></fieldset> +<input name="h" type="checkbox" value="h" tabindex="20"> h +<textarea name="t">t</textarea> +<form action="a" method="get"></form></form><br /> +<form action="b" method="get"><p><input type="text" value="i" /></form><br /> +<form>B:<input type="text" value="b" />C:<input type="text" value="c" /></form><br /> +(try each of these lines separately)<br /> +<form action="a">what<br /> +<form action="a">what +(try with container as div and as form)<br /> +<form>c <a>a</a> <b>b</b><input /><script>s</script> + +<h6>HTML comments (also CDATA)</h6> + +Special characters inside: <!-- <![CDATA check ]]> -->, <!-- 3 < 4 > 3.5, & 4 > 4 -->, <!-- che--ck -->, <!--[if !IE]> <--><a>c</a><!--> <![endif]--><br /> +Normal: <!-- check -->, <!--check -->, <em>comment:<!-- check --></em><!-- check -->, <table><!-- check --><tr><td>text not allowed</td></tr></table><br /> +Malformed: <![cdata check ]]>, < ![CDATA check ]]>, < ![CDATA check ] ]><br /> +Invalid: <em <!-- check -->>comment in tag content</em>, <!--check--> + +<h6>Ins-Del</h6> + +(depending on context, these elements can be of either block or inline type)<br /> +<p><ins datetime="d" cite="c"><div>block</div></ins></p><br /> +<p><del>d</del></p><br /> +<p><ins><del>d</del></ins></p><div><ins><p><del><div>d</div></del></p></ins></div><ins><div>d</div></ins> + +<h6>Lists</h6> + +<strong>Invalid character data</strong>: <ul><li>(item</li>)</ul><br /> +<strong>Definition list</strong>: <dl><dt>a</dt>bad<dd>first <em>one</em></dd><dt>b</dt><dd>second</dd></dl><br /> +<strong>Definition list, close-tags omitted</strong>: <dl><dt>a</dt>bad<dd>first <em>one</em></dd><dt>b<dd>second</dl><br /> +<strong>Definition lists, nested</strong>: <dl> + <dt>T1</dt> + <dd>D1</dd> + <dt>T2</dt> + <dd>D2<dl><dt>t1</dt><dd>d1</dd><dt>t2</dt><dd>d2</dd></dl></dd> + <dt>T3</dt> + <dd>D3</dd> + <dt>T4</dt> + <dd>D4<dl><dt>t1</dt><dd>d1</dd></dl></dd> +</dl><br /> +<strong>Definition lists, nested, close-tags omitted</strong>: <dl> + <dt>T1 + <dd>D1</dd> + <dt>T2</dt> + <dd>D2<dl><dt>t1<dd>d1<dt>t2</dt><dd>d2</dd></dl></dd> + <dt>T3 + <dd>D3 + <dt>T4 + <dd>D4<dl><dt>t1<dd>d1</dl></dd> +</dl><br /> +<strong>Nested</strong>: <ul> + <li>l1</li> + <li>l2<ol><li>lo1</li><li>lo2</li></ol></li> + <li>l3</li> + <li>l4<ol><li>lo3</li><li>lo4<ol><li>lo5</li></ol></li></ol></li> +</ul><br /> +<strong>Nested, close-tags omitted</strong>: <ul> + <li>l1</li> + <li>l2<ol><li>lo1<li>lo2</ol> + <li>l3 + <li>l4<ol><li>lo3<li>lo4<ol><li>lo5</ol></ol> +</ul><br /> +<strong>Complex</strong>: +<ol><script></script><li><table><tr><td> +<ul><li id="search" class="widget widget_search"> <form id="searchform" method="get" action="http://kohei.us"> + <div> + + <input type="text" name="s" id="s" size="15" /><br /> + <input type="submit" value="Search" /> + </div> + </form> + </li></ul> +</td></tr></table></li></ol> + +<h6>Non-English text-1</h6> + +Inscrieţi-vă acum la a Zecea Conferinţă Internaţională<br /> +გთხოვთ ახლავე გაიაროთ რეგისტრაცია<br /> +večjezično računalništvo<br /> +<a title="הירשמו +כעת לכנס ">Зарегистрируйтесь сейчас +на Десятую Международную Конференцию по</a><br /> +(this file should have utf-8 encoding; some characters may not be displayed because of missing fonts, etc.) + +<h6>Non-English text-2: entities</h6> + +用统一码<br /> +გთხოვთ<br /> +Inscreva-se agora para a Décima Conferência Internacional Sobre O Unicode, realizada entre os dias 10 e 12 de março de 1997 em Mainz +na Alemanha. + +<h6>Ruby</h6> + +(need compatible browser)<br /> +<ruby xml:lang="ja"> + <rbc> + <rb>斎</rb> + <rb>藤</rb> + <rb>信</rb> + <rb>男</rb> + </rbc> + <rtc class="reading"> + <rt>さい</rt> + <rt>とう</rt> + <rt>のぶ</rt> + <rt>お</rt> + </rtc> + <rtc class="annotation"> + <rt rbspan="4" xml:lang="en">W3C Associate Chairman</rt> + </rtc> +</ruby><br /> +<ruby> + <rb>WWW</rb> + <rp>(</rp><rt>World Wide Web</rt><rp>)</rp> +</ruby><br /> +<ruby> + A + <rp>(</rp><rt>aaa</rt><rp>)</rp> +</ruby> + +<h6>Tables</h6> + +<strong>Omitted closing tags:</strong> <table> +<colgroup><col style="x" /><col style="y" /> +<thead> +<tr><th>h1c1<th>h1c2 +<tbody> +<tr><td>r1c1<td>r1c2 +<tr><td>r2c1<td>r2c2 +</table><br /> +<strong>Nested, omitted closing tags:</strong> <table> +<colgroup><col style="x" /><col style="y" /> +<thead> +<tr><th>h1c1<th>h1c2 +<tbody> +<tr><td>r1c1<td>r1c2<table> +<colgroup><col style="x" /><col style="y" /> +<thead> +<tr><th>h1c1<th>h1c2 +<tbody> +<tr><td>r1c1<td>r1c2 +<tr><td>r2c1<td>r2c2 +</table> +<tr><td>r2c1<td>r2c2 +</table><br /> + +<h6>URLs</h6> + +<strong>Relative and absolute:</strong> <a href="mailto:x"></a>, <a href="http://a.com/b/c/d.f"></a>, <a href="./../d.f"></a>, <a href="./d.f"></a>, <a href="d.f"></a>, <a href="#s"></a>, <a href="./../../d.f#s"></a><br /> +(try base URL value of 'http://a.com/b/')<br /> +<strong>CSS URLs:</strong> <div style="background-image: url('a.gif');"></div>, <div style="background-image: URL("a.gif");"></div>, <div style="background-image: url('http://a.com/a.gif');"></div>, <div style="background-image: url('./../a.gif');"></div>, <div style="background-image: url('js:xss')"></div><br /> +<strong>Anti-spam:</strong> (try regex for 'http://a.com', etc.) <a href="mailto:x@y.com"></a>, <a href="http://a.com/b@d.f"></a>, <a href="a.com/d.f" rel="nofollow"></a>, <a href="a.com/d.f" rel="1, 2"></a>, <a href="a.com/d.f"></a>, <a href="b.com/d.f"></a>, <a href="c.com/d.f"></a><br /> + +<h6>XSS</h6> + +'';!--"<xss>=&{()}<br /> +<img src="javascript%3Aalert('xss');" /><br /> +<img src="javascript:alert('xss');" /><br /> +<img src="java script:alert('xss');" /><br /> +<img +src=javascript:alert('XSS') /><br /> +<div style="javascript:alert('xss');"></div><br /> +<div style="background-image:url(javascript:alert('xss'));"></div><br /> +<div style="background-image:url("javascript:alert('xss')" );"></div><br /> +<!--[if gte IE 4]><script>alert('xss');</script><![endif]--><br /> +<script a=">" src="http://ha.ckers.org/xss.js"></script><br /> +<div style="background-image: url('js:xss')"></div><br /> +<a style=";-moz-binding:url(http://lukasz.pilorz.net/xss/xss.xml#xss)" href="http://example.com">test</a><br /> +<strong>Bad IE7:</strong> <a href="http://x&x=%22+style%3d%22background-image%3a+expression%28alert +%28%27xss%3f%29%29">x</a><br /> +<strong>Bad IE7:</strong> <a style=color:expr/*comment*/ession(alert(document.domain))>xxx</a><br /> +<strong>Bad IE7:</strong> <a href="xxx" style="background: expression(alert('xss'));">xxx</a><br /> +<strong>Bad IE7:</strong> <a href="xxx" style="background: expression(alert('xss'));">xxx</a><br /> +<strong>Bad IE7:</strong> <a href="xxx" style="background: %45xpression(alert('xss'));">xxx</a><br /> +<strong>Bad IE7:</strong> <a href="xxx" style="background:/**/expression(alert('xss'));">xxx</a><br /> +<strong>Bad IE7:</strong> <a href="xxx" style="background:/**/Expression(alert('xss'));">xxx</a><br /> +<strong>Bad IE7:</strong> <a href="xxx" style="background:/**/Expression(alert('xss'));">xxx</a><br /> +<strong>Bad IE7:</strong> <a href="xxx" style="background: expr%45ssion(alert('xss'));">xxx</a><br /> +<strong>Bad IE7:</strong> <a href="xxx" style="background: exp/* */ression(alert('xss'));">xxx</a><br /> +<strong>Bad IE7:</strong> <a href="xxx" style="background: exp /* */ression(alert('xss'));">xxx</a><br /> +<strong>Bad IE7:</strong> <a href="xxx" style="background: exp/ * * /ression(alert('xss'));">xxx</a><br /> +<strong>Bad IE7:</strong> <a href="xxx" style="background:/* x */expression(alert('xss'));">xxx</a><br /> +<strong>Bad IE7:</strong> <a href="xxx" style="background:/* */ */expression(alert('xss'));">xxx</a><br /> +<strong>Bad IE7:</strong> <a href="x" style="width: /****/**;;;;;;*/expression/**/(alert('xss'));">x</a><br /> +<strong>Bad IE7:</strong> <a href="x" style="padding:10px; background:/**/expression(alert('xss'));">x</a><br /> +<strong>Bad IE7:</strong> <a href="x" style="background: huh /* */ */expression(alert('xss'));">x</a><br /> +<strong>Bad IE7:</strong> <a href="x" style="background:/**/expression(alert('xss'));background:/**/expression(alert('xss'));">x</a><br /> +<strong>Bad IE7:</strong> exp/*<a style='no\xss:noxss("*//*");xss:ex/*XSS*//*/*/pression(alert("XSS"))'>x</a><br /> +<strong>Bad IE7:</strong> <a style="background:Expre\ssion(alert('xss'));">hi</a><br /> +<strong>Bad IE7:</strong> <a style="background:expre\ssion(alert('xss'));">hi</a><br /> +<strong>Bad IE7:</strong> <a style="color: \0065 \0078 \0070 \0072 \0065 \0073 \0073 \0069 \006f \006e \0028 \0061 \006c \0065 \0072 \0074 \0028 \0031 \0029 \0029">test</a><br /> +<strong>Bad IE7:</strong> <a style="xss:e\0078pression(window.x?0:(alert(/XSS/),window.x=1));">hi</a><br /> +<strong>Bad IE7:</strong> <a style="background:url('java +script:eval(document.all.mycode.expr)')">hi</a><br /> + +<h6>Other</h6> + +3 < 4 <br /> +3 > 4 <br /> + > 3 <br />
\ No newline at end of file |