From ac75772150c3fe9411408ac44db04e774d095aa0 Mon Sep 17 00:00:00 2001 From: Craig Andrews Date: Mon, 27 Jul 2009 13:42:03 -0400 Subject: Sanitize html returned by oEmbed providers to protect laconica from XSS attacks --- extlib/htmLawed/htmLawed.php | 715 ++++++++++++ extlib/htmLawed/htmLawedTest.php | 592 ++++++++++ extlib/htmLawed/htmLawed_README.htm | 1979 +++++++++++++++++++++++++++++++++ extlib/htmLawed/htmLawed_README.txt | 1600 ++++++++++++++++++++++++++ extlib/htmLawed/htmLawed_TESTCASE.txt | 370 ++++++ 5 files changed, 5256 insertions(+) create mode 100644 extlib/htmLawed/htmLawed.php create mode 100644 extlib/htmLawed/htmLawedTest.php create mode 100644 extlib/htmLawed/htmLawed_README.htm create mode 100644 extlib/htmLawed/htmLawed_README.txt create mode 100644 extlib/htmLawed/htmLawed_TESTCASE.txt (limited to 'extlib') diff --git a/extlib/htmLawed/htmLawed.php b/extlib/htmLawed/htmLawed.php new file mode 100644 index 000000000..17f6e98ca --- /dev/null +++ b/extlib/htmLawed/htmLawed.php @@ -0,0 +1,715 @@ +1, 'abbr'=>1, 'acronym'=>1, 'address'=>1, 'applet'=>1, 'area'=>1, 'b'=>1, 'bdo'=>1, 'big'=>1, 'blockquote'=>1, 'br'=>1, 'button'=>1, 'caption'=>1, 'center'=>1, 'cite'=>1, 'code'=>1, 'col'=>1, 'colgroup'=>1, 'dd'=>1, 'del'=>1, 'dfn'=>1, 'dir'=>1, 'div'=>1, 'dl'=>1, 'dt'=>1, 'em'=>1, 'embed'=>1, 'fieldset'=>1, 'font'=>1, 'form'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'hr'=>1, 'i'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'ins'=>1, 'isindex'=>1, 'kbd'=>1, 'label'=>1, 'legend'=>1, 'li'=>1, 'map'=>1, 'menu'=>1, 'noscript'=>1, 'object'=>1, 'ol'=>1, 'optgroup'=>1, 'option'=>1, 'p'=>1, 'param'=>1, 'pre'=>1, 'q'=>1, 'rb'=>1, 'rbc'=>1, 'rp'=>1, 'rt'=>1, 'rtc'=>1, 'ruby'=>1, 's'=>1, 'samp'=>1, 'script'=>1, 'select'=>1, 'small'=>1, 'span'=>1, 'strike'=>1, 'strong'=>1, 'sub'=>1, 'sup'=>1, 'table'=>1, 'tbody'=>1, 'td'=>1, 'textarea'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1, 'tt'=>1, 'u'=>1, 'ul'=>1, 'var'=>1); // 86/deprecated+embed+ruby +if(!empty($C['safe'])){ + unset($e['applet'], $e['embed'], $e['iframe'], $e['object'], $e['script']); +} +$x = !empty($C['elements']) ? str_replace(array("\n", "\r", "\t", ' '), '', $C['elements']) : '*'; +if($x == '-*'){$e = array();} +elseif(strpos($x, '*') === false){$e = array_flip(explode(',', $x));} +else{ + if(isset($x[1])){ + preg_match_all('`(?:^|-|\+)[^\-+]+?(?=-|\+|$)`', $x, $m, PREG_SET_ORDER); + for($i=count($m); --$i>=0;){$m[$i] = $m[$i][0];} + foreach($m as $v){ + if($v[0] == '+'){$e[substr($v, 1)] = 1;} + if($v[0] == '-' && isset($e[($v = substr($v, 1))]) && !in_array('+'. $v, $m)){unset($e[$v]);} + } + } +} +$C['elements'] =& $e; +// config attrs +$x = !empty($C['deny_attribute']) ? str_replace(array("\n", "\r", "\t", ' '), '', $C['deny_attribute']) : ''; +$x = array_flip((isset($x[0]) && $x[0] == '*') ? explode('-', $x) : explode(',', $x. (!empty($C['safe']) ? ',on*' : ''))); +if(isset($x['on*'])){ + unset($x['on*']); + $x += array('onblur'=>1, 'onchange'=>1, 'onclick'=>1, 'ondblclick'=>1, 'onfocus'=>1, 'onkeydown'=>1, 'onkeypress'=>1, 'onkeyup'=>1, 'onmousedown'=>1, 'onmousemove'=>1, 'onmouseout'=>1, 'onmouseover'=>1, 'onmouseup'=>1, 'onreset'=>1, 'onselect'=>1, 'onsubmit'=>1); +} +$C['deny_attribute'] = $x; +// config URL +$x = (isset($C['schemes'][2]) && strpos($C['schemes'], ':')) ? strtolower($C['schemes']) : 'href: aim, feed, file, ftp, gopher, http, https, irc, mailto, news, nntp, sftp, ssh, telnet; *:file, http, https'; +$C['schemes'] = array(); +foreach(explode(';', str_replace(array(' ', "\t", "\r", "\n"), '', $x)) as $v){ + $x = $x2 = null; list($x, $x2) = explode(':', $v, 2); + if($x2){$C['schemes'][$x] = array_flip(explode(',', $x2));} +} +if(!isset($C['schemes']['*'])){$C['schemes']['*'] = array('file'=>1, 'http'=>1, 'https'=>1,);} +if(!empty($C['safe']) && empty($C['schemes']['style'])){$C['schemes']['style'] = array('nil'=>1);} +$C['abs_url'] = isset($C['abs_url']) ? $C['abs_url'] : 0; +if(!isset($C['base_url']) or !preg_match('`^[a-zA-Z\d.+\-]+://[^/]+/(.+?/)?$`', $C['base_url'])){ + $C['base_url'] = $C['abs_url'] = 0; +} +// config rest +$C['and_mark'] = empty($C['and_mark']) ? 0 : 1; +$C['anti_link_spam'] = (isset($C['anti_link_spam']) && is_array($C['anti_link_spam']) && count($C['anti_link_spam']) == 2 && (empty($C['anti_link_spam'][0]) or hl_regex($C['anti_link_spam'][0])) && (empty($C['anti_link_spam'][1]) or hl_regex($C['anti_link_spam'][1]))) ? $C['anti_link_spam'] : 0; +$C['anti_mail_spam'] = isset($C['anti_mail_spam']) ? $C['anti_mail_spam'] : 0; +$C['balance'] = isset($C['balance']) ? (bool)$C['balance'] : 1; +$C['cdata'] = isset($C['cdata']) ? $C['cdata'] : (empty($C['safe']) ? 3 : 0); +$C['clean_ms_char'] = empty($C['clean_ms_char']) ? 0 : $C['clean_ms_char']; +$C['comment'] = isset($C['comment']) ? $C['comment'] : (empty($C['safe']) ? 3 : 0); +$C['css_expression'] = empty($C['css_expression']) ? 0 : 1; +$C['hexdec_entity'] = isset($C['hexdec_entity']) ? $C['hexdec_entity'] : 1; +$C['hook'] = (!empty($C['hook']) && function_exists($C['hook'])) ? $C['hook'] : 0; +$C['hook_tag'] = (!empty($C['hook_tag']) && function_exists($C['hook_tag'])) ? $C['hook_tag'] : 0; +$C['keep_bad'] = isset($C['keep_bad']) ? $C['keep_bad'] : 6; +$C['lc_std_val'] = isset($C['lc_std_val']) ? (bool)$C['lc_std_val'] : 1; +$C['make_tag_strict'] = isset($C['make_tag_strict']) ? $C['make_tag_strict'] : 1; +$C['named_entity'] = isset($C['named_entity']) ? (bool)$C['named_entity'] : 1; +$C['no_deprecated_attr'] = isset($C['no_deprecated_attr']) ? $C['no_deprecated_attr'] : 1; +$C['parent'] = isset($C['parent'][0]) ? strtolower($C['parent']) : 'body'; +$C['show_setting'] = !empty($C['show_setting']) ? $C['show_setting'] : 0; +$C['style_pass'] = empty($C['style_pass']) ? 0 : 1; +$C['tidy'] = empty($C['tidy']) ? 0 : $C['tidy']; +$C['unique_ids'] = isset($C['unique_ids']) ? $C['unique_ids'] : 1; +$C['xml:lang'] = isset($C['xml:lang']) ? $C['xml:lang'] : 0; + +if(isset($GLOBALS['C'])){$reC = $GLOBALS['C'];} +$GLOBALS['C'] = $C; +$S = is_array($S) ? $S : hl_spec($S); +if(isset($GLOBALS['S'])){$reS = $GLOBALS['S'];} +$GLOBALS['S'] = $S; + +$t = preg_replace('`[\x00-\x08\x0b-\x0c\x0e-\x1f]`', '', $t); +if($C['clean_ms_char']){ + $x = array("\x7f"=>'', "\x80"=>'€', "\x81"=>'', "\x83"=>'ƒ', "\x85"=>'…', "\x86"=>'†', "\x87"=>'‡', "\x88"=>'ˆ', "\x89"=>'‰', "\x8a"=>'Š', "\x8b"=>'‹', "\x8c"=>'Œ', "\x8d"=>'', "\x8e"=>'Ž', "\x8f"=>'', "\x90"=>'', "\x95"=>'•', "\x96"=>'–', "\x97"=>'—', "\x98"=>'˜', "\x99"=>'™', "\x9a"=>'š', "\x9b"=>'›', "\x9c"=>'œ', "\x9d"=>'', "\x9e"=>'ž', "\x9f"=>'Ÿ'); + $x = $x + ($C['clean_ms_char'] == 1 ? array("\x82"=>'‚', "\x84"=>'„', "\x91"=>'‘', "\x92"=>'’', "\x93"=>'“', "\x94"=>'”') : array("\x82"=>'\'', "\x84"=>'"', "\x91"=>'\'', "\x92"=>'\'', "\x93"=>'"', "\x94"=>'"')); + $t = strtr($t, $x); +} +if($C['cdata'] or $C['comment']){$t = preg_replace_callback('``sm', 'hl_cmtcd', $t);} +$t = preg_replace_callback('`&([A-Za-z][A-Za-z0-9]{1,30}|#(?:[0-9]{1,8}|[Xx][0-9A-Fa-f]{1,7}));`', 'hl_ent', str_replace('&', '&', $t)); +if($C['unique_ids'] && !isset($GLOBALS['hl_Ids'])){$GLOBALS['hl_Ids'] = array();} +if($C['hook']){$t = $C['hook']($t, $C, $S);} +if($C['show_setting'] && preg_match('`^[a-z][a-z0-9_]*$`i', $C['show_setting'])){ + $GLOBALS[$C['show_setting']] = array('config'=>$C, 'spec'=>$S, 'time'=>microtime()); +} +// main +$t = preg_replace_callback('`<(?:(?:\s|$)|(?:[^>]*(?:>|$)))|>`m', 'hl_tag', $t); +$t = $C['balance'] ? hl_bal($t, $C['keep_bad'], $C['parent']) : $t; +$t = (($C['cdata'] or $C['comment']) && strpos($t, "\x01") !== false) ? str_replace(array("\x01", "\x02", "\x03", "\x04", "\x05"), array('', '', '&', '<', '>'), $t) : $t; +$t = $C['tidy'] ? hl_tidy($t, $C['tidy'], $C['parent']) : $t; +unset($C, $e); +if(isset($reC)){$GLOBALS['C'] = $reC;} +if(isset($reS)){$GLOBALS['S'] = $reS;} +return $t; +// eof +} + +function hl_attrval($t, $p){ +// check attr val against $S +$o = 1; $l = strlen($t); +foreach($p as $k=>$v){ + switch($k){ + case 'maxlen':if($l > $v){$o = 0;} + break; case 'minlen': if($l < $v){$o = 0;} + break; case 'maxval': if((float)($t) > $v){$o = 0;} + break; case 'minval': if((float)($t) < $v){$o = 0;} + break; case 'match': if(!preg_match($v, $t)){$o = 0;} + break; case 'nomatch': if(preg_match($v, $t)){$o = 0;} + break; case 'oneof': + $m = 0; + foreach(explode('|', $v) as $n){if($t == $n){$m = 1; break;}} + $o = $m; + break; case 'noneof': + $m = 1; + foreach(explode('|', $v) as $n){if($t == $n){$m = 0; break;}} + $o = $m; + break; default: + break; + } + if(!$o){break;} +} +return ($o ? $t : (isset($p['default']) ? $p['default'] : 0)); +// eof +} + +function hl_bal($t, $do=1, $in='div'){ +// balance tags +// by content +$cB = array('blockquote'=>1, 'form'=>1, 'map'=>1, 'noscript'=>1); // Block +$cE = array('area'=>1, 'br'=>1, 'col'=>1, 'embed'=>1, 'hr'=>1, 'img'=>1, 'input'=>1, 'isindex'=>1, 'param'=>1); // Empty +$cF = array('button'=>1, 'del'=>1, 'div'=>1, 'dd'=>1, 'fieldset'=>1, 'iframe'=>1, 'ins'=>1, 'li'=>1, 'noscript'=>1, 'object'=>1, 'td'=>1, 'th'=>1); // Flow; later context-wise dynamic move of ins & del to $cI +$cI = array('a'=>1, 'abbr'=>1, 'acronym'=>1, 'address'=>1, 'b'=>1, 'bdo'=>1, 'big'=>1, 'caption'=>1, 'cite'=>1, 'code'=>1, 'dfn'=>1, 'dt'=>1, 'em'=>1, 'font'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'i'=>1, 'kbd'=>1, 'label'=>1, 'legend'=>1, 'p'=>1, 'pre'=>1, 'q'=>1, 'rb'=>1, 'rt'=>1, 's'=>1, 'samp'=>1, 'small'=>1, 'span'=>1, 'strike'=>1, 'strong'=>1, 'sub'=>1, 'sup'=>1, 'tt'=>1, 'u'=>1, 'var'=>1); // Inline +$cN = array('a'=>array('a'=>1), 'button'=>array('a'=>1, 'button'=>1, 'fieldset'=>1, 'form'=>1, 'iframe'=>1, 'input'=>1, 'label'=>1, 'select'=>1, 'textarea'=>1), 'fieldset'=>array('fieldset'=>1), 'form'=>array('form'=>1), 'label'=>array('label'=>1), 'noscript'=>array('script'=>1), 'pre'=>array('big'=>1, 'font'=>1, 'img'=>1, 'object'=>1, 'script'=>1, 'small'=>1, 'sub'=>1, 'sup'=>1), 'rb'=>array('ruby'=>1), 'rt'=>array('ruby'=>1)); // Illegal +$cN2 = array_keys($cN); +$cR = array('blockquote'=>1, 'dir'=>1, 'dl'=>1, 'form'=>1, 'map'=>1, 'menu'=>1, 'noscript'=>1, 'ol'=>1, 'optgroup'=>1, 'rbc'=>1, 'rtc'=>1, 'ruby'=>1, 'select'=>1, 'table'=>1, 'tbody'=>1, 'tfoot'=>1, 'thead'=>1, 'tr'=>1, 'ul'=>1); +$cS = array('colgroup'=>array('col'=>1), 'dir'=>array('li'), 'dl'=>array('dd'=>1, 'dt'=>1), 'menu'=>array('li'=>1), 'ol'=>array('li'=>1), 'optgroup'=>array('option'=>1), 'option'=>array('#pcdata'=>1), 'rbc'=>array('rb'=>1), 'rp'=>array('#pcdata'=>1), 'rtc'=>array('rt'=>1), 'ruby'=>array('rb'=>1, 'rbc'=>1, 'rp'=>1, 'rt'=>1, 'rtc'=>1), 'select'=>array('optgroup'=>1, 'option'=>1), 'script'=>array('#pcdata'=>1), 'table'=>array('caption'=>1, 'col'=>1, 'colgroup'=>1, 'tfoot'=>1, 'tbody'=>1, 'tr'=>1, 'thead'=>1), 'tbody'=>array('tr'=>1), 'tfoot'=>array('tr'=>1), 'textarea'=>array('#pcdata'=>1), 'thead'=>array('tr'=>1), 'tr'=>array('td'=>1, 'th'=>1), 'ul'=>array('li'=>1)); // Specific - immediate parent-child +$cO = array('address'=>array('p'=>1), 'applet'=>array('param'=>1), 'blockquote'=>array('script'=>1), 'fieldset'=>array('legend'=>1, '#pcdata'=>1), 'form'=>array('script'=>1), 'map'=>array('area'=>1), 'object'=>array('param'=>1, 'embed'=>1)); // Other +$cT = array('colgroup'=>1, 'dd'=>1, 'dt'=>1, 'li'=>1, 'option'=>1, 'p'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1); // Omitable closing +// block/inline type; ins & del both type; #pcdata: text +$eB = array('address'=>1, 'blockquote'=>1, 'center'=>1, 'del'=>1, 'dir'=>1, 'dl'=>1, 'div'=>1, 'fieldset'=>1, 'form'=>1, 'ins'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'hr'=>1, 'isindex'=>1, 'menu'=>1, 'noscript'=>1, 'ol'=>1, 'p'=>1, 'pre'=>1, 'table'=>1, 'ul'=>1); +$eI = array('#pcdata'=>1, 'a'=>1, 'abbr'=>1, 'acronym'=>1, 'applet'=>1, 'b'=>1, 'bdo'=>1, 'big'=>1, 'br'=>1, 'button'=>1, 'cite'=>1, 'code'=>1, 'del'=>1, 'dfn'=>1, 'em'=>1, 'embed'=>1, 'font'=>1, 'i'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'ins'=>1, 'kbd'=>1, 'label'=>1, 'map'=>1, 'object'=>1, 'param'=>1, 'q'=>1, 'ruby'=>1, 's'=>1, 'samp'=>1, 'select'=>1, 'script'=>1, 'small'=>1, 'span'=>1, 'strike'=>1, 'strong'=>1, 'sub'=>1, 'sup'=>1, 'textarea'=>1, 'tt'=>1, 'u'=>1, 'var'=>1); +$eN = array('a'=>1, 'big'=>1, 'button'=>1, 'fieldset'=>1, 'font'=>1, 'form'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'label'=>1, 'object'=>1, 'ruby'=>1, 'script'=>1, 'select'=>1, 'small'=>1, 'sub'=>1, 'sup'=>1, 'textarea'=>1); // Exclude from specific ele; $cN values +$eO = array('area'=>1, 'caption'=>1, 'col'=>1, 'colgroup'=>1, 'dd'=>1, 'dt'=>1, 'legend'=>1, 'li'=>1, 'optgroup'=>1, 'option'=>1, 'rb'=>1, 'rbc'=>1, 'rp'=>1, 'rt'=>1, 'rtc'=>1, 'script'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'thead'=>1, 'th'=>1, 'tr'=>1); // Missing in $eB & $eI +$eF = $eB + $eI; + +// $in sets allowed child +$in = ((isset($eF[$in]) && $in != '#pcdata') or isset($eO[$in])) ? $in : 'div'; +if(isset($cE[$in])){ + return (!$do ? '' : str_replace(array('<', '>'), array('<', '>'), $t)); +} +if(isset($cS[$in])){$inOk = $cS[$in];} +elseif(isset($cI[$in])){$inOk = $eI; $cI['del'] = 1; $cI['ins'] = 1;} +elseif(isset($cF[$in])){$inOk = $eF; unset($cI['del'], $cI['ins']);} +elseif(isset($cB[$in])){$inOk = $eB; unset($cI['del'], $cI['ins']);} +if(isset($cO[$in])){$inOk = $inOk + $cO[$in];} +if(isset($cN[$in])){$inOk = array_diff_assoc($inOk, $cN[$in]);} + +$t = explode('<', $t); +$ok = $q = array(); // $q seq list of open non-empty ele +ob_start(); + +for($i=-1, $ci=count($t); ++$i<$ci;){ + // allowed $ok in parent $p + if($ql = count($q)){ + $p = array_pop($q); + $q[] = $p; + if(isset($cS[$p])){$ok = $cS[$p];} + elseif(isset($cI[$p])){$ok = $eI; $cI['del'] = 1; $cI['ins'] = 1;} + elseif(isset($cF[$p])){$ok = $eF; unset($cI['del'], $cI['ins']);} + elseif(isset($cB[$p])){$ok = $eB; unset($cI['del'], $cI['ins']);} + if(isset($cO[$p])){$ok = $ok + $cO[$p];} + if(isset($cN[$p])){$ok = array_diff_assoc($ok, $cN[$p]);} + }else{$ok = $inOk; unset($cI['del'], $cI['ins']);} + // bad tags, & ele content + if(isset($e) && ($do == 1 or (isset($ok['#pcdata']) && ($do == 3 or $do == 5)))){ + echo '<', $s, $e, $a, '>'; + } + if(isset($x[0])){ + if($do < 3 or isset($ok['#pcdata'])){echo $x;} + elseif(strpos($x, "\x02\x04")){ + foreach(preg_split('`(\x01\x02[^\x01\x02]+\x02\x01)`', $x, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY) as $v){ + echo (substr($v, 0, 2) == "\x01\x02" ? $v : ($do > 4 ? preg_replace('`\S`', '', $v) : '')); + } + }elseif($do > 4){echo preg_replace('`\S`', '', $x);} + } + // get markup + if(!preg_match('`^(/?)([a-zA-Z1-6]+)([^>]*)>(.*)`sm', $t[$i], $r)){$x = $t[$i]; continue;} + $s = null; $e = null; $a = null; $x = null; list($all, $s, $e, $a, $x) = $r; + // close tag + if($s){ + if(isset($cE[$e]) or !in_array($e, $q)){continue;} // Empty/unopen + if($p == $e){array_pop($q); echo ''; unset($e); continue;} // Last open + $add = ''; // Nesting - close open tags that need to be + for($j=-1, $cj=count($q); ++$j<$cj;){ + if(($d = array_pop($q)) == $e){break;} + else{$add .= "";} + } + echo $add, ''; unset($e); continue; + } + // open tag + // $cB ele needs $eB ele as child + if(isset($cB[$e]) && strlen(trim($x))){ + $t[$i] = "{$e}{$a}>"; + array_splice($t, $i+1, 0, 'div>'. $x); unset($e, $x); ++$ci; --$i; continue; + } + if((($ql && isset($cB[$p])) or (isset($cB[$in]) && !$ql)) && !isset($eB[$e]) && !isset($ok[$e])){ + array_splice($t, $i, 0, 'div>'); unset($e, $x); ++$ci; --$i; continue; + } + // if no open ele, $in = parent; mostly immediate parent-child relation should hold + if(!$ql or !isset($eN[$e]) or !array_intersect($q, $cN2)){ + if(!isset($ok[$e])){ + if($ql && isset($cT[$p])){echo ''; unset($e, $x); --$i;} + continue; + } + if(!isset($cE[$e])){$q[] = $e;} + echo '<', $e, $a, '>'; unset($e); continue; + } + // specific parent-child + if(isset($cS[$p][$e])){ + if(!isset($cE[$e])){$q[] = $e;} + echo '<', $e, $a, '>'; unset($e); continue; + } + // nesting + $add = ''; + $q2 = array(); + for($k=-1, $kc=count($q); ++$k<$kc;){ + $d = $q[$k]; + $ok2 = array(); + if(isset($cS[$d])){$q2[] = $d; continue;} + $ok2 = isset($cI[$d]) ? $eI : $eF; + if(isset($cO[$d])){$ok2 = $ok2 + $cO[$d];} + if(isset($cN[$d])){$ok2 = array_diff_assoc($ok2, $cN[$d]);} + if(!isset($ok2[$e])){ + if(!$k && !isset($inOk[$e])){continue 2;} + $add = ""; + for(;++$k<$kc;){$add = "{$add}";} + break; + } + else{$q2[] = $d;} + } + $q = $q2; + if(!isset($cE[$e])){$q[] = $e;} + echo $add, '<', $e, $a, '>'; unset($e); continue; +} + +// end +if($ql = count($q)){ + $p = array_pop($q); + $q[] = $p; + if(isset($cS[$p])){$ok = $cS[$p];} + elseif(isset($cI[$p])){$ok = $eI; $cI['del'] = 1; $cI['ins'] = 1;} + elseif(isset($cF[$p])){$ok = $eF; unset($cI['del'], $cI['ins']);} + elseif(isset($cB[$p])){$ok = $eB; unset($cI['del'], $cI['ins']);} + if(isset($cO[$p])){$ok = $ok + $cO[$p];} + if(isset($cN[$p])){$ok = array_diff_assoc($ok, $cN[$p]);} +}else{$ok = $inOk; unset($cI['del'], $cI['ins']);} +if(isset($e) && ($do == 1 or (isset($ok['#pcdata']) && ($do == 3 or $do == 5)))){ + echo '<', $s, $e, $a, '>'; +} +if(isset($x[0])){ + if(strlen(trim($x)) && (($ql && isset($cB[$p])) or (isset($cB[$in]) && !$ql))){ + echo '
', $x, '
'; + } + elseif($do < 3 or isset($ok['#pcdata'])){echo $x;} + elseif(strpos($x, "\x02\x04")){ + foreach(preg_split('`(\x01\x02[^\x01\x02]+\x02\x01)`', $x, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY) as $v){ + echo (substr($v, 0, 2) == "\x01\x02" ? $v : ($do > 4 ? preg_replace('`\S`', '', $v) : '')); + } + }elseif($do > 4){echo preg_replace('`\S`', '', $x);} +} +while(!empty($q) && ($e = array_pop($q))){echo '';} +$o = ob_get_contents(); +ob_end_clean(); +return $o; +// eof +} + +function hl_cmtcd($t){ +// comment/CDATA sec handler +$t = $t[0]; +global $C; +if($t[3] == '-'){ + if(!$C['comment']){return $t;} + if($C['comment'] == 1){return '';} + if(substr(($t = preg_replace('`--+`', '-', substr($t, 4, -3))), -1) != ' '){$t .= ' ';} + $t = $C['comment'] == 2 ? str_replace(array('&', '<', '>'), array('&', '<', '>'), $t) : $t; + $t = "\x01\x02\x04!--$t--\x05\x02\x01"; +}else{ // CDATA + if(!$C['cdata']){return $t;} + if($C['cdata'] == 1){return '';} + $t = substr($t, 1, -1); + $t = $C['cdata'] == 2 ? str_replace(array('&', '<', '>'), array('&', '<', '>'), $t) : $t; + $t = "\x01\x01\x04$t\x05\x01\x01"; +} +return str_replace(array('&', '<', '>'), array("\x03", "\x04", "\x05"), $t); +// eof +} + +function hl_ent($t){ +// entitity handler +global $C; +$t = $t[1]; +static $U = array('quot'=>1,'amp'=>1,'lt'=>1,'gt'=>1); +static $N = array('fnof'=>'402', 'Alpha'=>'913', 'Beta'=>'914', 'Gamma'=>'915', 'Delta'=>'916', 'Epsilon'=>'917', 'Zeta'=>'918', 'Eta'=>'919', 'Theta'=>'920', 'Iota'=>'921', 'Kappa'=>'922', 'Lambda'=>'923', 'Mu'=>'924', 'Nu'=>'925', 'Xi'=>'926', 'Omicron'=>'927', 'Pi'=>'928', 'Rho'=>'929', 'Sigma'=>'931', 'Tau'=>'932', 'Upsilon'=>'933', 'Phi'=>'934', 'Chi'=>'935', 'Psi'=>'936', 'Omega'=>'937', 'alpha'=>'945', 'beta'=>'946', 'gamma'=>'947', 'delta'=>'948', 'epsilon'=>'949', 'zeta'=>'950', 'eta'=>'951', 'theta'=>'952', 'iota'=>'953', 'kappa'=>'954', 'lambda'=>'955', 'mu'=>'956', 'nu'=>'957', 'xi'=>'958', 'omicron'=>'959', 'pi'=>'960', 'rho'=>'961', 'sigmaf'=>'962', 'sigma'=>'963', 'tau'=>'964', 'upsilon'=>'965', 'phi'=>'966', 'chi'=>'967', 'psi'=>'968', 'omega'=>'969', 'thetasym'=>'977', 'upsih'=>'978', 'piv'=>'982', 'bull'=>'8226', 'hellip'=>'8230', 'prime'=>'8242', 'Prime'=>'8243', 'oline'=>'8254', 'frasl'=>'8260', 'weierp'=>'8472', 'image'=>'8465', 'real'=>'8476', 'trade'=>'8482', 'alefsym'=>'8501', 'larr'=>'8592', 'uarr'=>'8593', 'rarr'=>'8594', 'darr'=>'8595', 'harr'=>'8596', 'crarr'=>'8629', 'lArr'=>'8656', 'uArr'=>'8657', 'rArr'=>'8658', 'dArr'=>'8659', 'hArr'=>'8660', 'forall'=>'8704', 'part'=>'8706', 'exist'=>'8707', 'empty'=>'8709', 'nabla'=>'8711', 'isin'=>'8712', 'notin'=>'8713', 'ni'=>'8715', 'prod'=>'8719', 'sum'=>'8721', 'minus'=>'8722', 'lowast'=>'8727', 'radic'=>'8730', 'prop'=>'8733', 'infin'=>'8734', 'ang'=>'8736', 'and'=>'8743', 'or'=>'8744', 'cap'=>'8745', 'cup'=>'8746', 'int'=>'8747', 'there4'=>'8756', 'sim'=>'8764', 'cong'=>'8773', 'asymp'=>'8776', 'ne'=>'8800', 'equiv'=>'8801', 'le'=>'8804', 'ge'=>'8805', 'sub'=>'8834', 'sup'=>'8835', 'nsub'=>'8836', 'sube'=>'8838', 'supe'=>'8839', 'oplus'=>'8853', 'otimes'=>'8855', 'perp'=>'8869', 'sdot'=>'8901', 'lceil'=>'8968', 'rceil'=>'8969', 'lfloor'=>'8970', 'rfloor'=>'8971', 'lang'=>'9001', 'rang'=>'9002', 'loz'=>'9674', 'spades'=>'9824', 'clubs'=>'9827', 'hearts'=>'9829', 'diams'=>'9830', 'apos'=>'39', 'OElig'=>'338', 'oelig'=>'339', 'Scaron'=>'352', 'scaron'=>'353', 'Yuml'=>'376', 'circ'=>'710', 'tilde'=>'732', 'ensp'=>'8194', 'emsp'=>'8195', 'thinsp'=>'8201', 'zwnj'=>'8204', 'zwj'=>'8205', 'lrm'=>'8206', 'rlm'=>'8207', 'ndash'=>'8211', 'mdash'=>'8212', 'lsquo'=>'8216', 'rsquo'=>'8217', 'sbquo'=>'8218', 'ldquo'=>'8220', 'rdquo'=>'8221', 'bdquo'=>'8222', 'dagger'=>'8224', 'Dagger'=>'8225', 'permil'=>'8240', 'lsaquo'=>'8249', 'rsaquo'=>'8250', 'euro'=>'8364', 'nbsp'=>'160', 'iexcl'=>'161', 'cent'=>'162', 'pound'=>'163', 'curren'=>'164', 'yen'=>'165', 'brvbar'=>'166', 'sect'=>'167', 'uml'=>'168', 'copy'=>'169', 'ordf'=>'170', 'laquo'=>'171', 'not'=>'172', 'shy'=>'173', 'reg'=>'174', 'macr'=>'175', 'deg'=>'176', 'plusmn'=>'177', 'sup2'=>'178', 'sup3'=>'179', 'acute'=>'180', 'micro'=>'181', 'para'=>'182', 'middot'=>'183', 'cedil'=>'184', 'sup1'=>'185', 'ordm'=>'186', 'raquo'=>'187', 'frac14'=>'188', 'frac12'=>'189', 'frac34'=>'190', 'iquest'=>'191', 'Agrave'=>'192', 'Aacute'=>'193', 'Acirc'=>'194', 'Atilde'=>'195', 'Auml'=>'196', 'Aring'=>'197', 'AElig'=>'198', 'Ccedil'=>'199', 'Egrave'=>'200', 'Eacute'=>'201', 'Ecirc'=>'202', 'Euml'=>'203', 'Igrave'=>'204', 'Iacute'=>'205', 'Icirc'=>'206', 'Iuml'=>'207', 'ETH'=>'208', 'Ntilde'=>'209', 'Ograve'=>'210', 'Oacute'=>'211', 'Ocirc'=>'212', 'Otilde'=>'213', 'Ouml'=>'214', 'times'=>'215', 'Oslash'=>'216', 'Ugrave'=>'217', 'Uacute'=>'218', 'Ucirc'=>'219', 'Uuml'=>'220', 'Yacute'=>'221', 'THORN'=>'222', 'szlig'=>'223', 'agrave'=>'224', 'aacute'=>'225', 'acirc'=>'226', 'atilde'=>'227', 'auml'=>'228', 'aring'=>'229', 'aelig'=>'230', 'ccedil'=>'231', 'egrave'=>'232', 'eacute'=>'233', 'ecirc'=>'234', 'euml'=>'235', 'igrave'=>'236', 'iacute'=>'237', 'icirc'=>'238', 'iuml'=>'239', 'eth'=>'240', 'ntilde'=>'241', 'ograve'=>'242', 'oacute'=>'243', 'ocirc'=>'244', 'otilde'=>'245', 'ouml'=>'246', 'divide'=>'247', 'oslash'=>'248', 'ugrave'=>'249', 'uacute'=>'250', 'ucirc'=>'251', 'uuml'=>'252', 'yacute'=>'253', 'thorn'=>'254', 'yuml'=>'255'); +if($t[0] != '#'){ + return ($C['and_mark'] ? "\x06" : '&'). (isset($U[$t]) ? $t : (isset($N[$t]) ? (!$C['named_entity'] ? '#'. ($C['hexdec_entity'] > 1 ? 'x'. dechex($N[$t]) : $N[$t]) : $t) : 'amp;'. $t)). ';'; +} +if(($n = ctype_digit($t = substr($t, 1)) ? intval($t) : hexdec(substr($t, 1))) < 9 or ($n > 13 && $n < 32) or $n == 11 or $n == 12 or ($n > 126 && $n < 160 && $n != 133) or ($n > 55295 && ($n < 57344 or ($n > 64975 && $n < 64992) or $n == 65534 or $n == 65535 or $n > 1114111))){ + return ($C['and_mark'] ? "\x06" : '&'). "amp;#{$t};"; +} +return ($C['and_mark'] ? "\x06" : '&'). '#'. (((ctype_digit($t) && $C['hexdec_entity'] < 2) or !$C['hexdec_entity']) ? $n : 'x'. dechex($n)). ';'; +// eof +} + +function hl_prot($p, $c=null){ +// check URL scheme +global $C; +$b = $a = ''; +if($c == null){$c = 'style'; $b = $p[1]; $a = $p[3]; $p = trim($p[2]);} +$c = isset($C['schemes'][$c]) ? $C['schemes'][$c] : $C['schemes']['*']; +if(isset($c['*']) or !strcspn($p, '#?;')){return "{$b}{$p}{$a}";} // All ok, frag, query, param +if(preg_match('`^([a-z\d\-+.&#; ]+?)(:|&#(58|x3a);|%3a|\\\\0{0,4}3a).`i', $p, $m) && !isset($c[strtolower($m[1])])){ // Denied prot + return "{$b}denied:{$p}{$a}"; +} +if($C['abs_url']){ + if($C['abs_url'] == -1 && strpos($p, $C['base_url']) === 0){ // Make url rel + $p = substr($p, strlen($C['base_url'])); + }elseif(empty($m[1])){ // Make URL abs + if(substr($p, 0, 2) == '//'){$p = substr($C['base_url'], 0, strpos($C['base_url'], ':')+1). $p;} + elseif($p[0] == '/'){$p = preg_replace('`(^.+?://[^/]+)(.*)`', '$1', $C['base_url']). $p;} + elseif(strcspn($p, './')){$p = $C['base_url']. $p;} + else{ + preg_match('`^([a-zA-Z\d\-+.]+://[^/]+)(.*)`', $C['base_url'], $m); + $p = preg_replace('`(?<=/)\./`', '', $m[2]. $p); + while(preg_match('`(?<=/)([^/]{3,}|[^/.]+?|\.[^/.]|[^/.]\.)/\.\./`', $p)){ + $p = preg_replace('`(?<=/)([^/]{3,}|[^/.]+?|\.[^/.]|[^/.]\.)/\.\./`', '', $p); + } + $p = $m[1]. $p; + } + } +} +return "{$b}{$p}{$a}"; +// eof +} + +function hl_regex($p){ +// ?regex +if(empty($p)){return 0;} +if($t = ini_get('track_errors')){$o = isset($php_errormsg) ? $php_errormsg : null;} +else{ini_set('track_errors', 1);} +unset($php_errormsg); +if(($d = ini_get('display_errors'))){ini_set('display_errors', 0);} +preg_match($p, ''); +if($d){ini_set('display_errors', 1);} +$r = isset($php_errormsg) ? 0 : 1; +if($t){$php_errormsg = isset($o) ? $o : null;} +else{ini_set('track_errors', 0);} +return $r; +// eof +} + +function hl_spec($t){ +// final $spec +$s = array(); +$t = str_replace(array("\t", "\r", "\n", ' '), '', preg_replace('/"(?>(`.|[^"])*)"/sme', 'substr(str_replace(array(";", "|", "~", " ", ",", "/", "(", ")", \'`"\'), array("\x01", "\x02", "\x03", "\x04", "\x05", "\x06", "\x07", "\x08", "\""), "$0"), 1, -1)', trim($t))); +for($i = count(($t = explode(';', $t))); --$i>=0;){ + $w = $t[$i]; + if(empty($w) or ($e = strpos($w, '=')) === false or !strlen(($a = substr($w, $e+1)))){continue;} + $y = $n = array(); + foreach(explode(',', $a) as $v){ + if(!preg_match('`^([a-z:\-\*]+)(?:\((.*?)\))?`i', $v, $m)){continue;} + if(($x = strtolower($m[1])) == '-*'){$n['*'] = 1; continue;} + if($x[0] == '-'){$n[substr($x, 1)] = 1; continue;} + if(!isset($m[2])){$y[$x] = 1; continue;} + foreach(explode('/', $m[2]) as $m){ + if(empty($m) or ($p = strpos($m, '=')) == 0 or $p < 5){$y[$x] = 1; continue;} + $y[$x][strtolower(substr($m, 0, $p))] = str_replace(array("\x01", "\x02", "\x03", "\x04", "\x05", "\x06", "\x07", "\x08"), array(";", "|", "~", " ", ",", "/", "(", ")"), substr($m, $p+1)); + } + if(isset($y[$x]['match']) && !hl_regex($y[$x]['match'])){unset($y[$x]['match']);} + if(isset($y[$x]['nomatch']) && !hl_regex($y[$x]['nomatch'])){unset($y[$x]['nomatch']);} + } + if(!count($y) && !count($n)){continue;} + foreach(explode(',', substr($w, 0, $e)) as $v){ + if(!strlen(($v = strtolower($v)))){continue;} + if(count($y)){$s[$v] = $y;} + if(count($n)){$s[$v]['n'] = $n;} + } +} +return $s; +// eof +} + +function hl_tag($t){ +// tag/attribute handler +global $C; +$t = $t[0]; +// invalid < > +if($t == '< '){return '< ';} +if($t == '>'){return '>';} +if(!preg_match('`^<(/?)([a-zA-Z][a-zA-Z1-6]*)([^>]*?)\s?>$`m', $t, $m)){ + return str_replace(array('<', '>'), array('<', '>'), $t); +}elseif(!isset($C['elements'][($e = strtolower($m[2]))])){ + return (($C['keep_bad']%2) ? str_replace(array('<', '>'), array('<', '>'), $t) : ''); +} +// attr string +$a = str_replace(array("\xad", "\n", "\r", "\t"), ' ', trim($m[3])); +if(strpos($a, '&') !== false){ + str_replace(array('­', '­', '­'), ' ', $a); +} +// tag transform +static $eD = array('applet'=>1, 'center'=>1, 'dir'=>1, 'embed'=>1, 'font'=>1, 'isindex'=>1, 'menu'=>1, 's'=>1, 'strike'=>1, 'u'=>1); // Deprecated +if($C['make_tag_strict'] && isset($eD[$e])){ + $trt = hl_tag2($e, $a, $C['make_tag_strict']); + if(!$e){return (($C['keep_bad']%2) ? str_replace(array('<', '>'), array('<', '>'), $t) : '');} +} +// close tag +static $eE = array('area'=>1, 'br'=>1, 'col'=>1, 'embed'=>1, 'hr'=>1, 'img'=>1, 'input'=>1, 'isindex'=>1, 'param'=>1); // Empty ele +if(!empty($m[1])){ + return (!isset($eE[$e]) ? "" : (($C['keep_bad'])%2 ? str_replace(array('<', '>'), array('<', '>'), $t) : '')); +} + +// open tag & attr +static $aN = array('abbr'=>array('td'=>1, 'th'=>1), 'accept-charset'=>array('form'=>1), 'accept'=>array('form'=>1, 'input'=>1), 'accesskey'=>array('a'=>1, 'area'=>1, 'button'=>1, 'input'=>1, 'label'=>1, 'legend'=>1, 'textarea'=>1), 'action'=>array('form'=>1), 'align'=>array('caption'=>1, 'embed'=>1, 'applet'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'object'=>1, 'legend'=>1, 'table'=>1, 'hr'=>1, 'div'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'p'=>1, 'col'=>1, 'colgroup'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'alt'=>array('applet'=>1, 'area'=>1, 'img'=>1, 'input'=>1), 'archive'=>array('applet'=>1, 'object'=>1), 'axis'=>array('td'=>1, 'th'=>1), 'bgcolor'=>array('embed'=>1, 'table'=>1, 'tr'=>1, 'td'=>1, 'th'=>1), 'border'=>array('table'=>1, 'img'=>1, 'object'=>1), 'bordercolor'=>array('table'=>1, 'td'=>1, 'tr'=>1), 'cellpadding'=>array('table'=>1), 'cellspacing'=>array('table'=>1), 'char'=>array('col'=>1, 'colgroup'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'charoff'=>array('col'=>1, 'colgroup'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'charset'=>array('a'=>1, 'script'=>1), 'checked'=>array('input'=>1), 'cite'=>array('blockquote'=>1, 'q'=>1, 'del'=>1, 'ins'=>1), 'classid'=>array('object'=>1), 'clear'=>array('br'=>1), 'code'=>array('applet'=>1), 'codebase'=>array('object'=>1, 'applet'=>1), 'codetype'=>array('object'=>1), 'color'=>array('font'=>1), 'cols'=>array('textarea'=>1), 'colspan'=>array('td'=>1, 'th'=>1), 'compact'=>array('dir'=>1, 'dl'=>1, 'menu'=>1, 'ol'=>1, 'ul'=>1), 'coords'=>array('area'=>1, 'a'=>1), 'data'=>array('object'=>1), 'datetime'=>array('del'=>1, 'ins'=>1), 'declare'=>array('object'=>1), 'defer'=>array('script'=>1), 'dir'=>array('bdo'=>1), 'disabled'=>array('button'=>1, 'input'=>1, 'optgroup'=>1, 'option'=>1, 'select'=>1, 'textarea'=>1), 'enctype'=>array('form'=>1), 'face'=>array('font'=>1), 'for'=>array('label'=>1), 'frame'=>array('table'=>1), 'frameborder'=>array('iframe'=>1), 'headers'=>array('td'=>1, 'th'=>1), 'height'=>array('embed'=>1, 'iframe'=>1, 'td'=>1, 'th'=>1, 'img'=>1, 'object'=>1, 'applet'=>1), 'href'=>array('a'=>1, 'area'=>1), 'hreflang'=>array('a'=>1), 'hspace'=>array('applet'=>1, 'img'=>1, 'object'=>1), 'ismap'=>array('img'=>1, 'input'=>1), 'label'=>array('option'=>1, 'optgroup'=>1), 'language'=>array('script'=>1), 'longdesc'=>array('img'=>1, 'iframe'=>1), 'marginheight'=>array('iframe'=>1), 'marginwidth'=>array('iframe'=>1), 'maxlength'=>array('input'=>1), 'method'=>array('form'=>1), 'model'=>array('embed'=>1), 'multiple'=>array('select'=>1), 'name'=>array('button'=>1, 'embed'=>1, 'textarea'=>1, 'applet'=>1, 'select'=>1, 'form'=>1, 'iframe'=>1, 'img'=>1, 'a'=>1, 'input'=>1, 'object'=>1, 'map'=>1, 'param'=>1), 'nohref'=>array('area'=>1), 'noshade'=>array('hr'=>1), 'nowrap'=>array('td'=>1, 'th'=>1), 'object'=>array('applet'=>1), 'onblur'=>array('a'=>1, 'area'=>1, 'button'=>1, 'input'=>1, 'label'=>1, 'select'=>1, 'textarea'=>1), 'onchange'=>array('input'=>1, 'select'=>1, 'textarea'=>1), 'onfocus'=>array('a'=>1, 'area'=>1, 'button'=>1, 'input'=>1, 'label'=>1, 'select'=>1, 'textarea'=>1), 'onreset'=>array('form'=>1), 'onselect'=>array('input'=>1, 'textarea'=>1), 'onsubmit'=>array('form'=>1), 'pluginspage'=>array('embed'=>1), 'pluginurl'=>array('embed'=>1), 'prompt'=>array('isindex'=>1), 'readonly'=>array('textarea'=>1, 'input'=>1), 'rel'=>array('a'=>1), 'rev'=>array('a'=>1), 'rows'=>array('textarea'=>1), 'rowspan'=>array('td'=>1, 'th'=>1), 'rules'=>array('table'=>1), 'scope'=>array('td'=>1, 'th'=>1), 'scrolling'=>array('iframe'=>1), 'selected'=>array('option'=>1), 'shape'=>array('area'=>1, 'a'=>1), 'size'=>array('hr'=>1, 'font'=>1, 'input'=>1, 'select'=>1), 'span'=>array('col'=>1, 'colgroup'=>1), 'src'=>array('embed'=>1, 'script'=>1, 'input'=>1, 'iframe'=>1, 'img'=>1), 'standby'=>array('object'=>1), 'start'=>array('ol'=>1), 'summary'=>array('table'=>1), 'tabindex'=>array('a'=>1, 'area'=>1, 'button'=>1, 'input'=>1, 'object'=>1, 'select'=>1, 'textarea'=>1), 'target'=>array('a'=>1, 'area'=>1, 'form'=>1), 'type'=>array('a'=>1, 'embed'=>1, 'object'=>1, 'param'=>1, 'script'=>1, 'input'=>1, 'li'=>1, 'ol'=>1, 'ul'=>1, 'button'=>1), 'usemap'=>array('img'=>1, 'input'=>1, 'object'=>1), 'valign'=>array('col'=>1, 'colgroup'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'value'=>array('input'=>1, 'option'=>1, 'param'=>1, 'button'=>1, 'li'=>1), 'valuetype'=>array('param'=>1), 'vspace'=>array('applet'=>1, 'img'=>1, 'object'=>1), 'width'=>array('embed'=>1, 'hr'=>1, 'iframe'=>1, 'img'=>1, 'object'=>1, 'table'=>1, 'td'=>1, 'th'=>1, 'applet'=>1, 'col'=>1, 'colgroup'=>1, 'pre'=>1), 'wmode'=>array('embed'=>1), 'xml:space'=>array('pre'=>1, 'script'=>1, 'style'=>1)); // Ele-specific +static $aNE = array('checked'=>1, 'compact'=>1, 'declare'=>1, 'defer'=>1, 'disabled'=>1, 'ismap'=>1, 'multiple'=>1, 'nohref'=>1, 'noresize'=>1, 'noshade'=>1, 'nowrap'=>1, 'readonly'=>1, 'selected'=>1); // Empty +static $aNP = array('action'=>1, 'cite'=>1, 'classid'=>1, 'codebase'=>1, 'data'=>1, 'href'=>1, 'longdesc'=>1, 'model'=>1, 'pluginspage'=>1, 'pluginurl'=>1, 'usemap'=>1); // Need scheme check; excludes style, on* & src +static $aNU = array('class'=>array('param'=>1, 'script'=>1), 'dir'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'iframe'=>1, 'param'=>1, 'script'=>1), 'id'=>array('script'=>1), 'lang'=>array('applet'=>1, 'br'=>1, 'iframe'=>1, 'param'=>1, 'script'=>1), 'xml:lang'=>array('applet'=>1, 'br'=>1, 'iframe'=>1, 'param'=>1, 'script'=>1), 'onclick'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'ondblclick'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onkeydown'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onkeypress'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onkeyup'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onmousedown'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onmousemove'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onmouseout'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onmouseover'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onmouseup'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'style'=>array('param'=>1, 'script'=>1), 'title'=>array('param'=>1, 'script'=>1)); // Univ & exceptions + +if($C['lc_std_val']){ + // predef attr vals for $eAL & $aNE ele + static $aNL = array('all'=>1, 'baseline'=>1, 'bottom'=>1, 'button'=>1, 'center'=>1, 'char'=>1, 'checkbox'=>1, 'circle'=>1, 'col'=>1, 'colgroup'=>1, 'cols'=>1, 'data'=>1, 'default'=>1, 'file'=>1, 'get'=>1, 'groups'=>1, 'hidden'=>1, 'image'=>1, 'justify'=>1, 'left'=>1, 'ltr'=>1, 'middle'=>1, 'none'=>1, 'object'=>1, 'password'=>1, 'poly'=>1, 'post'=>1, 'preserve'=>1, 'radio'=>1, 'rect'=>1, 'ref'=>1, 'reset'=>1, 'right'=>1, 'row'=>1, 'rowgroup'=>1, 'rows'=>1, 'rtl'=>1, 'submit'=>1, 'text'=>1, 'top'=>1); + static $eAL = array('a'=>1, 'area'=>1, 'bdo'=>1, 'button'=>1, 'col'=>1, 'form'=>1, 'img'=>1, 'input'=>1, 'object'=>1, 'optgroup'=>1, 'option'=>1, 'param'=>1, 'script'=>1, 'select'=>1, 'table'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1, 'xml:space'=>1); + $lcase = isset($eAL[$e]) ? 1 : 0; +} + +$depTr = 0; +if($C['no_deprecated_attr']){ + // dep attr:applicable ele + static $aND = array('align'=>array('caption'=>1, 'div'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'hr'=>1, 'img'=>1, 'input'=>1, 'legend'=>1, 'object'=>1, 'p'=>1, 'table'=>1), 'bgcolor'=>array('table'=>1, 'td'=>1, 'th'=>1, 'tr'=>1), 'border'=>array('img'=>1, 'object'=>1), 'bordercolor'=>array('table'=>1, 'td'=>1, 'tr'=>1), 'clear'=>array('br'=>1), 'compact'=>array('dl'=>1, 'ol'=>1, 'ul'=>1), 'height'=>array('td'=>1, 'th'=>1), 'hspace'=>array('img'=>1, 'object'=>1), 'language'=>array('script'=>1), 'name'=>array('a'=>1, 'form'=>1, 'iframe'=>1, 'img'=>1, 'map'=>1), 'noshade'=>array('hr'=>1), 'nowrap'=>array('td'=>1, 'th'=>1), 'size'=>array('hr'=>1), 'start'=>array('ol'=>1), 'type'=>array('li'=>1, 'ol'=>1, 'ul'=>1), 'value'=>array('li'=>1), 'vspace'=>array('img'=>1, 'object'=>1), 'width'=>array('hr'=>1, 'pre'=>1, 'td'=>1, 'th'=>1)); + static $eAD = array('a'=>1, 'br'=>1, 'caption'=>1, 'div'=>1, 'dl'=>1, 'form'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'hr'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'legend'=>1, 'li'=>1, 'map'=>1, 'object'=>1, 'ol'=>1, 'p'=>1, 'pre'=>1, 'script'=>1, 'table'=>1, 'td'=>1, 'th'=>1, 'tr'=>1, 'ul'=>1); + $depTr = isset($eAD[$e]) ? 1 : 0; +} + +// attr name-vals +if(strpos($a, "\x01") !== false){$a = preg_replace('`\x01[^\x01]*\x01`', '', $a);} // No comment/CDATA sec +$mode = 0; $a = trim($a, ' /'); $aA = array(); +while(strlen($a)){ + $w = 0; + switch($mode){ + case 0: // Name + if(preg_match('`^[a-zA-Z][\-a-zA-Z:]+`', $a, $m)){ + $nm = strtolower($m[0]); + $w = $mode = 1; $a = ltrim(substr_replace($a, '', 0, strlen($m[0]))); + } + break; case 1: + if($a[0] == '='){ // = + $w = 1; $mode = 2; $a = ltrim($a, '= '); + }else{ // No val + $w = 1; $mode = 0; $a = ltrim($a); + $aA[$nm] = ''; + } + break; case 2: // Val + if(preg_match('`^"[^"]*"`', $a, $m) or preg_match("`^'[^']*'`", $a, $m) or preg_match("`^\s*[^\s\"']+`", $a, $m)){ + $m = $m[0]; $w = 1; $mode = 0; $a = ltrim(substr_replace($a, '', 0, strlen($m))); + $aA[$nm] = trim(($m[0] == '"' or $m[0] == '\'') ? substr($m, 1, -1) : $m); + } + break; + } + if($w == 0){ // Parse errs, deal with space, " & ' + $a = preg_replace('`^(?:"[^"]*("|$)|\'[^\']*(\'|$)|\S)*\s*`', '', $a); + $mode = 0; + } +} +if($mode == 1){$aA[$nm] = '';} + +// clean attrs +global $S; +$rl = isset($S[$e]) ? $S[$e] : array(); +$a = array(); $nfr = 0; +foreach($aA as $k=>$v){ + if(((isset($C['deny_attribute']['*']) ? isset($C['deny_attribute'][$k]) : !isset($C['deny_attribute'][$k])) or isset($rl[$k])) && ((!isset($rl['n'][$k]) && !isset($rl['n']['*'])) or isset($rl[$k])) && (isset($aN[$k][$e]) or (isset($aNU[$k]) && !isset($aNU[$k][$e])))){ + if(isset($aNE[$k])){$v = $k;} + elseif(!empty($lcase) && (($e != 'button' or $e != 'input') or $k == 'type')){ // Rather loose but ?not cause issues + $v = (isset($aNL[($v2 = strtolower($v))])) ? $v2 : $v; + } + if($k == 'style' && !$C['style_pass']){ + if(false !== strpos($v, '&#')){ + static $sC = array(' '=>' ', ' '=>' ', 'E'=>'e', 'E'=>'e', 'e'=>'e', 'e'=>'e', 'X'=>'x', 'X'=>'x', 'x'=>'x', 'x'=>'x', 'P'=>'p', 'P'=>'p', 'p'=>'p', 'p'=>'p', 'S'=>'s', 'S'=>'s', 's'=>'s', 's'=>'s', 'I'=>'i', 'I'=>'i', 'i'=>'i', 'i'=>'i', 'O'=>'o', 'O'=>'o', 'o'=>'o', 'o'=>'o', 'N'=>'n', 'N'=>'n', 'n'=>'n', 'n'=>'n', 'U'=>'u', 'U'=>'u', 'u'=>'u', 'u'=>'u', 'R'=>'r', 'R'=>'r', 'r'=>'r', 'r'=>'r', 'L'=>'l', 'L'=>'l', 'l'=>'l', 'l'=>'l', '('=>'(', '('=>'(', ')'=>')', ')'=>')', ' '=>':', ' '=>':', '"'=>'"', '"'=>'"', '''=>"'", '''=>"'", '/'=>'/', '/'=>'/', '*'=>'*', '*'=>'*', '\'=>'\\', '\'=>'\\'); + $v = strtr($v, $sC); + } + $v = preg_replace_callback('`(url(?:\()(?: )*(?:\'|"|&(?:quot|apos);)?)(.+)((?:\'|"|&(?:quot|apos);)?(?: )*(?:\)))`iS', 'hl_prot', $v); + $v = !$C['css_expression'] ? preg_replace('`expression`i', ' ', preg_replace('`\\\\\S|(/|(%2f))(\*|(%2a))`i', ' ', $v)) : $v; + }elseif(isset($aNP[$k]) or strpos($k, 'src') !== false or $k[0] == 'o'){ + $v = hl_prot($v, $k); + if($k == 'href'){ // X-spam + if($C['anti_mail_spam'] && strpos($v, 'mailto:') === 0){ + $v = str_replace('@', htmlspecialchars($C['anti_mail_spam']), $v); + }elseif($C['anti_link_spam']){ + $r1 = $C['anti_link_spam'][1]; + if(!empty($r1) && preg_match($r1, $v)){continue;} + $r0 = $C['anti_link_spam'][0]; + if(!empty($r0) && preg_match($r0, $v)){ + if(isset($a['rel'])){ + if(!preg_match('`\bnofollow\b`i', $a['rel'])){$a['rel'] .= ' nofollow';} + }elseif(isset($aA['rel'])){ + if(!preg_match('`\bnofollow\b`i', $aA['rel'])){$nfr = 1;} + }else{$a['rel'] = 'nofollow';} + } + } + } + } + if(isset($rl[$k]) && is_array($rl[$k]) && ($v = hl_attrval($v, $rl[$k])) === 0){continue;} + $a[$k] = str_replace('"', '"', $v); + } +} +if($nfr){$a['rel'] = isset($a['rel']) ? $a['rel']. ' nofollow' : 'nofollow';} + +// rqd attr +static $eAR = array('area'=>array('alt'=>'area'), 'bdo'=>array('dir'=>'ltr'), 'form'=>array('action'=>''), 'img'=>array('src'=>'', 'alt'=>'image'), 'map'=>array('name'=>''), 'optgroup'=>array('label'=>''), 'param'=>array('name'=>''), 'script'=>array('type'=>'text/javascript'), 'textarea'=>array('rows'=>'10', 'cols'=>'50')); +if(isset($eAR[$e])){ + foreach($eAR[$e] as $k=>$v){ + if(!isset($a[$k])){$a[$k] = isset($v[0]) ? $v : $k;} + } +} + +// depr attrs +if($depTr){ + $c = array(); + foreach($a as $k=>$v){ + if($k == 'style' or !isset($aND[$k][$e])){continue;} + if($k == 'align'){ + unset($a['align']); + if($e == 'img' && ($v == 'left' or $v == 'right')){$c[] = 'float: '. $v;} + elseif(($e == 'div' or $e == 'table') && $v == 'center'){$c[] = 'margin: auto';} + else{$c[] = 'text-align: '. $v;} + }elseif($k == 'bgcolor'){ + unset($a['bgcolor']); + $c[] = 'background-color: '. $v; + }elseif($k == 'border'){ + unset($a['border']); $c[] = "border: {$v}px"; + }elseif($k == 'bordercolor'){ + unset($a['bordercolor']); $c[] = 'border-color: '. $v; + }elseif($k == 'clear'){ + unset($a['clear']); $c[] = 'clear: '. ($v != 'all' ? $v : 'both'); + }elseif($k == 'compact'){ + unset($a['compact']); $c[] = 'font-size: 85%'; + }elseif($k == 'height' or $k == 'width'){ + unset($a[$k]); $c[] = $k. ': '. ($v[0] != '*' ? $v. (ctype_digit($v) ? 'px' : '') : 'auto'); + }elseif($k == 'hspace'){ + unset($a['hspace']); $c[] = "margin-left: {$v}px; margin-right: {$v}px"; + }elseif($k == 'language' && !isset($a['type'])){ + unset($a['language']); + $a['type'] = 'text/'. strtolower($v); + }elseif($k == 'name'){ + if($C['no_deprecated_attr'] == 2 or ($e != 'a' && $e != 'map')){unset($a['name']);} + if(!isset($a['id']) && preg_match('`[a-zA-Z][a-zA-Z\d.:_\-]*`', $v)){$a['id'] = $v;} + }elseif($k == 'noshade'){ + unset($a['noshade']); $c[] = 'border-style: none; border: 0; background-color: gray; color: gray'; + }elseif($k == 'nowrap'){ + unset($a['nowrap']); $c[] = 'white-space: nowrap'; + }elseif($k == 'size'){ + unset($a['size']); $c[] = 'size: '. $v. 'px'; + }elseif($k == 'start' or $k == 'value'){ + unset($a[$k]); + }elseif($k == 'type'){ + unset($a['type']); + static $ol_type = array('i'=>'lower-roman', 'I'=>'upper-roman', 'a'=>'lower-latin', 'A'=>'upper-latin', '1'=>'decimal'); + $c[] = 'list-style-type: '. (isset($ol_type[$v]) ? $ol_type[$v] : 'decimal'); + }elseif($k == 'vspace'){ + unset($a['vspace']); $c[] = "margin-top: {$v}px; margin-bottom: {$v}px"; + } + } + if(count($c)){ + $c = implode('; ', $c); + $a['style'] = isset($a['style']) ? rtrim($a['style'], ' ;'). '; '. $c. ';': $c. ';'; + } +} +// unique ID +if($C['unique_ids'] && isset($a['id'])){ + if(!preg_match('`^[A-Za-z][A-Za-z0-9_\-.:]*$`', ($id = $a['id'])) or (isset($GLOBALS['hl_Ids'][$id]) && $C['unique_ids'] == 1)){unset($a['id']); + }else{ + while(isset($GLOBALS['hl_Ids'][$id])){$id = $C['unique_ids']. $id;} + $GLOBALS['hl_Ids'][($a['id'] = $id)] = 1; + } +} +// xml:lang +if($C['xml:lang'] && isset($a['lang'])){ + $a['xml:lang'] = isset($a['xml:lang']) ? $a['xml:lang'] : $a['lang']; + if($C['xml:lang'] == 2){unset($a['lang']);} +} +// for transformed tag +if(!empty($trt)){ + $a['style'] = isset($a['style']) ? rtrim($a['style'], ' ;'). '; '. $trt : $trt; +} +// return with empty ele / +if(empty($C['hook_tag'])){ + $aA = ''; + foreach($a as $k=>$v){$aA .= " {$k}=\"{$v}\"";} + return "<{$e}{$aA}". (isset($eE[$e]) ? ' /' : ''). '>'; +} +else{return $C['hook_tag']($e, $a);} +// eof +} + +function hl_tag2(&$e, &$a, $t=1){ +// transform tag +if($e == 'center'){$e = 'div'; return 'text-align: center;';} +if($e == 'dir' or $e == 'menu'){$e = 'ul'; return '';} +if($e == 's' or $e == 'strike'){$e = 'span'; return 'text-decoration: line-through;';} +if($e == 'u'){$e = 'span'; return 'text-decoration: underline;';} +static $fs = array('0'=>'xx-small', '1'=>'xx-small', '2'=>'small', '3'=>'medium', '4'=>'large', '5'=>'x-large', '6'=>'xx-large', '7'=>'300%', '-1'=>'smaller', '-2'=>'60%', '+1'=>'larger', '+2'=>'150%', '+3'=>'200%', '+4'=>'300%'); +if($e == 'font'){ + $a2 = ''; + if(preg_match('`face\s*=\s*(\'|")([^=]+?)\\1`i', $a, $m) or preg_match('`face\s*=\s*([^"])(\S+)`i', $a, $m)){ + $a2 .= ' font-family: '. str_replace('"', '\'', trim($m[2])). ';'; + } + if(preg_match('`color\s*=\s*(\'|")?(.+?)(\\1|\s|$)`i', $a, $m)){ + $a2 .= ' color: '. trim($m[2]). ';'; + } + if(preg_match('`size\s*=\s*(\'|")?(.+?)(\\1|\s|$)`i', $a, $m) && isset($fs[($m = trim($m[2]))])){ + $a2 .= ' font-size: '. $fs[$m]. ';'; + } + $e = 'span'; return ltrim($a2); +} +if($t == 2){$e = 0; return 0;} +return ''; +// eof +} + +function hl_tidy($t, $w, $p){ +// Tidy/compact HTM +if(strpos(' pre,script,textarea', "$p,")){return $t;} +$t = str_replace(' ]*(?)\s+`', '`\s+`', '`(<\w[^>]*(?) `'), array(' $1', ' ', '$1'), preg_replace_callback(array('`(<(!\[CDATA\[))(.+?)(\]\]>)`sm', '`(<(!--))(.+?)(-->)`sm', '`(<(pre|script|textarea).*?>)(.+?)()`sm'), create_function('$m', 'return $m[1]. str_replace(array("<", ">", "\n", "\r", "\t", " "), array("\x01", "\x02", "\x03", "\x04", "\x05", "\x07"), $m[3]). $m[4];'), $t))); +if(($w = strtolower($w)) == -1){ + return str_replace(array("\x01", "\x02", "\x03", "\x04", "\x05", "\x07"), array('<', '>', "\n", "\r", "\t", ' '), $t); +} +$s = strpos(" $w", 't') ? "\t" : ' '; +$s = preg_match('`\d`', $w, $m) ? str_repeat($s, $m[0]) : str_repeat($s, ($s == "\t" ? 1 : 2)); +$n = preg_match('`[ts]([1-9])`', $w, $m) ? $m[1] : 0; +$a = array('br'=>1); +$b = array('button'=>1, 'input'=>1, 'option'=>1); +$c = array('caption'=>1, 'dd'=>1, 'dt'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'isindex'=>1, 'label'=>1, 'legend'=>1, 'li'=>1, 'object'=>1, 'p'=>1, 'pre'=>1, 'td'=>1, 'textarea'=>1, 'th'=>1); +$d = array('address'=>1, 'blockquote'=>1, 'center'=>1, 'colgroup'=>1, 'dir'=>1, 'div'=>1, 'dl'=>1, 'fieldset'=>1, 'form'=>1, 'hr'=>1, 'iframe'=>1, 'map'=>1, 'menu'=>1, 'noscript'=>1, 'ol'=>1, 'optgroup'=>1, 'rbc'=>1, 'rtc'=>1, 'ruby'=>1, 'script'=>1, 'select'=>1, 'table'=>1, 'tfoot'=>1, 'thead'=>1, 'tr'=>1, 'ul'=>1); +ob_start(); +if(isset($d[$p])){echo str_repeat($s, ++$n);} +$t = explode('<', $t); +echo ltrim(array_shift($t)); +for($i=-1, $j=count($t); ++$i<$j;){ + $r = ''; list($e, $r) = explode('>', $t[$i]); + $x = $e[0] == '/' ? 0 : (substr($e, -1) == '/' ? 1 : ($e[0] != '!' ? 2 : -1)); + $y = !$x ? ltrim($e, '/') : ($x > 0 ? substr($e, 0, strcspn($e, ' ')) : 0); + $e = "<$e>"; + if(isset($d[$y])){ + if(!$x){echo "\n", str_repeat($s, --$n), "$e\n", str_repeat($s, $n);} + else{echo "\n", str_repeat($s, $n), "$e\n", str_repeat($s, ($x != 1 ? ++$n : $n));} + echo ltrim($r); continue; + } + $f = "\n". str_repeat($s, $n); + if(isset($c[$y])){ + if(!$x){echo $e, $f, ltrim($r);} + else{echo $f, $e, $r;} + }elseif(isset($b[$y])){echo $f, $e, $r; + }elseif(isset($a[$y])){echo $e, $f, ltrim($r); + }elseif(!$y){echo $f, $e, $f, ltrim($r); + }else{echo $e, $r;} +} +$t = preg_replace('`[\n]\s*?[\n]+`', "\n", ob_get_contents()); +ob_end_clean(); +if(($l = strpos(" $w", 'r') ? (strpos(" $w", 'n') ? "\r\n" : "\r") : 0)){ + $t = str_replace("\n", $l, $t); +} +return str_replace(array("\x01", "\x02", "\x03", "\x04", "\x05", "\x07"), array('<', '>', "\n", "\r", "\t", ' '), $t); +// eof +} + +function hl_version(){ +// rel +return '1.1.8.1'; +// eof +} + +function kses($t, $h, $p=array('http', 'https', 'ftp', 'news', 'nntp', 'telnet', 'gopher', 'mailto')){ +// kses compat +foreach($h as $k=>$v){ + $h[$k]['n']['*'] = 1; +} +$C['cdata'] = $C['comment'] = $C['make_tag_strict'] = $C['no_deprecated_attr'] = $C['unique_ids'] = 0; +$C['keep_bad'] = 1; +$C['elements'] = count($h) ? strtolower(implode(',', array_keys($h))) : '-*'; +$C['hook'] = 'kses_hook'; +$C['schemes'] = '*:'. implode(',', $p); +return htmLawed($t, $C, $h); +// eof +} + +function kses_hook($t, &$C, &$S){ +// kses compat +return $t; +// eof +} \ No newline at end of file diff --git a/extlib/htmLawed/htmLawedTest.php b/extlib/htmLawed/htmLawedTest.php new file mode 100644 index 000000000..776828699 --- /dev/null +++ b/extlib/htmLawed/htmLawedTest.php @@ -0,0 +1,592 @@ + $v){ + $_POST[$k] = stripslashes($v); + } + ini_set('magic_quotes_gpc', 0); +} +set_magic_quotes_runtime(0); + +$_POST['enc'] = (isset($_POST['enc']) and preg_match('`^[-\w]+$`', $_POST['enc'])) ? $_POST['enc'] : 'utf-8'; + +// token for anti-CSRF +if(count($_POST)){ + if((empty($_GET['pre']) and ((!empty($_POST['token']) and !empty($_SESSION['token']) and $_POST['token'] != $_SESSION['token']) or empty($_POST[$_sid]) or $_POST[$_sid] != session_id() or empty($_COOKIE[$_sid]) or $_COOKIE[$_sid] != session_id())) or ($_POST[$_sid] != session_id())){ + $_POST = array('enc'=>'utf-8'); + } +} +if(empty($_GET['pre'])){ + $_SESSION['token'] = md5(uniqid(rand(), 1)); + $token = $_SESSION['token']; + session_regenerate_id(1); +} + +// compress +if(function_exists('gzencode') && isset($_SERVER['HTTP_ACCEPT_ENCODING']) && preg_match('`gzip|deflate`i', $_SERVER['HTTP_ACCEPT_ENCODING']) && !ini_get('zlib.output_compression')){ + ob_start('ob_gzhandler'); +} + +// HTM for unprocessed +if(isset($_POST['inputH'])){ + echo 'htmLawed test: HTML view of unprocessed input

  Rendering of unprocessed input without an HTML doctype or charset declaration     close window | htmLawed test page

', $_POST['inputH'], '
'; + exit; +} + +// main +$_POST['text'] = isset($_POST['text']) ? $_POST['text'] : 'text to process; < '. $_limit. ' characters'. ($_hlimit ? ' (for binary hexdump view, < '. $_hlimit. ')' : ''); +$do = (!empty($_POST[$_sid]) && isset($_POST['text'][0]) && !isset($_POST['text'][$_limit])) ? 1 : 0; +$limit_exceeded = isset($_POST['text'][$_limit]) ? 1 : 0; +$pre_mem = memory_get_usage(); +$validation = (!empty($_POST[$_sid]) and isset($_POST['w3c_validate'][0])) ? 1 : 0; +include './htmLawed.php'; + +function format($t){ + $t = "\n". str_replace(array("\t", "\r\n", "\r", '&', '<', '>', "\n"), array(' ', "\n", "\n", '&', '<', '>', "¬
\n"), $t); + return str_replace(array('
', "\n ", ' '), array("\n
\n", "\n ", '  '), $t); +} + +function hexdump($d){ +// Mainly by Aidan Lister , Peter Waller + $hexi = ''; + $ascii = ''; + ob_start(); + echo '
';
+ $offset = 0;
+ $len = strlen($d);
+ for($i=$j=0; $i<$len; $i++)
+ {
+  // Convert to hexidecimal
+  $hexi .= sprintf("%02X ", ord($d[$i]));
+  // Replace non-viewable bytes with '.'
+  if(ord($d[$i]) >= 32){
+   $ascii .= htmlspecialchars($d[$i]);
+  }else{
+   $ascii .= '.';
+  } 
+  // Add extra column spacing
+  if($j == 7){
+   $hexi .= ' ';
+   $ascii .= '  ';
+  }
+  // Add row
+  if(++$j == 16 || $i == $len-1){
+   // Join the hexi / ascii output
+   echo sprintf("%04X   %-49s   %s", $offset, $hexi, $ascii);   
+   // Reset vars
+   $hexi = $ascii = '';
+   $offset += 16;
+   $j = 0;  
+   // Add newline   
+   if ($i !== $len-1){
+    echo "\n";
+   }
+  }
+ }
+ echo '
'; + $o = ob_get_contents(); + ob_end_clean(); + return $o; +} +?> + + + + + + + + +htmLawed (<?php echo hl_version();?>) test + + +
+ +
HTMLAWED TEST
+htm / txt documentation
+ +Input » (max. chars) + +
+ +
+ + +
+ + +'; + } +?> + + + + + + + + + + Validator tools: '; + } +} +?> + +Encoding: + +
+
+ +Input text is too long!
'; +} +?> + +
+ +Settings » + + +
+ +$v){ + if($k[0] == 'h' && $v != 'nil'){ + $cfg[substr($k, 1)] = $v; + } + } + + if($cfg['anti_link_spam'] && (!empty($cfg['anti_link_spam11']) or !empty($cfg['anti_link_spam12']))){ + $cfg['anti_link_spam'] = array($cfg['anti_link_spam11'], $cfg['anti_link_spam12']); + } + unset($cfg['anti_link_spam11'], $cfg['anti_link_spam12']); + if($cfg['anti_mail_spam'] == 1){ + $cfg['anti_mail_spam'] = isset($cfg['anti_mail_spam1'][0]) ? $cfg['anti_mail_spam1'] : 0; + } + unset($cfg['anti_mail_spam11']); + if($cfg['deny_attribute'] == 1){ + $cfg['deny_attribute'] = isset($cfg['deny_attribute1'][0]) ? $cfg['deny_attribute1'] : 0; + } + unset($cfg['deny_attribute1']); + if($cfg['tidy'] == 2){ + $cfg['tidy'] = isset($cfg['tidy2'][0]) ? $cfg['tidy2'] : 0; + } + unset($cfg['tidy2']); + if($cfg['unique_ids'] == 2){ + $cfg['unique_ids'] = isset($cfg['unique_ids2'][0]) ? $cfg['unique_ids2'] : 1; + } + unset($cfg['unique_ids2']); + unset($cfg['and_mark']); // disabling and_mark + + $cfg['show_setting'] = 'hlcfg'; + $st = microtime(); + $out = htmLawed($_POST['text'], $cfg, str_replace(array('$', '{'), '', $_POST['spec'])); + $et = microtime(); + echo '
Input code » ', strlen($_POST['text']), ' chars, ~', round((substr_count($_POST['text'], '>') + substr_count($_POST['text'], '<'))/2), ' tags ', (!isset($_POST['text'][$_hlimit]) ? ' Input binary » ' : ''), ' Finalized internal settings »  ', '
Output » htmLawed processing time ', number_format(((substr($et,0,9)) + (substr($et,-10)) - (substr($st,0,9)) - (substr($st,-10))),4), ' s', (($mem = memory_get_peak_usage()) !== false ? ', peak memory usage '. round(($mem-$pre_mem)/1048576, 2). ' MB' : ''), '
'; + if($_w3c_validate && $validation) + { +?> + + + + +
Output code »
', format($out), '
', (!isset($_POST['text'][$_hlimit]) ? '
Output binary »' : ''), '
Output rendered »
', $out, '
'; +} +else{ +?> + +
+ +
Use with a Javascript- and cookie-enabled, relatively new version of a common browser. Submitted input will also be HTML-rendered (XHTML 1) after htmLawed-filtering. + +
You can use text from this collection of test-cases in the input. Set the character encoding of the browser to Unicode/utf-8 before copying.' : ''); ?> + +

For anti-XSS tests, try the special test-page or see these results. + +

Change Encoding to reflect the character encoding of the input text. Even then, it may not work or some characters may not display properly because of variable browser support and because of the form interface. Developers can write some PHP code to capture the filtered input to a file if this is important. +

Refer to the htmLawed documentation (htm/txt) for details about Settings, and htmLawed's behavior and limitations. For Settings, incorrectly-specified values like regular expressions are silently ignored. One or more settings form-fields may have been disabled. Some characters are not allowed in the Spec field. + + +

Hovering the mouse over some of the text can provide additional information in some browsers.
+ + + +

Because of character-encoding issues, the W3C validator (anyway not perfect) may reject validation requests or invalidate otherwise-valid code, esp. if text was copy-pasted in the input box. Local applications like the HTML Validator Firefox browser add-on may be useful in such cases.
+ + + +
+ + + +
+ + \ No newline at end of file diff --git a/extlib/htmLawed/htmLawed_README.htm b/extlib/htmLawed/htmLawed_README.htm new file mode 100644 index 000000000..e560e2eb2 --- /dev/null +++ b/extlib/htmLawed/htmLawed_README.htm @@ -0,0 +1,1979 @@ + + + + + + + + +htmLawed documentation | htmLawed PHP software is a free, open-source, customizable HTML input purifier and filter + + +
+

htmLawed documentation

+ +
1  About htmLawed
1.1  Example uses
1.2  Features
1.3  History
1.4  License & copyright
1.5  Terms used here
+2  Usage
2.1  Simple
2.2  Configuring htmLawed using the $config parameter
2.3  Extra HTML specifications using the $spec parameter
2.4  Performance time & memory usage
2.5  Some security risks to keep in mind
2.6  Use without modifying old kses() code
2.7  Tolerance for ill-written HTML
2.8  Limitations & work-arounds
2.9  Examples
+3  Details
3.1  Invalid/dangerous characters
3.2  Character references/entities
3.3  HTML elements
+    3.3.1  HTML comments and CDATA sections
+    3.3.2  Tag-transformation for better XHTML-Strict
+    3.3.3  Tag balancing and proper nesting
+    3.3.4  Elements requiring child elements
+    3.3.5  Beautify or compact HTML
3.4  Attributes
+    3.4.1  Auto-addition of XHTML-required attributes
+    3.4.2  Duplicate/invalid id values
+    3.4.3  URL schemes (protocols) and scripts in attribute values
+    3.4.4  Absolute & relative URLs
+    3.4.5  Lower-cased, standard attribute values
+    3.4.6  Transformation of deprecated attributes
+    3.4.7  Anti-spam & href
+    3.4.8  Inline style properties
+    3.4.9  Hook function for tag content
3.5  Simple configuration directive for most valid XHTML
3.6  Simple configuration directive for most safe HTML
3.7  Using a hook function
3.8  Obtaining finalized parameter values
3.9  Retaining non-HTML tags in input with mixed markup
+4  Other
4.1  Support
4.2  Known issues
4.3  Change-log
4.4  Testing
4.5  Upgrade, & old versions
4.6  Comparison with HTMLPurifier
4.7  Use through application plug-ins/modules
4.8  Use in non-PHP applications
4.9  Donate
4.10  Acknowledgements
+5  Appendices
5.1  Characters discouraged in HTML
5.2  Valid attribute-element combinations
5.3  CSS 2.1 properties accepting URLs
5.4  Microsoft Windows 1252 character replacements
5.5  URL format
5.6  Brief on htmLawed code
+ +
+
+
htmLawed_README.txt, 16 July 2009
+htmLawed 1.1.8.1, 16 July 2009
+Copyright Santosh Patnaik
+GPL v3 license
+A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed 
+
+ +

+1  About htmLawed +

(to top)
+
+  htmLawed is a highly customizable single-file PHP script to make text secure, and standard- and admin policy-compliant for use in the body of HTML 4, XHTML 1 or 1.1, or generic XML documents. It is thus a configurable input (X)HTML filter, processor, purifier, sanitizer, beautifier, etc., and an alternative to the HTMLTidy application.
+
+  The lawing in of input text is needed to ensure that HTML code in the text is standard-compliant, does not introduce security vulnerabilities, and does not break the aesthetics, design or layout of web-pages. htmLawed tries to do this by, for example, making HTML well-formed with balanced and properly nested tags, neutralizing code that may be used for cross-site scripting (XSS) attacks, and allowing only specified HTML elements/tags and attributes.
+ +

+1.1  Example uses +

(to top)
+
+  *  Filtering of text submitted as comments on blogs to allow only certain HTML elements
+
+  *  Making RSS/Atom newsfeed item-content standard-compliant: often one uses an excerpt from an HTML document for the content, and with unbalanced tags, non-numerical entities, etc., such excerpts may not be XML-compliant
+
+  *  Text processing for stricter XML standard-compliance: e.g., to have lowercased x in hexadecimal numeric entities becomes necessary if an XHTML document with MathML content needs to be served as application/xml
+
+  *  Scraping text or data from web-pages
+
+  *  Pretty-printing HTML code
+ +
+

+1.2  Features +

(to top)
+
+  Key: * security feature, ^ standard compliance, ~ requires setting right options, ` different from Kses
+
+  *  make input more secure and standard-compliant
+  *  use for HTML 4, XHTML 1.0 or 1.1, or even generic XML documents  ^~`
+
+  *  beautify or compact HTML  ^~`
+
+  *  restrict elements  ^~`
+  *  proper closure of empty elements like img  ^`
+  *  transform deprecated elements like u  ^~`
+  *  HTML comments and CDATA sections can be permitted  ^~`
+  *  elements like script, object and form can be permitted  ~
+
+  *  restrict attributes, including element-specifically  ^~`
+  *  remove invalid attributes  ^`
+  *  element and attribute names are lower-cased  ^
+  *  provide required attributes, like alt for image  ^`
+  *  transform deprecated attributes  ^~`
+  *  attributes declared only once  ^`
+
+  *  restrict attribute values, including element-specifically  ^~`
+  *  a value is declared for empty (minimized) attributes like checked  ^
+  *  check for potentially dangerous attribute values  *~
+  *  ensure unique id attribute values  ^~`
+  *  double-quote attribute values  ^
+  *  lower-case standard attribute values like password  ^`
+
+  *  attribute-specific URL protocol/scheme restriction  *~`
+  *  disable dynamic expressions in style values  *~`
+
+  *  neutralize invalid named character entities  ^`
+  *  convert hexadecimal numeric entities to decimal ones, or vice versa  ^~`
+  *  convert named entities to numeric ones for generic XML use  ^~`
+
+  *  remove null characters  *
+  *  neutralize potentially dangerous proprietary Netscape Javascript entities  *
+  *  replace potentially dangerous soft-hyphen character in attribute values with spaces  *
+
+  *  remove common invalid characters not allowed in HTML or XML  ^`
+  *  replace characters from Microsoft applications like Word that are discouraged in HTML or XML  ^~`
+  *  neutralize entities for characters invalid or discouraged in HTML or XML  ^`
+  *  appropriately neutralize <, &, ", and > characters  ^*`
+
+  *  understands improperly spaced tag content (like, spread over more than a line) and properly spaces them  `
+  *  attempts to balance tags for well-formedness  ^~`
+  *  understands when omitable closing tags like </p> (allowed in HTML 4, transitional, e.g.) are missing  ^~`
+  *  attempts to permit only validly nested tags  ^~`
+  *  option to remove or neutralize bad content ^~`
+  *  attempts to rectify common errors of plain-text misplacement (e.g., directly inside blockquote) ^~`
+
+  *  fast, non-OOP code of ~45 kb incurring peak basal memory usage of ~0.5 MB
+  *  compatible with pre-existing code using Kses (the filter used by WordPress)
+
+  *  optional anti-spam measures such as addition of rel="nofollow" and link-disabling  ~`
+  *  optionally makes relative URLs absolute, and vice versa  ~`
+
+  *  optionally mark & to identify the entities for &, < and > introduced by htmLawed  ~`
+
+  *  allows deployment of powerful hook functions to inject HTML, consolidate style attributes to class, finely check attribute values, etc.  ~`
+
+  *  independent of character encoding of input and does not affect it
+
+  *  tolerance for ill-written HTML to a certain degree
+ +
+

+1.3  History +

(to top)
+
+  htmLawed was developed for use with LabWiki, a wiki software developed at PHP Labware, as a suitable software could not be found. Existing PHP software like Kses and HTMLPurifier were deemed inadequate, slow, resource-intensive, or dependent on external applications like HTML Tidy.
+
+  htmLawed started as a modification of Ulf Harnhammar's Kses (version 0.2.2) software, and is compatible with code that uses Kses; see section 2.6.
+ +
+

+1.4  License & copyright +

(to top)
+
+  htmLawed is free and open-source software licensed under GPL license version 3, and copyrighted by Santosh Patnaik, MD, PhD.
+ +
+

+1.5  Terms used here +

(to top)
+
+  *  administrator - or admin; person setting up the code to pass input through htmLawed; also, user
+  *  attributes - name-value pairs like href="http://x.com" in opening tags
+  *  author - writer
+  *  character - atomic unit of text; internally represented by a numeric code-point as specified by the encoding or charset in use
+  *  entity - markup like &gt; and &#160; used to refer to a character
+  *  element - HTML element like a and img
+  *  element content -  content between the opening and closing tags of an element, like click of <a href="x">click</a>
+  *  HTML - implies XHTML unless specified otherwise
+  *  input - text string given to htmLawed to process
+  *  processing - involves filtering, correction, etc., of input
+  *  safe - absence or reduction of certain characters and HTML elements and attributes in the input that can otherwise potentially and circumstantially expose web-site users to security vulnerabilities like cross-site scripting attacks (XSS)
+  *  scheme - URL protocol like http and ftp
+  *  specs - standard specifications
+  *  style property - terms like border and height for which declarations are made in values for the style attribute of elements
+  *  tag - markers like <a href="x"> and </a> delineating element content; the opening tag can contain attributes
+  *  tag content - consists of tag markers < and >, element names like div, and possibly attributes
+  *  user - administrator
+  *  writer - end-user like a blog commenter providing the input that is to be processed; also, author
+ +
+
+

+2  Usage +

(to top)
+
+  htmLawed should work with PHP 4.3 and higher. Either include() the htmLawed.php file or copy-paste the entire code.
+
+  To easily test htmLawed using a form-based interface, use the provided demo (htmLawed.php and htmLawedTest.php should be in the same directory on the web-server).
+ +

+2.1  Simple +

(to top)
+
+  The input text to be processed, $text, is passed as an argument of type string; htmLawed() returns the processed string:
+
+ +    $processed = htmLawed($text); +
+
Note: If input is from a $_GET or $_POST value, and magic quotes are enabled on the PHP setup, run stripslashes() on the input before passing to htmLawed.
+
+  By default, htmLawed will process the text allowing all valid HTML elements/tags, secure URL scheme/CSS style properties, etc. It will allow CDATA sections and HTML comments, balance tags, and ensure proper nesting of elements. Such actions can be configured using two other optional arguments -- $config and $spec:
+
+ +    $processed = htmLawed($text, $config, $spec); +
+
+  These extra parameters are detailed below. Some examples are shown in section 2.9.
+
Note: For maximum protection against XSS and other scripting attacks (e.g., by disallowing Javascript code), consider using the safe parameter; see section 3.6.
+ +
+

+2.2  Configuring htmLawed using the $config parameter +

(to top)
+
$config instructs htmLawed on how to tackle certain tasks. When $config is not specified, or not set as an array (e.g., $config = 1), htmLawed will take default actions. One or many of the task-action or value-specification pairs can be specified in $config as array key-value pairs. If a parameter is not specified, htmLawed will use the default value/action indicated further below.
+
+ +    $config = array('comment'=>0, 'cdata'=>1); +
+ +    $processed = htmLawed($text, $config); +
+
+  Or,
+
+ +    $processed = htmLawed($text, array('comment'=>0, 'cdata'=>1)); +
+
+  Below are the possible value-specification combinations. In PHP code, values that are integers should not be quoted and should be used as numeric types (unless meant as string/text).
+
+  Key: * default, ^ different default when htmLawed is used in the Kses-compatible mode (see section 2.6), ~ different default when valid_xhtml is set to 1 (see section 3.5), " different default when safe is set to 1 (see section 3.6)
+
abs_url
+  Make URLs absolute or relative; $config["base_url"] needs to be set; see section 3.4.4
+
-1 - make relative
0 - no action  *
1 - make absolute
+
and_mark
+  Mark & characters in the original input; see section 3.2
+
anti_link_spam
+  Anti-link-spam measure; see section 3.4.7
+
0 - no measure taken  *
array("regex1", "regex2") - will ensure a rel attribute with nofollow in its value in case the href attribute value matches the regular expression pattern regex1, and/or will remove href if its value matches the regular expression pattern regex2. E.g., array("/./", "/://\W*(?!(abc\.com|xyz\.org))/"); see section 3.4.7 for more.
+
anti_mail_spam
+  Anti-mail-spam measure; see section 3.4.7
+
0 - no measure taken  *
word - @ in mail address in href attribute value is replaced with specified word
+
balance
+  Balance tags for well-formedness and proper nesting; see section 3.3.3
+
0 - no
1 - yes  *
+
base_url
+  Base URL value that needs to be set if $config["abs_url"] is not 0; see section 3.4.4
+
cdata
+  Handling of CDATA sections; see section 3.3.1
+
0 - don't consider CDATA sections as markup and proceed as if plain text  ^"
1 - remove
2 - allow, but neutralize any <, >, and & inside by converting them to named entities
3 - allow  *
+
clean_ms_char
+  Replace discouraged characters introduced by Microsoft Word, etc.; see section 3.1
+
0 - no  *
1 - yes
2 - yes, but replace special single & double quotes with ordinary ones
+
comment
+  Handling of HTML comments; see section 3.3.1
+
0 - don't consider comments as markup and proceed as if plain text  ^"
1 - remove
2 - allow, but neutralize any <, >, and & inside by converting to named entities
3 - allow  *
+
css_expression
+  Allow dynamic CSS expression by not removing the expression from CSS property values in style attributes; see section 3.4.8
+
0 - remove  *
1 - allow
+
deny_attribute
+  Denied HTML attributes; see section 3.4
+
0 - none  *
string - dictated by values in string
on* (like onfocus) attributes not allowed - "
+
elements
+  Allowed HTML elements; see section 3.3
+
* -center -dir -font -isindex -menu -s -strike -u -  ~
applet, embed, iframe, object, script not allowed - "
+
hexdec_entity
+  Allow hexadecimal numeric entities and do not convert to the more widely accepted decimal ones, or convert decimal to hexadecimal ones; see section 3.2
+
0 - no
1 - yes  *
2 - convert decimal to hexadecimal ones
+
hook
+  Name of an optional hook function to alter the input string, $config or $spec before htmLawed starts its main work; see section 3.7
+
0 - no hook function  *
name - name is name of the hook function (kses_hook  ^)
+
hook_tag
+  Name of an optional hook function to alter tag content finalized by htmLawed; see section 3.4.9
+
0 - no hook function  *
name - name is name of the hook function
+
keep_bad
+  Neutralize bad tags by converting < and > to entities, or remove them; see section 3.3.3
+
0 - remove  ^
1 - neutralize both tags and element content
2 - remove tags but neutralize element content
3 and 4 - like 1 and 2 but remove if text (pcdata) is invalid in parent element
5 and 6 * -  like 3 and 4 but line-breaks, tabs and spaces are left
+
lc_std_val
+  For XHTML compliance, predefined, standard attribute values, like get for the method attribute of form, must be lowercased; see section 3.4.5
+
0 - no
1 - yes  *
+
make_tag_strict
+  Transform/remove these non-strict XHTML elements, even if they are allowed by the admin: applet center dir embed font isindex menu s strike u; see section 3.3.2
+
0 - no  ^
1 - yes, but leave applet, embed and isindex elements that currently can't be transformed  *
2 - yes, removing applet, embed and isindex elements and their contents (nested elements remain)  ~
+
named_entity
+  Allow non-universal named HTML entities, or convert to numeric ones; see section 3.2
+
0 - convert
1 - allow  *
+
no_deprecated_attr
+  Allow deprecated attributes or transform them; see section 3.4.6
+
0 - allow  ^
1 - transform, but name attributes for a and map are retained  *
2 - transform
+
parent
+  Name of the parent element, possibly imagined, that will hold the input; see section 3.3
+
safe
+  Magic parameter to make input the most secure against XSS without needing to specify other relevant $config parameters; see section 3.6
+
0 - no  *
1 - will auto-adjust other relevant $config parameters (indicated by " in this list)
+
schemes
+  Array of attribute-specific, comma-separated, lower-cased list of schemes (protocols) allowed in attributes accepting URLs; * covers all unspecified attributes; see section 3.4.3
+
href: aim, feed, file, ftp, gopher, http, https, irc, mailto, news, nntp, sftp, ssh, telnet; *:file, http, https  *
*: ftp, gopher, http, https, mailto, news, nntp, telnet  ^
href: aim, feed, file, ftp, gopher, http, https, irc, mailto, news, nntp, sftp, ssh, telnet; style: nil; *:file, http, https  "
+
show_setting
+  Name of a PHP variable to assign the finalized $config and $spec values; see section 3.8
+
style_pass
+  Do not look at style attribute values, letting them through without any alteration
+
0 - no *
1 - htmLawed will let through any style value; see section 3.4.8
+
tidy
+  Beautify or compact HTML code; see section 3.3.5
+
-1 - compact
0 - no  *
1 or string - beautify (custom format specified by string)
+
unique_ids
id attribute value checks; see section 3.4.2
+
0 - no  ^
1 - remove duplicate and/or invalid ones  *
word - remove invalid ones and replace duplicate ones with new and unique ones based on the word; the admin-specified word, like my_, should begin with a letter (a-z) and can contain letters, digits, ., _, -, and :.
+
valid_xhtml
+  Magic parameter to make input the most valid XHTML without needing to specify other relevant $config parameters; see section 3.5
+
0 - no  *
1 - will auto-adjust other relevant $config parameters (indicated by ~ in this list)
+
xml:lang
+  Auto-adding xml:lang attribute; see section 3.4.1
+
0 - no  *
1 - add if lang attribute is present
2 - add if lang attribute is present, and remove lang  ~
+ +
+

+2.3  Extra HTML specifications using the $spec parameter +

(to top)
+
+  The $spec argument can be used to disallow an otherwise legal attribute for an element, or to restrict the attribute's values. This can also be helpful as a security measure (e.g., in certain versions of browsers, certain values can cause buffer overflows and denial of service attacks), or in enforcing admin policy compliance. $spec is specified as a string of text containing one or more rules, with multiple rules separated from each other by a semi-colon (;). E.g.,
+
+ +    $spec = 'i=-*; td, tr=style, id, -*; a=id(match="/[a-z][a-z\d.:\-`"]*/i"/minval=2), href(maxlen=100/minlen=34); img=-width,-alt'; +
+ +    $processed = htmLawed($text, $config, $spec); +
+
+  Or,
+
+ +    $processed = htmLawed($text, $config, 'i=-*; td, tr=style, id, -*; a=id(match="/[a-z][a-z\d.:\-`"]*/i"/minval=2), href(maxlen=100/minlen=34); img=-width,-alt'); +
+
+  A rule begins with an HTML element name(s) (rule-element), for which the rule applies, followed by an equal (=) sign. A rule-element may represent multiple elements if comma (,)-separated element names are used. E.g., th,td,tr=.
+
+  Rest of the rule consists of comma-separated HTML attribute names. A minus (-) character before an attribute means that the attribute is not permitted inside the rule-element. E.g., -width. To deny all attributes, -* can be used.
+
+  Following shows examples of rule excerpts with rule-element a and the attributes that are being permitted:
+
+  *  a= - all
+  *  a=id - all
+  *  a=href, title, -id, -onclick - all except id and onclick
+  *  a=*, id, -id - all except id
+  *  a=-* - none
+  *  a=-*, href, title - none except href and title
+  *  a=-*, -id, href, title - none except href and title
+
+  Rules regarding attribute values are optionally specified inside round brackets after attribute names in slash ('/')-separated parameter = value pairs. E.g., title(maxlen=30/minlen=5). None, or one or more of the following parameters may be specified:
+
+  *  oneof - one or more choices separated by | that the value should match; if only one choice is provided, then the value must match that choice
+
+  *  noneof - one or more choices separated by | that the value should not match
+
+  *  maxlen and minlen - upper and lower limits for the number of characters in the attribute value; specified in numbers
+
+  *  maxval and minval - upper and lower limits for the numerical value specified in the attribute value; specified in numbers
+
+  *  match and nomatch - pattern that the attribute value should or should not match; specified as PHP/PCRE-compatible regular expressions with delimiters and possibly modifiers
+
+  *  default - a value to force on the attribute if the value provided by the writer does not fit any of the specified parameters
+
+  If default is not set and the attribute value does not satisfy any of the specified parameters, then the attribute is removed. The default value can also be used to force all attribute declarations to take the same value (by getting the values declared illegal by setting, e.g., maxlen to -1).
+
+  Examples with input <input title="WIDTH" value="10em" /><input title="length" value="5" /> are shown below.
+
Rule: input=title(maxlen=60/minlen=6), value
Output: <input value="10em" /><input title="length" value="5" />
+
Rule: input=title(), value(maxval=8/default=6)
Output: <input title="WIDTH" value="6" /><input title="length" value="5" />
+
Rule: input=title(nomatch=$w.d$i), value(match=$em$/default=6em)
Output: <input value="10em" /><input title="length" value="6em" />
+
Rule: input=title(oneof=height|depth/default=depth), value(noneof=5|6)
Output: <input title="depth" value="10em" /><input title="depth" />
+
Special characters: The characters ;, ,, /, (, ), |, ~ and space have special meanings in the rules. Words in the rules that use such characters, or the characters themselves, should be escaped by enclosing in pairs of double-quotes ("). A back-tick (`) can be used to escape a literal ". An example rule illustrating this is input=value(maxlen=30/match="/^\w/"/default="your `"ID`"").
+
Note: To deny an attribute for all elements for which it is legal, $config["deny_attribute"] (see section 3.4) can be used instead of $spec. Also, attributes can be allowed element-specifically through $spec while being denied globally through $config["deny_attribute"]. The hook_tag parameter (section 3.4.9) can also be used to implement the $spec functionality.
+ +
+

+2.4  Performance time & memory usage +

(to top)
+
+  The time and memory used by htmLawed depends on its configuration and the size of the input, and the amount, nestedness and well-formedness of the HTML markup within it. In particular, tag balancing and beautification each can increase the processing time by about a quarter.
+
+  The htmLawed demo can be used to evaluate the performance and effects of different types of input and $config.
+ +
+

+2.5  Some security risks to keep in mind +

(to top)
+
+  When setting the parameters/arguments (like those to allow certain HTML elements) for use with htmLawed, one should bear in mind that the setting may let through potentially dangerous HTML code. (This may not be a problem if the authors are trusted.)
+
+  For example, following increase security risks:
+
+  *  Allowing script, applet, embed, iframe or object elements, or certain of their attributes like allowscriptaccess
+
+  *  Allowing HTML comments (some Internet Explorer versions are vulnerable with, e.g., <!--[if gte IE 4]><script>alert("xss");</script><![endif]-->
+
+  *  Allowing dynamic CSS expressions (a feature of the IE browser)
+
Unsafe HTML can be removed by setting $config appropriately. E.g., $config["elements"] = "* -script" (section 3.3), $config["safe"] = 1 (section 3.6), etc.
+ +
+

+2.6  Use without modifying old kses() code +

(to top)
+
+  The Kses PHP script is used by many applications (like WordPress). It is possible to have such applications use htmLawed instead, since it is compatible with code that calls the kses() function declared in the Kses file (usually named kses.php). E.g., application code like this will continue to work after replacing Kses with htmLawed:
+
+ +    $comment_filtered = kses($comment_input, array('a'=>array(), 'b'=>array(), 'i'=>array())); +
+
+  For some of the $config parameters, htmLawed will use values other than the default ones. These are indicated by ^ in section 2.2. To force htmLawed to use other values, function kses() in the htmLawed code should be edited -- a few configurable parameters/variables need to be changed.
+
+  If the application uses a Kses file that has the kses() function declared, then, to have the application use htmLawed instead of Kses, simply rename htmLawed.php (to kses.php, e.g.) and replace the Kses file (or just replace the code in the Kses file with the htmLawed code). If the kses() function in the Kses file had been renamed by the application developer (e.g., in WordPress, it is named wp_kses()), then appropriately rename the kses() function in the htmLawed code.
+
+  If the Kses file used by the application has been highly altered by the application developers, then one may need a different approach. E.g., with WordPress, it is best to copy the htmLawed code to wp_includes/kses.php, rename the newly added function kses() to wp_kses(), and delete the code for the original wp_kses() function.
+
+  If the Kses code has a non-empty hook function (e.g., wp_kses_hook() in case of WordPress), then the code for htmLawed's kses_hook() function should be appropriately edited. However, the requirement of the hook function should be re-evaluated considering that htmLawed has extra capabilities. With WordPress, the hook function is an essential one. The following code is suggested for the htmLawed kses_hook() in case of WordPress:
+
+ +    function kses_hook($string, &$cf, &$spec){ +
+ +    // kses compatibility +
+ +    $allowed_html = $spec; +
+ +    $allowed_protocols = array(); +
+ +    foreach($cf['schemes'] as $v){ +
+ +     foreach($v as $k2=>$v2){ +
+ +      if(!in_array($k2, $allowed_protocols)){ +
+ +       $allowed_protocols[] = $k2; +
+ +      } +
+ +     } +
+ +    } +
+ +    return wp_kses_hook($string, $allowed_html, $allowed_protocols); +
+ +    // eof +
+ +    } +
+ +
+

+2.7  Tolerance for ill-written HTML +

(to top)
+
+  htmLawed can work with ill-written HTML code in the input. However, HTML that is too ill-written may not be read as HTML, and be considered mere plain text instead. Following statements indicate the degree of looseness that htmLawed can work with, and can be provided in instructions to writers:
+
+  *  Tags must be flanked by < and > with no > inside -- any needed > should be put in as &gt;. It is possible for tag content (element name and attributes) to be spread over many lines instead of being on one. A space may be present between the tag content and >, like <div > and <img / >, but not after the <.
+
+  *  Element and attribute names need not be lower-cased.
+
+  *  Attribute string of elements may be liberally spaced with tabs, line-breaks, etc.
+
+  *  Attribute values may not be double-quoted, or may be single-quoted.
+
+  *  Left-padding of numeric entities (like, &#0160;, &x07ff;) with 0 is okay as long as the number of characters between between the & and the ; does not exceed 8. All entities must end with ; though.
+
+  *  Named character entities must be properly cased. E.g., &Lt; or &TILDE; will not be let through without modification.
+
+  *  HTML comments should not be inside element tags (okay between tags), and should begin with <!-- and end with -->. Characters like <, >, and & may be allowed inside depending on $config, but any --> inside should be put in as --&gt;. Any -- inside will be automatically converted to -, and a space will be added before the comment delimiter -->.
+
+  *  CDATA sections should not be inside element tags, and can be in element content only if plain text is allowed for that element. They should begin with <[CDATA[ and end with ]]>. Characters like <, >, and & may be allowed inside depending on $config, but any ]]> inside should be put in as ]]&gt;.
+
+  *  For attribute values, character entities &lt;, &gt; and &amp; should be used instead of characters < and >, and & (when & is not part of a character entity). This applies even for Javascript code in values of attributes like onclick.
+
+  *  Characters <, >, & and " that are part of actual Javascript, etc., code in script elements should be used as such and not be put in as entities like &gt;. Otherwise, though the HTML will be valid, the code may fail to work. Further, if such characters have to be used, then they should be put inside CDATA sections.
+
+  *  Simple instructions like "an opening tag cannot be present between two closing tags" and "nested elements should be closed in the reverse order of how they were opened" can help authors write balanced HTML. If tags are imbalanced, htmLawed will try to balance them, but in the process, depending on $config["keep_bad"], some code/text may be lost.
+
+  *  Input authors should be notified of admin-specified allowed elements, attributes, configuration values (like conversion of named entities to numeric ones), etc.
+
+  *  With $config["unique_ids"] not 0 and the id attribute being permitted, writers should carefully avoid using duplicate or invalid id values as even though htmLawed will correct/remove the values, the final output may not be the one desired. E.g., when <a id="home"></a><input id="home" /><label for="home"></label> is processed into
+<a id="home"></a><input id="prefix_home" /><label for="home"></label>.
+
+  *  Note that even if intended HTML is lost in a highly ill-written input, the processed output will be more secure and standard-compliant.
+
+  *  For URLs, unless $config["scheme"] is appropriately set, writers should avoid using escape characters or entities in schemes. E.g., htt&#112; (which many browsers will read as the harmless http) may be considered bad by htmLawed.
+
+  *  htmLawed will attempt to put plain text present directly inside blockquote, form, map and noscript elements (illegal as per the specs) inside auto-generated div elements.
+ +
+

+2.8  Limitations & work-arounds +

(to top)
+
+  htmLawed's main objective is to make the input text more standard-compliant, secure for web-page readers, and free of HTML elements and attributes considered undesirable by the administrator. Some of its current limitations, regardless of this objective, are noted below along with work-arounds.
+
+  It should be borne in mind that no browser application is 100% standard-compliant, and that some of the standard specs (like asking for normalization of white-spacing within textarea elements) are clearly wrong. Regarding security, note that unsafe HTML code is not necessarily legally invalid.
+
+  *  htmLawed is meant for input that goes into the body of HTML documents. HTML's head-level elements are not supported, nor are the frameset elements frameset, frame and noframes.
+
+  *  It cannot transform the non-standard embed elements to the standard-compliant object elements. Yet, it can allow embed elements if permitted (embed is widely used and supported). Admins can certainly use the hook_tag parameter (section 3.4.9) to deploy a custom embed-to-object converter function.
+
+  *  The only non-standard element that may be permitted is embed; others like noembed and nobr cannot be permitted without modifying the htmLawed code.
+
+  *  It cannot handle input that has non-HTML code like SVG and MathML. One way around is to break the input into pieces and passing only those without non-HTML code to htmLawed. Another is described in section 3.9. A third way may be to some how take advantage of the $config["and_mark"] parameter (see section 3.2).
+
+  *  By default, htmLawed won't check many attribute values for standard compliance. E.g., width="20m" with the dimension in non-standard m is let through. Implementing universal and strict attribute value checks can make htmLawed slow and resource-intensive. Admins should look at the hook_tag parameter (section 3.4.9) or $spec to enforce finer checks.
+
+  *  The attributes, deprecated (which can be transformed too) or not, that it supports are largely those that are in the specs. Only a few of the proprietary attributes are supported.
+
+  *  Except for contained URLs and dynamic expressions (also optional), htmLawed does not check CSS style property values. Admins should look at using the hook_tag parameter (section 3.4.9) or $spec for finer checks. Perhaps the best option is to disallow style but allow class attributes with the right oneof or match values for class, and have the various class style properties in .css CSS stylesheet files.
+
+  *  htmLawed does not parse emoticons, decode BBcode, or wikify, auto-converting text to proper HTML. Similarly, it won't convert line-breaks to br elements. Such functions are beyond its purview. Admins should use other code to pre- or post-process the input for such purposes.
+
+  *  htmLawed cannot be used to have links force-opened in new windows (by auto-adding appropriate target and onclick attributes to a). Admins should look at Javascript-based DOM-modifying solutions for this. Admins may also be able to use a custom hook function to enforce such checks (hook_tag parameter; see section 3.4.9).
+
+  *  Nesting-based checks are not possible. E.g., one cannot disallow p elements specifically inside td while permitting it elsewhere. Admins may be able to use a custom hook function to enforce such checks (hook_tag parameter; see section 3.4.9).
+
+  *  Except for optionally converting absolute or relative URLs to the other type, htmLawed will not alter URLs (e.g., to change the value of query strings or to convert http to https. Having absolute URLs may be a standard-requirement, e.g., when HTML is embedded in email messages, whereas altering URLs for other purposes is beyond htmLawed's goals. Admins may be able to use a custom hook function to enforce such checks (hook_tag parameter; see section 3.4.9).
+
+  *  Pairs of opening and closing tags that do not enclose any content (like <em></em>) are not removed. This may be against the standard specs for certain elements (e.g., table). However, presence of such standard-incompliant code will not break the display or layout of content. Admins can also use simple regex-based code to filter out such code.
+
+  *  htmLawed does not check for certain element orderings described in the standard specs (e.g., in a table, tbody is allowed before tfoot). Admins may be able to use a custom hook function to enforce such checks (hook_tag parameter; see section 3.4.9).
+
+  *  htmLawed does not check the number of nested elements. E.g., it will allow two caption elements in a table element, illegal as per the specs. Admins may be able to use a custom hook function to enforce such checks (hook_tag parameter; see section 3.4.9).
+
+  *  htmLawed might convert certain entities to actual characters and remove backslashes and CSS comment-markers (/*) in style attribute values in order to detect malicious HTML like crafted IE-specific dynamic expressions like &#101;xpression.... If this is too harsh, admins can allow CSS expressions through htmLawed core but then use a custom function through the hook_tag parameter (section 3.4.9) to more specifically identify CSS expressions in the style attribute values. Also, using $config["style_pass"], it is possible to have htmLawed pass style attribute values without even looking at them (section 3.4.8).
+
+  *  htmLawed does not correct certain possible attribute-based security vulnerabilities (e.g., <a href="http://x%22+style=%22background-image:xss">x</a>). These arise when browsers mis-identify markup in escaped text, defeating the very purpose of escaping text (a bad browser will read the given example as <a href="http://x" style="background-image:xss">x</a>).
+
+  *  Because of poor Unicode support in PHP, htmLawed does not remove the high value HTML-invalid characters with multi-byte code-points. Such characters however are extremely unlikely to be in the input. (see section 3.1).
+
+  *  Like any script using PHP's PCRE regex functions, PHP setup-specific low PCRE limit values can cause htmLawed to at least partially fail with very long input texts.
+ +
+

+2.9  Examples +

(to top)
+
1. A blog administrator wants to allow only a, em, strike, strong and u in comments, but needs strike and u transformed to span for better XHTML 1-strict compliance, and, he wants the a links to be to http or https resources:
+
+ +    $processed = htmLawed($in, array('elements'=>'a, em, strike, strong, u', 'make_tag_strict'=>1, 'safe'=>1, 'schemes'=>'*:http, https'), 'a=href'); +
+
2. An author uses a custom-made web application to load content on his web-site. He is the only one using that application and the content he generates has all types of HTML, including scripts. The web application uses htmLawed primarily as a tool to correct errors that creep in while writing HTML and to take care of the occasional bad characters in copy-paste text introduced by Microsoft Office. The web application provides a preview before submitted input is added to the content. For the previewing process, htmLawed is set up as follows:
+
+ +    $processed = htmLawed($in, array('css_expression'=>1, 'keep_bad'=>1, 'make_tag_strict'=>1, 'schemes'=>'*:*', 'valid_xhtml'=>1)); +
+
+  For the final submission process, keep_bad is set to 6. A value of 1 for the preview process allows the author to note and correct any HTML mistake without losing any of the typed text.
+
3. A data-miner is scraping information in a specific table of similar web-pages and is collating the data rows, and uses htmLawed to reduce unnecessary markup and white-spaces:
+
+ +    $processed = htmLawed($in, array('elements'=>'tr, td', 'tidy'=>-1), 'tr, td ='); +
+ +
+
+

+3  Details +

(to top)
+

+3.1  Invalid/dangerous characters +

(to top)
+
+  Valid characters (more correctly, their code-points) in HTML or XML are, hexadecimally, 9, a, d, 20 to d7ff, and e000 to 10ffff, except fffe and ffff (decimally, 9, 10, 13, 32 to 55295, and 57344 to 1114111, except 65534 and 65535). htmLawed removes the invalid characters 0 to 8, b, c, and e to 1f.
+
+  Because of PHP's poor native support for multi-byte characters, htmLawed cannot check for the remaining invalid code-points. However, for various reasons, it is very unlikely for any of those characters to be in the input.
+
+  Characters that are discouraged (see section 5.1) but not invalid are not removed by htmLawed.
+
+  It (function hl_tag()) also replaces the potentially dangerous (in some Mozilla [Firefox] and Opera browsers) soft-hyphen character (code-point, hexadecimally, ad, or decimally, 173) in attribute values with spaces. Where required, the characters <, >, &, and " are converted to entities.
+
+  With $config["clean_ms_char"] set as 1 or 2, many of the discouraged characters (decimal code-points 127 to 159 except 133) that many Microsoft applications incorrectly use (as per the Windows 1252 [Cp-1252] or a similar encoding system), and the character for decimal code-point 133, are converted to appropriate decimal numerical entities (or removed for a few cases)-- see appendix in section 5.4. This can help avoid some display issues arising from copying-pasting of content.
+
+  With $config["clean_ms_char"] set as 2, characters for the hexadecimal code-points 82, 91, and 92 (for special single-quotes), and 84, 93, and 94 (for special double-quotes) are converted to ordinary single and double quotes respectively and not to entities.
+
+  The character values are replaced with entities/characters and not character values referred to by the entities/characters to keep this task independent of the character-encoding of input text.
+
+  The $config["clean_ms_char"] parameter need not be used if authors do not copy-paste Microsoft-created text or if the input text is not believed to use the Windows 1252 or a similar encoding. Further, the input form and the web-pages displaying it or its content should have the character encoding appropriately marked-up.
+ +
+

+3.2  Character references/entities +

(to top)
+
+  Valid character entities take the form &*; where * is #x followed by a hexadecimal number (hexadecimal numeric entity; like &#xA0; for non-breaking space), or alphanumeric like gt (external or named entity; like &nbsp; for non-breaking space), or # followed by a number (decimal numeric entity; like &#160; for non-breaking space). Character entities referring to the soft-hyphen character (the &shy; or \xad character; hexadecimal code-point ad [decimal 173]) in attribute values are always replaced with spaces; soft-hyphens in attribute values introduce vulnerabilities in some older versions of the Opera and Mozilla [Firefox] browsers.
+
+  htmLawed (function hl_ent()):
+
+  *  Neutralizes entities with multiple leading zeroes or missing semi-colons (potentially dangerous)
+
+  *  Lowercases the X (for XML-compliance) and A-F of hexadecimal numeric entities
+
+  *  Neutralizes entities referring to characters that are HTML-invalid (see section 3.1)
+
+  *  Neutralizes entities referring to characters that are HTML-discouraged (code-points, hexadecimally, 7f to 84, 86 to 9f, and fdd0 to fddf, or decimally, 127 to 132, 134 to 159, and 64991 to 64976). Entities referring to the remaining discouraged characters (see section 5.1 for a full list) are let through.
+
+  *  Neutralizes named entities that are not in the specs.
+
+  *  Optionally converts valid HTML-specific named entities except &gt;, &lt;, &quot;, and &amp; to decimal numeric ones (hexadecimal if $config["hexdec_entity"] is 2) for generic XML-compliance. For this, $config["named_entity"] should be 1.
+
+  *  Optionally converts hexadecimal numeric entities to the more widely supported decimal ones. For this, $config["hexdec_entity"] should be 0.
+
+  *  Optionally converts decimal numeric entities to the hexadecimal ones. For this, $config["hexdec_entity"] should be 2.
+
Neutralization refers to the entitification of & to &amp;.
+
Note: htmLawed does not convert entities to the actual characters represented by them; one can pass the htmLawed output through PHP's html_entity_decode function for that.
+
Note: If $config["and_mark"] is set, and set to a value other than 0, then the & characters in the original input are replaced with the control character for the hexadecimal code-point 6 (\x06; & characters introduced by htmLawed, e.g., after converting < to &lt;, are not affected). This allows one to distinguish, say, an &gt; introduced by htmLawed and an &gt; put in by the input writer, and can be helpful in further processing of the htmLawed-processed text (e.g., to identify the character sequence o(><)o to generate an emoticon image). When this feature is active, admins should ensure that the htmLawed output is not directly used in web pages or XML documents as the presence of the \x06 can break documents. Before use in such documents, and preferably before any storage, any remaining \x06 should be changed back to &, e.g., with:
+
+ +    $final = str_replace("\x06", '&', $prelim); +
+
+  Also, see section 3.9.
+ +
+

+3.3  HTML elements +

(to top)
+
+  htmLawed can be configured to allow only certain HTML elements (tags) in the input. Disallowed elements (just tag-content, and not element-content), based on $config["keep_bad"], are either neutralized (converted to plain text by entitification of < and >) or removed.
+
+  E.g., with only em permitted:
+
+  Input:
+
+ +      <em>My</em> website is <a href="http://a.com>a.com</a>. +
+
+  Output, with $config["keep_bad"] = 0:
+
+ +      <em>My</em> website is a.com. +
+
+  Output, with $config["keep_bad"] not 0:
+
+ +      <em>My</em> website is &lt;a href=""&gt;a.com&lt;/a&gt;. +
+
+  See section 3.3.3 for differences between the various non-zero $config["keep_bad"] values.
+
+  htmLawed by default permits these 86 elements:
+
+ +    a, abbr, acronym, address, applet, area, b, bdo, big, blockquote, br, button, caption, center, cite, code, col, colgroup, dd, del, dfn, dir, div, dl, dt, em, embed, fieldset, font, form, h1, h2, h3, h4, h5, h6, hr, i, iframe, img, input, ins, isindex, kbd, label, legend, li, map, menu, noscript, object, ol, optgroup, option, p, param, pre, q, rb, rbc, rp, rt, rtc, ruby, s, samp, script, select, small, span, strike, strong, sub, sup, table, tbody, td, textarea, tfoot, th, thead, tr, tt, u, ul, var +
+
+  Except for embed (included because of its wide-spread use) and the Ruby elements (rb, rbc, rp, rt, rtc, ruby; part of XHTML 1.1), these are all the elements in the HTML 4/XHTML 1 specs. Strict-specific specs. exclude center, dir, font, isindex, menu, s, strike, and u.
+
+  With $config["safe"] = 1, the default set will exclude applet, embed, iframe, object and script; see section 3.6.
+
+  When $config["elements"], which specifies allowed elements, is properly defined, and neither empty nor set to 0 or *, the default set is not used. To have elements added to or removed from the default set, a +/- notation is used. E.g., *-script-object implies that only script and object are disallowed, whereas *+embed means that noembed is also allowed. Elements can also be specified as comma separated names. E.g., a, b, i means only a, b and i are permitted. In this notation, *, + and - have no significance and can actually cause a mis-reading.
+
+  Some more examples of $config["elements"] values indicating permitted elements (note that empty spaces are liberally allowed for clarity):
+
+  *  a, blockquote, code, em, strong -- only a, blockquote, code, em, and strong
+  *  *-script -- all excluding script
+  *  * -center -dir -font -isindex -menu -s -strike -u -- only XHTML-Strict elements
+  *  *+noembed-script -- all including noembed excluding script
+
+  Some mis-usages (and the resulting permitted elements) that can be avoided:
+
+  *  -* -- none; instead of htmLawed, one might just use, e.g., the htmlspecialchars() PHP function
+  *  *, -script -- all except script; admin probably meant *-script
+  *  -*, a, em, strong -- all; admin probably meant a, em, strong
+  *  * -- all; admin need not have set elements
+  *  *-form+form -- all; a + will always over-ride any -
+  *  *, noembed -- only noembed; admin probably meant *+noembed
+  *  a, +b, i -- only a and i; admin probably meant a, b, i
+
+  Basically, when using the +/- notation, commas (,) should not be used, and vice versa, and * should be used with the former but not the latter.
+
Note: Even if an element that is not in the default set is allowed through $config["elements"], like noembed in the last example, it will eventually be removed during tag balancing unless such balancing is turned off ($config["balance"] set to 0). Currently, the only way around this, which actually is simple, is to edit the various arrays in the function hl_bal() to accommodate the element and its nesting properties.
+
A possibly second way to specify allowed elements is to set $config["parent"] to an element name that supposedly will hold the input, and to set $config["balance"] to 1. During tag balancing (see section 3.3.3), all elements that cannot legally nest inside the parent element will be removed. The parent element is auto-reset to div if $config["parent"] is empty, body, or an element not in htmLawed's default set of 86 elements.
+
Tag transformation is possible for improving XHTML-Strict compliance -- most of the deprecated elements are removed or converted to valid XHTML-Strict ones; see section 3.3.2.
+ +

+3.3.1  Handling of comments and CDATA sections +

(to top)
+
CDATA sections have the format <![CDATA[...anything but not "]]>"...]]>, and HTML comments, <!--...anything but not "-->"... -->. Neither HTML comments nor CDATA sections can reside inside tags. HTML comments can exist anywhere else, but CDATA sections can exist only where plain text is allowed (e.g., immediately inside td element content but not immediately inside tr element content).
+
+  htmLawed (function hl_cmtcd()) handles HTML comments or CDATA sections depending on the values of $config["comment"] or $config["cdata"]. If 0, such markup is not looked for and the text is processed like plain text. If 1, it is removed completely. If 2, it is preserved but any <, > and & inside are changed to entities. If 3, they are left as such.
+
+  Note that for the last two cases, HTML comments and CDATA sections will always be removed from tag content (function hl_tag()).
+
+  Examples:
+
+  Input:
+ +    <!-- home link --><a href="home.htm"><![CDATA[x=&y]]>Home</a> +
+  Output ($config["comment"] = 0, $config["cdata"] = 2):
+ +    &lt;-- home link --&gt;<a href="home.htm"><![CDATA[x=&amp;y]]>Home</a> +
+  Output ($config["comment"] = 1, $config["cdata"] = 2):
+ +    <a href="home.htm"><![CDATA[x=&amp;y]]>Home</a> +
+  Output ($config["comment"] = 2, $config["cdata"] = 2):
+ +    <!-- home link --><a href="home.htm"><![CDATA[x=&amp;y]]>Home</a> +
+  Output ($config["comment"] = 2, $config["cdata"] = 1):
+ +    <!-- home link --><a href="home.htm">Home</a> +
+  Output ($config["comment"] = 3, $config["cdata"] = 3):
+ +    <!-- home link --><a href="home.htm"><![CDATA[x=&y]]>Home</a> +
+
+  For standard-compliance, comments are given the form <!--comment -->, and any -- in the content is made -.
+
+  When $config["safe"] = 1, CDATA sections and comments are considered plain text unless $config["comment"] or $config["cdata"] is explicitly specified; see section 3.6.
+ +
+

+3.3.2  Tag-transformation for better XHTML-Strict +

(to top)
+
+  If $config["make_tag_strict"] is set and not 0, following non-XHTML-Strict elements (and attributes), even if admin-permitted, are mutated as indicated (element content remains intact; function hl_tag2()):
+
+  *  applet - (based on $config["make_tag_strict"], unchanged (1) or removed (2))
+  *  center - div style="text-align: center;"
+  *  dir - ul
+  *  embed - (based on $config["make_tag_strict"], unchanged (1) or removed (2))
+  *  font (face, size, color) -    span style="font-family: ; font-size: ; color: ;" (size transformation reference)
+  *  isindex - (based on $config["make_tag_strict"], unchanged (1) or removed (2))
+  *  menu - ul
+  *  s - span style="text-decoration: line-through;"
+  *  strike - span style="text-decoration: line-through;"
+  *  u - span style="text-decoration: underline;"
+
+  For an element with a pre-existing style attribute value, the extra style properties are appended.
+
+  Example input:
+
+ +    <center> +
+ +     The PHP <s>software</s> script used for this <strike>web-page</strike> web-page is <font style="font-weight: bold " face=arial size='+3' color   =  "red  ">htmLawedTest.php</font>, from <u style= 'color:green'>PHP Labware</u>. +
+ +    </center> +
+
+  The output:
+
+ +    <div style="text-align: center;"> +
+ +     The PHP <span style="text-decoration: line-through;">software</span> script used for this <span style="text-decoration: line-through;">web-page</span> web-page is <span style="font-weight: bold; font-family: arial; color: red; font-size: 200%;">htmLawedTest.php</span>, from <span style="color:green; text-decoration: underline;">PHP Labware</span>. +
+ +    </div> +
+ +
+

+3.3.3  Tag balancing and proper nesting +

(to top)
+
+  If $config["balance"] is set to 1, htmLawed (function hl_bal()) checks and corrects the input to have properly balanced tags and legal element content (i.e., any element nesting should be valid, and plain text may be present only in the content of elements that allow them).
+
+  Depending on the value of $config["keep_bad"] (see section 2.2 and section 3.3), illegal content may be removed or neutralized to plain text by converting < and > to entities:
+
0 - remove; this option is available only to maintain Kses-compatibility and should not be used otherwise (see section 2.6)
1 - neutralize tags and keep element content
2 - remove tags but keep element content
3 and 4 - like 1 and 2, but keep element content only if text (pcdata) is valid in parent element as per specs
5 and 6 -  like 3 and 4, but line-breaks, tabs and spaces are left
+
+  Example input (disallowing the p element):
+
+ +    <*> Pseudo-tags <*> +
+ +    <xml>Non-HTML tag xml</xml> +
+ +    <p> +
+ +    Disallowed tag p +
+ +    </p> +
+ +    <ul>Bad<li>OK</li></ul> +
+
+  The output with $config["keep_bad"] = 1:
+
+ +    &lt;*&gt; Pseudo-tags &lt;*&gt; +
+ +    &lt;xml&gt;Non-HTML tag xml&lt;/xml&gt; +
+ +    &lt;p&gt; +
+ +    Disallowed tag p +
+ +    &lt;/p&gt; +
+ +    <ul>Bad<li>OK</li></ul> +
+
+  The output with $config["keep_bad"] = 3:
+
+ +    &lt;*&gt; Pseudo-tags &lt;*&gt; +
+ +    &lt;xml&gt;Non-HTML tag xml&lt;/xml&gt; +
+ +    &lt;p&gt; +
+ +    Disallowed tag p +
+ +    &lt;/p&gt; +
+ +    <ul><li>OK</li></ul> +
+
+  The output with $config["keep_bad"] = 6:
+
+ +    &lt;*&gt; Pseudo-tags &lt;*&gt; +
+ +    Non-HTML tag xml +
+
+ +    Disallowed tag p +
+
+ +    <ul><li>OK</li></ul> +
+
+  An option like 1 is useful, e.g., when a writer previews his submission, whereas one like 3 is useful before content is finalized and made available to all.
+
Note: In the example above, unlike <*>, <xml> gets considered as a tag (even though there is no HTML element named xml). In general, text matching the regular expression pattern <(/?)([a-zA-Z][a-zA-Z1-6]*)([^>]*?)\s?> is considered a tag (phrase enclosed by the angled brackets < and >, and starting [with an optional slash preceding] with an alphanumeric word that starts with an alphabet...).
+
+  Nesting/content rules for each of the 86 elements in htmLawed's default set (see section 3.3) are defined in function hl_bal(). This means that if a non-standard element besides embed is being permitted through $config["elements"], the element's tag content will end up getting removed if $config["balance"] is set to 1.
+
+  Plain text and/or certain elements nested inside blockquote, form, map and noscript need to be in block-level elements. This point is often missed during manual writing of HTML code. htmLawed attempts to address this during balancing. E.g., if the parent container is set as form, the input B:<input type="text" value="b" />C:<input type="text" value="c" /> is converted to <div>B:<input type="text" value="b" />C:<input type="text" value="c" /></div>.
+ +
+

+3.3.4  Elements requiring child elements +

(to top)
+
+  As per specs, the following elements require legal child elements nested inside them:
+
+ +    blockquote, dir, dl, form, map, menu, noscript, ol, optgroup, rbc, rtc, ruby, select, table, tbody, tfoot, thead, tr, ul +
+
+  In some cases, the specs stipulate the number and/or the ordering of the child elements. A table can have 0 or 1 caption, tbody, tfoot, and thead, but they must be in this order: caption, thead, tfoot, tbody.
+
+  htmLawed currently does not check for conformance to these rules. Note that any non-compliance in this regard will not introduce security vulnerabilities, crash browser applications, or affect the rendering of web-pages.
+ +
+

+3.3.5  Beautify or compact HTML +

(to top)
+
+  By default, htmLawed will neither beautify HTML code by formatting it with indentations, etc., nor will it make it compact by removing un-needed white-space.(It does always properly white-space tag content.)
+
+  As per the HTML standards, spaces, tabs and line-breaks in web-pages (except those inside pre elements) are all considered equivalent, and referred to as white-spaces. Browser applications are supposed to consider contiguous white-spaces as just a single space, and to disregard white-spaces trailing opening tags or preceding closing tags. This white-space normalization allows the use of text/code beautifully formatted with indentations and line-spacings for readability. Such pretty HTML can, however, increase the size of web-pages, or make the extraction or scraping of plain text cumbersome.
+
+  With the $config parameter tidy, htmLawed can be used to beautify or compact the input text. Input with just plain text and no HTML markup is also subject to this. Besides pre, the script and textarea elements, CDATA sections, and HTML comments are not subjected to the tidying process.
+
+  To compact, use $config["tidy"] = -1; single instances or runs of white-spaces are replaced with a single space, and white-spaces trailing and leading open and closing tags, respectively, are removed.
+
+  To beautify, $config["tidy"] is set as 1, or for customized tidying, as a string like 2s2n. The s or t character specifies the use of spaces or tabs for indentation. The first and third characters, any of the digits 0-9, specify the number of spaces or tabs per indentation, and any parental lead spacing (extra indenting of the whole block of input text). The r and n characters are used to specify line-break characters: n for \n (Unix/Mac OS X line-breaks), rn or nr for \r\n (Windows/DOS line-breaks), or r for \r.
+
+  The $config["tidy"] value of 1 is equivalent to 2s0n. Other $config["tidy"] values are read loosely: a value of 4 is equivalent to 4s0n; t2, to 1t2n; s, to 2s0n; 2TR, to 2t0r; T1, to 1t1n; nr3, to 3s0nr, and so on. Except in the indentations and line-spacings, runs of white-spaces are replaced with a single space during beautification.
+
+  Input formatting using $config["tidy"] is not recommended when input text has mixed markup (like HTML + PHP).
+ +
+
+

+3.4  Attributes +

(to top)
+
+  htmLawed will only permit attributes described in the HTML specs (including deprecated ones). It also permits some attributes for use with the embed element (the non-standard embed element is supported in htmLawed because of its widespread use), and the the xml:space attribute (valid only in XHTML 1.1). A list of such 111 attributes and the elements they are allowed in is in section 5.2.
+
+  When $config["deny_attribute"] is not set, or set to 0, or empty (""), all the 111 attributes are permitted. Otherwise, $config["deny_attribute"] can be set as a list of comma-separated names of the denied attributes. on* can be used to refer to the group of potentially dangerous, script-accepting attributes: onblur, onchange, onclick, ondblclick, onfocus, onkeydown, onkeypress, onkeyup, onmousedown, onmousemove, onmouseout, onmouseover, onmouseup, onreset, onselect and onsubmit.
+
+  Note that attributes specified in $config["deny_attribute"] are denied globally, for all elements. To deny attributes for only specific elements, $spec (see section 2.3) can be used. $spec can also be used to element-specifically permit an attribute otherwise denied through $config["deny_attribute"].
+
+  With $config["safe"] = 1 (section 3.6), the on* attributes are automatically disallowed.
+
Note: To deny all but a few attributes globally, a simpler way to specify $config["deny_attribute"] would be to use the notation * -attribute1 -attribute2 .... Thus, a value of * -title -href implies that except href and title (where allowed as per standards) all other attributes are to be removed. With this notation, the value for the parameter safe (section 3.6) will have no effect on deny_attribute.
+
+  htmLawed (function hl_tag()) also:
+
+  *  Lower-cases attribute names
+  *  Removes duplicate attributes (last one stays)
+  *  Gives attributes the form name="value" and single-spaces them, removing unnecessary white-spacing
+  *  Provides required attributes (see section 3.4.1)
+  *  Double-quotes values and escapes any " inside them
+  *  Replaces the possibly dangerous soft-hyphen characters (hexadecimal code-point ad) in the values with spaces
+  *  Allows custom function to additionally filter/modify attribute values (see section 3.4.9)
+ +

+3.4.1  Auto-addition of XHTML-required attributes +

(to top)
+
+  If indicated attributes for the following elements are found missing, htmLawed (function hl_tag()) will add them (with values same as attribute names unless indicated otherwise below):
+
+  *  area - alt (area)
+  *  area, img - src, alt (image)
+  *  bdo - dir (ltr)
+  *  form - action
+  *  map - name
+  *  optgroup - label
+  *  param - name
+  *  script - type (text/javascript)
+  *  textarea - rows (10), cols (50)
+
+  Additionally, with $config["xml:lang"] set to 1 or 2, if the lang but not the xml:lang attribute is declared, then the latter is added too, with a value copied from that of lang. This is for better standard-compliance. With $config["xml:lang"] set to 2, the lang attribute is removed (XHTML 1.1 specs).
+
+  Note that the name attribute for map, invalid in XHTML 1.1, is also transformed if required -- see section 3.4.6.
+ +
+

+3.4.2  Duplicate/invalid id values +

(to top)
+
+  If $config["unique_ids"] is 1, htmLawed (function hl_tag()) removes id attributes with values that are not XHTML-compliant (must begin with a letter and can contain letters, digits, :, ., - and _) or duplicate. If $config["unique_ids"] is a word, any duplicate but otherwise valid value will be appropriately prefixed with the word to ensure its uniqueness. The word should begin with a letter and should contain only letters, numbers, :, ., _ and -.
+
+  Even if multiple inputs need to be filtered (through multiple calls to htmLawed), htmLawed ensures uniqueness of id values as it uses a global variable ($GLOBALS["hl_Ids"] array). Further, an admin can restrict the use of certain id values by presetting this variable before htmLawed is called into use. E.g.:
+
+ +    $GLOBALS['hl_Ids'] = array('top'=>1, 'bottom'=>1, 'myform'=>1); // id values not allowed in input +
+ +    $processed = htmLawed($text); // filter input +
+ +
+

+3.4.3  URL schemes (protocols) and scripts in attribute values +

(to top)
+
+  htmLawed edits attributes that take URLs as values if they are found to contain un-permitted schemes. E.g., if the afp scheme is not permitted, then <a href="afp://domain.org"> becomes <a href="denied:afp://domain.org">, and if Javascript is not permitted <a onclick="javascript:xss();"> becomes <a onclick="denied:javascript:xss();">.
+
+  By default htmLawed permits these schemes in URLs for the href attribute:
+
+ +    aim, feed, file, ftp, gopher, http, https, irc, mailto, news, nntp, sftp, ssh, telnet +
+
+  Also, only file, http and https are permitted in attributes whose names start with o (like onmouseover), and in these attributes that accept URLs:
+
+ +    action, cite, classid, codebase, data, href, longdesc, model, pluginspage, pluginurl, src, style, usemap +
+
+  These default sets are used when $config["schemes"] is not set (see section 2.2). To over-ride the defaults, $config["schemes"] is defined as a string of semi-colon-separated sub-strings of type attribute: comma-separated schemes. E.g., href: mailto, http, https; onclick: javascript; src: http, https. For unspecified attributes, file, http and https are permitted. This can be changed by passing schemes for * in $config["schemes"]. E.g., href: mailto, http, https; *: https, https.
+
* can be put in the list of schemes to permit all protocols. E.g., style: *; img: http, https results in protocols not being checked in style attribute values. However, in such cases, any relative-to-absolute URL conversion, or vice versa, (section 3.4.4) is not done.
+
+  Thus, to allow Javascript, one can set $config["schemes"] as href: mailto, http, https; *: http, https, javascript, or href: mailto, http, https, javascript; *: http, https, javascript, or *: *, and so on.
+
+  As a side-note, one may find style: * useful as URLs in style attributes can be specified in a variety of ways, and the patterns that htmLawed uses to identify URLs may mistakenly identify non-URL text.
+
Note: If URL-accepting attributes other than those listed above are being allowed, then the scheme will not be checked unless the attribute name contains the string src (e.g., dynsrc) or starts with o (e.g., onbeforecopy).
+
+  With $config["safe"] = 1, all URLs are disallowed in the style attribute values.
+ +
+

+3.4.4  Absolute & relative URLs in attribute values +

(to top)
+
+  htmLawed can make absolute URLs in attributes like href relative ($config["abs_url"] is -1), and vice versa ($config["abs_url"] is 1). URLs in scripts are not considered for this, and so are URLs like #section_6 (fragment), ?name=Tim#show (starting with query string), and ;var=1?name=Tim#show (starting with parameters). Further, this requires that $config["base_url"] be set properly, with the :// and a trailing slash (/), with no query string, etc. E.g., file:///D:/page/, https://abc.com/x/y/, or http://localhost/demo/ are okay, but file:///D:/page/?help=1, abc.com/x/y/ and http://localhost/demo/index.htm are not.
+
+  For making absolute URLs relative, only those URLs that have the $config["base_url"] string at the beginning are converted. E.g., with $config["base_url"] = "https://abc.com/x/y/", https://abc.com/x/y/a.gif and https://abc.com/x/y/z/b.gif become a.gif and z/b.gif respectively, while https://abc.com/x/c.gif is not changed.
+
+  When making relative URLs absolute, only values for scheme, network location (host-name) and path values in the base URL are inherited. See section 5.5 for more about the URL specification as per RFC 1808.
+ +
+

+3.4.5  Lower-cased, standard attribute values +

(to top)
+
+  Optionally, for standard-compliance, htmLawed (function hl_tag()) lower-cases standard attribute values to give, e.g., input type="password" instead of input type="Password", if $config["lc_std_val"] is 1. Attribute values matching those listed below for any of the elements (plus those for the type attribute of button or input) are lower-cased:
+
+ +    all, baseline, bottom, button, center, char, checkbox, circle, col, colgroup, cols, data, default, file, get, groups, hidden, image, justify, left, ltr, middle, none, object, password, poly, post, preserve, radio, rect, ref, reset, right, row, rowgroup, rows, rtl, submit, text, top +
+
+ +    a, area, bdo, button, col, form, img, input, object, option, optgroup, param, script, select, table, td, tfoot, th, thead, tr, xml:space +
+
+  The following empty (minimized) attributes are always assigned lower-cased values (same as the names):
+
+ +    checked, compact, declare, defer, disabled, ismap, multiple, nohref, noresize, noshade, nowrap, readonly, selected +
+ +
+

+3.4.6  Transformation of deprecated attributes +

(to top)
+
+  If $config["no_deprecated_attr"] is 0, then deprecated attributes (see appendix in section 5.2) are removed and, in most cases, their values are transformed to CSS style properties and added to the style attributes (function hl_tag()). Except for bordercolor for table, tr and td, the scores of proprietary attributes that were never part of any cross-browser standard are not supported.
+
Note: The attribute target for a is allowed even though it is not in XHTML 1.0 specs. This is because of the attribute's wide-spread use and browser-support, and because the attribute is valid in XHTML 1.1 onwards.
+
+  *  align - for img with value of left or right, becomes, e.g., float: left; for div and table with value center, becomes margin: auto; all others become, e.g., text-align: right
+
+  *  bgcolor - E.g., bgcolor="#ffffff" becomes background-color: #ffffff
+  *  border - E.g., height= "10" becomes height: 10px
+  *  bordercolor - E.g., bordercolor=#999999 becomes border-color: #999999;
+  *  compact - font-size: 85%
+  *  clear - E.g., 'clear="all" becomes clear: both
+
+  *  height - E.g., height= "10" becomes height: 10px and height="*" becomes height: auto
+
+  *  hspace - E.g., hspace="10" becomes margin-left: 10px; margin-right: 10px
+  *  language - language="VBScript" becomes type="text/vbscript"
+  *  name - E.g., name="xx" becomes id="xx"
+  *  noshade - border-style: none; border: 0; background-color: gray; color: gray
+  *  nowrap - white-space: nowrap
+  *  size - E.g., size="10" becomes height: 10px
+  *  start - removed
+  *  type - E.g., type="i" becomes list-style-type: lower-roman
+  *  value - removed
+  *  vspace - E.g., vspace="10" becomes margin-top: 10px; margin-bottom: 10px
+  *  width - like height
+
+  Example input:
+
+ +    <img src="j.gif" alt="image" name="dad's" /><img src="k.gif" alt="image" id="dad_off" name="dad" /> +
+ +    <br clear="left" /> +
+ +    <hr noshade size="1" /> +
+ +    <img name="img" src="i.gif" align="left" alt="image" hspace="10" vspace="10" width="10em" height="20" border="1" style="padding:5px;" /> +
+ +    <table width="50em" align="center" bgcolor="red"> +
+ +     <tr> +
+ +      <td width="20%"> +
+ +       <div align="center"> +
+ +        <h3 align="right">Section</h3> +
+ +        <p align="right">Para</p> +
+ +        <ol type="a" start="e"><li value="x">First item</li></ol> +
+ +       </div> +
+ +      </td> +
+ +      <td width="*"> +
+ +       <ol type="1"><li>First item</li></ol> +
+ +      </td> +
+ +     </tr> +
+ +    </table> +
+ +    <br clear="all" /> +
+
+  And the output with $config["no_deprecated_attr"] = 1:
+
+ +    <img src="j.gif" alt="image" /><img src="k.gif" alt="image" id="dad_off" /> +
+ +    <br style="clear: left;" /> +
+ +    <hr style="border-style: none; border: 0; background-color: gray; color: gray; size: 1px;" /> +
+ +    <img src="i.gif" alt="image" width="10em" height="20" style="padding:5px; float: left; margin-left: 10px; margin-right: 10px; margin-top: 10px; margin-bottom: 10px; border: 1px;" id="img" /> +
+ +    <table width="50em" style="margin: auto; background-color: red;"> +
+ +     <tr> +
+ +      <td style="width: 20%;"> +
+ +       <div style="margin: auto;"> +
+ +        <h3 style="text-align: right;">Section</h3> +
+ +        <p style="text-align: right;">Para</p> +
+ +        <ol style="list-style-type: lower-latin;"><li>First item</li></ol> +
+ +       </div> +
+ +      </td> +
+ +      <td style="width: auto;"> +
+ +       <ol style="list-style-type: decimal;"><li>First item</li></ol> +
+ +      </td> +
+ +     </tr> +
+ +    </table> +
+ +    <br style="clear: both;" /> +
+
+  For lang, deprecated in XHTML 1.1, transformation is taken care of through $config["xml:lang"]; see section 3.4.1.
+
+  The attribute name is deprecated in form, iframe, and img, and is replaced with id if an id attribute doesn't exist and if the name value is appropriate for id. For such replacements for a and map, for which the name attribute is deprecated in XHTML 1.1, $config["no_deprecated_attr"] should be set to 2 (when set to 1, for these two elements, the name attribute is retained).
+ +
+

+3.4.7  Anti-spam & href +

(to top)
+
+  htmLawed (function hl_tag()) can check the href attribute values (link addresses) as an anti-spam (email or link spam) measure.
+
+  If $config["anti_mail_spam"] is not 0, the @ of email addresses in href values like mailto:a@b.com is replaced with text specified by $config["anti_mail_spam"]. The text should be of a form that makes it clear to others that the address needs to be edited before a mail is sent; e.g., <remove_this_antispam>@ (makes the example address a<remove_this_antispam>@b.com).
+
+  For regular links, one can choose to have a rel attribute with nofollow in its value (which tells some search engines to not follow a link). This can discourage link spammers. Additionally, or as an alternative, one can choose to empty the href value altogether (disable the link).
+
+  For use of these options, $config["anti_link_spam"] should be set as an array with values regex1 and regex2, both or one of which can be empty (like array("", "regex2")) to indicate that that option is not to be used. Otherwise, regex1 or regex2 should be PHP- and PCRE-compatible regular expression patterns: href values will be matched against them and those matching the pattern will accordingly be treated.
+
+  Note that the regular expressions should have delimiters, and be well-formed and preferably fast. Absolute efficiency/accuracy is often not needed.
+
+  An example, to have a rel attribute with nofollow for all links, and to disable links that do not point to domains abc.com and xyz.org:
+
+ +    $config["anti_link_spam"] = array('`.`', '`://\W*(?!(abc\.com|xyz\.org))`'); +
+ +
+

+3.4.8  Inline style properties +

(to top)
+
+  htmLawed can check URL schemes and dynamic expressions (to guard against Javascript, etc., script-based insecurities) in inline CSS style property values in the style attributes. (CSS properties like background-image that accept URLs in their values are noted in section 5.3.) Dynamic CSS expressions that allow scripting in the IE browser, and can be a vulnerability, can be removed from property values by setting $config["css_expression"] to 1 (default setting).
+
Note: Because of the various ways of representing characters in attribute values (URL-escapement, entitification, etc.), htmLawed might alter the values of the style attribute values, and may even falsely identify dynamic CSS expressions and URL schemes in them. If this is an important issue, checking of URLs and dynamic expressions can be turned off ($config["schemes"] = "...style:*...", see section 3.4.3, and $config["css_expression"] = 0). Alternately, admins can use their own custom function for finer handling of style values through the hook_tag parameter (see section 3.4.9).
+
+  It is also possible to have htmLawed let through any style value by setting $config["style_pass"] to 1.
+
+  As such, it is better to set up a CSS file with class declarations, disallow the style attribute, set a $spec rule (see section 2.3) for class for the oneof or match parameter, and ask writers to make use of the class attribute.
+ +
+

+3.4.9  Hook function for tag content +

(to top)
+
+  It is possible to utilize a custom hook function to alter the tag content htmLawed has finalized (i.e., after it has checked/corrected for required attributes, transformed attributes, lower-cased attribute names, etc.).
+
+  When $config parameter hook_tag is set to the name of a function, htmLawed (function hl_tag()) will pass on the element name, and the finalized attribute name-value pairs as array elements to the function. The function is expected to return the full opening tag string like <element_name attribute_1_name="attribute_1_value"...> (for empty elements like img and input, the element-closing slash / should also be included).
+
+  This is a powerful functionality that can be exploited for various objectives: consolidate-and-convert inline style attributes to class, convert embed elements to object, permit only one caption element in a table element, disallow embedding of certain types of media, inject HTML, use CSSTidy to sanitize style attribute values, etc.
+
+  As an example, the custom hook code below can be used to force a series of specifically ordered id attributes on all elements, and a specific param element inside all object elements:
+
+ +    function my_tag_function($element, $attribute_array){ +
+ +      static $id = 0; +
+ +      // Remove any duplicate element +
+ +      if($element == 'param' && isset($attribute_array['allowscriptaccess'])){ +
+ +        return ''; +
+ +      } +
+
+ +      $new_element = ''; +
+
+ +      // Force a serialized ID number +
+ +      $attribute_array['id'] = 'my_'. $id; +
+ +      ++$id; +
+
+ +      // Inject param for allowscriptaccess +
+ +      if($element == 'object'){ +
+ +        $new_element = '<param id='my_'. $id; allowscriptaccess="never" />'; +
+ +        ++$id; +
+ +      } +
+
+ +      $string = ''; +
+ +      foreach($attribute_array as $k=>$v){ +
+ +        $string .= " {$k}=\"{$v}\""; +
+ +      } +
+ +      return "<{$element}{$string}". (isset($in_array($element, $empty_elements) ? ' /' : ''). '>'. $new_element; +
+ +    } +
+
+  The hook_tag parameter is different from the hook parameter (section 3.7).
+
+  Snippets of hook function code developed by others may be available on the htmLawed website.
+ +
+
+

+3.5  Simple configuration directive for most valid XHTML +

(to top)
+
+  If $config["valid_xhtml"] is set to 1, some relevant $config parameters (indicated by ~ in section 2.2) are auto-adjusted. This allows one to pass the $config argument with a simpler value. If a value for a parameter auto-set through valid_xhtml is still manually provided, then that value will over-ride the auto-set value.
+ +
+

+3.6  Simple configuration directive for most safe HTML +

(to top)
+
Safe HTML refers to HTML that is restricted to reduce the vulnerability for scripting attacks (such as XSS) based on HTML code which otherwise may still be legal and compliant with the HTML standard specs. When elements such as script and object, and attributes such as onmouseover and style are allowed in the input text, an input writer can introduce malevolent HTML code. Note that what is considered safe depends on the nature of the web application and the trust-level accorded to its users.
+
+  htmLawed allows an admin to use $config["safe"] to auto-adjust multiple $config parameters (such as elements which declares the allowed element-set), which otherwise would have to be manually set. The relevant parameters are indicated by " in section 2.2). Thus, one can pass the $config argument with a simpler value.
+
+  With the value of 1, htmLawed considers CDATA sections and HTML comments as plain text, and prohibits the applet, embed, iframe, object and script elements, and the on* attributes like onclick. ( There are $config parameters like css_expression that are not affected by the value set for safe but whose default values still contribute towards a more safe output.) Further, URLs with schemes (see section 3.4.3) are neutralized so that, e.g., style="moz-binding:url(http://danger)" becomes style="moz-binding:url(denied:http://danger)" while style="moz-binding:url(ok)" remains intact.
+
+  Admins, however, may still want to completely deny the style attribute, e.g., with code like
+
+ +    $processed = htmLawed($text, array('safe'=>1, 'deny_attribute'=>'style')); +
+
+  If a value for a parameter auto-set through safe is still manually provided, then that value can over-ride the auto-set value. E.g., with $config["safe"] = 1 and $config["elements"] = "*+script", script, but not applet, is allowed.
+
+  A page illustrating the efficacy of htmLawed's anti-XSS abilities with safe set to 1 against XSS vectors listed by RSnake may be available here.
+ +
+

+3.7  Using a hook function +

(to top)
+
+  If $config["hook"] is not set to 0, then htmLawed will allow preliminarily processed input to be altered by a hook function named by $config["hook"] before starting the main work (but after handling of characters, entities, HTML comments and CDATA sections -- see code for function htmLawed()).
+
+  The hook function also allows one to alter the finalized values of $config and $spec.
+
+  Note that the hook parameter is different from the hook_tag parameter (section 3.4.9).
+
+  Snippets of hook function code developed by others may be available on the htmLawed website.
+ +
+

+3.8  Obtaining finalized parameter values +

(to top)
+
+  htmLawed can assign the finalized $config and $spec values to a variable named by $config["show_setting"]. The variable, made global by htmLawed, is set as an array with three keys: config, with the $config value, spec, with the $spec value, and time, with a value that is the Unix time (the output of PHP's microtime() function) when the value was assigned. Admins should use a PHP-compliant variable name (e.g., one that does not begin with a numerical digit) that does not conflict with variable names in their non-htmLawed code.
+
+  The values, which are also post-hook function (if any), can be used to auto-generate information (on, e.g., the elements that are permitted) for input writers.
+ +
+

+3.9  Retaining non-HTML tags in input with mixed markup +

(to top)
+
+  htmLawed does not remove certain characters that though invalid are nevertheless discouraged in HTML documents as per the specs (see section 5.1). This can be utilized to deal with input that contains mixed markup. Input that may have HTML markup as well as some other markup that is based on the <, > and & characters is considered to have mixed markup. The non-HTML markup can be rather proprietary (like markup for emoticons/smileys), or standard (like MathML or SVG). Or it can be programming code meant for execution/evaluation (such as embedded PHP code).
+
+  To deal with such mixed markup, the input text can be pre-processed to hide the non-HTML markup by specifically replacing the <, > and & characters with some of the HTML-discouraged characters (see section 3.1.2). Post-htmLawed processing, the replacements are reverted.
+
+  An example (mixed HTML and PHP code in input text):
+
+ +    $text = preg_replace('`<\?php(.+?)\?>`sm', "\x83?php\\1?\x84", $text); +
+ +    $processed = htmLawed($text); +
+ +    $processed = preg_replace('`\x83\?php(.+?)\?\x84`sm', '<?php$1?>', $processed); +
+
+  This code will not work if $config["clean_ms_char"] is set to 1 (section 3.1), in which case one should instead deploy a hook function (section 3.7). (htmLawed internally uses certain control characters, code-points 1 to 7, and use of these characters as markers in the logic of hook functions may cause issues.)
+
+  Admins may also be able to use $config["and_mark"] to deal with such mixed markup; see section 3.2.
+ +
+
+

+4  Other +

(to top)
+

+4.1  Support +

(to top)
+
+  A careful re-reading of this documentation will very likely answer your questions.
+
+  Software updates and forum-based community-support may be found at http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed. For general PHP issues (not htmLawed-specific), support may be found through internet searches and at http://php.net.
+ +
+

+4.2  Known issues +

(to top)
+
+  See section 2.8.
+
+  Readers are advised to cross-check information given in this document.
+ +
+

+4.3  Change-log +

(to top)
+
+  (The release date for the downloadable package of files containing documentation, demo script, test-cases, etc., besides the htmLawed.php file may be updated independently if the secondary files are revised.)
+
Version number - Release date. Notes
+
+  1.1.8.1 - 16 July 2009. Minor code-change to fix a PHP error notice
+
+  1.1.8 - 23 April 2009. Parameter deny_attribute now accepts the wild-card *, making it simpler to specify its value when all but a few attributes are being denied; fixed a bug in interpreting $spec
+
+  1.1.7 - 11-12 March 2009. Attributes globally denied through deny_attribute can be allowed element-specifically through $spec; $config["style_pass"] allowing letting through any style value introduced; altered logic to catch certain types of dynamic crafted CSS expressions
+
+  1.1.3-6 - 28-31 January - 4 February 2009. Altered logic to catch certain types of dynamic crafted CSS expressions
+
+  1.1.2 - 22 January 2009. Fixed bug in parsing of font attributes during tag transformation
+
+  1.1.1 - 27 September 2008. Better nesting correction when omitable closing tags are absent
+
+  1.1 - 29 June 2008. $config["hook_tag"] and $config["format"] introduced for custom tag/attribute check/modification/injection and output compaction/beautification; fixed a regex-in-$spec parsing bug
+
+  1.0.9 - 11 June 2008. Fixed bug in invalid HTML code-point entity check
+
+  1.0.8 - 15 May 2008. bordercolor attribute for table, td and tr
+
+  1.0.7 - 1 May 2008. Support for wmode attribute for embed; $config["show_setting"] introduced; improved $config["elements"] evaluation
+
+  1.0.6 - 20 April 2008. $config["and_mark"] introduced
+
+  1.0.5 - 12 March 2008. style URL schemes essentially disallowed when $config safe is on; improved regex for CSS expression search
+
+  1.0.4 - 10 March 2008. Improved corrections for blockquote, form, map and noscript
+
+  1.0.3 - 3 March 2008. Character entities for soft-hyphens are now replaced with spaces (instead of being removed); a bug allowing td directly inside table fixed; safe $config parameter added
+
+  1.0.2 - 13 February 2008. Improved implementation of $config["keep_bad"]
+
+  1.0.1 - 7 November 2007. Improved regex for identifying URLs, protocols and dynamic expressions (hl_tag() and hl_prot()); no error display with hl_regex()
+
+  1.0 - 2 November 2007. First release
+ +
+

+4.4  Testing +

(to top)
+
+  To test htmLawed using a form interface, a demo web-page is provided with the htmLawed distribution (htmLawed.php and htmLawedTest.php should be in the same directory on the web-server). A file with test-cases is also provided.
+ +
+

+4.5  Upgrade, & old versions +

(to top)
+
+  Upgrading is as simple as replacing the previous version of htmLawed.php (assuming it was not modified for customized features). As htmLawed output is almost always used in static documents, upgrading should not affect old, finalized content.
+
+  Old versions of htmLawed may be available online. E.g., for version 1.0, check http://www.bioinformatics.org/phplabware/downloads/htmLawed1.zip, for 1.1.1, htmLawed111.zip, and for 1.1.10, htmLawed1110.zip.
+ +
+

+4.6  Comparison with HTMLPurifier +

(to top)
+
+  The HTMLPurifier PHP library by Edward Yang is a very good HTML filtering script that uses object oriented PHP code. Compared to htmLawed, it:
+
+  *  does not support PHP versions older than 5.0 (HTMLPurifier dropped PHP 4 support after version 2)
+
+  *  is 15-20 times bigger (scores of files totalling more than 750 kb)
+
+  *  consumes 10-15 times more RAM memory (just including the HTMLPurifier files without calling the filter requires a few MBs of memory)
+
+  *  is expectedly slower
+
+  *  does not allow admins to fully allow all valid HTML (because of incomplete HTML support, it always considers elements like script illegal)
+
+  *  lacks many of the extra features of htmLawed (like entity conversions and code compaction/beautification)
+
+  *  has poor documentation
+
+  However, HTMLPurifier has finer checks for character encodings and attribute values, and can log warnings and errors. Visit the HTMLPurifier website for updated information.
+ +
+

+4.7  Use through application plug-ins/modules +

(to top)
+
+  Plug-ins/modules to implement htmLawed in applications such as Drupal and DokuWiki may have been developed. Please check the application websites and the forum on the htmLawed site.
+ +
+

+4.8  Use in non-PHP applications +

(to top)
+
+  Non-PHP applications written in Python, Ruby, etc., may be able to use htmLawed through system calls to the PHP engine. Such code may have been documented on the internet. Also check the forum on the htmLawed site.
+ +
+

+4.9  Donate +

(to top)
+
+  A donation in any currency and amount to appreciate or support this software can be sent by PayPal to this email address: drpatnaik at yahoo dot com.
+ +
+

+4.10  Acknowledgements +

(to top)
+
+  Bryan Blakey, Ulf Harnhammer, Gareth Heyes, Lukasz Pilorz, Shelley Powers, Edward Yang, and many anonymous users.
+
+  Thank you!
+ +
+
+

+5  Appendices +

(to top)
+

+5.1  Characters discouraged in XHTML +

(to top)
+
+  Characters represented by the following hexadecimal code-points are not invalid, even though some validators may issue messages stating otherwise.
+
7f to 84, 86 to 9f, fdd0 to fddf, 1fffe, 1ffff, 2fffe, 2ffff, 3fffe, 3ffff, 4fffe, 4ffff, 5fffe, 5ffff, 6fffe, 6ffff, 7fffe, 7ffff, 8fffe, 8ffff, 9fffe, 9ffff, afffe, affff, bfffe, bffff, cfffe, cffff, dfffe, dffff, efffe, effff, ffffe, fffff, 10fffe and 10ffff
+ +
+

+5.2  Valid attribute-element combinations +

(to top)
+
+  Valid attribute-element combinations as per W3C specs.
+
+  *  includes deprecated attributes (marked ^), attributes for the non-standard embed element (marked *), and the proprietary bordercolor (marked ~)
+  *  only non-frameset, HTML body elements
+  *  name for a and map, and lang are invalid in XHTML 1.1
+  *  target is valid for a in XHTML 1.1 and higher
+  *  xml:space is only for XHTML 1.1
+
+  abbr - td, th
+  accept - form, input
+  accept-charset - form
+  accesskey - a, area, button, input, label, legend, textarea
+  action - form
+  align - caption^, embed, applet, iframe, img^, input^, object^, legend^, table^, hr^, div^, h1^, h2^, h3^, h4^, h5^, h6^, p^, col, colgroup, tbody, td, tfoot, th, thead, tr
+  alt - applet, area, img, input
+  archive - applet, object
+  axis - td, th
+  bgcolor - embed, table^, tr^, td^, th^
+  border - table, img^, object^
+  bordercolor~ - table, td, tr
+  cellpadding - table
+  cellspacing - table
+  char - col, colgroup, tbody, td, tfoot, th, thead, tr
+  charoff - col, colgroup, tbody, td, tfoot, th, thead, tr
+  charset - a, script
+  checked - input
+  cite - blockquote, q, del, ins
+  classid - object
+  clear - br^
+  code - applet
+  codebase - object, applet
+  codetype - object
+  color - font
+  cols - textarea
+  colspan - td, th
+  compact - dir, dl^, menu, ol^, ul^
+  coords - area, a
+  data - object
+  datetime - del, ins
+  declare - object
+  defer - script
+  dir - bdo
+  disabled - button, input, optgroup, option, select, textarea
+  enctype - form
+  face - font
+  for - label
+  frame - table
+  frameborder - iframe
+  headers - td, th
+  height - embed, iframe, td^, th^, img, object, applet
+  href - a, area
+  hreflang - a
+  hspace - applet, img^, object^
+  ismap - img, input
+  label - option, optgroup
+  language - script^
+  longdesc - img, iframe
+  marginheight - iframe
+  marginwidth - iframe
+  maxlength - input
+  method - form
+  model* - embed
+  multiple - select
+  name - button, embed, textarea, applet^, select, form^, iframe^, img^, a^, input, object, map^, param
+  nohref - area
+  noshade - hr^
+  nowrap - td^, th^
+  object - applet
+  onblur - a, area, button, input, label, select, textarea
+  onchange - input, select, textarea
+  onfocus - a, area, button, input, label, select, textarea
+  onreset - form
+  onselect - input, textarea
+  onsubmit - form
+  pluginspage* - embed
+  pluginurl* - embed
+  prompt - isindex
+  readonly - textarea, input
+  rel - a
+  rev - a
+  rows - textarea
+  rowspan - td, th
+  rules - table
+  scope - td, th
+  scrolling - iframe
+  selected - option
+  shape - area, a
+  size - hr^, font, input, select
+  span - col, colgroup
+  src - embed, script, input, iframe, img
+  standby - object
+  start - ol^
+  summary - table
+  tabindex - a, area, button, input, object, select, textarea
+  target - a^, area, form
+  type - a, embed, object, param, script, input, li^, ol^, ul^, button
+  usemap - img, input, object
+  valign - col, colgroup, tbody, td, tfoot, th, thead, tr
+  value - input, option, param, button, li^
+  valuetype - param
+  vspace - applet, img^, object^
+  width - embed, hr^, iframe, img, object, table, td^, th^, applet, col, colgroup, pre^
+  wmode - embed
+  xml:space - pre, script, style
+
+  These are allowed in all but the shown elements:
+
+  class - param, script
+  dir - applet, bdo, br, iframe, param, script
+  id - script
+  lang - applet, br, iframe, param, script
+  onclick - applet, bdo, br, font, iframe, isindex, param, script
+  ondblclick - applet, bdo, br, font, iframe, isindex, param, script
+  onkeydown - applet, bdo, br, font, iframe, isindex, param, script
+  onkeypress - applet, bdo, br, font, iframe, isindex, param, script
+  onkeyup - applet, bdo, br, font, iframe, isindex, param, script
+  onmousedown - applet, bdo, br, font, iframe, isindex, param, script
+  onmousemove - applet, bdo, br, font, iframe, isindex, param, script
+  onmouseout - applet, bdo, br, font, iframe, isindex, param, script
+  onmouseover - applet, bdo, br, font, iframe, isindex, param, script
+  onmouseup - applet, bdo, br, font, iframe, isindex, param, script
+  style - param, script
+  title - param, script
+  xml:lang - applet, br, iframe, param, script
+ +
+

+5.3  CSS 2.1 properties accepting URLs +

(to top)
+
+  background
+  background-image
+  content
+  cue-after
+  cue-before
+  cursor
+  list-style
+  list-style-image
+  play-during
+ +
+

+5.4  Microsoft Windows 1252 character replacements +

(to top)
+
+  Key: d double, l left, q quote, r right, s. single
+
+  Code-point (decimal) - hexadecimal value - replacement entity - represented character
+
+  127 - 7f - (removed) - (not used)
+  128 - 80 - &#8364; - euro
+  129 - 81 - (removed) - (not used)
+  130 - 82 - &#8218; - baseline s. q
+  131 - 83 - &#402; - florin
+  132 - 84 - &#8222; - baseline d q
+  133 - 85 - &#8230; - ellipsis
+  134 - 86 - &#8224; - dagger
+  135 - 87 - &#8225; - d dagger
+  136 - 88 - &#710; - circumflex accent
+  137 - 89 - &#8240; - permile
+  138 - 8a - &#352; - S Hacek
+  139 - 8b - &#8249; - l s. guillemet
+  140 - 8c - &#338; - OE ligature
+  141 - 8d - (removed) - (not used)
+  142 - 8e - &#381; - Z dieresis
+  143 - 8f - (removed) - (not used)
+  144 - 90 - (removed) - (not used)
+  145 - 91 - &#8216; - l s. q
+  146 - 92 - &#8217; - r s. q
+  147 - 93 - &#8220; - l d q
+  148 - 94 - &#8221; - r d q
+  149 - 95 - &#8226; - bullet
+  150 - 96 - &#8211; - en dash
+  151 - 97 - &#8212; - em dash
+  152 - 98 - &#732; - tilde accent
+  153 - 99 - &#8482; - trademark
+  154 - 9a - &#353; - s Hacek
+  155 - 9b - &#8250; - r s. guillemet
+  156 - 9c - &#339; - oe ligature
+  157 - 9d - (removed) - (not used)
+  158 - 9e - &#382; - z dieresis
+  159 - 9f - &#376; - Y dieresis
+ +
+

+5.5  URL format +

(to top)
+
+  An absolute URL has a protocol or scheme, a network location or hostname, and, optional path, parameters, query and fragment segments. Thus, an absolute URL has this generic structure:
+
+ +    (scheme) : (//network location) /(path) ;(parameters) ?(query) #(fragment) +
+
+  The schemes can only contain letters, digits, +, . and -. Hostname is the portion after the // and up to the first / (if any; else, up to the end) when : is followed by a // (e.g., abc.com in ftp://abc.com/def); otherwise, it consists of everything after the : (e.g., def@abc.com in mailto:def@abc.com').
+
Relative URLs do not have explicit schemes and network locations; such values are inherited from a base URL.
+ +
+

+5.6  Brief on htmLawed code +

(to top)
+
+  Much of the code's logic and reasoning can be understood from the documentation above.
+
+  The output of htmLawed is a text string containing the processed input. There is no custom error tracking.
+
Function arguments for htmLawed are:
+
+  *  $in - 1st argument; a text string; the input text to be processed. Any extraneous slashes added by PHP when magic quotes are enabled should be removed beforehand using PHP's stripslashes() function.
+
+  *  $config - 2nd argument; an associative array; optional (named $C in htmLawed code). The array has keys with names like balance and keep_bad, and the values, which can be boolean, string, or array, depending on the key, are read to accordingly set the configurable parameters (indicated by the keys). All configurable parameters receive some default value if the value to be used is not specified by the user through $config. Finalized $config is thus a filtered and possibly larger array.
+
+  *  $spec - 3rd argument; a text string; optional. The string has rules, written in an htmLawed-designated format, specifying element-specific attribute and attribute value restrictions. Function hl_spec() is used to convert the string to an associative-array for internal use. Finalized $spec is thus an array.
+
Finalized $config and $spec are made global variables while htmLawed is at work. Values of any pre-existing global variables with same names are noted, and their values are restored after htmLawed finishes processing the input (to capture the finalized values, the show_settings parameter of $config should be used). Depending on $config, another global variable hl_Ids, to track id attribute values for uniqueness, may be set. Unlike the other two variables, this one is not reset (or unset) post-processing.
+
+  Except for the main function htmLawed() and the functions kses() and kses_hook(), htmLawed's functions are name-spaced using the hl_ prefix. The functions and their roles are:
+
+  *  hl_attrval - checking attribute values against $spec
+  *  hl_bal - tag balancing
+  *  hl_cmtcd - handling CDATA sections and HTML comments
+  *  hl_ent - entity handling
+  *  hl_prot - checking a URL scheme/protocol
+  *  hl_regex - checking syntax of a regular expression
+  *  hl_spec - converting user-supplied $spec value to one used by htmLawed internally
+  *  hl_tag - handling tags
+  *  hl_tag2 - transforming tags
+  *  hl_tidy - compact/beautify HTML
+  *  hl_version - reporting htmLawed version
+  *  htmLawed - main function
+  *  kses - main function of kses
+  *  kses_hook - hook function of kses
+
+  The last two are for compatibility with pre-existing code using the kses script. htmLawed's kses() basically passes on the filtering task to htmLawed() function after deciphering $config and $spec from the argument values supplied to it. kses_hook() is an empty function and is meant for being filled with custom code if the kses script users were using one.
+
htmLawed() finalizes $spec (with the help of hl_spec()) and $config, and globalizes them. Finalization of $config involves setting default values if an inappropriate or invalid one is supplied. This includes calling hl_regex() to check well-formedness of regular expression patterns if such expressions are user-supplied through $config. htmLawed() then removes invalid characters like nulls and x01 and appropriately handles entities using hl_ent(). HTML comments and CDATA sections are identified and treated as per $config with the help of hl_cmtcd(). When retained, the < and > characters identifying them, and the <, > and & characters inside them, are replaced with control characters (code-points 1 to 5) till any tag balancing is completed.
+
+  After this initial processing htmLawed() identifies tags using regex and processes them with the help of hl_tag() --  a large function that analyzes tag content, filtering it as per HTML standards, $config and $spec. Among other things, hl_tag() transforms deprecated elements using hl_tag2(), removes attributes from closing tags, checks attribute values as per $spec rules using hl_attrval(), and checks URL protocols using hl_prot(). htmLawed() performs tag balancing and nesting checks with a call to hl_bal(), and optionally compacts/beautifies the output with proper white-spacing with a call to hl_tidy(). The latter temporarily replaces white-space, and <, > and & characters inside pre, script and textarea elements, and HTML comments and CDATA sections with control characters (code-points 1 to 5, and 7).
+
+  htmLawed permits the use of custom code or hook functions at two stages. The first, called inside htmLawed(), allows the input text as well as the finalized $config and $spec values to be altered right after the initial processing (see section 3.7). The second is called by hl_tag() once the tag content is finalized (see section 3.4.9).
+
+  Being dictated by the external and stable HTML standard, htmLawed's objective is very clear-cut and less concerned with tweakability. The code is only minimally annotated with comments -- it is not meant to instruct; PHP developers familiar with the HTML specs will see the logic, and others can always refer to the htmLawed documentation. The compact structuring of the statements is meant to aid in quickly grasping the logic, at least when viewed with code syntax highlighted. +
+
+
+


HTM version of htmLawed_README.txt generated on 23 Apr, 2009 using rTxt2htm from PHP Labware +
+
+ + \ No newline at end of file diff --git a/extlib/htmLawed/htmLawed_README.txt b/extlib/htmLawed/htmLawed_README.txt new file mode 100644 index 000000000..3ce4b9ac1 --- /dev/null +++ b/extlib/htmLawed/htmLawed_README.txt @@ -0,0 +1,1600 @@ +/* +htmLawed_README.txt, 16 July 2009 +htmLawed 1.1.8.1, 16 July 2009 +Copyright Santosh Patnaik +GPL v3 license +A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed +*/ + + +== Content ========================================================== + + +1 About htmLawed + 1.1 Example uses + 1.2 Features + 1.3 History + 1.4 License & copyright + 1.5 Terms used here +2 Usage + 2.1 Simple + 2.2 Configuring htmLawed using the '$config' parameter + 2.3 Extra HTML specifications using the '$spec' parameter + 2.4 Performance time & memory usage + 2.5 Some security risks to keep in mind + 2.6 Use without modifying old 'kses()' code + 2.7 Tolerance for ill-written HTML + 2.8 Limitations & work-arounds + 2.9 Examples +3 Details + 3.1 Invalid/dangerous characters + 3.2 Character references/entities + 3.3 HTML elements + 3.3.1 HTML comments and 'CDATA' sections + 3.3.2 Tag-transformation for better XHTML-Strict + 3.3.3 Tag balancing and proper nesting + 3.3.4 Elements requiring child elements + 3.3.5 Beautify or compact HTML + 3.4 Attributes + 3.4.1 Auto-addition of XHTML-required attributes + 3.4.2 Duplicate/invalid 'id' values + 3.4.3 URL schemes (protocols) and scripts in attribute values + 3.4.4 Absolute & relative URLs + 3.4.5 Lower-cased, standard attribute values + 3.4.6 Transformation of deprecated attributes + 3.4.7 Anti-spam & 'href' + 3.4.8 Inline style properties + 3.4.9 Hook function for tag content + 3.5 Simple configuration directive for most valid XHTML + 3.6 Simple configuration directive for most `safe` HTML + 3.7 Using a hook function + 3.8 Obtaining `finalized` parameter values + 3.9 Retaining non-HTML tags in input with mixed markup +4 Other + 4.1 Support + 4.2 Known issues + 4.3 Change-log + 4.4 Testing + 4.5 Upgrade, & old versions + 4.6 Comparison with 'HTMLPurifier' + 4.7 Use through application plug-ins/modules + 4.8 Use in non-PHP applications + 4.9 Donate + 4.10 Acknowledgements +5 Appendices + 5.1 Characters discouraged in HTML + 5.2 Valid attribute-element combinations + 5.3 CSS 2.1 properties accepting URLs + 5.4 Microsoft Windows 1252 character replacements + 5.5 URL format + 5.6 Brief on htmLawed code + + +== 1 About htmLawed ================================================ + + + htmLawed is a highly customizable single-file PHP script to make text secure, and standard- and admin policy-compliant for use in the body of HTML 4, XHTML 1 or 1.1, or generic XML documents. It is thus a configurable input (X)HTML filter, processor, purifier, sanitizer, beautifier, etc., and an alternative to the HTMLTidy:- http://tidy.sourceforge.net application. + + The `lawing in` of input text is needed to ensure that HTML code in the text is standard-compliant, does not introduce security vulnerabilities, and does not break the aesthetics, design or layout of web-pages. htmLawed tries to do this by, for example, making HTML well-formed with balanced and properly nested tags, neutralizing code that may be used for cross-site scripting ('XSS') attacks, and allowing only specified HTML elements/tags and attributes. + + +-- 1.1 Example uses ------------------------------------------------ + + + * Filtering of text submitted as comments on blogs to allow only certain HTML elements + + * Making RSS/Atom newsfeed item-content standard-compliant: often one uses an excerpt from an HTML document for the content, and with unbalanced tags, non-numerical entities, etc., such excerpts may not be XML-compliant + + * Text processing for stricter XML standard-compliance: e.g., to have lowercased 'x' in hexadecimal numeric entities becomes necessary if an XHTML document with MathML content needs to be served as 'application/xml' + + * Scraping text or data from web-pages + + * Pretty-printing HTML code + + +-- 1.2 Features ---------------------------------------------------o + + + Key: '*' security feature, '^' standard compliance, '~' requires setting right options, '`' different from 'Kses' + + * make input more *secure* and *standard-compliant* + * use for HTML 4, XHTML 1.0 or 1.1, or even generic *XML* documents ^~` + + * *beautify* or *compact* HTML ^~` + + * *restrict elements* ^~` + * proper closure of empty elements like 'img' ^` + * *transform deprecated elements* like 'u' ^~` + * HTML *comments* and 'CDATA' sections can be permitted ^~` + * elements like 'script', 'object' and 'form' can be permitted ~ + + * *restrict attributes*, including *element-specifically* ^~` + * remove *invalid attributes* ^` + * element and attribute names are *lower-cased* ^ + * provide *required attributes*, like 'alt' for 'image' ^` + * *transform deprecated attributes* ^~` + * attributes *declared only once* ^` + + * *restrict attribute values*, including *element-specifically* ^~` + * a value is declared for `empty` (`minimized`) attributes like 'checked' ^ + * check for potentially dangerous attribute values *~ + * ensure *unique* 'id' attribute values ^~` + * *double-quote* attribute values ^ + * lower-case *standard attribute values* like 'password' ^` + + * *attribute-specific URL protocol/scheme restriction* *~` + * disable *dynamic expressions* in 'style' values *~` + + * neutralize invalid named character entities ^` + * *convert* hexadecimal numeric entities to decimal ones, or vice versa ^~` + * convert named entities to numeric ones for generic XML use ^~` + + * remove *null* characters * + * neutralize potentially dangerous proprietary Netscape *Javascript entities* * + * replace potentially dangerous *soft-hyphen* character in attribute values with spaces * + + * remove common *invalid characters* not allowed in HTML or XML ^` + * replace *characters from Microsoft applications* like 'Word' that are discouraged in HTML or XML ^~` + * neutralize entities for characters invalid or discouraged in HTML or XML ^` + * appropriately neutralize '<', '&', '"', and '>' characters ^*` + + * understands improperly spaced tag content (like, spread over more than a line) and properly spaces them ` + * attempts to *balance tags* for well-formedness ^~` + * understands when *omitable closing tags* like '

' (allowed in HTML 4, transitional, e.g.) are missing ^~` + * attempts to permit only *validly nested tags* ^~` + * option to *remove or neutralize bad content* ^~` + * attempts to *rectify common errors of plain-text misplacement* (e.g., directly inside 'blockquote') ^~` + + * fast, *non-OOP* code of ~45 kb incurring peak basal memory usage of ~0.5 MB + * *compatible* with pre-existing code using 'Kses' (the filter used by 'WordPress') + + * optional *anti-spam* measures such as addition of 'rel="nofollow"' and link-disabling ~` + * optionally makes *relative URLs absolute*, and vice versa ~` + + * optionally mark '&' to identify the entities for '&', '<' and '>' introduced by htmLawed ~` + + * allows deployment of powerful *hook functions* to *inject* HTML, *consolidate* 'style' attributes to 'class', finely check attribute values, etc. ~` + + * *independent of character encoding* of input and does not affect it + + * *tolerance for ill-written HTML* to a certain degree + + +-- 1.3 History ----------------------------------------------------o + + + htmLawed was developed for use with 'LabWiki', a wiki software developed at PHP Labware, as a suitable software could not be found. Existing PHP software like 'Kses' and 'HTMLPurifier' were deemed inadequate, slow, resource-intensive, or dependent on external applications like 'HTML Tidy'. + + htmLawed started as a modification of Ulf Harnhammar's 'Kses' (version 0.2.2) software, and is compatible with code that uses 'Kses'; see section:- #2.6. + + +-- 1.4 License & copyright ----------------------------------------o + + + htmLawed is free and open-source software licensed under GPL license version 3:- http://www.gnu.org/licenses/gpl-3.0.txt, and copyrighted by Santosh Patnaik, MD, PhD. + + +-- 1.5 Terms used here --------------------------------------------o + + + * `administrator` - or admin; person setting up the code to pass input through htmLawed; also, `user` + * `attributes` - name-value pairs like 'href="http://x.com"' in opening tags + * `author` - `writer` + * `character` - atomic unit of text; internally represented by a numeric `code-point` as specified by the `encoding` or `charset` in use + * `entity` - markup like '>' and ' ' used to refer to a character + * `element` - HTML element like 'a' and 'img' + * `element content` - content between the opening and closing tags of an element, like 'click' of 'click' + * `HTML` - implies XHTML unless specified otherwise + * `input` - text string given to htmLawed to process + * `processing` - involves filtering, correction, etc., of input + * `safe` - absence or reduction of certain characters and HTML elements and attributes in the input that can otherwise potentially and circumstantially expose web-site users to security vulnerabilities like cross-site scripting attacks (XSS) + * `scheme` - URL protocol like 'http' and 'ftp' + * `specs` - standard specifications + * `style property` - terms like 'border' and 'height' for which declarations are made in values for the 'style' attribute of elements + * `tag` - markers like '' and '' delineating element content; the opening tag can contain attributes + * `tag content` - consists of tag markers '<' and '>', element names like 'div', and possibly attributes + * `user` - administrator + * `writer` - end-user like a blog commenter providing the input that is to be processed; also, `author` + + +== 2 Usage ========================================================oo + + + htmLawed should work with PHP 4.3 and higher. Either 'include()' the 'htmLawed.php' file or copy-paste the entire code. + + To easily *test* htmLawed using a form-based interface, use the provided demo:- htmLawedTest.php ('htmLawed.php' and 'htmLawedTest.php' should be in the same directory on the web-server). + + +-- 2.1 Simple ------------------------------------------------------ + + + The input text to be processed, '$text', is passed as an argument of type string; 'htmLawed()' returns the processed string: + + $processed = htmLawed($text); + + *Note*: If input is from a '$_GET' or '$_POST' value, and 'magic quotes' are enabled on the PHP setup, run 'stripslashes()' on the input before passing to htmLawed. + + By default, htmLawed will process the text allowing all valid HTML elements/tags, secure URL scheme/CSS style properties, etc. It will allow 'CDATA' sections and HTML comments, balance tags, and ensure proper nesting of elements. Such actions can be configured using two other optional arguments -- '$config' and '$spec': + + $processed = htmLawed($text, $config, $spec); + + These extra parameters are detailed below. Some examples are shown in section:- #2.9. + + *Note*: For maximum protection against 'XSS' and other scripting attacks (e.g., by disallowing Javascript code), consider using the 'safe' parameter; see section:- #3.6. + + +-- 2.2 Configuring htmLawed using the '$config' parameter ---------o + + + '$config' instructs htmLawed on how to tackle certain tasks. When '$config' is not specified, or not set as an array (e.g., '$config = 1'), htmLawed will take default actions. One or many of the task-action or value-specification pairs can be specified in '$config' as array key-value pairs. If a parameter is not specified, htmLawed will use the default value/action indicated further below. + + $config = array('comment'=>0, 'cdata'=>1); + $processed = htmLawed($text, $config); + + Or, + + $processed = htmLawed($text, array('comment'=>0, 'cdata'=>1)); + + Below are the possible value-specification combinations. In PHP code, values that are integers should not be quoted and should be used as numeric types (unless meant as string/text). + + Key: '*' default, '^' different default when htmLawed is used in the Kses-compatible mode (see section:- #2.6), '~' different default when 'valid_xhtml' is set to '1' (see section:- #3.5), '"' different default when 'safe' is set to '1' (see section:- #3.6) + + *abs_url* + Make URLs absolute or relative; '$config["base_url"]' needs to be set; see section:- #3.4.4 + + '-1' - make relative + '0' - no action * + '1' - make absolute + + *and_mark* + Mark '&' characters in the original input; see section:- #3.2 + + *anti_link_spam* + Anti-link-spam measure; see section:- #3.4.7 + + '0' - no measure taken * + 'array("regex1", "regex2")' - will ensure a 'rel' attribute with 'nofollow' in its value in case the 'href' attribute value matches the regular expression pattern 'regex1', and/or will remove 'href' if its value matches the regular expression pattern 'regex2'. E.g., 'array("/./", "/://\W*(?!(abc\.com|xyz\.org))/")'; see section:- #3.4.7 for more. + + *anti_mail_spam* + Anti-mail-spam measure; see section:- #3.4.7 + + '0' - no measure taken * + 'word' - '@' in mail address in 'href' attribute value is replaced with specified 'word' + + *balance* + Balance tags for well-formedness and proper nesting; see section:- #3.3.3 + + '0' - no + '1' - yes * + + *base_url* + Base URL value that needs to be set if '$config["abs_url"]' is not '0'; see section:- #3.4.4 + + *cdata* + Handling of 'CDATA' sections; see section:- #3.3.1 + + '0' - don't consider 'CDATA' sections as markup and proceed as if plain text ^" + '1' - remove + '2' - allow, but neutralize any '<', '>', and '&' inside by converting them to named entities + '3' - allow * + + *clean_ms_char* + Replace discouraged characters introduced by Microsoft Word, etc.; see section:- #3.1 + + '0' - no * + '1' - yes + '2' - yes, but replace special single & double quotes with ordinary ones + + *comment* + Handling of HTML comments; see section:- #3.3.1 + + '0' - don't consider comments as markup and proceed as if plain text ^" + '1' - remove + '2' - allow, but neutralize any '<', '>', and '&' inside by converting to named entities + '3' - allow * + + *css_expression* + Allow dynamic CSS expression by not removing the expression from CSS property values in 'style' attributes; see section:- #3.4.8 + + '0' - remove * + '1' - allow + + *deny_attribute* + Denied HTML attributes; see section:- #3.4 + + '0' - none * + 'string' - dictated by values in 'string' + 'on*' (like 'onfocus') attributes not allowed - " + + *elements* + Allowed HTML elements; see section:- #3.3 + + '* -center -dir -font -isindex -menu -s -strike -u' - ~ + 'applet, embed, iframe, object, script' not allowed - " + + *hexdec_entity* + Allow hexadecimal numeric entities and do not convert to the more widely accepted decimal ones, or convert decimal to hexadecimal ones; see section:- #3.2 + + '0' - no + '1' - yes * + '2' - convert decimal to hexadecimal ones + + *hook* + Name of an optional hook function to alter the input string, '$config' or '$spec' before htmLawed starts its main work; see section:- #3.7 + + '0' - no hook function * + 'name' - 'name' is name of the hook function ('kses_hook' ^) + + *hook_tag* + Name of an optional hook function to alter tag content finalized by htmLawed; see section:- #3.4.9 + + '0' - no hook function * + 'name' - 'name' is name of the hook function + + *keep_bad* + Neutralize bad tags by converting '<' and '>' to entities, or remove them; see section:- #3.3.3 + + '0' - remove ^ + '1' - neutralize both tags and element content + '2' - remove tags but neutralize element content + '3' and '4' - like '1' and '2' but remove if text ('pcdata') is invalid in parent element + '5' and '6' * - like '3' and '4' but line-breaks, tabs and spaces are left + + *lc_std_val* + For XHTML compliance, predefined, standard attribute values, like 'get' for the 'method' attribute of 'form', must be lowercased; see section:- #3.4.5 + + '0' - no + '1' - yes * + + *make_tag_strict* + Transform/remove these non-strict XHTML elements, even if they are allowed by the admin: 'applet' 'center' 'dir' 'embed' 'font' 'isindex' 'menu' 's' 'strike' 'u'; see section:- #3.3.2 + + '0' - no ^ + '1' - yes, but leave 'applet', 'embed' and 'isindex' elements that currently can't be transformed * + '2' - yes, removing 'applet', 'embed' and 'isindex' elements and their contents (nested elements remain) ~ + + *named_entity* + Allow non-universal named HTML entities, or convert to numeric ones; see section:- #3.2 + + '0' - convert + '1' - allow * + + *no_deprecated_attr* + Allow deprecated attributes or transform them; see section:- #3.4.6 + + '0' - allow ^ + '1' - transform, but 'name' attributes for 'a' and 'map' are retained * + '2' - transform + + *parent* + Name of the parent element, possibly imagined, that will hold the input; see section:- #3.3 + + *safe* + Magic parameter to make input the most secure against XSS without needing to specify other relevant '$config' parameters; see section:- #3.6 + + '0' - no * + '1' - will auto-adjust other relevant '$config' parameters (indicated by '"' in this list) + + *schemes* + Array of attribute-specific, comma-separated, lower-cased list of schemes (protocols) allowed in attributes accepting URLs; '*' covers all unspecified attributes; see section:- #3.4.3 + + 'href: aim, feed, file, ftp, gopher, http, https, irc, mailto, news, nntp, sftp, ssh, telnet; *:file, http, https' * + '*: ftp, gopher, http, https, mailto, news, nntp, telnet' ^ + 'href: aim, feed, file, ftp, gopher, http, https, irc, mailto, news, nntp, sftp, ssh, telnet; style: nil; *:file, http, https' " + + *show_setting* + Name of a PHP variable to assign the `finalized` '$config' and '$spec' values; see section:- #3.8 + + *style_pass* + Do not look at 'style' attribute values, letting them through without any alteration + + '0' - no * + '1' - htmLawed will let through any 'style' value; see section:- #3.4.8 + + *tidy* + Beautify or compact HTML code; see section:- #3.3.5 + + '-1' - compact + '0' - no * + '1' or 'string' - beautify (custom format specified by 'string') + + *unique_ids* + 'id' attribute value checks; see section:- #3.4.2 + + '0' - no ^ + '1' - remove duplicate and/or invalid ones * + 'word' - remove invalid ones and replace duplicate ones with new and unique ones based on the 'word'; the admin-specified 'word', like 'my_', should begin with a letter (a-z) and can contain letters, digits, '.', '_', '-', and ':'. + + *valid_xhtml* + Magic parameter to make input the most valid XHTML without needing to specify other relevant '$config' parameters; see section:- #3.5 + + '0' - no * + '1' - will auto-adjust other relevant '$config' parameters (indicated by '~' in this list) + + *xml:lang* + Auto-adding 'xml:lang' attribute; see section:- #3.4.1 + + '0' - no * + '1' - add if 'lang' attribute is present + '2' - add if 'lang' attribute is present, and remove 'lang' ~ + + +-- 2.3 Extra HTML specifications using the $spec parameter --------o + + + The '$spec' argument can be used to disallow an otherwise legal attribute for an element, or to restrict the attribute's values. This can also be helpful as a security measure (e.g., in certain versions of browsers, certain values can cause buffer overflows and denial of service attacks), or in enforcing admin policy compliance. '$spec' is specified as a string of text containing one or more `rules`, with multiple rules separated from each other by a semi-colon (';'). E.g., + + $spec = 'i=-*; td, tr=style, id, -*; a=id(match="/[a-z][a-z\d.:\-`"]*/i"/minval=2), href(maxlen=100/minlen=34); img=-width,-alt'; + $processed = htmLawed($text, $config, $spec); + + Or, + + $processed = htmLawed($text, $config, 'i=-*; td, tr=style, id, -*; a=id(match="/[a-z][a-z\d.:\-`"]*/i"/minval=2), href(maxlen=100/minlen=34); img=-width,-alt'); + + A rule begins with an HTML *element* name(s) (`rule-element`), for which the rule applies, followed by an equal ('=') sign. A rule-element may represent multiple elements if comma (,)-separated element names are used. E.g., 'th,td,tr='. + + Rest of the rule consists of comma-separated HTML *attribute names*. A minus ('-') character before an attribute means that the attribute is not permitted inside the rule-element. E.g., '-width'. To deny all attributes, '-*' can be used. + + Following shows examples of rule excerpts with rule-element 'a' and the attributes that are being permitted: + + * 'a=' - all + * 'a=id' - all + * 'a=href, title, -id, -onclick' - all except 'id' and 'onclick' + * 'a=*, id, -id' - all except 'id' + * 'a=-*' - none + * 'a=-*, href, title' - none except 'href' and 'title' + * 'a=-*, -id, href, title' - none except 'href' and 'title' + + Rules regarding *attribute values* are optionally specified inside round brackets after attribute names in slash ('/')-separated `parameter = value` pairs. E.g., 'title(maxlen=30/minlen=5)'. None, or one or more of the following parameters may be specified: + + * 'oneof' - one or more choices separated by '|' that the value should match; if only one choice is provided, then the value must match that choice + + * 'noneof' - one or more choices separated by '|' that the value should not match + + * 'maxlen' and 'minlen' - upper and lower limits for the number of characters in the attribute value; specified in numbers + + * 'maxval' and 'minval' - upper and lower limits for the numerical value specified in the attribute value; specified in numbers + + * 'match' and 'nomatch' - pattern that the attribute value should or should not match; specified as PHP/PCRE-compatible regular expressions with delimiters and possibly modifiers + + * 'default' - a value to force on the attribute if the value provided by the writer does not fit any of the specified parameters + + If 'default' is not set and the attribute value does not satisfy any of the specified parameters, then the attribute is removed. The 'default' value can also be used to force all attribute declarations to take the same value (by getting the values declared illegal by setting, e.g., 'maxlen' to '-1'). + + Examples with `input` '' are shown below. + + `Rule`: 'input=title(maxlen=60/minlen=6), value' + `Output`: '' + + `Rule`: 'input=title(), value(maxval=8/default=6)' + `Output`: '' + + `Rule`: 'input=title(nomatch=$w.d$i), value(match=$em$/default=6em)' + `Output`: '' + + `Rule`: 'input=title(oneof=height|depth/default=depth), value(noneof=5|6)' + `Output`: '' + + *Special characters*: The characters ';', ',', '/', '(', ')', '|', '~' and space have special meanings in the rules. Words in the rules that use such characters, or the characters themselves, should be `escaped` by enclosing in pairs of double-quotes ('"'). A back-tick ('`') can be used to escape a literal '"'. An example rule illustrating this is 'input=value(maxlen=30/match="/^\w/"/default="your `"ID`"")'. + + *Note*: To deny an attribute for all elements for which it is legal, '$config["deny_attribute"]' (see section:- #3.4) can be used instead of '$spec'. Also, attributes can be allowed element-specifically through '$spec' while being denied globally through '$config["deny_attribute"]'. The 'hook_tag' parameter (section:- #3.4.9) can also be used to implement the '$spec' functionality. + + +-- 2.4 Performance time & memory usage ----------------------------o + + + The time and memory used by htmLawed depends on its configuration and the size of the input, and the amount, nestedness and well-formedness of the HTML markup within it. In particular, tag balancing and beautification each can increase the processing time by about a quarter. + + The htmLawed demo:- htmLawedTest.php can be used to evaluate the performance and effects of different types of input and '$config'. + + +-- 2.5 Some security risks to keep in mind ------------------------o + + + When setting the parameters/arguments (like those to allow certain HTML elements) for use with htmLawed, one should bear in mind that the setting may let through potentially `dangerous` HTML code. (This may not be a problem if the authors are trusted.) + + For example, following increase security risks: + + * Allowing 'script', 'applet', 'embed', 'iframe' or 'object' elements, or certain of their attributes like 'allowscriptaccess' + + * Allowing HTML comments (some Internet Explorer versions are vulnerable with, e.g., '' + + * Allowing dynamic CSS expressions (a feature of the IE browser) + + `Unsafe` HTML can be removed by setting '$config' appropriately. E.g., '$config["elements"] = "* -script"' (section:- #3.3), '$config["safe"] = 1' (section:- #3.6), etc. + + +-- 2.6 Use without modifying old 'kses()' code --------------------o + + + The 'Kses' PHP script is used by many applications (like 'WordPress'). It is possible to have such applications use htmLawed instead, since it is compatible with code that calls the 'kses()' function declared in the 'Kses' file (usually named 'kses.php'). E.g., application code like this will continue to work after replacing 'Kses' with htmLawed: + + $comment_filtered = kses($comment_input, array('a'=>array(), 'b'=>array(), 'i'=>array())); + + For some of the '$config' parameters, htmLawed will use values other than the default ones. These are indicated by '^' in section:- #2.2. To force htmLawed to use other values, function 'kses()' in the htmLawed code should be edited -- a few configurable parameters/variables need to be changed. + + If the application uses a 'Kses' file that has the 'kses()' function declared, then, to have the application use htmLawed instead of 'Kses', simply rename 'htmLawed.php' (to 'kses.php', e.g.) and replace the 'Kses' file (or just replace the code in the 'Kses' file with the htmLawed code). If the 'kses()' function in the 'Kses' file had been renamed by the application developer (e.g., in 'WordPress', it is named 'wp_kses()'), then appropriately rename the 'kses()' function in the htmLawed code. + + If the 'Kses' file used by the application has been highly altered by the application developers, then one may need a different approach. E.g., with 'WordPress', it is best to copy the htmLawed code to 'wp_includes/kses.php', rename the newly added function 'kses()' to 'wp_kses()', and delete the code for the original 'wp_kses()' function. + + If the 'Kses' code has a non-empty hook function (e.g., 'wp_kses_hook()' in case of 'WordPress'), then the code for htmLawed's 'kses_hook()' function should be appropriately edited. However, the requirement of the hook function should be re-evaluated considering that htmLawed has extra capabilities. With 'WordPress', the hook function is an essential one. The following code is suggested for the htmLawed 'kses_hook()' in case of 'WordPress': + + function kses_hook($string, &$cf, &$spec){ + // kses compatibility + $allowed_html = $spec; + $allowed_protocols = array(); + foreach($cf['schemes'] as $v){ + foreach($v as $k2=>$v2){ + if(!in_array($k2, $allowed_protocols)){ + $allowed_protocols[] = $k2; + } + } + } + return wp_kses_hook($string, $allowed_html, $allowed_protocols); + // eof + } + + +-- 2.7 Tolerance for ill-written HTML -----------------------------o + + + htmLawed can work with ill-written HTML code in the input. However, HTML that is too ill-written may not be `read` as HTML, and be considered mere plain text instead. Following statements indicate the degree of `looseness` that htmLawed can work with, and can be provided in instructions to writers: + + * Tags must be flanked by '<' and '>' with no '>' inside -- any needed '>' should be put in as '>'. It is possible for tag content (element name and attributes) to be spread over many lines instead of being on one. A space may be present between the tag content and '>', like '
' and '', but not after the '<'. + + * Element and attribute names need not be lower-cased. + + * Attribute string of elements may be liberally spaced with tabs, line-breaks, etc. + + * Attribute values may not be double-quoted, or may be single-quoted. + + * Left-padding of numeric entities (like, ' ', '&x07ff;') with '0' is okay as long as the number of characters between between the '&' and the ';' does not exceed 8. All entities must end with ';' though. + + * Named character entities must be properly cased. E.g., '≪' or '&TILDE;' will not be let through without modification. + + * HTML comments should not be inside element tags (okay between tags), and should begin with ''. Characters like '<', '>', and '&' may be allowed inside depending on '$config', but any '-->' inside should be put in as '-->'. Any '--' inside will be automatically converted to '-', and a space will be added before the comment delimiter '-->'. + + * 'CDATA' sections should not be inside element tags, and can be in element content only if plain text is allowed for that element. They should begin with '<[CDATA[' and end with ']]>'. Characters like '<', '>', and '&' may be allowed inside depending on '$config', but any ']]>' inside should be put in as ']]>'. + + * For attribute values, character entities '<', '>' and '&' should be used instead of characters '<' and '>', and '&' (when '&' is not part of a character entity). This applies even for Javascript code in values of attributes like 'onclick'. + + * Characters '<', '>', '&' and '"' that are part of actual Javascript, etc., code in 'script' elements should be used as such and not be put in as entities like '>'. Otherwise, though the HTML will be valid, the code may fail to work. Further, if such characters have to be used, then they should be put inside 'CDATA' sections. + + * Simple instructions like "an opening tag cannot be present between two closing tags" and "nested elements should be closed in the reverse order of how they were opened" can help authors write balanced HTML. If tags are imbalanced, htmLawed will try to balance them, but in the process, depending on '$config["keep_bad"]', some code/text may be lost. + + * Input authors should be notified of admin-specified allowed elements, attributes, configuration values (like conversion of named entities to numeric ones), etc. + + * With '$config["unique_ids"]' not '0' and the 'id' attribute being permitted, writers should carefully avoid using duplicate or invalid 'id' values as even though htmLawed will correct/remove the values, the final output may not be the one desired. E.g., when '' is processed into +''. + + * Note that even if intended HTML is lost in a highly ill-written input, the processed output will be more secure and standard-compliant. + + * For URLs, unless '$config["scheme"]' is appropriately set, writers should avoid using escape characters or entities in schemes. E.g., 'http' (which many browsers will read as the harmless 'http') may be considered bad by htmLawed. + + * htmLawed will attempt to put plain text present directly inside 'blockquote', 'form', 'map' and 'noscript' elements (illegal as per the specs) inside auto-generated 'div' elements. + + +-- 2.8 Limitations & work-arounds ---------------------------------o + + + htmLawed's main objective is to make the input text `more` standard-compliant, secure for web-page readers, and free of HTML elements and attributes considered undesirable by the administrator. Some of its current limitations, regardless of this objective, are noted below along with work-arounds. + + It should be borne in mind that no browser application is 100% standard-compliant, and that some of the standard specs (like asking for normalization of white-spacing within 'textarea' elements) are clearly wrong. Regarding security, note that `unsafe` HTML code is not necessarily legally invalid. + + * htmLawed is meant for input that goes into the 'body' of HTML documents. HTML's head-level elements are not supported, nor are the frameset elements 'frameset', 'frame' and 'noframes'. + + * It cannot transform the non-standard 'embed' elements to the standard-compliant 'object' elements. Yet, it can allow 'embed' elements if permitted ('embed' is widely used and supported). Admins can certainly use the 'hook_tag' parameter (section:- #3.4.9) to deploy a custom embed-to-object converter function. + + * The only non-standard element that may be permitted is 'embed'; others like 'noembed' and 'nobr' cannot be permitted without modifying the htmLawed code. + + * It cannot handle input that has non-HTML code like 'SVG' and 'MathML'. One way around is to break the input into pieces and passing only those without non-HTML code to htmLawed. Another is described in section:- #3.9. A third way may be to some how take advantage of the '$config["and_mark"]' parameter (see section:- #3.2). + + * By default, htmLawed won't check many attribute values for standard compliance. E.g., 'width="20m"' with the dimension in non-standard 'm' is let through. Implementing universal and strict attribute value checks can make htmLawed slow and resource-intensive. Admins should look at the 'hook_tag' parameter (section:- #3.4.9) or '$spec' to enforce finer checks. + + * The attributes, deprecated (which can be transformed too) or not, that it supports are largely those that are in the specs. Only a few of the proprietary attributes are supported. + + * Except for contained URLs and dynamic expressions (also optional), htmLawed does not check CSS style property values. Admins should look at using the 'hook_tag' parameter (section:- #3.4.9) or '$spec' for finer checks. Perhaps the best option is to disallow 'style' but allow 'class' attributes with the right 'oneof' or 'match' values for 'class', and have the various class style properties in '.css' CSS stylesheet files. + + * htmLawed does not parse emoticons, decode `BBcode`, or `wikify`, auto-converting text to proper HTML. Similarly, it won't convert line-breaks to 'br' elements. Such functions are beyond its purview. Admins should use other code to pre- or post-process the input for such purposes. + + * htmLawed cannot be used to have links force-opened in new windows (by auto-adding appropriate 'target' and 'onclick' attributes to 'a'). Admins should look at Javascript-based DOM-modifying solutions for this. Admins may also be able to use a custom hook function to enforce such checks ('hook_tag' parameter; see section:- #3.4.9). + + * Nesting-based checks are not possible. E.g., one cannot disallow 'p' elements specifically inside 'td' while permitting it elsewhere. Admins may be able to use a custom hook function to enforce such checks ('hook_tag' parameter; see section:- #3.4.9). + + * Except for optionally converting absolute or relative URLs to the other type, htmLawed will not alter URLs (e.g., to change the value of query strings or to convert 'http' to 'https'. Having absolute URLs may be a standard-requirement, e.g., when HTML is embedded in email messages, whereas altering URLs for other purposes is beyond htmLawed's goals. Admins may be able to use a custom hook function to enforce such checks ('hook_tag' parameter; see section:- #3.4.9). + + * Pairs of opening and closing tags that do not enclose any content (like '') are not removed. This may be against the standard specs for certain elements (e.g., 'table'). However, presence of such standard-incompliant code will not break the display or layout of content. Admins can also use simple regex-based code to filter out such code. + + * htmLawed does not check for certain element orderings described in the standard specs (e.g., in a 'table', 'tbody' is allowed before 'tfoot'). Admins may be able to use a custom hook function to enforce such checks ('hook_tag' parameter; see section:- #3.4.9). + + * htmLawed does not check the number of nested elements. E.g., it will allow two 'caption' elements in a 'table' element, illegal as per the specs. Admins may be able to use a custom hook function to enforce such checks ('hook_tag' parameter; see section:- #3.4.9). + + * htmLawed might convert certain entities to actual characters and remove backslashes and CSS comment-markers ('/*') in 'style' attribute values in order to detect malicious HTML like crafted IE-specific dynamic expressions like 'expression...'. If this is too harsh, admins can allow CSS expressions through htmLawed core but then use a custom function through the 'hook_tag' parameter (section:- #3.4.9) to more specifically identify CSS expressions in the 'style' attribute values. Also, using '$config["style_pass"]', it is possible to have htmLawed pass 'style' attribute values without even looking at them (section:- #3.4.8). + + * htmLawed does not correct certain possible attribute-based security vulnerabilities (e.g., 'x'). These arise when browsers mis-identify markup in `escaped` text, defeating the very purpose of escaping text (a bad browser will read the given example as 'x'). + + * Because of poor Unicode support in PHP, htmLawed does not remove the `high value` HTML-invalid characters with multi-byte code-points. Such characters however are extremely unlikely to be in the input. (see section:- #3.1). + + * Like any script using PHP's PCRE regex functions, PHP setup-specific low PCRE limit values can cause htmLawed to at least partially fail with very long input texts. + + +-- 2.9 Examples ---------------------------------------------------o + + + *1.* A blog administrator wants to allow only 'a', 'em', 'strike', 'strong' and 'u' in comments, but needs 'strike' and 'u' transformed to 'span' for better XHTML 1-strict compliance, and, he wants the 'a' links to be to 'http' or 'https' resources: + + $processed = htmLawed($in, array('elements'=>'a, em, strike, strong, u', 'make_tag_strict'=>1, 'safe'=>1, 'schemes'=>'*:http, https'), 'a=href'); + + *2.* An author uses a custom-made web application to load content on his web-site. He is the only one using that application and the content he generates has all types of HTML, including scripts. The web application uses htmLawed primarily as a tool to correct errors that creep in while writing HTML and to take care of the occasional `bad` characters in copy-paste text introduced by Microsoft Office. The web application provides a preview before submitted input is added to the content. For the previewing process, htmLawed is set up as follows: + + $processed = htmLawed($in, array('css_expression'=>1, 'keep_bad'=>1, 'make_tag_strict'=>1, 'schemes'=>'*:*', 'valid_xhtml'=>1)); + + For the final submission process, 'keep_bad' is set to '6'. A value of '1' for the preview process allows the author to note and correct any HTML mistake without losing any of the typed text. + + *3.* A data-miner is scraping information in a specific table of similar web-pages and is collating the data rows, and uses htmLawed to reduce unnecessary markup and white-spaces: + + $processed = htmLawed($in, array('elements'=>'tr, td', 'tidy'=>-1), 'tr, td ='); + + +== 3 Details =====================================================oo + + +-- 3.1 Invalid/dangerous characters -------------------------------- + + + Valid characters (more correctly, their code-points) in HTML or XML are, hexadecimally, '9', 'a', 'd', '20' to 'd7ff', and 'e000' to '10ffff', except 'fffe' and 'ffff' (decimally, '9', '10', '13', '32' to '55295', and '57344' to '1114111', except '65534' and '65535'). htmLawed removes the invalid characters '0' to '8', 'b', 'c', and 'e' to '1f'. + + Because of PHP's poor native support for multi-byte characters, htmLawed cannot check for the remaining invalid code-points. However, for various reasons, it is very unlikely for any of those characters to be in the input. + + Characters that are discouraged (see section:- #5.1) but not invalid are not removed by htmLawed. + + It (function 'hl_tag()') also replaces the potentially dangerous (in some Mozilla [Firefox] and Opera browsers) soft-hyphen character (code-point, hexadecimally, 'ad', or decimally, '173') in attribute values with spaces. Where required, the characters '<', '>', '&', and '"' are converted to entities. + + With '$config["clean_ms_char"]' set as '1' or '2', many of the discouraged characters (decimal code-points '127' to '159' except '133') that many Microsoft applications incorrectly use (as per the 'Windows 1252' ['Cp-1252'] or a similar encoding system), and the character for decimal code-point '133', are converted to appropriate decimal numerical entities (or removed for a few cases)-- see appendix in section:- #5.4. This can help avoid some display issues arising from copying-pasting of content. + + With '$config["clean_ms_char"]' set as '2', characters for the hexadecimal code-points '82', '91', and '92' (for special single-quotes), and '84', '93', and '94' (for special double-quotes) are converted to ordinary single and double quotes respectively and not to entities. + + The character values are replaced with entities/characters and not character values referred to by the entities/characters to keep this task independent of the character-encoding of input text. + + The '$config["clean_ms_char"]' parameter need not be used if authors do not copy-paste Microsoft-created text or if the input text is not believed to use the 'Windows 1252' or a similar encoding. Further, the input form and the web-pages displaying it or its content should have the character encoding appropriately marked-up. + + +-- 3.2 Character references/entities ------------------------------o + + + Valid character entities take the form '&*;' where '*' is '#x' followed by a hexadecimal number (hexadecimal numeric entity; like ' ' for non-breaking space), or alphanumeric like 'gt' (external or named entity; like ' ' for non-breaking space), or '#' followed by a number (decimal numeric entity; like ' ' for non-breaking space). Character entities referring to the soft-hyphen character (the '­' or '\xad' character; hexadecimal code-point 'ad' [decimal '173']) in attribute values are always replaced with spaces; soft-hyphens in attribute values introduce vulnerabilities in some older versions of the Opera and Mozilla [Firefox] browsers. + + htmLawed (function 'hl_ent()'): + + * Neutralizes entities with multiple leading zeroes or missing semi-colons (potentially dangerous) + + * Lowercases the 'X' (for XML-compliance) and 'A-F' of hexadecimal numeric entities + + * Neutralizes entities referring to characters that are HTML-invalid (see section:- #3.1) + + * Neutralizes entities referring to characters that are HTML-discouraged (code-points, hexadecimally, '7f' to '84', '86' to '9f', and 'fdd0' to 'fddf', or decimally, '127' to '132', '134' to '159', and '64991' to '64976'). Entities referring to the remaining discouraged characters (see section:- #5.1 for a full list) are let through. + + * Neutralizes named entities that are not in the specs. + + * Optionally converts valid HTML-specific named entities except '>', '<', '"', and '&' to decimal numeric ones (hexadecimal if $config["hexdec_entity"] is '2') for generic XML-compliance. For this, '$config["named_entity"]' should be '1'. + + * Optionally converts hexadecimal numeric entities to the more widely supported decimal ones. For this, '$config["hexdec_entity"]' should be '0'. + + * Optionally converts decimal numeric entities to the hexadecimal ones. For this, '$config["hexdec_entity"]' should be '2'. + + `Neutralization` refers to the `entitification` of '&' to '&'. + + *Note*: htmLawed does not convert entities to the actual characters represented by them; one can pass the htmLawed output through PHP's 'html_entity_decode' function:- http://www.php.net/html_entity_decode for that. + + *Note*: If '$config["and_mark"]' is set, and set to a value other than '0', then the '&' characters in the original input are replaced with the control character for the hexadecimal code-point '6' ('\x06'; '&' characters introduced by htmLawed, e.g., after converting '<' to '<', are not affected). This allows one to distinguish, say, an '>' introduced by htmLawed and an '>' put in by the input writer, and can be helpful in further processing of the htmLawed-processed text (e.g., to identify the character sequence 'o(><)o' to generate an emoticon image). When this feature is active, admins should ensure that the htmLawed output is not directly used in web pages or XML documents as the presence of the '\x06' can break documents. Before use in such documents, and preferably before any storage, any remaining '\x06' should be changed back to '&', e.g., with: + + $final = str_replace("\x06", '&', $prelim); + + Also, see section:- #3.9. + + +-- 3.3 HTML elements ----------------------------------------------o + + + htmLawed can be configured to allow only certain HTML elements (tags) in the input. Disallowed elements (just tag-content, and not element-content), based on '$config["keep_bad"]', are either `neutralized` (converted to plain text by entitification of '<' and '>') or removed. + + E.g., with only 'em' permitted: + + Input: + + My website is My website is a.com. + + Output, with '$config["keep_bad"]' not '0': + + My website is <a href="">a.com</a>. + + See section:- #3.3.3 for differences between the various non-zero '$config["keep_bad"]' values. + + htmLawed by default permits these 86 elements: + + a, abbr, acronym, address, applet, area, b, bdo, big, blockquote, br, button, caption, center, cite, code, col, colgroup, dd, del, dfn, dir, div, dl, dt, em, embed, fieldset, font, form, h1, h2, h3, h4, h5, h6, hr, i, iframe, img, input, ins, isindex, kbd, label, legend, li, map, menu, noscript, object, ol, optgroup, option, p, param, pre, q, rb, rbc, rp, rt, rtc, ruby, s, samp, script, select, small, span, strike, strong, sub, sup, table, tbody, td, textarea, tfoot, th, thead, tr, tt, u, ul, var + + Except for 'embed' (included because of its wide-spread use) and the Ruby elements ('rb', 'rbc', 'rp', 'rt', 'rtc', 'ruby'; part of XHTML 1.1), these are all the elements in the HTML 4/XHTML 1 specs. Strict-specific specs. exclude 'center', 'dir', 'font', 'isindex', 'menu', 's', 'strike', and 'u'. + + With '$config["safe"] = 1', the default set will exclude 'applet', 'embed', 'iframe', 'object' and 'script'; see section:- #3.6. + + When '$config["elements"]', which specifies allowed elements, is `properly` defined, and neither empty nor set to '0' or '*', the default set is not used. To have elements added to or removed from the default set, a '+/-' notation is used. E.g., '*-script-object' implies that only 'script' and 'object' are disallowed, whereas '*+embed' means that 'noembed' is also allowed. Elements can also be specified as comma separated names. E.g., 'a, b, i' means only 'a', 'b' and 'i' are permitted. In this notation, '*', '+' and '-' have no significance and can actually cause a mis-reading. + + Some more examples of '$config["elements"]' values indicating permitted elements (note that empty spaces are liberally allowed for clarity): + + * 'a, blockquote, code, em, strong' -- only 'a', 'blockquote', 'code', 'em', and 'strong' + * '*-script' -- all excluding 'script' + * '* -center -dir -font -isindex -menu -s -strike -u' -- only XHTML-Strict elements + * '*+noembed-script' -- all including 'noembed' excluding 'script' + + Some mis-usages (and the resulting permitted elements) that can be avoided: + + * '-*' -- none; instead of htmLawed, one might just use, e.g., the 'htmlspecialchars()' PHP function + * '*, -script' -- all except 'script'; admin probably meant '*-script' + * '-*, a, em, strong' -- all; admin probably meant 'a, em, strong' + * '*' -- all; admin need not have set 'elements' + * '*-form+form' -- all; a '+' will always over-ride any '-' + * '*, noembed' -- only 'noembed'; admin probably meant '*+noembed' + * 'a, +b, i' -- only 'a' and 'i'; admin probably meant 'a, b, i' + + Basically, when using the '+/-' notation, commas (',') should not be used, and vice versa, and '*' should be used with the former but not the latter. + + *Note*: Even if an element that is not in the default set is allowed through '$config["elements"]', like 'noembed' in the last example, it will eventually be removed during tag balancing unless such balancing is turned off ('$config["balance"]' set to '0'). Currently, the only way around this, which actually is simple, is to edit the various arrays in the function 'hl_bal()' to accommodate the element and its nesting properties. + + *A possibly second way to specify allowed elements* is to set '$config["parent"]' to an element name that supposedly will hold the input, and to set '$config["balance"]' to '1'. During tag balancing (see section:- #3.3.3), all elements that cannot legally nest inside the parent element will be removed. The parent element is auto-reset to 'div' if '$config["parent"]' is empty, 'body', or an element not in htmLawed's default set of 86 elements. + + `Tag transformation` is possible for improving XHTML-Strict compliance -- most of the deprecated elements are removed or converted to valid XHTML-Strict ones; see section:- #3.3.2. + + +.. 3.3.1 Handling of comments and CDATA sections ................... + + + 'CDATA' sections have the format '"...]]>', and HTML comments, '"... -->'. Neither HTML comments nor 'CDATA' sections can reside inside tags. HTML comments can exist anywhere else, but 'CDATA' sections can exist only where plain text is allowed (e.g., immediately inside 'td' element content but not immediately inside 'tr' element content). + + htmLawed (function 'hl_cmtcd()') handles HTML comments or 'CDATA' sections depending on the values of '$config["comment"]' or '$config["cdata"]'. If '0', such markup is not looked for and the text is processed like plain text. If '1', it is removed completely. If '2', it is preserved but any '<', '>' and '&' inside are changed to entities. If '3', they are left as such. + + Note that for the last two cases, HTML comments and 'CDATA' sections will always be removed from tag content (function 'hl_tag()'). + + Examples: + + Input: + Home + Output ('$config["comment"] = 0, $config["cdata"] = 2'): + <-- home link -->Home + Output ('$config["comment"] = 1, $config["cdata"] = 2'): + Home + Output ('$config["comment"] = 2, $config["cdata"] = 2'): + Home + Output ('$config["comment"] = 2, $config["cdata"] = 1'): + Home + Output ('$config["comment"] = 3, $config["cdata"] = 3'): + Home + + For standard-compliance, comments are given the form '', and any '--' in the content is made '-'. + + When '$config["safe"] = 1', CDATA sections and comments are considered plain text unless '$config["comment"]' or '$config["cdata"]' is explicitly specified; see section:- #3.6. + + +.. 3.3.2 Tag-transformation for better XHTML-Strict ................o + + + If '$config["make_tag_strict"]' is set and not '0', following non-XHTML-Strict elements (and attributes), even if admin-permitted, are mutated as indicated (element content remains intact; function 'hl_tag2()'): + + * applet - (based on '$config["make_tag_strict"]', unchanged ('1') or removed ('2')) + * center - 'div style="text-align: center;"' + * dir - 'ul' + * embed - (based on '$config["make_tag_strict"]', unchanged ('1') or removed ('2')) + * font (face, size, color) - 'span style="font-family: ; font-size: ; color: ;"' (size transformation reference:- http://style.cleverchimp.com/font_size_intervals/altintervals.html) + * isindex - (based on '$config["make_tag_strict"]', unchanged ('1') or removed ('2')) + * menu - 'ul' + * s - 'span style="text-decoration: line-through;"' + * strike - 'span style="text-decoration: line-through;"' + * u - 'span style="text-decoration: underline;"' + + For an element with a pre-existing 'style' attribute value, the extra style properties are appended. + + Example input: + +
+ The PHP software script used for this web-page web-page is htmLawedTest.php, from PHP Labware. +
+ + The output: + +
+ The PHP software script used for this web-page web-page is htmLawedTest.php, from PHP Labware. +
+ + +-- 3.3.3 Tag balancing and proper nesting -------------------------o + + + If '$config["balance"]' is set to '1', htmLawed (function 'hl_bal()') checks and corrects the input to have properly balanced tags and legal element content (i.e., any element nesting should be valid, and plain text may be present only in the content of elements that allow them). + + Depending on the value of '$config["keep_bad"]' (see section:- #2.2 and section:- #3.3), illegal content may be removed or neutralized to plain text by converting < and > to entities: + + '0' - remove; this option is available only to maintain Kses-compatibility and should not be used otherwise (see section:- #2.6) + '1' - neutralize tags and keep element content + '2' - remove tags but keep element content + '3' and '4' - like '1' and '2', but keep element content only if text ('pcdata') is valid in parent element as per specs + '5' and '6' - like '3' and '4', but line-breaks, tabs and spaces are left + + Example input (disallowing the 'p' element): + + <*> Pseudo-tags <*> + Non-HTML tag xml +

+ Disallowed tag p +

+
    Bad
  • OK
+ + The output with '$config["keep_bad"] = 1': + + <*> Pseudo-tags <*> + <xml>Non-HTML tag xml</xml> + <p> + Disallowed tag p + </p> +
    Bad
  • OK
+ + The output with '$config["keep_bad"] = 3': + + <*> Pseudo-tags <*> + <xml>Non-HTML tag xml</xml> + <p> + Disallowed tag p + </p> +
  • OK
+ + The output with '$config["keep_bad"] = 6': + + <*> Pseudo-tags <*> + Non-HTML tag xml + + Disallowed tag p + +
  • OK
+ + An option like '1' is useful, e.g., when a writer previews his submission, whereas one like '3' is useful before content is finalized and made available to all. + + *Note:* In the example above, unlike '<*>', '' gets considered as a tag (even though there is no HTML element named 'xml'). In general, text matching the regular expression pattern '<(/?)([a-zA-Z][a-zA-Z1-6]*)([^>]*?)\s?>' is considered a tag (phrase enclosed by the angled brackets '<' and '>', and starting [with an optional slash preceding] with an alphanumeric word that starts with an alphabet...). + + Nesting/content rules for each of the 86 elements in htmLawed's default set (see section:- #3.3) are defined in function 'hl_bal()'. This means that if a non-standard element besides 'embed' is being permitted through '$config["elements"]', the element's tag content will end up getting removed if '$config["balance"]' is set to '1'. + + Plain text and/or certain elements nested inside 'blockquote', 'form', 'map' and 'noscript' need to be in block-level elements. This point is often missed during manual writing of HTML code. htmLawed attempts to address this during balancing. E.g., if the parent container is set as 'form', the input 'B:C:' is converted to '
B:C:
'. + + +-- 3.3.4 Elements requiring child elements ------------------------o + + + As per specs, the following elements require legal child elements nested inside them: + + blockquote, dir, dl, form, map, menu, noscript, ol, optgroup, rbc, rtc, ruby, select, table, tbody, tfoot, thead, tr, ul + + In some cases, the specs stipulate the number and/or the ordering of the child elements. A 'table' can have 0 or 1 'caption', 'tbody', 'tfoot', and 'thead', but they must be in this order: 'caption', 'thead', 'tfoot', 'tbody'. + + htmLawed currently does not check for conformance to these rules. Note that any non-compliance in this regard will not introduce security vulnerabilities, crash browser applications, or affect the rendering of web-pages. + + +-- 3.3.5 Beautify or compact HTML ---------------------------------o + + + By default, htmLawed will neither `beautify` HTML code by formatting it with indentations, etc., nor will it make it compact by removing un-needed white-space.(It does always properly white-space tag content.) + + As per the HTML standards, spaces, tabs and line-breaks in web-pages (except those inside 'pre' elements) are all considered equivalent, and referred to as `white-spaces`. Browser applications are supposed to consider contiguous white-spaces as just a single space, and to disregard white-spaces trailing opening tags or preceding closing tags. This white-space `normalization` allows the use of text/code beautifully formatted with indentations and line-spacings for readability. Such `pretty` HTML can, however, increase the size of web-pages, or make the extraction or scraping of plain text cumbersome. + + With the '$config' parameter 'tidy', htmLawed can be used to beautify or compact the input text. Input with just plain text and no HTML markup is also subject to this. Besides 'pre', the 'script' and 'textarea' elements, CDATA sections, and HTML comments are not subjected to the tidying process. + + To `compact`, use '$config["tidy"] = -1'; single instances or runs of white-spaces are replaced with a single space, and white-spaces trailing and leading open and closing tags, respectively, are removed. + + To `beautify`, '$config["tidy"]' is set as '1', or for customized tidying, as a string like '2s2n'. The 's' or 't' character specifies the use of spaces or tabs for indentation. The first and third characters, any of the digits 0-9, specify the number of spaces or tabs per indentation, and any parental lead spacing (extra indenting of the whole block of input text). The 'r' and 'n' characters are used to specify line-break characters: 'n' for '\n' (Unix/Mac OS X line-breaks), 'rn' or 'nr' for '\r\n' (Windows/DOS line-breaks), or 'r' for '\r'. + + The '$config["tidy"]' value of '1' is equivalent to '2s0n'. Other '$config["tidy"]' values are read loosely: a value of '4' is equivalent to '4s0n'; 't2', to '1t2n'; 's', to '2s0n'; '2TR', to '2t0r'; 'T1', to '1t1n'; 'nr3', to '3s0nr', and so on. Except in the indentations and line-spacings, runs of white-spaces are replaced with a single space during beautification. + + Input formatting using '$config["tidy"]' is not recommended when input text has mixed markup (like HTML + PHP). + + +-- 3.4 Attributes ------------------------------------------------oo + + + htmLawed will only permit attributes described in the HTML specs (including deprecated ones). It also permits some attributes for use with the 'embed' element (the non-standard 'embed' element is supported in htmLawed because of its widespread use), and the the 'xml:space' attribute (valid only in XHTML 1.1). A list of such 111 attributes and the elements they are allowed in is in section:- #5.2. + + When '$config["deny_attribute"]' is not set, or set to '0', or empty ('""'), all the 111 attributes are permitted. Otherwise, '$config["deny_attribute"]' can be set as a list of comma-separated names of the denied attributes. 'on*' can be used to refer to the group of potentially dangerous, script-accepting attributes: 'onblur', 'onchange', 'onclick', 'ondblclick', 'onfocus', 'onkeydown', 'onkeypress', 'onkeyup', 'onmousedown', 'onmousemove', 'onmouseout', 'onmouseover', 'onmouseup', 'onreset', 'onselect' and 'onsubmit'. + + Note that attributes specified in '$config["deny_attribute"]' are denied globally, for all elements. To deny attributes for only specific elements, '$spec' (see section:- #2.3) can be used. '$spec' can also be used to element-specifically permit an attribute otherwise denied through '$config["deny_attribute"]'. + + With '$config["safe"] = 1' (section:- #3.6), the 'on*' attributes are automatically disallowed. + + *Note*: To deny all but a few attributes globally, a simpler way to specify '$config["deny_attribute"]' would be to use the notation '* -attribute1 -attribute2 ...'. Thus, a value of '* -title -href' implies that except 'href' and 'title' (where allowed as per standards) all other attributes are to be removed. With this notation, the value for the parameter 'safe' (section:- #3.6) will have no effect on 'deny_attribute'. + + htmLawed (function 'hl_tag()') also: + + * Lower-cases attribute names + * Removes duplicate attributes (last one stays) + * Gives attributes the form 'name="value"' and single-spaces them, removing unnecessary white-spacing + * Provides `required` attributes (see section:- #3.4.1) + * Double-quotes values and escapes any '"' inside them + * Replaces the possibly dangerous soft-hyphen characters (hexadecimal code-point 'ad') in the values with spaces + * Allows custom function to additionally filter/modify attribute values (see section:- #3.4.9) + + +.. 3.4.1 Auto-addition of XHTML-required attributes ................ + + + If indicated attributes for the following elements are found missing, htmLawed (function 'hl_tag()') will add them (with values same as attribute names unless indicated otherwise below): + + * area - alt ('area') + * area, img - src, alt ('image') + * bdo - dir ('ltr') + * form - action + * map - name + * optgroup - label + * param - name + * script - type ('text/javascript') + * textarea - rows ('10'), cols ('50') + + Additionally, with '$config["xml:lang"]' set to '1' or '2', if the 'lang' but not the 'xml:lang' attribute is declared, then the latter is added too, with a value copied from that of 'lang'. This is for better standard-compliance. With '$config["xml:lang"]' set to '2', the 'lang' attribute is removed (XHTML 1.1 specs). + + Note that the 'name' attribute for 'map', invalid in XHTML 1.1, is also transformed if required -- see section:- #3.4.6. + + +.. 3.4.2 Duplicate/invalid 'id' values ............................o + + + If '$config["unique_ids"]' is '1', htmLawed (function 'hl_tag()') removes 'id' attributes with values that are not XHTML-compliant (must begin with a letter and can contain letters, digits, ':', '.', '-' and '_') or duplicate. If '$config["unique_ids"]' is a word, any duplicate but otherwise valid value will be appropriately prefixed with the word to ensure its uniqueness. The word should begin with a letter and should contain only letters, numbers, ':', '.', '_' and '-'. + + Even if multiple inputs need to be filtered (through multiple calls to htmLawed), htmLawed ensures uniqueness of 'id' values as it uses a global variable ('$GLOBALS["hl_Ids"]' array). Further, an admin can restrict the use of certain 'id' values by presetting this variable before htmLawed is called into use. E.g.: + + $GLOBALS['hl_Ids'] = array('top'=>1, 'bottom'=>1, 'myform'=>1); // id values not allowed in input + $processed = htmLawed($text); // filter input + + +.. 3.4.3 URL schemes (protocols) and scripts in attribute values ............o + + + htmLawed edits attributes that take URLs as values if they are found to contain un-permitted schemes. E.g., if the 'afp' scheme is not permitted, then '' becomes '', and if Javascript is not permitted '' becomes ''. + + By default htmLawed permits these schemes in URLs for the 'href' attribute: + + aim, feed, file, ftp, gopher, http, https, irc, mailto, news, nntp, sftp, ssh, telnet + + Also, only 'file', 'http' and 'https' are permitted in attributes whose names start with 'o' (like 'onmouseover'), and in these attributes that accept URLs: + + action, cite, classid, codebase, data, href, longdesc, model, pluginspage, pluginurl, src, style, usemap + + These default sets are used when '$config["schemes"]' is not set (see section:- #2.2). To over-ride the defaults, '$config["schemes"]' is defined as a string of semi-colon-separated sub-strings of type 'attribute: comma-separated schemes'. E.g., 'href: mailto, http, https; onclick: javascript; src: http, https'. For unspecified attributes, 'file', 'http' and 'https' are permitted. This can be changed by passing schemes for '*' in '$config["schemes"]'. E.g., 'href: mailto, http, https; *: https, https'. + + '*' can be put in the list of schemes to permit all protocols. E.g., 'style: *; img: http, https' results in protocols not being checked in 'style' attribute values. However, in such cases, any relative-to-absolute URL conversion, or vice versa, (section:- #3.4.4) is not done. + + Thus, `to allow Javascript`, one can set '$config["schemes"]' as 'href: mailto, http, https; *: http, https, javascript', or 'href: mailto, http, https, javascript; *: http, https, javascript', or '*: *', and so on. + + As a side-note, one may find 'style: *' useful as URLs in 'style' attributes can be specified in a variety of ways, and the patterns that htmLawed uses to identify URLs may mistakenly identify non-URL text. + + *Note*: If URL-accepting attributes other than those listed above are being allowed, then the scheme will not be checked unless the attribute name contains the string 'src' (e.g., 'dynsrc') or starts with 'o' (e.g., 'onbeforecopy'). + + With '$config["safe"] = 1', all URLs are disallowed in the 'style' attribute values. + + +.. 3.4.4 Absolute & relative URLs in attribute values .............o + + + htmLawed can make absolute URLs in attributes like 'href' relative ('$config["abs_url"]' is '-1'), and vice versa ('$config["abs_url"]' is '1'). URLs in scripts are not considered for this, and so are URLs like '#section_6' (fragment), '?name=Tim#show' (starting with query string), and ';var=1?name=Tim#show' (starting with parameters). Further, this requires that '$config["base_url"]' be set properly, with the '://' and a trailing slash ('/'), with no query string, etc. E.g., 'file:///D:/page/', 'https://abc.com/x/y/', or 'http://localhost/demo/' are okay, but 'file:///D:/page/?help=1', 'abc.com/x/y/' and 'http://localhost/demo/index.htm' are not. + + For making absolute URLs relative, only those URLs that have the '$config["base_url"]' string at the beginning are converted. E.g., with '$config["base_url"] = "https://abc.com/x/y/"', 'https://abc.com/x/y/a.gif' and 'https://abc.com/x/y/z/b.gif' become 'a.gif' and 'z/b.gif' respectively, while 'https://abc.com/x/c.gif' is not changed. + + When making relative URLs absolute, only values for scheme, network location (host-name) and path values in the base URL are inherited. See section:- #5.5 for more about the URL specification as per RFC 1808:- http://www.ietf.org/rfc/rfc1808.txt. + + +.. 3.4.5 Lower-cased, standard attribute values ....................o + + + Optionally, for standard-compliance, htmLawed (function 'hl_tag()') lower-cases standard attribute values to give, e.g., 'input type="password"' instead of 'input type="Password"', if '$config["lc_std_val"]' is '1'. Attribute values matching those listed below for any of the elements (plus those for the 'type' attribute of 'button' or 'input') are lower-cased: + + all, baseline, bottom, button, center, char, checkbox, circle, col, colgroup, cols, data, default, file, get, groups, hidden, image, justify, left, ltr, middle, none, object, password, poly, post, preserve, radio, rect, ref, reset, right, row, rowgroup, rows, rtl, submit, text, top + + a, area, bdo, button, col, form, img, input, object, option, optgroup, param, script, select, table, td, tfoot, th, thead, tr, xml:space + + The following `empty` (`minimized`) attributes are always assigned lower-cased values (same as the names): + + checked, compact, declare, defer, disabled, ismap, multiple, nohref, noresize, noshade, nowrap, readonly, selected + + +.. 3.4.6 Transformation of deprecated attributes ..................o + + + If '$config["no_deprecated_attr"]' is '0', then deprecated attributes (see appendix in section:- #5.2) are removed and, in most cases, their values are transformed to CSS style properties and added to the 'style' attributes (function 'hl_tag()'). Except for 'bordercolor' for 'table', 'tr' and 'td', the scores of proprietary attributes that were never part of any cross-browser standard are not supported. + + *Note*: The attribute 'target' for 'a' is allowed even though it is not in XHTML 1.0 specs. This is because of the attribute's wide-spread use and browser-support, and because the attribute is valid in XHTML 1.1 onwards. + + * align - for 'img' with value of 'left' or 'right', becomes, e.g., 'float: left'; for 'div' and 'table' with value 'center', becomes 'margin: auto'; all others become, e.g., 'text-align: right' + + * bgcolor - E.g., 'bgcolor="#ffffff"' becomes 'background-color: #ffffff' + * border - E.g., 'height= "10"' becomes 'height: 10px' + * bordercolor - E.g., 'bordercolor=#999999' becomes 'border-color: #999999;' + * compact - 'font-size: 85%' + * clear - E.g., 'clear="all" becomes 'clear: both' + + * height - E.g., 'height= "10"' becomes 'height: 10px' and 'height="*"' becomes 'height: auto' + + * hspace - E.g., 'hspace="10"' becomes 'margin-left: 10px; margin-right: 10px' + * language - 'language="VBScript"' becomes 'type="text/vbscript"' + * name - E.g., 'name="xx"' becomes 'id="xx"' + * noshade - 'border-style: none; border: 0; background-color: gray; color: gray' + * nowrap - 'white-space: nowrap' + * size - E.g., 'size="10"' becomes 'height: 10px' + * start - removed + * type - E.g., 'type="i"' becomes 'list-style-type: lower-roman' + * value - removed + * vspace - E.g., 'vspace="10"' becomes 'margin-top: 10px; margin-bottom: 10px' + * width - like 'height' + + Example input: + + imageimage +
+
+ image + + + + + +
+
+

Section

+

Para

+
  1. First item
+
+
+
  1. First item
+
+
+ + And the output with '$config["no_deprecated_attr"] = 1': + + imageimage +
+
+ image + + + + + +
+
+

Section

+

Para

+
  1. First item
+
+
+
  1. First item
+
+
+ + For 'lang', deprecated in XHTML 1.1, transformation is taken care of through '$config["xml:lang"]'; see section:- #3.4.1. + + The attribute 'name' is deprecated in 'form', 'iframe', and 'img', and is replaced with 'id' if an 'id' attribute doesn't exist and if the 'name' value is appropriate for 'id'. For such replacements for 'a' and 'map', for which the 'name' attribute is deprecated in XHTML 1.1, '$config["no_deprecated_attr"]' should be set to '2' (when set to '1', for these two elements, the 'name' attribute is retained). + + +-- 3.4.7 Anti-spam & 'href' ---------------------------------------o + + + htmLawed (function 'hl_tag()') can check the 'href' attribute values (link addresses) as an anti-spam (email or link spam) measure. + + If '$config["anti_mail_spam"]' is not '0', the '@' of email addresses in 'href' values like 'mailto:a@b.com' is replaced with text specified by '$config["anti_mail_spam"]'. The text should be of a form that makes it clear to others that the address needs to be edited before a mail is sent; e.g., '@' (makes the example address 'a@b.com'). + + For regular links, one can choose to have a 'rel' attribute with 'nofollow' in its value (which tells some search engines to not follow a link). This can discourage link spammers. Additionally, or as an alternative, one can choose to empty the 'href' value altogether (disable the link). + + For use of these options, '$config["anti_link_spam"]' should be set as an array with values 'regex1' and 'regex2', both or one of which can be empty (like 'array("", "regex2")') to indicate that that option is not to be used. Otherwise, 'regex1' or 'regex2' should be PHP- and PCRE-compatible regular expression patterns: 'href' values will be matched against them and those matching the pattern will accordingly be treated. + + Note that the regular expressions should have `delimiters`, and be well-formed and preferably fast. Absolute efficiency/accuracy is often not needed. + + An example, to have a 'rel' attribute with 'nofollow' for all links, and to disable links that do not point to domains 'abc.com' and 'xyz.org': + + $config["anti_link_spam"] = array('`.`', '`://\W*(?!(abc\.com|xyz\.org))`'); + + +-- 3.4.8 Inline style properties ----------------------------------o + + + htmLawed can check URL schemes and dynamic expressions (to guard against Javascript, etc., script-based insecurities) in inline CSS style property values in the 'style' attributes. (CSS properties like 'background-image' that accept URLs in their values are noted in section:- #5.3.) Dynamic CSS expressions that allow scripting in the IE browser, and can be a vulnerability, can be removed from property values by setting '$config["css_expression"]' to '1' (default setting). + + *Note*: Because of the various ways of representing characters in attribute values (URL-escapement, entitification, etc.), htmLawed might alter the values of the 'style' attribute values, and may even falsely identify dynamic CSS expressions and URL schemes in them. If this is an important issue, checking of URLs and dynamic expressions can be turned off ('$config["schemes"] = "...style:*..."', see section:- #3.4.3, and '$config["css_expression"] = 0'). Alternately, admins can use their own custom function for finer handling of 'style' values through the 'hook_tag' parameter (see section:- #3.4.9). + + It is also possible to have htmLawed let through any 'style' value by setting '$config["style_pass"]' to '1'. + + As such, it is better to set up a CSS file with class declarations, disallow the 'style' attribute, set a '$spec' rule (see section:- #2.3) for 'class' for the 'oneof' or 'match' parameter, and ask writers to make use of the 'class' attribute. + + +-- 3.4.9 Hook function for tag content ----------------------------o + + + It is possible to utilize a custom hook function to alter the tag content htmLawed has finalized (i.e., after it has checked/corrected for required attributes, transformed attributes, lower-cased attribute names, etc.). + + When '$config' parameter 'hook_tag' is set to the name of a function, htmLawed (function 'hl_tag()') will pass on the element name, and the `finalized` attribute name-value pairs as array elements to the function. The function is expected to return the full opening tag string like '' (for empty elements like 'img' and 'input', the element-closing slash '/' should also be included). + + This is a *powerful functionality* that can be exploited for various objectives: consolidate-and-convert inline 'style' attributes to 'class', convert 'embed' elements to 'object', permit only one 'caption' element in a 'table' element, disallow embedding of certain types of media, *inject HTML*, use CSSTidy:- http://csstidy.sourceforge.net to sanitize 'style' attribute values, etc. + + As an example, the custom hook code below can be used to force a series of specifically ordered 'id' attributes on all elements, and a specific 'param' element inside all 'object' elements: + + function my_tag_function($element, $attribute_array){ + static $id = 0; + // Remove any duplicate element + if($element == 'param' && isset($attribute_array['allowscriptaccess'])){ + return ''; + } + + $new_element = ''; + + // Force a serialized ID number + $attribute_array['id'] = 'my_'. $id; + ++$id; + + // Inject param for allowscriptaccess + if($element == 'object'){ + $new_element = ''; + ++$id; + } + + $string = ''; + foreach($attribute_array as $k=>$v){ + $string .= " {$k}=\"{$v}\""; + } + return "<{$element}{$string}". (isset($in_array($element, $empty_elements) ? ' /' : ''). '>'. $new_element; + } + + The 'hook_tag' parameter is different from the 'hook' parameter (section:- #3.7). + + Snippets of hook function code developed by others may be available on the htmLawed:- http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed website. + + +-- 3.5 Simple configuration directive for most valid XHTML -------oo + + + If '$config["valid_xhtml"]' is set to '1', some relevant '$config' parameters (indicated by '~' in section:- #2.2) are auto-adjusted. This allows one to pass the '$config' argument with a simpler value. If a value for a parameter auto-set through 'valid_xhtml' is still manually provided, then that value will over-ride the auto-set value. + + +-- 3.6 Simple configuration directive for most `safe` HTML --------o + + + `Safe` HTML refers to HTML that is restricted to reduce the vulnerability for scripting attacks (such as XSS) based on HTML code which otherwise may still be legal and compliant with the HTML standard specs. When elements such as 'script' and 'object', and attributes such as 'onmouseover' and 'style' are allowed in the input text, an input writer can introduce malevolent HTML code. Note that what is considered 'safe' depends on the nature of the web application and the trust-level accorded to its users. + + htmLawed allows an admin to use '$config["safe"]' to auto-adjust multiple '$config' parameters (such as 'elements' which declares the allowed element-set), which otherwise would have to be manually set. The relevant parameters are indicated by '"' in section:- #2.2). Thus, one can pass the '$config' argument with a simpler value. + + With the value of '1', htmLawed considers 'CDATA' sections and HTML comments as plain text, and prohibits the 'applet', 'embed', 'iframe', 'object' and 'script' elements, and the 'on*' attributes like 'onclick'. ( There are '$config' parameters like 'css_expression' that are not affected by the value set for 'safe' but whose default values still contribute towards a more `safe` output.) Further, URLs with schemes (see section:- #3.4.3) are neutralized so that, e.g., 'style="moz-binding:url(http://danger)"' becomes 'style="moz-binding:url(denied:http://danger)"' while 'style="moz-binding:url(ok)"' remains intact. + + Admins, however, may still want to completely deny the 'style' attribute, e.g., with code like + + $processed = htmLawed($text, array('safe'=>1, 'deny_attribute'=>'style')); + + If a value for a parameter auto-set through 'safe' is still manually provided, then that value can over-ride the auto-set value. E.g., with '$config["safe"] = 1' and '$config["elements"] = "*+script"', 'script', but not 'applet', is allowed. + + A page illustrating the efficacy of htmLawed's anti-XSS abilities with 'safe' set to '1' against XSS vectors listed by RSnake:- http://ha.ckers.org/xss.html may be available here:- http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed/rsnake/RSnakeXSSTest.htm. + + +-- 3.7 Using a hook function --------------------------------------o + + + If '$config["hook"]' is not set to '0', then htmLawed will allow preliminarily processed input to be altered by a hook function named by '$config["hook"]' before starting the main work (but after handling of characters, entities, HTML comments and 'CDATA' sections -- see code for function 'htmLawed()'). + + The hook function also allows one to alter the `finalized` values of '$config' and '$spec'. + + Note that the 'hook' parameter is different from the 'hook_tag' parameter (section:- #3.4.9). + + Snippets of hook function code developed by others may be available on the htmLawed:- http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed website. + + +-- 3.8 Obtaining `finalized` parameter values ---------------------o + + + htmLawed can assign the `finalized` '$config' and '$spec' values to a variable named by '$config["show_setting"]'. The variable, made global by htmLawed, is set as an array with three keys: 'config', with the '$config' value, 'spec', with the '$spec' value, and 'time', with a value that is the Unix time (the output of PHP's 'microtime()' function) when the value was assigned. Admins should use a PHP-compliant variable name (e.g., one that does not begin with a numerical digit) that does not conflict with variable names in their non-htmLawed code. + + The values, which are also post-hook function (if any), can be used to auto-generate information (on, e.g., the elements that are permitted) for input writers. + + +-- 3.9 Retaining non-HTML tags in input with mixed markup ---------o + + + htmLawed does not remove certain characters that though invalid are nevertheless discouraged in HTML documents as per the specs (see section:- #5.1). This can be utilized to deal with input that contains mixed markup. Input that may have HTML markup as well as some other markup that is based on the '<', '>' and '&' characters is considered to have mixed markup. The non-HTML markup can be rather proprietary (like markup for emoticons/smileys), or standard (like MathML or SVG). Or it can be programming code meant for execution/evaluation (such as embedded PHP code). + + To deal with such mixed markup, the input text can be pre-processed to hide the non-HTML markup by specifically replacing the '<', '>' and '&' characters with some of the HTML-discouraged characters (see section:- #3.1.2). Post-htmLawed processing, the replacements are reverted. + + An example (mixed HTML and PHP code in input text): + + $text = preg_replace('`<\?php(.+?)\?>`sm', "\x83?php\\1?\x84", $text); + $processed = htmLawed($text); + $processed = preg_replace('`\x83\?php(.+?)\?\x84`sm', '', $processed); + + This code will not work if '$config["clean_ms_char"]' is set to '1' (section:- #3.1), in which case one should instead deploy a hook function (section:- #3.7). (htmLawed internally uses certain control characters, code-points '1' to '7', and use of these characters as markers in the logic of hook functions may cause issues.) + + Admins may also be able to use '$config["and_mark"]' to deal with such mixed markup; see section:- #3.2. + + +== 4 Other =======================================================oo + + +-- 4.1 Support ----------------------------------------------------- + + + A careful re-reading of this documentation will very likely answer your questions. + + Software updates and forum-based community-support may be found at http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed. For general PHP issues (not htmLawed-specific), support may be found through internet searches and at http://php.net. + + +-- 4.2 Known issues -----------------------------------------------o + + + See section:- #2.8. + + Readers are advised to cross-check information given in this document. + + +-- 4.3 Change-log -------------------------------------------------o + + + (The release date for the downloadable package of files containing documentation, demo script, test-cases, etc., besides the 'htmLawed.php' file may be updated independently if the secondary files are revised.) + + `Version number - Release date. Notes` + + 1.1.8.1 - 16 July 2009. Minor code-change to fix a PHP error notice + + 1.1.8 - 23 April 2009. Parameter 'deny_attribute' now accepts the wild-card '*', making it simpler to specify its value when all but a few attributes are being denied; fixed a bug in interpreting '$spec' + + 1.1.7 - 11-12 March 2009. Attributes globally denied through 'deny_attribute' can be allowed element-specifically through '$spec'; '$config["style_pass"]' allowing letting through any 'style' value introduced; altered logic to catch certain types of dynamic crafted CSS expressions + + 1.1.3-6 - 28-31 January - 4 February 2009. Altered logic to catch certain types of dynamic crafted CSS expressions + + 1.1.2 - 22 January 2009. Fixed bug in parsing of 'font' attributes during tag transformation + + 1.1.1 - 27 September 2008. Better nesting correction when omitable closing tags are absent + + 1.1 - 29 June 2008. '$config["hook_tag"]' and '$config["format"]' introduced for custom tag/attribute check/modification/injection and output compaction/beautification; fixed a regex-in-$spec parsing bug + + 1.0.9 - 11 June 2008. Fixed bug in invalid HTML code-point entity check + + 1.0.8 - 15 May 2008. 'bordercolor' attribute for 'table', 'td' and 'tr' + + 1.0.7 - 1 May 2008. Support for 'wmode' attribute for 'embed'; '$config["show_setting"]' introduced; improved '$config["elements"]' evaluation + + 1.0.6 - 20 April 2008. '$config["and_mark"]' introduced + + 1.0.5 - 12 March 2008. 'style' URL schemes essentially disallowed when $config 'safe' is on; improved regex for CSS expression search + + 1.0.4 - 10 March 2008. Improved corrections for 'blockquote', 'form', 'map' and 'noscript' + + 1.0.3 - 3 March 2008. Character entities for soft-hyphens are now replaced with spaces (instead of being removed); a bug allowing 'td' directly inside 'table' fixed; 'safe' '$config' parameter added + + 1.0.2 - 13 February 2008. Improved implementation of '$config["keep_bad"]' + + 1.0.1 - 7 November 2007. Improved regex for identifying URLs, protocols and dynamic expressions ('hl_tag()' and 'hl_prot()'); no error display with 'hl_regex()' + + 1.0 - 2 November 2007. First release + + +-- 4.4 Testing ----------------------------------------------------o + + + To test htmLawed using a form interface, a demo:- htmLawedTest.php web-page is provided with the htmLawed distribution ('htmLawed.php' and 'htmLawedTest.php' should be in the same directory on the web-server). A file with test-cases:- htmLawed_TESTCASE.txt is also provided. + + +-- 4.5 Upgrade, & old versions ------------------------------------o + + + Upgrading is as simple as replacing the previous version of 'htmLawed.php' (assuming it was not modified for customized features). As htmLawed output is almost always used in static documents, upgrading should not affect old, finalized content. + + Old versions of htmLawed may be available online. E.g., for version 1.0, check http://www.bioinformatics.org/phplabware/downloads/htmLawed1.zip, for 1.1.1, htmLawed111.zip, and for 1.1.10, htmLawed1110.zip. + + +-- 4.6 Comparison with 'HTMLPurifier' -----------------------------o + + + The HTMLPurifier PHP library by Edward Yang is a very good HTML filtering script that uses object oriented PHP code. Compared to htmLawed, it: + + * does not support PHP versions older than 5.0 (HTMLPurifier dropped PHP 4 support after version 2) + + * is 15-20 times bigger (scores of files totalling more than 750 kb) + + * consumes 10-15 times more RAM memory (just including the HTMLPurifier files without calling the filter requires a few MBs of memory) + + * is expectedly slower + + * does not allow admins to fully allow all valid HTML (because of incomplete HTML support, it always considers elements like 'script' illegal) + + * lacks many of the extra features of htmLawed (like entity conversions and code compaction/beautification) + + * has poor documentation + + However, HTMLPurifier has finer checks for character encodings and attribute values, and can log warnings and errors. Visit the HTMLPurifier website:- http://htmlpurifier.org for updated information. + + +-- 4.7 Use through application plug-ins/modules -------------------o + + + Plug-ins/modules to implement htmLawed in applications such as Drupal and DokuWiki may have been developed. Please check the application websites and the forum on the htmLawed site:- http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed. + + +-- 4.8 Use in non-PHP applications --------------------------------o + + + Non-PHP applications written in Python, Ruby, etc., may be able to use htmLawed through system calls to the PHP engine. Such code may have been documented on the internet. Also check the forum on the htmLawed site:- http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed. + + +-- 4.9 Donate -----------------------------------------------------o + + + A donation in any currency and amount to appreciate or support this software can be sent by PayPal:- http://paypal.com to this email address: drpatnaik at yahoo dot com. + + +-- 4.10 Acknowledgements ------------------------------------------o + + + Bryan Blakey, Ulf Harnhammer, Gareth Heyes, Lukasz Pilorz, Shelley Powers, Edward Yang, and many anonymous users. + + Thank you! + + +== 5 Appendices ==================================================oo + + +-- 5.1 Characters discouraged in XHTML ----------------------------- + + + Characters represented by the following hexadecimal code-points are `not` invalid, even though some validators may issue messages stating otherwise. + + '7f' to '84', '86' to '9f', 'fdd0' to 'fddf', '1fffe', '1ffff', '2fffe', '2ffff', '3fffe', '3ffff', '4fffe', '4ffff', '5fffe', '5ffff', '6fffe', '6ffff', '7fffe', '7ffff', '8fffe', '8ffff', '9fffe', '9ffff', 'afffe', 'affff', 'bfffe', 'bffff', 'cfffe', 'cffff', 'dfffe', 'dffff', 'efffe', 'effff', 'ffffe', 'fffff', '10fffe' and '10ffff' + + +-- 5.2 Valid attribute-element combinations -----------------------o + + + Valid attribute-element combinations as per W3C specs. + + * includes deprecated attributes (marked '^'), attributes for the non-standard 'embed' element (marked '*'), and the proprietary 'bordercolor' (marked '~') + * only non-frameset, HTML body elements + * 'name' for 'a' and 'map', and 'lang' are invalid in XHTML 1.1 + * 'target' is valid for 'a' in XHTML 1.1 and higher + * 'xml:space' is only for XHTML 1.1 + + abbr - td, th + accept - form, input + accept-charset - form + accesskey - a, area, button, input, label, legend, textarea + action - form + align - caption^, embed, applet, iframe, img^, input^, object^, legend^, table^, hr^, div^, h1^, h2^, h3^, h4^, h5^, h6^, p^, col, colgroup, tbody, td, tfoot, th, thead, tr + alt - applet, area, img, input + archive - applet, object + axis - td, th + bgcolor - embed, table^, tr^, td^, th^ + border - table, img^, object^ + bordercolor~ - table, td, tr + cellpadding - table + cellspacing - table + char - col, colgroup, tbody, td, tfoot, th, thead, tr + charoff - col, colgroup, tbody, td, tfoot, th, thead, tr + charset - a, script + checked - input + cite - blockquote, q, del, ins + classid - object + clear - br^ + code - applet + codebase - object, applet + codetype - object + color - font + cols - textarea + colspan - td, th + compact - dir, dl^, menu, ol^, ul^ + coords - area, a + data - object + datetime - del, ins + declare - object + defer - script + dir - bdo + disabled - button, input, optgroup, option, select, textarea + enctype - form + face - font + for - label + frame - table + frameborder - iframe + headers - td, th + height - embed, iframe, td^, th^, img, object, applet + href - a, area + hreflang - a + hspace - applet, img^, object^ + ismap - img, input + label - option, optgroup + language - script^ + longdesc - img, iframe + marginheight - iframe + marginwidth - iframe + maxlength - input + method - form + model* - embed + multiple - select + name - button, embed, textarea, applet^, select, form^, iframe^, img^, a^, input, object, map^, param + nohref - area + noshade - hr^ + nowrap - td^, th^ + object - applet + onblur - a, area, button, input, label, select, textarea + onchange - input, select, textarea + onfocus - a, area, button, input, label, select, textarea + onreset - form + onselect - input, textarea + onsubmit - form + pluginspage* - embed + pluginurl* - embed + prompt - isindex + readonly - textarea, input + rel - a + rev - a + rows - textarea + rowspan - td, th + rules - table + scope - td, th + scrolling - iframe + selected - option + shape - area, a + size - hr^, font, input, select + span - col, colgroup + src - embed, script, input, iframe, img + standby - object + start - ol^ + summary - table + tabindex - a, area, button, input, object, select, textarea + target - a^, area, form + type - a, embed, object, param, script, input, li^, ol^, ul^, button + usemap - img, input, object + valign - col, colgroup, tbody, td, tfoot, th, thead, tr + value - input, option, param, button, li^ + valuetype - param + vspace - applet, img^, object^ + width - embed, hr^, iframe, img, object, table, td^, th^, applet, col, colgroup, pre^ + wmode - embed + xml:space - pre, script, style + + These are allowed in all but the shown elements: + + class - param, script + dir - applet, bdo, br, iframe, param, script + id - script + lang - applet, br, iframe, param, script + onclick - applet, bdo, br, font, iframe, isindex, param, script + ondblclick - applet, bdo, br, font, iframe, isindex, param, script + onkeydown - applet, bdo, br, font, iframe, isindex, param, script + onkeypress - applet, bdo, br, font, iframe, isindex, param, script + onkeyup - applet, bdo, br, font, iframe, isindex, param, script + onmousedown - applet, bdo, br, font, iframe, isindex, param, script + onmousemove - applet, bdo, br, font, iframe, isindex, param, script + onmouseout - applet, bdo, br, font, iframe, isindex, param, script + onmouseover - applet, bdo, br, font, iframe, isindex, param, script + onmouseup - applet, bdo, br, font, iframe, isindex, param, script + style - param, script + title - param, script + xml:lang - applet, br, iframe, param, script + + +-- 5.3 CSS 2.1 properties accepting URLs ------------------------o + + + background + background-image + content + cue-after + cue-before + cursor + list-style + list-style-image + play-during + + +-- 5.4 Microsoft Windows 1252 character replacements --------------o + + + Key: 'd' double, 'l' left, 'q' quote, 'r' right, 's.' single + + Code-point (decimal) - hexadecimal value - replacement entity - represented character + + 127 - 7f - (removed) - (not used) + 128 - 80 - € - euro + 129 - 81 - (removed) - (not used) + 130 - 82 - ‚ - baseline s. q + 131 - 83 - ƒ - florin + 132 - 84 - „ - baseline d q + 133 - 85 - … - ellipsis + 134 - 86 - † - dagger + 135 - 87 - ‡ - d dagger + 136 - 88 - ˆ - circumflex accent + 137 - 89 - ‰ - permile + 138 - 8a - Š - S Hacek + 139 - 8b - ‹ - l s. guillemet + 140 - 8c - Œ - OE ligature + 141 - 8d - (removed) - (not used) + 142 - 8e - Ž - Z dieresis + 143 - 8f - (removed) - (not used) + 144 - 90 - (removed) - (not used) + 145 - 91 - ‘ - l s. q + 146 - 92 - ’ - r s. q + 147 - 93 - “ - l d q + 148 - 94 - ” - r d q + 149 - 95 - • - bullet + 150 - 96 - – - en dash + 151 - 97 - — - em dash + 152 - 98 - ˜ - tilde accent + 153 - 99 - ™ - trademark + 154 - 9a - š - s Hacek + 155 - 9b - › - r s. guillemet + 156 - 9c - œ - oe ligature + 157 - 9d - (removed) - (not used) + 158 - 9e - ž - z dieresis + 159 - 9f - Ÿ - Y dieresis + + +-- 5.5 URL format -------------------------------------------------o + + + An `absolute` URL has a 'protocol' or 'scheme', a 'network location' or 'hostname', and, optional 'path', 'parameters', 'query' and 'fragment' segments. Thus, an absolute URL has this generic structure: + + (scheme) : (//network location) /(path) ;(parameters) ?(query) #(fragment) + + The schemes can only contain letters, digits, '+', '.' and '-'. Hostname is the portion after the '//' and up to the first '/' (if any; else, up to the end) when ':' is followed by a '//' (e.g., 'abc.com' in 'ftp://abc.com/def'); otherwise, it consists of everything after the ':' (e.g., 'def@abc.com' in mailto:def@abc.com'). + + `Relative` URLs do not have explicit schemes and network locations; such values are inherited from a `base` URL. + + +-- 5.6 Brief on htmLawed code -------------------------------------o + + + Much of the code's logic and reasoning can be understood from the documentation above. + + The *output* of htmLawed is a text string containing the processed input. There is no custom error tracking. + + *Function arguments* for htmLawed are: + + * '$in' - 1st argument; a text string; the *input text* to be processed. Any extraneous slashes added by PHP when `magic quotes` are enabled should be removed beforehand using PHP's 'stripslashes()' function. + + * '$config' - 2nd argument; an associative array; optional (named '$C' in htmLawed code). The array has keys with names like 'balance' and 'keep_bad', and the values, which can be boolean, string, or array, depending on the key, are read to accordingly set the *configurable parameters* (indicated by the keys). All configurable parameters receive some default value if the value to be used is not specified by the user through '$config'. `Finalized` '$config' is thus a filtered and possibly larger array. + + * '$spec' - 3rd argument; a text string; optional. The string has rules, written in an htmLawed-designated format, *specifying* element-specific attribute and attribute value restrictions. Function 'hl_spec()' is used to convert the string to an associative-array for internal use. `Finalized` '$spec' is thus an array. + + `Finalized` '$config' and '$spec' are made *global variables* while htmLawed is at work. Values of any pre-existing global variables with same names are noted, and their values are restored after htmLawed finishes processing the input (to capture the `finalized` values, the 'show_settings' parameter of '$config' should be used). Depending on '$config', another global variable 'hl_Ids', to track 'id' attribute values for uniqueness, may be set. Unlike the other two variables, this one is not reset (or unset) post-processing. + + Except for the main function 'htmLawed()' and the functions 'kses()' and 'kses_hook()', htmLawed's functions are *name-spaced* using the 'hl_' prefix. The *functions* and their roles are: + + * 'hl_attrval' - checking attribute values against $spec + * 'hl_bal' - tag balancing + * 'hl_cmtcd' - handling CDATA sections and HTML comments + * 'hl_ent' - entity handling + * 'hl_prot' - checking a URL scheme/protocol + * 'hl_regex' - checking syntax of a regular expression + * 'hl_spec' - converting user-supplied $spec value to one used by htmLawed internally + * 'hl_tag' - handling tags + * 'hl_tag2' - transforming tags + * 'hl_tidy' - compact/beautify HTML + * 'hl_version' - reporting htmLawed version + * 'htmLawed' - main function + * 'kses' - main function of 'kses' + * 'kses_hook' - hook function of 'kses' + + The last two are for compatibility with pre-existing code using the 'kses' script. htmLawed's 'kses()' basically passes on the filtering task to 'htmLawed()' function after deciphering '$config' and '$spec' from the argument values supplied to it. 'kses_hook()' is an empty function and is meant for being filled with custom code if the 'kses' script users were using one. + + 'htmLawed()' finalizes '$spec' (with the help of 'hl_spec()') and '$config', and globalizes them. Finalization of '$config' involves setting default values if an inappropriate or invalid one is supplied. This includes calling 'hl_regex()' to check well-formedness of regular expression patterns if such expressions are user-supplied through '$config'. 'htmLawed()' then removes invalid characters like nulls and 'x01' and appropriately handles entities using 'hl_ent()'. HTML comments and CDATA sections are identified and treated as per '$config' with the help of 'hl_cmtcd()'. When retained, the '<' and '>' characters identifying them, and the '<', '>' and '&' characters inside them, are replaced with control characters (code-points '1' to '5') till any tag balancing is completed. + + After this `initial processing` 'htmLawed()' identifies tags using regex and processes them with the help of 'hl_tag()' -- a large function that analyzes tag content, filtering it as per HTML standards, '$config' and '$spec'. Among other things, 'hl_tag()' transforms deprecated elements using 'hl_tag2()', removes attributes from closing tags, checks attribute values as per '$spec' rules using 'hl_attrval()', and checks URL protocols using 'hl_prot()'. 'htmLawed()' performs tag balancing and nesting checks with a call to 'hl_bal()', and optionally compacts/beautifies the output with proper white-spacing with a call to 'hl_tidy()'. The latter temporarily replaces white-space, and '<', '>' and '&' characters inside 'pre', 'script' and 'textarea' elements, and HTML comments and CDATA sections with control characters (code-points '1' to '5', and '7'). + + htmLawed permits the use of custom code or *hook functions* at two stages. The first, called inside 'htmLawed()', allows the input text as well as the finalized $config and $spec values to be altered right after the initial processing (see section:- #3.7). The second is called by 'hl_tag()' once the tag content is finalized (see section:- #3.4.9). + + Being dictated by the external and stable HTML standard, htmLawed's objective is very clear-cut and less concerned with tweakability. The code is only minimally annotated with comments -- it is not meant to instruct; PHP developers familiar with the HTML specs will see the logic, and others can always refer to the htmLawed documentation. The compact structuring of the statements is meant to aid in quickly grasping the logic, at least when viewed with code syntax highlighted. + +___________________________________________________________________oo + + +@@description: htmLawed PHP software is a free, open-source, customizable HTML input purifier and filter +@@encoding: utf-8 +@@keywords: htmLawed, HTM, HTML, HTML Tidy, converter, filter, formatter, purifier, sanitizer, XSS, input, PHP, software, code, script, security, cross-site scripting, hack, sanitize, remove, standards, tags, attributes, elements +@@language: en +@@title: htmLawed documentation \ No newline at end of file diff --git a/extlib/htmLawed/htmLawed_TESTCASE.txt b/extlib/htmLawed/htmLawed_TESTCASE.txt new file mode 100644 index 000000000..366465ce3 --- /dev/null +++ b/extlib/htmLawed/htmLawed_TESTCASE.txt @@ -0,0 +1,370 @@ +/* +htmLawed_TESTCASE.txt, 23 April 2009 +htmLawed 1.1.8.1, 16 July 2009 +Copyright Santosh Patnaik +GPL v3 license +A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed +*/ + +This file has UTF-8-encoded text with both correct and incorrect/malformed HTML/XHTML code snippets to test htmLawed (test cases/samples). The entire text may also be used as a unit. + +************************************************ +when viewing this file in a web browser, set the +character encoding to Unicode/UTF-8 +************************************************ + +--------------------- start -------------------- + +Try different $config and $spec values. Some text even when filtered in will not be displayed in a rendered web-page
+ +
Attributes
+ +Xml:lang:
, ,
+Standard, predefined value, or empty attribute: , ,
+Required: , image
+Quote & space variation: a, a, a
+Invalid: a
+Duplicated: a
+Deprecated: a,

+Casing:
+Admin-restricted?: + +
Attribute values
+ +Duplicate ID value:, ,
+(try 'my_' for prefix)
+Double-quotes in value:, ,
+(try filter for CSS expression)
+CSS expression:

+Other: ,
+(try 'maxlen', 'maxval', etc., for 'input' in '$spec') + +
Blockquotes
+ +
abc

+
abc
def

+
abc
def

+
abc
def
ghi

+abc
def
ghi
+(try with blockquote parent) + +
CDATA sections
+ +Special characters inside: ]]>, 3.5, & 4 > 4 ]]>
+Normal: , CDATA follows:
+Malformed: , < ![CDATA check ]]>, , < ![CDATA check ] ]>
+Invalid: >CDATA in tag content,
text not allowed
+ +
Complex-1: deprecated elements
+ +
+The PHP software script used for this web-page webpage is htmLawedTest.php, from PHP Labware. +
+ +
Complex-2: deprecated attributes
+ +aa +
+
+image + + + + + +
+
+

Section

+

Para

+
  1. First item
+
+
+
  1. First item
+
+
+ +
Complex-3: embed, object, area
+ +
+ +
+ + +

navigate the site: 1 | 3 | 4

+ +
+ +
Complex-4: nested and other tables
+ +
Cell
Cell
Cell
Cell Cell Cell
Cell
Cell Cell Cell

+PCDATA wrong: Well
Hello

+Missing tr:
Well

+ +
Complex-5: pseudo, disallowed or non-HTML tags
+ +(Try different 'keep_bad' values) +<*> Pseudotags <*> +Non-HTML tag xml +

+Disallowed tag p +

+
    Bad
  • OK
+ +
Elements
+ +Unbalanced: check
+Non-XHTML:

+Malformed: < a href="">, , , , < /a>, < a href="">, a, a,
+Invalid: a
+Empty: a, a, atext
+Content invalid: 12
+Content invalid?:

(try setting 'form' as parent) +Casing: + +
Entities
+ +Special: & 3 < 2 & 5>4 and j >i >a & ia
+Padding: B B f f  
+Malformed: & #x27;, &x27;, ' &TILDE;, &tilde
+Invalid: , �, , �, ￿, &bad;
+Discouraged characters: , „, ﷠, 􏿾
+Context: '>', <?
+Casing: ', ', &TILDE;, ˜ +
+(also check named-to-numeric and hexdec-to-decimal, and vice versa, conversions) + +
Format
+ +Valid but ill-formatted: text +text + text text
p r e
+ text text

+text none text +text none t e x t +
text none t e x t + +text none t e x t + +
+
p r e  
+
+				pre
+		
+
+
Cell
Cell
Cell
CellCellCell
Cell
CellCellCell
+(try to compact or beautify) + +
Forms
+ +(note nesting of 'form', missing required attributes, etc.)
+
+ +
pl
+ h + +

+


+
B:C:

+(try each of these lines separately)
+
what
+what +(try with container as div and as form)
+c a b + +
HTML comments (also CDATA)
+ +Special characters inside: , , , c
+Normal: , , comment:,
text not allowed

+Malformed: , < ![CDATA check ]]>, < ![CDATA check ] ]>
+Invalid: >comment in tag content, + +
Ins-Del
+ +(depending on context, these elements can be of either block or inline type)
+

block


+

d


+

d

d

d
+ +
Lists
+ +Invalid character data:
  • (item
  • )

+Definition list:
a
bad
first one
b
second

+Definition list, close-tags omitted:
a
bad
first one
b
second

+Definition lists, nested:
+
T1
+
D1
+
T2
+
D2
t1
d1
t2
d2
+
T3
+
D3
+
T4
+
D4
t1
d1
+

+Definition lists, nested, close-tags omitted:
+
T1 +
D1
+
T2
+
D2
t1
d1
t2
d2
+
T3 +
D3 +
T4 +
D4
t1
d1
+

+Nested:
    +
  • l1
  • +
  • l2
    1. lo1
    2. lo2
  • +
  • l3
  • +
  • l4
    1. lo3
    2. lo4
      1. lo5
  • +

+Nested, close-tags omitted:
    +
  • l1
  • +
  • l2
    1. lo1
    2. lo2
    +
  • l3 +
  • l4
    1. lo3
    2. lo4
      1. lo5
    +

+Complex: +
  1. +
    +
+ +
Non-English text-1
+ +Inscrieţi-vă acum la a Zecea Conferinţă Internaţională
+გთხოვთ ახლავე გაიაროთ რეგისტრაცია
+večjezično računalništvo
+Зарегистрируйтесь сейчас +на Десятую Международную Конференцию по
+(this file should have utf-8 encoding; some characters may not be displayed because of missing fonts, etc.) + +
Non-English text-2: entities
+ +用统一码
+გთხოვთ
+Inscreva-se agora para a Décima Conferência Internacional Sobre O Unicode, realizada entre os dias 10 e 12 de março de 1997 em Mainz +na Alemanha. + +
Ruby
+ +(need compatible browser)
+ + + + + + + + + さい + とう + のぶ + + + + W3C Associate Chairman + +
+ + WWW + (World Wide Web) +
+ + A + (aaa) + + +
Tables
+ +Omitted closing tags: ++ + +
h1c1h1c2 +
r1c1r1c2 +
r2c1r2c2 +

+Nested, omitted closing tags: ++ + +
h1c1h1c2 +
r1c1r1c2 ++ + +
h1c1h1c2 +
r1c1r1c2 +
r2c1r2c2 +
+
r2c1r2c2 +

+ +
URLs
+ +Relative and absolute: , , , , , ,
+(try base URL value of 'http://a.com/b/')
+CSS URLs:
,
,
,
,

+Anti-spam: (try regex for 'http://a.com', etc.) , , , , , ,
+ +
XSS
+ +'';!--"=&{()}
+
+
+
+
+

+

+

+
+
+

+test
+Bad IE7: x
+Bad IE7: xxx
+Bad IE7: xxx
+Bad IE7: xxx
+Bad IE7: xxx
+Bad IE7: xxx
+Bad IE7: xxx
+Bad IE7: xxx
+Bad IE7: xxx
+Bad IE7: xxx
+Bad IE7: xxx
+Bad IE7: xxx
+Bad IE7: xxx
+Bad IE7: xxx
+Bad IE7: x
+Bad IE7: x
+Bad IE7: x
+Bad IE7: x
+Bad IE7: exp/*x
+Bad IE7: hi
+Bad IE7: hi
+Bad IE7: test
+Bad IE7: hi
+Bad IE7: hi
+ +
Other
+ +3 < 4
+3 > 4
+ > 3
\ No newline at end of file -- cgit v1.2.3-54-g00ecf From 5c21a371d61b4cd3dd125f2c43aa80c52c9c316d Mon Sep 17 00:00:00 2001 From: Brion Vibber Date: Tue, 18 Aug 2009 20:59:18 -0700 Subject: Include php-gettext 1.0.7 into extlibs; loading it up if native gettext extension is not present. This provides a pure PHP implementation of the gettext functions. This should help get laconica running on shared hosting environments where PHP's gettext module may not be installed. Also gets us one step closer to running on Mac OS X 10.5 with Apple's preinstalled PHP, which doesn't provide an easy way to add modules. Source: http://savannah.nongnu.org/projects/php-gettext Copyright (c) 2005 Steven Armstrong GPLv2 or later --- extlib/php-gettext/.DS_Store | Bin 0 -> 6148 bytes extlib/php-gettext/AUTHORS | 3 + extlib/php-gettext/COPYING | 340 ++++++++++++++++++++++++++++++++++++++ extlib/php-gettext/ChangeLog | 144 +++++++++++++++++ extlib/php-gettext/README | 189 ++++++++++++++++++++++ extlib/php-gettext/gettext.inc | 318 ++++++++++++++++++++++++++++++++++++ extlib/php-gettext/gettext.php | 358 +++++++++++++++++++++++++++++++++++++++++ extlib/php-gettext/streams.php | 166 +++++++++++++++++++ install.php | 3 +- lib/common.php | 3 + 10 files changed, 1522 insertions(+), 2 deletions(-) create mode 100644 extlib/php-gettext/.DS_Store create mode 100644 extlib/php-gettext/AUTHORS create mode 100644 extlib/php-gettext/COPYING create mode 100644 extlib/php-gettext/ChangeLog create mode 100644 extlib/php-gettext/README create mode 100644 extlib/php-gettext/gettext.inc create mode 100644 extlib/php-gettext/gettext.php create mode 100644 extlib/php-gettext/streams.php (limited to 'extlib') diff --git a/extlib/php-gettext/.DS_Store b/extlib/php-gettext/.DS_Store new file mode 100644 index 000000000..5008ddfcf Binary files /dev/null and b/extlib/php-gettext/.DS_Store differ diff --git a/extlib/php-gettext/AUTHORS b/extlib/php-gettext/AUTHORS new file mode 100644 index 000000000..da6ade7b6 --- /dev/null +++ b/extlib/php-gettext/AUTHORS @@ -0,0 +1,3 @@ +Danilo Segan +Nico Kaiser (contributed most changes between 1.0.2 and 1.0.3, bugfix for 1.0.5) +Steven Armstrong (gettext.inc, leading to 1.0.6) diff --git a/extlib/php-gettext/COPYING b/extlib/php-gettext/COPYING new file mode 100644 index 000000000..5b6e7c66c --- /dev/null +++ b/extlib/php-gettext/COPYING @@ -0,0 +1,340 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General +Public License instead of this License. diff --git a/extlib/php-gettext/ChangeLog b/extlib/php-gettext/ChangeLog new file mode 100644 index 000000000..5e0949dfd --- /dev/null +++ b/extlib/php-gettext/ChangeLog @@ -0,0 +1,144 @@ +2006-02-07 Danilo Šegan + + * examples/pigs_dropin.php: comment-out bind_textdomain_codeset + + * gettext.inc (T_bind_textdomain_codeset): bind_textdomain_codeset + is available only in PHP 4.2.0+ (thanks to Jens A. Tkotz). + + * Makefile: Include gettext.inc in DIST_FILES, VERSION up to + 1.0.7. + +2006-02-03 Danilo Šegan + + Added setlocale() emulation as well. + + * examples/pigs_dropin.php: Use T_setlocale() and locale_emulation(). + * examples/pigs_fallback.php: Use T_setlocale() and locale_emulation(). + + * gettext.inc: Added globals $EMULATEGETTEXT and $CURRENTLOCALE. + (locale_emulation): Whether emulation is active. + (_check_locale): Rewrite. + (_setlocale): Added emulated setlocale function. + (T_setlocale): Wrapper around _setlocale. + (_get_reader): Use variables and _setlocale. + +2006-02-02 Danilo Šegan + + Fix bug #12192. + + * examples/locale/sr_CS/LC_MESSAGES/messages.po: Correct grammar. + * examples/locale/sr_CS/LC_MESSAGES/messages.mo: Rebuild. + +2006-02-02 Danilo Šegan + + Fix bug #15419. + + * streams.php: Support for PHP 5.1.1 fread() which reads most 8kb. + (Fix by Piotr Szotkowski ) + +2006-02-02 Danilo Šegan + + Merge Steven Armstrong's changes, supporting standard gettext + interfaces: + + * examples/*: Restructured examples. + * gettext.inc: Added. + * AUTHORS: Added Steven. + * Makefile (VERSION): Up to 1.0.6. + +2006-01-28 Nico Kaiser + + * gettext.php (select_string): Fix "true" <-> 1 difference of PHP + +2005-07-29 Danilo Šegan + + * Makefile (VERSION): Up to 1.0.5. + +2005-07-29 Danilo Šegan + + Fixes bug #13850. + + * gettext.php (gettext_reader): check $Reader->error as well. + +2005-07-29 Danilo Šegan + + * Makefile (VERSION): Up to 1.0.4. + +2005-07-29 Danilo Šegan + + Fixes bug #13771. + + * gettext.php (gettext_reader->get_plural_forms): Plural forms + header extraction regex change. Reported by Edgar Gonzales. + +2005-02-28 Danilo Šegan + + * AUTHORS: Added Nico to the list. + + * Makefile (VERSION): Up to 1.0.3. + + * README: Updated. + +2005-02-28 Danilo Šegan + + * gettext.php: Added pre-loading, code documentation, and many + code clean-ups by Nico Kaiser . + +2005-02-28 Danilo Šegan + + * streams.php (FileReader.read): Handle read($bytes = 0). + + * examples/pigs.php: Prefix gettext function names with T or T_. + + * examples/update: Use the same keywords T_ and T_ngettext. + + * streams.php: Added CachedFileReader. + +2003-11-11 Danilo Šegan + + * gettext.php: Added hashing to find_string. + +2003-11-01 Danilo Šegan + + * Makefile (DIST_FILES): Replaced LICENSE with COPYING. + (VERSION): Up to 1.0.2. + + * AUTHORS: Minor edits. + + * README: Minor edits. + + * COPYING: Removed LICENSE, added this file. + + * gettext.php: Added copyright notice and disclaimer. + * streams.php: Same. + * examples/pigs.php: Same. + +2003-10-23 Danilo Šegan + + * Makefile: Upped version to 1.0.1. + + * gettext.php (gettext_reader): Remove a call to set_total_plurals. + (set_total_plurals): Removed unused function for some better days. + +2003-10-23 Danilo Šegan + + * Makefile: Added, version 1.0.0. + + * examples/*: Added an example of usage. + + * README: Described all the crap. + +2003-10-22 Danilo Šegan + + * gettext.php: Plural forms implemented too. + + * streams.php: Added FileReader for direct access to files (no + need to keep file in memory). + + * gettext.php: It works, except for plural forms. + + * streams.php: Created abstract class StreamReader. + Added StringReader class. + + * gettext.php: Started writing gettext_reader. + diff --git a/extlib/php-gettext/README b/extlib/php-gettext/README new file mode 100644 index 000000000..c7525e29c --- /dev/null +++ b/extlib/php-gettext/README @@ -0,0 +1,189 @@ +PHP-gettext 1.0 + +Copyright 2003, 2006 -- Danilo "angry with PHP[1]" Segan +Licensed under GPLv2 (or any later version, see COPYING) + +[1] PHP is actually cyrillic, and translates roughly to + "works-doesn't-work" (UTF-8: Ради-Не-Ради) + + +Introduction + + How many times did you look for a good translation tool, and + found out that gettext is best for the job? Many times. + + How many times did you try to use gettext in PHP, but failed + miserably, because either your hosting provider didn't support + it, or the server didn't have adequate locale? Many times. + + Well, this is a solution to your needs. It allows using gettext + tools for managing translations, yet it doesn't require gettext + library at all. It parses generated MO files directly, and thus + might be a bit slower than the (maybe provided) gettext library. + + PHP-gettext is a simple reader for GNU gettext MO files. Those + are binary containers for translations, produced by GNU msgfmt. + +Why? + + I got used to having gettext work even without gettext + library. It's there in my favourite language Python, so I was + surprised that I couldn't find it in PHP. I even Googled for it, + but to no avail. + + So, I said, what the heck, I'm going to write it for this + disguisting language of PHP, because I'm often constrained to it. + +Features + + o Support for simple translations + Just define a simple alias for translate() function (suggested + use of _() or gettext(); see provided example). + + o Support for ngettext calls (plural forms, see a note under bugs) + You may also use plural forms. Translations in MO files need to + provide this, and they must also provide "plural-forms" header. + Please see 'info gettext' for more details. + + o Support for reading straight files, or strings (!!!) + Since I can imagine many different backends for reading in the MO + file data, I used imaginary abstract class StreamReader to do all + the input (check streams.php). For your convenience, I've already + provided two classes for reading files: FileReader and + StringReader (CachedFileReader is a combination of the two: it + loads entire file contents into a string, and then works on that). + See example below for usage. You can for instance use StringReader + when you read in data from a database, or you can create your own + derivative of StreamReader for anything you like. + + +Bugs + + Plural-forms field in MO header (translation for empty string, + i.e. "") is treated according to PHP syntactic rules (it's + eval()ed). Since these should actually follow C syntax, there are + some problems. + + For instance, I'm used to using this: + Plural-Forms: nplurals=3; plural=n%10==1 && n%100!=11 ? 0 : \ + n%10>=2 && n%10<=4 && (n%100<10 || n%100>=20) ? 1 : 2; + but it fails with PHP (it sets $plural=2 instead of 0 for $n==1). + + The fix is usually simple, but I'm lazy to go into the details of + PHP operator precedence, and maybe try to fix it. In here, I had + to put everything after the first ':' in parenthesis: + Plural-Forms: nplurals=3; plural=n%10==1 && n%100!=11 ? 0 : \ + (n%10>=2 && n%10<=4 && (n%100<10 || n%100>=20) ? 1 : 2); + That works, and I'm satisfied. + + Besides this one, there are probably a bunch of other bugs, since + I hate PHP (did I mention it already? no? strange), and don't + know it very well. So, feel free to fix any of those and report + them back to me at . + +Usage + + Put files streams.php and gettext.php somewhere you can load them + from, and require 'em in where you want to use them. + + Then, create one 'stream reader' (a class that provides functions + like read(), seekto(), currentpos() and length()) which will + provide data for the 'gettext_reader', with eg. + $streamer = new FileStream('data.mo'); + + Then, use that as a parameter to gettext_reader constructor: + $wohoo = new gettext_reader($streamer); + + If you want to disable pre-loading of entire message catalog in + memory (if, for example, you have a multi-thousand message catalog + which you'll use only occasionally), use "false" for second + parameter to gettext_reader constructor: + $wohoo = new gettext_reader($streamer, false); + + From now on, you have all the benefits of gettext data at your + disposal, so may run: + print $wohoo->translate("This is a test"); + print $wohoo->ngettext("%d bird", "%d birds", $birds); + + You might need to pass parameter "-k" to xgettext to make it + extract all the strings. In above example, try with + xgettext -ktranslate -kngettext:1,2 file.php + what should create messages.po which contains two messages for + translation. + + I suggest creating simple aliases for these functions (see + example/pigs.php for how do I do it, which means it's probably a + bad way). + + +Usage with gettext.inc (standard gettext interfaces emulation) + + Check example in examples/pig_dropin.php, basically you include + gettext.inc and use all the standard gettext interfaces as + documented on: + + http://www.php.net/gettext + + The only catch is that you can check return value of setlocale() + to see if your locale is system supported or not. + + +Example + + See in examples/ subdirectory. There are a couple of files. + pigs.php is an example, serbian.po is a translation to Serbian + language, and serbian.mo is generated with + msgfmt -o serbian.mo serbian.po + There is also simple "update" script that can be used to generate + POT file and to update the translation using msgmerge. + +Interesting TODO: + + o Try to parse "plural-forms" header field, and to follow C syntax + rules. This won't be easy. + +Boring TODO: + + o Learn PHP and fix bugs, slowness and other stuff resulting from + my lack of knowledge (but *maybe*, it's not my knowledge that is + bad, but PHP itself ;-). + + (This is mostly done thanks to Nico Kaiser.) + + o Try to use hash tables in MO files: with pre-loading, would it + be useful at all? + +Never-asked-questions: + + o Why did you mark this as version 1.0 when this is the first code + release? + + Well, it's quite simple. I consider that the first released thing + should be labeled "version 1" (first, right?). Zero is there to + indicate that there's zero improvement and/or change compared to + "version 1". + + I plan to use version numbers 1.0.* for small bugfixes, and to + release 1.1 as "first stable release of version 1". + + This may trick someone that this is actually useful software, but + as with any other free software, I take NO RESPONSIBILITY for + creating such a masterpiece that will smoke crack, trash your + hard disk, and make lasers in your CD device dance to the tune of + Mozart's 40th Symphony (there is one like that, right?). + + o Can I...? + + Yes, you can. This is free software (as in freedom, free speech), + and you might do whatever you wish with it, provided you do not + limit freedom of others (GPL). + + I'm considering licensing this under LGPL, but I *do* want + *every* PHP-gettext user to contribute and respect ideas of free + software, so don't count on it happening anytime soon. + + I'm sorry that I'm taking away your freedom of taking others' + freedom away, but I believe that's neglible as compared to what + freedoms you could take away. ;-) + + Uhm, whatever. diff --git a/extlib/php-gettext/gettext.inc b/extlib/php-gettext/gettext.inc new file mode 100644 index 000000000..eb94b256a --- /dev/null +++ b/extlib/php-gettext/gettext.inc @@ -0,0 +1,318 @@ + + + Drop in replacement for native gettext. + + This file is part of PHP-gettext. + + PHP-gettext is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + PHP-gettext is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PHP-gettext; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +*/ +/* +LC_CTYPE 0 +LC_NUMERIC 1 +LC_TIME 2 +LC_COLLATE 3 +LC_MONETARY 4 +LC_MESSAGES 5 +LC_ALL 6 +*/ + +require('streams.php'); +require('gettext.php'); + + +// Variables + +global $text_domains, $default_domain, $LC_CATEGORIES, $EMULATEGETTEXT, $CURRENTLOCALE; +$text_domains = array(); +$default_domain = 'messages'; +$LC_CATEGORIES = array('LC_CTYPE', 'LC_NUMERIC', 'LC_TIME', 'LC_COLLATE', 'LC_MONETARY', 'LC_MESSAGES', 'LC_ALL'); +$EMULATEGETTEXT = 0; +$CURRENTLOCALE = ''; + + +// Utility functions + +/** + * Utility function to get a StreamReader for the given text domain. + */ +function _get_reader($domain=null, $category=5, $enable_cache=true) { + global $text_domains, $default_domain, $LC_CATEGORIES; + if (!isset($domain)) $domain = $default_domain; + if (!isset($text_domains[$domain]->l10n)) { + // get the current locale + $locale = _setlocale(LC_MESSAGES, 0); + $p = isset($text_domains[$domain]->path) ? $text_domains[$domain]->path : './'; + $path = $p . "$locale/". $LC_CATEGORIES[$category] ."/$domain.mo"; + if (file_exists($path)) { + $input = new FileReader($path); + } + else { + $input = null; + } + $text_domains[$domain]->l10n = new gettext_reader($input, $enable_cache); + } + return $text_domains[$domain]->l10n; +} + +/** + * Returns whether we are using our emulated gettext API or PHP built-in one. + */ +function locale_emulation() { + global $EMULATEGETTEXT; + return $EMULATEGETTEXT; +} + +/** + * Checks if the current locale is supported on this system. + */ +function _check_locale() { + global $EMULATEGETTEXT; + return !$EMULATEGETTEXT; +} + +/** + * Get the codeset for the given domain. + */ +function _get_codeset($domain=null) { + global $text_domains, $default_domain, $LC_CATEGORIES; + if (!isset($domain)) $domain = $default_domain; + return (isset($text_domains[$domain]->codeset))? $text_domains[$domain]->codeset : ini_get('mbstring.internal_encoding'); +} + +/** + * Convert the given string to the encoding set by bind_textdomain_codeset. + */ +function _encode($text) { + $source_encoding = mb_detect_encoding($text); + $target_encoding = _get_codeset(); + if ($source_encoding != $target_encoding) { + return mb_convert_encoding($text, $target_encoding, $source_encoding); + } + else { + return $text; + } +} + + + + +// Custom implementation of the standard gettext related functions + +/** + * Sets a requested locale, if needed emulates it. + */ +function _setlocale($category, $locale) { + global $CURRENTLOCALE, $EMULATEGETTEXT; + if ($locale === 0) { // use === to differentiate between string "0" + if ($CURRENTLOCALE != '') + return $CURRENTLOCALE; + else + // obey LANG variable, maybe extend to support all of LC_* vars + // even if we tried to read locale without setting it first + return _setlocale($category, $CURRENTLOCALE); + } else { + $ret = 0; + if (function_exists('setlocale')) // I don't know if this ever happens ;) + $ret = setlocale($category, $locale); + if (($ret and $locale == '') or ($ret == $locale)) { + $EMULATEGETTEXT = 0; + $CURRENTLOCALE = $ret; + } else { + if ($locale == '') // emulate variable support + $CURRENTLOCALE = getenv('LANG'); + else + $CURRENTLOCALE = $locale; + $EMULATEGETTEXT = 1; + } + return $CURRENTLOCALE; + } +} + +/** + * Sets the path for a domain. + */ +function _bindtextdomain($domain, $path) { + global $text_domains; + // ensure $path ends with a slash + if ($path[strlen($path) - 1] != '/') $path .= '/'; + elseif ($path[strlen($path) - 1] != '\\') $path .= '\\'; + $text_domains[$domain]->path = $path; +} + +/** + * Specify the character encoding in which the messages from the DOMAIN message catalog will be returned. + */ +function _bind_textdomain_codeset($domain, $codeset) { + global $text_domains; + $text_domains[$domain]->codeset = $codeset; +} + +/** + * Sets the default domain. + */ +function _textdomain($domain) { + global $default_domain; + $default_domain = $domain; +} + +/** + * Lookup a message in the current domain. + */ +function _gettext($msgid) { + $l10n = _get_reader(); + //return $l10n->translate($msgid); + return _encode($l10n->translate($msgid)); +} +/** + * Alias for gettext. + */ +function __($msgid) { + return _gettext($msgid); +} +/** + * Plural version of gettext. + */ +function _ngettext($single, $plural, $number) { + $l10n = _get_reader(); + //return $l10n->ngettext($single, $plural, $number); + return _encode($l10n->ngettext($single, $plural, $number)); +} + +/** + * Override the current domain. + */ +function _dgettext($domain, $msgid) { + $l10n = _get_reader($domain); + //return $l10n->translate($msgid); + return _encode($l10n->translate($msgid)); +} +/** + * Plural version of dgettext. + */ +function _dngettext($domain, $single, $plural, $number) { + $l10n = _get_reader($domain); + //return $l10n->ngettext($single, $plural, $number); + return _encode($l10n->ngettext($single, $plural, $number)); +} + +/** + * Overrides the domain and category for a single lookup. + */ +function _dcgettext($domain, $msgid, $category) { + $l10n = _get_reader($domain, $category); + //return $l10n->translate($msgid); + return _encode($l10n->translate($msgid)); +} +/** + * Plural version of dcgettext. + */ +function _dcngettext($domain, $single, $plural, $number, $category) { + $l10n = _get_reader($domain, $category); + //return $l10n->ngettext($single, $plural, $number); + return _encode($l10n->ngettext($single, $plural, $number)); +} + + + +// Wrappers to use if the standard gettext functions are available, but the current locale is not supported by the system. +// Use the standard impl if the current locale is supported, use the custom impl otherwise. + +function T_setlocale($category, $locale) { + return _setlocale($category, $locale); +} + +function T_bindtextdomain($domain, $path) { + if (_check_locale()) return bindtextdomain($domain, $path); + else return _bindtextdomain($domain, $path); +} +function T_bind_textdomain_codeset($domain, $codeset) { + // bind_textdomain_codeset is available only in PHP 4.2.0+ + if (_check_locale() and function_exists('bind_textdomain_codeset')) return bind_textdomain_codeset($domain, $codeset); + else return _bind_textdomain_codeset($domain, $codeset); +} +function T_textdomain($domain) { + if (_check_locale()) return textdomain($domain); + else return _textdomain($domain); +} +function T_gettext($msgid) { + if (_check_locale()) return gettext($msgid); + else return _gettext($msgid); +} +function T_($msgid) { + if (_check_locale()) return _($msgid); + return __($msgid); +} +function T_ngettext($single, $plural, $number) { + if (_check_locale()) return ngettext($single, $plural, $number); + else return _ngettext($single, $plural, $number); +} +function T_dgettext($domain, $msgid) { + if (_check_locale()) return dgettext($domain, $msgid); + else return _dgettext($domain, $msgid); +} +function T_dngettext($domain, $single, $plural, $number) { + if (_check_locale()) return dngettext($domain, $single, $plural, $number); + else return _dngettext($domain, $single, $plural, $number); +} +function T_dcgettext($domain, $msgid, $category) { + if (_check_locale()) return dcgettext($domain, $msgid, $category); + else return _dcgettext($domain, $msgid, $category); +} +function T_dcngettext($domain, $single, $plural, $number, $category) { + if (_check_locale()) return dcngettext($domain, $single, $plural, $number, $category); + else return _dcngettext($domain, $single, $plural, $number, $category); +} + + + +// Wrappers used as a drop in replacement for the standard gettext functions + +if (!function_exists('gettext')) { + function bindtextdomain($domain, $path) { + return _bindtextdomain($domain, $path); + } + function bind_textdomain_codeset($domain, $codeset) { + return _bind_textdomain_codeset($domain, $codeset); + } + function textdomain($domain) { + return _textdomain($domain); + } + function gettext($msgid) { + return _gettext($msgid); + } + function _($msgid) { + return __($msgid); + } + function ngettext($single, $plural, $number) { + return _ngettext($single, $plural, $number); + } + function dgettext($domain, $msgid) { + return _dgettext($domain, $msgid); + } + function dngettext($domain, $single, $plural, $number) { + return _dngettext($domain, $single, $plural, $number); + } + function dcgettext($domain, $msgid, $category) { + return _dcgettext($domain, $msgid, $category); + } + function dcngettext($domain, $single, $plural, $number, $category) { + return _dcngettext($domain, $single, $plural, $number, $category); + } +} + +?> \ No newline at end of file diff --git a/extlib/php-gettext/gettext.php b/extlib/php-gettext/gettext.php new file mode 100644 index 000000000..ad94a987b --- /dev/null +++ b/extlib/php-gettext/gettext.php @@ -0,0 +1,358 @@ +. + Copyright (c) 2005 Nico Kaiser + + This file is part of PHP-gettext. + + PHP-gettext is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + PHP-gettext is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PHP-gettext; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +*/ + +/** + * Provides a simple gettext replacement that works independently from + * the system's gettext abilities. + * It can read MO files and use them for translating strings. + * The files are passed to gettext_reader as a Stream (see streams.php) + * + * This version has the ability to cache all strings and translations to + * speed up the string lookup. + * While the cache is enabled by default, it can be switched off with the + * second parameter in the constructor (e.g. whenusing very large MO files + * that you don't want to keep in memory) + */ +class gettext_reader { + //public: + var $error = 0; // public variable that holds error code (0 if no error) + + //private: + var $BYTEORDER = 0; // 0: low endian, 1: big endian + var $STREAM = NULL; + var $short_circuit = false; + var $enable_cache = false; + var $originals = NULL; // offset of original table + var $translations = NULL; // offset of translation table + var $pluralheader = NULL; // cache header field for plural forms + var $total = 0; // total string count + var $table_originals = NULL; // table for original strings (offsets) + var $table_translations = NULL; // table for translated strings (offsets) + var $cache_translations = NULL; // original -> translation mapping + + + /* Methods */ + + + /** + * Reads a 32bit Integer from the Stream + * + * @access private + * @return Integer from the Stream + */ + function readint() { + if ($this->BYTEORDER == 0) { + // low endian + return array_shift(unpack('V', $this->STREAM->read(4))); + } else { + // big endian + return array_shift(unpack('N', $this->STREAM->read(4))); + } + } + + /** + * Reads an array of Integers from the Stream + * + * @param int count How many elements should be read + * @return Array of Integers + */ + function readintarray($count) { + if ($this->BYTEORDER == 0) { + // low endian + return unpack('V'.$count, $this->STREAM->read(4 * $count)); + } else { + // big endian + return unpack('N'.$count, $this->STREAM->read(4 * $count)); + } + } + + /** + * Constructor + * + * @param object Reader the StreamReader object + * @param boolean enable_cache Enable or disable caching of strings (default on) + */ + function gettext_reader($Reader, $enable_cache = true) { + // If there isn't a StreamReader, turn on short circuit mode. + if (! $Reader || isset($Reader->error) ) { + $this->short_circuit = true; + return; + } + + // Caching can be turned off + $this->enable_cache = $enable_cache; + + // $MAGIC1 = (int)0x950412de; //bug in PHP 5 + $MAGIC1 = (int) - 1794895138; + // $MAGIC2 = (int)0xde120495; //bug + $MAGIC2 = (int) - 569244523; + + $this->STREAM = $Reader; + $magic = $this->readint(); + if ($magic == $MAGIC1) { + $this->BYTEORDER = 0; + } elseif ($magic == $MAGIC2) { + $this->BYTEORDER = 1; + } else { + $this->error = 1; // not MO file + return false; + } + + // FIXME: Do we care about revision? We should. + $revision = $this->readint(); + + $this->total = $this->readint(); + $this->originals = $this->readint(); + $this->translations = $this->readint(); + } + + /** + * Loads the translation tables from the MO file into the cache + * If caching is enabled, also loads all strings into a cache + * to speed up translation lookups + * + * @access private + */ + function load_tables() { + if (is_array($this->cache_translations) && + is_array($this->table_originals) && + is_array($this->table_translations)) + return; + + /* get original and translations tables */ + $this->STREAM->seekto($this->originals); + $this->table_originals = $this->readintarray($this->total * 2); + $this->STREAM->seekto($this->translations); + $this->table_translations = $this->readintarray($this->total * 2); + + if ($this->enable_cache) { + $this->cache_translations = array (); + /* read all strings in the cache */ + for ($i = 0; $i < $this->total; $i++) { + $this->STREAM->seekto($this->table_originals[$i * 2 + 2]); + $original = $this->STREAM->read($this->table_originals[$i * 2 + 1]); + $this->STREAM->seekto($this->table_translations[$i * 2 + 2]); + $translation = $this->STREAM->read($this->table_translations[$i * 2 + 1]); + $this->cache_translations[$original] = $translation; + } + } + } + + /** + * Returns a string from the "originals" table + * + * @access private + * @param int num Offset number of original string + * @return string Requested string if found, otherwise '' + */ + function get_original_string($num) { + $length = $this->table_originals[$num * 2 + 1]; + $offset = $this->table_originals[$num * 2 + 2]; + if (! $length) + return ''; + $this->STREAM->seekto($offset); + $data = $this->STREAM->read($length); + return (string)$data; + } + + /** + * Returns a string from the "translations" table + * + * @access private + * @param int num Offset number of original string + * @return string Requested string if found, otherwise '' + */ + function get_translation_string($num) { + $length = $this->table_translations[$num * 2 + 1]; + $offset = $this->table_translations[$num * 2 + 2]; + if (! $length) + return ''; + $this->STREAM->seekto($offset); + $data = $this->STREAM->read($length); + return (string)$data; + } + + /** + * Binary search for string + * + * @access private + * @param string string + * @param int start (internally used in recursive function) + * @param int end (internally used in recursive function) + * @return int string number (offset in originals table) + */ + function find_string($string, $start = -1, $end = -1) { + if (($start == -1) or ($end == -1)) { + // find_string is called with only one parameter, set start end end + $start = 0; + $end = $this->total; + } + if (abs($start - $end) <= 1) { + // We're done, now we either found the string, or it doesn't exist + $txt = $this->get_original_string($start); + if ($string == $txt) + return $start; + else + return -1; + } else if ($start > $end) { + // start > end -> turn around and start over + return $this->find_string($string, $end, $start); + } else { + // Divide table in two parts + $half = (int)(($start + $end) / 2); + $cmp = strcmp($string, $this->get_original_string($half)); + if ($cmp == 0) + // string is exactly in the middle => return it + return $half; + else if ($cmp < 0) + // The string is in the upper half + return $this->find_string($string, $start, $half); + else + // The string is in the lower half + return $this->find_string($string, $half, $end); + } + } + + /** + * Translates a string + * + * @access public + * @param string string to be translated + * @return string translated string (or original, if not found) + */ + function translate($string) { + if ($this->short_circuit) + return $string; + $this->load_tables(); + + if ($this->enable_cache) { + // Caching enabled, get translated string from cache + if (array_key_exists($string, $this->cache_translations)) + return $this->cache_translations[$string]; + else + return $string; + } else { + // Caching not enabled, try to find string + $num = $this->find_string($string); + if ($num == -1) + return $string; + else + return $this->get_translation_string($num); + } + } + + /** + * Get possible plural forms from MO header + * + * @access private + * @return string plural form header + */ + function get_plural_forms() { + // lets assume message number 0 is header + // this is true, right? + $this->load_tables(); + + // cache header field for plural forms + if (! is_string($this->pluralheader)) { + if ($this->enable_cache) { + $header = $this->cache_translations[""]; + } else { + $header = $this->get_translation_string(0); + } + if (eregi("plural-forms: ([^\n]*)\n", $header, $regs)) + $expr = $regs[1]; + else + $expr = "nplurals=2; plural=n == 1 ? 0 : 1;"; + $this->pluralheader = $expr; + } + return $this->pluralheader; + } + + /** + * Detects which plural form to take + * + * @access private + * @param n count + * @return int array index of the right plural form + */ + function select_string($n) { + $string = $this->get_plural_forms(); + $string = str_replace('nplurals',"\$total",$string); + $string = str_replace("n",$n,$string); + $string = str_replace('plural',"\$plural",$string); + + $total = 0; + $plural = 0; + + eval("$string"); + if ($plural >= $total) $plural = $total - 1; + return $plural; + } + + /** + * Plural version of gettext + * + * @access public + * @param string single + * @param string plural + * @param string number + * @return translated plural form + */ + function ngettext($single, $plural, $number) { + if ($this->short_circuit) { + if ($number != 1) + return $plural; + else + return $single; + } + + // find out the appropriate form + $select = $this->select_string($number); + + // this should contains all strings separated by NULLs + $key = $single.chr(0).$plural; + + + if ($this->enable_cache) { + if (! array_key_exists($key, $this->cache_translations)) { + return ($number != 1) ? $plural : $single; + } else { + $result = $this->cache_translations[$key]; + $list = explode(chr(0), $result); + return $list[$select]; + } + } else { + $num = $this->find_string($key); + if ($num == -1) { + return ($number != 1) ? $plural : $single; + } else { + $result = $this->get_translation_string($num); + $list = explode(chr(0), $result); + return $list[$select]; + } + } + } + +} + +?> diff --git a/extlib/php-gettext/streams.php b/extlib/php-gettext/streams.php new file mode 100644 index 000000000..d57aac649 --- /dev/null +++ b/extlib/php-gettext/streams.php @@ -0,0 +1,166 @@ +. + + This file is part of PHP-gettext. + + PHP-gettext is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + PHP-gettext is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PHP-gettext; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +*/ + + +// Simple class to wrap file streams, string streams, etc. +// seek is essential, and it should be byte stream +class StreamReader { + // should return a string [FIXME: perhaps return array of bytes?] + function read($bytes) { + return false; + } + + // should return new position + function seekto($position) { + return false; + } + + // returns current position + function currentpos() { + return false; + } + + // returns length of entire stream (limit for seekto()s) + function length() { + return false; + } +} + +class StringReader { + var $_pos; + var $_str; + + function StringReader($str='') { + $this->_str = $str; + $this->_pos = 0; + } + + function read($bytes) { + $data = substr($this->_str, $this->_pos, $bytes); + $this->_pos += $bytes; + if (strlen($this->_str)<$this->_pos) + $this->_pos = strlen($this->_str); + + return $data; + } + + function seekto($pos) { + $this->_pos = $pos; + if (strlen($this->_str)<$this->_pos) + $this->_pos = strlen($this->_str); + return $this->_pos; + } + + function currentpos() { + return $this->_pos; + } + + function length() { + return strlen($this->_str); + } + +} + + +class FileReader { + var $_pos; + var $_fd; + var $_length; + + function FileReader($filename) { + if (file_exists($filename)) { + + $this->_length=filesize($filename); + $this->_pos = 0; + $this->_fd = fopen($filename,'rb'); + if (!$this->_fd) { + $this->error = 3; // Cannot read file, probably permissions + return false; + } + } else { + $this->error = 2; // File doesn't exist + return false; + } + } + + function read($bytes) { + if ($bytes) { + fseek($this->_fd, $this->_pos); + + // PHP 5.1.1 does not read more than 8192 bytes in one fread() + // the discussions at PHP Bugs suggest it's the intended behaviour + while ($bytes > 0) { + $chunk = fread($this->_fd, $bytes); + $data .= $chunk; + $bytes -= strlen($chunk); + } + $this->_pos = ftell($this->_fd); + + return $data; + } else return ''; + } + + function seekto($pos) { + fseek($this->_fd, $pos); + $this->_pos = ftell($this->_fd); + return $this->_pos; + } + + function currentpos() { + return $this->_pos; + } + + function length() { + return $this->_length; + } + + function close() { + fclose($this->_fd); + } + +} + +// Preloads entire file in memory first, then creates a StringReader +// over it (it assumes knowledge of StringReader internals) +class CachedFileReader extends StringReader { + function CachedFileReader($filename) { + if (file_exists($filename)) { + + $length=filesize($filename); + $fd = fopen($filename,'rb'); + + if (!$fd) { + $this->error = 3; // Cannot read file, probably permissions + return false; + } + $this->_str = fread($fd, $length); + fclose($fd); + + } else { + $this->error = 2; // File doesn't exist + return false; + } + } +} + + +?> \ No newline at end of file diff --git a/install.php b/install.php index 9bcee275f..c13f70272 100644 --- a/install.php +++ b/install.php @@ -49,8 +49,7 @@ function checkPrereqs() } $reqs = array('gd', 'curl', - 'xmlwriter', 'mbstring', - 'gettext'); + 'xmlwriter', 'mbstring'); foreach ($reqs as $req) { if (!checkExtension($req)) { diff --git a/lib/common.php b/lib/common.php index 6c4b856e0..72c093bf3 100644 --- a/lib/common.php +++ b/lib/common.php @@ -47,6 +47,9 @@ require_once('PEAR.php'); require_once('DB/DataObject.php'); require_once('DB/DataObject/Cast.php'); # for dates +if (!function_exists('gettext')) { + require_once("php-gettext/gettext.inc"); +} require_once(INSTALLDIR.'/lib/language.php'); // This gets included before the config file, so that admin code and plugins -- cgit v1.2.3-54-g00ecf From 8bca90b8d2804675421464a3db1b447a439a456c Mon Sep 17 00:00:00 2001 From: Brion Vibber Date: Tue, 18 Aug 2009 21:02:46 -0700 Subject: Tweak to php-gettext stream reader; initialize local variable before appending data to it to avoid triggering an E_NOTICE message. --- extlib/php-gettext/streams.php | 1 + 1 file changed, 1 insertion(+) (limited to 'extlib') diff --git a/extlib/php-gettext/streams.php b/extlib/php-gettext/streams.php index d57aac649..3eafa7482 100644 --- a/extlib/php-gettext/streams.php +++ b/extlib/php-gettext/streams.php @@ -108,6 +108,7 @@ class FileReader { // PHP 5.1.1 does not read more than 8192 bytes in one fread() // the discussions at PHP Bugs suggest it's the intended behaviour + $data = ''; while ($bytes > 0) { $chunk = fread($this->_fd, $bytes); $data .= $chunk; -- cgit v1.2.3-54-g00ecf From 8246977ef00970f76da28500e40f170e68544bf9 Mon Sep 17 00:00:00 2001 From: Brion Vibber Date: Tue, 18 Aug 2009 21:21:57 -0700 Subject: kill stupid Finder metadata file that made it into my checkin presumably due to 'git add php-gettext' deciding to find all hidden files in the directory for me :P --- extlib/php-gettext/.DS_Store | Bin 6148 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 extlib/php-gettext/.DS_Store (limited to 'extlib') diff --git a/extlib/php-gettext/.DS_Store b/extlib/php-gettext/.DS_Store deleted file mode 100644 index 5008ddfcf..000000000 Binary files a/extlib/php-gettext/.DS_Store and /dev/null differ -- cgit v1.2.3-54-g00ecf From 3fd0a9693dc666478b29c85a045be3b084bc37e2 Mon Sep 17 00:00:00 2001 From: Craig Andrews Date: Mon, 31 Aug 2009 15:49:11 -0400 Subject: Fix typo in Stomp Thanks Marcel|HSD --- extlib/Stomp.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'extlib') diff --git a/extlib/Stomp.php b/extlib/Stomp.php index 9e1c97b3b..abd9cba62 100644 --- a/extlib/Stomp.php +++ b/extlib/Stomp.php @@ -454,7 +454,7 @@ class Stomp */ public function disconnect () { - $header = array(); + $headers = array(); if ($this->clientId != null) { $headers["client-id"] = $this->clientId; -- cgit v1.2.3-54-g00ecf From 00032e11122463a3b69babc464caa26783c7c644 Mon Sep 17 00:00:00 2001 From: Craig Andrews Date: Mon, 31 Aug 2009 22:16:49 -0400 Subject: Allow the oEmbed tag to be split across lines --- extlib/Services/oEmbed.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'extlib') diff --git a/extlib/Services/oEmbed.php b/extlib/Services/oEmbed.php index 7d507b6f6..b05e3a1d1 100644 --- a/extlib/Services/oEmbed.php +++ b/extlib/Services/oEmbed.php @@ -303,7 +303,7 @@ class Services_oEmbed // Find all tags that have a valid oembed type set. We then // extract the href attribute for each type. $regexp = '#]*)type="' . - '(application/json|text/xml)\+oembed"([^>]*)>#i'; + '(application/json|text/xml)\+oembed"([^>]*)>#im'; $m = $ret = array(); if (!preg_match_all($regexp, $body, $m)) { -- cgit v1.2.3-54-g00ecf From 29d0dd740c329c2674de58bec172419148a1b495 Mon Sep 17 00:00:00 2001 From: Craig Andrews Date: Tue, 1 Sep 2009 23:17:45 -0400 Subject: Allow whitespace before and after the = and require space before the href in html --- extlib/Services/oEmbed.php | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'extlib') diff --git a/extlib/Services/oEmbed.php b/extlib/Services/oEmbed.php index b05e3a1d1..0dc8f01b2 100644 --- a/extlib/Services/oEmbed.php +++ b/extlib/Services/oEmbed.php @@ -256,7 +256,7 @@ class Services_oEmbed $code = curl_getinfo($ch, CURLINFO_HTTP_CODE); if (substr($code, 0, 1) != '2') { - throw new Services_oEmbed_Exception('Non-200 code returned'); + throw new Services_oEmbed_Exception('Non-200 code returned. Got code ' . $code); } curl_close($ch); @@ -302,7 +302,7 @@ class Services_oEmbed // Find all tags that have a valid oembed type set. We then // extract the href attribute for each type. - $regexp = '#]*)type="' . + $regexp = '#]*)type[\s\n]*=[\s\n]*"' . '(application/json|text/xml)\+oembed"([^>]*)>#im'; $m = $ret = array(); @@ -314,7 +314,7 @@ class Services_oEmbed foreach ($m[0] as $i => $link) { $h = array(); - if (preg_match('/href="([^"]+)"/i', $link, $h)) { + if (preg_match('/[\s\n]+href[\s\n]*=[\s\n]*"([^"]+)"/im', $link, $h)) { $ret[$m[2][$i]] = $h[1]; } } @@ -347,7 +347,7 @@ class Services_oEmbed $code = curl_getinfo($ch, CURLINFO_HTTP_CODE); if (substr($code, 0, 1) != '2') { - throw new Services_oEmbed_Exception('Non-200 code returned'); + throw new Services_oEmbed_Exception('Non-200 code returned. Got code ' . $code); } return $result; -- cgit v1.2.3-54-g00ecf From 5974871b7b00bd8e3f28dc5f5a9465a9eec0d3d3 Mon Sep 17 00:00:00 2001 From: Craig Andrews Date: Thu, 3 Sep 2009 18:34:30 -0400 Subject: Improve OAuth CGI compatibility Fixes http://status.net/trac/ticket/1822 Reported upstream at http://code.google.com/p/oauth/issues/detail?id=118 --- extlib/OAuth.php | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'extlib') diff --git a/extlib/OAuth.php b/extlib/OAuth.php index 029166175..fd4853554 100644 --- a/extlib/OAuth.php +++ b/extlib/OAuth.php @@ -199,7 +199,8 @@ class OAuthRequest {/*{{{*/ } else { // collect request parameters from query string (GET) and post-data (POST) if appropriate (note: POST vars have priority) $req_parameters = $_GET; - if ($http_method == "POST" && @strstr($request_headers["Content-Type"], "application/x-www-form-urlencoded") ) { + if ($http_method == "POST" && + ( @strstr($request_headers["Content-Type"], "application/x-www-form-urlencoded") || @strstr($_ENV["CONTENT_TYPE"], "application/x-www-form-urlencoded") )) { $req_parameters = array_merge($req_parameters, $_POST); } -- cgit v1.2.3-54-g00ecf From a73162d3eb237046f89ab21e17a9424ac47cd66d Mon Sep 17 00:00:00 2001 From: Marcel van der Boom Date: Tue, 8 Sep 2009 21:48:51 +0200 Subject: Silence the NOTICE log messages on port not defined, we deal with that properly, and most of the time it is indeed not define --- extlib/OAuth.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'extlib') diff --git a/extlib/OAuth.php b/extlib/OAuth.php index fd4853554..648627b57 100644 --- a/extlib/OAuth.php +++ b/extlib/OAuth.php @@ -327,7 +327,7 @@ class OAuthRequest {/*{{{*/ public function get_normalized_http_url() {/*{{{*/ $parts = parse_url($this->http_url); - $port = @$parts['port']; + $port = isset($parts['port']) ? $parts['port'] : null; $scheme = $parts['scheme']; $host = $parts['host']; $path = @$parts['path']; -- cgit v1.2.3-54-g00ecf From c04987018cd6c845c6da7a92d9857d8c651f7022 Mon Sep 17 00:00:00 2001 From: Marcel van der Boom Date: Tue, 8 Sep 2009 22:21:33 +0200 Subject: Several fixes to make RabbitMQ a player. * extlib/Stomp.php -spaces for tabs (we're on PEAR, right?) - send: initialize the $properties parameter as array() instead of null this prevents unsetting $headers if $properties was not set (besides that, it's the proper way to initialize an array) - subscribe: insert FIXME's on ActiveMQ specifics - ack: make sure the content-length header is set *and* is zero. I have seen the header set to '3' there but could not find where it came from, this is at least safe. - disconnect: typo in $headers variable - readFrame: use fgets() instead of gets() so that RabbitQ, which is more protocol strict can also play * extlib/Stomp/Frame.php - spaces for tabs - add note on possibly protocol violating linefeed * extlib/Stomp/Message.php - space for tabs - add content-length header for message * lib/stompqueuemanager.php - use the notice for logging, not the frame --- extlib/Stomp.php | 124 ++++++++++++++++++++++++++-------------------- extlib/Stomp/Frame.php | 87 ++++++++++++++++---------------- extlib/Stomp/Message.php | 6 ++- lib/stompqueuemanager.php | 5 +- 4 files changed, 123 insertions(+), 99 deletions(-) (limited to 'extlib') diff --git a/extlib/Stomp.php b/extlib/Stomp.php index abd9cba62..c9e90629c 100644 --- a/extlib/Stomp.php +++ b/extlib/Stomp.php @@ -26,7 +26,7 @@ require_once 'Stomp/Frame.php'; * * @package Stomp * @author Hiram Chirino - * @author Dejan Bosanac + * @author Dejan Bosanac * @author Michael Caplan * @version $Revision: 43 $ */ @@ -44,15 +44,15 @@ class Stomp * * @var int */ - public $prefetchSize = 1; - - /** + public $prefetchSize = 1; + + /** * Client id used for durable subscriptions * * @var string */ - public $clientId = null; - + public $clientId = null; + protected $_brokerUri = null; protected $_socket = null; protected $_hosts = array(); @@ -66,7 +66,7 @@ class Stomp protected $_sessionId; protected $_read_timeout_seconds = 60; protected $_read_timeout_milliseconds = 0; - + /** * Constructor * @@ -134,10 +134,10 @@ class Stomp require_once 'Stomp/Exception.php'; throw new Stomp_Exception("No broker defined"); } - + // force disconnect, if previous established connection exists $this->disconnect(); - + $i = $this->_currentHost; $att = 0; $connected = false; @@ -190,11 +190,11 @@ class Stomp if ($password != '') { $this->_password = $password; } - $headers = array('login' => $this->_username , 'passcode' => $this->_password); - if ($this->clientId != null) { - $headers["client-id"] = $this->clientId; - } - $frame = new Stomp_Frame("CONNECT", $headers); + $headers = array('login' => $this->_username , 'passcode' => $this->_password); + if ($this->clientId != null) { + $headers["client-id"] = $this->clientId; + } + $frame = new Stomp_Frame("CONNECT", $headers); $this->_writeFrame($frame); $frame = $this->readFrame(); if ($frame instanceof Stomp_Frame && $frame->command == 'CONNECTED') { @@ -209,7 +209,7 @@ class Stomp } } } - + /** * Check if client session has ben established * @@ -229,7 +229,7 @@ class Stomp return $this->_sessionId; } /** - * Send a message to a destination in the messaging system + * Send a message to a destination in the messaging system * * @param string $destination Destination queue * @param string|Stomp_Frame $msg Message @@ -237,7 +237,7 @@ class Stomp * @param boolean $sync Perform request synchronously * @return boolean */ - public function send ($destination, $msg, $properties = null, $sync = null) + public function send ($destination, $msg, $properties = array(), $sync = null) { if ($msg instanceof Stomp_Frame) { $msg->headers['destination'] = $destination; @@ -319,10 +319,12 @@ class Stomp public function subscribe ($destination, $properties = null, $sync = null) { $headers = array('ack' => 'client'); - $headers['activemq.prefetchSize'] = $this->prefetchSize; - if ($this->clientId != null) { - $headers["activemq.subcriptionName"] = $this->clientId; - } + // FIXME: this seems to be activemq specific, but not hurting rabbitmq? + $headers['activemq.prefetchSize'] = $this->prefetchSize; + if ($this->clientId != null) { + // FIXME: this seems to be activemq specific, but not hurting rabbitmq? + $headers["activemq.subcriptionName"] = $this->clientId; + } if (isset($properties)) { foreach ($properties as $name => $value) { $headers[$name] = $value; @@ -424,7 +426,7 @@ class Stomp } /** * Acknowledge consumption of a message from a subscription - * Note: This operation is always asynchronous + * Note: This operation is always asynchronous * * @param string|Stomp_Frame $messageMessage ID * @param string $transactionId @@ -433,20 +435,26 @@ class Stomp */ public function ack ($message, $transactionId = null) { + // Handle the headers, + $headers = array(); + if ($message instanceof Stomp_Frame) { - $frame = new Stomp_Frame('ACK', $message->headers); - $this->_writeFrame($frame); - return true; + // Copy headers from the object + // FIXME: at least content-length can be wrong here (set to 3 sometimes). + $headers = $message->headers; } else { - $headers = array(); if (isset($transactionId)) { $headers['transaction'] = $transactionId; } $headers['message-id'] = $message; - $frame = new Stomp_Frame('ACK', $headers); - $this->_writeFrame($frame); - return true; } + // An ACK has no content + $headers['content-length'] = 0; + + // Create it and write it out + $frame = new Stomp_Frame('ACK', $headers); + $this->_writeFrame($frame); + return true; } /** * Graceful disconnect from the server @@ -454,11 +462,11 @@ class Stomp */ public function disconnect () { - $headers = array(); + $headers = array(); - if ($this->clientId != null) { - $headers["client-id"] = $this->clientId; - } + if ($this->clientId != null) { + $headers["client-id"] = $this->clientId; + } if (is_resource($this->_socket)) { $this->_writeFrame(new Stomp_Frame('DISCONNECT', $headers)); @@ -490,19 +498,19 @@ class Stomp $this->_writeFrame($stompFrame); } } - + /** * Set timeout to wait for content to read * * @param int $seconds_to_wait Seconds to wait for a frame * @param int $milliseconds Milliseconds to wait for a frame */ - public function setReadTimeout($seconds, $milliseconds = 0) + public function setReadTimeout($seconds, $milliseconds = 0) { $this->_read_timeout_seconds = $seconds; $this->_read_timeout_milliseconds = $milliseconds; } - + /** * Read responce frame from server * @@ -513,19 +521,29 @@ class Stomp if (!$this->hasFrameToRead()) { return false; } - + $rb = 1024; $data = ''; - do { - $read = fgets($this->_socket, $rb); - if ($read === false) { - $this->_reconnect(); - return $this->readFrame(); - } - $data .= $read; - $len = strlen($data); - } while (($len < 2 || ! ($data[$len - 2] == "\x00" && $data[$len - 1] == "\n"))); - + do { + $read = fread($this->_socket, $rb); + if ($read === false) { + $this->_reconnect(); + return $this->readFrame(); + } + $data .= $read; + $len = strlen($data); + + $continue = true; + // ActiveMq apparently add \n after 0 char + if($data[$len - 2] == "\x00" && $data[$len - 1] == "\n") { + $continue = false; + } + + // RabbitMq does not + if($data[$len - 1] == "\x00") { + $continue = false; + } + } while ( $continue ); list ($header, $body) = explode("\n\n", $data, 2); $header = explode("\n", $header); $headers = array(); @@ -546,7 +564,7 @@ class Stomp return $frame; } } - + /** * Check if there is a frame to read * @@ -557,7 +575,7 @@ class Stomp $read = array($this->_socket); $write = null; $except = null; - + $has_frame_to_read = stream_select($read, $write, $except, $this->_read_timeout_seconds, $this->_read_timeout_milliseconds); if ($has_frame_to_read === false) { @@ -565,18 +583,18 @@ class Stomp } else if ($has_frame_to_read > 0) { return true; } else { - return false; + return false; } } - + /** * Reconnects and renews subscriptions (if there were any) - * Call this method when you detect connection problems + * Call this method when you detect connection problems */ protected function _reconnect () { $subscriptions = $this->_subscriptions; - + $this->connect($this->_username, $this->_password); foreach ($subscriptions as $dest => $properties) { $this->subscribe($dest, $properties); diff --git a/extlib/Stomp/Frame.php b/extlib/Stomp/Frame.php index dc59c1cb7..9fd97b4f5 100644 --- a/extlib/Stomp/Frame.php +++ b/extlib/Stomp/Frame.php @@ -17,46 +17,46 @@ */ /* vim: set expandtab tabstop=3 shiftwidth=3: */ - -/** - * Stomp Frames are messages that are sent and received on a StompConnection. - * - * @package Stomp - * @author Hiram Chirino - * @author Dejan Bosanac - * @author Michael Caplan - * @version $Revision: 36 $ - */ -class Stomp_Frame -{ - public $command; - public $headers = array(); - public $body; - - /** - * Constructor - * - * @param string $command - * @param array $headers - * @param string $body - */ - public function __construct ($command = null, $headers = null, $body = null) - { - $this->_init($command, $headers, $body); - } - - protected function _init ($command = null, $headers = null, $body = null) - { - $this->command = $command; - if ($headers != null) { - $this->headers = $headers; - } - $this->body = $body; - - if ($this->command == 'ERROR') { - require_once 'Stomp/Exception.php'; - throw new Stomp_Exception($this->headers['message'], 0, $this->body); - } + +/** + * Stomp Frames are messages that are sent and received on a StompConnection. + * + * @package Stomp + * @author Hiram Chirino + * @author Dejan Bosanac + * @author Michael Caplan + * @version $Revision: 36 $ + */ +class Stomp_Frame +{ + public $command; + public $headers = array(); + public $body; + + /** + * Constructor + * + * @param string $command + * @param array $headers + * @param string $body + */ + public function __construct ($command = null, $headers = null, $body = null) + { + $this->_init($command, $headers, $body); + } + + protected function _init ($command = null, $headers = null, $body = null) + { + $this->command = $command; + if ($headers != null) { + $this->headers = $headers; + } + $this->body = $body; + + if ($this->command == 'ERROR') { + require_once 'Stomp/Exception.php'; + throw new Stomp_Exception($this->headers['message'], 0, $this->body); + } } /** @@ -74,7 +74,8 @@ class Stomp_Frame $data .= "\n"; $data .= $this->body; - return $data .= "\x00\n"; - } -} + $data .= "\x00\n"; // Should there really be a linefeed here? + return $data; + } +} ?> \ No newline at end of file diff --git a/extlib/Stomp/Message.php b/extlib/Stomp/Message.php index 6bcad3efd..055662133 100644 --- a/extlib/Stomp/Message.php +++ b/extlib/Stomp/Message.php @@ -29,8 +29,12 @@ require_once 'Stomp/Frame.php'; */ class Stomp_Message extends Stomp_Frame { - public function __construct ($body, $headers = null) + public function __construct ($body, $headers = array()) { + if(!isset($headers['content-length'])) { + // TODO: log this, to see if this is correct + $headers['content-length'] = strlen($body); + } $this->_init("SEND", $headers, $body); } } diff --git a/lib/stompqueuemanager.php b/lib/stompqueuemanager.php index f059b42f0..5d8b2996b 100644 --- a/lib/stompqueuemanager.php +++ b/lib/stompqueuemanager.php @@ -141,10 +141,11 @@ class StompQueueManager $this->con->ack($frame); } else { if ($handler->handle_notice($notice)) { - $this->_log(LOG_INFO, 'Successfully handled notice '. $notice->id .' posted at ' . $frame->headers['created'] . ' in queue '. $queue); + $this->_log(LOG_INFO, 'Successfully handled notice '. $notice->id .' originally posted at ' . $notice->created . ' in queue '. $queue); + $this->con->ack($frame); } else { - $this->_log(LOG_WARNING, 'Failed handling notice '. $notice->id .' posted at ' . $frame->headers['created'] . ' in queue '. $queue); + $this->_log(LOG_WARNING, 'Failed handling notice '. $notice->id .' originally posted at ' . $notice->created . ' in queue '. $queue); // FIXME we probably shouldn't have to do // this kind of queue management ourselves $this->con->ack($frame); -- cgit v1.2.3-54-g00ecf From fe9473ac7810d317e001a0fec19cbacaafc0c909 Mon Sep 17 00:00:00 2001 From: Brion Vibber Date: Tue, 1 Sep 2009 20:04:36 -0300 Subject: Check that 'dl' function is available and usable before trying to call it with error suppression; if it's disabled or unavailable we end up with mysterious failures during installation or loading of libraries. Fixed for StatusNet installer as well as some external libraries that should be fixed upstream if they haven't already been: * PEAR * Auth/OpenID * Auth/Yadis --- extlib/Auth/OpenID/BigMath.php | 2 +- extlib/Auth/Yadis/XML.php | 2 +- extlib/PEAR.php | 2 +- install.php | 17 ++++++++++++----- 4 files changed, 15 insertions(+), 8 deletions(-) (limited to 'extlib') diff --git a/extlib/Auth/OpenID/BigMath.php b/extlib/Auth/OpenID/BigMath.php index 45104947d..b5fc627a0 100644 --- a/extlib/Auth/OpenID/BigMath.php +++ b/extlib/Auth/OpenID/BigMath.php @@ -376,7 +376,7 @@ function Auth_OpenID_detectMathLibrary($exts) // Try to load dynamic modules. if (!$loaded) { foreach ($extension['modules'] as $module) { - if (@dl($module . "." . PHP_SHLIB_SUFFIX)) { + if (function_exists('dl') && ini_get('enable_dl') && !ini_get('safe_mode') && @dl($module . "." . PHP_SHLIB_SUFFIX)) { $loaded = true; break; } diff --git a/extlib/Auth/Yadis/XML.php b/extlib/Auth/Yadis/XML.php index 4854f12bb..7232d6cbd 100644 --- a/extlib/Auth/Yadis/XML.php +++ b/extlib/Auth/Yadis/XML.php @@ -349,7 +349,7 @@ function &Auth_Yadis_getXMLParser() foreach ($extensions as $name => $params) { if (!extension_loaded($name)) { foreach ($params['libname'] as $libname) { - if (@dl($libname)) { + if (function_exists('dl') && ini_get('enable_dl') && !ini_get('safe_mode') && @dl($libname)) { $classname = $params['classname']; } } diff --git a/extlib/PEAR.php b/extlib/PEAR.php index 4c24c6006..fcefa964a 100644 --- a/extlib/PEAR.php +++ b/extlib/PEAR.php @@ -746,7 +746,7 @@ class PEAR { if (!extension_loaded($ext)) { // if either returns true dl() will produce a FATAL error, stop that - if ((ini_get('enable_dl') != 1) || (ini_get('safe_mode') == 1)) { + if ((ini_get('enable_dl') != 1) || (ini_get('safe_mode') == 1) || !function_exists('dl')) { return false; } if (OS_WINDOWS) { diff --git a/install.php b/install.php index c49043e5c..39984aa08 100644 --- a/install.php +++ b/install.php @@ -267,12 +267,19 @@ function checkPrereqs() function checkExtension($name) { - if (!extension_loaded($name)) { - if (!@dl($name.'.so')) { - return false; - } + if (extension_loaded($name)) { + return true; + } elseif (function_exists('dl') && ini_get('enable_dl') && !ini_get('safe_mode')) { + // dl will throw a fatal error if it's disabled or we're in safe mode. + // More fun, it may not even exist under some SAPIs in 5.3.0 or later... + $soname = $name . '.' . PHP_SHLIB_SUFFIX; + if (PHP_SHLIB_SUFFIX == 'dll') { + $soname = "php_" . $soname; + } + return @dl($soname); + } else { + return false; } - return true; } function showLibs() -- cgit v1.2.3-54-g00ecf From 48565a2cdc9df329dee1ad327a1c632dd8f1d4c3 Mon Sep 17 00:00:00 2001 From: Evan Prodromou Date: Tue, 15 Sep 2009 17:08:27 -0400 Subject: Revert "Several fixes to make RabbitMQ a player." This reverts commit c04987018cd6c845c6da7a92d9857d8c651f7022. --- extlib/Stomp.php | 124 ++++++++++++++++++++-------------------------- extlib/Stomp/Frame.php | 87 ++++++++++++++++---------------- extlib/Stomp/Message.php | 6 +-- lib/stompqueuemanager.php | 5 +- 4 files changed, 99 insertions(+), 123 deletions(-) (limited to 'extlib') diff --git a/extlib/Stomp.php b/extlib/Stomp.php index c9e90629c..abd9cba62 100644 --- a/extlib/Stomp.php +++ b/extlib/Stomp.php @@ -26,7 +26,7 @@ require_once 'Stomp/Frame.php'; * * @package Stomp * @author Hiram Chirino - * @author Dejan Bosanac + * @author Dejan Bosanac * @author Michael Caplan * @version $Revision: 43 $ */ @@ -44,15 +44,15 @@ class Stomp * * @var int */ - public $prefetchSize = 1; - - /** + public $prefetchSize = 1; + + /** * Client id used for durable subscriptions * * @var string */ - public $clientId = null; - + public $clientId = null; + protected $_brokerUri = null; protected $_socket = null; protected $_hosts = array(); @@ -66,7 +66,7 @@ class Stomp protected $_sessionId; protected $_read_timeout_seconds = 60; protected $_read_timeout_milliseconds = 0; - + /** * Constructor * @@ -134,10 +134,10 @@ class Stomp require_once 'Stomp/Exception.php'; throw new Stomp_Exception("No broker defined"); } - + // force disconnect, if previous established connection exists $this->disconnect(); - + $i = $this->_currentHost; $att = 0; $connected = false; @@ -190,11 +190,11 @@ class Stomp if ($password != '') { $this->_password = $password; } - $headers = array('login' => $this->_username , 'passcode' => $this->_password); - if ($this->clientId != null) { - $headers["client-id"] = $this->clientId; - } - $frame = new Stomp_Frame("CONNECT", $headers); + $headers = array('login' => $this->_username , 'passcode' => $this->_password); + if ($this->clientId != null) { + $headers["client-id"] = $this->clientId; + } + $frame = new Stomp_Frame("CONNECT", $headers); $this->_writeFrame($frame); $frame = $this->readFrame(); if ($frame instanceof Stomp_Frame && $frame->command == 'CONNECTED') { @@ -209,7 +209,7 @@ class Stomp } } } - + /** * Check if client session has ben established * @@ -229,7 +229,7 @@ class Stomp return $this->_sessionId; } /** - * Send a message to a destination in the messaging system + * Send a message to a destination in the messaging system * * @param string $destination Destination queue * @param string|Stomp_Frame $msg Message @@ -237,7 +237,7 @@ class Stomp * @param boolean $sync Perform request synchronously * @return boolean */ - public function send ($destination, $msg, $properties = array(), $sync = null) + public function send ($destination, $msg, $properties = null, $sync = null) { if ($msg instanceof Stomp_Frame) { $msg->headers['destination'] = $destination; @@ -319,12 +319,10 @@ class Stomp public function subscribe ($destination, $properties = null, $sync = null) { $headers = array('ack' => 'client'); - // FIXME: this seems to be activemq specific, but not hurting rabbitmq? - $headers['activemq.prefetchSize'] = $this->prefetchSize; - if ($this->clientId != null) { - // FIXME: this seems to be activemq specific, but not hurting rabbitmq? - $headers["activemq.subcriptionName"] = $this->clientId; - } + $headers['activemq.prefetchSize'] = $this->prefetchSize; + if ($this->clientId != null) { + $headers["activemq.subcriptionName"] = $this->clientId; + } if (isset($properties)) { foreach ($properties as $name => $value) { $headers[$name] = $value; @@ -426,7 +424,7 @@ class Stomp } /** * Acknowledge consumption of a message from a subscription - * Note: This operation is always asynchronous + * Note: This operation is always asynchronous * * @param string|Stomp_Frame $messageMessage ID * @param string $transactionId @@ -435,26 +433,20 @@ class Stomp */ public function ack ($message, $transactionId = null) { - // Handle the headers, - $headers = array(); - if ($message instanceof Stomp_Frame) { - // Copy headers from the object - // FIXME: at least content-length can be wrong here (set to 3 sometimes). - $headers = $message->headers; + $frame = new Stomp_Frame('ACK', $message->headers); + $this->_writeFrame($frame); + return true; } else { + $headers = array(); if (isset($transactionId)) { $headers['transaction'] = $transactionId; } $headers['message-id'] = $message; + $frame = new Stomp_Frame('ACK', $headers); + $this->_writeFrame($frame); + return true; } - // An ACK has no content - $headers['content-length'] = 0; - - // Create it and write it out - $frame = new Stomp_Frame('ACK', $headers); - $this->_writeFrame($frame); - return true; } /** * Graceful disconnect from the server @@ -462,11 +454,11 @@ class Stomp */ public function disconnect () { - $headers = array(); + $headers = array(); - if ($this->clientId != null) { - $headers["client-id"] = $this->clientId; - } + if ($this->clientId != null) { + $headers["client-id"] = $this->clientId; + } if (is_resource($this->_socket)) { $this->_writeFrame(new Stomp_Frame('DISCONNECT', $headers)); @@ -498,19 +490,19 @@ class Stomp $this->_writeFrame($stompFrame); } } - + /** * Set timeout to wait for content to read * * @param int $seconds_to_wait Seconds to wait for a frame * @param int $milliseconds Milliseconds to wait for a frame */ - public function setReadTimeout($seconds, $milliseconds = 0) + public function setReadTimeout($seconds, $milliseconds = 0) { $this->_read_timeout_seconds = $seconds; $this->_read_timeout_milliseconds = $milliseconds; } - + /** * Read responce frame from server * @@ -521,29 +513,19 @@ class Stomp if (!$this->hasFrameToRead()) { return false; } - + $rb = 1024; $data = ''; - do { - $read = fread($this->_socket, $rb); - if ($read === false) { - $this->_reconnect(); - return $this->readFrame(); - } - $data .= $read; - $len = strlen($data); - - $continue = true; - // ActiveMq apparently add \n after 0 char - if($data[$len - 2] == "\x00" && $data[$len - 1] == "\n") { - $continue = false; - } - - // RabbitMq does not - if($data[$len - 1] == "\x00") { - $continue = false; - } - } while ( $continue ); + do { + $read = fgets($this->_socket, $rb); + if ($read === false) { + $this->_reconnect(); + return $this->readFrame(); + } + $data .= $read; + $len = strlen($data); + } while (($len < 2 || ! ($data[$len - 2] == "\x00" && $data[$len - 1] == "\n"))); + list ($header, $body) = explode("\n\n", $data, 2); $header = explode("\n", $header); $headers = array(); @@ -564,7 +546,7 @@ class Stomp return $frame; } } - + /** * Check if there is a frame to read * @@ -575,7 +557,7 @@ class Stomp $read = array($this->_socket); $write = null; $except = null; - + $has_frame_to_read = stream_select($read, $write, $except, $this->_read_timeout_seconds, $this->_read_timeout_milliseconds); if ($has_frame_to_read === false) { @@ -583,18 +565,18 @@ class Stomp } else if ($has_frame_to_read > 0) { return true; } else { - return false; + return false; } } - + /** * Reconnects and renews subscriptions (if there were any) - * Call this method when you detect connection problems + * Call this method when you detect connection problems */ protected function _reconnect () { $subscriptions = $this->_subscriptions; - + $this->connect($this->_username, $this->_password); foreach ($subscriptions as $dest => $properties) { $this->subscribe($dest, $properties); diff --git a/extlib/Stomp/Frame.php b/extlib/Stomp/Frame.php index 9fd97b4f5..dc59c1cb7 100644 --- a/extlib/Stomp/Frame.php +++ b/extlib/Stomp/Frame.php @@ -17,46 +17,46 @@ */ /* vim: set expandtab tabstop=3 shiftwidth=3: */ - -/** - * Stomp Frames are messages that are sent and received on a StompConnection. - * - * @package Stomp - * @author Hiram Chirino - * @author Dejan Bosanac - * @author Michael Caplan - * @version $Revision: 36 $ - */ -class Stomp_Frame -{ - public $command; - public $headers = array(); - public $body; - - /** - * Constructor - * - * @param string $command - * @param array $headers - * @param string $body - */ - public function __construct ($command = null, $headers = null, $body = null) - { - $this->_init($command, $headers, $body); - } - - protected function _init ($command = null, $headers = null, $body = null) - { - $this->command = $command; - if ($headers != null) { - $this->headers = $headers; - } - $this->body = $body; - - if ($this->command == 'ERROR') { - require_once 'Stomp/Exception.php'; - throw new Stomp_Exception($this->headers['message'], 0, $this->body); - } + +/** + * Stomp Frames are messages that are sent and received on a StompConnection. + * + * @package Stomp + * @author Hiram Chirino + * @author Dejan Bosanac + * @author Michael Caplan + * @version $Revision: 36 $ + */ +class Stomp_Frame +{ + public $command; + public $headers = array(); + public $body; + + /** + * Constructor + * + * @param string $command + * @param array $headers + * @param string $body + */ + public function __construct ($command = null, $headers = null, $body = null) + { + $this->_init($command, $headers, $body); + } + + protected function _init ($command = null, $headers = null, $body = null) + { + $this->command = $command; + if ($headers != null) { + $this->headers = $headers; + } + $this->body = $body; + + if ($this->command == 'ERROR') { + require_once 'Stomp/Exception.php'; + throw new Stomp_Exception($this->headers['message'], 0, $this->body); + } } /** @@ -74,8 +74,7 @@ class Stomp_Frame $data .= "\n"; $data .= $this->body; - $data .= "\x00\n"; // Should there really be a linefeed here? - return $data; - } -} + return $data .= "\x00\n"; + } +} ?> \ No newline at end of file diff --git a/extlib/Stomp/Message.php b/extlib/Stomp/Message.php index 055662133..6bcad3efd 100644 --- a/extlib/Stomp/Message.php +++ b/extlib/Stomp/Message.php @@ -29,12 +29,8 @@ require_once 'Stomp/Frame.php'; */ class Stomp_Message extends Stomp_Frame { - public function __construct ($body, $headers = array()) + public function __construct ($body, $headers = null) { - if(!isset($headers['content-length'])) { - // TODO: log this, to see if this is correct - $headers['content-length'] = strlen($body); - } $this->_init("SEND", $headers, $body); } } diff --git a/lib/stompqueuemanager.php b/lib/stompqueuemanager.php index 5d8b2996b..f059b42f0 100644 --- a/lib/stompqueuemanager.php +++ b/lib/stompqueuemanager.php @@ -141,11 +141,10 @@ class StompQueueManager $this->con->ack($frame); } else { if ($handler->handle_notice($notice)) { - $this->_log(LOG_INFO, 'Successfully handled notice '. $notice->id .' originally posted at ' . $notice->created . ' in queue '. $queue); - + $this->_log(LOG_INFO, 'Successfully handled notice '. $notice->id .' posted at ' . $frame->headers['created'] . ' in queue '. $queue); $this->con->ack($frame); } else { - $this->_log(LOG_WARNING, 'Failed handling notice '. $notice->id .' originally posted at ' . $notice->created . ' in queue '. $queue); + $this->_log(LOG_WARNING, 'Failed handling notice '. $notice->id .' posted at ' . $frame->headers['created'] . ' in queue '. $queue); // FIXME we probably shouldn't have to do // this kind of queue management ourselves $this->con->ack($frame); -- cgit v1.2.3-54-g00ecf