root/branches/1.5/wp-includes/kses.php

Revision 2151, 18.8 kB (checked in by saxmatt, 4 years ago)

More filters and KSES cleanup.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
Line 
1 <?php
2 // Added wp_ prefix to avoid conflicts with existing kses users
3 # kses 0.2.1 - HTML/XHTML filter that only allows some elements and attributes
4 # Copyright (C) 2002, 2003  Ulf Harnhammar
5 # *** CONTACT INFORMATION ***
6 #
7 # E-mail:      metaur at users dot sourceforge dot net
8 # Web page:    http://sourceforge.net/projects/kses
9 # Paper mail:  (not at the moment)
10 #
11 # [kses strips evil scripts!]
12 if (!defined('CUSTOM_TAGS'))
13     define('CUSTOM_TAGS', false);
14     
15 // You can override this in your my-hacks.php file
16 if (!CUSTOM_TAGS) {
17 $allowedtags = array(
18     'a' => array(
19         'href' => array(),
20         'title' => array()
21         ),
22     'abbr' => array('title' => array()),
23     'acronym' => array('title' => array()),
24     'b' => array(),
25     'blockquote' => array('cite' => array()),
26 //    'br' => array(),
27     'code' => array(),
28 //    'del' => array('datetime' => array()),
29 //    'dd' => array(),
30 //    'dl' => array(),
31 //    'dt' => array(),
32     'em' => array(),
33     'i' => array(),
34 //    'ins' => array('datetime' => array(), 'cite' => array()),
35 //    'li' => array(),
36 //    'ol' => array(),
37 //    'p' => array(),
38 //    'q' => array(),
39     'strike' => array(),
40     'strong' => array(),
41 //    'sub' => array(),
42 //    'sup' => array(),
43 //    'u' => array(),
44 //    'ul' => array(),
45     );
46 }
47 function wp_kses($string, $allowed_html, $allowed_protocols =
48                array('http', 'https', 'ftp', 'news', 'nntp', 'feed', 'gopher', 'mailto'))
49 ###############################################################################
50 # This function makes sure that only the allowed HTML element names, attribute
51 # names and attribute values plus only sane HTML entities will occur in
52 # $string. You have to remove any slashes from PHP's magic quotes before you
53 # call this function.
54 ###############################################################################
55 {
56   $string = wp_kses_no_null($string);
57   $string = wp_kses_js_entities($string);
58   $string = wp_kses_normalize_entities($string);
59   $string = wp_kses_hook($string);
60   $allowed_html_fixed = wp_kses_array_lc($allowed_html);
61   return wp_kses_split($string, $allowed_html_fixed, $allowed_protocols);
62 } # function wp_kses
63
64
65 function wp_kses_hook($string)
66 ###############################################################################
67 # You add any kses hooks here.
68 ###############################################################################
69 {
70   return $string;
71 } # function wp_kses_hook
72
73
74 function wp_kses_version()
75 ###############################################################################
76 # This function returns kses' version number.
77 ###############################################################################
78 {
79   return '0.2.1';
80 } # function wp_kses_version
81
82
83 function wp_kses_split($string, $allowed_html, $allowed_protocols)
84 ###############################################################################
85 # This function searches for HTML tags, no matter how malformed. It also
86 # matches stray ">" characters.
87 ###############################################################################
88 {
89   return preg_replace('%(<'.   # EITHER: <
90                       '[^>]*'. # things that aren't >
91                       '(>|$)'. # > or end of string
92                       '|>)%e', # OR: just a >
93                       "wp_kses_split2('\\1', \$allowed_html, ".
94                       '$allowed_protocols)',
95                       $string);
96 } # function wp_kses_split
97
98
99 function wp_kses_split2($string, $allowed_html, $allowed_protocols)
100 ###############################################################################
101 # This function does a lot of work. It rejects some very malformed things
102 # like <:::>. It returns an empty string, if the element isn't allowed (look
103 # ma, no strip_tags()!). Otherwise it splits the tag into an element and an
104 # attribute list.
105 ###############################################################################
106 {
107   $string = wp_kses_stripslashes($string);
108
109   if (substr($string, 0, 1) != '<')
110     return '&gt;';
111     # It matched a ">" character
112
113   if (!preg_match('%^<\s*(/\s*)?([a-zA-Z0-9]+)([^>]*)>?$%', $string, $matches))
114     return '';
115     # It's seriously malformed
116
117   $slash = trim($matches[1]);
118   $elem = $matches[2];
119   $attrlist = $matches[3];
120
121   if (!is_array($allowed_html[strtolower($elem)]))
122     return '';
123     # They are using a not allowed HTML element
124
125   return wp_kses_attr("$slash$elem", $attrlist, $allowed_html,
126                    $allowed_protocols);
127 } # function wp_kses_split2
128
129
130 function wp_kses_attr($element, $attr, $allowed_html, $allowed_protocols)
131 ###############################################################################
132 # This function removes all attributes, if none are allowed for this element.
133 # If some are allowed it calls wp_kses_hair() to split them further, and then it
134 # builds up new HTML code from the data that kses_hair() returns. It also
135 # removes "<" and ">" characters, if there are any left. One more thing it
136 # does is to check if the tag has a closing XHTML slash, and if it does,
137 # it puts one in the returned code as well.
138 ###############################################################################
139 {
140 # Is there a closing XHTML slash at the end of the attributes?
141
142   $xhtml_slash = '';
143   if (preg_match('%\s/\s*$%', $attr))
144     $xhtml_slash = ' /';
145
146 # Are any attributes allowed at all for this element?
147
148   if (count($allowed_html[strtolower($element)]) == 0)
149     return "<$element$xhtml_slash>";
150
151 # Split it
152
153   $attrarr = wp_kses_hair($attr, $allowed_protocols);
154
155 # Go through $attrarr, and save the allowed attributes for this element
156 # in $attr2
157
158   $attr2 = '';
159
160   foreach ($attrarr as $arreach)
161   {
162     $current = $allowed_html[strtolower($element)]
163                             [strtolower($arreach['name'])];
164     if ($current == '')
165       continue; # the attribute is not allowed
166
167     if (!is_array($current))
168       $attr2 .= ' '.$arreach['whole'];
169     # there are no checks
170
171     else
172     {
173     # there are some checks
174       $ok = true;
175       foreach ($current as $currkey => $currval)
176         if (!wp_kses_check_attr_val($arreach['value'], $arreach['vless'],
177                                  $currkey, $currval))
178         { $ok = false; break; }
179
180       if ($ok)
181         $attr2 .= ' '.$arreach['whole']; # it passed them
182     } # if !is_array($current)
183   } # foreach
184
185 # Remove any "<" or ">" characters
186
187   $attr2 = preg_replace('/[<>]/', '', $attr2);
188
189   return "<$element$attr2$xhtml_slash>";
190 } # function wp_kses_attr
191
192
193 function wp_kses_hair($attr, $allowed_protocols)
194 ###############################################################################
195 # This function does a lot of work. It parses an attribute list into an array
196 # with attribute data, and tries to do the right thing even if it gets weird
197 # input. It will add quotes around attribute values that don't have any quotes
198 # or apostrophes around them, to make it easier to produce HTML code that will
199 # conform to W3C's HTML specification. It will also remove bad URL protocols
200 # from attribute values.
201 ###############################################################################
202 {
203   $attrarr = array();
204   $mode = 0;
205   $attrname = '';
206
207 # Loop through the whole attribute list
208
209   while (strlen($attr) != 0)
210   {
211     $working = 0; # Was the last operation successful?
212
213     switch ($mode)
214     {
215       case 0: # attribute name, href for instance
216
217         if (preg_match('/^([-a-zA-Z]+)/', $attr, $match))
218         {
219           $attrname = $match[1];
220           $working = $mode = 1;
221           $attr = preg_replace('/^[-a-zA-Z]+/', '', $attr);
222         }
223
224         break;
225
226       case 1: # equals sign or valueless ("selected")
227
228         if (preg_match('/^\s*=\s*/', $attr)) # equals sign
229         {
230           $working = 1; $mode = 2;
231           $attr = preg_replace('/^\s*=\s*/', '', $attr);
232           break;
233         }
234
235         if (preg_match('/^\s+/', $attr)) # valueless
236         {
237           $working = 1; $mode = 0;
238           $attrarr[] = array
239                         ('name'  => $attrname,
240                          'value' => '',
241                          'whole' => $attrname,
242                          'vless' => 'y');
243           $attr = preg_replace('/^\s+/', '', $attr);
244         }
245
246         break;
247
248       case 2: # attribute value, a URL after href= for instance
249
250         if (preg_match('/^"([^"]*)"(\s+|$)/', $attr, $match))
251          # "value"
252         {
253           $thisval = wp_kses_bad_protocol($match[1], $allowed_protocols);
254
255           $attrarr[] = array
256                         ('name'  => $attrname,
257                          'value' => $thisval,
258                          'whole' => "$attrname=\"$thisval\"",
259                          'vless' => 'n');
260           $working = 1; $mode = 0;
261           $attr = preg_replace('/^"[^"]*"(\s+|$)/', '', $attr);
262           break;
263         }
264
265         if (preg_match("/^'([^']*)'(\s+|$)/", $attr, $match))
266          # 'value'
267         {
268           $thisval = wp_kses_bad_protocol($match[1], $allowed_protocols);
269
270           $attrarr[] = array
271                         ('name'  => $attrname,
272                          'value' => $thisval,
273                          'whole' => "$attrname='$thisval'",
274                          'vless' => 'n');
275           $working = 1; $mode = 0;
276           $attr = preg_replace("/^'[^']*'(\s+|$)/", '', $attr);
277           break;
278         }
279
280         if (preg_match("%^([^\s\"']+)(\s+|$)%", $attr, $match))
281          # value
282         {
283           $thisval = wp_kses_bad_protocol($match[1], $allowed_protocols);
284
285           $attrarr[] = array
286                         ('name'  => $attrname,
287                          'value' => $thisval,
288                          'whole' => "$attrname=\"$thisval\"",
289                          'vless' => 'n');
290                          # We add quotes to conform to W3C's HTML spec.
291           $working = 1; $mode = 0;
292           $attr = preg_replace("%^[^\s\"']+(\s+|$)%", '', $attr);
293         }
294
295         break;
296     } # switch
297
298     if ($working == 0) # not well formed, remove and try again
299     {
300       $attr = wp_kses_html_error($attr);
301       $mode = 0;
302     }
303   } # while
304
305   if ($mode == 1)
306   # special case, for when the attribute list ends with a valueless
307   # attribute like "selected"
308     $attrarr[] = array
309                   ('name'  => $attrname,
310                    'value' => '',
311                    'whole' => $attrname,
312                    'vless' => 'y');
313
314   return $attrarr;
315 } # function wp_kses_hair
316
317
318 function wp_kses_check_attr_val($value, $vless, $checkname, $checkvalue)
319 ###############################################################################
320 # This function performs different checks for attribute values. The currently
321 # implemented checks are "maxlen", "minlen", "maxval", "minval" and "valueless"
322 # with even more checks to come soon.
323 ###############################################################################
324 {
325   $ok = true;
326
327   switch (strtolower($checkname))
328   {
329     case 'maxlen':
330     # The maxlen check makes sure that the attribute value has a length not
331     # greater than the given value. This can be used to avoid Buffer Overflows
332     # in WWW clients and various Internet servers.
333
334       if (strlen($value) > $checkvalue)
335         $ok = false;
336       break;
337
338     case 'minlen':
339     # The minlen check makes sure that the attribute value has a length not
340     # smaller than the given value.
341
342       if (strlen($value) < $checkvalue)
343         $ok = false;
344       break;
345
346     case 'maxval':
347     # The maxval check does two things: it checks that the attribute value is
348     # an integer from 0 and up, without an excessive amount of zeroes or
349     # whitespace (to avoid Buffer Overflows). It also checks that the attribute
350     # value is not greater than the given value.
351     # This check can be used to avoid Denial of Service attacks.
352
353       if (!preg_match('/^\s{0,6}[0-9]{1,6}\s{0,6}$/', $value))
354         $ok = false;
355       if ($value > $checkvalue)
356         $ok = false;
357       break;
358
359     case 'minval':
360     # The minval check checks that the attribute value is a positive integer,
361     # and that it is not smaller than the given value.
362
363       if (!preg_match('/^\s{0,6}[0-9]{1,6}\s{0,6}$/', $value))
364         $ok = false;
365       if ($value < $checkvalue)
366         $ok = false;
367       break;
368
369     case 'valueless':
370     # The valueless check checks if the attribute has a value
371     # (like <a href="blah">) or not (<option selected>). If the given value
372     # is a "y" or a "Y", the attribute must not have a value.
373     # If the given value is an "n" or an "N", the attribute must have one.
374
375       if (strtolower($checkvalue) != $vless)
376         $ok = false;
377       break;
378   } # switch
379
380   return $ok;
381 } # function wp_kses_check_attr_val
382
383
384 function wp_kses_bad_protocol($string, $allowed_protocols)
385 ###############################################################################
386 # This function removes all non-allowed protocols from the beginning of
387 # $string. It ignores whitespace and the case of the letters, and it does
388 # understand HTML entities. It does its work in a while loop, so it won't be
389 # fooled by a string like "javascript:javascript:alert(57)".
390 ###############################################################################
391 {
392   $string = wp_kses_no_null($string);
393   $string2 = $string.'a';
394
395   while ($string != $string2)
396   {
397     $string2 = $string;
398     $string = wp_kses_bad_protocol_once($string, $allowed_protocols);
399   } # while
400
401   return $string;
402 } # function wp_kses_bad_protocol
403
404
405 function wp_kses_no_null($string)
406 ###############################################################################
407 # This function removes any NULL or chr(173) characters in $string.
408 ###############################################################################
409 {
410   $string = preg_replace('/\0+/', '', $string);
411   $string = preg_replace('/(\\\\0)+/', '', $string);
412
413   return $string;
414 } # function wp_kses_no_null
415
416
417 function wp_kses_stripslashes($string)
418 ###############################################################################
419 # This function changes the character sequence  \"  to just  "
420 # It leaves all other slashes alone. It's really weird, but the quoting from
421 # preg_replace(//e) seems to require this.
422 ###############################################################################
423 {
424   return preg_replace('%\\\\"%', '"', $string);
425 } # function wp_kses_stripslashes
426
427
428 function wp_kses_array_lc($inarray)
429 ###############################################################################
430 # This function goes through an array, and changes the keys to all lower case.
431 ###############################################################################
432 {
433   $outarray = array();
434
435   foreach ($inarray as $inkey => $inval)
436   {
437     $outkey = strtolower($inkey);
438     $outarray[$outkey] = array();
439
440     foreach ($inval as $inkey2 => $inval2)
441     {
442       $outkey2 = strtolower($inkey2);
443       $outarray[$outkey][$outkey2] = $inval2;
444     } # foreach $inval
445   } # foreach $inarray
446
447   return $outarray;
448 } # function wp_kses_array_lc
449
450
451 function wp_kses_js_entities($string)
452 ###############################################################################
453 # This function removes the HTML JavaScript entities found in early versions of
454 # Netscape 4.
455 ###############################################################################
456 {
457   return preg_replace('%&\s*\{[^}]*(\}\s*;?|$)%', '', $string);
458 } # function wp_kses_js_entities
459
460
461 function wp_kses_html_error($string)
462 ###############################################################################
463 # This function deals with parsing errors in wp_kses_hair(). The general plan is
464 # to remove everything to and including some whitespace, but it deals with
465 # quotes and apostrophes as well.
466 ###############################################################################
467 {
468   return preg_replace('/^("[^"]*("|$)|\'[^\']*(\'|$)|\S)*\s*/', '', $string);
469 } # function wp_kses_html_error
470
471
472 function wp_kses_bad_protocol_once($string, $allowed_protocols)
473 ###############################################################################
474 # This function searches for URL protocols at the beginning of $string, while
475 # handling whitespace and HTML entities.
476 ###############################################################################
477 {
478   return preg_replace('/^((&[^;]*;|[\sA-Za-z0-9])*)'.
479                       '(:|&#58;|&#[Xx]3[Aa];)\s*/e',
480                       'wp_kses_bad_protocol_once2("\\1", $allowed_protocols)',
481                       $string);
482 } # function wp_kses_bad_protocol_once
483
484
485 function wp_kses_bad_protocol_once2($string, $allowed_protocols)
486 ###############################################################################
487 # This function processes URL protocols, checks to see if they're in the white-
488 # list or not, and returns different data depending on the answer.
489 ###############################################################################
490 {
491   $string2 = wp_kses_decode_entities($string);
492   $string2 = preg_replace('/\s/', '', $string2);
493   $string2 = wp_kses_no_null($string2);
494   $string2 = strtolower($string2);
495
496   $allowed = false;
497   foreach ($allowed_protocols as $one_protocol)
498     if (strtolower($one_protocol) == $string2)
499     {
500       $allowed = true;
501       break;
502     }
503
504   if ($allowed)
505     return "$string2:";
506   else
507     return '';
508 } # function wp_kses_bad_protocol_once2
509
510
511 function wp_kses_normalize_entities($string)
512 ###############################################################################
513 # This function normalizes HTML entities. It will convert "AT&T" to the correct
514 # "AT&amp;T", "&#00058;" to "&#58;", "&#XYZZY;" to "&amp;#XYZZY;" and so on.
515 ###############################################################################
516 {
517 # Disarm all entities by converting & to &amp;
518
519   $string = str_replace('&', '&amp;', $string);
520
521 # Change back the allowed entities in our entity whitelist
522
523   $string = preg_replace('/&amp;([A-Za-z][A-Za-z0-9]{0,19});/',
524                          '&\\1;', $string);
525   $string = preg_replace('/&amp;#0*([0-9]{1,5});/e',
526                          'wp_kses_normalize_entities2("\\1")', $string);
527   $string = preg_replace('/&amp;#([Xx])0*(([0-9A-Fa-f]{2}){1,2});/',
528                          '&#\\1\\2;', $string);
529
530   return $string;
531 } # function wp_kses_normalize_entities
532
533
534 function wp_kses_normalize_entities2($i)
535 ###############################################################################
536 # This function helps wp_kses_normalize_entities() to only accept 16 bit values
537 # and nothing more for &#number; entities.
538 ###############################################################################
539 {
540   return (($i > 65535) ? "&amp;#$i;" : "&#$i;");
541 } # function wp_kses_normalize_entities2
542
543
544 function wp_kses_decode_entities($string)
545 ###############################################################################
546 # This function decodes numeric HTML entities (&#65; and &#x41;). It doesn't
547 # do anything with other entities like &auml;, but we don't need them in the
548 # URL protocol whitelisting system anyway.
549 ###############################################################################
550 {
551   $string = preg_replace('/&#([0-9]+);/e', 'chr("\\1")', $string);
552   $string = preg_replace('/&#[Xx]([0-9A-Fa-f]+);/e', 'chr(hexdec("\\1"))',
553                          $string);
554
555   return $string;
556 } # function wp_kses_decode_entities
557
558 function wp_filter_kses( $string ) {
559     global $allowedtags;
560     return wp_kses($string, $allowedtags);
561 }
562
563 ?>
Note: See TracBrowser for help on using the browser.