root/branches/1.5/wp-includes/functions-formatting.php

Revision 3627, 32.9 kB (checked in by ryan, 3 years ago)

Security back ports from masquerade and MarkJaquith?.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
Line 
1 <?php
2
3 function wptexturize($text) {
4     $output = '';
5     // Capture tags and everything inside them
6     $textarr = preg_split("/(<.*>)/Us", $text, -1, PREG_SPLIT_DELIM_CAPTURE);
7     $stop = count($textarr); $next = true; // loop stuff
8     for ($i = 0; $i < $stop; $i++) {
9         $curl = $textarr[$i];
10
11         if (isset($curl{0}) && '<' != $curl{0} && $next) { // If it's not a tag
12             $curl = str_replace('---', '&#8212;', $curl);
13             $curl = str_replace(' -- ', ' &#8212; ', $curl);
14             $curl = str_replace('--', '&#8211;', $curl);
15             $curl = str_replace('xn&#8211;', 'xn--', $curl);
16             $curl = str_replace('...', '&#8230;', $curl);
17             $curl = str_replace('``', '&#8220;', $curl);
18
19             // This is a hack, look at this more later. It works pretty well though.
20             $cockney = array("'tain't","'twere","'twas","'tis","'twill","'til","'bout","'nuff","'round","'cause");
21             $cockneyreplace = array("&#8217;tain&#8217;t","&#8217;twere","&#8217;twas","&#8217;tis","&#8217;twill","&#8217;til","&#8217;bout","&#8217;nuff","&#8217;round","&#8217;cause");
22             $curl = str_replace($cockney, $cockneyreplace, $curl);
23
24             $curl = preg_replace("/'s/", '&#8217;s', $curl);
25             $curl = preg_replace("/'(\d\d(?:&#8217;|')?s)/", "&#8217;$1", $curl);
26             $curl = preg_replace('/(\s|\A|")\'/', '$1&#8216;', $curl);
27             $curl = preg_replace('/(\d+)"/', '$1&#8243;', $curl);
28             $curl = preg_replace("/(\d+)'/", '$1&#8242;', $curl);
29             $curl = preg_replace("/(\S)'([^'\s])/", "$1&#8217;$2", $curl);
30             $curl = preg_replace('/(\s|\A)"(?!\s)/', '$1&#8220;$2', $curl);
31             $curl = preg_replace('/"(\s|\S|\Z)/', '&#8221;$1', $curl);
32             $curl = preg_replace("/'([\s.]|\Z)/", '&#8217;$1', $curl);
33             $curl = preg_replace("/ \(tm\)/i", ' &#8482;', $curl);
34             $curl = str_replace("''", '&#8221;', $curl);
35             
36             $curl = preg_replace('/(\d+)x(\d+)/', "$1&#215;$2", $curl);
37
38         } elseif (strstr($curl, '<code') || strstr($curl, '<pre') || strstr($curl, '<kbd' || strstr($curl, '<style') || strstr($curl, '<script'))) {
39             // strstr is fast
40             $next = false;
41         } else {
42             $next = true;
43         }
44         $curl = preg_replace('/&([^#])(?![a-z12]{1,8};)/', '&#038;$1', $curl);
45         $output .= $curl;
46     }
47     return $output;
48 }
49
50 function clean_pre($text) {
51     $text = str_replace('<br />', '', $text);
52     $text = str_replace('<p>', "\n", $text);
53     $text = str_replace('</p>', '', $text);
54     return $text;
55 }
56
57 function wpautop($pee, $br = 1) {
58     $pee = $pee . "\n"; // just to make things a little easier, pad the end
59     $pee = preg_replace('|<br />\s*<br />|', "\n\n", $pee);
60     // Space things out a little
61     $pee = preg_replace('!(<(?:table|thead|tfoot|caption|colgroup|tbody|tr|td|th|div|dl|dd|dt|ul|ol|li|pre|select|form|blockquote|address|math|p|h[1-6])[^>]*>)!', "\n$1", $pee);
62     $pee = preg_replace('!(</(?:table|thead|tfoot|caption|colgroup|tbody|tr|td|th|div|dl|dd|dt|ul|ol|li|pre|select|form|blockquote|address|math|p|h[1-6])>)!', "$1\n", $pee);
63     $pee = str_replace(array("\r\n", "\r"), "\n", $pee); // cross-platform newlines
64     $pee = preg_replace("/\n\n+/", "\n\n", $pee); // take care of duplicates
65     $pee = preg_replace('/\n?(.+?)(?:\n\s*\n|\z)/s', "\t<p>$1</p>\n", $pee); // make paragraphs, including one at the end
66     $pee = preg_replace('|<p>\s*?</p>|', '', $pee); // under certain strange conditions it could create a P of entirely whitespace
67     $pee = preg_replace('!<p>\s*(</?(?:table|thead|tfoot|caption|colgroup|tbody|tr|td|th|div|dl|dd|dt|ul|ol|li|hr|pre|select|form|blockquote|address|math|p|h[1-6])[^>]*>)\s*</p>!', "$1", $pee); // don't pee all over a tag
68     $pee = preg_replace("|<p>(<li.+?)</p>|", "$1", $pee); // problem with nested lists
69     $pee = preg_replace('|<p><blockquote([^>]*)>|i', "<blockquote$1><p>", $pee);
70     $pee = str_replace('</blockquote></p>', '</p></blockquote>', $pee);
71     $pee = preg_replace('!<p>\s*(</?(?:table|thead|tfoot|caption|colgroup|tbody|tr|td|th|div|dl|dd|dt|ul|ol|li|hr|pre|select|form|blockquote|address|math|p|h[1-6])[^>]*>)!', "$1", $pee);
72     $pee = preg_replace('!(</?(?:table|thead|tfoot|caption|colgroup|tbody|tr|td|th|div|dl|dd|dt|ul|ol|li|pre|select|form|blockquote|address|math|p|h[1-6])[^>]*>)\s*</p>!', "$1", $pee);
73     if ($br) $pee = preg_replace('|(?<!<br />)\s*\n|', "<br />\n", $pee); // optionally make line breaks
74     $pee = preg_replace('!(</?(?:table|thead|tfoot|caption|tbody|tr|td|th|div|dl|dd|dt|ul|ol|li|pre|select|form|blockquote|address|math|p|h[1-6])[^>]*>)\s*<br />!', "$1", $pee);
75     $pee = preg_replace('!<br />(\s*</?(?:p|li|div|dl|dd|dt|th|pre|td|ul|ol)>)!', '$1', $pee);
76     $pee = preg_replace('!(<pre.*?>)(.*?)</pre>!ise', " stripslashes('$1') .  clean_pre('$2')  . '</pre>' ", $pee);
77     
78     return $pee;
79 }
80
81
82 function seems_utf8($Str) { # by bmorel at ssi dot fr
83     for ($i=0; $i<strlen($Str); $i++) {
84         if (ord($Str[$i]) < 0x80) continue; # 0bbbbbbb
85         elseif ((ord($Str[$i]) & 0xE0) == 0xC0) $n=1; # 110bbbbb
86         elseif ((ord($Str[$i]) & 0xF0) == 0xE0) $n=2; # 1110bbbb
87         elseif ((ord($Str[$i]) & 0xF8) == 0xF0) $n=3; # 11110bbb
88         elseif ((ord($Str[$i]) & 0xFC) == 0xF8) $n=4; # 111110bb
89         elseif ((ord($Str[$i]) & 0xFE) == 0xFC) $n=5; # 1111110b
90         else return false; # Does not match any model
91         for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
92             if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80))
93             return false;
94         }
95     }
96     return true;
97 }
98
99 function wp_specialchars( $text, $quotes = 0 ) {
100     // Like htmlspecialchars except don't double-encode HTML entities
101     $text = preg_replace('/&([^#])(?![a-z12]{1,8};)/', '&#038;$1', $text);-
102     $text = str_replace('<', '&lt;', $text);
103     $text = str_replace('>', '&gt;', $text);
104     if ( $quotes ) {
105         $text = str_replace('"', '&quot;', $text);
106         $text = str_replace("'", '&#039;', $text);
107     }
108     return $text;
109 }
110
111 function utf8_uri_encode( $utf8_string ) {
112   $unicode = '';       
113   $values = array();
114   $num_octets = 1;
115         
116   for ($i = 0; $i < strlen( $utf8_string ); $i++ ) {
117
118     $value = ord( $utf8_string[ $i ] );
119             
120     if ( $value < 128 ) {
121       $unicode .= chr($value);
122     } else {
123       if ( count( $values ) == 0 ) $num_octets = ( $value < 224 ) ? 2 : 3;
124                 
125       $values[] = $value;
126       
127       if ( count( $values ) == $num_octets ) {
128     if ($num_octets == 3) {
129       $unicode .= '%' . dechex($values[0]) . '%' . dechex($values[1]) . '%' . dechex($values[2]);
130     } else {
131       $unicode .= '%' . dechex($values[0]) . '%' . dechex($values[1]);
132     }
133
134     $values = array();
135     $num_octets = 1;
136       }
137     }
138   }
139
140   return $unicode;   
141 }
142
143 function remove_accents($string) {
144     if (seems_utf8($string)) {
145         $chars = array(
146         // Decompositions for Latin-1 Supplement
147         chr(195).chr(128) => 'A', chr(195).chr(129) => 'A',
148         chr(195).chr(130) => 'A', chr(195).chr(131) => 'A',
149         chr(195).chr(132) => 'A', chr(195).chr(133) => 'A',
150         chr(195).chr(135) => 'C', chr(195).chr(136) => 'E',
151         chr(195).chr(137) => 'E', chr(195).chr(138) => 'E',
152         chr(195).chr(139) => 'E', chr(195).chr(140) => 'I',
153         chr(195).chr(141) => 'I', chr(195).chr(142) => 'I',
154         chr(195).chr(143) => 'I', chr(195).chr(145) => 'N',
155         chr(195).chr(146) => 'O', chr(195).chr(147) => 'O',
156         chr(195).chr(148) => 'O', chr(195).chr(149) => 'O',
157         chr(195).chr(150) => 'O', chr(195).chr(153) => 'U',
158         chr(195).chr(154) => 'U', chr(195).chr(155) => 'U',
159         chr(195).chr(156) => 'U', chr(195).chr(157) => 'Y',
160         chr(195).chr(159) => 's', chr(195).chr(160) => 'a',
161         chr(195).chr(161) => 'a', chr(195).chr(162) => 'a',
162         chr(195).chr(163) => 'a', chr(195).chr(164) => 'a',
163         chr(195).chr(165) => 'a', chr(195).chr(167) => 'c',
164         chr(195).chr(168) => 'e', chr(195).chr(169) => 'e',
165         chr(195).chr(170) => 'e', chr(195).chr(171) => 'e',
166         chr(195).chr(172) => 'i', chr(195).chr(173) => 'i',
167         chr(195).chr(174) => 'i', chr(195).chr(175) => 'i',
168         chr(195).chr(177) => 'n', chr(195).chr(178) => 'o',
169         chr(195).chr(179) => 'o', chr(195).chr(180) => 'o',
170         chr(195).chr(181) => 'o', chr(195).chr(182) => 'o',
171         chr(195).chr(182) => 'o', chr(195).chr(185) => 'u',
172         chr(195).chr(186) => 'u', chr(195).chr(187) => 'u',
173         chr(195).chr(188) => 'u', chr(195).chr(189) => 'y',
174         chr(195).chr(191) => 'y',
175         // Decompositions for Latin Extended-A
176         chr(196).chr(128) => 'A', chr(196).chr(129) => 'a',
177         chr(196).chr(130) => 'A', chr(196).chr(131) => 'a',
178         chr(196).chr(132) => 'A', chr(196).chr(133) => 'a',
179         chr(196).chr(134) => 'C', chr(196).chr(134) => 'c',
180         chr(196).chr(136) => 'C', chr(196).chr(137) => 'c',
181         chr(196).chr(138) => 'C', chr(196).chr(139) => 'c',
182         chr(196).chr(140) => 'C', chr(196).chr(141) => 'c',
183         chr(196).chr(142) => 'D', chr(196).chr(143) => 'd',
184         chr(196).chr(144) => 'D', chr(196).chr(145) => 'd',
185         chr(196).chr(146) => 'E', chr(196).chr(147) => 'e',
186         chr(196).chr(148) => 'E', chr(196).chr(149) => 'e',
187         chr(196).chr(150) => 'E', chr(196).chr(151) => 'e',
188         chr(196).chr(152) => 'E', chr(196).chr(153) => 'e',
189         chr(196).chr(154) => 'E', chr(196).chr(155) => 'e',
190         chr(196).chr(156) => 'G', chr(196).chr(157) => 'g',
191         chr(196).chr(158) => 'G', chr(196).chr(159) => 'g',
192         chr(196).chr(160) => 'G', chr(196).chr(161) => 'g',
193         chr(196).chr(162) => 'G', chr(196).chr(163) => 'g',
194         chr(196).chr(164) => 'H', chr(196).chr(165) => 'h',
195         chr(196).chr(166) => 'H', chr(196).chr(167) => 'h',
196         chr(196).chr(168) => 'I', chr(196).chr(169) => 'i',
197         chr(196).chr(170) => 'I', chr(196).chr(171) => 'i',
198         chr(196).chr(172) => 'I', chr(196).chr(173) => 'i',
199         chr(196).chr(174) => 'I', chr(196).chr(175) => 'i',
200         chr(196).chr(176) => 'I', chr(196).chr(177) => 'i',
201         chr(196).chr(178) => 'IJ',chr(196).chr(179) => 'ij',
202         chr(196).chr(180) => 'J', chr(196).chr(181) => 'j',
203         chr(196).chr(182) => 'K', chr(196).chr(183) => 'k',
204         chr(196).chr(184) => 'k', chr(196).chr(185) => 'L',
205         chr(196).chr(186) => 'l', chr(196).chr(187) => 'L',
206         chr(196).chr(188) => 'l', chr(196).chr(189) => 'L',
207         chr(196).chr(190) => 'l', chr(196).chr(191) => 'L',
208         chr(197).chr(128) => 'l', chr(196).chr(129) => 'L',
209         chr(197).chr(130) => 'l', chr(196).chr(131) => 'N',
210         chr(197).chr(132) => 'n', chr(196).chr(133) => 'N',
211         chr(197).chr(134) => 'n', chr(196).chr(135) => 'N',
212         chr(197).chr(136) => 'n', chr(196).chr(137) => 'N',
213         chr(197).chr(138) => 'n', chr(196).chr(139) => 'N',
214         chr(197).chr(140) => 'O', chr(196).chr(141) => 'o',
215         chr(197).chr(142) => 'O', chr(196).chr(143) => 'o',
216         chr(197).chr(144) => 'O', chr(196).chr(145) => 'o',
217         chr(197).chr(146) => 'OE',chr(197).chr(147) => 'oe',
218         chr(197).chr(148) => 'R',chr(197).chr(149) => 'r',
219         chr(197).chr(150) => 'R',chr(197).chr(151) => 'r',
220         chr(197).chr(152) => 'R',chr(197).chr(153) => 'r',
221         chr(197).chr(154) => 'S',chr(197).chr(155) => 's',
222         chr(197).chr(156) => 'S',chr(197).chr(157) => 's',
223         chr(197).chr(158) => 'S',chr(197).chr(159) => 's',
224         chr(197).chr(160) => 'S', chr(197).chr(161) => 's',
225         chr(197).chr(162) => 'T', chr(197).chr(163) => 't',
226         chr(197).chr(164) => 'T', chr(197).chr(165) => 't',
227         chr(197).chr(166) => 'T', chr(197).chr(167) => 't',
228         chr(197).chr(168) => 'U', chr(197).chr(169) => 'u',
229         chr(197).chr(170) => 'U', chr(197).chr(171) => 'u',
230         chr(197).chr(172) => 'U', chr(197).chr(173) => 'u',
231         chr(197).chr(174) => 'U', chr(197).chr(175) => 'u',
232         chr(197).chr(176) => 'U', chr(197).chr(177) => 'u',
233         chr(197).chr(178) => 'U', chr(197).chr(179) => 'u',
234