1: <?php
2: 3: 4: 5: 6: 7: 8: 9: 10: 11: 12: 13: 14: 15: 16: 17: 18: 19: 20: 21: 22: 23: 24: 25: 26: 27: 28: 29: 30: 31: 32: 33: 34: 35: 36: 37: 38: 39: 40: 41: 42: 43:
44:
45:
46: 47: 48: 49: 50: 51: 52: 53:
54: class SimplePie_Decode_HTML_Entities
55: {
56: 57: 58: 59: 60: 61:
62: var $data = '';
63:
64: 65: 66: 67: 68: 69:
70: var $consumed = '';
71:
72: 73: 74: 75: 76: 77:
78: var $position = 0;
79:
80: 81: 82: 83: 84: 85:
86: public function __construct($data)
87: {
88: $this->data = $data;
89: }
90:
91: 92: 93: 94: 95: 96:
97: public function parse()
98: {
99: while (($this->position = strpos($this->data, '&', $this->position)) !== false)
100: {
101: $this->consume();
102: $this->entity();
103: $this->consumed = '';
104: }
105: return $this->data;
106: }
107:
108: 109: 110: 111: 112: 113:
114: public function consume()
115: {
116: if (isset($this->data[$this->position]))
117: {
118: $this->consumed .= $this->data[$this->position];
119: return $this->data[$this->position++];
120: }
121: else
122: {
123: return false;
124: }
125: }
126:
127: 128: 129: 130: 131: 132: 133:
134: public function consume_range($chars)
135: {
136: if ($len = strspn($this->data, $chars, $this->position))
137: {
138: $data = substr($this->data, $this->position, $len);
139: $this->consumed .= $data;
140: $this->position += $len;
141: return $data;
142: }
143: else
144: {
145: return false;
146: }
147: }
148:
149: 150: 151: 152: 153:
154: public function unconsume()
155: {
156: $this->consumed = substr($this->consumed, 0, -1);
157: $this->position--;
158: }
159:
160: 161: 162: 163: 164:
165: public function entity()
166: {
167: switch ($this->consume())
168: {
169: case "\x09":
170: case "\x0A":
171: case "\x0B":
172: case "\x0B":
173: case "\x0C":
174: case "\x20":
175: case "\x3C":
176: case "\x26":
177: case false:
178: break;
179:
180: case "\x23":
181: switch ($this->consume())
182: {
183: case "\x78":
184: case "\x58":
185: $range = '0123456789ABCDEFabcdef';
186: $hex = true;
187: break;
188:
189: default:
190: $range = '0123456789';
191: $hex = false;
192: $this->unconsume();
193: break;
194: }
195:
196: if ($codepoint = $this->consume_range($range))
197: {
198: static $windows_1252_specials = array(0x0D => "\x0A", 0x80 => "\xE2\x82\xAC", 0x81 => "\xEF\xBF\xBD", 0x82 => "\xE2\x80\x9A", 0x83 => "\xC6\x92", 0x84 => "\xE2\x80\x9E", 0x85 => "\xE2\x80\xA6", 0x86 => "\xE2\x80\xA0", 0x87 => "\xE2\x80\xA1", 0x88 => "\xCB\x86", 0x89 => "\xE2\x80\xB0", 0x8A => "\xC5\xA0", 0x8B => "\xE2\x80\xB9", 0x8C => "\xC5\x92", 0x8D => "\xEF\xBF\xBD", 0x8E => "\xC5\xBD", 0x8F => "\xEF\xBF\xBD", 0x90 => "\xEF\xBF\xBD", 0x91 => "\xE2\x80\x98", 0x92 => "\xE2\x80\x99", 0x93 => "\xE2\x80\x9C", 0x94 => "\xE2\x80\x9D", 0x95 => "\xE2\x80\xA2", 0x96 => "\xE2\x80\x93", 0x97 => "\xE2\x80\x94", 0x98 => "\xCB\x9C", 0x99 => "\xE2\x84\xA2", 0x9A => "\xC5\xA1", 0x9B => "\xE2\x80\xBA", 0x9C => "\xC5\x93", 0x9D => "\xEF\xBF\xBD", 0x9E => "\xC5\xBE", 0x9F => "\xC5\xB8");
199:
200: if ($hex)
201: {
202: $codepoint = hexdec($codepoint);
203: }
204: else
205: {
206: $codepoint = intval($codepoint);
207: }
208:
209: if (isset($windows_1252_specials[$codepoint]))
210: {
211: $replacement = $windows_1252_specials[$codepoint];
212: }
213: else
214: {
215: $replacement = SimplePie_Misc::codepoint_to_utf8($codepoint);
216: }
217:
218: if (!in_array($this->consume(), array(';', false), true))
219: {
220: $this->unconsume();
221: }
222:
223: $consumed_length = strlen($this->consumed);
224: $this->data = substr_replace($this->data, $replacement, $this->position - $consumed_length, $consumed_length);
225: $this->position += strlen($replacement) - $consumed_length;
226: }
227: break;
228:
229: default:
230: static $entities = array(
231: 'Aacute' => "\xC3\x81",
232: 'aacute' => "\xC3\xA1",
233: 'Aacute;' => "\xC3\x81",
234: 'aacute;' => "\xC3\xA1",
235: 'Acirc' => "\xC3\x82",
236: 'acirc' => "\xC3\xA2",
237: 'Acirc;' => "\xC3\x82",
238: 'acirc;' => "\xC3\xA2",
239: 'acute' => "\xC2\xB4",
240: 'acute;' => "\xC2\xB4",
241: 'AElig' => "\xC3\x86",
242: 'aelig' => "\xC3\xA6",
243: 'AElig;' => "\xC3\x86",
244: 'aelig;' => "\xC3\xA6",
245: 'Agrave' => "\xC3\x80",
246: 'agrave' => "\xC3\xA0",
247: 'Agrave;' => "\xC3\x80",
248: 'agrave;' => "\xC3\xA0",
249: 'alefsym;' => "\xE2\x84\xB5",
250: 'Alpha;' => "\xCE\x91",
251: 'alpha;' => "\xCE\xB1",
252: 'AMP' => "\x26",
253: 'amp' => "\x26",
254: 'AMP;' => "\x26",
255: 'amp;' => "\x26",
256: 'and;' => "\xE2\x88\xA7",
257: 'ang;' => "\xE2\x88\xA0",
258: 'apos;' => "\x27",
259: 'Aring' => "\xC3\x85",
260: 'aring' => "\xC3\xA5",
261: 'Aring;' => "\xC3\x85",
262: 'aring;' => "\xC3\xA5",
263: 'asymp;' => "\xE2\x89\x88",
264: 'Atilde' => "\xC3\x83",
265: 'atilde' => "\xC3\xA3",
266: 'Atilde;' => "\xC3\x83",
267: 'atilde;' => "\xC3\xA3",
268: 'Auml' => "\xC3\x84",
269: 'auml' => "\xC3\xA4",
270: 'Auml;' => "\xC3\x84",
271: 'auml;' => "\xC3\xA4",
272: 'bdquo;' => "\xE2\x80\x9E",
273: 'Beta;' => "\xCE\x92",
274: 'beta;' => "\xCE\xB2",
275: 'brvbar' => "\xC2\xA6",
276: 'brvbar;' => "\xC2\xA6",
277: 'bull;' => "\xE2\x80\xA2",
278: 'cap;' => "\xE2\x88\xA9",
279: 'Ccedil' => "\xC3\x87",
280: 'ccedil' => "\xC3\xA7",
281: 'Ccedil;' => "\xC3\x87",
282: 'ccedil;' => "\xC3\xA7",
283: 'cedil' => "\xC2\xB8",
284: 'cedil;' => "\xC2\xB8",
285: 'cent' => "\xC2\xA2",
286: 'cent;' => "\xC2\xA2",
287: 'Chi;' => "\xCE\xA7",
288: 'chi;' => "\xCF\x87",
289: 'circ;' => "\xCB\x86",
290: 'clubs;' => "\xE2\x99\xA3",
291: 'cong;' => "\xE2\x89\x85",
292: 'COPY' => "\xC2\xA9",
293: 'copy' => "\xC2\xA9",
294: 'COPY;' => "\xC2\xA9",
295: 'copy;' => "\xC2\xA9",
296: 'crarr;' => "\xE2\x86\xB5",
297: 'cup;' => "\xE2\x88\xAA",
298: 'curren' => "\xC2\xA4",
299: 'curren;' => "\xC2\xA4",
300: 'Dagger;' => "\xE2\x80\xA1",
301: 'dagger;' => "\xE2\x80\xA0",
302: 'dArr;' => "\xE2\x87\x93",
303: 'darr;' => "\xE2\x86\x93",
304: 'deg' => "\xC2\xB0",
305: 'deg;' => "\xC2\xB0",
306: 'Delta;' => "\xCE\x94",
307: 'delta;' => "\xCE\xB4",
308: 'diams;' => "\xE2\x99\xA6",
309: 'divide' => "\xC3\xB7",
310: 'divide;' => "\xC3\xB7",
311: 'Eacute' => "\xC3\x89",
312: 'eacute' => "\xC3\xA9",
313: 'Eacute;' => "\xC3\x89",
314: 'eacute;' => "\xC3\xA9",
315: 'Ecirc' => "\xC3\x8A",
316: 'ecirc' => "\xC3\xAA",
317: 'Ecirc;' => "\xC3\x8A",
318: 'ecirc;' => "\xC3\xAA",
319: 'Egrave' => "\xC3\x88",
320: 'egrave' => "\xC3\xA8",
321: 'Egrave;' => "\xC3\x88",
322: 'egrave;' => "\xC3\xA8",
323: 'empty;' => "\xE2\x88\x85",
324: 'emsp;' => "\xE2\x80\x83",
325: 'ensp;' => "\xE2\x80\x82",
326: 'Epsilon;' => "\xCE\x95",
327: 'epsilon;' => "\xCE\xB5",
328: 'equiv;' => "\xE2\x89\xA1",
329: 'Eta;' => "\xCE\x97",
330: 'eta;' => "\xCE\xB7",
331: 'ETH' => "\xC3\x90",
332: 'eth' => "\xC3\xB0",
333: 'ETH;' => "\xC3\x90",
334: 'eth;' => "\xC3\xB0",
335: 'Euml' => "\xC3\x8B",
336: 'euml' => "\xC3\xAB",
337: 'Euml;' => "\xC3\x8B",
338: 'euml;' => "\xC3\xAB",
339: 'euro;' => "\xE2\x82\xAC",
340: 'exist;' => "\xE2\x88\x83",
341: 'fnof;' => "\xC6\x92",
342: 'forall;' => "\xE2\x88\x80",
343: 'frac12' => "\xC2\xBD",
344: 'frac12;' => "\xC2\xBD",
345: 'frac14' => "\xC2\xBC",
346: 'frac14;' => "\xC2\xBC",
347: 'frac34' => "\xC2\xBE",
348: 'frac34;' => "\xC2\xBE",
349: 'frasl;' => "\xE2\x81\x84",
350: 'Gamma;' => "\xCE\x93",
351: 'gamma;' => "\xCE\xB3",
352: 'ge;' => "\xE2\x89\xA5",
353: 'GT' => "\x3E",
354: 'gt' => "\x3E",
355: 'GT;' => "\x3E",
356: 'gt;' => "\x3E",
357: 'hArr;' => "\xE2\x87\x94",
358: 'harr;' => "\xE2\x86\x94",
359: 'hearts;' => "\xE2\x99\xA5",
360: 'hellip;' => "\xE2\x80\xA6",
361: 'Iacute' => "\xC3\x8D",
362: 'iacute' => "\xC3\xAD",
363: 'Iacute;' => "\xC3\x8D",
364: 'iacute;' => "\xC3\xAD",
365: 'Icirc' => "\xC3\x8E",
366: 'icirc' => "\xC3\xAE",
367: 'Icirc;' => "\xC3\x8E",
368: 'icirc;' => "\xC3\xAE",
369: 'iexcl' => "\xC2\xA1",
370: 'iexcl;' => "\xC2\xA1",
371: 'Igrave' => "\xC3\x8C",
372: 'igrave' => "\xC3\xAC",
373: 'Igrave;' => "\xC3\x8C",
374: 'igrave;' => "\xC3\xAC",
375: 'image;' => "\xE2\x84\x91",
376: 'infin;' => "\xE2\x88\x9E",
377: 'int;' => "\xE2\x88\xAB",
378: 'Iota;' => "\xCE\x99",
379: 'iota;' => "\xCE\xB9",
380: 'iquest' => "\xC2\xBF",
381: 'iquest;' => "\xC2\xBF",
382: 'isin;' => "\xE2\x88\x88",
383: 'Iuml' => "\xC3\x8F",
384: 'iuml' => "\xC3\xAF",
385: 'Iuml;' => "\xC3\x8F",
386: 'iuml;' => "\xC3\xAF",
387: 'Kappa;' => "\xCE\x9A",
388: 'kappa;' => "\xCE\xBA",
389: 'Lambda;' => "\xCE\x9B",
390: 'lambda;' => "\xCE\xBB",
391: 'lang;' => "\xE3\x80\x88",
392: 'laquo' => "\xC2\xAB",
393: 'laquo;' => "\xC2\xAB",
394: 'lArr;' => "\xE2\x87\x90",
395: 'larr;' => "\xE2\x86\x90",
396: 'lceil;' => "\xE2\x8C\x88",
397: 'ldquo;' => "\xE2\x80\x9C",
398: 'le;' => "\xE2\x89\xA4",
399: 'lfloor;' => "\xE2\x8C\x8A",
400: 'lowast;' => "\xE2\x88\x97",
401: 'loz;' => "\xE2\x97\x8A",
402: 'lrm;' => "\xE2\x80\x8E",
403: 'lsaquo;' => "\xE2\x80\xB9",
404: 'lsquo;' => "\xE2\x80\x98",
405: 'LT' => "\x3C",
406: 'lt' => "\x3C",
407: 'LT;' => "\x3C",
408: 'lt;' => "\x3C",
409: 'macr' => "\xC2\xAF",
410: 'macr;' => "\xC2\xAF",
411: 'mdash;' => "\xE2\x80\x94",
412: 'micro' => "\xC2\xB5",
413: 'micro;' => "\xC2\xB5",
414: 'middot' => "\xC2\xB7",
415: 'middot;' => "\xC2\xB7",
416: 'minus;' => "\xE2\x88\x92",
417: 'Mu;' => "\xCE\x9C",
418: 'mu;' => "\xCE\xBC",
419: 'nabla;' => "\xE2\x88\x87",
420: 'nbsp' => "\xC2\xA0",
421: 'nbsp;' => "\xC2\xA0",
422: 'ndash;' => "\xE2\x80\x93",
423: 'ne;' => "\xE2\x89\xA0",
424: 'ni;' => "\xE2\x88\x8B",
425: 'not' => "\xC2\xAC",
426: 'not;' => "\xC2\xAC",
427: 'notin;' => "\xE2\x88\x89",
428: 'nsub;' => "\xE2\x8A\x84",
429: 'Ntilde' => "\xC3\x91",
430: 'ntilde' => "\xC3\xB1",
431: 'Ntilde;' => "\xC3\x91",
432: 'ntilde;' => "\xC3\xB1",
433: 'Nu;' => "\xCE\x9D",
434: 'nu;' => "\xCE\xBD",
435: 'Oacute' => "\xC3\x93",
436: 'oacute' => "\xC3\xB3",
437: 'Oacute;' => "\xC3\x93",
438: 'oacute;' => "\xC3\xB3",
439: 'Ocirc' => "\xC3\x94",
440: 'ocirc' => "\xC3\xB4",
441: 'Ocirc;' => "\xC3\x94",
442: 'ocirc;' => "\xC3\xB4",
443: 'OElig;' => "\xC5\x92",
444: 'oelig;' => "\xC5\x93",
445: 'Ograve' => "\xC3\x92",
446: 'ograve' => "\xC3\xB2",
447: 'Ograve;' => "\xC3\x92",
448: 'ograve;' => "\xC3\xB2",
449: 'oline;' => "\xE2\x80\xBE",
450: 'Omega;' => "\xCE\xA9",
451: 'omega;' => "\xCF\x89",
452: 'Omicron;' => "\xCE\x9F",
453: 'omicron;' => "\xCE\xBF",
454: 'oplus;' => "\xE2\x8A\x95",
455: 'or;' => "\xE2\x88\xA8",
456: 'ordf' => "\xC2\xAA",
457: 'ordf;' => "\xC2\xAA",
458: 'ordm' => "\xC2\xBA",
459: 'ordm;' => "\xC2\xBA",
460: 'Oslash' => "\xC3\x98",
461: 'oslash' => "\xC3\xB8",
462: 'Oslash;' => "\xC3\x98",
463: 'oslash;' => "\xC3\xB8",
464: 'Otilde' => "\xC3\x95",
465: 'otilde' => "\xC3\xB5",
466: 'Otilde;' => "\xC3\x95",
467: 'otilde;' => "\xC3\xB5",
468: 'otimes;' => "\xE2\x8A\x97",
469: 'Ouml' => "\xC3\x96",
470: 'ouml' => "\xC3\xB6",
471: 'Ouml;' => "\xC3\x96",
472: 'ouml;' => "\xC3\xB6",
473: 'para' => "\xC2\xB6",
474: 'para;' => "\xC2\xB6",
475: 'part;' => "\xE2\x88\x82",
476: 'permil;' => "\xE2\x80\xB0",
477: 'perp;' => "\xE2\x8A\xA5",
478: 'Phi;' => "\xCE\xA6",
479: 'phi;' => "\xCF\x86",
480: 'Pi;' => "\xCE\xA0",
481: 'pi;' => "\xCF\x80",
482: 'piv;' => "\xCF\x96",
483: 'plusmn' => "\xC2\xB1",
484: 'plusmn;' => "\xC2\xB1",
485: 'pound' => "\xC2\xA3",
486: 'pound;' => "\xC2\xA3",
487: 'Prime;' => "\xE2\x80\xB3",
488: 'prime;' => "\xE2\x80\xB2",
489: 'prod;' => "\xE2\x88\x8F",
490: 'prop;' => "\xE2\x88\x9D",
491: 'Psi;' => "\xCE\xA8",
492: 'psi;' => "\xCF\x88",
493: 'QUOT' => "\x22",
494: 'quot' => "\x22",
495: 'QUOT;' => "\x22",
496: 'quot;' => "\x22",
497: 'radic;' => "\xE2\x88\x9A",
498: 'rang;' => "\xE3\x80\x89",
499: 'raquo' => "\xC2\xBB",
500: 'raquo;' => "\xC2\xBB",
501: 'rArr;' => "\xE2\x87\x92",
502: 'rarr;' => "\xE2\x86\x92",
503: 'rceil;' => "\xE2\x8C\x89",
504: 'rdquo;' => "\xE2\x80\x9D",
505: 'real;' => "\xE2\x84\x9C",
506: 'REG' => "\xC2\xAE",
507: 'reg' => "\xC2\xAE",
508: 'REG;' => "\xC2\xAE",
509: 'reg;' => "\xC2\xAE",
510: 'rfloor;' => "\xE2\x8C\x8B",
511: 'Rho;' => "\xCE\xA1",
512: 'rho;' => "\xCF\x81",
513: 'rlm;' => "\xE2\x80\x8F",
514: 'rsaquo;' => "\xE2\x80\xBA",
515: 'rsquo;' => "\xE2\x80\x99",
516: 'sbquo;' => "\xE2\x80\x9A",
517: 'Scaron;' => "\xC5\xA0",
518: 'scaron;' => "\xC5\xA1",
519: 'sdot;' => "\xE2\x8B\x85",
520: 'sect' => "\xC2\xA7",
521: 'sect;' => "\xC2\xA7",
522: 'shy' => "\xC2\xAD",
523: 'shy;' => "\xC2\xAD",
524: 'Sigma;' => "\xCE\xA3",
525: 'sigma;' => "\xCF\x83",
526: 'sigmaf;' => "\xCF\x82",
527: 'sim;' => "\xE2\x88\xBC",
528: 'spades;' => "\xE2\x99\xA0",
529: 'sub;' => "\xE2\x8A\x82",
530: 'sube;' => "\xE2\x8A\x86",
531: 'sum;' => "\xE2\x88\x91",
532: 'sup;' => "\xE2\x8A\x83",
533: 'sup1' => "\xC2\xB9",
534: 'sup1;' => "\xC2\xB9",
535: 'sup2' => "\xC2\xB2",
536: 'sup2;' => "\xC2\xB2",
537: 'sup3' => "\xC2\xB3",
538: 'sup3;' => "\xC2\xB3",
539: 'supe;' => "\xE2\x8A\x87",
540: 'szlig' => "\xC3\x9F",
541: 'szlig;' => "\xC3\x9F",
542: 'Tau;' => "\xCE\xA4",
543: 'tau;' => "\xCF\x84",
544: 'there4;' => "\xE2\x88\xB4",
545: 'Theta;' => "\xCE\x98",
546: 'theta;' => "\xCE\xB8",
547: 'thetasym;' => "\xCF\x91",
548: 'thinsp;' => "\xE2\x80\x89",
549: 'THORN' => "\xC3\x9E",
550: 'thorn' => "\xC3\xBE",
551: 'THORN;' => "\xC3\x9E",
552: 'thorn;' => "\xC3\xBE",
553: 'tilde;' => "\xCB\x9C",
554: 'times' => "\xC3\x97",
555: 'times;' => "\xC3\x97",
556: 'TRADE;' => "\xE2\x84\xA2",
557: 'trade;' => "\xE2\x84\xA2",
558: 'Uacute' => "\xC3\x9A",
559: 'uacute' => "\xC3\xBA",
560: 'Uacute;' => "\xC3\x9A",
561: 'uacute;' => "\xC3\xBA",
562: 'uArr;' => "\xE2\x87\x91",
563: 'uarr;' => "\xE2\x86\x91",
564: 'Ucirc' => "\xC3\x9B",
565: 'ucirc' => "\xC3\xBB",
566: 'Ucirc;' => "\xC3\x9B",
567: 'ucirc;' => "\xC3\xBB",
568: 'Ugrave' => "\xC3\x99",
569: 'ugrave' => "\xC3\xB9",
570: 'Ugrave;' => "\xC3\x99",
571: 'ugrave;' => "\xC3\xB9",
572: 'uml' => "\xC2\xA8",
573: 'uml;' => "\xC2\xA8",
574: 'upsih;' => "\xCF\x92",
575: 'Upsilon;' => "\xCE\xA5",
576: 'upsilon;' => "\xCF\x85",
577: 'Uuml' => "\xC3\x9C",
578: 'uuml' => "\xC3\xBC",
579: 'Uuml;' => "\xC3\x9C",
580: 'uuml;' => "\xC3\xBC",
581: 'weierp;' => "\xE2\x84\x98",
582: 'Xi;' => "\xCE\x9E",
583: 'xi;' => "\xCE\xBE",
584: 'Yacute' => "\xC3\x9D",
585: 'yacute' => "\xC3\xBD",
586: 'Yacute;' => "\xC3\x9D",
587: 'yacute;' => "\xC3\xBD",
588: 'yen' => "\xC2\xA5",
589: 'yen;' => "\xC2\xA5",
590: 'yuml' => "\xC3\xBF",
591: 'Yuml;' => "\xC5\xB8",
592: 'yuml;' => "\xC3\xBF",
593: 'Zeta;' => "\xCE\x96",
594: 'zeta;' => "\xCE\xB6",
595: 'zwj;' => "\xE2\x80\x8D",
596: 'zwnj;' => "\xE2\x80\x8C"
597: );
598:
599: for ($i = 0, $match = null; $i < 9 && $this->consume() !== false; $i++)
600: {
601: $consumed = substr($this->consumed, 1);
602: if (isset($entities[$consumed]))
603: {
604: $match = $consumed;
605: }
606: }
607:
608: if ($match !== null)
609: {
610: $this->data = substr_replace($this->data, $entities[$match], $this->position - strlen($consumed) - 1, strlen($match) + 1);
611: $this->position += strlen($entities[$match]) - strlen($consumed) - 1;
612: }
613: break;
614: }
615: }
616: }
617:
618: