|
12 | 12 | */ |
13 | 13 | class Utils |
14 | 14 | { |
| 15 | + const ENCODINGS = [ |
| 16 | + 'ASCII' => 'ascii', |
| 17 | + 'UTF-8' => 'utf-8', |
| 18 | + 'SJIS' => 'shift_jis', |
| 19 | + 'Windows-1251' => 'windows-1251', |
| 20 | + 'Windows-1252' => 'windows-1252', |
| 21 | + 'Windows-1254' => 'windows-1254', |
| 22 | + 'ISO-8859-1' => 'iso-8859-1', |
| 23 | + 'ISO-8859-2' => 'iso-8859-2', |
| 24 | + 'ISO-8859-3' => 'iso-8859-3', |
| 25 | + 'ISO-8859-4' => 'iso-8859-4', |
| 26 | + 'ISO-8859-5' => 'iso-8859-5', |
| 27 | + 'ISO-8859-6' => 'iso-8859-6', |
| 28 | + 'ISO-8859-7' => 'iso-8859-7', |
| 29 | + 'ISO-8859-8' => 'iso-8859-8', |
| 30 | + 'ISO-8859-9' => 'iso-8859-9', |
| 31 | + 'ISO-8859-10' => 'iso-8859-10', |
| 32 | + 'ISO-8859-13' => 'iso-8859-13', |
| 33 | + 'ISO-8859-14' => 'iso-8859-14', |
| 34 | + 'ISO-8859-15' => 'iso-8859-15', |
| 35 | + 'ISO-8859-16' => 'iso-8859-16', |
| 36 | + ]; |
| 37 | + |
15 | 38 | /** |
16 | 39 | * Creates a <video> element. |
17 | 40 | * |
@@ -240,4 +263,54 @@ public static function xpathQuery(DOMDocument $document, $query, $returnFirst = |
240 | 263 | return $returnFirst ? $entries->item(0) : $entries; |
241 | 264 | } |
242 | 265 | } |
| 266 | + |
| 267 | + /** |
| 268 | + * Parse a string as html code |
| 269 | + * |
| 270 | + * @param string $html |
| 271 | + * |
| 272 | + * @return DOMDocument |
| 273 | + */ |
| 274 | + public static function parse($html) |
| 275 | + { |
| 276 | + $errors = libxml_use_internal_errors(true); |
| 277 | + $entities = libxml_disable_entity_loader(true); |
| 278 | + |
| 279 | + $html = trim(self::normalize($html)); |
| 280 | + |
| 281 | + $document = new DOMDocument(); |
| 282 | + $document->loadHTML($html); |
| 283 | + |
| 284 | + libxml_use_internal_errors($errors); |
| 285 | + libxml_disable_entity_loader($entities); |
| 286 | + |
| 287 | + return $document; |
| 288 | + } |
| 289 | + |
| 290 | + /** |
| 291 | + * Normalize the encoding of a html code before parse |
| 292 | + * |
| 293 | + * @param string $string |
| 294 | + * |
| 295 | + * @return string |
| 296 | + */ |
| 297 | + private static function normalize($string) |
| 298 | + { |
| 299 | + if (stripos($string, '<meta charset="utf') === false) { |
| 300 | + return $string; |
| 301 | + } |
| 302 | + |
| 303 | + $detected = mb_detect_encoding($string, implode(',', array_keys(self::ENCODINGS)), true); |
| 304 | + |
| 305 | + if ($detected && isset(self::ENCODINGS[$detected])) { |
| 306 | + $string = mb_convert_encoding($string, 'HTML-ENTITIES', $detected); |
| 307 | + $string = preg_replace( |
| 308 | + '/<head[^>]*>/', |
| 309 | + '<head><META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset='.self::ENCODINGS[$detected].'">', |
| 310 | + $string |
| 311 | + ); |
| 312 | + } |
| 313 | + |
| 314 | + return $string; |
| 315 | + } |
243 | 316 | } |
0 commit comments