Skip to content

Commit dc8125e

Browse files
committed
russian encoding #324
1 parent 5594b25 commit dc8125e

File tree

2 files changed

+74
-45
lines changed

2 files changed

+74
-45
lines changed

src/Http/Response.php

Lines changed: 1 addition & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -57,51 +57,7 @@ public function getHtmlContent()
5757
return $this->htmlContent = false;
5858
}
5959

60-
$errors = libxml_use_internal_errors(true);
61-
$entities = libxml_disable_entity_loader(true);
62-
63-
$this->htmlContent = new DOMDocument();
64-
65-
if (stripos($content, '<meta charset="utf') === false) {
66-
$encodings = [
67-
'ASCII' => 'ascii',
68-
'UTF-8' => 'utf-8',
69-
'SJIS' => 'shift_jis',
70-
'Windows-1251' => 'windows-1251',
71-
'Windows-1252' => 'windows-1252',
72-
'Windows-1254' => 'windows-1254',
73-
'ISO-8859-16' => 'iso-8859-16',
74-
'ISO-8859-15' => 'iso-8859-15',
75-
'ISO-8859-14' => 'iso-8859-14',
76-
'ISO-8859-13' => 'iso-8859-13',
77-
'ISO-8859-10' => 'iso-8859-10',
78-
'ISO-8859-9' => 'iso-8859-9',
79-
'ISO-8859-8' => 'iso-8859-8',
80-
'ISO-8859-7' => 'iso-8859-7',
81-
'ISO-8859-6' => 'iso-8859-6',
82-
'ISO-8859-5' => 'iso-8859-5',
83-
'ISO-8859-4' => 'iso-8859-4',
84-
'ISO-8859-3' => 'iso-8859-3',
85-
'ISO-8859-2' => 'iso-8859-2',
86-
'ISO-8859-1' => 'iso-8859-1',
87-
];
88-
89-
$detected = mb_detect_encoding($content, implode(',', array_keys($encodings)), true);
90-
91-
if ($detected && !empty($encodings[$detected])) {
92-
$content = mb_convert_encoding($content, 'HTML-ENTITIES', $detected);
93-
$content = preg_replace(
94-
'/<head[^>]*>/',
95-
'<head><META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset='.$encodings[$detected].'">',
96-
$content
97-
);
98-
}
99-
}
100-
101-
$this->htmlContent->loadHTML(trim($content));
102-
103-
libxml_use_internal_errors($errors);
104-
libxml_disable_entity_loader($entities);
60+
$this->htmlContent = Utils::parse($content);
10561
} catch (Exception $exception) {
10662
return $this->htmlContent = false;
10763
}

src/Utils.php

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,29 @@
1212
*/
1313
class Utils
1414
{
15+
const ENCODINGS = [
16+
'ASCII' => 'ascii',
17+
'UTF-8' => 'utf-8',
18+
'SJIS' => 'shift_jis',
19+
'Windows-1251' => 'windows-1251',
20+
'Windows-1252' => 'windows-1252',
21+
'Windows-1254' => 'windows-1254',
22+
'ISO-8859-1' => 'iso-8859-1',
23+
'ISO-8859-2' => 'iso-8859-2',
24+
'ISO-8859-3' => 'iso-8859-3',
25+
'ISO-8859-4' => 'iso-8859-4',
26+
'ISO-8859-5' => 'iso-8859-5',
27+
'ISO-8859-6' => 'iso-8859-6',
28+
'ISO-8859-7' => 'iso-8859-7',
29+
'ISO-8859-8' => 'iso-8859-8',
30+
'ISO-8859-9' => 'iso-8859-9',
31+
'ISO-8859-10' => 'iso-8859-10',
32+
'ISO-8859-13' => 'iso-8859-13',
33+
'ISO-8859-14' => 'iso-8859-14',
34+
'ISO-8859-15' => 'iso-8859-15',
35+
'ISO-8859-16' => 'iso-8859-16',
36+
];
37+
1538
/**
1639
* Creates a <video> element.
1740
*
@@ -240,4 +263,54 @@ public static function xpathQuery(DOMDocument $document, $query, $returnFirst =
240263
return $returnFirst ? $entries->item(0) : $entries;
241264
}
242265
}
266+
267+
/**
268+
* Parse a string as html code
269+
*
270+
* @param string $html
271+
*
272+
* @return DOMDocument
273+
*/
274+
public static function parse($html)
275+
{
276+
$errors = libxml_use_internal_errors(true);
277+
$entities = libxml_disable_entity_loader(true);
278+
279+
$html = trim(self::normalize($html));
280+
281+
$document = new DOMDocument();
282+
$document->loadHTML($html);
283+
284+
libxml_use_internal_errors($errors);
285+
libxml_disable_entity_loader($entities);
286+
287+
return $document;
288+
}
289+
290+
/**
291+
* Normalize the encoding of a html code before parse
292+
*
293+
* @param string $string
294+
*
295+
* @return string
296+
*/
297+
private static function normalize($string)
298+
{
299+
if (stripos($string, '<meta charset="utf') === false) {
300+
return $string;
301+
}
302+
303+
$detected = mb_detect_encoding($string, implode(',', array_keys(self::ENCODINGS)), true);
304+
305+
if ($detected && isset(self::ENCODINGS[$detected])) {
306+
$string = mb_convert_encoding($string, 'HTML-ENTITIES', $detected);
307+
$string = preg_replace(
308+
'/<head[^>]*>/',
309+
'<head><META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset='.self::ENCODINGS[$detected].'">',
310+
$string
311+
);
312+
}
313+
314+
return $string;
315+
}
243316
}

0 commit comments

Comments
 (0)