@@ -61,20 +61,40 @@ public function getHtmlContent()
6161 $ entities = libxml_disable_entity_loader (true );
6262
6363 $ this ->htmlContent = new DOMDocument ();
64-
65- if (mb_detect_encoding ($ content , 'UTF-8 ' , true ) === 'UTF-8 ' ) {
66- $ content = mb_convert_encoding ($ content , 'HTML-ENTITIES ' , 'UTF-8 ' );
67- $ content = preg_replace (
68- '/<head[^>]*>/ ' ,
69- '<head><META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=utf-8"> ' ,
70- $ content
71- );
72- } elseif (mb_detect_encoding ($ content , 'SJIS ' , true ) === 'SJIS ' ) {
73- $ content = mb_convert_encoding ($ content , 'HTML-ENTITIES ' , 'SJIS ' );
74- $ content = preg_replace ('/<head[^>]*>/ ' , '<head><META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=shift_jis"> ' , $ content );
75- } elseif (mb_detect_encoding ($ content , 'ISO-8859-1 ' , true ) === 'ISO-8859-1 ' ) {
76- $ content = mb_convert_encoding ($ content , 'HTML-ENTITIES ' , 'ISO-8859-1 ' );
77- $ content = preg_replace ('/<head[^>]*>/ ' , '<head><META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=iso-8859-1"> ' , $ content );
64+ if (stripos ($ content , '<meta charset= ' ) === false ) {
65+ $ encodings = [
66+ 'ASCII ' => 'ascii ' ,
67+ 'UTF-8 ' => 'utf-8 ' ,
68+ 'SJIS ' => 'shift_jis ' ,
69+ 'Windows-1251 ' => 'windows-1251 ' ,
70+ 'Windows-1252 ' => 'windows-1252 ' ,
71+ 'Windows-1254 ' => 'windows-1254 ' ,
72+ 'ISO-8859-16 ' => 'iso-8859-16 ' ,
73+ 'ISO-8859-15 ' => 'iso-8859-15 ' ,
74+ 'ISO-8859-14 ' => 'iso-8859-14 ' ,
75+ 'ISO-8859-13 ' => 'iso-8859-13 ' ,
76+ 'ISO-8859-10 ' => 'iso-8859-10 ' ,
77+ 'ISO-8859-9 ' => 'iso-8859-9 ' ,
78+ 'ISO-8859-8 ' => 'iso-8859-8 ' ,
79+ 'ISO-8859-7 ' => 'iso-8859-7 ' ,
80+ 'ISO-8859-6 ' => 'iso-8859-6 ' ,
81+ 'ISO-8859-5 ' => 'iso-8859-5 ' ,
82+ 'ISO-8859-4 ' => 'iso-8859-4 ' ,
83+ 'ISO-8859-3 ' => 'iso-8859-3 ' ,
84+ 'ISO-8859-2 ' => 'iso-8859-2 ' ,
85+ 'ISO-8859-1 ' => 'iso-8859-1 ' ,
86+ ];
87+
88+ $ detected = mb_detect_encoding ($ content , implode (', ' , array_keys ($ encodings )), true );
89+
90+ if ($ detected && !empty ($ encodings [$ detected ])) {
91+ $ content = mb_convert_encoding ($ content , 'HTML-ENTITIES ' , $ detected );
92+ $ content = preg_replace (
93+ '/<head[^>]*>/ ' ,
94+ '<head><META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset= ' .$ encodings [$ detected ].'"> ' ,
95+ $ content
96+ );
97+ }
7898 }
7999
80100 $ this ->htmlContent ->loadHTML (trim ($ content ));
0 commit comments