Improved HTML UTF-8 handling

This commit is contained in:
djmaze 2021-11-25 11:28:22 +01:00
parent 9218cc4785
commit bc986f323c
3 changed files with 26 additions and 86 deletions

View file

@ -91,27 +91,17 @@ abstract class HtmlUtils
$sResult = ''; $sResult = '';
if ($oElem instanceof \DOMDocument) if ($oElem instanceof \DOMDocument)
{ {
if (isset($oElem->documentElement)) $sResult = $oElem->saveHTML(isset($oElem->documentElement) ? $oElem->documentElement : null);
{
$sResult = $oElem->saveHTML($oElem->documentElement);
}
else
{
$sResult = $oElem->saveHTML();
}
} }
else if ($oElem) else if ($oDom)
{ {
if ($oDom) $sResult = $oDom->saveHTML($oElem);
{ }
$sResult = $oDom->saveHTML($oElem); else
} {
else $oTempDoc = self::createDOMDocument();
{ $oTempDoc->appendChild($oTempDoc->importNode($oElem->cloneNode(true), true));
$oTempDoc = self::createDOMDocument(); $sResult = $oTempDoc->saveHTML();
$oTempDoc->appendChild($oTempDoc->importNode($oElem->cloneNode(true), true));
$sResult = $oTempDoc->saveHTML();
}
} }
return \trim($sResult); return \trim($sResult);
@ -188,8 +178,8 @@ abstract class HtmlUtils
$sBodyAttrs = \preg_replace('/xmlns:[a-z]="[^"]*"/i', '', $sBodyAttrs); $sBodyAttrs = \preg_replace('/xmlns:[a-z]="[^"]*"/i', '', $sBodyAttrs);
$sBodyAttrs = \preg_replace('/xmlns:[a-z]=\'[^\']*\'/i', '', $sBodyAttrs); $sBodyAttrs = \preg_replace('/xmlns:[a-z]=\'[^\']*\'/i', '', $sBodyAttrs);
$sHtmlAttrs = trim($sHtmlAttrs); $sHtmlAttrs = \trim($sHtmlAttrs);
$sBodyAttrs = trim($sBodyAttrs); $sBodyAttrs = \trim($sBodyAttrs);
return $sHtml; return $sHtml;
} }
@ -1078,8 +1068,8 @@ abstract class HtmlUtils
$sText = \strtr($sText, array( $sText = \strtr($sText, array(
"\n" => "<br />", "\n" => "<br />",
"\t" => '&nbsp;&nbsp;&nbsp;', "\t" => "\xC2\xA0\xC2\xA0\xC2\xA0\xC2\xA0",
' ' => '&nbsp;&nbsp;' ' ' => "\xC2\xA0\xC2\xA0"
)); ));
return $sText; return $sText;
@ -1089,20 +1079,20 @@ abstract class HtmlUtils
{ {
$sText = \MailSo\Base\Utils::StripSpaces($sText); $sText = \MailSo\Base\Utils::StripSpaces($sText);
$sText = \preg_replace_callback('/<h([1-6])[^>]*>/', function($m) {
return "\n\n" . \str_repeat('#', $m[1]) . ' ';
}, $sText);
$sText = \preg_replace(array( $sText = \preg_replace(array(
"/\r/", "/\r/",
"/[\n\t]+/", "/[\n\t]+/",
'/<script[^>]*>.*?<\/script>/i', '/<script[^>]*>.*?<\/script>|<style[^>]*>.*?<\/style>|<title[^>]*>.*?<\/title>/i',
'/<style[^>]*>.*?<\/style>/i', '/<\/h[1-6]>/i',
'/<title[^>]*>.*?<\/title>/i',
'/<h[123][^>]*>(.+?)<\/h[123]>/i',
'/<h[456][^>]*>(.+?)<\/h[456]>/i',
'/<p[^>]*>/i', '/<p[^>]*>/i',
'/<br[^>]*>/i', '/<br[^>]*>/i',
'/<b[^>]*>(.+?)<\/b>/i', '/<b[^>]*>(.+?)<\/b>/i',
'/<i[^>]*>(.+?)<\/i>/i', '/<i[^>]*>(.+?)<\/i>/i',
'/(<ul[^>]*>|<\/ul>)/i', '/<ul[^>]*>|<\/ul>|<ol[^>]*>|<\/ol>/i',
'/(<ol[^>]*>|<\/ol>)/i',
'/<li[^>]*>/i', '/<li[^>]*>/i',
'/<a[^>]*href="([^"]+)"[^>]*>(.+?)<\/a>/i', '/<a[^>]*href="([^"]+)"[^>]*>(.+?)<\/a>/i',
'/<hr[^>]*>/i', '/<hr[^>]*>/i',
@ -1110,69 +1100,23 @@ abstract class HtmlUtils
'/(<tr[^>]*>|<\/tr>)/i', '/(<tr[^>]*>|<\/tr>)/i',
'/<td[^>]*>(.+?)<\/td>/i', '/<td[^>]*>(.+?)<\/td>/i',
'/<th[^>]*>(.+?)<\/th>/i', '/<th[^>]*>(.+?)<\/th>/i',
'/&nbsp;/i',
'/&quot;/i',
'/&amp;/i',
'/&copy;/i',
'/&trade;/i',
'/&#8220;/',
'/&#8221;/',
'/&#8211;/',
'/&#8217;/',
'/&#38;/',
'/&#169;/',
'/&#8482;/',
'/&#151;/',
'/&#147;/',
'/&#148;/',
'/&#149;/',
'/&reg;/i',
'/&bull;/i',
'/&[&;]+;/i',
'/&#39;/',
'/&#160;/'
), array( ), array(
'', '',
' ', ' ',
'', '',
'', "\n\n",
'',
"\n\n\\1\n\n",
"\n\n\\1\n\n",
"\n\n\t", "\n\n\t",
"\n", "\n",
'\\1', '\\1',
'\\1', '\\1',
"\n\n", "\n\n",
"\n\n",
"\n\t* ", "\n\t* ",
'\\2 (\\1)', '\\2 (\\1)',
"\n------------------------------------\n", "\n------------------------------------\n",
"\n", "\n",
"\n", "\n",
"\t\\1\n", "\t\\1\n",
"\t\\1\n", "\t\\1\n"
' ',
'"',
'&',
'(c)',
'(tm)',
'"',
'"',
'-',
"'",
'&',
'(c)',
'(tm)',
'--',
'"',
'"',
'*',
'(R)',
'*',
'',
'\'',
''
), $sText); ), $sText);
$sText = \str_ireplace('<div>',"\n<div>", $sText); $sText = \str_ireplace('<div>',"\n<div>", $sText);
@ -1180,13 +1124,7 @@ abstract class HtmlUtils
$sText = \preg_replace("/\n\\s+\n/", "\n", $sText); $sText = \preg_replace("/\n\\s+\n/", "\n", $sText);
$sText = \preg_replace("/[\n]{3,}/", "\n\n", $sText); $sText = \preg_replace("/[\n]{3,}/", "\n\n", $sText);
$sText = \preg_replace(array( $sText = \html_entity_decode($sText, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML401, 'UTF-8');
'/&gt;/i',
'/&lt;/i'
), array(
'>',
'<'
), $sText);
return \trim($sText); return \trim($sText);
} }

View file

@ -133,7 +133,7 @@ class Utils
// return $sHtml; // return $sHtml;
return \preg_replace( return \preg_replace(
['@"\\s*/>@', '/\\s*&nbsp;/i', '/&nbsp;\\s*/i', '/[\\r\\n\\t]+/', '/>\\s+</'], ['@"\\s*/>@', '/\\s*&nbsp;/i', '/&nbsp;\\s*/i', '/[\\r\\n\\t]+/', '/>\\s+</'],
['">', '&nbsp;', '&nbsp;', ' ', '><'], ['">', "\xC2\xA0", "\xC2\xA0", ' ', '><'],
\trim($sHtml) \trim($sHtml)
); );
} }

View file

@ -146,6 +146,8 @@ if (defined('APP_VERSION'))
define('APP_PLUGINS_PATH', APP_PRIVATE_DATA.'plugins/'); define('APP_PLUGINS_PATH', APP_PRIVATE_DATA.'plugins/');
ini_set('default_charset', 'UTF-8');
ini_set('internal_encoding', 'UTF-8');
mb_internal_encoding('UTF-8'); mb_internal_encoding('UTF-8');
mb_language('uni'); mb_language('uni');