From 6f4404030b043e5278ae87ec16887f09c1c600a5 Mon Sep 17 00:00:00 2001 From: Jan Tojnar Date: Fri, 31 Mar 2023 05:25:40 +0200 Subject: [PATCH] Do not use `mb_convert_encoding` with `HTML-ENTITIES` as target encoding This is deprecated since PHP 8.2: Deprecated: mb_convert_encoding(): Handling HTML entities via mbstring is deprecated; use htmlspecialchars, htmlentities, or mb_encode_numericentity/mb_decode_numericentity instead It was used because `DOMDocument`, which uses libxml2 internally, will parse the HTML as ISO-8859-1, unless the document contains an XML encoding declaration or HTML meta tag setting character set. Since first such element wins, putting the `meta[charset]` up front will ensure the parser uses the correct encoding, even if the document contains incorrect meta tag (e.g. when the document is converted to UTF-8 without also updating the metadata by the software passing it to Readability). https://stackoverflow.com/a/39148511/160386 (cherry picked from commit f14428e4c0fa34b28f6b8a7696e62790bcde363e) --- src/JSLikeHTMLElement.php | 3 +-- src/Readability.php | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/JSLikeHTMLElement.php b/src/JSLikeHTMLElement.php index 3f382e1..bb5c9ea 100644 --- a/src/JSLikeHTMLElement.php +++ b/src/JSLikeHTMLElement.php @@ -79,14 +79,13 @@ class JSLikeHTMLElement extends \DOMElement } else { // $value is probably ill-formed $f = new \DOMDocument(); - $value = mb_convert_encoding($value, 'HTML-ENTITIES', 'UTF-8'); // Using will generate a warning, but so will bad HTML // (and by this point, bad HTML is what we've got). // We use it (and suppress the warning) because an HTML fragment will // be wrapped around tags which we don't really want to keep. // Note: despite the warning, if loadHTML succeeds it will return true. - $result = $f->loadHTML('' . $value . ''); + $result = $f->loadHTML('' . $value . ''); if ($result) { $import = $f->getElementsByTagName('htmlfragment')->item(0); diff --git a/src/Readability.php b/src/Readability.php index 8c1e62b..08963c0 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -1430,7 +1430,7 @@ class Readability implements LoggerAwareInterface unset($tidy); } - $this->html = mb_convert_encoding($this->html, 'HTML-ENTITIES', 'UTF-8'); + $this->html = '' . (string) $this->html; if ('html5lib' === $this->parser || 'html5' === $this->parser) { $this->dom = (new HTML5())->loadHTML($this->html);