diff --git a/src/Readability.php b/src/Readability.php index 1663884..b2b4d09 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -1422,7 +1422,7 @@ class Readability implements LoggerAwareInterface unset($tidy); } - $this->html = '' . (string) $this->html; + $this->html = self::ensureMetaCharset((string) $this->html); if ('html5lib' === $this->parser || 'html5' === $this->parser) { $this->dom = (new HTML5())->loadHTML($this->html); @@ -1510,4 +1510,45 @@ class Readability implements LoggerAwareInterface ) && !$node->hasAttribute('hidden'); } + + /** + * Tries to insert `meta[charset]` tag into the proper place in the passed HTML document. + * + * `DOMDocument::loadHTML` will parse HTML documents as ISO-8859-1 if there is no `meta[charset]` tag. + * This means that UTF-8-encoded HTML fragments such as those coming from JSON-LD `articleBody` field would be parsed with incorrect encoding. + * Unfortunately, we cannot just put the tag at the start of the HTML fragment, since that would cause parser to auto-insert a `html` element, losing the attributes of the original `html` tag. + * + * @param string $html UTF-8 encoded document + */ + private static function ensureMetaCharset(string $html): string + { + $charsetTag = ''; + + // Only look at first 1024 bytes since, according to HTML5 specification, + // that’s where elements declaring a character encoding must be located. + // https://developer.mozilla.org/en-US/docs/Web/HTML/Element/meta#charset + $start = substr($html, 0, 1000); + + if (1 === preg_match('/]+charset/i', $start)) { + // tag is already present, no need for modification. + return $html; + } + + if (1 === preg_match('/]*>/i', $start)) { + // tag was located, tags go there. + $html = preg_replace('/]*>/i', '$0' . $charsetTag, $html, 1); + + return $html; + } + + if (1 === preg_match('/]*>/i', $start)) { + // tag was located, let’s put it inside and have parser create . + $html = preg_replace('/]*>/i', '$0' . $charsetTag, $html, 1); + + return $html; + } + + // Fallback – just plop the at the start of the fragment. + return $charsetTag . $html; + } } diff --git a/tests/ReadabilityTest.php b/tests/ReadabilityTest.php index 8a29629..d32b29e 100644 --- a/tests/ReadabilityTest.php +++ b/tests/ReadabilityTest.php @@ -24,7 +24,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase $readability->init(); $this->assertNull($readability->url); - $this->assertInstanceOf('DomDocument', $readability->dom); + $this->assertInstanceOf(\DOMDocument::class, $readability->dom); } public function testConstructHtml5Parser(): void @@ -33,7 +33,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase $readability->init(); $this->assertSame('http://0.0.0.0', $readability->url); - $this->assertInstanceOf('DomDocument', $readability->dom); + $this->assertInstanceOf(\DOMDocument::class, $readability->dom); $this->assertSame('', $readability->original_html); } @@ -46,7 +46,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase $readability->init(); $this->assertSame('http://0.0.0.0', $readability->url); - $this->assertInstanceOf('DomDocument', $readability->dom); + $this->assertInstanceOf(\DOMDocument::class, $readability->dom); $this->assertSame('', $readability->original_html); $this->assertTrue($readability->tidied); } @@ -60,7 +60,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase $this->assertSame('', $readability->original_html); $this->assertFalse($readability->tidied); - $this->assertInstanceOf('DomDocument', $readability->dom); + $this->assertInstanceOf(\DOMDocument::class, $readability->dom); } public function testConstructSimpleWithoutTidy(): void @@ -69,7 +69,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase $readability->init(); $this->assertSame('http://0.0.0.0', $readability->url); - $this->assertInstanceOf('DomDocument', $readability->dom); + $this->assertInstanceOf(\DOMDocument::class, $readability->dom); $this->assertSame('', $readability->original_html); $this->assertFalse($readability->tidied); } @@ -115,7 +115,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase public function testInitDiv(): void { $readability = $this->getReadability('
' . str_repeat('This is the awesome content :)', 7) . '
', 'http://0.0.0.0'); - $readability->debug = true; $res = $readability->init(); $this->assertTrue($res); @@ -129,7 +128,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase public function testWithFootnotes(): void { $readability = $this->getReadability('
' . str_repeat('

This is an awesome text with some links, here there are: the awesome

', 7) . '
', 'http://0.0.0.0'); - $readability->debug = true; $readability->convertLinksToFootnotes = true; $res = $readability->init(); @@ -146,7 +144,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase public function testStandardClean(): void { $readability = $this->getReadability('

Title

' . str_repeat('

This is an awesome text with some links, here there are: the awesome

', 7) . 'will NOT be removed
', 'http://0.0.0.0'); - $readability->debug = true; $readability->lightClean = false; $res = $readability->init(); @@ -163,7 +160,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase public function testWithIframe(): void { $readability = $this->getReadability('

Title

' . str_repeat('

This is an awesome text with some links, here there are: the awesome

', 7) . '

This is an awesome text with some links, here there are

', 'http://0.0.0.0'); - $readability->debug = true; $res = $readability->init(); $this->assertTrue($res); @@ -178,7 +174,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase public function testWithArticle(): void { $readability = $this->getReadability('

' . str_repeat('This is an awesome text with some links, here there are: the awesome', 20) . '

This is an awesome text with some links, here there are

', 'http://0.0.0.0'); - $readability->debug = true; $res = $readability->init(); $this->assertTrue($res); @@ -193,7 +188,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase public function testWithAside(): void { $readability = $this->getReadability('
' . str_repeat('

This is an awesome text with some links, here there are: the awesome

', 7) . '
', 'http://0.0.0.0'); - $readability->debug = true; $res = $readability->init(); $this->assertTrue($res); @@ -208,7 +202,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase public function testWithClasses(): void { $readability = $this->getReadability('
' . str_repeat('

This is an awesome text with some links, here there are: the awesome

', 7) . '
' . str_repeat('

This text should be removed

', 10) . '
', 'http://0.0.0.0'); - $readability->debug = true; $res = $readability->init(); $this->assertTrue($res); @@ -223,7 +216,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase public function testWithClassesWithoutLightClean(): void { $readability = $this->getReadability('
' . str_repeat('

This is an awesome text with some links, here there are: the awesome

', 7) . '
' . str_repeat('

This text should be removed

', 10) . '
', 'http://0.0.0.0'); - $readability->debug = true; $readability->lightClean = false; $res = $readability->init(); @@ -239,7 +231,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase public function testWithTd(): void { $readability = $this->getReadability('' . str_repeat('', 7) . '

This is an awesome text with some links, here there are the awesome

', 'http://0.0.0.0'); - $readability->debug = true; $res = $readability->init(); $this->assertTrue($res); @@ -252,7 +243,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase public function testWithSameClasses(): void { $readability = $this->getReadability('
' . str_repeat('

This is an awesome text with some links, here there are the awesome

', 7) . '
This text is also an awesome text and you should know that !
', 'http://0.0.0.0'); - $readability->debug = true; $res = $readability->init(); $this->assertTrue($res); @@ -266,7 +256,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase public function testWithScript(): void { $readability = $this->getReadability('
' . str_repeat('

This is an awesome text with some links, here there are the awesome

', 7) . '

', 'http://0.0.0.0'); - $readability->debug = true; $res = $readability->init(); $this->assertTrue($res); @@ -280,7 +269,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase public function testTitle(): void { $readability = $this->getReadability('this is my title
' . str_repeat('

This is an awesome text with some links, here there are the awesome

', 7) . '

', 'http://0.0.0.0'); - $readability->debug = true; $res = $readability->init(); $this->assertTrue($res); @@ -294,7 +282,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase public function testTitleWithDash(): void { $readability = $this->getReadability(' title2 - title3
' . str_repeat('

This is an awesome text with some links, here there are the awesome

', 7) . '

', 'http://0.0.0.0'); - $readability->debug = true; $res = $readability->init(); $this->assertTrue($res); @@ -308,7 +295,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase public function testTitleWithDoubleDot(): void { $readability = $this->getReadability(' title2 : title3
' . str_repeat('

This is an awesome text with some links, here there are the awesome

', 7) . '

', 'http://0.0.0.0'); - $readability->debug = true; $res = $readability->init(); $this->assertTrue($res); @@ -322,7 +308,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase public function testTitleTooShortUseH1(): void { $readability = $this->getReadability('too short

this is my h1 title !

' . str_repeat('

This is an awesome text with some links, here there are the awesome

', 7) . '

', 'http://0.0.0.0'); - $readability->debug = true; $res = $readability->init(); $this->assertTrue($res); @@ -369,7 +354,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase '; $readability = $this->getReadability($data, 'http://iosgames.ru/?p=22030'); - $readability->debug = true; $res = $readability->init(); @@ -437,7 +421,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase '; $readability = $this->getReadability($data, 'http://0.0.0.0'); - $readability->debug = true; $res = $readability->init(); @@ -474,7 +457,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase $html = (string) file_get_contents('tests/fixtures/childNodeGoesNull.html'); $readability = $this->getReadability($html, 'http://0.0.0.0'); - $readability->debug = true; $readability->convertLinksToFootnotes = true; $res = $readability->init(); @@ -487,7 +469,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase $html = (string) file_get_contents('tests/fixtures/keepFootnotes.html'); $readability = $this->getReadability($html, 'http://0.0.0.0'); - $readability->debug = true; $res = $readability->init(); $this->assertTrue($res); @@ -501,7 +482,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase $html = (string) file_get_contents('tests/fixtures/wipedBody.html'); $readability = $this->getReadability($html, 'http://0.0.0.0', 'libxml', false); - $readability->debug = true; $res = $readability->init(); $this->assertTrue($res); @@ -540,7 +520,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase public function testVisibleNode(string $content, bool $shouldBeVisible): void { $readability = $this->getReadability($content, 'http://0.0.0.0'); - $readability->debug = true; $res = $readability->init(); if ($shouldBeVisible) { @@ -550,6 +529,48 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase } } + /** + * @return array + */ + public function dataForHtmlLang(): array + { + return [ + 'meta' => [ + '
' . str_repeat('

This is the awesome content :)

', 7) . '
', + 'fr', + ], + 'head' => [ + 'Foo
' . str_repeat('

This is the awesome content :)

', 7) . '
', + 'fr', + ], + 'headless' => [ + '
' . str_repeat('

This is the awesome content :)

', 7) . '
', + 'fr', + // tidy would add tag. + false, + ], + 'fragment' => [ + '
' . str_repeat('

This is the awesome content :)

', 7) . '
', + '', + // tidy would add . + false, + ], + ]; + } + + /** + * @dataProvider dataForHtmlLang + */ + public function testHtmlLang(string $html, string $lang, bool $useTidy = true): void + { + $readability = $this->getReadability($html, 'http://0.0.0.0', 'libxml', $useTidy); + $res = $readability->init(); + + $this->assertTrue($res); + $this->assertInstanceOf(\DOMDocument::class, $readability->dom); + $this->assertSame($lang, $readability->dom->documentElement->getAttribute('lang')); + } + private function getReadability(string $html, ?string $url = null, string $parser = 'libxml', bool $useTidy = true): Readability { $readability = new Readability($html, $url, $parser, $useTidy);