diff --git a/src/JSLikeHTMLElement.php b/src/JSLikeHTMLElement.php index f67cd69..3f382e1 100644 --- a/src/JSLikeHTMLElement.php +++ b/src/JSLikeHTMLElement.php @@ -47,14 +47,16 @@ class JSLikeHTMLElement extends \DOMElement { if ('innerHTML' !== $name) { $trace = debug_backtrace(); - trigger_error('Undefined property via __set(): ' . $name . ' in ' . $trace[0]['file'] . ' on line ' . $trace[0]['line'], E_USER_NOTICE); + trigger_error('Undefined property via __set(): ' . $name . ' in ' . $trace[0]['file'] . ' on line ' . $trace[0]['line'], \E_USER_NOTICE); return; } // first, empty the element - for ($x = $this->childNodes->length - 1; $x >= 0; --$x) { - $this->removeChild($this->childNodes->item($x)); + if (isset($this->childNodes)) { + for ($x = $this->childNodes->length - 1; $x >= 0; --$x) { + $this->removeChild($this->childNodes->item($x)); + } } // $value holds our new inner HTML @@ -112,15 +114,17 @@ class JSLikeHTMLElement extends \DOMElement if ('innerHTML' === $name) { $inner = ''; - foreach ($this->childNodes as $child) { - $inner .= $this->ownerDocument->saveXML($child); + if (isset($this->childNodes)) { + foreach ($this->childNodes as $child) { + $inner .= $this->ownerDocument->saveXML($child); + } } return $inner; } $trace = debug_backtrace(); - trigger_error('Undefined property via __get(): ' . $name . ' in ' . $trace[0]['file'] . ' on line ' . $trace[0]['line'], E_USER_NOTICE); + trigger_error('Undefined property via __get(): ' . $name . ' in ' . $trace[0]['file'] . ' on line ' . $trace[0]['line'], \E_USER_NOTICE); } public function __toString() diff --git a/src/Readability.php b/src/Readability.php index 37638ab..f9320db 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -291,6 +291,11 @@ class Readability implements LoggerAwareInterface $innerDiv->appendChild($articleContent); $overlay->appendChild($innerDiv); + // without tidy the body can (sometimes) be wiped, so re-create it + if (false === isset($this->body->childNodes)) { + $this->body = $this->dom->createElement('body'); + } + // Clear the old HTML, insert the new content. $this->body->setInnerHtml(''); $this->body->appendChild($overlay); @@ -335,9 +340,9 @@ class Readability implements LoggerAwareInterface $footnoteLink = $articleLink->cloneNode(true); $refLink = $this->dom->createElement('a'); $footnote = $this->dom->createElement('li'); - $linkDomain = @parse_url($footnoteLink->getAttribute('href'), PHP_URL_HOST); + $linkDomain = @parse_url($footnoteLink->getAttribute('href'), \PHP_URL_HOST); if (!$linkDomain && isset($this->url)) { - $linkDomain = @parse_url($this->url, PHP_URL_HOST); + $linkDomain = @parse_url($this->url, \PHP_URL_HOST); } $linkText = $this->getInnerText($articleLink); @@ -934,7 +939,7 @@ class Readability implements LoggerAwareInterface case 'DD': case 'DT': case 'LI': - $readability->value -= 2 * round($this->getLinkDensity($node), 0, PHP_ROUND_HALF_UP); + $readability->value -= 2 * round($this->getLinkDensity($node), 0, \PHP_ROUND_HALF_UP); break; case 'ASIDE': case 'FOOTER': @@ -1025,7 +1030,7 @@ class Readability implements LoggerAwareInterface continue; } - if (XML_TEXT_NODE === $childNode->nodeType) { + if (\XML_TEXT_NODE === $childNode->nodeType) { $p = $this->dom->createElement('p'); $p->setInnerHtml($childNode->nodeValue); $p->setAttribute('data-readability-styled', 'true'); @@ -1151,7 +1156,7 @@ class Readability implements LoggerAwareInterface // relatively small link density (5% or less) and be mostly unaffected by this operation. // If not for this we would have used XPath to find maximum @readability. $readability = $item->getAttributeNode('readability'); - $readability->value = round($readability->value * (1 - $this->getLinkDensity($item)), 0, PHP_ROUND_HALF_UP); + $readability->value = round($readability->value * (1 - $this->getLinkDensity($item)), 0, \PHP_ROUND_HALF_UP); if (!$topCandidate || $readability->value > (int) $topCandidate->getAttribute('readability')) { $this->logger->debug('Candidate: ' . $item->getNodePath() . ' (' . $item->getAttribute('class') . ':' . $item->getAttribute('id') . ') with score ' . $readability->value); @@ -1223,7 +1228,7 @@ class Readability implements LoggerAwareInterface $siblingNode = $siblingNodes->item($s); $siblingNodeName = $siblingNode->nodeName; $append = false; - $this->logger->debug('Looking at sibling node: ' . $siblingNode->getNodePath() . ((XML_ELEMENT_NODE === $siblingNode->nodeType && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : '')); + $this->logger->debug('Looking at sibling node: ' . $siblingNode->getNodePath() . ((\XML_ELEMENT_NODE === $siblingNode->nodeType && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : '')); if ($siblingNode->isSameNode($topCandidate)) { $append = true; @@ -1232,11 +1237,11 @@ class Readability implements LoggerAwareInterface $contentBonus = 0; // Give a bonus if sibling nodes and top candidates have the same classname. - if (XML_ELEMENT_NODE === $siblingNode->nodeType && $siblingNode->getAttribute('class') === $topCandidate->getAttribute('class') && '' !== $topCandidate->getAttribute('class')) { + if (\XML_ELEMENT_NODE === $siblingNode->nodeType && $siblingNode->getAttribute('class') === $topCandidate->getAttribute('class') && '' !== $topCandidate->getAttribute('class')) { $contentBonus += ((int) $topCandidate->getAttribute('readability')) * 0.2; } - if (XML_ELEMENT_NODE === $siblingNode->nodeType && $siblingNode->hasAttribute('readability') && (((int) $siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold) { + if (\XML_ELEMENT_NODE === $siblingNode->nodeType && $siblingNode->hasAttribute('readability') && (((int) $siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold) { $append = true; } @@ -1381,7 +1386,7 @@ class Readability implements LoggerAwareInterface $this->logger->debug('Parsing URL: ' . $this->url); if ($this->url) { - $this->domainRegExp = '/' . strtr(preg_replace('/www\d*\./', '', parse_url($this->url, PHP_URL_HOST)), ['.' => '\.']) . '/'; + $this->domainRegExp = '/' . strtr(preg_replace('/www\d*\./', '', parse_url($this->url, \PHP_URL_HOST)), ['.' => '\.']) . '/'; } mb_internal_encoding('UTF-8'); @@ -1428,7 +1433,7 @@ class Readability implements LoggerAwareInterface $this->dom = new \DOMDocument(); $this->dom->preserveWhiteSpace = false; - $this->dom->loadHTML($this->html, LIBXML_NOBLANKS | LIBXML_COMPACT | LIBXML_NOERROR); + $this->dom->loadHTML($this->html, \LIBXML_NOBLANKS | \LIBXML_COMPACT | \LIBXML_NOERROR); libxml_use_internal_errors(false); } diff --git a/tests/ReadabilityTest.php b/tests/ReadabilityTest.php index 3be09c5..e345947 100644 --- a/tests/ReadabilityTest.php +++ b/tests/ReadabilityTest.php @@ -332,9 +332,9 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase public function testAutoClosingIframeNotThrowingException() { - error_reporting(E_ALL | E_STRICT); + error_reporting(\E_ALL | \E_STRICT); ini_set('display_errors', true); - set_error_handler([$this, 'error2Exception'], E_ALL | E_STRICT); + set_error_handler([$this, 'error2Exception'], \E_ALL | \E_STRICT); $data = ' @@ -483,6 +483,19 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase $this->assertContains('getContent()->getInnerHtml()); } + public function testWithWipedBody() + { + // from https://www.cs.cmu.edu/~rgs/alice-table.html + $html = file_get_contents('tests/fixtures/wipedBody.html'); + + $readability = $this->getReadability($html, 'http://0.0.0.0', 'libxml', false); + $readability->debug = true; + $res = $readability->init(); + + $this->assertTrue($res); + $this->assertContains('Down the Rabbit-Hole', $readability->getContent()->getInnerHtml()); + } + private function getReadability($html, $url = null, $parser = 'libxml', $useTidy = true) { $readability = new Readability($html, $url, $parser, $useTidy); diff --git a/tests/fixtures/wipedBody.html b/tests/fixtures/wipedBody.html new file mode 100644 index 0000000..58a2ffe --- /dev/null +++ b/tests/fixtures/wipedBody.html @@ -0,0 +1,67 @@ + + +
++
+ CHAPTER I: Down the Rabbit-Hole + CHAPTER II: The Pool of Tears + CHAPTER III: A Caucus-Race and a Long Tale + CHAPTER IV: The Rabbit Sends in a Little Bill + CHAPTER V: Advice from a Caterpillar + CHAPTER VI: Pig and Pepper + CHAPTER VII: A Mad Tea-Party + CHAPTER VIII: The Queen's Croquet-Ground + CHAPTER IX: The Mock Turtle's Story + CHAPTER X: The Lobster Quadrille + CHAPTER XI: Who Stole the Tarts? + CHAPTER XII: Alice's Evidence ++ +Robert Stockton +