diff --git a/src/JSLikeHTMLElement.php b/src/JSLikeHTMLElement.php index f67cd69..3f382e1 100644 --- a/src/JSLikeHTMLElement.php +++ b/src/JSLikeHTMLElement.php @@ -47,14 +47,16 @@ class JSLikeHTMLElement extends \DOMElement { if ('innerHTML' !== $name) { $trace = debug_backtrace(); - trigger_error('Undefined property via __set(): ' . $name . ' in ' . $trace[0]['file'] . ' on line ' . $trace[0]['line'], E_USER_NOTICE); + trigger_error('Undefined property via __set(): ' . $name . ' in ' . $trace[0]['file'] . ' on line ' . $trace[0]['line'], \E_USER_NOTICE); return; } // first, empty the element - for ($x = $this->childNodes->length - 1; $x >= 0; --$x) { - $this->removeChild($this->childNodes->item($x)); + if (isset($this->childNodes)) { + for ($x = $this->childNodes->length - 1; $x >= 0; --$x) { + $this->removeChild($this->childNodes->item($x)); + } } // $value holds our new inner HTML @@ -112,15 +114,17 @@ class JSLikeHTMLElement extends \DOMElement if ('innerHTML' === $name) { $inner = ''; - foreach ($this->childNodes as $child) { - $inner .= $this->ownerDocument->saveXML($child); + if (isset($this->childNodes)) { + foreach ($this->childNodes as $child) { + $inner .= $this->ownerDocument->saveXML($child); + } } return $inner; } $trace = debug_backtrace(); - trigger_error('Undefined property via __get(): ' . $name . ' in ' . $trace[0]['file'] . ' on line ' . $trace[0]['line'], E_USER_NOTICE); + trigger_error('Undefined property via __get(): ' . $name . ' in ' . $trace[0]['file'] . ' on line ' . $trace[0]['line'], \E_USER_NOTICE); } public function __toString() diff --git a/src/Readability.php b/src/Readability.php index 37638ab..f9320db 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -291,6 +291,11 @@ class Readability implements LoggerAwareInterface $innerDiv->appendChild($articleContent); $overlay->appendChild($innerDiv); + // without tidy the body can (sometimes) be wiped, so re-create it + if (false === isset($this->body->childNodes)) { + $this->body = $this->dom->createElement('body'); + } + // Clear the old HTML, insert the new content. $this->body->setInnerHtml(''); $this->body->appendChild($overlay); @@ -335,9 +340,9 @@ class Readability implements LoggerAwareInterface $footnoteLink = $articleLink->cloneNode(true); $refLink = $this->dom->createElement('a'); $footnote = $this->dom->createElement('li'); - $linkDomain = @parse_url($footnoteLink->getAttribute('href'), PHP_URL_HOST); + $linkDomain = @parse_url($footnoteLink->getAttribute('href'), \PHP_URL_HOST); if (!$linkDomain && isset($this->url)) { - $linkDomain = @parse_url($this->url, PHP_URL_HOST); + $linkDomain = @parse_url($this->url, \PHP_URL_HOST); } $linkText = $this->getInnerText($articleLink); @@ -934,7 +939,7 @@ class Readability implements LoggerAwareInterface case 'DD': case 'DT': case 'LI': - $readability->value -= 2 * round($this->getLinkDensity($node), 0, PHP_ROUND_HALF_UP); + $readability->value -= 2 * round($this->getLinkDensity($node), 0, \PHP_ROUND_HALF_UP); break; case 'ASIDE': case 'FOOTER': @@ -1025,7 +1030,7 @@ class Readability implements LoggerAwareInterface continue; } - if (XML_TEXT_NODE === $childNode->nodeType) { + if (\XML_TEXT_NODE === $childNode->nodeType) { $p = $this->dom->createElement('p'); $p->setInnerHtml($childNode->nodeValue); $p->setAttribute('data-readability-styled', 'true'); @@ -1151,7 +1156,7 @@ class Readability implements LoggerAwareInterface // relatively small link density (5% or less) and be mostly unaffected by this operation. // If not for this we would have used XPath to find maximum @readability. $readability = $item->getAttributeNode('readability'); - $readability->value = round($readability->value * (1 - $this->getLinkDensity($item)), 0, PHP_ROUND_HALF_UP); + $readability->value = round($readability->value * (1 - $this->getLinkDensity($item)), 0, \PHP_ROUND_HALF_UP); if (!$topCandidate || $readability->value > (int) $topCandidate->getAttribute('readability')) { $this->logger->debug('Candidate: ' . $item->getNodePath() . ' (' . $item->getAttribute('class') . ':' . $item->getAttribute('id') . ') with score ' . $readability->value); @@ -1223,7 +1228,7 @@ class Readability implements LoggerAwareInterface $siblingNode = $siblingNodes->item($s); $siblingNodeName = $siblingNode->nodeName; $append = false; - $this->logger->debug('Looking at sibling node: ' . $siblingNode->getNodePath() . ((XML_ELEMENT_NODE === $siblingNode->nodeType && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : '')); + $this->logger->debug('Looking at sibling node: ' . $siblingNode->getNodePath() . ((\XML_ELEMENT_NODE === $siblingNode->nodeType && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : '')); if ($siblingNode->isSameNode($topCandidate)) { $append = true; @@ -1232,11 +1237,11 @@ class Readability implements LoggerAwareInterface $contentBonus = 0; // Give a bonus if sibling nodes and top candidates have the same classname. - if (XML_ELEMENT_NODE === $siblingNode->nodeType && $siblingNode->getAttribute('class') === $topCandidate->getAttribute('class') && '' !== $topCandidate->getAttribute('class')) { + if (\XML_ELEMENT_NODE === $siblingNode->nodeType && $siblingNode->getAttribute('class') === $topCandidate->getAttribute('class') && '' !== $topCandidate->getAttribute('class')) { $contentBonus += ((int) $topCandidate->getAttribute('readability')) * 0.2; } - if (XML_ELEMENT_NODE === $siblingNode->nodeType && $siblingNode->hasAttribute('readability') && (((int) $siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold) { + if (\XML_ELEMENT_NODE === $siblingNode->nodeType && $siblingNode->hasAttribute('readability') && (((int) $siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold) { $append = true; } @@ -1381,7 +1386,7 @@ class Readability implements LoggerAwareInterface $this->logger->debug('Parsing URL: ' . $this->url); if ($this->url) { - $this->domainRegExp = '/' . strtr(preg_replace('/www\d*\./', '', parse_url($this->url, PHP_URL_HOST)), ['.' => '\.']) . '/'; + $this->domainRegExp = '/' . strtr(preg_replace('/www\d*\./', '', parse_url($this->url, \PHP_URL_HOST)), ['.' => '\.']) . '/'; } mb_internal_encoding('UTF-8'); @@ -1428,7 +1433,7 @@ class Readability implements LoggerAwareInterface $this->dom = new \DOMDocument(); $this->dom->preserveWhiteSpace = false; - $this->dom->loadHTML($this->html, LIBXML_NOBLANKS | LIBXML_COMPACT | LIBXML_NOERROR); + $this->dom->loadHTML($this->html, \LIBXML_NOBLANKS | \LIBXML_COMPACT | \LIBXML_NOERROR); libxml_use_internal_errors(false); } diff --git a/tests/ReadabilityTest.php b/tests/ReadabilityTest.php index 3be09c5..e345947 100644 --- a/tests/ReadabilityTest.php +++ b/tests/ReadabilityTest.php @@ -332,9 +332,9 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase public function testAutoClosingIframeNotThrowingException() { - error_reporting(E_ALL | E_STRICT); + error_reporting(\E_ALL | \E_STRICT); ini_set('display_errors', true); - set_error_handler([$this, 'error2Exception'], E_ALL | E_STRICT); + set_error_handler([$this, 'error2Exception'], \E_ALL | \E_STRICT); $data = ' @@ -483,6 +483,19 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase $this->assertContains('getContent()->getInnerHtml()); } + public function testWithWipedBody() + { + // from https://www.cs.cmu.edu/~rgs/alice-table.html + $html = file_get_contents('tests/fixtures/wipedBody.html'); + + $readability = $this->getReadability($html, 'http://0.0.0.0', 'libxml', false); + $readability->debug = true; + $res = $readability->init(); + + $this->assertTrue($res); + $this->assertContains('Down the Rabbit-Hole', $readability->getContent()->getInnerHtml()); + } + private function getReadability($html, $url = null, $parser = 'libxml', $useTidy = true) { $readability = new Readability($html, $url, $parser, $useTidy); diff --git a/tests/fixtures/wipedBody.html b/tests/fixtures/wipedBody.html new file mode 100644 index 0000000..58a2ffe --- /dev/null +++ b/tests/fixtures/wipedBody.html @@ -0,0 +1,67 @@ + + + +Alice's Adventures in Wonderland (Project Gutenberg) + + + + + + + + +<BODY> +<H1>Alice's Adventures in Wonderland</H1> + <H1>Lewis Carroll</H1> + <H1>The Millennium Fulcrum Edition 3.0</H1> + +NOTE: This is a hypertext formatted version of the Project Gutenberg edition. +For more information, check the +<A HREF="alice-small.txt">small print</A> +or check out the +<A HREF="ftp://uiarchive.cso.uiuc.edu/pub/etext/gutenberg/etext91/alice30.txt"> +full ascii text</A>. The original Tenniel illustrations are also available +due to the efforts of Project Gutenberg. You can if you like, grab them as a +<A HREF="ftp://uiarchive.cso.uiuc.edu/pub/etext/gutenberg/etext94/algif10.zip"> +"zip file"</A> or read the <A HREF="algif-small.txt">small print</A> +that comes with them. +This document is part of a small, but growing collection of html formatted +etexts. (Others may be found in either my <A +HREF="http://www.cs.cmu.edu/Web/People/rgs/rgs-home.html">home page</A> or +John Ockerbloom's indexes by <A +HREF="http://www.cs.cmu.edu/Web/bookauthors.html">author</A> and <A +HREF="http://www.cs.cmu.edu/Web/booktitles.html">title</A>.) +I am still trying to figure out whether anyone else is interested in these +on-line readable documents. If you appreciate this document or would like to +see more such, send me mail at "rgs@cs.cmu.edu". +<P> +<A HREF="alice01a.gif"><IMG SRC="alice01th.gif"></A> +<P> +<H2>CONTENTS</H2> +<PRE> + CHAPTER I: <A HREF="alice-I.html">Down the Rabbit-Hole</A> + CHAPTER II: <A HREF="alice-II.html">The Pool of Tears</A> + CHAPTER III: <A HREF="alice-III.html">A Caucus-Race and a Long Tale</A> + CHAPTER IV: <A HREF="alice-IV.html">The Rabbit Sends in a Little Bill</A> + CHAPTER V: <A HREF="alice-V.html">Advice from a Caterpillar</A> + CHAPTER VI: <A HREF="alice-VI.html">Pig and Pepper</A> + CHAPTER VII: <A HREF="alice-VII.html">A Mad Tea-Party</A> + CHAPTER VIII: <A HREF="alice-VIII.html">The Queen's Croquet-Ground</A> + CHAPTER IX: <A HREF="alice-IX.html">The Mock Turtle's Story</A> + CHAPTER X: <A HREF="alice-X.html">The Lobster Quadrille</A> + CHAPTER XI: <A HREF="alice-XI.html">Who Stole the Tarts?</A> + CHAPTER XII: <A HREF="alice-XII.html">Alice's Evidence</A> +</PRE> + +<ADDRESS><A HREF="mailto:rgs@cs.cmu.edu">Robert Stockton</A></ADDRESS> +<P> +<!- Access counter added 5/25 1:49am -> +<A href="http://www.dbasics.com/cgi-bin/pages.cgi?143205747"><IMG SRC="http://www.dbasics.com/cgi-bin/counter.cgi?143205747.2&(none)"></A> Access statistics from htmlZine +<!- This page has been visited +A HREF="http://counter.digits.com/wc?--info=yes&--name=rgsalice" +IMG SRC="http://counter.digits.com/wc/-d/4/-r/-z/rgsalice" + ALIGN=absmiddle WIDTH=60 HEIGHT=20 BORDER=0 HSPACE=4 ALT="????"/A +times since March 2, 1996. -> +</BODY> + +