Merge pull request #62 from j0k3r/fix/avoid-wiped-body

Body can be wiped without tidy
pull/65/head
Jérémy Benoist 5 years ago committed by GitHub
commit 6f6b1f9e2b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 16
      src/JSLikeHTMLElement.php
  2. 25
      src/Readability.php
  3. 17
      tests/ReadabilityTest.php
  4. 67
      tests/fixtures/wipedBody.html

@ -47,14 +47,16 @@ class JSLikeHTMLElement extends \DOMElement
{
if ('innerHTML' !== $name) {
$trace = debug_backtrace();
trigger_error('Undefined property via __set(): ' . $name . ' in ' . $trace[0]['file'] . ' on line ' . $trace[0]['line'], E_USER_NOTICE);
trigger_error('Undefined property via __set(): ' . $name . ' in ' . $trace[0]['file'] . ' on line ' . $trace[0]['line'], \E_USER_NOTICE);
return;
}
// first, empty the element
for ($x = $this->childNodes->length - 1; $x >= 0; --$x) {
$this->removeChild($this->childNodes->item($x));
if (isset($this->childNodes)) {
for ($x = $this->childNodes->length - 1; $x >= 0; --$x) {
$this->removeChild($this->childNodes->item($x));
}
}
// $value holds our new inner HTML
@ -112,15 +114,17 @@ class JSLikeHTMLElement extends \DOMElement
if ('innerHTML' === $name) {
$inner = '';
foreach ($this->childNodes as $child) {
$inner .= $this->ownerDocument->saveXML($child);
if (isset($this->childNodes)) {
foreach ($this->childNodes as $child) {
$inner .= $this->ownerDocument->saveXML($child);
}
}
return $inner;
}
$trace = debug_backtrace();
trigger_error('Undefined property via __get(): ' . $name . ' in ' . $trace[0]['file'] . ' on line ' . $trace[0]['line'], E_USER_NOTICE);
trigger_error('Undefined property via __get(): ' . $name . ' in ' . $trace[0]['file'] . ' on line ' . $trace[0]['line'], \E_USER_NOTICE);
}
public function __toString()

@ -291,6 +291,11 @@ class Readability implements LoggerAwareInterface
$innerDiv->appendChild($articleContent);
$overlay->appendChild($innerDiv);
// without tidy the body can (sometimes) be wiped, so re-create it
if (false === isset($this->body->childNodes)) {
$this->body = $this->dom->createElement('body');
}
// Clear the old HTML, insert the new content.
$this->body->setInnerHtml('');
$this->body->appendChild($overlay);
@ -335,9 +340,9 @@ class Readability implements LoggerAwareInterface
$footnoteLink = $articleLink->cloneNode(true);
$refLink = $this->dom->createElement('a');
$footnote = $this->dom->createElement('li');
$linkDomain = @parse_url($footnoteLink->getAttribute('href'), PHP_URL_HOST);
$linkDomain = @parse_url($footnoteLink->getAttribute('href'), \PHP_URL_HOST);
if (!$linkDomain && isset($this->url)) {
$linkDomain = @parse_url($this->url, PHP_URL_HOST);
$linkDomain = @parse_url($this->url, \PHP_URL_HOST);
}
$linkText = $this->getInnerText($articleLink);
@ -934,7 +939,7 @@ class Readability implements LoggerAwareInterface
case 'DD':
case 'DT':
case 'LI':
$readability->value -= 2 * round($this->getLinkDensity($node), 0, PHP_ROUND_HALF_UP);
$readability->value -= 2 * round($this->getLinkDensity($node), 0, \PHP_ROUND_HALF_UP);
break;
case 'ASIDE':
case 'FOOTER':
@ -1025,7 +1030,7 @@ class Readability implements LoggerAwareInterface
continue;
}
if (XML_TEXT_NODE === $childNode->nodeType) {
if (\XML_TEXT_NODE === $childNode->nodeType) {
$p = $this->dom->createElement('p');
$p->setInnerHtml($childNode->nodeValue);
$p->setAttribute('data-readability-styled', 'true');
@ -1151,7 +1156,7 @@ class Readability implements LoggerAwareInterface
// relatively small link density (5% or less) and be mostly unaffected by this operation.
// If not for this we would have used XPath to find maximum @readability.
$readability = $item->getAttributeNode('readability');
$readability->value = round($readability->value * (1 - $this->getLinkDensity($item)), 0, PHP_ROUND_HALF_UP);
$readability->value = round($readability->value * (1 - $this->getLinkDensity($item)), 0, \PHP_ROUND_HALF_UP);
if (!$topCandidate || $readability->value > (int) $topCandidate->getAttribute('readability')) {
$this->logger->debug('Candidate: ' . $item->getNodePath() . ' (' . $item->getAttribute('class') . ':' . $item->getAttribute('id') . ') with score ' . $readability->value);
@ -1223,7 +1228,7 @@ class Readability implements LoggerAwareInterface
$siblingNode = $siblingNodes->item($s);
$siblingNodeName = $siblingNode->nodeName;
$append = false;
$this->logger->debug('Looking at sibling node: ' . $siblingNode->getNodePath() . ((XML_ELEMENT_NODE === $siblingNode->nodeType && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : ''));
$this->logger->debug('Looking at sibling node: ' . $siblingNode->getNodePath() . ((\XML_ELEMENT_NODE === $siblingNode->nodeType && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : ''));
if ($siblingNode->isSameNode($topCandidate)) {
$append = true;
@ -1232,11 +1237,11 @@ class Readability implements LoggerAwareInterface
$contentBonus = 0;
// Give a bonus if sibling nodes and top candidates have the same classname.
if (XML_ELEMENT_NODE === $siblingNode->nodeType && $siblingNode->getAttribute('class') === $topCandidate->getAttribute('class') && '' !== $topCandidate->getAttribute('class')) {
if (\XML_ELEMENT_NODE === $siblingNode->nodeType && $siblingNode->getAttribute('class') === $topCandidate->getAttribute('class') && '' !== $topCandidate->getAttribute('class')) {
$contentBonus += ((int) $topCandidate->getAttribute('readability')) * 0.2;
}
if (XML_ELEMENT_NODE === $siblingNode->nodeType && $siblingNode->hasAttribute('readability') && (((int) $siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold) {
if (\XML_ELEMENT_NODE === $siblingNode->nodeType && $siblingNode->hasAttribute('readability') && (((int) $siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold) {
$append = true;
}
@ -1381,7 +1386,7 @@ class Readability implements LoggerAwareInterface
$this->logger->debug('Parsing URL: ' . $this->url);
if ($this->url) {
$this->domainRegExp = '/' . strtr(preg_replace('/www\d*\./', '', parse_url($this->url, PHP_URL_HOST)), ['.' => '\.']) . '/';
$this->domainRegExp = '/' . strtr(preg_replace('/www\d*\./', '', parse_url($this->url, \PHP_URL_HOST)), ['.' => '\.']) . '/';
}
mb_internal_encoding('UTF-8');
@ -1428,7 +1433,7 @@ class Readability implements LoggerAwareInterface
$this->dom = new \DOMDocument();
$this->dom->preserveWhiteSpace = false;
$this->dom->loadHTML($this->html, LIBXML_NOBLANKS | LIBXML_COMPACT | LIBXML_NOERROR);
$this->dom->loadHTML($this->html, \LIBXML_NOBLANKS | \LIBXML_COMPACT | \LIBXML_NOERROR);
libxml_use_internal_errors(false);
}

@ -332,9 +332,9 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testAutoClosingIframeNotThrowingException()
{
error_reporting(E_ALL | E_STRICT);
error_reporting(\E_ALL | \E_STRICT);
ini_set('display_errors', true);
set_error_handler([$this, 'error2Exception'], E_ALL | E_STRICT);
set_error_handler([$this, 'error2Exception'], \E_ALL | \E_STRICT);
$data = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" lang="ru-RU" prefix="og: http://ogp.me/ns#">
@ -483,6 +483,19 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertContains('<a href="#fnref1:fnfeed_2" rev="footnote"', $readability->getContent()->getInnerHtml());
}
public function testWithWipedBody()
{
// from https://www.cs.cmu.edu/~rgs/alice-table.html
$html = file_get_contents('tests/fixtures/wipedBody.html');
$readability = $this->getReadability($html, 'http://0.0.0.0', 'libxml', false);
$readability->debug = true;
$res = $readability->init();
$this->assertTrue($res);
$this->assertContains('<a href="alice-I.html">Down the Rabbit-Hole</a>', $readability->getContent()->getInnerHtml());
}
private function getReadability($html, $url = null, $parser = 'libxml', $useTidy = true)
{
$readability = new Readability($html, $url, $parser, $useTidy);

@ -0,0 +1,67 @@
<HTML>
<HEAD>
<TITLE>Alice's Adventures in Wonderland (Project Gutenberg)</TITLE>
</HEAD>
<frameset Rows="50, *">
<frame src="alice-finfo.html">
<frame src="alice-ftitle.html" name="alice-main">
</frameset>
<noframes>
<BODY>
<H1>Alice's Adventures in Wonderland</H1>
<H1>Lewis Carroll</H1>
<H1>The Millennium Fulcrum Edition 3.0</H1>
NOTE: This is a hypertext formatted version of the Project Gutenberg edition.
For more information, check the
<A HREF="alice-small.txt">small print</A>
or check out the
<A HREF="ftp://uiarchive.cso.uiuc.edu/pub/etext/gutenberg/etext91/alice30.txt">
full ascii text</A>. The original Tenniel illustrations are also available
due to the efforts of Project Gutenberg. You can if you like, grab them as a
<A HREF="ftp://uiarchive.cso.uiuc.edu/pub/etext/gutenberg/etext94/algif10.zip">
"zip file"</A> or read the <A HREF="algif-small.txt">small print</A>
that comes with them.
This document is part of a small, but growing collection of html formatted
etexts. (Others may be found in either my <A
HREF="http://www.cs.cmu.edu/Web/People/rgs/rgs-home.html">home page</A> or
John Ockerbloom's indexes by <A
HREF="http://www.cs.cmu.edu/Web/bookauthors.html">author</A> and <A
HREF="http://www.cs.cmu.edu/Web/booktitles.html">title</A>.)
I am still trying to figure out whether anyone else is interested in these
on-line readable documents. If you appreciate this document or would like to
see more such, send me mail at "rgs@cs.cmu.edu".
<P>
<A HREF="alice01a.gif"><IMG SRC="alice01th.gif"></A>
<P>
<H2>CONTENTS</H2>
<PRE>
CHAPTER I: <A HREF="alice-I.html">Down the Rabbit-Hole</A>
CHAPTER II: <A HREF="alice-II.html">The Pool of Tears</A>
CHAPTER III: <A HREF="alice-III.html">A Caucus-Race and a Long Tale</A>
CHAPTER IV: <A HREF="alice-IV.html">The Rabbit Sends in a Little Bill</A>
CHAPTER V: <A HREF="alice-V.html">Advice from a Caterpillar</A>
CHAPTER VI: <A HREF="alice-VI.html">Pig and Pepper</A>
CHAPTER VII: <A HREF="alice-VII.html">A Mad Tea-Party</A>
CHAPTER VIII: <A HREF="alice-VIII.html">The Queen's Croquet-Ground</A>
CHAPTER IX: <A HREF="alice-IX.html">The Mock Turtle's Story</A>
CHAPTER X: <A HREF="alice-X.html">The Lobster Quadrille</A>
CHAPTER XI: <A HREF="alice-XI.html">Who Stole the Tarts?</A>
CHAPTER XII: <A HREF="alice-XII.html">Alice's Evidence</A>
</PRE>
<ADDRESS><A HREF="mailto:rgs@cs.cmu.edu">Robert Stockton</A></ADDRESS>
<P>
<!- Access counter added 5/25 1:49am ->
<A href="http://www.dbasics.com/cgi-bin/pages.cgi?143205747"><IMG SRC="http://www.dbasics.com/cgi-bin/counter.cgi?143205747.2&(none)"></A> Access statistics from htmlZine
<!- This page has been visited
A HREF="http://counter.digits.com/wc?--info=yes&--name=rgsalice"
IMG SRC="http://counter.digits.com/wc/-d/4/-r/-z/rgsalice"
ALIGN=absmiddle WIDTH=60 HEIGHT=20 BORDER=0 HSPACE=4 ALT="????"/A
times since March 2, 1996. ->
</BODY>
</noframes>
</HTML>
Loading…
Cancel
Save