Revert BC changes

- avoid method signature update
- revert moving logic out of the constructor
pull/14/head
Jeremy Benoist 10 years ago
parent 7b47e2f1de
commit 00f622e9b7
  1. 62
      src/Readability.php
  2. 29
      tests/ReadabilityTest.php

@ -174,14 +174,15 @@ class Readability implements LoggerAwareInterface
* @param string (optional) Which parser to use for turning raw HTML into a DOMDocument * @param string (optional) Which parser to use for turning raw HTML into a DOMDocument
* @param bool (optional) Use tidy * @param bool (optional) Use tidy
*/ */
public function __construct($html, $url = null, $parser = 'libxml', $useTidy = true) public function __construct($html, $url = null, $parser = 'libxml', $use_tidy = true)
{ {
$this->url = $url; $this->url = $url;
$this->html = $html; $this->html = $html;
$this->parser = $parser; $this->parser = $parser;
$this->useTidy = $useTidy && function_exists('tidy_parse_string'); $this->useTidy = $use_tidy && function_exists('tidy_parse_string');
$this->logger = new NullLogger(); $this->logger = new NullLogger();
$this->loadHtml();
} }
public function setLogger(LoggerInterface $logger) public function setLogger(LoggerInterface $logger)
@ -235,6 +236,8 @@ class Readability implements LoggerAwareInterface
* Load HTML in a DOMDocument. * Load HTML in a DOMDocument.
* Apply Pre filters * Apply Pre filters
* Cleanup HTML using Tidy (or not). * Cleanup HTML using Tidy (or not).
*
* @todo This should be called in init() instead of from __construct
*/ */
private function loadHtml() private function loadHtml()
{ {
@ -266,7 +269,6 @@ class Readability implements LoggerAwareInterface
* Use tidy (if it exists). * Use tidy (if it exists).
* This fixes problems with some sites which would otherwise trouble DOMDocument's HTML parsing. * This fixes problems with some sites which would otherwise trouble DOMDocument's HTML parsing.
* Although sometimes it makes matters worse, which is why there is an option to disable it. * Although sometimes it makes matters worse, which is why there is an option to disable it.
*
*/ */
if ($this->useTidy) { if ($this->useTidy) {
$this->logger->debug('Tidying document'); $this->logger->debug('Tidying document');
@ -314,8 +316,6 @@ class Readability implements LoggerAwareInterface
*/ */
public function init() public function init()
{ {
$this->loadHtml();
if (!isset($this->dom->documentElement)) { if (!isset($this->dom->documentElement)) {
return false; return false;
} }
@ -372,12 +372,31 @@ class Readability implements LoggerAwareInterface
return $this->success; return $this->success;
} }
/**
* Debug.
*
* @deprecated use $this->logger->debug() instead
*/
protected function dbg($msg)
{
$this->logger->debug($msg);
}
/**
* Dump debug info.
*
* @deprecated since Monolog gather log, we don't need it
*/
protected function dump_dbg()
{
}
/** /**
* Run any post-process modifications to article content as necessary. * Run any post-process modifications to article content as necessary.
* *
* @param \DOMElement $articleContent * @param \DOMElement $articleContent
*/ */
public function postProcessContent(\DOMElement $articleContent) public function postProcessContent($articleContent)
{ {
if ($this->convertLinksToFootnotes && !preg_match('/\bwiki/', $this->url)) { if ($this->convertLinksToFootnotes && !preg_match('/\bwiki/', $this->url)) {
$this->addFootnotes($articleContent); $this->addFootnotes($articleContent);
@ -462,7 +481,7 @@ class Readability implements LoggerAwareInterface
* *
* @param \DOMElement $articleContent * @param \DOMElement $articleContent
*/ */
public function addFootnotes(\DOMElement $articleContent) public function addFootnotes($articleContent)
{ {
$footnotesWrapper = $this->dom->createElement('footer'); $footnotesWrapper = $this->dom->createElement('footer');
$footnotesWrapper->setAttribute('class', 'readability-footnotes'); $footnotesWrapper->setAttribute('class', 'readability-footnotes');
@ -526,7 +545,7 @@ class Readability implements LoggerAwareInterface
* *
* @param \DOMElement $articleContent * @param \DOMElement $articleContent
*/ */
public function prepArticle(\DOMElement $articleContent) public function prepArticle($articleContent)
{ {
$this->logger->debug($this->lightClean ? 'Light clean enabled.' : 'Standard clean enabled.'); $this->logger->debug($this->lightClean ? 'Light clean enabled.' : 'Standard clean enabled.');
@ -623,7 +642,7 @@ class Readability implements LoggerAwareInterface
* *
* @param \DOMElement $node * @param \DOMElement $node
*/ */
protected function initializeNode(\DOMElement $node) protected function initializeNode($node)
{ {
if (!isset($node->tagName)) { if (!isset($node->tagName)) {
return; return;
@ -694,7 +713,7 @@ class Readability implements LoggerAwareInterface
* *
* @return \DOMElement|bool * @return \DOMElement|bool
*/ */
protected function grabArticle(\DOMElement $page = null) protected function grabArticle($page = null)
{ {
if (!$page) { if (!$page) {
$page = $this->dom; $page = $this->dom;
@ -743,8 +762,7 @@ class Readability implements LoggerAwareInterface
continue; continue;
} }
// XML_TEXT_NODE if ($childNode->nodeType === XML_TEXT_NODE) {
if ($childNode->nodeType == 3) {
$p = $this->dom->createElement('p'); $p = $this->dom->createElement('p');
$p->innerHTML = $childNode->nodeValue; $p->innerHTML = $childNode->nodeValue;
$p->setAttribute('data-readability-styled', 'true'); $p->setAttribute('data-readability-styled', 'true');
@ -770,7 +788,7 @@ class Readability implements LoggerAwareInterface
continue; continue;
} }
$grandParentNode = ($parentNode->parentNode instanceof \DOMElement) ? $parentNode->parentNode : null; $grandParentNode = $parentNode->parentNode instanceof \DOMElement ? $parentNode->parentNode : null;
$innerText = $this->getInnerText($nodesToScore[$pt]); $innerText = $this->getInnerText($nodesToScore[$pt]);
// If this paragraph is less than MIN_PARAGRAPH_LENGTH (default:20) characters, don't even count it. // If this paragraph is less than MIN_PARAGRAPH_LENGTH (default:20) characters, don't even count it.
@ -1051,7 +1069,7 @@ class Readability implements LoggerAwareInterface
* *
* @return string * @return string
*/ */
public function getInnerText(\DOMElement $e = null, $normalizeSpaces = true, $flattenLines = false) public function getInnerText($e, $normalizeSpaces = true, $flattenLines = false)
{ {
if (null === $e || !isset($e->textContent) || $e->textContent === '') { if (null === $e || !isset($e->textContent) || $e->textContent === '') {
return ''; return '';
@ -1073,7 +1091,7 @@ class Readability implements LoggerAwareInterface
* *
* @param \DOMElement $e * @param \DOMElement $e
*/ */
public function cleanStyles(\DOMElement $e) public function cleanStyles($e)
{ {
if (!is_object($e)) { if (!is_object($e)) {
return; return;
@ -1121,7 +1139,7 @@ class Readability implements LoggerAwareInterface
* *
* @return int * @return int
*/ */
public function getLinkDensity(\DOMElement $e, $excludeExternal = false) public function getLinkDensity($e, $excludeExternal = false)
{ {
$links = $e->getElementsByTagName('a'); $links = $e->getElementsByTagName('a');
$textLength = mb_strlen($this->getInnerText($e, true, true)); $textLength = mb_strlen($this->getInnerText($e, true, true));
@ -1150,7 +1168,7 @@ class Readability implements LoggerAwareInterface
* *
* @return int * @return int
*/ */
protected function weightAttribute(\DOMElement $element, $attribute) protected function weightAttribute($element, $attribute)
{ {
if (!$element->hasAttribute($attribute)) { if (!$element->hasAttribute($attribute)) {
return 0; return 0;
@ -1185,7 +1203,7 @@ class Readability implements LoggerAwareInterface
* *
* @return int * @return int
*/ */
public function getWeight(\DOMElement $e) public function getWeight($e)
{ {
if (!$this->flagIsActive(self::FLAG_WEIGHT_ATTRIBUTES)) { if (!$this->flagIsActive(self::FLAG_WEIGHT_ATTRIBUTES)) {
return 0; return 0;
@ -1205,7 +1223,7 @@ class Readability implements LoggerAwareInterface
* *
* @param \DOMElement $node * @param \DOMElement $node
*/ */
public function killBreaks(\DOMElement $node) public function killBreaks($node)
{ {
$html = $node->innerHTML; $html = $node->innerHTML;
$html = preg_replace($this->regexps['killBreaks'], '<br />', $html); $html = preg_replace($this->regexps['killBreaks'], '<br />', $html);
@ -1221,7 +1239,7 @@ class Readability implements LoggerAwareInterface
* @param \DOMElement $e * @param \DOMElement $e
* @param string $tag * @param string $tag
*/ */
public function clean(\DOMElement $e, $tag) public function clean($e, $tag)
{ {
$currentItem = null; $currentItem = null;
$targetList = $e->getElementsByTagName($tag); $targetList = $e->getElementsByTagName($tag);
@ -1257,7 +1275,7 @@ class Readability implements LoggerAwareInterface
* @param \DOMElement $e * @param \DOMElement $e
* @param string $tag * @param string $tag
*/ */
public function cleanConditionally(\DOMElement $e, $tag) public function cleanConditionally($e, $tag)
{ {
if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) { if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
return; return;
@ -1370,7 +1388,7 @@ class Readability implements LoggerAwareInterface
* *
* @param \DOMElement $e * @param \DOMElement $e
*/ */
public function cleanHeaders(\DOMElement $e) public function cleanHeaders($e)
{ {
for ($headerIndex = 1; $headerIndex < 3; ++$headerIndex) { for ($headerIndex = 1; $headerIndex < 3; ++$headerIndex) {
$headers = $e->getElementsByTagName('h'.$headerIndex); $headers = $e->getElementsByTagName('h'.$headerIndex);

@ -22,40 +22,47 @@ class ReadabilityTest extends \PHPUnit_Framework_TestCase
return $readability; return $readability;
} }
/**
* @requires extension tidy
*/
public function testConstructDefault()
{
$readability = $this->getReadability('');
$this->assertNull($readability->url);
$this->assertInstanceOf('DomDocument', $readability->dom);
}
/**
* @requires extension tidy
*/
public function testConstructSimple() public function testConstructSimple()
{ {
$readability = $this->getReadability('<html/>', 'http://0.0.0.0'); $readability = $this->getReadability('<html/>', 'http://0.0.0.0');
$readability->init();
$this->assertEquals('http://0.0.0.0', $readability->url); $this->assertEquals('http://0.0.0.0', $readability->url);
$this->assertInstanceOf('DomDocument', $readability->dom);
$this->assertEquals('<html/>', $readability->original_html); $this->assertEquals('<html/>', $readability->original_html);
$this->assertTrue($readability->tidied); $this->assertTrue($readability->tidied);
$this->assertTrue($this->logHandler->hasDebugThatContains('Parsing URL: http://0.0.0.0'));
$this->assertTrue($this->logHandler->hasDebugThatContains('Tidying document'));
$this->assertTrue($this->logHandler->hasDebugThatContains('Light clean enabled.'));
} }
public function testConstructDefaultWithoutTidy() public function testConstructDefaultWithoutTidy()
{ {
$readability = $this->getReadability('', null, 'libxml', false); $readability = $this->getReadability('', null, 'libxml', false);
$readability->init();
$this->assertNull($readability->url); $this->assertNull($readability->url);
$this->assertEquals('', $readability->original_html); $this->assertEquals('', $readability->original_html);
$this->assertFalse($readability->tidied); $this->assertFalse($readability->tidied);
$this->assertTrue($this->logHandler->hasDebugThatContains('Parsing URL: ')); $this->assertInstanceOf('DomDocument', $readability->dom);
$this->assertFalse($this->logHandler->hasDebugThatContains('Tidying document'));
$this->assertTrue($this->logHandler->hasDebugThatContains('Light clean enabled.'));
} }
public function testConstructSimpleWithoutTidy() public function testConstructSimpleWithoutTidy()
{ {
$readability = $this->getReadability('<html/>', 'http://0.0.0.0', 'libxml', false); $readability = $this->getReadability('<html/>', 'http://0.0.0.0', 'libxml', false);
$readability->init();
$this->assertEquals('http://0.0.0.0', $readability->url); $this->assertEquals('http://0.0.0.0', $readability->url);
$this->assertInstanceOf('DomDocument', $readability->dom);
$this->assertEquals('<html/>', $readability->original_html); $this->assertEquals('<html/>', $readability->original_html);
$this->assertFalse($readability->tidied); $this->assertFalse($readability->tidied);
} }
@ -447,6 +454,8 @@ class ReadabilityTest extends \PHPUnit_Framework_TestCase
public function testPreFilters() public function testPreFilters()
{ {
$this->markTestSkipped('Won\'t work until loadHtml() is moved in init() instead of __construct()');
$readability = $this->getReadability('<div>'.str_repeat('<p>This <b>is</b> the awesome and WONDERFUL content :)</p>', 7).'</div>', 'http://0.0.0.0'); $readability = $this->getReadability('<div>'.str_repeat('<p>This <b>is</b> the awesome and WONDERFUL content :)</p>', 7).'</div>', 'http://0.0.0.0');
$readability->addPreFilter('!<b[^>]*>(.*?)</b>!is', ''); $readability->addPreFilter('!<b[^>]*>(.*?)</b>!is', '');

Loading…
Cancel
Save