'.str_repeat('This is an awesome text with some links, here there are: the awesome', 20).'
This is an awesome text with some links, here there are
diff --git a/README.md b/README.md index 60dfcb5..68a8f00 100644 --- a/README.md +++ b/README.md @@ -45,3 +45,20 @@ if ($result) { echo 'Looks like we couldn\'t find the content. :('; } ``` + +If you want to debug it, or check what's going on, you can inject a logger (which must follow `Psr\Log\LoggerInterface`, Monolog for example): + +```php +use Readability\Readability; +use Monolog\Logger; +use Monolog\Handler\StreamHandler; + +$url = 'http://www.medialens.org/index.php/alerts/alert-archive/alerts-2013/729-thatcher.html'; +$html = file_get_contents($url); + +$logger = new Logger('readability'); +$logger->pushHandler(new StreamHandler('path/to/your.log', Logger::DEBUG)); + +$readability = new Readability($html, $url); +$readability->setLogger($logger); +``` diff --git a/composer.json b/composer.json index 5487a5c..3921e36 100644 --- a/composer.json +++ b/composer.json @@ -24,7 +24,8 @@ "role": "Developer (original JS version)" }], "require": { - "php": ">=5.3.3" + "php": ">=5.3.3", + "monolog/monolog": "^1.13.1" }, "require-dev": { "satooshi/php-coveralls": "~0.6" diff --git a/src/Readability.php b/src/Readability.php index b40a544..2969c4b 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -2,6 +2,10 @@ namespace Readability; +use Psr\Log\LoggerAwareInterface; +use Psr\Log\LoggerInterface; +use Psr\Log\NullLogger; + /** * Arc90's Readability ported to PHP for FiveFilters.org * Based on readability.js version 1.7.1 (without multi-page support) @@ -45,7 +49,7 @@ namespace Readability; * existing DOMElement objects without passing an entire HTML document to * be parsed. */ -class Readability +class Readability implements LoggerAwareInterface { public $convertLinksToFootnotes = false; public $revertForcedParagraphElements = true; @@ -57,10 +61,9 @@ class Readability public $url = null; // preserves more content (experimental) public $lightClean = true; + // no more used, keept to avoid BC public $debug = false; public $tidied = false; - // error text for one time output - protected $debugText = ''; // article domain regexp for calibration protected $domainRegExp = null; protected $body = null; // @@ -70,6 +73,10 @@ class Readability protected $flags = 7; // indicates whether we were able to extract or not protected $success = false; + protected $logger; + protected $parser; + protected $html; + protected $useTidy; /** * All of the regular expressions in use within readability. @@ -167,13 +174,76 @@ class Readability * @param string (optional) Which parser to use for turning raw HTML into a DOMDocument * @param bool (optional) Use tidy */ - public function __construct($html, $url = null, $parser = 'libxml', $use_tidy = true) + public function __construct($html, $url = null, $parser = 'libxml', $useTidy = true) { $this->url = $url; - $this->debugText = 'Parsing URL: '.$url."\n"; + $this->html = $html; + $this->parser = $parser; + $this->useTidy = $useTidy && function_exists('tidy_parse_string'); + + $this->logger = new NullLogger(); + } + + public function setLogger(LoggerInterface $logger) + { + $this->logger = $logger; + } + + /** + * Get article title element. + * + * @return \DOMElement + */ + public function getTitle() + { + return $this->articleTitle; + } + + /** + * Get article content element. + * + * @return \DOMElement + */ + public function getContent() + { + return $this->articleContent; + } + + /** + * Add pre filter for raw input HTML processing. + * + * @param string RegExp for replace + * @param string (optional) Replacer + */ + public function addPreFilter($filter, $replacer = '') + { + $this->pre_filters[$filter] = $replacer; + } + + /** + * Add post filter for raw output HTML processing. + * + * @param string RegExp for replace + * @param string (optional) Replacer + */ + public function addPostFilter($filter, $replacer = '') + { + $this->post_filters[$filter] = $replacer; + } + + /** + * Load HTML in a DOMDocument. + * Apply Pre filters + * Cleanup HTML using Tidy (or not). + */ + private function loadHtml() + { + $this->original_html = $this->html; - if ($url) { - $this->domainRegExp = '/'.strtr(preg_replace('/www\d*\./', '', parse_url($url, PHP_URL_HOST)), array('.' => '\.')).'/'; + $this->logger->debug('Parsing URL: '.$this->url); + + if ($this->url) { + $this->domainRegExp = '/'.strtr(preg_replace('/www\d*\./', '', parse_url($this->url, PHP_URL_HOST)), array('.' => '\.')).'/'; } mb_internal_encoding('UTF-8'); @@ -183,13 +253,13 @@ class Readability // HACK: dirty cleanup to replace some stuff; shouldn't use regexps with HTML but well... if (!$this->flagIsActive(self::FLAG_DISABLE_PREFILTER)) { foreach ($this->pre_filters as $search => $replace) { - $html = preg_replace($search, $replace, $html); + $this->html = preg_replace($search, $replace, $this->html); } unset($search, $replace); } - if (trim($html) === '') { - $html = ''; + if (trim($this->html) === '') { + $this->html = ''; } /* @@ -198,30 +268,30 @@ class Readability * Although sometimes it makes matters worse, which is why there is an option to disable it. * */ - if ($use_tidy && function_exists('tidy_parse_string')) { - $this->debugText .= 'Tidying document'."\n"; - $tidy = tidy_parse_string($html, $this->tidy_config, 'UTF8'); + if ($this->useTidy) { + $this->logger->debug('Tidying document'); + + $tidy = tidy_parse_string($this->html, $this->tidy_config, 'UTF8'); if (tidy_clean_repair($tidy)) { - $this->original_html = $html; $this->tidied = true; - $html = $tidy->value; - $html = preg_replace('/[\r\n]+/is', "\n", $html); + $this->html = $tidy->value; + $this->html = preg_replace('/[\r\n]+/is', "\n", $this->html); } unset($tidy); } - $html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'); + $this->html = mb_convert_encoding($this->html, 'HTML-ENTITIES', 'UTF-8'); - if (!($parser == 'html5lib' && ($this->dom = \HTML5_Parser::parse($html)))) { + if (!($this->parser == 'html5lib' && ($this->dom = \HTML5_Parser::parse($this->html)))) { libxml_use_internal_errors(true); $this->dom = new \DOMDocument(); $this->dom->preserveWhiteSpace = false; if (PHP_VERSION_ID >= 50400) { - $this->dom->loadHTML($html, LIBXML_NOBLANKS | LIBXML_COMPACT | LIBXML_NOERROR); + $this->dom->loadHTML($this->html, LIBXML_NOBLANKS | LIBXML_COMPACT | LIBXML_NOERROR); } else { - $this->dom->loadHTML($html); + $this->dom->loadHTML($this->html); } libxml_use_internal_errors(false); @@ -230,37 +300,6 @@ class Readability $this->dom->registerNodeClass('DOMElement', 'Readability\JSLikeHTMLElement'); } - /** - * Get article title element. - * - * @return \DOMElement - */ - public function getTitle() - { - return $this->articleTitle; - } - - /** - * Get article content element. - * - * @return \DOMElement - */ - public function getContent() - { - return $this->articleContent; - } - - /** - * Add post filter for raw output HTML processing. - * - * @param string RegExp for replace - * @param string (optional) Replacer - */ - public function addPostFilter($filter, $replacer = '') - { - $this->post_filters[$filter] = $replacer; - } - /** * Runs readability. * @@ -275,6 +314,8 @@ class Readability */ public function init() { + $this->loadHtml(); + if (!isset($this->dom->documentElement)) { return false; } @@ -327,34 +368,10 @@ class Readability // Set title and content instance variables. $this->articleTitle = $articleTitle; $this->articleContent = $articleContent; - $this->dump_dbg(); return $this->success; } - /** - * Debug. - * - * @param string $msg - */ - protected function dbg($msg) //, $error=false) - { - if ($this->debug) { - $this->debugText .= $msg."\n"; - } - } - - /** - * Dump debug info. - */ - protected function dump_dbg() - { - if ($this->debug) { - openlog('Readability PHP ', LOG_PID | LOG_PERROR, 0); - syslog(6, $this->debugText); // 1 - error 6 - info - } - } - /** * Run any post-process modifications to article content as necessary. * @@ -511,7 +528,8 @@ class Readability */ public function prepArticle(\DOMElement $articleContent) { - $this->dbg($this->lightClean ? 'Light clean enabled.' : 'Standard clean enabled.'); + $this->logger->debug($this->lightClean ? 'Light clean enabled.' : 'Standard clean enabled.'); + $this->cleanStyles($articleContent); $this->killBreaks($articleContent); @@ -594,7 +612,7 @@ class Readability } unset($search, $replace); } catch (\Exception $e) { - $this->dbg('Cleaning output HTML failed. Ignoring: '.$e->getMessage()); + $this->logger->error('Cleaning output HTML failed. Ignoring: '.$e->getMessage()); } } } @@ -702,7 +720,6 @@ class Readability // (as in, where they contain no other block level elements). if (strcasecmp($tagName, 'div') === 0 || strcasecmp($tagName, 'article') === 0 || strcasecmp($tagName, 'section') === 0) { if (!preg_match($this->regexps['divToPElements'], $node->innerHTML)) { - //$this->dbg('Altering '.$node->getNodePath().' to p'); $newNode = $this->dom->createElement('p'); try { @@ -712,7 +729,7 @@ class Readability --$nodeIndex; $nodesToScore[] = $newNode; } catch (\Exception $e) { - $this->dbg('Could not alter div/article to p, reverting back to div: '.$e->getMessage()); + $this->logger->error('Could not alter div/article to p, reverting back to div: '.$e->getMessage()); } } else { // Will change these P elements back to text nodes after processing. @@ -728,7 +745,6 @@ class Readability // XML_TEXT_NODE if ($childNode->nodeType == 3) { - //$this->dbg('replacing text node with a P tag with the same content.'); $p = $this->dom->createElement('p'); $p->innerHTML = $childNode->nodeValue; $p->setAttribute('data-readability-styled', 'true'); @@ -814,7 +830,7 @@ class Readability $node = $candidates->item($c); // node should be readable but not inside of an article otherwise it's probably non-readable block if ($node->hasAttribute('readability') && (int) $node->getAttributeNode('readability')->value < 40 && ($node->parentNode ? strcasecmp($node->parentNode->tagName, 'article') !== 0 : true)) { - $this->dbg('Removing unlikely candidate (using note) '.$node->getNodePath().' by "'.$node->tagName.'" with readability '.($node->hasAttribute('readability') ? (int) $node->getAttributeNode('readability')->value : 0)); + $this->logger->debug('Removing unlikely candidate (using note) '.$node->getNodePath().' by "'.$node->tagName.'" with readability '.($node->hasAttribute('readability') ? (int) $node->getAttributeNode('readability')->value : 0)); $node->parentNode->removeChild($node); } } @@ -832,7 +848,7 @@ class Readability preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) && !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString) ) { - $this->dbg('Removing unlikely candidate (using conf) '.$node->getNodePath().' by "'.$unlikelyMatchString.'" with readability '.($node->hasAttribute('readability') ? (int) $node->getAttributeNode('readability')->value : 0)); + $this->logger->debug('Removing unlikely candidate (using conf) '.$node->getNodePath().' by "'.$unlikelyMatchString.'" with readability '.($node->hasAttribute('readability') ? (int) $node->getAttributeNode('readability')->value : 0)); $node->parentNode->removeChild($node); --$nodeIndex; } @@ -859,7 +875,7 @@ class Readability $readability->value = round($readability->value * (1 - $this->getLinkDensity($item)), 0, PHP_ROUND_HALF_UP); if (!$topCandidate || $readability->value > (int) $topCandidate->getAttribute('readability')) { - $this->dbg('Candidate: '.$item->getNodePath().' ('.$item->getAttribute('class').':'.$item->getAttribute('id').') with score '.$readability->value); + $this->logger->debug('Candidate: '.$item->getNodePath().' ('.$item->getAttribute('class').':'.$item->getAttribute('id').') with score '.$readability->value); $topCandidate = $item; } } @@ -877,9 +893,9 @@ class Readability if ($page instanceof \DOMDocument) { if (!isset($page->documentElement)) { // we don't have a body either? what a mess! :) - $this->dbg('The page has no body!'); + $this->logger->debug('The page has no body!'); } else { - $this->dbg('Setting body to a raw HTML of original page!'); + $this->logger->debug('Setting body to a raw HTML of original page!'); $topCandidate->innerHTML = $page->documentElement->innerHTML; $page->documentElement->innerHTML = ''; $this->reinitBody(); @@ -908,7 +924,7 @@ class Readability } } - $this->dbg('Top candidate: '.$topCandidate->getNodePath()); + $this->logger->debug('Top candidate: '.$topCandidate->getNodePath()); /* * Now that we have the top candidate, look through its siblings for content that might also be related. @@ -928,9 +944,8 @@ class Readability $siblingNode = $siblingNodes->item($s); $siblingNodeName = $siblingNode->nodeName; $append = false; - $this->dbg('Looking at sibling node: '.$siblingNode->getNodePath().(($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability')) ? (' with score '.$siblingNode->getAttribute('readability')) : '')); + $this->logger->debug('Looking at sibling node: '.$siblingNode->getNodePath().(($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability')) ? (' with score '.$siblingNode->getAttribute('readability')) : '')); - //$this->dbg('Sibling has score ' . ($siblingNode->readability ? siblingNode.readability.contentScore : 'Unknown')); if ($siblingNode->isSameNode($topCandidate)) { $append = true; } @@ -958,18 +973,18 @@ class Readability } if ($append) { - $this->dbg('Appending node: '.$siblingNode->getNodePath()); + $this->logger->debug('Appending node: '.$siblingNode->getNodePath()); if (strcasecmp($siblingNodeName, 'div') !== 0 && strcasecmp($siblingNodeName, 'p') !== 0) { // We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. - $this->dbg('Altering siblingNode "'.$siblingNodeName.'" to "div".'); + $this->logger->debug('Altering siblingNode "'.$siblingNodeName.'" to "div".'); $nodeToAppend = $this->dom->createElement('div'); try { $nodeToAppend->setAttribute('alt', $siblingNodeName); $nodeToAppend->innerHTML = $siblingNode->innerHTML; } catch (\Exception $e) { - $this->dbg('Could not alter siblingNode "'.$siblingNodeName.'" to "div", reverting to original.'); + $this->logger->debug('Could not alter siblingNode "'.$siblingNodeName.'" to "div", reverting to original.'); $nodeToAppend = $siblingNode; --$s; --$sl; @@ -1005,17 +1020,17 @@ class Readability if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) { $this->removeFlag(self::FLAG_STRIP_UNLIKELYS); - $this->dbg('...content is shorter than '.self::MIN_ARTICLE_LENGTH." letters, trying not to strip unlikely content.\n"); + $this->logger->debug('...content is shorter than '.self::MIN_ARTICLE_LENGTH." letters, trying not to strip unlikely content.\n"); return $this->grabArticle($this->body); } elseif ($this->flagIsActive(self::FLAG_WEIGHT_ATTRIBUTES)) { $this->removeFlag(self::FLAG_WEIGHT_ATTRIBUTES); - $this->dbg('...content is shorter than '.self::MIN_ARTICLE_LENGTH." letters, trying not to weight attributes.\n"); + $this->logger->debug('...content is shorter than '.self::MIN_ARTICLE_LENGTH." letters, trying not to weight attributes.\n"); return $this->grabArticle($this->body); } elseif ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) { $this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY); - $this->dbg('...content is shorter than '.self::MIN_ARTICLE_LENGTH." letters, trying not to clean at all.\n"); + $this->logger->debug('...content is shorter than '.self::MIN_ARTICLE_LENGTH." letters, trying not to clean at all.\n"); return $this->grabArticle($this->body); } @@ -1036,9 +1051,9 @@ class Readability * * @return string */ - public function getInnerText(\DOMElement $e, $normalizeSpaces = true, $flattenLines = false) + public function getInnerText(\DOMElement $e = null, $normalizeSpaces = true, $flattenLines = false) { - if (!isset($e->textContent) || $e->textContent === '') { + if (null === $e || !isset($e->textContent) || $e->textContent === '') { return ''; } @@ -1262,10 +1277,10 @@ class Readability $node = $tagsList->item($i); $weight = $this->getWeight($node); $contentScore = ($node->hasAttribute('readability')) ? (int) $node->getAttribute('readability') : 0; - $this->dbg('Start conditional cleaning of '.$node->getNodePath().' (class='.$node->getAttribute('class').'; id='.$node->getAttribute('id').')'.(($node->hasAttribute('readability')) ? (' with score '.$node->getAttribute('readability')) : '')); + $this->logger->debug('Start conditional cleaning of '.$node->getNodePath().' (class='.$node->getAttribute('class').'; id='.$node->getAttribute('id').')'.(($node->hasAttribute('readability')) ? (' with score '.$node->getAttribute('readability')) : '')); if ($weight + $contentScore < 0) { - $this->dbg('Removing...'); + $this->logger->debug('Removing...'); $node->parentNode->removeChild($node); } elseif ($this->getCommaCount($this->getInnerText($node)) < self::MIN_COMMAS_IN_PARAGRAPH) { /* @@ -1299,51 +1314,51 @@ class Readability if ($this->lightClean) { if ($li > $p && $tag != 'ul' && $tag != 'ol') { - $this->dbg(' too many
This is the awesome content :)
', 7), 'http://0.0.0.0'); + $readability = $this->getReadability(str_repeat('This is the awesome content :)
', 7), 'http://0.0.0.0'); $res = $readability->init(); $this->assertTrue($res); @@ -96,7 +87,7 @@ class ReadabilityTest extends \PHPUnit_Framework_TestCase public function testInitDivP() { - $readability = new ReadabilityTested('This is the awesome content :)
', 7).'This is the awesome content :)
', 7).'This is an awesome text with some links, here there are: the awesome
', 7).'This is an awesome text with some links, here there are: the awesome
', 7).'This is an awesome text with some links, here there are: the awesome
', 7).'will NOT be removedThis is an awesome text with some links, here there are: the awesome
', 7).'will NOT be removedThis is an awesome text with some links, here there are: the awesome
', 7).'This is an awesome text with some links, here there are
This is an awesome text with some links, here there are: the awesome
', 7).'This is an awesome text with some links, here there are
'.str_repeat('This is an awesome text with some links, here there are: the awesome', 20).'
This is an awesome text with some links, here there are
'.str_repeat('This is an awesome text with some links, here there are: the awesome', 20).'
This is an awesome text with some links, here there are
This is an awesome text with some links, here there are: the awesome
', 7).'This is an awesome text with some links, here there are: the awesome
', 7).'This is an awesome text with some links, here there are: the awesome
', 7).'This is an awesome text with some links, here there are: the awesome
', 7).'This is an awesome text with some links, here there are: the awesome
', 7).'This is an awesome text with some links, here there are: the awesome
', 7).'This is an awesome text with some links, here there are the awesome | ', 7).'
This is an awesome text with some links, here there are the awesome | ', 7).'
This is an awesome text with some links, here there are the awesome
', 7).'This is an awesome text with some links, here there are the awesome
', 7).'This is an awesome text with some links, here there are the awesome
', 7).'This is an awesome text with some links, here there are the awesome
', 7).'This is an awesome text with some links, here there are the awesome
', 7).'This is an awesome text with some links, here there are the awesome
', 7).'This is an awesome text with some links, here there are the awesome
', 7).'This is an awesome text with some links, here there are the awesome
', 7).'This is an awesome text with some links, here there are the awesome
', 7).'This is an awesome text with some links, here there are the awesome
', 7).'This is an awesome text with some links, here there are the awesome
', 7).'This is an awesome text with some links, here there are the awesome
', 7).'This is the awesome content :)
', 7).'This is the awesome content :)
', 7).'This is the awesome content :)
', 7).'This is the awesome and WONDERFUL content :)
', 7).'