diff --git a/README.md b/README.md index 60dfcb5..68a8f00 100644 --- a/README.md +++ b/README.md @@ -45,3 +45,20 @@ if ($result) { echo 'Looks like we couldn\'t find the content. :('; } ``` + +If you want to debug it, or check what's going on, you can inject a logger (which must follow `Psr\Log\LoggerInterface`, Monolog for example): + +```php +use Readability\Readability; +use Monolog\Logger; +use Monolog\Handler\StreamHandler; + +$url = 'http://www.medialens.org/index.php/alerts/alert-archive/alerts-2013/729-thatcher.html'; +$html = file_get_contents($url); + +$logger = new Logger('readability'); +$logger->pushHandler(new StreamHandler('path/to/your.log', Logger::DEBUG)); + +$readability = new Readability($html, $url); +$readability->setLogger($logger); +``` diff --git a/composer.json b/composer.json index 5487a5c..3921e36 100644 --- a/composer.json +++ b/composer.json @@ -24,7 +24,8 @@ "role": "Developer (original JS version)" }], "require": { - "php": ">=5.3.3" + "php": ">=5.3.3", + "monolog/monolog": "^1.13.1" }, "require-dev": { "satooshi/php-coveralls": "~0.6" diff --git a/src/Readability.php b/src/Readability.php index b40a544..cb42741 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -2,6 +2,10 @@ namespace Readability; +use Psr\Log\LoggerAwareInterface; +use Psr\Log\LoggerInterface; +use Psr\Log\NullLogger; + /** * Arc90's Readability ported to PHP for FiveFilters.org * Based on readability.js version 1.7.1 (without multi-page support) @@ -45,7 +49,7 @@ namespace Readability; * existing DOMElement objects without passing an entire HTML document to * be parsed. */ -class Readability +class Readability implements LoggerAwareInterface { public $convertLinksToFootnotes = false; public $revertForcedParagraphElements = true; @@ -57,10 +61,9 @@ class Readability public $url = null; // preserves more content (experimental) public $lightClean = true; + // no more used, keept to avoid BC public $debug = false; public $tidied = false; - // error text for one time output - protected $debugText = ''; // article domain regexp for calibration protected $domainRegExp = null; protected $body = null; // @@ -70,6 +73,10 @@ class Readability protected $flags = 7; // indicates whether we were able to extract or not protected $success = false; + protected $logger; + protected $parser; + protected $html; + protected $useTidy; /** * All of the regular expressions in use within readability. @@ -167,13 +174,78 @@ class Readability * @param string (optional) Which parser to use for turning raw HTML into a DOMDocument * @param bool (optional) Use tidy */ - public function __construct($html, $url = null, $parser = 'libxml', $use_tidy = true) + public function __construct($html, $url = null, $parser = 'libxml', $useTidy = true) { $this->url = $url; - $this->debugText = 'Parsing URL: '.$url."\n"; + $this->html = $html; + $this->parser = $parser; + $this->useTidy = $useTidy && function_exists('tidy_parse_string'); + + $this->logger = new NullLogger(); + } + + public function setLogger(LoggerInterface $logger) + { + $this->logger = $logger; + } + + /** + * Get article title element. + * + * @return \DOMElement + */ + public function getTitle() + { + return $this->articleTitle; + } + + /** + * Get article content element. + * + * @return \DOMElement + */ + public function getContent() + { + return $this->articleContent; + } + + /** + * Add pre filter for raw input HTML processing. + * + * @param string RegExp for replace + * @param string (optional) Replacer + */ + public function addPreFilter($filter, $replacer = '') + { + $this->pre_filters[$filter] = $replacer; + } + + /** + * Add post filter for raw output HTML processing. + * + * @param string RegExp for replace + * @param string (optional) Replacer + */ + public function addPostFilter($filter, $replacer = '') + { + $this->post_filters[$filter] = $replacer; + } + + /** + * Load HTML in a DOMDocument. + * Apply Pre filters + * Cleanup HTML using Tidy (or not). + * + * @return [type] [description] + */ + private function loadHtml() + { + $this->original_html = $this->html; + + $this->logger->debug('Parsing URL: '.$this->url); - if ($url) { - $this->domainRegExp = '/'.strtr(preg_replace('/www\d*\./', '', parse_url($url, PHP_URL_HOST)), array('.' => '\.')).'/'; + if ($this->url) { + $this->domainRegExp = '/'.strtr(preg_replace('/www\d*\./', '', parse_url($this->url, PHP_URL_HOST)), array('.' => '\.')).'/'; } mb_internal_encoding('UTF-8'); @@ -183,13 +255,13 @@ class Readability // HACK: dirty cleanup to replace some stuff; shouldn't use regexps with HTML but well... if (!$this->flagIsActive(self::FLAG_DISABLE_PREFILTER)) { foreach ($this->pre_filters as $search => $replace) { - $html = preg_replace($search, $replace, $html); + $this->html = preg_replace($search, $replace, $this->html); } unset($search, $replace); } - if (trim($html) === '') { - $html = ''; + if (trim($this->html) === '') { + $this->html = ''; } /* @@ -198,30 +270,30 @@ class Readability * Although sometimes it makes matters worse, which is why there is an option to disable it. * */ - if ($use_tidy && function_exists('tidy_parse_string')) { - $this->debugText .= 'Tidying document'."\n"; - $tidy = tidy_parse_string($html, $this->tidy_config, 'UTF8'); + if ($this->useTidy) { + $this->logger->debug('Tidying document'); + + $tidy = tidy_parse_string($this->html, $this->tidy_config, 'UTF8'); if (tidy_clean_repair($tidy)) { - $this->original_html = $html; $this->tidied = true; - $html = $tidy->value; - $html = preg_replace('/[\r\n]+/is', "\n", $html); + $this->html = $tidy->value; + $this->html = preg_replace('/[\r\n]+/is', "\n", $this->html); } unset($tidy); } - $html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'); + $this->html = mb_convert_encoding($this->html, 'HTML-ENTITIES', 'UTF-8'); - if (!($parser == 'html5lib' && ($this->dom = \HTML5_Parser::parse($html)))) { + if (!($this->parser == 'html5lib' && ($this->dom = \HTML5_Parser::parse($this->html)))) { libxml_use_internal_errors(true); $this->dom = new \DOMDocument(); $this->dom->preserveWhiteSpace = false; if (PHP_VERSION_ID >= 50400) { - $this->dom->loadHTML($html, LIBXML_NOBLANKS | LIBXML_COMPACT | LIBXML_NOERROR); + $this->dom->loadHTML($this->html, LIBXML_NOBLANKS | LIBXML_COMPACT | LIBXML_NOERROR); } else { - $this->dom->loadHTML($html); + $this->dom->loadHTML($this->html); } libxml_use_internal_errors(false); @@ -230,37 +302,6 @@ class Readability $this->dom->registerNodeClass('DOMElement', 'Readability\JSLikeHTMLElement'); } - /** - * Get article title element. - * - * @return \DOMElement - */ - public function getTitle() - { - return $this->articleTitle; - } - - /** - * Get article content element. - * - * @return \DOMElement - */ - public function getContent() - { - return $this->articleContent; - } - - /** - * Add post filter for raw output HTML processing. - * - * @param string RegExp for replace - * @param string (optional) Replacer - */ - public function addPostFilter($filter, $replacer = '') - { - $this->post_filters[$filter] = $replacer; - } - /** * Runs readability. * @@ -275,6 +316,8 @@ class Readability */ public function init() { + $this->loadHtml(); + if (!isset($this->dom->documentElement)) { return false; } @@ -327,34 +370,10 @@ class Readability // Set title and content instance variables. $this->articleTitle = $articleTitle; $this->articleContent = $articleContent; - $this->dump_dbg(); return $this->success; } - /** - * Debug. - * - * @param string $msg - */ - protected function dbg($msg) //, $error=false) - { - if ($this->debug) { - $this->debugText .= $msg."\n"; - } - } - - /** - * Dump debug info. - */ - protected function dump_dbg() - { - if ($this->debug) { - openlog('Readability PHP ', LOG_PID | LOG_PERROR, 0); - syslog(6, $this->debugText); // 1 - error 6 - info - } - } - /** * Run any post-process modifications to article content as necessary. * @@ -511,7 +530,8 @@ class Readability */ public function prepArticle(\DOMElement $articleContent) { - $this->dbg($this->lightClean ? 'Light clean enabled.' : 'Standard clean enabled.'); + $this->logger->debug($this->lightClean ? 'Light clean enabled.' : 'Standard clean enabled.'); + $this->cleanStyles($articleContent); $this->killBreaks($articleContent); @@ -594,7 +614,7 @@ class Readability } unset($search, $replace); } catch (\Exception $e) { - $this->dbg('Cleaning output HTML failed. Ignoring: '.$e->getMessage()); + $this->logger->error('Cleaning output HTML failed. Ignoring: '.$e->getMessage()); } } } @@ -702,7 +722,6 @@ class Readability // (as in, where they contain no other block level elements). if (strcasecmp($tagName, 'div') === 0 || strcasecmp($tagName, 'article') === 0 || strcasecmp($tagName, 'section') === 0) { if (!preg_match($this->regexps['divToPElements'], $node->innerHTML)) { - //$this->dbg('Altering '.$node->getNodePath().' to p'); $newNode = $this->dom->createElement('p'); try { @@ -712,7 +731,7 @@ class Readability --$nodeIndex; $nodesToScore[] = $newNode; } catch (\Exception $e) { - $this->dbg('Could not alter div/article to p, reverting back to div: '.$e->getMessage()); + $this->logger->error('Could not alter div/article to p, reverting back to div: '.$e->getMessage()); } } else { // Will change these P elements back to text nodes after processing. @@ -728,7 +747,6 @@ class Readability // XML_TEXT_NODE if ($childNode->nodeType == 3) { - //$this->dbg('replacing text node with a P tag with the same content.'); $p = $this->dom->createElement('p'); $p->innerHTML = $childNode->nodeValue; $p->setAttribute('data-readability-styled', 'true'); @@ -814,7 +832,7 @@ class Readability $node = $candidates->item($c); // node should be readable but not inside of an article otherwise it's probably non-readable block if ($node->hasAttribute('readability') && (int) $node->getAttributeNode('readability')->value < 40 && ($node->parentNode ? strcasecmp($node->parentNode->tagName, 'article') !== 0 : true)) { - $this->dbg('Removing unlikely candidate (using note) '.$node->getNodePath().' by "'.$node->tagName.'" with readability '.($node->hasAttribute('readability') ? (int) $node->getAttributeNode('readability')->value : 0)); + $this->logger->debug('Removing unlikely candidate (using note) '.$node->getNodePath().' by "'.$node->tagName.'" with readability '.($node->hasAttribute('readability') ? (int) $node->getAttributeNode('readability')->value : 0)); $node->parentNode->removeChild($node); } } @@ -832,7 +850,7 @@ class Readability preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) && !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString) ) { - $this->dbg('Removing unlikely candidate (using conf) '.$node->getNodePath().' by "'.$unlikelyMatchString.'" with readability '.($node->hasAttribute('readability') ? (int) $node->getAttributeNode('readability')->value : 0)); + $this->logger->debug('Removing unlikely candidate (using conf) '.$node->getNodePath().' by "'.$unlikelyMatchString.'" with readability '.($node->hasAttribute('readability') ? (int) $node->getAttributeNode('readability')->value : 0)); $node->parentNode->removeChild($node); --$nodeIndex; } @@ -859,7 +877,7 @@ class Readability $readability->value = round($readability->value * (1 - $this->getLinkDensity($item)), 0, PHP_ROUND_HALF_UP); if (!$topCandidate || $readability->value > (int) $topCandidate->getAttribute('readability')) { - $this->dbg('Candidate: '.$item->getNodePath().' ('.$item->getAttribute('class').':'.$item->getAttribute('id').') with score '.$readability->value); + $this->logger->debug('Candidate: '.$item->getNodePath().' ('.$item->getAttribute('class').':'.$item->getAttribute('id').') with score '.$readability->value); $topCandidate = $item; } } @@ -877,9 +895,9 @@ class Readability if ($page instanceof \DOMDocument) { if (!isset($page->documentElement)) { // we don't have a body either? what a mess! :) - $this->dbg('The page has no body!'); + $this->logger->debug('The page has no body!'); } else { - $this->dbg('Setting body to a raw HTML of original page!'); + $this->logger->debug('Setting body to a raw HTML of original page!'); $topCandidate->innerHTML = $page->documentElement->innerHTML; $page->documentElement->innerHTML = ''; $this->reinitBody(); @@ -908,7 +926,7 @@ class Readability } } - $this->dbg('Top candidate: '.$topCandidate->getNodePath()); + $this->logger->debug('Top candidate: '.$topCandidate->getNodePath()); /* * Now that we have the top candidate, look through its siblings for content that might also be related. @@ -928,9 +946,8 @@ class Readability $siblingNode = $siblingNodes->item($s); $siblingNodeName = $siblingNode->nodeName; $append = false; - $this->dbg('Looking at sibling node: '.$siblingNode->getNodePath().(($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability')) ? (' with score '.$siblingNode->getAttribute('readability')) : '')); + $this->logger->debug('Looking at sibling node: '.$siblingNode->getNodePath().(($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability')) ? (' with score '.$siblingNode->getAttribute('readability')) : '')); - //$this->dbg('Sibling has score ' . ($siblingNode->readability ? siblingNode.readability.contentScore : 'Unknown')); if ($siblingNode->isSameNode($topCandidate)) { $append = true; } @@ -958,18 +975,18 @@ class Readability } if ($append) { - $this->dbg('Appending node: '.$siblingNode->getNodePath()); + $this->logger->debug('Appending node: '.$siblingNode->getNodePath()); if (strcasecmp($siblingNodeName, 'div') !== 0 && strcasecmp($siblingNodeName, 'p') !== 0) { // We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. - $this->dbg('Altering siblingNode "'.$siblingNodeName.'" to "div".'); + $this->logger->debug('Altering siblingNode "'.$siblingNodeName.'" to "div".'); $nodeToAppend = $this->dom->createElement('div'); try { $nodeToAppend->setAttribute('alt', $siblingNodeName); $nodeToAppend->innerHTML = $siblingNode->innerHTML; } catch (\Exception $e) { - $this->dbg('Could not alter siblingNode "'.$siblingNodeName.'" to "div", reverting to original.'); + $this->logger->debug('Could not alter siblingNode "'.$siblingNodeName.'" to "div", reverting to original.'); $nodeToAppend = $siblingNode; --$s; --$sl; @@ -1005,17 +1022,17 @@ class Readability if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) { $this->removeFlag(self::FLAG_STRIP_UNLIKELYS); - $this->dbg('...content is shorter than '.self::MIN_ARTICLE_LENGTH." letters, trying not to strip unlikely content.\n"); + $this->logger->debug('...content is shorter than '.self::MIN_ARTICLE_LENGTH." letters, trying not to strip unlikely content.\n"); return $this->grabArticle($this->body); } elseif ($this->flagIsActive(self::FLAG_WEIGHT_ATTRIBUTES)) { $this->removeFlag(self::FLAG_WEIGHT_ATTRIBUTES); - $this->dbg('...content is shorter than '.self::MIN_ARTICLE_LENGTH." letters, trying not to weight attributes.\n"); + $this->logger->debug('...content is shorter than '.self::MIN_ARTICLE_LENGTH." letters, trying not to weight attributes.\n"); return $this->grabArticle($this->body); } elseif ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) { $this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY); - $this->dbg('...content is shorter than '.self::MIN_ARTICLE_LENGTH." letters, trying not to clean at all.\n"); + $this->logger->debug('...content is shorter than '.self::MIN_ARTICLE_LENGTH." letters, trying not to clean at all.\n"); return $this->grabArticle($this->body); } @@ -1262,10 +1279,10 @@ class Readability $node = $tagsList->item($i); $weight = $this->getWeight($node); $contentScore = ($node->hasAttribute('readability')) ? (int) $node->getAttribute('readability') : 0; - $this->dbg('Start conditional cleaning of '.$node->getNodePath().' (class='.$node->getAttribute('class').'; id='.$node->getAttribute('id').')'.(($node->hasAttribute('readability')) ? (' with score '.$node->getAttribute('readability')) : '')); + $this->logger->debug('Start conditional cleaning of '.$node->getNodePath().' (class='.$node->getAttribute('class').'; id='.$node->getAttribute('id').')'.(($node->hasAttribute('readability')) ? (' with score '.$node->getAttribute('readability')) : '')); if ($weight + $contentScore < 0) { - $this->dbg('Removing...'); + $this->logger->debug('Removing...'); $node->parentNode->removeChild($node); } elseif ($this->getCommaCount($this->getInnerText($node)) < self::MIN_COMMAS_IN_PARAGRAPH) { /* @@ -1299,51 +1316,51 @@ class Readability if ($this->lightClean) { if ($li > $p && $tag != 'ul' && $tag != 'ol') { - $this->dbg(' too many
  • elements, and parent is not