Merge pull request #13 from j0k3r/monolog

Use Monolog instead of custom solution
pull/14/head 1.1.0
Jeremy Benoist 10 years ago
commit 7b47e2f1de
  1. 17
      README.md
  2. 3
      composer.json
  3. 239
      src/Readability.php
  4. 132
      tests/ReadabilityTest.php

@ -45,3 +45,20 @@ if ($result) {
echo 'Looks like we couldn\'t find the content. :('; echo 'Looks like we couldn\'t find the content. :(';
} }
``` ```
If you want to debug it, or check what's going on, you can inject a logger (which must follow `Psr\Log\LoggerInterface`, Monolog for example):
```php
use Readability\Readability;
use Monolog\Logger;
use Monolog\Handler\StreamHandler;
$url = 'http://www.medialens.org/index.php/alerts/alert-archive/alerts-2013/729-thatcher.html';
$html = file_get_contents($url);
$logger = new Logger('readability');
$logger->pushHandler(new StreamHandler('path/to/your.log', Logger::DEBUG));
$readability = new Readability($html, $url);
$readability->setLogger($logger);
```

@ -24,7 +24,8 @@
"role": "Developer (original JS version)" "role": "Developer (original JS version)"
}], }],
"require": { "require": {
"php": ">=5.3.3" "php": ">=5.3.3",
"monolog/monolog": "^1.13.1"
}, },
"require-dev": { "require-dev": {
"satooshi/php-coveralls": "~0.6" "satooshi/php-coveralls": "~0.6"

@ -2,6 +2,10 @@
namespace Readability; namespace Readability;
use Psr\Log\LoggerAwareInterface;
use Psr\Log\LoggerInterface;
use Psr\Log\NullLogger;
/** /**
* Arc90's Readability ported to PHP for FiveFilters.org * Arc90's Readability ported to PHP for FiveFilters.org
* Based on readability.js version 1.7.1 (without multi-page support) * Based on readability.js version 1.7.1 (without multi-page support)
@ -45,7 +49,7 @@ namespace Readability;
* existing DOMElement objects without passing an entire HTML document to * existing DOMElement objects without passing an entire HTML document to
* be parsed. * be parsed.
*/ */
class Readability class Readability implements LoggerAwareInterface
{ {
public $convertLinksToFootnotes = false; public $convertLinksToFootnotes = false;
public $revertForcedParagraphElements = true; public $revertForcedParagraphElements = true;
@ -57,10 +61,9 @@ class Readability
public $url = null; public $url = null;
// preserves more content (experimental) // preserves more content (experimental)
public $lightClean = true; public $lightClean = true;
// no more used, keept to avoid BC
public $debug = false; public $debug = false;
public $tidied = false; public $tidied = false;
// error text for one time output
protected $debugText = '';
// article domain regexp for calibration // article domain regexp for calibration
protected $domainRegExp = null; protected $domainRegExp = null;
protected $body = null; // protected $body = null; //
@ -70,6 +73,10 @@ class Readability
protected $flags = 7; protected $flags = 7;
// indicates whether we were able to extract or not // indicates whether we were able to extract or not
protected $success = false; protected $success = false;
protected $logger;
protected $parser;
protected $html;
protected $useTidy;
/** /**
* All of the regular expressions in use within readability. * All of the regular expressions in use within readability.
@ -167,13 +174,76 @@ class Readability
* @param string (optional) Which parser to use for turning raw HTML into a DOMDocument * @param string (optional) Which parser to use for turning raw HTML into a DOMDocument
* @param bool (optional) Use tidy * @param bool (optional) Use tidy
*/ */
public function __construct($html, $url = null, $parser = 'libxml', $use_tidy = true) public function __construct($html, $url = null, $parser = 'libxml', $useTidy = true)
{ {
$this->url = $url; $this->url = $url;
$this->debugText = 'Parsing URL: '.$url."\n"; $this->html = $html;
$this->parser = $parser;
$this->useTidy = $useTidy && function_exists('tidy_parse_string');
$this->logger = new NullLogger();
}
if ($url) { public function setLogger(LoggerInterface $logger)
$this->domainRegExp = '/'.strtr(preg_replace('/www\d*\./', '', parse_url($url, PHP_URL_HOST)), array('.' => '\.')).'/'; {
$this->logger = $logger;
}
/**
* Get article title element.
*
* @return \DOMElement
*/
public function getTitle()
{
return $this->articleTitle;
}
/**
* Get article content element.
*
* @return \DOMElement
*/
public function getContent()
{
return $this->articleContent;
}
/**
* Add pre filter for raw input HTML processing.
*
* @param string RegExp for replace
* @param string (optional) Replacer
*/
public function addPreFilter($filter, $replacer = '')
{
$this->pre_filters[$filter] = $replacer;
}
/**
* Add post filter for raw output HTML processing.
*
* @param string RegExp for replace
* @param string (optional) Replacer
*/
public function addPostFilter($filter, $replacer = '')
{
$this->post_filters[$filter] = $replacer;
}
/**
* Load HTML in a DOMDocument.
* Apply Pre filters
* Cleanup HTML using Tidy (or not).
*/
private function loadHtml()
{
$this->original_html = $this->html;
$this->logger->debug('Parsing URL: '.$this->url);
if ($this->url) {
$this->domainRegExp = '/'.strtr(preg_replace('/www\d*\./', '', parse_url($this->url, PHP_URL_HOST)), array('.' => '\.')).'/';
} }
mb_internal_encoding('UTF-8'); mb_internal_encoding('UTF-8');
@ -183,13 +253,13 @@ class Readability
// HACK: dirty cleanup to replace some stuff; shouldn't use regexps with HTML but well... // HACK: dirty cleanup to replace some stuff; shouldn't use regexps with HTML but well...
if (!$this->flagIsActive(self::FLAG_DISABLE_PREFILTER)) { if (!$this->flagIsActive(self::FLAG_DISABLE_PREFILTER)) {
foreach ($this->pre_filters as $search => $replace) { foreach ($this->pre_filters as $search => $replace) {
$html = preg_replace($search, $replace, $html); $this->html = preg_replace($search, $replace, $this->html);
} }
unset($search, $replace); unset($search, $replace);
} }
if (trim($html) === '') { if (trim($this->html) === '') {
$html = '<html></html>'; $this->html = '<html></html>';
} }
/* /*
@ -198,30 +268,30 @@ class Readability
* Although sometimes it makes matters worse, which is why there is an option to disable it. * Although sometimes it makes matters worse, which is why there is an option to disable it.
* *
*/ */
if ($use_tidy && function_exists('tidy_parse_string')) { if ($this->useTidy) {
$this->debugText .= 'Tidying document'."\n"; $this->logger->debug('Tidying document');
$tidy = tidy_parse_string($html, $this->tidy_config, 'UTF8');
$tidy = tidy_parse_string($this->html, $this->tidy_config, 'UTF8');
if (tidy_clean_repair($tidy)) { if (tidy_clean_repair($tidy)) {
$this->original_html = $html;
$this->tidied = true; $this->tidied = true;
$html = $tidy->value; $this->html = $tidy->value;
$html = preg_replace('/[\r\n]+/is', "\n", $html); $this->html = preg_replace('/[\r\n]+/is', "\n", $this->html);
} }
unset($tidy); unset($tidy);
} }
$html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'); $this->html = mb_convert_encoding($this->html, 'HTML-ENTITIES', 'UTF-8');
if (!($parser == 'html5lib' && ($this->dom = \HTML5_Parser::parse($html)))) { if (!($this->parser == 'html5lib' && ($this->dom = \HTML5_Parser::parse($this->html)))) {
libxml_use_internal_errors(true); libxml_use_internal_errors(true);
$this->dom = new \DOMDocument(); $this->dom = new \DOMDocument();
$this->dom->preserveWhiteSpace = false; $this->dom->preserveWhiteSpace = false;
if (PHP_VERSION_ID >= 50400) { if (PHP_VERSION_ID >= 50400) {
$this->dom->loadHTML($html, LIBXML_NOBLANKS | LIBXML_COMPACT | LIBXML_NOERROR); $this->dom->loadHTML($this->html, LIBXML_NOBLANKS | LIBXML_COMPACT | LIBXML_NOERROR);
} else { } else {
$this->dom->loadHTML($html); $this->dom->loadHTML($this->html);
} }
libxml_use_internal_errors(false); libxml_use_internal_errors(false);
@ -230,37 +300,6 @@ class Readability
$this->dom->registerNodeClass('DOMElement', 'Readability\JSLikeHTMLElement'); $this->dom->registerNodeClass('DOMElement', 'Readability\JSLikeHTMLElement');
} }
/**
* Get article title element.
*
* @return \DOMElement
*/
public function getTitle()
{
return $this->articleTitle;
}
/**
* Get article content element.
*
* @return \DOMElement
*/
public function getContent()
{
return $this->articleContent;
}
/**
* Add post filter for raw output HTML processing.
*
* @param string RegExp for replace
* @param string (optional) Replacer
*/
public function addPostFilter($filter, $replacer = '')
{
$this->post_filters[$filter] = $replacer;
}
/** /**
* Runs readability. * Runs readability.
* *
@ -275,6 +314,8 @@ class Readability
*/ */
public function init() public function init()
{ {
$this->loadHtml();
if (!isset($this->dom->documentElement)) { if (!isset($this->dom->documentElement)) {
return false; return false;
} }
@ -327,34 +368,10 @@ class Readability
// Set title and content instance variables. // Set title and content instance variables.
$this->articleTitle = $articleTitle; $this->articleTitle = $articleTitle;
$this->articleContent = $articleContent; $this->articleContent = $articleContent;
$this->dump_dbg();
return $this->success; return $this->success;
} }
/**
* Debug.
*
* @param string $msg
*/
protected function dbg($msg) //, $error=false)
{
if ($this->debug) {
$this->debugText .= $msg."\n";
}
}
/**
* Dump debug info.
*/
protected function dump_dbg()
{
if ($this->debug) {
openlog('Readability PHP ', LOG_PID | LOG_PERROR, 0);
syslog(6, $this->debugText); // 1 - error 6 - info
}
}
/** /**
* Run any post-process modifications to article content as necessary. * Run any post-process modifications to article content as necessary.
* *
@ -511,7 +528,8 @@ class Readability
*/ */
public function prepArticle(\DOMElement $articleContent) public function prepArticle(\DOMElement $articleContent)
{ {
$this->dbg($this->lightClean ? 'Light clean enabled.' : 'Standard clean enabled.'); $this->logger->debug($this->lightClean ? 'Light clean enabled.' : 'Standard clean enabled.');
$this->cleanStyles($articleContent); $this->cleanStyles($articleContent);
$this->killBreaks($articleContent); $this->killBreaks($articleContent);
@ -594,7 +612,7 @@ class Readability
} }
unset($search, $replace); unset($search, $replace);
} catch (\Exception $e) { } catch (\Exception $e) {
$this->dbg('Cleaning output HTML failed. Ignoring: '.$e->getMessage()); $this->logger->error('Cleaning output HTML failed. Ignoring: '.$e->getMessage());
} }
} }
} }
@ -702,7 +720,6 @@ class Readability
// (as in, where they contain no other block level elements). // (as in, where they contain no other block level elements).
if (strcasecmp($tagName, 'div') === 0 || strcasecmp($tagName, 'article') === 0 || strcasecmp($tagName, 'section') === 0) { if (strcasecmp($tagName, 'div') === 0 || strcasecmp($tagName, 'article') === 0 || strcasecmp($tagName, 'section') === 0) {
if (!preg_match($this->regexps['divToPElements'], $node->innerHTML)) { if (!preg_match($this->regexps['divToPElements'], $node->innerHTML)) {
//$this->dbg('Altering '.$node->getNodePath().' to p');
$newNode = $this->dom->createElement('p'); $newNode = $this->dom->createElement('p');
try { try {
@ -712,7 +729,7 @@ class Readability
--$nodeIndex; --$nodeIndex;
$nodesToScore[] = $newNode; $nodesToScore[] = $newNode;
} catch (\Exception $e) { } catch (\Exception $e) {
$this->dbg('Could not alter div/article to p, reverting back to div: '.$e->getMessage()); $this->logger->error('Could not alter div/article to p, reverting back to div: '.$e->getMessage());
} }
} else { } else {
// Will change these P elements back to text nodes after processing. // Will change these P elements back to text nodes after processing.
@ -728,7 +745,6 @@ class Readability
// XML_TEXT_NODE // XML_TEXT_NODE
if ($childNode->nodeType == 3) { if ($childNode->nodeType == 3) {
//$this->dbg('replacing text node with a P tag with the same content.');
$p = $this->dom->createElement('p'); $p = $this->dom->createElement('p');
$p->innerHTML = $childNode->nodeValue; $p->innerHTML = $childNode->nodeValue;
$p->setAttribute('data-readability-styled', 'true'); $p->setAttribute('data-readability-styled', 'true');
@ -814,7 +830,7 @@ class Readability
$node = $candidates->item($c); $node = $candidates->item($c);
// node should be readable but not inside of an article otherwise it's probably non-readable block // node should be readable but not inside of an article otherwise it's probably non-readable block
if ($node->hasAttribute('readability') && (int) $node->getAttributeNode('readability')->value < 40 && ($node->parentNode ? strcasecmp($node->parentNode->tagName, 'article') !== 0 : true)) { if ($node->hasAttribute('readability') && (int) $node->getAttributeNode('readability')->value < 40 && ($node->parentNode ? strcasecmp($node->parentNode->tagName, 'article') !== 0 : true)) {
$this->dbg('Removing unlikely candidate (using note) '.$node->getNodePath().' by "'.$node->tagName.'" with readability '.($node->hasAttribute('readability') ? (int) $node->getAttributeNode('readability')->value : 0)); $this->logger->debug('Removing unlikely candidate (using note) '.$node->getNodePath().' by "'.$node->tagName.'" with readability '.($node->hasAttribute('readability') ? (int) $node->getAttributeNode('readability')->value : 0));
$node->parentNode->removeChild($node); $node->parentNode->removeChild($node);
} }
} }
@ -832,7 +848,7 @@ class Readability
preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) && preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) &&
!preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString) !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString)
) { ) {
$this->dbg('Removing unlikely candidate (using conf) '.$node->getNodePath().' by "'.$unlikelyMatchString.'" with readability '.($node->hasAttribute('readability') ? (int) $node->getAttributeNode('readability')->value : 0)); $this->logger->debug('Removing unlikely candidate (using conf) '.$node->getNodePath().' by "'.$unlikelyMatchString.'" with readability '.($node->hasAttribute('readability') ? (int) $node->getAttributeNode('readability')->value : 0));
$node->parentNode->removeChild($node); $node->parentNode->removeChild($node);
--$nodeIndex; --$nodeIndex;
} }
@ -859,7 +875,7 @@ class Readability
$readability->value = round($readability->value * (1 - $this->getLinkDensity($item)), 0, PHP_ROUND_HALF_UP); $readability->value = round($readability->value * (1 - $this->getLinkDensity($item)), 0, PHP_ROUND_HALF_UP);
if (!$topCandidate || $readability->value > (int) $topCandidate->getAttribute('readability')) { if (!$topCandidate || $readability->value > (int) $topCandidate->getAttribute('readability')) {
$this->dbg('Candidate: '.$item->getNodePath().' ('.$item->getAttribute('class').':'.$item->getAttribute('id').') with score '.$readability->value); $this->logger->debug('Candidate: '.$item->getNodePath().' ('.$item->getAttribute('class').':'.$item->getAttribute('id').') with score '.$readability->value);
$topCandidate = $item; $topCandidate = $item;
} }
} }
@ -877,9 +893,9 @@ class Readability
if ($page instanceof \DOMDocument) { if ($page instanceof \DOMDocument) {
if (!isset($page->documentElement)) { if (!isset($page->documentElement)) {
// we don't have a body either? what a mess! :) // we don't have a body either? what a mess! :)
$this->dbg('The page has no body!'); $this->logger->debug('The page has no body!');
} else { } else {
$this->dbg('Setting body to a raw HTML of original page!'); $this->logger->debug('Setting body to a raw HTML of original page!');
$topCandidate->innerHTML = $page->documentElement->innerHTML; $topCandidate->innerHTML = $page->documentElement->innerHTML;
$page->documentElement->innerHTML = ''; $page->documentElement->innerHTML = '';
$this->reinitBody(); $this->reinitBody();
@ -908,7 +924,7 @@ class Readability
} }
} }
$this->dbg('Top candidate: '.$topCandidate->getNodePath()); $this->logger->debug('Top candidate: '.$topCandidate->getNodePath());
/* /*
* Now that we have the top candidate, look through its siblings for content that might also be related. * Now that we have the top candidate, look through its siblings for content that might also be related.
@ -928,9 +944,8 @@ class Readability
$siblingNode = $siblingNodes->item($s); $siblingNode = $siblingNodes->item($s);
$siblingNodeName = $siblingNode->nodeName; $siblingNodeName = $siblingNode->nodeName;
$append = false; $append = false;
$this->dbg('Looking at sibling node: '.$siblingNode->getNodePath().(($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability')) ? (' with score '.$siblingNode->getAttribute('readability')) : '')); $this->logger->debug('Looking at sibling node: '.$siblingNode->getNodePath().(($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability')) ? (' with score '.$siblingNode->getAttribute('readability')) : ''));
//$this->dbg('Sibling has score ' . ($siblingNode->readability ? siblingNode.readability.contentScore : 'Unknown'));
if ($siblingNode->isSameNode($topCandidate)) { if ($siblingNode->isSameNode($topCandidate)) {
$append = true; $append = true;
} }
@ -958,18 +973,18 @@ class Readability
} }
if ($append) { if ($append) {
$this->dbg('Appending node: '.$siblingNode->getNodePath()); $this->logger->debug('Appending node: '.$siblingNode->getNodePath());
if (strcasecmp($siblingNodeName, 'div') !== 0 && strcasecmp($siblingNodeName, 'p') !== 0) { if (strcasecmp($siblingNodeName, 'div') !== 0 && strcasecmp($siblingNodeName, 'p') !== 0) {
// We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. // We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident.
$this->dbg('Altering siblingNode "'.$siblingNodeName.'" to "div".'); $this->logger->debug('Altering siblingNode "'.$siblingNodeName.'" to "div".');
$nodeToAppend = $this->dom->createElement('div'); $nodeToAppend = $this->dom->createElement('div');
try { try {
$nodeToAppend->setAttribute('alt', $siblingNodeName); $nodeToAppend->setAttribute('alt', $siblingNodeName);
$nodeToAppend->innerHTML = $siblingNode->innerHTML; $nodeToAppend->innerHTML = $siblingNode->innerHTML;
} catch (\Exception $e) { } catch (\Exception $e) {
$this->dbg('Could not alter siblingNode "'.$siblingNodeName.'" to "div", reverting to original.'); $this->logger->debug('Could not alter siblingNode "'.$siblingNodeName.'" to "div", reverting to original.');
$nodeToAppend = $siblingNode; $nodeToAppend = $siblingNode;
--$s; --$s;
--$sl; --$sl;
@ -1005,17 +1020,17 @@ class Readability
if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) { if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) {
$this->removeFlag(self::FLAG_STRIP_UNLIKELYS); $this->removeFlag(self::FLAG_STRIP_UNLIKELYS);
$this->dbg('...content is shorter than '.self::MIN_ARTICLE_LENGTH." letters, trying not to strip unlikely content.\n"); $this->logger->debug('...content is shorter than '.self::MIN_ARTICLE_LENGTH." letters, trying not to strip unlikely content.\n");
return $this->grabArticle($this->body); return $this->grabArticle($this->body);
} elseif ($this->flagIsActive(self::FLAG_WEIGHT_ATTRIBUTES)) { } elseif ($this->flagIsActive(self::FLAG_WEIGHT_ATTRIBUTES)) {
$this->removeFlag(self::FLAG_WEIGHT_ATTRIBUTES); $this->removeFlag(self::FLAG_WEIGHT_ATTRIBUTES);
$this->dbg('...content is shorter than '.self::MIN_ARTICLE_LENGTH." letters, trying not to weight attributes.\n"); $this->logger->debug('...content is shorter than '.self::MIN_ARTICLE_LENGTH." letters, trying not to weight attributes.\n");
return $this->grabArticle($this->body); return $this->grabArticle($this->body);
} elseif ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) { } elseif ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
$this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY); $this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY);
$this->dbg('...content is shorter than '.self::MIN_ARTICLE_LENGTH." letters, trying not to clean at all.\n"); $this->logger->debug('...content is shorter than '.self::MIN_ARTICLE_LENGTH." letters, trying not to clean at all.\n");
return $this->grabArticle($this->body); return $this->grabArticle($this->body);
} }
@ -1036,9 +1051,9 @@ class Readability
* *
* @return string * @return string
*/ */
public function getInnerText(\DOMElement $e, $normalizeSpaces = true, $flattenLines = false) public function getInnerText(\DOMElement $e = null, $normalizeSpaces = true, $flattenLines = false)
{ {
if (!isset($e->textContent) || $e->textContent === '') { if (null === $e || !isset($e->textContent) || $e->textContent === '') {
return ''; return '';
} }
@ -1262,10 +1277,10 @@ class Readability
$node = $tagsList->item($i); $node = $tagsList->item($i);
$weight = $this->getWeight($node); $weight = $this->getWeight($node);
$contentScore = ($node->hasAttribute('readability')) ? (int) $node->getAttribute('readability') : 0; $contentScore = ($node->hasAttribute('readability')) ? (int) $node->getAttribute('readability') : 0;
$this->dbg('Start conditional cleaning of '.$node->getNodePath().' (class='.$node->getAttribute('class').'; id='.$node->getAttribute('id').')'.(($node->hasAttribute('readability')) ? (' with score '.$node->getAttribute('readability')) : '')); $this->logger->debug('Start conditional cleaning of '.$node->getNodePath().' (class='.$node->getAttribute('class').'; id='.$node->getAttribute('id').')'.(($node->hasAttribute('readability')) ? (' with score '.$node->getAttribute('readability')) : ''));
if ($weight + $contentScore < 0) { if ($weight + $contentScore < 0) {
$this->dbg('Removing...'); $this->logger->debug('Removing...');
$node->parentNode->removeChild($node); $node->parentNode->removeChild($node);
} elseif ($this->getCommaCount($this->getInnerText($node)) < self::MIN_COMMAS_IN_PARAGRAPH) { } elseif ($this->getCommaCount($this->getInnerText($node)) < self::MIN_COMMAS_IN_PARAGRAPH) {
/* /*
@ -1299,51 +1314,51 @@ class Readability
if ($this->lightClean) { if ($this->lightClean) {
if ($li > $p && $tag != 'ul' && $tag != 'ol') { if ($li > $p && $tag != 'ul' && $tag != 'ol') {
$this->dbg(' too many <li> elements, and parent is not <ul> or <ol>'); $this->logger->debug(' too many <li> elements, and parent is not <ul> or <ol>');
$toRemove = true; $toRemove = true;
} elseif ($input > floor($p / 3)) { } elseif ($input > floor($p / 3)) {
$this->dbg(' too many <input> elements'); $this->logger->debug(' too many <input> elements');
$toRemove = true; $toRemove = true;
} elseif ($contentLength < 6 && ($embedCount === 0 && ($img === 0 || $img > 2))) { } elseif ($contentLength < 6 && ($embedCount === 0 && ($img === 0 || $img > 2))) {
$this->dbg(' content length less than 6 chars, 0 embeds and either 0 images or more than 2 images'); $this->logger->debug(' content length less than 6 chars, 0 embeds and either 0 images or more than 2 images');
$toRemove = true; $toRemove = true;
} elseif ($weight < 25 && $linkDensity > 0.25) { } elseif ($weight < 25 && $linkDensity > 0.25) {
$this->dbg(' weight is '.$weight.' < 25 and link density is '.sprintf('%.2f', $linkDensity).' > 0.25'); $this->logger->debug(' weight is '.$weight.' < 25 and link density is '.sprintf('%.2f', $linkDensity).' > 0.25');
$toRemove = true; $toRemove = true;
} elseif ($a > 2 && ($weight >= 25 && $linkDensity > 0.5)) { } elseif ($a > 2 && ($weight >= 25 && $linkDensity > 0.5)) {
$this->dbg(' more than 2 links and weight is '.$weight.' > 25 but link density is '.sprintf('%.2f', $linkDensity).' > 0.5'); $this->logger->debug(' more than 2 links and weight is '.$weight.' > 25 but link density is '.sprintf('%.2f', $linkDensity).' > 0.5');
$toRemove = true; $toRemove = true;
} elseif ($embedCount > 3) { } elseif ($embedCount > 3) {
$this->dbg(' more than 3 embeds'); $this->logger->debug(' more than 3 embeds');
$toRemove = true; $toRemove = true;
} }
} else { } else {
if ($img > $p) { if ($img > $p) {
$this->dbg(' more image elements than paragraph elements'); $this->logger->debug(' more image elements than paragraph elements');
$toRemove = true; $toRemove = true;
} elseif ($li > $p && $tag != 'ul' && $tag != 'ol') { } elseif ($li > $p && $tag != 'ul' && $tag != 'ol') {
$this->dbg(' too many <li> elements, and parent is not <ul> or <ol>'); $this->logger->debug(' too many <li> elements, and parent is not <ul> or <ol>');
$toRemove = true; $toRemove = true;
} elseif ($input > floor($p / 3)) { } elseif ($input > floor($p / 3)) {
$this->dbg(' too many <input> elements'); $this->logger->debug(' too many <input> elements');
$toRemove = true; $toRemove = true;
} elseif ($contentLength < 10 && ($img === 0 || $img > 2)) { } elseif ($contentLength < 10 && ($img === 0 || $img > 2)) {
$this->dbg(' content length less than 10 chars and 0 images, or more than 2 images'); $this->logger->debug(' content length less than 10 chars and 0 images, or more than 2 images');
$toRemove = true; $toRemove = true;
} elseif ($weight < 25 && $linkDensity > 0.2) { } elseif ($weight < 25 && $linkDensity > 0.2) {
$this->dbg(' weight is '.$weight.' lower than 0 and link density is '.sprintf('%.2f', $linkDensity).' > 0.2'); $this->logger->debug(' weight is '.$weight.' lower than 0 and link density is '.sprintf('%.2f', $linkDensity).' > 0.2');
$toRemove = true; $toRemove = true;
} elseif ($weight >= 25 && $linkDensity > 0.5) { } elseif ($weight >= 25 && $linkDensity > 0.5) {
$this->dbg(' weight above 25 but link density is '.sprintf('%.2f', $linkDensity).' > 0.5'); $this->logger->debug(' weight above 25 but link density is '.sprintf('%.2f', $linkDensity).' > 0.5');
$toRemove = true; $toRemove = true;
} elseif (($embedCount == 1 && $contentLength < 75) || $embedCount > 1) { } elseif (($embedCount == 1 && $contentLength < 75) || $embedCount > 1) {
$this->dbg(' 1 embed and content length smaller than 75 chars, or more than one embed'); $this->logger->debug(' 1 embed and content length smaller than 75 chars, or more than one embed');
$toRemove = true; $toRemove = true;
} }
} }
if ($toRemove) { if ($toRemove) {
$this->dbg('Removing...'); $this->logger->debug('Removing...');
$node->parentNode->removeChild($node); $node->parentNode->removeChild($node);
} }
} }

@ -3,75 +3,66 @@
namespace Tests\Readability; namespace Tests\Readability;
use Readability\Readability; use Readability\Readability;
use Monolog\Logger;
use Monolog\Handler\TestHandler;
class ReadabilityTested extends Readability class ReadabilityTest extends \PHPUnit_Framework_TestCase
{
public function getDebugText()
{ {
return $this->debugText; public $logHandler;
} public $logger;
public function getDomainRegexp() private function getReadability($html, $url = null, $parser = 'libxml', $useTidy = true)
{ {
return $this->domainRegExp; $readability = new Readability($html, $url, $parser, $useTidy);
}
}
class ReadabilityTest extends \PHPUnit_Framework_TestCase $this->logHandler = new TestHandler();
{ $this->logger = new Logger('test', array($this->logHandler));
/** $readability->setLogger($this->logger);
* @requires extension tidy
*/
public function testConstructDefault()
{
$readability = new ReadabilityTested('');
$this->assertNull($readability->url); return $readability;
$this->assertContains('Parsing URL', $readability->getDebugText());
$this->assertContains('Tidying document', $readability->getDebugText());
$this->assertNull($readability->getDomainRegexp());
$this->assertInstanceOf('DomDocument', $readability->dom);
} }
/**
* @requires extension tidy
*/
public function testConstructSimple() public function testConstructSimple()
{ {
$readability = new ReadabilityTested('<html/>', 'http://0.0.0.0'); $readability = $this->getReadability('<html/>', 'http://0.0.0.0');
$readability->init();
$this->assertEquals('http://0.0.0.0', $readability->url); $this->assertEquals('http://0.0.0.0', $readability->url);
$this->assertContains('Parsing URL', $readability->getDebugText()); $this->assertEquals('<html/>', $readability->original_html);
$this->assertContains('Tidying document', $readability->getDebugText()); $this->assertTrue($readability->tidied);
$this->assertEquals('/0\.0\.0\.0/', $readability->getDomainRegexp());
$this->assertInstanceOf('DomDocument', $readability->dom); $this->assertTrue($this->logHandler->hasDebugThatContains('Parsing URL: http://0.0.0.0'));
$this->assertTrue($this->logHandler->hasDebugThatContains('Tidying document'));
$this->assertTrue($this->logHandler->hasDebugThatContains('Light clean enabled.'));
} }
public function testConstructDefaultWithoutTidy() public function testConstructDefaultWithoutTidy()
{ {
$readability = new ReadabilityTested('', null, 'libxml', false); $readability = $this->getReadability('', null, 'libxml', false);
$readability->init();
$this->assertNull($readability->url); $this->assertNull($readability->url);
$this->assertContains('Parsing URL', $readability->getDebugText()); $this->assertEquals('', $readability->original_html);
$this->assertNotContains('Tidying document', $readability->getDebugText()); $this->assertFalse($readability->tidied);
$this->assertNull($readability->getDomainRegexp());
$this->assertInstanceOf('DomDocument', $readability->dom); $this->assertTrue($this->logHandler->hasDebugThatContains('Parsing URL: '));
$this->assertFalse($this->logHandler->hasDebugThatContains('Tidying document'));
$this->assertTrue($this->logHandler->hasDebugThatContains('Light clean enabled.'));
} }
public function testConstructSimpleWithoutTidy() public function testConstructSimpleWithoutTidy()
{ {
$readability = new ReadabilityTested('<html/>', 'http://0.0.0.0', 'libxml', false); $readability = $this->getReadability('<html/>', 'http://0.0.0.0', 'libxml', false);
$readability->init();
$this->assertEquals('http://0.0.0.0', $readability->url); $this->assertEquals('http://0.0.0.0', $readability->url);
$this->assertContains('Parsing URL', $readability->getDebugText()); $this->assertEquals('<html/>', $readability->original_html);
$this->assertNotContains('Tidying document', $readability->getDebugText()); $this->assertFalse($readability->tidied);
$this->assertEquals('/0\.0\.0\.0/', $readability->getDomainRegexp());
$this->assertInstanceOf('DomDocument', $readability->dom);
} }
public function testInitNoContent() public function testInitNoContent()
{ {
$readability = new ReadabilityTested('<html/>', 'http://0.0.0.0'); $readability = $this->getReadability('<html/>', 'http://0.0.0.0');
$res = $readability->init(); $res = $readability->init();
$this->assertFalse($res); $this->assertFalse($res);
@ -83,7 +74,7 @@ class ReadabilityTest extends \PHPUnit_Framework_TestCase
public function testInitP() public function testInitP()
{ {
$readability = new ReadabilityTested(str_repeat('<p>This is the awesome content :)</p>', 7), 'http://0.0.0.0'); $readability = $this->getReadability(str_repeat('<p>This is the awesome content :)</p>', 7), 'http://0.0.0.0');
$res = $readability->init(); $res = $readability->init();
$this->assertTrue($res); $this->assertTrue($res);
@ -96,7 +87,7 @@ class ReadabilityTest extends \PHPUnit_Framework_TestCase
public function testInitDivP() public function testInitDivP()
{ {
$readability = new ReadabilityTested('<div>'.str_repeat('<p>This is the awesome content :)</p>', 7).'</div>', 'http://0.0.0.0'); $readability = $this->getReadability('<div>'.str_repeat('<p>This is the awesome content :)</p>', 7).'</div>', 'http://0.0.0.0');
$res = $readability->init(); $res = $readability->init();
$this->assertTrue($res); $this->assertTrue($res);
@ -109,7 +100,7 @@ class ReadabilityTest extends \PHPUnit_Framework_TestCase
public function testInitDiv() public function testInitDiv()
{ {
$readability = new ReadabilityTested('<div>'.str_repeat('This is the awesome content :)', 7).'</div>', 'http://0.0.0.0'); $readability = $this->getReadability('<div>'.str_repeat('This is the awesome content :)', 7).'</div>', 'http://0.0.0.0');
$readability->debug = true; $readability->debug = true;
$res = $readability->init(); $res = $readability->init();
@ -123,7 +114,7 @@ class ReadabilityTest extends \PHPUnit_Framework_TestCase
public function testWithFootnotes() public function testWithFootnotes()
{ {
$readability = new ReadabilityTested('<div>'.str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7).'</div>', 'http://0.0.0.0'); $readability = $this->getReadability('<div>'.str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7).'</div>', 'http://0.0.0.0');
$readability->debug = true; $readability->debug = true;
$readability->convertLinksToFootnotes = true; $readability->convertLinksToFootnotes = true;
$res = $readability->init(); $res = $readability->init();
@ -140,7 +131,7 @@ class ReadabilityTest extends \PHPUnit_Framework_TestCase
public function testStandardClean() public function testStandardClean()
{ {
$readability = new ReadabilityTested('<div><h2>Title</h2>'.str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7).'<a href="#nofollow" rel="nofollow">will NOT be removed</a></div>', 'http://0.0.0.0'); $readability = $this->getReadability('<div><h2>Title</h2>'.str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7).'<a href="#nofollow" rel="nofollow">will NOT be removed</a></div>', 'http://0.0.0.0');
$readability->debug = true; $readability->debug = true;
$readability->lightClean = false; $readability->lightClean = false;
$res = $readability->init(); $res = $readability->init();
@ -157,7 +148,7 @@ class ReadabilityTest extends \PHPUnit_Framework_TestCase
public function testWithIframe() public function testWithIframe()
{ {
$readability = new ReadabilityTested('<div><h2>Title</h2>'.str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7).'<p>This is an awesome text with some links, here there are <iframe src="http://youtube.com/test" href="#nofollow" rel="nofollow"></iframe><iframe>http://soundcloud.com/test</iframe></p></div>', 'http://0.0.0.0'); $readability = $this->getReadability('<div><h2>Title</h2>'.str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7).'<p>This is an awesome text with some links, here there are <iframe src="http://youtube.com/test" href="#nofollow" rel="nofollow"></iframe><iframe>http://soundcloud.com/test</iframe></p></div>', 'http://0.0.0.0');
$readability->debug = true; $readability->debug = true;
$res = $readability->init(); $res = $readability->init();
@ -172,7 +163,7 @@ class ReadabilityTest extends \PHPUnit_Framework_TestCase
public function testWithArticle() public function testWithArticle()
{ {
$readability = new ReadabilityTested('<article><p>'.str_repeat('This is an awesome text with some links, here there are: the awesome', 20).'</p><p>This is an awesome text with some links, here there are <iframe src="http://youtube.com/test" href="#nofollow" rel="nofollow"></iframe></p></article>', 'http://0.0.0.0'); $readability = $this->getReadability('<article><p>'.str_repeat('This is an awesome text with some links, here there are: the awesome', 20).'</p><p>This is an awesome text with some links, here there are <iframe src="http://youtube.com/test" href="#nofollow" rel="nofollow"></iframe></p></article>', 'http://0.0.0.0');
$readability->debug = true; $readability->debug = true;
$res = $readability->init(); $res = $readability->init();
@ -187,7 +178,7 @@ class ReadabilityTest extends \PHPUnit_Framework_TestCase
public function testWithAside() public function testWithAside()
{ {
$readability = new ReadabilityTested('<article>'.str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7).'<footer><aside>'.str_repeat('<p>This is an awesome text with some links, here there are</p>', 8).'</aside></footer></article>', 'http://0.0.0.0'); $readability = $this->getReadability('<article>'.str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7).'<footer><aside>'.str_repeat('<p>This is an awesome text with some links, here there are</p>', 8).'</aside></footer></article>', 'http://0.0.0.0');
$readability->debug = true; $readability->debug = true;
$res = $readability->init(); $res = $readability->init();
@ -202,7 +193,7 @@ class ReadabilityTest extends \PHPUnit_Framework_TestCase
public function testWithClasses() public function testWithClasses()
{ {
$readability = new ReadabilityTested('<article>'.str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7).'<div style="display:none">'.str_repeat('<p class="clock">This text should be removed</p>', 10).'</div></article>', 'http://0.0.0.0'); $readability = $this->getReadability('<article>'.str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7).'<div style="display:none">'.str_repeat('<p class="clock">This text should be removed</p>', 10).'</div></article>', 'http://0.0.0.0');
$readability->debug = true; $readability->debug = true;
$res = $readability->init(); $res = $readability->init();
@ -217,7 +208,7 @@ class ReadabilityTest extends \PHPUnit_Framework_TestCase
public function testWithClassesWithoutLightClean() public function testWithClassesWithoutLightClean()
{ {
$readability = new ReadabilityTested('<article>'.str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7).'<div style="display:none">'.str_repeat('<p class="clock">This text should be removed</p>', 10).'</div></article>', 'http://0.0.0.0'); $readability = $this->getReadability('<article>'.str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7).'<div style="display:none">'.str_repeat('<p class="clock">This text should be removed</p>', 10).'</div></article>', 'http://0.0.0.0');
$readability->debug = true; $readability->debug = true;
$readability->lightClean = false; $readability->lightClean = false;
$res = $readability->init(); $res = $readability->init();
@ -233,7 +224,7 @@ class ReadabilityTest extends \PHPUnit_Framework_TestCase
public function testWithTd() public function testWithTd()
{ {
$readability = new ReadabilityTested('<table><tr>'.str_repeat('<td><p>This is an awesome text with some links, here there are the awesome</td>', 7).'</tr></table>', 'http://0.0.0.0'); $readability = $this->getReadability('<table><tr>'.str_repeat('<td><p>This is an awesome text with some links, here there are the awesome</td>', 7).'</tr></table>', 'http://0.0.0.0');
$readability->debug = true; $readability->debug = true;
$res = $readability->init(); $res = $readability->init();
@ -246,7 +237,7 @@ class ReadabilityTest extends \PHPUnit_Framework_TestCase
public function testWithSameClasses() public function testWithSameClasses()
{ {
$readability = new ReadabilityTested('<article class="awesomecontent">'.str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7).'<div class="awesomecontent">This text is also an awesome text and you should know that !</div></article>', 'http://0.0.0.0'); $readability = $this->getReadability('<article class="awesomecontent">'.str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7).'<div class="awesomecontent">This text is also an awesome text and you should know that !</div></article>', 'http://0.0.0.0');
$readability->debug = true; $readability->debug = true;
$res = $readability->init(); $res = $readability->init();
@ -260,7 +251,7 @@ class ReadabilityTest extends \PHPUnit_Framework_TestCase
public function testWithScript() public function testWithScript()
{ {
$readability = new ReadabilityTested('<article class="awesomecontent">'.str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7).'<p><script>This text is also an awesome text and you should know that !</script></p></article>', 'http://0.0.0.0'); $readability = $this->getReadability('<article class="awesomecontent">'.str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7).'<p><script>This text is also an awesome text and you should know that !</script></p></article>', 'http://0.0.0.0');
$readability->debug = true; $readability->debug = true;
$res = $readability->init(); $res = $readability->init();
@ -274,7 +265,7 @@ class ReadabilityTest extends \PHPUnit_Framework_TestCase
public function testTitle() public function testTitle()
{ {
$readability = new ReadabilityTested('<title>this is my title</title><article class="awesomecontent">'.str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7).'<p></p></article>', 'http://0.0.0.0'); $readability = $this->getReadability('<title>this is my title</title><article class="awesomecontent">'.str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7).'<p></p></article>', 'http://0.0.0.0');
$readability->debug = true; $readability->debug = true;
$res = $readability->init(); $res = $readability->init();
@ -288,7 +279,7 @@ class ReadabilityTest extends \PHPUnit_Framework_TestCase
public function testTitleWithDash() public function testTitleWithDash()
{ {
$readability = new ReadabilityTested('<title> title2 - title3 </title><article class="awesomecontent">'.str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7).'<p></p></article>', 'http://0.0.0.0'); $readability = $this->getReadability('<title> title2 - title3 </title><article class="awesomecontent">'.str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7).'<p></p></article>', 'http://0.0.0.0');
$readability->debug = true; $readability->debug = true;
$res = $readability->init(); $res = $readability->init();
@ -302,7 +293,7 @@ class ReadabilityTest extends \PHPUnit_Framework_TestCase
public function testTitleWithDoubleDot() public function testTitleWithDoubleDot()
{ {
$readability = new ReadabilityTested('<title> title2 : title3 </title><article class="awesomecontent">'.str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7).'<p></p></article>', 'http://0.0.0.0'); $readability = $this->getReadability('<title> title2 : title3 </title><article class="awesomecontent">'.str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7).'<p></p></article>', 'http://0.0.0.0');
$readability->debug = true; $readability->debug = true;
$res = $readability->init(); $res = $readability->init();
@ -316,7 +307,7 @@ class ReadabilityTest extends \PHPUnit_Framework_TestCase
public function testTitleTooShortUseH1() public function testTitleTooShortUseH1()
{ {
$readability = new ReadabilityTested('<title>too short</title><h1>this is my h1 title !</h1><article class="awesomecontent">'.str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7).'<p></p></article>', 'http://0.0.0.0'); $readability = $this->getReadability('<title>too short</title><h1>this is my h1 title !</h1><article class="awesomecontent">'.str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7).'<p></p></article>', 'http://0.0.0.0');
$readability->debug = true; $readability->debug = true;
$res = $readability->init(); $res = $readability->init();
@ -330,13 +321,9 @@ class ReadabilityTest extends \PHPUnit_Framework_TestCase
// public function testConstructParser() // public function testConstructParser()
// { // {
// $readability = new ReadabilityTested('<html/>', 'http://0.0.0.0', 'html5lib'); // $readability = $this->getReadability('<html/>', 'http://0.0.0.0', 'html5lib');
// $this->assertEquals('http://0.0.0.0', $readability->url); // $this->assertEquals('http://0.0.0.0', $readability->url);
// $this->assertContains('Parsing URL', $readability->getDebugText());
// $this->assertContains('Tidying document', $readability->getDebugText());
// $this->assertEquals('/0\.0\.0\.0/', $readability->getDomainRegexp());
// $this->assertInstanceOf('DomDocument', $readability->dom);
// } // }
// dummy function to be used to the next test // dummy function to be used to the next test
@ -376,7 +363,7 @@ class ReadabilityTest extends \PHPUnit_Framework_TestCase
</body> </body>
</html>'; </html>';
$readability = new ReadabilityTested($data, 'http://iosgames.ru/?p=22030'); $readability = $this->getReadability($data, 'http://iosgames.ru/?p=22030');
$readability->debug = true; $readability->debug = true;
$res = $readability->init(); $res = $readability->init();
@ -437,7 +424,7 @@ class ReadabilityTest extends \PHPUnit_Framework_TestCase
</body> </body>
</html>'; </html>';
$readability = new ReadabilityTested($data, 'http://0.0.0.0'); $readability = $this->getReadability($data, 'http://0.0.0.0');
$readability->debug = true; $readability->debug = true;
$res = $readability->init(); $res = $readability->init();
@ -449,18 +436,23 @@ class ReadabilityTest extends \PHPUnit_Framework_TestCase
public function testPostFilters() public function testPostFilters()
{ {
$readability = new ReadabilityTested('<div>'.str_repeat('<p>This <b>is</b> the awesome content :)</p>', 7).'</div>', 'http://0.0.0.0'); $readability = $this->getReadability('<div>'.str_repeat('<p>This <b>is</b> the awesome content :)</p>', 7).'</div>', 'http://0.0.0.0');
$readability->addPostFilter('!<strong[^>]*>(.*?)</strong>!is', '');
$res = $readability->init(); $res = $readability->init();
$this->assertTrue($res); $this->assertTrue($res);
$this->assertContains('This <strong>is</strong> the awesome content :)', $readability->getContent()->innerHTML); $this->assertContains('This the awesome content :)', $readability->getContent()->innerHTML);
}
$readability = new ReadabilityTested('<div>'.str_repeat('<p>This <b>is</b> the awesome content :)</p>', 7).'</div>', 'http://0.0.0.0'); public function testPreFilters()
$readability->addPostFilter('!<strong[^>]*>(.*?)</strong>!is', ''); {
$readability = $this->getReadability('<div>'.str_repeat('<p>This <b>is</b> the awesome and WONDERFUL content :)</p>', 7).'</div>', 'http://0.0.0.0');
$readability->addPreFilter('!<b[^>]*>(.*?)</b>!is', '');
$res = $readability->init(); $res = $readability->init();
$this->assertTrue($res); $this->assertTrue($res);
$this->assertContains('This the awesome content :)', $readability->getContent()->innerHTML); $this->assertContains('This the awesome and WONDERFUL content :)', $readability->getContent()->innerHTML);
} }
} }

Loading…
Cancel
Save