pull/7/head v1.0.4
Jeremy Benoist 11 years ago
parent 908a49824f
commit c5a4a490e1
  1. 2
      src/JSLikeHTMLElement.php
  2. 74
      src/Readability.php

@ -47,7 +47,7 @@ class JSLikeHTMLElement extends \DOMElement
{ {
if ($name == 'innerHTML') { if ($name == 'innerHTML') {
// first, empty the element // first, empty the element
for ($x = $this->childNodes->length - 1; $x >= 0; $x--) { for ($x = $this->childNodes->length - 1; $x >= 0; --$x) {
$this->removeChild($this->childNodes->item($x)); $this->removeChild($this->childNodes->item($x));
} }
// $value holds our new inner HTML // $value holds our new inner HTML

@ -287,7 +287,7 @@ class Readability
// Build readability's DOM tree. // Build readability's DOM tree.
$overlay = $this->dom->createElement('div'); $overlay = $this->dom->createElement('div');
$innerDiv = $this->dom->createElement('div'); $innerDiv = $this->dom->createElement('div');
$articleTitle = $this->getArticleTitle(); $articleTitle = $this->getArticleTitle();
$articleContent = $this->grabArticle(); $articleContent = $this->grabArticle();
if (!$articleContent) { if (!$articleContent) {
$this->success = false; $this->success = false;
@ -406,11 +406,11 @@ class Readability
$this->body->setAttribute('id', 'readabilityBody'); $this->body->setAttribute('id', 'readabilityBody');
// Remove all style tags in head. // Remove all style tags in head.
$styleTags = $this->dom->getElementsByTagName('style'); $styleTags = $this->dom->getElementsByTagName('style');
for ($i = $styleTags->length - 1; $i >= 0; $i--) { for ($i = $styleTags->length - 1; $i >= 0; --$i) {
$styleTags->item($i)->parentNode->removeChild($styleTags->item($i)); $styleTags->item($i)->parentNode->removeChild($styleTags->item($i));
} }
$linkTags = $this->dom->getElementsByTagName('link'); $linkTags = $this->dom->getElementsByTagName('link');
for ($i = $linkTags->length - 1; $i >= 0; $i--) { for ($i = $linkTags->length - 1; $i >= 0; --$i) {
$linkTags->item($i)->parentNode->removeChild($linkTags->item($i)); $linkTags->item($i)->parentNode->removeChild($linkTags->item($i));
} }
} }
@ -430,12 +430,12 @@ class Readability
$footnotesWrapper->appendChild($articleFootnotes); $footnotesWrapper->appendChild($articleFootnotes);
$articleLinks = $articleContent->getElementsByTagName('a'); $articleLinks = $articleContent->getElementsByTagName('a');
$linkCount = 0; $linkCount = 0;
for ($i = 0; $i < $articleLinks->length; $i++) { for ($i = 0; $i < $articleLinks->length; ++$i) {
$articleLink = $articleLinks->item($i); $articleLink = $articleLinks->item($i);
$footnoteLink = $articleLink->cloneNode(true); $footnoteLink = $articleLink->cloneNode(true);
$refLink = $this->dom->createElement('a'); $refLink = $this->dom->createElement('a');
$footnote = $this->dom->createElement('li'); $footnote = $this->dom->createElement('li');
$linkDomain = @parse_url($footnoteLink->getAttribute('href'), PHP_URL_HOST); $linkDomain = @parse_url($footnoteLink->getAttribute('href'), PHP_URL_HOST);
if (!$linkDomain && isset($this->url)) { if (!$linkDomain && isset($this->url)) {
$linkDomain = @parse_url($this->url, PHP_URL_HOST); $linkDomain = @parse_url($this->url, PHP_URL_HOST);
} }
@ -443,7 +443,7 @@ class Readability
if ((strpos($articleLink->getAttribute('class'), 'readability-DoNotFootnote') !== false) || preg_match($this->regexps['skipFootnoteLink'], $linkText)) { if ((strpos($articleLink->getAttribute('class'), 'readability-DoNotFootnote') !== false) || preg_match($this->regexps['skipFootnoteLink'], $linkText)) {
continue; continue;
} }
$linkCount++; ++$linkCount;
// Add a superscript reference after the article link. // Add a superscript reference after the article link.
$refLink->setAttribute('href', '#readabilityFootnoteLink-'.$linkCount); $refLink->setAttribute('href', '#readabilityFootnoteLink-'.$linkCount);
$refLink->innerHTML = '<small><sup>['.$linkCount.']</sup></small>'; $refLink->innerHTML = '<small><sup>['.$linkCount.']</sup></small>';
@ -492,20 +492,20 @@ class Readability
* which is what they were before. * which is what they were before.
*/ */
$elems = $xpath->query('.//p[@data-readability-styled]', $articleContent); $elems = $xpath->query('.//p[@data-readability-styled]', $articleContent);
for ($i = $elems->length - 1; $i >= 0; $i--) { for ($i = $elems->length - 1; $i >= 0; --$i) {
$e = $elems->item($i); $e = $elems->item($i);
$e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e); $e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e);
} }
} }
// Remove service data-candidate attribute. // Remove service data-candidate attribute.
$elems = $xpath->query('.//*[@data-candidate]', $articleContent); $elems = $xpath->query('.//*[@data-candidate]', $articleContent);
for ($i = $elems->length - 1; $i >= 0; $i--) { for ($i = $elems->length - 1; $i >= 0; --$i) {
$elems->item($i)->removeAttribute('data-candidate'); $elems->item($i)->removeAttribute('data-candidate');
} }
// Remove unrelated links and other unneded stuff. // Remove unrelated links and other unneded stuff.
// (not(*) and not(text()[normalize-space()])) or // What's wrong here? // (not(*) and not(text()[normalize-space()])) or // What's wrong here?
$elems = $xpath->query('.//a[@rel="nofollow"]', $articleContent); $elems = $xpath->query('.//a[@rel="nofollow"]', $articleContent);
for ($i = $elems->length - 1; $i >= 0; $i--) { for ($i = $elems->length - 1; $i >= 0; --$i) {
$elems->item($i)->parentNode->removeChild($elems->item($i)); $elems->item($i)->parentNode->removeChild($elems->item($i));
} }
// Clean out junk from the article content. // Clean out junk from the article content.
@ -534,7 +534,7 @@ class Readability
$this->cleanConditionally($articleContent, 'div'); $this->cleanConditionally($articleContent, 'div');
// Remove extra paragraphs. // Remove extra paragraphs.
$articleParagraphs = $articleContent->getElementsByTagName('p'); $articleParagraphs = $articleContent->getElementsByTagName('p');
for ($i = $articleParagraphs->length - 1; $i >= 0; $i--) { for ($i = $articleParagraphs->length - 1; $i >= 0; --$i) {
$imgCount = $articleParagraphs->item($i)->getElementsByTagName('img')->length; $imgCount = $articleParagraphs->item($i)->getElementsByTagName('img')->length;
$embedCount = $articleParagraphs->item($i)->getElementsByTagName('embed')->length; $embedCount = $articleParagraphs->item($i)->getElementsByTagName('embed')->length;
$objectCount = $articleParagraphs->item($i)->getElementsByTagName('object')->length; $objectCount = $articleParagraphs->item($i)->getElementsByTagName('object')->length;
@ -650,7 +650,7 @@ class Readability
$xpath = new \DOMXPath($page); $xpath = new \DOMXPath($page);
} }
$allElements = $page->getElementsByTagName('*'); $allElements = $page->getElementsByTagName('*');
for ($nodeIndex = 0; ($node = $allElements->item($nodeIndex)); $nodeIndex++) { for ($nodeIndex = 0; ($node = $allElements->item($nodeIndex)); ++$nodeIndex) {
$tagName = $node->tagName; $tagName = $node->tagName;
// Some well known site uses sections as paragraphs. // Some well known site uses sections as paragraphs.
if (strcasecmp($tagName, 'p') === 0 || strcasecmp($tagName, 'td') === 0 || strcasecmp($tagName, 'section') === 0) { if (strcasecmp($tagName, 'p') === 0 || strcasecmp($tagName, 'td') === 0 || strcasecmp($tagName, 'section') === 0) {
@ -668,14 +668,14 @@ class Readability
//$newNode->setAttribute('class', $node->getAttribute('class')); //$newNode->setAttribute('class', $node->getAttribute('class'));
//$newNode->setAttribute('id', $node->getAttribute('id')); //$newNode->setAttribute('id', $node->getAttribute('id'));
$node = $node->parentNode->replaceChild($newNode, $node); $node = $node->parentNode->replaceChild($newNode, $node);
$nodeIndex--; --$nodeIndex;
$nodesToScore[] = $newNode; $nodesToScore[] = $newNode;
} catch (Exception $e) { } catch (Exception $e) {
$this->dbg('Could not alter div/article to p, reverting back to div: '.$e->getMessage()); $this->dbg('Could not alter div/article to p, reverting back to div: '.$e->getMessage());
} }
} else { } else {
// Will change these P elements back to text nodes after processing. // Will change these P elements back to text nodes after processing.
for ($i = 0, $il = $node->childNodes->length; $i < $il; $i++) { for ($i = 0, $il = $node->childNodes->length; $i < $il; ++$i) {
$childNode = $node->childNodes->item($i); $childNode = $node->childNodes->item($i);
if (is_object($childNode) && get_class($childNode) === 'DOMProcessingInstruction') { //executable tags (<?php or <?xml) warning if (is_object($childNode) && get_class($childNode) === 'DOMProcessingInstruction') { //executable tags (<?php or <?xml) warning
$childNode->parentNode->removeChild($childNode); $childNode->parentNode->removeChild($childNode);
@ -699,7 +699,7 @@ class Readability
* A score is determined by things like number of commas, class names, etc. * A score is determined by things like number of commas, class names, etc.
* Maybe eventually link density. * Maybe eventually link density.
*/ */
for ($pt = 0, $scored = count($nodesToScore); $pt < $scored; $pt++) { for ($pt = 0, $scored = count($nodesToScore); $pt < $scored; ++$pt) {
$parentNode = $nodesToScore[$pt]->parentNode; $parentNode = $nodesToScore[$pt]->parentNode;
// No parent node? Move on... // No parent node? Move on...
if (!$parentNode) { if (!$parentNode) {
@ -754,7 +754,7 @@ class Readability
*/ */
if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS) && $xpath) { if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS) && $xpath) {
$candidates = $xpath->query('.//*[(self::footer and count(//footer)<2) or (self::aside and count(//aside)<2)]', $page->documentElement); $candidates = $xpath->query('.//*[(self::footer and count(//footer)<2) or (self::aside and count(//aside)<2)]', $page->documentElement);
for ($node = null, $c = $candidates->length - 1; $c >= 0; $c--) { for ($node = null, $c = $candidates->length - 1; $c >= 0; --$c) {
$node = $candidates->item($c); $node = $candidates->item($c);
// node should be readable but not inside of an article otherwise it's probably non-readable block // node should be readable but not inside of an article otherwise it's probably non-readable block
if ($node->hasAttribute('readability') && (int) $node->getAttributeNode('readability')->value < 40 && ($node->parentNode ? strcasecmp($node->parentNode->tagName, 'article') !== 0 : true)) { if ($node->hasAttribute('readability') && (int) $node->getAttributeNode('readability')->value < 40 && ($node->parentNode ? strcasecmp($node->parentNode->tagName, 'article') !== 0 : true)) {
@ -763,7 +763,7 @@ class Readability
} }
} }
$candidates = $xpath->query('.//*[not(self::body) and (@class or @id or @style) and ((number(@readability) < 40) or not(@readability))]', $page->documentElement); $candidates = $xpath->query('.//*[not(self::body) and (@class or @id or @style) and ((number(@readability) < 40) or not(@readability))]', $page->documentElement);
for ($node = null, $c = $candidates->length - 1; $c >= 0; $c--) { for ($node = null, $c = $candidates->length - 1; $c >= 0; --$c) {
$node = $candidates->item($c); $node = $candidates->item($c);
$tagName = $node->tagName; $tagName = $node->tagName;
/* Remove unlikely candidates */ /* Remove unlikely candidates */
@ -775,7 +775,7 @@ class Readability
) { ) {
$this->dbg('Removing unlikely candidate '.$node->getNodePath().' by "'.$unlikelyMatchString.'" with readability '.($node->hasAttribute('readability') ? (int) $node->getAttributeNode('readability')->value : 0)); $this->dbg('Removing unlikely candidate '.$node->getNodePath().' by "'.$unlikelyMatchString.'" with readability '.($node->hasAttribute('readability') ? (int) $node->getAttributeNode('readability')->value : 0));
$node->parentNode->removeChild($node); $node->parentNode->removeChild($node);
$nodeIndex--; --$nodeIndex;
} }
} }
unset($candidates); unset($candidates);
@ -788,7 +788,7 @@ class Readability
if ($xpath) { if ($xpath) {
// Using array of DOMElements after deletion is a path to DOOMElement. // Using array of DOMElements after deletion is a path to DOOMElement.
$candidates = $xpath->query('.//*[@data-candidate]', $page->documentElement); $candidates = $xpath->query('.//*[@data-candidate]', $page->documentElement);
for ($c = $candidates->length - 1; $c >= 0; $c--) { for ($c = $candidates->length - 1; $c >= 0; --$c) {
// Scale the final candidates score based on link density. Good content should have a // Scale the final candidates score based on link density. Good content should have a
// relatively small link density (5% or less) and be mostly unaffected by this operation. // relatively small link density (5% or less) and be mostly unaffected by this operation.
// If not for this we would have used XPath to find maximum @readability. // If not for this we would have used XPath to find maximum @readability.
@ -849,7 +849,7 @@ class Readability
$siblingNodes = new stdClass(); $siblingNodes = new stdClass();
$siblingNodes->length = 0; $siblingNodes->length = 0;
} }
for ($s = 0, $sl = $siblingNodes->length; $s < $sl; $s++) { for ($s = 0, $sl = $siblingNodes->length; $s < $sl; ++$s) {
$siblingNode = $siblingNodes->item($s); $siblingNode = $siblingNodes->item($s);
$siblingNodeName = $siblingNode->nodeName; $siblingNodeName = $siblingNode->nodeName;
$append = false; $append = false;
@ -869,7 +869,7 @@ class Readability
if (strcasecmp($siblingNodeName, 'p') === 0) { if (strcasecmp($siblingNodeName, 'p') === 0) {
$linkDensity = $this->getLinkDensity($siblingNode); $linkDensity = $this->getLinkDensity($siblingNode);
$nodeContent = $this->getInnerText($siblingNode, true, true); $nodeContent = $this->getInnerText($siblingNode, true, true);
$nodeLength = mb_strlen($nodeContent); $nodeLength = mb_strlen($nodeContent);
if ($nodeLength > self::MIN_NODE_LENGTH && $linkDensity < self::MAX_LINK_DENSITY) { if ($nodeLength > self::MIN_NODE_LENGTH && $linkDensity < self::MAX_LINK_DENSITY) {
$append = true; $append = true;
} elseif ($nodeLength < self::MIN_NODE_LENGTH && $linkDensity === 0 && preg_match('/\.( |$)/', $nodeContent)) { } elseif ($nodeLength < self::MIN_NODE_LENGTH && $linkDensity === 0 && preg_match('/\.( |$)/', $nodeContent)) {
@ -890,13 +890,13 @@ class Readability
} catch (Exception $e) { } catch (Exception $e) {
$this->dbg('Could not alter siblingNode '.$siblingNodeName.' to div, reverting to original.'); $this->dbg('Could not alter siblingNode '.$siblingNodeName.' to div, reverting to original.');
$nodeToAppend = $siblingNode; $nodeToAppend = $siblingNode;
$s--; --$s;
$sl--; --$sl;
} }
} else { } else {
$nodeToAppend = $siblingNode; $nodeToAppend = $siblingNode;
$s--; --$s;
$sl--; --$sl;
} }
// To ensure a node does not interfere with readability styles, remove its classnames & ids. // To ensure a node does not interfere with readability styles, remove its classnames & ids.
// Now done via RegExp post_filter. // Now done via RegExp post_filter.
@ -1022,7 +1022,7 @@ class Readability
$links = $e->getElementsByTagName('a'); $links = $e->getElementsByTagName('a');
$textLength = mb_strlen($this->getInnerText($e, true, true)); $textLength = mb_strlen($this->getInnerText($e, true, true));
$linkLength = 0; $linkLength = 0;
for ($dRe = $this->domainRegExp, $i = 0, $il = $links->length; $i < $il; $i++) { for ($dRe = $this->domainRegExp, $i = 0, $il = $links->length; $i < $il; ++$i) {
if ($excludeExternal && $dRe && !preg_match($dRe, $links->item($i)->getAttribute('href'))) { if ($excludeExternal && $dRe && !preg_match($dRe, $links->item($i)->getAttribute('href'))) {
continue; continue;
} }
@ -1116,7 +1116,7 @@ class Readability
{ {
$targetList = $e->getElementsByTagName($tag); $targetList = $e->getElementsByTagName($tag);
$isEmbed = ($tag === 'audio' || $tag === 'video' || $tag === 'iframe' || $tag === 'object' || $tag === 'embed'); $isEmbed = ($tag === 'audio' || $tag === 'video' || $tag === 'iframe' || $tag === 'object' || $tag === 'embed');
for ($cur_item = null, $y = $targetList->length - 1; $y >= 0; $y--) { for ($cur_item = null, $y = $targetList->length - 1; $y >= 0; --$y) {
/* Allow youtube and vimeo videos through as people usually want to see those. */ /* Allow youtube and vimeo videos through as people usually want to see those. */
$cur_item = $targetList->item($y); $cur_item = $targetList->item($y);
if ($isEmbed) { if ($isEmbed) {
@ -1155,7 +1155,7 @@ class Readability
* *
* TODO: Consider taking into account original contentScore here. * TODO: Consider taking into account original contentScore here.
*/ */
for ($node = null, $i = $curTagsLength - 1; $i >= 0; $i--) { for ($node = null, $i = $curTagsLength - 1; $i >= 0; --$i) {
$node = $tagsList->item($i); $node = $tagsList->item($i);
//$class = $node->getAttribute('class').' '.$node->getAttribute('id'); //debug //$class = $node->getAttribute('class').' '.$node->getAttribute('id'); //debug
$weight = $this->getWeight($node); $weight = $this->getWeight($node);
@ -1176,18 +1176,18 @@ class Readability
$a = $node->getElementsByTagName('a')->length; $a = $node->getElementsByTagName('a')->length;
$embedCount = 0; $embedCount = 0;
$embeds = $node->getElementsByTagName('embed'); $embeds = $node->getElementsByTagName('embed');
for ($ei = 0, $il = $embeds->length; $ei < $il; $ei++) { for ($ei = 0, $il = $embeds->length; $ei < $il; ++$ei) {
if (preg_match($this->regexps['media'], $embeds->item($ei)->getAttribute('src'))) { if (preg_match($this->regexps['media'], $embeds->item($ei)->getAttribute('src'))) {
$embedCount++; ++$embedCount;
} }
} }
$embeds = $node->getElementsByTagName('iframe'); $embeds = $node->getElementsByTagName('iframe');
for ($ei = 0, $il = $embeds->length; $ei < $il; $ei++) { for ($ei = 0, $il = $embeds->length; $ei < $il; ++$ei) {
if (preg_match($this->regexps['media'], $embeds->item($ei)->getAttribute('src'))) { if (preg_match($this->regexps['media'], $embeds->item($ei)->getAttribute('src'))) {
$embedCount++; ++$embedCount;
} }
} }
$linkDensity = $this->getLinkDensity($node, true); $linkDensity = $this->getLinkDensity($node, true);
$contentLength = mb_strlen($this->getInnerText($node)); $contentLength = mb_strlen($this->getInnerText($node));
$toRemove = false; $toRemove = false;
if ($this->lightClean) { if ($this->lightClean) {
@ -1250,9 +1250,9 @@ class Readability
*/ */
public function cleanHeaders($e) public function cleanHeaders($e)
{ {
for ($headerIndex = 1; $headerIndex < 3; $headerIndex++) { for ($headerIndex = 1; $headerIndex < 3; ++$headerIndex) {
$headers = $e->getElementsByTagName('h'.$headerIndex); $headers = $e->getElementsByTagName('h'.$headerIndex);
for ($i = $headers->length - 1; $i >= 0; $i--) { for ($i = $headers->length - 1; $i >= 0; --$i) {
if ($this->getWeight($headers->item($i)) < 0 || $this->getLinkDensity($headers->item($i)) > 0.33) { if ($this->getWeight($headers->item($i)) < 0 || $this->getLinkDensity($headers->item($i)) > 0.33) {
$headers->item($i)->parentNode->removeChild($headers->item($i)); $headers->item($i)->parentNode->removeChild($headers->item($i));
} }
@ -1276,9 +1276,7 @@ class Readability
} }
/** /**
* Will recreate previously deleted body property * Will recreate previously deleted body property.
*
* @return void
*/ */
protected function reinitBody() protected function reinitBody()
{ {

Loading…
Cancel
Save