Backport changes from wallabag

e9e4ff87f8
pull/7/head
Jeremy Benoist 11 years ago
parent 91b80b70e2
commit c67189248e
  1. 57
      src/Readability.php

@ -63,6 +63,7 @@ class Readability
protected $bodyCache = null; // Cache the body HTML in case we need to re-use it later
protected $flags = 7; // 1 | 2 | 4; // Start with all processing flags set.
protected $success = false; // indicates whether we were able to extract or not
/**
* All of the regular expressions in use within readability.
* Defined up here so we don't instantiate them repeatedly in loops.
@ -122,6 +123,7 @@ class Readability
'!</code>\s*</pre>!is' => '</pre>',
'!<[hb]r>!is' => '<\\1 />',
);
// flags
const FLAG_STRIP_UNLIKELYS = 1;
const FLAG_WEIGHT_ATTRIBUTES = 2;
@ -137,6 +139,7 @@ class Readability
const MIN_ARTICLE_LENGTH = 200;
const MIN_NODE_LENGTH = 80;
const MAX_LINK_DENSITY = 0.25;
/**
* Create instance of Readability.
*
@ -206,6 +209,7 @@ class Readability
$this->dom->registerNodeClass('DOMElement', 'Readability\JSLikeHTMLElement');
}
/**
* Get article title element.
*
@ -215,6 +219,7 @@ class Readability
{
return $this->articleTitle;
}
/**
* Get article content element.
*
@ -224,6 +229,7 @@ class Readability
{
return $this->articleContent;
}
/**
* Add pre filter for raw input HTML processing.
*
@ -234,6 +240,7 @@ class Readability
{
$this->pre_filters[$filter] = $replacer;
}
/**
* Add post filter for raw output HTML processing.
*
@ -244,6 +251,7 @@ class Readability
{
$this->post_filters[$filter] = $replacer;
}
/**
* Runs readability.
*
@ -305,6 +313,7 @@ class Readability
return $this->success;
}
/**
* Debug.
*/
@ -325,6 +334,7 @@ class Readability
syslog(6, $this->debugText); // 1 - error 6 - info
}
}
/**
* Run any post-process modifications to article content as necessary.
*
@ -336,6 +346,7 @@ class Readability
$this->addFootnotes($articleContent);
}
}
/**
* Get the article title as an H1.
*
@ -349,6 +360,7 @@ class Readability
$curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0));
} catch (Exception $e) {
}
if (preg_match('/ [\|\-] /', $curTitle)) {
$curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle);
if (count(explode(' ', $curTitle)) < 3) {
@ -365,15 +377,18 @@ class Readability
$curTitle = $this->getInnerText($hOnes->item(0));
}
}
$curTitle = trim($curTitle);
if (count(explode(' ', $curTitle)) <= 4) {
$curTitle = $origTitle;
}
$articleTitle = $this->dom->createElement('h1');
$articleTitle->innerHTML = $curTitle;
return $articleTitle;
}
/**
* Prepare the HTML document for readability to scrape it.
* This includes things like stripping javascript, CSS, and handling terrible markup.
@ -399,6 +414,7 @@ class Readability
$linkTags->item($i)->parentNode->removeChild($linkTags->item($i));
}
}
/**
* For easier reading, convert this document to have footnotes at the bottom rather than inline links.
*
@ -453,6 +469,7 @@ class Readability
$articleContent->appendChild($footnotesWrapper);
}
}
/**
* Prepare the article node for display. Clean out any inline styles,
* iframes, forms, strip extraneous <p> tags, etc.
@ -547,6 +564,7 @@ class Readability
}
}
}
/**
* Initialize a node with the readability object. Also checks the
* className/id for special names to add to its score.
@ -614,6 +632,7 @@ class Readability
}
$readability->value += $this->getWeight($node);
}
/**
* grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
* most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
@ -796,6 +815,7 @@ class Readability
$this->dbg('Setting body to a raw HTML of original page!');
$topCandidate->innerHTML = $page->documentElement->innerHTML;
$page->documentElement->innerHTML = '';
$this->reinitBody();
$page->documentElement->appendChild($topCandidate);
}
} else {
@ -896,10 +916,8 @@ class Readability
* finding the -right- content.
*/
if (mb_strlen($this->getInnerText($articleContent, false)) < self::MIN_ARTICLE_LENGTH) {
if (!$this->body->hasChildNodes()) {
$this->body = $this->dom->createElement('body');
}
$this->body->innerHTML = $this->bodyCache;
$this->reinitBody();
if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) {
$this->removeFlag(self::FLAG_STRIP_UNLIKELYS);
$this->dbg('...content is shorter than '.self::MIN_ARTICLE_LENGTH." letters, trying not to strip unlikely content.\n");
@ -922,6 +940,7 @@ class Readability
return $articleContent;
}
/**
* Get the inner text of a node.
* This also strips out any excess whitespace to be found.
@ -946,6 +965,7 @@ class Readability
return $textContent;
}
/**
* Remove the style attribute on every $e and under.
*
@ -961,6 +981,7 @@ class Readability
$elem->removeAttribute('style');
}
}
/**
* Get comma number for a given text.
*
@ -972,6 +993,7 @@ class Readability
{
return substr_count($text, ',');
}
/**
* Get words number for a given text if words separated by a space.
* Input string should be normalized.
@ -984,6 +1006,7 @@ class Readability
{
return substr_count($text, ' ');
}
/**
* Get the density of links as a percentage of the content
* This is the amount of text that is inside a link divided by the total text in the node.
@ -1011,6 +1034,7 @@ class Readability
return 0;
}
}
/**
* Get an element weight by attribute.
* Uses regular expressions to tell if this element looks good or bad.
@ -1045,6 +1069,7 @@ class Readability
return $weight;
}
/**
* Get an element relative weight.
*
@ -1065,6 +1090,7 @@ class Readability
return $weight;
}
/**
* Remove extraneous break tags from a node.
*
@ -1076,6 +1102,7 @@ class Readability
$html = preg_replace($this->regexps['killBreaks'], '<br />', $html);
$node->innerHTML = $html;
}
/**
* Clean a node of all elements of type "tag".
* (Unless it's a youtube/vimeo video. People love movies.).
@ -1106,6 +1133,7 @@ class Readability
$cur_item->parentNode->removeChild($cur_item);
}
}
/**
* Clean an element of all tags of type "tag" if they look fishy.
* "Fishy" is an algorithm based on content length, classnames,
@ -1192,8 +1220,8 @@ class Readability
} elseif ($input > floor($p / 3)) {
$this->dbg(' too many <input> elements');
$toRemove = true;
} elseif ($contentLength < 25 && ($img === 0 || $img > 2)) {
$this->dbg(' content length less than 25 chars and 0 images, or more than 2 images');
} elseif ($contentLength < 10 && ($img === 0 || $img > 2)) {
$this->dbg(' content length less than 10 chars and 0 images, or more than 2 images');
$toRemove = true;
} elseif ($weight < 25 && $linkDensity > 0.2) {
$this->dbg(' weight is '.$weight.' lower than 0 and link density is '.sprintf('%.2f', $linkDensity).' > 0.2');
@ -1214,6 +1242,7 @@ class Readability
}
}
}
/**
* Clean out spurious headers from an Element. Checks things like classnames and link density.
*
@ -1230,16 +1259,32 @@ class Readability
}
}
}
public function flagIsActive($flag)
{
return ($this->flags & $flag) > 0;
}
public function addFlag($flag)
{
$this->flags = $this->flags | $flag;
}
public function removeFlag($flag)
{
$this->flags = $this->flags & ~$flag;
}
/**
* Will recreate previously deleted body property
*
* @return void
*/
protected function reinitBody()
{
if (!isset($this->body->childNodes)) {
$this->body = $this->dom->createElement('body');
$this->body->innerHTML = $this->bodyCache;
}
}
}

Loading…
Cancel
Save