diff --git a/src/Readability.php b/src/Readability.php index 98789fe..49460d2 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -63,6 +63,7 @@ class Readability protected $bodyCache = null; // Cache the body HTML in case we need to re-use it later protected $flags = 7; // 1 | 2 | 4; // Start with all processing flags set. protected $success = false; // indicates whether we were able to extract or not + /** * All of the regular expressions in use within readability. * Defined up here so we don't instantiate them repeatedly in loops. @@ -122,6 +123,7 @@ class Readability '!\s*!is' => '', '!<[hb]r>!is' => '<\\1 />', ); + // flags const FLAG_STRIP_UNLIKELYS = 1; const FLAG_WEIGHT_ATTRIBUTES = 2; @@ -137,6 +139,7 @@ class Readability const MIN_ARTICLE_LENGTH = 200; const MIN_NODE_LENGTH = 80; const MAX_LINK_DENSITY = 0.25; + /** * Create instance of Readability. * @@ -206,6 +209,7 @@ class Readability $this->dom->registerNodeClass('DOMElement', 'Readability\JSLikeHTMLElement'); } + /** * Get article title element. * @@ -215,6 +219,7 @@ class Readability { return $this->articleTitle; } + /** * Get article content element. * @@ -224,6 +229,7 @@ class Readability { return $this->articleContent; } + /** * Add pre filter for raw input HTML processing. * @@ -234,6 +240,7 @@ class Readability { $this->pre_filters[$filter] = $replacer; } + /** * Add post filter for raw output HTML processing. * @@ -244,6 +251,7 @@ class Readability { $this->post_filters[$filter] = $replacer; } + /** * Runs readability. * @@ -305,6 +313,7 @@ class Readability return $this->success; } + /** * Debug. */ @@ -325,6 +334,7 @@ class Readability syslog(6, $this->debugText); // 1 - error 6 - info } } + /** * Run any post-process modifications to article content as necessary. * @@ -336,6 +346,7 @@ class Readability $this->addFootnotes($articleContent); } } + /** * Get the article title as an H1. * @@ -349,6 +360,7 @@ class Readability $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0)); } catch (Exception $e) { } + if (preg_match('/ [\|\-] /', $curTitle)) { $curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle); if (count(explode(' ', $curTitle)) < 3) { @@ -365,15 +377,18 @@ class Readability $curTitle = $this->getInnerText($hOnes->item(0)); } } + $curTitle = trim($curTitle); if (count(explode(' ', $curTitle)) <= 4) { $curTitle = $origTitle; } + $articleTitle = $this->dom->createElement('h1'); $articleTitle->innerHTML = $curTitle; return $articleTitle; } + /** * Prepare the HTML document for readability to scrape it. * This includes things like stripping javascript, CSS, and handling terrible markup. @@ -399,6 +414,7 @@ class Readability $linkTags->item($i)->parentNode->removeChild($linkTags->item($i)); } } + /** * For easier reading, convert this document to have footnotes at the bottom rather than inline links. * @@ -453,6 +469,7 @@ class Readability $articleContent->appendChild($footnotesWrapper); } } + /** * Prepare the article node for display. Clean out any inline styles, * iframes, forms, strip extraneous
tags, etc.
@@ -547,6 +564,7 @@ class Readability
}
}
}
+
/**
* Initialize a node with the readability object. Also checks the
* className/id for special names to add to its score.
@@ -614,6 +632,7 @@ class Readability
}
$readability->value += $this->getWeight($node);
}
+
/**
* grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
* most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
@@ -796,6 +815,7 @@ class Readability
$this->dbg('Setting body to a raw HTML of original page!');
$topCandidate->innerHTML = $page->documentElement->innerHTML;
$page->documentElement->innerHTML = '';
+ $this->reinitBody();
$page->documentElement->appendChild($topCandidate);
}
} else {
@@ -896,10 +916,8 @@ class Readability
* finding the -right- content.
*/
if (mb_strlen($this->getInnerText($articleContent, false)) < self::MIN_ARTICLE_LENGTH) {
- if (!$this->body->hasChildNodes()) {
- $this->body = $this->dom->createElement('body');
- }
- $this->body->innerHTML = $this->bodyCache;
+ $this->reinitBody();
+
if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) {
$this->removeFlag(self::FLAG_STRIP_UNLIKELYS);
$this->dbg('...content is shorter than '.self::MIN_ARTICLE_LENGTH." letters, trying not to strip unlikely content.\n");
@@ -922,6 +940,7 @@ class Readability
return $articleContent;
}
+
/**
* Get the inner text of a node.
* This also strips out any excess whitespace to be found.
@@ -946,6 +965,7 @@ class Readability
return $textContent;
}
+
/**
* Remove the style attribute on every $e and under.
*
@@ -961,6 +981,7 @@ class Readability
$elem->removeAttribute('style');
}
}
+
/**
* Get comma number for a given text.
*
@@ -972,6 +993,7 @@ class Readability
{
return substr_count($text, ',');
}
+
/**
* Get words number for a given text if words separated by a space.
* Input string should be normalized.
@@ -984,6 +1006,7 @@ class Readability
{
return substr_count($text, ' ');
}
+
/**
* Get the density of links as a percentage of the content
* This is the amount of text that is inside a link divided by the total text in the node.
@@ -1011,6 +1034,7 @@ class Readability
return 0;
}
}
+
/**
* Get an element weight by attribute.
* Uses regular expressions to tell if this element looks good or bad.
@@ -1045,6 +1069,7 @@ class Readability
return $weight;
}
+
/**
* Get an element relative weight.
*
@@ -1065,6 +1090,7 @@ class Readability
return $weight;
}
+
/**
* Remove extraneous break tags from a node.
*
@@ -1076,6 +1102,7 @@ class Readability
$html = preg_replace($this->regexps['killBreaks'], '
', $html);
$node->innerHTML = $html;
}
+
/**
* Clean a node of all elements of type "tag".
* (Unless it's a youtube/vimeo video. People love movies.).
@@ -1106,6 +1133,7 @@ class Readability
$cur_item->parentNode->removeChild($cur_item);
}
}
+
/**
* Clean an element of all tags of type "tag" if they look fishy.
* "Fishy" is an algorithm based on content length, classnames,
@@ -1192,8 +1220,8 @@ class Readability
} elseif ($input > floor($p / 3)) {
$this->dbg(' too many elements');
$toRemove = true;
- } elseif ($contentLength < 25 && ($img === 0 || $img > 2)) {
- $this->dbg(' content length less than 25 chars and 0 images, or more than 2 images');
+ } elseif ($contentLength < 10 && ($img === 0 || $img > 2)) {
+ $this->dbg(' content length less than 10 chars and 0 images, or more than 2 images');
$toRemove = true;
} elseif ($weight < 25 && $linkDensity > 0.2) {
$this->dbg(' weight is '.$weight.' lower than 0 and link density is '.sprintf('%.2f', $linkDensity).' > 0.2');
@@ -1214,6 +1242,7 @@ class Readability
}
}
}
+
/**
* Clean out spurious headers from an Element. Checks things like classnames and link density.
*
@@ -1230,16 +1259,32 @@ class Readability
}
}
}
+
public function flagIsActive($flag)
{
return ($this->flags & $flag) > 0;
}
+
public function addFlag($flag)
{
$this->flags = $this->flags | $flag;
}
+
public function removeFlag($flag)
{
$this->flags = $this->flags & ~$flag;
}
+
+ /**
+ * Will recreate previously deleted body property
+ *
+ * @return void
+ */
+ protected function reinitBody()
+ {
+ if (!isset($this->body->childNodes)) {
+ $this->body = $this->dom->createElement('body');
+ $this->body->innerHTML = $this->bodyCache;
+ }
+ }
}