commit 881e441bdfb79130e90343fbd1bd853eb2f9e85a Author: Jeremy Date: Fri Dec 12 11:43:29 2014 +0100 Initial commit diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..2259096 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,12 @@ +; top-most EditorConfig file +root = true + +; Unix-style newlines +[*] +end_of_line = LF + +[*.php] +indent_style = space +indent_size = 4 +trim_trailing_whitespace = true +insert_final_newline = true diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..48b8bf9 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +vendor/ diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..d398c9d --- /dev/null +++ b/.travis.yml @@ -0,0 +1,16 @@ +language: php + +php: + - 5.2 + - 5.3 + - 5.4 + - 5.5 + - 5.6 + +before_script: + - composer self-update + - echo 'date.timezone = "Europe/Paris"' >> ~/.phpenv/versions/$(phpenv version-name)/etc/php.ini + - composer install --prefer-dist --no-interaction + +script: + - phpunit --coverage-text diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 0000000..5c304d1 --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,201 @@ +Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "{}" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright {yyyy} {name of copyright owner} + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md new file mode 100644 index 0000000..a5cf760 --- /dev/null +++ b/README.md @@ -0,0 +1,38 @@ +# Readability + +[![Build Status](https://travis-ci.org/j0k3r/php-readability.svg?branch=master)](https://travis-ci.org/j0k3r/php-readability) + +This is an extract of the Readability class from the [full-text-rss](https://github.com/Dither/full-text-rss) fork. It kind be defined as a better version of the original [php-readability](http://code.fivefilters.org/php-readability). + +## Differences + +The default php-readability lib is really old and needs to be improved. I found a great fork of [full-text-rss](http://fivefilters.org/content-only/) from @Dither which improve the Readability class. + + - I've extracted the class from its fork to be able to use it out of the box + - I've added some simple tests + - and changed the CS, run `php-cs-fixer` and added a namespace + +**But** the code is still really hard to understand / read ... + +## Usage + +```php +use Readability\Readability; + +$url = 'http://www.medialens.org/index.php/alerts/alert-archive/alerts-2013/729-thatcher.html'; + +// you can use whatever you want to retrieve the html content (Guzzle, Buzz, cURL ...) +$html = file_get_contents($url); + +$readability = new Readability($html, $url); +$result = $readability->init(); + +if ($result) { + // display the title of the page + echo $readability->getTitle()->textContent; + // display the *readability* content + echo $readability->getContent()->textContent; +} else { + echo 'Looks like we couldn\'t find the content. :('; +} +``` diff --git a/composer.json b/composer.json new file mode 100644 index 0000000..548459a --- /dev/null +++ b/composer.json @@ -0,0 +1,33 @@ +{ + "name": "j0k3r/php-readability", + "type": "library", + "description": "Automatic article extraction from HTML", + "keywords": ["article extraction","content extraction","extraction","article","content","html"], + "license": "Apache-2.0", + "authors": [{ + "name": "Jeremy Benoist", + "email": "jeremy.benoist@gmail.com", + "homepage": "http://www.j0k3r.net", + "role": "Developer" + },{ + "name": "DitherSky", + "homepage": "https://github.com/Dither", + "role": "Developer (https://github.com/Dither/full-text-rss)" + },{ + "name": "Keyvan Minoukadeh", + "email": "keyvan@keyvan.net", + "homepage": "http://keyvan.net", + "role": "Developer (ported original JS code to PHP)" + },{ + "name": "Arc90", + "homepage": "http://arc90.com", + "role": "Developer (original JS version)" + }], + "require": { + "php": ">=5.2", + "ext-tidy": ">=1.2" + }, + "autoload": { + "psr-4": { "Readability\\": "src/" } + } +} diff --git a/phpunit.xml.dist b/phpunit.xml.dist new file mode 100644 index 0000000..f727e46 --- /dev/null +++ b/phpunit.xml.dist @@ -0,0 +1,29 @@ + + + + + + ./tests/ + + + + + + ./src/TubeLink/ + + ./tests + + + + + diff --git a/src/JSLikeHTMLElement.php b/src/JSLikeHTMLElement.php new file mode 100644 index 0000000..6a732cb --- /dev/null +++ b/src/JSLikeHTMLElement.php @@ -0,0 +1,115 @@ +registerNodeClass('DOMElement', 'JSLikeHTMLElement'); + * $doc->loadHTML('

Para 1

Para 2

'); + * $elem = $doc->getElementsByTagName('div')->item(0); + * + * // print innerHTML + * echo $elem->innerHTML; // prints '

Para 1

Para 2

' + * echo "\n\n"; + * + * // set innerHTML + * $elem->innerHTML = 'FiveFilters.org'; + * echo $elem->innerHTML; // prints 'FiveFilters.org' + * echo "\n\n"; + * + * // print document (with our changes) + * echo $doc->saveXML(); + * + * @author Keyvan Minoukadeh - http://www.keyvan.net - keyvan@keyvan.net + * @see http://fivefilters.org (the project this was written for) + */ +class JSLikeHTMLElement extends \DOMElement +{ + /** + * Used for setting innerHTML like it's done in JavaScript: + * @code + * $div->innerHTML = '

Chapter 2

The story begins...

'; + * @endcode + */ + public function __set($name, $value) + { + if ($name == 'innerHTML') { + // first, empty the element + for ($x=$this->childNodes->length-1; $x>=0; $x--) { + $this->removeChild($this->childNodes->item($x)); + } + // $value holds our new inner HTML + if ($value != '') { + $f = $this->ownerDocument->createDocumentFragment(); + // appendXML() expects well-formed markup (XHTML) + $result = @$f->appendXML($value); // @ to suppress PHP warnings + if ($result) { + if ($f->hasChildNodes()) { + $this->appendChild($f); + } + } else { + // $value is probably ill-formed + $f = new \DOMDocument(); + $value = mb_convert_encoding($value, 'HTML-ENTITIES', 'UTF-8'); + // Using will generate a warning, but so will bad HTML + // (and by this point, bad HTML is what we've got). + // We use it (and suppress the warning) because an HTML fragment will + // be wrapped around tags which we don't really want to keep. + // Note: despite the warning, if loadHTML succeeds it will return true. + $result = @$f->loadHTML(''.$value.''); + if ($result) { + $import = $f->getElementsByTagName('htmlfragment')->item(0); + foreach ($import->childNodes as $child) { + $importedNode = $this->ownerDocument->importNode($child, true); + $this->appendChild($importedNode); + } + } else { + // oh well, we tried, we really did. :( + // this element is now empty + } + } + } + } else { + $trace = debug_backtrace(); + trigger_error('Undefined property via __set(): '.$name.' in '.$trace[0]['file'].' on line '.$trace[0]['line'], E_USER_NOTICE); + } + } + + /** + * Used for getting innerHTML like it's done in JavaScript: + * @code + * $string = $div->innerHTML; + * @endcode + */ + public function __get($name) + { + if ($name == 'innerHTML') { + $inner = ''; + foreach ($this->childNodes as $child) { + $inner .= $this->ownerDocument->saveXML($child); + } + + return $inner; + } + + $trace = debug_backtrace(); + trigger_error('Undefined property via __get(): '.$name.' in '.$trace[0]['file'].' on line '.$trace[0]['line'], E_USER_NOTICE); + + return null; + } + + public function __toString() + { + return '['.$this->tagName.']'; + } +} diff --git a/src/Readability.php b/src/Readability.php new file mode 100644 index 0000000..7f1837e --- /dev/null +++ b/src/Readability.php @@ -0,0 +1,1238 @@ + '/display\s*:\s*none|ignore|\binfo|annoy|clock|date|time|author|intro|links|hidd?e|about|archive|\bprint|bookmark|tags|share|search|social|robot|published|combx|comment|mast(?:head)|subscri|community|category|disqus|extra|head(?:er|note)|floor|foot(?:er|note)|menu|tool|function|nav|remark|rss|shoutbox|tool|widget|meta|banner|sponsor|adsense|inner-?ad|ad-|sponsor|\badv\b|\bads\b|agr?egate?|pager|sidebar|popup|tweet|twitter/i', + 'okMaybeItsACandidate' => '/article\b|contain|\bcontent|column|general|detail|shadow|lightbox|blog|body|entry|main|page/i', + 'positive' => '/read|full|article|body|\bcontent|contain|entry|main|markdown|page|attach|pagination|post|text|blog|story/i', + 'negative' => '/bottom|stat|info|discuss|e[\-]?mail|comment|reply|log.{2}(n|ed)|sign|single|combx|com-|contact|_nav|link|media|\bout|promo|\bad-|related|scroll|shoutbox|sidebar|sponsor|shopping|teaser/i', + 'divToPElements' => '/<(?:blockquote|code|div|article|footer|aside|img|p|pre|dl|ol|ul)/mi', + 'killBreaks' => '/(([ \r\n\s]| ?)*)+/', + 'media' => '!//(?:[^\.\?/]+\.)?(?:youtu(?:be)?|soundcloud|dailymotion|vimeo|pornhub|xvideos|twitvid|rutube|viddler)\.(?:com|be|org|net)/!i', + 'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i' + ); + public $tidy_config = array( + 'tidy-mark' => false, + 'vertical-space' => false, + 'doctype' => 'omit', + 'numeric-entities' => false, + // 'preserve-entities' => true, + 'break-before-br' => false, + 'clean' => true, + 'output-xhtml' => true, + 'logical-emphasis' => true, + 'show-body-only' => false, + 'new-blocklevel-tags' => 'article,aside,audio,details,figcaption,figure,footer,header,hgroup,nav,section,source,summary,temp,track,video', + 'new-empty-tags' => 'command,embed,keygen,source,track,wbr', + 'new-inline-tags' => 'audio,canvas,command,datalist,embed,keygen,mark,meter,output,progress,time,video,wbr', + 'wrap' => 0, + 'drop-empty-paras' => true, + 'drop-proprietary-attributes' => false, + 'enclose-text' => true, + 'enclose-block-text' => true, + 'merge-divs' => true, + // 'merge-spans' => true, + 'input-encoding' => '????', + 'output-encoding' => 'utf8', + 'hide-comments' => true + ); + // raw HTML filters + protected $pre_filters = array( + '!]*>(.*?)!is' => '', // remove obvious scripts + '!]*>(.*?)!is' => '', // remove obvious styles + '!]*>!is' => '', // remove spans as we redefine styles and they're probably special-styled + '!]*>\s*\[AD\]\s*!is' => '', // HACK: firewall-filtered content + '!(]*>[ \r\n\s]*){2,}!i' => '

', // HACK: replace linebreaks plus br's with p's + //'!!is' => '', // replace noscripts + '!<(/?)font[^>]*>!is' => '<\\1span>' // replace fonts to spans + ); + // output HTML filters + protected $post_filters = array( + '/\s*

']+/>!is' => '', // replace empty tags that break layouts + //'!<(\s*/?\s*(?:blockquote|br|hr|code|div|article|span|footer|aside|p|pre|dl|li|ul|ol)) [^>]+>!is' => "<\\1>", // remove all attributes on text tags + "/\n+/" => "\n", //single newlines cleanup + '!]*>\s* '\s*!is' => '', + '!<[hb]r>!is' => '<\\1 />' + ); + // flags + const FLAG_STRIP_UNLIKELYS = 1; + const FLAG_WEIGHT_ATTRIBUTES = 2; + const FLAG_CLEAN_CONDITIONALLY = 4; + const FLAG_DISABLE_PREFILTER = 8; + const FLAG_DISABLE_POSTFILTER = 16; + // constants + const SCORE_CHARS_IN_PARAGRAPH = 100; + const SCORE_WORDS_IN_PARAGRAPH = 20; + const GRANDPARENT_SCORE_DIVISOR = 2.2; + const MIN_PARAGRAPH_LENGTH = 20; + const MIN_COMMAS_IN_PARAGRAPH = 6; + const MIN_ARTICLE_LENGTH = 200; + const MIN_NODE_LENGTH = 80; + const MAX_LINK_DENSITY = 0.25; + /** + * Create instance of Readability + * @param string UTF-8 encoded string + * @param string (optional) URL associated with HTML (for footnotes) + * @param string (optional) Which parser to use for turning raw HTML into a DOMDocument + * @param boolean (optional) Use tidy + */ + public function __construct($html, $url=null, $parser='libxml', $use_tidy=true) + { + $this->url = $url; + $this->debugText = 'Parsing URL: '.$url."\n"; + + if ($url) { + $this->domainRegExp = '/'.strtr(preg_replace('/www\d*\./', '', parse_url($url, PHP_URL_HOST)), array('.' => '\.')).'/'; + } + + mb_internal_encoding("UTF-8"); + mb_http_output("UTF-8"); + mb_regex_encoding("UTF-8"); + + // HACK: dirty cleanup to replace some stuff; shouldn't use regexps with HTML but well... + if (!$this->flagIsActive(self::FLAG_DISABLE_PREFILTER)) { + foreach ($this->pre_filters as $search => $replace) { + $html = preg_replace($search, $replace, $html); + } + unset($search, $replace); + } + + if (trim($html) === '') { + $html = ''; + } + + /** + * Use tidy (if it exists). + * This fixes problems with some sites which would otherwise trouble DOMDocument's HTML parsing. + * Although sometimes it makes matters worse, which is why there is an option to disable it. + * + */ + if ($use_tidy && function_exists('tidy_parse_string')) { + $this->debugText .= 'Tidying document'."\n"; + $tidy = tidy_parse_string($html, $this->tidy_config, 'UTF8'); + if (tidy_clean_repair($tidy)) { + $original_html = $html; + $this->tidied = true; + $html = $tidy->value; + $html = preg_replace('/]+>/i', '', $html); + $html = preg_replace('/[\r\n]+/is', "\n", $html); + } + unset($tidy); + } + $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); + + if (!($parser=='html5lib' && ($this->dom = \HTML5_Parser::parse($html)))) { + libxml_use_internal_errors(true); + $this->dom = new \DOMDocument(); + $this->dom->preserveWhiteSpace = false; + @$this->dom->loadHTML($html, LIBXML_NOBLANKS | LIBXML_COMPACT | LIBXML_NOERROR); + libxml_use_internal_errors(false); + } + + $this->dom->registerNodeClass('DOMElement', 'Readability\JSLikeHTMLElement'); + } + /** + * Get article title element + * @return DOMElement + */ + public function getTitle() + { + return $this->articleTitle; + } + /** + * Get article content element + * @return DOMElement + */ + public function getContent() + { + return $this->articleContent; + } + /** + * Add pre filter for raw input HTML processing + * @param string RegExp for replace + * @param string (optional) Replacer + */ + public function addPreFilter($filter, $replacer='') + { + $this->pre_filters[$filter] = $replacer; + } + /** + * Add post filter for raw output HTML processing + * @param string RegExp for replace + * @param string (optional) Replacer + */ + public function addPostFilter($filter, $replacer='') + { + $this->post_filters[$filter] = $replacer; + } + /** + * Runs readability. + * + * Workflow: + * 1. Prep the document by removing script tags, css, etc. + * 2. Build readability's DOM tree. + * 3. Grab the article content from the current dom tree. + * 4. Replace the current DOM tree with the new one. + * 5. Read peacefully. + * + * @return boolean true if we found content, false otherwise + */ + public function init() + { + if (!isset($this->dom->documentElement)) { + return false; + } + + // Assume successful outcome + $this->success = true; + $bodyElems = $this->dom->getElementsByTagName('body'); + // WTF multiple body nodes? + if ($this->bodyCache == null) { + $this->bodyCache = ''; + foreach ($bodyElems as $bodyNode) { + $this->bodyCache += $bodyNode->innerHTML; + } + } + if ($bodyElems->length > 0 && $this->body == null) { + $this->body = $bodyElems->item(0); + } + $this->prepDocument(); + // Build readability's DOM tree. + $overlay = $this->dom->createElement('div'); + $innerDiv = $this->dom->createElement('div'); + $articleTitle = $this->getArticleTitle(); + $articleContent = $this->grabArticle(); + if (!$articleContent) { + $this->success = false; + $articleContent = $this->dom->createElement('div'); + $articleContent->setAttribute('id', 'readability-content'); + $articleContent->innerHTML = '

Sorry, Readability was unable to parse this page for content.

'; + } + $overlay->setAttribute('id', 'readOverlay'); + $innerDiv->setAttribute('id', 'readInner'); + // Glue the structure of our document together. + $innerDiv->appendChild($articleTitle); + $innerDiv->appendChild($articleContent); + $overlay->appendChild($innerDiv); + // Clear the old HTML, insert the new content. + $this->body->innerHTML = ''; + $this->body->appendChild($overlay); + $this->body->removeAttribute('style'); + $this->postProcessContent($articleContent); + // Set title and content instance variables. + $this->articleTitle = $articleTitle; + $this->articleContent = $articleContent; + $this->dump_dbg(); + + return $this->success; + } + /** + * Debug + */ + protected function dbg($msg) //, $error=false) + { + if ($this->debug) { + $this->debugText .= $msg."\n"; + } + } + + /** + * Dump debug info + */ + protected function dump_dbg() + { + if ($this->debug) { + openlog("Readability PHP ", LOG_PID | LOG_PERROR, 0); + syslog(6, $this->debugText); // 1 - error 6 - info + } + } + /** + * Run any post-process modifications to article content as necessary. + * + * @param DOMElement + * @return void + */ + public function postProcessContent($articleContent) + { + if ($this->convertLinksToFootnotes && !preg_match('/\bwiki/', @$this->url)) { + $this->addFootnotes($articleContent); + } + } + /** + * Get the article title as an H1. + * + * @return DOMElement + */ + protected function getArticleTitle() + { + $curTitle = ''; + $origTitle = ''; + try { + $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0)); + } catch (Exception $e) {} + if (preg_match('/ [\|\-] /', $curTitle)) { + $curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle); + if (count(explode(' ', $curTitle)) < 3) { + $curTitle = preg_replace('/[^\|\-]*[\|\-](.*)/i', '$1', $origTitle); + } + } elseif (strpos($curTitle, ': ') !== false) { + $curTitle = preg_replace('/.*:(.*)/i', '$1', $origTitle); + if (count(explode(' ', $curTitle)) < 3) { + $curTitle = preg_replace('/[^:]*[:](.*)/i','$1', $origTitle); + } + } elseif (mb_strlen($curTitle) > 150 || mb_strlen($curTitle) < 15) { + $hOnes = $this->dom->getElementsByTagName('h1'); + if ($hOnes->length == 1) { + $curTitle = $this->getInnerText($hOnes->item(0)); + } + } + $curTitle = trim($curTitle); + if (count(explode(' ', $curTitle)) <= 4) { + $curTitle = $origTitle; + } + $articleTitle = $this->dom->createElement('h1'); + $articleTitle->innerHTML = $curTitle; + + return $articleTitle; + } + /** + * Prepare the HTML document for readability to scrape it. + * This includes things like stripping javascript, CSS, and handling terrible markup. + * + * @return void + */ + protected function prepDocument() + { + /** + * In some cases a body element can't be found (if the HTML is totally hosed for example) + * so we create a new body node and append it to the document. + */ + if ($this->body == null) { + $this->body = $this->dom->createElement('body'); + $this->dom->documentElement->appendChild($this->body); + } + $this->body->setAttribute('id', 'readabilityBody'); + // Remove all style tags in head. + $styleTags = $this->dom->getElementsByTagName('style'); + for ($i = $styleTags->length-1; $i >= 0; $i--) { + $styleTags->item($i)->parentNode->removeChild($styleTags->item($i)); + } + $linkTags = $this->dom->getElementsByTagName('link'); + for ($i = $linkTags->length-1; $i >= 0; $i--) { + $linkTags->item($i)->parentNode->removeChild($linkTags->item($i)); + } + } + /** + * For easier reading, convert this document to have footnotes at the bottom rather than inline links. + * @see http://www.roughtype.com/archives/2010/05/experiments_in.php + * + * @return void + */ + public function addFootnotes($articleContent) + { + $footnotesWrapper = $this->dom->createElement('footer'); + $footnotesWrapper->setAttribute('id', 'readability-footnotes'); + $footnotesWrapper->innerHTML = '

References

'; + $articleFootnotes = $this->dom->createElement('ol'); + $articleFootnotes->setAttribute('id', 'readability-footnotes-list'); + $footnotesWrapper->appendChild($articleFootnotes); + $articleLinks = $articleContent->getElementsByTagName('a'); + $linkCount = 0; + for ($i = 0; $i < $articleLinks->length; $i++) { + $articleLink = $articleLinks->item($i); + $footnoteLink = $articleLink->cloneNode(true); + $refLink = $this->dom->createElement('a'); + $footnote = $this->dom->createElement('li'); + $linkDomain = @parse_url($footnoteLink->getAttribute('href'), PHP_URL_HOST); + if (!$linkDomain && isset($this->url)) { + $linkDomain = @parse_url($this->url, PHP_URL_HOST); + } + $linkText = $this->getInnerText($articleLink); + if ((strpos($articleLink->getAttribute('class'), 'readability-DoNotFootnote') !== false) || preg_match($this->regexps['skipFootnoteLink'], $linkText)) { + continue; + } + $linkCount++; + // Add a superscript reference after the article link. + $refLink->setAttribute('href', '#readabilityFootnoteLink-' . $linkCount); + $refLink->innerHTML = '[' . $linkCount . ']'; + $refLink->setAttribute('class', 'readability-DoNotFootnote'); + $refLink->setAttribute('style', 'color: inherit;'); + if ($articleLink->parentNode->lastChild->isSameNode($articleLink)) { + $articleLink->parentNode->appendChild($refLink); + } else { + $articleLink->parentNode->insertBefore($refLink, $articleLink->nextSibling); + } + $articleLink->setAttribute('style', 'color: inherit; text-decoration: none;'); + $articleLink->setAttribute('name', 'readabilityLink-' . $linkCount); + $footnote->innerHTML = '^ '; + $footnoteLink->innerHTML = ($footnoteLink->getAttribute('title') != '' ? $footnoteLink->getAttribute('title') : $linkText); + $footnoteLink->setAttribute('name', 'readabilityFootnoteLink-' . $linkCount); + $footnote->appendChild($footnoteLink); + if ($linkDomain) { + $footnote->innerHTML = $footnote->innerHTML . ' (' . $linkDomain . ')'; + } + $articleFootnotes->appendChild($footnote); + } + if ($linkCount > 0) { + $articleContent->appendChild($footnotesWrapper); + } + } + /** + * Prepare the article node for display. Clean out any inline styles, + * iframes, forms, strip extraneous

tags, etc. + * + * @param DOMElement + * @return void + */ + public function prepArticle($articleContent) + { + if ($this->lightClean) { + $this->dbg('Light clean enabled.'); + } else { + $this->dbg('Standard clean enabled.'); + } + $this->cleanStyles($articleContent); + $this->killBreaks($articleContent); + $xpath = new \DOMXPath($articleContent->ownerDocument); + if ($this->revertForcedParagraphElements) { + /** + * Reverts P elements with class 'readability-styled' to text nodes: + * which is what they were before. + */ + $elems = $xpath->query('.//p[@data-readability-styled]', $articleContent); + for ($i = $elems->length-1; $i >= 0; $i--) { + $e = $elems->item($i); + $e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e); + } + } + // Remove service data-candidate attribute. + $elems = $xpath->query('.//*[@data-candidate]', $articleContent); + for ($i = $elems->length-1; $i >= 0; $i--) { + $elems->item($i)->removeAttribute('data-candidate'); + } + // Remove unrelated links and other unneded stuff. + // (not(*) and not(text()[normalize-space()])) or // What's wrong here? + $elems = $xpath->query('.//a[@rel="nofollow"]', $articleContent); + for ($i = $elems->length-1; $i >= 0; $i--) { + $elems->item($i)->parentNode->removeChild($elems->item($i)); + } + // Clean out junk from the article content. + $this->clean($articleContent, 'input'); + $this->clean($articleContent, 'button'); + $this->clean($articleContent, 'nav'); + $this->clean($articleContent, 'object'); + $this->clean($articleContent, 'iframe'); + $this->clean($articleContent, 'canvas'); + $this->clean($articleContent, 'h1'); + + /** + * If there is only one h2, they are probably using it as a main header, so remove it since we + * already have a header. + */ + $h2s = $articleContent->getElementsByTagName('h2'); + if ($h2s->length == 1 && mb_strlen($this->getInnerText($h2s->item(0), true, true)) < 100) { + $this->clean($articleContent, 'h2'); + } + $this->cleanHeaders($articleContent); + // Do these last as the previous stuff may have removed junk that will affect these. + $this->cleanConditionally($articleContent, 'form'); + $this->cleanConditionally($articleContent, 'table'); + $this->cleanConditionally($articleContent, 'ul'); + //if (!$this->lightClean) + $this->cleanConditionally($articleContent, 'div'); + // Remove extra paragraphs. + $articleParagraphs = $articleContent->getElementsByTagName('p'); + for ($i = $articleParagraphs->length-1; $i >= 0; $i--) { + $imgCount = $articleParagraphs->item($i)->getElementsByTagName('img')->length; + $embedCount = $articleParagraphs->item($i)->getElementsByTagName('embed')->length; + $objectCount = $articleParagraphs->item($i)->getElementsByTagName('object')->length; + $videoCount = $articleParagraphs->item($i)->getElementsByTagName('video')->length; + $audioCount = $articleParagraphs->item($i)->getElementsByTagName('audio')->length; + $iframeCount = $articleParagraphs->item($i)->getElementsByTagName('iframe')->length; + if ($iframeCount === 0 && $imgCount === 0 && $embedCount === 0 && $objectCount === 0 && $videoCount === 0 && $audioCount === 0 && mb_strlen(preg_replace('/\s+/is', '', $this->getInnerText($articleParagraphs->item($i), false, false))) === 0) { + $articleParagraphs->item($i)->parentNode->removeChild($articleParagraphs->item($i)); + } + // add extra text to iframe tag to avoid an auto-closing iframe and then break the html code + if ($iframeCount) { + $iframe = $articleParagraphs->item($i)->getElementsByTagName('iframe'); + $iframe->item(0)->nodeValue = ' '; + + $articleParagraphs->item($i)->parentNode->replaceChild($iframe->item(0), $articleParagraphs->item($i)); + } + } + + if (!$this->flagIsActive(self::FLAG_DISABLE_POSTFILTER)) { + try { + foreach ($this->post_filters as $search => $replace) { + $articleContent->innerHTML = preg_replace($search, $replace, $articleContent->innerHTML); + } + unset($search, $replace); + } catch (Exception $e) { + $this->dbg("Cleaning output HTML failed. Ignoring: " . $e->getMessage()); + } + } + } + /** + * Initialize a node with the readability object. Also checks the + * className/id for special names to add to its score. + * + * @param Element + * @return void + */ + protected function initializeNode($node) + { + if (!isset($node->tagName)) { + return; + } + + $readability = $this->dom->createAttribute('readability'); + // this is our contentScore + $readability->value = 0; + $node->setAttributeNode($readability); + + // using strtoupper just in case + switch (strtoupper($node->tagName)) { + case 'ARTICLE': + $readability->value += 15; + case 'DIV': + $readability->value += 5; + break; + case 'PRE': + case 'CODE': + case 'TD': + case 'BLOCKQUOTE': + case 'FIGURE': + $readability->value += 3; + break; +/* case 'SECTION': // often misused + $readability->value += 2; + break; +*/ + case 'OL': + case 'UL': + case 'DL': + case 'DD': + case 'DT': + case 'LI': + $readability->value -= 2 * round($this->getLinkDensity($node), 0, PHP_ROUND_HALF_UP); + break; + case 'ASIDE': + case 'FOOTER': + case 'HEADER': + case 'ADDRESS': + case 'FORM': + case 'BUTTON': + case 'TEXTAREA': + case 'INPUT': + case 'NAV': + $readability->value -= 3; + break; + case 'H1': + case 'H2': + case 'H3': + case 'H4': + case 'H5': + case 'H6': + case 'TH': + case 'HGROUP': + $readability->value -= 5; + break; + } + $readability->value += $this->getWeight($node); + } + /** + * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is + * most likely to be the stuff a user wants to read. Then return it wrapped up in a div. + * + * @return DOMElement + */ + protected function grabArticle($page=null) + { + if (!$page) { + $page = $this->dom; + } + $xpath = null; + $nodesToScore = array(); + if ($page instanceof \DOMDocument && isset($page->documentElement)) { + $xpath = new \DOMXPath($page); + } + $allElements = $page->getElementsByTagName('*'); + for ($nodeIndex = 0; ($node = $allElements->item($nodeIndex)); $nodeIndex++) { + $tagName = $node->tagName; + // Some well known site uses sections as paragraphs. + if (strcasecmp($tagName, 'p') === 0 || strcasecmp($tagName, 'td') === 0 || strcasecmp($tagName, 'section') === 0) { + $nodesToScore[] = $node; + } + // Turn divs into P tags where they have been used inappropriately + // (as in, where they contain no other block level elements). + if (strcasecmp($tagName, 'div') === 0 || strcasecmp($tagName, 'article') === 0 || strcasecmp($tagName, 'section') === 0) { + if (!preg_match($this->regexps['divToPElements'], $node->innerHTML)) { + //$this->dbg('Altering '.$node->getNodePath().' to p'); + $newNode = $this->dom->createElement('p'); + try { + $newNode->innerHTML = $node->innerHTML; + // It's easier to debug using original attributes. + //$newNode->setAttribute('class', $node->getAttribute('class')); + //$newNode->setAttribute('id', $node->getAttribute('id')); + $node = $node->parentNode->replaceChild($newNode, $node); + $nodeIndex--; + $nodesToScore[] = $newNode; + } catch (Exception $e) { + $this->dbg('Could not alter div/article to p, reverting back to div: ' . $e->getMessage()); + } + } else { + // Will change these P elements back to text nodes after processing. + for ($i = 0, $il = $node->childNodes->length; $i < $il; $i++) { + $childNode = $node->childNodes->item($i); + if (is_object($childNode) && get_class($childNode) === 'DOMProcessingInstruction') { //executable tags (parentNode->removeChild($childNode); + continue; + } + if ($childNode->nodeType == 3) { // XML_TEXT_NODE + //$this->dbg('replacing text node with a P tag with the same content.'); + $p = $this->dom->createElement('p'); + $p->innerHTML = $childNode->nodeValue; + $p->setAttribute('data-readability-styled', 'true'); + $childNode->parentNode->replaceChild($p, $childNode); + } + } + } + } + } + /** + * Loop through all paragraphs, and assign a score to them based on how content-y they look. + * Then add their score to their parent node. + * + * A score is determined by things like number of commas, class names, etc. + * Maybe eventually link density. + */ + for ($pt=0, $scored = count($nodesToScore); $pt < $scored; $pt++) { + $parentNode = $nodesToScore[$pt]->parentNode; + // No parent node? Move on... + if (!$parentNode) { + continue; + } + $grandParentNode = ($parentNode->parentNode instanceof DOMElement) ? $parentNode->parentNode : null; + $innerText = $this->getInnerText($nodesToScore[$pt]); + // If this paragraph is less than MIN_PARAGRAPH_LENGTH (default:20) characters, don't even count it. + if (mb_strlen($innerText) < self::MIN_PARAGRAPH_LENGTH) { + continue; + } + // Initialize readability data for the parent. + if (!$parentNode->hasAttribute('readability')) { + $this->initializeNode($parentNode); + $parentNode->setAttribute('data-candidate','true'); + } + // Initialize readability data for the grandparent. + if ($grandParentNode && !$grandParentNode->hasAttribute('readability') && isset($grandParentNode->tagName)) { + $this->initializeNode($grandParentNode); + $grandParentNode->setAttribute('data-candidate','true'); + } + // Add a point for the paragraph itself as a base. + $contentScore = 1; + // Add points for any commas within this paragraph. + $contentScore += $this->getCommaCount($innerText); + // For every SCORE_CHARS_IN_PARAGRAPH (default:100) characters in this paragraph, add another point. Up to 3 points. + $contentScore += min(floor(mb_strlen($innerText) / self::SCORE_CHARS_IN_PARAGRAPH), 3); + // For every SCORE_WORDS_IN_PARAGRAPH (default:20) words in this paragraph, add another point. Up to 3 points. + $contentScore += min(floor($this->getWordCount($innerText)/ self::SCORE_WORDS_IN_PARAGRAPH), 3); + /* TEST: For every positive/negative parent tag, add/substract half point. Up to 3 points. *\/ + $up = $nodesToScore[$pt]; + $score = 0; + while ($up->parentNode instanceof DOMElement) { + $up = $up->parentNode; + if (preg_match($this->regexps['positive'], $up->getAttribute('class') . ' ' . $up->getAttribute('id'))) { + $score += 0.5; + } elseif (preg_match($this->regexps['negative'], $up->getAttribute('class') . ' ' . $up->getAttribute('id'))) { + $score -= 0.5; + } + } + $score = floor($score); + $contentScore += max(min($score, 3), -3);/**/ + // Add the score to the parent. The grandparent gets half. + $parentNode->getAttributeNode('readability')->value += $contentScore; + if ($grandParentNode) { + $grandParentNode->getAttributeNode('readability')->value += $contentScore / self::GRANDPARENT_SCORE_DIVISOR; + } + } + /** + * Node prepping: trash nodes that look cruddy (like ones with the class name "comment", etc). + * This is faster to do before scoring but safer after. + */ + if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS) && $xpath) { + $candidates = $xpath->query('.//*[(self::footer and count(//footer)<2) or (self::aside and count(//aside)<2)]', $page->documentElement); + for ($node = null, $c = $candidates->length-1; $c >= 0; $c--) { + $node = $candidates->item($c); + // node should be readable but not inside of an article otherwise it's probably non-readable block + if ($node->hasAttribute('readability') && (int) $node->getAttributeNode('readability')->value < 40 && ($node->parentNode ? strcasecmp($node->parentNode->tagName, 'article') !== 0 : true)) { + $this->dbg('Removing unlikely candidate '.$node->getNodePath().' by "'.$node->tagName.'" with readability '.($node->hasAttribute('readability') ? (int) $node->getAttributeNode('readability')->value : 0)); + $node->parentNode->removeChild($node); + } + } + $candidates = $xpath->query('.//*[not(self::body) and (@class or @id or @style) and ((number(@readability) < 40) or not(@readability))]', $page->documentElement); + for ($node = null, $c = $candidates->length-1; $c >= 0; $c--) { + $node = $candidates->item($c); + $tagName = $node->tagName; + /* Remove unlikely candidates */ + $unlikelyMatchString = $node->getAttribute('class')." ".$node->getAttribute('id')." ".$node->getAttribute('style'); + //$this->dbg('Processing '.$node->getNodePath().' by "'. $unlikelyMatchString.'" with readability '.($node->hasAttribute('readability') ? (int)$node->getAttributeNode('readability')->value : 0)); + if (mb_strlen($unlikelyMatchString) > 3 && // don't process "empty" strings + preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) && + !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString) + ) { + $this->dbg('Removing unlikely candidate '.$node->getNodePath().' by "'. $unlikelyMatchString.'" with readability '. ($node->hasAttribute('readability') ? (int) $node->getAttributeNode('readability')->value : 0)); + $node->parentNode->removeChild($node); + $nodeIndex--; + } + } + unset($candidates); + } + /** + * After we've calculated scores, loop through all of the possible candidate nodes we found + * and find the one with the highest score. + */ + $topCandidate = null; + if ($xpath) { + // Using array of DOMElements after deletion is a path to DOOMElement. + $candidates = $xpath->query('.//*[@data-candidate]', $page->documentElement); + for ($c = $candidates->length-1; $c >= 0; $c--) { + // Scale the final candidates score based on link density. Good content should have a + // relatively small link density (5% or less) and be mostly unaffected by this operation. + // If not for this we would have used XPath to find maximum @readability. + $readability = $candidates->item($c)->getAttributeNode('readability'); + $readability->value = round($readability->value * (1 - $this->getLinkDensity($candidates->item($c))), 0, PHP_ROUND_HALF_UP); + if (!$topCandidate || $readability->value > (int) $topCandidate->getAttribute('readability')) { + $this->dbg('Candidate: '.$candidates->item($c)->getNodePath().' ('.$candidates->item($c)->getAttribute('class').':'.$candidates->item($c)->getAttribute('id').') with score '.$readability->value); + $topCandidate = $candidates->item($c); + } + } + unset($candidates); + } + /** + * If we still have no top candidate, just use the body as a last resort. + * We also have to copy the body node so it is something we can modify. + */ + if ($topCandidate === null || strcasecmp($topCandidate->tagName, 'body') === 0) { + $topCandidate = $this->dom->createElement('div'); + if ($page instanceof \DOMDocument) { + if (!isset($page->documentElement)) { + // we don't have a body either? what a mess! :) + $this->dbg('The page has no body!'); + } else { + $this->dbg('Setting body to a raw HTML of original page!'); + $topCandidate->innerHTML = $page->documentElement->innerHTML; + $page->documentElement->innerHTML = ''; + $page->documentElement->appendChild($topCandidate); + } + } else { + $topCandidate->innerHTML = $page->innerHTML; + $page->innerHTML = ''; + $page->appendChild($topCandidate); + } + $this->initializeNode($topCandidate); + } + // Set table as the main node if resulted data is table element. + $tagName = $topCandidate->tagName; + if (strcasecmp($tagName, 'td') === 0 || strcasecmp($tagName, 'tr') === 0) { + $up = $topCandidate; + if ($up->parentNode instanceof DOMElement) { + $up = $up->parentNode; + if (strcasecmp($up->tagName, 'table') === 0) { + $topCandidate = $up; + } + } + } + $this->dbg('Top candidate: '.$topCandidate->getNodePath()); + /** + * Now that we have the top candidate, look through its siblings for content that might also be related. + * Things like preambles, content split by ads that we removed, etc. + */ + $articleContent = $this->dom->createElement('div'); + $articleContent->setAttribute('id', 'readability-content'); + $siblingScoreThreshold = max(10, ((int) $topCandidate->getAttribute('readability')) * 0.2); + $siblingNodes = $topCandidate->parentNode->childNodes; + if (!isset($siblingNodes)) { + $siblingNodes = new stdClass(); + $siblingNodes->length = 0; + } + for ($s = 0, $sl = $siblingNodes->length; $s < $sl; $s++) { + $siblingNode = $siblingNodes->item($s); + $siblingNodeName = $siblingNode->nodeName; + $append = false; + $this->dbg('Looking at sibling node: ' . $siblingNode->getNodePath() . (($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : '')); + //$this->dbg('Sibling has score ' . ($siblingNode->readability ? siblingNode.readability.contentScore : 'Unknown')); + if ($siblingNode->isSameNode($topCandidate)) { + $append = true; + } + $contentBonus = 0; + // Give a bonus if sibling nodes and top candidates have the same classname. + if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->getAttribute('class') == $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') != '') { + $contentBonus += ((int) $topCandidate->getAttribute('readability')) * 0.2; + } + if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability') && (((int) $siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold) { + $append = true; + } + if (strcasecmp($siblingNodeName, 'p') === 0) { + $linkDensity = $this->getLinkDensity($siblingNode); + $nodeContent = $this->getInnerText($siblingNode, true, true); + $nodeLength = mb_strlen($nodeContent); + if ($nodeLength > self::MIN_NODE_LENGTH && $linkDensity < self::MAX_LINK_DENSITY) { + $append = true; + } elseif ($nodeLength < self::MIN_NODE_LENGTH && $linkDensity === 0 && preg_match('/\.( |$)/', $nodeContent)) { + $append = true; + } + } + if ($append) { + $this->dbg('Appending node: ' . $siblingNode->getNodePath()); + $nodeToAppend = null; + if (strcasecmp($siblingNodeName, 'div') !== 0 && strcasecmp($siblingNodeName, 'p') !== 0) { + /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */ + $this->dbg('Altering siblingNode ' . $siblingNodeName . ' to div.'); + $nodeToAppend = $this->dom->createElement('div'); + try { + $nodeToAppend->setAttribute('id', $siblingNode->getAttribute('id')); + $nodeToAppend->setAttribute('alt', $siblingNodeName); + $nodeToAppend->innerHTML = $siblingNode->innerHTML; + } catch (Exception $e) { + $this->dbg('Could not alter siblingNode ' . $siblingNodeName . ' to div, reverting to original.'); + $nodeToAppend = $siblingNode; + $s--; + $sl--; + } + } else { + $nodeToAppend = $siblingNode; + $s--; + $sl--; + } + // To ensure a node does not interfere with readability styles, remove its classnames & ids. + // Now done via RegExp post_filter. + //$nodeToAppend->removeAttribute('class'); + //$nodeToAppend->removeAttribute('id'); + // Append sibling and subtract from our list as appending removes a node. + $articleContent->appendChild($nodeToAppend); + } + } + unset($xpath); + // So we have all of the content that we need. Now we clean it up for presentation. + $this->prepArticle($articleContent); + /** + * Now that we've gone through the full algorithm, check to see if we got any meaningful content. + * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher + * likelihood of finding the content, and the sieve approach gives us a higher likelihood of + * finding the -right- content. + */ + if (mb_strlen($this->getInnerText($articleContent, false)) < self::MIN_ARTICLE_LENGTH) { + if (!$this->body->hasChildNodes()) { + $this->body = $this->dom->createElement('body'); + } + $this->body->innerHTML = $this->bodyCache; + if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) { + $this->removeFlag(self::FLAG_STRIP_UNLIKELYS); + $this->dbg("...content is shorter than ".self::MIN_ARTICLE_LENGTH." letters, trying not to strip unlikely content.\n"); + + return $this->grabArticle($this->body); + } elseif ($this->flagIsActive(self::FLAG_WEIGHT_ATTRIBUTES)) { + $this->removeFlag(self::FLAG_WEIGHT_ATTRIBUTES); + $this->dbg("...content is shorter than ".self::MIN_ARTICLE_LENGTH." letters, trying not to weight attributes.\n"); + + return $this->grabArticle($this->body); + } elseif ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) { + $this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY); + $this->dbg("...content is shorter than ".self::MIN_ARTICLE_LENGTH." letters, trying not to clean at all.\n"); + + return $this->grabArticle($this->body); + } else { + return false; + } + } + + return $articleContent; + } + /** + * Get the inner text of a node. + * This also strips out any excess whitespace to be found. + * + * @param DOMElement $e + * @param boolean $normalizeSpaces (default: true) + * @param boolean $flattenLines (default: false) + * @return string + */ + public function getInnerText($e, $normalizeSpaces=true, $flattenLines=false) + { + if (!isset($e->textContent) || $e->textContent === '') { + return ''; + } + $textContent = trim($e->textContent); + if ($flattenLines) { + $textContent = mb_ereg_replace('(?:[\r\n](?:\s| )*)+', '', $textContent); + } elseif ($normalizeSpaces) { + $textContent = mb_ereg_replace('\s\s+', ' ', $textContent); + } + + return $textContent; + } + /** + * Remove the style attribute on every $e and under. + * + * @param DOMElement $e + * @return void + */ + public function cleanStyles($e) + { + if (!is_object($e)) { + return; + } + $elems = $e->getElementsByTagName('*'); + foreach ($elems as $elem) { + $elem->removeAttribute('style'); + } + } + /** + * Get comma number for a given text. + * + * @param string $text + * @return number (integer) + */ + public function getCommaCount($text) + { + return substr_count($text, ','); + } + /** + * Get words number for a given text if words separated by a space. + * Input string should be normalized. + * + * @param string $text + * @return number (integer) + */ + public function getWordCount($text) + { + return substr_count($text, ' '); + } + /** + * Get the density of links as a percentage of the content + * This is the amount of text that is inside a link divided by the total text in the node. + * Can exclude external references to differentiate between simple text and menus/infoblocks. + * + * @param DOMElement $e + * @param string $excludeExternal + * @return number (float) + */ + public function getLinkDensity($e, $excludeExternal=false) + { + $links = $e->getElementsByTagName('a'); + $textLength = mb_strlen($this->getInnerText($e, true, true)); + $linkLength = 0; + for ($dRe = $this->domainRegExp, $i=0, $il=$links->length; $i < $il; $i++) { + if ($excludeExternal && $dRe && !preg_match($dRe, $links->item($i)->getAttribute('href'))) { + continue; + } + $linkLength += mb_strlen($this->getInnerText($links->item($i))); + } + if ($textLength > 0 && $linkLength > 0) { + return $linkLength / $textLength; + } else { + return 0; + } + } + /** + * Get an element weight by attribute. + * Uses regular expressions to tell if this element looks good or bad. + * + * @param DOMElement $element + * @param string $attribute + * @return number (Integer) + */ + protected function weightAttribute($element, $attribute) + { + if (!$element->hasAttribute($attribute)) { + return 0; + } + $weight = 0; + //$attribute_val = trim($element->getAttribute('class')." ".$element->getAttribute('id')); + $attribute_val = trim($element->getAttribute($attribute)); + if ($attribute_val != '') { + if (preg_match($this->regexps['negative'], $attribute_val)) { + $weight -= 25; + } + if (preg_match($this->regexps['positive'], $attribute_val)) { + $weight += 25; + } + if (preg_match($this->regexps['unlikelyCandidates'], $attribute_val)) { + $weight -= 5; + } + if (preg_match($this->regexps['okMaybeItsACandidate'], $attribute_val)) { + $weight += 5; + } + } + + return $weight; + } + /** + * Get an element relative weight. + * + * @param DOMElement $e + * @return number (Integer) + */ + public function getWeight($e) + { + if (!$this->flagIsActive(self::FLAG_WEIGHT_ATTRIBUTES)) { + return 0; + } + $weight = 0; + /* Look for a special classname */ + $weight += $this->weightAttribute($e, 'class'); + /* Look for a special ID */ + $weight += $this->weightAttribute($e, 'id'); + + return $weight; + } + /** + * Remove extraneous break tags from a node. + * + * @param DOMElement $node + * @return void + */ + public function killBreaks($node) + { + $html = $node->innerHTML; + $html = preg_replace($this->regexps['killBreaks'], '
', $html); + $node->innerHTML = $html; + } + /** + * Clean a node of all elements of type "tag". + * (Unless it's a youtube/vimeo video. People love movies.) + * + * Updated 2012-09-18 to preserve youtube/vimeo iframes + * + * @param DOMElement $e + * @param string $tag + * @return void + */ + public function clean($e, $tag) + { + $targetList = $e->getElementsByTagName($tag); + $isEmbed = ($tag === 'audio' || $tag === 'video' || $tag === 'iframe' || $tag === 'object' || $tag === 'embed'); + for ($cur_item = null, $y = $targetList->length-1; $y >= 0; $y--) { + /* Allow youtube and vimeo videos through as people usually want to see those. */ + $cur_item = $targetList->item($y); + if ($isEmbed) { + $attributeValues = $cur_item->getAttribute('src').' '.$cur_item->getAttribute('href'); + /* First, check the elements attributes to see if any of them contain known media hosts */ + if (preg_match($this->regexps['media'], $attributeValues)) { + continue; + } + /* Then check the elements inside this element for the same. */ + if (preg_match($this->regexps['media'], $targetList->item($y)->innerHTML)) { + continue; + } + } + $cur_item->parentNode->removeChild($cur_item); + } + } + /** + * Clean an element of all tags of type "tag" if they look fishy. + * "Fishy" is an algorithm based on content length, classnames, + * link density, number of images & embeds, etc. + * + * @param DOMElement $e + * @param string $tag + * @return void + */ + public function cleanConditionally($e, $tag) + { + if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) { + return; + } + $tagsList = $e->getElementsByTagName($tag); + $curTagsLength = $tagsList->length; + /** + * Gather counts for other typical elements embedded within. + * Traverse backwards so we can remove nodes at the same time without effecting the traversal. + * + * TODO: Consider taking into account original contentScore here. + */ + for ($node = null, $i = $curTagsLength - 1; $i >= 0; $i--) { + $node = $tagsList->item($i); + //$class = $node->getAttribute('class').' '.$node->getAttribute('id'); //debug + $weight = $this->getWeight($node); + $contentScore = ($node->hasAttribute('readability')) ? (int) $node->getAttribute('readability') : 0; + $this->dbg('Start conditional cleaning of ' . $node->getNodePath() . ' (class=' . $node->getAttribute('class') . '; id=' . $node->getAttribute('id') . ')' . (($node->hasAttribute('readability')) ? (' with score ' . $node->getAttribute('readability')) : '')); + if ($weight + $contentScore < 0) { + $this->dbg('Removing...'); + $node->parentNode->removeChild($node); + } elseif ($this->getCommaCount($this->getInnerText($node)) < self::MIN_COMMAS_IN_PARAGRAPH) { + /** + * If there are not very many commas, and the number of + * non-paragraph elements is more than paragraphs or other ominous signs, remove the element. + */ + $p = $node->getElementsByTagName('p')->length; + $img = $node->getElementsByTagName('img')->length; + $li = $node->getElementsByTagName('li')->length-100; + $input = $node->getElementsByTagName('input')->length; + $a = $node->getElementsByTagName('a')->length; + $embedCount = 0; + $embeds = $node->getElementsByTagName('embed'); + for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) { + if (preg_match($this->regexps['media'], $embeds->item($ei)->getAttribute('src'))) { + $embedCount++; + } + } + $embeds = $node->getElementsByTagName('iframe'); + for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) { + if (preg_match($this->regexps['media'], $embeds->item($ei)->getAttribute('src'))) { + $embedCount++; + } + } + $linkDensity = $this->getLinkDensity($node, true); + $contentLength = mb_strlen($this->getInnerText($node)); + $toRemove = false; + if ($this->lightClean) { + if ($li > $p && $tag != 'ul' && $tag != 'ol') { + $this->dbg(' too many

  • elements, and parent is not
      or
        '); + $toRemove = true; + } elseif ( $input > floor($p/3) ) { + $this->dbg(' too many elements'); + $toRemove = true; + } elseif ($contentLength < 6 && ($embedCount === 0 && ($img === 0 || $img > 2))) { + $this->dbg(' content length less than 6 chars, 0 embeds and either 0 images or more than 2 images'); + $toRemove = true; + } elseif ($weight < 25 && $linkDensity > 0.25) { + $this->dbg(' weight is '.$weight.' < 25 and link density is '.sprintf("%.2f", $linkDensity).' > 0.25'); + $toRemove = true; + } elseif ($a > 2 && ($weight >= 25 && $linkDensity > 0.5)) { + $this->dbg(' more than 2 links and weight is '.$weight.' > 25 but link density is '.sprintf("%.2f", $linkDensity).' > 0.5'); + $toRemove = true; + } elseif ($embedCount > 3) { + $this->dbg(' more than 3 embeds'); + $toRemove = true; + } + } else { + if ($img > $p) { + $this->dbg(' more image elements than paragraph elements'); + $toRemove = true; + } elseif ($li > $p && $tag != 'ul' && $tag != 'ol') { + $this->dbg(' too many
      1. elements, and parent is not
          or
            '); + $toRemove = true; + } elseif ( $input > floor($p/3) ) { + $this->dbg(' too many elements'); + $toRemove = true; + } elseif ($contentLength < 25 && ($img === 0 || $img > 2) ) { + $this->dbg(' content length less than 25 chars and 0 images, or more than 2 images'); + $toRemove = true; + } elseif ($weight < 25 && $linkDensity > 0.2) { + $this->dbg(' weight is '.$weight.' lower than 0 and link density is '.sprintf("%.2f", $linkDensity).' > 0.2'); + $toRemove = true; + } elseif ($weight >= 25 && $linkDensity > 0.5) { + $this->dbg(' weight above 25 but link density is '.sprintf("%.2f", $linkDensity).' > 0.5'); + $toRemove = true; + } elseif (($embedCount == 1 && $contentLength < 75) || $embedCount > 1) { + $this->dbg(' 1 embed and content length smaller than 75 chars, or more than one embed'); + $toRemove = true; + } + } + if ($toRemove) { + //$this->dbg('Removing: '.$node->innerHTML); + $this->dbg('Removing...'); + $node->parentNode->removeChild($node); + } + } + } + } + /** + * Clean out spurious headers from an Element. Checks things like classnames and link density. + * + * @param DOMElement $e + * @return void + */ + public function cleanHeaders($e) + { + for ($headerIndex = 1; $headerIndex < 3; $headerIndex++) { + $headers = $e->getElementsByTagName('h' . $headerIndex); + for ($i=$headers->length-1; $i >=0; $i--) { + if ($this->getWeight($headers->item($i)) < 0 || $this->getLinkDensity($headers->item($i)) > 0.33) { + $headers->item($i)->parentNode->removeChild($headers->item($i)); + } + } + } + } + public function flagIsActive($flag) + { + return ($this->flags & $flag) > 0; + } + public function addFlag($flag) + { + $this->flags = $this->flags | $flag; + } + public function removeFlag($flag) + { + $this->flags = $this->flags & ~$flag; + } +} diff --git a/tests/ReadabilityTest.php b/tests/ReadabilityTest.php new file mode 100644 index 0000000..b36d234 --- /dev/null +++ b/tests/ReadabilityTest.php @@ -0,0 +1,228 @@ +debugText; + } + + public function getDomainRegexp() + { + return $this->domainRegExp; + } +} + +class ReadabilityTest extends \PHPUnit_Framework_TestCase +{ + protected function setUp() + { + } + + protected function tearDown() + { + } + + public function testConstructDefault() + { + $readability = new ReadabilityTested(''); + + $this->assertNull($readability->url); + $this->assertContains('Parsing URL', $readability->getDebugText()); + $this->assertContains('Tidying document', $readability->getDebugText()); + $this->assertNull($readability->getDomainRegexp()); + $this->assertInstanceOf('DomDocument', $readability->dom); + } + + public function testConstructSimple() + { + $readability = new ReadabilityTested('', 'http://0.0.0.0'); + + $this->assertEquals('http://0.0.0.0', $readability->url); + $this->assertContains('Parsing URL', $readability->getDebugText()); + $this->assertContains('Tidying document', $readability->getDebugText()); + $this->assertEquals('/0\.0\.0\.0/', $readability->getDomainRegexp()); + $this->assertInstanceOf('DomDocument', $readability->dom); + } + + public function testInitNoContent() + { + $readability = new ReadabilityTested('', 'http://0.0.0.0'); + $res = $readability->init(); + + $this->assertFalse($res); + $this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getContent()); + $this->assertContains('Sorry, Readability was unable to parse this page for content.', $readability->getContent()->innerHTML); + } + + public function testInitP() + { + $readability = new ReadabilityTested(str_repeat('

            This is the awesome content :)

            ', 7), 'http://0.0.0.0'); + $res = $readability->init(); + + $this->assertTrue($res); + $this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getContent()); + $this->assertContains('
            getContent()->innerHTML); + } + + public function testInitDivP() + { + $readability = new ReadabilityTested('
            '.str_repeat('

            This is the awesome content :)

            ', 7).'
            ', 'http://0.0.0.0'); + $res = $readability->init(); + + $this->assertTrue($res); + $this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getContent()); + $this->assertContains('
            getContent()->innerHTML); + } + + public function testInitDiv() + { + $readability = new ReadabilityTested('
            '.str_repeat('This is the awesome content :)', 7).'
            ', 'http://0.0.0.0'); + $readability->debug = true; + $res = $readability->init(); + + $this->assertTrue($res); + $this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getContent()); + $this->assertContains('
            getContent()->innerHTML); + } + + public function testWithFootnotes() + { + $readability = new ReadabilityTested('
            '.str_repeat('

            This is an awesome text with some links, here there are: the awesome

            ', 7).'
            ', 'http://0.0.0.0'); + $readability->debug = true; + $readability->convertLinksToFootnotes = true; + $res = $readability->init(); + + $this->assertTrue($res); + $this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getContent()); + $this->assertContains('
            getContent()->innerHTML); + $this->assertContains('readabilityFootnoteLink', $readability->getContent()->innerHTML); + $this->assertContains('readabilityLink-3', $readability->getContent()->innerHTML); + } + + public function testStandardClean() + { + $readability = new ReadabilityTested('

            Title

            '.str_repeat('

            This is an awesome text with some links, here there are: the awesome

            ', 7).'will be removed
            ', 'http://0.0.0.0'); + $readability->debug = true; + $readability->lightClean = false; + $res = $readability->init(); + + $this->assertTrue($res); + $this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getContent()); + $this->assertContains('
            getContent()->innerHTML); + $this->assertNotContains('will be removed', $readability->getContent()->innerHTML); + $this->assertNotContains('

            ', $readability->getContent()->innerHTML); + } + + public function testWithIframe() + { + $readability = new ReadabilityTested('

            Title

            '.str_repeat('

            This is an awesome text with some links, here there are: the awesome

            ', 7).'

            This is an awesome text with some links, here there are

            ', 'http://0.0.0.0'); + $readability->debug = true; + $res = $readability->init(); + + $this->assertTrue($res); + $this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getContent()); + $this->assertContains('
            getContent()->innerHTML); + $this->assertContains('nofollow', $readability->getContent()->innerHTML); + } + + public function testWithArticle() + { + $readability = new ReadabilityTested('

            '.str_repeat('This is an awesome text with some links, here there are: the awesome', 20).'

            This is an awesome text with some links, here there are

            ', 'http://0.0.0.0'); + $readability->debug = true; + $res = $readability->init(); + + $this->assertTrue($res); + $this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getContent()); + $this->assertContains('alt="article"', $readability->getContent()->innerHTML); + $this->assertContains('This is an awesome text with some links, here there are', $readability->getContent()->innerHTML); + $this->assertContains('nofollow', $readability->getContent()->innerHTML); + } + + public function testWithAside() + { + $readability = new ReadabilityTested('
            '.str_repeat('

            This is an awesome text with some links, here there are: the awesome

            ', 7).'
            ', 'http://0.0.0.0'); + $readability->debug = true; + $res = $readability->init(); + + $this->assertTrue($res); + $this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getContent()); + $this->assertContains('alt="article"', $readability->getContent()->innerHTML); + $this->assertContains('This is an awesome text with some links, here there are', $readability->getContent()->innerHTML); + $this->assertNotContains('