Merge pull request #3 from j0k3r/phpunit-travis

Improve Travis
11 years ago · b69619d386
parent ddd013e3f8 1963319a55
commit b69619d386
8 changed files with 161 additions and 129 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1 +1,3 @@
 vendor/
+coverage/
+composer.lock
--- a/.scrutinizer.yml
+++ b/.scrutinizer.yml
@ -0,0 +1,3 @@
+tools:
+    external_code_coverage:
+        timeout: 600
--- a/.travis.yml
+++ b/.travis.yml
@ -4,10 +4,29 @@ php:
    - 5.4
    - 5.5
    - 5.6
+    - nightly
+    - hhvm-nightly

-before_script:
+# run build against nightly but allow them to fail
+matrix:
+    fast_finish: true
+    allow_failures:
+        - php: nightly
+        - php: hhvm-nightly
+
+# faster builds on new travis setup not using sudo
+sudo: false
+
+install:
    - composer self-update
+
+before_script:
    - composer install --prefer-dist --no-interaction

 script:
-    - phpunit --coverage-text
+    - phpunit --coverage-clover=coverage.clover
+
+after_script:
+    - |
+        wget https://scrutinizer-ci.com/ocular.phar
+        php ocular.phar code-coverage:upload --format=php-clover coverage.clover
--- a/README.md
+++ b/README.md
@ -1,6 +1,7 @@
 # Readability

 [![Build Status](https://travis-ci.org/j0k3r/php-readability.svg?branch=master)](https://travis-ci.org/j0k3r/php-readability)
+[![Code Coverage](https://scrutinizer-ci.com/g/j0k3r/php-readability/badges/coverage.png?b=master)](https://scrutinizer-ci.com/g/j0k3r/php-readability/?branch=master)

 This is an extract of the Readability class from the [full-text-rss](https://github.com/Dither/full-text-rss) fork. It kind be defined as a better version of the original [php-readability](http://code.fivefilters.org/php-readability).

--- a/phpunit.xml.dist
+++ b/phpunit.xml.dist
@ -19,11 +19,14 @@

    <filter>
        <whitelist>
-            <directory>./src/TubeLink/</directory>
+            <directory>./src/</directory>
            <exclude>
                <directory>./tests</directory>
            </exclude>
        </whitelist>
    </filter>

+    <logging>
+        <log type="coverage-html" target="coverage" title="FullText" charset="UTF-8" yui="true" highlight="true" lowUpperBound="35" highLowerBound="70"/>
+    </logging>
 </phpunit>
--- a/src/JSLikeHTMLElement.php
+++ b/src/JSLikeHTMLElement.php
@ -3,7 +3,7 @@
 namespace Readability;

 /**
- * JavaScript-like HTML DOM Element
+ * JavaScript-like HTML DOM Element.
 *
 * This class extends PHP's DOMElement to allow
 * users to get and set the innerHTML property of
@ -31,12 +31,14 @@ namespace Readability;
 *     echo $doc->saveXML();
 *
 * @author Keyvan Minoukadeh - http://www.keyvan.net - keyvan@keyvan.net
+ *
 * @see http://fivefilters.org (the project this was written for)
 */
 class JSLikeHTMLElement extends \DOMElement
 {
    /**
-     * Used for setting innerHTML like it's done in JavaScript:
+     * Used for setting innerHTML like it's done in JavaScript:.
+     *
     * @code
     * $div->innerHTML = '<h2>Chapter 2</h2><p>The story begins...</p>';
     * @endcode
@ -86,7 +88,8 @@ class JSLikeHTMLElement extends \DOMElement
    }

    /**
-     * Used for getting innerHTML like it's done in JavaScript:
+     * Used for getting innerHTML like it's done in JavaScript:.
+     *
     * @code
     * $string = $div->innerHTML;
     * @endcode
@ -105,7 +108,7 @@ class JSLikeHTMLElement extends \DOMElement
        $trace = debug_backtrace();
        trigger_error('Undefined property via __get(): '.$name.' in '.$trace[0]['file'].' on line '.$trace[0]['line'], E_USER_NOTICE);

-        return null;
+        return;
    }

    public function __toString()
--- a/src/Readability.php
+++ b/src/Readability.php
@ -14,7 +14,7 @@ namespace Readability;
 * More information: http://fivefilters.org/content-only/
 * License: Apache License, Version 2.0
 * Requires: PHP version 5.2.0+
- * Date: 2013-08-02
+ * Date: 2013-08-02.
 *
 * Differences between the PHP port and the original
 * ------------------------------------------------------
@ -52,6 +52,7 @@ class Readability
    public $revertForcedParagraphElements = true;
    public $articleTitle;
    public $articleContent;
+    public $original_html;
    public $dom;
    public $url = null; // optional - URL where HTML was retrieved
    public $lightClean = true; // preserves more content (experimental)
@ -75,7 +76,7 @@ class Readability
        'divToPElements' => '/<(?:blockquote|code|div|article|footer|aside|img|p|pre|dl|ol|ul)/mi',
        'killBreaks' => '/(<br\s*\/?>([ \r\n\s]|&nbsp;?)*)+/',
        'media' => '!//(?:[^\.\?/]+\.)?(?:youtu(?:be)?|soundcloud|dailymotion|vimeo|pornhub|xvideos|twitvid|rutube|viddler)\.(?:com|be|org|net)/!i',
-        'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i'
+        'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i',
    );
    public $tidy_config = array(
        'tidy-mark' => false,
@ -100,7 +101,7 @@ class Readability
        // 'merge-spans' => true,
        'input-encoding' => '????',
        'output-encoding' => 'utf8',
-        'hide-comments' => true
+        'hide-comments' => true,
    );
    // raw HTML filters
    protected $pre_filters = array(
@ -110,7 +111,7 @@ class Readability
        '!<font[^>]*>\s*\[AD\]\s*</font>!is' => '', // HACK: firewall-filtered content
        '!(<br[^>]*>[ \r\n\s]*){2,}!i' => '</p><p>', // HACK: replace linebreaks plus br's with p's
        //'!</?noscript>!is' => '', // replace noscripts
-        '!<(/?)font[^>]*>!is' => '<\\1span>' // replace fonts to spans
+        '!<(/?)font[^>]*>!is' => '<\\1span>', // replace fonts to spans
    );
    // output HTML filters
    protected $post_filters = array(
@ -120,7 +121,7 @@ class Readability
        "/\n+/" => "\n", //single newlines cleanup
        '!<pre[^>]*>\s*<code!is' => '<pre', // modern web...
        '!</code>\s*</pre>!is' => '</pre>',
-        '!<[hb]r>!is' => '<\\1 />'
+        '!<[hb]r>!is' => '<\\1 />',
    );
    // flags
    const FLAG_STRIP_UNLIKELYS = 1;
@ -138,11 +139,12 @@ class Readability
    const MIN_NODE_LENGTH = 80;
    const MAX_LINK_DENSITY = 0.25;
    /**
-     * Create instance of Readability
+     * Create instance of Readability.
+     *
     * @param string UTF-8 encoded string
     * @param string (optional) URL associated with HTML (for footnotes)
     * @param string (optional) Which parser to use for turning raw HTML into a DOMDocument
-     * @param boolean (optional) Use tidy
+     * @param bool (optional) Use tidy
     */
    public function __construct($html, $url = null, $parser = 'libxml', $use_tidy = true)
    {
@ -153,9 +155,9 @@ class Readability
            $this->domainRegExp = '/'.strtr(preg_replace('/www\d*\./', '', parse_url($url, PHP_URL_HOST)), array('.' => '\.')).'/';
        }

-        mb_internal_encoding("UTF-8");
-        mb_http_output("UTF-8");
-        mb_regex_encoding("UTF-8");
+        mb_internal_encoding('UTF-8');
+        mb_http_output('UTF-8');
+        mb_regex_encoding('UTF-8');

        // HACK: dirty cleanup to replace some stuff; shouldn't use regexps with HTML but well...
        if (!$this->flagIsActive(self::FLAG_DISABLE_PREFILTER)) {
@ -169,7 +171,7 @@ class Readability
            $html = '<html></html>';
        }

-        /**
+        /*
         * Use tidy (if it exists).
         * This fixes problems with some sites which would otherwise trouble DOMDocument's HTML parsing.
         * Although sometimes it makes matters worse, which is why there is an option to disable it.
@ -179,7 +181,7 @@ class Readability
            $this->debugText .= 'Tidying document'."\n";
            $tidy = tidy_parse_string($html, $this->tidy_config, 'UTF8');
            if (tidy_clean_repair($tidy)) {
-                $original_html = $html;
+                $this->original_html = $html;
                $this->tidied = true;
                $html = $tidy->value;
                $html = preg_replace('/<html[^>]+>/i', '<html>', $html);
@ -187,7 +189,7 @@ class Readability
            }
            unset($tidy);
        }
-        $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
+        $html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8');

        if (!($parser == 'html5lib' && ($this->dom = \HTML5_Parser::parse($html)))) {
            libxml_use_internal_errors(true);
@ -200,7 +202,8 @@ class Readability
        $this->dom->registerNodeClass('DOMElement', 'Readability\JSLikeHTMLElement');
    }
    /**
-     * Get article title element
+     * Get article title element.
+     *
     * @return DOMElement
     */
    public function getTitle()
@ -208,7 +211,8 @@ class Readability
        return $this->articleTitle;
    }
    /**
-     * Get article content element
+     * Get article content element.
+     *
     * @return DOMElement
     */
    public function getContent()
@ -216,7 +220,8 @@ class Readability
        return $this->articleContent;
    }
    /**
-     * Add pre filter for raw input HTML processing
+     * Add pre filter for raw input HTML processing.
+     *
     * @param string RegExp for replace
     * @param string (optional) Replacer
     */
@ -225,7 +230,8 @@ class Readability
        $this->pre_filters[$filter] = $replacer;
    }
    /**
-     * Add post filter for raw output HTML processing
+     * Add post filter for raw output HTML processing.
+     *
     * @param string RegExp for replace
     * @param string (optional) Replacer
     */
@ -243,7 +249,7 @@ class Readability
     *  4. Replace the current DOM tree with the new one.
     *  5. Read peacefully.
     *
-     * @return boolean true if we found content, false otherwise
+     * @return bool true if we found content, false otherwise
     */
    public function init()
    {
@ -258,7 +264,7 @@ class Readability
        if ($this->bodyCache == null) {
            $this->bodyCache = '';
            foreach ($bodyElems as $bodyNode) {
-                $this->bodyCache += $bodyNode->innerHTML;
+                $this->bodyCache .= trim($bodyNode->innerHTML);
            }
        }
        if ($bodyElems->length > 0 && $this->body == null) {
@ -295,7 +301,7 @@ class Readability
        return $this->success;
    }
    /**
-     * Debug
+     * Debug.
     */
    protected function dbg($msg) //, $error=false)
    {
@ -305,12 +311,12 @@ class Readability
    }

    /**
-     * Dump debug info
+     * Dump debug info.
     */
    protected function dump_dbg()
    {
        if ($this->debug) {
-            openlog("Readability PHP ", LOG_PID | LOG_PERROR, 0);
+            openlog('Readability PHP ', LOG_PID | LOG_PERROR, 0);
            syslog(6, $this->debugText); // 1 - error 6 - info
        }
    }
@ -318,7 +324,6 @@ class Readability
     * Run any post-process modifications to article content as necessary.
     *
     * @param DOMElement
-     * @return void
     */
    public function postProcessContent($articleContent)
    {
@ -337,7 +342,8 @@ class Readability
        $origTitle = '';
        try {
            $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0));
-        } catch (Exception $e) {}
+        } catch (Exception $e) {
+        }
        if (preg_match('/ [\|\-] /', $curTitle)) {
            $curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle);
            if (count(explode(' ', $curTitle)) < 3) {
@ -366,12 +372,10 @@ class Readability
    /**
     * Prepare the HTML document for readability to scrape it.
     * This includes things like stripping javascript, CSS, and handling terrible markup.
-     *
-     * @return void
     */
    protected function prepDocument()
    {
-        /**
+        /*
         * In some cases a body element can't be found (if the HTML is totally hosed for example)
         * so we create a new body node and append it to the document.
         */
@ -392,9 +396,8 @@ class Readability
    }
    /**
     * For easier reading, convert this document to have footnotes at the bottom rather than inline links.
-     * @see http://www.roughtype.com/archives/2010/05/experiments_in.php
     *
-     * @return void
+     * @see http://www.roughtype.com/archives/2010/05/experiments_in.php
     */
    public function addFootnotes($articleContent)
    {
@ -450,7 +453,6 @@ class Readability
     * iframes, forms, strip extraneous <p> tags, etc.
     *
     * @param DOMElement
-     * @return void
     */
    public function prepArticle($articleContent)
    {
@ -463,7 +465,7 @@ class Readability
        $this->killBreaks($articleContent);
        $xpath = new \DOMXPath($articleContent->ownerDocument);
        if ($this->revertForcedParagraphElements) {
-            /**
+            /*
             * Reverts P elements with class 'readability-styled' to text nodes:
             * which is what they were before.
             */
@ -493,7 +495,7 @@ class Readability
        $this->clean($articleContent, 'canvas');
        $this->clean($articleContent, 'h1');

-        /**
+        /*
         * If there is only one h2, they are probably using it as a main header, so remove it since we
         *  already have a header.
         */
@ -536,7 +538,7 @@ class Readability
                }
                unset($search, $replace);
            } catch (Exception $e) {
-                $this->dbg("Cleaning output HTML failed. Ignoring: " . $e->getMessage());
+                $this->dbg('Cleaning output HTML failed. Ignoring: '.$e->getMessage());
            }
        }
    }
@ -545,7 +547,6 @@ class Readability
     * className/id for special names to add to its score.
     *
     * @param Element
-     * @return void
     */
    protected function initializeNode($node)
    {
@ -667,7 +668,7 @@ class Readability
                }
            }
        }
-        /**
+        /*
         * Loop through all paragraphs, and assign a score to them based on how content-y they look.
         * Then add their score to their parent node.
         *
@ -723,7 +724,7 @@ class Readability
                $grandParentNode->getAttributeNode('readability')->value += $contentScore / self::GRANDPARENT_SCORE_DIVISOR;
            }
        }
-        /**
+        /*
         * Node prepping: trash nodes that look cruddy (like ones with the class name "comment", etc).
         * This is faster to do before scoring but safer after.
         */
@ -742,7 +743,7 @@ class Readability
                $node = $candidates->item($c);
                $tagName = $node->tagName;
                /* Remove unlikely candidates */
-                $unlikelyMatchString = $node->getAttribute('class')." ".$node->getAttribute('id')." ".$node->getAttribute('style');
+                $unlikelyMatchString = $node->getAttribute('class').' '.$node->getAttribute('id').' '.$node->getAttribute('style');
                //$this->dbg('Processing '.$node->getNodePath().' by "'. $unlikelyMatchString.'" with readability '.($node->hasAttribute('readability') ? (int)$node->getAttributeNode('readability')->value : 0));
                if (mb_strlen($unlikelyMatchString) > 3 && // don't process "empty" strings
                    preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) &&
@ -755,7 +756,7 @@ class Readability
            }
            unset($candidates);
        }
-        /**
+        /*
         * After we've calculated scores, loop through all of the possible candidate nodes we found
         * and find the one with the highest score.
         */
@ -776,7 +777,7 @@ class Readability
            }
            unset($candidates);
        }
-        /**
+        /*
         * If we still have no top candidate, just use the body as a last resort.
         * We also have to copy the body node so it is something we can modify.
         */
@ -811,7 +812,7 @@ class Readability
            }
        }
        $this->dbg('Top candidate: '.$topCandidate->getNodePath());
-        /**
+        /*
         * Now that we have the top candidate, look through its siblings for content that might also be related.
         * Things like preambles, content split by ads that we removed, etc.
         */
@ -883,7 +884,7 @@ class Readability
        unset($xpath);
        // So we have all of the content that we need. Now we clean it up for presentation.
        $this->prepArticle($articleContent);
-        /**
+        /*
         * Now that we've gone through the full algorithm, check to see if we got any meaningful content.
         * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher
         * likelihood of finding the content, and the sieve approach gives us a higher likelihood of
@ -896,17 +897,17 @@ class Readability
            $this->body->innerHTML = $this->bodyCache;
            if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) {
                $this->removeFlag(self::FLAG_STRIP_UNLIKELYS);
-                $this->dbg("...content is shorter than ".self::MIN_ARTICLE_LENGTH." letters, trying not to strip unlikely content.\n");
+                $this->dbg('...content is shorter than '.self::MIN_ARTICLE_LENGTH." letters, trying not to strip unlikely content.\n");

                return $this->grabArticle($this->body);
            } elseif ($this->flagIsActive(self::FLAG_WEIGHT_ATTRIBUTES)) {
                $this->removeFlag(self::FLAG_WEIGHT_ATTRIBUTES);
-                $this->dbg("...content is shorter than ".self::MIN_ARTICLE_LENGTH." letters, trying not to weight attributes.\n");
+                $this->dbg('...content is shorter than '.self::MIN_ARTICLE_LENGTH." letters, trying not to weight attributes.\n");

                return $this->grabArticle($this->body);
            } elseif ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
                $this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY);
-                $this->dbg("...content is shorter than ".self::MIN_ARTICLE_LENGTH." letters, trying not to clean at all.\n");
+                $this->dbg('...content is shorter than '.self::MIN_ARTICLE_LENGTH." letters, trying not to clean at all.\n");

                return $this->grabArticle($this->body);
            } else {
@ -921,8 +922,9 @@ class Readability
     * This also strips out any excess whitespace to be found.
     *
     * @param DOMElement $e
-     * @param  boolean    $normalizeSpaces (default: true)
-     * @param  boolean    $flattenLines    (default: false)
+     * @param bool       $normalizeSpaces (default: true)
+     * @param bool       $flattenLines    (default: false)
+     *
     * @return string
     */
    public function getInnerText($e, $normalizeSpaces = true, $flattenLines = false)
@ -943,7 +945,6 @@ class Readability
     * Remove the style attribute on every $e and under.
     *
     * @param DOMElement $e
-     * @return void
     */
    public function cleanStyles($e)
    {
@ -959,6 +960,7 @@ class Readability
     * Get comma number for a given text.
     *
     * @param string $text
+     *
     * @return number (integer)
     */
    public function getCommaCount($text)
@ -970,6 +972,7 @@ class Readability
     * Input string should be normalized.
     *
     * @param string $text
+     *
     * @return number (integer)
     */
    public function getWordCount($text)
@ -983,6 +986,7 @@ class Readability
     *
     * @param DOMElement $e
     * @param string     $excludeExternal
+     *
     * @return number (float)
     */
    public function getLinkDensity($e, $excludeExternal = false)
@ -1008,6 +1012,7 @@ class Readability
     *
     * @param DOMElement $element
     * @param string     $attribute
+     *
     * @return number (Integer)
     */
    protected function weightAttribute($element, $attribute)
@ -1039,6 +1044,7 @@ class Readability
     * Get an element relative weight.
     *
     * @param DOMElement $e
+     *
     * @return number (Integer)
     */
    public function getWeight($e)
@ -1058,7 +1064,6 @@ class Readability
     * Remove extraneous break tags from a node.
     *
     * @param DOMElement $node
-     * @return void
     */
    public function killBreaks($node)
    {
@ -1068,13 +1073,12 @@ class Readability
    }
    /**
     * Clean a node of all elements of type "tag".
-     * (Unless it's a youtube/vimeo video. People love movies.)
+     * (Unless it's a youtube/vimeo video. People love movies.).
     *
     * Updated 2012-09-18 to preserve youtube/vimeo iframes
     *
     * @param DOMElement $e
     * @param string     $tag
-     * @return void
     */
    public function clean($e, $tag)
    {
@ -1104,7 +1108,6 @@ class Readability
     *
     * @param DOMElement $e
     * @param string     $tag
-     * @return void
     */
    public function cleanConditionally($e, $tag)
    {
@ -1113,7 +1116,7 @@ class Readability
        }
        $tagsList = $e->getElementsByTagName($tag);
        $curTagsLength = $tagsList->length;
-        /**
+        /*
         * Gather counts for other typical elements embedded within.
         * Traverse backwards so we can remove nodes at the same time without effecting the traversal.
         *
@ -1129,7 +1132,7 @@ class Readability
                $this->dbg('Removing...');
                $node->parentNode->removeChild($node);
            } elseif ($this->getCommaCount($this->getInnerText($node)) < self::MIN_COMMAS_IN_PARAGRAPH) {
-                /**
+                /*
                 * If there are not very many commas, and the number of
                 * non-paragraph elements is more than paragraphs or other ominous signs, remove the element.
                 */
@ -1165,10 +1168,10 @@ class Readability
                        $this->dbg(' content length less than 6 chars, 0 embeds and either 0 images or more than 2 images');
                        $toRemove = true;
                    } elseif ($weight < 25 && $linkDensity > 0.25) {
-                        $this->dbg(' weight is '.$weight.' < 25 and link density is '.sprintf("%.2f", $linkDensity).' > 0.25');
+                        $this->dbg(' weight is '.$weight.' < 25 and link density is '.sprintf('%.2f', $linkDensity).' > 0.25');
                        $toRemove = true;
                    } elseif ($a > 2 && ($weight >= 25 && $linkDensity > 0.5)) {
-                        $this->dbg('  more than 2 links and weight is '.$weight.' > 25 but link density is '.sprintf("%.2f", $linkDensity).' > 0.5');
+                        $this->dbg('  more than 2 links and weight is '.$weight.' > 25 but link density is '.sprintf('%.2f', $linkDensity).' > 0.5');
                        $toRemove = true;
                    } elseif ($embedCount > 3) {
                        $this->dbg(' more than 3 embeds');
@ -1188,10 +1191,10 @@ class Readability
                        $this->dbg('  content length less than 25 chars and 0 images, or more than 2 images');
                        $toRemove = true;
                    } elseif ($weight < 25 && $linkDensity > 0.2) {
-                        $this->dbg('  weight is '.$weight.' lower than 0 and link density is '.sprintf("%.2f", $linkDensity).' > 0.2');
+                        $this->dbg('  weight is '.$weight.' lower than 0 and link density is '.sprintf('%.2f', $linkDensity).' > 0.2');
                        $toRemove = true;
                    } elseif ($weight >= 25 && $linkDensity > 0.5) {
-                        $this->dbg('  weight above 25 but link density is '.sprintf("%.2f", $linkDensity).' > 0.5');
+                        $this->dbg('  weight above 25 but link density is '.sprintf('%.2f', $linkDensity).' > 0.5');
                        $toRemove = true;
                    } elseif (($embedCount == 1 && $contentLength < 75) || $embedCount > 1) {
                        $this->dbg('  1 embed and content length smaller than 75 chars, or more than one embed');
@ -1210,7 +1213,6 @@ class Readability
     * Clean out spurious headers from an Element. Checks things like classnames and link density.
     *
     * @param DOMElement $e
-     * @return void
     */
    public function cleanHeaders($e)
    {
--- a/tests/ReadabilityTest.php
+++ b/tests/ReadabilityTest.php
@ -3,7 +3,6 @@
 namespace Tests\Readability;

 use Readability\Readability;
-use Readability\JSLikeHTMLElement;

 class ReadabilityTested extends Readability
 {