Merge pull request #69 from j0k3r/feature/php-7.2

Require PHP >= 7.2
pull/64/head
Jérémy Benoist 4 years ago committed by GitHub
commit 2e9349f076
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 6
      .github/workflows/coding-standards.yml
  2. 62
      .github/workflows/continuous-integration.yml
  3. 1
      .gitignore
  4. 12
      .scrutinizer.yml
  5. 2
      README.md
  6. 13
      composer.json
  7. 4
      phpstan.neon
  8. 2
      phpunit.xml.dist
  9. 227
      src/Readability.php
  10. 91
      tests/ReadabilityTest.php

@ -8,9 +8,6 @@ on:
branches: branches:
- master - master
env:
SYMFONY_PHPUNIT_VERSION: 7.5
jobs: jobs:
coding-standards: coding-standards:
name: "CS Fixer & PHPStan" name: "CS Fixer & PHPStan"
@ -35,9 +32,6 @@ jobs:
env: env:
COMPOSER_TOKEN: ${{ secrets.GITHUB_TOKEN }} COMPOSER_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: "Add PHPStan"
run: "composer require phpstan/phpstan phpstan/phpstan-phpunit --dev --no-progress --no-suggest"
- name: "Install dependencies with Composer" - name: "Install dependencies with Composer"
uses: "ramsey/composer-install@v1" uses: "ramsey/composer-install@v1"
with: with:

@ -19,13 +19,11 @@ jobs:
strategy: strategy:
matrix: matrix:
php: php:
- "5.6"
- "7.0"
- "7.1"
- "7.2" - "7.2"
- "7.3" - "7.3"
- "7.4" - "7.4"
- "8.0" - "8.0"
- "8.1"
steps: steps:
- name: "Checkout" - name: "Checkout"
@ -38,18 +36,12 @@ jobs:
with: with:
php-version: "${{ matrix.php }}" php-version: "${{ matrix.php }}"
coverage: "none" coverage: "none"
tools: composer:v1 tools: composer:v2
extensions: tidy extensions: tidy
ini-values: "date.timezone=Europe/Paris" ini-values: "date.timezone=Europe/Paris"
env: env:
COMPOSER_TOKEN: ${{ secrets.GITHUB_TOKEN }} COMPOSER_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: "Force PHPUnit version"
if: matrix.php >= '7.2'
run: "echo $SYMFONY_PHPUNIT_VERSION"
env:
SYMFONY_PHPUNIT_VERSION: 7.5
- name: "Remove useless deps" - name: "Remove useless deps"
run: "composer remove friendsofphp/php-cs-fixer --dev --no-progress --no-update" run: "composer remove friendsofphp/php-cs-fixer --dev --no-progress --no-update"
@ -84,7 +76,7 @@ jobs:
with: with:
php-version: "${{ matrix.php }}" php-version: "${{ matrix.php }}"
coverage: "xdebug" coverage: "xdebug"
tools: composer:v1 tools: composer:v2
extensions: tidy extensions: tidy
ini-values: "date.timezone=Europe/Paris" ini-values: "date.timezone=Europe/Paris"
env: env:
@ -103,8 +95,6 @@ jobs:
- name: "Run PHPUnit (with coverage)" - name: "Run PHPUnit (with coverage)"
run: "php vendor/bin/simple-phpunit -v --coverage-clover build/logs/clover.xml" run: "php vendor/bin/simple-phpunit -v --coverage-clover build/logs/clover.xml"
env:
SYMFONY_PHPUNIT_VERSION: 7.5
- name: "Retrieve Coveralls phar" - name: "Retrieve Coveralls phar"
run: "wget https://github.com/php-coveralls/php-coveralls/releases/download/v2.4.2/php-coveralls.phar" run: "wget https://github.com/php-coveralls/php-coveralls/releases/download/v2.4.2/php-coveralls.phar"
@ -126,49 +116,6 @@ jobs:
php: php:
- "7.2" - "7.2"
steps:
- name: "Checkout"
uses: "actions/checkout@v2"
with:
fetch-depth: 2
- name: "Install PHP"
uses: "shivammathur/setup-php@v2"
with:
php-version: "${{ matrix.php }}"
coverage: "none"
tools: composer:v1
extensions: tidy
ini-values: "date.timezone=Europe/Paris"
env:
COMPOSER_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: "Remove useless deps"
run: "composer remove friendsofphp/php-cs-fixer --dev --no-progress --no-update"
- name: "Install dependencies with Composer"
uses: "ramsey/composer-install@v1"
with:
composer-options: "--optimize-autoloader --prefer-dist"
dependency-versions: "lowest"
- name: "Setup logs"
run: "mkdir -p build/logs"
- name: "Run PHPUnit"
run: "php vendor/bin/simple-phpunit -v"
env:
SYMFONY_PHPUNIT_VERSION: 7.5
phpunit-composerv2:
name: "PHPUnit with Composer v2 (PHP ${{ matrix.php }})"
runs-on: "ubuntu-20.04"
strategy:
matrix:
php:
- "7.4"
steps: steps:
- name: "Checkout" - name: "Checkout"
uses: "actions/checkout@v2" uses: "actions/checkout@v2"
@ -193,11 +140,10 @@ jobs:
uses: "ramsey/composer-install@v1" uses: "ramsey/composer-install@v1"
with: with:
composer-options: "--optimize-autoloader --prefer-dist" composer-options: "--optimize-autoloader --prefer-dist"
dependency-versions: "lowest"
- name: "Setup logs" - name: "Setup logs"
run: "mkdir -p build/logs" run: "mkdir -p build/logs"
- name: "Run PHPUnit" - name: "Run PHPUnit"
run: "php vendor/bin/simple-phpunit -v" run: "php vendor/bin/simple-phpunit -v"
env:
SYMFONY_PHPUNIT_VERSION: 7.5

1
.gitignore vendored

@ -2,4 +2,5 @@ vendor/
coverage/ coverage/
composer.lock composer.lock
.php_cs.cache .php_cs.cache
.php-cs-fixer.cache
.phpunit.result.cache .phpunit.result.cache

@ -1,12 +0,0 @@
tools:
external_code_coverage: false
build:
nodes:
analysis:
tests:
override:
- php-scrutinizer-run
environment:
php:
version: 7.2

@ -5,7 +5,7 @@
[![Total Downloads](https://poser.pugx.org/j0k3r/php-readability/downloads)](https://packagist.org/packages/j0k3r/php-readability) [![Total Downloads](https://poser.pugx.org/j0k3r/php-readability/downloads)](https://packagist.org/packages/j0k3r/php-readability)
[![License](https://poser.pugx.org/j0k3r/php-readability/license)](https://packagist.org/packages/j0k3r/php-readability) [![License](https://poser.pugx.org/j0k3r/php-readability/license)](https://packagist.org/packages/j0k3r/php-readability)
This is an extract of the Readability class from this [full-text-rss](https://github.com/Dither/full-text-rss) fork. It can be defined as a better version of the original [php-readability](https://bitbucket.org/fivefilters/php-readability/overview). This is an extract of the Readability class from this [full-text-rss](https://github.com/Dither/full-text-rss) fork. It can be defined as a better version of the original [php-readability](https://bitbucket.org/fivefilters/php-readability).
## Differences ## Differences

@ -24,15 +24,17 @@
"role": "Developer (original JS version)" "role": "Developer (original JS version)"
}], }],
"require": { "require": {
"php": ">=5.6.0", "php": ">=7.2.0",
"ext-mbstring": "*", "ext-mbstring": "*",
"psr/log": "^1.0", "psr/log": "^1.0",
"masterminds/html5": "^2.7" "masterminds/html5": "^2.7"
}, },
"require-dev": { "require-dev": {
"friendsofphp/php-cs-fixer": "^2.14", "friendsofphp/php-cs-fixer": "^3.0",
"monolog/monolog": "^1.24|^2.1", "monolog/monolog": "^1.24|^2.1",
"symfony/phpunit-bridge": "^4.4|^5.3" "symfony/phpunit-bridge": "^4.4|^5.3|^6.0",
"phpstan/phpstan": "^1.3",
"phpstan/phpstan-phpunit": "^1.0"
}, },
"suggest": { "suggest": {
"ext-tidy": "Used to clean up given HTML and to avoid problems with bad HTML structure." "ext-tidy": "Used to clean up given HTML and to avoid problems with bad HTML structure."
@ -42,5 +44,10 @@
}, },
"autoload-dev": { "autoload-dev": {
"psr-4": { "Tests\\Readability\\": "tests/" } "psr-4": { "Tests\\Readability\\": "tests/" }
},
"config":{
"platform": {
"php": "7.2.34"
}
} }
} }

@ -6,7 +6,9 @@ parameters:
# https://github.com/phpstan/phpstan/issues/694#issuecomment-350724288 # https://github.com/phpstan/phpstan/issues/694#issuecomment-350724288
bootstrapFiles: bootstrapFiles:
- vendor/bin/.phpunit/phpunit-7.5-0/vendor/autoload.php - vendor/bin/.phpunit/phpunit-8.5-0/vendor/autoload.php
checkMissingIterableValueType: false
includes: includes:
- vendor/phpstan/phpstan-phpunit/extension.neon - vendor/phpstan/phpstan-phpunit/extension.neon

@ -11,7 +11,7 @@
> >
<testsuites> <testsuites>
<testsuite name="Readability Test Suite"> <testsuite name="Readability">
<directory>./tests/</directory> <directory>./tests/</directory>
</testsuite> </testsuite>
</testsuites> </testsuites>

@ -7,66 +7,23 @@ use Psr\Log\LoggerAwareInterface;
use Psr\Log\LoggerInterface; use Psr\Log\LoggerInterface;
use Psr\Log\NullLogger; use Psr\Log\NullLogger;
/**
* Arc90's Readability ported to PHP for FiveFilters.org
* Based on readability.js version 1.7.1 (without multi-page support)
* ------------------------------------------------------
* Original URL: http://lab.arc90.com/experiments/readability/js/readability.js
* Arc90's project URL: http://lab.arc90.com/experiments/readability/
* JS Source: http://code.google.com/p/arc90labs-readability
* Ported by: Keyvan Minoukadeh, http://www.keyvan.net
* Modded by: Dither, https://dithersky.wordpress.com
* More information: http://fivefilters.org/content-only/
* License: Apache License, Version 2.0
* Requires: PHP version 5.2.0+
* Date: 2013-08-02.
*
* Differences between the PHP port and the original
* ------------------------------------------------------
* Arc90's Readability is designed to run in the browser. It works on the DOM
* tree (the parsed HTML) after the page's CSS styles have been applied and
* Javascript code executed. This PHP port does not run inside a browser.
* We use PHP's ability to parse HTML to build our DOM tree, but we cannot
* rely on CSS or Javascript support. As such, the results will not always
* match Arc90's Readability. (For example, if a web page contains CSS style
* rules or Javascript code which hide certain HTML elements from display,
* Arc90's Readability will dismiss those from consideration but our PHP port,
* unable to understand CSS or Javascript, will not know any better.)
*
* Another significant difference is that the aim of Arc90's Readability is
* to re-present the main content block of a given web page so users can
* read it more easily in their browsers. Correct identification, clean up,
* and separation of the content block is only a part of this process.
* This PHP port is only concerned with this part, it does not include code
* that relates to presentation in the browser - Arc90 already do
* that extremely well, and for PDF output there's FiveFilters.org's
* PDF Newspaper: http://fivefilters.org/pdf-newspaper/.
*
* Finally, this class contains methods that might be useful for developers
* working on HTML document fragments. So without deviating too much from
* the original code (which I don't want to do because it makes debugging
* and updating more difficult), I've tried to make it a little more
* developer friendly. You should be able to use the methods here on
* existing DOMElement objects without passing an entire HTML document to
* be parsed.
*/
class Readability implements LoggerAwareInterface class Readability implements LoggerAwareInterface
{ {
// flags // flags
const FLAG_STRIP_UNLIKELYS = 1; public const FLAG_STRIP_UNLIKELYS = 1;
const FLAG_WEIGHT_ATTRIBUTES = 2; public const FLAG_WEIGHT_ATTRIBUTES = 2;
const FLAG_CLEAN_CONDITIONALLY = 4; public const FLAG_CLEAN_CONDITIONALLY = 4;
const FLAG_DISABLE_PREFILTER = 8; public const FLAG_DISABLE_PREFILTER = 8;
const FLAG_DISABLE_POSTFILTER = 16; public const FLAG_DISABLE_POSTFILTER = 16;
// constants // constants
const SCORE_CHARS_IN_PARAGRAPH = 100; public const SCORE_CHARS_IN_PARAGRAPH = 100;
const SCORE_WORDS_IN_PARAGRAPH = 20; public const SCORE_WORDS_IN_PARAGRAPH = 20;
const GRANDPARENT_SCORE_DIVISOR = 2; public const GRANDPARENT_SCORE_DIVISOR = 2;
const MIN_PARAGRAPH_LENGTH = 20; public const MIN_PARAGRAPH_LENGTH = 20;
const MIN_COMMAS_IN_PARAGRAPH = 6; public const MIN_COMMAS_IN_PARAGRAPH = 6;
const MIN_ARTICLE_LENGTH = 200; public const MIN_ARTICLE_LENGTH = 200;
const MIN_NODE_LENGTH = 80; public const MIN_NODE_LENGTH = 80;
const MAX_LINK_DENSITY = 0.25; public const MAX_LINK_DENSITY = 0.25;
public $convertLinksToFootnotes = false; public $convertLinksToFootnotes = false;
public $revertForcedParagraphElements = true; public $revertForcedParagraphElements = true;
public $articleTitle; public $articleTitle;
@ -171,26 +128,22 @@ class Readability implements LoggerAwareInterface
/** /**
* Create instance of Readability. * Create instance of Readability.
* *
* @param string $html UTF-8 encoded string * @param string $html UTF-8 encoded string
* @param string $url URL associated with HTML (for footnotes) * @param string $url URL associated with HTML (for footnotes)
* @param string $parser Which parser to use for turning raw HTML into a DOMDocument * @param string $parser Which parser to use for turning raw HTML into a DOMDocument
* @param bool $use_tidy Use tidy * @param bool $useTidy Use tidy
*/ */
public function __construct($html, $url = null, $parser = 'libxml', $use_tidy = true) public function __construct(string $html, string $url = null, string $parser = 'libxml', bool $useTidy = true)
{ {
$this->url = $url; $this->url = $url;
$this->html = $html; $this->html = $html;
$this->parser = $parser; $this->parser = $parser;
$this->useTidy = $use_tidy && \function_exists('tidy_parse_string'); $this->useTidy = $useTidy && \function_exists('tidy_parse_string');
$this->logger = new NullLogger(); $this->logger = new NullLogger();
$this->loadHtml();
} }
/** public function setLogger(LoggerInterface $logger): void
* @return void
*/
public function setLogger(LoggerInterface $logger)
{ {
$this->logger = $logger; $this->logger = $logger;
} }
@ -221,7 +174,7 @@ class Readability implements LoggerAwareInterface
* @param string $filter RegExp for replace * @param string $filter RegExp for replace
* @param string $replacer Replacer * @param string $replacer Replacer
*/ */
public function addPreFilter($filter, $replacer = '') public function addPreFilter(string $filter, string $replacer = ''): void
{ {
$this->pre_filters[$filter] = $replacer; $this->pre_filters[$filter] = $replacer;
} }
@ -232,7 +185,7 @@ class Readability implements LoggerAwareInterface
* @param string $filter RegExp for replace * @param string $filter RegExp for replace
* @param string $replacer Replacer * @param string $replacer Replacer
*/ */
public function addPostFilter($filter, $replacer = '') public function addPostFilter(string $filter, string $replacer = ''): void
{ {
$this->post_filters[$filter] = $replacer; $this->post_filters[$filter] = $replacer;
} }
@ -249,8 +202,10 @@ class Readability implements LoggerAwareInterface
* *
* @return bool true if we found content, false otherwise * @return bool true if we found content, false otherwise
*/ */
public function init() public function init(): bool
{ {
$this->loadHtml();
if (!isset($this->dom->documentElement)) { if (!isset($this->dom->documentElement)) {
return false; return false;
} }
@ -315,7 +270,7 @@ class Readability implements LoggerAwareInterface
/** /**
* Run any post-process modifications to article content as necessary. * Run any post-process modifications to article content as necessary.
*/ */
public function postProcessContent(\DOMElement $articleContent) public function postProcessContent(\DOMElement $articleContent): void
{ {
if ($this->convertLinksToFootnotes && !preg_match('/\bwiki/', $this->url)) { if ($this->convertLinksToFootnotes && !preg_match('/\bwiki/', $this->url)) {
$this->addFootnotes($articleContent); $this->addFootnotes($articleContent);
@ -327,7 +282,7 @@ class Readability implements LoggerAwareInterface
* *
* @see http://www.roughtype.com/archives/2010/05/experiments_in.php * @see http://www.roughtype.com/archives/2010/05/experiments_in.php
*/ */
public function addFootnotes(\DOMElement $articleContent) public function addFootnotes(\DOMElement $articleContent): void
{ {
$footnotesWrapper = $this->dom->createElement('footer'); $footnotesWrapper = $this->dom->createElement('footer');
$footnotesWrapper->setAttribute('class', 'readability-footnotes'); $footnotesWrapper->setAttribute('class', 'readability-footnotes');
@ -389,7 +344,7 @@ class Readability implements LoggerAwareInterface
* Prepare the article node for display. Clean out any inline styles, * Prepare the article node for display. Clean out any inline styles,
* iframes, forms, strip extraneous <p> tags, etc. * iframes, forms, strip extraneous <p> tags, etc.
*/ */
public function prepArticle(\DOMNode $articleContent) public function prepArticle(\DOMNode $articleContent): void
{ {
if (!$articleContent instanceof \DOMElement) { if (!$articleContent instanceof \DOMElement) {
return; return;
@ -491,10 +446,8 @@ class Readability implements LoggerAwareInterface
* @param \DOMElement $e * @param \DOMElement $e
* @param bool $normalizeSpaces (default: true) * @param bool $normalizeSpaces (default: true)
* @param bool $flattenLines (default: false) * @param bool $flattenLines (default: false)
*
* @return string
*/ */
public function getInnerText($e, $normalizeSpaces = true, $flattenLines = false) public function getInnerText($e, bool $normalizeSpaces = true, bool $flattenLines = false): string
{ {
if (null === $e || !isset($e->textContent) || '' === $e->textContent) { if (null === $e || !isset($e->textContent) || '' === $e->textContent) {
return ''; return '';
@ -503,9 +456,11 @@ class Readability implements LoggerAwareInterface
$textContent = trim($e->textContent); $textContent = trim($e->textContent);
if ($flattenLines) { if ($flattenLines) {
$textContent = mb_ereg_replace('(?:[\r\n](?:\s|&nbsp;)*)+', '', $textContent); return (string) mb_ereg_replace('(?:[\r\n](?:\s|&nbsp;)*)+', '', $textContent);
} elseif ($normalizeSpaces) { }
$textContent = mb_ereg_replace('\s\s+', ' ', $textContent);
if ($normalizeSpaces) {
return (string) mb_ereg_replace('\s\s+', ' ', $textContent);
} }
return $textContent; return $textContent;
@ -513,30 +468,22 @@ class Readability implements LoggerAwareInterface
/** /**
* Remove the style attribute on every $e and under. * Remove the style attribute on every $e and under.
*
* @param \DOMElement $e
*/ */
public function cleanStyles($e) public function cleanStyles(\DOMElement $e): void
{ {
if (!\is_object($e)) { if (\is_object($e)) {
return; $elems = $e->getElementsByTagName('*');
}
$elems = $e->getElementsByTagName('*');
foreach ($elems as $elem) { foreach ($elems as $elem) {
$elem->removeAttribute('style'); $elem->removeAttribute('style');
}
} }
} }
/** /**
* Get comma number for a given text. * Get comma number for a given text.
*
* @param string $text
*
* @return int
*/ */
public function getCommaCount($text) public function getCommaCount(string $text): int
{ {
return substr_count($text, ','); return substr_count($text, ',');
} }
@ -544,12 +491,8 @@ class Readability implements LoggerAwareInterface
/** /**
* Get words number for a given text if words separated by a space. * Get words number for a given text if words separated by a space.
* Input string should be normalized. * Input string should be normalized.
*
* @param string $text
*
* @return int
*/ */
public function getWordCount($text) public function getWordCount(string $text): int
{ {
return substr_count($text, ' '); return substr_count($text, ' ');
} }
@ -558,12 +501,8 @@ class Readability implements LoggerAwareInterface
* Get the density of links as a percentage of the content * Get the density of links as a percentage of the content
* This is the amount of text that is inside a link divided by the total text in the node. * This is the amount of text that is inside a link divided by the total text in the node.
* Can exclude external references to differentiate between simple text and menus/infoblocks. * Can exclude external references to differentiate between simple text and menus/infoblocks.
*
* @param bool $excludeExternal
*
* @return int
*/ */
public function getLinkDensity(\DOMElement $e, $excludeExternal = false) public function getLinkDensity(\DOMElement $e, bool $excludeExternal = false): float
{ {
$links = $e->getElementsByTagName('a'); $links = $e->getElementsByTagName('a');
$textLength = mb_strlen($this->getInnerText($e, true, true)); $textLength = mb_strlen($this->getInnerText($e, true, true));
@ -585,10 +524,8 @@ class Readability implements LoggerAwareInterface
/** /**
* Get an element relative weight. * Get an element relative weight.
*
* @return int
*/ */
public function getWeight(\DOMElement $e) public function getWeight(\DOMElement $e): int
{ {
if (!$this->flagIsActive(self::FLAG_WEIGHT_ATTRIBUTES)) { if (!$this->flagIsActive(self::FLAG_WEIGHT_ATTRIBUTES)) {
return 0; return 0;
@ -606,7 +543,7 @@ class Readability implements LoggerAwareInterface
/** /**
* Remove extraneous break tags from a node. * Remove extraneous break tags from a node.
*/ */
public function killBreaks(\DOMElement $node) public function killBreaks(\DOMElement $node): void
{ {
$html = $node->getInnerHTML(); $html = $node->getInnerHTML();
$html = preg_replace($this->regexps['killBreaks'], '<br />', $html); $html = preg_replace($this->regexps['killBreaks'], '<br />', $html);
@ -618,10 +555,8 @@ class Readability implements LoggerAwareInterface
* (Unless it's a youtube/vimeo video. People love movies.). * (Unless it's a youtube/vimeo video. People love movies.).
* *
* Updated 2012-09-18 to preserve youtube/vimeo iframes * Updated 2012-09-18 to preserve youtube/vimeo iframes
*
* @param string $tag
*/ */
public function clean(\DOMElement $e, $tag) public function clean(\DOMElement $e, string $tag): void
{ {
$targetList = $e->getElementsByTagName($tag); $targetList = $e->getElementsByTagName($tag);
$isEmbed = ('audio' === $tag || 'video' === $tag || 'iframe' === $tag || 'object' === $tag || 'embed' === $tag); $isEmbed = ('audio' === $tag || 'video' === $tag || 'iframe' === $tag || 'object' === $tag || 'embed' === $tag);
@ -652,10 +587,8 @@ class Readability implements LoggerAwareInterface
* Clean an element of all tags of type "tag" if they look fishy. * Clean an element of all tags of type "tag" if they look fishy.
* "Fishy" is an algorithm based on content length, classnames, * "Fishy" is an algorithm based on content length, classnames,
* link density, number of images & embeds, etc. * link density, number of images & embeds, etc.
*
* @param string $tag
*/ */
public function cleanConditionally(\DOMElement $e, $tag) public function cleanConditionally(\DOMElement $e, string $tag): void
{ {
if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) { if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
return; return;
@ -765,7 +698,7 @@ class Readability implements LoggerAwareInterface
/** /**
* Clean out spurious headers from an Element. Checks things like classnames and link density. * Clean out spurious headers from an Element. Checks things like classnames and link density.
*/ */
public function cleanHeaders(\DOMElement $e) public function cleanHeaders(\DOMElement $e): void
{ {
for ($headerIndex = 1; $headerIndex < 3; ++$headerIndex) { for ($headerIndex = 1; $headerIndex < 3; ++$headerIndex) {
$headers = $e->getElementsByTagName('h' . $headerIndex); $headers = $e->getElementsByTagName('h' . $headerIndex);
@ -780,57 +713,28 @@ class Readability implements LoggerAwareInterface
/** /**
* Check if the given flag is active. * Check if the given flag is active.
*
* @param int $flag
*
* @return bool
*/ */
public function flagIsActive($flag) public function flagIsActive(int $flag): bool
{ {
return ($this->flags & $flag) > 0; return ($this->flags & $flag) > 0;
} }
/** /**
* Add a flag. * Add a flag.
*
* @param int $flag
*/ */
public function addFlag($flag) public function addFlag(int $flag): void
{ {
$this->flags = $this->flags | $flag; $this->flags = $this->flags | $flag;
} }
/** /**
* Remove a flag. * Remove a flag.
*
* @param int $flag
*/ */
public function removeFlag($flag) public function removeFlag(int $flag): void
{ {
$this->flags = $this->flags & ~$flag; $this->flags = $this->flags & ~$flag;
} }
/**
* Debug.
*
* @deprecated use $this->logger->debug() instead
* @codeCoverageIgnore
*/
protected function dbg($msg)
{
$this->logger->debug($msg);
}
/**
* Dump debug info.
*
* @deprecated since Monolog gather log, we don't need it
* @codeCoverageIgnore
*/
protected function dump_dbg()
{
}
/** /**
* Get the article title as an H1. * Get the article title as an H1.
* *
@ -877,7 +781,7 @@ class Readability implements LoggerAwareInterface
* Prepare the HTML document for readability to scrape it. * Prepare the HTML document for readability to scrape it.
* This includes things like stripping javascript, CSS, and handling terrible markup. * This includes things like stripping javascript, CSS, and handling terrible markup.
*/ */
protected function prepDocument() protected function prepDocument(): void
{ {
/* /*
* In some cases a body element can't be found (if the HTML is totally hosed for example) * In some cases a body element can't be found (if the HTML is totally hosed for example)
@ -906,7 +810,7 @@ class Readability implements LoggerAwareInterface
* Initialize a node with the readability object. Also checks the * Initialize a node with the readability object. Also checks the
* className/id for special names to add to its score. * className/id for special names to add to its score.
*/ */
protected function initializeNode(\DOMElement $node) protected function initializeNode(\DOMElement $node): void
{ {
if (!isset($node->tagName)) { if (!isset($node->tagName)) {
return; return;
@ -993,7 +897,8 @@ class Readability implements LoggerAwareInterface
$allElements = $page->getElementsByTagName('*'); $allElements = $page->getElementsByTagName('*');
for ($nodeIndex = 0; ($node = $allElements->item($nodeIndex)); ++$nodeIndex) { for ($nodeIndex = 0; $allElements->item($nodeIndex); ++$nodeIndex) {
$node = $allElements->item($nodeIndex);
$tagName = $node->tagName; $tagName = $node->tagName;
$nodeContent = $node->getInnerHTML(); $nodeContent = $node->getInnerHTML();
@ -1107,7 +1012,7 @@ class Readability implements LoggerAwareInterface
$contentScore += max(min($score, 3), -3);/**/ $contentScore += max(min($score, 3), -3);/**/
// Add the score to the parent. The grandparent gets half. // Add the score to the parent. The grandparent gets half.
$parentNode->getAttributeNode('readability')->value += $contentScore; $parentNode->getAttributeNode('readability')->value = ((float) $parentNode->getAttributeNode('readability')->value) + $contentScore;
if ($grandParentNode) { if ($grandParentNode) {
$grandParentNode->getAttributeNode('readability')->value += round($contentScore / self::GRANDPARENT_SCORE_DIVISOR); $grandParentNode->getAttributeNode('readability')->value += round($contentScore / self::GRANDPARENT_SCORE_DIVISOR);
} }
@ -1228,7 +1133,7 @@ class Readability implements LoggerAwareInterface
$siblingScoreThreshold = max(10, ((int) $topCandidate->getAttribute('readability')) * 0.2); $siblingScoreThreshold = max(10, ((int) $topCandidate->getAttribute('readability')) * 0.2);
$siblingNodes = $topCandidate->parentNode->childNodes; $siblingNodes = $topCandidate->parentNode->childNodes;
if (null === $siblingNodes) { if (0 === $siblingNodes->length) {
$siblingNodes = new \stdClass(); $siblingNodes = new \stdClass();
$siblingNodes->length = 0; $siblingNodes->length = 0;
} }
@ -1260,7 +1165,7 @@ class Readability implements LoggerAwareInterface
$nodeLength = mb_strlen($nodeContent); $nodeLength = mb_strlen($nodeContent);
if (($nodeLength > self::MIN_NODE_LENGTH && $linkDensity < self::MAX_LINK_DENSITY) if (($nodeLength > self::MIN_NODE_LENGTH && $linkDensity < self::MAX_LINK_DENSITY)
|| ($nodeLength < self::MIN_NODE_LENGTH && 0 === $linkDensity && preg_match('/\.( |$)/', $nodeContent))) { || ($nodeLength < self::MIN_NODE_LENGTH && 0 === (int) $linkDensity && preg_match('/\.( |$)/', $nodeContent))) {
$append = true; $append = true;
} }
} }
@ -1337,12 +1242,8 @@ class Readability implements LoggerAwareInterface
/** /**
* Get an element weight by attribute. * Get an element weight by attribute.
* Uses regular expressions to tell if this element looks good or bad. * Uses regular expressions to tell if this element looks good or bad.
*
* @param string $attribute
*
* @return int
*/ */
protected function weightAttribute(\DOMElement $element, $attribute) protected function weightAttribute(\DOMElement $element, string $attribute): int
{ {
if (!$element->hasAttribute($attribute)) { if (!$element->hasAttribute($attribute)) {
return 0; return 0;
@ -1373,7 +1274,7 @@ class Readability implements LoggerAwareInterface
/** /**
* Will recreate previously deleted body property. * Will recreate previously deleted body property.
*/ */
protected function reinitBody() protected function reinitBody(): void
{ {
if (!isset($this->body->childNodes)) { if (!isset($this->body->childNodes)) {
$this->body = $this->dom->createElement('body'); $this->body = $this->dom->createElement('body');
@ -1385,17 +1286,15 @@ class Readability implements LoggerAwareInterface
* Load HTML in a DOMDocument. * Load HTML in a DOMDocument.
* Apply Pre filters * Apply Pre filters
* Cleanup HTML using Tidy (or not). * Cleanup HTML using Tidy (or not).
*
* @todo This should be called in init() instead of from __construct
*/ */
private function loadHtml() private function loadHtml(): void
{ {
$this->original_html = $this->html; $this->original_html = $this->html;
$this->logger->debug('Parsing URL: ' . $this->url); $this->logger->debug('Parsing URL: ' . $this->url);
if ($this->url) { if ($this->url) {
$this->domainRegExp = '/' . strtr(preg_replace('/www\d*\./', '', parse_url($this->url, \PHP_URL_HOST)), ['.' => '\.']) . '/'; $this->domainRegExp = '/' . strtr((string) preg_replace('/www\d*\./', '', (string) parse_url($this->url, \PHP_URL_HOST)), ['.' => '\.']) . '/';
} }
mb_internal_encoding('UTF-8'); mb_internal_encoding('UTF-8');
@ -1431,7 +1330,7 @@ class Readability implements LoggerAwareInterface
unset($tidy); unset($tidy);
} }
$this->html = mb_convert_encoding($this->html, 'HTML-ENTITIES', 'UTF-8'); $this->html = mb_convert_encoding((string) $this->html, 'HTML-ENTITIES', 'UTF-8');
if ('html5lib' === $this->parser || 'html5' === $this->parser) { if ('html5lib' === $this->parser || 'html5' === $this->parser) {
$this->dom = (new HTML5())->loadHTML($this->html); $this->dom = (new HTML5())->loadHTML($this->html);

@ -4,27 +4,32 @@ namespace Tests\Readability;
use Monolog\Handler\TestHandler; use Monolog\Handler\TestHandler;
use Monolog\Logger; use Monolog\Logger;
use Psr\Log\LoggerInterface;
use Readability\Readability; use Readability\Readability;
class ReadabilityTest extends \PHPUnit\Framework\TestCase class ReadabilityTest extends \PHPUnit\Framework\TestCase
{ {
/** @var TestHandler */
public $logHandler; public $logHandler;
/** @var LoggerInterface */
public $logger; public $logger;
/** /**
* @requires extension tidy * @requires extension tidy
*/ */
public function testConstructDefault() public function testConstructDefault(): void
{ {
$readability = $this->getReadability(''); $readability = $this->getReadability('');
$readability->init();
$this->assertNull($readability->url); $this->assertNull($readability->url);
$this->assertInstanceOf('DomDocument', $readability->dom); $this->assertInstanceOf('DomDocument', $readability->dom);
} }
public function testConstructHtml5Parser() public function testConstructHtml5Parser(): void
{ {
$readability = $this->getReadability('<html/>', 'http://0.0.0.0', 'html5lib'); $readability = $this->getReadability('<html/>', 'http://0.0.0.0', 'html5lib');
$readability->init();
$this->assertSame('http://0.0.0.0', $readability->url); $this->assertSame('http://0.0.0.0', $readability->url);
$this->assertInstanceOf('DomDocument', $readability->dom); $this->assertInstanceOf('DomDocument', $readability->dom);
@ -34,9 +39,10 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
/** /**
* @requires extension tidy * @requires extension tidy
*/ */
public function testConstructSimple() public function testConstructSimple(): void
{ {
$readability = $this->getReadability('<html/>', 'http://0.0.0.0'); $readability = $this->getReadability('<html/>', 'http://0.0.0.0');
$readability->init();
$this->assertSame('http://0.0.0.0', $readability->url); $this->assertSame('http://0.0.0.0', $readability->url);
$this->assertInstanceOf('DomDocument', $readability->dom); $this->assertInstanceOf('DomDocument', $readability->dom);
@ -44,9 +50,10 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertTrue($readability->tidied); $this->assertTrue($readability->tidied);
} }
public function testConstructDefaultWithoutTidy() public function testConstructDefaultWithoutTidy(): void
{ {
$readability = $this->getReadability('', null, 'libxml', false); $readability = $this->getReadability('', null, 'libxml', false);
$readability->init();
$this->assertNull($readability->url); $this->assertNull($readability->url);
$this->assertSame('', $readability->original_html); $this->assertSame('', $readability->original_html);
@ -55,9 +62,10 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertInstanceOf('DomDocument', $readability->dom); $this->assertInstanceOf('DomDocument', $readability->dom);
} }
public function testConstructSimpleWithoutTidy() public function testConstructSimpleWithoutTidy(): void
{ {
$readability = $this->getReadability('<html/>', 'http://0.0.0.0', 'libxml', false); $readability = $this->getReadability('<html/>', 'http://0.0.0.0', 'libxml', false);
$readability->init();
$this->assertSame('http://0.0.0.0', $readability->url); $this->assertSame('http://0.0.0.0', $readability->url);
$this->assertInstanceOf('DomDocument', $readability->dom); $this->assertInstanceOf('DomDocument', $readability->dom);
@ -65,7 +73,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertFalse($readability->tidied); $this->assertFalse($readability->tidied);
} }
public function testInitNoContent() public function testInitNoContent(): void
{ {
$readability = $this->getReadability('<html/>', 'http://0.0.0.0'); $readability = $this->getReadability('<html/>', 'http://0.0.0.0');
$res = $readability->init(); $res = $readability->init();
@ -77,7 +85,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertStringContainsString('Sorry, Readability was unable to parse this page for content.', $readability->getContent()->getInnerHtml()); $this->assertStringContainsString('Sorry, Readability was unable to parse this page for content.', $readability->getContent()->getInnerHtml());
} }
public function testInitP() public function testInitP(): void
{ {
$readability = $this->getReadability(str_repeat('<p>This is the awesome content :)</p>', 7), 'http://0.0.0.0'); $readability = $this->getReadability(str_repeat('<p>This is the awesome content :)</p>', 7), 'http://0.0.0.0');
$res = $readability->init(); $res = $readability->init();
@ -90,7 +98,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertStringContainsString('This is the awesome content :)', $readability->getContent()->getInnerHtml()); $this->assertStringContainsString('This is the awesome content :)', $readability->getContent()->getInnerHtml());
} }
public function testInitDivP() public function testInitDivP(): void
{ {
$readability = $this->getReadability('<div>' . str_repeat('<p>This is the awesome content :)</p>', 7) . '</div>', 'http://0.0.0.0'); $readability = $this->getReadability('<div>' . str_repeat('<p>This is the awesome content :)</p>', 7) . '</div>', 'http://0.0.0.0');
$res = $readability->init(); $res = $readability->init();
@ -103,7 +111,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertStringContainsString('This is the awesome content :)', $readability->getContent()->getInnerHtml()); $this->assertStringContainsString('This is the awesome content :)', $readability->getContent()->getInnerHtml());
} }
public function testInitDiv() public function testInitDiv(): void
{ {
$readability = $this->getReadability('<div>' . str_repeat('This is the awesome content :)', 7) . '</div>', 'http://0.0.0.0'); $readability = $this->getReadability('<div>' . str_repeat('This is the awesome content :)', 7) . '</div>', 'http://0.0.0.0');
$readability->debug = true; $readability->debug = true;
@ -117,7 +125,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertStringContainsString('This is the awesome content :)', $readability->getContent()->getInnerHtml()); $this->assertStringContainsString('This is the awesome content :)', $readability->getContent()->getInnerHtml());
} }
public function testWithFootnotes() public function testWithFootnotes(): void
{ {
$readability = $this->getReadability('<div>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '</div>', 'http://0.0.0.0'); $readability = $this->getReadability('<div>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '</div>', 'http://0.0.0.0');
$readability->debug = true; $readability->debug = true;
@ -134,7 +142,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertStringContainsString('readabilityLink-3', $readability->getContent()->getInnerHtml()); $this->assertStringContainsString('readabilityLink-3', $readability->getContent()->getInnerHtml());
} }
public function testStandardClean() public function testStandardClean(): void
{ {
$readability = $this->getReadability('<div><h2>Title</h2>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<a href="#nofollow" rel="nofollow">will NOT be removed</a></div>', 'http://0.0.0.0'); $readability = $this->getReadability('<div><h2>Title</h2>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<a href="#nofollow" rel="nofollow">will NOT be removed</a></div>', 'http://0.0.0.0');
$readability->debug = true; $readability->debug = true;
@ -151,7 +159,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertStringNotContainsString('<h2>', $readability->getContent()->getInnerHtml()); $this->assertStringNotContainsString('<h2>', $readability->getContent()->getInnerHtml());
} }
public function testWithIframe() public function testWithIframe(): void
{ {
$readability = $this->getReadability('<div><h2>Title</h2>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<p>This is an awesome text with some links, here there are <iframe src="http://youtube.com/test" href="#nofollow" rel="nofollow"></iframe><iframe>http://soundcloud.com/test</iframe></p></div>', 'http://0.0.0.0'); $readability = $this->getReadability('<div><h2>Title</h2>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<p>This is an awesome text with some links, here there are <iframe src="http://youtube.com/test" href="#nofollow" rel="nofollow"></iframe><iframe>http://soundcloud.com/test</iframe></p></div>', 'http://0.0.0.0');
$readability->debug = true; $readability->debug = true;
@ -166,7 +174,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertStringContainsString('nofollow', $readability->getContent()->getInnerHtml()); $this->assertStringContainsString('nofollow', $readability->getContent()->getInnerHtml());
} }
public function testWithArticle() public function testWithArticle(): void
{ {
$readability = $this->getReadability('<article><p>' . str_repeat('This is an awesome text with some links, here there are: the awesome', 20) . '</p><p>This is an awesome text with some links, here there are <iframe src="http://youtube.com/test" href="#nofollow" rel="nofollow"></iframe></p></article>', 'http://0.0.0.0'); $readability = $this->getReadability('<article><p>' . str_repeat('This is an awesome text with some links, here there are: the awesome', 20) . '</p><p>This is an awesome text with some links, here there are <iframe src="http://youtube.com/test" href="#nofollow" rel="nofollow"></iframe></p></article>', 'http://0.0.0.0');
$readability->debug = true; $readability->debug = true;
@ -181,7 +189,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertStringContainsString('nofollow', $readability->getContent()->getInnerHtml()); $this->assertStringContainsString('nofollow', $readability->getContent()->getInnerHtml());
} }
public function testWithAside() public function testWithAside(): void
{ {
$readability = $this->getReadability('<article>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<footer><aside>' . str_repeat('<p>This is an awesome text with some links, here there are</p>', 8) . '</aside></footer></article>', 'http://0.0.0.0'); $readability = $this->getReadability('<article>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<footer><aside>' . str_repeat('<p>This is an awesome text with some links, here there are</p>', 8) . '</aside></footer></article>', 'http://0.0.0.0');
$readability->debug = true; $readability->debug = true;
@ -196,7 +204,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertStringContainsString('<footer readability="5"/>', $readability->getContent()->getInnerHtml()); $this->assertStringContainsString('<footer readability="5"/>', $readability->getContent()->getInnerHtml());
} }
public function testWithClasses() public function testWithClasses(): void
{ {
$readability = $this->getReadability('<article>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<div style="display:none">' . str_repeat('<p class="clock">This text should be removed</p>', 10) . '</div></article>', 'http://0.0.0.0'); $readability = $this->getReadability('<article>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<div style="display:none">' . str_repeat('<p class="clock">This text should be removed</p>', 10) . '</div></article>', 'http://0.0.0.0');
$readability->debug = true; $readability->debug = true;
@ -211,7 +219,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertStringNotContainsString('This text should be removed', $readability->getContent()->getInnerHtml()); $this->assertStringNotContainsString('This text should be removed', $readability->getContent()->getInnerHtml());
} }
public function testWithClassesWithoutLightClean() public function testWithClassesWithoutLightClean(): void
{ {
$readability = $this->getReadability('<article>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<div style="display:none">' . str_repeat('<p class="clock">This text should be removed</p>', 10) . '</div></article>', 'http://0.0.0.0'); $readability = $this->getReadability('<article>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<div style="display:none">' . str_repeat('<p class="clock">This text should be removed</p>', 10) . '</div></article>', 'http://0.0.0.0');
$readability->debug = true; $readability->debug = true;
@ -227,7 +235,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertStringNotContainsString('This text should be removed', $readability->getContent()->getInnerHtml()); $this->assertStringNotContainsString('This text should be removed', $readability->getContent()->getInnerHtml());
} }
public function testWithTd() public function testWithTd(): void
{ {
$readability = $this->getReadability('<table><tr>' . str_repeat('<td><p>This is an awesome text with some links, here there are the awesome</td>', 7) . '</tr></table>', 'http://0.0.0.0'); $readability = $this->getReadability('<table><tr>' . str_repeat('<td><p>This is an awesome text with some links, here there are the awesome</td>', 7) . '</tr></table>', 'http://0.0.0.0');
$readability->debug = true; $readability->debug = true;
@ -240,7 +248,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertStringContainsString('This is an awesome text with some links, here there are', $readability->getContent()->getInnerHtml()); $this->assertStringContainsString('This is an awesome text with some links, here there are', $readability->getContent()->getInnerHtml());
} }
public function testWithSameClasses() public function testWithSameClasses(): void
{ {
$readability = $this->getReadability('<article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<div class="awesomecontent">This text is also an awesome text and you should know that !</div></article>', 'http://0.0.0.0'); $readability = $this->getReadability('<article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<div class="awesomecontent">This text is also an awesome text and you should know that !</div></article>', 'http://0.0.0.0');
$readability->debug = true; $readability->debug = true;
@ -254,7 +262,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertStringContainsString('This text is also an awesome text and you should know that', $readability->getContent()->getInnerHtml()); $this->assertStringContainsString('This text is also an awesome text and you should know that', $readability->getContent()->getInnerHtml());
} }
public function testWithScript() public function testWithScript(): void
{ {
$readability = $this->getReadability('<article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p><script>This text is also an awesome text and you should know that !</script></p></article>', 'http://0.0.0.0'); $readability = $this->getReadability('<article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p><script>This text is also an awesome text and you should know that !</script></p></article>', 'http://0.0.0.0');
$readability->debug = true; $readability->debug = true;
@ -268,7 +276,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertStringNotContainsString('This text is also an awesome text and you should know that', $readability->getContent()->getInnerHtml()); $this->assertStringNotContainsString('This text is also an awesome text and you should know that', $readability->getContent()->getInnerHtml());
} }
public function testTitle() public function testTitle(): void
{ {
$readability = $this->getReadability('<title>this is my title</title><article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p></p></article>', 'http://0.0.0.0'); $readability = $this->getReadability('<title>this is my title</title><article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p></p></article>', 'http://0.0.0.0');
$readability->debug = true; $readability->debug = true;
@ -282,7 +290,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertStringNotContainsString('This text is also an awesome text and you should know that', $readability->getContent()->getInnerHtml()); $this->assertStringNotContainsString('This text is also an awesome text and you should know that', $readability->getContent()->getInnerHtml());
} }
public function testTitleWithDash() public function testTitleWithDash(): void
{ {
$readability = $this->getReadability('<title> title2 - title3 </title><article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p></p></article>', 'http://0.0.0.0'); $readability = $this->getReadability('<title> title2 - title3 </title><article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p></p></article>', 'http://0.0.0.0');
$readability->debug = true; $readability->debug = true;
@ -296,7 +304,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertStringNotContainsString('This text is also an awesome text and you should know that', $readability->getContent()->getInnerHtml()); $this->assertStringNotContainsString('This text is also an awesome text and you should know that', $readability->getContent()->getInnerHtml());
} }
public function testTitleWithDoubleDot() public function testTitleWithDoubleDot(): void
{ {
$readability = $this->getReadability('<title> title2 : title3 </title><article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p></p></article>', 'http://0.0.0.0'); $readability = $this->getReadability('<title> title2 : title3 </title><article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p></p></article>', 'http://0.0.0.0');
$readability->debug = true; $readability->debug = true;
@ -310,7 +318,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertStringNotContainsString('This text is also an awesome text and you should know that', $readability->getContent()->getInnerHtml()); $this->assertStringNotContainsString('This text is also an awesome text and you should know that', $readability->getContent()->getInnerHtml());
} }
public function testTitleTooShortUseH1() public function testTitleTooShortUseH1(): void
{ {
$readability = $this->getReadability('<title>too short</title><h1>this is my h1 title !</h1><article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p></p></article>', 'http://0.0.0.0'); $readability = $this->getReadability('<title>too short</title><h1>this is my h1 title !</h1><article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p></p></article>', 'http://0.0.0.0');
$readability->debug = true; $readability->debug = true;
@ -324,17 +332,14 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertStringNotContainsString('This text is also an awesome text and you should know that', $readability->getContent()->getInnerHtml()); $this->assertStringNotContainsString('This text is also an awesome text and you should know that', $readability->getContent()->getInnerHtml());
} }
// dummy function to be used to the next test public function testAutoClosingIframeNotThrowingException(): void
public function error2Exception($code, $string, $file, $line, $context)
{
throw new \Exception($string, $code);
}
public function testAutoClosingIframeNotThrowingException()
{ {
error_reporting(\E_ALL | \E_STRICT); error_reporting(\E_ALL | \E_STRICT);
ini_set('display_errors', true); ini_set('display_errors', '1');
set_error_handler([$this, 'error2Exception'], \E_ALL | \E_STRICT); // dummy function to be used to the next test
set_error_handler(function (int $errno, string $errstr, string $errfile, int $errline, array $errcontext) {
throw new \Exception($errstr, $errno);
}, \E_ALL | \E_STRICT);
$data = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> $data = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" lang="ru-RU" prefix="og: http://ogp.me/ns#"> <html xmlns="http://www.w3.org/1999/xhtml" lang="ru-RU" prefix="og: http://ogp.me/ns#">
@ -376,7 +381,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
/** /**
* This should generate an Exception "DOMElement::setAttribute(): ID post-60 already defined". * This should generate an Exception "DOMElement::setAttribute(): ID post-60 already defined".
*/ */
public function testAppendIdAlreadyHere() public function testAppendIdAlreadyHere(): void
{ {
$data = '<!DOCTYPE html> $data = '<!DOCTYPE html>
<html lang="fr"> <html lang="fr">
@ -432,7 +437,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getTitle()); $this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getTitle());
} }
public function testPostFilters() public function testPostFilters(): void
{ {
$readability = $this->getReadability('<div>' . str_repeat('<p>This <strong>is</strong> the awesome content :)</p>', 10) . '</div>', 'http://0.0.0.0'); $readability = $this->getReadability('<div>' . str_repeat('<p>This <strong>is</strong> the awesome content :)</p>', 10) . '</div>', 'http://0.0.0.0');
$readability->addPostFilter('!<strong[^>]*>(.*?)</strong>!is', ''); $readability->addPostFilter('!<strong[^>]*>(.*?)</strong>!is', '');
@ -443,10 +448,8 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertStringContainsString('This the awesome content :)', $readability->getContent()->getInnerHtml()); $this->assertStringContainsString('This the awesome content :)', $readability->getContent()->getInnerHtml());
} }
public function testPreFilters() public function testPreFilters(): void
{ {
$this->markTestSkipped('Won\'t work until loadHtml() is moved in init() instead of __construct()');
$readability = $this->getReadability('<div>' . str_repeat('<p>This <b>is</b> the awesome and WONDERFUL content :)</p>', 7) . '</div>', 'http://0.0.0.0'); $readability = $this->getReadability('<div>' . str_repeat('<p>This <b>is</b> the awesome and WONDERFUL content :)</p>', 7) . '</div>', 'http://0.0.0.0');
$readability->addPreFilter('!<b[^>]*>(.*?)</b>!is', ''); $readability->addPreFilter('!<b[^>]*>(.*?)</b>!is', '');
@ -456,10 +459,10 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertStringContainsString('This the awesome and WONDERFUL content :)', $readability->getContent()->getInnerHtml()); $this->assertStringContainsString('This the awesome and WONDERFUL content :)', $readability->getContent()->getInnerHtml());
} }
public function testChildNodeGoneNull() public function testChildNodeGoneNull(): void
{ {
// from http://www.ayyaantuu.net/ethiopia-targets-opposition-lawmakers/ // from http://www.ayyaantuu.net/ethiopia-targets-opposition-lawmakers/
$html = file_get_contents('tests/fixtures/childNodeGoesNull.html'); $html = (string) file_get_contents('tests/fixtures/childNodeGoesNull.html');
$readability = $this->getReadability($html, 'http://0.0.0.0'); $readability = $this->getReadability($html, 'http://0.0.0.0');
$readability->debug = true; $readability->debug = true;
@ -469,10 +472,10 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertTrue($res); $this->assertTrue($res);
} }
public function testKeepFootnotes() public function testKeepFootnotes(): void
{ {
// from https://www.schreibdichte.de/blog/feed-aggregator-und-spaeter-lesen-dienst-im-team // from https://www.schreibdichte.de/blog/feed-aggregator-und-spaeter-lesen-dienst-im-team
$html = file_get_contents('tests/fixtures/keepFootnotes.html'); $html = (string) file_get_contents('tests/fixtures/keepFootnotes.html');
$readability = $this->getReadability($html, 'http://0.0.0.0'); $readability = $this->getReadability($html, 'http://0.0.0.0');
$readability->debug = true; $readability->debug = true;
@ -483,10 +486,10 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertStringContainsString('<a href="#fnref1:fnfeed_2" rev="footnote"', $readability->getContent()->getInnerHtml()); $this->assertStringContainsString('<a href="#fnref1:fnfeed_2" rev="footnote"', $readability->getContent()->getInnerHtml());
} }
public function testWithWipedBody() public function testWithWipedBody(): void
{ {
// from https://www.cs.cmu.edu/~rgs/alice-table.html // from https://www.cs.cmu.edu/~rgs/alice-table.html
$html = file_get_contents('tests/fixtures/wipedBody.html'); $html = (string) file_get_contents('tests/fixtures/wipedBody.html');
$readability = $this->getReadability($html, 'http://0.0.0.0', 'libxml', false); $readability = $this->getReadability($html, 'http://0.0.0.0', 'libxml', false);
$readability->debug = true; $readability->debug = true;
@ -496,7 +499,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertStringContainsString('<a href="alice-I.html">Down the Rabbit-Hole</a>', $readability->getContent()->getInnerHtml()); $this->assertStringContainsString('<a href="alice-I.html">Down the Rabbit-Hole</a>', $readability->getContent()->getInnerHtml());
} }
private function getReadability($html, $url = null, $parser = 'libxml', $useTidy = true) private function getReadability(string $html, string $url = null, string $parser = 'libxml', bool $useTidy = true): Readability
{ {
$readability = new Readability($html, $url, $parser, $useTidy); $readability = new Readability($html, $url, $parser, $useTidy);

Loading…
Cancel
Save