diff --git a/.github/workflows/coding-standards.yml b/.github/workflows/coding-standards.yml
index cd666ae..f26e7d4 100644
--- a/.github/workflows/coding-standards.yml
+++ b/.github/workflows/coding-standards.yml
@@ -8,9 +8,6 @@ on:
branches:
- master
-env:
- SYMFONY_PHPUNIT_VERSION: 7.5
-
jobs:
coding-standards:
name: "CS Fixer & PHPStan"
@@ -35,9 +32,6 @@ jobs:
env:
COMPOSER_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- - name: "Add PHPStan"
- run: "composer require phpstan/phpstan phpstan/phpstan-phpunit --dev --no-progress --no-suggest"
-
- name: "Install dependencies with Composer"
uses: "ramsey/composer-install@v1"
with:
diff --git a/.github/workflows/continuous-integration.yml b/.github/workflows/continuous-integration.yml
index 1424248..5f9bee1 100644
--- a/.github/workflows/continuous-integration.yml
+++ b/.github/workflows/continuous-integration.yml
@@ -19,13 +19,11 @@ jobs:
strategy:
matrix:
php:
- - "5.6"
- - "7.0"
- - "7.1"
- "7.2"
- "7.3"
- "7.4"
- "8.0"
+ - "8.1"
steps:
- name: "Checkout"
@@ -38,18 +36,12 @@ jobs:
with:
php-version: "${{ matrix.php }}"
coverage: "none"
- tools: composer:v1
+ tools: composer:v2
extensions: tidy
ini-values: "date.timezone=Europe/Paris"
env:
COMPOSER_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- - name: "Force PHPUnit version"
- if: matrix.php >= '7.2'
- run: "echo $SYMFONY_PHPUNIT_VERSION"
- env:
- SYMFONY_PHPUNIT_VERSION: 7.5
-
- name: "Remove useless deps"
run: "composer remove friendsofphp/php-cs-fixer --dev --no-progress --no-update"
@@ -84,7 +76,7 @@ jobs:
with:
php-version: "${{ matrix.php }}"
coverage: "xdebug"
- tools: composer:v1
+ tools: composer:v2
extensions: tidy
ini-values: "date.timezone=Europe/Paris"
env:
@@ -103,8 +95,6 @@ jobs:
- name: "Run PHPUnit (with coverage)"
run: "php vendor/bin/simple-phpunit -v --coverage-clover build/logs/clover.xml"
- env:
- SYMFONY_PHPUNIT_VERSION: 7.5
- name: "Retrieve Coveralls phar"
run: "wget https://github.com/php-coveralls/php-coveralls/releases/download/v2.4.2/php-coveralls.phar"
@@ -126,49 +116,6 @@ jobs:
php:
- "7.2"
- steps:
- - name: "Checkout"
- uses: "actions/checkout@v2"
- with:
- fetch-depth: 2
-
- - name: "Install PHP"
- uses: "shivammathur/setup-php@v2"
- with:
- php-version: "${{ matrix.php }}"
- coverage: "none"
- tools: composer:v1
- extensions: tidy
- ini-values: "date.timezone=Europe/Paris"
- env:
- COMPOSER_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
- - name: "Remove useless deps"
- run: "composer remove friendsofphp/php-cs-fixer --dev --no-progress --no-update"
-
- - name: "Install dependencies with Composer"
- uses: "ramsey/composer-install@v1"
- with:
- composer-options: "--optimize-autoloader --prefer-dist"
- dependency-versions: "lowest"
-
- - name: "Setup logs"
- run: "mkdir -p build/logs"
-
- - name: "Run PHPUnit"
- run: "php vendor/bin/simple-phpunit -v"
- env:
- SYMFONY_PHPUNIT_VERSION: 7.5
-
- phpunit-composerv2:
- name: "PHPUnit with Composer v2 (PHP ${{ matrix.php }})"
- runs-on: "ubuntu-20.04"
-
- strategy:
- matrix:
- php:
- - "7.4"
-
steps:
- name: "Checkout"
uses: "actions/checkout@v2"
@@ -193,11 +140,10 @@ jobs:
uses: "ramsey/composer-install@v1"
with:
composer-options: "--optimize-autoloader --prefer-dist"
+ dependency-versions: "lowest"
- name: "Setup logs"
run: "mkdir -p build/logs"
- name: "Run PHPUnit"
run: "php vendor/bin/simple-phpunit -v"
- env:
- SYMFONY_PHPUNIT_VERSION: 7.5
diff --git a/.gitignore b/.gitignore
index 160e7cc..65afc77 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,4 +2,5 @@ vendor/
coverage/
composer.lock
.php_cs.cache
+.php-cs-fixer.cache
.phpunit.result.cache
diff --git a/README.md b/README.md
index e3f3cf7..d285e70 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@
[](https://packagist.org/packages/j0k3r/php-readability)
[](https://packagist.org/packages/j0k3r/php-readability)
-This is an extract of the Readability class from this [full-text-rss](https://github.com/Dither/full-text-rss) fork. It can be defined as a better version of the original [php-readability](https://bitbucket.org/fivefilters/php-readability/overview).
+This is an extract of the Readability class from this [full-text-rss](https://github.com/Dither/full-text-rss) fork. It can be defined as a better version of the original [php-readability](https://bitbucket.org/fivefilters/php-readability).
## Differences
diff --git a/composer.json b/composer.json
index d32d576..5436e9e 100644
--- a/composer.json
+++ b/composer.json
@@ -24,15 +24,17 @@
"role": "Developer (original JS version)"
}],
"require": {
- "php": ">=5.6.0",
+ "php": ">=7.2.0",
"ext-mbstring": "*",
"psr/log": "^1.0",
"masterminds/html5": "^2.7"
},
"require-dev": {
- "friendsofphp/php-cs-fixer": "^2.14",
+ "friendsofphp/php-cs-fixer": "^3.0",
"monolog/monolog": "^1.24|^2.1",
- "symfony/phpunit-bridge": "^4.4|^5.3"
+ "symfony/phpunit-bridge": "^4.4|^5.3|^6.0",
+ "phpstan/phpstan": "^1.3",
+ "phpstan/phpstan-phpunit": "^1.0"
},
"suggest": {
"ext-tidy": "Used to clean up given HTML and to avoid problems with bad HTML structure."
@@ -42,5 +44,10 @@
},
"autoload-dev": {
"psr-4": { "Tests\\Readability\\": "tests/" }
+ },
+ "config":{
+ "platform": {
+ "php": "7.2.34"
+ }
}
}
diff --git a/phpstan.neon b/phpstan.neon
index 7c1f51c..4f4a583 100644
--- a/phpstan.neon
+++ b/phpstan.neon
@@ -6,7 +6,9 @@ parameters:
# https://github.com/phpstan/phpstan/issues/694#issuecomment-350724288
bootstrapFiles:
- - vendor/bin/.phpunit/phpunit-7.5-0/vendor/autoload.php
+ - vendor/bin/.phpunit/phpunit-8.5-0/vendor/autoload.php
+
+ checkMissingIterableValueType: false
includes:
- vendor/phpstan/phpstan-phpunit/extension.neon
diff --git a/phpunit.xml.dist b/phpunit.xml.dist
index 342a478..75d1741 100644
--- a/phpunit.xml.dist
+++ b/phpunit.xml.dist
@@ -11,7 +11,7 @@
>
tags, etc.
*/
- public function prepArticle(\DOMNode $articleContent)
+ public function prepArticle(\DOMNode $articleContent): void
{
if (!$articleContent instanceof \DOMElement) {
return;
@@ -491,10 +489,8 @@ class Readability implements LoggerAwareInterface
* @param \DOMElement $e
* @param bool $normalizeSpaces (default: true)
* @param bool $flattenLines (default: false)
- *
- * @return string
*/
- public function getInnerText($e, $normalizeSpaces = true, $flattenLines = false)
+ public function getInnerText($e, bool $normalizeSpaces = true, bool $flattenLines = false): string
{
if (null === $e || !isset($e->textContent) || '' === $e->textContent) {
return '';
@@ -503,9 +499,11 @@ class Readability implements LoggerAwareInterface
$textContent = trim($e->textContent);
if ($flattenLines) {
- $textContent = mb_ereg_replace('(?:[\r\n](?:\s| )*)+', '', $textContent);
- } elseif ($normalizeSpaces) {
- $textContent = mb_ereg_replace('\s\s+', ' ', $textContent);
+ return (string) mb_ereg_replace('(?:[\r\n](?:\s| )*)+', '', $textContent);
+ }
+
+ if ($normalizeSpaces) {
+ return (string) mb_ereg_replace('\s\s+', ' ', $textContent);
}
return $textContent;
@@ -513,30 +511,22 @@ class Readability implements LoggerAwareInterface
/**
* Remove the style attribute on every $e and under.
- *
- * @param \DOMElement $e
*/
- public function cleanStyles($e)
+ public function cleanStyles(\DOMElement $e): void
{
- if (!\is_object($e)) {
- return;
- }
-
- $elems = $e->getElementsByTagName('*');
+ if (\is_object($e)) {
+ $elems = $e->getElementsByTagName('*');
- foreach ($elems as $elem) {
- $elem->removeAttribute('style');
+ foreach ($elems as $elem) {
+ $elem->removeAttribute('style');
+ }
}
}
/**
* Get comma number for a given text.
- *
- * @param string $text
- *
- * @return int
*/
- public function getCommaCount($text)
+ public function getCommaCount(string $text): int
{
return substr_count($text, ',');
}
@@ -544,12 +534,8 @@ class Readability implements LoggerAwareInterface
/**
* Get words number for a given text if words separated by a space.
* Input string should be normalized.
- *
- * @param string $text
- *
- * @return int
*/
- public function getWordCount($text)
+ public function getWordCount(string $text): int
{
return substr_count($text, ' ');
}
@@ -558,12 +544,8 @@ class Readability implements LoggerAwareInterface
* Get the density of links as a percentage of the content
* This is the amount of text that is inside a link divided by the total text in the node.
* Can exclude external references to differentiate between simple text and menus/infoblocks.
- *
- * @param bool $excludeExternal
- *
- * @return int
*/
- public function getLinkDensity(\DOMElement $e, $excludeExternal = false)
+ public function getLinkDensity(\DOMElement $e, bool $excludeExternal = false): float
{
$links = $e->getElementsByTagName('a');
$textLength = mb_strlen($this->getInnerText($e, true, true));
@@ -585,10 +567,8 @@ class Readability implements LoggerAwareInterface
/**
* Get an element relative weight.
- *
- * @return int
*/
- public function getWeight(\DOMElement $e)
+ public function getWeight(\DOMElement $e): int
{
if (!$this->flagIsActive(self::FLAG_WEIGHT_ATTRIBUTES)) {
return 0;
@@ -606,7 +586,7 @@ class Readability implements LoggerAwareInterface
/**
* Remove extraneous break tags from a node.
*/
- public function killBreaks(\DOMElement $node)
+ public function killBreaks(\DOMElement $node): void
{
$html = $node->getInnerHTML();
$html = preg_replace($this->regexps['killBreaks'], '
', $html);
@@ -618,10 +598,8 @@ class Readability implements LoggerAwareInterface
* (Unless it's a youtube/vimeo video. People love movies.).
*
* Updated 2012-09-18 to preserve youtube/vimeo iframes
- *
- * @param string $tag
*/
- public function clean(\DOMElement $e, $tag)
+ public function clean(\DOMElement $e, string $tag): void
{
$targetList = $e->getElementsByTagName($tag);
$isEmbed = ('audio' === $tag || 'video' === $tag || 'iframe' === $tag || 'object' === $tag || 'embed' === $tag);
@@ -652,10 +630,8 @@ class Readability implements LoggerAwareInterface
* Clean an element of all tags of type "tag" if they look fishy.
* "Fishy" is an algorithm based on content length, classnames,
* link density, number of images & embeds, etc.
- *
- * @param string $tag
*/
- public function cleanConditionally(\DOMElement $e, $tag)
+ public function cleanConditionally(\DOMElement $e, string $tag): void
{
if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
return;
@@ -765,7 +741,7 @@ class Readability implements LoggerAwareInterface
/**
* Clean out spurious headers from an Element. Checks things like classnames and link density.
*/
- public function cleanHeaders(\DOMElement $e)
+ public function cleanHeaders(\DOMElement $e): void
{
for ($headerIndex = 1; $headerIndex < 3; ++$headerIndex) {
$headers = $e->getElementsByTagName('h' . $headerIndex);
@@ -780,57 +756,28 @@ class Readability implements LoggerAwareInterface
/**
* Check if the given flag is active.
- *
- * @param int $flag
- *
- * @return bool
*/
- public function flagIsActive($flag)
+ public function flagIsActive(int $flag): bool
{
return ($this->flags & $flag) > 0;
}
/**
* Add a flag.
- *
- * @param int $flag
*/
- public function addFlag($flag)
+ public function addFlag(int $flag): void
{
$this->flags = $this->flags | $flag;
}
/**
* Remove a flag.
- *
- * @param int $flag
*/
- public function removeFlag($flag)
+ public function removeFlag(int $flag): void
{
$this->flags = $this->flags & ~$flag;
}
- /**
- * Debug.
- *
- * @deprecated use $this->logger->debug() instead
- * @codeCoverageIgnore
- */
- protected function dbg($msg)
- {
- $this->logger->debug($msg);
- }
-
- /**
- * Dump debug info.
- *
- * @deprecated since Monolog gather log, we don't need it
- * @codeCoverageIgnore
- */
- protected function dump_dbg()
- {
- }
-
/**
* Get the article title as an H1.
*
@@ -877,7 +824,7 @@ class Readability implements LoggerAwareInterface
* Prepare the HTML document for readability to scrape it.
* This includes things like stripping javascript, CSS, and handling terrible markup.
*/
- protected function prepDocument()
+ protected function prepDocument(): void
{
/*
* In some cases a body element can't be found (if the HTML is totally hosed for example)
@@ -906,7 +853,7 @@ class Readability implements LoggerAwareInterface
* Initialize a node with the readability object. Also checks the
* className/id for special names to add to its score.
*/
- protected function initializeNode(\DOMElement $node)
+ protected function initializeNode(\DOMElement $node): void
{
if (!isset($node->tagName)) {
return;
@@ -993,7 +940,8 @@ class Readability implements LoggerAwareInterface
$allElements = $page->getElementsByTagName('*');
- for ($nodeIndex = 0; ($node = $allElements->item($nodeIndex)); ++$nodeIndex) {
+ for ($nodeIndex = 0; $allElements->item($nodeIndex); ++$nodeIndex) {
+ $node = $allElements->item($nodeIndex);
$tagName = $node->tagName;
$nodeContent = $node->getInnerHTML();
@@ -1107,7 +1055,7 @@ class Readability implements LoggerAwareInterface
$contentScore += max(min($score, 3), -3);/**/
// Add the score to the parent. The grandparent gets half.
- $parentNode->getAttributeNode('readability')->value += $contentScore;
+ $parentNode->getAttributeNode('readability')->value = ((float) $parentNode->getAttributeNode('readability')->value) + $contentScore;
if ($grandParentNode) {
$grandParentNode->getAttributeNode('readability')->value += round($contentScore / self::GRANDPARENT_SCORE_DIVISOR);
}
@@ -1228,7 +1176,7 @@ class Readability implements LoggerAwareInterface
$siblingScoreThreshold = max(10, ((int) $topCandidate->getAttribute('readability')) * 0.2);
$siblingNodes = $topCandidate->parentNode->childNodes;
- if (null === $siblingNodes) {
+ if (0 === $siblingNodes->length) {
$siblingNodes = new \stdClass();
$siblingNodes->length = 0;
}
@@ -1260,7 +1208,7 @@ class Readability implements LoggerAwareInterface
$nodeLength = mb_strlen($nodeContent);
if (($nodeLength > self::MIN_NODE_LENGTH && $linkDensity < self::MAX_LINK_DENSITY)
- || ($nodeLength < self::MIN_NODE_LENGTH && 0 === $linkDensity && preg_match('/\.( |$)/', $nodeContent))) {
+ || ($nodeLength < self::MIN_NODE_LENGTH && 0 === (int) $linkDensity && preg_match('/\.( |$)/', $nodeContent))) {
$append = true;
}
}
@@ -1337,12 +1285,8 @@ class Readability implements LoggerAwareInterface
/**
* Get an element weight by attribute.
* Uses regular expressions to tell if this element looks good or bad.
- *
- * @param string $attribute
- *
- * @return int
*/
- protected function weightAttribute(\DOMElement $element, $attribute)
+ protected function weightAttribute(\DOMElement $element, string $attribute): int
{
if (!$element->hasAttribute($attribute)) {
return 0;
@@ -1373,7 +1317,7 @@ class Readability implements LoggerAwareInterface
/**
* Will recreate previously deleted body property.
*/
- protected function reinitBody()
+ protected function reinitBody(): void
{
if (!isset($this->body->childNodes)) {
$this->body = $this->dom->createElement('body');
@@ -1388,14 +1332,14 @@ class Readability implements LoggerAwareInterface
*
* @todo This should be called in init() instead of from __construct
*/
- private function loadHtml()
+ private function loadHtml(): void
{
$this->original_html = $this->html;
$this->logger->debug('Parsing URL: ' . $this->url);
if ($this->url) {
- $this->domainRegExp = '/' . strtr(preg_replace('/www\d*\./', '', parse_url($this->url, \PHP_URL_HOST)), ['.' => '\.']) . '/';
+ $this->domainRegExp = '/' . strtr((string) preg_replace('/www\d*\./', '', (string) parse_url($this->url, \PHP_URL_HOST)), ['.' => '\.']) . '/';
}
mb_internal_encoding('UTF-8');
@@ -1431,7 +1375,7 @@ class Readability implements LoggerAwareInterface
unset($tidy);
}
- $this->html = mb_convert_encoding($this->html, 'HTML-ENTITIES', 'UTF-8');
+ $this->html = mb_convert_encoding((string) $this->html, 'HTML-ENTITIES', 'UTF-8');
if ('html5lib' === $this->parser || 'html5' === $this->parser) {
$this->dom = (new HTML5())->loadHTML($this->html);
diff --git a/tests/ReadabilityTest.php b/tests/ReadabilityTest.php
index a8e8cfb..5931b43 100644
--- a/tests/ReadabilityTest.php
+++ b/tests/ReadabilityTest.php
@@ -4,27 +4,32 @@ namespace Tests\Readability;
use Monolog\Handler\TestHandler;
use Monolog\Logger;
+use Psr\Log\LoggerInterface;
use Readability\Readability;
class ReadabilityTest extends \PHPUnit\Framework\TestCase
{
+ /** @var TestHandler */
public $logHandler;
+ /** @var LoggerInterface */
public $logger;
/**
* @requires extension tidy
*/
- public function testConstructDefault()
+ public function testConstructDefault(): void
{
$readability = $this->getReadability('');
+ $readability->init();
$this->assertNull($readability->url);
$this->assertInstanceOf('DomDocument', $readability->dom);
}
- public function testConstructHtml5Parser()
+ public function testConstructHtml5Parser(): void
{
$readability = $this->getReadability('