Require PHP >= 7.2

- remove test on Composer v1
- remove deprecated function
- move `loadHtml()` into `init()` instead of `__construct`

Kinda prepare 2.0 version :)
pull/69/head
Jeremy Benoist 4 years ago
parent b1a20a9575
commit 66215a6c80
No known key found for this signature in database
GPG Key ID: BCA73962457ACC3C
  1. 6
      .github/workflows/coding-standards.yml
  2. 62
      .github/workflows/continuous-integration.yml
  3. 1
      .gitignore
  4. 2
      README.md
  5. 13
      composer.json
  6. 4
      phpstan.neon
  7. 2
      phpunit.xml.dist
  8. 182
      src/Readability.php
  9. 91
      tests/ReadabilityTest.php

@ -8,9 +8,6 @@ on:
branches:
- master
env:
SYMFONY_PHPUNIT_VERSION: 7.5
jobs:
coding-standards:
name: "CS Fixer & PHPStan"
@ -35,9 +32,6 @@ jobs:
env:
COMPOSER_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: "Add PHPStan"
run: "composer require phpstan/phpstan phpstan/phpstan-phpunit --dev --no-progress --no-suggest"
- name: "Install dependencies with Composer"
uses: "ramsey/composer-install@v1"
with:

@ -19,13 +19,11 @@ jobs:
strategy:
matrix:
php:
- "5.6"
- "7.0"
- "7.1"
- "7.2"
- "7.3"
- "7.4"
- "8.0"
- "8.1"
steps:
- name: "Checkout"
@ -38,18 +36,12 @@ jobs:
with:
php-version: "${{ matrix.php }}"
coverage: "none"
tools: composer:v1
tools: composer:v2
extensions: tidy
ini-values: "date.timezone=Europe/Paris"
env:
COMPOSER_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: "Force PHPUnit version"
if: matrix.php >= '7.2'
run: "echo $SYMFONY_PHPUNIT_VERSION"
env:
SYMFONY_PHPUNIT_VERSION: 7.5
- name: "Remove useless deps"
run: "composer remove friendsofphp/php-cs-fixer --dev --no-progress --no-update"
@ -84,7 +76,7 @@ jobs:
with:
php-version: "${{ matrix.php }}"
coverage: "xdebug"
tools: composer:v1
tools: composer:v2
extensions: tidy
ini-values: "date.timezone=Europe/Paris"
env:
@ -103,8 +95,6 @@ jobs:
- name: "Run PHPUnit (with coverage)"
run: "php vendor/bin/simple-phpunit -v --coverage-clover build/logs/clover.xml"
env:
SYMFONY_PHPUNIT_VERSION: 7.5
- name: "Retrieve Coveralls phar"
run: "wget https://github.com/php-coveralls/php-coveralls/releases/download/v2.4.2/php-coveralls.phar"
@ -126,49 +116,6 @@ jobs:
php:
- "7.2"
steps:
- name: "Checkout"
uses: "actions/checkout@v2"
with:
fetch-depth: 2
- name: "Install PHP"
uses: "shivammathur/setup-php@v2"
with:
php-version: "${{ matrix.php }}"
coverage: "none"
tools: composer:v1
extensions: tidy
ini-values: "date.timezone=Europe/Paris"
env:
COMPOSER_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: "Remove useless deps"
run: "composer remove friendsofphp/php-cs-fixer --dev --no-progress --no-update"
- name: "Install dependencies with Composer"
uses: "ramsey/composer-install@v1"
with:
composer-options: "--optimize-autoloader --prefer-dist"
dependency-versions: "lowest"
- name: "Setup logs"
run: "mkdir -p build/logs"
- name: "Run PHPUnit"
run: "php vendor/bin/simple-phpunit -v"
env:
SYMFONY_PHPUNIT_VERSION: 7.5
phpunit-composerv2:
name: "PHPUnit with Composer v2 (PHP ${{ matrix.php }})"
runs-on: "ubuntu-20.04"
strategy:
matrix:
php:
- "7.4"
steps:
- name: "Checkout"
uses: "actions/checkout@v2"
@ -193,11 +140,10 @@ jobs:
uses: "ramsey/composer-install@v1"
with:
composer-options: "--optimize-autoloader --prefer-dist"
dependency-versions: "lowest"
- name: "Setup logs"
run: "mkdir -p build/logs"
- name: "Run PHPUnit"
run: "php vendor/bin/simple-phpunit -v"
env:
SYMFONY_PHPUNIT_VERSION: 7.5

1
.gitignore vendored

@ -2,4 +2,5 @@ vendor/
coverage/
composer.lock
.php_cs.cache
.php-cs-fixer.cache
.phpunit.result.cache

@ -5,7 +5,7 @@
[![Total Downloads](https://poser.pugx.org/j0k3r/php-readability/downloads)](https://packagist.org/packages/j0k3r/php-readability)
[![License](https://poser.pugx.org/j0k3r/php-readability/license)](https://packagist.org/packages/j0k3r/php-readability)
This is an extract of the Readability class from this [full-text-rss](https://github.com/Dither/full-text-rss) fork. It can be defined as a better version of the original [php-readability](https://bitbucket.org/fivefilters/php-readability/overview).
This is an extract of the Readability class from this [full-text-rss](https://github.com/Dither/full-text-rss) fork. It can be defined as a better version of the original [php-readability](https://bitbucket.org/fivefilters/php-readability).
## Differences

@ -24,15 +24,17 @@
"role": "Developer (original JS version)"
}],
"require": {
"php": ">=5.6.0",
"php": ">=7.2.0",
"ext-mbstring": "*",
"psr/log": "^1.0",
"masterminds/html5": "^2.7"
},
"require-dev": {
"friendsofphp/php-cs-fixer": "^2.14",
"friendsofphp/php-cs-fixer": "^3.0",
"monolog/monolog": "^1.24|^2.1",
"symfony/phpunit-bridge": "^4.4|^5.3"
"symfony/phpunit-bridge": "^4.4|^5.3|^6.0",
"phpstan/phpstan": "^1.3",
"phpstan/phpstan-phpunit": "^1.0"
},
"suggest": {
"ext-tidy": "Used to clean up given HTML and to avoid problems with bad HTML structure."
@ -42,5 +44,10 @@
},
"autoload-dev": {
"psr-4": { "Tests\\Readability\\": "tests/" }
},
"config":{
"platform": {
"php": "7.2.34"
}
}
}

@ -6,7 +6,9 @@ parameters:
# https://github.com/phpstan/phpstan/issues/694#issuecomment-350724288
bootstrapFiles:
- vendor/bin/.phpunit/phpunit-7.5-0/vendor/autoload.php
- vendor/bin/.phpunit/phpunit-8.5-0/vendor/autoload.php
checkMissingIterableValueType: false
includes:
- vendor/phpstan/phpstan-phpunit/extension.neon

@ -11,7 +11,7 @@
>
<testsuites>
<testsuite name="Readability Test Suite">
<testsuite name="Readability">
<directory>./tests/</directory>
</testsuite>
</testsuites>

@ -53,20 +53,20 @@ use Psr\Log\NullLogger;
class Readability implements LoggerAwareInterface
{
// flags
const FLAG_STRIP_UNLIKELYS = 1;
const FLAG_WEIGHT_ATTRIBUTES = 2;
const FLAG_CLEAN_CONDITIONALLY = 4;
const FLAG_DISABLE_PREFILTER = 8;
const FLAG_DISABLE_POSTFILTER = 16;
public const FLAG_STRIP_UNLIKELYS = 1;
public const FLAG_WEIGHT_ATTRIBUTES = 2;
public const FLAG_CLEAN_CONDITIONALLY = 4;
public const FLAG_DISABLE_PREFILTER = 8;
public const FLAG_DISABLE_POSTFILTER = 16;
// constants
const SCORE_CHARS_IN_PARAGRAPH = 100;
const SCORE_WORDS_IN_PARAGRAPH = 20;
const GRANDPARENT_SCORE_DIVISOR = 2;
const MIN_PARAGRAPH_LENGTH = 20;
const MIN_COMMAS_IN_PARAGRAPH = 6;
const MIN_ARTICLE_LENGTH = 200;
const MIN_NODE_LENGTH = 80;
const MAX_LINK_DENSITY = 0.25;
public const SCORE_CHARS_IN_PARAGRAPH = 100;
public const SCORE_WORDS_IN_PARAGRAPH = 20;
public const GRANDPARENT_SCORE_DIVISOR = 2;
public const MIN_PARAGRAPH_LENGTH = 20;
public const MIN_COMMAS_IN_PARAGRAPH = 6;
public const MIN_ARTICLE_LENGTH = 200;
public const MIN_NODE_LENGTH = 80;
public const MAX_LINK_DENSITY = 0.25;
public $convertLinksToFootnotes = false;
public $revertForcedParagraphElements = true;
public $articleTitle;
@ -171,26 +171,22 @@ class Readability implements LoggerAwareInterface
/**
* Create instance of Readability.
*
* @param string $html UTF-8 encoded string
* @param string $url URL associated with HTML (for footnotes)
* @param string $parser Which parser to use for turning raw HTML into a DOMDocument
* @param bool $use_tidy Use tidy
* @param string $html UTF-8 encoded string
* @param string $url URL associated with HTML (for footnotes)
* @param string $parser Which parser to use for turning raw HTML into a DOMDocument
* @param bool $useTidy Use tidy
*/
public function __construct($html, $url = null, $parser = 'libxml', $use_tidy = true)
public function __construct(string $html, string $url = null, string $parser = 'libxml', bool $useTidy = true)
{
$this->url = $url;
$this->html = $html;
$this->parser = $parser;
$this->useTidy = $use_tidy && \function_exists('tidy_parse_string');
$this->useTidy = $useTidy && \function_exists('tidy_parse_string');
$this->logger = new NullLogger();
$this->loadHtml();
}
/**
* @return void
*/
public function setLogger(LoggerInterface $logger)
public function setLogger(LoggerInterface $logger): void
{
$this->logger = $logger;
}
@ -221,7 +217,7 @@ class Readability implements LoggerAwareInterface
* @param string $filter RegExp for replace
* @param string $replacer Replacer
*/
public function addPreFilter($filter, $replacer = '')
public function addPreFilter(string $filter, string $replacer = ''): void
{
$this->pre_filters[$filter] = $replacer;
}
@ -232,7 +228,7 @@ class Readability implements LoggerAwareInterface
* @param string $filter RegExp for replace
* @param string $replacer Replacer
*/
public function addPostFilter($filter, $replacer = '')
public function addPostFilter(string $filter, string $replacer = ''): void
{
$this->post_filters[$filter] = $replacer;
}
@ -249,8 +245,10 @@ class Readability implements LoggerAwareInterface
*
* @return bool true if we found content, false otherwise
*/
public function init()
public function init(): bool
{
$this->loadHtml();
if (!isset($this->dom->documentElement)) {
return false;
}
@ -315,7 +313,7 @@ class Readability implements LoggerAwareInterface
/**
* Run any post-process modifications to article content as necessary.
*/
public function postProcessContent(\DOMElement $articleContent)
public function postProcessContent(\DOMElement $articleContent): void
{
if ($this->convertLinksToFootnotes && !preg_match('/\bwiki/', $this->url)) {
$this->addFootnotes($articleContent);
@ -327,7 +325,7 @@ class Readability implements LoggerAwareInterface
*
* @see http://www.roughtype.com/archives/2010/05/experiments_in.php
*/
public function addFootnotes(\DOMElement $articleContent)
public function addFootnotes(\DOMElement $articleContent): void
{
$footnotesWrapper = $this->dom->createElement('footer');
$footnotesWrapper->setAttribute('class', 'readability-footnotes');
@ -389,7 +387,7 @@ class Readability implements LoggerAwareInterface
* Prepare the article node for display. Clean out any inline styles,
* iframes, forms, strip extraneous <p> tags, etc.
*/
public function prepArticle(\DOMNode $articleContent)
public function prepArticle(\DOMNode $articleContent): void
{
if (!$articleContent instanceof \DOMElement) {
return;
@ -491,10 +489,8 @@ class Readability implements LoggerAwareInterface
* @param \DOMElement $e
* @param bool $normalizeSpaces (default: true)
* @param bool $flattenLines (default: false)
*
* @return string
*/
public function getInnerText($e, $normalizeSpaces = true, $flattenLines = false)
public function getInnerText($e, bool $normalizeSpaces = true, bool $flattenLines = false): string
{
if (null === $e || !isset($e->textContent) || '' === $e->textContent) {
return '';
@ -503,9 +499,11 @@ class Readability implements LoggerAwareInterface
$textContent = trim($e->textContent);
if ($flattenLines) {
$textContent = mb_ereg_replace('(?:[\r\n](?:\s|&nbsp;)*)+', '', $textContent);
} elseif ($normalizeSpaces) {
$textContent = mb_ereg_replace('\s\s+', ' ', $textContent);
return (string) mb_ereg_replace('(?:[\r\n](?:\s|&nbsp;)*)+', '', $textContent);
}
if ($normalizeSpaces) {
return (string) mb_ereg_replace('\s\s+', ' ', $textContent);
}
return $textContent;
@ -513,30 +511,22 @@ class Readability implements LoggerAwareInterface
/**
* Remove the style attribute on every $e and under.
*
* @param \DOMElement $e
*/
public function cleanStyles($e)
public function cleanStyles(\DOMElement $e): void
{
if (!\is_object($e)) {
return;
}
$elems = $e->getElementsByTagName('*');
if (\is_object($e)) {
$elems = $e->getElementsByTagName('*');
foreach ($elems as $elem) {
$elem->removeAttribute('style');
foreach ($elems as $elem) {
$elem->removeAttribute('style');
}
}
}
/**
* Get comma number for a given text.
*
* @param string $text
*
* @return int
*/
public function getCommaCount($text)
public function getCommaCount(string $text): int
{
return substr_count($text, ',');
}
@ -544,12 +534,8 @@ class Readability implements LoggerAwareInterface
/**
* Get words number for a given text if words separated by a space.
* Input string should be normalized.
*
* @param string $text
*
* @return int
*/
public function getWordCount($text)
public function getWordCount(string $text): int
{
return substr_count($text, ' ');
}
@ -558,12 +544,8 @@ class Readability implements LoggerAwareInterface
* Get the density of links as a percentage of the content
* This is the amount of text that is inside a link divided by the total text in the node.
* Can exclude external references to differentiate between simple text and menus/infoblocks.
*
* @param bool $excludeExternal
*
* @return int
*/
public function getLinkDensity(\DOMElement $e, $excludeExternal = false)
public function getLinkDensity(\DOMElement $e, bool $excludeExternal = false): float
{
$links = $e->getElementsByTagName('a');
$textLength = mb_strlen($this->getInnerText($e, true, true));
@ -585,10 +567,8 @@ class Readability implements LoggerAwareInterface
/**
* Get an element relative weight.
*
* @return int
*/
public function getWeight(\DOMElement $e)
public function getWeight(\DOMElement $e): int
{
if (!$this->flagIsActive(self::FLAG_WEIGHT_ATTRIBUTES)) {
return 0;
@ -606,7 +586,7 @@ class Readability implements LoggerAwareInterface
/**
* Remove extraneous break tags from a node.
*/
public function killBreaks(\DOMElement $node)
public function killBreaks(\DOMElement $node): void
{
$html = $node->getInnerHTML();
$html = preg_replace($this->regexps['killBreaks'], '<br />', $html);
@ -618,10 +598,8 @@ class Readability implements LoggerAwareInterface
* (Unless it's a youtube/vimeo video. People love movies.).
*
* Updated 2012-09-18 to preserve youtube/vimeo iframes
*
* @param string $tag
*/
public function clean(\DOMElement $e, $tag)
public function clean(\DOMElement $e, string $tag): void
{
$targetList = $e->getElementsByTagName($tag);
$isEmbed = ('audio' === $tag || 'video' === $tag || 'iframe' === $tag || 'object' === $tag || 'embed' === $tag);
@ -652,10 +630,8 @@ class Readability implements LoggerAwareInterface
* Clean an element of all tags of type "tag" if they look fishy.
* "Fishy" is an algorithm based on content length, classnames,
* link density, number of images & embeds, etc.
*
* @param string $tag
*/
public function cleanConditionally(\DOMElement $e, $tag)
public function cleanConditionally(\DOMElement $e, string $tag): void
{
if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
return;
@ -765,7 +741,7 @@ class Readability implements LoggerAwareInterface
/**
* Clean out spurious headers from an Element. Checks things like classnames and link density.
*/
public function cleanHeaders(\DOMElement $e)
public function cleanHeaders(\DOMElement $e): void
{
for ($headerIndex = 1; $headerIndex < 3; ++$headerIndex) {
$headers = $e->getElementsByTagName('h' . $headerIndex);
@ -780,57 +756,28 @@ class Readability implements LoggerAwareInterface
/**
* Check if the given flag is active.
*
* @param int $flag
*
* @return bool
*/
public function flagIsActive($flag)
public function flagIsActive(int $flag): bool
{
return ($this->flags & $flag) > 0;
}
/**
* Add a flag.
*
* @param int $flag
*/
public function addFlag($flag)
public function addFlag(int $flag): void
{
$this->flags = $this->flags | $flag;
}
/**
* Remove a flag.
*
* @param int $flag
*/
public function removeFlag($flag)
public function removeFlag(int $flag): void
{
$this->flags = $this->flags & ~$flag;
}
/**
* Debug.
*
* @deprecated use $this->logger->debug() instead
* @codeCoverageIgnore
*/
protected function dbg($msg)
{
$this->logger->debug($msg);
}
/**
* Dump debug info.
*
* @deprecated since Monolog gather log, we don't need it
* @codeCoverageIgnore
*/
protected function dump_dbg()
{
}
/**
* Get the article title as an H1.
*
@ -877,7 +824,7 @@ class Readability implements LoggerAwareInterface
* Prepare the HTML document for readability to scrape it.
* This includes things like stripping javascript, CSS, and handling terrible markup.
*/
protected function prepDocument()
protected function prepDocument(): void
{
/*
* In some cases a body element can't be found (if the HTML is totally hosed for example)
@ -906,7 +853,7 @@ class Readability implements LoggerAwareInterface
* Initialize a node with the readability object. Also checks the
* className/id for special names to add to its score.
*/
protected function initializeNode(\DOMElement $node)
protected function initializeNode(\DOMElement $node): void
{
if (!isset($node->tagName)) {
return;
@ -993,7 +940,8 @@ class Readability implements LoggerAwareInterface
$allElements = $page->getElementsByTagName('*');
for ($nodeIndex = 0; ($node = $allElements->item($nodeIndex)); ++$nodeIndex) {
for ($nodeIndex = 0; $allElements->item($nodeIndex); ++$nodeIndex) {
$node = $allElements->item($nodeIndex);
$tagName = $node->tagName;
$nodeContent = $node->getInnerHTML();
@ -1107,7 +1055,7 @@ class Readability implements LoggerAwareInterface
$contentScore += max(min($score, 3), -3);/**/
// Add the score to the parent. The grandparent gets half.
$parentNode->getAttributeNode('readability')->value += $contentScore;
$parentNode->getAttributeNode('readability')->value = ((float) $parentNode->getAttributeNode('readability')->value) + $contentScore;
if ($grandParentNode) {
$grandParentNode->getAttributeNode('readability')->value += round($contentScore / self::GRANDPARENT_SCORE_DIVISOR);
}
@ -1228,7 +1176,7 @@ class Readability implements LoggerAwareInterface
$siblingScoreThreshold = max(10, ((int) $topCandidate->getAttribute('readability')) * 0.2);
$siblingNodes = $topCandidate->parentNode->childNodes;
if (null === $siblingNodes) {
if (0 === $siblingNodes->length) {
$siblingNodes = new \stdClass();
$siblingNodes->length = 0;
}
@ -1260,7 +1208,7 @@ class Readability implements LoggerAwareInterface
$nodeLength = mb_strlen($nodeContent);
if (($nodeLength > self::MIN_NODE_LENGTH && $linkDensity < self::MAX_LINK_DENSITY)
|| ($nodeLength < self::MIN_NODE_LENGTH && 0 === $linkDensity && preg_match('/\.( |$)/', $nodeContent))) {
|| ($nodeLength < self::MIN_NODE_LENGTH && 0 === (int) $linkDensity && preg_match('/\.( |$)/', $nodeContent))) {
$append = true;
}
}
@ -1337,12 +1285,8 @@ class Readability implements LoggerAwareInterface
/**
* Get an element weight by attribute.
* Uses regular expressions to tell if this element looks good or bad.
*
* @param string $attribute
*
* @return int
*/
protected function weightAttribute(\DOMElement $element, $attribute)
protected function weightAttribute(\DOMElement $element, string $attribute): int
{
if (!$element->hasAttribute($attribute)) {
return 0;
@ -1373,7 +1317,7 @@ class Readability implements LoggerAwareInterface
/**
* Will recreate previously deleted body property.
*/
protected function reinitBody()
protected function reinitBody(): void
{
if (!isset($this->body->childNodes)) {
$this->body = $this->dom->createElement('body');
@ -1388,14 +1332,14 @@ class Readability implements LoggerAwareInterface
*
* @todo This should be called in init() instead of from __construct
*/
private function loadHtml()
private function loadHtml(): void
{
$this->original_html = $this->html;
$this->logger->debug('Parsing URL: ' . $this->url);
if ($this->url) {
$this->domainRegExp = '/' . strtr(preg_replace('/www\d*\./', '', parse_url($this->url, \PHP_URL_HOST)), ['.' => '\.']) . '/';
$this->domainRegExp = '/' . strtr((string) preg_replace('/www\d*\./', '', (string) parse_url($this->url, \PHP_URL_HOST)), ['.' => '\.']) . '/';
}
mb_internal_encoding('UTF-8');
@ -1431,7 +1375,7 @@ class Readability implements LoggerAwareInterface
unset($tidy);
}
$this->html = mb_convert_encoding($this->html, 'HTML-ENTITIES', 'UTF-8');
$this->html = mb_convert_encoding((string) $this->html, 'HTML-ENTITIES', 'UTF-8');
if ('html5lib' === $this->parser || 'html5' === $this->parser) {
$this->dom = (new HTML5())->loadHTML($this->html);

@ -4,27 +4,32 @@ namespace Tests\Readability;
use Monolog\Handler\TestHandler;
use Monolog\Logger;
use Psr\Log\LoggerInterface;
use Readability\Readability;
class ReadabilityTest extends \PHPUnit\Framework\TestCase
{
/** @var TestHandler */
public $logHandler;
/** @var LoggerInterface */
public $logger;
/**
* @requires extension tidy
*/
public function testConstructDefault()
public function testConstructDefault(): void
{
$readability = $this->getReadability('');
$readability->init();
$this->assertNull($readability->url);
$this->assertInstanceOf('DomDocument', $readability->dom);
}
public function testConstructHtml5Parser()
public function testConstructHtml5Parser(): void
{
$readability = $this->getReadability('<html/>', 'http://0.0.0.0', 'html5lib');
$readability->init();
$this->assertSame('http://0.0.0.0', $readability->url);
$this->assertInstanceOf('DomDocument', $readability->dom);
@ -34,9 +39,10 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
/**
* @requires extension tidy
*/
public function testConstructSimple()
public function testConstructSimple(): void
{
$readability = $this->getReadability('<html/>', 'http://0.0.0.0');
$readability->init();
$this->assertSame('http://0.0.0.0', $readability->url);
$this->assertInstanceOf('DomDocument', $readability->dom);
@ -44,9 +50,10 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertTrue($readability->tidied);
}
public function testConstructDefaultWithoutTidy()
public function testConstructDefaultWithoutTidy(): void
{
$readability = $this->getReadability('', null, 'libxml', false);
$readability->init();
$this->assertNull($readability->url);
$this->assertSame('', $readability->original_html);
@ -55,9 +62,10 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertInstanceOf('DomDocument', $readability->dom);
}
public function testConstructSimpleWithoutTidy()
public function testConstructSimpleWithoutTidy(): void
{
$readability = $this->getReadability('<html/>', 'http://0.0.0.0', 'libxml', false);
$readability->init();
$this->assertSame('http://0.0.0.0', $readability->url);
$this->assertInstanceOf('DomDocument', $readability->dom);
@ -65,7 +73,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertFalse($readability->tidied);
}
public function testInitNoContent()
public function testInitNoContent(): void
{
$readability = $this->getReadability('<html/>', 'http://0.0.0.0');
$res = $readability->init();
@ -77,7 +85,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertStringContainsString('Sorry, Readability was unable to parse this page for content.', $readability->getContent()->getInnerHtml());
}
public function testInitP()
public function testInitP(): void
{
$readability = $this->getReadability(str_repeat('<p>This is the awesome content :)</p>', 7), 'http://0.0.0.0');
$res = $readability->init();
@ -90,7 +98,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertStringContainsString('This is the awesome content :)', $readability->getContent()->getInnerHtml());
}
public function testInitDivP()
public function testInitDivP(): void
{
$readability = $this->getReadability('<div>' . str_repeat('<p>This is the awesome content :)</p>', 7) . '</div>', 'http://0.0.0.0');
$res = $readability->init();
@ -103,7 +111,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertStringContainsString('This is the awesome content :)', $readability->getContent()->getInnerHtml());
}
public function testInitDiv()
public function testInitDiv(): void
{
$readability = $this->getReadability('<div>' . str_repeat('This is the awesome content :)', 7) . '</div>', 'http://0.0.0.0');
$readability->debug = true;
@ -117,7 +125,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertStringContainsString('This is the awesome content :)', $readability->getContent()->getInnerHtml());
}
public function testWithFootnotes()
public function testWithFootnotes(): void
{
$readability = $this->getReadability('<div>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '</div>', 'http://0.0.0.0');
$readability->debug = true;
@ -134,7 +142,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertStringContainsString('readabilityLink-3', $readability->getContent()->getInnerHtml());
}
public function testStandardClean()
public function testStandardClean(): void
{
$readability = $this->getReadability('<div><h2>Title</h2>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<a href="#nofollow" rel="nofollow">will NOT be removed</a></div>', 'http://0.0.0.0');
$readability->debug = true;
@ -151,7 +159,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertStringNotContainsString('<h2>', $readability->getContent()->getInnerHtml());
}
public function testWithIframe()
public function testWithIframe(): void
{
$readability = $this->getReadability('<div><h2>Title</h2>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<p>This is an awesome text with some links, here there are <iframe src="http://youtube.com/test" href="#nofollow" rel="nofollow"></iframe><iframe>http://soundcloud.com/test</iframe></p></div>', 'http://0.0.0.0');
$readability->debug = true;
@ -166,7 +174,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertStringContainsString('nofollow', $readability->getContent()->getInnerHtml());
}
public function testWithArticle()
public function testWithArticle(): void
{
$readability = $this->getReadability('<article><p>' . str_repeat('This is an awesome text with some links, here there are: the awesome', 20) . '</p><p>This is an awesome text with some links, here there are <iframe src="http://youtube.com/test" href="#nofollow" rel="nofollow"></iframe></p></article>', 'http://0.0.0.0');
$readability->debug = true;
@ -181,7 +189,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertStringContainsString('nofollow', $readability->getContent()->getInnerHtml());
}
public function testWithAside()
public function testWithAside(): void
{
$readability = $this->getReadability('<article>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<footer><aside>' . str_repeat('<p>This is an awesome text with some links, here there are</p>', 8) . '</aside></footer></article>', 'http://0.0.0.0');
$readability->debug = true;
@ -196,7 +204,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertStringContainsString('<footer readability="5"/>', $readability->getContent()->getInnerHtml());
}
public function testWithClasses()
public function testWithClasses(): void
{
$readability = $this->getReadability('<article>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<div style="display:none">' . str_repeat('<p class="clock">This text should be removed</p>', 10) . '</div></article>', 'http://0.0.0.0');
$readability->debug = true;
@ -211,7 +219,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertStringNotContainsString('This text should be removed', $readability->getContent()->getInnerHtml());
}
public function testWithClassesWithoutLightClean()
public function testWithClassesWithoutLightClean(): void
{
$readability = $this->getReadability('<article>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<div style="display:none">' . str_repeat('<p class="clock">This text should be removed</p>', 10) . '</div></article>', 'http://0.0.0.0');
$readability->debug = true;
@ -227,7 +235,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertStringNotContainsString('This text should be removed', $readability->getContent()->getInnerHtml());
}
public function testWithTd()
public function testWithTd(): void
{
$readability = $this->getReadability('<table><tr>' . str_repeat('<td><p>This is an awesome text with some links, here there are the awesome</td>', 7) . '</tr></table>', 'http://0.0.0.0');
$readability->debug = true;
@ -240,7 +248,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertStringContainsString('This is an awesome text with some links, here there are', $readability->getContent()->getInnerHtml());
}
public function testWithSameClasses()
public function testWithSameClasses(): void
{
$readability = $this->getReadability('<article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<div class="awesomecontent">This text is also an awesome text and you should know that !</div></article>', 'http://0.0.0.0');
$readability->debug = true;
@ -254,7 +262,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertStringContainsString('This text is also an awesome text and you should know that', $readability->getContent()->getInnerHtml());
}
public function testWithScript()
public function testWithScript(): void
{
$readability = $this->getReadability('<article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p><script>This text is also an awesome text and you should know that !</script></p></article>', 'http://0.0.0.0');
$readability->debug = true;
@ -268,7 +276,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertStringNotContainsString('This text is also an awesome text and you should know that', $readability->getContent()->getInnerHtml());
}
public function testTitle()
public function testTitle(): void
{
$readability = $this->getReadability('<title>this is my title</title><article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p></p></article>', 'http://0.0.0.0');
$readability->debug = true;
@ -282,7 +290,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertStringNotContainsString('This text is also an awesome text and you should know that', $readability->getContent()->getInnerHtml());
}
public function testTitleWithDash()
public function testTitleWithDash(): void
{
$readability = $this->getReadability('<title> title2 - title3 </title><article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p></p></article>', 'http://0.0.0.0');
$readability->debug = true;
@ -296,7 +304,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertStringNotContainsString('This text is also an awesome text and you should know that', $readability->getContent()->getInnerHtml());
}
public function testTitleWithDoubleDot()
public function testTitleWithDoubleDot(): void
{
$readability = $this->getReadability('<title> title2 : title3 </title><article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p></p></article>', 'http://0.0.0.0');
$readability->debug = true;
@ -310,7 +318,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertStringNotContainsString('This text is also an awesome text and you should know that', $readability->getContent()->getInnerHtml());
}
public function testTitleTooShortUseH1()
public function testTitleTooShortUseH1(): void
{
$readability = $this->getReadability('<title>too short</title><h1>this is my h1 title !</h1><article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p></p></article>', 'http://0.0.0.0');
$readability->debug = true;
@ -324,17 +332,14 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertStringNotContainsString('This text is also an awesome text and you should know that', $readability->getContent()->getInnerHtml());
}
// dummy function to be used to the next test
public function error2Exception($code, $string, $file, $line, $context)
{
throw new \Exception($string, $code);
}
public function testAutoClosingIframeNotThrowingException()
public function testAutoClosingIframeNotThrowingException(): void
{
error_reporting(\E_ALL | \E_STRICT);
ini_set('display_errors', true);
set_error_handler([$this, 'error2Exception'], \E_ALL | \E_STRICT);
ini_set('display_errors', '1');
// dummy function to be used to the next test
set_error_handler(function (int $errno, string $errstr, string $errfile, int $errline, array $errcontext) {
throw new \Exception($errstr, $errno);
}, \E_ALL | \E_STRICT);
$data = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" lang="ru-RU" prefix="og: http://ogp.me/ns#">
@ -376,7 +381,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
/**
* This should generate an Exception "DOMElement::setAttribute(): ID post-60 already defined".
*/
public function testAppendIdAlreadyHere()
public function testAppendIdAlreadyHere(): void
{
$data = '<!DOCTYPE html>
<html lang="fr">
@ -432,7 +437,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getTitle());
}
public function testPostFilters()
public function testPostFilters(): void
{
$readability = $this->getReadability('<div>' . str_repeat('<p>This <strong>is</strong> the awesome content :)</p>', 10) . '</div>', 'http://0.0.0.0');
$readability->addPostFilter('!<strong[^>]*>(.*?)</strong>!is', '');
@ -443,10 +448,8 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertStringContainsString('This the awesome content :)', $readability->getContent()->getInnerHtml());
}
public function testPreFilters()
public function testPreFilters(): void
{
$this->markTestSkipped('Won\'t work until loadHtml() is moved in init() instead of __construct()');
$readability = $this->getReadability('<div>' . str_repeat('<p>This <b>is</b> the awesome and WONDERFUL content :)</p>', 7) . '</div>', 'http://0.0.0.0');
$readability->addPreFilter('!<b[^>]*>(.*?)</b>!is', '');
@ -456,10 +459,10 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertStringContainsString('This the awesome and WONDERFUL content :)', $readability->getContent()->getInnerHtml());
}
public function testChildNodeGoneNull()
public function testChildNodeGoneNull(): void
{
// from http://www.ayyaantuu.net/ethiopia-targets-opposition-lawmakers/
$html = file_get_contents('tests/fixtures/childNodeGoesNull.html');
$html = (string) file_get_contents('tests/fixtures/childNodeGoesNull.html');
$readability = $this->getReadability($html, 'http://0.0.0.0');
$readability->debug = true;
@ -469,10 +472,10 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertTrue($res);
}
public function testKeepFootnotes()
public function testKeepFootnotes(): void
{
// from https://www.schreibdichte.de/blog/feed-aggregator-und-spaeter-lesen-dienst-im-team
$html = file_get_contents('tests/fixtures/keepFootnotes.html');
$html = (string) file_get_contents('tests/fixtures/keepFootnotes.html');
$readability = $this->getReadability($html, 'http://0.0.0.0');
$readability->debug = true;
@ -483,10 +486,10 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertStringContainsString('<a href="#fnref1:fnfeed_2" rev="footnote"', $readability->getContent()->getInnerHtml());
}
public function testWithWipedBody()
public function testWithWipedBody(): void
{
// from https://www.cs.cmu.edu/~rgs/alice-table.html
$html = file_get_contents('tests/fixtures/wipedBody.html');
$html = (string) file_get_contents('tests/fixtures/wipedBody.html');
$readability = $this->getReadability($html, 'http://0.0.0.0', 'libxml', false);
$readability->debug = true;
@ -496,7 +499,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertStringContainsString('<a href="alice-I.html">Down the Rabbit-Hole</a>', $readability->getContent()->getInnerHtml());
}
private function getReadability($html, $url = null, $parser = 'libxml', $useTidy = true)
private function getReadability(string $html, string $url = null, string $parser = 'libxml', bool $useTidy = true): Readability
{
$readability = new Readability($html, $url, $parser, $useTidy);

Loading…
Cancel
Save