Merge pull request #98 from jtojnar/backports

[1.x] Backport fixes
pull/101/head 1.2.11
Jérémy Benoist 1 year ago committed by GitHub
commit 487ce3a517
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 8
      .github/workflows/coding-standards.yml
  2. 46
      .github/workflows/continuous-integration.yml
  3. 1
      .gitignore
  4. 4
      .php-cs-fixer.php
  5. 7
      composer.json
  6. 2
      phpstan.dist.neon
  7. 11
      src/JSLikeHTMLElement.php
  8. 32
      src/Readability.php
  9. 18
      tests/ReadabilityTest.php

@ -4,9 +4,11 @@ on:
pull_request: pull_request:
branches: branches:
- master - master
- 1.x
push: push:
branches: branches:
- master - master
- 1.x
env: env:
SYMFONY_PHPUNIT_VERSION: 7.5 SYMFONY_PHPUNIT_VERSION: 7.5
@ -14,7 +16,7 @@ env:
jobs: jobs:
coding-standards: coding-standards:
name: "CS Fixer & PHPStan" name: "CS Fixer & PHPStan"
runs-on: "ubuntu-20.04" runs-on: "ubuntu-22.04"
strategy: strategy:
matrix: matrix:
@ -23,7 +25,7 @@ jobs:
steps: steps:
- name: "Checkout" - name: "Checkout"
uses: "actions/checkout@v2" uses: "actions/checkout@v4"
- name: "Install PHP" - name: "Install PHP"
uses: "shivammathur/setup-php@v2" uses: "shivammathur/setup-php@v2"
@ -39,7 +41,7 @@ jobs:
run: "composer require phpstan/phpstan phpstan/phpstan-phpunit --dev --no-progress --no-suggest" run: "composer require phpstan/phpstan phpstan/phpstan-phpunit --dev --no-progress --no-suggest"
- name: "Install dependencies with Composer" - name: "Install dependencies with Composer"
uses: "ramsey/composer-install@v1" uses: "ramsey/composer-install@v3"
with: with:
composer-options: "--optimize-autoloader --prefer-dist" composer-options: "--optimize-autoloader --prefer-dist"

@ -4,9 +4,11 @@ on:
pull_request: pull_request:
branches: branches:
- "master" - "master"
- "1.x"
push: push:
branches: branches:
- "master" - "master"
- "1.x"
env: env:
fail-fast: true fail-fast: true
@ -14,7 +16,7 @@ env:
jobs: jobs:
phpunit: phpunit:
name: "PHPUnit (PHP ${{ matrix.php }})" name: "PHPUnit (PHP ${{ matrix.php }})"
runs-on: "ubuntu-20.04" runs-on: "ubuntu-22.04"
strategy: strategy:
matrix: matrix:
@ -26,10 +28,14 @@ jobs:
- "7.3" - "7.3"
- "7.4" - "7.4"
- "8.0" - "8.0"
- "8.1"
- "8.2"
- "8.3"
- "8.4"
steps: steps:
- name: "Checkout" - name: "Checkout"
uses: "actions/checkout@v2" uses: "actions/checkout@v4"
with: with:
fetch-depth: 2 fetch-depth: 2
@ -38,23 +44,17 @@ jobs:
with: with:
php-version: "${{ matrix.php }}" php-version: "${{ matrix.php }}"
coverage: "none" coverage: "none"
tools: composer:v1 tools: composer:v2
extensions: tidy extensions: tidy
ini-values: "date.timezone=Europe/Paris" ini-values: "date.timezone=Europe/Paris"
env: env:
COMPOSER_TOKEN: ${{ secrets.GITHUB_TOKEN }} COMPOSER_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: "Force PHPUnit version"
if: matrix.php >= '7.2'
run: "echo $SYMFONY_PHPUNIT_VERSION"
env:
SYMFONY_PHPUNIT_VERSION: 7.5
- name: "Remove useless deps" - name: "Remove useless deps"
run: "composer remove friendsofphp/php-cs-fixer --dev --no-progress --no-update" run: "composer remove friendsofphp/php-cs-fixer --dev --no-progress --no-update"
- name: "Install dependencies with Composer" - name: "Install dependencies with Composer"
uses: "ramsey/composer-install@v1" uses: "ramsey/composer-install@v3"
with: with:
composer-options: "--optimize-autoloader --prefer-dist" composer-options: "--optimize-autoloader --prefer-dist"
@ -66,7 +66,7 @@ jobs:
phpunit-coverage: phpunit-coverage:
name: "PHPUnit coverage (PHP ${{ matrix.php }})" name: "PHPUnit coverage (PHP ${{ matrix.php }})"
runs-on: "ubuntu-20.04" runs-on: "ubuntu-22.04"
strategy: strategy:
matrix: matrix:
@ -75,7 +75,7 @@ jobs:
steps: steps:
- name: "Checkout" - name: "Checkout"
uses: "actions/checkout@v2" uses: "actions/checkout@v4"
with: with:
fetch-depth: 2 fetch-depth: 2
@ -84,7 +84,7 @@ jobs:
with: with:
php-version: "${{ matrix.php }}" php-version: "${{ matrix.php }}"
coverage: "xdebug" coverage: "xdebug"
tools: composer:v1 tools: composer:v2
extensions: tidy extensions: tidy
ini-values: "date.timezone=Europe/Paris" ini-values: "date.timezone=Europe/Paris"
env: env:
@ -94,7 +94,7 @@ jobs:
run: "composer remove friendsofphp/php-cs-fixer --dev --no-progress --no-update" run: "composer remove friendsofphp/php-cs-fixer --dev --no-progress --no-update"
- name: "Install dependencies with Composer" - name: "Install dependencies with Composer"
uses: "ramsey/composer-install@v1" uses: "ramsey/composer-install@v3"
with: with:
composer-options: "--optimize-autoloader --prefer-dist" composer-options: "--optimize-autoloader --prefer-dist"
@ -103,8 +103,6 @@ jobs:
- name: "Run PHPUnit (with coverage)" - name: "Run PHPUnit (with coverage)"
run: "php vendor/bin/simple-phpunit -v --coverage-clover build/logs/clover.xml" run: "php vendor/bin/simple-phpunit -v --coverage-clover build/logs/clover.xml"
env:
SYMFONY_PHPUNIT_VERSION: 7.5
- name: "Retrieve Coveralls phar" - name: "Retrieve Coveralls phar"
run: "wget https://github.com/php-coveralls/php-coveralls/releases/download/v2.4.2/php-coveralls.phar" run: "wget https://github.com/php-coveralls/php-coveralls/releases/download/v2.4.2/php-coveralls.phar"
@ -119,7 +117,7 @@ jobs:
phpunit-lowest: phpunit-lowest:
name: "PHPUnit lowest deps (PHP ${{ matrix.php }})" name: "PHPUnit lowest deps (PHP ${{ matrix.php }})"
runs-on: "ubuntu-20.04" runs-on: "ubuntu-22.04"
strategy: strategy:
matrix: matrix:
@ -128,7 +126,7 @@ jobs:
steps: steps:
- name: "Checkout" - name: "Checkout"
uses: "actions/checkout@v2" uses: "actions/checkout@v4"
with: with:
fetch-depth: 2 fetch-depth: 2
@ -137,7 +135,7 @@ jobs:
with: with:
php-version: "${{ matrix.php }}" php-version: "${{ matrix.php }}"
coverage: "none" coverage: "none"
tools: composer:v1 tools: composer:v2
extensions: tidy extensions: tidy
ini-values: "date.timezone=Europe/Paris" ini-values: "date.timezone=Europe/Paris"
env: env:
@ -147,7 +145,7 @@ jobs:
run: "composer remove friendsofphp/php-cs-fixer --dev --no-progress --no-update" run: "composer remove friendsofphp/php-cs-fixer --dev --no-progress --no-update"
- name: "Install dependencies with Composer" - name: "Install dependencies with Composer"
uses: "ramsey/composer-install@v1" uses: "ramsey/composer-install@v3"
with: with:
composer-options: "--optimize-autoloader --prefer-dist" composer-options: "--optimize-autoloader --prefer-dist"
dependency-versions: "lowest" dependency-versions: "lowest"
@ -157,11 +155,9 @@ jobs:
- name: "Run PHPUnit" - name: "Run PHPUnit"
run: "php vendor/bin/simple-phpunit -v" run: "php vendor/bin/simple-phpunit -v"
env:
SYMFONY_PHPUNIT_VERSION: 7.5
phpunit-composerv2: phpunit-composerv2:
name: "PHPUnit with Composer v2 (PHP ${{ matrix.php }})" name: "PHPUnit with Composer v1 (PHP ${{ matrix.php }})"
runs-on: "ubuntu-20.04" runs-on: "ubuntu-20.04"
strategy: strategy:
@ -180,7 +176,7 @@ jobs:
with: with:
php-version: "${{ matrix.php }}" php-version: "${{ matrix.php }}"
coverage: "none" coverage: "none"
tools: composer:v2 tools: composer:v1
extensions: tidy extensions: tidy
ini-values: "date.timezone=Europe/Paris" ini-values: "date.timezone=Europe/Paris"
env: env:
@ -199,5 +195,3 @@ jobs:
- name: "Run PHPUnit" - name: "Run PHPUnit"
run: "php vendor/bin/simple-phpunit -v" run: "php vendor/bin/simple-phpunit -v"
env:
SYMFONY_PHPUNIT_VERSION: 7.5

1
.gitignore vendored

@ -3,3 +3,4 @@ coverage/
composer.lock composer.lock
.php_cs.cache .php_cs.cache
.phpunit.result.cache .phpunit.result.cache
phpstan.neon

@ -26,6 +26,10 @@ return (new PhpCsFixer\Config())
'strict_comparison' => true, 'strict_comparison' => true,
'strict_param' => true, 'strict_param' => true,
'concat_space' => ['spacing' => 'one'], 'concat_space' => ['spacing' => 'one'],
// Pulled in by @Symfony, we cannot add property types until we bump PHP to ≥ 7.4
'no_null_property_initialization' => false,
// Pulled in by @Symfony with `const` but const visibility requires PHP ≥ 7.1
'visibility_required' => ['elements' => ['method', 'property']],
]) ])
->setFinder($finder) ->setFinder($finder)
; ;

@ -32,7 +32,7 @@
"require-dev": { "require-dev": {
"friendsofphp/php-cs-fixer": "^2.14", "friendsofphp/php-cs-fixer": "^2.14",
"monolog/monolog": "^1.24|^2.1", "monolog/monolog": "^1.24|^2.1",
"symfony/phpunit-bridge": "^4.4|^5.3" "symfony/phpunit-bridge": "^4.4|^5.3|^6.0|^7.0"
}, },
"suggest": { "suggest": {
"ext-tidy": "Used to clean up given HTML and to avoid problems with bad HTML structure." "ext-tidy": "Used to clean up given HTML and to avoid problems with bad HTML structure."
@ -42,5 +42,10 @@
}, },
"autoload-dev": { "autoload-dev": {
"psr-4": { "Tests\\Readability\\": "tests/" } "psr-4": { "Tests\\Readability\\": "tests/" }
},
"scripts": {
"fix": "php-cs-fixer fix --verbose --diff",
"phpstan": "phpstan analyze --memory-limit 512M",
"test": "simple-phpunit -v"
} }
} }

@ -6,7 +6,7 @@ parameters:
# https://github.com/phpstan/phpstan/issues/694#issuecomment-350724288 # https://github.com/phpstan/phpstan/issues/694#issuecomment-350724288
bootstrapFiles: bootstrapFiles:
- vendor/bin/.phpunit/phpunit-7.5-0/vendor/autoload.php - vendor/bin/.phpunit/phpunit/vendor/autoload.php
includes: includes:
- vendor/phpstan/phpstan-phpunit/extension.neon - vendor/phpstan/phpstan-phpunit/extension.neon

@ -39,9 +39,9 @@ class JSLikeHTMLElement extends \DOMElement
/** /**
* Used for setting innerHTML like it's done in JavaScript:. * Used for setting innerHTML like it's done in JavaScript:.
* *
* @code * ```php
* $div->innerHTML = '<h2>Chapter 2</h2><p>The story begins...</p>'; * $div->innerHTML = '<h2>Chapter 2</h2><p>The story begins...</p>';
* @endcode * ```
*/ */
public function __set($name, $value) public function __set($name, $value)
{ {
@ -79,14 +79,13 @@ class JSLikeHTMLElement extends \DOMElement
} else { } else {
// $value is probably ill-formed // $value is probably ill-formed
$f = new \DOMDocument(); $f = new \DOMDocument();
$value = mb_convert_encoding($value, 'HTML-ENTITIES', 'UTF-8');
// Using <htmlfragment> will generate a warning, but so will bad HTML // Using <htmlfragment> will generate a warning, but so will bad HTML
// (and by this point, bad HTML is what we've got). // (and by this point, bad HTML is what we've got).
// We use it (and suppress the warning) because an HTML fragment will // We use it (and suppress the warning) because an HTML fragment will
// be wrapped around <html><body> tags which we don't really want to keep. // be wrapped around <html><body> tags which we don't really want to keep.
// Note: despite the warning, if loadHTML succeeds it will return true. // Note: despite the warning, if loadHTML succeeds it will return true.
$result = $f->loadHTML('<htmlfragment>' . $value . '</htmlfragment>'); $result = $f->loadHTML('<meta charset="utf-8"><htmlfragment>' . $value . '</htmlfragment>');
if ($result) { if ($result) {
$import = $f->getElementsByTagName('htmlfragment')->item(0); $import = $f->getElementsByTagName('htmlfragment')->item(0);
@ -105,9 +104,9 @@ class JSLikeHTMLElement extends \DOMElement
/** /**
* Used for getting innerHTML like it's done in JavaScript:. * Used for getting innerHTML like it's done in JavaScript:.
* *
* @code * ```php
* $string = $div->innerHTML; * $string = $div->innerHTML;
* @endcode * ```
*/ */
public function __get($name) public function __get($name)
{ {

@ -144,7 +144,7 @@ class Readability implements LoggerAwareInterface
// HACK: replace linebreaks plus br's with p's // HACK: replace linebreaks plus br's with p's
'!(<br[^>]*>[ \r\n\s]*){2,}!i' => '</p><p>', '!(<br[^>]*>[ \r\n\s]*){2,}!i' => '</p><p>',
// replace noscripts // replace noscripts
//'!</?noscript>!is' => '', // '!</?noscript>!is' => '',
// replace fonts to spans // replace fonts to spans
'!<(/?)font[^>]*>!is' => '<\\1span>', '!<(/?)font[^>]*>!is' => '<\\1span>',
]; ];
@ -155,8 +155,8 @@ class Readability implements LoggerAwareInterface
// replace empty tags that break layouts // replace empty tags that break layouts
'!<(?:a|div|p|figure)[^>]+/>!is' => '', '!<(?:a|div|p|figure)[^>]+/>!is' => '',
// remove all attributes on text tags // remove all attributes on text tags
//'!<(\s*/?\s*(?:blockquote|br|hr|code|div|article|span|footer|aside|p|pre|dl|li|ul|ol)) [^>]+>!is' => "<\\1>", // '!<(\s*/?\s*(?:blockquote|br|hr|code|div|article|span|footer|aside|p|pre|dl|li|ul|ol)) [^>]+>!is' => "<\\1>",
//single newlines cleanup // single newlines cleanup
"/\n+/" => "\n", "/\n+/" => "\n",
// modern web... // modern web...
'!<pre[^>]*>\s*<code!is' => '<pre', '!<pre[^>]*>\s*<code!is' => '<pre',
@ -366,7 +366,7 @@ class Readability implements LoggerAwareInterface
$articleLink->setAttribute('style', 'color: inherit; text-decoration: none;'); $articleLink->setAttribute('style', 'color: inherit; text-decoration: none;');
$articleLink->setAttribute('name', 'readabilityLink-' . $linkCount); $articleLink->setAttribute('name', 'readabilityLink-' . $linkCount);
$footnote->setInnerHtml('<small><sup><a href="#readabilityLink-' . $linkCount . '" title="Jump to Link in Article">^</a></sup></small> '); $footnote->setInnerHtml('<small><sup><a href="#readabilityLink-' . $linkCount . '" title="Jump to Link in Article">^</a></sup></small> ');
$footnoteLink->setInnerHtml(('' !== $footnoteLink->getAttribute('title') ? $footnoteLink->getAttribute('title') : $linkText)); $footnoteLink->setInnerHtml('' !== $footnoteLink->getAttribute('title') ? $footnoteLink->getAttribute('title') : $linkText);
$footnoteLink->setAttribute('name', 'readabilityFootnoteLink-' . $linkCount); $footnoteLink->setAttribute('name', 'readabilityFootnoteLink-' . $linkCount);
$footnote->appendChild($footnoteLink); $footnote->appendChild($footnoteLink);
@ -796,7 +796,7 @@ class Readability implements LoggerAwareInterface
*/ */
public function addFlag($flag) public function addFlag($flag)
{ {
$this->flags = $this->flags | $flag; $this->flags |= $flag;
} }
/** /**
@ -806,13 +806,14 @@ class Readability implements LoggerAwareInterface
*/ */
public function removeFlag($flag) public function removeFlag($flag)
{ {
$this->flags = $this->flags & ~$flag; $this->flags &= ~$flag;
} }
/** /**
* Debug. * Debug.
* *
* @deprecated use $this->logger->debug() instead * @deprecated use $this->logger->debug() instead
*
* @codeCoverageIgnore * @codeCoverageIgnore
*/ */
protected function dbg($msg) protected function dbg($msg)
@ -824,6 +825,7 @@ class Readability implements LoggerAwareInterface
* Dump debug info. * Dump debug info.
* *
* @deprecated since Monolog gather log, we don't need it * @deprecated since Monolog gather log, we don't need it
*
* @codeCoverageIgnore * @codeCoverageIgnore
*/ */
protected function dump_dbg() protected function dump_dbg()
@ -973,11 +975,11 @@ class Readability implements LoggerAwareInterface
* Using a variety of metrics (content score, classname, element types), find the content that is * Using a variety of metrics (content score, classname, element types), find the content that is
* most likely to be the stuff a user wants to read. Then return it wrapped up in a div. * most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
* *
* @param \DOMElement $page * @param ?\DOMElement $page
* *
* @return \DOMElement|false * @return \DOMElement|false
*/ */
protected function grabArticle(\DOMElement $page = null) protected function grabArticle($page = null)
{ {
if (!$page) { if (!$page) {
$page = $this->dom; $page = $this->dom;
@ -992,7 +994,7 @@ class Readability implements LoggerAwareInterface
$allElements = $page->getElementsByTagName('*'); $allElements = $page->getElementsByTagName('*');
for ($nodeIndex = 0; ($node = $allElements->item($nodeIndex)); ++$nodeIndex) { for ($nodeIndex = 0; $node = $allElements->item($nodeIndex); ++$nodeIndex) {
$tagName = $node->tagName; $tagName = $node->tagName;
$nodeContent = $node->getInnerHTML(); $nodeContent = $node->getInnerHTML();
@ -1136,9 +1138,9 @@ class Readability implements LoggerAwareInterface
// Remove unlikely candidates // Remove unlikely candidates
$unlikelyMatchString = $node->getAttribute('class') . ' ' . $node->getAttribute('id') . ' ' . $node->getAttribute('style'); $unlikelyMatchString = $node->getAttribute('class') . ' ' . $node->getAttribute('id') . ' ' . $node->getAttribute('style');
if (mb_strlen($unlikelyMatchString) > 3 && // don't process "empty" strings if (mb_strlen($unlikelyMatchString) > 3 // don't process "empty" strings
preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) && && preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString)
!preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString) && !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString)
) { ) {
$this->logger->debug('Removing unlikely candidate (using conf) ' . $node->getNodePath() . ' by "' . $unlikelyMatchString . '" with readability ' . ($node->hasAttribute('readability') ? (int) $node->getAttributeNode('readability')->value : 0)); $this->logger->debug('Removing unlikely candidate (using conf) ' . $node->getNodePath() . ' by "' . $unlikelyMatchString . '" with readability ' . ($node->hasAttribute('readability') ? (int) $node->getAttributeNode('readability')->value : 0));
$node->parentNode->removeChild($node); $node->parentNode->removeChild($node);
@ -1289,8 +1291,8 @@ class Readability implements LoggerAwareInterface
// To ensure a node does not interfere with readability styles, remove its classnames & ids. // To ensure a node does not interfere with readability styles, remove its classnames & ids.
// Now done via RegExp post_filter. // Now done via RegExp post_filter.
//$nodeToAppend->removeAttribute('class'); // $nodeToAppend->removeAttribute('class');
//$nodeToAppend->removeAttribute('id'); // $nodeToAppend->removeAttribute('id');
// Append sibling and subtract from our list as appending removes a node. // Append sibling and subtract from our list as appending removes a node.
$articleContent->appendChild($nodeToAppend); $articleContent->appendChild($nodeToAppend);
} }
@ -1430,7 +1432,7 @@ class Readability implements LoggerAwareInterface
unset($tidy); unset($tidy);
} }
$this->html = mb_convert_encoding($this->html, 'HTML-ENTITIES', 'UTF-8'); $this->html = '<meta charset="utf-8">' . (string) $this->html;
if ('html5lib' === $this->parser || 'html5' === $this->parser) { if ('html5lib' === $this->parser || 'html5' === $this->parser) {
$this->dom = (new HTML5())->loadHTML($this->html); $this->dom = (new HTML5())->loadHTML($this->html);

@ -325,17 +325,18 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
} }
// dummy function to be used to the next test // dummy function to be used to the next test
public function error2Exception($code, $string, $file, $line, $context) public function error2Exception($code, $string, $file, $line)
{ {
throw new \Exception($string, $code); throw new \Exception($string, $code);
} }
public function testAutoClosingIframeNotThrowingException() public function testAutoClosingIframeNotThrowingException()
{ {
error_reporting(\E_ALL | \E_STRICT); $oldErrorReporting = error_reporting(\E_ALL);
ini_set('display_errors', true); $oldDisplayErrors = ini_set('display_errors', true);
set_error_handler([$this, 'error2Exception'], \E_ALL | \E_STRICT); set_error_handler([$this, 'error2Exception']);
try {
$data = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> $data = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" lang="ru-RU" prefix="og: http://ogp.me/ns#"> <html xmlns="http://www.w3.org/1999/xhtml" lang="ru-RU" prefix="og: http://ogp.me/ns#">
@ -358,6 +359,8 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
</div> </div>
</div> </div>
</div> </div>
</div>
</div>
</body> </body>
</html>'; </html>';
@ -371,6 +374,13 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getTitle()); $this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getTitle());
$this->assertStringContainsString('<iframe src="https://www.youtube.com/embed/PUep6xNeKjA" width="560" height="315" frameborder="0" allowfullscreen="allowfullscreen"> </iframe>', $readability->getContent()->getInnerHtml()); $this->assertStringContainsString('<iframe src="https://www.youtube.com/embed/PUep6xNeKjA" width="560" height="315" frameborder="0" allowfullscreen="allowfullscreen"> </iframe>', $readability->getContent()->getInnerHtml());
$this->assertStringContainsString('3D Touch', $readability->getTitle()->getInnerHtml()); $this->assertStringContainsString('3D Touch', $readability->getTitle()->getInnerHtml());
} finally {
restore_error_handler();
if (false !== $oldDisplayErrors) {
ini_set('display_errors', $oldDisplayErrors);
}
error_reporting($oldErrorReporting);
}
} }
/** /**

Loading…
Cancel
Save