diff --git a/.gitattributes b/.gitattributes index 8770321..5e1411f 100644 --- a/.gitattributes +++ b/.gitattributes @@ -7,3 +7,4 @@ /README.md export-ignore /phpunit.xml.dist export-ignore /tests export-ignore +/phpstan.neon export-ignore diff --git a/.travis.yml b/.travis.yml index cdf1bd5..a669b13 100644 --- a/.travis.yml +++ b/.travis.yml @@ -29,12 +29,14 @@ install: - composer self-update before_script: + - if [ "$CS_FIXER" = "run" ]; then composer require phpstan/phpstan phpstan/phpstan-phpunit --dev -n ; fi; - composer install -o --prefer-dist --no-interaction script: - mkdir -p build/logs - php vendor/bin/simple-phpunit -v --coverage-clover build/logs/clover.xml - if [ "$CS_FIXER" = "run" ]; then php vendor/bin/php-cs-fixer fix --verbose --dry-run ; fi; + - if [ "$CS_FIXER" = "run" ]; then php vendor/bin/phpstan analyse src tests --no-progress --level 1 ; fi; after_script: - php vendor/bin/php-coveralls -v -x build/logs/clover.xml diff --git a/composer.json b/composer.json index 318cb65..c789dbf 100644 --- a/composer.json +++ b/composer.json @@ -40,5 +40,8 @@ }, "autoload": { "psr-4": { "Readability\\": "src/" } + }, + "autoload-dev": { + "psr-4": { "Tests\\Readability\\": "tests/" } } } diff --git a/phpstan.neon b/phpstan.neon new file mode 100644 index 0000000..334ba3a --- /dev/null +++ b/phpstan.neon @@ -0,0 +1,8 @@ +includes: + - vendor/phpstan/phpstan-phpunit/extension.neon + - vendor/phpstan/phpstan-phpunit/rules.neon + +parameters: + # https://github.com/phpstan/phpstan/issues/694#issuecomment-350724288 + autoload_files: + - vendor/bin/.phpunit/phpunit-6.5/vendor/autoload.php diff --git a/src/JSLikeHTMLElement.php b/src/JSLikeHTMLElement.php index b908d06..f67cd69 100644 --- a/src/JSLikeHTMLElement.php +++ b/src/JSLikeHTMLElement.php @@ -127,4 +127,14 @@ class JSLikeHTMLElement extends \DOMElement { return '[' . $this->tagName . ']'; } + + public function getInnerHtml() + { + return $this->__get('innerHTML'); + } + + public function setInnerHtml($value) + { + return $this->__set('innerHTML', $value); + } } diff --git a/src/Readability.php b/src/Readability.php index 8d0aa33..ad22691 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -72,6 +72,9 @@ class Readability implements LoggerAwareInterface public $articleTitle; public $articleContent; public $original_html; + /** + * @var \DOMDocument + */ public $dom; // optional - URL where HTML was retrieved public $url = null; @@ -169,10 +172,10 @@ class Readability implements LoggerAwareInterface /** * Create instance of Readability. * - * @param string UTF-8 encoded string - * @param string (optional) URL associated with HTML (for footnotes) - * @param string (optional) Which parser to use for turning raw HTML into a DOMDocument - * @param bool (optional) Use tidy + * @param string $html UTF-8 encoded string + * @param string $url URL associated with HTML (for footnotes) + * @param string $parser Which parser to use for turning raw HTML into a DOMDocument + * @param bool $use_tidy Use tidy */ public function __construct($html, $url = null, $parser = 'libxml', $use_tidy = true) { @@ -213,8 +216,8 @@ class Readability implements LoggerAwareInterface /** * Add pre filter for raw input HTML processing. * - * @param string RegExp for replace - * @param string (optional) Replacer + * @param string $filter RegExp for replace + * @param string $replacer Replacer */ public function addPreFilter($filter, $replacer = '') { @@ -224,8 +227,8 @@ class Readability implements LoggerAwareInterface /** * Add post filter for raw output HTML processing. * - * @param string RegExp for replace - * @param string (optional) Replacer + * @param string $filter RegExp for replace + * @param string $replacer Replacer */ public function addPostFilter($filter, $replacer = '') { @@ -258,7 +261,7 @@ class Readability implements LoggerAwareInterface if (null === $this->bodyCache) { $this->bodyCache = ''; foreach ($bodyElems as $bodyNode) { - $this->bodyCache .= trim($bodyNode->innerHTML); + $this->bodyCache .= trim($bodyNode->getInnerHTML()); } } @@ -278,7 +281,7 @@ class Readability implements LoggerAwareInterface $this->success = false; $articleContent = $this->dom->createElement('div'); $articleContent->setAttribute('class', 'readability-content'); - $articleContent->innerHTML = '
Sorry, Readability was unable to parse this page for content.
'; + $articleContent->setInnerHtml('Sorry, Readability was unable to parse this page for content.
'); } $overlay->setAttribute('class', 'readOverlay'); @@ -290,7 +293,7 @@ class Readability implements LoggerAwareInterface $overlay->appendChild($innerDiv); // Clear the old HTML, insert the new content. - $this->body->innerHTML = ''; + $this->body->setInnerHtml(''); $this->body->appendChild($overlay); $this->body->removeAttribute('style'); $this->postProcessContent($articleContent); @@ -307,7 +310,7 @@ class Readability implements LoggerAwareInterface * * @param \DOMElement $articleContent */ - public function postProcessContent($articleContent) + public function postProcessContent(\DOMElement $articleContent) { if ($this->convertLinksToFootnotes && !preg_match('/\bwiki/', $this->url)) { $this->addFootnotes($articleContent); @@ -321,11 +324,11 @@ class Readability implements LoggerAwareInterface * * @param \DOMElement $articleContent */ - public function addFootnotes($articleContent) + public function addFootnotes(\DOMElement $articleContent) { $footnotesWrapper = $this->dom->createElement('footer'); $footnotesWrapper->setAttribute('class', 'readability-footnotes'); - $footnotesWrapper->innerHTML = ' tags, etc.
*
- * @param \DOMElement $articleContent
+ * @param \DOMNode $articleContent
*/
- public function prepArticle($articleContent)
+ public function prepArticle(\DOMNode $articleContent)
{
+ if (!$articleContent instanceof \DOMElement) {
+ return;
+ }
+
$this->logger->debug($this->lightClean ? 'Light clean enabled.' : 'Standard clean enabled.');
$this->cleanStyles($articleContent);
@@ -467,7 +474,7 @@ class Readability implements LoggerAwareInterface
if (!$this->flagIsActive(self::FLAG_DISABLE_POSTFILTER)) {
try {
foreach ($this->post_filters as $search => $replace) {
- $articleContent->innerHTML = preg_replace($search, $replace, $articleContent->innerHTML);
+ $articleContent->setInnerHtml(preg_replace($search, $replace, $articleContent->getInnerHTML()));
}
unset($search, $replace);
} catch (\Exception $e) {
@@ -552,11 +559,11 @@ class Readability implements LoggerAwareInterface
* Can exclude external references to differentiate between simple text and menus/infoblocks.
*
* @param \DOMElement $e
- * @param string $excludeExternal
+ * @param bool $excludeExternal
*
* @return int
*/
- public function getLinkDensity($e, $excludeExternal = false)
+ public function getLinkDensity(\DOMElement $e, $excludeExternal = false)
{
$links = $e->getElementsByTagName('a');
$textLength = mb_strlen($this->getInnerText($e, true, true));
@@ -583,7 +590,7 @@ class Readability implements LoggerAwareInterface
*
* @return int
*/
- public function getWeight($e)
+ public function getWeight(\DOMElement $e)
{
if (!$this->flagIsActive(self::FLAG_WEIGHT_ATTRIBUTES)) {
return 0;
@@ -603,11 +610,11 @@ class Readability implements LoggerAwareInterface
*
* @param \DOMElement $node
*/
- public function killBreaks($node)
+ public function killBreaks(\DOMElement $node)
{
- $html = $node->innerHTML;
+ $html = $node->getInnerHTML();
$html = preg_replace($this->regexps['killBreaks'], '
', $html);
- $node->innerHTML = $html;
+ $node->setInnerHtml($html);
}
/**
@@ -619,7 +626,7 @@ class Readability implements LoggerAwareInterface
* @param \DOMElement $e
* @param string $tag
*/
- public function clean($e, $tag)
+ public function clean(\DOMElement $e, $tag)
{
$currentItem = null;
$targetList = $e->getElementsByTagName($tag);
@@ -638,7 +645,7 @@ class Readability implements LoggerAwareInterface
}
// Then check the elements inside this element for the same.
- if (preg_match($this->regexps['media'], $targetList->item($y)->innerHTML)) {
+ if (preg_match($this->regexps['media'], $targetList->item($y)->getInnerHTML())) {
continue;
}
}
@@ -655,7 +662,7 @@ class Readability implements LoggerAwareInterface
* @param \DOMElement $e
* @param string $tag
*/
- public function cleanConditionally($e, $tag)
+ public function cleanConditionally(\DOMElement $e, $tag)
{
if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
return;
@@ -768,7 +775,7 @@ class Readability implements LoggerAwareInterface
*
* @param \DOMElement $e
*/
- public function cleanHeaders($e)
+ public function cleanHeaders(\DOMElement $e)
{
for ($headerIndex = 1; $headerIndex < 3; ++$headerIndex) {
$headers = $e->getElementsByTagName('h' . $headerIndex);
@@ -871,7 +878,7 @@ class Readability implements LoggerAwareInterface
}
$articleTitle = $this->dom->createElement('h1');
- $articleTitle->innerHTML = $curTitle;
+ $articleTitle->setInnerHtml($curTitle);
return $articleTitle;
}
@@ -911,7 +918,7 @@ class Readability implements LoggerAwareInterface
*
* @param \DOMElement $node
*/
- protected function initializeNode($node)
+ protected function initializeNode(\DOMElement $node)
{
if (!isset($node->tagName)) {
return;
@@ -981,9 +988,9 @@ class Readability implements LoggerAwareInterface
*
* @param \DOMElement $page
*
- * @return \DOMElement|bool
+ * @return \DOMElement|false
*/
- protected function grabArticle($page = null)
+ protected function grabArticle(\DOMElement $page = null)
{
if (!$page) {
$page = $this->dom;
@@ -1009,11 +1016,11 @@ class Readability implements LoggerAwareInterface
// Turn divs into P tags where they have been used inappropriately
// (as in, where they contain no other block level elements).
if (0 === strcasecmp($tagName, 'div') || 0 === strcasecmp($tagName, 'article') || 0 === strcasecmp($tagName, 'section')) {
- if (!preg_match($this->regexps['divToPElements'], $node->innerHTML)) {
+ if (!preg_match($this->regexps['divToPElements'], $node->getInnerHTML())) {
$newNode = $this->dom->createElement('p');
try {
- $newNode->innerHTML = $node->innerHTML;
+ $newNode->setInnerHtml($node->getInnerHTML());
$node->parentNode->replaceChild($newNode, $node);
--$nodeIndex;
@@ -1040,7 +1047,7 @@ class Readability implements LoggerAwareInterface
if (XML_TEXT_NODE === $childNode->nodeType) {
$p = $this->dom->createElement('p');
- $p->innerHTML = $childNode->nodeValue;
+ $p->setInnerHtml($childNode->nodeValue);
$p->setAttribute('data-readability-styled', 'true');
$childNode->parentNode->replaceChild($p, $childNode);
}
@@ -1190,14 +1197,14 @@ class Readability implements LoggerAwareInterface
$this->logger->debug('The page has no body!');
} else {
$this->logger->debug('Setting body to a raw HTML of original page!');
- $topCandidate->innerHTML = $page->documentElement->innerHTML;
- $page->documentElement->innerHTML = '';
+ $topCandidate->setInnerHtml($page->documentElement->getInnerHTML());
+ $page->documentElement->setInnerHtml('');
$this->reinitBody();
$page->documentElement->appendChild($topCandidate);
}
} else {
- $topCandidate->innerHTML = $page->innerHTML;
- $page->innerHTML = '';
+ $topCandidate->setInnerHtml($page->getInnerHTML());
+ $page->setInnerHtml('');
$page->appendChild($topCandidate);
}
@@ -1229,8 +1236,8 @@ class Readability implements LoggerAwareInterface
$siblingScoreThreshold = max(10, ((int) $topCandidate->getAttribute('readability')) * 0.2);
$siblingNodes = $topCandidate->parentNode->childNodes;
- if (!isset($siblingNodes)) {
- $siblingNodes = new stdClass();
+ if (null === $siblingNodes) {
+ $siblingNodes = new \stdClass();
$siblingNodes->length = 0;
}
@@ -1276,7 +1283,7 @@ class Readability implements LoggerAwareInterface
try {
$nodeToAppend->setAttribute('alt', $siblingNodeName);
- $nodeToAppend->innerHTML = $siblingNode->innerHTML;
+ $nodeToAppend->setInnerHtml($siblingNode->getInnerHTML());
} catch (\Exception $e) {
$this->logger->debug('Could not alter siblingNode "' . $siblingNodeName . '" to "div", reverting to original.');
$nodeToAppend = $siblingNode;
@@ -1344,7 +1351,7 @@ class Readability implements LoggerAwareInterface
*
* @return int
*/
- protected function weightAttribute($element, $attribute)
+ protected function weightAttribute(\DOMElement $element, $attribute)
{
if (!$element->hasAttribute($attribute)) {
return 0;
@@ -1379,7 +1386,7 @@ class Readability implements LoggerAwareInterface
{
if (!isset($this->body->childNodes)) {
$this->body = $this->dom->createElement('body');
- $this->body->innerHTML = $this->bodyCache;
+ $this->body->setInnerHtml($this->bodyCache);
}
}
@@ -1435,17 +1442,16 @@ class Readability implements LoggerAwareInterface
$this->html = mb_convert_encoding($this->html, 'HTML-ENTITIES', 'UTF-8');
- if (!('html5lib' === $this->parser && ($this->dom = Parser::parse($this->html)))) {
+ if ('html5lib' === $this->parser) {
+ $this->dom = Parser::parse($this->html);
+ }
+
+ if ('libxml' === $this->parser) {
libxml_use_internal_errors(true);
$this->dom = new \DOMDocument();
$this->dom->preserveWhiteSpace = false;
-
- if (\PHP_VERSION_ID >= 50400) {
- $this->dom->loadHTML($this->html, LIBXML_NOBLANKS | LIBXML_COMPACT | LIBXML_NOERROR);
- } else {
- $this->dom->loadHTML($this->html);
- }
+ $this->dom->loadHTML($this->html, LIBXML_NOBLANKS | LIBXML_COMPACT | LIBXML_NOERROR);
libxml_use_internal_errors(false);
}
diff --git a/tests/ReadabilityTest.php b/tests/ReadabilityTest.php
index 5a3efdd..dd58990 100644
--- a/tests/ReadabilityTest.php
+++ b/tests/ReadabilityTest.php
@@ -73,8 +73,8 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertFalse($res);
$this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getContent());
$this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getTitle());
- $this->assertEmpty($readability->getTitle()->innerHTML);
- $this->assertContains('Sorry, Readability was unable to parse this page for content.', $readability->getContent()->innerHTML);
+ $this->assertEmpty($readability->getTitle()->getInnerHtml());
+ $this->assertContains('Sorry, Readability was unable to parse this page for content.', $readability->getContent()->getInnerHtml());
}
public function testInitP()
@@ -85,9 +85,9 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertTrue($res);
$this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getContent());
$this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getTitle());
- $this->assertContains('