diff --git a/.php-cs-fixer.php b/.php-cs-fixer.php index 5f09a0c..d0c927c 100644 --- a/.php-cs-fixer.php +++ b/.php-cs-fixer.php @@ -28,8 +28,6 @@ return (new PhpCsFixer\Config()) 'concat_space' => ['spacing' => 'one'], // Pulled in by @Symfony:risky but we still support PHP 7.4 'modernize_strpos' => false, - // Pulled in by @Symfony, we cannot add property types until we bump PHP to ≥ 7.4 - 'no_null_property_initialization' => false, ]) ->setFinder($finder) ; diff --git a/src/Readability.php b/src/Readability.php index e7e1db5..e87aec6 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -24,26 +24,36 @@ class Readability implements LoggerAwareInterface public const MIN_ARTICLE_LENGTH = 200; public const MIN_NODE_LENGTH = 80; public const MAX_LINK_DENSITY = 0.25; - public $convertLinksToFootnotes = false; - public $revertForcedParagraphElements = false; - public $articleTitle; - public $articleContent; - public $original_html; + + public bool $convertLinksToFootnotes = false; + public bool $revertForcedParagraphElements = false; + + public ?\DOMElement $articleTitle; + + public ?\DOMElement $articleContent; + + public ?string $original_html; + + public ?\DOMDocument $dom; + /** - * @var \DOMDocument + * @var ?string URL where HTML was retrieved */ - public $dom; - // optional - URL where HTML was retrieved - public $url = null; - // preserves more content (experimental) - public $lightClean = true; - public $tidied = false; + public ?string $url = null; /** - * All of the regular expressions in use within readability. + * @var bool preserves more content (experimental) + */ + public bool $lightClean = true; + + public bool $tidied = false; + + /** + * @var array All of the regular expressions in use within readability. + * * Defined up here so we don't instantiate them repeatedly in loops. */ - public $regexps = [ + public array $regexps = [ 'unlikelyCandidates' => '/-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i', 'okMaybeItsACandidate' => '/article\b|contain|\bcontent|column|general|detail|shadow|lightbox|blog|body|entry|main|page|footnote|element/i', 'positive' => '/read|full|article|body|\bcontent|contain|entry|main|markdown|media|page|attach|pagination|post|text|blog|story/i', @@ -55,10 +65,18 @@ class Readability implements LoggerAwareInterface 'hasContent' => '/\S$/', 'isNotVisible' => '/display\s*:\s*none/', ]; - public $defaultTagsToScore = ['section', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'td', 'pre']; - // The commented out elements qualify as phrasing content but tend to be - // removed by readability when put into paragraphs, so we ignore them here. - public $phrasingElements = [ + + /** + * @var array + */ + public array $defaultTagsToScore = ['section', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'td', 'pre']; + + /** + * @var array + */ + public array $phrasingElements = [ + // The commented out elements qualify as phrasing content but tend to be + // removed by readability when put into paragraphs, so we ignore them here. // "CANVAS", "IFRAME", "SVG", "VIDEO", 'ABBR', 'AUDIO', 'B', 'BDO', 'BR', 'BUTTON', 'CITE', 'CODE', 'DATA', 'DATALIST', 'DFN', 'EM', 'EMBED', 'I', 'IMG', 'INPUT', 'KBD', 'LABEL', @@ -66,7 +84,11 @@ class Readability implements LoggerAwareInterface 'RUBY', 'SAMP', 'SCRIPT', 'SELECT', 'SMALL', 'SPAN', 'STRONG', 'SUB', 'SUP', 'TEXTAREA', 'TIME', 'VAR', 'WBR', ]; - public $tidy_config = [ + + /** + * @var array + */ + public array $tidy_config = [ 'tidy-mark' => false, 'vertical-space' => false, 'doctype' => 'omit', @@ -90,21 +112,41 @@ class Readability implements LoggerAwareInterface 'output-encoding' => 'utf8', 'hide-comments' => true, ]; - // article domain regexp for calibration - protected $domainRegExp = null; - protected $body = null; - // Cache the body HTML in case we need to re-use it later - protected $bodyCache = null; - // 1 | 2 | 4; // Start with all processing flags set. - protected $flags = 7; - // indicates whether we were able to extract or not - protected $success = false; - protected $logger; - protected $parser; - protected $html; - protected $useTidy; - // raw HTML filters - protected $pre_filters = [ + + /** + * @var ?string article domain regexp for calibration + */ + protected ?string $domainRegExp = null; + + protected ?\DOMElement $body = null; + + /** + * @var ?string Cache the body HTML in case we need to re-use it later + */ + protected ?string $bodyCache = null; + + /** + * @var int-mask-of start with all processing flags set + */ + protected int $flags = self::FLAG_STRIP_UNLIKELYS | self::FLAG_WEIGHT_ATTRIBUTES | self::FLAG_CLEAN_CONDITIONALLY; + + /** + * @var bool indicates whether we were able to extract or not + */ + protected bool $success = false; + + protected LoggerInterface $logger; + + protected string $parser; + + protected string $html; + + protected bool $useTidy; + + /** + * @var array raw HTML filters + */ + protected array $pre_filters = [ // remove spans as we redefine styles and they're probably special-styled '!]*>!is' => '', // HACK: firewall-filtered content @@ -116,8 +158,11 @@ class Readability implements LoggerAwareInterface // replace fonts to spans '!<(/?)font[^>]*>!is' => '<\\1span>', ]; - // output HTML filters - protected $post_filters = [ + + /** + * @var array output HTML filters + */ + protected array $post_filters = [ // replace excessive br's '/\s*

'articleTitle; } /** * Get article content element. - * - * @return \DOMElement */ - public function getContent() + public function getContent(): \DOMElement { return $this->articleContent; } @@ -452,12 +493,8 @@ class Readability implements LoggerAwareInterface /** * Get the inner text of a node. * This also strips out any excess whitespace to be found. - * - * @param \DOMElement $e - * @param bool $normalizeSpaces (default: true) - * @param bool $flattenLines (default: false) */ - public function getInnerText($e, bool $normalizeSpaces = true, bool $flattenLines = false): string + public function getInnerText(?\DOMNode $e, bool $normalizeSpaces = true, bool $flattenLines = false): string { if (null === $e || !isset($e->textContent) || '' === $e->textContent) { return ''; @@ -750,10 +787,8 @@ class Readability implements LoggerAwareInterface /** * Get the article title as an H1. - * - * @return \DOMElement */ - protected function getArticleTitle() + protected function getArticleTitle(): \DOMElement { try { $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0)); diff --git a/tests/ReadabilityTest.php b/tests/ReadabilityTest.php index 2fee95f..cce4568 100644 --- a/tests/ReadabilityTest.php +++ b/tests/ReadabilityTest.php @@ -10,10 +10,8 @@ use Readability\Readability; class ReadabilityTest extends \PHPUnit\Framework\TestCase { - /** @var TestHandler */ - public $logHandler; - /** @var LoggerInterface */ - public $logger; + public TestHandler $logHandler; + public LoggerInterface $logger; /** * @requires extension tidy @@ -323,7 +321,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase $oldErrorReporting = error_reporting(\E_ALL | \E_STRICT); $oldDisplayErrors = ini_set('display_errors', '1'); // dummy function to be used to the next test - set_error_handler(function (int $errno, string $errstr, string $errfile, int $errline, array $errcontext) { + set_error_handler(function (int $errno, string $errstr, string $errfile, int $errline, array $errcontext): bool { throw new \Exception($errstr, $errno); }, \E_ALL | \E_STRICT);