diff --git a/src/Readability.php b/src/Readability.php index c554cb9..c250f05 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -46,14 +46,27 @@ class Readability implements LoggerAwareInterface * Defined up here so we don't instantiate them repeatedly in loops. */ public $regexps = [ - 'unlikelyCandidates' => '/display\s*:\s*none|ignore|\binfos?\b|annoy|clock|date|time|author|intro|hidd?e|about|archive|\bprint|bookmark|tags|tag-list|share|search|social|robot|published|combx|comment|mast(?:head)|subscri|community|category|disqus|extra|head|head(?:er|note)|floor|foot(?:er|note)|menu|tool\b|function|nav|remark|rss|shoutbox|widget|meta|banner|sponsor|adsense|inner-?ad|ad-|sponsor|\badv\b|\bads\b|agr?egate?|pager|sidebar|popup|tweet|twitter/i', - 'okMaybeItsACandidate' => '/article\b|contain|\bcontent|column|general|detail|shadow|lightbox|blog|body|entry|main|page|footnote/i', + 'unlikelyCandidates' => '/-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i', + 'okMaybeItsACandidate' => '/article\b|contain|\bcontent|column|general|detail|shadow|lightbox|blog|body|entry|main|page|footnote|element/i', 'positive' => '/read|full|article|body|\bcontent|contain|entry|main|markdown|media|page|attach|pagination|post|text|blog|story/i', 'negative' => '/bottom|stat|info|discuss|e[\-]?mail|comment|reply|log.{2}(n|ed)|sign|single|combx|com-|contact|_nav|link|media|promo|\bad-|related|scroll|shoutbox|sidebar|sponsor|shopping|teaser|recommend/i', 'divToPElements' => '/<(?:blockquote|header|section|code|div|article|footer|aside|img|p|pre|dl|ol|ul)/mi', 'killBreaks' => '/(([ \r\n\s]| ?)*)+/', 'media' => '!//(?:[^\.\?/]+\.)?(?:youtu(?:be)?|giphy|soundcloud|dailymotion|vimeo|pornhub|xvideos|twitvid|rutube|openload\.co|viddler)\.(?:com|be|org|net)/!i', 'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i', + 'hasContent' => '/\S$/', + 'isNotVisible' => '/display\s*:\s*none/', + ]; + public $defaultTagsToScore = ['section', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'td', 'pre']; + // The commented out elements qualify as phrasing content but tend to be + // removed by readability when put into paragraphs, so we ignore them here. + public $phrasingElements = [ + // "CANVAS", "IFRAME", "SVG", "VIDEO", + 'ABBR', 'AUDIO', 'B', 'BDO', 'BR', 'BUTTON', 'CITE', 'CODE', 'DATA', + 'DATALIST', 'DFN', 'EM', 'EMBED', 'I', 'IMG', 'INPUT', 'KBD', 'LABEL', + 'MARK', 'MATH', 'METER', 'NOSCRIPT', 'OBJECT', 'OUTPUT', 'PROGRESS', 'Q', + 'RUBY', 'SAMP', 'SCRIPT', 'SELECT', 'SMALL', 'SPAN', 'STRONG', 'SUB', + 'SUP', 'TEXTAREA', 'TIME', 'VAR', 'WBR', ]; public $tidy_config = [ 'tidy-mark' => false, @@ -62,7 +75,7 @@ class Readability implements LoggerAwareInterface 'numeric-entities' => false, // 'preserve-entities' => true, 'break-before-br' => false, - 'clean' => true, + 'clean' => false, 'output-xhtml' => true, 'logical-emphasis' => true, 'show-body-only' => false, @@ -485,7 +498,7 @@ class Readability implements LoggerAwareInterface */ public function getCommaCount(string $text): int { - return substr_count($text, ','); + return \count(explode(',', $text)); } /** @@ -609,6 +622,9 @@ class Readability implements LoggerAwareInterface $contentScore = ($node->hasAttribute('readability')) ? (int) $node->getAttribute('readability') : 0; $this->logger->debug('Start conditional cleaning of ' . $node->getNodePath() . ' (class=' . $node->getAttribute('class') . '; id=' . $node->getAttribute('id') . ')' . (($node->hasAttribute('readability')) ? (' with score ' . $node->getAttribute('readability')) : '')); + // XXX Incomplete implementation + $isList = \in_array($node->tagName, ['ul', 'ol'], true); + if ($weight + $contentScore < 0) { $this->logger->debug('Removing...'); $node->parentNode->removeChild($node); @@ -643,16 +659,16 @@ class Readability implements LoggerAwareInterface $toRemove = false; if ($this->lightClean) { - if ($li > $p && 'ul' !== $tag && 'ol' !== $tag) { + if (!$isList && $li > $p) { $this->logger->debug(' too many
  • elements, and parent is not