From 5182d6cb111e5a61c6332b61625d1ef990dcd9e3 Mon Sep 17 00:00:00 2001 From: Jeremy Benoist Date: Sun, 2 Oct 2016 14:49:43 +0200 Subject: [PATCH] =?UTF-8?q?=E2=80=9Cinfo=E2=80=9D=20is=20too=20agressive?= =?UTF-8?q?=20in=20unlikelyCandidates?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some contents have a `infocontent` node (ot sth different) and they are real content. Using only `info` as regex is too agressive and remove legitimate content. Matching the whole word `info` (or `infos`) should be a better choice --- src/Readability.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Readability.php b/src/Readability.php index f120c9f..d33c97a 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -66,7 +66,7 @@ class Readability implements LoggerAwareInterface public $tidied = false; // article domain regexp for calibration protected $domainRegExp = null; - protected $body = null; // + protected $body = null; // Cache the body HTML in case we need to re-use it later protected $bodyCache = null; // 1 | 2 | 4; // Start with all processing flags set. @@ -83,7 +83,7 @@ class Readability implements LoggerAwareInterface * Defined up here so we don't instantiate them repeatedly in loops. */ public $regexps = array( - 'unlikelyCandidates' => '/display\s*:\s*none|ignore|\binfo|annoy|clock|date|time|author|intro|links|hidd?e|about|archive|\bprint|bookmark|tags|tag-list|share|search|social|robot|published|combx|comment|mast(?:head)|subscri|community|category|disqus|extra|head|head(?:er|note)|floor|foot(?:er|note)|menu|tool|function|nav|remark|rss|shoutbox|tool|widget|meta|banner|sponsor|adsense|inner-?ad|ad-|sponsor|\badv\b|\bads\b|agr?egate?|pager|sidebar|popup|tweet|twitter/i', + 'unlikelyCandidates' => '/display\s*:\s*none|ignore|\binfos?\b|annoy|clock|date|time|author|intro|links|hidd?e|about|archive|\bprint|bookmark|tags|tag-list|share|search|social|robot|published|combx|comment|mast(?:head)|subscri|community|category|disqus|extra|head|head(?:er|note)|floor|foot(?:er|note)|menu|tool|function|nav|remark|rss|shoutbox|tool|widget|meta|banner|sponsor|adsense|inner-?ad|ad-|sponsor|\badv\b|\bads\b|agr?egate?|pager|sidebar|popup|tweet|twitter/i', 'okMaybeItsACandidate' => '/article\b|contain|\bcontent|column|general|detail|shadow|lightbox|blog|body|entry|main|page/i', 'positive' => '/read|full|article|body|\bcontent|contain|entry|main|markdown|page|attach|pagination|post|text|blog|story/i', 'negative' => '/bottom|stat|info|discuss|e[\-]?mail|comment|reply|log.{2}(n|ed)|sign|single|combx|com-|contact|_nav|link|media|\bout|promo|\bad-|related|scroll|shoutbox|sidebar|sponsor|shopping|teaser|recommend/i',