Merge pull request #52 from nicofrand/master

Skip empty (empty innerHTML) nodes when grabbing article
pull/65/head 1.2.7
Jérémy Benoist 5 years ago committed by GitHub
commit 9a490fac07
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 10
      src/Readability.php

@ -993,6 +993,12 @@ class Readability implements LoggerAwareInterface
for ($nodeIndex = 0; ($node = $allElements->item($nodeIndex)); ++$nodeIndex) {
$tagName = $node->tagName;
$nodeContent = $node->getInnerHTML();
if (empty($nodeContent)) {
$this->logger->debug('Skipping empty node');
continue;
}
// Some well known site uses sections as paragraphs.
if (0 === strcasecmp($tagName, 'p') || 0 === strcasecmp($tagName, 'td') || 0 === strcasecmp($tagName, 'pre') || 0 === strcasecmp($tagName, 'section')) {
$nodesToScore[] = $node;
@ -1001,11 +1007,11 @@ class Readability implements LoggerAwareInterface
// Turn divs into P tags where they have been used inappropriately
// (as in, where they contain no other block level elements).
if (0 === strcasecmp($tagName, 'div') || 0 === strcasecmp($tagName, 'article') || 0 === strcasecmp($tagName, 'section')) {
if (!preg_match($this->regexps['divToPElements'], $node->getInnerHTML())) {
if (!preg_match($this->regexps['divToPElements'], $nodeContent)) {
$newNode = $this->dom->createElement('p');
try {
$newNode->setInnerHtml($node->getInnerHTML());
$newNode->setInnerHtml($nodeContent);
$node->parentNode->replaceChild($newNode, $node);
--$nodeIndex;

Loading…
Cancel
Save