Keep h1 and other headings

Even though using h1 tags for sections inside an article is semantically
wrong, a lot of websites are doing it anyway. So the idea here is to
stop stripping headings, including h1 on Readability's side.

Fixes wallabag/wallabag#5805

Signed-off-by: Kevin Decherf <kevin@kdecherf.com>
pull/75/head
Kevin Decherf 4 years ago
parent 6689f19956
commit 41ef59212f
  1. 9
      src/Readability.php

@ -395,14 +395,17 @@ class Readability implements LoggerAwareInterface
$this->clean($articleContent, 'object'); $this->clean($articleContent, 'object');
$this->clean($articleContent, 'iframe'); $this->clean($articleContent, 'iframe');
$this->clean($articleContent, 'canvas'); $this->clean($articleContent, 'canvas');
$this->clean($articleContent, 'h1');
/* /*
* If there is only one h2, they are probably using it as a main header, so remove it since we * If there is only one h1 or h2, they are probably using it as a main header, so remove it since we
* already have a header. * already have a header.
*/ */
$h1s = $articleContent->getElementsByTagName('h1');
if (1 === $h1s->length && mb_strlen($this->getInnerText($h1s->item(0), true, true)) < 100) {
$this->clean($articleContent, 'h1');
}
$h2s = $articleContent->getElementsByTagName('h2'); $h2s = $articleContent->getElementsByTagName('h2');
if (1 === $h2s->length && mb_strlen($this->getInnerText($h2s->item(0), true, true)) < 100) { if (0 === $h1s->length && 1 === $h2s->length && mb_strlen($this->getInnerText($h2s->item(0), true, true)) < 100) {
$this->clean($articleContent, 'h2'); $this->clean($articleContent, 'h2');
} }

Loading…
Cancel
Save