From f64b06d90d835d5088301d2f74c997a9faf7ac1e Mon Sep 17 00:00:00 2001 From: pavel Date: Mon, 10 Mar 2025 03:03:50 +0200 Subject: [PATCH] add getVideos --- src/Readability.php | 207 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 195 insertions(+), 12 deletions(-) diff --git a/src/Readability.php b/src/Readability.php index b2b4d09..e74cd0d 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -2,6 +2,7 @@ namespace Readability; +use DOMElement; use Masterminds\HTML5; use Psr\Log\LoggerAwareInterface; use Psr\Log\LoggerInterface; @@ -62,11 +63,45 @@ class Readability implements LoggerAwareInterface // removed by readability when put into paragraphs, so we ignore them here. public $phrasingElements = [ // "CANVAS", "IFRAME", "SVG", "VIDEO", - 'ABBR', 'AUDIO', 'B', 'BDO', 'BR', 'BUTTON', 'CITE', 'CODE', 'DATA', - 'DATALIST', 'DFN', 'EM', 'EMBED', 'I', 'IMG', 'INPUT', 'KBD', 'LABEL', - 'MARK', 'MATH', 'METER', 'NOSCRIPT', 'OBJECT', 'OUTPUT', 'PROGRESS', 'Q', - 'RUBY', 'SAMP', 'SCRIPT', 'SELECT', 'SMALL', 'SPAN', 'STRONG', 'SUB', - 'SUP', 'TEXTAREA', 'TIME', 'VAR', 'WBR', + 'ABBR', + 'AUDIO', + 'B', + 'BDO', + 'BR', + 'BUTTON', + 'CITE', + 'CODE', + 'DATA', + 'DATALIST', + 'DFN', + 'EM', + 'EMBED', + 'I', + 'IMG', + 'INPUT', + 'KBD', + 'LABEL', + 'MARK', + 'MATH', + 'METER', + 'NOSCRIPT', + 'OBJECT', + 'OUTPUT', + 'PROGRESS', + 'Q', + 'RUBY', + 'SAMP', + 'SCRIPT', + 'SELECT', + 'SMALL', + 'SPAN', + 'STRONG', + 'SUB', + 'SUP', + 'TEXTAREA', + 'TIME', + 'VAR', + 'WBR', ]; public $tidy_config = [ 'tidy-mark' => false, @@ -934,7 +969,8 @@ class Readability implements LoggerAwareInterface // Remove unlikely candidates $unlikelyMatchString = $node->getAttribute('class') . ' ' . $node->getAttribute('id') . ' ' . $node->getAttribute('style'); - if (mb_strlen($unlikelyMatchString) > 3 // don't process "empty" strings + if ( + mb_strlen($unlikelyMatchString) > 3 // don't process "empty" strings && preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) && !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString) ) { @@ -1118,7 +1154,7 @@ class Readability implements LoggerAwareInterface $topCandidates = array_filter( $topCandidates, - fn ($v, $idx) => 0 === $idx || null !== $v, + fn($v, $idx) => 0 === $idx || null !== $v, \ARRAY_FILTER_USE_BOTH ); $topCandidate = $topCandidates[0]; @@ -1253,7 +1289,8 @@ class Readability implements LoggerAwareInterface $nodeLength = mb_strlen($nodeContent); if (($nodeLength > self::MIN_NODE_LENGTH && $linkDensity < self::MAX_LINK_DENSITY) - || ($nodeLength < self::MIN_NODE_LENGTH && 0 === $nodeLength && 0 === $linkDensity && preg_match('/\.( |$)/', $nodeContent))) { + || ($nodeLength < self::MIN_NODE_LENGTH && 0 === $nodeLength && 0 === $linkDensity && preg_match('/\.( |$)/', $nodeContent)) + ) { $append = true; } } @@ -1465,7 +1502,7 @@ class Readability implements LoggerAwareInterface && !\in_array( false, array_map( - fn ($c) => $this->isPhrasingContent($c), + fn($c) => $this->isPhrasingContent($c), iterator_to_array($node->childNodes) ), true @@ -1481,7 +1518,7 @@ class Readability implements LoggerAwareInterface private function hasSingleTagInsideElement(\DOMElement $node, string $tag): bool { $childNodes = iterator_to_array($node->childNodes); - $children = array_filter($childNodes, fn ($childNode) => $childNode instanceof \DOMElement); + $children = array_filter($childNodes, fn($childNode) => $childNode instanceof \DOMElement); // There should be exactly 1 element child with given tag if (1 !== \count($children) || $children[0]->nodeName !== $tag) { @@ -1490,7 +1527,7 @@ class Readability implements LoggerAwareInterface $a = array_filter( $childNodes, - fn ($childNode) => $childNode instanceof \DOMText && preg_match($this->regexps['hasContent'], $this->getInnerText($childNode)) + fn($childNode) => $childNode instanceof \DOMText && preg_match($this->regexps['hasContent'], $this->getInnerText($childNode)) ); return 0 === \count($a); @@ -1508,7 +1545,7 @@ class Readability implements LoggerAwareInterface $node->hasAttribute('style') && preg_match($this->regexps['isNotVisible'], $node->getAttribute('style')) ) - && !$node->hasAttribute('hidden'); + && !$node->hasAttribute('hidden'); } /** @@ -1551,4 +1588,150 @@ class Readability implements LoggerAwareInterface // Fallback – just plop the at the start of the fragment. return $charsetTag . $html; } + + /** + * Tries to get video urls + * + * @return array + */ + public function getVideos(): array + { + $videos = []; + + // Create a DOMXPath to query the document + $xpath = new \DOMXPath($this->dom); + + // Grab all nodes in one pass + // This finds