diff --git a/README.md b/README.md index 05d62aa..27d933a 100644 --- a/README.md +++ b/README.md @@ -3,11 +3,11 @@ [![Build Status](https://travis-ci.org/j0k3r/php-readability.svg?branch=master)](https://travis-ci.org/j0k3r/php-readability) [![Code Coverage](https://scrutinizer-ci.com/g/j0k3r/php-readability/badges/coverage.png?b=master)](https://scrutinizer-ci.com/g/j0k3r/php-readability/?branch=master) -This is an extract of the Readability class from the [full-text-rss](https://github.com/Dither/full-text-rss) fork. It kind be defined as a better version of the original [php-readability](https://bitbucket.org/fivefilters/php-readability/overview). +This is an extract of the Readability class from this [full-text-rss](https://github.com/Dither/full-text-rss) fork. It can be defined as a better version of the original [php-readability](https://bitbucket.org/fivefilters/php-readability/overview). ## Differences -The default php-readability lib is really old and needs to be improved. I found a great fork of [full-text-rss](http://fivefilters.org/content-only/) from @Dither which improve the Readability class. +The default php-readability lib is really old and needs to be improved. I found a great fork of full-text-rss from [@Dither](https://github.com/Dither/full-text-rss) which improve the Readability class. - I've extracted the class from its fork to be able to use it out of the box - I've added some simple tests @@ -15,6 +15,12 @@ The default php-readability lib is really old and needs to be improved. I found **But** the code is still really hard to understand / read ... +## Requirements + +By default, this lib will use the [Tidy extension](https://github.com/htacg/tidy-html5) if it's available. Tidy is only used to cleanup the given HTML and avoid problems with bad HTML structure, etc .. + +Since Composer doesn't support suggestion on PHP extension, I write this suggestion here. + ## Usage ```php @@ -26,6 +32,8 @@ $url = 'http://www.medialens.org/index.php/alerts/alert-archive/alerts-2013/729- $html = file_get_contents($url); $readability = new Readability($html, $url); +// or without Tidy +// $readability = new Readability($html, $url, 'libxml', false); $result = $readability->init(); if ($result) { diff --git a/composer.json b/composer.json index 07cbe59..6727aac 100644 --- a/composer.json +++ b/composer.json @@ -24,8 +24,7 @@ "role": "Developer (original JS version)" }], "require": { - "php": ">=5.3.3", - "ext-tidy": ">=1.2" + "php": ">=5.3.3" }, "autoload": { "psr-4": { "Readability\\": "src/" } diff --git a/tests/ReadabilityTest.php b/tests/ReadabilityTest.php index fe6ce60..e82c1d6 100644 --- a/tests/ReadabilityTest.php +++ b/tests/ReadabilityTest.php @@ -19,6 +19,9 @@ class ReadabilityTested extends Readability class ReadabilityTest extends \PHPUnit_Framework_TestCase { + /** + * @requires extension tidy + */ public function testConstructDefault() { $readability = new ReadabilityTested(''); @@ -30,6 +33,9 @@ class ReadabilityTest extends \PHPUnit_Framework_TestCase $this->assertInstanceOf('DomDocument', $readability->dom); } + /** + * @requires extension tidy + */ public function testConstructSimple() { $readability = new ReadabilityTested('', 'http://0.0.0.0'); @@ -41,6 +47,28 @@ class ReadabilityTest extends \PHPUnit_Framework_TestCase $this->assertInstanceOf('DomDocument', $readability->dom); } + public function testConstructDefaultWithoutTidy() + { + $readability = new ReadabilityTested('', null, 'libxml', false); + + $this->assertNull($readability->url); + $this->assertContains('Parsing URL', $readability->getDebugText()); + $this->assertNotContains('Tidying document', $readability->getDebugText()); + $this->assertNull($readability->getDomainRegexp()); + $this->assertInstanceOf('DomDocument', $readability->dom); + } + + public function testConstructSimpleWithoutTidy() + { + $readability = new ReadabilityTested('', 'http://0.0.0.0', 'libxml', false); + + $this->assertEquals('http://0.0.0.0', $readability->url); + $this->assertContains('Parsing URL', $readability->getDebugText()); + $this->assertNotContains('Tidying document', $readability->getDebugText()); + $this->assertEquals('/0\.0\.0\.0/', $readability->getDomainRegexp()); + $this->assertInstanceOf('DomDocument', $readability->dom); + } + public function testInitNoContent() { $readability = new ReadabilityTested('', 'http://0.0.0.0');