commit
881e441bdf
10 changed files with 1911 additions and 0 deletions
@ -0,0 +1,12 @@ |
||||
; top-most EditorConfig file |
||||
root = true |
||||
|
||||
; Unix-style newlines |
||||
[*] |
||||
end_of_line = LF |
||||
|
||||
[*.php] |
||||
indent_style = space |
||||
indent_size = 4 |
||||
trim_trailing_whitespace = true |
||||
insert_final_newline = true |
||||
@ -0,0 +1 @@ |
||||
vendor/ |
||||
@ -0,0 +1,16 @@ |
||||
language: php |
||||
|
||||
php: |
||||
- 5.2 |
||||
- 5.3 |
||||
- 5.4 |
||||
- 5.5 |
||||
- 5.6 |
||||
|
||||
before_script: |
||||
- composer self-update |
||||
- echo 'date.timezone = "Europe/Paris"' >> ~/.phpenv/versions/$(phpenv version-name)/etc/php.ini |
||||
- composer install --prefer-dist --no-interaction |
||||
|
||||
script: |
||||
- phpunit --coverage-text |
||||
@ -0,0 +1,201 @@ |
||||
Apache License |
||||
Version 2.0, January 2004 |
||||
http://www.apache.org/licenses/ |
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION |
||||
|
||||
1. Definitions. |
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction, |
||||
and distribution as defined by Sections 1 through 9 of this document. |
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by |
||||
the copyright owner that is granting the License. |
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all |
||||
other entities that control, are controlled by, or are under common |
||||
control with that entity. For the purposes of this definition, |
||||
"control" means (i) the power, direct or indirect, to cause the |
||||
direction or management of such entity, whether by contract or |
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the |
||||
outstanding shares, or (iii) beneficial ownership of such entity. |
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity |
||||
exercising permissions granted by this License. |
||||
|
||||
"Source" form shall mean the preferred form for making modifications, |
||||
including but not limited to software source code, documentation |
||||
source, and configuration files. |
||||
|
||||
"Object" form shall mean any form resulting from mechanical |
||||
transformation or translation of a Source form, including but |
||||
not limited to compiled object code, generated documentation, |
||||
and conversions to other media types. |
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or |
||||
Object form, made available under the License, as indicated by a |
||||
copyright notice that is included in or attached to the work |
||||
(an example is provided in the Appendix below). |
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object |
||||
form, that is based on (or derived from) the Work and for which the |
||||
editorial revisions, annotations, elaborations, or other modifications |
||||
represent, as a whole, an original work of authorship. For the purposes |
||||
of this License, Derivative Works shall not include works that remain |
||||
separable from, or merely link (or bind by name) to the interfaces of, |
||||
the Work and Derivative Works thereof. |
||||
|
||||
"Contribution" shall mean any work of authorship, including |
||||
the original version of the Work and any modifications or additions |
||||
to that Work or Derivative Works thereof, that is intentionally |
||||
submitted to Licensor for inclusion in the Work by the copyright owner |
||||
or by an individual or Legal Entity authorized to submit on behalf of |
||||
the copyright owner. For the purposes of this definition, "submitted" |
||||
means any form of electronic, verbal, or written communication sent |
||||
to the Licensor or its representatives, including but not limited to |
||||
communication on electronic mailing lists, source code control systems, |
||||
and issue tracking systems that are managed by, or on behalf of, the |
||||
Licensor for the purpose of discussing and improving the Work, but |
||||
excluding communication that is conspicuously marked or otherwise |
||||
designated in writing by the copyright owner as "Not a Contribution." |
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity |
||||
on behalf of whom a Contribution has been received by Licensor and |
||||
subsequently incorporated within the Work. |
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of |
||||
this License, each Contributor hereby grants to You a perpetual, |
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable |
||||
copyright license to reproduce, prepare Derivative Works of, |
||||
publicly display, publicly perform, sublicense, and distribute the |
||||
Work and such Derivative Works in Source or Object form. |
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of |
||||
this License, each Contributor hereby grants to You a perpetual, |
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable |
||||
(except as stated in this section) patent license to make, have made, |
||||
use, offer to sell, sell, import, and otherwise transfer the Work, |
||||
where such license applies only to those patent claims licensable |
||||
by such Contributor that are necessarily infringed by their |
||||
Contribution(s) alone or by combination of their Contribution(s) |
||||
with the Work to which such Contribution(s) was submitted. If You |
||||
institute patent litigation against any entity (including a |
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work |
||||
or a Contribution incorporated within the Work constitutes direct |
||||
or contributory patent infringement, then any patent licenses |
||||
granted to You under this License for that Work shall terminate |
||||
as of the date such litigation is filed. |
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the |
||||
Work or Derivative Works thereof in any medium, with or without |
||||
modifications, and in Source or Object form, provided that You |
||||
meet the following conditions: |
||||
|
||||
(a) You must give any other recipients of the Work or |
||||
Derivative Works a copy of this License; and |
||||
|
||||
(b) You must cause any modified files to carry prominent notices |
||||
stating that You changed the files; and |
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works |
||||
that You distribute, all copyright, patent, trademark, and |
||||
attribution notices from the Source form of the Work, |
||||
excluding those notices that do not pertain to any part of |
||||
the Derivative Works; and |
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its |
||||
distribution, then any Derivative Works that You distribute must |
||||
include a readable copy of the attribution notices contained |
||||
within such NOTICE file, excluding those notices that do not |
||||
pertain to any part of the Derivative Works, in at least one |
||||
of the following places: within a NOTICE text file distributed |
||||
as part of the Derivative Works; within the Source form or |
||||
documentation, if provided along with the Derivative Works; or, |
||||
within a display generated by the Derivative Works, if and |
||||
wherever such third-party notices normally appear. The contents |
||||
of the NOTICE file are for informational purposes only and |
||||
do not modify the License. You may add Your own attribution |
||||
notices within Derivative Works that You distribute, alongside |
||||
or as an addendum to the NOTICE text from the Work, provided |
||||
that such additional attribution notices cannot be construed |
||||
as modifying the License. |
||||
|
||||
You may add Your own copyright statement to Your modifications and |
||||
may provide additional or different license terms and conditions |
||||
for use, reproduction, or distribution of Your modifications, or |
||||
for any such Derivative Works as a whole, provided Your use, |
||||
reproduction, and distribution of the Work otherwise complies with |
||||
the conditions stated in this License. |
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise, |
||||
any Contribution intentionally submitted for inclusion in the Work |
||||
by You to the Licensor shall be under the terms and conditions of |
||||
this License, without any additional terms or conditions. |
||||
Notwithstanding the above, nothing herein shall supersede or modify |
||||
the terms of any separate license agreement you may have executed |
||||
with Licensor regarding such Contributions. |
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade |
||||
names, trademarks, service marks, or product names of the Licensor, |
||||
except as required for reasonable and customary use in describing the |
||||
origin of the Work and reproducing the content of the NOTICE file. |
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or |
||||
agreed to in writing, Licensor provides the Work (and each |
||||
Contributor provides its Contributions) on an "AS IS" BASIS, |
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or |
||||
implied, including, without limitation, any warranties or conditions |
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A |
||||
PARTICULAR PURPOSE. You are solely responsible for determining the |
||||
appropriateness of using or redistributing the Work and assume any |
||||
risks associated with Your exercise of permissions under this License. |
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory, |
||||
whether in tort (including negligence), contract, or otherwise, |
||||
unless required by applicable law (such as deliberate and grossly |
||||
negligent acts) or agreed to in writing, shall any Contributor be |
||||
liable to You for damages, including any direct, indirect, special, |
||||
incidental, or consequential damages of any character arising as a |
||||
result of this License or out of the use or inability to use the |
||||
Work (including but not limited to damages for loss of goodwill, |
||||
work stoppage, computer failure or malfunction, or any and all |
||||
other commercial damages or losses), even if such Contributor |
||||
has been advised of the possibility of such damages. |
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing |
||||
the Work or Derivative Works thereof, You may choose to offer, |
||||
and charge a fee for, acceptance of support, warranty, indemnity, |
||||
or other liability obligations and/or rights consistent with this |
||||
License. However, in accepting such obligations, You may act only |
||||
on Your own behalf and on Your sole responsibility, not on behalf |
||||
of any other Contributor, and only if You agree to indemnify, |
||||
defend, and hold each Contributor harmless for any liability |
||||
incurred by, or claims asserted against, such Contributor by reason |
||||
of your accepting any such warranty or additional liability. |
||||
|
||||
END OF TERMS AND CONDITIONS |
||||
|
||||
APPENDIX: How to apply the Apache License to your work. |
||||
|
||||
To apply the Apache License to your work, attach the following |
||||
boilerplate notice, with the fields enclosed by brackets "{}" |
||||
replaced with your own identifying information. (Don't include |
||||
the brackets!) The text should be enclosed in the appropriate |
||||
comment syntax for the file format. We also recommend that a |
||||
file or class name and description of purpose be included on the |
||||
same "printed page" as the copyright notice for easier |
||||
identification within third-party archives. |
||||
|
||||
Copyright {yyyy} {name of copyright owner} |
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); |
||||
you may not use this file except in compliance with the License. |
||||
You may obtain a copy of the License at |
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0 |
||||
|
||||
Unless required by applicable law or agreed to in writing, software |
||||
distributed under the License is distributed on an "AS IS" BASIS, |
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
See the License for the specific language governing permissions and |
||||
limitations under the License. |
||||
@ -0,0 +1,38 @@ |
||||
# Readability |
||||
|
||||
[](https://travis-ci.org/j0k3r/php-readability) |
||||
|
||||
This is an extract of the Readability class from the [full-text-rss](https://github.com/Dither/full-text-rss) fork. It kind be defined as a better version of the original [php-readability](http://code.fivefilters.org/php-readability). |
||||
|
||||
## Differences |
||||
|
||||
The default php-readability lib is really old and needs to be improved. I found a great fork of [full-text-rss](http://fivefilters.org/content-only/) from @Dither which improve the Readability class. |
||||
|
||||
- I've extracted the class from its fork to be able to use it out of the box |
||||
- I've added some simple tests |
||||
- and changed the CS, run `php-cs-fixer` and added a namespace |
||||
|
||||
**But** the code is still really hard to understand / read ... |
||||
|
||||
## Usage |
||||
|
||||
```php |
||||
use Readability\Readability; |
||||
|
||||
$url = 'http://www.medialens.org/index.php/alerts/alert-archive/alerts-2013/729-thatcher.html'; |
||||
|
||||
// you can use whatever you want to retrieve the html content (Guzzle, Buzz, cURL ...) |
||||
$html = file_get_contents($url); |
||||
|
||||
$readability = new Readability($html, $url); |
||||
$result = $readability->init(); |
||||
|
||||
if ($result) { |
||||
// display the title of the page |
||||
echo $readability->getTitle()->textContent; |
||||
// display the *readability* content |
||||
echo $readability->getContent()->textContent; |
||||
} else { |
||||
echo 'Looks like we couldn\'t find the content. :('; |
||||
} |
||||
``` |
||||
@ -0,0 +1,33 @@ |
||||
{ |
||||
"name": "j0k3r/php-readability", |
||||
"type": "library", |
||||
"description": "Automatic article extraction from HTML", |
||||
"keywords": ["article extraction","content extraction","extraction","article","content","html"], |
||||
"license": "Apache-2.0", |
||||
"authors": [{ |
||||
"name": "Jeremy Benoist", |
||||
"email": "jeremy.benoist@gmail.com", |
||||
"homepage": "http://www.j0k3r.net", |
||||
"role": "Developer" |
||||
},{ |
||||
"name": "DitherSky", |
||||
"homepage": "https://github.com/Dither", |
||||
"role": "Developer (https://github.com/Dither/full-text-rss)" |
||||
},{ |
||||
"name": "Keyvan Minoukadeh", |
||||
"email": "keyvan@keyvan.net", |
||||
"homepage": "http://keyvan.net", |
||||
"role": "Developer (ported original JS code to PHP)" |
||||
},{ |
||||
"name": "Arc90", |
||||
"homepage": "http://arc90.com", |
||||
"role": "Developer (original JS version)" |
||||
}], |
||||
"require": { |
||||
"php": ">=5.2", |
||||
"ext-tidy": ">=1.2" |
||||
}, |
||||
"autoload": { |
||||
"psr-4": { "Readability\\": "src/" } |
||||
} |
||||
} |
||||
@ -0,0 +1,29 @@ |
||||
<?xml version="1.0" encoding="UTF-8"?> |
||||
<phpunit backupGlobals="false" |
||||
backupStaticAttributes="false" |
||||
colors="true" |
||||
convertErrorsToExceptions="true" |
||||
convertNoticesToExceptions="true" |
||||
convertWarningsToExceptions="true" |
||||
processIsolation="false" |
||||
stopOnFailure="false" |
||||
syntaxCheck="false" |
||||
bootstrap="vendor/autoload.php" |
||||
> |
||||
|
||||
<testsuites> |
||||
<testsuite name="Readability Test Suite"> |
||||
<directory>./tests/</directory> |
||||
</testsuite> |
||||
</testsuites> |
||||
|
||||
<filter> |
||||
<whitelist> |
||||
<directory>./src/TubeLink/</directory> |
||||
<exclude> |
||||
<directory>./tests</directory> |
||||
</exclude> |
||||
</whitelist> |
||||
</filter> |
||||
|
||||
</phpunit> |
||||
@ -0,0 +1,115 @@ |
||||
<?php |
||||
|
||||
namespace Readability; |
||||
|
||||
/** |
||||
* JavaScript-like HTML DOM Element |
||||
* |
||||
* This class extends PHP's DOMElement to allow |
||||
* users to get and set the innerHTML property of |
||||
* HTML elements in the same way it's done in |
||||
* JavaScript. |
||||
* |
||||
* Example usage: |
||||
* require_once 'JSLikeHTMLElement.php'; |
||||
* header('Content-Type: text/plain'); |
||||
* $doc = new DOMDocument(); |
||||
* $doc->registerNodeClass('DOMElement', 'JSLikeHTMLElement'); |
||||
* $doc->loadHTML('<div><p>Para 1</p><p>Para 2</p></div>'); |
||||
* $elem = $doc->getElementsByTagName('div')->item(0); |
||||
* |
||||
* // print innerHTML |
||||
* echo $elem->innerHTML; // prints '<p>Para 1</p><p>Para 2</p>' |
||||
* echo "\n\n"; |
||||
* |
||||
* // set innerHTML |
||||
* $elem->innerHTML = '<a href="http://fivefilters.org">FiveFilters.org</a>'; |
||||
* echo $elem->innerHTML; // prints '<a href="http://fivefilters.org">FiveFilters.org</a>' |
||||
* echo "\n\n"; |
||||
* |
||||
* // print document (with our changes) |
||||
* echo $doc->saveXML(); |
||||
* |
||||
* @author Keyvan Minoukadeh - http://www.keyvan.net - keyvan@keyvan.net |
||||
* @see http://fivefilters.org (the project this was written for) |
||||
*/ |
||||
class JSLikeHTMLElement extends \DOMElement |
||||
{ |
||||
/** |
||||
* Used for setting innerHTML like it's done in JavaScript: |
||||
* @code |
||||
* $div->innerHTML = '<h2>Chapter 2</h2><p>The story begins...</p>'; |
||||
* @endcode |
||||
*/ |
||||
public function __set($name, $value) |
||||
{ |
||||
if ($name == 'innerHTML') { |
||||
// first, empty the element |
||||
for ($x=$this->childNodes->length-1; $x>=0; $x--) { |
||||
$this->removeChild($this->childNodes->item($x)); |
||||
} |
||||
// $value holds our new inner HTML |
||||
if ($value != '') { |
||||
$f = $this->ownerDocument->createDocumentFragment(); |
||||
// appendXML() expects well-formed markup (XHTML) |
||||
$result = @$f->appendXML($value); // @ to suppress PHP warnings |
||||
if ($result) { |
||||
if ($f->hasChildNodes()) { |
||||
$this->appendChild($f); |
||||
} |
||||
} else { |
||||
// $value is probably ill-formed |
||||
$f = new \DOMDocument(); |
||||
$value = mb_convert_encoding($value, 'HTML-ENTITIES', 'UTF-8'); |
||||
// Using <htmlfragment> will generate a warning, but so will bad HTML |
||||
// (and by this point, bad HTML is what we've got). |
||||
// We use it (and suppress the warning) because an HTML fragment will |
||||
// be wrapped around <html><body> tags which we don't really want to keep. |
||||
// Note: despite the warning, if loadHTML succeeds it will return true. |
||||
$result = @$f->loadHTML('<htmlfragment>'.$value.'</htmlfragment>'); |
||||
if ($result) { |
||||
$import = $f->getElementsByTagName('htmlfragment')->item(0); |
||||
foreach ($import->childNodes as $child) { |
||||
$importedNode = $this->ownerDocument->importNode($child, true); |
||||
$this->appendChild($importedNode); |
||||
} |
||||
} else { |
||||
// oh well, we tried, we really did. :( |
||||
// this element is now empty |
||||
} |
||||
} |
||||
} |
||||
} else { |
||||
$trace = debug_backtrace(); |
||||
trigger_error('Undefined property via __set(): '.$name.' in '.$trace[0]['file'].' on line '.$trace[0]['line'], E_USER_NOTICE); |
||||
} |
||||
} |
||||
|
||||
/** |
||||
* Used for getting innerHTML like it's done in JavaScript: |
||||
* @code |
||||
* $string = $div->innerHTML; |
||||
* @endcode |
||||
*/ |
||||
public function __get($name) |
||||
{ |
||||
if ($name == 'innerHTML') { |
||||
$inner = ''; |
||||
foreach ($this->childNodes as $child) { |
||||
$inner .= $this->ownerDocument->saveXML($child); |
||||
} |
||||
|
||||
return $inner; |
||||
} |
||||
|
||||
$trace = debug_backtrace(); |
||||
trigger_error('Undefined property via __get(): '.$name.' in '.$trace[0]['file'].' on line '.$trace[0]['line'], E_USER_NOTICE); |
||||
|
||||
return null; |
||||
} |
||||
|
||||
public function __toString() |
||||
{ |
||||
return '['.$this->tagName.']'; |
||||
} |
||||
} |
||||
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,228 @@ |
||||
<?php |
||||
|
||||
namespace Tests\Readability; |
||||
|
||||
use Readability\Readability; |
||||
use Readability\JSLikeHTMLElement; |
||||
|
||||
class ReadabilityTested extends Readability |
||||
{ |
||||
public function getDebugText() |
||||
{ |
||||
return $this->debugText; |
||||
} |
||||
|
||||
public function getDomainRegexp() |
||||
{ |
||||
return $this->domainRegExp; |
||||
} |
||||
} |
||||
|
||||
class ReadabilityTest extends \PHPUnit_Framework_TestCase |
||||
{ |
||||
protected function setUp() |
||||
{ |
||||
} |
||||
|
||||
protected function tearDown() |
||||
{ |
||||
} |
||||
|
||||
public function testConstructDefault() |
||||
{ |
||||
$readability = new ReadabilityTested(''); |
||||
|
||||
$this->assertNull($readability->url); |
||||
$this->assertContains('Parsing URL', $readability->getDebugText()); |
||||
$this->assertContains('Tidying document', $readability->getDebugText()); |
||||
$this->assertNull($readability->getDomainRegexp()); |
||||
$this->assertInstanceOf('DomDocument', $readability->dom); |
||||
} |
||||
|
||||
public function testConstructSimple() |
||||
{ |
||||
$readability = new ReadabilityTested('<html/>', 'http://0.0.0.0'); |
||||
|
||||
$this->assertEquals('http://0.0.0.0', $readability->url); |
||||
$this->assertContains('Parsing URL', $readability->getDebugText()); |
||||
$this->assertContains('Tidying document', $readability->getDebugText()); |
||||
$this->assertEquals('/0\.0\.0\.0/', $readability->getDomainRegexp()); |
||||
$this->assertInstanceOf('DomDocument', $readability->dom); |
||||
} |
||||
|
||||
public function testInitNoContent() |
||||
{ |
||||
$readability = new ReadabilityTested('<html/>', 'http://0.0.0.0'); |
||||
$res = $readability->init(); |
||||
|
||||
$this->assertFalse($res); |
||||
$this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getContent()); |
||||
$this->assertContains('Sorry, Readability was unable to parse this page for content.', $readability->getContent()->innerHTML); |
||||
} |
||||
|
||||
public function testInitP() |
||||
{ |
||||
$readability = new ReadabilityTested(str_repeat('<p>This is the awesome content :)</p>', 7), 'http://0.0.0.0'); |
||||
$res = $readability->init(); |
||||
|
||||
$this->assertTrue($res); |
||||
$this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getContent()); |
||||
$this->assertContains('<div readability=', $readability->getContent()->innerHTML); |
||||
$this->assertContains('This is the awesome content :)', $readability->getContent()->innerHTML); |
||||
} |
||||
|
||||
public function testInitDivP() |
||||
{ |
||||
$readability = new ReadabilityTested('<div>'.str_repeat('<p>This is the awesome content :)</p>', 7).'</div>', 'http://0.0.0.0'); |
||||
$res = $readability->init(); |
||||
|
||||
$this->assertTrue($res); |
||||
$this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getContent()); |
||||
$this->assertContains('<div readability=', $readability->getContent()->innerHTML); |
||||
$this->assertContains('This is the awesome content :)', $readability->getContent()->innerHTML); |
||||
} |
||||
|
||||
public function testInitDiv() |
||||
{ |
||||
$readability = new ReadabilityTested('<div>'.str_repeat('This is the awesome content :)', 7).'</div>', 'http://0.0.0.0'); |
||||
$readability->debug = true; |
||||
$res = $readability->init(); |
||||
|
||||
$this->assertTrue($res); |
||||
$this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getContent()); |
||||
$this->assertContains('<div readability=', $readability->getContent()->innerHTML); |
||||
$this->assertContains('This is the awesome content :)', $readability->getContent()->innerHTML); |
||||
} |
||||
|
||||
public function testWithFootnotes() |
||||
{ |
||||
$readability = new ReadabilityTested('<div>'.str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7).'</div>', 'http://0.0.0.0'); |
||||
$readability->debug = true; |
||||
$readability->convertLinksToFootnotes = true; |
||||
$res = $readability->init(); |
||||
|
||||
$this->assertTrue($res); |
||||
$this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getContent()); |
||||
$this->assertContains('<div readability=', $readability->getContent()->innerHTML); |
||||
$this->assertContains('This is an awesome text with some links, here there are', $readability->getContent()->innerHTML); |
||||
$this->assertContains('readabilityFootnoteLink', $readability->getContent()->innerHTML); |
||||
$this->assertContains('readabilityLink-3', $readability->getContent()->innerHTML); |
||||
} |
||||
|
||||
public function testStandardClean() |
||||
{ |
||||
$readability = new ReadabilityTested('<div><h2>Title</h2>'.str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7).'<a href="#nofollow" rel="nofollow">will be removed</a></div>', 'http://0.0.0.0'); |
||||
$readability->debug = true; |
||||
$readability->lightClean = false; |
||||
$res = $readability->init(); |
||||
|
||||
$this->assertTrue($res); |
||||
$this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getContent()); |
||||
$this->assertContains('<div readability=', $readability->getContent()->innerHTML); |
||||
$this->assertContains('This is an awesome text with some links, here there are', $readability->getContent()->innerHTML); |
||||
$this->assertNotContains('will be removed', $readability->getContent()->innerHTML); |
||||
$this->assertNotContains('<h2>', $readability->getContent()->innerHTML); |
||||
} |
||||
|
||||
public function testWithIframe() |
||||
{ |
||||
$readability = new ReadabilityTested('<div><h2>Title</h2>'.str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7).'<p>This is an awesome text with some links, here there are <iframe src="http://youtube.com/test" href="#nofollow" rel="nofollow"></iframe><iframe>http://soundcloud.com/test</iframe></p></div>', 'http://0.0.0.0'); |
||||
$readability->debug = true; |
||||
$res = $readability->init(); |
||||
|
||||
$this->assertTrue($res); |
||||
$this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getContent()); |
||||
$this->assertContains('<div readability=', $readability->getContent()->innerHTML); |
||||
$this->assertContains('This is an awesome text with some links, here there are', $readability->getContent()->innerHTML); |
||||
$this->assertContains('nofollow', $readability->getContent()->innerHTML); |
||||
} |
||||
|
||||
public function testWithArticle() |
||||
{ |
||||
$readability = new ReadabilityTested('<article><p>'.str_repeat('This is an awesome text with some links, here there are: the awesome', 20).'</p><p>This is an awesome text with some links, here there are <iframe src="http://youtube.com/test" href="#nofollow" rel="nofollow"></iframe></p></article>', 'http://0.0.0.0'); |
||||
$readability->debug = true; |
||||
$res = $readability->init(); |
||||
|
||||
$this->assertTrue($res); |
||||
$this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getContent()); |
||||
$this->assertContains('alt="article"', $readability->getContent()->innerHTML); |
||||
$this->assertContains('This is an awesome text with some links, here there are', $readability->getContent()->innerHTML); |
||||
$this->assertContains('nofollow', $readability->getContent()->innerHTML); |
||||
} |
||||
|
||||
public function testWithAside() |
||||
{ |
||||
$readability = new ReadabilityTested('<article>'.str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7).'<footer><aside>'.str_repeat('<p>This is an awesome text with some links, here there are</p>', 8).'</aside></footer></article>', 'http://0.0.0.0'); |
||||
$readability->debug = true; |
||||
$res = $readability->init(); |
||||
|
||||
$this->assertTrue($res); |
||||
$this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getContent()); |
||||
$this->assertContains('alt="article"', $readability->getContent()->innerHTML); |
||||
$this->assertContains('This is an awesome text with some links, here there are', $readability->getContent()->innerHTML); |
||||
$this->assertNotContains('<aside>', $readability->getContent()->innerHTML); |
||||
$this->assertContains('<footer/>', $readability->getContent()->innerHTML); |
||||
} |
||||
|
||||
public function testWithClasses() |
||||
{ |
||||
$readability = new ReadabilityTested('<article>'.str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7).'<div style="display:none">'.str_repeat('<p class="clock">This text should be removed</p>', 10).'</div></article>', 'http://0.0.0.0'); |
||||
$readability->debug = true; |
||||
$res = $readability->init(); |
||||
|
||||
$this->assertTrue($res); |
||||
$this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getContent()); |
||||
$this->assertContains('alt="article"', $readability->getContent()->innerHTML); |
||||
$this->assertContains('This is an awesome text with some links, here there are', $readability->getContent()->innerHTML); |
||||
$this->assertNotContains('This text should be removed', $readability->getContent()->innerHTML); |
||||
} |
||||
|
||||
public function testWithTd() |
||||
{ |
||||
$readability = new ReadabilityTested('<table><tr>'.str_repeat('<td><p>This is an awesome text with some links, here there are the awesome</td>', 7).'</tr></table>', 'http://0.0.0.0'); |
||||
$readability->debug = true; |
||||
$res = $readability->init(); |
||||
|
||||
$this->assertTrue($res); |
||||
$this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getContent()); |
||||
$this->assertContains('alt="tr"', $readability->getContent()->innerHTML); |
||||
$this->assertContains('This is an awesome text with some links, here there are', $readability->getContent()->innerHTML); |
||||
} |
||||
|
||||
public function testWithSameClasses() |
||||
{ |
||||
$readability = new ReadabilityTested('<article class="awesomecontent">'.str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7).'<div class="awesomecontent">This text is also an awesome text and you should know that !</div></article>', 'http://0.0.0.0'); |
||||
$readability->debug = true; |
||||
$res = $readability->init(); |
||||
|
||||
$this->assertTrue($res); |
||||
$this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getContent()); |
||||
$this->assertContains('alt="article"', $readability->getContent()->innerHTML); |
||||
$this->assertContains('This is an awesome text with some links, here there are', $readability->getContent()->innerHTML); |
||||
$this->assertContains('This text is also an awesome text and you should know that', $readability->getContent()->innerHTML); |
||||
} |
||||
|
||||
public function testWithScript() |
||||
{ |
||||
$readability = new ReadabilityTested('<article class="awesomecontent">'.str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7).'<p><script>This text is also an awesome text and you should know that !</script></p></article>', 'http://0.0.0.0'); |
||||
$readability->debug = true; |
||||
$res = $readability->init(); |
||||
|
||||
$this->assertTrue($res); |
||||
$this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getContent()); |
||||
$this->assertContains('alt="article"', $readability->getContent()->innerHTML); |
||||
$this->assertContains('This is an awesome text with some links, here there are', $readability->getContent()->innerHTML); |
||||
$this->assertNotContains('This text is also an awesome text and you should know that', $readability->getContent()->innerHTML); |
||||
} |
||||
|
||||
// public function testConstructParser() |
||||
// { |
||||
// $readability = new ReadabilityTested('<html/>', 'http://0.0.0.0', 'html5lib'); |
||||
|
||||
// $this->assertEquals('http://0.0.0.0', $readability->url); |
||||
// $this->assertContains('Parsing URL', $readability->getDebugText()); |
||||
// $this->assertContains('Tidying document', $readability->getDebugText()); |
||||
// $this->assertEquals('/0\.0\.0\.0/', $readability->getDomainRegexp()); |
||||
// $this->assertInstanceOf('DomDocument', $readability->dom); |
||||
// } |
||||
} |
||||
Loading…
Reference in new issue