commit
881e441bdf
10 changed files with 1911 additions and 0 deletions
@ -0,0 +1,12 @@ |
|||||||
|
; top-most EditorConfig file |
||||||
|
root = true |
||||||
|
|
||||||
|
; Unix-style newlines |
||||||
|
[*] |
||||||
|
end_of_line = LF |
||||||
|
|
||||||
|
[*.php] |
||||||
|
indent_style = space |
||||||
|
indent_size = 4 |
||||||
|
trim_trailing_whitespace = true |
||||||
|
insert_final_newline = true |
||||||
@ -0,0 +1 @@ |
|||||||
|
vendor/ |
||||||
@ -0,0 +1,16 @@ |
|||||||
|
language: php |
||||||
|
|
||||||
|
php: |
||||||
|
- 5.2 |
||||||
|
- 5.3 |
||||||
|
- 5.4 |
||||||
|
- 5.5 |
||||||
|
- 5.6 |
||||||
|
|
||||||
|
before_script: |
||||||
|
- composer self-update |
||||||
|
- echo 'date.timezone = "Europe/Paris"' >> ~/.phpenv/versions/$(phpenv version-name)/etc/php.ini |
||||||
|
- composer install --prefer-dist --no-interaction |
||||||
|
|
||||||
|
script: |
||||||
|
- phpunit --coverage-text |
||||||
@ -0,0 +1,201 @@ |
|||||||
|
Apache License |
||||||
|
Version 2.0, January 2004 |
||||||
|
http://www.apache.org/licenses/ |
||||||
|
|
||||||
|
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION |
||||||
|
|
||||||
|
1. Definitions. |
||||||
|
|
||||||
|
"License" shall mean the terms and conditions for use, reproduction, |
||||||
|
and distribution as defined by Sections 1 through 9 of this document. |
||||||
|
|
||||||
|
"Licensor" shall mean the copyright owner or entity authorized by |
||||||
|
the copyright owner that is granting the License. |
||||||
|
|
||||||
|
"Legal Entity" shall mean the union of the acting entity and all |
||||||
|
other entities that control, are controlled by, or are under common |
||||||
|
control with that entity. For the purposes of this definition, |
||||||
|
"control" means (i) the power, direct or indirect, to cause the |
||||||
|
direction or management of such entity, whether by contract or |
||||||
|
otherwise, or (ii) ownership of fifty percent (50%) or more of the |
||||||
|
outstanding shares, or (iii) beneficial ownership of such entity. |
||||||
|
|
||||||
|
"You" (or "Your") shall mean an individual or Legal Entity |
||||||
|
exercising permissions granted by this License. |
||||||
|
|
||||||
|
"Source" form shall mean the preferred form for making modifications, |
||||||
|
including but not limited to software source code, documentation |
||||||
|
source, and configuration files. |
||||||
|
|
||||||
|
"Object" form shall mean any form resulting from mechanical |
||||||
|
transformation or translation of a Source form, including but |
||||||
|
not limited to compiled object code, generated documentation, |
||||||
|
and conversions to other media types. |
||||||
|
|
||||||
|
"Work" shall mean the work of authorship, whether in Source or |
||||||
|
Object form, made available under the License, as indicated by a |
||||||
|
copyright notice that is included in or attached to the work |
||||||
|
(an example is provided in the Appendix below). |
||||||
|
|
||||||
|
"Derivative Works" shall mean any work, whether in Source or Object |
||||||
|
form, that is based on (or derived from) the Work and for which the |
||||||
|
editorial revisions, annotations, elaborations, or other modifications |
||||||
|
represent, as a whole, an original work of authorship. For the purposes |
||||||
|
of this License, Derivative Works shall not include works that remain |
||||||
|
separable from, or merely link (or bind by name) to the interfaces of, |
||||||
|
the Work and Derivative Works thereof. |
||||||
|
|
||||||
|
"Contribution" shall mean any work of authorship, including |
||||||
|
the original version of the Work and any modifications or additions |
||||||
|
to that Work or Derivative Works thereof, that is intentionally |
||||||
|
submitted to Licensor for inclusion in the Work by the copyright owner |
||||||
|
or by an individual or Legal Entity authorized to submit on behalf of |
||||||
|
the copyright owner. For the purposes of this definition, "submitted" |
||||||
|
means any form of electronic, verbal, or written communication sent |
||||||
|
to the Licensor or its representatives, including but not limited to |
||||||
|
communication on electronic mailing lists, source code control systems, |
||||||
|
and issue tracking systems that are managed by, or on behalf of, the |
||||||
|
Licensor for the purpose of discussing and improving the Work, but |
||||||
|
excluding communication that is conspicuously marked or otherwise |
||||||
|
designated in writing by the copyright owner as "Not a Contribution." |
||||||
|
|
||||||
|
"Contributor" shall mean Licensor and any individual or Legal Entity |
||||||
|
on behalf of whom a Contribution has been received by Licensor and |
||||||
|
subsequently incorporated within the Work. |
||||||
|
|
||||||
|
2. Grant of Copyright License. Subject to the terms and conditions of |
||||||
|
this License, each Contributor hereby grants to You a perpetual, |
||||||
|
worldwide, non-exclusive, no-charge, royalty-free, irrevocable |
||||||
|
copyright license to reproduce, prepare Derivative Works of, |
||||||
|
publicly display, publicly perform, sublicense, and distribute the |
||||||
|
Work and such Derivative Works in Source or Object form. |
||||||
|
|
||||||
|
3. Grant of Patent License. Subject to the terms and conditions of |
||||||
|
this License, each Contributor hereby grants to You a perpetual, |
||||||
|
worldwide, non-exclusive, no-charge, royalty-free, irrevocable |
||||||
|
(except as stated in this section) patent license to make, have made, |
||||||
|
use, offer to sell, sell, import, and otherwise transfer the Work, |
||||||
|
where such license applies only to those patent claims licensable |
||||||
|
by such Contributor that are necessarily infringed by their |
||||||
|
Contribution(s) alone or by combination of their Contribution(s) |
||||||
|
with the Work to which such Contribution(s) was submitted. If You |
||||||
|
institute patent litigation against any entity (including a |
||||||
|
cross-claim or counterclaim in a lawsuit) alleging that the Work |
||||||
|
or a Contribution incorporated within the Work constitutes direct |
||||||
|
or contributory patent infringement, then any patent licenses |
||||||
|
granted to You under this License for that Work shall terminate |
||||||
|
as of the date such litigation is filed. |
||||||
|
|
||||||
|
4. Redistribution. You may reproduce and distribute copies of the |
||||||
|
Work or Derivative Works thereof in any medium, with or without |
||||||
|
modifications, and in Source or Object form, provided that You |
||||||
|
meet the following conditions: |
||||||
|
|
||||||
|
(a) You must give any other recipients of the Work or |
||||||
|
Derivative Works a copy of this License; and |
||||||
|
|
||||||
|
(b) You must cause any modified files to carry prominent notices |
||||||
|
stating that You changed the files; and |
||||||
|
|
||||||
|
(c) You must retain, in the Source form of any Derivative Works |
||||||
|
that You distribute, all copyright, patent, trademark, and |
||||||
|
attribution notices from the Source form of the Work, |
||||||
|
excluding those notices that do not pertain to any part of |
||||||
|
the Derivative Works; and |
||||||
|
|
||||||
|
(d) If the Work includes a "NOTICE" text file as part of its |
||||||
|
distribution, then any Derivative Works that You distribute must |
||||||
|
include a readable copy of the attribution notices contained |
||||||
|
within such NOTICE file, excluding those notices that do not |
||||||
|
pertain to any part of the Derivative Works, in at least one |
||||||
|
of the following places: within a NOTICE text file distributed |
||||||
|
as part of the Derivative Works; within the Source form or |
||||||
|
documentation, if provided along with the Derivative Works; or, |
||||||
|
within a display generated by the Derivative Works, if and |
||||||
|
wherever such third-party notices normally appear. The contents |
||||||
|
of the NOTICE file are for informational purposes only and |
||||||
|
do not modify the License. You may add Your own attribution |
||||||
|
notices within Derivative Works that You distribute, alongside |
||||||
|
or as an addendum to the NOTICE text from the Work, provided |
||||||
|
that such additional attribution notices cannot be construed |
||||||
|
as modifying the License. |
||||||
|
|
||||||
|
You may add Your own copyright statement to Your modifications and |
||||||
|
may provide additional or different license terms and conditions |
||||||
|
for use, reproduction, or distribution of Your modifications, or |
||||||
|
for any such Derivative Works as a whole, provided Your use, |
||||||
|
reproduction, and distribution of the Work otherwise complies with |
||||||
|
the conditions stated in this License. |
||||||
|
|
||||||
|
5. Submission of Contributions. Unless You explicitly state otherwise, |
||||||
|
any Contribution intentionally submitted for inclusion in the Work |
||||||
|
by You to the Licensor shall be under the terms and conditions of |
||||||
|
this License, without any additional terms or conditions. |
||||||
|
Notwithstanding the above, nothing herein shall supersede or modify |
||||||
|
the terms of any separate license agreement you may have executed |
||||||
|
with Licensor regarding such Contributions. |
||||||
|
|
||||||
|
6. Trademarks. This License does not grant permission to use the trade |
||||||
|
names, trademarks, service marks, or product names of the Licensor, |
||||||
|
except as required for reasonable and customary use in describing the |
||||||
|
origin of the Work and reproducing the content of the NOTICE file. |
||||||
|
|
||||||
|
7. Disclaimer of Warranty. Unless required by applicable law or |
||||||
|
agreed to in writing, Licensor provides the Work (and each |
||||||
|
Contributor provides its Contributions) on an "AS IS" BASIS, |
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or |
||||||
|
implied, including, without limitation, any warranties or conditions |
||||||
|
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A |
||||||
|
PARTICULAR PURPOSE. You are solely responsible for determining the |
||||||
|
appropriateness of using or redistributing the Work and assume any |
||||||
|
risks associated with Your exercise of permissions under this License. |
||||||
|
|
||||||
|
8. Limitation of Liability. In no event and under no legal theory, |
||||||
|
whether in tort (including negligence), contract, or otherwise, |
||||||
|
unless required by applicable law (such as deliberate and grossly |
||||||
|
negligent acts) or agreed to in writing, shall any Contributor be |
||||||
|
liable to You for damages, including any direct, indirect, special, |
||||||
|
incidental, or consequential damages of any character arising as a |
||||||
|
result of this License or out of the use or inability to use the |
||||||
|
Work (including but not limited to damages for loss of goodwill, |
||||||
|
work stoppage, computer failure or malfunction, or any and all |
||||||
|
other commercial damages or losses), even if such Contributor |
||||||
|
has been advised of the possibility of such damages. |
||||||
|
|
||||||
|
9. Accepting Warranty or Additional Liability. While redistributing |
||||||
|
the Work or Derivative Works thereof, You may choose to offer, |
||||||
|
and charge a fee for, acceptance of support, warranty, indemnity, |
||||||
|
or other liability obligations and/or rights consistent with this |
||||||
|
License. However, in accepting such obligations, You may act only |
||||||
|
on Your own behalf and on Your sole responsibility, not on behalf |
||||||
|
of any other Contributor, and only if You agree to indemnify, |
||||||
|
defend, and hold each Contributor harmless for any liability |
||||||
|
incurred by, or claims asserted against, such Contributor by reason |
||||||
|
of your accepting any such warranty or additional liability. |
||||||
|
|
||||||
|
END OF TERMS AND CONDITIONS |
||||||
|
|
||||||
|
APPENDIX: How to apply the Apache License to your work. |
||||||
|
|
||||||
|
To apply the Apache License to your work, attach the following |
||||||
|
boilerplate notice, with the fields enclosed by brackets "{}" |
||||||
|
replaced with your own identifying information. (Don't include |
||||||
|
the brackets!) The text should be enclosed in the appropriate |
||||||
|
comment syntax for the file format. We also recommend that a |
||||||
|
file or class name and description of purpose be included on the |
||||||
|
same "printed page" as the copyright notice for easier |
||||||
|
identification within third-party archives. |
||||||
|
|
||||||
|
Copyright {yyyy} {name of copyright owner} |
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License"); |
||||||
|
you may not use this file except in compliance with the License. |
||||||
|
You may obtain a copy of the License at |
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0 |
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software |
||||||
|
distributed under the License is distributed on an "AS IS" BASIS, |
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||||
|
See the License for the specific language governing permissions and |
||||||
|
limitations under the License. |
||||||
@ -0,0 +1,38 @@ |
|||||||
|
# Readability |
||||||
|
|
||||||
|
[](https://travis-ci.org/j0k3r/php-readability) |
||||||
|
|
||||||
|
This is an extract of the Readability class from the [full-text-rss](https://github.com/Dither/full-text-rss) fork. It kind be defined as a better version of the original [php-readability](http://code.fivefilters.org/php-readability). |
||||||
|
|
||||||
|
## Differences |
||||||
|
|
||||||
|
The default php-readability lib is really old and needs to be improved. I found a great fork of [full-text-rss](http://fivefilters.org/content-only/) from @Dither which improve the Readability class. |
||||||
|
|
||||||
|
- I've extracted the class from its fork to be able to use it out of the box |
||||||
|
- I've added some simple tests |
||||||
|
- and changed the CS, run `php-cs-fixer` and added a namespace |
||||||
|
|
||||||
|
**But** the code is still really hard to understand / read ... |
||||||
|
|
||||||
|
## Usage |
||||||
|
|
||||||
|
```php |
||||||
|
use Readability\Readability; |
||||||
|
|
||||||
|
$url = 'http://www.medialens.org/index.php/alerts/alert-archive/alerts-2013/729-thatcher.html'; |
||||||
|
|
||||||
|
// you can use whatever you want to retrieve the html content (Guzzle, Buzz, cURL ...) |
||||||
|
$html = file_get_contents($url); |
||||||
|
|
||||||
|
$readability = new Readability($html, $url); |
||||||
|
$result = $readability->init(); |
||||||
|
|
||||||
|
if ($result) { |
||||||
|
// display the title of the page |
||||||
|
echo $readability->getTitle()->textContent; |
||||||
|
// display the *readability* content |
||||||
|
echo $readability->getContent()->textContent; |
||||||
|
} else { |
||||||
|
echo 'Looks like we couldn\'t find the content. :('; |
||||||
|
} |
||||||
|
``` |
||||||
@ -0,0 +1,33 @@ |
|||||||
|
{ |
||||||
|
"name": "j0k3r/php-readability", |
||||||
|
"type": "library", |
||||||
|
"description": "Automatic article extraction from HTML", |
||||||
|
"keywords": ["article extraction","content extraction","extraction","article","content","html"], |
||||||
|
"license": "Apache-2.0", |
||||||
|
"authors": [{ |
||||||
|
"name": "Jeremy Benoist", |
||||||
|
"email": "jeremy.benoist@gmail.com", |
||||||
|
"homepage": "http://www.j0k3r.net", |
||||||
|
"role": "Developer" |
||||||
|
},{ |
||||||
|
"name": "DitherSky", |
||||||
|
"homepage": "https://github.com/Dither", |
||||||
|
"role": "Developer (https://github.com/Dither/full-text-rss)" |
||||||
|
},{ |
||||||
|
"name": "Keyvan Minoukadeh", |
||||||
|
"email": "keyvan@keyvan.net", |
||||||
|
"homepage": "http://keyvan.net", |
||||||
|
"role": "Developer (ported original JS code to PHP)" |
||||||
|
},{ |
||||||
|
"name": "Arc90", |
||||||
|
"homepage": "http://arc90.com", |
||||||
|
"role": "Developer (original JS version)" |
||||||
|
}], |
||||||
|
"require": { |
||||||
|
"php": ">=5.2", |
||||||
|
"ext-tidy": ">=1.2" |
||||||
|
}, |
||||||
|
"autoload": { |
||||||
|
"psr-4": { "Readability\\": "src/" } |
||||||
|
} |
||||||
|
} |
||||||
@ -0,0 +1,29 @@ |
|||||||
|
<?xml version="1.0" encoding="UTF-8"?> |
||||||
|
<phpunit backupGlobals="false" |
||||||
|
backupStaticAttributes="false" |
||||||
|
colors="true" |
||||||
|
convertErrorsToExceptions="true" |
||||||
|
convertNoticesToExceptions="true" |
||||||
|
convertWarningsToExceptions="true" |
||||||
|
processIsolation="false" |
||||||
|
stopOnFailure="false" |
||||||
|
syntaxCheck="false" |
||||||
|
bootstrap="vendor/autoload.php" |
||||||
|
> |
||||||
|
|
||||||
|
<testsuites> |
||||||
|
<testsuite name="Readability Test Suite"> |
||||||
|
<directory>./tests/</directory> |
||||||
|
</testsuite> |
||||||
|
</testsuites> |
||||||
|
|
||||||
|
<filter> |
||||||
|
<whitelist> |
||||||
|
<directory>./src/TubeLink/</directory> |
||||||
|
<exclude> |
||||||
|
<directory>./tests</directory> |
||||||
|
</exclude> |
||||||
|
</whitelist> |
||||||
|
</filter> |
||||||
|
|
||||||
|
</phpunit> |
||||||
@ -0,0 +1,115 @@ |
|||||||
|
<?php |
||||||
|
|
||||||
|
namespace Readability; |
||||||
|
|
||||||
|
/** |
||||||
|
* JavaScript-like HTML DOM Element |
||||||
|
* |
||||||
|
* This class extends PHP's DOMElement to allow |
||||||
|
* users to get and set the innerHTML property of |
||||||
|
* HTML elements in the same way it's done in |
||||||
|
* JavaScript. |
||||||
|
* |
||||||
|
* Example usage: |
||||||
|
* require_once 'JSLikeHTMLElement.php'; |
||||||
|
* header('Content-Type: text/plain'); |
||||||
|
* $doc = new DOMDocument(); |
||||||
|
* $doc->registerNodeClass('DOMElement', 'JSLikeHTMLElement'); |
||||||
|
* $doc->loadHTML('<div><p>Para 1</p><p>Para 2</p></div>'); |
||||||
|
* $elem = $doc->getElementsByTagName('div')->item(0); |
||||||
|
* |
||||||
|
* // print innerHTML |
||||||
|
* echo $elem->innerHTML; // prints '<p>Para 1</p><p>Para 2</p>' |
||||||
|
* echo "\n\n"; |
||||||
|
* |
||||||
|
* // set innerHTML |
||||||
|
* $elem->innerHTML = '<a href="http://fivefilters.org">FiveFilters.org</a>'; |
||||||
|
* echo $elem->innerHTML; // prints '<a href="http://fivefilters.org">FiveFilters.org</a>' |
||||||
|
* echo "\n\n"; |
||||||
|
* |
||||||
|
* // print document (with our changes) |
||||||
|
* echo $doc->saveXML(); |
||||||
|
* |
||||||
|
* @author Keyvan Minoukadeh - http://www.keyvan.net - keyvan@keyvan.net |
||||||
|
* @see http://fivefilters.org (the project this was written for) |
||||||
|
*/ |
||||||
|
class JSLikeHTMLElement extends \DOMElement |
||||||
|
{ |
||||||
|
/** |
||||||
|
* Used for setting innerHTML like it's done in JavaScript: |
||||||
|
* @code |
||||||
|
* $div->innerHTML = '<h2>Chapter 2</h2><p>The story begins...</p>'; |
||||||
|
* @endcode |
||||||
|
*/ |
||||||
|
public function __set($name, $value) |
||||||
|
{ |
||||||
|
if ($name == 'innerHTML') { |
||||||
|
// first, empty the element |
||||||
|
for ($x=$this->childNodes->length-1; $x>=0; $x--) { |
||||||
|
$this->removeChild($this->childNodes->item($x)); |
||||||
|
} |
||||||
|
// $value holds our new inner HTML |
||||||
|
if ($value != '') { |
||||||
|
$f = $this->ownerDocument->createDocumentFragment(); |
||||||
|
// appendXML() expects well-formed markup (XHTML) |
||||||
|
$result = @$f->appendXML($value); // @ to suppress PHP warnings |
||||||
|
if ($result) { |
||||||
|
if ($f->hasChildNodes()) { |
||||||
|
$this->appendChild($f); |
||||||
|
} |
||||||
|
} else { |
||||||
|
// $value is probably ill-formed |
||||||
|
$f = new \DOMDocument(); |
||||||
|
$value = mb_convert_encoding($value, 'HTML-ENTITIES', 'UTF-8'); |
||||||
|
// Using <htmlfragment> will generate a warning, but so will bad HTML |
||||||
|
// (and by this point, bad HTML is what we've got). |
||||||
|
// We use it (and suppress the warning) because an HTML fragment will |
||||||
|
// be wrapped around <html><body> tags which we don't really want to keep. |
||||||
|
// Note: despite the warning, if loadHTML succeeds it will return true. |
||||||
|
$result = @$f->loadHTML('<htmlfragment>'.$value.'</htmlfragment>'); |
||||||
|
if ($result) { |
||||||
|
$import = $f->getElementsByTagName('htmlfragment')->item(0); |
||||||
|
foreach ($import->childNodes as $child) { |
||||||
|
$importedNode = $this->ownerDocument->importNode($child, true); |
||||||
|
$this->appendChild($importedNode); |
||||||
|
} |
||||||
|
} else { |
||||||
|
// oh well, we tried, we really did. :( |
||||||
|
// this element is now empty |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
} else { |
||||||
|
$trace = debug_backtrace(); |
||||||
|
trigger_error('Undefined property via __set(): '.$name.' in '.$trace[0]['file'].' on line '.$trace[0]['line'], E_USER_NOTICE); |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
/** |
||||||
|
* Used for getting innerHTML like it's done in JavaScript: |
||||||
|
* @code |
||||||
|
* $string = $div->innerHTML; |
||||||
|
* @endcode |
||||||
|
*/ |
||||||
|
public function __get($name) |
||||||
|
{ |
||||||
|
if ($name == 'innerHTML') { |
||||||
|
$inner = ''; |
||||||
|
foreach ($this->childNodes as $child) { |
||||||
|
$inner .= $this->ownerDocument->saveXML($child); |
||||||
|
} |
||||||
|
|
||||||
|
return $inner; |
||||||
|
} |
||||||
|
|
||||||
|
$trace = debug_backtrace(); |
||||||
|
trigger_error('Undefined property via __get(): '.$name.' in '.$trace[0]['file'].' on line '.$trace[0]['line'], E_USER_NOTICE); |
||||||
|
|
||||||
|
return null; |
||||||
|
} |
||||||
|
|
||||||
|
public function __toString() |
||||||
|
{ |
||||||
|
return '['.$this->tagName.']'; |
||||||
|
} |
||||||
|
} |
||||||
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,228 @@ |
|||||||
|
<?php |
||||||
|
|
||||||
|
namespace Tests\Readability; |
||||||
|
|
||||||
|
use Readability\Readability; |
||||||
|
use Readability\JSLikeHTMLElement; |
||||||
|
|
||||||
|
class ReadabilityTested extends Readability |
||||||
|
{ |
||||||
|
public function getDebugText() |
||||||
|
{ |
||||||
|
return $this->debugText; |
||||||
|
} |
||||||
|
|
||||||
|
public function getDomainRegexp() |
||||||
|
{ |
||||||
|
return $this->domainRegExp; |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
class ReadabilityTest extends \PHPUnit_Framework_TestCase |
||||||
|
{ |
||||||
|
protected function setUp() |
||||||
|
{ |
||||||
|
} |
||||||
|
|
||||||
|
protected function tearDown() |
||||||
|
{ |
||||||
|
} |
||||||
|
|
||||||
|
public function testConstructDefault() |
||||||
|
{ |
||||||
|
$readability = new ReadabilityTested(''); |
||||||
|
|
||||||
|
$this->assertNull($readability->url); |
||||||
|
$this->assertContains('Parsing URL', $readability->getDebugText()); |
||||||
|
$this->assertContains('Tidying document', $readability->getDebugText()); |
||||||
|
$this->assertNull($readability->getDomainRegexp()); |
||||||
|
$this->assertInstanceOf('DomDocument', $readability->dom); |
||||||
|
} |
||||||
|
|
||||||
|
public function testConstructSimple() |
||||||
|
{ |
||||||
|
$readability = new ReadabilityTested('<html/>', 'http://0.0.0.0'); |
||||||
|
|
||||||
|
$this->assertEquals('http://0.0.0.0', $readability->url); |
||||||
|
$this->assertContains('Parsing URL', $readability->getDebugText()); |
||||||
|
$this->assertContains('Tidying document', $readability->getDebugText()); |
||||||
|
$this->assertEquals('/0\.0\.0\.0/', $readability->getDomainRegexp()); |
||||||
|
$this->assertInstanceOf('DomDocument', $readability->dom); |
||||||
|
} |
||||||
|
|
||||||
|
public function testInitNoContent() |
||||||
|
{ |
||||||
|
$readability = new ReadabilityTested('<html/>', 'http://0.0.0.0'); |
||||||
|
$res = $readability->init(); |
||||||
|
|
||||||
|
$this->assertFalse($res); |
||||||
|
$this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getContent()); |
||||||
|
$this->assertContains('Sorry, Readability was unable to parse this page for content.', $readability->getContent()->innerHTML); |
||||||
|
} |
||||||
|
|
||||||
|
public function testInitP() |
||||||
|
{ |
||||||
|
$readability = new ReadabilityTested(str_repeat('<p>This is the awesome content :)</p>', 7), 'http://0.0.0.0'); |
||||||
|
$res = $readability->init(); |
||||||
|
|
||||||
|
$this->assertTrue($res); |
||||||
|
$this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getContent()); |
||||||
|
$this->assertContains('<div readability=', $readability->getContent()->innerHTML); |
||||||
|
$this->assertContains('This is the awesome content :)', $readability->getContent()->innerHTML); |
||||||
|
} |
||||||
|
|
||||||
|
public function testInitDivP() |
||||||
|
{ |
||||||
|
$readability = new ReadabilityTested('<div>'.str_repeat('<p>This is the awesome content :)</p>', 7).'</div>', 'http://0.0.0.0'); |
||||||
|
$res = $readability->init(); |
||||||
|
|
||||||
|
$this->assertTrue($res); |
||||||
|
$this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getContent()); |
||||||
|
$this->assertContains('<div readability=', $readability->getContent()->innerHTML); |
||||||
|
$this->assertContains('This is the awesome content :)', $readability->getContent()->innerHTML); |
||||||
|
} |
||||||
|
|
||||||
|
public function testInitDiv() |
||||||
|
{ |
||||||
|
$readability = new ReadabilityTested('<div>'.str_repeat('This is the awesome content :)', 7).'</div>', 'http://0.0.0.0'); |
||||||
|
$readability->debug = true; |
||||||
|
$res = $readability->init(); |
||||||
|
|
||||||
|
$this->assertTrue($res); |
||||||
|
$this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getContent()); |
||||||
|
$this->assertContains('<div readability=', $readability->getContent()->innerHTML); |
||||||
|
$this->assertContains('This is the awesome content :)', $readability->getContent()->innerHTML); |
||||||
|
} |
||||||
|
|
||||||
|
public function testWithFootnotes() |
||||||
|
{ |
||||||
|
$readability = new ReadabilityTested('<div>'.str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7).'</div>', 'http://0.0.0.0'); |
||||||
|
$readability->debug = true; |
||||||
|
$readability->convertLinksToFootnotes = true; |
||||||
|
$res = $readability->init(); |
||||||
|
|
||||||
|
$this->assertTrue($res); |
||||||
|
$this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getContent()); |
||||||
|
$this->assertContains('<div readability=', $readability->getContent()->innerHTML); |
||||||
|
$this->assertContains('This is an awesome text with some links, here there are', $readability->getContent()->innerHTML); |
||||||
|
$this->assertContains('readabilityFootnoteLink', $readability->getContent()->innerHTML); |
||||||
|
$this->assertContains('readabilityLink-3', $readability->getContent()->innerHTML); |
||||||
|
} |
||||||
|
|
||||||
|
public function testStandardClean() |
||||||
|
{ |
||||||
|
$readability = new ReadabilityTested('<div><h2>Title</h2>'.str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7).'<a href="#nofollow" rel="nofollow">will be removed</a></div>', 'http://0.0.0.0'); |
||||||
|
$readability->debug = true; |
||||||
|
$readability->lightClean = false; |
||||||
|
$res = $readability->init(); |
||||||
|
|
||||||
|
$this->assertTrue($res); |
||||||
|
$this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getContent()); |
||||||
|
$this->assertContains('<div readability=', $readability->getContent()->innerHTML); |
||||||
|
$this->assertContains('This is an awesome text with some links, here there are', $readability->getContent()->innerHTML); |
||||||
|
$this->assertNotContains('will be removed', $readability->getContent()->innerHTML); |
||||||
|
$this->assertNotContains('<h2>', $readability->getContent()->innerHTML); |
||||||
|
} |
||||||
|
|
||||||
|
public function testWithIframe() |
||||||
|
{ |
||||||
|
$readability = new ReadabilityTested('<div><h2>Title</h2>'.str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7).'<p>This is an awesome text with some links, here there are <iframe src="http://youtube.com/test" href="#nofollow" rel="nofollow"></iframe><iframe>http://soundcloud.com/test</iframe></p></div>', 'http://0.0.0.0'); |
||||||
|
$readability->debug = true; |
||||||
|
$res = $readability->init(); |
||||||
|
|
||||||
|
$this->assertTrue($res); |
||||||
|
$this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getContent()); |
||||||
|
$this->assertContains('<div readability=', $readability->getContent()->innerHTML); |
||||||
|
$this->assertContains('This is an awesome text with some links, here there are', $readability->getContent()->innerHTML); |
||||||
|
$this->assertContains('nofollow', $readability->getContent()->innerHTML); |
||||||
|
} |
||||||
|
|
||||||
|
public function testWithArticle() |
||||||
|
{ |
||||||
|
$readability = new ReadabilityTested('<article><p>'.str_repeat('This is an awesome text with some links, here there are: the awesome', 20).'</p><p>This is an awesome text with some links, here there are <iframe src="http://youtube.com/test" href="#nofollow" rel="nofollow"></iframe></p></article>', 'http://0.0.0.0'); |
||||||
|
$readability->debug = true; |
||||||
|
$res = $readability->init(); |
||||||
|
|
||||||
|
$this->assertTrue($res); |
||||||
|
$this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getContent()); |
||||||
|
$this->assertContains('alt="article"', $readability->getContent()->innerHTML); |
||||||
|
$this->assertContains('This is an awesome text with some links, here there are', $readability->getContent()->innerHTML); |
||||||
|
$this->assertContains('nofollow', $readability->getContent()->innerHTML); |
||||||
|
} |
||||||
|
|
||||||
|
public function testWithAside() |
||||||
|
{ |
||||||
|
$readability = new ReadabilityTested('<article>'.str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7).'<footer><aside>'.str_repeat('<p>This is an awesome text with some links, here there are</p>', 8).'</aside></footer></article>', 'http://0.0.0.0'); |
||||||
|
$readability->debug = true; |
||||||
|
$res = $readability->init(); |
||||||
|
|
||||||
|
$this->assertTrue($res); |
||||||
|
$this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getContent()); |
||||||
|
$this->assertContains('alt="article"', $readability->getContent()->innerHTML); |
||||||
|
$this->assertContains('This is an awesome text with some links, here there are', $readability->getContent()->innerHTML); |
||||||
|
$this->assertNotContains('<aside>', $readability->getContent()->innerHTML); |
||||||
|
$this->assertContains('<footer/>', $readability->getContent()->innerHTML); |
||||||
|
} |
||||||
|
|
||||||
|
public function testWithClasses() |
||||||
|
{ |
||||||
|
$readability = new ReadabilityTested('<article>'.str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7).'<div style="display:none">'.str_repeat('<p class="clock">This text should be removed</p>', 10).'</div></article>', 'http://0.0.0.0'); |
||||||
|
$readability->debug = true; |
||||||
|
$res = $readability->init(); |
||||||
|
|
||||||
|
$this->assertTrue($res); |
||||||
|
$this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getContent()); |
||||||
|
$this->assertContains('alt="article"', $readability->getContent()->innerHTML); |
||||||
|
$this->assertContains('This is an awesome text with some links, here there are', $readability->getContent()->innerHTML); |
||||||
|
$this->assertNotContains('This text should be removed', $readability->getContent()->innerHTML); |
||||||
|
} |
||||||
|
|
||||||
|
public function testWithTd() |
||||||
|
{ |
||||||
|
$readability = new ReadabilityTested('<table><tr>'.str_repeat('<td><p>This is an awesome text with some links, here there are the awesome</td>', 7).'</tr></table>', 'http://0.0.0.0'); |
||||||
|
$readability->debug = true; |
||||||
|
$res = $readability->init(); |
||||||
|
|
||||||
|
$this->assertTrue($res); |
||||||
|
$this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getContent()); |
||||||
|
$this->assertContains('alt="tr"', $readability->getContent()->innerHTML); |
||||||
|
$this->assertContains('This is an awesome text with some links, here there are', $readability->getContent()->innerHTML); |
||||||
|
} |
||||||
|
|
||||||
|
public function testWithSameClasses() |
||||||
|
{ |
||||||
|
$readability = new ReadabilityTested('<article class="awesomecontent">'.str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7).'<div class="awesomecontent">This text is also an awesome text and you should know that !</div></article>', 'http://0.0.0.0'); |
||||||
|
$readability->debug = true; |
||||||
|
$res = $readability->init(); |
||||||
|
|
||||||
|
$this->assertTrue($res); |
||||||
|
$this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getContent()); |
||||||
|
$this->assertContains('alt="article"', $readability->getContent()->innerHTML); |
||||||
|
$this->assertContains('This is an awesome text with some links, here there are', $readability->getContent()->innerHTML); |
||||||
|
$this->assertContains('This text is also an awesome text and you should know that', $readability->getContent()->innerHTML); |
||||||
|
} |
||||||
|
|
||||||
|
public function testWithScript() |
||||||
|
{ |
||||||
|
$readability = new ReadabilityTested('<article class="awesomecontent">'.str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7).'<p><script>This text is also an awesome text and you should know that !</script></p></article>', 'http://0.0.0.0'); |
||||||
|
$readability->debug = true; |
||||||
|
$res = $readability->init(); |
||||||
|
|
||||||
|
$this->assertTrue($res); |
||||||
|
$this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getContent()); |
||||||
|
$this->assertContains('alt="article"', $readability->getContent()->innerHTML); |
||||||
|
$this->assertContains('This is an awesome text with some links, here there are', $readability->getContent()->innerHTML); |
||||||
|
$this->assertNotContains('This text is also an awesome text and you should know that', $readability->getContent()->innerHTML); |
||||||
|
} |
||||||
|
|
||||||
|
// public function testConstructParser() |
||||||
|
// { |
||||||
|
// $readability = new ReadabilityTested('<html/>', 'http://0.0.0.0', 'html5lib'); |
||||||
|
|
||||||
|
// $this->assertEquals('http://0.0.0.0', $readability->url); |
||||||
|
// $this->assertContains('Parsing URL', $readability->getDebugText()); |
||||||
|
// $this->assertContains('Tidying document', $readability->getDebugText()); |
||||||
|
// $this->assertEquals('/0\.0\.0\.0/', $readability->getDomainRegexp()); |
||||||
|
// $this->assertInstanceOf('DomDocument', $readability->dom); |
||||||
|
// } |
||||||
|
} |
||||||
Loading…
Reference in new issue