├── .gitignore
├── README.md
├── blacklist.txt
├── cache
└── .gitkeep
├── common-sites.json
├── composer.json
├── composer.lock
├── composer.phar
├── example.env
├── index.php
├── lib
├── FileSystemCache
│ ├── .gitignore
│ ├── .travis.yml
│ ├── README.md
│ ├── composer.json
│ ├── lib
│ │ └── FileSystemCache.php
│ ├── phpunit.xml.dist
│ └── tests
│ │ └── FileSystemCacheTest.php
├── ansi-color.php
└── fivefilters-php-readability
│ ├── JSLikeHTMLElement.php
│ ├── README.md
│ ├── Readability.php
│ ├── composer.json
│ └── examples
│ ├── JSLikeHTMLElement.php
│ └── Readability.php
├── src
├── Fetcher.php
├── Generator.php
├── Parser.php
├── Uploader.php
└── templates
│ ├── fullhn.manifest.mustache
│ ├── index.mustache
│ ├── latest.mustache
│ └── partials
│ └── head.mustache
└── www
├── apple-touch-icon.png
├── css
├── img
│ └── loader.gif
└── style.css
├── favicon.ico
├── js
├── app.js
├── jquery-2.0.3.min.js
├── moment.min.js
└── waypoints.min.js
└── robots.txt
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | .env
3 | index.html
4 | latest.html
5 | cache.manifest
6 | cache/*
7 | vendor/*
8 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # FullHackerNews
2 |
3 | Read all Hacker News articles in one single static page, optimized for reading.
4 | I use it to load all articles for offline reading on my iPhone.
5 |
6 | Can work with any other feed.
7 |
8 | # Requirement
9 | * PHP >= 5.6.0
10 | * Amazon S3 account
11 |
12 | # Installing
13 |
14 | * Make the `cache` folder writable
15 | * Create an S3 bucket, configured as a Web server
16 | * upload the content of `www` to the S3 bucket
17 | * copy `example.env` to `.env` and update the values, or set env variables
18 | * install dependencies : `$ php composer.phar install`
19 | * run `php index.php` periodically
20 | * enjoy
21 |
22 | # License
23 |
24 | This project is released under the BSD license.
25 |
--------------------------------------------------------------------------------
/blacklist.txt:
--------------------------------------------------------------------------------
1 | www.fullhn.com
2 | dolphin-emu.org
--------------------------------------------------------------------------------
/cache/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mauricesvay/FullHackerNews/73117bd1f81ddb163048e5828ee6da1edabd157e/cache/.gitkeep
--------------------------------------------------------------------------------
/common-sites.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "name": "github",
4 | "pattern": "/http(s)?:\\/\\/github.com\\/([^\\/]+)\\/([^\\/]+)/",
5 | "path": "#readme > article.entry-content"
6 | },
7 | {
8 | "name": "tweet",
9 | "pattern": "/http(s)?:\\/\\/twitter.com\\/(\\S+)\\/status\\/\\d+/",
10 | "path": ".js-tweet-text-container > .TweetTextSize--jumbo"
11 | },
12 | {
13 | "name": "tweet (mobile)",
14 | "pattern": "/http(s)?:\\/\\/mobile.twitter.com\\/(\\S+)\\/status\\/\\d+/",
15 | "path": "[data-testid='tweetDetail']"
16 | },
17 | {
18 | "name": "arxiv.org",
19 | "pattern": "/http(s)?:\\/\\/arxiv.org\\/abs\\/(\\S+)/",
20 | "path": ".abstract"
21 | }
22 | ]
23 |
--------------------------------------------------------------------------------
/composer.json:
--------------------------------------------------------------------------------
1 | {
2 | "require": {
3 | "mustache/mustache": "^2.12",
4 | "ezyang/htmlpurifier": "^4.10",
5 | "simplepie/simplepie": "^1.5",
6 | "guzzlehttp/guzzle": "^7.0",
7 | "euskadi31/opengraph": "^1.0",
8 | "aws/aws-sdk-php": "^3.99",
9 | "vlucas/phpdotenv": "^3.3",
10 | "paquettg/php-html-parser": "^3.1"
11 | }
12 | }
--------------------------------------------------------------------------------
/composer.lock:
--------------------------------------------------------------------------------
1 | {
2 | "_readme": [
3 | "This file locks the dependencies of your project to a known state",
4 | "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
5 | "This file is @generated automatically"
6 | ],
7 | "content-hash": "137a1b4f54362a7dee1624a111fdbb25",
8 | "packages": [
9 | {
10 | "name": "aws/aws-crt-php",
11 | "version": "v1.0.2",
12 | "source": {
13 | "type": "git",
14 | "url": "https://github.com/awslabs/aws-crt-php.git",
15 | "reference": "3942776a8c99209908ee0b287746263725685732"
16 | },
17 | "dist": {
18 | "type": "zip",
19 | "url": "https://api.github.com/repos/awslabs/aws-crt-php/zipball/3942776a8c99209908ee0b287746263725685732",
20 | "reference": "3942776a8c99209908ee0b287746263725685732",
21 | "shasum": ""
22 | },
23 | "require": {
24 | "php": ">=5.5"
25 | },
26 | "require-dev": {
27 | "phpunit/phpunit": "^4.8.35|^5.4.3"
28 | },
29 | "type": "library",
30 | "autoload": {
31 | "classmap": [
32 | "src/"
33 | ]
34 | },
35 | "notification-url": "https://packagist.org/downloads/",
36 | "license": [
37 | "Apache-2.0"
38 | ],
39 | "authors": [
40 | {
41 | "name": "AWS SDK Common Runtime Team",
42 | "email": "aws-sdk-common-runtime@amazon.com"
43 | }
44 | ],
45 | "description": "AWS Common Runtime for PHP",
46 | "homepage": "http://aws.amazon.com/sdkforphp",
47 | "keywords": [
48 | "amazon",
49 | "aws",
50 | "crt",
51 | "sdk"
52 | ],
53 | "support": {
54 | "issues": "https://github.com/awslabs/aws-crt-php/issues",
55 | "source": "https://github.com/awslabs/aws-crt-php/tree/v1.0.2"
56 | },
57 | "time": "2021-09-03T22:57:30+00:00"
58 | },
59 | {
60 | "name": "aws/aws-sdk-php",
61 | "version": "3.194.5",
62 | "source": {
63 | "type": "git",
64 | "url": "https://github.com/aws/aws-sdk-php.git",
65 | "reference": "33f755378debdbc7e010157811fc47aebf090c53"
66 | },
67 | "dist": {
68 | "type": "zip",
69 | "url": "https://api.github.com/repos/aws/aws-sdk-php/zipball/33f755378debdbc7e010157811fc47aebf090c53",
70 | "reference": "33f755378debdbc7e010157811fc47aebf090c53",
71 | "shasum": ""
72 | },
73 | "require": {
74 | "aws/aws-crt-php": "^1.0.2",
75 | "ext-json": "*",
76 | "ext-pcre": "*",
77 | "ext-simplexml": "*",
78 | "guzzlehttp/guzzle": "^5.3.3|^6.2.1|^7.0",
79 | "guzzlehttp/promises": "^1.4.0",
80 | "guzzlehttp/psr7": "^1.7.0",
81 | "mtdowling/jmespath.php": "^2.6",
82 | "php": ">=5.5"
83 | },
84 | "require-dev": {
85 | "andrewsville/php-token-reflection": "^1.4",
86 | "aws/aws-php-sns-message-validator": "~1.0",
87 | "behat/behat": "~3.0",
88 | "doctrine/cache": "~1.4",
89 | "ext-dom": "*",
90 | "ext-openssl": "*",
91 | "ext-pcntl": "*",
92 | "ext-sockets": "*",
93 | "nette/neon": "^2.3",
94 | "paragonie/random_compat": ">= 2",
95 | "phpunit/phpunit": "^4.8.35|^5.4.3",
96 | "psr/cache": "^1.0",
97 | "psr/simple-cache": "^1.0",
98 | "sebastian/comparator": "^1.2.3"
99 | },
100 | "suggest": {
101 | "aws/aws-php-sns-message-validator": "To validate incoming SNS notifications",
102 | "doctrine/cache": "To use the DoctrineCacheAdapter",
103 | "ext-curl": "To send requests using cURL",
104 | "ext-openssl": "Allows working with CloudFront private distributions and verifying received SNS messages",
105 | "ext-sockets": "To use client-side monitoring"
106 | },
107 | "type": "library",
108 | "extra": {
109 | "branch-alias": {
110 | "dev-master": "3.0-dev"
111 | }
112 | },
113 | "autoload": {
114 | "psr-4": {
115 | "Aws\\": "src/"
116 | },
117 | "files": [
118 | "src/functions.php"
119 | ]
120 | },
121 | "notification-url": "https://packagist.org/downloads/",
122 | "license": [
123 | "Apache-2.0"
124 | ],
125 | "authors": [
126 | {
127 | "name": "Amazon Web Services",
128 | "homepage": "http://aws.amazon.com"
129 | }
130 | ],
131 | "description": "AWS SDK for PHP - Use Amazon Web Services in your PHP project",
132 | "homepage": "http://aws.amazon.com/sdkforphp",
133 | "keywords": [
134 | "amazon",
135 | "aws",
136 | "cloud",
137 | "dynamodb",
138 | "ec2",
139 | "glacier",
140 | "s3",
141 | "sdk"
142 | ],
143 | "support": {
144 | "forum": "https://forums.aws.amazon.com/forum.jspa?forumID=80",
145 | "issues": "https://github.com/aws/aws-sdk-php/issues",
146 | "source": "https://github.com/aws/aws-sdk-php/tree/3.194.5"
147 | },
148 | "time": "2021-09-24T18:25:24+00:00"
149 | },
150 | {
151 | "name": "euskadi31/opengraph",
152 | "version": "v1.0.0",
153 | "source": {
154 | "type": "git",
155 | "url": "https://github.com/euskadi31/Opengraph.git",
156 | "reference": "35645b1bbe9309f8b46734e3eb8f01e6613aeb52"
157 | },
158 | "dist": {
159 | "type": "zip",
160 | "url": "https://api.github.com/repos/euskadi31/Opengraph/zipball/35645b1bbe9309f8b46734e3eb8f01e6613aeb52",
161 | "reference": "35645b1bbe9309f8b46734e3eb8f01e6613aeb52",
162 | "shasum": ""
163 | },
164 | "require": {
165 | "php": ">=5.3.0"
166 | },
167 | "require-dev": {
168 | "atoum/atoum": "dev-master"
169 | },
170 | "type": "library",
171 | "autoload": {
172 | "psr-0": {
173 | "Opengraph": "src/"
174 | }
175 | },
176 | "notification-url": "https://packagist.org/downloads/",
177 | "license": [
178 | "MIT"
179 | ],
180 | "authors": [
181 | {
182 | "name": "Axel Etcheverry",
183 | "email": "axel@etcheverry.biz",
184 | "homepage": "http://www.axel-etcheverry.com"
185 | }
186 | ],
187 | "description": "A PHP 5.3+ framework for OpenGraph Protocol",
188 | "keywords": [
189 | "OpenGraph Protocol",
190 | "og",
191 | "opengraph",
192 | "sdk"
193 | ],
194 | "support": {
195 | "issues": "https://github.com/euskadi31/Opengraph/issues",
196 | "source": "https://github.com/euskadi31/Opengraph/tree/master"
197 | },
198 | "time": "2013-11-25T14:33:37+00:00"
199 | },
200 | {
201 | "name": "ezyang/htmlpurifier",
202 | "version": "v4.13.0",
203 | "source": {
204 | "type": "git",
205 | "url": "https://github.com/ezyang/htmlpurifier.git",
206 | "reference": "08e27c97e4c6ed02f37c5b2b20488046c8d90d75"
207 | },
208 | "dist": {
209 | "type": "zip",
210 | "url": "https://api.github.com/repos/ezyang/htmlpurifier/zipball/08e27c97e4c6ed02f37c5b2b20488046c8d90d75",
211 | "reference": "08e27c97e4c6ed02f37c5b2b20488046c8d90d75",
212 | "shasum": ""
213 | },
214 | "require": {
215 | "php": ">=5.2"
216 | },
217 | "require-dev": {
218 | "simpletest/simpletest": "dev-master#72de02a7b80c6bb8864ef9bf66d41d2f58f826bd"
219 | },
220 | "type": "library",
221 | "autoload": {
222 | "psr-0": {
223 | "HTMLPurifier": "library/"
224 | },
225 | "files": [
226 | "library/HTMLPurifier.composer.php"
227 | ],
228 | "exclude-from-classmap": [
229 | "/library/HTMLPurifier/Language/"
230 | ]
231 | },
232 | "notification-url": "https://packagist.org/downloads/",
233 | "license": [
234 | "LGPL-2.1-or-later"
235 | ],
236 | "authors": [
237 | {
238 | "name": "Edward Z. Yang",
239 | "email": "admin@htmlpurifier.org",
240 | "homepage": "http://ezyang.com"
241 | }
242 | ],
243 | "description": "Standards compliant HTML filter written in PHP",
244 | "homepage": "http://htmlpurifier.org/",
245 | "keywords": [
246 | "html"
247 | ],
248 | "support": {
249 | "issues": "https://github.com/ezyang/htmlpurifier/issues",
250 | "source": "https://github.com/ezyang/htmlpurifier/tree/master"
251 | },
252 | "time": "2020-06-29T00:56:53+00:00"
253 | },
254 | {
255 | "name": "guzzlehttp/guzzle",
256 | "version": "7.3.0",
257 | "source": {
258 | "type": "git",
259 | "url": "https://github.com/guzzle/guzzle.git",
260 | "reference": "7008573787b430c1c1f650e3722d9bba59967628"
261 | },
262 | "dist": {
263 | "type": "zip",
264 | "url": "https://api.github.com/repos/guzzle/guzzle/zipball/7008573787b430c1c1f650e3722d9bba59967628",
265 | "reference": "7008573787b430c1c1f650e3722d9bba59967628",
266 | "shasum": ""
267 | },
268 | "require": {
269 | "ext-json": "*",
270 | "guzzlehttp/promises": "^1.4",
271 | "guzzlehttp/psr7": "^1.7 || ^2.0",
272 | "php": "^7.2.5 || ^8.0",
273 | "psr/http-client": "^1.0"
274 | },
275 | "provide": {
276 | "psr/http-client-implementation": "1.0"
277 | },
278 | "require-dev": {
279 | "bamarni/composer-bin-plugin": "^1.4.1",
280 | "ext-curl": "*",
281 | "php-http/client-integration-tests": "^3.0",
282 | "phpunit/phpunit": "^8.5.5 || ^9.3.5",
283 | "psr/log": "^1.1"
284 | },
285 | "suggest": {
286 | "ext-curl": "Required for CURL handler support",
287 | "ext-intl": "Required for Internationalized Domain Name (IDN) support",
288 | "psr/log": "Required for using the Log middleware"
289 | },
290 | "type": "library",
291 | "extra": {
292 | "branch-alias": {
293 | "dev-master": "7.3-dev"
294 | }
295 | },
296 | "autoload": {
297 | "psr-4": {
298 | "GuzzleHttp\\": "src/"
299 | },
300 | "files": [
301 | "src/functions_include.php"
302 | ]
303 | },
304 | "notification-url": "https://packagist.org/downloads/",
305 | "license": [
306 | "MIT"
307 | ],
308 | "authors": [
309 | {
310 | "name": "Michael Dowling",
311 | "email": "mtdowling@gmail.com",
312 | "homepage": "https://github.com/mtdowling"
313 | },
314 | {
315 | "name": "Márk Sági-Kazár",
316 | "email": "mark.sagikazar@gmail.com",
317 | "homepage": "https://sagikazarmark.hu"
318 | }
319 | ],
320 | "description": "Guzzle is a PHP HTTP client library",
321 | "homepage": "http://guzzlephp.org/",
322 | "keywords": [
323 | "client",
324 | "curl",
325 | "framework",
326 | "http",
327 | "http client",
328 | "psr-18",
329 | "psr-7",
330 | "rest",
331 | "web service"
332 | ],
333 | "support": {
334 | "issues": "https://github.com/guzzle/guzzle/issues",
335 | "source": "https://github.com/guzzle/guzzle/tree/7.3.0"
336 | },
337 | "funding": [
338 | {
339 | "url": "https://github.com/GrahamCampbell",
340 | "type": "github"
341 | },
342 | {
343 | "url": "https://github.com/Nyholm",
344 | "type": "github"
345 | },
346 | {
347 | "url": "https://github.com/alexeyshockov",
348 | "type": "github"
349 | },
350 | {
351 | "url": "https://github.com/gmponos",
352 | "type": "github"
353 | }
354 | ],
355 | "time": "2021-03-23T11:33:13+00:00"
356 | },
357 | {
358 | "name": "guzzlehttp/promises",
359 | "version": "1.4.1",
360 | "source": {
361 | "type": "git",
362 | "url": "https://github.com/guzzle/promises.git",
363 | "reference": "8e7d04f1f6450fef59366c399cfad4b9383aa30d"
364 | },
365 | "dist": {
366 | "type": "zip",
367 | "url": "https://api.github.com/repos/guzzle/promises/zipball/8e7d04f1f6450fef59366c399cfad4b9383aa30d",
368 | "reference": "8e7d04f1f6450fef59366c399cfad4b9383aa30d",
369 | "shasum": ""
370 | },
371 | "require": {
372 | "php": ">=5.5"
373 | },
374 | "require-dev": {
375 | "symfony/phpunit-bridge": "^4.4 || ^5.1"
376 | },
377 | "type": "library",
378 | "extra": {
379 | "branch-alias": {
380 | "dev-master": "1.4-dev"
381 | }
382 | },
383 | "autoload": {
384 | "psr-4": {
385 | "GuzzleHttp\\Promise\\": "src/"
386 | },
387 | "files": [
388 | "src/functions_include.php"
389 | ]
390 | },
391 | "notification-url": "https://packagist.org/downloads/",
392 | "license": [
393 | "MIT"
394 | ],
395 | "authors": [
396 | {
397 | "name": "Michael Dowling",
398 | "email": "mtdowling@gmail.com",
399 | "homepage": "https://github.com/mtdowling"
400 | }
401 | ],
402 | "description": "Guzzle promises library",
403 | "keywords": [
404 | "promise"
405 | ],
406 | "support": {
407 | "issues": "https://github.com/guzzle/promises/issues",
408 | "source": "https://github.com/guzzle/promises/tree/1.4.1"
409 | },
410 | "time": "2021-03-07T09:25:29+00:00"
411 | },
412 | {
413 | "name": "guzzlehttp/psr7",
414 | "version": "1.8.2",
415 | "source": {
416 | "type": "git",
417 | "url": "https://github.com/guzzle/psr7.git",
418 | "reference": "dc960a912984efb74d0a90222870c72c87f10c91"
419 | },
420 | "dist": {
421 | "type": "zip",
422 | "url": "https://api.github.com/repos/guzzle/psr7/zipball/dc960a912984efb74d0a90222870c72c87f10c91",
423 | "reference": "dc960a912984efb74d0a90222870c72c87f10c91",
424 | "shasum": ""
425 | },
426 | "require": {
427 | "php": ">=5.4.0",
428 | "psr/http-message": "~1.0",
429 | "ralouphie/getallheaders": "^2.0.5 || ^3.0.0"
430 | },
431 | "provide": {
432 | "psr/http-message-implementation": "1.0"
433 | },
434 | "require-dev": {
435 | "ext-zlib": "*",
436 | "phpunit/phpunit": "~4.8.36 || ^5.7.27 || ^6.5.14 || ^7.5.20 || ^8.5.8 || ^9.3.10"
437 | },
438 | "suggest": {
439 | "laminas/laminas-httphandlerrunner": "Emit PSR-7 responses"
440 | },
441 | "type": "library",
442 | "extra": {
443 | "branch-alias": {
444 | "dev-master": "1.7-dev"
445 | }
446 | },
447 | "autoload": {
448 | "psr-4": {
449 | "GuzzleHttp\\Psr7\\": "src/"
450 | },
451 | "files": [
452 | "src/functions_include.php"
453 | ]
454 | },
455 | "notification-url": "https://packagist.org/downloads/",
456 | "license": [
457 | "MIT"
458 | ],
459 | "authors": [
460 | {
461 | "name": "Michael Dowling",
462 | "email": "mtdowling@gmail.com",
463 | "homepage": "https://github.com/mtdowling"
464 | },
465 | {
466 | "name": "Tobias Schultze",
467 | "homepage": "https://github.com/Tobion"
468 | }
469 | ],
470 | "description": "PSR-7 message implementation that also provides common utility methods",
471 | "keywords": [
472 | "http",
473 | "message",
474 | "psr-7",
475 | "request",
476 | "response",
477 | "stream",
478 | "uri",
479 | "url"
480 | ],
481 | "support": {
482 | "issues": "https://github.com/guzzle/psr7/issues",
483 | "source": "https://github.com/guzzle/psr7/tree/1.8.2"
484 | },
485 | "time": "2021-04-26T09:17:50+00:00"
486 | },
487 | {
488 | "name": "mtdowling/jmespath.php",
489 | "version": "2.6.1",
490 | "source": {
491 | "type": "git",
492 | "url": "https://github.com/jmespath/jmespath.php.git",
493 | "reference": "9b87907a81b87bc76d19a7fb2d61e61486ee9edb"
494 | },
495 | "dist": {
496 | "type": "zip",
497 | "url": "https://api.github.com/repos/jmespath/jmespath.php/zipball/9b87907a81b87bc76d19a7fb2d61e61486ee9edb",
498 | "reference": "9b87907a81b87bc76d19a7fb2d61e61486ee9edb",
499 | "shasum": ""
500 | },
501 | "require": {
502 | "php": "^5.4 || ^7.0 || ^8.0",
503 | "symfony/polyfill-mbstring": "^1.17"
504 | },
505 | "require-dev": {
506 | "composer/xdebug-handler": "^1.4 || ^2.0",
507 | "phpunit/phpunit": "^4.8.36 || ^7.5.15"
508 | },
509 | "bin": [
510 | "bin/jp.php"
511 | ],
512 | "type": "library",
513 | "extra": {
514 | "branch-alias": {
515 | "dev-master": "2.6-dev"
516 | }
517 | },
518 | "autoload": {
519 | "psr-4": {
520 | "JmesPath\\": "src/"
521 | },
522 | "files": [
523 | "src/JmesPath.php"
524 | ]
525 | },
526 | "notification-url": "https://packagist.org/downloads/",
527 | "license": [
528 | "MIT"
529 | ],
530 | "authors": [
531 | {
532 | "name": "Michael Dowling",
533 | "email": "mtdowling@gmail.com",
534 | "homepage": "https://github.com/mtdowling"
535 | }
536 | ],
537 | "description": "Declaratively specify how to extract elements from a JSON document",
538 | "keywords": [
539 | "json",
540 | "jsonpath"
541 | ],
542 | "support": {
543 | "issues": "https://github.com/jmespath/jmespath.php/issues",
544 | "source": "https://github.com/jmespath/jmespath.php/tree/2.6.1"
545 | },
546 | "time": "2021-06-14T00:11:39+00:00"
547 | },
548 | {
549 | "name": "mustache/mustache",
550 | "version": "v2.13.0",
551 | "source": {
552 | "type": "git",
553 | "url": "https://github.com/bobthecow/mustache.php.git",
554 | "reference": "e95c5a008c23d3151d59ea72484d4f72049ab7f4"
555 | },
556 | "dist": {
557 | "type": "zip",
558 | "url": "https://api.github.com/repos/bobthecow/mustache.php/zipball/e95c5a008c23d3151d59ea72484d4f72049ab7f4",
559 | "reference": "e95c5a008c23d3151d59ea72484d4f72049ab7f4",
560 | "shasum": ""
561 | },
562 | "require": {
563 | "php": ">=5.2.4"
564 | },
565 | "require-dev": {
566 | "friendsofphp/php-cs-fixer": "~1.11",
567 | "phpunit/phpunit": "~3.7|~4.0|~5.0"
568 | },
569 | "type": "library",
570 | "autoload": {
571 | "psr-0": {
572 | "Mustache": "src/"
573 | }
574 | },
575 | "notification-url": "https://packagist.org/downloads/",
576 | "license": [
577 | "MIT"
578 | ],
579 | "authors": [
580 | {
581 | "name": "Justin Hileman",
582 | "email": "justin@justinhileman.info",
583 | "homepage": "http://justinhileman.com"
584 | }
585 | ],
586 | "description": "A Mustache implementation in PHP.",
587 | "homepage": "https://github.com/bobthecow/mustache.php",
588 | "keywords": [
589 | "mustache",
590 | "templating"
591 | ],
592 | "support": {
593 | "issues": "https://github.com/bobthecow/mustache.php/issues",
594 | "source": "https://github.com/bobthecow/mustache.php/tree/master"
595 | },
596 | "time": "2019-11-23T21:40:31+00:00"
597 | },
598 | {
599 | "name": "myclabs/php-enum",
600 | "version": "1.8.3",
601 | "source": {
602 | "type": "git",
603 | "url": "https://github.com/myclabs/php-enum.git",
604 | "reference": "b942d263c641ddb5190929ff840c68f78713e937"
605 | },
606 | "dist": {
607 | "type": "zip",
608 | "url": "https://api.github.com/repos/myclabs/php-enum/zipball/b942d263c641ddb5190929ff840c68f78713e937",
609 | "reference": "b942d263c641ddb5190929ff840c68f78713e937",
610 | "shasum": ""
611 | },
612 | "require": {
613 | "ext-json": "*",
614 | "php": "^7.3 || ^8.0"
615 | },
616 | "require-dev": {
617 | "phpunit/phpunit": "^9.5",
618 | "squizlabs/php_codesniffer": "1.*",
619 | "vimeo/psalm": "^4.6.2"
620 | },
621 | "type": "library",
622 | "autoload": {
623 | "psr-4": {
624 | "MyCLabs\\Enum\\": "src/"
625 | }
626 | },
627 | "notification-url": "https://packagist.org/downloads/",
628 | "license": [
629 | "MIT"
630 | ],
631 | "authors": [
632 | {
633 | "name": "PHP Enum contributors",
634 | "homepage": "https://github.com/myclabs/php-enum/graphs/contributors"
635 | }
636 | ],
637 | "description": "PHP Enum implementation",
638 | "homepage": "http://github.com/myclabs/php-enum",
639 | "keywords": [
640 | "enum"
641 | ],
642 | "support": {
643 | "issues": "https://github.com/myclabs/php-enum/issues",
644 | "source": "https://github.com/myclabs/php-enum/tree/1.8.3"
645 | },
646 | "funding": [
647 | {
648 | "url": "https://github.com/mnapoli",
649 | "type": "github"
650 | },
651 | {
652 | "url": "https://tidelift.com/funding/github/packagist/myclabs/php-enum",
653 | "type": "tidelift"
654 | }
655 | ],
656 | "time": "2021-07-05T08:18:36+00:00"
657 | },
658 | {
659 | "name": "paquettg/php-html-parser",
660 | "version": "3.1.1",
661 | "source": {
662 | "type": "git",
663 | "url": "https://github.com/paquettg/php-html-parser.git",
664 | "reference": "4e01a438ad5961cc2d7427eb9798d213c8a12629"
665 | },
666 | "dist": {
667 | "type": "zip",
668 | "url": "https://api.github.com/repos/paquettg/php-html-parser/zipball/4e01a438ad5961cc2d7427eb9798d213c8a12629",
669 | "reference": "4e01a438ad5961cc2d7427eb9798d213c8a12629",
670 | "shasum": ""
671 | },
672 | "require": {
673 | "ext-curl": "*",
674 | "ext-mbstring": "*",
675 | "ext-zlib": "*",
676 | "guzzlehttp/guzzle": "^7.0",
677 | "guzzlehttp/psr7": "^1.6",
678 | "myclabs/php-enum": "^1.7",
679 | "paquettg/string-encode": "~1.0.0",
680 | "php": ">=7.2",
681 | "php-http/httplug": "^2.1"
682 | },
683 | "require-dev": {
684 | "friendsofphp/php-cs-fixer": "^2.16",
685 | "infection/infection": "^0.13.4",
686 | "mockery/mockery": "^1.2",
687 | "phan/phan": "^2.4",
688 | "phpunit/phpunit": "^7.5.1"
689 | },
690 | "type": "library",
691 | "autoload": {
692 | "psr-4": {
693 | "PHPHtmlParser\\": "src/PHPHtmlParser"
694 | }
695 | },
696 | "notification-url": "https://packagist.org/downloads/",
697 | "license": [
698 | "MIT"
699 | ],
700 | "authors": [
701 | {
702 | "name": "Gilles Paquette",
703 | "email": "paquettg@gmail.com",
704 | "homepage": "http://gillespaquette.ca"
705 | }
706 | ],
707 | "description": "An HTML DOM parser. It allows you to manipulate HTML. Find tags on an HTML page with selectors just like jQuery.",
708 | "homepage": "https://github.com/paquettg/php-html-parser",
709 | "keywords": [
710 | "dom",
711 | "html",
712 | "parser"
713 | ],
714 | "support": {
715 | "issues": "https://github.com/paquettg/php-html-parser/issues",
716 | "source": "https://github.com/paquettg/php-html-parser/tree/3.1.1"
717 | },
718 | "funding": [
719 | {
720 | "url": "https://tidelift.com/funding/github/packagist/paquettg/php-html-parser",
721 | "type": "tidelift"
722 | }
723 | ],
724 | "time": "2020-11-01T20:34:43+00:00"
725 | },
726 | {
727 | "name": "paquettg/string-encode",
728 | "version": "1.0.1",
729 | "source": {
730 | "type": "git",
731 | "url": "https://github.com/paquettg/string-encoder.git",
732 | "reference": "a8708e9fac9d5ddfc8fc2aac6004e2cd05d80fee"
733 | },
734 | "dist": {
735 | "type": "zip",
736 | "url": "https://api.github.com/repos/paquettg/string-encoder/zipball/a8708e9fac9d5ddfc8fc2aac6004e2cd05d80fee",
737 | "reference": "a8708e9fac9d5ddfc8fc2aac6004e2cd05d80fee",
738 | "shasum": ""
739 | },
740 | "require": {
741 | "php": ">=7.1"
742 | },
743 | "require-dev": {
744 | "phpunit/phpunit": "^7.5.1"
745 | },
746 | "type": "library",
747 | "autoload": {
748 | "psr-0": {
749 | "stringEncode": "src/"
750 | }
751 | },
752 | "notification-url": "https://packagist.org/downloads/",
753 | "license": [
754 | "MIT"
755 | ],
756 | "authors": [
757 | {
758 | "name": "Gilles Paquette",
759 | "email": "paquettg@gmail.com",
760 | "homepage": "http://gillespaquette.ca"
761 | }
762 | ],
763 | "description": "Facilitating the process of altering string encoding in PHP.",
764 | "homepage": "https://github.com/paquettg/string-encoder",
765 | "keywords": [
766 | "charset",
767 | "encoding",
768 | "string"
769 | ],
770 | "support": {
771 | "issues": "https://github.com/paquettg/string-encoder/issues",
772 | "source": "https://github.com/paquettg/string-encoder/tree/1.0.1"
773 | },
774 | "time": "2018-12-21T02:25:09+00:00"
775 | },
776 | {
777 | "name": "php-http/httplug",
778 | "version": "2.2.0",
779 | "source": {
780 | "type": "git",
781 | "url": "https://github.com/php-http/httplug.git",
782 | "reference": "191a0a1b41ed026b717421931f8d3bd2514ffbf9"
783 | },
784 | "dist": {
785 | "type": "zip",
786 | "url": "https://api.github.com/repos/php-http/httplug/zipball/191a0a1b41ed026b717421931f8d3bd2514ffbf9",
787 | "reference": "191a0a1b41ed026b717421931f8d3bd2514ffbf9",
788 | "shasum": ""
789 | },
790 | "require": {
791 | "php": "^7.1 || ^8.0",
792 | "php-http/promise": "^1.1",
793 | "psr/http-client": "^1.0",
794 | "psr/http-message": "^1.0"
795 | },
796 | "require-dev": {
797 | "friends-of-phpspec/phpspec-code-coverage": "^4.1",
798 | "phpspec/phpspec": "^5.1 || ^6.0"
799 | },
800 | "type": "library",
801 | "extra": {
802 | "branch-alias": {
803 | "dev-master": "2.x-dev"
804 | }
805 | },
806 | "autoload": {
807 | "psr-4": {
808 | "Http\\Client\\": "src/"
809 | }
810 | },
811 | "notification-url": "https://packagist.org/downloads/",
812 | "license": [
813 | "MIT"
814 | ],
815 | "authors": [
816 | {
817 | "name": "Eric GELOEN",
818 | "email": "geloen.eric@gmail.com"
819 | },
820 | {
821 | "name": "Márk Sági-Kazár",
822 | "email": "mark.sagikazar@gmail.com",
823 | "homepage": "https://sagikazarmark.hu"
824 | }
825 | ],
826 | "description": "HTTPlug, the HTTP client abstraction for PHP",
827 | "homepage": "http://httplug.io",
828 | "keywords": [
829 | "client",
830 | "http"
831 | ],
832 | "support": {
833 | "issues": "https://github.com/php-http/httplug/issues",
834 | "source": "https://github.com/php-http/httplug/tree/master"
835 | },
836 | "time": "2020-07-13T15:43:23+00:00"
837 | },
838 | {
839 | "name": "php-http/promise",
840 | "version": "1.1.0",
841 | "source": {
842 | "type": "git",
843 | "url": "https://github.com/php-http/promise.git",
844 | "reference": "4c4c1f9b7289a2ec57cde7f1e9762a5789506f88"
845 | },
846 | "dist": {
847 | "type": "zip",
848 | "url": "https://api.github.com/repos/php-http/promise/zipball/4c4c1f9b7289a2ec57cde7f1e9762a5789506f88",
849 | "reference": "4c4c1f9b7289a2ec57cde7f1e9762a5789506f88",
850 | "shasum": ""
851 | },
852 | "require": {
853 | "php": "^7.1 || ^8.0"
854 | },
855 | "require-dev": {
856 | "friends-of-phpspec/phpspec-code-coverage": "^4.3.2",
857 | "phpspec/phpspec": "^5.1.2 || ^6.2"
858 | },
859 | "type": "library",
860 | "extra": {
861 | "branch-alias": {
862 | "dev-master": "1.1-dev"
863 | }
864 | },
865 | "autoload": {
866 | "psr-4": {
867 | "Http\\Promise\\": "src/"
868 | }
869 | },
870 | "notification-url": "https://packagist.org/downloads/",
871 | "license": [
872 | "MIT"
873 | ],
874 | "authors": [
875 | {
876 | "name": "Joel Wurtz",
877 | "email": "joel.wurtz@gmail.com"
878 | },
879 | {
880 | "name": "Márk Sági-Kazár",
881 | "email": "mark.sagikazar@gmail.com"
882 | }
883 | ],
884 | "description": "Promise used for asynchronous HTTP requests",
885 | "homepage": "http://httplug.io",
886 | "keywords": [
887 | "promise"
888 | ],
889 | "support": {
890 | "issues": "https://github.com/php-http/promise/issues",
891 | "source": "https://github.com/php-http/promise/tree/1.1.0"
892 | },
893 | "time": "2020-07-07T09:29:14+00:00"
894 | },
895 | {
896 | "name": "phpoption/phpoption",
897 | "version": "1.8.0",
898 | "source": {
899 | "type": "git",
900 | "url": "https://github.com/schmittjoh/php-option.git",
901 | "reference": "5455cb38aed4523f99977c4a12ef19da4bfe2a28"
902 | },
903 | "dist": {
904 | "type": "zip",
905 | "url": "https://api.github.com/repos/schmittjoh/php-option/zipball/5455cb38aed4523f99977c4a12ef19da4bfe2a28",
906 | "reference": "5455cb38aed4523f99977c4a12ef19da4bfe2a28",
907 | "shasum": ""
908 | },
909 | "require": {
910 | "php": "^7.0 || ^8.0"
911 | },
912 | "require-dev": {
913 | "bamarni/composer-bin-plugin": "^1.4.1",
914 | "phpunit/phpunit": "^6.5.14 || ^7.0.20 || ^8.5.19 || ^9.5.8"
915 | },
916 | "type": "library",
917 | "extra": {
918 | "branch-alias": {
919 | "dev-master": "1.8-dev"
920 | }
921 | },
922 | "autoload": {
923 | "psr-4": {
924 | "PhpOption\\": "src/PhpOption/"
925 | }
926 | },
927 | "notification-url": "https://packagist.org/downloads/",
928 | "license": [
929 | "Apache-2.0"
930 | ],
931 | "authors": [
932 | {
933 | "name": "Johannes M. Schmitt",
934 | "email": "schmittjoh@gmail.com"
935 | },
936 | {
937 | "name": "Graham Campbell",
938 | "email": "hello@gjcampbell.co.uk"
939 | }
940 | ],
941 | "description": "Option Type for PHP",
942 | "keywords": [
943 | "language",
944 | "option",
945 | "php",
946 | "type"
947 | ],
948 | "support": {
949 | "issues": "https://github.com/schmittjoh/php-option/issues",
950 | "source": "https://github.com/schmittjoh/php-option/tree/1.8.0"
951 | },
952 | "funding": [
953 | {
954 | "url": "https://github.com/GrahamCampbell",
955 | "type": "github"
956 | },
957 | {
958 | "url": "https://tidelift.com/funding/github/packagist/phpoption/phpoption",
959 | "type": "tidelift"
960 | }
961 | ],
962 | "time": "2021-08-28T21:27:29+00:00"
963 | },
964 | {
965 | "name": "psr/http-client",
966 | "version": "1.0.1",
967 | "source": {
968 | "type": "git",
969 | "url": "https://github.com/php-fig/http-client.git",
970 | "reference": "2dfb5f6c5eff0e91e20e913f8c5452ed95b86621"
971 | },
972 | "dist": {
973 | "type": "zip",
974 | "url": "https://api.github.com/repos/php-fig/http-client/zipball/2dfb5f6c5eff0e91e20e913f8c5452ed95b86621",
975 | "reference": "2dfb5f6c5eff0e91e20e913f8c5452ed95b86621",
976 | "shasum": ""
977 | },
978 | "require": {
979 | "php": "^7.0 || ^8.0",
980 | "psr/http-message": "^1.0"
981 | },
982 | "type": "library",
983 | "extra": {
984 | "branch-alias": {
985 | "dev-master": "1.0.x-dev"
986 | }
987 | },
988 | "autoload": {
989 | "psr-4": {
990 | "Psr\\Http\\Client\\": "src/"
991 | }
992 | },
993 | "notification-url": "https://packagist.org/downloads/",
994 | "license": [
995 | "MIT"
996 | ],
997 | "authors": [
998 | {
999 | "name": "PHP-FIG",
1000 | "homepage": "http://www.php-fig.org/"
1001 | }
1002 | ],
1003 | "description": "Common interface for HTTP clients",
1004 | "homepage": "https://github.com/php-fig/http-client",
1005 | "keywords": [
1006 | "http",
1007 | "http-client",
1008 | "psr",
1009 | "psr-18"
1010 | ],
1011 | "support": {
1012 | "source": "https://github.com/php-fig/http-client/tree/master"
1013 | },
1014 | "time": "2020-06-29T06:28:15+00:00"
1015 | },
1016 | {
1017 | "name": "psr/http-message",
1018 | "version": "1.0.1",
1019 | "source": {
1020 | "type": "git",
1021 | "url": "https://github.com/php-fig/http-message.git",
1022 | "reference": "f6561bf28d520154e4b0ec72be95418abe6d9363"
1023 | },
1024 | "dist": {
1025 | "type": "zip",
1026 | "url": "https://api.github.com/repos/php-fig/http-message/zipball/f6561bf28d520154e4b0ec72be95418abe6d9363",
1027 | "reference": "f6561bf28d520154e4b0ec72be95418abe6d9363",
1028 | "shasum": ""
1029 | },
1030 | "require": {
1031 | "php": ">=5.3.0"
1032 | },
1033 | "type": "library",
1034 | "extra": {
1035 | "branch-alias": {
1036 | "dev-master": "1.0.x-dev"
1037 | }
1038 | },
1039 | "autoload": {
1040 | "psr-4": {
1041 | "Psr\\Http\\Message\\": "src/"
1042 | }
1043 | },
1044 | "notification-url": "https://packagist.org/downloads/",
1045 | "license": [
1046 | "MIT"
1047 | ],
1048 | "authors": [
1049 | {
1050 | "name": "PHP-FIG",
1051 | "homepage": "http://www.php-fig.org/"
1052 | }
1053 | ],
1054 | "description": "Common interface for HTTP messages",
1055 | "homepage": "https://github.com/php-fig/http-message",
1056 | "keywords": [
1057 | "http",
1058 | "http-message",
1059 | "psr",
1060 | "psr-7",
1061 | "request",
1062 | "response"
1063 | ],
1064 | "support": {
1065 | "source": "https://github.com/php-fig/http-message/tree/master"
1066 | },
1067 | "time": "2016-08-06T14:39:51+00:00"
1068 | },
1069 | {
1070 | "name": "ralouphie/getallheaders",
1071 | "version": "3.0.3",
1072 | "source": {
1073 | "type": "git",
1074 | "url": "https://github.com/ralouphie/getallheaders.git",
1075 | "reference": "120b605dfeb996808c31b6477290a714d356e822"
1076 | },
1077 | "dist": {
1078 | "type": "zip",
1079 | "url": "https://api.github.com/repos/ralouphie/getallheaders/zipball/120b605dfeb996808c31b6477290a714d356e822",
1080 | "reference": "120b605dfeb996808c31b6477290a714d356e822",
1081 | "shasum": ""
1082 | },
1083 | "require": {
1084 | "php": ">=5.6"
1085 | },
1086 | "require-dev": {
1087 | "php-coveralls/php-coveralls": "^2.1",
1088 | "phpunit/phpunit": "^5 || ^6.5"
1089 | },
1090 | "type": "library",
1091 | "autoload": {
1092 | "files": [
1093 | "src/getallheaders.php"
1094 | ]
1095 | },
1096 | "notification-url": "https://packagist.org/downloads/",
1097 | "license": [
1098 | "MIT"
1099 | ],
1100 | "authors": [
1101 | {
1102 | "name": "Ralph Khattar",
1103 | "email": "ralph.khattar@gmail.com"
1104 | }
1105 | ],
1106 | "description": "A polyfill for getallheaders.",
1107 | "support": {
1108 | "issues": "https://github.com/ralouphie/getallheaders/issues",
1109 | "source": "https://github.com/ralouphie/getallheaders/tree/develop"
1110 | },
1111 | "time": "2019-03-08T08:55:37+00:00"
1112 | },
1113 | {
1114 | "name": "simplepie/simplepie",
1115 | "version": "1.5.6",
1116 | "source": {
1117 | "type": "git",
1118 | "url": "https://github.com/simplepie/simplepie.git",
1119 | "reference": "1c68e14ca3ac84346b6e6fe3c5eedf725d0f92c6"
1120 | },
1121 | "dist": {
1122 | "type": "zip",
1123 | "url": "https://api.github.com/repos/simplepie/simplepie/zipball/1c68e14ca3ac84346b6e6fe3c5eedf725d0f92c6",
1124 | "reference": "1c68e14ca3ac84346b6e6fe3c5eedf725d0f92c6",
1125 | "shasum": ""
1126 | },
1127 | "require": {
1128 | "ext-pcre": "*",
1129 | "ext-xml": "*",
1130 | "ext-xmlreader": "*",
1131 | "php": ">=5.6.0"
1132 | },
1133 | "require-dev": {
1134 | "phpunit/phpunit": "~5.4.3 || ~6.5"
1135 | },
1136 | "suggest": {
1137 | "ext-curl": "",
1138 | "ext-iconv": "",
1139 | "ext-intl": "",
1140 | "ext-mbstring": "",
1141 | "mf2/mf2": "Microformat module that allows for parsing HTML for microformats"
1142 | },
1143 | "type": "library",
1144 | "autoload": {
1145 | "psr-0": {
1146 | "SimplePie": "library"
1147 | }
1148 | },
1149 | "notification-url": "https://packagist.org/downloads/",
1150 | "license": [
1151 | "BSD-3-Clause"
1152 | ],
1153 | "authors": [
1154 | {
1155 | "name": "Ryan Parman",
1156 | "homepage": "http://ryanparman.com/",
1157 | "role": "Creator, alumnus developer"
1158 | },
1159 | {
1160 | "name": "Sam Sneddon",
1161 | "homepage": "https://gsnedders.com/",
1162 | "role": "Alumnus developer"
1163 | },
1164 | {
1165 | "name": "Ryan McCue",
1166 | "email": "me@ryanmccue.info",
1167 | "homepage": "http://ryanmccue.info/",
1168 | "role": "Developer"
1169 | }
1170 | ],
1171 | "description": "A simple Atom/RSS parsing library for PHP",
1172 | "homepage": "http://simplepie.org/",
1173 | "keywords": [
1174 | "atom",
1175 | "feeds",
1176 | "rss"
1177 | ],
1178 | "support": {
1179 | "issues": "https://github.com/simplepie/simplepie/issues",
1180 | "source": "https://github.com/simplepie/simplepie/tree/1.5.6"
1181 | },
1182 | "time": "2020-10-14T07:17:22+00:00"
1183 | },
1184 | {
1185 | "name": "symfony/polyfill-ctype",
1186 | "version": "v1.23.0",
1187 | "source": {
1188 | "type": "git",
1189 | "url": "https://github.com/symfony/polyfill-ctype.git",
1190 | "reference": "46cd95797e9df938fdd2b03693b5fca5e64b01ce"
1191 | },
1192 | "dist": {
1193 | "type": "zip",
1194 | "url": "https://api.github.com/repos/symfony/polyfill-ctype/zipball/46cd95797e9df938fdd2b03693b5fca5e64b01ce",
1195 | "reference": "46cd95797e9df938fdd2b03693b5fca5e64b01ce",
1196 | "shasum": ""
1197 | },
1198 | "require": {
1199 | "php": ">=7.1"
1200 | },
1201 | "suggest": {
1202 | "ext-ctype": "For best performance"
1203 | },
1204 | "type": "library",
1205 | "extra": {
1206 | "branch-alias": {
1207 | "dev-main": "1.23-dev"
1208 | },
1209 | "thanks": {
1210 | "name": "symfony/polyfill",
1211 | "url": "https://github.com/symfony/polyfill"
1212 | }
1213 | },
1214 | "autoload": {
1215 | "psr-4": {
1216 | "Symfony\\Polyfill\\Ctype\\": ""
1217 | },
1218 | "files": [
1219 | "bootstrap.php"
1220 | ]
1221 | },
1222 | "notification-url": "https://packagist.org/downloads/",
1223 | "license": [
1224 | "MIT"
1225 | ],
1226 | "authors": [
1227 | {
1228 | "name": "Gert de Pagter",
1229 | "email": "BackEndTea@gmail.com"
1230 | },
1231 | {
1232 | "name": "Symfony Community",
1233 | "homepage": "https://symfony.com/contributors"
1234 | }
1235 | ],
1236 | "description": "Symfony polyfill for ctype functions",
1237 | "homepage": "https://symfony.com",
1238 | "keywords": [
1239 | "compatibility",
1240 | "ctype",
1241 | "polyfill",
1242 | "portable"
1243 | ],
1244 | "support": {
1245 | "source": "https://github.com/symfony/polyfill-ctype/tree/v1.23.0"
1246 | },
1247 | "funding": [
1248 | {
1249 | "url": "https://symfony.com/sponsor",
1250 | "type": "custom"
1251 | },
1252 | {
1253 | "url": "https://github.com/fabpot",
1254 | "type": "github"
1255 | },
1256 | {
1257 | "url": "https://tidelift.com/funding/github/packagist/symfony/symfony",
1258 | "type": "tidelift"
1259 | }
1260 | ],
1261 | "time": "2021-02-19T12:13:01+00:00"
1262 | },
1263 | {
1264 | "name": "symfony/polyfill-mbstring",
1265 | "version": "v1.23.1",
1266 | "source": {
1267 | "type": "git",
1268 | "url": "https://github.com/symfony/polyfill-mbstring.git",
1269 | "reference": "9174a3d80210dca8daa7f31fec659150bbeabfc6"
1270 | },
1271 | "dist": {
1272 | "type": "zip",
1273 | "url": "https://api.github.com/repos/symfony/polyfill-mbstring/zipball/9174a3d80210dca8daa7f31fec659150bbeabfc6",
1274 | "reference": "9174a3d80210dca8daa7f31fec659150bbeabfc6",
1275 | "shasum": ""
1276 | },
1277 | "require": {
1278 | "php": ">=7.1"
1279 | },
1280 | "suggest": {
1281 | "ext-mbstring": "For best performance"
1282 | },
1283 | "type": "library",
1284 | "extra": {
1285 | "branch-alias": {
1286 | "dev-main": "1.23-dev"
1287 | },
1288 | "thanks": {
1289 | "name": "symfony/polyfill",
1290 | "url": "https://github.com/symfony/polyfill"
1291 | }
1292 | },
1293 | "autoload": {
1294 | "psr-4": {
1295 | "Symfony\\Polyfill\\Mbstring\\": ""
1296 | },
1297 | "files": [
1298 | "bootstrap.php"
1299 | ]
1300 | },
1301 | "notification-url": "https://packagist.org/downloads/",
1302 | "license": [
1303 | "MIT"
1304 | ],
1305 | "authors": [
1306 | {
1307 | "name": "Nicolas Grekas",
1308 | "email": "p@tchwork.com"
1309 | },
1310 | {
1311 | "name": "Symfony Community",
1312 | "homepage": "https://symfony.com/contributors"
1313 | }
1314 | ],
1315 | "description": "Symfony polyfill for the Mbstring extension",
1316 | "homepage": "https://symfony.com",
1317 | "keywords": [
1318 | "compatibility",
1319 | "mbstring",
1320 | "polyfill",
1321 | "portable",
1322 | "shim"
1323 | ],
1324 | "support": {
1325 | "source": "https://github.com/symfony/polyfill-mbstring/tree/v1.23.1"
1326 | },
1327 | "funding": [
1328 | {
1329 | "url": "https://symfony.com/sponsor",
1330 | "type": "custom"
1331 | },
1332 | {
1333 | "url": "https://github.com/fabpot",
1334 | "type": "github"
1335 | },
1336 | {
1337 | "url": "https://tidelift.com/funding/github/packagist/symfony/symfony",
1338 | "type": "tidelift"
1339 | }
1340 | ],
1341 | "time": "2021-05-27T12:26:48+00:00"
1342 | },
1343 | {
1344 | "name": "vlucas/phpdotenv",
1345 | "version": "v3.6.8",
1346 | "source": {
1347 | "type": "git",
1348 | "url": "https://github.com/vlucas/phpdotenv.git",
1349 | "reference": "5e679f7616db829358341e2d5cccbd18773bdab8"
1350 | },
1351 | "dist": {
1352 | "type": "zip",
1353 | "url": "https://api.github.com/repos/vlucas/phpdotenv/zipball/5e679f7616db829358341e2d5cccbd18773bdab8",
1354 | "reference": "5e679f7616db829358341e2d5cccbd18773bdab8",
1355 | "shasum": ""
1356 | },
1357 | "require": {
1358 | "php": "^5.4 || ^7.0 || ^8.0",
1359 | "phpoption/phpoption": "^1.5.2",
1360 | "symfony/polyfill-ctype": "^1.17"
1361 | },
1362 | "require-dev": {
1363 | "ext-filter": "*",
1364 | "ext-pcre": "*",
1365 | "phpunit/phpunit": "^4.8.36 || ^5.7.27 || ^6.5.14 || ^7.5.20"
1366 | },
1367 | "suggest": {
1368 | "ext-filter": "Required to use the boolean validator.",
1369 | "ext-pcre": "Required to use most of the library."
1370 | },
1371 | "type": "library",
1372 | "extra": {
1373 | "branch-alias": {
1374 | "dev-master": "3.6-dev"
1375 | }
1376 | },
1377 | "autoload": {
1378 | "psr-4": {
1379 | "Dotenv\\": "src/"
1380 | }
1381 | },
1382 | "notification-url": "https://packagist.org/downloads/",
1383 | "license": [
1384 | "BSD-3-Clause"
1385 | ],
1386 | "authors": [
1387 | {
1388 | "name": "Graham Campbell",
1389 | "email": "graham@alt-three.com",
1390 | "homepage": "https://gjcampbell.co.uk/"
1391 | },
1392 | {
1393 | "name": "Vance Lucas",
1394 | "email": "vance@vancelucas.com",
1395 | "homepage": "https://vancelucas.com/"
1396 | }
1397 | ],
1398 | "description": "Loads environment variables from `.env` to `getenv()`, `$_ENV` and `$_SERVER` automagically.",
1399 | "keywords": [
1400 | "dotenv",
1401 | "env",
1402 | "environment"
1403 | ],
1404 | "support": {
1405 | "issues": "https://github.com/vlucas/phpdotenv/issues",
1406 | "source": "https://github.com/vlucas/phpdotenv/tree/v3.6.8"
1407 | },
1408 | "funding": [
1409 | {
1410 | "url": "https://github.com/GrahamCampbell",
1411 | "type": "github"
1412 | },
1413 | {
1414 | "url": "https://tidelift.com/funding/github/packagist/vlucas/phpdotenv",
1415 | "type": "tidelift"
1416 | }
1417 | ],
1418 | "time": "2021-01-20T14:39:46+00:00"
1419 | }
1420 | ],
1421 | "packages-dev": [],
1422 | "aliases": [],
1423 | "minimum-stability": "stable",
1424 | "stability-flags": [],
1425 | "prefer-stable": false,
1426 | "prefer-lowest": false,
1427 | "platform": [],
1428 | "platform-dev": [],
1429 | "plugin-api-version": "2.1.0"
1430 | }
1431 |
--------------------------------------------------------------------------------
/composer.phar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mauricesvay/FullHackerNews/73117bd1f81ddb163048e5828ee6da1edabd157e/composer.phar
--------------------------------------------------------------------------------
/example.env:
--------------------------------------------------------------------------------
1 | # Copy this file to .env
2 |
3 | # Feed
4 | FEED_URL="http://news.ycombinator.com/rss"
5 |
6 | # AWS
7 | AWS_BUCKET="www.fullhn.com"
8 | AWS_ACCESS_KEY_ID=""
9 | AWS_SECRET_ACCESS_KEY=""
10 |
--------------------------------------------------------------------------------
/index.php:
--------------------------------------------------------------------------------
1 | load();
6 |
7 | require_once __DIR__ . "/src/Fetcher.php";
8 | require_once __DIR__ . "/src/Parser.php";
9 | require_once __DIR__ . "/src/Generator.php";
10 | require_once __DIR__ . "/src/Uploader.php";
11 | require_once __DIR__ . "/lib/ansi-color.php";
12 |
13 | use PhpAnsiColor\Color;
14 |
15 | $out_folder = __DIR__ . '/www';
16 |
17 | $feed = new SimplePie();
18 | $feed->set_cache_duration(600);
19 | $feed->set_cache_location(__DIR__ . '/cache');
20 | $feed->set_feed_url(getenv('FEED_URL'));
21 | $feed->init();
22 |
23 | $articles = [];
24 |
25 | foreach ($feed->get_items() as $i => $item) {
26 | $parsed_url = parse_url($item->get_permalink());
27 | $comment_tags = $item->get_item_tags('', 'comments');
28 | $articles[] = [
29 | 'index' => $i,
30 | 'url' => $item->get_permalink(),
31 | 'domain' => $parsed_url['host'],
32 | 'title' => $item->get_title(),
33 | 'comments' => count($comment_tags) ? $comment_tags[0]['data'] : '',
34 | ];
35 | }
36 |
37 | foreach ($articles as $i => $article) {
38 | error_log("================================================================================");
39 | error_log(Color::set($article['url'], "yellow"));
40 | error_log("title: " . $articles[$i]['title'] . " (" . $articles[$i]['domain'] . ")");
41 | try {
42 | $articles[$i]['content'] = FullFeed\Fetcher::fetch($article['url']);
43 | $articles[$i]['parsed'] = FullFeed\Parser::parse($article['url'], $articles[$i]['content']);
44 | $articles[$i]['image'] = FullFeed\Parser::extractImage($articles[$i]['content']);
45 | } catch (Exception $e) {
46 | error_log(Color::set($e->getMessage(), "red"));
47 | $articles[$i]['content'] = "";
48 | $articles[$i]['parsed'] = "";
49 | $articles[$i]['image'] = "";
50 | }
51 | error_log("comments: " . $articles[$i]['comments']);
52 | error_log("image: " . $articles[$i]['image']);
53 | error_log("content: " . strlen($articles[$i]['content']));
54 | error_log("parsed: " . strlen($articles[$i]['parsed']));
55 | }
56 |
57 | error_log("================================================================================");
58 | error_log(Color::set("Uploading to S3", "yellow"));
59 | $out_index = FullFeed\Generator::renderTemplateWithArticles('index', $articles);
60 | file_put_contents($out_folder . '/index.html', $out_index);
61 | $out_latest = FullFeed\Generator::renderTemplateWithArticles('latest', $articles);
62 | file_put_contents($out_folder . '/latest.html', $out_latest);
63 | $manifest = FullFeed\Generator::generateManifest($out_folder, date('r'));
64 | file_put_contents($out_folder . '/cache.manifest', $manifest);
65 |
66 | FullFeed\Uploader::upload($out_folder);
--------------------------------------------------------------------------------
/lib/FileSystemCache/.gitignore:
--------------------------------------------------------------------------------
1 | cache/
2 | examples/cache/
3 | vendor/
4 | tests/cache/
5 |
--------------------------------------------------------------------------------
/lib/FileSystemCache/.travis.yml:
--------------------------------------------------------------------------------
1 | language: php
2 | php:
3 | - 5.4
4 | - 5.3
5 | before_script: mkdir tests/cache; mkdir tests/cache/test; mkdir tests/cache/test2; mkdir tests/cache/test/test; chmod -R 777 tests/cache;
6 | script: phpunit --coverage-text
7 |
--------------------------------------------------------------------------------
/lib/FileSystemCache/README.md:
--------------------------------------------------------------------------------
1 | FileSystemCache
2 | ===============
3 |
4 | A simple PHP class for caching data in the filesystem. Major features include:
5 |
6 | * Support for TTL when storing data
7 | * Support for "Newer Than" parameter when retrieving data
8 | * Every call is an atomic operation with proper file locking
9 | * Can group cache keys together for easy invalidation
10 | * Composer support
11 | * PHPUnit tests
12 |
13 | [](http://travis-ci.org/jdorn/FileSystemCache)
14 |
15 | Getting Started
16 | ------------------
17 | FileSystemCache can be installed with Composer or downloaded manually.
18 |
19 | ### With Composer
20 |
21 | If you're already using Composer, just add `jdorn/file-system-cache` to your `composer.json` file.
22 | FileSystemCache works with Composer's autoloader out of the bat.
23 | ```js
24 | {
25 | "require": {
26 | "jdorn/file-system-cache": "dev-master"
27 | }
28 | }
29 | ```
30 |
31 | ### Manually
32 |
33 | If you aren't using Composer, you just need to include `lib/FileSystemCache.php` in your script.
34 |
35 | ```php
36 | require_once("path/to/FileSystemCache.php");
37 | ```
38 |
39 | Setting the Cache Directory
40 | -----------------------
41 |
42 | By default, all cached data is stored in the `cache` directory relative to the currently executing script.
43 | You can change this by setting the $cacheDir static property.
44 |
45 | ```php
46 | 1001,
65 | 'ip address'=>'10.1.1.1'
66 | );
67 |
68 | //string
69 | $key_data = 'my_key';
70 |
71 | //object
72 | $key_data = new SomeObject();
73 |
74 | //number
75 | $key_data = 1005;
76 |
77 |
78 | //generate a key object
79 | $key = FileSystemCache::generateCacheKey($key_data);
80 | ```
81 |
82 | You can group cache keys together to better organize your data and make invalidation easier.
83 |
84 | ```php
85 | 'is some data I want to cache',
116 | 'it'=>'can be a string, array, object, or number.'
117 | );
118 |
119 | $key = FileSystemCache::generateCacheKey('mykey');
120 |
121 | FileSystemCache::store($key, $data);
122 | ```
123 |
124 | If you want the data to expire automatically after a set amount of time, use the optional `ttl` parameter.
125 |
126 | ```php
127 | // Expire automatically after 1 hour (3600 seconds)
128 | FileSystemCache::store($key, $data, 3600);
129 | ```
130 |
131 | Retrieve
132 | --------------------
133 | You retrieve data using the same cache key you used to store it. `False` will be returned if the data was not cached or expired.
134 |
135 | ```php
136 | $data = FileSystemCache::retrieve($key);
137 |
138 | // If there was a cache miss
139 | if($data === false) {
140 | ...
141 | }
142 | ```
143 |
144 | You can specify a `newer than` timestamp to only retrieve cached data that was stored after a certain time.
145 | This is useful for storing a compiled version of a source file.
146 |
147 | ```php
148 | $file = 'source_file.txt';
149 | $modified = filemtime($file);
150 |
151 | $key = FileSystemCache::generateCacheKey($file);
152 |
153 | $data = FileSystemCache::retrieve($key, $modified);
154 |
155 | // If there was a cache miss
156 | if($data === false) {
157 | ...
158 | }
159 | ```
160 |
161 | Get and Modify
162 | ------------------
163 | There is an atomic `Get and Modify` method as well.
164 |
165 | ```php
166 | FileSystemCache::getAndModify($key, function($value) {
167 | $value->count++;
168 |
169 | return $value;
170 | });
171 | ```
172 |
173 | If the data was originally cached with a TTL, you can pass `true` as the 3rd parameter to resset the TTL.
174 | Otherwise, it will be based on the original time it was stored.
175 |
176 |
177 | Invalidate
178 | -------------------
179 | You can invalidate a single cache key or a group of cache keys.
180 |
181 | ```php
182 | FileSystemCache::invalidate($key);
183 |
184 | FileSystemCache::invalidateGroup('mygroup');
185 | ```
186 |
187 | Invalidating a group is done recursively by default and all sub-groups will also be invalidated.
188 | If you pass `false` as the 2nd parameter, you can make it non-recursive.
189 |
190 | ```php
191 | FileSystemCache::invalidateGroup('mygroup', false);
192 | ```
193 |
194 | Running the Tests
195 | ------------------
196 | You need PHPUnit installed to run the tests. Configuration is defined in `phpunit.xml.dist`. Running the tests is easy:
197 |
198 | ```
199 | phpunit
200 | ```
201 |
--------------------------------------------------------------------------------
/lib/FileSystemCache/composer.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "jdorn/file-system-cache",
3 | "description": "an easy way to cache data in the file system",
4 | "homepage": "https://github.com/jdorn/FileSystemCache/",
5 | "keywords": ["cache", "file system"],
6 | "minimum-stability": "dev",
7 | "license": "LGPL",
8 | "type": "library",
9 | "require": {
10 | "php": ">=5.3.0"
11 | },
12 | "authors": [
13 | {
14 | "name": "Jeremy Dorn",
15 | "email": "jeremy@jeremydorn.com",
16 | "homepage": "http://jeremydorn.com/"
17 | }
18 | ],
19 | "autoload": {
20 | "classmap": ["lib"]
21 | },
22 | "extra": {
23 | "branch-alias": {
24 | "dev-master": "1.0.x-dev"
25 | }
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/lib/FileSystemCache/lib/FileSystemCache.php:
--------------------------------------------------------------------------------
1 |
7 | * @copyright 2012 Jeremy Dorn
8 | * @license http://www.opensource.org/licenses/lgpl-license.php LGPL
9 | * @link http://github.com/jdorn/FileSystemCache
10 | * @version 1.0.0
11 | */
12 | class FileSystemCache {
13 | /**
14 | * The root cache directory. Everything will be cached relative to this directory.
15 | * @var string
16 | */
17 | public static $cacheDir = 'cache';
18 |
19 | /**
20 | * Generates a cache key to use with store, retrieve, getAndModify, and invalidate
21 | * @param mixed $key_data Unique data that identifies the key. Can be a string, array, number, or object.
22 | * @param String $group An optional group to put the cache key in. Must be in the format "groupname" or "groupname/subgroupname".
23 | * @return FileSystemCacheKey The cache key object.
24 | */
25 | public static function generateCacheKey($key_data, $group=null) {
26 | return new FileSystemCacheKey($key_data,$group);
27 | }
28 |
29 | /**
30 | * Stores data in the cache
31 | * @param FileSystemCacheKey $key The cache key
32 | * @param mixed $data The data to store (will be serialized before storing)
33 | * @param int $ttl The number of seconds until the cache expires. (optional)
34 | * @return boolean True on success, false on failure
35 | */
36 | public static function store(FileSystemCacheKey $key, $data, $ttl=null) {
37 | $filename = $key->getFileName();
38 |
39 | $data = new FileSystemCacheValue($key,$data,$ttl);
40 |
41 | $fh = self::getFileHandle($filename,'c');
42 |
43 | if(!$fh) return false;
44 |
45 | if(!self::putContents($fh,$data)) return false;
46 |
47 | return true;
48 | }
49 |
50 | /**
51 | * Retrieve data from cache
52 | * @param FileSystemCacheKey $key The cache key
53 | * @param int $newer_than If passed, only return if the cached value was created after this time
54 | * @return mixed The cached data or FALSE if not found or expired
55 | */
56 | public static function retrieve(FileSystemCacheKey $key, $newer_than=null) {
57 | $filename = $key->getFileName();
58 |
59 | if(!file_exists($filename)) return false;
60 |
61 | //if cached data is not newer than $newer_than
62 | if($newer_than && filemtime($filename) < $newer_than) return false;
63 |
64 | $fh = self::getFileHandle($filename,'r');
65 | if(!$fh) return false;
66 |
67 | $data = self::getContents($fh,$key);
68 | if(!$data) return false;
69 |
70 |
71 | self::closeFile($fh);
72 | return $data->value;
73 | }
74 |
75 | /**
76 | * Atomically retrieve data from cache, modify it, and store it back
77 | * @param FileSystemCacheKey $key The cache key
78 | * @param Closure $callback A closure function to modify the cache value.
79 | * Takes the old value as an argument and returns new value.
80 | * If this function returns false, the cached value will be invalidated.
81 | * @param bool $resetTtl If set to true, the expiration date will be recalculated using the previous TTL
82 | * @return mixed The new value if it was stored successfully or false if it wasn't
83 | * @throws Exception If an invalid callback method is given
84 | */
85 | public static function getAndModify(FileSystemCacheKey $key, Closure $callback, $resetTtl=false) {
86 | $filename = $key->getFileName();
87 |
88 | if(!file_exists($filename)) return false;
89 |
90 | //open a file handle
91 | $fh = self::getFileHandle($filename,'c+');
92 | if(!$fh) return false;
93 |
94 | //get the data
95 | $data = self::getContents($fh,$key);
96 | if(!$data) return false;
97 |
98 | //get new value from callback function
99 | $old_value = $data->value;
100 | $data->value = $callback($data->value);
101 |
102 | //if the callback function returns false
103 | if($data->value === false) {
104 | self::closeFile($fh);
105 |
106 | //delete the cache file
107 | self::invalidate($key);
108 | return false;
109 | }
110 |
111 | //if value didn't change
112 | if(!$resetTtl && $data->value === $old_value) {
113 | self::closeFile($fh);
114 | return $data->value;
115 | }
116 |
117 | //if we're resetting the ttl to now
118 | if($resetTtl) {
119 | $data->created = time();
120 | if($data->ttl) {
121 | $data->expires = $data->created + $data->ttl;
122 | }
123 | }
124 |
125 | if(!self::emptyFile($fh)) return false;
126 |
127 | //write contents and close the file handle
128 | self::putContents($fh,$data);
129 |
130 | //return the new value after modifying
131 | return $data->value;
132 | }
133 |
134 | /**
135 | * Invalidate a specific cache key
136 | * @param FileSystemCacheKey $key The cache key
137 | * @return boolean True on success. Currently never returns false.
138 | */
139 | public static function invalidate(FileSystemCacheKey $key) {
140 | $filename = $key->getFileName();
141 | if(file_exists($filename)) {
142 | unlink($filename);
143 | }
144 | return true;
145 | }
146 |
147 | /**
148 | * Invalidate a group of cache keys
149 | * @param string $name The name of the group to invalidate (e.g. 'groupname', 'groupname/subgroupname', etc.). If null, the entire cache will be invalidated.
150 | * @param boolean $recursive If set to false, none of the subgroups will be invalidated.
151 | * @throws Exception If an invalid group name is given
152 | */
153 | public static function invalidateGroup($name=null, $recursive=true) {
154 | //if invalidating a group, make sure it's valid
155 | if($name) {
156 | //it needs to have a trailing slash and no leading slashes
157 | $name = trim($name,'/').'/';
158 |
159 | //make sure the key isn't going up a directory
160 | if(strpos($name,'..') !== false) {
161 | throw new Exception("Invalidate path cannot go up directories.");
162 | }
163 | }
164 |
165 | array_map("unlink", glob(self::$cacheDir.'/'.$name.'*.cache'));
166 |
167 | //if recursively invalidating
168 | if($recursive) {
169 | $subdirs = glob(self::$cacheDir.'/'.$name.'*',GLOB_ONLYDIR);
170 |
171 | foreach($subdirs as $dir) {
172 | $dir = basename($dir);
173 |
174 | //skip all subdirectories that start with '.'
175 | if($dir[0] == '.') continue;
176 |
177 | self::invalidateGroup($name.$dir,true);
178 | }
179 | }
180 | }
181 |
182 |
183 | /**
184 | * Get a file handle from a file name. Will create the directory if it doesn't exist already. Also, automatically locks the file with the proper read or write lock.
185 | * @param String $filename The full file path.
186 | * @param String $mode The file mode. Accepted modes are 'c', 'c+', and 'r'.
187 | * @return resource The file handle
188 | */
189 | private static function getFileHandle($filename, $mode='c') {
190 | $write = in_array($mode,array('c','c+'));
191 |
192 | if($write) {
193 | //make sure the directory exists and is writable
194 | $directory = dirname($filename);
195 | if(!file_exists($directory)) {
196 | if(!mkdir($directory,0777,true)) {
197 | return false;
198 | }
199 | }
200 | elseif(!is_dir($directory)) {
201 | return false;
202 | }
203 | elseif(!is_writable($directory)) {
204 | return false;
205 | }
206 | }
207 |
208 | //get file pointer
209 | $fh = fopen($filename,$mode);
210 |
211 | if(!$fh) return false;
212 |
213 | //lock file with appropriate lock type
214 | if($write) {
215 | if(!flock($fh,LOCK_EX)) {
216 | self::closeFile($fh);
217 | return false;
218 | }
219 | }
220 | else {
221 | if(!flock($fh,LOCK_SH)) {
222 | self::closeFile($fh);
223 | return false;
224 | }
225 | }
226 |
227 | return $fh;
228 | }
229 |
230 | /**
231 | * Empties a file. If empty fails, the file will be closed and it will return false.
232 | * @param resource $fh The file handle
233 | * @return boolean true for success, false for failure
234 | */
235 | private static function emptyFile($fh) {
236 | rewind($fh);
237 | if(!ftruncate($fh,0)) {
238 | //release lock
239 | self::closeFile($fh);
240 | return false;
241 | }
242 | else {
243 | return true;
244 | }
245 | }
246 |
247 | /**
248 | * Closes a file. Also releases any locks on the file.
249 | * @param resource $fh The file handle
250 | */
251 | private static function closeFile($fh) {
252 | flock($fh,LOCK_UN);
253 | fclose($fh);
254 | }
255 |
256 | /**
257 | * Returns the contents of a cache file. If the data is not in the right form or expired, it will be invalidated.
258 | * @param resource $fh The file handle
259 | * @param FileSystemCacheKey $key The cache key. This is used to invalidate the key when the data is expired.
260 | * @return boolean|FileSystemCacheValue FALSE if something went wrong or the data is expired. Otherwise, a FileSystemCacheValue object will be returned.
261 | */
262 | private static function getContents($fh,FileSystemCacheKey $key) {
263 | //get the existing file contents
264 | $contents = stream_get_contents($fh);
265 | $data = @unserialize($contents);
266 |
267 | //if we can't unserialize the data or if the data is expired
268 | if(!$data || !($data instanceof FileSystemCacheValue) || $data->isExpired()) {
269 | //release lock
270 | self::closeFile($fh);
271 |
272 | //delete the cache file so we don't try to retrieve it again
273 | self::invalidate($key);
274 |
275 | return false;
276 | }
277 |
278 | return $data;
279 | }
280 |
281 | /**
282 | * Writes to a file. Also closes and releases any locks on the file.
283 | * @param resource $fh The file handle
284 | * @param FileSystemCacheValue $data The cache value to store in the file.
285 | * @return boolean True on success. Currently, never returns false.
286 | */
287 | private static function putContents($fh,FileSystemCacheValue $data) {
288 | fwrite($fh,serialize($data));
289 | fflush($fh);
290 |
291 | //release lock
292 | self::closeFile($fh);
293 |
294 | return true;
295 | }
296 | }
297 |
298 | /**
299 | * Class that represents a cache key.
300 | * Most of the time, you would get a FileSystemCacheKey object from FileSystemCache::generateCacheKey();
301 | */
302 | class FileSystemCacheKey {
303 | /**
304 | * @var mixed The key data used to generate the cache key
305 | */
306 | public $key;
307 | /**
308 | * @var string The group (if any) that the key will be stored in. Can be null.
309 | */
310 | public $group;
311 |
312 | /**
313 | * Creates a FileSystemCacheKey object
314 | * @param mixed $key Key data that will be used to generate a cache key
315 | * @param string $group The group (if any) that the key will be stored in. Can be null.
316 | */
317 | public function __construct($key,$group) {
318 | $this->key = $key;
319 | $this->group = $group;
320 | }
321 |
322 | /**
323 | * Returns the generated cache key.
324 | * Non-string key data will be serialized and hashed
325 | * @return string The generated cache key.
326 | */
327 | public function __toString() {
328 | $key = $this->key;
329 |
330 | //convert arrays and objects into strings
331 | if(!is_string($key)) {
332 | $key = serialize($key);
333 | }
334 |
335 | //if we can't use the key directly, md5 it
336 | if(preg_match('/[^a-zA-Z0-9_\-\.]/',$key)) {
337 | $key = md5($key);
338 | }
339 |
340 | //if it contains a group
341 | if($this->group) {
342 | //sanitize the group part
343 | $parts = explode('/',$this->group);
344 | foreach($parts as $i=>&$part) {
345 | $part = preg_replace('/[^a-zA-Z0-9_\-]/','',$part);
346 |
347 | if(!$part) unset($parts[$i]);
348 | }
349 |
350 | $group = implode('/',$parts);
351 |
352 | $key = $group.'/'.$key;
353 | }
354 |
355 | return $key;
356 | }
357 |
358 | /**
359 | * Returns the full path to the cache file for this key.
360 | * @return string The full path to the cache file for this key.
361 | */
362 | public function getFileName() {
363 | return FileSystemCache::$cacheDir . '/' . $this->__toString() . '.cache';
364 | }
365 | }
366 |
367 | /**
368 | * This class represents the actual data stored in the cache file.
369 | * You should never need to use this class directly.
370 | */
371 | class FileSystemCacheValue {
372 | /**
373 | * @var FileSystemCacheKey The cache key the file is stored under.
374 | */
375 | public $key;
376 | /**
377 | * @var mixed The value being cached
378 | */
379 | public $value;
380 | /**
381 | * @var int The max number of seconds to store the data. If null, the data won't expire.
382 | */
383 | public $ttl;
384 | /**
385 | * @var int The timestamp of when the data will expire. If null, the data won't expire.
386 | */
387 | public $expires;
388 | /**
389 | * @var int The timestamp of when the value was created.
390 | */
391 | public $created;
392 |
393 | /**
394 | * Creates a FileSystemCacheValue object.
395 | * @param FileSystemCacheKey $key The cache key the file is stored under.
396 | * @param mixed $value The data being stored
397 | * @param int $ttl The timestamp of when the data will expire. If null, the data won't expire.
398 | */
399 | public function __construct($key,$value,$ttl = null) {
400 | $this->key = $key;
401 | $this->value = $value;
402 | $this->ttl = $ttl;
403 | $this->created = time();
404 |
405 | if($ttl) $this->expires = $this->created + $ttl;
406 | else $this->expires = null;
407 | }
408 |
409 | /**
410 | * Checks if a value is expired
411 | * @return bool True if the value is expired. False if it is not.
412 | */
413 | public function isExpired() {
414 | //value doesn't expire
415 | if(!$this->expires) return false;
416 |
417 | //if it is after the expire time
418 | return time() > $this->expires;
419 | }
420 | }
421 |
--------------------------------------------------------------------------------
/lib/FileSystemCache/phpunit.xml.dist:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | ./tests
7 |
8 |
9 |
10 |
11 |
12 | ./
13 |
14 | ./tests
15 | ./vendor
16 | ./examples
17 |
18 |
19 |
20 |
21 |
--------------------------------------------------------------------------------
/lib/FileSystemCache/tests/FileSystemCacheTest.php:
--------------------------------------------------------------------------------
1 | assertInstanceOf('FileSystemCacheKey', $key);
13 | }
14 |
15 | /**
16 | * @dataProvider dataProvider
17 | */
18 | function testStoreDataTypes($data) {
19 | $key = FileSystemCache::generateCacheKey('mytestkey');
20 |
21 | FileSystemCache::invalidate($key);
22 |
23 | $this->assertFalse(FileSystemCache::retrieve($key));
24 |
25 | FileSystemCache::store($key, $data);
26 |
27 | $this->assertEquals($data, FileSystemCache::retrieve($key));
28 |
29 | FileSystemCache::invalidate($key);
30 |
31 | $this->assertFalse(FileSystemCache::retrieve($key));
32 | }
33 |
34 | /**
35 | * @dataProvider keyDataProvider
36 | */
37 | function testStore($key_data, $group) {
38 | $key = FileSystemCache::generateCacheKey($key_data, $group);
39 |
40 | $data = 'test'.microtime(true);
41 |
42 | FileSystemCache::invalidate($key);
43 |
44 | $this->assertFalse(FileSystemCache::retrieve($key));
45 |
46 | FileSystemCache::store($key, $data);
47 |
48 | $this->assertEquals($data, FileSystemCache::retrieve($key));
49 |
50 | FileSystemCache::invalidate($key);
51 |
52 | $this->assertFalse(FileSystemCache::retrieve($key));
53 | }
54 |
55 | function testStoreTtl() {
56 | $key = FileSystemCache::generateCacheKey('ttl test');
57 | $data = 'test ttl '.microtime(true);
58 |
59 | FileSystemCache::invalidate($key);
60 |
61 | $this->assertFalse(FileSystemCache::retrieve($key));
62 |
63 | FileSystemCache::store($key, $data, 1);
64 |
65 | $this->assertEquals($data, FileSystemCache::retrieve($key));
66 |
67 | sleep(2);
68 |
69 | $this->assertFalse(FileSystemCache::retrieve($key));
70 | }
71 |
72 | function testRetrieveNewerThan() {
73 | $key = FileSystemCache::generateCacheKey('newer than test');
74 | $data = 'test newer than data';
75 | FileSystemCache::store($key, $data);
76 |
77 | $this->assertFalse(FileSystemCache::retrieve($key, time() + 5));
78 | $this->assertEquals($data, FileSystemCache::retrieve($key, time() - 5));
79 |
80 | FileSystemCache::invalidate($key);
81 | $this->assertFalse(FileSystemCache::retrieve($key));
82 | }
83 |
84 | function testGetAndModifyReturnFalse() {
85 | $key = FileSystemCache::generateCacheKey('get and modify key');
86 | $data = 'get and modify data';
87 |
88 | FileSystemCache::store($key, $data, 1);
89 | $this->assertEquals($data, FileSystemCache::retrieve($key));
90 |
91 | FileSystemCache::getAndModify($key, function($value) {
92 | return false;
93 | });
94 |
95 | $this->assertFalse(FileSystemCache::retrieve($key));
96 | }
97 |
98 | function testGetAndModify() {
99 | $key = FileSystemCache::generateCacheKey('get and modify key');
100 | $data = 'get and modify data';
101 |
102 | FileSystemCache::store($key, $data, 1);
103 | $this->assertEquals($data, FileSystemCache::retrieve($key));
104 |
105 | FileSystemCache::getAndModify($key, function($value) {
106 | $value .= 'test';
107 | return $value;
108 | });
109 |
110 | $this->assertEquals($data.'test', FileSystemCache::retrieve($key));
111 |
112 | sleep(2);
113 |
114 | $this->assertFalse(FileSystemCache::retrieve($key));
115 | }
116 |
117 | function testGetAndModifyResetTtl() {
118 | $key = FileSystemCache::generateCacheKey('get and modify reset ttl key');
119 | $data = 'get and modify reset ttl data';
120 |
121 | FileSystemCache::store($key, $data, 3);
122 | sleep(2);
123 | // At this point, the key expires in 1 seconds
124 | $this->assertEquals($data, FileSystemCache::retrieve($key));
125 |
126 | FileSystemCache::getAndModify($key, function($value) {
127 | $value .= 'test';
128 | return $value;
129 | }, true);
130 |
131 | sleep(2);
132 |
133 | // The original expiration has hit, but getAndModify should have extended it
134 | $this->assertEquals($data.'test', FileSystemCache::retrieve($key));
135 |
136 | sleep(2);
137 |
138 | $this->assertFalse(FileSystemCache::retrieve($key));
139 | }
140 |
141 | function testGetAndModifyUnchanged() {
142 | $key = FileSystemCache::generateCacheKey('get and modify unchanged');
143 | $data = 'get and modify unchanged';
144 |
145 | FileSystemCache::store($key, $data);
146 |
147 | $return = FileSystemCache::getAndModify($key, function($value) {
148 | return $value;
149 | });
150 |
151 | $this->assertEquals($data, $return);
152 |
153 | $this->assertEquals($data, FileSystemCache::retrieve($key));
154 | }
155 |
156 | /**
157 | * @expectedException Exception
158 | */
159 | function testHackedGroupInvalidation() {
160 | FileSystemCache::invalidateGroup('this/../../is/a/hack');
161 | }
162 |
163 | function testGroupInvalidation() {
164 | $key_root = FileSystemCache::generateCacheKey('mykey');
165 | $key_group1 = FileSystemCache::generateCacheKey('mykey1','test');
166 | $key_group2 = FileSystemCache::generateCacheKey('mykey2','test');
167 | $key_sub = FileSystemCache::generateCacheKey('mykey','test/test');
168 | $key_other = FileSystemCache::generateCacheKey('mykey','test2');
169 |
170 | $data = 'group invalidation';
171 |
172 | FileSystemCache::store($key_root, $data);
173 | FileSystemCache::store($key_group1, $data);
174 | FileSystemCache::store($key_group2, $data);
175 | FileSystemCache::store($key_sub, $data);
176 | FileSystemCache::store($key_other, $data);
177 |
178 | $this->assertEquals($data, FileSystemCache::retrieve($key_root));
179 | $this->assertEquals($data, FileSystemCache::retrieve($key_group1));
180 | $this->assertEquals($data, FileSystemCache::retrieve($key_group2));
181 | $this->assertEquals($data, FileSystemCache::retrieve($key_sub));
182 | $this->assertEquals($data, FileSystemCache::retrieve($key_other));
183 |
184 | FileSystemCache::invalidateGroup('test', false);
185 |
186 | $this->assertEquals($data, FileSystemCache::retrieve($key_root));
187 | $this->assertFalse(FileSystemCache::retrieve($key_group1));
188 | $this->assertFalse(FileSystemCache::retrieve($key_group2));
189 | $this->assertEquals($data, FileSystemCache::retrieve($key_sub));
190 | $this->assertEquals($data, FileSystemCache::retrieve($key_other));
191 |
192 | FileSystemCache::invalidate($key_root);
193 | FileSystemCache::invalidate($key_sub);
194 | FileSystemCache::invalidate($key_other);
195 |
196 | $this->assertFalse(FileSystemCache::retrieve($key_root));
197 | $this->assertFalse(FileSystemCache::retrieve($key_sub));
198 | $this->assertFalse(FileSystemCache::retrieve($key_other));
199 | }
200 |
201 |
202 | function testGroupInvalidationRecursive() {
203 | $key_root = FileSystemCache::generateCacheKey('mykey');
204 | $key_group1 = FileSystemCache::generateCacheKey('mykey1','test');
205 | $key_group2 = FileSystemCache::generateCacheKey('mykey2','test');
206 | $key_sub = FileSystemCache::generateCacheKey('mykey','test/test');
207 | $key_other = FileSystemCache::generateCacheKey('mykey','test2');
208 |
209 | $data = 'group invalidation recursive';
210 |
211 | FileSystemCache::store($key_root, $data);
212 | FileSystemCache::store($key_group1, $data);
213 | FileSystemCache::store($key_group2, $data);
214 | FileSystemCache::store($key_sub, $data);
215 | FileSystemCache::store($key_other, $data);
216 |
217 | $this->assertEquals($data, FileSystemCache::retrieve($key_root));
218 | $this->assertEquals($data, FileSystemCache::retrieve($key_group1));
219 | $this->assertEquals($data, FileSystemCache::retrieve($key_group2));
220 | $this->assertEquals($data, FileSystemCache::retrieve($key_sub));
221 | $this->assertEquals($data, FileSystemCache::retrieve($key_other));
222 |
223 | FileSystemCache::invalidateGroup('test');
224 |
225 | $this->assertEquals($data, FileSystemCache::retrieve($key_root));
226 | $this->assertFalse(FileSystemCache::retrieve($key_group1));
227 | $this->assertFalse(FileSystemCache::retrieve($key_group2));
228 | $this->assertFalse(FileSystemCache::retrieve($key_sub));
229 | $this->assertEquals($data, FileSystemCache::retrieve($key_other));
230 |
231 | FileSystemCache::invalidate($key_root);
232 | FileSystemCache::invalidate($key_other);
233 |
234 | $this->assertFalse(FileSystemCache::retrieve($key_root));
235 | $this->assertFalse(FileSystemCache::retrieve($key_other));
236 | }
237 |
238 | function keyProvider() {
239 | return array(
240 | array(FileSystemCache::generateCacheKey('mykey')),
241 | array(FileSystemCache::generateCacheKey('mykey','test')),
242 | array(FileSystemCache::generateCacheKey('mykey','test/test')),
243 | );
244 | }
245 |
246 | function keyDataProvider() {
247 | $data = $this->dataProvider();
248 | $groups = $this->groupProvider();
249 |
250 | $keys = array();
251 | foreach($data as $key_data) {
252 | foreach($groups as $group) {
253 | $keys[] = array(
254 | $key_data[0],
255 | $group[0]
256 | );
257 | }
258 | }
259 |
260 | return $keys;
261 | }
262 | function dataProvider() {
263 | $temp = new DateTime();
264 |
265 | return array(
266 | array(99),
267 | array('string'),
268 | array(array('an','array','with'=>'data')),
269 | array( $temp )
270 | );
271 | }
272 | function groupProvider() {
273 | return array(
274 | array(null),
275 | array('test'),
276 | array('test/test')
277 | );
278 | }
279 | }
280 |
--------------------------------------------------------------------------------
/lib/ansi-color.php:
--------------------------------------------------------------------------------
1 | 0,
30 | "bold" => 1,
31 | "italic" => 3,
32 | "underline" => 4,
33 | "blink" => 5,
34 | "inverse" => 7,
35 | "hidden" => 8,
36 | "black" => 30,
37 | "red" => 31,
38 | "green" => 32,
39 | "yellow" => 33,
40 | "blue" => 34,
41 | "magenta" => 35,
42 | "cyan" => 36,
43 | "white" => 37,
44 | "black_bg" => 40,
45 | "red_bg" => 41,
46 | "green_bg" => 42,
47 | "yellow_bg" => 43,
48 | "blue_bg" => 44,
49 | "magenta_bg" => 45,
50 | "cyan_bg" => 46,
51 | "white_bg" => 47
52 | );
53 |
54 | public static function set($str, $color)
55 | {
56 | $color_attrs = explode("+", $color);
57 | $ansi_str = "";
58 | foreach ($color_attrs as $attr) {
59 | $ansi_str .= "\033[" . self::$ANSI_CODES[$attr] . "m";
60 | }
61 | $ansi_str .= $str . "\033[" . self::$ANSI_CODES["off"] . "m";
62 | return $ansi_str;
63 | }
64 |
65 | public static function log($message, $color)
66 | {
67 | error_log(self::set($message, $color));
68 | }
69 |
70 | public static function replace($full_text, $search_regexp, $color)
71 | {
72 | $new_text = preg_replace_callback(
73 | "/($search_regexp)/",
74 | function ($matches) use ($color) {
75 | return Color::set($matches[1], $color);
76 | },
77 | $full_text
78 | );
79 | return is_null($new_text) ? $full_text : $new_text;
80 | }
81 | }
82 |
83 |
--------------------------------------------------------------------------------
/lib/fivefilters-php-readability/JSLikeHTMLElement.php:
--------------------------------------------------------------------------------
1 | registerNodeClass('DOMElement', 'JSLikeHTMLElement');
16 | * $doc->loadHTML('
');
17 | * $elem = $doc->getElementsByTagName('div')->item(0);
18 | *
19 | * // print innerHTML
20 | * echo $elem->innerHTML; // prints 'Para 1
Para 2
'
21 | * echo "\n\n";
22 | *
23 | * // set innerHTML
24 | * $elem->innerHTML = 'FiveFilters.org';
25 | * echo $elem->innerHTML; // prints 'FiveFilters.org'
26 | * echo "\n\n";
27 | *
28 | * // print document (with our changes)
29 | * echo $doc->saveXML();
30 | * @endcode
31 | *
32 | * @author Keyvan Minoukadeh - http://www.keyvan.net - keyvan@keyvan.net
33 | * @see http://fivefilters.org (the project this was written for)
34 | */
35 | class JSLikeHTMLElement extends DOMElement
36 | {
37 | /**
38 | * Used for setting innerHTML like it's done in JavaScript:
39 | * @code
40 | * $div->innerHTML = 'Chapter 2
The story begins...
';
41 | * @endcode
42 | */
43 | public function __set($name, $value) {
44 | if ($name == 'innerHTML') {
45 | // first, empty the element
46 | for ($x=$this->childNodes->length-1; $x>=0; $x--) {
47 | $this->removeChild($this->childNodes->item($x));
48 | }
49 | // $value holds our new inner HTML
50 | if ($value != '') {
51 | $f = $this->ownerDocument->createDocumentFragment();
52 | // appendXML() expects well-formed markup (XHTML)
53 | $result = @$f->appendXML($value); // @ to suppress PHP warnings
54 | if ($result) {
55 | if ($f->hasChildNodes()) $this->appendChild($f);
56 | } else {
57 | // $value is probably ill-formed
58 | $f = new DOMDocument();
59 | $value = mb_convert_encoding($value, 'HTML-ENTITIES', 'UTF-8');
60 | // Using will generate a warning, but so will bad HTML
61 | // (and by this point, bad HTML is what we've got).
62 | // We use it (and suppress the warning) because an HTML fragment will
63 | // be wrapped around tags which we don't really want to keep.
64 | // Note: despite the warning, if loadHTML succeeds it will return true.
65 | $result = @$f->loadHTML(''.$value.'');
66 | if ($result) {
67 | $import = $f->getElementsByTagName('htmlfragment')->item(0);
68 | foreach ($import->childNodes as $child) {
69 | $importedNode = $this->ownerDocument->importNode($child, true);
70 | $this->appendChild($importedNode);
71 | }
72 | } else {
73 | // oh well, we tried, we really did. :(
74 | // this element is now empty
75 | }
76 | }
77 | }
78 | } else {
79 | $trace = debug_backtrace();
80 | trigger_error('Undefined property via __set(): '.$name.' in '.$trace[0]['file'].' on line '.$trace[0]['line'], E_USER_NOTICE);
81 | }
82 | }
83 |
84 | /**
85 | * Used for getting innerHTML like it's done in JavaScript:
86 | * @code
87 | * $string = $div->innerHTML;
88 | * @endcode
89 | */
90 | public function __get($name)
91 | {
92 | if ($name == 'innerHTML') {
93 | $inner = '';
94 | foreach ($this->childNodes as $child) {
95 | $inner .= $this->ownerDocument->saveXML($child);
96 | }
97 | return $inner;
98 | }
99 |
100 | $trace = debug_backtrace();
101 | trigger_error('Undefined property via __get(): '.$name.' in '.$trace[0]['file'].' on line '.$trace[0]['line'], E_USER_NOTICE);
102 | return null;
103 | }
104 |
105 | public function __toString()
106 | {
107 | return '['.$this->tagName.']';
108 | }
109 | }
--------------------------------------------------------------------------------
/lib/fivefilters-php-readability/README.md:
--------------------------------------------------------------------------------
1 | PHP Readability
2 | ================
3 |
4 | This is a PHP port of Arc90's original Javascript version of Readability. (Arc90 has since relaunched the project.)
5 |
6 | For instructions on how to use this, please see
7 |
8 | For a more flexible and robust solution to article extraction, take a look at [Full-Text RSS](http://fivefilters.org/content-only/) - it makes use of PHP Readability, but offers much more.
9 |
10 | Feel free to fork this and change/improve it. I would love to see your results. Please do share them and I'll consider pulling them in.
11 |
12 | PHP Readability is licensed under the Apache License, Version 2.0 (the same license as the original JS version). The original Javascript version can be found here: (readability.js)
13 |
14 | ### Simple example
15 |
16 | cleanRepair();
36 | $html = $tidy->value;
37 | }
38 |
39 | // give it to Readability
40 | $readability = new Readability($html, $url);
41 | // print debug output?
42 | // useful to compare against Arc90's original JS version -
43 | // simply click the bookmarklet with FireBug's console window open
44 | $readability->debug = false;
45 | // convert links to footnotes?
46 | $readability->convertLinksToFootnotes = true;
47 | // process it
48 | $result = $readability->init();
49 | // does it look like we found what we wanted?
50 | if ($result) {
51 | echo "== Title =====================================\n";
52 | echo $readability->getTitle()->textContent, "\n\n";
53 | echo "== Body ======================================\n";
54 | $content = $readability->getContent()->innerHTML;
55 | // if we've got Tidy, let's clean it up for output
56 | if (function_exists('tidy_parse_string')) {
57 | $tidy = tidy_parse_string($content, array('indent'=>true, 'show-body-only' => true), 'UTF8');
58 | $tidy->cleanRepair();
59 | $content = $tidy->value;
60 | }
61 | echo $content;
62 | } else {
63 | echo 'Looks like we couldn\'t find the content. :(';
64 | }
--------------------------------------------------------------------------------
/lib/fivefilters-php-readability/Readability.php:
--------------------------------------------------------------------------------
1 | init();
64 | echo $r->articleContent->innerHTML;
65 | */
66 |
67 | class Readability
68 | {
69 | public $version = '1.7.1-without-multi-page';
70 | public $convertLinksToFootnotes = false;
71 | public $revertForcedParagraphElements = true;
72 | public $articleTitle;
73 | public $articleContent;
74 | public $dom;
75 | public $url = null; // optional - URL where HTML was retrieved
76 | public $debug = false;
77 | public $lightClean = true; // preserves more content (experimental) added 2012-09-19
78 | protected $body = null; //
79 | protected $bodyCache = null; // Cache the body HTML in case we need to re-use it later
80 | protected $flags = 7; // 1 | 2 | 4; // Start with all flags set.
81 | protected $success = false; // indicates whether we were able to extract or not
82 |
83 | /**
84 | * All of the regular expressions in use within readability.
85 | * Defined up here so we don't instantiate them repeatedly in loops.
86 | **/
87 | public $regexps = array(
88 | 'unlikelyCandidates' => '/combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i',
89 | 'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i',
90 | 'positive' => '/article|body|content|entry|hentry|main|page|attachment|pagination|post|text|blog|story/i',
91 | 'negative' => '/combx|comment|com-|contact|foot|footer|_nav|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i',
92 | 'divToPElements' => '/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i',
93 | 'replaceBrs' => '/(
]*>[ \n\r\t]*){2,}/i',
94 | 'replaceFonts' => '/<(\/?)font[^>]*>/i',
95 | // 'trimRe' => '/^\s+|\s+$/g', // PHP has trim()
96 | 'normalize' => '/\s{2,}/',
97 | 'killBreaks' => '/(
(\s| ?)*){1,}/',
98 | 'video' => '!//(player\.|www\.)?(youtube|vimeo|viddler)\.com!i',
99 | 'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i'
100 | );
101 |
102 | /* constants */
103 | const FLAG_STRIP_UNLIKELYS = 1;
104 | const FLAG_WEIGHT_CLASSES = 2;
105 | const FLAG_CLEAN_CONDITIONALLY = 4;
106 |
107 | /**
108 | * Create instance of Readability
109 | * @param string UTF-8 encoded string
110 | * @param string (optional) URL associated with HTML (used for footnotes)
111 | * @param string which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib')
112 | */
113 | function __construct($html, $url=null, $parser='libxml')
114 | {
115 | $this->url = $url;
116 | /* Turn all double br's into p's */
117 | $html = preg_replace($this->regexps['replaceBrs'], '', $html);
118 | $html = preg_replace($this->regexps['replaceFonts'], '<$1span>', $html);
119 | $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
120 | if (trim($html) == '') $html = '';
121 | if ($parser=='html5lib' && ($this->dom = HTML5_Parser::parse($html))) {
122 | // all good
123 | } else {
124 | $this->dom = new DOMDocument();
125 | $this->dom->preserveWhiteSpace = false;
126 | @$this->dom->loadHTML($html);
127 | }
128 | $this->dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement');
129 | }
130 |
131 | /**
132 | * Get article title element
133 | * @return DOMElement
134 | */
135 | public function getTitle() {
136 | return $this->articleTitle;
137 | }
138 |
139 | /**
140 | * Get article content element
141 | * @return DOMElement
142 | */
143 | public function getContent() {
144 | return $this->articleContent;
145 | }
146 |
147 | /**
148 | * Runs readability.
149 | *
150 | * Workflow:
151 | * 1. Prep the document by removing script tags, css, etc.
152 | * 2. Build readability's DOM tree.
153 | * 3. Grab the article content from the current dom tree.
154 | * 4. Replace the current DOM tree with the new one.
155 | * 5. Read peacefully.
156 | *
157 | * @return boolean true if we found content, false otherwise
158 | **/
159 | public function init()
160 | {
161 | if (!isset($this->dom->documentElement)) return false;
162 | $this->removeScripts($this->dom);
163 | //die($this->getInnerHTML($this->dom->documentElement));
164 |
165 | // Assume successful outcome
166 | $this->success = true;
167 |
168 | $bodyElems = $this->dom->getElementsByTagName('body');
169 | if ($bodyElems->length > 0) {
170 | if ($this->bodyCache == null) {
171 | $this->bodyCache = $bodyElems->item(0)->innerHTML;
172 | }
173 | if ($this->body == null) {
174 | $this->body = $bodyElems->item(0);
175 | }
176 | }
177 |
178 | $this->prepDocument();
179 |
180 | //die($this->dom->documentElement->parentNode->nodeType);
181 | //$this->setInnerHTML($this->dom->documentElement, $this->getInnerHTML($this->dom->documentElement));
182 | //die($this->getInnerHTML($this->dom->documentElement));
183 |
184 | /* Build readability's DOM tree */
185 | $overlay = $this->dom->createElement('div');
186 | $innerDiv = $this->dom->createElement('div');
187 | $articleTitle = $this->getArticleTitle();
188 | $articleContent = $this->grabArticle();
189 |
190 | if (!$articleContent) {
191 | $this->success = false;
192 | $articleContent = $this->dom->createElement('div');
193 | $articleContent->setAttribute('id', 'readability-content');
194 | $articleContent->innerHTML = '
Sorry, Readability was unable to parse this page for content.
';
195 | }
196 |
197 | $overlay->setAttribute('id', 'readOverlay');
198 | $innerDiv->setAttribute('id', 'readInner');
199 |
200 | /* Glue the structure of our document together. */
201 | $innerDiv->appendChild($articleTitle);
202 | $innerDiv->appendChild($articleContent);
203 | $overlay->appendChild($innerDiv);
204 |
205 | /* Clear the old HTML, insert the new content. */
206 | $this->body->innerHTML = '';
207 | $this->body->appendChild($overlay);
208 | //document.body.insertBefore(overlay, document.body.firstChild);
209 | $this->body->removeAttribute('style');
210 |
211 | $this->postProcessContent($articleContent);
212 |
213 | // Set title and content instance variables
214 | $this->articleTitle = $articleTitle;
215 | $this->articleContent = $articleContent;
216 |
217 | return $this->success;
218 | }
219 |
220 | /**
221 | * Debug
222 | */
223 | protected function dbg($msg) {
224 | if ($this->debug) echo '* ',$msg, "\n";
225 | }
226 |
227 | /**
228 | * Run any post-process modifications to article content as necessary.
229 | *
230 | * @param DOMElement
231 | * @return void
232 | */
233 | public function postProcessContent($articleContent) {
234 | if ($this->convertLinksToFootnotes && !preg_match('/wikipedia\.org/', @$this->url)) {
235 | $this->addFootnotes($articleContent);
236 | }
237 | }
238 |
239 | /**
240 | * Get the article title as an H1.
241 | *
242 | * @return DOMElement
243 | */
244 | protected function getArticleTitle() {
245 | $curTitle = '';
246 | $origTitle = '';
247 |
248 | try {
249 | $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0));
250 | } catch(Exception $e) {}
251 |
252 | if (preg_match('/ [\|\-] /', $curTitle))
253 | {
254 | $curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle);
255 |
256 | if (count(explode(' ', $curTitle)) < 3) {
257 | $curTitle = preg_replace('/[^\|\-]*[\|\-](.*)/i', '$1', $origTitle);
258 | }
259 | }
260 | else if (strpos($curTitle, ': ') !== false)
261 | {
262 | $curTitle = preg_replace('/.*:(.*)/i', '$1', $origTitle);
263 |
264 | if (count(explode(' ', $curTitle)) < 3) {
265 | $curTitle = preg_replace('/[^:]*[:](.*)/i','$1', $origTitle);
266 | }
267 | }
268 | else if(strlen($curTitle) > 150 || strlen($curTitle) < 15)
269 | {
270 | $hOnes = $this->dom->getElementsByTagName('h1');
271 | if($hOnes->length == 1)
272 | {
273 | $curTitle = $this->getInnerText($hOnes->item(0));
274 | }
275 | }
276 |
277 | $curTitle = trim($curTitle);
278 |
279 | if (count(explode(' ', $curTitle)) <= 4) {
280 | $curTitle = $origTitle;
281 | }
282 |
283 | $articleTitle = $this->dom->createElement('h1');
284 | $articleTitle->innerHTML = $curTitle;
285 |
286 | return $articleTitle;
287 | }
288 |
289 | /**
290 | * Prepare the HTML document for readability to scrape it.
291 | * This includes things like stripping javascript, CSS, and handling terrible markup.
292 | *
293 | * @return void
294 | **/
295 | protected function prepDocument() {
296 | /**
297 | * In some cases a body element can't be found (if the HTML is totally hosed for example)
298 | * so we create a new body node and append it to the document.
299 | */
300 | if ($this->body == null)
301 | {
302 | $this->body = $this->dom->createElement('body');
303 | $this->dom->documentElement->appendChild($this->body);
304 | }
305 | $this->body->setAttribute('id', 'readabilityBody');
306 |
307 | /* Remove all style tags in head */
308 | $styleTags = $this->dom->getElementsByTagName('style');
309 | for ($i = $styleTags->length-1; $i >= 0; $i--)
310 | {
311 | $styleTags->item($i)->parentNode->removeChild($styleTags->item($i));
312 | }
313 |
314 | /* Turn all double br's into p's */
315 | /* Note, this is pretty costly as far as processing goes. Maybe optimize later. */
316 | //document.body.innerHTML = document.body.innerHTML.replace(readability.regexps.replaceBrs, '').replace(readability.regexps.replaceFonts, '<$1span>');
317 | // We do this in the constructor for PHP as that's when we have raw HTML - before parsing it into a DOM tree.
318 | // Manipulating innerHTML as it's done in JS is not possible in PHP.
319 | }
320 |
321 | /**
322 | * For easier reading, convert this document to have footnotes at the bottom rather than inline links.
323 | * @see http://www.roughtype.com/archives/2010/05/experiments_in.php
324 | *
325 | * @return void
326 | **/
327 | public function addFootnotes($articleContent) {
328 | $footnotesWrapper = $this->dom->createElement('div');
329 | $footnotesWrapper->setAttribute('id', 'readability-footnotes');
330 | $footnotesWrapper->innerHTML = '
References
';
331 |
332 | $articleFootnotes = $this->dom->createElement('ol');
333 | $articleFootnotes->setAttribute('id', 'readability-footnotes-list');
334 | $footnotesWrapper->appendChild($articleFootnotes);
335 |
336 | $articleLinks = $articleContent->getElementsByTagName('a');
337 |
338 | $linkCount = 0;
339 | for ($i = 0; $i < $articleLinks->length; $i++)
340 | {
341 | $articleLink = $articleLinks->item($i);
342 | $footnoteLink = $articleLink->cloneNode(true);
343 | $refLink = $this->dom->createElement('a');
344 | $footnote = $this->dom->createElement('li');
345 | $linkDomain = @parse_url($footnoteLink->getAttribute('href'), PHP_URL_HOST);
346 | if (!$linkDomain && isset($this->url)) $linkDomain = @parse_url($this->url, PHP_URL_HOST);
347 | //linkDomain = footnoteLink.host ? footnoteLink.host : document.location.host,
348 | $linkText = $this->getInnerText($articleLink);
349 |
350 | if ((strpos($articleLink->getAttribute('class'), 'readability-DoNotFootnote') !== false) || preg_match($this->regexps['skipFootnoteLink'], $linkText)) {
351 | continue;
352 | }
353 |
354 | $linkCount++;
355 |
356 | /** Add a superscript reference after the article link */
357 | $refLink->setAttribute('href', '#readabilityFootnoteLink-' . $linkCount);
358 | $refLink->innerHTML = '[' . $linkCount . ']';
359 | $refLink->setAttribute('class', 'readability-DoNotFootnote');
360 | $refLink->setAttribute('style', 'color: inherit;');
361 |
362 | //TODO: does this work or should we use DOMNode.isSameNode()?
363 | if ($articleLink->parentNode->lastChild == $articleLink) {
364 | $articleLink->parentNode->appendChild($refLink);
365 | } else {
366 | $articleLink->parentNode->insertBefore($refLink, $articleLink->nextSibling);
367 | }
368 |
369 | $articleLink->setAttribute('style', 'color: inherit; text-decoration: none;');
370 | $articleLink->setAttribute('name', 'readabilityLink-' . $linkCount);
371 |
372 | $footnote->innerHTML = '^ ';
373 |
374 | $footnoteLink->innerHTML = ($footnoteLink->getAttribute('title') != '' ? $footnoteLink->getAttribute('title') : $linkText);
375 | $footnoteLink->setAttribute('name', 'readabilityFootnoteLink-' . $linkCount);
376 |
377 | $footnote->appendChild($footnoteLink);
378 | if ($linkDomain) $footnote->innerHTML = $footnote->innerHTML . ' (' . $linkDomain . ')';
379 |
380 | $articleFootnotes->appendChild($footnote);
381 | }
382 |
383 | if ($linkCount > 0) {
384 | $articleContent->appendChild($footnotesWrapper);
385 | }
386 | }
387 |
388 | /**
389 | * Reverts P elements with class 'readability-styled'
390 | * to text nodes - which is what they were before.
391 | *
392 | * @param DOMElement
393 | * @return void
394 | */
395 | function revertReadabilityStyledElements($articleContent) {
396 | $xpath = new DOMXPath($articleContent->ownerDocument);
397 | $elems = $xpath->query('.//p[@class="readability-styled"]', $articleContent);
398 | //$elems = $articleContent->getElementsByTagName('p');
399 | for ($i = $elems->length-1; $i >= 0; $i--) {
400 | $e = $elems->item($i);
401 | $e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e);
402 | //if ($e->hasAttribute('class') && $e->getAttribute('class') == 'readability-styled') {
403 | // $e->parentNode->replaceChild($this->dom->createTextNode($e->textContent), $e);
404 | //}
405 | }
406 | }
407 |
408 | /**
409 | * Prepare the article node for display. Clean out any inline styles,
410 | * iframes, forms, strip extraneous tags, etc.
411 | *
412 | * @param DOMElement
413 | * @return void
414 | */
415 | function prepArticle($articleContent) {
416 | $this->cleanStyles($articleContent);
417 | $this->killBreaks($articleContent);
418 | if ($this->revertForcedParagraphElements) {
419 | $this->revertReadabilityStyledElements($articleContent);
420 | }
421 |
422 | /* Clean out junk from the article content */
423 | $this->cleanConditionally($articleContent, 'form');
424 | $this->clean($articleContent, 'object');
425 | $this->clean($articleContent, 'h1');
426 |
427 | /**
428 | * If there is only one h2, they are probably using it
429 | * as a header and not a subheader, so remove it since we already have a header.
430 | ***/
431 | if (!$this->lightClean && ($articleContent->getElementsByTagName('h2')->length == 1)) {
432 | $this->clean($articleContent, 'h2');
433 | }
434 | $this->clean($articleContent, 'iframe');
435 |
436 | $this->cleanHeaders($articleContent);
437 |
438 | /* Do these last as the previous stuff may have removed junk that will affect these */
439 | $this->cleanConditionally($articleContent, 'table');
440 | $this->cleanConditionally($articleContent, 'ul');
441 | $this->cleanConditionally($articleContent, 'div');
442 |
443 | /* Remove extra paragraphs */
444 | $articleParagraphs = $articleContent->getElementsByTagName('p');
445 | for ($i = $articleParagraphs->length-1; $i >= 0; $i--)
446 | {
447 | $imgCount = $articleParagraphs->item($i)->getElementsByTagName('img')->length;
448 | $embedCount = $articleParagraphs->item($i)->getElementsByTagName('embed')->length;
449 | $objectCount = $articleParagraphs->item($i)->getElementsByTagName('object')->length;
450 | $iframeCount = $articleParagraphs->item($i)->getElementsByTagName('iframe')->length;
451 |
452 | if ($imgCount === 0 && $embedCount === 0 && $objectCount === 0 && $iframeCount === 0 && $this->getInnerText($articleParagraphs->item($i), false) == '')
453 | {
454 | $articleParagraphs->item($i)->parentNode->removeChild($articleParagraphs->item($i));
455 | }
456 | }
457 |
458 | try {
459 | $articleContent->innerHTML = preg_replace('/
]*>\s*
innerHTML);
460 | //articleContent.innerHTML = articleContent.innerHTML.replace(/
]*>\s*
dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " . $e);
464 | }
465 | }
466 |
467 | /**
468 | * Initialize a node with the readability object. Also checks the
469 | * className/id for special names to add to its score.
470 | *
471 | * @param Element
472 | * @return void
473 | **/
474 | protected function initializeNode($node) {
475 | $readability = $this->dom->createAttribute('readability');
476 | $readability->value = 0; // this is our contentScore
477 | $node->setAttributeNode($readability);
478 |
479 | switch (strtoupper($node->tagName)) { // unsure if strtoupper is needed, but using it just in case
480 | case 'DIV':
481 | $readability->value += 5;
482 | break;
483 |
484 | case 'PRE':
485 | case 'TD':
486 | case 'BLOCKQUOTE':
487 | $readability->value += 3;
488 | break;
489 |
490 | case 'ADDRESS':
491 | case 'OL':
492 | case 'UL':
493 | case 'DL':
494 | case 'DD':
495 | case 'DT':
496 | case 'LI':
497 | case 'FORM':
498 | $readability->value -= 3;
499 | break;
500 |
501 | case 'H1':
502 | case 'H2':
503 | case 'H3':
504 | case 'H4':
505 | case 'H5':
506 | case 'H6':
507 | case 'TH':
508 | $readability->value -= 5;
509 | break;
510 | }
511 | $readability->value += $this->getClassWeight($node);
512 | }
513 |
514 | /***
515 | * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
516 | * most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
517 | *
518 | * @return DOMElement
519 | **/
520 | protected function grabArticle($page=null) {
521 | $stripUnlikelyCandidates = $this->flagIsActive(self::FLAG_STRIP_UNLIKELYS);
522 | if (!$page) $page = $this->dom;
523 | $allElements = $page->getElementsByTagName('*');
524 | /**
525 | * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs
526 | * into P tags where they have been used inappropriately (as in, where they contain no other block level elements.)
527 | *
528 | * Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
529 | * TODO: Shouldn't this be a reverse traversal?
530 | **/
531 | $node = null;
532 | $nodesToScore = array();
533 | for ($nodeIndex = 0; ($node = $allElements->item($nodeIndex)); $nodeIndex++) {
534 | //for ($nodeIndex=$targetList->length-1; $nodeIndex >= 0; $nodeIndex--) {
535 | //$node = $targetList->item($nodeIndex);
536 | $tagName = strtoupper($node->tagName);
537 | /* Remove unlikely candidates */
538 | if ($stripUnlikelyCandidates) {
539 | $unlikelyMatchString = $node->getAttribute('class') . $node->getAttribute('id');
540 | if (
541 | preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) &&
542 | !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString) &&
543 | $tagName != 'BODY'
544 | )
545 | {
546 | $this->dbg('Removing unlikely candidate - ' . $unlikelyMatchString);
547 | //$nodesToRemove[] = $node;
548 | $node->parentNode->removeChild($node);
549 | $nodeIndex--;
550 | continue;
551 | }
552 | }
553 |
554 | if ($tagName == 'P' || $tagName == 'TD' || $tagName == 'PRE') {
555 | $nodesToScore[] = $node;
556 | }
557 |
558 | /* Turn all divs that don't have children block level elements into p's */
559 | if ($tagName == 'DIV') {
560 | if (!preg_match($this->regexps['divToPElements'], $node->innerHTML)) {
561 | //$this->dbg('Altering div to p');
562 | $newNode = $this->dom->createElement('p');
563 | try {
564 | $newNode->innerHTML = $node->innerHTML;
565 | //$nodesToReplace[] = array('new'=>$newNode, 'old'=>$node);
566 | $node->parentNode->replaceChild($newNode, $node);
567 | $nodeIndex--;
568 | $nodesToScore[] = $node; // or $newNode?
569 | }
570 | catch(Exception $e) {
571 | $this->dbg('Could not alter div to p, reverting back to div.: ' . $e);
572 | }
573 | }
574 | else
575 | {
576 | /* EXPERIMENTAL */
577 | // TODO: change these p elements back to text nodes after processing
578 | for ($i = 0, $il = $node->childNodes->length; $i < $il; $i++) {
579 | $childNode = $node->childNodes->item($i);
580 | if ($childNode->nodeType == 3) { // XML_TEXT_NODE
581 | //$this->dbg('replacing text node with a p tag with the same content.');
582 | $p = $this->dom->createElement('p');
583 | $p->innerHTML = $childNode->nodeValue;
584 | $p->setAttribute('style', 'display: inline;');
585 | $p->setAttribute('class', 'readability-styled');
586 | $childNode->parentNode->replaceChild($p, $childNode);
587 | }
588 | }
589 | }
590 | }
591 | }
592 |
593 | /**
594 | * Loop through all paragraphs, and assign a score to them based on how content-y they look.
595 | * Then add their score to their parent node.
596 | *
597 | * A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
598 | **/
599 | $candidates = array();
600 | for ($pt=0; $pt < count($nodesToScore); $pt++) {
601 | $parentNode = $nodesToScore[$pt]->parentNode;
602 | // $grandParentNode = $parentNode ? $parentNode->parentNode : null;
603 | $grandParentNode = !$parentNode ? null : (($parentNode->parentNode instanceof DOMElement) ? $parentNode->parentNode : null);
604 | $innerText = $this->getInnerText($nodesToScore[$pt]);
605 |
606 | if (!$parentNode || !isset($parentNode->tagName)) {
607 | continue;
608 | }
609 |
610 | /* If this paragraph is less than 25 characters, don't even count it. */
611 | if(strlen($innerText) < 25) {
612 | continue;
613 | }
614 |
615 | /* Initialize readability data for the parent. */
616 | if (!$parentNode->hasAttribute('readability'))
617 | {
618 | $this->initializeNode($parentNode);
619 | $candidates[] = $parentNode;
620 | }
621 |
622 | /* Initialize readability data for the grandparent. */
623 | if ($grandParentNode && !$grandParentNode->hasAttribute('readability') && isset($grandParentNode->tagName))
624 | {
625 | $this->initializeNode($grandParentNode);
626 | $candidates[] = $grandParentNode;
627 | }
628 |
629 | $contentScore = 0;
630 |
631 | /* Add a point for the paragraph itself as a base. */
632 | $contentScore++;
633 |
634 | /* Add points for any commas within this paragraph */
635 | $contentScore += count(explode(',', $innerText));
636 |
637 | /* For every 100 characters in this paragraph, add another point. Up to 3 points. */
638 | $contentScore += min(floor(strlen($innerText) / 100), 3);
639 |
640 | /* Add the score to the parent. The grandparent gets half. */
641 | $parentNode->getAttributeNode('readability')->value += $contentScore;
642 |
643 | if ($grandParentNode) {
644 | $grandParentNode->getAttributeNode('readability')->value += $contentScore/2;
645 | }
646 | }
647 |
648 | /**
649 | * After we've calculated scores, loop through all of the possible candidate nodes we found
650 | * and find the one with the highest score.
651 | **/
652 | $topCandidate = null;
653 | for ($c=0, $cl=count($candidates); $c < $cl; $c++)
654 | {
655 | /**
656 | * Scale the final candidates score based on link density. Good content should have a
657 | * relatively small link density (5% or less) and be mostly unaffected by this operation.
658 | **/
659 | $readability = $candidates[$c]->getAttributeNode('readability');
660 | $readability->value = $readability->value * (1-$this->getLinkDensity($candidates[$c]));
661 |
662 | $this->dbg('Candidate: ' . $candidates[$c]->tagName . ' (' . $candidates[$c]->getAttribute('class') . ':' . $candidates[$c]->getAttribute('id') . ') with score ' . $readability->value);
663 |
664 | if (!$topCandidate || $readability->value > (int)$topCandidate->getAttribute('readability')) {
665 | $topCandidate = $candidates[$c];
666 | }
667 | }
668 |
669 | /**
670 | * If we still have no top candidate, just use the body as a last resort.
671 | * We also have to copy the body node so it is something we can modify.
672 | **/
673 | if ($topCandidate === null || strtoupper($topCandidate->tagName) == 'BODY')
674 | {
675 | $topCandidate = $this->dom->createElement('div');
676 | if ($page instanceof DOMDocument) {
677 | if (!isset($page->documentElement)) {
678 | // we don't have a body either? what a mess! :)
679 | } else {
680 | $topCandidate->innerHTML = $page->documentElement->innerHTML;
681 | $page->documentElement->innerHTML = '';
682 | $page->documentElement->appendChild($topCandidate);
683 | }
684 | } else {
685 | $topCandidate->innerHTML = $page->innerHTML;
686 | $page->innerHTML = '';
687 | $page->appendChild($topCandidate);
688 | }
689 | $this->initializeNode($topCandidate);
690 | }
691 |
692 | /**
693 | * Now that we have the top candidate, look through its siblings for content that might also be related.
694 | * Things like preambles, content split by ads that we removed, etc.
695 | **/
696 | $articleContent = $this->dom->createElement('div');
697 | $articleContent->setAttribute('id', 'readability-content');
698 | $siblingScoreThreshold = max(10, ((int)$topCandidate->getAttribute('readability')) * 0.2);
699 | $siblingNodes = $topCandidate->parentNode->childNodes;
700 | if (!isset($siblingNodes)) {
701 | $siblingNodes = new stdClass;
702 | $siblingNodes->length = 0;
703 | }
704 |
705 | for ($s=0, $sl=$siblingNodes->length; $s < $sl; $s++)
706 | {
707 | $siblingNode = $siblingNodes->item($s);
708 | $append = false;
709 |
710 | $this->dbg('Looking at sibling node: ' . $siblingNode->nodeName . (($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : ''));
711 |
712 | //dbg('Sibling has score ' . ($siblingNode->readability ? siblingNode.readability.contentScore : 'Unknown'));
713 |
714 | if ($siblingNode === $topCandidate)
715 | // or if ($siblingNode->isSameNode($topCandidate))
716 | {
717 | $append = true;
718 | }
719 |
720 | $contentBonus = 0;
721 | /* Give a bonus if sibling nodes and top candidates have the example same classname */
722 | if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->getAttribute('class') == $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') != '') {
723 | $contentBonus += ((int)$topCandidate->getAttribute('readability')) * 0.2;
724 | }
725 |
726 | if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability') && (((int)$siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold)
727 | {
728 | $append = true;
729 | }
730 |
731 | if (strtoupper($siblingNode->nodeName) == 'P') {
732 | $linkDensity = $this->getLinkDensity($siblingNode);
733 | $nodeContent = $this->getInnerText($siblingNode);
734 | $nodeLength = strlen($nodeContent);
735 |
736 | if ($nodeLength > 80 && $linkDensity < 0.25)
737 | {
738 | $append = true;
739 | }
740 | else if ($nodeLength < 80 && $linkDensity === 0 && preg_match('/\.( |$)/', $nodeContent))
741 | {
742 | $append = true;
743 | }
744 | }
745 |
746 | if ($append)
747 | {
748 | $this->dbg('Appending node: ' . $siblingNode->nodeName);
749 |
750 | $nodeToAppend = null;
751 | $sibNodeName = strtoupper($siblingNode->nodeName);
752 | if ($sibNodeName != 'DIV' && $sibNodeName != 'P') {
753 | /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */
754 |
755 | $this->dbg('Altering siblingNode of ' . $sibNodeName . ' to div.');
756 | $nodeToAppend = $this->dom->createElement('div');
757 | try {
758 | $nodeToAppend->setAttribute('id', $siblingNode->getAttribute('id'));
759 | $nodeToAppend->innerHTML = $siblingNode->innerHTML;
760 | }
761 | catch(Exception $e)
762 | {
763 | $this->dbg('Could not alter siblingNode to div, reverting back to original.');
764 | $nodeToAppend = $siblingNode;
765 | $s--;
766 | $sl--;
767 | }
768 | } else {
769 | $nodeToAppend = $siblingNode;
770 | $s--;
771 | $sl--;
772 | }
773 |
774 | /* To ensure a node does not interfere with readability styles, remove its classnames */
775 | $nodeToAppend->removeAttribute('class');
776 |
777 | /* Append sibling and subtract from our list because it removes the node when you append to another node */
778 | $articleContent->appendChild($nodeToAppend);
779 | }
780 | }
781 |
782 | /**
783 | * So we have all of the content that we need. Now we clean it up for presentation.
784 | **/
785 | $this->prepArticle($articleContent);
786 |
787 | /**
788 | * Now that we've gone through the full algorithm, check to see if we got any meaningful content.
789 | * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher
790 | * likelihood of finding the content, and the sieve approach gives us a higher likelihood of
791 | * finding the -right- content.
792 | **/
793 | if (strlen($this->getInnerText($articleContent, false)) < 250)
794 | {
795 | // TODO: find out why element disappears sometimes, e.g. for this URL http://www.businessinsider.com/6-hedge-fund-etfs-for-average-investors-2011-7
796 | // in the meantime, we check and create an empty element if it's not there.
797 | if (!isset($this->body->childNodes)) $this->body = $this->dom->createElement('body');
798 | $this->body->innerHTML = $this->bodyCache;
799 |
800 | if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) {
801 | $this->removeFlag(self::FLAG_STRIP_UNLIKELYS);
802 | return $this->grabArticle($this->body);
803 | }
804 | else if ($this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) {
805 | $this->removeFlag(self::FLAG_WEIGHT_CLASSES);
806 | return $this->grabArticle($this->body);
807 | }
808 | else if ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
809 | $this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY);
810 | return $this->grabArticle($this->body);
811 | }
812 | else {
813 | return false;
814 | }
815 | }
816 | return $articleContent;
817 | }
818 |
819 | /**
820 | * Remove script tags from document
821 | *
822 | * @param DOMElement
823 | * @return void
824 | */
825 | public function removeScripts($doc) {
826 | $scripts = $doc->getElementsByTagName('script');
827 | for($i = $scripts->length-1; $i >= 0; $i--)
828 | {
829 | $scripts->item($i)->parentNode->removeChild($scripts->item($i));
830 | }
831 | }
832 |
833 | /**
834 | * Get the inner text of a node.
835 | * This also strips out any excess whitespace to be found.
836 | *
837 | * @param DOMElement $
838 | * @param boolean $normalizeSpaces (default: true)
839 | * @return string
840 | **/
841 | public function getInnerText($e, $normalizeSpaces=true) {
842 | $textContent = '';
843 |
844 | if (!isset($e->textContent) || $e->textContent == '') {
845 | return '';
846 | }
847 |
848 | $textContent = trim($e->textContent);
849 |
850 | if ($normalizeSpaces) {
851 | return preg_replace($this->regexps['normalize'], ' ', $textContent);
852 | } else {
853 | return $textContent;
854 | }
855 | }
856 |
857 | /**
858 | * Get the number of times a string $s appears in the node $e.
859 | *
860 | * @param DOMElement $e
861 | * @param string - what to count. Default is ","
862 | * @return number (integer)
863 | **/
864 | public function getCharCount($e, $s=',') {
865 | return substr_count($this->getInnerText($e), $s);
866 | }
867 |
868 | /**
869 | * Remove the style attribute on every $e and under.
870 | *
871 | * @param DOMElement $e
872 | * @return void
873 | */
874 | public function cleanStyles($e) {
875 | if (!is_object($e)) return;
876 | $elems = $e->getElementsByTagName('*');
877 | foreach ($elems as $elem) {
878 | $elem->removeAttribute('style');
879 | }
880 | }
881 |
882 | /**
883 | * Get the density of links as a percentage of the content
884 | * This is the amount of text that is inside a link divided by the total text in the node.
885 | *
886 | * @param DOMElement $e
887 | * @return number (float)
888 | */
889 | public function getLinkDensity($e) {
890 | $links = $e->getElementsByTagName('a');
891 | $textLength = strlen($this->getInnerText($e));
892 | $linkLength = 0;
893 | for ($i=0, $il=$links->length; $i < $il; $i++)
894 | {
895 | $linkLength += strlen($this->getInnerText($links->item($i)));
896 | }
897 | if ($textLength > 0) {
898 | return $linkLength / $textLength;
899 | } else {
900 | return 0;
901 | }
902 | }
903 |
904 | /**
905 | * Get an elements class/id weight. Uses regular expressions to tell if this
906 | * element looks good or bad.
907 | *
908 | * @param DOMElement $e
909 | * @return number (Integer)
910 | */
911 | public function getClassWeight($e) {
912 | if(!$this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) {
913 | return 0;
914 | }
915 |
916 | $weight = 0;
917 |
918 | /* Look for a special classname */
919 | if ($e->hasAttribute('class') && $e->getAttribute('class') != '')
920 | {
921 | if (preg_match($this->regexps['negative'], $e->getAttribute('class'))) {
922 | $weight -= 25;
923 | }
924 | if (preg_match($this->regexps['positive'], $e->getAttribute('class'))) {
925 | $weight += 25;
926 | }
927 | }
928 |
929 | /* Look for a special ID */
930 | if ($e->hasAttribute('id') && $e->getAttribute('id') != '')
931 | {
932 | if (preg_match($this->regexps['negative'], $e->getAttribute('id'))) {
933 | $weight -= 25;
934 | }
935 | if (preg_match($this->regexps['positive'], $e->getAttribute('id'))) {
936 | $weight += 25;
937 | }
938 | }
939 | return $weight;
940 | }
941 |
942 | /**
943 | * Remove extraneous break tags from a node.
944 | *
945 | * @param DOMElement $node
946 | * @return void
947 | */
948 | public function killBreaks($node) {
949 | $html = $node->innerHTML;
950 | $html = preg_replace($this->regexps['killBreaks'], '
', $html);
951 | $node->innerHTML = $html;
952 | }
953 |
954 | /**
955 | * Clean a node of all elements of type "tag".
956 | * (Unless it's a youtube/vimeo video. People love movies.)
957 | *
958 | * Updated 2012-09-18 to preserve youtube/vimeo iframes
959 | *
960 | * @param DOMElement $e
961 | * @param string $tag
962 | * @return void
963 | */
964 | public function clean($e, $tag) {
965 | $targetList = $e->getElementsByTagName($tag);
966 | $isEmbed = ($tag == 'iframe' || $tag == 'object' || $tag == 'embed');
967 |
968 | for ($y=$targetList->length-1; $y >= 0; $y--) {
969 | /* Allow youtube and vimeo videos through as people usually want to see those. */
970 | if ($isEmbed) {
971 | $attributeValues = '';
972 | for ($i=0, $il=$targetList->item($y)->attributes->length; $i < $il; $i++) {
973 | $attributeValues .= $targetList->item($y)->attributes->item($i)->value . '|'; // DOMAttr? (TODO: test)
974 | }
975 |
976 | /* First, check the elements attributes to see if any of them contain youtube or vimeo */
977 | if (preg_match($this->regexps['video'], $attributeValues)) {
978 | continue;
979 | }
980 |
981 | /* Then check the elements inside this element for the same. */
982 | if (preg_match($this->regexps['video'], $targetList->item($y)->innerHTML)) {
983 | continue;
984 | }
985 | }
986 | $targetList->item($y)->parentNode->removeChild($targetList->item($y));
987 | }
988 | }
989 |
990 | /**
991 | * Clean an element of all tags of type "tag" if they look fishy.
992 | * "Fishy" is an algorithm based on content length, classnames,
993 | * link density, number of images & embeds, etc.
994 | *
995 | * @param DOMElement $e
996 | * @param string $tag
997 | * @return void
998 | */
999 | public function cleanConditionally($e, $tag) {
1000 | if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
1001 | return;
1002 | }
1003 |
1004 | $tagsList = $e->getElementsByTagName($tag);
1005 | $curTagsLength = $tagsList->length;
1006 |
1007 | /**
1008 | * Gather counts for other typical elements embedded within.
1009 | * Traverse backwards so we can remove nodes at the same time without effecting the traversal.
1010 | *
1011 | * TODO: Consider taking into account original contentScore here.
1012 | */
1013 | for ($i=$curTagsLength-1; $i >= 0; $i--) {
1014 | $weight = $this->getClassWeight($tagsList->item($i));
1015 | $contentScore = ($tagsList->item($i)->hasAttribute('readability')) ? (int)$tagsList->item($i)->getAttribute('readability') : 0;
1016 |
1017 | $this->dbg('Cleaning Conditionally ' . $tagsList->item($i)->tagName . ' (' . $tagsList->item($i)->getAttribute('class') . ':' . $tagsList->item($i)->getAttribute('id') . ')' . (($tagsList->item($i)->hasAttribute('readability')) ? (' with score ' . $tagsList->item($i)->getAttribute('readability')) : ''));
1018 |
1019 | if ($weight + $contentScore < 0) {
1020 | $tagsList->item($i)->parentNode->removeChild($tagsList->item($i));
1021 | }
1022 | else if ( $this->getCharCount($tagsList->item($i), ',') < 10) {
1023 | /**
1024 | * If there are not very many commas, and the number of
1025 | * non-paragraph elements is more than paragraphs or other ominous signs, remove the element.
1026 | **/
1027 | $p = $tagsList->item($i)->getElementsByTagName('p')->length;
1028 | $img = $tagsList->item($i)->getElementsByTagName('img')->length;
1029 | $li = $tagsList->item($i)->getElementsByTagName('li')->length-100;
1030 | $input = $tagsList->item($i)->getElementsByTagName('input')->length;
1031 | $a = $tagsList->item($i)->getElementsByTagName('a')->length;
1032 |
1033 | $embedCount = 0;
1034 | $embeds = $tagsList->item($i)->getElementsByTagName('embed');
1035 | for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) {
1036 | if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) {
1037 | $embedCount++;
1038 | }
1039 | }
1040 | $embeds = $tagsList->item($i)->getElementsByTagName('iframe');
1041 | for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) {
1042 | if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) {
1043 | $embedCount++;
1044 | }
1045 | }
1046 |
1047 | $linkDensity = $this->getLinkDensity($tagsList->item($i));
1048 | $contentLength = strlen($this->getInnerText($tagsList->item($i)));
1049 | $toRemove = false;
1050 |
1051 | if ($this->lightClean) {
1052 | $this->dbg('Light clean...');
1053 | if ( ($img > $p) && ($img > 4) ) {
1054 | $this->dbg(' more than 4 images and more image elements than paragraph elements');
1055 | $toRemove = true;
1056 | } else if ($li > $p && $tag != 'ul' && $tag != 'ol') {
1057 | $this->dbg(' too many
elements, and parent is not