├── LICENSE ├── MOBIClass ├── CharacterEntities.php ├── ContentProvider.php ├── EXTHHelper.php ├── FileByte.php ├── FileDate.php ├── FileElement.php ├── FileInt.php ├── FileObject.php ├── FileRecord.php ├── FileShort.php ├── FileString.php ├── FileTri.php ├── Http.php ├── ImageHandler.php ├── LinkedStringBuilder.php ├── MOBI.php ├── MOBIFile.php ├── MultipleFileHandler.php ├── OnlineArticle.php ├── PalmRecord.php ├── Prc.php ├── PreprocessedArticle.php ├── RecognizeURL.php ├── Record.php ├── RecordFactory.php ├── Settings.php ├── constants.php ├── downloaders │ └── FanFictionNet.php ├── http_build_url.php └── readability │ ├── JSLikeHTMLElement.php │ └── Readability.php ├── README.md └── index.php /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2013 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | -------------------------------------------------------------------------------- /MOBIClass/CharacterEntities.php: -------------------------------------------------------------------------------- 1 | output is UTF-8 11 | return $str; 12 | //return utf8_encode($str); 13 | //Convert to CP1252 14 | list($from, $to) = CharacterEntities::generateTables(); 15 | return str_replace($from, $to, $str); 16 | } 17 | 18 | private static function generateTables(){ 19 | $from = array(); 20 | $to = array(); 21 | 22 | for($i = 0; $i < 256; $i++){ 23 | $from[$i] = $to[$i] = chr($i); 24 | } 25 | 26 | $from[0x80] = "€"; 27 | $from[0x82] = "‚"; 28 | $from[0x83] = "ƒ"; 29 | $from[0x84] = "„"; 30 | $from[0x85] = "…"; 31 | $from[0x86] = "†"; 32 | $from[0x87] = "‡"; 33 | $from[0x88] = "ˆ"; 34 | $from[0x89] = "‰"; 35 | $from[0x8A] = "Š"; 36 | $from[0x8B] = "‹"; 37 | $from[0x8C] = "Œ"; 38 | $from[0x8E] = "Ž"; 39 | 40 | $from[0x91] = "‘"; 41 | $from[0x92] = "’"; 42 | $from[0x93] = "“"; 43 | $from[0x94] = "”"; 44 | $from[0x95] = "•"; 45 | $from[0x96] = "–"; 46 | $from[0x97] = "—"; 47 | $from[0x98] = "˜"; 48 | $from[0x99] = "™"; 49 | $from[0x9A] = "š"; 50 | $from[0x9B] = "›"; 51 | $from[0x9C] = "œ"; 52 | $from[0x9E] = "ž"; 53 | $from[0x9F] = "Ÿ"; 54 | 55 | $from[0xA1] = "¡"; 56 | $from[0xA2] = "¢"; 57 | $from[0xA3] = "£"; 58 | $from[0xA4] = "¤"; 59 | $from[0xA5] = "¥"; 60 | $from[0xA6] = "¦"; 61 | $from[0xA7] = "§"; 62 | $from[0xA8] = "¨"; 63 | $from[0xA9] = "©"; 64 | $from[0xAA] = "ª"; 65 | $from[0xAB] = "«"; 66 | $from[0xAC] = "¬"; 67 | $from[0xAE] = "®"; 68 | $from[0xAF] = "¯"; 69 | 70 | $from[0xB0] = "°"; 71 | $from[0xB1] = "±"; 72 | $from[0xB2] = "²"; 73 | $from[0xB3] = "³"; 74 | $from[0xB4] = "´"; 75 | $from[0xB5] = "µ"; 76 | $from[0xB6] = "¶"; 77 | $from[0xB7] = "·"; 78 | $from[0xB8] = "¸"; 79 | $from[0xB9] = "¹"; 80 | $from[0xBA] = "º"; 81 | $from[0xBB] = "»"; 82 | $from[0xBC] = "¼"; 83 | $from[0xBD] = "½"; 84 | $from[0xBE] = "¾"; 85 | $from[0xBF] = "¿"; 86 | 87 | $from[0xC0] = "À"; 88 | $from[0xC1] = "Á"; 89 | $from[0xC2] = "Â"; 90 | $from[0xC3] = "Ã"; 91 | $from[0xC4] = "Ä"; 92 | $from[0xC5] = "Å"; 93 | $from[0xC6] = "Æ"; 94 | $from[0xC7] = "Ç"; 95 | $from[0xC8] = "È"; 96 | $from[0xC9] = "É"; 97 | $from[0xCA] = "Ê"; 98 | $from[0xCB] = "Ë"; 99 | $from[0xCC] = "Ì"; 100 | $from[0xCD] = "Í"; 101 | $from[0xCE] = "Î"; 102 | $from[0xCF] = "Ï"; 103 | 104 | $from[0xD0] = "Ð"; 105 | $from[0xD1] = "Ñ"; 106 | $from[0xD2] = "Ò"; 107 | $from[0xD3] = "Ó"; 108 | $from[0xD4] = "Ô"; 109 | $from[0xD5] = "Õ"; 110 | $from[0xD6] = "Ö"; 111 | $from[0xD7] = "×"; 112 | $from[0xD8] = "Ø"; 113 | $from[0xD9] = "Ù"; 114 | $from[0xDA] = "Ú"; 115 | $from[0xDB] = "Û"; 116 | $from[0xDC] = "Ü"; 117 | $from[0xDD] = "Ý"; 118 | $from[0xDE] = "Þ"; 119 | $from[0xDF] = "ß"; 120 | 121 | $from[0xE0] = "à"; 122 | $from[0xE1] = "á"; 123 | $from[0xE2] = "â"; 124 | $from[0xE3] = "ã"; 125 | $from[0xE4] = "ä"; 126 | $from[0xE5] = "å"; 127 | $from[0xE6] = "æ"; 128 | $from[0xE7] = "ç"; 129 | $from[0xE8] = "è"; 130 | $from[0xE9] = "é"; 131 | $from[0xEA] = "ê"; 132 | $from[0xEB] = "ë"; 133 | $from[0xEC] = "ì"; 134 | $from[0xED] = "í"; 135 | $from[0xEE] = "î"; 136 | $from[0xEF] = "ï"; 137 | 138 | $from[0xF0] = "ð"; 139 | $from[0xF1] = "ñ"; 140 | $from[0xF2] = "ò"; 141 | $from[0xF3] = "ó"; 142 | $from[0xF4] = "ô"; 143 | $from[0xF5] = "õ"; 144 | $from[0xF6] = "ö"; 145 | $from[0xF7] = "÷"; 146 | $from[0xF8] = "ø"; 147 | $from[0xF9] = "ù"; 148 | $from[0xFA] = "ú"; 149 | $from[0xFB] = "û"; 150 | $from[0xFC] = "ü"; 151 | $from[0xFD] = "ý"; 152 | $from[0xFE] = "þ"; 153 | $from[0xFF] = "ÿ"; 154 | 155 | 156 | return array($from, $to); 157 | } 158 | /* 159 | 00 = U+0000 : NULL 160 | 01 = U+0001 : START OF HEADING 161 | 02 = U+0002 : START OF TEXT 162 | 03 = U+0003 : END OF TEXT 163 | 04 = U+0004 : END OF TRANSMISSION 164 | 05 = U+0005 : ENQUIRY 165 | 06 = U+0006 : ACKNOWLEDGE 166 | 07 = U+0007 : BELL 167 | 08 = U+0008 : BACKSPACE 168 | 09 = U+0009 : HORIZONTAL TABULATION 169 | 0A = U+000A : LINE FEED 170 | 0B = U+000B : VERTICAL TABULATION 171 | 0C = U+000C : FORM FEED 172 | 0D = U+000D : CARRIAGE RETURN 173 | 0E = U+000E : SHIFT OUT 174 | 0F = U+000F : SHIFT IN 175 | 10 = U+0010 : DATA LINK ESCAPE 176 | 11 = U+0011 : DEVICE CONTROL ONE 177 | 12 = U+0012 : DEVICE CONTROL TWO 178 | 13 = U+0013 : DEVICE CONTROL THREE 179 | 14 = U+0014 : DEVICE CONTROL FOUR 180 | 15 = U+0015 : NEGATIVE ACKNOWLEDGE 181 | 16 = U+0016 : SYNCHRONOUS IDLE 182 | 17 = U+0017 : END OF TRANSMISSION BLOCK 183 | 18 = U+0018 : CANCEL 184 | 19 = U+0019 : END OF MEDIUM 185 | 1A = U+001A : SUBSTITUTE 186 | 1B = U+001B : ESCAPE 187 | 1C = U+001C : FILE SEPARATOR 188 | 1D = U+001D : GROUP SEPARATOR 189 | 1E = U+001E : RECORD SEPARATOR 190 | 1F = U+001F : UNIT SEPARATOR 191 | 20 = U+0020 : SPACE 192 | 21 = U+0021 : EXCLAMATION MARK 193 | 22 = U+0022 : QUOTATION MARK 194 | 23 = U+0023 : NUMBER SIGN 195 | 24 = U+0024 : DOLLAR SIGN 196 | 25 = U+0025 : PERCENT SIGN 197 | 26 = U+0026 : AMPERSAND 198 | 27 = U+0027 : APOSTROPHE 199 | 28 = U+0028 : LEFT PARENTHESIS 200 | 29 = U+0029 : RIGHT PARENTHESIS 201 | 2A = U+002A : ASTERISK 202 | 2B = U+002B : PLUS SIGN 203 | 2C = U+002C : COMMA 204 | 2D = U+002D : HYPHEN-MINUS 205 | 2E = U+002E : FULL STOP 206 | 2F = U+002F : SOLIDUS 207 | 30 = U+0030 : DIGIT ZERO 208 | 31 = U+0031 : DIGIT ONE 209 | 32 = U+0032 : DIGIT TWO 210 | 33 = U+0033 : DIGIT THREE 211 | 34 = U+0034 : DIGIT FOUR 212 | 35 = U+0035 : DIGIT FIVE 213 | 36 = U+0036 : DIGIT SIX 214 | 37 = U+0037 : DIGIT SEVEN 215 | 38 = U+0038 : DIGIT EIGHT 216 | 39 = U+0039 : DIGIT NINE 217 | 3A = U+003A : COLON 218 | 3B = U+003B : SEMICOLON 219 | 3C = U+003C : LESS-THAN SIGN 220 | 3D = U+003D : EQUALS SIGN 221 | 3E = U+003E : GREATER-THAN SIGN 222 | 3F = U+003F : QUESTION MARK 223 | 40 = U+0040 : COMMERCIAL AT 224 | 41 = U+0041 : LATIN CAPITAL LETTER A 225 | 42 = U+0042 : LATIN CAPITAL LETTER B 226 | 43 = U+0043 : LATIN CAPITAL LETTER C 227 | 44 = U+0044 : LATIN CAPITAL LETTER D 228 | 45 = U+0045 : LATIN CAPITAL LETTER E 229 | 46 = U+0046 : LATIN CAPITAL LETTER F 230 | 47 = U+0047 : LATIN CAPITAL LETTER G 231 | 48 = U+0048 : LATIN CAPITAL LETTER H 232 | 49 = U+0049 : LATIN CAPITAL LETTER I 233 | 4A = U+004A : LATIN CAPITAL LETTER J 234 | 4B = U+004B : LATIN CAPITAL LETTER K 235 | 4C = U+004C : LATIN CAPITAL LETTER L 236 | 4D = U+004D : LATIN CAPITAL LETTER M 237 | 4E = U+004E : LATIN CAPITAL LETTER N 238 | 4F = U+004F : LATIN CAPITAL LETTER O 239 | 50 = U+0050 : LATIN CAPITAL LETTER P 240 | 51 = U+0051 : LATIN CAPITAL LETTER Q 241 | 52 = U+0052 : LATIN CAPITAL LETTER R 242 | 53 = U+0053 : LATIN CAPITAL LETTER S 243 | 54 = U+0054 : LATIN CAPITAL LETTER T 244 | 55 = U+0055 : LATIN CAPITAL LETTER U 245 | 56 = U+0056 : LATIN CAPITAL LETTER V 246 | 57 = U+0057 : LATIN CAPITAL LETTER W 247 | 58 = U+0058 : LATIN CAPITAL LETTER X 248 | 59 = U+0059 : LATIN CAPITAL LETTER Y 249 | 5A = U+005A : LATIN CAPITAL LETTER Z 250 | 5B = U+005B : LEFT SQUARE BRACKET 251 | 5C = U+005C : REVERSE SOLIDUS 252 | 5D = U+005D : RIGHT SQUARE BRACKET 253 | 5E = U+005E : CIRCUMFLEX ACCENT 254 | 5F = U+005F : LOW LINE 255 | 60 = U+0060 : GRAVE ACCENT 256 | 61 = U+0061 : LATIN SMALL LETTER A 257 | 62 = U+0062 : LATIN SMALL LETTER B 258 | 63 = U+0063 : LATIN SMALL LETTER C 259 | 64 = U+0064 : LATIN SMALL LETTER D 260 | 65 = U+0065 : LATIN SMALL LETTER E 261 | 66 = U+0066 : LATIN SMALL LETTER F 262 | 67 = U+0067 : LATIN SMALL LETTER G 263 | 68 = U+0068 : LATIN SMALL LETTER H 264 | 69 = U+0069 : LATIN SMALL LETTER I 265 | 6A = U+006A : LATIN SMALL LETTER J 266 | 6B = U+006B : LATIN SMALL LETTER K 267 | 6C = U+006C : LATIN SMALL LETTER L 268 | 6D = U+006D : LATIN SMALL LETTER M 269 | 6E = U+006E : LATIN SMALL LETTER N 270 | 6F = U+006F : LATIN SMALL LETTER O 271 | 70 = U+0070 : LATIN SMALL LETTER P 272 | 71 = U+0071 : LATIN SMALL LETTER Q 273 | 72 = U+0072 : LATIN SMALL LETTER R 274 | 73 = U+0073 : LATIN SMALL LETTER S 275 | 74 = U+0074 : LATIN SMALL LETTER T 276 | 75 = U+0075 : LATIN SMALL LETTER U 277 | 76 = U+0076 : LATIN SMALL LETTER V 278 | 77 = U+0077 : LATIN SMALL LETTER W 279 | 78 = U+0078 : LATIN SMALL LETTER X 280 | 79 = U+0079 : LATIN SMALL LETTER Y 281 | 7A = U+007A : LATIN SMALL LETTER Z 282 | 7B = U+007B : LEFT CURLY BRACKET 283 | 7C = U+007C : VERTICAL LINE 284 | 7D = U+007D : RIGHT CURLY BRACKET 285 | 7E = U+007E : TILDE 286 | 7F = U+007F : DELETE 287 | 80 = U+20AC : EURO SIGN 288 | 82 = U+201A : SINGLE LOW-9 QUOTATION MARK 289 | 83 = U+0192 : LATIN SMALL LETTER F WITH HOOK 290 | 84 = U+201E : DOUBLE LOW-9 QUOTATION MARK 291 | 85 = U+2026 : HORIZONTAL ELLIPSIS 292 | 86 = U+2020 : DAGGER 293 | 87 = U+2021 : DOUBLE DAGGER 294 | 88 = U+02C6 : MODIFIER LETTER CIRCUMFLEX ACCENT 295 | 89 = U+2030 : PER MILLE SIGN 296 | 8A = U+0160 : LATIN CAPITAL LETTER S WITH CARON 297 | 8B = U+2039 : SINGLE LEFT-POINTING ANGLE QUOTATION MARK 298 | 8C = U+0152 : LATIN CAPITAL LIGATURE OE 299 | 8E = U+017D : LATIN CAPITAL LETTER Z WITH CARON 300 | 91 = U+2018 : LEFT SINGLE QUOTATION MARK 301 | 92 = U+2019 : RIGHT SINGLE QUOTATION MARK 302 | 93 = U+201C : LEFT DOUBLE QUOTATION MARK 303 | 94 = U+201D : RIGHT DOUBLE QUOTATION MARK 304 | 95 = U+2022 : BULLET 305 | 96 = U+2013 : EN DASH 306 | 97 = U+2014 : EM DASH 307 | 98 = U+02DC : SMALL TILDE 308 | 99 = U+2122 : TRADE MARK SIGN 309 | 9A = U+0161 : LATIN SMALL LETTER S WITH CARON 310 | 9B = U+203A : SINGLE RIGHT-POINTING ANGLE QUOTATION MARK 311 | 9C = U+0153 : LATIN SMALL LIGATURE OE 312 | 9E = U+017E : LATIN SMALL LETTER Z WITH CARON 313 | 9F = U+0178 : LATIN CAPITAL LETTER Y WITH DIAERESIS 314 | A0 = U+00A0 : NO-BREAK SPACE 315 | A1 = U+00A1 : INVERTED EXCLAMATION MARK 316 | A2 = U+00A2 : CENT SIGN 317 | A3 = U+00A3 : POUND SIGN 318 | A4 = U+00A4 : CURRENCY SIGN 319 | A5 = U+00A5 : YEN SIGN 320 | A6 = U+00A6 : BROKEN BAR 321 | A7 = U+00A7 : SECTION SIGN 322 | A8 = U+00A8 : DIAERESIS 323 | A9 = U+00A9 : COPYRIGHT SIGN 324 | AA = U+00AA : FEMININE ORDINAL INDICATOR 325 | AB = U+00AB : LEFT-POINTING DOUBLE ANGLE QUOTATION MARK 326 | AC = U+00AC : NOT SIGN 327 | AD = U+00AD : SOFT HYPHEN 328 | AE = U+00AE : REGISTERED SIGN 329 | AF = U+00AF : MACRON 330 | B0 = U+00B0 : DEGREE SIGN 331 | B1 = U+00B1 : PLUS-MINUS SIGN 332 | B2 = U+00B2 : SUPERSCRIPT TWO 333 | B3 = U+00B3 : SUPERSCRIPT THREE 334 | B4 = U+00B4 : ACUTE ACCENT 335 | B5 = U+00B5 : MICRO SIGN 336 | B6 = U+00B6 : PILCROW SIGN 337 | B7 = U+00B7 : MIDDLE DOT 338 | B8 = U+00B8 : CEDILLA 339 | B9 = U+00B9 : SUPERSCRIPT ONE 340 | BA = U+00BA : MASCULINE ORDINAL INDICATOR 341 | BB = U+00BB : RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK 342 | BC = U+00BC : VULGAR FRACTION ONE QUARTER 343 | BD = U+00BD : VULGAR FRACTION ONE HALF 344 | BE = U+00BE : VULGAR FRACTION THREE QUARTERS 345 | BF = U+00BF : INVERTED QUESTION MARK 346 | C0 = U+00C0 : LATIN CAPITAL LETTER A WITH GRAVE 347 | C1 = U+00C1 : LATIN CAPITAL LETTER A WITH ACUTE 348 | C2 = U+00C2 : LATIN CAPITAL LETTER A WITH CIRCUMFLEX 349 | C3 = U+00C3 : LATIN CAPITAL LETTER A WITH TILDE 350 | C4 = U+00C4 : LATIN CAPITAL LETTER A WITH DIAERESIS 351 | C5 = U+00C5 : LATIN CAPITAL LETTER A WITH RING ABOVE 352 | C6 = U+00C6 : LATIN CAPITAL LETTER AE 353 | C7 = U+00C7 : LATIN CAPITAL LETTER C WITH CEDILLA 354 | C8 = U+00C8 : LATIN CAPITAL LETTER E WITH GRAVE 355 | C9 = U+00C9 : LATIN CAPITAL LETTER E WITH ACUTE 356 | CA = U+00CA : LATIN CAPITAL LETTER E WITH CIRCUMFLEX 357 | CB = U+00CB : LATIN CAPITAL LETTER E WITH DIAERESIS 358 | CC = U+00CC : LATIN CAPITAL LETTER I WITH GRAVE 359 | CD = U+00CD : LATIN CAPITAL LETTER I WITH ACUTE 360 | CE = U+00CE : LATIN CAPITAL LETTER I WITH CIRCUMFLEX 361 | CF = U+00CF : LATIN CAPITAL LETTER I WITH DIAERESIS 362 | D0 = U+00D0 : LATIN CAPITAL LETTER ETH 363 | D1 = U+00D1 : LATIN CAPITAL LETTER N WITH TILDE 364 | D2 = U+00D2 : LATIN CAPITAL LETTER O WITH GRAVE 365 | D3 = U+00D3 : LATIN CAPITAL LETTER O WITH ACUTE 366 | D4 = U+00D4 : LATIN CAPITAL LETTER O WITH CIRCUMFLEX 367 | D5 = U+00D5 : LATIN CAPITAL LETTER O WITH TILDE 368 | D6 = U+00D6 : LATIN CAPITAL LETTER O WITH DIAERESIS 369 | D7 = U+00D7 : MULTIPLICATION SIGN 370 | D8 = U+00D8 : LATIN CAPITAL LETTER O WITH STROKE 371 | D9 = U+00D9 : LATIN CAPITAL LETTER U WITH GRAVE 372 | DA = U+00DA : LATIN CAPITAL LETTER U WITH ACUTE 373 | DB = U+00DB : LATIN CAPITAL LETTER U WITH CIRCUMFLEX 374 | DC = U+00DC : LATIN CAPITAL LETTER U WITH DIAERESIS 375 | DD = U+00DD : LATIN CAPITAL LETTER Y WITH ACUTE 376 | DE = U+00DE : LATIN CAPITAL LETTER THORN 377 | DF = U+00DF : LATIN SMALL LETTER SHARP S 378 | E0 = U+00E0 : LATIN SMALL LETTER A WITH GRAVE 379 | E1 = U+00E1 : LATIN SMALL LETTER A WITH ACUTE 380 | E2 = U+00E2 : LATIN SMALL LETTER A WITH CIRCUMFLEX 381 | E3 = U+00E3 : LATIN SMALL LETTER A WITH TILDE 382 | E4 = U+00E4 : LATIN SMALL LETTER A WITH DIAERESIS 383 | E5 = U+00E5 : LATIN SMALL LETTER A WITH RING ABOVE 384 | E6 = U+00E6 : LATIN SMALL LETTER AE 385 | E7 = U+00E7 : LATIN SMALL LETTER C WITH CEDILLA 386 | E8 = U+00E8 : LATIN SMALL LETTER E WITH GRAVE 387 | E9 = U+00E9 : LATIN SMALL LETTER E WITH ACUTE 388 | EA = U+00EA : LATIN SMALL LETTER E WITH CIRCUMFLEX 389 | EB = U+00EB : LATIN SMALL LETTER E WITH DIAERESIS 390 | EC = U+00EC : LATIN SMALL LETTER I WITH GRAVE 391 | ED = U+00ED : LATIN SMALL LETTER I WITH ACUTE 392 | EE = U+00EE : LATIN SMALL LETTER I WITH CIRCUMFLEX 393 | EF = U+00EF : LATIN SMALL LETTER I WITH DIAERESIS 394 | F0 = U+00F0 : LATIN SMALL LETTER ETH 395 | F1 = U+00F1 : LATIN SMALL LETTER N WITH TILDE 396 | F2 = U+00F2 : LATIN SMALL LETTER O WITH GRAVE 397 | F3 = U+00F3 : LATIN SMALL LETTER O WITH ACUTE 398 | F4 = U+00F4 : LATIN SMALL LETTER O WITH CIRCUMFLEX 399 | F5 = U+00F5 : LATIN SMALL LETTER O WITH TILDE 400 | F6 = U+00F6 : LATIN SMALL LETTER O WITH DIAERESIS 401 | F7 = U+00F7 : DIVISION SIGN 402 | F8 = U+00F8 : LATIN SMALL LETTER O WITH STROKE 403 | F9 = U+00F9 : LATIN SMALL LETTER U WITH GRAVE 404 | FA = U+00FA : LATIN SMALL LETTER U WITH ACUTE 405 | FB = U+00FB : LATIN SMALL LETTER U WITH CIRCUMFLEX 406 | FC = U+00FC : LATIN SMALL LETTER U WITH DIAERESIS 407 | FD = U+00FD : LATIN SMALL LETTER Y WITH ACUTE 408 | FE = U+00FE : LATIN SMALL LETTER THORN 409 | FF = U+00FF : LATIN SMALL LETTER Y WITH DIAERESIS 410 | * 411 | */ 412 | } 413 | ?> 414 | -------------------------------------------------------------------------------- /MOBIClass/ContentProvider.php: -------------------------------------------------------------------------------- 1 | 23 | -------------------------------------------------------------------------------- /MOBIClass/EXTHHelper.php: -------------------------------------------------------------------------------- 1 | > (8*$i)).$out; 25 | $mask = $mask << 8; 26 | } 27 | return $out; 28 | } 29 | 30 | static function getRightRepresentation($type, $value){ 31 | if($type >= 100 && $type < 200){ 32 | return $value; 33 | }else{ 34 | return self::toHex($value); 35 | } 36 | } 37 | 38 | static function toHex($value){ 39 | $out = ""; 40 | for($i = 0, $len = strlen($value); $i < $len; $i++){ 41 | if($i > 0) $out .= " "; 42 | $hex = dechex(ord($value[$i])); 43 | if(strlen($hex) < 2) $hex = "0".$hex; 44 | $out .= $hex; 45 | } 46 | return $out; 47 | } 48 | 49 | 50 | static private $types = array( 51 | 1 => "drm server id", 52 | 2 => "drm commerce id", 53 | 3 => "drm ebookbase book id", 54 | 100 => "author", 55 | 101 => "publisher", 56 | 102 => "imprint", 57 | 103 => "description", 58 | 104 => "isbn", 59 | 105 => "subject", 60 | 106 => "publishingdate", 61 | 107 => "review", 62 | 108 => "contributor", 63 | 109 => "rights", 64 | 110 => "subjectcode", 65 | 111 => "type", 66 | 112 => "source", 67 | 113 => "asin", 68 | 114 => "versionnumber", 69 | 115 => "sample", 70 | 116 => "startreading", 71 | 118 => "retail price", 72 | 119 => "retail price currency", 73 | 201 => "coveroffset", 74 | 202 => "thumboffset", 75 | 203 => "hasfakecover", 76 | 204 => "Creator Software", 77 | 205 => "Creator Major Version", 78 | 206 => "Creator Minor Version", 79 | 207 => "Creator Build Number", 80 | 208 => "watermark", 81 | 209 => "tamper proof keys", 82 | 300 => "fontsignature", 83 | 401 => "clippinglimit", 84 | 402 => "publisherlimit", 85 | 403 => "403", 86 | 404 => "ttsflag", 87 | 501 => "cdetype", 88 | 502 => "lastupdatetime", 89 | 503 => "updatedtitle" 90 | ); 91 | static private $flippedTypes = array( 92 | "drm server id" => 1, 93 | "drm commerce id" => 2, 94 | "drm ebookbase book id" => 3, 95 | "author" => 100, 96 | "publisher" => 101, 97 | "imprint" => 102, 98 | "description" => 103, 99 | "isbn" => 104, 100 | "subject" => 105, 101 | "publishingdate" => 106, 102 | "review" => 107, 103 | "contributor" => 108, 104 | "rights" => 109, 105 | "subjectcode" => 110, 106 | "type" => 111, 107 | "source" => 112, 108 | "asin" => 113, 109 | "versionnumber" => 114, 110 | "sample" => 115, 111 | "startreading" => 116, 112 | "retail price" => 118, 113 | "retail price currency" => 119, 114 | "coveroffset" => 201, 115 | "thumboffset" => 202, 116 | "hasfakecover" => 203, 117 | "Creator Software" => 204, 118 | "Creator Major Version" => 205, 119 | "Creator Minor Version" => 206, 120 | "Creator Build Number" => 207, 121 | "watermark" => 208, 122 | "tamper proof keys" => 209, 123 | "fontsignature" => 300, 124 | "clippinglimit" => 401, 125 | "publisherlimit" => 402, 126 | "403" => 403, 127 | "ttsflag" => 404, 128 | "cdetype" => 501, 129 | "lastupdatetime" => 502, 130 | "updatedtitle" => 503 131 | ); 132 | } -------------------------------------------------------------------------------- /MOBIClass/FileByte.php: -------------------------------------------------------------------------------- 1 | set($n); 18 | } 19 | 20 | public function get(){ 21 | return $this->data; 22 | } 23 | 24 | public function set($value){ 25 | $this->data = intval($value) & 0xFF; 26 | } 27 | 28 | public function serialize() { 29 | return $this->byteToString($this->data); 30 | } 31 | 32 | public function unserialize($data) { 33 | __construct($this->toInt($data)); 34 | } 35 | 36 | 37 | public function __toString(){ 38 | return "FileByte: {".$this->byteAsString($this->data)."}"; 39 | } 40 | } 41 | ?> 42 | -------------------------------------------------------------------------------- /MOBIClass/FileDate.php: -------------------------------------------------------------------------------- 1 | set($n); 18 | } 19 | 20 | public function get(){ 21 | return $this->data; 22 | } 23 | 24 | public function set($value){ 25 | $this->data = intval($value); 26 | } 27 | 28 | public function serialize() { 29 | return $this->intToString($this->data); 30 | } 31 | 32 | public function unserialize($data) { 33 | __construct($this->toInt($data)); 34 | } 35 | 36 | public function __toString(){ 37 | return "FileDate: {".(date("r", $this->data-94694400))."}"; 38 | } 39 | } 40 | ?> 41 | -------------------------------------------------------------------------------- /MOBIClass/FileElement.php: -------------------------------------------------------------------------------- 1 | elements = $elements; 20 | } 21 | 22 | public function getByteLength(){ 23 | return $this->getLength(); 24 | } 25 | 26 | public function getLength(){ 27 | $total = 0; 28 | foreach($this->elements as $val){ 29 | $total += $val->getByteLength(); 30 | } 31 | return $total; 32 | } 33 | 34 | public function offsetToEntry($name){ 35 | $pos = 0; 36 | foreach($this->elements as $key=>$value){ 37 | if($name == $key){ 38 | break; 39 | } 40 | $pos += $value->getByteLength(); 41 | } 42 | return $pos; 43 | } 44 | 45 | public function exists($key){ 46 | return isset($this->elements[$key]); 47 | } 48 | /** 49 | * @param string $key 50 | * @return FileObject 51 | */ 52 | public function get($key){ 53 | return $this->elements[$key]; 54 | } 55 | 56 | /** 57 | * @param string $key 58 | * @param FileObject $value 59 | */ 60 | public function set($key, $value){ 61 | $this->elements[$key] = $value; 62 | } 63 | 64 | public function add($key, $value){ 65 | $this->elements[$key] = $value; 66 | } 67 | 68 | public function serialize() { 69 | $result = array(); 70 | foreach($this->elements as $val){ 71 | $result[] = $val->serialize(); 72 | } 73 | return implode("", $result); 74 | } 75 | 76 | public function unserialize($data) { 77 | //TODO: If reading is needed -> way more complex 78 | } 79 | 80 | public function __toString(){ 81 | $output = "FileElement (".$this->getByteLength()." bytes): {\n"; 82 | foreach($this->elements as $key=>$value){ 83 | $output .= "\t".$key.": ".$value."\n"; 84 | } 85 | $output .= "}"; 86 | return $output; 87 | } 88 | } 89 | ?> 90 | -------------------------------------------------------------------------------- /MOBIClass/FileInt.php: -------------------------------------------------------------------------------- 1 | set($n); 18 | } 19 | 20 | public function get(){ 21 | return $this->data; 22 | } 23 | 24 | public function set($value){ 25 | $this->data = intval($value); 26 | } 27 | 28 | public function serialize() { 29 | return $this->intToString($this->data); 30 | } 31 | 32 | public function unserialize($data) { 33 | __construct($this->toInt($data)); 34 | } 35 | 36 | public function __toString(){ 37 | return "FileInt: {".$this->intAsString($this->data)."}"; 38 | } 39 | } 40 | ?> 41 | -------------------------------------------------------------------------------- /MOBIClass/FileObject.php: -------------------------------------------------------------------------------- 1 | byteLength = $byteLength; 13 | } 14 | 15 | public function getByteLength(){ 16 | if($this->byteLength >= 0){ 17 | return $this->byteLength; 18 | } 19 | return $this->getLength(); 20 | } 21 | 22 | public function getLength(){ 23 | throw new Exception("Sub-class needs to implement this if it doesn't have a fixed length"); 24 | } 25 | 26 | /** 27 | * Convert a string to byte format (maximum 4 bytes) 28 | * @param string $string Input string 29 | * @return int Output integer 30 | */ 31 | public function toInt($string){ 32 | $out = 0; 33 | for($i = 0, $len = min(4, strlen($string)); $i < $len; $i++){ 34 | $out = $out | (ord($string[$i]) << (($len-$i-1)*8)); 35 | } 36 | return $out; 37 | } 38 | 39 | /** 40 | * Convert a byte (stored in an integer) to a string 41 | * @param byte $int 42 | * @return string 43 | */ 44 | public function byteToString($int){ 45 | return $this->toString($int, 1); 46 | } 47 | 48 | /** 49 | * Convert a byte (stored in an integer) to a string 50 | * @param byte $int 51 | * @return string 52 | */ 53 | public function byteAsString($int){ 54 | return $this->asString($int, 1); 55 | } 56 | 57 | /** 58 | * Convert a short (stored in an integer) to a string 59 | * @param short $int 60 | * @return string 61 | */ 62 | public function shortToString($int){ 63 | return $this->toString($int, 2); 64 | } 65 | 66 | /** 67 | * Convert a short (stored in an integer) to a string 68 | * @param short $int 69 | * @return string 70 | */ 71 | public function shortAsString($int){ 72 | return $this->asString($int, 2); 73 | } 74 | 75 | /** 76 | * Convert a tri-byte (stored in an integer) to a string 77 | * @param tri-byte $int 78 | * @return string 79 | */ 80 | public function triToString($int){ 81 | return $this->toString($int, 3); 82 | } 83 | 84 | /** 85 | * Convert a tri-byte (stored in an integer) to a string 86 | * @param tri-byte $int 87 | * @return string 88 | */ 89 | public function triAsString($int){ 90 | return $this->asString($int, 3); 91 | } 92 | 93 | /** 94 | * Convert an integer to a string 95 | * @param int $int 96 | * @return string 97 | */ 98 | public function intToString($int){ 99 | return $this->toString($int, 4); 100 | } 101 | 102 | /** 103 | * Convert an integer to a string 104 | * @param int $int 105 | * @return string 106 | */ 107 | public function intAsString($int){ 108 | return $this->asString($int, 4); 109 | } 110 | 111 | /** 112 | * Convert a number of n bytes to a string 113 | * @param int $int Number that should be converted 114 | * @param int $size Number of bytes to convert 115 | * @return string Output string 116 | */ 117 | private function toString($int, $size){ 118 | $out = ""; 119 | for($i = 0; $i < $size; $i++){ 120 | $out = chr($int & 0xFF).$out; 121 | $int = $int >> 8; 122 | } 123 | return $out; 124 | } 125 | 126 | /** 127 | * Convert a number of n bytes to a string 128 | * @param int $int Number that should be converted 129 | * @param int $size Number of bytes to convert 130 | * @return string Output string 131 | */ 132 | private function asString($int, $size){ 133 | $out = ""; 134 | for($i = 0; $i < $size; $i++){ 135 | if($i > 0) $out = " ".$out; 136 | $byte = dechex($int & 0xFF); 137 | if(strlen($byte) == 1) $byte = "0".$byte; 138 | $out = $byte.$out; 139 | $int = $int >> 8; 140 | } 141 | return $out; 142 | } 143 | 144 | /** 145 | * Get the value 146 | * @return mixed Value to get 147 | */ 148 | abstract public function get(); 149 | 150 | /** 151 | * Set the value 152 | * @return mixed Value to set 153 | */ 154 | abstract public function set($value); 155 | 156 | /** 157 | * Serialize the object 158 | * @return string String representation 159 | */ 160 | abstract public function serialize(); 161 | 162 | /** 163 | * Unserialize the object 164 | * @param string $data String representation 165 | */ 166 | abstract public function unserialize($data); 167 | } 168 | ?> 169 | -------------------------------------------------------------------------------- /MOBIClass/FileRecord.php: -------------------------------------------------------------------------------- 1 | record = $record; 20 | } 21 | 22 | public function getByteLength(){ 23 | return $this->getLength(); 24 | } 25 | 26 | public function getLength(){ 27 | return $this->record->getLength(); 28 | } 29 | 30 | public function get(){ 31 | return $this->record; 32 | } 33 | 34 | public function set($record){ 35 | $this->record = $record; 36 | } 37 | 38 | public function serialize() { 39 | return $this->record->serialize(); 40 | } 41 | 42 | public function unserialize($data) { 43 | __construct($this->record->unserialize($data)); 44 | } 45 | } 46 | ?> 47 | -------------------------------------------------------------------------------- /MOBIClass/FileShort.php: -------------------------------------------------------------------------------- 1 | set($n); 18 | } 19 | 20 | public function get(){ 21 | return $this->data; 22 | } 23 | 24 | public function set($value){ 25 | $this->data = intval($value) & 0xFFFF; 26 | } 27 | 28 | public function serialize() { 29 | return $this->shortToString($this->data); 30 | } 31 | 32 | public function unserialize($data) { 33 | __construct($this->toInt($data)); 34 | } 35 | 36 | 37 | public function __toString(){ 38 | return "FileShort: {".$this->shortAsString($this->data)."}"; 39 | } 40 | } 41 | ?> 42 | -------------------------------------------------------------------------------- /MOBIClass/FileString.php: -------------------------------------------------------------------------------- 1 | forcedLength = -1; 21 | $this->data = ""; 22 | 23 | if($second != null){ 24 | $this->data = $first; 25 | $this->forcedLength = $second; 26 | }else if($first != null){ 27 | if(is_string($first)){ 28 | $this->data = $first; 29 | }else{ 30 | $this->forcedLength = $first; 31 | } 32 | } 33 | } 34 | 35 | public function getByteLength(){ 36 | return $this->getLength(); 37 | } 38 | 39 | public function getLength(){ 40 | if($this->forcedLength >= 0){ 41 | return $this->forcedLength; 42 | } 43 | return strlen($this->data); 44 | } 45 | 46 | public function get(){ 47 | return $this->data; 48 | } 49 | 50 | public function set($value){ 51 | $this->data = $value; 52 | } 53 | 54 | public function serialize() { 55 | $output = $this->data; 56 | $curLength = strlen($output); 57 | 58 | if($this->forcedLength >= 0){ 59 | if($this->forcedLength > $curLength){ 60 | return str_pad($output, $this->forcedLength, "\0", STR_PAD_RIGHT); 61 | }elseif($this->forcedLength == $curLength){ 62 | return $output; 63 | }else{ 64 | return substr($output, 0, $this->forcedLength); 65 | } 66 | } 67 | return $output; 68 | } 69 | 70 | public function unserialize($data) { 71 | __construct($data); 72 | } 73 | 74 | public function __toString(){ 75 | $out = "FileString"; 76 | if($this->forcedLength >= 0){ 77 | $out .= " ".$this->forcedLength; 78 | } 79 | $out .= ": {\"".str_replace(array(" ", "\0"), " ", $this->serialize())."\"}"; 80 | return $out; 81 | } 82 | } 83 | ?> 84 | -------------------------------------------------------------------------------- /MOBIClass/FileTri.php: -------------------------------------------------------------------------------- 1 | set($n); 18 | } 19 | 20 | public function get(){ 21 | return $this->data; 22 | } 23 | 24 | public function set($value){ 25 | $this->data = intval($value) & 0xFFFFFF; 26 | } 27 | 28 | public function serialize() { 29 | return $this->triToString($this->data); 30 | } 31 | 32 | public function unserialize($data) { 33 | __construct($this->toInt($data)); 34 | } 35 | 36 | 37 | public function __toString(){ 38 | return "FileTri: {".$this->triAsString($this->data)."}"; 39 | } 40 | } 41 | ?> 42 | -------------------------------------------------------------------------------- /MOBIClass/Http.php: -------------------------------------------------------------------------------- 1 | 'val1', 'var2' => 'val2') */ 19 | $postdata = array(), /* HTTP POST Data ie. array('var1' => 'val1', 'var2' => 'val2') */ 20 | $cookie = array(), /* HTTP Cookie Data ie. array('var1' => 'val1', 'var2' => 'val2') */ 21 | $custom_headers = array(), /* Custom HTTP headers ie. array('Referer: http://localhost/ */ 22 | $timeout = 1000, /* Socket timeout in milliseconds */ 23 | $req_hdr = false, /* Include HTTP request headers */ 24 | $res_hdr = false, /* Include HTTP response headers */ 25 | $depth = 4 /* Depth of the iteration left (to avoid redirection loops) */ 26 | ) 27 | { 28 | if(self::$cache){ 29 | $cacheFile = "cache/".$ip."/".str_replace("/", "...", $uri); 30 | 31 | if(is_file($cacheFile)){ 32 | $data = file_get_contents($cacheFile); 33 | 34 | return self::resolveTruncated($data); 35 | } 36 | } 37 | $ret = ''; 38 | $verb = strtoupper($verb); 39 | $cookie_str = ''; 40 | $getdata_str = count($getdata) ? '?' : ''; 41 | $postdata_str = ''; 42 | 43 | foreach ($getdata as $k => $v) 44 | $getdata_str .= urlencode($k) .'='. urlencode($v); 45 | 46 | foreach ($postdata as $k => $v) 47 | $postdata_str .= urlencode($k) .'='. urlencode($v) .'&'; 48 | 49 | foreach ($cookie as $k => $v) 50 | $cookie_str .= urlencode($k) .'='. urlencode($v) .'; '; 51 | 52 | $crlf = "\r\n"; 53 | $req = $verb .' '. $uri . $getdata_str .' HTTP/1.1' . $crlf; 54 | $req .= 'Host: '. $ip . $crlf; 55 | $req .= 'User-Agent: Mozilla/5.0 Firefox/3.6.12' . $crlf; 56 | $req .= 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' . $crlf; 57 | $req .= 'Accept-Language: en-us,en;q=0.5' . $crlf; 58 | $req .= 'Accept-Encoding: deflate' . $crlf; 59 | $req .= 'Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7' . $crlf; 60 | 61 | 62 | foreach ($custom_headers as $k => $v) 63 | $req .= $k .': '. $v . $crlf; 64 | 65 | if (!empty($cookie_str)) 66 | $req .= 'Cookie: '. substr($cookie_str, 0, -2) . $crlf; 67 | 68 | if ($verb == 'POST' && !empty($postdata_str)) 69 | { 70 | $postdata_str = substr($postdata_str, 0, -1); 71 | $req .= 'Content-Type: application/x-www-form-urlencoded' . $crlf; 72 | $req .= 'Content-Length: '. strlen($postdata_str) . $crlf . $crlf; 73 | $req .= $postdata_str; 74 | } 75 | else $req .= $crlf; 76 | 77 | if ($req_hdr) 78 | $ret .= $req; 79 | 80 | if (($fp = @fsockopen($ip, $port, $errno, $errstr)) == false) 81 | return "Error $errno: $errstr\n"; 82 | 83 | stream_set_timeout($fp, 0, $timeout * 1000); 84 | 85 | fputs($fp, $req); 86 | $ret .= stream_get_contents($fp); 87 | fclose($fp); 88 | 89 | $headerSplit = strpos($ret, "\r\n\r\n"); 90 | $header = substr($ret, 0, $headerSplit); 91 | 92 | $redirectURL = self::CheckForRedirect($header); 93 | 94 | if($redirectURL !== false){ 95 | if($depth > 0){ 96 | $url_parts = parse_url($redirectURL); 97 | $url_parts["port"] = isset($url_parts["port"]) ? $url_parts["port"] : 80; 98 | $url_parts["path"] = isset($url_parts["path"]) ? $url_parts["path"] : "/"; 99 | 100 | return self::FullRequest($verb, $url_parts["host"], $url_parts["port"], $url_parts["path"], $getdata, $postdata, $cookie, $custom_headers, $timeout, $req_hdr, $res_hdr, $depth-1); 101 | }else{ 102 | return "Redirect loop, stopping..."; 103 | } 104 | } 105 | 106 | $truncated = false; 107 | $headerLines = explode("\r\n", $header); 108 | foreach($headerLines as $line){ 109 | list($name, $value) = explode(":", $line); 110 | $name = trim($name); 111 | $value = trim($value); 112 | 113 | if(strtolower($name) == "transfer-encoding" && strtolower($value) == "chunked"){ //TODO: Put right values! 114 | $truncated = true; 115 | } 116 | } 117 | 118 | if (!$res_hdr) 119 | $ret = substr($ret, $headerSplit + 4); 120 | 121 | if($truncated){ 122 | $ret = self::resolveTruncated($ret); 123 | } 124 | if(self::$cache){ 125 | if(!is_dir("cache")){ 126 | mkdir("cache"); 127 | } 128 | if(!is_dir("cache/".$ip)){ 129 | mkdir("cache/".$ip); 130 | } 131 | if(!is_file("cache/".$ip."/".str_replace("/", "...", $uri))){ 132 | $h = fopen("cache/".$ip."/".str_replace("/", "...", $uri), "w"); 133 | fwrite($h, $ret); 134 | fclose($h); 135 | } 136 | } 137 | 138 | return $ret; 139 | } 140 | 141 | private static function resolveTruncated($data){ 142 | $pos = 0; 143 | $end = strlen($data); 144 | $out = ""; 145 | 146 | while($pos < $end){ 147 | $endVal = strpos($data, "\r\n", $pos); 148 | $value = hexdec(substr($data, $pos, $endVal-$pos)); 149 | $out .= substr($data, $endVal+2, $value); 150 | $pos = $endVal+2+$value; 151 | } 152 | 153 | return $out; 154 | } 155 | 156 | private static function CheckForRedirect($header){ 157 | $firstLine = substr($header, 0, strpos($header, "\r\n")); 158 | list($httpVersion, $statusCode, $message) = explode(" ", $firstLine); 159 | 160 | if(substr($statusCode, 0, 1) == "3"){ 161 | $part = substr($header, strpos(strtolower($header), "location: ")+strlen("location: ")); 162 | $location = trim(substr($part, 0, strpos($part, "\r\n"))); 163 | 164 | if(strlen($location) > 0){ 165 | return $location; 166 | } 167 | } 168 | return false; 169 | } 170 | } 171 | ?> -------------------------------------------------------------------------------- /MOBIClass/ImageHandler.php: -------------------------------------------------------------------------------- 1 | 41 | -------------------------------------------------------------------------------- /MOBIClass/LinkedStringBuilder.php: -------------------------------------------------------------------------------- 1 | links[$name] = $this->length(); 12 | } 13 | 14 | public function resolveLink($name, $value) { 15 | $this->resolutions[$name] = $value; 16 | } 17 | 18 | public function append($string) { 19 | $len = strlen($string); 20 | 21 | $this->length += $len; 22 | $this->partSize[] = $len; 23 | $this->parts[] = $string; 24 | } 25 | 26 | public function replace($from, $to, $replacement) { 27 | $partStart = 0; 28 | $partEnd = 0; 29 | for ($i = 0, $len = sizeof($this->partSize); $i < $len; $i++) { 30 | $partEnd += $this->partSize[$i]; 31 | if ($partEnd > $from) { 32 | if ($partEnd < $to) { 33 | $this->replace($partEnd, $to, substr($replacement, $partEnd - $from)); 34 | $replacement = substr($replacement, 0, $partEnd - $from); 35 | $to = $partEnd; 36 | } 37 | 38 | $cur = $this->parts[$i]; 39 | 40 | for ($j = 0; $j < $to - $from; $j++) { 41 | $cur[$from - $partStart + $j] = $replacement[$j]; 42 | } 43 | 44 | $this->parts[$i] = $cur; 45 | return true; 46 | } 47 | $partStart = $partEnd; 48 | } 49 | 50 | throw new Exception("Couldn't replace string (target longer than source?)"); 51 | } 52 | 53 | public function length() { 54 | return $this->length; 55 | } 56 | 57 | public function processLinks() { 58 | foreach ($this->resolutions as $name => $value) { 59 | if (isset($this->links[$name])) { 60 | $start = $this->links[$name]; 61 | $this->replace($start, $start + strlen($value), $value); 62 | 63 | unset($this->resolutions[$name]); 64 | } 65 | } 66 | } 67 | 68 | public function build() { 69 | $this->processLinks(); 70 | 71 | return implode("", $this->parts); 72 | } 73 | } -------------------------------------------------------------------------------- /MOBIClass/MOBI.php: -------------------------------------------------------------------------------- 1 | setInternetSource($url); //Load URL, the result will be cleaned using a Readability port 42 | * $mobi->setFileSource($file); //Load a local file without any extra changes 43 | * $mobi->setData($data); //Load data 44 | * 45 | * //If you want, you can set some optional settings (see Settings.php for all recognized settings) 46 | * $options = array( 47 | * "title"=>"Insert title here", 48 | * "author"=>"Author" 49 | * ); 50 | * $mobi->setOptions($options); 51 | * 52 | * //Then there are two ways to output it: 53 | * $mobi->save($file); //Save the file locally 54 | * $mobi->download($name); //Let the client download the file, make sure the page 55 | * //that calls it doesn't output anything, otherwise it might 56 | * //conflict with the download. $name contains the file name, 57 | * //usually something like "title.mobi" (where the title should 58 | * //be cleaned so as not to contain illegal characters). 59 | * 60 | * 61 | * @author Sander Kromwijk 62 | */ 63 | class MOBI { 64 | private $source = false; 65 | private $images = array(); 66 | private $optional = array(); 67 | private $imgCounter = 0; 68 | private $debug = false; 69 | private $prc = false; 70 | 71 | public function __construct(){ 72 | 73 | } 74 | 75 | public function getTitle(){ 76 | if(isset($this->optional["title"])){ 77 | return $this->optional["title"]; 78 | } 79 | return false; 80 | } 81 | 82 | /** 83 | * Set a content provider as source 84 | * @param ContentProvider $content Content Provider to use 85 | */ 86 | public function setContentProvider($content){ 87 | $this->setOptions($content->getMetaData()); 88 | $this->setImages($content->getImages()); 89 | $this->setData($content->getTextData()); 90 | } 91 | 92 | /** 93 | * Set a local file as source 94 | * @param string $file Path to the file 95 | */ 96 | public function setFileSource($file){ 97 | $this->setData(file_get_contents($file)); 98 | } 99 | 100 | /** 101 | * Set the data to use 102 | * @param string $data Data to put in the file 103 | */ 104 | public function setData($data){ 105 | //$data = utf8_encode($data); 106 | $data = CharacterEntities::convert($data); 107 | //$data = utf8_decode($data); 108 | //$this->source = iconv('UTF-8', 'ISO-8859-1//TRANSLIT', $data); 109 | $this->source = $data; 110 | $this->prc = false; 111 | } 112 | 113 | /** 114 | * Set the images to use 115 | * @param array $data Data to put in the file 116 | */ 117 | public function setImages($data){ 118 | $this->images = $data; 119 | $this->prc = false; 120 | } 121 | 122 | /** 123 | * Set options, usually for things like titles, authors, etc... 124 | * @param array $options Options to set 125 | */ 126 | public function setOptions($options){ 127 | $this->optional = $options; 128 | $this->prc = false; 129 | } 130 | 131 | /** 132 | * Prepare the prc file 133 | * @return Prc The file that can be used to be saved/downloaded 134 | */ 135 | private function preparePRC(){ 136 | if($this->source === false){ 137 | throw new Exception("No data set"); 138 | } 139 | if($this->prc !== false) return $this->prc; 140 | 141 | $data = $this->source; 142 | $len = strlen($data); 143 | 144 | $settings = new Settings($this->optional); 145 | $rec = new RecordFactory($settings); 146 | $dataRecords = $rec->createRecords($data); 147 | $nRecords = sizeof($dataRecords); 148 | $mobiHeader = new PalmRecord($settings, $dataRecords, $nRecords, $len, sizeof($this->images)); 149 | array_unshift($dataRecords, $mobiHeader); 150 | $dataRecords = array_merge($dataRecords, $this->images); 151 | $dataRecords[] = $rec->createFLISRecord(); 152 | $dataRecords[] = $rec->createFCISRecord($len); 153 | $dataRecords[] = $rec->createEOFRecord(); 154 | $this->prc = new Prc($settings, $dataRecords); 155 | return $this->prc; 156 | } 157 | 158 | /** 159 | * Save the file locally 160 | * @param string $filename Path to save the file 161 | */ 162 | public function save($filename){ 163 | $prc = $this->preparePRC(); 164 | $prc->save($filename); 165 | } 166 | 167 | /** 168 | * Let the client download the file. Warning! No data should be 169 | * outputted before or after. 170 | * @param string $name Name used for download, usually "title.mobi" 171 | */ 172 | public function download($name){ 173 | $prc = $this->preparePRC(); 174 | $data = $prc->serialize(); 175 | $length = strlen($data); 176 | 177 | if($this->debug) return; //In debug mode, don't start the download 178 | 179 | header("Content-Type: application/x-mobipocket-ebook"); 180 | header("Content-Disposition: attachment; filename=\"".$name."\""); 181 | header("Content-Transfer-Encoding: binary"); 182 | header("Accept-Ranges: bytes"); 183 | header("Cache-control: private"); 184 | header('Pragma: private'); 185 | header("Expires: Mon, 26 Jul 1997 05:00:00 GMT"); 186 | header("Content-Length: ".$length); 187 | 188 | echo $data; 189 | //Finished! 190 | } 191 | 192 | } 193 | ?> -------------------------------------------------------------------------------- /MOBIClass/MOBIFile.php: -------------------------------------------------------------------------------- 1 | "Unknown Title", "toc" => true); 20 | private $parts = array(); 21 | private $images = array(); 22 | 23 | private $links = array(); 24 | 25 | /** 26 | * Get the text data (the "html" code) 27 | */ 28 | public function getTextData(){ 29 | $str = new LinkedStringBuilder(); 30 | 31 | $str->append(""); 32 | $str->append(""); 33 | $this->addGuide($str); 34 | $str->append(""); 35 | $str->append(""); 36 | 37 | $this->resolveFilepos($str, self::START_LINK); 38 | $str->append("

".$this->settings["title"]."

"); 39 | 40 | $entries = $this->addText($str); 41 | 42 | $this->addTOC($str, $entries); 43 | 44 | $str->append(""); 45 | $str->append(""); 46 | 47 | return $str->build(); 48 | } 49 | 50 | private function filepos($position) { 51 | return str_pad($position, 10, "0", STR_PAD_LEFT); 52 | } 53 | 54 | private function addFilepos($str, $name) { 55 | $str->addLink($name); 56 | $str->append($this->filepos(0)); 57 | } 58 | 59 | private function resolveFilepos($str, $name) { 60 | $str->resolveLink($name, $this->filepos($str->length())); 61 | } 62 | 63 | private function addGuide($str) { 64 | $str->append(""); 65 | $str->append(""); 68 | $str->append(""); 71 | $str->append(""); 72 | } 73 | 74 | 75 | /** 76 | * Generate the body's text and the chapter entries 77 | * @return array($string, $entries) $string is the html data, $entries 78 | * contains the level, the title and the position of the titles. 79 | */ 80 | private function addText($str){ 81 | $entries = array(); 82 | 83 | for($i = 0; $i < sizeof($this->parts); $i++){ 84 | list($type, $data) = $this->parts[$i]; 85 | $id = "title_".$i; 86 | 87 | switch($type){ 88 | case self::PARAGRAPH: 89 | $str->append("

".$data."

"); 90 | break; 91 | case self::PAGEBREAK: 92 | $str->append(""); 93 | break; 94 | case self::H2: 95 | $entries[] = array("level" => 2, "title" => $data, "id" => $id); 96 | $this->resolveFilepos($str, $id); 97 | $str->append("

".$data."

"); 98 | break; 99 | case self::H3: 100 | $entries[] = array("level" => 3, "title" => $data, "id" => $id); 101 | $this->resolveFilepos($str, $id); 102 | $str->append("

".$data."

"); 103 | break; 104 | case self::IMAGE: 105 | $str->append(""); 106 | break; 107 | } 108 | } 109 | return $entries; 110 | } 111 | 112 | /** 113 | * Generate a TOC 114 | * @param $str LinkedStringBuilder 115 | * @param $entries The entries array generated by generateText (contains "id" and "title" values) 116 | */ 117 | private function addTOC($str, $entries){ 118 | $this->resolveFilepos($str, self::TOC_LINK); 119 | $str->append("

Contents

"); 120 | $str->append("
"); 121 | for($i = 0, $len = sizeof($entries); $i < $len; $i++){ 122 | $entry = $entries[$i]; 123 | 124 | $str->append(""); 127 | } 128 | $str->append("
append(">".$entry["title"]."
"); 129 | } 130 | 131 | /** 132 | * Get the file records of the images 133 | */ 134 | public function getImages(){ 135 | return $this->images; 136 | } 137 | 138 | /** 139 | * Get the metadata 140 | */ 141 | public function getMetaData(){ 142 | return $this->settings; 143 | } 144 | 145 | /** 146 | * Change the file's settings. For example set("author", "John Doe") or set("title", "The adventures of John Doe"). 147 | * @param $key Key of the setting to insert. 148 | */ 149 | public function set($key, $value){ 150 | $this->settings[$key] = $value; 151 | } 152 | 153 | /** 154 | * Get the file's settings. 155 | */ 156 | public function get($key){ 157 | return $this->settings[$key]; 158 | } 159 | 160 | /** 161 | * Append a paragraph of text to the file. 162 | * @param string $text The text to insert. 163 | */ 164 | public function appendParagraph($text){ 165 | $this->parts[] = array(self::PARAGRAPH, $text); 166 | } 167 | 168 | /** 169 | * Append a chapter title (H2) 170 | * @param string $title The title to insert. 171 | */ 172 | public function appendChapterTitle($title){ 173 | $this->parts[] = array(self::H2, $title); 174 | } 175 | 176 | /** 177 | * Append a section title (H3) 178 | * @param string $title The title to insert. 179 | */ 180 | public function appendSectionTitle($title){ 181 | $this->parts[] = array(self::H3, $title); 182 | } 183 | 184 | public function appendPageBreak() { 185 | $this->parts[] = array(self::PAGEBREAK, null); 186 | } 187 | 188 | /** 189 | * Append an image. 190 | * @param resource $img An image file (for example, created by `imagecreate`) 191 | */ 192 | public function appendImage($img){ 193 | $imgIndex = sizeof($this->images); 194 | $this->images[] = new FileRecord(new Record(ImageHandler::CreateImage($img))); 195 | $this->parts[] = array(self::IMAGE, $imgIndex); 196 | } 197 | } -------------------------------------------------------------------------------- /MOBIClass/MultipleFileHandler.php: -------------------------------------------------------------------------------- 1 | ".$title."".$contents.""; 34 | } 35 | $pos = 0; 36 | 37 | if(sizeof($this->toc) > 0){ 38 | $lastToc = $this->toc[sizeof($this->toc)-1]; 39 | $lastFile = $this->files[sizeof($this->files)-1]; 40 | $pos = $lastToc["pos"] + strlen($lastFile) + 1; 41 | } 42 | 43 | $this->files[] = $contents; 44 | $this->toc[] = array("title"=>$title, "pos"=>$pos); 45 | } 46 | 47 | /** 48 | * Add an image to the file 49 | * @param string $imageContents Data string containing the binary data of the image 50 | * @return int The reference of the image 51 | */ 52 | public function addImage($imageContents){ 53 | $this->images[] = $imageContents; 54 | return sizeof($this->images)-1; 55 | } 56 | 57 | /** 58 | * Add an image to the file 59 | * @param string $url Url to the image 60 | * @return int The reference of the image, false if the image couldn't be downloaded 61 | */ 62 | public function addImageFromUrl($url){ 63 | $image = ImageHandler::DownloadImage($url); 64 | 65 | if($image === false) return false; 66 | return $this->addImage($image); 67 | } 68 | 69 | /** 70 | * Set the metadata 71 | * @param string $key Key 72 | * @param string $value Value 73 | */ 74 | public function setMetadata($key, $value){ 75 | $this->metadata[$key] = $value; 76 | } 77 | 78 | /** 79 | * Get the text data to be integrated in the MOBI file 80 | * @return string 81 | */ 82 | public function getTextData(){ 83 | $data = implode("\n", $this->files); 84 | $begin = ""; 85 | $beforeTOC = $begin.$data; 86 | 87 | $tocPos = strlen($beforeTOC); 88 | 89 | $toc = $this->generateTOC(strlen($begin)); 90 | 91 | $customBegin = ""; 92 | $data = $customBegin.$data.$toc.""; 93 | return $data; 94 | } 95 | 96 | public function forceLength($n, $l){ 97 | $str = $n.""; 98 | $cur = strlen($str); 99 | while($cur < $l){ 100 | $str = "0".$str; 101 | $cur++; 102 | } 103 | return $str; 104 | } 105 | 106 | public function generateTOC($base = 0){ 107 | $toc = "

Contents

"; 108 | $toc .= "
"; 109 | for($i = 0, $len = sizeof($this->toc); $i < $len; $i++){ 110 | $entry = $this->toc[$i]; 111 | $position = $entry["pos"]+$base; 112 | $toc .= ""; 113 | } 114 | $toc .= "
".($i+1).".".$entry["title"]."
"; 115 | 116 | return $toc; 117 | } 118 | /** 119 | * Get the images (an array containing the jpeg data). Array entry 0 will 120 | * correspond to image record 0. 121 | * @return array 122 | */ 123 | public function getImages(){ 124 | return $this->images; 125 | } 126 | 127 | /** 128 | * Get the metadata in the form of a hashtable (for example, title or author). 129 | * @return array 130 | */ 131 | public function getMetaData(){ 132 | return $this->metadata; 133 | } 134 | 135 | } 136 | ?> 137 | -------------------------------------------------------------------------------- /MOBIClass/OnlineArticle.php: -------------------------------------------------------------------------------- 1 | init(); 23 | if(!isset($this->metadata["title"])){ 24 | $this->metadata["title"] = CharacterEntities::convert(strip_tags($r->getTitle()->innerHTML)); 25 | } 26 | if(!isset($this->metadata["author"])){ 27 | $parts = parse_url($url); 28 | $this->metadata["author"] = $parts["host"]; 29 | } 30 | 31 | $article = $r->getContent()->innerHTML; 32 | if(substr($article, 0, 5) == ""; 34 | }else{ 35 | $article = "".$article.""; 36 | } 37 | $doc = new DOMDocument(); 38 | @$doc->loadHTML($article) or die($article); 39 | $doc->normalizeDocument(); 40 | 41 | $this->images = $this->handleImages($doc, $url); 42 | $this->text = $doc->saveHTML(); 43 | } 44 | 45 | /** 46 | * Get the text data to be integrated in the MOBI file 47 | * @return string 48 | */ 49 | public function getTextData(){ 50 | return $this->text; 51 | } 52 | /** 53 | * Get the images (an array containing the jpeg data). Array entry 0 will 54 | * correspond to image record 0. 55 | * @return array 56 | */ 57 | public function getImages(){ 58 | return $this->images; 59 | } 60 | /** 61 | * Get the metadata in the form of a hashtable (for example, title or author). 62 | * @return array 63 | */ 64 | public function getMetaData(){ 65 | return $this->metadata; 66 | } 67 | /** 68 | * 69 | * @param DOMElement $dom 70 | * @return array 71 | */ 72 | private function handleImages($dom, $url){ 73 | $images = array(); 74 | 75 | $parts = parse_url($url); 76 | 77 | $savedImages = array(); 78 | 79 | $imgElements = $dom->getElementsByTagName('img'); 80 | foreach($imgElements as $img) { 81 | $src = $img->getAttribute("src"); 82 | 83 | $is_root = false; 84 | if(substr($src, 0, 1) == "/"){ 85 | $is_root = true; 86 | } 87 | 88 | $parsed = parse_url($src); 89 | 90 | if(!isset($parsed["host"])){ 91 | if($is_root){ 92 | $src = http_build_url($url, $parsed, HTTP_URL_REPLACE); 93 | }else{ 94 | $src = http_build_url($url, $parsed, HTTP_URL_JOIN_PATH); 95 | } 96 | } 97 | $img->setAttribute("src", ""); 98 | if(isset($savedImages[$src])){ 99 | $img->setAttribute("recindex", $savedImages[$src]); 100 | }else{ 101 | $image = ImageHandler::DownloadImage($src); 102 | 103 | if($image !== false){ 104 | $images[$this->imgCounter] = new FileRecord(new Record($image)); 105 | 106 | $img->setAttribute("recindex", $this->imgCounter); 107 | $savedImages[$src] = $this->imgCounter; 108 | $this->imgCounter++; 109 | } 110 | } 111 | } 112 | 113 | return $images; 114 | } 115 | } 116 | ?> 117 | -------------------------------------------------------------------------------- /MOBIClass/PalmRecord.php: -------------------------------------------------------------------------------- 1 | elements = new FileElement(array( 15 | "compression"=>new FileShort(), 16 | "unused"=>new FileShort(), 17 | "textLength"=>new FileInt(), 18 | "recordCount"=>new FileShort(), 19 | "recordSize"=>new FileShort(), 20 | "encryptionType"=>new FileShort(), 21 | "unused2"=>new FileShort(), 22 | //MOBI Header 23 | "mobiIdentifier"=>new FileString("MOBI", 4), 24 | "mobiHeaderLength"=>new FileInt(), 25 | "mobiType"=>new FileInt(), 26 | "textEncoding"=>new FileInt(), 27 | "uniqueID"=>new FileInt(), 28 | "fileVersion"=>new FileInt(), 29 | "reserved"=>new FileString(40), 30 | "firstNonBookIndex"=>new FileInt(), 31 | "fullNameOffset"=>new FileInt(), 32 | "fullNameLength"=>new FileInt(), 33 | "locale"=>new FileInt(), 34 | "inputLanguage"=>new FileInt(), 35 | "outputLanguage"=>new FileInt(), 36 | "minimumVersion"=>new FileInt(), 37 | "firstImageIndex"=>new FileInt(), 38 | "huffmanRecordOffset"=>new FileInt(), 39 | "huffmanRecordCount"=>new FileInt(), 40 | "unused3"=>new FileString(8), 41 | "exthFlags"=>new FileInt(0x40), 42 | "unknown"=>new FileString(32), 43 | "drmOffset"=>new FileInt(0xFFFFFFFF), 44 | "drmCount"=>new FileShort(0xFFFFFFFF), 45 | "drmSize"=>new FileShort(), 46 | "drmFlags"=>new FileInt(), 47 | "mobiFiller"=>new FileString(72), 48 | //EXTH Header 49 | "exthIdentifier"=>new FileString("EXTH", 4), 50 | "exthHeaderLength"=>new FileInt(), 51 | "exthRecordCount"=>new FileInt(), 52 | "exthRecords"=>new FileElement(), 53 | "exthPadding"=>new FileString(), 54 | //"fullNamePadding"=>new FileString(100), 55 | "fullName"=>new FileString() 56 | )); 57 | 58 | //Set values from the info block 59 | foreach($settings->values as $name => $val){ 60 | //echo $name.", "; 61 | if($this->elements->exists($name)){ 62 | $this->elements->get($name)->set($settings->get($name)); 63 | } 64 | } 65 | 66 | $els = $settings->values; 67 | 68 | $exthElems = new FileElement(); 69 | $i = 0; 70 | $l = 0; 71 | foreach($els as $name=>$val){ 72 | $type = EXTHHelper::textToType($name); 73 | if($type !== false){ 74 | $type = new FileInt($type); 75 | $length = new FileInt(8+strlen($val)); 76 | $data = new FileString($val); 77 | $l += 8+strlen($val); 78 | $exthElems->add("type".$i, $type); 79 | $exthElems->add("length".$i, $length); 80 | $exthElems->add("data".$i, $data); 81 | $i++; 82 | } 83 | } 84 | 85 | if($images > 0){ 86 | $this->elements->get("firstImageIndex")->set($textRecords+1); 87 | } 88 | $this->elements->get("firstNonBookIndex")->set($textRecords+2+$images); 89 | $this->elements->get("reserved")->set(str_pad("", 40, chr(255), STR_PAD_RIGHT)); 90 | $this->elements->get("exthRecordCount")->set($i); 91 | $this->elements->set("exthRecords", $exthElems); 92 | $pad = $l%4; 93 | $pad = (4-$pad)%4; 94 | $this->elements->get("exthPadding")->set(str_pad("", $pad, "\0", STR_PAD_RIGHT)); 95 | $this->elements->get("exthHeaderLength")->set(12+$l+$pad); 96 | 97 | 98 | $this->elements->get("recordCount")->set($textRecords); 99 | 100 | $this->elements->get("fullNameOffset")->set($this->elements->offsetToEntry("fullName")); 101 | $this->elements->get("fullNameLength")->set(strlen($settings->get("title"))); 102 | $this->elements->get("fullName")->set($settings->get("title")); 103 | $this->elements->get("textLength")->set($textLength); 104 | } 105 | 106 | public function getByteLength(){ 107 | return $this->getLength(); 108 | } 109 | 110 | public function getLength(){ 111 | return $this->elements->getByteLength(); 112 | } 113 | 114 | public function get(){ 115 | return $this; 116 | } 117 | 118 | public function set($elements){ 119 | throw new Exception("Unallowed set"); 120 | } 121 | 122 | public function serialize() { 123 | return $this->elements->serialize(); 124 | } 125 | 126 | public function unserialize($data) { 127 | $this->elements->unserialize($data); 128 | } 129 | 130 | public function __toString(){ 131 | $output = "PalmDoc Record (".$this->getByteLength()." bytes):\n"; 132 | $output .= $this->elements; 133 | return $output; 134 | } 135 | } 136 | ?> 137 | -------------------------------------------------------------------------------- /MOBIClass/Prc.php: -------------------------------------------------------------------------------- 1 | new FileString(32), 12 | "attributes"=>new FileShort(), 13 | "version"=>new FileShort(), 14 | "creationTime"=>new FileDate(), 15 | "modificationTime"=>new FileDate(), 16 | "backupTime"=>new FileDate(), 17 | "modificationNumber"=>new FileInt(), 18 | "appInfoID"=>new FileInt(), 19 | "sortInfoID"=>new FileInt(), 20 | "prcType"=>new FileString(4), 21 | "creator"=>new FileString(4), 22 | "uniqueIDSeed"=>new FileInt(), 23 | "nextRecordListID"=>new FileInt(), 24 | "numberRecords"=>new FileShort(), 25 | "recordList"=>new FileElement(), 26 | "filler"=>new FileShort(), 27 | "records"=>new FileElement() 28 | )); 29 | 30 | //Set values from the info block 31 | foreach($this->elements as $name => $val){ 32 | if($settings->exists($name)){ 33 | $this->get($name)->set($settings->get($name)); 34 | } 35 | } 36 | 37 | $this->get("numberRecords")->set(sizeof($records)); 38 | 39 | $i = 0; 40 | foreach($records as $record){ 41 | $offset = new FileInt(); 42 | $attr = new FileByte(); 43 | $uniqueID = new FileTri($i); 44 | 45 | $this->elements["recordList"]->add("Rec".$i, new FileElement(array( 46 | "offset"=>$offset, 47 | "attribute"=>$attr, 48 | "uniqueID"=>$uniqueID 49 | ))); 50 | 51 | $this->elements["records"]->add("Rec".$i, $record); 52 | $i++; 53 | } 54 | 55 | $this->updateOffsets($records); 56 | } 57 | 58 | public function getByteLength(){ 59 | throw new Exception("Test"); 60 | } 61 | 62 | public function updateOffsets($records){ 63 | $base = $this->offsetToEntry("records"); 64 | 65 | $i = 0; 66 | 67 | foreach($records as $record){ 68 | $el = $this->elements["recordList"]->get("Rec".$i); 69 | 70 | $local = $this->elements["records"]->offsetToEntry("Rec".$i); 71 | 72 | $el->get("offset")->set($base+$local); 73 | 74 | $i++; 75 | } 76 | } 77 | 78 | public function save($file){ 79 | $handle = fopen($file, "w"); 80 | fwrite($handle, $this->serialize()); 81 | fclose($handle); 82 | } 83 | 84 | public function output(){ 85 | echo $this->serialize(); 86 | } 87 | 88 | public function __toString(){ 89 | $output = "Prc (".$this->getByteLength()." bytes): {\n"; 90 | foreach($this->elements as $key=>$value){ 91 | $output .= "\t".$key.": ".$value."\n"; 92 | } 93 | $output .= "}"; 94 | return $output; 95 | } 96 | } 97 | ?> 98 | -------------------------------------------------------------------------------- /MOBIClass/PreprocessedArticle.php: -------------------------------------------------------------------------------- 1 | text = $textData; 16 | $this->metadata = $metadata; 17 | 18 | $this->images = $this->downloadImages($imageLinks); 19 | } 20 | 21 | /** 22 | * Create a Preprocessed article from a json string 23 | * @param string $json JSON data. Should be of the following format: 24 | * {"text": "TEXT", "images: ["imageURL1", "imageURL2"], "metadata": {"key": "value"}} 25 | * 26 | * Note: Any image tags should have the recindex attribute set to the appropriate index (the 27 | * same index as the image in the array) 28 | * @return PreprocessedArticle The generated preprocessed array 29 | */ 30 | static public function CreateFromJson($json){ 31 | $data = json_decode($json); 32 | return new PreprocessedArticle($data["text"], $data["images"], $data["metadata"]); 33 | } 34 | 35 | /** 36 | * Get the text data to be integrated in the MOBI file 37 | * @return string 38 | */ 39 | public function getTextData(){ 40 | return $this->text; 41 | } 42 | /** 43 | * Get the images (an array containing the jpeg data). Array entry 0 will 44 | * correspond to image record 0. 45 | * @return array 46 | */ 47 | public function getImages(){ 48 | return $this->images; 49 | } 50 | /** 51 | * Get the metadata in the form of a hashtable (for example, title or author). 52 | * @return array 53 | */ 54 | public function getMetaData(){ 55 | return $this->metadata; 56 | } 57 | /** 58 | * 59 | * @param DOMElement $dom 60 | * @return array 61 | */ 62 | private function downloadImages($links){ 63 | $images = array(); 64 | foreach($links as $link) { 65 | $imgFile = @imagecreatefromstring(Http::Request($link)); 66 | 67 | if($imgFile === false){ 68 | $imgFile = @imagecreate(1, 1); 69 | $black = @imagecolorallocate($imgFile, 255, 255, 255); 70 | } 71 | if($imgFile !== false){ 72 | @imagefilter($imgFile, IMG_FILTER_GRAYSCALE); 73 | 74 | ob_start(); 75 | @imagejpeg($imgFile); 76 | $image = ob_get_contents(); 77 | ob_end_clean(); 78 | 79 | $images[$this->imgCounter] = new FileRecord(new Record($image)); 80 | imagedestroy($imgFile); 81 | 82 | $this->imgCounter++; 83 | } 84 | } 85 | 86 | return $images; 87 | } 88 | } 89 | ?> 90 | -------------------------------------------------------------------------------- /MOBIClass/RecognizeURL.php: -------------------------------------------------------------------------------- 1 | 17 | -------------------------------------------------------------------------------- /MOBIClass/Record.php: -------------------------------------------------------------------------------- 1 | data = $data; 27 | if($length >= 0){ 28 | $this->length = $length; 29 | }else{ 30 | $this->length = strlen($data); 31 | } 32 | } 33 | 34 | public function compress($compression_method){ 35 | switch($compression_method){ 36 | case NO_COMPRESSION: 37 | //Finished! 38 | break; 39 | case PALMDOC_COMPRESSION: 40 | throw new Exception("Not implemented yet"); 41 | break; 42 | case HUFF: 43 | throw new Exception("Not implemented yet"); 44 | break; 45 | default: 46 | throw new Exception("Invalid argument"); 47 | } 48 | } 49 | 50 | public function getByteLength(){ 51 | return $this->getLength(); 52 | } 53 | 54 | /** 55 | * Get the length of the record 56 | * @return int Length of the data 57 | */ 58 | public function getLength(){ 59 | return $this->length; 60 | } 61 | 62 | /** 63 | * Get the data contained in the record 64 | * @return string Data contained in the record 65 | */ 66 | public function get(){ 67 | return $this->data; 68 | } 69 | 70 | /** 71 | * Set the data contained in the record 72 | * @param string $value Data contained in the record 73 | */ 74 | public function set($value){ 75 | $this->data = $value; 76 | } 77 | 78 | public function serialize(){ 79 | return $this->data; 80 | } 81 | public function unserialize($data){ 82 | __construct($data); 83 | } 84 | 85 | public function __toString() { 86 | $toShow = $this->data; 87 | if(strlen($this->data) > 103){ 88 | $toShow = substr($this->data, 0, 100)."..."; 89 | } 90 | $out = "Record: {\n"; 91 | $out .= "\t".htmlspecialchars($toShow)."\n"; 92 | $out .= "}"; 93 | return $out; 94 | } 95 | } 96 | ?> 97 | -------------------------------------------------------------------------------- /MOBIClass/RecordFactory.php: -------------------------------------------------------------------------------- 1 | settings = $settings; 22 | } 23 | 24 | /** 25 | * Create records from a data string 26 | * @param string $data 27 | * @return array(Record) 28 | */ 29 | public function createRecords($data){ 30 | $records = array(); 31 | $size = $this->settings->get("recordSize"); 32 | $compression = $this->settings->get("compression"); 33 | 34 | $dataEntries = mb_str_split($data, $size); 35 | 36 | for($i = 0, $len = sizeof($dataEntries); $i < $len; $i++){ 37 | $cur = $dataEntries[$i]; 38 | 39 | $dataEntries[$i] = $cur; 40 | $records[$i] = new Record($dataEntries[$i]); 41 | $records[$i]->compress($compression); 42 | } 43 | 44 | return $records; 45 | } 46 | 47 | public function createEOFRecord(){ 48 | return new Record(0xe98e0d0a); 49 | } 50 | 51 | public function createFCISRecord($textLength){ 52 | $r = "FCIS"; 53 | $r .= $this->asString(20, 4); 54 | $r .= $this->asString(16, 4); 55 | $r .= $this->asString(1, 4); 56 | $r .= $this->asString(0, 4); 57 | $r .= $this->asString($textLength, 4); 58 | $r .= $this->asString(0, 4); 59 | $r .= $this->asString(32, 4); 60 | $r .= $this->asString(8, 4); 61 | $r .= $this->asString(1, 2); 62 | $r .= $this->asString(1, 2); 63 | $r .= $this->asString(0, 4); 64 | return new Record($r); 65 | } 66 | 67 | public function createFLISRecord(){ 68 | $r = "FLIS"; 69 | $r .= $this->asString(8, 4); 70 | $r .= $this->asString(65, 2); 71 | $r .= $this->asString(0, 2); 72 | $r .= $this->asString(0, 4); 73 | $r .= $this->asString(-1, 4); 74 | $r .= $this->asString(1, 2); 75 | $r .= $this->asString(3, 2); 76 | $r .= $this->asString(3, 4); 77 | $r .= $this->asString(1, 4); 78 | $r .= $this->asString(-1, 4); 79 | return new Record($r); 80 | } 81 | 82 | private function asString($int, $size){ 83 | $out = ""; 84 | for($i = 0; $i < $size; $i++){ 85 | if($i > 0) $out = " ".$out; 86 | $byte = dechex($int & 0xFF); 87 | if(strlen($byte) == 1) $byte = "0".$byte; 88 | $out = $byte.$out; 89 | $int = $int >> 8; 90 | } 91 | return $out; 92 | } 93 | 94 | public function __toString() { 95 | $out = "Record Factory: {\n"; 96 | $out .= "\tRecord Size: ".$this->settings->get("recordSize")."\n"; 97 | $out .= "\tCompression: ".$this->settings->get("compression")."\n"; 98 | $out .= "}"; 99 | return $out; 100 | } 101 | } 102 | 103 | /** 104 | * Split string in chunks of at most split_length bytes, while respecting multi-byte 105 | * character boundaries. 106 | */ 107 | function mb_str_split($string, $split_length = 1){ 108 | mb_internal_encoding('UTF-8'); 109 | mb_regex_encoding('UTF-8'); 110 | 111 | $split_length = ($split_length <= 0) ? 1 : $split_length; 112 | 113 | $bytes = strlen($string); 114 | 115 | $array = array(); 116 | 117 | if ($split_length >= $bytes) { 118 | $array[] = $string; 119 | return $array; 120 | } 121 | 122 | $i = 0; 123 | while ($i < $bytes) { 124 | $cut_string = mb_strcut($string, $i, $split_length); 125 | $n_bytes = strlen($cut_string); 126 | 127 | if ($n_bytes == 0) { 128 | throw new Exception('Inifite loop in string split detected.'); 129 | } 130 | $array[] = $cut_string; 131 | $i += strlen($cut_string); 132 | } 133 | 134 | return $array; 135 | } 136 | ?> 137 | -------------------------------------------------------------------------------- /MOBIClass/Settings.php: -------------------------------------------------------------------------------- 1 | values = array( 24 | "attributes"=>0, 25 | "version"=>0, 26 | "creationTime"=>time()+94694400, 27 | "modificationTime"=>time()+94694400, 28 | "backupTime"=>0, 29 | "modificationNumber"=>0, 30 | "appInfoID"=>0, 31 | "sortInfoID"=>0, 32 | "prcType"=>"BOOK", 33 | "creator"=>"MOBI", 34 | "uniqueIDSeed"=>rand(), 35 | "nextRecordListID"=>0, 36 | "recordAttributes"=>0, 37 | "compression"=>NO_COMPRESSION, 38 | "recordSize"=>RECORD_SIZE, 39 | "encryptionType"=>NO_ENCRYPTION, 40 | "mobiIdentifier"=>"MOBI", 41 | "mobiHeaderLength"=>0xe8, 42 | "mobiType"=>MOBIPOCKET_BOOK, 43 | "textEncoding"=>UTF8, 44 | "uniqueID"=>rand(), 45 | "fileVersion"=>6, 46 | "locale"=>0x09, 47 | "inputLanguage"=>0, 48 | "outputLanguage"=>0, 49 | "minimumVersion"=>6, 50 | "huffmanRecordOffset"=>0, 51 | "huffmanRecordCount"=>0, 52 | "exthFlags"=>0x40, 53 | "drmOffset"=>0xFFFFFFFF, 54 | "drmCount"=>0, 55 | "drmSize"=>0, 56 | "drmFlags"=>0, 57 | "extraDataFlags"=>0, 58 | "exthIdentifier"=>"EXTH", 59 | // These can be changed without any risk 60 | "title"=>"Unknown title", 61 | "author"=>"Unknown author", 62 | "subject"=>"Unknown subject" 63 | ); 64 | 65 | foreach($additionalSettings as $key=>$value){ 66 | $this->values[$key] = $value; 67 | } 68 | } 69 | 70 | /** 71 | * Get a value from the settings 72 | * @param string $key Key of the setting 73 | * @return mixed The value of the setting 74 | */ 75 | public function get($key){ 76 | return $this->values[$key]; 77 | } 78 | 79 | /** 80 | * Checks if a value is set 81 | * @param string $key Key of the setting 82 | * @return bool True if the value exists 83 | */ 84 | public function exists($key){ 85 | return isset($this->values[$key]); 86 | } 87 | 88 | public function __toString() { 89 | $out = "Settings: {\n"; 90 | foreach($this->values as $key=>$value){ 91 | $out .= "\t".$key.": ".$value."\n"; 92 | } 93 | $out .= "}"; 94 | return $out; 95 | } 96 | } 97 | ?> 98 | -------------------------------------------------------------------------------- /MOBIClass/constants.php: -------------------------------------------------------------------------------- 1 | id = intval(substr($ending, 0, strpos($ending, "/"))); 17 | 18 | for($i = 1; $i <= max(1, $this->chapterCount); $i++){ 19 | $this->addChapter($i); 20 | } 21 | } 22 | 23 | private function addChapter($n){ 24 | $doc = new DOMDocument(); 25 | $file = Http::Request(self::$prefix.$this->id."/".$n."/"); 26 | @$doc->loadHTML($file) or die($file); 27 | 28 | if(!$this->downloadedMetadata){ 29 | $this->loadMetadata($doc); 30 | $this->downloadedMetadata = true; 31 | } 32 | if($this->chapterCount < 0){ 33 | $this->chapterCount = $this->getNumberChapters($doc); 34 | 35 | if($this->chapterCount > 4){ 36 | die("Too many files to download, don't use php for this!"); 37 | } 38 | } 39 | 40 | $textEl = $doc->getElementById("storytext"); 41 | if($textEl == null) die("Error: ".$doc->saveHTML()); 42 | $horizontalRulebars = $doc->getElementsByTagName('hr'); 43 | /** 44 | * @var DOMNode 45 | */ 46 | $hr; 47 | foreach($horizontalRulebars as $hr) { 48 | $hr->setAttribute("size", null); 49 | $hr->setAttribute("noshade", null); 50 | } 51 | $text = $this->innerHtml($textEl); 52 | 53 | $title = ""; 54 | $selects = $doc->getElementsByTagName('select'); 55 | foreach($selects as $select) { 56 | if($select->hasAttribute("name") && $select->getAttribute("name") == "chapter"){ 57 | $options = $select->getElementsByTagName("option"); 58 | 59 | $test = $n.". "; 60 | foreach($options as $option){ 61 | $val = $option->nodeValue; 62 | if(substr($val, 0, strlen($test)) == $test){ 63 | $title = substr($val, strlen($test)); 64 | break; 65 | } 66 | } 67 | break; 68 | } 69 | } 70 | $this->addPage($text, $title); 71 | } 72 | 73 | private function getNumberChapters($doc){ 74 | $selects = $doc->getElementsByTagName('select'); 75 | foreach($selects as $select) { 76 | if($select->hasAttribute("name") && $select->getAttribute("name") == "chapter"){ 77 | $options = $select->getElementsByTagName("option"); 78 | 79 | $count = $options->length; 80 | return $count; 81 | } 82 | } 83 | } 84 | 85 | private function loadMetadata($doc){ 86 | //Author 87 | $links = $doc->getElementsByTagName('a'); 88 | foreach($links as $link) { 89 | if($link == null){ 90 | var_dump($link); 91 | } 92 | if($link->hasAttribute("href") && substr($link->getAttribute("href"), 0, 3) == "/u/"){ 93 | $this->setMetadata("author", $link->nodeValue); 94 | } 95 | } 96 | //Title 97 | /* 98 | $links = $doc->getElementsByTagName('link'); 99 | foreach($links as $link) { 100 | if($link->hasAttribute("rel") && $link->getAttribute("rel") == "canonical"){ 101 | $url = $link->getAttribute("href"); 102 | $title = str_replace("_", " ", substr($url, strrpos($url, "/")+1)); 103 | $this->setMetadata("title", $title); 104 | } 105 | }*/ 106 | 107 | //TODO: Find a more reliable way to extract the title 108 | $title = $doc->getElementsByTagName("b")->item(0)->nodeValue; 109 | $this->setMetadata("title", $title); 110 | } 111 | 112 | private function innerHtml($node){ 113 | $doc = new DOMDocument(); 114 | foreach ($node->childNodes as $child) 115 | $doc->appendChild($doc->importNode($child, true)); 116 | 117 | return $doc->saveHTML(); 118 | } 119 | 120 | public static function Matches($url){ 121 | //TODO: Implement with regex 122 | return strpos($url, self::$prefix) !== false; 123 | } 124 | } 125 | ?> 126 | -------------------------------------------------------------------------------- /MOBIClass/http_build_url.php: -------------------------------------------------------------------------------- 1 | registerNodeClass('DOMElement', 'JSLikeHTMLElement'); 16 | * $doc->loadHTML('

Para 1

Para 2

'); 17 | * $elem = $doc->getElementsByTagName('div')->item(0); 18 | * 19 | * // print innerHTML 20 | * echo $elem->innerHTML; // prints '

Para 1

Para 2

' 21 | * echo "\n\n"; 22 | * 23 | * // set innerHTML 24 | * $elem->innerHTML = 'FiveFilters.org'; 25 | * echo $elem->innerHTML; // prints 'FiveFilters.org' 26 | * echo "\n\n"; 27 | * 28 | * // print document (with our changes) 29 | * echo $doc->saveXML(); 30 | * @endcode 31 | * 32 | * @author Keyvan Minoukadeh - http://www.keyvan.net - keyvan@keyvan.net 33 | * @see http://fivefilters.org (the project this was written for) 34 | */ 35 | class JSLikeHTMLElement extends DOMElement 36 | { 37 | /** 38 | * Used for setting innerHTML like it's done in JavaScript: 39 | * @code 40 | * $div->innerHTML = '

Chapter 2

The story begins...

'; 41 | * @endcode 42 | */ 43 | public function __set($name, $value) { 44 | if ($name == 'innerHTML') { 45 | // first, empty the element 46 | for ($x=$this->childNodes->length-1; $x>=0; $x--) { 47 | $this->removeChild($this->childNodes->item($x)); 48 | } 49 | // $value holds our new inner HTML 50 | if ($value != '') { 51 | $f = $this->ownerDocument->createDocumentFragment(); 52 | // appendXML() expects well-formed markup (XHTML) 53 | $result = @$f->appendXML($value); // @ to suppress PHP warnings 54 | if ($result) { 55 | if ($f->hasChildNodes()) $this->appendChild($f); 56 | } else { 57 | // $value is probably ill-formed 58 | $f = new DOMDocument(); 59 | $value = mb_convert_encoding($value, 'HTML-ENTITIES', 'UTF-8'); 60 | // Using will generate a warning, but so will bad HTML 61 | // (and by this point, bad HTML is what we've got). 62 | // We use it (and suppress the warning) because an HTML fragment will 63 | // be wrapped around tags which we don't really want to keep. 64 | // Note: despite the warning, if loadHTML succeeds it will return true. 65 | $result = @$f->loadHTML(''.$value.''); 66 | if ($result) { 67 | $import = $f->getElementsByTagName('htmlfragment')->item(0); 68 | foreach ($import->childNodes as $child) { 69 | $importedNode = $this->ownerDocument->importNode($child, true); 70 | $this->appendChild($importedNode); 71 | } 72 | } else { 73 | // oh well, we tried, we really did. :( 74 | // this element is now empty 75 | } 76 | } 77 | } 78 | } else { 79 | $trace = debug_backtrace(); 80 | trigger_error('Undefined property via __set(): '.$name.' in '.$trace[0]['file'].' on line '.$trace[0]['line'], E_USER_NOTICE); 81 | } 82 | } 83 | 84 | /** 85 | * Used for getting innerHTML like it's done in JavaScript: 86 | * @code 87 | * $string = $div->innerHTML; 88 | * @endcode 89 | */ 90 | public function __get($name) 91 | { 92 | if ($name == 'innerHTML') { 93 | $inner = ''; 94 | foreach ($this->childNodes as $child) { 95 | $inner .= $this->ownerDocument->saveXML($child); 96 | } 97 | return $inner; 98 | } 99 | 100 | $trace = debug_backtrace(); 101 | trigger_error('Undefined property via __get(): '.$name.' in '.$trace[0]['file'].' on line '.$trace[0]['line'], E_USER_NOTICE); 102 | return null; 103 | } 104 | 105 | public function __toString() 106 | { 107 | return '['.$this->tagName.']'; 108 | } 109 | } 110 | ?> -------------------------------------------------------------------------------- /MOBIClass/readability/Readability.php: -------------------------------------------------------------------------------- 1 | init(); 62 | echo $r->articleContent->innerHTML; 63 | */ 64 | 65 | class Readability 66 | { 67 | public $version = '1.7.1-without-multi-page'; 68 | public $convertLinksToFootnotes = false; 69 | public $revertForcedParagraphElements = true; 70 | public $articleTitle; 71 | public $articleContent; 72 | public $dom; 73 | public $url = null; // optional - URL where HTML was retrieved 74 | public $debug = false; 75 | protected $body = null; // 76 | protected $bodyCache = null; // Cache the body HTML in case we need to re-use it later 77 | protected $flags = 7; // 1 | 2 | 4; // Start with all flags set. 78 | protected $success = false; // indicates whether we were able to extract or not 79 | 80 | /** 81 | * All of the regular expressions in use within readability. 82 | * Defined up here so we don't instantiate them repeatedly in loops. 83 | **/ 84 | public $regexps = array( 85 | 'unlikelyCandidates' => '/combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter/i', 86 | 'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i', 87 | 'positive' => '/article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i', 88 | 'negative' => '/combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i', 89 | 'divToPElements' => '/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i', 90 | 'replaceBrs' => '/(]*>[ \n\r\t]*){2,}/i', 91 | 'replaceFonts' => '/<(\/?)font[^>]*>/i', 92 | // 'trimRe' => '/^\s+|\s+$/g', // PHP has trim() 93 | 'normalize' => '/\s{2,}/', 94 | 'killBreaks' => '/((\s| ?)*){1,}/', 95 | 'video' => '/http:\/\/(www\.)?(youtube|vimeo)\.com/i', 96 | 'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i' 97 | ); 98 | 99 | /* constants */ 100 | const FLAG_STRIP_UNLIKELYS = 1; 101 | const FLAG_WEIGHT_CLASSES = 2; 102 | const FLAG_CLEAN_CONDITIONALLY = 4; 103 | 104 | /** 105 | * Create instance of Readability 106 | * @param string UTF-8 encoded string 107 | * @param string (optional) URL associated with HTML (used for footnotes) 108 | */ 109 | function __construct($html, $url=null) 110 | { 111 | /* Turn all double br's into p's */ 112 | /* Note, this is pretty costly as far as processing goes. Maybe optimize later. */ 113 | $html = preg_replace($this->regexps['replaceBrs'], '

', $html); 114 | $html = preg_replace($this->regexps['replaceFonts'], '<$1span>', $html); 115 | $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); 116 | $this->dom = new DOMDocument(); 117 | $this->dom->preserveWhiteSpace = false; 118 | $this->dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement'); 119 | @$this->dom->loadHTML($html); 120 | $this->url = $url; 121 | } 122 | 123 | /** 124 | * Get article title element 125 | * @return DOMElement 126 | */ 127 | public function getTitle() { 128 | return $this->articleTitle; 129 | } 130 | 131 | /** 132 | * Get article content element 133 | * @return DOMElement 134 | */ 135 | public function getContent() { 136 | return $this->articleContent; 137 | } 138 | 139 | /** 140 | * Runs readability. 141 | * 142 | * Workflow: 143 | * 1. Prep the document by removing script tags, css, etc. 144 | * 2. Build readability's DOM tree. 145 | * 3. Grab the article content from the current dom tree. 146 | * 4. Replace the current DOM tree with the new one. 147 | * 5. Read peacefully. 148 | * 149 | * @return boolean true if we found content, false otherwise 150 | **/ 151 | public function init() 152 | { 153 | $this->removeScripts($this->dom); 154 | 155 | // Assume successful outcome 156 | $this->success = true; 157 | 158 | $bodyElems = $this->dom->getElementsByTagName('body'); 159 | if ($bodyElems->length > 0) { 160 | if ($this->bodyCache == null) { 161 | $this->bodyCache = $bodyElems->item(0)->innerHTML; 162 | } 163 | if ($this->body == null) { 164 | $this->body = $bodyElems->item(0); 165 | } 166 | } 167 | 168 | $this->prepDocument(); 169 | 170 | //die($this->dom->documentElement->parentNode->nodeType); 171 | //$this->setInnerHTML($this->dom->documentElement, $this->getInnerHTML($this->dom->documentElement)); 172 | //die($this->getInnerHTML($this->dom->documentElement)); 173 | 174 | /* Build readability's DOM tree */ 175 | $overlay = $this->dom->createElement('div'); 176 | $innerDiv = $this->dom->createElement('div'); 177 | $articleTitle = $this->getArticleTitle(); 178 | $articleContent = $this->grabArticle(); 179 | 180 | if (!$articleContent) { 181 | $this->success = false; 182 | $articleContent = $this->dom->createElement('div'); 183 | $articleContent->setAttribute('id', 'readability-content'); 184 | $articleContent->innerHTML = '

Sorry, Readability was unable to parse this page for content.

'; 185 | } 186 | 187 | $overlay->setAttribute('id', 'readOverlay'); 188 | $innerDiv->setAttribute('id', 'readInner'); 189 | 190 | /* Glue the structure of our document together. */ 191 | $innerDiv->appendChild($articleTitle); 192 | $innerDiv->appendChild($articleContent); 193 | $overlay->appendChild($innerDiv); 194 | 195 | /* Clear the old HTML, insert the new content. */ 196 | $this->body->innerHTML = ''; 197 | $this->body->appendChild($overlay); 198 | //document.body.insertBefore(overlay, document.body.firstChild); 199 | $this->body->removeAttribute('style'); 200 | 201 | $this->postProcessContent($articleContent); 202 | 203 | // Set title and content instance variables 204 | $this->articleTitle = $articleTitle; 205 | $this->articleContent = $articleContent; 206 | 207 | return $this->success; 208 | } 209 | 210 | /** 211 | * Debug 212 | */ 213 | protected function dbg($msg) { 214 | if ($this->debug) echo '* ',$msg, '
', "\n"; 215 | } 216 | 217 | /** 218 | * Run any post-process modifications to article content as necessary. 219 | * 220 | * @param DOMElement 221 | * @return void 222 | */ 223 | public function postProcessContent($articleContent) { 224 | if ($this->convertLinksToFootnotes && !preg_match('/wikipedia\.org/', @$this->url)) { 225 | $this->addFootnotes($articleContent); 226 | } 227 | } 228 | 229 | /** 230 | * Get the article title as an H1. 231 | * 232 | * @return DOMElement 233 | */ 234 | protected function getArticleTitle() { 235 | $curTitle = ''; 236 | $origTitle = ''; 237 | 238 | try { 239 | $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0)); 240 | } catch(Exception $e) {} 241 | 242 | if (preg_match('/ [\|\-] /', $curTitle)) 243 | { 244 | $curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle); 245 | 246 | if (count(explode(' ', $curTitle)) < 3) { 247 | $curTitle = preg_replace('/[^\|\-]*[\|\-](.*)/i', '$1', $origTitle); 248 | } 249 | } 250 | else if (strpos($curTitle, ': ') !== false) 251 | { 252 | $curTitle = preg_replace('/.*:(.*)/i', '$1', $origTitle); 253 | 254 | if (count(explode(' ', $curTitle)) < 3) { 255 | $curTitle = preg_replace('/[^:]*[:](.*)/i','$1', $origTitle); 256 | } 257 | } 258 | else if(strlen($curTitle) > 150 || strlen($curTitle) < 15) 259 | { 260 | $hOnes = $this->dom->getElementsByTagName('h1'); 261 | if($hOnes->length == 1) 262 | { 263 | $curTitle = $this->getInnerText($hOnes->item(0)); 264 | } 265 | } 266 | 267 | $curTitle = trim($curTitle); 268 | 269 | if (count(explode(' ', $curTitle)) <= 4) { 270 | $curTitle = $origTitle; 271 | } 272 | 273 | $articleTitle = $this->dom->createElement('h1'); 274 | $articleTitle->innerHTML = $curTitle; 275 | 276 | return $articleTitle; 277 | } 278 | 279 | /** 280 | * Prepare the HTML document for readability to scrape it. 281 | * This includes things like stripping javascript, CSS, and handling terrible markup. 282 | * 283 | * @return void 284 | **/ 285 | protected function prepDocument() { 286 | /** 287 | * In some cases a body element can't be found (if the HTML is totally hosed for example) 288 | * so we create a new body node and append it to the document. 289 | */ 290 | if($this->dom->documentElement == null){ 291 | throw new Exception("No document element"); 292 | } 293 | if ($this->body == null) 294 | { 295 | $this->body = $this->dom->createElement('body'); 296 | $this->dom->documentElement->appendChild($this->body); 297 | } 298 | 299 | $this->body->setAttribute('id', 'readabilityBody'); 300 | 301 | /* Remove all style tags in head */ 302 | $styleTags = $this->dom->getElementsByTagName('style'); 303 | for ($i = $styleTags->length-1; $i >= 0; $i--) 304 | { 305 | $styleTags->item($i)->parentNode->removeChild($styleTags->item($i)); 306 | } 307 | 308 | /* Turn all double br's into p's */ 309 | /* Note, this is pretty costly as far as processing goes. Maybe optimize later. */ 310 | //document.body.innerHTML = document.body.innerHTML.replace(readability.regexps.replaceBrs, '

').replace(readability.regexps.replaceFonts, '<$1span>'); 311 | // We do this in the constructor for PHP as that's when we have raw HTML - before parsing it into a DOM tree. 312 | // Manipulating innerHTML as it's done in JS is not possible in PHP. 313 | } 314 | 315 | /** 316 | * For easier reading, convert this document to have footnotes at the bottom rather than inline links. 317 | * @see http://www.roughtype.com/archives/2010/05/experiments_in.php 318 | * 319 | * @return void 320 | **/ 321 | public function addFootnotes($articleContent) { 322 | $footnotesWrapper = $this->dom->createElement('div'); 323 | $footnotesWrapper->setAttribute('id', 'readability-footnotes'); 324 | $footnotesWrapper->innerHTML = '

References

'; 325 | 326 | $articleFootnotes = $this->dom->createElement('ol'); 327 | $articleFootnotes->setAttribute('id', 'readability-footnotes-list'); 328 | $footnotesWrapper->appendChild($articleFootnotes); 329 | 330 | $articleLinks = $articleContent->getElementsByTagName('a'); 331 | 332 | $linkCount = 0; 333 | for ($i = 0; $i < $articleLinks->length; $i++) 334 | { 335 | $articleLink = $articleLinks->item($i); 336 | $footnoteLink = $articleLink->cloneNode(true); 337 | $refLink = $this->dom->createElement('a'); 338 | $footnote = $this->dom->createElement('li'); 339 | $linkDomain = @parse_url($footnoteLink->getAttribute('href'), PHP_URL_HOST); 340 | if (!$linkDomain && isset($this->url)) $linkDomain = @parse_url($this->url, PHP_URL_HOST); 341 | //linkDomain = footnoteLink.host ? footnoteLink.host : document.location.host, 342 | $linkText = $this->getInnerText($articleLink); 343 | 344 | if ((strpos($articleLink->getAttribute('class'), 'readability-DoNotFootnote') !== false) || preg_match($this->regexps['skipFootnoteLink'], $linkText)) { 345 | continue; 346 | } 347 | 348 | $linkCount++; 349 | 350 | /** Add a superscript reference after the article link */ 351 | $refLink->setAttribute('href', '#readabilityFootnoteLink-' . $linkCount); 352 | $refLink->innerHTML = '[' . $linkCount . ']'; 353 | $refLink->setAttribute('class', 'readability-DoNotFootnote'); 354 | $refLink->setAttribute('style', 'color: inherit;'); 355 | 356 | //TODO: does this work or should we use DOMNode.isSameNode()? 357 | if ($articleLink->parentNode->lastChild == $articleLink) { 358 | $articleLink->parentNode->appendChild($refLink); 359 | } else { 360 | $articleLink->parentNode->insertBefore($refLink, $articleLink->nextSibling); 361 | } 362 | 363 | $articleLink->setAttribute('style', 'color: inherit; text-decoration: none;'); 364 | $articleLink->setAttribute('name', 'readabilityLink-' . $linkCount); 365 | 366 | $footnote->innerHTML = '^ '; 367 | 368 | $footnoteLink->innerHTML = ($footnoteLink->getAttribute('title') != '' ? $footnoteLink->getAttribute('title') : $linkText); 369 | $footnoteLink->setAttribute('name', 'readabilityFootnoteLink-' . $linkCount); 370 | 371 | $footnote->appendChild($footnoteLink); 372 | if ($linkDomain) $footnote->innerHTML = $footnote->innerHTML . ' (' . $linkDomain . ')'; 373 | 374 | $articleFootnotes->appendChild($footnote); 375 | } 376 | 377 | if ($linkCount > 0) { 378 | $articleContent->appendChild($footnotesWrapper); 379 | } 380 | } 381 | 382 | /** 383 | * Reverts P elements with class 'readability-styled' 384 | * to text nodes - which is what they were before. 385 | * 386 | * @param DOMElement 387 | * @return void 388 | */ 389 | function revertReadabilityStyledElements($articleContent) { 390 | $xpath = new DOMXPath($articleContent->ownerDocument); 391 | $elems = $xpath->query('.//p[@class="readability-styled"]', $articleContent); 392 | //$elems = $articleContent->getElementsByTagName('p'); 393 | for ($i = $elems->length-1; $i >= 0; $i--) { 394 | $e = $elems->item($i); 395 | $e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e); 396 | //if ($e->hasAttribute('class') && $e->getAttribute('class') == 'readability-styled') { 397 | // $e->parentNode->replaceChild($this->dom->createTextNode($e->textContent), $e); 398 | //} 399 | } 400 | } 401 | 402 | /** 403 | * Prepare the article node for display. Clean out any inline styles, 404 | * iframes, forms, strip extraneous

tags, etc. 405 | * 406 | * @param DOMElement 407 | * @return void 408 | */ 409 | function prepArticle($articleContent) { 410 | $this->cleanStyles($articleContent); 411 | $this->killBreaks($articleContent); 412 | if ($this->revertForcedParagraphElements) { 413 | $this->revertReadabilityStyledElements($articleContent); 414 | } 415 | 416 | /* Clean out junk from the article content */ 417 | $this->cleanConditionally($articleContent, 'form'); 418 | $this->clean($articleContent, 'object'); 419 | $this->clean($articleContent, 'h1'); 420 | 421 | /** 422 | * If there is only one h2, they are probably using it 423 | * as a header and not a subheader, so remove it since we already have a header. 424 | ***/ 425 | if ($articleContent->getElementsByTagName('h2')->length == 1) { 426 | $this->clean($articleContent, 'h2'); 427 | } 428 | $this->clean($articleContent, 'iframe'); 429 | 430 | $this->cleanHeaders($articleContent); 431 | 432 | /* Do these last as the previous stuff may have removed junk that will affect these */ 433 | $this->cleanConditionally($articleContent, 'table'); 434 | $this->cleanConditionally($articleContent, 'ul'); 435 | $this->cleanConditionally($articleContent, 'div'); 436 | 437 | /* Remove extra paragraphs */ 438 | $articleParagraphs = $articleContent->getElementsByTagName('p'); 439 | for ($i = $articleParagraphs->length-1; $i >= 0; $i--) 440 | { 441 | $imgCount = $articleParagraphs->item($i)->getElementsByTagName('img')->length; 442 | $embedCount = $articleParagraphs->item($i)->getElementsByTagName('embed')->length; 443 | $objectCount = $articleParagraphs->item($i)->getElementsByTagName('object')->length; 444 | 445 | if ($imgCount === 0 && $embedCount === 0 && $objectCount === 0 && $this->getInnerText($articleParagraphs->item($i), false) == '') 446 | { 447 | $articleParagraphs->item($i)->parentNode->removeChild($articleParagraphs->item($i)); 448 | } 449 | } 450 | 451 | try { 452 | $articleContent->innerHTML = preg_replace('/]*>\s*

innerHTML); 453 | //articleContent.innerHTML = articleContent.innerHTML.replace(/]*>\s*

dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " . $e); 457 | } 458 | } 459 | 460 | /** 461 | * Initialize a node with the readability object. Also checks the 462 | * className/id for special names to add to its score. 463 | * 464 | * @param Element 465 | * @return void 466 | **/ 467 | protected function initializeNode($node) { 468 | $readability = $this->dom->createAttribute('readability'); 469 | $readability->value = 0; // this is our contentScore 470 | $node->setAttributeNode($readability); 471 | 472 | switch (strtoupper($node->tagName)) { // unsure if strtoupper is needed, but using it just in case 473 | case 'DIV': 474 | $readability->value += 5; 475 | break; 476 | 477 | case 'PRE': 478 | case 'TD': 479 | case 'BLOCKQUOTE': 480 | $readability->value += 3; 481 | break; 482 | 483 | case 'ADDRESS': 484 | case 'OL': 485 | case 'UL': 486 | case 'DL': 487 | case 'DD': 488 | case 'DT': 489 | case 'LI': 490 | case 'FORM': 491 | $readability->value -= 3; 492 | break; 493 | 494 | case 'H1': 495 | case 'H2': 496 | case 'H3': 497 | case 'H4': 498 | case 'H5': 499 | case 'H6': 500 | case 'TH': 501 | $readability->value -= 5; 502 | break; 503 | } 504 | $readability->value += $this->getClassWeight($node); 505 | } 506 | 507 | /*** 508 | * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is 509 | * most likely to be the stuff a user wants to read. Then return it wrapped up in a div. 510 | * 511 | * @return DOMElement 512 | **/ 513 | protected function grabArticle($page=null) { 514 | $stripUnlikelyCandidates = $this->flagIsActive(self::FLAG_STRIP_UNLIKELYS); 515 | if (!$page) $page = $this->dom; 516 | $allElements = $page->getElementsByTagName('*'); 517 | /** 518 | * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs 519 | * into P tags where they have been used inappropriately (as in, where they contain no other block level elements.) 520 | * 521 | * Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 522 | * TODO: Shouldn't this be a reverse traversal? 523 | **/ 524 | $node = null; 525 | $nodesToScore = array(); 526 | for ($nodeIndex = 0; ($node = $allElements->item($nodeIndex)); $nodeIndex++) { 527 | //for ($nodeIndex=$targetList->length-1; $nodeIndex >= 0; $nodeIndex--) { 528 | //$node = $targetList->item($nodeIndex); 529 | $tagName = strtoupper($node->tagName); 530 | /* Remove unlikely candidates */ 531 | if ($stripUnlikelyCandidates) { 532 | $unlikelyMatchString = $node->getAttribute('class') . $node->getAttribute('id'); 533 | if ( 534 | preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) && 535 | !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString) && 536 | $tagName != 'BODY' 537 | ) 538 | { 539 | $this->dbg('Removing unlikely candidate - ' . $unlikelyMatchString); 540 | //$nodesToRemove[] = $node; 541 | $node->parentNode->removeChild($node); 542 | $nodeIndex--; 543 | continue; 544 | } 545 | } 546 | 547 | if ($tagName == 'P' || $tagName == 'TD' || $tagName == 'PRE') { 548 | $nodesToScore[] = $node; 549 | } 550 | 551 | /* Turn all divs that don't have children block level elements into p's */ 552 | if ($tagName == 'DIV') { 553 | if (!preg_match($this->regexps['divToPElements'], $node->innerHTML)) { 554 | //$this->dbg('Altering div to p'); 555 | $newNode = $this->dom->createElement('p'); 556 | try { 557 | $newNode->innerHTML = $node->innerHTML; 558 | //$nodesToReplace[] = array('new'=>$newNode, 'old'=>$node); 559 | $node->parentNode->replaceChild($newNode, $node); 560 | $nodeIndex--; 561 | $nodesToScore[] = $node; // or $newNode? 562 | } 563 | catch(Exception $e) { 564 | $this->dbg('Could not alter div to p, reverting back to div.: ' . $e); 565 | } 566 | } 567 | else 568 | { 569 | /* EXPERIMENTAL */ 570 | // TODO: change these p elements back to text nodes after processing 571 | for ($i = 0, $il = $node->childNodes->length; $i < $il; $i++) { 572 | $childNode = $node->childNodes->item($i); 573 | if ($childNode->nodeType == 3) { // XML_TEXT_NODE 574 | //$this->dbg('replacing text node with a p tag with the same content.'); 575 | $p = $this->dom->createElement('p'); 576 | $p->innerHTML = $childNode->nodeValue; 577 | $p->setAttribute('style', 'display: inline;'); 578 | $p->setAttribute('class', 'readability-styled'); 579 | $childNode->parentNode->replaceChild($p, $childNode); 580 | } 581 | } 582 | } 583 | } 584 | } 585 | 586 | /** 587 | * Loop through all paragraphs, and assign a score to them based on how content-y they look. 588 | * Then add their score to their parent node. 589 | * 590 | * A score is determined by things like number of commas, class names, etc. Maybe eventually link density. 591 | **/ 592 | $candidates = array(); 593 | for ($pt=0; $pt < count($nodesToScore); $pt++) { 594 | $parentNode = $nodesToScore[$pt]->parentNode; 595 | // $grandParentNode = $parentNode ? $parentNode->parentNode : null; 596 | $grandParentNode = !$parentNode ? null : (($parentNode->parentNode instanceof DOMElement) ? $parentNode->parentNode : null); 597 | $innerText = $this->getInnerText($nodesToScore[$pt]); 598 | 599 | if (!$parentNode || !isset($parentNode->tagName)) { 600 | continue; 601 | } 602 | 603 | /* If this paragraph is less than 25 characters, don't even count it. */ 604 | if(strlen($innerText) < 25) { 605 | continue; 606 | } 607 | 608 | /* Initialize readability data for the parent. */ 609 | if (!$parentNode->hasAttribute('readability')) 610 | { 611 | $this->initializeNode($parentNode); 612 | $candidates[] = $parentNode; 613 | } 614 | 615 | /* Initialize readability data for the grandparent. */ 616 | if ($grandParentNode && !$grandParentNode->hasAttribute('readability') && isset($grandParentNode->tagName)) 617 | { 618 | $this->initializeNode($grandParentNode); 619 | $candidates[] = $grandParentNode; 620 | } 621 | 622 | $contentScore = 0; 623 | 624 | /* Add a point for the paragraph itself as a base. */ 625 | $contentScore++; 626 | 627 | /* Add points for any commas within this paragraph */ 628 | $contentScore += count(explode(',', $innerText)); 629 | 630 | /* For every 100 characters in this paragraph, add another point. Up to 3 points. */ 631 | $contentScore += min(floor(strlen($innerText) / 100), 3); 632 | 633 | /* Add the score to the parent. The grandparent gets half. */ 634 | $parentNode->getAttributeNode('readability')->value += $contentScore; 635 | 636 | if ($grandParentNode) { 637 | $grandParentNode->getAttributeNode('readability')->value += $contentScore/2; 638 | } 639 | } 640 | 641 | /** 642 | * After we've calculated scores, loop through all of the possible candidate nodes we found 643 | * and find the one with the highest score. 644 | **/ 645 | $topCandidate = null; 646 | for ($c=0, $cl=count($candidates); $c < $cl; $c++) 647 | { 648 | /** 649 | * Scale the final candidates score based on link density. Good content should have a 650 | * relatively small link density (5% or less) and be mostly unaffected by this operation. 651 | **/ 652 | $readability = $candidates[$c]->getAttributeNode('readability'); 653 | $readability->value = $readability->value * (1-$this->getLinkDensity($candidates[$c])); 654 | 655 | $this->dbg('Candidate: ' . $candidates[$c]->tagName . ' (' . $candidates[$c]->getAttribute('class') . ':' . $candidates[$c]->getAttribute('id') . ') with score ' . $readability->value); 656 | 657 | if (!$topCandidate || $readability->value > (int)$topCandidate->getAttribute('readability')) { 658 | $topCandidate = $candidates[$c]; 659 | } 660 | } 661 | 662 | /** 663 | * If we still have no top candidate, just use the body as a last resort. 664 | * We also have to copy the body node so it is something we can modify. 665 | **/ 666 | if ($topCandidate === null || strtoupper($topCandidate->tagName) == 'BODY') 667 | { 668 | $topCandidate = $this->dom->createElement('div'); 669 | $topCandidate->innerHTML = ($page instanceof DOMDocument) ? $page->saveXML($page->documentElement) : $page->innerHTML; 670 | $page->innerHTML = ''; 671 | $page->appendChild($topCandidate); 672 | $this->initializeNode($topCandidate); 673 | } 674 | 675 | /** 676 | * Now that we have the top candidate, look through its siblings for content that might also be related. 677 | * Things like preambles, content split by ads that we removed, etc. 678 | **/ 679 | $articleContent = $this->dom->createElement('div'); 680 | $articleContent->setAttribute('id', 'readability-content'); 681 | $siblingScoreThreshold = max(10, ((int)$topCandidate->getAttribute('readability')) * 0.2); 682 | $siblingNodes = $topCandidate->parentNode->childNodes; 683 | 684 | for ($s=0, $sl=$siblingNodes->length; $s < $sl; $s++) 685 | { 686 | $siblingNode = $siblingNodes->item($s); 687 | $append = false; 688 | 689 | $this->dbg('Looking at sibling node: ' . $siblingNode->nodeName . (($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : '')); 690 | 691 | //dbg('Sibling has score ' . ($siblingNode->readability ? siblingNode.readability.contentScore : 'Unknown')); 692 | 693 | if ($siblingNode === $topCandidate) 694 | // or if ($siblingNode->isSameNode($topCandidate)) 695 | { 696 | $append = true; 697 | } 698 | 699 | $contentBonus = 0; 700 | /* Give a bonus if sibling nodes and top candidates have the example same classname */ 701 | if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->getAttribute('class') == $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') != '') { 702 | $contentBonus += ((int)$topCandidate->getAttribute('readability')) * 0.2; 703 | } 704 | 705 | if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability') && (((int)$siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold) 706 | { 707 | $append = true; 708 | } 709 | 710 | if (strtoupper($siblingNode->nodeName) == 'P') { 711 | $linkDensity = $this->getLinkDensity($siblingNode); 712 | $nodeContent = $this->getInnerText($siblingNode); 713 | $nodeLength = strlen($nodeContent); 714 | 715 | if ($nodeLength > 80 && $linkDensity < 0.25) 716 | { 717 | $append = true; 718 | } 719 | else if ($nodeLength < 80 && $linkDensity === 0 && preg_match('/\.( |$)/', $nodeContent)) 720 | { 721 | $append = true; 722 | } 723 | } 724 | 725 | if ($append) 726 | { 727 | $this->dbg('Appending node: ' . $siblingNode->nodeName); 728 | 729 | $nodeToAppend = null; 730 | $sibNodeName = strtoupper($siblingNode->nodeName); 731 | if ($sibNodeName != 'DIV' && $sibNodeName != 'P') { 732 | /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */ 733 | 734 | $this->dbg('Altering siblingNode of ' . $sibNodeName . ' to div.'); 735 | $nodeToAppend = $this->dom->createElement('div'); 736 | try { 737 | $nodeToAppend->setAttribute('id', $siblingNode->getAttribute('id')); 738 | $nodeToAppend->innerHTML = $siblingNode->innerHTML; 739 | } 740 | catch(Exception $e) 741 | { 742 | $this->dbg('Could not alter siblingNode to div, reverting back to original.'); 743 | $nodeToAppend = $siblingNode; 744 | $s--; 745 | $sl--; 746 | } 747 | } else { 748 | $nodeToAppend = $siblingNode; 749 | $s--; 750 | $sl--; 751 | } 752 | 753 | /* To ensure a node does not interfere with readability styles, remove its classnames */ 754 | $nodeToAppend->removeAttribute('class'); 755 | 756 | /* Append sibling and subtract from our list because it removes the node when you append to another node */ 757 | $articleContent->appendChild($nodeToAppend); 758 | } 759 | } 760 | 761 | /** 762 | * So we have all of the content that we need. Now we clean it up for presentation. 763 | **/ 764 | $this->prepArticle($articleContent); 765 | 766 | /** 767 | * Now that we've gone through the full algorithm, check to see if we got any meaningful content. 768 | * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher 769 | * likelihood of finding the content, and the sieve approach gives us a higher likelihood of 770 | * finding the -right- content. 771 | **/ 772 | if (strlen($this->getInnerText($articleContent, false)) < 250) 773 | { 774 | $this->body->innerHTML = $this->bodyCache; 775 | 776 | if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) { 777 | $this->removeFlag(self::FLAG_STRIP_UNLIKELYS); 778 | return $this->grabArticle($this->body); 779 | } 780 | else if ($this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) { 781 | $this->removeFlag(self::FLAG_WEIGHT_CLASSES); 782 | return $this->grabArticle($this->body); 783 | } 784 | else if ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) { 785 | $this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY); 786 | return $this->grabArticle($this->body); 787 | } 788 | else { 789 | return false; 790 | } 791 | } 792 | return $articleContent; 793 | } 794 | 795 | /** 796 | * Remove script tags from document 797 | * 798 | * @param DOMElement 799 | * @return void 800 | */ 801 | public function removeScripts($doc) { 802 | $scripts = $doc->getElementsByTagName('script'); 803 | for($i = $scripts->length-1; $i >= 0; $i--) 804 | { 805 | $scripts->item($i)->parentNode->removeChild($scripts->item($i)); 806 | } 807 | } 808 | 809 | /** 810 | * Get the inner text of a node. 811 | * This also strips out any excess whitespace to be found. 812 | * 813 | * @param DOMElement $ 814 | * @param boolean $normalizeSpaces (default: true) 815 | * @return string 816 | **/ 817 | public function getInnerText($e, $normalizeSpaces=true) { 818 | $textContent = ''; 819 | 820 | if (!isset($e->textContent) || $e->textContent == '') { 821 | return ''; 822 | } 823 | 824 | $textContent = trim($e->textContent); 825 | 826 | if ($normalizeSpaces) { 827 | return preg_replace($this->regexps['normalize'], ' ', $textContent); 828 | } else { 829 | return $textContent; 830 | } 831 | } 832 | 833 | /** 834 | * Get the number of times a string $s appears in the node $e. 835 | * 836 | * @param DOMElement $e 837 | * @param string - what to count. Default is "," 838 | * @return number (integer) 839 | **/ 840 | public function getCharCount($e, $s=',') { 841 | return substr_count($this->getInnerText($e), $s); 842 | } 843 | 844 | /** 845 | * Remove the style attribute on every $e and under. 846 | * 847 | * @param DOMElement $e 848 | * @return void 849 | */ 850 | public function cleanStyles($e) { 851 | $elems = $e->getElementsByTagName('*'); 852 | foreach ($elems as $elem) { 853 | $elem->removeAttribute('style'); 854 | } 855 | } 856 | 857 | /** 858 | * Get the density of links as a percentage of the content 859 | * This is the amount of text that is inside a link divided by the total text in the node. 860 | * 861 | * @param DOMElement $e 862 | * @return number (float) 863 | */ 864 | public function getLinkDensity($e) { 865 | $links = $e->getElementsByTagName('a'); 866 | $textLength = strlen($this->getInnerText($e)); 867 | $linkLength = 0; 868 | for ($i=0, $il=$links->length; $i < $il; $i++) 869 | { 870 | $linkLength += strlen($this->getInnerText($links->item($i))); 871 | } 872 | if ($textLength > 0) { 873 | return $linkLength / $textLength; 874 | } else { 875 | return 0; 876 | } 877 | } 878 | 879 | /** 880 | * Get an elements class/id weight. Uses regular expressions to tell if this 881 | * element looks good or bad. 882 | * 883 | * @param DOMElement $e 884 | * @return number (Integer) 885 | */ 886 | public function getClassWeight($e) { 887 | if(!$this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) { 888 | return 0; 889 | } 890 | 891 | $weight = 0; 892 | 893 | /* Look for a special classname */ 894 | if ($e->hasAttribute('class') && $e->getAttribute('class') != '') 895 | { 896 | if (preg_match($this->regexps['negative'], $e->getAttribute('class'))) { 897 | $weight -= 25; 898 | } 899 | if (preg_match($this->regexps['positive'], $e->getAttribute('class'))) { 900 | $weight += 25; 901 | } 902 | } 903 | 904 | /* Look for a special ID */ 905 | if ($e->hasAttribute('id') && $e->getAttribute('id') != '') 906 | { 907 | if (preg_match($this->regexps['negative'], $e->getAttribute('id'))) { 908 | $weight -= 25; 909 | } 910 | if (preg_match($this->regexps['positive'], $e->getAttribute('id'))) { 911 | $weight += 25; 912 | } 913 | } 914 | return $weight; 915 | } 916 | 917 | /** 918 | * Remove extraneous break tags from a node. 919 | * 920 | * @param DOMElement $node 921 | * @return void 922 | */ 923 | public function killBreaks($node) { 924 | $html = $node->innerHTML; 925 | $html = preg_replace($this->regexps['killBreaks'], '
', $html); 926 | $node->innerHTML = $html; 927 | } 928 | 929 | /** 930 | * Clean a node of all elements of type "tag". 931 | * (Unless it's a youtube/vimeo video. People love movies.) 932 | * 933 | * @param DOMElement $e 934 | * @param string $tag 935 | * @return void 936 | */ 937 | public function clean($e, $tag) { 938 | $targetList = $e->getElementsByTagName($tag); 939 | $isEmbed = ($tag == 'object' || $tag == 'embed'); 940 | 941 | for ($y=$targetList->length-1; $y >= 0; $y--) { 942 | /* Allow youtube and vimeo videos through as people usually want to see those. */ 943 | if ($isEmbed) { 944 | $attributeValues = ''; 945 | for ($i=0, $il=$targetList->item($y)->attributes->length; $i < $il; $i++) { 946 | $attributeValues .= $targetList->item($y)->attributes->item($i)->value . '|'; // DOMAttr? (TODO: test) 947 | } 948 | 949 | /* First, check the elements attributes to see if any of them contain youtube or vimeo */ 950 | if (preg_match($this->regexps['video'], $attributeValues)) { 951 | continue; 952 | } 953 | 954 | /* Then check the elements inside this element for the same. */ 955 | if (preg_match($this->regexps['video'], $targetList->item($y)->innerHTML)) { 956 | continue; 957 | } 958 | } 959 | $targetList->item($y)->parentNode->removeChild($targetList->item($y)); 960 | } 961 | } 962 | 963 | /** 964 | * Clean an element of all tags of type "tag" if they look fishy. 965 | * "Fishy" is an algorithm based on content length, classnames, 966 | * link density, number of images & embeds, etc. 967 | * 968 | * @param DOMElement $e 969 | * @param string $tag 970 | * @return void 971 | */ 972 | public function cleanConditionally($e, $tag) { 973 | if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) { 974 | return; 975 | } 976 | 977 | $tagsList = $e->getElementsByTagName($tag); 978 | $curTagsLength = $tagsList->length; 979 | 980 | /** 981 | * Gather counts for other typical elements embedded within. 982 | * Traverse backwards so we can remove nodes at the same time without effecting the traversal. 983 | * 984 | * TODO: Consider taking into account original contentScore here. 985 | */ 986 | for ($i=$curTagsLength-1; $i >= 0; $i--) { 987 | $weight = $this->getClassWeight($tagsList->item($i)); 988 | $contentScore = ($tagsList->item($i)->hasAttribute('readability')) ? (int)$tagsList->item($i)->getAttribute('readability') : 0; 989 | 990 | $this->dbg('Cleaning Conditionally ' . $tagsList->item($i)->tagName . ' (' . $tagsList->item($i)->getAttribute('class') . ':' . $tagsList->item($i)->getAttribute('id') . ')' . (($tagsList->item($i)->hasAttribute('readability')) ? (' with score ' . $tagsList->item($i)->getAttribute('readability')) : '')); 991 | 992 | if ($weight + $contentScore < 0) { 993 | $tagsList->item($i)->parentNode->removeChild($tagsList->item($i)); 994 | } 995 | else if ( $this->getCharCount($tagsList->item($i), ',') < 10) { 996 | /** 997 | * If there are not very many commas, and the number of 998 | * non-paragraph elements is more than paragraphs or other ominous signs, remove the element. 999 | **/ 1000 | $p = $tagsList->item($i)->getElementsByTagName('p')->length; 1001 | $img = $tagsList->item($i)->getElementsByTagName('img')->length; 1002 | $li = $tagsList->item($i)->getElementsByTagName('li')->length-100; 1003 | $input = $tagsList->item($i)->getElementsByTagName('input')->length; 1004 | 1005 | $embedCount = 0; 1006 | $embeds = $tagsList->item($i)->getElementsByTagName('embed'); 1007 | for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) { 1008 | if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) { 1009 | $embedCount++; 1010 | } 1011 | } 1012 | 1013 | $linkDensity = $this->getLinkDensity($tagsList->item($i)); 1014 | $contentLength = strlen($this->getInnerText($tagsList->item($i))); 1015 | $toRemove = false; 1016 | 1017 | if ( $img > $p ) { 1018 | $toRemove = true; 1019 | } else if ($li > $p && $tag != 'ul' && $tag != 'ol') { 1020 | $toRemove = true; 1021 | } else if ( $input > floor($p/3) ) { 1022 | $toRemove = true; 1023 | } else if ($contentLength < 25 && ($img === 0 || $img > 2) ) { 1024 | $toRemove = true; 1025 | } else if($weight < 25 && $linkDensity > 0.2) { 1026 | $toRemove = true; 1027 | } else if($weight >= 25 && $linkDensity > 0.5) { 1028 | $toRemove = true; 1029 | } else if(($embedCount == 1 && $contentLength < 75) || $embedCount > 1) { 1030 | $toRemove = true; 1031 | } 1032 | 1033 | if ($toRemove) { 1034 | $tagsList->item($i)->parentNode->removeChild($tagsList->item($i)); 1035 | } 1036 | } 1037 | } 1038 | } 1039 | 1040 | /** 1041 | * Clean out spurious headers from an Element. Checks things like classnames and link density. 1042 | * 1043 | * @param DOMElement $e 1044 | * @return void 1045 | */ 1046 | public function cleanHeaders($e) { 1047 | for ($headerIndex = 1; $headerIndex < 3; $headerIndex++) { 1048 | $headers = $e->getElementsByTagName('h' . $headerIndex); 1049 | for ($i=$headers->length-1; $i >=0; $i--) { 1050 | if ($this->getClassWeight($headers->item($i)) < 0 || $this->getLinkDensity($headers->item($i)) > 0.33) { 1051 | $headers->item($i)->parentNode->removeChild($headers->item($i)); 1052 | } 1053 | } 1054 | } 1055 | } 1056 | 1057 | public function flagIsActive($flag) { 1058 | return ($this->flags & $flag) > 0; 1059 | } 1060 | 1061 | public function addFlag($flag) { 1062 | $this->flags = $this->flags | $flag; 1063 | } 1064 | 1065 | public function removeFlag($flag) { 1066 | $this->flags = $this->flags & ~$flag; 1067 | } 1068 | } 1069 | ?> -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | phpMobi file generator 2 | ====================== 3 | 4 | phpMobi is a php script that can generate .mobi files from valid html 5 | files. While this was meant as an experiment, this tool works quite 6 | well and can be used to generate mobipocket files from most news articles. 7 | 8 | IMPORTANT: Do NOT use this on a public web server: most of it was coded in 9 | a weekend, with no testing and no special attention to security. Also, as no official 10 | documentation for the MOBI file format is available, there will be some bugs/problems 11 | in the generated files, but it works for relatively simple documents on the Kindle 12 | previewer and the Kindle 3. 13 | 14 | MobiPocket is an eBook format created by Mobipocket SA. This tool also 15 | uses a php readability port made by [Keyvan Minoukadeh](http://www.keyvan.net/2010/08/php-readability/). 16 | 17 | Code sample 18 | ------------ 19 | 20 | See index.php for an example of using this program. 21 | 22 | Sending an online article as a download: 23 | 24 | //Create the MOBI object 25 | $mobi = new MOBI(); 26 | 27 | //Set the content provider 28 | $content = new OnlineArticle("URL"); 29 | $mobi->setContentProvider($content); 30 | 31 | //Get title and make it a 12 character long url-safe filename 32 | $title = $mobi->getTitle(); 33 | if($title === false) 34 | $title = "file"; 35 | 36 | $title = urlencode(str_replace(" ", "_", strtolower(substr($title, 0, 12)))); 37 | 38 | //Send the mobi file as download 39 | $mobi->download($title.".mobi"); 40 | 41 | Using a previously generated/downloaded html file (will not download any images!): 42 | 43 | $data = "..."; 44 | $options = array( 45 | "title" => "Local document", 46 | "author" => "Author name", 47 | "subject" => "Subject" 48 | ); 49 | 50 | //Create the MOBI object 51 | $mobi = new MOBI(); 52 | 53 | //Set the data 54 | $mobi->setData($data); 55 | $mobi->setOptions($options); 56 | 57 | //Save the mobi file locally 58 | $mobi->save($options["title"].".mobi"); 59 | 60 | Implementation 61 | -------------- 62 | 63 | This code was implemented while reverse-engineering the MobiPocket format. 64 | Therefore this code absolutely isn't optimized for speed, but rather for 65 | easy changes, as getting it to produce valid files was quite fiddly. 66 | 67 | Features 68 | -------- 69 | 70 | Modular content provider system: 71 | Adding a new data source can be done by extending the ContentProvider 72 | class. See the OnlineArticle class for a simple but complete 73 | implementation of such a system. 74 | 75 | Image support: 76 | By default, the online article downloader (and any other content 77 | provider that supports images) will download images and integrate them 78 | into the mobi file. 79 | 80 | Partial UTF-8 support: 81 | In practice UTF-8 just works, but there are some unhandled corner 82 | cases (see missing features). 83 | 84 | Missing Features 85 | ---------------- 86 | 87 | Compression: 88 | This won't be implemented (or if it is, only to serve as a 89 | reference of the format). 90 | 91 | Different eBook types: 92 | MobiPocket supports other formats/layouts, such as newspaper-like 93 | formats. At the moment only the book layout has been implemented. 94 | 95 | Full UTF-8 support: 96 | UTF-8 should work most of the time (it worked every time I 97 | tested it), but there might be some problems when the character 98 | is split over two "records". 99 | 100 | License 101 | ------- 102 | This code is released under the Apache license (version 2.0) 103 | -------------------------------------------------------------------------------- /index.php: -------------------------------------------------------------------------------- 1 | setContentProvider($content); 29 | 30 | //Get title and make it a 12 character long filename 31 | $title = $mobi->getTitle(); 32 | if($title === false) $title = "file"; 33 | $title = urlencode(str_replace(" ", "_", strtolower(substr($title, 0, 12)))); 34 | 35 | //Send the mobi file as download 36 | $mobi->download($title.".mobi"); 37 | die; 38 | }else{ 39 | //Create the mobi object 40 | $mobi = new MOBI(); 41 | 42 | $content = new MOBIFile(); 43 | 44 | $content->set("title", "My first eBook"); 45 | $content->set("author", "Me"); 46 | 47 | $content->appendChapterTitle("Introduction"); 48 | for($i = 0, $lenI = rand(5, 10); $i < $lenI; $i++){ 49 | $content->appendParagraph("P".($i+1)); 50 | } 51 | 52 | 53 | //Based on PHP's imagecreatetruecolor help paage 54 | $im = imagecreatetruecolor(220, 200); 55 | $text_color = imagecolorallocate($im, 233, 14, 91); 56 | imagestring($im, 10, 5, 5, 'A Simple Text String', $text_color); 57 | imagestring($im, 5, 15, 75, 'A Simple Text String', $text_color); 58 | imagestring($im, 3, 25, 125, 'A Simple Text String', $text_color); 59 | imagestring($im, 2, 10, 155, 'A Simple Text String', $text_color); 60 | $content->appendImage($im); 61 | imagedestroy($im); 62 | 63 | $content->appendPageBreak(); 64 | 65 | for($i = 0, $lenI = rand(10, 15); $i < $lenI; $i++){ 66 | $content->appendChapterTitle(($i+1).". Chapter ".($i+1)); 67 | 68 | for($j = 0, $lenJ = rand(20, 40); $j < $lenJ; $j++){ 69 | $content->appendParagraph("P".($i+1).".".($j+1)." TEXT TEXT TEXT"); 70 | } 71 | 72 | $content->appendPageBreak(); 73 | } 74 | 75 | $mobi->setContentProvider($content); 76 | 77 | //Get title and make it a 12 character long filename 78 | $title = $mobi->getTitle(); 79 | if($title === false) $title = "file"; 80 | $title = urlencode(str_replace(" ", "_", strtolower(substr($title, 0, 12)))); 81 | 82 | //Send the mobi file as download 83 | $mobi->download($title.".mobi"); 84 | die; 85 | } 86 | } 87 | ?> 88 | 89 | 90 | 91 | 92 | Sample 93 | 94 | 95 | Download 96 | 97 | 98 | --------------------------------------------------------------------------------