├── LICENSE ├── MOBIClass ├── CharacterEntities.php ├── ContentProvider.php ├── EXTHHelper.php ├── FileByte.php ├── FileDate.php ├── FileElement.php ├── FileInt.php ├── FileObject.php ├── FileRecord.php ├── FileShort.php ├── FileString.php ├── FileTri.php ├── Http.php ├── ImageHandler.php ├── LinkedStringBuilder.php ├── MOBI.php ├── MOBIFile.php ├── MultipleFileHandler.php ├── OnlineArticle.php ├── PalmRecord.php ├── Prc.php ├── PreprocessedArticle.php ├── RecognizeURL.php ├── Record.php ├── RecordFactory.php ├── Settings.php ├── constants.php ├── downloaders │ └── FanFictionNet.php ├── http_build_url.php └── readability │ ├── JSLikeHTMLElement.php │ └── Readability.php ├── README.md └── index.php /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2013 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | -------------------------------------------------------------------------------- /MOBIClass/CharacterEntities.php: -------------------------------------------------------------------------------- 1 | output is UTF-8 11 | return $str; 12 | //return utf8_encode($str); 13 | //Convert to CP1252 14 | list($from, $to) = CharacterEntities::generateTables(); 15 | return str_replace($from, $to, $str); 16 | } 17 | 18 | private static function generateTables(){ 19 | $from = array(); 20 | $to = array(); 21 | 22 | for($i = 0; $i < 256; $i++){ 23 | $from[$i] = $to[$i] = chr($i); 24 | } 25 | 26 | $from[0x80] = "€"; 27 | $from[0x82] = "‚"; 28 | $from[0x83] = "ƒ"; 29 | $from[0x84] = "„"; 30 | $from[0x85] = "…"; 31 | $from[0x86] = "†"; 32 | $from[0x87] = "‡"; 33 | $from[0x88] = "ˆ"; 34 | $from[0x89] = "‰"; 35 | $from[0x8A] = "Š"; 36 | $from[0x8B] = "‹"; 37 | $from[0x8C] = "Œ"; 38 | $from[0x8E] = "Ž"; 39 | 40 | $from[0x91] = "‘"; 41 | $from[0x92] = "’"; 42 | $from[0x93] = "“"; 43 | $from[0x94] = "”"; 44 | $from[0x95] = "•"; 45 | $from[0x96] = "–"; 46 | $from[0x97] = "—"; 47 | $from[0x98] = "˜"; 48 | $from[0x99] = "™"; 49 | $from[0x9A] = "š"; 50 | $from[0x9B] = "›"; 51 | $from[0x9C] = "œ"; 52 | $from[0x9E] = "ž"; 53 | $from[0x9F] = "Ÿ"; 54 | 55 | $from[0xA1] = "¡"; 56 | $from[0xA2] = "¢"; 57 | $from[0xA3] = "£"; 58 | $from[0xA4] = "¤"; 59 | $from[0xA5] = "¥"; 60 | $from[0xA6] = "¦"; 61 | $from[0xA7] = "§"; 62 | $from[0xA8] = "¨"; 63 | $from[0xA9] = "©"; 64 | $from[0xAA] = "ª"; 65 | $from[0xAB] = "«"; 66 | $from[0xAC] = "¬"; 67 | $from[0xAE] = "®"; 68 | $from[0xAF] = "¯"; 69 | 70 | $from[0xB0] = "°"; 71 | $from[0xB1] = "±"; 72 | $from[0xB2] = "²"; 73 | $from[0xB3] = "³"; 74 | $from[0xB4] = "´"; 75 | $from[0xB5] = "µ"; 76 | $from[0xB6] = "¶"; 77 | $from[0xB7] = "·"; 78 | $from[0xB8] = "¸"; 79 | $from[0xB9] = "¹"; 80 | $from[0xBA] = "º"; 81 | $from[0xBB] = "»"; 82 | $from[0xBC] = "¼"; 83 | $from[0xBD] = "½"; 84 | $from[0xBE] = "¾"; 85 | $from[0xBF] = "¿"; 86 | 87 | $from[0xC0] = "À"; 88 | $from[0xC1] = "Á"; 89 | $from[0xC2] = "Â"; 90 | $from[0xC3] = "Ã"; 91 | $from[0xC4] = "Ä"; 92 | $from[0xC5] = "Å"; 93 | $from[0xC6] = "Æ"; 94 | $from[0xC7] = "Ç"; 95 | $from[0xC8] = "È"; 96 | $from[0xC9] = "É"; 97 | $from[0xCA] = "Ê"; 98 | $from[0xCB] = "Ë"; 99 | $from[0xCC] = "Ì"; 100 | $from[0xCD] = "Í"; 101 | $from[0xCE] = "Î"; 102 | $from[0xCF] = "Ï"; 103 | 104 | $from[0xD0] = "Ð"; 105 | $from[0xD1] = "Ñ"; 106 | $from[0xD2] = "Ò"; 107 | $from[0xD3] = "Ó"; 108 | $from[0xD4] = "Ô"; 109 | $from[0xD5] = "Õ"; 110 | $from[0xD6] = "Ö"; 111 | $from[0xD7] = "×"; 112 | $from[0xD8] = "Ø"; 113 | $from[0xD9] = "Ù"; 114 | $from[0xDA] = "Ú"; 115 | $from[0xDB] = "Û"; 116 | $from[0xDC] = "Ü"; 117 | $from[0xDD] = "Ý"; 118 | $from[0xDE] = "Þ"; 119 | $from[0xDF] = "ß"; 120 | 121 | $from[0xE0] = "à"; 122 | $from[0xE1] = "á"; 123 | $from[0xE2] = "â"; 124 | $from[0xE3] = "ã"; 125 | $from[0xE4] = "ä"; 126 | $from[0xE5] = "å"; 127 | $from[0xE6] = "æ"; 128 | $from[0xE7] = "ç"; 129 | $from[0xE8] = "è"; 130 | $from[0xE9] = "é"; 131 | $from[0xEA] = "ê"; 132 | $from[0xEB] = "ë"; 133 | $from[0xEC] = "ì"; 134 | $from[0xED] = "í"; 135 | $from[0xEE] = "î"; 136 | $from[0xEF] = "ï"; 137 | 138 | $from[0xF0] = "ð"; 139 | $from[0xF1] = "ñ"; 140 | $from[0xF2] = "ò"; 141 | $from[0xF3] = "ó"; 142 | $from[0xF4] = "ô"; 143 | $from[0xF5] = "õ"; 144 | $from[0xF6] = "ö"; 145 | $from[0xF7] = "÷"; 146 | $from[0xF8] = "ø"; 147 | $from[0xF9] = "ù"; 148 | $from[0xFA] = "ú"; 149 | $from[0xFB] = "û"; 150 | $from[0xFC] = "ü"; 151 | $from[0xFD] = "ý"; 152 | $from[0xFE] = "þ"; 153 | $from[0xFF] = "ÿ"; 154 | 155 | 156 | return array($from, $to); 157 | } 158 | /* 159 | 00 = U+0000 : NULL 160 | 01 = U+0001 : START OF HEADING 161 | 02 = U+0002 : START OF TEXT 162 | 03 = U+0003 : END OF TEXT 163 | 04 = U+0004 : END OF TRANSMISSION 164 | 05 = U+0005 : ENQUIRY 165 | 06 = U+0006 : ACKNOWLEDGE 166 | 07 = U+0007 : BELL 167 | 08 = U+0008 : BACKSPACE 168 | 09 = U+0009 : HORIZONTAL TABULATION 169 | 0A = U+000A : LINE FEED 170 | 0B = U+000B : VERTICAL TABULATION 171 | 0C = U+000C : FORM FEED 172 | 0D = U+000D : CARRIAGE RETURN 173 | 0E = U+000E : SHIFT OUT 174 | 0F = U+000F : SHIFT IN 175 | 10 = U+0010 : DATA LINK ESCAPE 176 | 11 = U+0011 : DEVICE CONTROL ONE 177 | 12 = U+0012 : DEVICE CONTROL TWO 178 | 13 = U+0013 : DEVICE CONTROL THREE 179 | 14 = U+0014 : DEVICE CONTROL FOUR 180 | 15 = U+0015 : NEGATIVE ACKNOWLEDGE 181 | 16 = U+0016 : SYNCHRONOUS IDLE 182 | 17 = U+0017 : END OF TRANSMISSION BLOCK 183 | 18 = U+0018 : CANCEL 184 | 19 = U+0019 : END OF MEDIUM 185 | 1A = U+001A : SUBSTITUTE 186 | 1B = U+001B : ESCAPE 187 | 1C = U+001C : FILE SEPARATOR 188 | 1D = U+001D : GROUP SEPARATOR 189 | 1E = U+001E : RECORD SEPARATOR 190 | 1F = U+001F : UNIT SEPARATOR 191 | 20 = U+0020 : SPACE 192 | 21 = U+0021 : EXCLAMATION MARK 193 | 22 = U+0022 : QUOTATION MARK 194 | 23 = U+0023 : NUMBER SIGN 195 | 24 = U+0024 : DOLLAR SIGN 196 | 25 = U+0025 : PERCENT SIGN 197 | 26 = U+0026 : AMPERSAND 198 | 27 = U+0027 : APOSTROPHE 199 | 28 = U+0028 : LEFT PARENTHESIS 200 | 29 = U+0029 : RIGHT PARENTHESIS 201 | 2A = U+002A : ASTERISK 202 | 2B = U+002B : PLUS SIGN 203 | 2C = U+002C : COMMA 204 | 2D = U+002D : HYPHEN-MINUS 205 | 2E = U+002E : FULL STOP 206 | 2F = U+002F : SOLIDUS 207 | 30 = U+0030 : DIGIT ZERO 208 | 31 = U+0031 : DIGIT ONE 209 | 32 = U+0032 : DIGIT TWO 210 | 33 = U+0033 : DIGIT THREE 211 | 34 = U+0034 : DIGIT FOUR 212 | 35 = U+0035 : DIGIT FIVE 213 | 36 = U+0036 : DIGIT SIX 214 | 37 = U+0037 : DIGIT SEVEN 215 | 38 = U+0038 : DIGIT EIGHT 216 | 39 = U+0039 : DIGIT NINE 217 | 3A = U+003A : COLON 218 | 3B = U+003B : SEMICOLON 219 | 3C = U+003C : LESS-THAN SIGN 220 | 3D = U+003D : EQUALS SIGN 221 | 3E = U+003E : GREATER-THAN SIGN 222 | 3F = U+003F : QUESTION MARK 223 | 40 = U+0040 : COMMERCIAL AT 224 | 41 = U+0041 : LATIN CAPITAL LETTER A 225 | 42 = U+0042 : LATIN CAPITAL LETTER B 226 | 43 = U+0043 : LATIN CAPITAL LETTER C 227 | 44 = U+0044 : LATIN CAPITAL LETTER D 228 | 45 = U+0045 : LATIN CAPITAL LETTER E 229 | 46 = U+0046 : LATIN CAPITAL LETTER F 230 | 47 = U+0047 : LATIN CAPITAL LETTER G 231 | 48 = U+0048 : LATIN CAPITAL LETTER H 232 | 49 = U+0049 : LATIN CAPITAL LETTER I 233 | 4A = U+004A : LATIN CAPITAL LETTER J 234 | 4B = U+004B : LATIN CAPITAL LETTER K 235 | 4C = U+004C : LATIN CAPITAL LETTER L 236 | 4D = U+004D : LATIN CAPITAL LETTER M 237 | 4E = U+004E : LATIN CAPITAL LETTER N 238 | 4F = U+004F : LATIN CAPITAL LETTER O 239 | 50 = U+0050 : LATIN CAPITAL LETTER P 240 | 51 = U+0051 : LATIN CAPITAL LETTER Q 241 | 52 = U+0052 : LATIN CAPITAL LETTER R 242 | 53 = U+0053 : LATIN CAPITAL LETTER S 243 | 54 = U+0054 : LATIN CAPITAL LETTER T 244 | 55 = U+0055 : LATIN CAPITAL LETTER U 245 | 56 = U+0056 : LATIN CAPITAL LETTER V 246 | 57 = U+0057 : LATIN CAPITAL LETTER W 247 | 58 = U+0058 : LATIN CAPITAL LETTER X 248 | 59 = U+0059 : LATIN CAPITAL LETTER Y 249 | 5A = U+005A : LATIN CAPITAL LETTER Z 250 | 5B = U+005B : LEFT SQUARE BRACKET 251 | 5C = U+005C : REVERSE SOLIDUS 252 | 5D = U+005D : RIGHT SQUARE BRACKET 253 | 5E = U+005E : CIRCUMFLEX ACCENT 254 | 5F = U+005F : LOW LINE 255 | 60 = U+0060 : GRAVE ACCENT 256 | 61 = U+0061 : LATIN SMALL LETTER A 257 | 62 = U+0062 : LATIN SMALL LETTER B 258 | 63 = U+0063 : LATIN SMALL LETTER C 259 | 64 = U+0064 : LATIN SMALL LETTER D 260 | 65 = U+0065 : LATIN SMALL LETTER E 261 | 66 = U+0066 : LATIN SMALL LETTER F 262 | 67 = U+0067 : LATIN SMALL LETTER G 263 | 68 = U+0068 : LATIN SMALL LETTER H 264 | 69 = U+0069 : LATIN SMALL LETTER I 265 | 6A = U+006A : LATIN SMALL LETTER J 266 | 6B = U+006B : LATIN SMALL LETTER K 267 | 6C = U+006C : LATIN SMALL LETTER L 268 | 6D = U+006D : LATIN SMALL LETTER M 269 | 6E = U+006E : LATIN SMALL LETTER N 270 | 6F = U+006F : LATIN SMALL LETTER O 271 | 70 = U+0070 : LATIN SMALL LETTER P 272 | 71 = U+0071 : LATIN SMALL LETTER Q 273 | 72 = U+0072 : LATIN SMALL LETTER R 274 | 73 = U+0073 : LATIN SMALL LETTER S 275 | 74 = U+0074 : LATIN SMALL LETTER T 276 | 75 = U+0075 : LATIN SMALL LETTER U 277 | 76 = U+0076 : LATIN SMALL LETTER V 278 | 77 = U+0077 : LATIN SMALL LETTER W 279 | 78 = U+0078 : LATIN SMALL LETTER X 280 | 79 = U+0079 : LATIN SMALL LETTER Y 281 | 7A = U+007A : LATIN SMALL LETTER Z 282 | 7B = U+007B : LEFT CURLY BRACKET 283 | 7C = U+007C : VERTICAL LINE 284 | 7D = U+007D : RIGHT CURLY BRACKET 285 | 7E = U+007E : TILDE 286 | 7F = U+007F : DELETE 287 | 80 = U+20AC : EURO SIGN 288 | 82 = U+201A : SINGLE LOW-9 QUOTATION MARK 289 | 83 = U+0192 : LATIN SMALL LETTER F WITH HOOK 290 | 84 = U+201E : DOUBLE LOW-9 QUOTATION MARK 291 | 85 = U+2026 : HORIZONTAL ELLIPSIS 292 | 86 = U+2020 : DAGGER 293 | 87 = U+2021 : DOUBLE DAGGER 294 | 88 = U+02C6 : MODIFIER LETTER CIRCUMFLEX ACCENT 295 | 89 = U+2030 : PER MILLE SIGN 296 | 8A = U+0160 : LATIN CAPITAL LETTER S WITH CARON 297 | 8B = U+2039 : SINGLE LEFT-POINTING ANGLE QUOTATION MARK 298 | 8C = U+0152 : LATIN CAPITAL LIGATURE OE 299 | 8E = U+017D : LATIN CAPITAL LETTER Z WITH CARON 300 | 91 = U+2018 : LEFT SINGLE QUOTATION MARK 301 | 92 = U+2019 : RIGHT SINGLE QUOTATION MARK 302 | 93 = U+201C : LEFT DOUBLE QUOTATION MARK 303 | 94 = U+201D : RIGHT DOUBLE QUOTATION MARK 304 | 95 = U+2022 : BULLET 305 | 96 = U+2013 : EN DASH 306 | 97 = U+2014 : EM DASH 307 | 98 = U+02DC : SMALL TILDE 308 | 99 = U+2122 : TRADE MARK SIGN 309 | 9A = U+0161 : LATIN SMALL LETTER S WITH CARON 310 | 9B = U+203A : SINGLE RIGHT-POINTING ANGLE QUOTATION MARK 311 | 9C = U+0153 : LATIN SMALL LIGATURE OE 312 | 9E = U+017E : LATIN SMALL LETTER Z WITH CARON 313 | 9F = U+0178 : LATIN CAPITAL LETTER Y WITH DIAERESIS 314 | A0 = U+00A0 : NO-BREAK SPACE 315 | A1 = U+00A1 : INVERTED EXCLAMATION MARK 316 | A2 = U+00A2 : CENT SIGN 317 | A3 = U+00A3 : POUND SIGN 318 | A4 = U+00A4 : CURRENCY SIGN 319 | A5 = U+00A5 : YEN SIGN 320 | A6 = U+00A6 : BROKEN BAR 321 | A7 = U+00A7 : SECTION SIGN 322 | A8 = U+00A8 : DIAERESIS 323 | A9 = U+00A9 : COPYRIGHT SIGN 324 | AA = U+00AA : FEMININE ORDINAL INDICATOR 325 | AB = U+00AB : LEFT-POINTING DOUBLE ANGLE QUOTATION MARK 326 | AC = U+00AC : NOT SIGN 327 | AD = U+00AD : SOFT HYPHEN 328 | AE = U+00AE : REGISTERED SIGN 329 | AF = U+00AF : MACRON 330 | B0 = U+00B0 : DEGREE SIGN 331 | B1 = U+00B1 : PLUS-MINUS SIGN 332 | B2 = U+00B2 : SUPERSCRIPT TWO 333 | B3 = U+00B3 : SUPERSCRIPT THREE 334 | B4 = U+00B4 : ACUTE ACCENT 335 | B5 = U+00B5 : MICRO SIGN 336 | B6 = U+00B6 : PILCROW SIGN 337 | B7 = U+00B7 : MIDDLE DOT 338 | B8 = U+00B8 : CEDILLA 339 | B9 = U+00B9 : SUPERSCRIPT ONE 340 | BA = U+00BA : MASCULINE ORDINAL INDICATOR 341 | BB = U+00BB : RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK 342 | BC = U+00BC : VULGAR FRACTION ONE QUARTER 343 | BD = U+00BD : VULGAR FRACTION ONE HALF 344 | BE = U+00BE : VULGAR FRACTION THREE QUARTERS 345 | BF = U+00BF : INVERTED QUESTION MARK 346 | C0 = U+00C0 : LATIN CAPITAL LETTER A WITH GRAVE 347 | C1 = U+00C1 : LATIN CAPITAL LETTER A WITH ACUTE 348 | C2 = U+00C2 : LATIN CAPITAL LETTER A WITH CIRCUMFLEX 349 | C3 = U+00C3 : LATIN CAPITAL LETTER A WITH TILDE 350 | C4 = U+00C4 : LATIN CAPITAL LETTER A WITH DIAERESIS 351 | C5 = U+00C5 : LATIN CAPITAL LETTER A WITH RING ABOVE 352 | C6 = U+00C6 : LATIN CAPITAL LETTER AE 353 | C7 = U+00C7 : LATIN CAPITAL LETTER C WITH CEDILLA 354 | C8 = U+00C8 : LATIN CAPITAL LETTER E WITH GRAVE 355 | C9 = U+00C9 : LATIN CAPITAL LETTER E WITH ACUTE 356 | CA = U+00CA : LATIN CAPITAL LETTER E WITH CIRCUMFLEX 357 | CB = U+00CB : LATIN CAPITAL LETTER E WITH DIAERESIS 358 | CC = U+00CC : LATIN CAPITAL LETTER I WITH GRAVE 359 | CD = U+00CD : LATIN CAPITAL LETTER I WITH ACUTE 360 | CE = U+00CE : LATIN CAPITAL LETTER I WITH CIRCUMFLEX 361 | CF = U+00CF : LATIN CAPITAL LETTER I WITH DIAERESIS 362 | D0 = U+00D0 : LATIN CAPITAL LETTER ETH 363 | D1 = U+00D1 : LATIN CAPITAL LETTER N WITH TILDE 364 | D2 = U+00D2 : LATIN CAPITAL LETTER O WITH GRAVE 365 | D3 = U+00D3 : LATIN CAPITAL LETTER O WITH ACUTE 366 | D4 = U+00D4 : LATIN CAPITAL LETTER O WITH CIRCUMFLEX 367 | D5 = U+00D5 : LATIN CAPITAL LETTER O WITH TILDE 368 | D6 = U+00D6 : LATIN CAPITAL LETTER O WITH DIAERESIS 369 | D7 = U+00D7 : MULTIPLICATION SIGN 370 | D8 = U+00D8 : LATIN CAPITAL LETTER O WITH STROKE 371 | D9 = U+00D9 : LATIN CAPITAL LETTER U WITH GRAVE 372 | DA = U+00DA : LATIN CAPITAL LETTER U WITH ACUTE 373 | DB = U+00DB : LATIN CAPITAL LETTER U WITH CIRCUMFLEX 374 | DC = U+00DC : LATIN CAPITAL LETTER U WITH DIAERESIS 375 | DD = U+00DD : LATIN CAPITAL LETTER Y WITH ACUTE 376 | DE = U+00DE : LATIN CAPITAL LETTER THORN 377 | DF = U+00DF : LATIN SMALL LETTER SHARP S 378 | E0 = U+00E0 : LATIN SMALL LETTER A WITH GRAVE 379 | E1 = U+00E1 : LATIN SMALL LETTER A WITH ACUTE 380 | E2 = U+00E2 : LATIN SMALL LETTER A WITH CIRCUMFLEX 381 | E3 = U+00E3 : LATIN SMALL LETTER A WITH TILDE 382 | E4 = U+00E4 : LATIN SMALL LETTER A WITH DIAERESIS 383 | E5 = U+00E5 : LATIN SMALL LETTER A WITH RING ABOVE 384 | E6 = U+00E6 : LATIN SMALL LETTER AE 385 | E7 = U+00E7 : LATIN SMALL LETTER C WITH CEDILLA 386 | E8 = U+00E8 : LATIN SMALL LETTER E WITH GRAVE 387 | E9 = U+00E9 : LATIN SMALL LETTER E WITH ACUTE 388 | EA = U+00EA : LATIN SMALL LETTER E WITH CIRCUMFLEX 389 | EB = U+00EB : LATIN SMALL LETTER E WITH DIAERESIS 390 | EC = U+00EC : LATIN SMALL LETTER I WITH GRAVE 391 | ED = U+00ED : LATIN SMALL LETTER I WITH ACUTE 392 | EE = U+00EE : LATIN SMALL LETTER I WITH CIRCUMFLEX 393 | EF = U+00EF : LATIN SMALL LETTER I WITH DIAERESIS 394 | F0 = U+00F0 : LATIN SMALL LETTER ETH 395 | F1 = U+00F1 : LATIN SMALL LETTER N WITH TILDE 396 | F2 = U+00F2 : LATIN SMALL LETTER O WITH GRAVE 397 | F3 = U+00F3 : LATIN SMALL LETTER O WITH ACUTE 398 | F4 = U+00F4 : LATIN SMALL LETTER O WITH CIRCUMFLEX 399 | F5 = U+00F5 : LATIN SMALL LETTER O WITH TILDE 400 | F6 = U+00F6 : LATIN SMALL LETTER O WITH DIAERESIS 401 | F7 = U+00F7 : DIVISION SIGN 402 | F8 = U+00F8 : LATIN SMALL LETTER O WITH STROKE 403 | F9 = U+00F9 : LATIN SMALL LETTER U WITH GRAVE 404 | FA = U+00FA : LATIN SMALL LETTER U WITH ACUTE 405 | FB = U+00FB : LATIN SMALL LETTER U WITH CIRCUMFLEX 406 | FC = U+00FC : LATIN SMALL LETTER U WITH DIAERESIS 407 | FD = U+00FD : LATIN SMALL LETTER Y WITH ACUTE 408 | FE = U+00FE : LATIN SMALL LETTER THORN 409 | FF = U+00FF : LATIN SMALL LETTER Y WITH DIAERESIS 410 | * 411 | */ 412 | } 413 | ?> 414 | -------------------------------------------------------------------------------- /MOBIClass/ContentProvider.php: -------------------------------------------------------------------------------- 1 | 23 | -------------------------------------------------------------------------------- /MOBIClass/EXTHHelper.php: -------------------------------------------------------------------------------- 1 | > (8*$i)).$out; 25 | $mask = $mask << 8; 26 | } 27 | return $out; 28 | } 29 | 30 | static function getRightRepresentation($type, $value){ 31 | if($type >= 100 && $type < 200){ 32 | return $value; 33 | }else{ 34 | return self::toHex($value); 35 | } 36 | } 37 | 38 | static function toHex($value){ 39 | $out = ""; 40 | for($i = 0, $len = strlen($value); $i < $len; $i++){ 41 | if($i > 0) $out .= " "; 42 | $hex = dechex(ord($value[$i])); 43 | if(strlen($hex) < 2) $hex = "0".$hex; 44 | $out .= $hex; 45 | } 46 | return $out; 47 | } 48 | 49 | 50 | static private $types = array( 51 | 1 => "drm server id", 52 | 2 => "drm commerce id", 53 | 3 => "drm ebookbase book id", 54 | 100 => "author", 55 | 101 => "publisher", 56 | 102 => "imprint", 57 | 103 => "description", 58 | 104 => "isbn", 59 | 105 => "subject", 60 | 106 => "publishingdate", 61 | 107 => "review", 62 | 108 => "contributor", 63 | 109 => "rights", 64 | 110 => "subjectcode", 65 | 111 => "type", 66 | 112 => "source", 67 | 113 => "asin", 68 | 114 => "versionnumber", 69 | 115 => "sample", 70 | 116 => "startreading", 71 | 118 => "retail price", 72 | 119 => "retail price currency", 73 | 201 => "coveroffset", 74 | 202 => "thumboffset", 75 | 203 => "hasfakecover", 76 | 204 => "Creator Software", 77 | 205 => "Creator Major Version", 78 | 206 => "Creator Minor Version", 79 | 207 => "Creator Build Number", 80 | 208 => "watermark", 81 | 209 => "tamper proof keys", 82 | 300 => "fontsignature", 83 | 401 => "clippinglimit", 84 | 402 => "publisherlimit", 85 | 403 => "403", 86 | 404 => "ttsflag", 87 | 501 => "cdetype", 88 | 502 => "lastupdatetime", 89 | 503 => "updatedtitle" 90 | ); 91 | static private $flippedTypes = array( 92 | "drm server id" => 1, 93 | "drm commerce id" => 2, 94 | "drm ebookbase book id" => 3, 95 | "author" => 100, 96 | "publisher" => 101, 97 | "imprint" => 102, 98 | "description" => 103, 99 | "isbn" => 104, 100 | "subject" => 105, 101 | "publishingdate" => 106, 102 | "review" => 107, 103 | "contributor" => 108, 104 | "rights" => 109, 105 | "subjectcode" => 110, 106 | "type" => 111, 107 | "source" => 112, 108 | "asin" => 113, 109 | "versionnumber" => 114, 110 | "sample" => 115, 111 | "startreading" => 116, 112 | "retail price" => 118, 113 | "retail price currency" => 119, 114 | "coveroffset" => 201, 115 | "thumboffset" => 202, 116 | "hasfakecover" => 203, 117 | "Creator Software" => 204, 118 | "Creator Major Version" => 205, 119 | "Creator Minor Version" => 206, 120 | "Creator Build Number" => 207, 121 | "watermark" => 208, 122 | "tamper proof keys" => 209, 123 | "fontsignature" => 300, 124 | "clippinglimit" => 401, 125 | "publisherlimit" => 402, 126 | "403" => 403, 127 | "ttsflag" => 404, 128 | "cdetype" => 501, 129 | "lastupdatetime" => 502, 130 | "updatedtitle" => 503 131 | ); 132 | } -------------------------------------------------------------------------------- /MOBIClass/FileByte.php: -------------------------------------------------------------------------------- 1 | set($n); 18 | } 19 | 20 | public function get(){ 21 | return $this->data; 22 | } 23 | 24 | public function set($value){ 25 | $this->data = intval($value) & 0xFF; 26 | } 27 | 28 | public function serialize() { 29 | return $this->byteToString($this->data); 30 | } 31 | 32 | public function unserialize($data) { 33 | __construct($this->toInt($data)); 34 | } 35 | 36 | 37 | public function __toString(){ 38 | return "FileByte: {".$this->byteAsString($this->data)."}"; 39 | } 40 | } 41 | ?> 42 | -------------------------------------------------------------------------------- /MOBIClass/FileDate.php: -------------------------------------------------------------------------------- 1 | set($n); 18 | } 19 | 20 | public function get(){ 21 | return $this->data; 22 | } 23 | 24 | public function set($value){ 25 | $this->data = intval($value); 26 | } 27 | 28 | public function serialize() { 29 | return $this->intToString($this->data); 30 | } 31 | 32 | public function unserialize($data) { 33 | __construct($this->toInt($data)); 34 | } 35 | 36 | public function __toString(){ 37 | return "FileDate: {".(date("r", $this->data-94694400))."}"; 38 | } 39 | } 40 | ?> 41 | -------------------------------------------------------------------------------- /MOBIClass/FileElement.php: -------------------------------------------------------------------------------- 1 | elements = $elements; 20 | } 21 | 22 | public function getByteLength(){ 23 | return $this->getLength(); 24 | } 25 | 26 | public function getLength(){ 27 | $total = 0; 28 | foreach($this->elements as $val){ 29 | $total += $val->getByteLength(); 30 | } 31 | return $total; 32 | } 33 | 34 | public function offsetToEntry($name){ 35 | $pos = 0; 36 | foreach($this->elements as $key=>$value){ 37 | if($name == $key){ 38 | break; 39 | } 40 | $pos += $value->getByteLength(); 41 | } 42 | return $pos; 43 | } 44 | 45 | public function exists($key){ 46 | return isset($this->elements[$key]); 47 | } 48 | /** 49 | * @param string $key 50 | * @return FileObject 51 | */ 52 | public function get($key){ 53 | return $this->elements[$key]; 54 | } 55 | 56 | /** 57 | * @param string $key 58 | * @param FileObject $value 59 | */ 60 | public function set($key, $value){ 61 | $this->elements[$key] = $value; 62 | } 63 | 64 | public function add($key, $value){ 65 | $this->elements[$key] = $value; 66 | } 67 | 68 | public function serialize() { 69 | $result = array(); 70 | foreach($this->elements as $val){ 71 | $result[] = $val->serialize(); 72 | } 73 | return implode("", $result); 74 | } 75 | 76 | public function unserialize($data) { 77 | //TODO: If reading is needed -> way more complex 78 | } 79 | 80 | public function __toString(){ 81 | $output = "FileElement (".$this->getByteLength()." bytes): {\n"; 82 | foreach($this->elements as $key=>$value){ 83 | $output .= "\t".$key.": ".$value."\n"; 84 | } 85 | $output .= "}"; 86 | return $output; 87 | } 88 | } 89 | ?> 90 | -------------------------------------------------------------------------------- /MOBIClass/FileInt.php: -------------------------------------------------------------------------------- 1 | set($n); 18 | } 19 | 20 | public function get(){ 21 | return $this->data; 22 | } 23 | 24 | public function set($value){ 25 | $this->data = intval($value); 26 | } 27 | 28 | public function serialize() { 29 | return $this->intToString($this->data); 30 | } 31 | 32 | public function unserialize($data) { 33 | __construct($this->toInt($data)); 34 | } 35 | 36 | public function __toString(){ 37 | return "FileInt: {".$this->intAsString($this->data)."}"; 38 | } 39 | } 40 | ?> 41 | -------------------------------------------------------------------------------- /MOBIClass/FileObject.php: -------------------------------------------------------------------------------- 1 | byteLength = $byteLength; 13 | } 14 | 15 | public function getByteLength(){ 16 | if($this->byteLength >= 0){ 17 | return $this->byteLength; 18 | } 19 | return $this->getLength(); 20 | } 21 | 22 | public function getLength(){ 23 | throw new Exception("Sub-class needs to implement this if it doesn't have a fixed length"); 24 | } 25 | 26 | /** 27 | * Convert a string to byte format (maximum 4 bytes) 28 | * @param string $string Input string 29 | * @return int Output integer 30 | */ 31 | public function toInt($string){ 32 | $out = 0; 33 | for($i = 0, $len = min(4, strlen($string)); $i < $len; $i++){ 34 | $out = $out | (ord($string[$i]) << (($len-$i-1)*8)); 35 | } 36 | return $out; 37 | } 38 | 39 | /** 40 | * Convert a byte (stored in an integer) to a string 41 | * @param byte $int 42 | * @return string 43 | */ 44 | public function byteToString($int){ 45 | return $this->toString($int, 1); 46 | } 47 | 48 | /** 49 | * Convert a byte (stored in an integer) to a string 50 | * @param byte $int 51 | * @return string 52 | */ 53 | public function byteAsString($int){ 54 | return $this->asString($int, 1); 55 | } 56 | 57 | /** 58 | * Convert a short (stored in an integer) to a string 59 | * @param short $int 60 | * @return string 61 | */ 62 | public function shortToString($int){ 63 | return $this->toString($int, 2); 64 | } 65 | 66 | /** 67 | * Convert a short (stored in an integer) to a string 68 | * @param short $int 69 | * @return string 70 | */ 71 | public function shortAsString($int){ 72 | return $this->asString($int, 2); 73 | } 74 | 75 | /** 76 | * Convert a tri-byte (stored in an integer) to a string 77 | * @param tri-byte $int 78 | * @return string 79 | */ 80 | public function triToString($int){ 81 | return $this->toString($int, 3); 82 | } 83 | 84 | /** 85 | * Convert a tri-byte (stored in an integer) to a string 86 | * @param tri-byte $int 87 | * @return string 88 | */ 89 | public function triAsString($int){ 90 | return $this->asString($int, 3); 91 | } 92 | 93 | /** 94 | * Convert an integer to a string 95 | * @param int $int 96 | * @return string 97 | */ 98 | public function intToString($int){ 99 | return $this->toString($int, 4); 100 | } 101 | 102 | /** 103 | * Convert an integer to a string 104 | * @param int $int 105 | * @return string 106 | */ 107 | public function intAsString($int){ 108 | return $this->asString($int, 4); 109 | } 110 | 111 | /** 112 | * Convert a number of n bytes to a string 113 | * @param int $int Number that should be converted 114 | * @param int $size Number of bytes to convert 115 | * @return string Output string 116 | */ 117 | private function toString($int, $size){ 118 | $out = ""; 119 | for($i = 0; $i < $size; $i++){ 120 | $out = chr($int & 0xFF).$out; 121 | $int = $int >> 8; 122 | } 123 | return $out; 124 | } 125 | 126 | /** 127 | * Convert a number of n bytes to a string 128 | * @param int $int Number that should be converted 129 | * @param int $size Number of bytes to convert 130 | * @return string Output string 131 | */ 132 | private function asString($int, $size){ 133 | $out = ""; 134 | for($i = 0; $i < $size; $i++){ 135 | if($i > 0) $out = " ".$out; 136 | $byte = dechex($int & 0xFF); 137 | if(strlen($byte) == 1) $byte = "0".$byte; 138 | $out = $byte.$out; 139 | $int = $int >> 8; 140 | } 141 | return $out; 142 | } 143 | 144 | /** 145 | * Get the value 146 | * @return mixed Value to get 147 | */ 148 | abstract public function get(); 149 | 150 | /** 151 | * Set the value 152 | * @return mixed Value to set 153 | */ 154 | abstract public function set($value); 155 | 156 | /** 157 | * Serialize the object 158 | * @return string String representation 159 | */ 160 | abstract public function serialize(); 161 | 162 | /** 163 | * Unserialize the object 164 | * @param string $data String representation 165 | */ 166 | abstract public function unserialize($data); 167 | } 168 | ?> 169 | -------------------------------------------------------------------------------- /MOBIClass/FileRecord.php: -------------------------------------------------------------------------------- 1 | record = $record; 20 | } 21 | 22 | public function getByteLength(){ 23 | return $this->getLength(); 24 | } 25 | 26 | public function getLength(){ 27 | return $this->record->getLength(); 28 | } 29 | 30 | public function get(){ 31 | return $this->record; 32 | } 33 | 34 | public function set($record){ 35 | $this->record = $record; 36 | } 37 | 38 | public function serialize() { 39 | return $this->record->serialize(); 40 | } 41 | 42 | public function unserialize($data) { 43 | __construct($this->record->unserialize($data)); 44 | } 45 | } 46 | ?> 47 | -------------------------------------------------------------------------------- /MOBIClass/FileShort.php: -------------------------------------------------------------------------------- 1 | set($n); 18 | } 19 | 20 | public function get(){ 21 | return $this->data; 22 | } 23 | 24 | public function set($value){ 25 | $this->data = intval($value) & 0xFFFF; 26 | } 27 | 28 | public function serialize() { 29 | return $this->shortToString($this->data); 30 | } 31 | 32 | public function unserialize($data) { 33 | __construct($this->toInt($data)); 34 | } 35 | 36 | 37 | public function __toString(){ 38 | return "FileShort: {".$this->shortAsString($this->data)."}"; 39 | } 40 | } 41 | ?> 42 | -------------------------------------------------------------------------------- /MOBIClass/FileString.php: -------------------------------------------------------------------------------- 1 | forcedLength = -1; 21 | $this->data = ""; 22 | 23 | if($second != null){ 24 | $this->data = $first; 25 | $this->forcedLength = $second; 26 | }else if($first != null){ 27 | if(is_string($first)){ 28 | $this->data = $first; 29 | }else{ 30 | $this->forcedLength = $first; 31 | } 32 | } 33 | } 34 | 35 | public function getByteLength(){ 36 | return $this->getLength(); 37 | } 38 | 39 | public function getLength(){ 40 | if($this->forcedLength >= 0){ 41 | return $this->forcedLength; 42 | } 43 | return strlen($this->data); 44 | } 45 | 46 | public function get(){ 47 | return $this->data; 48 | } 49 | 50 | public function set($value){ 51 | $this->data = $value; 52 | } 53 | 54 | public function serialize() { 55 | $output = $this->data; 56 | $curLength = strlen($output); 57 | 58 | if($this->forcedLength >= 0){ 59 | if($this->forcedLength > $curLength){ 60 | return str_pad($output, $this->forcedLength, "\0", STR_PAD_RIGHT); 61 | }elseif($this->forcedLength == $curLength){ 62 | return $output; 63 | }else{ 64 | return substr($output, 0, $this->forcedLength); 65 | } 66 | } 67 | return $output; 68 | } 69 | 70 | public function unserialize($data) { 71 | __construct($data); 72 | } 73 | 74 | public function __toString(){ 75 | $out = "FileString"; 76 | if($this->forcedLength >= 0){ 77 | $out .= " ".$this->forcedLength; 78 | } 79 | $out .= ": {\"".str_replace(array(" ", "\0"), " ", $this->serialize())."\"}"; 80 | return $out; 81 | } 82 | } 83 | ?> 84 | -------------------------------------------------------------------------------- /MOBIClass/FileTri.php: -------------------------------------------------------------------------------- 1 | set($n); 18 | } 19 | 20 | public function get(){ 21 | return $this->data; 22 | } 23 | 24 | public function set($value){ 25 | $this->data = intval($value) & 0xFFFFFF; 26 | } 27 | 28 | public function serialize() { 29 | return $this->triToString($this->data); 30 | } 31 | 32 | public function unserialize($data) { 33 | __construct($this->toInt($data)); 34 | } 35 | 36 | 37 | public function __toString(){ 38 | return "FileTri: {".$this->triAsString($this->data)."}"; 39 | } 40 | } 41 | ?> 42 | -------------------------------------------------------------------------------- /MOBIClass/Http.php: -------------------------------------------------------------------------------- 1 | 'val1', 'var2' => 'val2') */ 19 | $postdata = array(), /* HTTP POST Data ie. array('var1' => 'val1', 'var2' => 'val2') */ 20 | $cookie = array(), /* HTTP Cookie Data ie. array('var1' => 'val1', 'var2' => 'val2') */ 21 | $custom_headers = array(), /* Custom HTTP headers ie. array('Referer: http://localhost/ */ 22 | $timeout = 1000, /* Socket timeout in milliseconds */ 23 | $req_hdr = false, /* Include HTTP request headers */ 24 | $res_hdr = false, /* Include HTTP response headers */ 25 | $depth = 4 /* Depth of the iteration left (to avoid redirection loops) */ 26 | ) 27 | { 28 | if(self::$cache){ 29 | $cacheFile = "cache/".$ip."/".str_replace("/", "...", $uri); 30 | 31 | if(is_file($cacheFile)){ 32 | $data = file_get_contents($cacheFile); 33 | 34 | return self::resolveTruncated($data); 35 | } 36 | } 37 | $ret = ''; 38 | $verb = strtoupper($verb); 39 | $cookie_str = ''; 40 | $getdata_str = count($getdata) ? '?' : ''; 41 | $postdata_str = ''; 42 | 43 | foreach ($getdata as $k => $v) 44 | $getdata_str .= urlencode($k) .'='. urlencode($v); 45 | 46 | foreach ($postdata as $k => $v) 47 | $postdata_str .= urlencode($k) .'='. urlencode($v) .'&'; 48 | 49 | foreach ($cookie as $k => $v) 50 | $cookie_str .= urlencode($k) .'='. urlencode($v) .'; '; 51 | 52 | $crlf = "\r\n"; 53 | $req = $verb .' '. $uri . $getdata_str .' HTTP/1.1' . $crlf; 54 | $req .= 'Host: '. $ip . $crlf; 55 | $req .= 'User-Agent: Mozilla/5.0 Firefox/3.6.12' . $crlf; 56 | $req .= 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' . $crlf; 57 | $req .= 'Accept-Language: en-us,en;q=0.5' . $crlf; 58 | $req .= 'Accept-Encoding: deflate' . $crlf; 59 | $req .= 'Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7' . $crlf; 60 | 61 | 62 | foreach ($custom_headers as $k => $v) 63 | $req .= $k .': '. $v . $crlf; 64 | 65 | if (!empty($cookie_str)) 66 | $req .= 'Cookie: '. substr($cookie_str, 0, -2) . $crlf; 67 | 68 | if ($verb == 'POST' && !empty($postdata_str)) 69 | { 70 | $postdata_str = substr($postdata_str, 0, -1); 71 | $req .= 'Content-Type: application/x-www-form-urlencoded' . $crlf; 72 | $req .= 'Content-Length: '. strlen($postdata_str) . $crlf . $crlf; 73 | $req .= $postdata_str; 74 | } 75 | else $req .= $crlf; 76 | 77 | if ($req_hdr) 78 | $ret .= $req; 79 | 80 | if (($fp = @fsockopen($ip, $port, $errno, $errstr)) == false) 81 | return "Error $errno: $errstr\n"; 82 | 83 | stream_set_timeout($fp, 0, $timeout * 1000); 84 | 85 | fputs($fp, $req); 86 | $ret .= stream_get_contents($fp); 87 | fclose($fp); 88 | 89 | $headerSplit = strpos($ret, "\r\n\r\n"); 90 | $header = substr($ret, 0, $headerSplit); 91 | 92 | $redirectURL = self::CheckForRedirect($header); 93 | 94 | if($redirectURL !== false){ 95 | if($depth > 0){ 96 | $url_parts = parse_url($redirectURL); 97 | $url_parts["port"] = isset($url_parts["port"]) ? $url_parts["port"] : 80; 98 | $url_parts["path"] = isset($url_parts["path"]) ? $url_parts["path"] : "/"; 99 | 100 | return self::FullRequest($verb, $url_parts["host"], $url_parts["port"], $url_parts["path"], $getdata, $postdata, $cookie, $custom_headers, $timeout, $req_hdr, $res_hdr, $depth-1); 101 | }else{ 102 | return "Redirect loop, stopping..."; 103 | } 104 | } 105 | 106 | $truncated = false; 107 | $headerLines = explode("\r\n", $header); 108 | foreach($headerLines as $line){ 109 | list($name, $value) = explode(":", $line); 110 | $name = trim($name); 111 | $value = trim($value); 112 | 113 | if(strtolower($name) == "transfer-encoding" && strtolower($value) == "chunked"){ //TODO: Put right values! 114 | $truncated = true; 115 | } 116 | } 117 | 118 | if (!$res_hdr) 119 | $ret = substr($ret, $headerSplit + 4); 120 | 121 | if($truncated){ 122 | $ret = self::resolveTruncated($ret); 123 | } 124 | if(self::$cache){ 125 | if(!is_dir("cache")){ 126 | mkdir("cache"); 127 | } 128 | if(!is_dir("cache/".$ip)){ 129 | mkdir("cache/".$ip); 130 | } 131 | if(!is_file("cache/".$ip."/".str_replace("/", "...", $uri))){ 132 | $h = fopen("cache/".$ip."/".str_replace("/", "...", $uri), "w"); 133 | fwrite($h, $ret); 134 | fclose($h); 135 | } 136 | } 137 | 138 | return $ret; 139 | } 140 | 141 | private static function resolveTruncated($data){ 142 | $pos = 0; 143 | $end = strlen($data); 144 | $out = ""; 145 | 146 | while($pos < $end){ 147 | $endVal = strpos($data, "\r\n", $pos); 148 | $value = hexdec(substr($data, $pos, $endVal-$pos)); 149 | $out .= substr($data, $endVal+2, $value); 150 | $pos = $endVal+2+$value; 151 | } 152 | 153 | return $out; 154 | } 155 | 156 | private static function CheckForRedirect($header){ 157 | $firstLine = substr($header, 0, strpos($header, "\r\n")); 158 | list($httpVersion, $statusCode, $message) = explode(" ", $firstLine); 159 | 160 | if(substr($statusCode, 0, 1) == "3"){ 161 | $part = substr($header, strpos(strtolower($header), "location: ")+strlen("location: ")); 162 | $location = trim(substr($part, 0, strpos($part, "\r\n"))); 163 | 164 | if(strlen($location) > 0){ 165 | return $location; 166 | } 167 | } 168 | return false; 169 | } 170 | } 171 | ?> -------------------------------------------------------------------------------- /MOBIClass/ImageHandler.php: -------------------------------------------------------------------------------- 1 | 41 | -------------------------------------------------------------------------------- /MOBIClass/LinkedStringBuilder.php: -------------------------------------------------------------------------------- 1 | links[$name] = $this->length(); 12 | } 13 | 14 | public function resolveLink($name, $value) { 15 | $this->resolutions[$name] = $value; 16 | } 17 | 18 | public function append($string) { 19 | $len = strlen($string); 20 | 21 | $this->length += $len; 22 | $this->partSize[] = $len; 23 | $this->parts[] = $string; 24 | } 25 | 26 | public function replace($from, $to, $replacement) { 27 | $partStart = 0; 28 | $partEnd = 0; 29 | for ($i = 0, $len = sizeof($this->partSize); $i < $len; $i++) { 30 | $partEnd += $this->partSize[$i]; 31 | if ($partEnd > $from) { 32 | if ($partEnd < $to) { 33 | $this->replace($partEnd, $to, substr($replacement, $partEnd - $from)); 34 | $replacement = substr($replacement, 0, $partEnd - $from); 35 | $to = $partEnd; 36 | } 37 | 38 | $cur = $this->parts[$i]; 39 | 40 | for ($j = 0; $j < $to - $from; $j++) { 41 | $cur[$from - $partStart + $j] = $replacement[$j]; 42 | } 43 | 44 | $this->parts[$i] = $cur; 45 | return true; 46 | } 47 | $partStart = $partEnd; 48 | } 49 | 50 | throw new Exception("Couldn't replace string (target longer than source?)"); 51 | } 52 | 53 | public function length() { 54 | return $this->length; 55 | } 56 | 57 | public function processLinks() { 58 | foreach ($this->resolutions as $name => $value) { 59 | if (isset($this->links[$name])) { 60 | $start = $this->links[$name]; 61 | $this->replace($start, $start + strlen($value), $value); 62 | 63 | unset($this->resolutions[$name]); 64 | } 65 | } 66 | } 67 | 68 | public function build() { 69 | $this->processLinks(); 70 | 71 | return implode("", $this->parts); 72 | } 73 | } -------------------------------------------------------------------------------- /MOBIClass/MOBI.php: -------------------------------------------------------------------------------- 1 | setInternetSource($url); //Load URL, the result will be cleaned using a Readability port 42 | * $mobi->setFileSource($file); //Load a local file without any extra changes 43 | * $mobi->setData($data); //Load data 44 | * 45 | * //If you want, you can set some optional settings (see Settings.php for all recognized settings) 46 | * $options = array( 47 | * "title"=>"Insert title here", 48 | * "author"=>"Author" 49 | * ); 50 | * $mobi->setOptions($options); 51 | * 52 | * //Then there are two ways to output it: 53 | * $mobi->save($file); //Save the file locally 54 | * $mobi->download($name); //Let the client download the file, make sure the page 55 | * //that calls it doesn't output anything, otherwise it might 56 | * //conflict with the download. $name contains the file name, 57 | * //usually something like "title.mobi" (where the title should 58 | * //be cleaned so as not to contain illegal characters). 59 | * 60 | * 61 | * @author Sander Kromwijk 62 | */ 63 | class MOBI { 64 | private $source = false; 65 | private $images = array(); 66 | private $optional = array(); 67 | private $imgCounter = 0; 68 | private $debug = false; 69 | private $prc = false; 70 | 71 | public function __construct(){ 72 | 73 | } 74 | 75 | public function getTitle(){ 76 | if(isset($this->optional["title"])){ 77 | return $this->optional["title"]; 78 | } 79 | return false; 80 | } 81 | 82 | /** 83 | * Set a content provider as source 84 | * @param ContentProvider $content Content Provider to use 85 | */ 86 | public function setContentProvider($content){ 87 | $this->setOptions($content->getMetaData()); 88 | $this->setImages($content->getImages()); 89 | $this->setData($content->getTextData()); 90 | } 91 | 92 | /** 93 | * Set a local file as source 94 | * @param string $file Path to the file 95 | */ 96 | public function setFileSource($file){ 97 | $this->setData(file_get_contents($file)); 98 | } 99 | 100 | /** 101 | * Set the data to use 102 | * @param string $data Data to put in the file 103 | */ 104 | public function setData($data){ 105 | //$data = utf8_encode($data); 106 | $data = CharacterEntities::convert($data); 107 | //$data = utf8_decode($data); 108 | //$this->source = iconv('UTF-8', 'ISO-8859-1//TRANSLIT', $data); 109 | $this->source = $data; 110 | $this->prc = false; 111 | } 112 | 113 | /** 114 | * Set the images to use 115 | * @param array $data Data to put in the file 116 | */ 117 | public function setImages($data){ 118 | $this->images = $data; 119 | $this->prc = false; 120 | } 121 | 122 | /** 123 | * Set options, usually for things like titles, authors, etc... 124 | * @param array $options Options to set 125 | */ 126 | public function setOptions($options){ 127 | $this->optional = $options; 128 | $this->prc = false; 129 | } 130 | 131 | /** 132 | * Prepare the prc file 133 | * @return Prc The file that can be used to be saved/downloaded 134 | */ 135 | private function preparePRC(){ 136 | if($this->source === false){ 137 | throw new Exception("No data set"); 138 | } 139 | if($this->prc !== false) return $this->prc; 140 | 141 | $data = $this->source; 142 | $len = strlen($data); 143 | 144 | $settings = new Settings($this->optional); 145 | $rec = new RecordFactory($settings); 146 | $dataRecords = $rec->createRecords($data); 147 | $nRecords = sizeof($dataRecords); 148 | $mobiHeader = new PalmRecord($settings, $dataRecords, $nRecords, $len, sizeof($this->images)); 149 | array_unshift($dataRecords, $mobiHeader); 150 | $dataRecords = array_merge($dataRecords, $this->images); 151 | $dataRecords[] = $rec->createFLISRecord(); 152 | $dataRecords[] = $rec->createFCISRecord($len); 153 | $dataRecords[] = $rec->createEOFRecord(); 154 | $this->prc = new Prc($settings, $dataRecords); 155 | return $this->prc; 156 | } 157 | 158 | /** 159 | * Save the file locally 160 | * @param string $filename Path to save the file 161 | */ 162 | public function save($filename){ 163 | $prc = $this->preparePRC(); 164 | $prc->save($filename); 165 | } 166 | 167 | /** 168 | * Let the client download the file. Warning! No data should be 169 | * outputted before or after. 170 | * @param string $name Name used for download, usually "title.mobi" 171 | */ 172 | public function download($name){ 173 | $prc = $this->preparePRC(); 174 | $data = $prc->serialize(); 175 | $length = strlen($data); 176 | 177 | if($this->debug) return; //In debug mode, don't start the download 178 | 179 | header("Content-Type: application/x-mobipocket-ebook"); 180 | header("Content-Disposition: attachment; filename=\"".$name."\""); 181 | header("Content-Transfer-Encoding: binary"); 182 | header("Accept-Ranges: bytes"); 183 | header("Cache-control: private"); 184 | header('Pragma: private'); 185 | header("Expires: Mon, 26 Jul 1997 05:00:00 GMT"); 186 | header("Content-Length: ".$length); 187 | 188 | echo $data; 189 | //Finished! 190 | } 191 | 192 | } 193 | ?> -------------------------------------------------------------------------------- /MOBIClass/MOBIFile.php: -------------------------------------------------------------------------------- 1 | "Unknown Title", "toc" => true); 20 | private $parts = array(); 21 | private $images = array(); 22 | 23 | private $links = array(); 24 | 25 | /** 26 | * Get the text data (the "html" code) 27 | */ 28 | public function getTextData(){ 29 | $str = new LinkedStringBuilder(); 30 | 31 | $str->append(""); 32 | $str->append("
"); 33 | $this->addGuide($str); 34 | $str->append(""); 35 | $str->append(""); 36 | 37 | $this->resolveFilepos($str, self::START_LINK); 38 | $str->append("".$data."
"); 90 | break; 91 | case self::PAGEBREAK: 92 | $str->append("
"); 121 | for($i = 0, $len = sizeof($entries); $i < $len; $i++){ 122 | $entry = $entries[$i]; 123 | 124 | $str->append(" "); 127 | } 128 | $str->append(" append(">".$entry["title"]."
"; 115 | 116 | return $toc; 117 | } 118 | /** 119 | * Get the images (an array containing the jpeg data). Array entry 0 will 120 | * correspond to image record 0. 121 | * @return array 122 | */ 123 | public function getImages(){ 124 | return $this->images; 125 | } 126 | 127 | /** 128 | * Get the metadata in the form of a hashtable (for example, title or author). 129 | * @return array 130 | */ 131 | public function getMetaData(){ 132 | return $this->metadata; 133 | } 134 | 135 | } 136 | ?> 137 | -------------------------------------------------------------------------------- /MOBIClass/OnlineArticle.php: -------------------------------------------------------------------------------- 1 | init(); 23 | if(!isset($this->metadata["title"])){ 24 | $this->metadata["title"] = CharacterEntities::convert(strip_tags($r->getTitle()->innerHTML)); 25 | } 26 | if(!isset($this->metadata["author"])){ 27 | $parts = parse_url($url); 28 | $this->metadata["author"] = $parts["host"]; 29 | } 30 | 31 | $article = $r->getContent()->innerHTML; 32 | if(substr($article, 0, 5) == "".$article.""; 34 | }else{ 35 | $article = "".$article.""; 36 | } 37 | $doc = new DOMDocument(); 38 | @$doc->loadHTML($article) or die($article); 39 | $doc->normalizeDocument(); 40 | 41 | $this->images = $this->handleImages($doc, $url); 42 | $this->text = $doc->saveHTML(); 43 | } 44 | 45 | /** 46 | * Get the text data to be integrated in the MOBI file 47 | * @return string 48 | */ 49 | public function getTextData(){ 50 | return $this->text; 51 | } 52 | /** 53 | * Get the images (an array containing the jpeg data). Array entry 0 will 54 | * correspond to image record 0. 55 | * @return array 56 | */ 57 | public function getImages(){ 58 | return $this->images; 59 | } 60 | /** 61 | * Get the metadata in the form of a hashtable (for example, title or author). 62 | * @return array 63 | */ 64 | public function getMetaData(){ 65 | return $this->metadata; 66 | } 67 | /** 68 | * 69 | * @param DOMElement $dom 70 | * @return array 71 | */ 72 | private function handleImages($dom, $url){ 73 | $images = array(); 74 | 75 | $parts = parse_url($url); 76 | 77 | $savedImages = array(); 78 | 79 | $imgElements = $dom->getElementsByTagName('img'); 80 | foreach($imgElements as $img) { 81 | $src = $img->getAttribute("src"); 82 | 83 | $is_root = false; 84 | if(substr($src, 0, 1) == "/"){ 85 | $is_root = true; 86 | } 87 | 88 | $parsed = parse_url($src); 89 | 90 | if(!isset($parsed["host"])){ 91 | if($is_root){ 92 | $src = http_build_url($url, $parsed, HTTP_URL_REPLACE); 93 | }else{ 94 | $src = http_build_url($url, $parsed, HTTP_URL_JOIN_PATH); 95 | } 96 | } 97 | $img->setAttribute("src", ""); 98 | if(isset($savedImages[$src])){ 99 | $img->setAttribute("recindex", $savedImages[$src]); 100 | }else{ 101 | $image = ImageHandler::DownloadImage($src); 102 | 103 | if($image !== false){ 104 | $images[$this->imgCounter] = new FileRecord(new Record($image)); 105 | 106 | $img->setAttribute("recindex", $this->imgCounter); 107 | $savedImages[$src] = $this->imgCounter; 108 | $this->imgCounter++; 109 | } 110 | } 111 | } 112 | 113 | return $images; 114 | } 115 | } 116 | ?> 117 | -------------------------------------------------------------------------------- /MOBIClass/PalmRecord.php: -------------------------------------------------------------------------------- 1 | elements = new FileElement(array( 15 | "compression"=>new FileShort(), 16 | "unused"=>new FileShort(), 17 | "textLength"=>new FileInt(), 18 | "recordCount"=>new FileShort(), 19 | "recordSize"=>new FileShort(), 20 | "encryptionType"=>new FileShort(), 21 | "unused2"=>new FileShort(), 22 | //MOBI Header 23 | "mobiIdentifier"=>new FileString("MOBI", 4), 24 | "mobiHeaderLength"=>new FileInt(), 25 | "mobiType"=>new FileInt(), 26 | "textEncoding"=>new FileInt(), 27 | "uniqueID"=>new FileInt(), 28 | "fileVersion"=>new FileInt(), 29 | "reserved"=>new FileString(40), 30 | "firstNonBookIndex"=>new FileInt(), 31 | "fullNameOffset"=>new FileInt(), 32 | "fullNameLength"=>new FileInt(), 33 | "locale"=>new FileInt(), 34 | "inputLanguage"=>new FileInt(), 35 | "outputLanguage"=>new FileInt(), 36 | "minimumVersion"=>new FileInt(), 37 | "firstImageIndex"=>new FileInt(), 38 | "huffmanRecordOffset"=>new FileInt(), 39 | "huffmanRecordCount"=>new FileInt(), 40 | "unused3"=>new FileString(8), 41 | "exthFlags"=>new FileInt(0x40), 42 | "unknown"=>new FileString(32), 43 | "drmOffset"=>new FileInt(0xFFFFFFFF), 44 | "drmCount"=>new FileShort(0xFFFFFFFF), 45 | "drmSize"=>new FileShort(), 46 | "drmFlags"=>new FileInt(), 47 | "mobiFiller"=>new FileString(72), 48 | //EXTH Header 49 | "exthIdentifier"=>new FileString("EXTH", 4), 50 | "exthHeaderLength"=>new FileInt(), 51 | "exthRecordCount"=>new FileInt(), 52 | "exthRecords"=>new FileElement(), 53 | "exthPadding"=>new FileString(), 54 | //"fullNamePadding"=>new FileString(100), 55 | "fullName"=>new FileString() 56 | )); 57 | 58 | //Set values from the info block 59 | foreach($settings->values as $name => $val){ 60 | //echo $name.", "; 61 | if($this->elements->exists($name)){ 62 | $this->elements->get($name)->set($settings->get($name)); 63 | } 64 | } 65 | 66 | $els = $settings->values; 67 | 68 | $exthElems = new FileElement(); 69 | $i = 0; 70 | $l = 0; 71 | foreach($els as $name=>$val){ 72 | $type = EXTHHelper::textToType($name); 73 | if($type !== false){ 74 | $type = new FileInt($type); 75 | $length = new FileInt(8+strlen($val)); 76 | $data = new FileString($val); 77 | $l += 8+strlen($val); 78 | $exthElems->add("type".$i, $type); 79 | $exthElems->add("length".$i, $length); 80 | $exthElems->add("data".$i, $data); 81 | $i++; 82 | } 83 | } 84 | 85 | if($images > 0){ 86 | $this->elements->get("firstImageIndex")->set($textRecords+1); 87 | } 88 | $this->elements->get("firstNonBookIndex")->set($textRecords+2+$images); 89 | $this->elements->get("reserved")->set(str_pad("", 40, chr(255), STR_PAD_RIGHT)); 90 | $this->elements->get("exthRecordCount")->set($i); 91 | $this->elements->set("exthRecords", $exthElems); 92 | $pad = $l%4; 93 | $pad = (4-$pad)%4; 94 | $this->elements->get("exthPadding")->set(str_pad("", $pad, "\0", STR_PAD_RIGHT)); 95 | $this->elements->get("exthHeaderLength")->set(12+$l+$pad); 96 | 97 | 98 | $this->elements->get("recordCount")->set($textRecords); 99 | 100 | $this->elements->get("fullNameOffset")->set($this->elements->offsetToEntry("fullName")); 101 | $this->elements->get("fullNameLength")->set(strlen($settings->get("title"))); 102 | $this->elements->get("fullName")->set($settings->get("title")); 103 | $this->elements->get("textLength")->set($textLength); 104 | } 105 | 106 | public function getByteLength(){ 107 | return $this->getLength(); 108 | } 109 | 110 | public function getLength(){ 111 | return $this->elements->getByteLength(); 112 | } 113 | 114 | public function get(){ 115 | return $this; 116 | } 117 | 118 | public function set($elements){ 119 | throw new Exception("Unallowed set"); 120 | } 121 | 122 | public function serialize() { 123 | return $this->elements->serialize(); 124 | } 125 | 126 | public function unserialize($data) { 127 | $this->elements->unserialize($data); 128 | } 129 | 130 | public function __toString(){ 131 | $output = "PalmDoc Record (".$this->getByteLength()." bytes):\n"; 132 | $output .= $this->elements; 133 | return $output; 134 | } 135 | } 136 | ?> 137 | -------------------------------------------------------------------------------- /MOBIClass/Prc.php: -------------------------------------------------------------------------------- 1 | new FileString(32), 12 | "attributes"=>new FileShort(), 13 | "version"=>new FileShort(), 14 | "creationTime"=>new FileDate(), 15 | "modificationTime"=>new FileDate(), 16 | "backupTime"=>new FileDate(), 17 | "modificationNumber"=>new FileInt(), 18 | "appInfoID"=>new FileInt(), 19 | "sortInfoID"=>new FileInt(), 20 | "prcType"=>new FileString(4), 21 | "creator"=>new FileString(4), 22 | "uniqueIDSeed"=>new FileInt(), 23 | "nextRecordListID"=>new FileInt(), 24 | "numberRecords"=>new FileShort(), 25 | "recordList"=>new FileElement(), 26 | "filler"=>new FileShort(), 27 | "records"=>new FileElement() 28 | )); 29 | 30 | //Set values from the info block 31 | foreach($this->elements as $name => $val){ 32 | if($settings->exists($name)){ 33 | $this->get($name)->set($settings->get($name)); 34 | } 35 | } 36 | 37 | $this->get("numberRecords")->set(sizeof($records)); 38 | 39 | $i = 0; 40 | foreach($records as $record){ 41 | $offset = new FileInt(); 42 | $attr = new FileByte(); 43 | $uniqueID = new FileTri($i); 44 | 45 | $this->elements["recordList"]->add("Rec".$i, new FileElement(array( 46 | "offset"=>$offset, 47 | "attribute"=>$attr, 48 | "uniqueID"=>$uniqueID 49 | ))); 50 | 51 | $this->elements["records"]->add("Rec".$i, $record); 52 | $i++; 53 | } 54 | 55 | $this->updateOffsets($records); 56 | } 57 | 58 | public function getByteLength(){ 59 | throw new Exception("Test"); 60 | } 61 | 62 | public function updateOffsets($records){ 63 | $base = $this->offsetToEntry("records"); 64 | 65 | $i = 0; 66 | 67 | foreach($records as $record){ 68 | $el = $this->elements["recordList"]->get("Rec".$i); 69 | 70 | $local = $this->elements["records"]->offsetToEntry("Rec".$i); 71 | 72 | $el->get("offset")->set($base+$local); 73 | 74 | $i++; 75 | } 76 | } 77 | 78 | public function save($file){ 79 | $handle = fopen($file, "w"); 80 | fwrite($handle, $this->serialize()); 81 | fclose($handle); 82 | } 83 | 84 | public function output(){ 85 | echo $this->serialize(); 86 | } 87 | 88 | public function __toString(){ 89 | $output = "Prc (".$this->getByteLength()." bytes): {\n"; 90 | foreach($this->elements as $key=>$value){ 91 | $output .= "\t".$key.": ".$value."\n"; 92 | } 93 | $output .= "}"; 94 | return $output; 95 | } 96 | } 97 | ?> 98 | -------------------------------------------------------------------------------- /MOBIClass/PreprocessedArticle.php: -------------------------------------------------------------------------------- 1 | text = $textData; 16 | $this->metadata = $metadata; 17 | 18 | $this->images = $this->downloadImages($imageLinks); 19 | } 20 | 21 | /** 22 | * Create a Preprocessed article from a json string 23 | * @param string $json JSON data. Should be of the following format: 24 | * {"text": "TEXT", "images: ["imageURL1", "imageURL2"], "metadata": {"key": "value"}} 25 | * 26 | * Note: Any image tags should have the recindex attribute set to the appropriate index (the 27 | * same index as the image in the array) 28 | * @return PreprocessedArticle The generated preprocessed array 29 | */ 30 | static public function CreateFromJson($json){ 31 | $data = json_decode($json); 32 | return new PreprocessedArticle($data["text"], $data["images"], $data["metadata"]); 33 | } 34 | 35 | /** 36 | * Get the text data to be integrated in the MOBI file 37 | * @return string 38 | */ 39 | public function getTextData(){ 40 | return $this->text; 41 | } 42 | /** 43 | * Get the images (an array containing the jpeg data). Array entry 0 will 44 | * correspond to image record 0. 45 | * @return array 46 | */ 47 | public function getImages(){ 48 | return $this->images; 49 | } 50 | /** 51 | * Get the metadata in the form of a hashtable (for example, title or author). 52 | * @return array 53 | */ 54 | public function getMetaData(){ 55 | return $this->metadata; 56 | } 57 | /** 58 | * 59 | * @param DOMElement $dom 60 | * @return array 61 | */ 62 | private function downloadImages($links){ 63 | $images = array(); 64 | foreach($links as $link) { 65 | $imgFile = @imagecreatefromstring(Http::Request($link)); 66 | 67 | if($imgFile === false){ 68 | $imgFile = @imagecreate(1, 1); 69 | $black = @imagecolorallocate($imgFile, 255, 255, 255); 70 | } 71 | if($imgFile !== false){ 72 | @imagefilter($imgFile, IMG_FILTER_GRAYSCALE); 73 | 74 | ob_start(); 75 | @imagejpeg($imgFile); 76 | $image = ob_get_contents(); 77 | ob_end_clean(); 78 | 79 | $images[$this->imgCounter] = new FileRecord(new Record($image)); 80 | imagedestroy($imgFile); 81 | 82 | $this->imgCounter++; 83 | } 84 | } 85 | 86 | return $images; 87 | } 88 | } 89 | ?> 90 | -------------------------------------------------------------------------------- /MOBIClass/RecognizeURL.php: -------------------------------------------------------------------------------- 1 | 17 | -------------------------------------------------------------------------------- /MOBIClass/Record.php: -------------------------------------------------------------------------------- 1 | data = $data; 27 | if($length >= 0){ 28 | $this->length = $length; 29 | }else{ 30 | $this->length = strlen($data); 31 | } 32 | } 33 | 34 | public function compress($compression_method){ 35 | switch($compression_method){ 36 | case NO_COMPRESSION: 37 | //Finished! 38 | break; 39 | case PALMDOC_COMPRESSION: 40 | throw new Exception("Not implemented yet"); 41 | break; 42 | case HUFF: 43 | throw new Exception("Not implemented yet"); 44 | break; 45 | default: 46 | throw new Exception("Invalid argument"); 47 | } 48 | } 49 | 50 | public function getByteLength(){ 51 | return $this->getLength(); 52 | } 53 | 54 | /** 55 | * Get the length of the record 56 | * @return int Length of the data 57 | */ 58 | public function getLength(){ 59 | return $this->length; 60 | } 61 | 62 | /** 63 | * Get the data contained in the record 64 | * @return string Data contained in the record 65 | */ 66 | public function get(){ 67 | return $this->data; 68 | } 69 | 70 | /** 71 | * Set the data contained in the record 72 | * @param string $value Data contained in the record 73 | */ 74 | public function set($value){ 75 | $this->data = $value; 76 | } 77 | 78 | public function serialize(){ 79 | return $this->data; 80 | } 81 | public function unserialize($data){ 82 | __construct($data); 83 | } 84 | 85 | public function __toString() { 86 | $toShow = $this->data; 87 | if(strlen($this->data) > 103){ 88 | $toShow = substr($this->data, 0, 100)."..."; 89 | } 90 | $out = "Record: {\n"; 91 | $out .= "\t".htmlspecialchars($toShow)."\n"; 92 | $out .= "}"; 93 | return $out; 94 | } 95 | } 96 | ?> 97 | -------------------------------------------------------------------------------- /MOBIClass/RecordFactory.php: -------------------------------------------------------------------------------- 1 | settings = $settings; 22 | } 23 | 24 | /** 25 | * Create records from a data string 26 | * @param string $data 27 | * @return array(Record) 28 | */ 29 | public function createRecords($data){ 30 | $records = array(); 31 | $size = $this->settings->get("recordSize"); 32 | $compression = $this->settings->get("compression"); 33 | 34 | $dataEntries = mb_str_split($data, $size); 35 | 36 | for($i = 0, $len = sizeof($dataEntries); $i < $len; $i++){ 37 | $cur = $dataEntries[$i]; 38 | 39 | $dataEntries[$i] = $cur; 40 | $records[$i] = new Record($dataEntries[$i]); 41 | $records[$i]->compress($compression); 42 | } 43 | 44 | return $records; 45 | } 46 | 47 | public function createEOFRecord(){ 48 | return new Record(0xe98e0d0a); 49 | } 50 | 51 | public function createFCISRecord($textLength){ 52 | $r = "FCIS"; 53 | $r .= $this->asString(20, 4); 54 | $r .= $this->asString(16, 4); 55 | $r .= $this->asString(1, 4); 56 | $r .= $this->asString(0, 4); 57 | $r .= $this->asString($textLength, 4); 58 | $r .= $this->asString(0, 4); 59 | $r .= $this->asString(32, 4); 60 | $r .= $this->asString(8, 4); 61 | $r .= $this->asString(1, 2); 62 | $r .= $this->asString(1, 2); 63 | $r .= $this->asString(0, 4); 64 | return new Record($r); 65 | } 66 | 67 | public function createFLISRecord(){ 68 | $r = "FLIS"; 69 | $r .= $this->asString(8, 4); 70 | $r .= $this->asString(65, 2); 71 | $r .= $this->asString(0, 2); 72 | $r .= $this->asString(0, 4); 73 | $r .= $this->asString(-1, 4); 74 | $r .= $this->asString(1, 2); 75 | $r .= $this->asString(3, 2); 76 | $r .= $this->asString(3, 4); 77 | $r .= $this->asString(1, 4); 78 | $r .= $this->asString(-1, 4); 79 | return new Record($r); 80 | } 81 | 82 | private function asString($int, $size){ 83 | $out = ""; 84 | for($i = 0; $i < $size; $i++){ 85 | if($i > 0) $out = " ".$out; 86 | $byte = dechex($int & 0xFF); 87 | if(strlen($byte) == 1) $byte = "0".$byte; 88 | $out = $byte.$out; 89 | $int = $int >> 8; 90 | } 91 | return $out; 92 | } 93 | 94 | public function __toString() { 95 | $out = "Record Factory: {\n"; 96 | $out .= "\tRecord Size: ".$this->settings->get("recordSize")."\n"; 97 | $out .= "\tCompression: ".$this->settings->get("compression")."\n"; 98 | $out .= "}"; 99 | return $out; 100 | } 101 | } 102 | 103 | /** 104 | * Split string in chunks of at most split_length bytes, while respecting multi-byte 105 | * character boundaries. 106 | */ 107 | function mb_str_split($string, $split_length = 1){ 108 | mb_internal_encoding('UTF-8'); 109 | mb_regex_encoding('UTF-8'); 110 | 111 | $split_length = ($split_length <= 0) ? 1 : $split_length; 112 | 113 | $bytes = strlen($string); 114 | 115 | $array = array(); 116 | 117 | if ($split_length >= $bytes) { 118 | $array[] = $string; 119 | return $array; 120 | } 121 | 122 | $i = 0; 123 | while ($i < $bytes) { 124 | $cut_string = mb_strcut($string, $i, $split_length); 125 | $n_bytes = strlen($cut_string); 126 | 127 | if ($n_bytes == 0) { 128 | throw new Exception('Inifite loop in string split detected.'); 129 | } 130 | $array[] = $cut_string; 131 | $i += strlen($cut_string); 132 | } 133 | 134 | return $array; 135 | } 136 | ?> 137 | -------------------------------------------------------------------------------- /MOBIClass/Settings.php: -------------------------------------------------------------------------------- 1 | values = array( 24 | "attributes"=>0, 25 | "version"=>0, 26 | "creationTime"=>time()+94694400, 27 | "modificationTime"=>time()+94694400, 28 | "backupTime"=>0, 29 | "modificationNumber"=>0, 30 | "appInfoID"=>0, 31 | "sortInfoID"=>0, 32 | "prcType"=>"BOOK", 33 | "creator"=>"MOBI", 34 | "uniqueIDSeed"=>rand(), 35 | "nextRecordListID"=>0, 36 | "recordAttributes"=>0, 37 | "compression"=>NO_COMPRESSION, 38 | "recordSize"=>RECORD_SIZE, 39 | "encryptionType"=>NO_ENCRYPTION, 40 | "mobiIdentifier"=>"MOBI", 41 | "mobiHeaderLength"=>0xe8, 42 | "mobiType"=>MOBIPOCKET_BOOK, 43 | "textEncoding"=>UTF8, 44 | "uniqueID"=>rand(), 45 | "fileVersion"=>6, 46 | "locale"=>0x09, 47 | "inputLanguage"=>0, 48 | "outputLanguage"=>0, 49 | "minimumVersion"=>6, 50 | "huffmanRecordOffset"=>0, 51 | "huffmanRecordCount"=>0, 52 | "exthFlags"=>0x40, 53 | "drmOffset"=>0xFFFFFFFF, 54 | "drmCount"=>0, 55 | "drmSize"=>0, 56 | "drmFlags"=>0, 57 | "extraDataFlags"=>0, 58 | "exthIdentifier"=>"EXTH", 59 | // These can be changed without any risk 60 | "title"=>"Unknown title", 61 | "author"=>"Unknown author", 62 | "subject"=>"Unknown subject" 63 | ); 64 | 65 | foreach($additionalSettings as $key=>$value){ 66 | $this->values[$key] = $value; 67 | } 68 | } 69 | 70 | /** 71 | * Get a value from the settings 72 | * @param string $key Key of the setting 73 | * @return mixed The value of the setting 74 | */ 75 | public function get($key){ 76 | return $this->values[$key]; 77 | } 78 | 79 | /** 80 | * Checks if a value is set 81 | * @param string $key Key of the setting 82 | * @return bool True if the value exists 83 | */ 84 | public function exists($key){ 85 | return isset($this->values[$key]); 86 | } 87 | 88 | public function __toString() { 89 | $out = "Settings: {\n"; 90 | foreach($this->values as $key=>$value){ 91 | $out .= "\t".$key.": ".$value."\n"; 92 | } 93 | $out .= "}"; 94 | return $out; 95 | } 96 | } 97 | ?> 98 | -------------------------------------------------------------------------------- /MOBIClass/constants.php: -------------------------------------------------------------------------------- 1 | id = intval(substr($ending, 0, strpos($ending, "/"))); 17 | 18 | for($i = 1; $i <= max(1, $this->chapterCount); $i++){ 19 | $this->addChapter($i); 20 | } 21 | } 22 | 23 | private function addChapter($n){ 24 | $doc = new DOMDocument(); 25 | $file = Http::Request(self::$prefix.$this->id."/".$n."/"); 26 | @$doc->loadHTML($file) or die($file); 27 | 28 | if(!$this->downloadedMetadata){ 29 | $this->loadMetadata($doc); 30 | $this->downloadedMetadata = true; 31 | } 32 | if($this->chapterCount < 0){ 33 | $this->chapterCount = $this->getNumberChapters($doc); 34 | 35 | if($this->chapterCount > 4){ 36 | die("Too many files to download, don't use php for this!"); 37 | } 38 | } 39 | 40 | $textEl = $doc->getElementById("storytext"); 41 | if($textEl == null) die("Error: ".$doc->saveHTML()); 42 | $horizontalRulebars = $doc->getElementsByTagName('hr'); 43 | /** 44 | * @var DOMNode 45 | */ 46 | $hr; 47 | foreach($horizontalRulebars as $hr) { 48 | $hr->setAttribute("size", null); 49 | $hr->setAttribute("noshade", null); 50 | } 51 | $text = $this->innerHtml($textEl); 52 | 53 | $title = ""; 54 | $selects = $doc->getElementsByTagName('select'); 55 | foreach($selects as $select) { 56 | if($select->hasAttribute("name") && $select->getAttribute("name") == "chapter"){ 57 | $options = $select->getElementsByTagName("option"); 58 | 59 | $test = $n.". "; 60 | foreach($options as $option){ 61 | $val = $option->nodeValue; 62 | if(substr($val, 0, strlen($test)) == $test){ 63 | $title = substr($val, strlen($test)); 64 | break; 65 | } 66 | } 67 | break; 68 | } 69 | } 70 | $this->addPage($text, $title); 71 | } 72 | 73 | private function getNumberChapters($doc){ 74 | $selects = $doc->getElementsByTagName('select'); 75 | foreach($selects as $select) { 76 | if($select->hasAttribute("name") && $select->getAttribute("name") == "chapter"){ 77 | $options = $select->getElementsByTagName("option"); 78 | 79 | $count = $options->length; 80 | return $count; 81 | } 82 | } 83 | } 84 | 85 | private function loadMetadata($doc){ 86 | //Author 87 | $links = $doc->getElementsByTagName('a'); 88 | foreach($links as $link) { 89 | if($link == null){ 90 | var_dump($link); 91 | } 92 | if($link->hasAttribute("href") && substr($link->getAttribute("href"), 0, 3) == "/u/"){ 93 | $this->setMetadata("author", $link->nodeValue); 94 | } 95 | } 96 | //Title 97 | /* 98 | $links = $doc->getElementsByTagName('link'); 99 | foreach($links as $link) { 100 | if($link->hasAttribute("rel") && $link->getAttribute("rel") == "canonical"){ 101 | $url = $link->getAttribute("href"); 102 | $title = str_replace("_", " ", substr($url, strrpos($url, "/")+1)); 103 | $this->setMetadata("title", $title); 104 | } 105 | }*/ 106 | 107 | //TODO: Find a more reliable way to extract the title 108 | $title = $doc->getElementsByTagName("b")->item(0)->nodeValue; 109 | $this->setMetadata("title", $title); 110 | } 111 | 112 | private function innerHtml($node){ 113 | $doc = new DOMDocument(); 114 | foreach ($node->childNodes as $child) 115 | $doc->appendChild($doc->importNode($child, true)); 116 | 117 | return $doc->saveHTML(); 118 | } 119 | 120 | public static function Matches($url){ 121 | //TODO: Implement with regex 122 | return strpos($url, self::$prefix) !== false; 123 | } 124 | } 125 | ?> 126 | -------------------------------------------------------------------------------- /MOBIClass/http_build_url.php: -------------------------------------------------------------------------------- 1 | registerNodeClass('DOMElement', 'JSLikeHTMLElement'); 16 | * $doc->loadHTML('
"; 109 | for($i = 0, $len = sizeof($this->toc); $i < $len; $i++){ 110 | $entry = $this->toc[$i]; 111 | $position = $entry["pos"]+$base; 112 | $toc .= " "; 113 | } 114 | $toc .= " ".($i+1).". ".$entry["title"]."
Para 1
Para 2
Para 1
Para 2
' 21 | * echo "\n\n"; 22 | * 23 | * // set innerHTML 24 | * $elem->innerHTML = 'FiveFilters.org'; 25 | * echo $elem->innerHTML; // prints 'FiveFilters.org' 26 | * echo "\n\n"; 27 | * 28 | * // print document (with our changes) 29 | * echo $doc->saveXML(); 30 | * @endcode 31 | * 32 | * @author Keyvan Minoukadeh - http://www.keyvan.net - keyvan@keyvan.net 33 | * @see http://fivefilters.org (the project this was written for) 34 | */ 35 | class JSLikeHTMLElement extends DOMElement 36 | { 37 | /** 38 | * Used for setting innerHTML like it's done in JavaScript: 39 | * @code 40 | * $div->innerHTML = 'The story begins...
'; 41 | * @endcode 42 | */ 43 | public function __set($name, $value) { 44 | if ($name == 'innerHTML') { 45 | // first, empty the element 46 | for ($x=$this->childNodes->length-1; $x>=0; $x--) { 47 | $this->removeChild($this->childNodes->item($x)); 48 | } 49 | // $value holds our new inner HTML 50 | if ($value != '') { 51 | $f = $this->ownerDocument->createDocumentFragment(); 52 | // appendXML() expects well-formed markup (XHTML) 53 | $result = @$f->appendXML($value); // @ to suppress PHP warnings 54 | if ($result) { 55 | if ($f->hasChildNodes()) $this->appendChild($f); 56 | } else { 57 | // $value is probably ill-formed 58 | $f = new DOMDocument(); 59 | $value = mb_convert_encoding($value, 'HTML-ENTITIES', 'UTF-8'); 60 | // Using', $html); 114 | $html = preg_replace($this->regexps['replaceFonts'], '<$1span>', $html); 115 | $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); 116 | $this->dom = new DOMDocument(); 117 | $this->dom->preserveWhiteSpace = false; 118 | $this->dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement'); 119 | @$this->dom->loadHTML($html); 120 | $this->url = $url; 121 | } 122 | 123 | /** 124 | * Get article title element 125 | * @return DOMElement 126 | */ 127 | public function getTitle() { 128 | return $this->articleTitle; 129 | } 130 | 131 | /** 132 | * Get article content element 133 | * @return DOMElement 134 | */ 135 | public function getContent() { 136 | return $this->articleContent; 137 | } 138 | 139 | /** 140 | * Runs readability. 141 | * 142 | * Workflow: 143 | * 1. Prep the document by removing script tags, css, etc. 144 | * 2. Build readability's DOM tree. 145 | * 3. Grab the article content from the current dom tree. 146 | * 4. Replace the current DOM tree with the new one. 147 | * 5. Read peacefully. 148 | * 149 | * @return boolean true if we found content, false otherwise 150 | **/ 151 | public function init() 152 | { 153 | $this->removeScripts($this->dom); 154 | 155 | // Assume successful outcome 156 | $this->success = true; 157 | 158 | $bodyElems = $this->dom->getElementsByTagName('body'); 159 | if ($bodyElems->length > 0) { 160 | if ($this->bodyCache == null) { 161 | $this->bodyCache = $bodyElems->item(0)->innerHTML; 162 | } 163 | if ($this->body == null) { 164 | $this->body = $bodyElems->item(0); 165 | } 166 | } 167 | 168 | $this->prepDocument(); 169 | 170 | //die($this->dom->documentElement->parentNode->nodeType); 171 | //$this->setInnerHTML($this->dom->documentElement, $this->getInnerHTML($this->dom->documentElement)); 172 | //die($this->getInnerHTML($this->dom->documentElement)); 173 | 174 | /* Build readability's DOM tree */ 175 | $overlay = $this->dom->createElement('div'); 176 | $innerDiv = $this->dom->createElement('div'); 177 | $articleTitle = $this->getArticleTitle(); 178 | $articleContent = $this->grabArticle(); 179 | 180 | if (!$articleContent) { 181 | $this->success = false; 182 | $articleContent = $this->dom->createElement('div'); 183 | $articleContent->setAttribute('id', 'readability-content'); 184 | $articleContent->innerHTML = '
Sorry, Readability was unable to parse this page for content.
'; 185 | } 186 | 187 | $overlay->setAttribute('id', 'readOverlay'); 188 | $innerDiv->setAttribute('id', 'readInner'); 189 | 190 | /* Glue the structure of our document together. */ 191 | $innerDiv->appendChild($articleTitle); 192 | $innerDiv->appendChild($articleContent); 193 | $overlay->appendChild($innerDiv); 194 | 195 | /* Clear the old HTML, insert the new content. */ 196 | $this->body->innerHTML = ''; 197 | $this->body->appendChild($overlay); 198 | //document.body.insertBefore(overlay, document.body.firstChild); 199 | $this->body->removeAttribute('style'); 200 | 201 | $this->postProcessContent($articleContent); 202 | 203 | // Set title and content instance variables 204 | $this->articleTitle = $articleTitle; 205 | $this->articleContent = $articleContent; 206 | 207 | return $this->success; 208 | } 209 | 210 | /** 211 | * Debug 212 | */ 213 | protected function dbg($msg) { 214 | if ($this->debug) echo '* ',$msg, '').replace(readability.regexps.replaceFonts, '<$1span>'); 311 | // We do this in the constructor for PHP as that's when we have raw HTML - before parsing it into a DOM tree. 312 | // Manipulating innerHTML as it's done in JS is not possible in PHP. 313 | } 314 | 315 | /** 316 | * For easier reading, convert this document to have footnotes at the bottom rather than inline links. 317 | * @see http://www.roughtype.com/archives/2010/05/experiments_in.php 318 | * 319 | * @return void 320 | **/ 321 | public function addFootnotes($articleContent) { 322 | $footnotesWrapper = $this->dom->createElement('div'); 323 | $footnotesWrapper->setAttribute('id', 'readability-footnotes'); 324 | $footnotesWrapper->innerHTML = '
tags, etc.
405 | *
406 | * @param DOMElement
407 | * @return void
408 | */
409 | function prepArticle($articleContent) {
410 | $this->cleanStyles($articleContent);
411 | $this->killBreaks($articleContent);
412 | if ($this->revertForcedParagraphElements) {
413 | $this->revertReadabilityStyledElements($articleContent);
414 | }
415 |
416 | /* Clean out junk from the article content */
417 | $this->cleanConditionally($articleContent, 'form');
418 | $this->clean($articleContent, 'object');
419 | $this->clean($articleContent, 'h1');
420 |
421 | /**
422 | * If there is only one h2, they are probably using it
423 | * as a header and not a subheader, so remove it since we already have a header.
424 | ***/
425 | if ($articleContent->getElementsByTagName('h2')->length == 1) {
426 | $this->clean($articleContent, 'h2');
427 | }
428 | $this->clean($articleContent, 'iframe');
429 |
430 | $this->cleanHeaders($articleContent);
431 |
432 | /* Do these last as the previous stuff may have removed junk that will affect these */
433 | $this->cleanConditionally($articleContent, 'table');
434 | $this->cleanConditionally($articleContent, 'ul');
435 | $this->cleanConditionally($articleContent, 'div');
436 |
437 | /* Remove extra paragraphs */
438 | $articleParagraphs = $articleContent->getElementsByTagName('p');
439 | for ($i = $articleParagraphs->length-1; $i >= 0; $i--)
440 | {
441 | $imgCount = $articleParagraphs->item($i)->getElementsByTagName('img')->length;
442 | $embedCount = $articleParagraphs->item($i)->getElementsByTagName('embed')->length;
443 | $objectCount = $articleParagraphs->item($i)->getElementsByTagName('object')->length;
444 |
445 | if ($imgCount === 0 && $embedCount === 0 && $objectCount === 0 && $this->getInnerText($articleParagraphs->item($i), false) == '')
446 | {
447 | $articleParagraphs->item($i)->parentNode->removeChild($articleParagraphs->item($i));
448 | }
449 | }
450 |
451 | try {
452 | $articleContent->innerHTML = preg_replace('/
]*>\s*
innerHTML);
453 | //articleContent.innerHTML = articleContent.innerHTML.replace(/
]*>\s*
dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " . $e);
457 | }
458 | }
459 |
460 | /**
461 | * Initialize a node with the readability object. Also checks the
462 | * className/id for special names to add to its score.
463 | *
464 | * @param Element
465 | * @return void
466 | **/
467 | protected function initializeNode($node) {
468 | $readability = $this->dom->createAttribute('readability');
469 | $readability->value = 0; // this is our contentScore
470 | $node->setAttributeNode($readability);
471 |
472 | switch (strtoupper($node->tagName)) { // unsure if strtoupper is needed, but using it just in case
473 | case 'DIV':
474 | $readability->value += 5;
475 | break;
476 |
477 | case 'PRE':
478 | case 'TD':
479 | case 'BLOCKQUOTE':
480 | $readability->value += 3;
481 | break;
482 |
483 | case 'ADDRESS':
484 | case 'OL':
485 | case 'UL':
486 | case 'DL':
487 | case 'DD':
488 | case 'DT':
489 | case 'LI':
490 | case 'FORM':
491 | $readability->value -= 3;
492 | break;
493 |
494 | case 'H1':
495 | case 'H2':
496 | case 'H3':
497 | case 'H4':
498 | case 'H5':
499 | case 'H6':
500 | case 'TH':
501 | $readability->value -= 5;
502 | break;
503 | }
504 | $readability->value += $this->getClassWeight($node);
505 | }
506 |
507 | /***
508 | * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
509 | * most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
510 | *
511 | * @return DOMElement
512 | **/
513 | protected function grabArticle($page=null) {
514 | $stripUnlikelyCandidates = $this->flagIsActive(self::FLAG_STRIP_UNLIKELYS);
515 | if (!$page) $page = $this->dom;
516 | $allElements = $page->getElementsByTagName('*');
517 | /**
518 | * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs
519 | * into P tags where they have been used inappropriately (as in, where they contain no other block level elements.)
520 | *
521 | * Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
522 | * TODO: Shouldn't this be a reverse traversal?
523 | **/
524 | $node = null;
525 | $nodesToScore = array();
526 | for ($nodeIndex = 0; ($node = $allElements->item($nodeIndex)); $nodeIndex++) {
527 | //for ($nodeIndex=$targetList->length-1; $nodeIndex >= 0; $nodeIndex--) {
528 | //$node = $targetList->item($nodeIndex);
529 | $tagName = strtoupper($node->tagName);
530 | /* Remove unlikely candidates */
531 | if ($stripUnlikelyCandidates) {
532 | $unlikelyMatchString = $node->getAttribute('class') . $node->getAttribute('id');
533 | if (
534 | preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) &&
535 | !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString) &&
536 | $tagName != 'BODY'
537 | )
538 | {
539 | $this->dbg('Removing unlikely candidate - ' . $unlikelyMatchString);
540 | //$nodesToRemove[] = $node;
541 | $node->parentNode->removeChild($node);
542 | $nodeIndex--;
543 | continue;
544 | }
545 | }
546 |
547 | if ($tagName == 'P' || $tagName == 'TD' || $tagName == 'PRE') {
548 | $nodesToScore[] = $node;
549 | }
550 |
551 | /* Turn all divs that don't have children block level elements into p's */
552 | if ($tagName == 'DIV') {
553 | if (!preg_match($this->regexps['divToPElements'], $node->innerHTML)) {
554 | //$this->dbg('Altering div to p');
555 | $newNode = $this->dom->createElement('p');
556 | try {
557 | $newNode->innerHTML = $node->innerHTML;
558 | //$nodesToReplace[] = array('new'=>$newNode, 'old'=>$node);
559 | $node->parentNode->replaceChild($newNode, $node);
560 | $nodeIndex--;
561 | $nodesToScore[] = $node; // or $newNode?
562 | }
563 | catch(Exception $e) {
564 | $this->dbg('Could not alter div to p, reverting back to div.: ' . $e);
565 | }
566 | }
567 | else
568 | {
569 | /* EXPERIMENTAL */
570 | // TODO: change these p elements back to text nodes after processing
571 | for ($i = 0, $il = $node->childNodes->length; $i < $il; $i++) {
572 | $childNode = $node->childNodes->item($i);
573 | if ($childNode->nodeType == 3) { // XML_TEXT_NODE
574 | //$this->dbg('replacing text node with a p tag with the same content.');
575 | $p = $this->dom->createElement('p');
576 | $p->innerHTML = $childNode->nodeValue;
577 | $p->setAttribute('style', 'display: inline;');
578 | $p->setAttribute('class', 'readability-styled');
579 | $childNode->parentNode->replaceChild($p, $childNode);
580 | }
581 | }
582 | }
583 | }
584 | }
585 |
586 | /**
587 | * Loop through all paragraphs, and assign a score to them based on how content-y they look.
588 | * Then add their score to their parent node.
589 | *
590 | * A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
591 | **/
592 | $candidates = array();
593 | for ($pt=0; $pt < count($nodesToScore); $pt++) {
594 | $parentNode = $nodesToScore[$pt]->parentNode;
595 | // $grandParentNode = $parentNode ? $parentNode->parentNode : null;
596 | $grandParentNode = !$parentNode ? null : (($parentNode->parentNode instanceof DOMElement) ? $parentNode->parentNode : null);
597 | $innerText = $this->getInnerText($nodesToScore[$pt]);
598 |
599 | if (!$parentNode || !isset($parentNode->tagName)) {
600 | continue;
601 | }
602 |
603 | /* If this paragraph is less than 25 characters, don't even count it. */
604 | if(strlen($innerText) < 25) {
605 | continue;
606 | }
607 |
608 | /* Initialize readability data for the parent. */
609 | if (!$parentNode->hasAttribute('readability'))
610 | {
611 | $this->initializeNode($parentNode);
612 | $candidates[] = $parentNode;
613 | }
614 |
615 | /* Initialize readability data for the grandparent. */
616 | if ($grandParentNode && !$grandParentNode->hasAttribute('readability') && isset($grandParentNode->tagName))
617 | {
618 | $this->initializeNode($grandParentNode);
619 | $candidates[] = $grandParentNode;
620 | }
621 |
622 | $contentScore = 0;
623 |
624 | /* Add a point for the paragraph itself as a base. */
625 | $contentScore++;
626 |
627 | /* Add points for any commas within this paragraph */
628 | $contentScore += count(explode(',', $innerText));
629 |
630 | /* For every 100 characters in this paragraph, add another point. Up to 3 points. */
631 | $contentScore += min(floor(strlen($innerText) / 100), 3);
632 |
633 | /* Add the score to the parent. The grandparent gets half. */
634 | $parentNode->getAttributeNode('readability')->value += $contentScore;
635 |
636 | if ($grandParentNode) {
637 | $grandParentNode->getAttributeNode('readability')->value += $contentScore/2;
638 | }
639 | }
640 |
641 | /**
642 | * After we've calculated scores, loop through all of the possible candidate nodes we found
643 | * and find the one with the highest score.
644 | **/
645 | $topCandidate = null;
646 | for ($c=0, $cl=count($candidates); $c < $cl; $c++)
647 | {
648 | /**
649 | * Scale the final candidates score based on link density. Good content should have a
650 | * relatively small link density (5% or less) and be mostly unaffected by this operation.
651 | **/
652 | $readability = $candidates[$c]->getAttributeNode('readability');
653 | $readability->value = $readability->value * (1-$this->getLinkDensity($candidates[$c]));
654 |
655 | $this->dbg('Candidate: ' . $candidates[$c]->tagName . ' (' . $candidates[$c]->getAttribute('class') . ':' . $candidates[$c]->getAttribute('id') . ') with score ' . $readability->value);
656 |
657 | if (!$topCandidate || $readability->value > (int)$topCandidate->getAttribute('readability')) {
658 | $topCandidate = $candidates[$c];
659 | }
660 | }
661 |
662 | /**
663 | * If we still have no top candidate, just use the body as a last resort.
664 | * We also have to copy the body node so it is something we can modify.
665 | **/
666 | if ($topCandidate === null || strtoupper($topCandidate->tagName) == 'BODY')
667 | {
668 | $topCandidate = $this->dom->createElement('div');
669 | $topCandidate->innerHTML = ($page instanceof DOMDocument) ? $page->saveXML($page->documentElement) : $page->innerHTML;
670 | $page->innerHTML = '';
671 | $page->appendChild($topCandidate);
672 | $this->initializeNode($topCandidate);
673 | }
674 |
675 | /**
676 | * Now that we have the top candidate, look through its siblings for content that might also be related.
677 | * Things like preambles, content split by ads that we removed, etc.
678 | **/
679 | $articleContent = $this->dom->createElement('div');
680 | $articleContent->setAttribute('id', 'readability-content');
681 | $siblingScoreThreshold = max(10, ((int)$topCandidate->getAttribute('readability')) * 0.2);
682 | $siblingNodes = $topCandidate->parentNode->childNodes;
683 |
684 | for ($s=0, $sl=$siblingNodes->length; $s < $sl; $s++)
685 | {
686 | $siblingNode = $siblingNodes->item($s);
687 | $append = false;
688 |
689 | $this->dbg('Looking at sibling node: ' . $siblingNode->nodeName . (($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : ''));
690 |
691 | //dbg('Sibling has score ' . ($siblingNode->readability ? siblingNode.readability.contentScore : 'Unknown'));
692 |
693 | if ($siblingNode === $topCandidate)
694 | // or if ($siblingNode->isSameNode($topCandidate))
695 | {
696 | $append = true;
697 | }
698 |
699 | $contentBonus = 0;
700 | /* Give a bonus if sibling nodes and top candidates have the example same classname */
701 | if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->getAttribute('class') == $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') != '') {
702 | $contentBonus += ((int)$topCandidate->getAttribute('readability')) * 0.2;
703 | }
704 |
705 | if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability') && (((int)$siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold)
706 | {
707 | $append = true;
708 | }
709 |
710 | if (strtoupper($siblingNode->nodeName) == 'P') {
711 | $linkDensity = $this->getLinkDensity($siblingNode);
712 | $nodeContent = $this->getInnerText($siblingNode);
713 | $nodeLength = strlen($nodeContent);
714 |
715 | if ($nodeLength > 80 && $linkDensity < 0.25)
716 | {
717 | $append = true;
718 | }
719 | else if ($nodeLength < 80 && $linkDensity === 0 && preg_match('/\.( |$)/', $nodeContent))
720 | {
721 | $append = true;
722 | }
723 | }
724 |
725 | if ($append)
726 | {
727 | $this->dbg('Appending node: ' . $siblingNode->nodeName);
728 |
729 | $nodeToAppend = null;
730 | $sibNodeName = strtoupper($siblingNode->nodeName);
731 | if ($sibNodeName != 'DIV' && $sibNodeName != 'P') {
732 | /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */
733 |
734 | $this->dbg('Altering siblingNode of ' . $sibNodeName . ' to div.');
735 | $nodeToAppend = $this->dom->createElement('div');
736 | try {
737 | $nodeToAppend->setAttribute('id', $siblingNode->getAttribute('id'));
738 | $nodeToAppend->innerHTML = $siblingNode->innerHTML;
739 | }
740 | catch(Exception $e)
741 | {
742 | $this->dbg('Could not alter siblingNode to div, reverting back to original.');
743 | $nodeToAppend = $siblingNode;
744 | $s--;
745 | $sl--;
746 | }
747 | } else {
748 | $nodeToAppend = $siblingNode;
749 | $s--;
750 | $sl--;
751 | }
752 |
753 | /* To ensure a node does not interfere with readability styles, remove its classnames */
754 | $nodeToAppend->removeAttribute('class');
755 |
756 | /* Append sibling and subtract from our list because it removes the node when you append to another node */
757 | $articleContent->appendChild($nodeToAppend);
758 | }
759 | }
760 |
761 | /**
762 | * So we have all of the content that we need. Now we clean it up for presentation.
763 | **/
764 | $this->prepArticle($articleContent);
765 |
766 | /**
767 | * Now that we've gone through the full algorithm, check to see if we got any meaningful content.
768 | * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher
769 | * likelihood of finding the content, and the sieve approach gives us a higher likelihood of
770 | * finding the -right- content.
771 | **/
772 | if (strlen($this->getInnerText($articleContent, false)) < 250)
773 | {
774 | $this->body->innerHTML = $this->bodyCache;
775 |
776 | if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) {
777 | $this->removeFlag(self::FLAG_STRIP_UNLIKELYS);
778 | return $this->grabArticle($this->body);
779 | }
780 | else if ($this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) {
781 | $this->removeFlag(self::FLAG_WEIGHT_CLASSES);
782 | return $this->grabArticle($this->body);
783 | }
784 | else if ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
785 | $this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY);
786 | return $this->grabArticle($this->body);
787 | }
788 | else {
789 | return false;
790 | }
791 | }
792 | return $articleContent;
793 | }
794 |
795 | /**
796 | * Remove script tags from document
797 | *
798 | * @param DOMElement
799 | * @return void
800 | */
801 | public function removeScripts($doc) {
802 | $scripts = $doc->getElementsByTagName('script');
803 | for($i = $scripts->length-1; $i >= 0; $i--)
804 | {
805 | $scripts->item($i)->parentNode->removeChild($scripts->item($i));
806 | }
807 | }
808 |
809 | /**
810 | * Get the inner text of a node.
811 | * This also strips out any excess whitespace to be found.
812 | *
813 | * @param DOMElement $
814 | * @param boolean $normalizeSpaces (default: true)
815 | * @return string
816 | **/
817 | public function getInnerText($e, $normalizeSpaces=true) {
818 | $textContent = '';
819 |
820 | if (!isset($e->textContent) || $e->textContent == '') {
821 | return '';
822 | }
823 |
824 | $textContent = trim($e->textContent);
825 |
826 | if ($normalizeSpaces) {
827 | return preg_replace($this->regexps['normalize'], ' ', $textContent);
828 | } else {
829 | return $textContent;
830 | }
831 | }
832 |
833 | /**
834 | * Get the number of times a string $s appears in the node $e.
835 | *
836 | * @param DOMElement $e
837 | * @param string - what to count. Default is ","
838 | * @return number (integer)
839 | **/
840 | public function getCharCount($e, $s=',') {
841 | return substr_count($this->getInnerText($e), $s);
842 | }
843 |
844 | /**
845 | * Remove the style attribute on every $e and under.
846 | *
847 | * @param DOMElement $e
848 | * @return void
849 | */
850 | public function cleanStyles($e) {
851 | $elems = $e->getElementsByTagName('*');
852 | foreach ($elems as $elem) {
853 | $elem->removeAttribute('style');
854 | }
855 | }
856 |
857 | /**
858 | * Get the density of links as a percentage of the content
859 | * This is the amount of text that is inside a link divided by the total text in the node.
860 | *
861 | * @param DOMElement $e
862 | * @return number (float)
863 | */
864 | public function getLinkDensity($e) {
865 | $links = $e->getElementsByTagName('a');
866 | $textLength = strlen($this->getInnerText($e));
867 | $linkLength = 0;
868 | for ($i=0, $il=$links->length; $i < $il; $i++)
869 | {
870 | $linkLength += strlen($this->getInnerText($links->item($i)));
871 | }
872 | if ($textLength > 0) {
873 | return $linkLength / $textLength;
874 | } else {
875 | return 0;
876 | }
877 | }
878 |
879 | /**
880 | * Get an elements class/id weight. Uses regular expressions to tell if this
881 | * element looks good or bad.
882 | *
883 | * @param DOMElement $e
884 | * @return number (Integer)
885 | */
886 | public function getClassWeight($e) {
887 | if(!$this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) {
888 | return 0;
889 | }
890 |
891 | $weight = 0;
892 |
893 | /* Look for a special classname */
894 | if ($e->hasAttribute('class') && $e->getAttribute('class') != '')
895 | {
896 | if (preg_match($this->regexps['negative'], $e->getAttribute('class'))) {
897 | $weight -= 25;
898 | }
899 | if (preg_match($this->regexps['positive'], $e->getAttribute('class'))) {
900 | $weight += 25;
901 | }
902 | }
903 |
904 | /* Look for a special ID */
905 | if ($e->hasAttribute('id') && $e->getAttribute('id') != '')
906 | {
907 | if (preg_match($this->regexps['negative'], $e->getAttribute('id'))) {
908 | $weight -= 25;
909 | }
910 | if (preg_match($this->regexps['positive'], $e->getAttribute('id'))) {
911 | $weight += 25;
912 | }
913 | }
914 | return $weight;
915 | }
916 |
917 | /**
918 | * Remove extraneous break tags from a node.
919 | *
920 | * @param DOMElement $node
921 | * @return void
922 | */
923 | public function killBreaks($node) {
924 | $html = $node->innerHTML;
925 | $html = preg_replace($this->regexps['killBreaks'], '
', $html);
926 | $node->innerHTML = $html;
927 | }
928 |
929 | /**
930 | * Clean a node of all elements of type "tag".
931 | * (Unless it's a youtube/vimeo video. People love movies.)
932 | *
933 | * @param DOMElement $e
934 | * @param string $tag
935 | * @return void
936 | */
937 | public function clean($e, $tag) {
938 | $targetList = $e->getElementsByTagName($tag);
939 | $isEmbed = ($tag == 'object' || $tag == 'embed');
940 |
941 | for ($y=$targetList->length-1; $y >= 0; $y--) {
942 | /* Allow youtube and vimeo videos through as people usually want to see those. */
943 | if ($isEmbed) {
944 | $attributeValues = '';
945 | for ($i=0, $il=$targetList->item($y)->attributes->length; $i < $il; $i++) {
946 | $attributeValues .= $targetList->item($y)->attributes->item($i)->value . '|'; // DOMAttr? (TODO: test)
947 | }
948 |
949 | /* First, check the elements attributes to see if any of them contain youtube or vimeo */
950 | if (preg_match($this->regexps['video'], $attributeValues)) {
951 | continue;
952 | }
953 |
954 | /* Then check the elements inside this element for the same. */
955 | if (preg_match($this->regexps['video'], $targetList->item($y)->innerHTML)) {
956 | continue;
957 | }
958 | }
959 | $targetList->item($y)->parentNode->removeChild($targetList->item($y));
960 | }
961 | }
962 |
963 | /**
964 | * Clean an element of all tags of type "tag" if they look fishy.
965 | * "Fishy" is an algorithm based on content length, classnames,
966 | * link density, number of images & embeds, etc.
967 | *
968 | * @param DOMElement $e
969 | * @param string $tag
970 | * @return void
971 | */
972 | public function cleanConditionally($e, $tag) {
973 | if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
974 | return;
975 | }
976 |
977 | $tagsList = $e->getElementsByTagName($tag);
978 | $curTagsLength = $tagsList->length;
979 |
980 | /**
981 | * Gather counts for other typical elements embedded within.
982 | * Traverse backwards so we can remove nodes at the same time without effecting the traversal.
983 | *
984 | * TODO: Consider taking into account original contentScore here.
985 | */
986 | for ($i=$curTagsLength-1; $i >= 0; $i--) {
987 | $weight = $this->getClassWeight($tagsList->item($i));
988 | $contentScore = ($tagsList->item($i)->hasAttribute('readability')) ? (int)$tagsList->item($i)->getAttribute('readability') : 0;
989 |
990 | $this->dbg('Cleaning Conditionally ' . $tagsList->item($i)->tagName . ' (' . $tagsList->item($i)->getAttribute('class') . ':' . $tagsList->item($i)->getAttribute('id') . ')' . (($tagsList->item($i)->hasAttribute('readability')) ? (' with score ' . $tagsList->item($i)->getAttribute('readability')) : ''));
991 |
992 | if ($weight + $contentScore < 0) {
993 | $tagsList->item($i)->parentNode->removeChild($tagsList->item($i));
994 | }
995 | else if ( $this->getCharCount($tagsList->item($i), ',') < 10) {
996 | /**
997 | * If there are not very many commas, and the number of
998 | * non-paragraph elements is more than paragraphs or other ominous signs, remove the element.
999 | **/
1000 | $p = $tagsList->item($i)->getElementsByTagName('p')->length;
1001 | $img = $tagsList->item($i)->getElementsByTagName('img')->length;
1002 | $li = $tagsList->item($i)->getElementsByTagName('li')->length-100;
1003 | $input = $tagsList->item($i)->getElementsByTagName('input')->length;
1004 |
1005 | $embedCount = 0;
1006 | $embeds = $tagsList->item($i)->getElementsByTagName('embed');
1007 | for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) {
1008 | if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) {
1009 | $embedCount++;
1010 | }
1011 | }
1012 |
1013 | $linkDensity = $this->getLinkDensity($tagsList->item($i));
1014 | $contentLength = strlen($this->getInnerText($tagsList->item($i)));
1015 | $toRemove = false;
1016 |
1017 | if ( $img > $p ) {
1018 | $toRemove = true;
1019 | } else if ($li > $p && $tag != 'ul' && $tag != 'ol') {
1020 | $toRemove = true;
1021 | } else if ( $input > floor($p/3) ) {
1022 | $toRemove = true;
1023 | } else if ($contentLength < 25 && ($img === 0 || $img > 2) ) {
1024 | $toRemove = true;
1025 | } else if($weight < 25 && $linkDensity > 0.2) {
1026 | $toRemove = true;
1027 | } else if($weight >= 25 && $linkDensity > 0.5) {
1028 | $toRemove = true;
1029 | } else if(($embedCount == 1 && $contentLength < 75) || $embedCount > 1) {
1030 | $toRemove = true;
1031 | }
1032 |
1033 | if ($toRemove) {
1034 | $tagsList->item($i)->parentNode->removeChild($tagsList->item($i));
1035 | }
1036 | }
1037 | }
1038 | }
1039 |
1040 | /**
1041 | * Clean out spurious headers from an Element. Checks things like classnames and link density.
1042 | *
1043 | * @param DOMElement $e
1044 | * @return void
1045 | */
1046 | public function cleanHeaders($e) {
1047 | for ($headerIndex = 1; $headerIndex < 3; $headerIndex++) {
1048 | $headers = $e->getElementsByTagName('h' . $headerIndex);
1049 | for ($i=$headers->length-1; $i >=0; $i--) {
1050 | if ($this->getClassWeight($headers->item($i)) < 0 || $this->getLinkDensity($headers->item($i)) > 0.33) {
1051 | $headers->item($i)->parentNode->removeChild($headers->item($i));
1052 | }
1053 | }
1054 | }
1055 | }
1056 |
1057 | public function flagIsActive($flag) {
1058 | return ($this->flags & $flag) > 0;
1059 | }
1060 |
1061 | public function addFlag($flag) {
1062 | $this->flags = $this->flags | $flag;
1063 | }
1064 |
1065 | public function removeFlag($flag) {
1066 | $this->flags = $this->flags & ~$flag;
1067 | }
1068 | }
1069 | ?>
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | phpMobi file generator
2 | ======================
3 |
4 | phpMobi is a php script that can generate .mobi files from valid html
5 | files. While this was meant as an experiment, this tool works quite
6 | well and can be used to generate mobipocket files from most news articles.
7 |
8 | IMPORTANT: Do NOT use this on a public web server: most of it was coded in
9 | a weekend, with no testing and no special attention to security. Also, as no official
10 | documentation for the MOBI file format is available, there will be some bugs/problems
11 | in the generated files, but it works for relatively simple documents on the Kindle
12 | previewer and the Kindle 3.
13 |
14 | MobiPocket is an eBook format created by Mobipocket SA. This tool also
15 | uses a php readability port made by [Keyvan Minoukadeh](http://www.keyvan.net/2010/08/php-readability/).
16 |
17 | Code sample
18 | ------------
19 |
20 | See index.php for an example of using this program.
21 |
22 | Sending an online article as a download:
23 |
24 | //Create the MOBI object
25 | $mobi = new MOBI();
26 |
27 | //Set the content provider
28 | $content = new OnlineArticle("URL");
29 | $mobi->setContentProvider($content);
30 |
31 | //Get title and make it a 12 character long url-safe filename
32 | $title = $mobi->getTitle();
33 | if($title === false)
34 | $title = "file";
35 |
36 | $title = urlencode(str_replace(" ", "_", strtolower(substr($title, 0, 12))));
37 |
38 | //Send the mobi file as download
39 | $mobi->download($title.".mobi");
40 |
41 | Using a previously generated/downloaded html file (will not download any images!):
42 |
43 | $data = "...";
44 | $options = array(
45 | "title" => "Local document",
46 | "author" => "Author name",
47 | "subject" => "Subject"
48 | );
49 |
50 | //Create the MOBI object
51 | $mobi = new MOBI();
52 |
53 | //Set the data
54 | $mobi->setData($data);
55 | $mobi->setOptions($options);
56 |
57 | //Save the mobi file locally
58 | $mobi->save($options["title"].".mobi");
59 |
60 | Implementation
61 | --------------
62 |
63 | This code was implemented while reverse-engineering the MobiPocket format.
64 | Therefore this code absolutely isn't optimized for speed, but rather for
65 | easy changes, as getting it to produce valid files was quite fiddly.
66 |
67 | Features
68 | --------
69 |
70 | Modular content provider system:
71 | Adding a new data source can be done by extending the ContentProvider
72 | class. See the OnlineArticle class for a simple but complete
73 | implementation of such a system.
74 |
75 | Image support:
76 | By default, the online article downloader (and any other content
77 | provider that supports images) will download images and integrate them
78 | into the mobi file.
79 |
80 | Partial UTF-8 support:
81 | In practice UTF-8 just works, but there are some unhandled corner
82 | cases (see missing features).
83 |
84 | Missing Features
85 | ----------------
86 |
87 | Compression:
88 | This won't be implemented (or if it is, only to serve as a
89 | reference of the format).
90 |
91 | Different eBook types:
92 | MobiPocket supports other formats/layouts, such as newspaper-like
93 | formats. At the moment only the book layout has been implemented.
94 |
95 | Full UTF-8 support:
96 | UTF-8 should work most of the time (it worked every time I
97 | tested it), but there might be some problems when the character
98 | is split over two "records".
99 |
100 | License
101 | -------
102 | This code is released under the Apache license (version 2.0)
103 |
--------------------------------------------------------------------------------
/index.php:
--------------------------------------------------------------------------------
1 | setContentProvider($content);
29 |
30 | //Get title and make it a 12 character long filename
31 | $title = $mobi->getTitle();
32 | if($title === false) $title = "file";
33 | $title = urlencode(str_replace(" ", "_", strtolower(substr($title, 0, 12))));
34 |
35 | //Send the mobi file as download
36 | $mobi->download($title.".mobi");
37 | die;
38 | }else{
39 | //Create the mobi object
40 | $mobi = new MOBI();
41 |
42 | $content = new MOBIFile();
43 |
44 | $content->set("title", "My first eBook");
45 | $content->set("author", "Me");
46 |
47 | $content->appendChapterTitle("Introduction");
48 | for($i = 0, $lenI = rand(5, 10); $i < $lenI; $i++){
49 | $content->appendParagraph("P".($i+1));
50 | }
51 |
52 |
53 | //Based on PHP's imagecreatetruecolor help paage
54 | $im = imagecreatetruecolor(220, 200);
55 | $text_color = imagecolorallocate($im, 233, 14, 91);
56 | imagestring($im, 10, 5, 5, 'A Simple Text String', $text_color);
57 | imagestring($im, 5, 15, 75, 'A Simple Text String', $text_color);
58 | imagestring($im, 3, 25, 125, 'A Simple Text String', $text_color);
59 | imagestring($im, 2, 10, 155, 'A Simple Text String', $text_color);
60 | $content->appendImage($im);
61 | imagedestroy($im);
62 |
63 | $content->appendPageBreak();
64 |
65 | for($i = 0, $lenI = rand(10, 15); $i < $lenI; $i++){
66 | $content->appendChapterTitle(($i+1).". Chapter ".($i+1));
67 |
68 | for($j = 0, $lenJ = rand(20, 40); $j < $lenJ; $j++){
69 | $content->appendParagraph("P".($i+1).".".($j+1)." TEXT TEXT TEXT");
70 | }
71 |
72 | $content->appendPageBreak();
73 | }
74 |
75 | $mobi->setContentProvider($content);
76 |
77 | //Get title and make it a 12 character long filename
78 | $title = $mobi->getTitle();
79 | if($title === false) $title = "file";
80 | $title = urlencode(str_replace(" ", "_", strtolower(substr($title, 0, 12))));
81 |
82 | //Send the mobi file as download
83 | $mobi->download($title.".mobi");
84 | die;
85 | }
86 | }
87 | ?>
88 |
89 |
90 |