├── .idea ├── modules.xml └── vcs.xml ├── LICENSE ├── README.MD ├── lithestring.iml └── src └── lithe └── core └── LitheString.java /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU LESSER GENERAL PUBLIC LICENSE 2 | Version 2.1, February 1999 3 | 4 | Copyright (C) 1991, 1999 Free Software Foundation, Inc. 5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | (This is the first released version of the Lesser GPL. It also counts 10 | as the successor of the GNU Library Public License, version 2, hence 11 | the version number 2.1.) 12 | 13 | Preamble 14 | 15 | The licenses for most software are designed to take away your 16 | freedom to share and change it. By contrast, the GNU General Public 17 | Licenses are intended to guarantee your freedom to share and change 18 | free software--to make sure the software is free for all its users. 19 | 20 | This license, the Lesser General Public License, applies to some 21 | specially designated software packages--typically libraries--of the 22 | Free Software Foundation and other authors who decide to use it. You 23 | can use it too, but we suggest you first think carefully about whether 24 | this license or the ordinary General Public License is the better 25 | strategy to use in any particular case, based on the explanations below. 26 | 27 | When we speak of free software, we are referring to freedom of use, 28 | not price. Our General Public Licenses are designed to make sure that 29 | you have the freedom to distribute copies of free software (and charge 30 | for this service if you wish); that you receive source code or can get 31 | it if you want it; that you can change the software and use pieces of 32 | it in new free programs; and that you are informed that you can do 33 | these things. 34 | 35 | To protect your rights, we need to make restrictions that forbid 36 | distributors to deny you these rights or to ask you to surrender these 37 | rights. These restrictions translate to certain responsibilities for 38 | you if you distribute copies of the library or if you modify it. 39 | 40 | For example, if you distribute copies of the library, whether gratis 41 | or for a fee, you must give the recipients all the rights that we gave 42 | you. You must make sure that they, too, receive or can get the source 43 | code. If you link other code with the library, you must provide 44 | complete object files to the recipients, so that they can relink them 45 | with the library after making changes to the library and recompiling 46 | it. And you must show them these terms so they know their rights. 47 | 48 | We protect your rights with a two-step method: (1) we copyright the 49 | library, and (2) we offer you this license, which gives you legal 50 | permission to copy, distribute and/or modify the library. 51 | 52 | To protect each distributor, we want to make it very clear that 53 | there is no warranty for the free library. Also, if the library is 54 | modified by someone else and passed on, the recipients should know 55 | that what they have is not the original version, so that the original 56 | author's reputation will not be affected by problems that might be 57 | introduced by others. 58 | 59 | Finally, software patents pose a constant threat to the existence of 60 | any free program. We wish to make sure that a company cannot 61 | effectively restrict the users of a free program by obtaining a 62 | restrictive license from a patent holder. Therefore, we insist that 63 | any patent license obtained for a version of the library must be 64 | consistent with the full freedom of use specified in this license. 65 | 66 | Most GNU software, including some libraries, is covered by the 67 | ordinary GNU General Public License. This license, the GNU Lesser 68 | General Public License, applies to certain designated libraries, and 69 | is quite different from the ordinary General Public License. We use 70 | this license for certain libraries in order to permit linking those 71 | libraries into non-free programs. 72 | 73 | When a program is linked with a library, whether statically or using 74 | a shared library, the combination of the two is legally speaking a 75 | combined work, a derivative of the original library. The ordinary 76 | General Public License therefore permits such linking only if the 77 | entire combination fits its criteria of freedom. The Lesser General 78 | Public License permits more lax criteria for linking other code with 79 | the library. 80 | 81 | We call this license the "Lesser" General Public License because it 82 | does Less to protect the user's freedom than the ordinary General 83 | Public License. It also provides other free software developers Less 84 | of an advantage over competing non-free programs. These disadvantages 85 | are the reason we use the ordinary General Public License for many 86 | libraries. However, the Lesser license provides advantages in certain 87 | special circumstances. 88 | 89 | For example, on rare occasions, there may be a special need to 90 | encourage the widest possible use of a certain library, so that it becomes 91 | a de-facto standard. To achieve this, non-free programs must be 92 | allowed to use the library. A more frequent case is that a free 93 | library does the same job as widely used non-free libraries. In this 94 | case, there is little to gain by limiting the free library to free 95 | software only, so we use the Lesser General Public License. 96 | 97 | In other cases, permission to use a particular library in non-free 98 | programs enables a greater number of people to use a large body of 99 | free software. For example, permission to use the GNU C Library in 100 | non-free programs enables many more people to use the whole GNU 101 | operating system, as well as its variant, the GNU/Linux operating 102 | system. 103 | 104 | Although the Lesser General Public License is Less protective of the 105 | users' freedom, it does ensure that the user of a program that is 106 | linked with the Library has the freedom and the wherewithal to run 107 | that program using a modified version of the Library. 108 | 109 | The precise terms and conditions for copying, distribution and 110 | modification follow. Pay close attention to the difference between a 111 | "work based on the library" and a "work that uses the library". The 112 | former contains code derived from the library, whereas the latter must 113 | be combined with the library in order to run. 114 | 115 | GNU LESSER GENERAL PUBLIC LICENSE 116 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 117 | 118 | 0. This License Agreement applies to any software library or other 119 | program which contains a notice placed by the copyright holder or 120 | other authorized party saying it may be distributed under the terms of 121 | this Lesser General Public License (also called "this License"). 122 | Each licensee is addressed as "you". 123 | 124 | A "library" means a collection of software functions and/or data 125 | prepared so as to be conveniently linked with application programs 126 | (which use some of those functions and data) to form executables. 127 | 128 | The "Library", below, refers to any such software library or work 129 | which has been distributed under these terms. A "work based on the 130 | Library" means either the Library or any derivative work under 131 | copyright law: that is to say, a work containing the Library or a 132 | portion of it, either verbatim or with modifications and/or translated 133 | straightforwardly into another language. (Hereinafter, translation is 134 | included without limitation in the term "modification".) 135 | 136 | "Source code" for a work means the preferred form of the work for 137 | making modifications to it. For a library, complete source code means 138 | all the source code for all modules it contains, plus any associated 139 | interface definition files, plus the scripts used to control compilation 140 | and installation of the library. 141 | 142 | Activities other than copying, distribution and modification are not 143 | covered by this License; they are outside its scope. The act of 144 | running a program using the Library is not restricted, and output from 145 | such a program is covered only if its contents constitute a work based 146 | on the Library (independent of the use of the Library in a tool for 147 | writing it). Whether that is true depends on what the Library does 148 | and what the program that uses the Library does. 149 | 150 | 1. You may copy and distribute verbatim copies of the Library's 151 | complete source code as you receive it, in any medium, provided that 152 | you conspicuously and appropriately publish on each copy an 153 | appropriate copyright notice and disclaimer of warranty; keep intact 154 | all the notices that refer to this License and to the absence of any 155 | warranty; and distribute a copy of this License along with the 156 | Library. 157 | 158 | You may charge a fee for the physical act of transferring a copy, 159 | and you may at your option offer warranty protection in exchange for a 160 | fee. 161 | 162 | 2. You may modify your copy or copies of the Library or any portion 163 | of it, thus forming a work based on the Library, and copy and 164 | distribute such modifications or work under the terms of Section 1 165 | above, provided that you also meet all of these conditions: 166 | 167 | a) The modified work must itself be a software library. 168 | 169 | b) You must cause the files modified to carry prominent notices 170 | stating that you changed the files and the date of any change. 171 | 172 | c) You must cause the whole of the work to be licensed at no 173 | charge to all third parties under the terms of this License. 174 | 175 | d) If a facility in the modified Library refers to a function or a 176 | table of data to be supplied by an application program that uses 177 | the facility, other than as an argument passed when the facility 178 | is invoked, then you must make a good faith effort to ensure that, 179 | in the event an application does not supply such function or 180 | table, the facility still operates, and performs whatever part of 181 | its purpose remains meaningful. 182 | 183 | (For example, a function in a library to compute square roots has 184 | a purpose that is entirely well-defined independent of the 185 | application. Therefore, Subsection 2d requires that any 186 | application-supplied function or table used by this function must 187 | be optional: if the application does not supply it, the square 188 | root function must still compute square roots.) 189 | 190 | These requirements apply to the modified work as a whole. If 191 | identifiable sections of that work are not derived from the Library, 192 | and can be reasonably considered independent and separate works in 193 | themselves, then this License, and its terms, do not apply to those 194 | sections when you distribute them as separate works. But when you 195 | distribute the same sections as part of a whole which is a work based 196 | on the Library, the distribution of the whole must be on the terms of 197 | this License, whose permissions for other licensees extend to the 198 | entire whole, and thus to each and every part regardless of who wrote 199 | it. 200 | 201 | Thus, it is not the intent of this section to claim rights or contest 202 | your rights to work written entirely by you; rather, the intent is to 203 | exercise the right to control the distribution of derivative or 204 | collective works based on the Library. 205 | 206 | In addition, mere aggregation of another work not based on the Library 207 | with the Library (or with a work based on the Library) on a volume of 208 | a storage or distribution medium does not bring the other work under 209 | the scope of this License. 210 | 211 | 3. You may opt to apply the terms of the ordinary GNU General Public 212 | License instead of this License to a given copy of the Library. To do 213 | this, you must alter all the notices that refer to this License, so 214 | that they refer to the ordinary GNU General Public License, version 2, 215 | instead of to this License. (If a newer version than version 2 of the 216 | ordinary GNU General Public License has appeared, then you can specify 217 | that version instead if you wish.) Do not make any other change in 218 | these notices. 219 | 220 | Once this change is made in a given copy, it is irreversible for 221 | that copy, so the ordinary GNU General Public License applies to all 222 | subsequent copies and derivative works made from that copy. 223 | 224 | This option is useful when you wish to copy part of the code of 225 | the Library into a program that is not a library. 226 | 227 | 4. You may copy and distribute the Library (or a portion or 228 | derivative of it, under Section 2) in object code or executable form 229 | under the terms of Sections 1 and 2 above provided that you accompany 230 | it with the complete corresponding machine-readable source code, which 231 | must be distributed under the terms of Sections 1 and 2 above on a 232 | medium customarily used for software interchange. 233 | 234 | If distribution of object code is made by offering access to copy 235 | from a designated place, then offering equivalent access to copy the 236 | source code from the same place satisfies the requirement to 237 | distribute the source code, even though third parties are not 238 | compelled to copy the source along with the object code. 239 | 240 | 5. A program that contains no derivative of any portion of the 241 | Library, but is designed to work with the Library by being compiled or 242 | linked with it, is called a "work that uses the Library". Such a 243 | work, in isolation, is not a derivative work of the Library, and 244 | therefore falls outside the scope of this License. 245 | 246 | However, linking a "work that uses the Library" with the Library 247 | creates an executable that is a derivative of the Library (because it 248 | contains portions of the Library), rather than a "work that uses the 249 | library". The executable is therefore covered by this License. 250 | Section 6 states terms for distribution of such executables. 251 | 252 | When a "work that uses the Library" uses material from a header file 253 | that is part of the Library, the object code for the work may be a 254 | derivative work of the Library even though the source code is not. 255 | Whether this is true is especially significant if the work can be 256 | linked without the Library, or if the work is itself a library. The 257 | threshold for this to be true is not precisely defined by law. 258 | 259 | If such an object file uses only numerical parameters, data 260 | structure layouts and accessors, and small macros and small inline 261 | functions (ten lines or less in length), then the use of the object 262 | file is unrestricted, regardless of whether it is legally a derivative 263 | work. (Executables containing this object code plus portions of the 264 | Library will still fall under Section 6.) 265 | 266 | Otherwise, if the work is a derivative of the Library, you may 267 | distribute the object code for the work under the terms of Section 6. 268 | Any executables containing that work also fall under Section 6, 269 | whether or not they are linked directly with the Library itself. 270 | 271 | 6. As an exception to the Sections above, you may also combine or 272 | link a "work that uses the Library" with the Library to produce a 273 | work containing portions of the Library, and distribute that work 274 | under terms of your choice, provided that the terms permit 275 | modification of the work for the customer's own use and reverse 276 | engineering for debugging such modifications. 277 | 278 | You must give prominent notice with each copy of the work that the 279 | Library is used in it and that the Library and its use are covered by 280 | this License. You must supply a copy of this License. If the work 281 | during execution displays copyright notices, you must include the 282 | copyright notice for the Library among them, as well as a reference 283 | directing the user to the copy of this License. Also, you must do one 284 | of these things: 285 | 286 | a) Accompany the work with the complete corresponding 287 | machine-readable source code for the Library including whatever 288 | changes were used in the work (which must be distributed under 289 | Sections 1 and 2 above); and, if the work is an executable linked 290 | with the Library, with the complete machine-readable "work that 291 | uses the Library", as object code and/or source code, so that the 292 | user can modify the Library and then relink to produce a modified 293 | executable containing the modified Library. (It is understood 294 | that the user who changes the contents of definitions files in the 295 | Library will not necessarily be able to recompile the application 296 | to use the modified definitions.) 297 | 298 | b) Use a suitable shared library mechanism for linking with the 299 | Library. A suitable mechanism is one that (1) uses at run time a 300 | copy of the library already present on the user's computer system, 301 | rather than copying library functions into the executable, and (2) 302 | will operate properly with a modified version of the library, if 303 | the user installs one, as long as the modified version is 304 | interface-compatible with the version that the work was made with. 305 | 306 | c) Accompany the work with a written offer, valid for at 307 | least three years, to give the same user the materials 308 | specified in Subsection 6a, above, for a charge no more 309 | than the cost of performing this distribution. 310 | 311 | d) If distribution of the work is made by offering access to copy 312 | from a designated place, offer equivalent access to copy the above 313 | specified materials from the same place. 314 | 315 | e) Verify that the user has already received a copy of these 316 | materials or that you have already sent this user a copy. 317 | 318 | For an executable, the required form of the "work that uses the 319 | Library" must include any data and utility programs needed for 320 | reproducing the executable from it. However, as a special exception, 321 | the materials to be distributed need not include anything that is 322 | normally distributed (in either source or binary form) with the major 323 | components (compiler, kernel, and so on) of the operating system on 324 | which the executable runs, unless that component itself accompanies 325 | the executable. 326 | 327 | It may happen that this requirement contradicts the license 328 | restrictions of other proprietary libraries that do not normally 329 | accompany the operating system. Such a contradiction means you cannot 330 | use both them and the Library together in an executable that you 331 | distribute. 332 | 333 | 7. You may place library facilities that are a work based on the 334 | Library side-by-side in a single library together with other library 335 | facilities not covered by this License, and distribute such a combined 336 | library, provided that the separate distribution of the work based on 337 | the Library and of the other library facilities is otherwise 338 | permitted, and provided that you do these two things: 339 | 340 | a) Accompany the combined library with a copy of the same work 341 | based on the Library, uncombined with any other library 342 | facilities. This must be distributed under the terms of the 343 | Sections above. 344 | 345 | b) Give prominent notice with the combined library of the fact 346 | that part of it is a work based on the Library, and explaining 347 | where to find the accompanying uncombined form of the same work. 348 | 349 | 8. You may not copy, modify, sublicense, link with, or distribute 350 | the Library except as expressly provided under this License. Any 351 | attempt otherwise to copy, modify, sublicense, link with, or 352 | distribute the Library is void, and will automatically terminate your 353 | rights under this License. However, parties who have received copies, 354 | or rights, from you under this License will not have their licenses 355 | terminated so long as such parties remain in full compliance. 356 | 357 | 9. You are not required to accept this License, since you have not 358 | signed it. However, nothing else grants you permission to modify or 359 | distribute the Library or its derivative works. These actions are 360 | prohibited by law if you do not accept this License. Therefore, by 361 | modifying or distributing the Library (or any work based on the 362 | Library), you indicate your acceptance of this License to do so, and 363 | all its terms and conditions for copying, distributing or modifying 364 | the Library or works based on it. 365 | 366 | 10. Each time you redistribute the Library (or any work based on the 367 | Library), the recipient automatically receives a license from the 368 | original licensor to copy, distribute, link with or modify the Library 369 | subject to these terms and conditions. You may not impose any further 370 | restrictions on the recipients' exercise of the rights granted herein. 371 | You are not responsible for enforcing compliance by third parties with 372 | this License. 373 | 374 | 11. If, as a consequence of a court judgment or allegation of patent 375 | infringement or for any other reason (not limited to patent issues), 376 | conditions are imposed on you (whether by court order, agreement or 377 | otherwise) that contradict the conditions of this License, they do not 378 | excuse you from the conditions of this License. If you cannot 379 | distribute so as to satisfy simultaneously your obligations under this 380 | License and any other pertinent obligations, then as a consequence you 381 | may not distribute the Library at all. For example, if a patent 382 | license would not permit royalty-free redistribution of the Library by 383 | all those who receive copies directly or indirectly through you, then 384 | the only way you could satisfy both it and this License would be to 385 | refrain entirely from distribution of the Library. 386 | 387 | If any portion of this section is held invalid or unenforceable under any 388 | particular circumstance, the balance of the section is intended to apply, 389 | and the section as a whole is intended to apply in other circumstances. 390 | 391 | It is not the purpose of this section to induce you to infringe any 392 | patents or other property right claims or to contest validity of any 393 | such claims; this section has the sole purpose of protecting the 394 | integrity of the free software distribution system which is 395 | implemented by public license practices. Many people have made 396 | generous contributions to the wide range of software distributed 397 | through that system in reliance on consistent application of that 398 | system; it is up to the author/donor to decide if he or she is willing 399 | to distribute software through any other system and a licensee cannot 400 | impose that choice. 401 | 402 | This section is intended to make thoroughly clear what is believed to 403 | be a consequence of the rest of this License. 404 | 405 | 12. If the distribution and/or use of the Library is restricted in 406 | certain countries either by patents or by copyrighted interfaces, the 407 | original copyright holder who places the Library under this License may add 408 | an explicit geographical distribution limitation excluding those countries, 409 | so that distribution is permitted only in or among countries not thus 410 | excluded. In such case, this License incorporates the limitation as if 411 | written in the body of this License. 412 | 413 | 13. The Free Software Foundation may publish revised and/or new 414 | versions of the Lesser General Public License from time to time. 415 | Such new versions will be similar in spirit to the present version, 416 | but may differ in detail to address new problems or concerns. 417 | 418 | Each version is given a distinguishing version number. If the Library 419 | specifies a version number of this License which applies to it and 420 | "any later version", you have the option of following the terms and 421 | conditions either of that version or of any later version published by 422 | the Free Software Foundation. If the Library does not specify a 423 | license version number, you may choose any version ever published by 424 | the Free Software Foundation. 425 | 426 | 14. If you wish to incorporate parts of the Library into other free 427 | programs whose distribution conditions are incompatible with these, 428 | write to the author to ask for permission. For software which is 429 | copyrighted by the Free Software Foundation, write to the Free 430 | Software Foundation; we sometimes make exceptions for this. Our 431 | decision will be guided by the two goals of preserving the free status 432 | of all derivatives of our free software and of promoting the sharing 433 | and reuse of software generally. 434 | 435 | NO WARRANTY 436 | 437 | 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO 438 | WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. 439 | EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR 440 | OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY 441 | KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE 442 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 443 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE 444 | LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME 445 | THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 446 | 447 | 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN 448 | WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY 449 | AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU 450 | FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR 451 | CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE 452 | LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING 453 | RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A 454 | FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF 455 | SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH 456 | DAMAGES. 457 | 458 | END OF TERMS AND CONDITIONS 459 | 460 | How to Apply These Terms to Your New Libraries 461 | 462 | If you develop a new library, and you want it to be of the greatest 463 | possible use to the public, we recommend making it free software that 464 | everyone can redistribute and change. You can do so by permitting 465 | redistribution under these terms (or, alternatively, under the terms of the 466 | ordinary General Public License). 467 | 468 | To apply these terms, attach the following notices to the library. It is 469 | safest to attach them to the start of each source file to most effectively 470 | convey the exclusion of warranty; and each file should have at least the 471 | "copyright" line and a pointer to where the full notice is found. 472 | 473 | {description} 474 | Copyright (C) {year} {fullname} 475 | 476 | This library is free software; you can redistribute it and/or 477 | modify it under the terms of the GNU Lesser General Public 478 | License as published by the Free Software Foundation; either 479 | version 2.1 of the License, or (at your option) any later version. 480 | 481 | This library is distributed in the hope that it will be useful, 482 | but WITHOUT ANY WARRANTY; without even the implied warranty of 483 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 484 | Lesser General Public License for more details. 485 | 486 | You should have received a copy of the GNU Lesser General Public 487 | License along with this library; if not, write to the Free Software 488 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 489 | USA 490 | 491 | Also add information on how to contact you by electronic and paper mail. 492 | 493 | You should also get your employer (if you work as a programmer) or your 494 | school, if any, to sign a "copyright disclaimer" for the library, if 495 | necessary. Here is a sample; alter the names: 496 | 497 | Yoyodyne, Inc., hereby disclaims all copyright interest in the 498 | library `Frob' (a library for tweaking knobs) written by James Random 499 | Hacker. 500 | 501 | {signature of Ty Coon}, 1 April 1990 502 | Ty Coon, President of Vice 503 | 504 | That's all there is to it! -------------------------------------------------------------------------------- /README.MD: -------------------------------------------------------------------------------- 1 | # lithestring 2 | 3 | ## Synopsis 4 | 5 | Java class to compress short (or long) strings 6 | 7 | Included in the lithecore library https://github.com/lithedream/lithecore 8 | 9 | ## Motivation 10 | 11 | I dreamed a compression algorithm expecially useful for short strings, without fear of taking up more space than the UTF-8 encoding of the original string. 12 | 13 | The compression algorithm chooses the best approach between: 14 | * Plain UTF-8 encoding (as is, without overhead) 15 | * An encoding which uses 5 bits for a-z, A-Z, space, and encodes every other UTF-8 character with 3 bits of overhead (for really short Latin strings) 16 | * An intermediate algorithm based on Huffman encoding (dictionary header, then encoded string) 17 | * 1 byte of overhead then GZIP compression (for long strings) 18 | 19 | The decompression algorithm looks if it is a plain UTF-8 encoding or a compressed one, and in the latter case reads the data header to apply the correct decoding algorithm. 20 | 21 | ## Code Example 22 | 23 | ```java 24 | String input = ...; 25 | byte[] compressed = LitheString.zip(input); // in the worst case, compressed is the plain UTF-8 encoding of input 26 | String uncompressed = LitheString.unzip(compressed); 27 | 28 | if (input.equals(uncompressed)){ 29 | System.out.println("It works!"); 30 | } else { 31 | System.out.println("Please submit a bug for "+input); 32 | } 33 | ``` 34 | 35 | ## Author 36 | 37 | * **lithedream** 38 | 39 | ## License 40 | 41 | LGPL-2.1 42 | -------------------------------------------------------------------------------- /lithestring.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /src/lithe/core/LitheString.java: -------------------------------------------------------------------------------- 1 | package lithe.core; 2 | 3 | import java.io.*; 4 | import java.nio.charset.StandardCharsets; 5 | import java.util.*; 6 | import java.util.zip.GZIPInputStream; 7 | import java.util.zip.GZIPOutputStream; 8 | 9 | 10 | public class LitheString { 11 | 12 | private byte[] content; 13 | 14 | public LitheString(String input) { 15 | this.content = zip(input); 16 | } 17 | 18 | @Override 19 | public boolean equals(Object obj) { 20 | return this == obj || (obj instanceof LitheString && Arrays.equals(((LitheString) obj).content, this.content)); 21 | } 22 | 23 | @Override 24 | public int hashCode() { 25 | return Arrays.hashCode(content); 26 | } 27 | 28 | /** 29 | * Returns the zipped byte[] content 30 | * 31 | * @return the zipped byte[] content 32 | */ 33 | public byte[] getBytes() { 34 | return content; 35 | } 36 | 37 | /** 38 | * Returns the corresponding String content 39 | * 40 | * @return the corresponding String content 41 | */ 42 | public String getString() { 43 | return unzip(content); 44 | } 45 | 46 | /** 47 | * Compresses the string as best as it can 48 | * 49 | * @param input 50 | * @return the compressed byte[] 51 | */ 52 | public static byte[] zip(String input) { 53 | byte[] z0 = input.getBytes(StandardCharsets.UTF_8); 54 | return zipUTF8(z0); 55 | } 56 | 57 | /** 58 | * Compresses the string already in UTF-8 form as best as it can 59 | * 60 | * @param utf8Input 61 | * @return the compressed byte[] 62 | */ 63 | public static byte[] zipUTF8(byte[] utf8Input) { 64 | byte[] z1 = z1UTF8(utf8Input); 65 | byte[] z2 = z2UTF8(utf8Input); 66 | byte[] z3 = z3UTF8(utf8Input); 67 | 68 | return shortest(utf8Input, z1, z2, z3); 69 | } 70 | 71 | /** 72 | * Compresses the string and checks if the encoding is correct, throwing exception if it didn't work 73 | * 74 | * @param input 75 | * @return the compressed byte[] 76 | */ 77 | public static byte[] secureZip(String input) { 78 | byte[] zipped = zip(input); 79 | String unzipped = unzip(zipped); 80 | if (!input.equals(unzipped)) { 81 | throw new IllegalArgumentException("Error in encoding String '" + (input.length() > 100 ? input.substring(0, 100) + "..." : input) + "'"); 82 | } 83 | return zipped; 84 | } 85 | 86 | /** 87 | * Compresses the string using a custom encoding with 5 bits for a-z and space charactes, and adds 3 bits to every other UTF-8 character 88 | * 89 | * @param input 90 | * @return the compressed byte[] 91 | */ 92 | public static byte[] z1(String input) { 93 | return z1UTF8(input.getBytes(StandardCharsets.UTF_8)); 94 | } 95 | 96 | /** 97 | * Like z1, but with an UTF-8 encoded string as input 98 | * 99 | * @param utf8Input 100 | * @return the compressed byte[] 101 | */ 102 | public static byte[] z1UTF8(byte[] utf8Input) { 103 | BitWriter output = new BitWriter(); 104 | output.write01("100"); 105 | 106 | boolean caps = false; 107 | try (ByteArrayInputStream bais = new ByteArrayInputStream(utf8Input)) { 108 | int in; 109 | while ((in = bais.read()) != -1) { 110 | byte byt = (byte) in; 111 | int nExtraByte = getNExtraBytes(byt); 112 | for (int i = 0; i < nExtraByte; i++) { 113 | bais.read(); 114 | } 115 | if (byt >= 97 && byt <= 122) { // lower 116 | caps = false; 117 | break; 118 | 119 | } 120 | if (byt >= 65 && byt <= 90) { // upper 121 | caps = true; 122 | break; 123 | } 124 | } 125 | } catch (IOException e) { 126 | 127 | } 128 | output.write(caps); 129 | 130 | try (PushbackInputStream bais = new PushbackInputStream(new ByteArrayInputStream(utf8Input))) { 131 | int in; 132 | while ((in = bais.read()) != -1) { 133 | byte byt = (byte) in; 134 | if (byt >= 97 && byt <= 122) { // lower 135 | if (caps) { 136 | int in2 = bais.read(); 137 | if (in2 != -1) { 138 | bais.unread(in2); // push it back 139 | if (in2 >= 65 && in2 <= 90) { // if the next is upper 140 | output.write01("111"); // write this in utf8 141 | output.write(byt); 142 | continue; 143 | } 144 | } 145 | output.write01("00000"); 146 | caps = !caps; 147 | } 148 | output.writeLast5Bits((byte) (byt - 96)); 149 | 150 | } else if (byt >= 65 && byt <= 90) { // upper 151 | if (!caps) { 152 | int in2 = bais.read(); 153 | if (in2 != -1) { 154 | bais.unread(in2); // push it back 155 | if (in2 >= 97 && in2 <= 122) { // if the next is lower 156 | output.write01("111"); // write this in utf8 157 | output.write(byt); 158 | continue; 159 | } 160 | } 161 | output.write01("00000"); 162 | caps = !caps; 163 | } 164 | output.writeLast5Bits((byte) (byt - 64)); 165 | 166 | } else if (byt == 32) { //space 167 | output.write01("11011"); 168 | } else { 169 | output.write01("111"); 170 | output.write(byt); 171 | for (int i = 0; i < getNExtraBytes(byt); i++) { 172 | output.write((byte) bais.read()); 173 | } 174 | } 175 | } 176 | } catch (IOException e) { 177 | 178 | } 179 | output.close(); 180 | return output.toByteArray(); 181 | } 182 | 183 | /** 184 | * Returns how many bytes are after this to complete the UTF-8 character 185 | * 186 | * @param byt 187 | * @return the number of extra bytes 188 | */ 189 | private static int getNExtraBytes(byte byt) { 190 | int nExtraByte = 0; 191 | if (!startsWith(byt, "0")) { //1 byte 192 | if (startsWith(byt, "110")) { //2 byte 193 | nExtraByte = 1; 194 | } else if (startsWith(byt, "1110")) { //3 byte 195 | nExtraByte = 2; 196 | } else if (startsWith(byt, "11110")) { //4 byte 197 | nExtraByte = 3; 198 | } 199 | } 200 | return nExtraByte; 201 | } 202 | 203 | /** 204 | * Returns if byte parameter starts with the sequence of "010..." as written in binaryString 205 | * 206 | * @param byt 207 | * @param binaryString 208 | * @return if the 010... of binaryString match the start of byt 209 | */ 210 | private static boolean startsWith(byte byt, String binaryString) { 211 | byte pos = 7; 212 | for (int i = 0; i < binaryString.length(); i++) { 213 | boolean iBitIsSet = binaryString.charAt(i) == ('1'); 214 | boolean bitValue = getNBitValue(byt, pos); 215 | if (bitValue != iBitIsSet) { 216 | return false; 217 | } 218 | if (pos == 0) { 219 | break; 220 | } else { 221 | pos--; 222 | } 223 | } 224 | return true; 225 | } 226 | 227 | /** 228 | * Compresses the string using a modified Huffman encoding 229 | * 230 | * @param input 231 | * @return the compressed byte[] 232 | */ 233 | public static byte[] z2(String input) { 234 | return z2UTF8(input.getBytes(StandardCharsets.UTF_8)); 235 | } 236 | 237 | /** 238 | * Like z2, but with an UTF-8 encoded string as input 239 | * 240 | * @param input 241 | * @return the compressed byte[] 242 | */ 243 | public static byte[] z2UTF8(byte[] input) { 244 | List listChars = new ArrayList<>(); 245 | try (ByteArrayInputStream bais = new ByteArrayInputStream(input)) { 246 | int in; 247 | while ((in = bais.read()) != -1) { 248 | byte byt = (byte) in; 249 | int nExtraByte = getNExtraBytes(byt); 250 | 251 | byte[] utf8Bytes = new byte[nExtraByte + 1]; 252 | utf8Bytes[0] = byt; 253 | 254 | for (int i = 0; i < nExtraByte; i++) { 255 | utf8Bytes[i + 1] = (byte) bais.read(); 256 | } 257 | listChars.add(new UTF8Char(utf8Bytes)); 258 | } 259 | } catch (IOException e) { 260 | 261 | } 262 | 263 | Map objectFreqs = Huffer.makeFreqs(listChars); 264 | { 265 | int howMany = 0; 266 | for (Iterator> it = objectFreqs.entrySet().iterator(); it.hasNext(); ) { 267 | Map.Entry next = it.next(); 268 | if (next.getValue() == 1) { 269 | it.remove(); 270 | howMany++; 271 | } 272 | } 273 | if (howMany > 0) { 274 | objectFreqs.put(UTF8Char.getInvalidChar(), howMany); 275 | } 276 | } 277 | Map huff = Huffer.makeMap(objectFreqs); 278 | 279 | List> listHuff = new ArrayList<>(); 280 | for (Map.Entry e : huff.entrySet()) { 281 | listHuff.add(new AbstractMap.SimpleEntry(e.getKey(), e.getValue())); 282 | } 283 | Collections.sort(listHuff, new Comparator>() { 284 | @Override 285 | public int compare(Map.Entry o1, Map.Entry o2) { 286 | return Integer.compare(o1.getValue().length(), o2.getValue().length()); 287 | } 288 | }); 289 | 290 | BitWriter output = innerZ2(listChars, 0, huff, listHuff); 291 | int spareBits = output.getSpareBits(); 292 | if (spareBits > 0) { 293 | output = innerZ2(listChars, spareBits, huff, listHuff); 294 | } 295 | output.close(); 296 | return output.toByteArray(); 297 | } 298 | 299 | /** 300 | * Inner workings of z2 301 | * 302 | * @param listChars 303 | * @param spareBits 304 | * @param huff 305 | * @param listHuff 306 | * @return 307 | */ 308 | private static BitWriter innerZ2(List listChars, int spareBits, Map huff, List> listHuff) { 309 | BitWriter output = new BitWriter(); 310 | output.write01("1010"); 311 | for (int i = 0; i < spareBits; i++) { 312 | output.write01("0"); 313 | } 314 | output.write01("1"); 315 | 316 | int length = 0; 317 | for (Map.Entry e : listHuff) { 318 | int difference = e.getValue().length() - length; 319 | if (difference > 0) { 320 | for (int i = 0; i < difference; i++) { 321 | output.write01("0"); 322 | } 323 | output.write01("1"); 324 | length = e.getValue().length(); 325 | } else { 326 | output.write01("10"); 327 | } 328 | output.write01(e.getValue()); 329 | if (e.getKey().isInvalid()) { 330 | output.write01("10"); 331 | } else { 332 | for (int i = 0; i < e.getKey().getBytes().length; i++) { 333 | output.write(e.getKey().getBytes()[i]); 334 | } 335 | } 336 | } 337 | output.write01("11"); 338 | for (UTF8Char c : listChars) { 339 | String s = huff.get(c); 340 | if (s != null) { 341 | output.write01(s); 342 | 343 | } else { 344 | output.write01(huff.get(UTF8Char.getInvalidChar())); 345 | for (byte b : c.getBytes()) { 346 | output.write(b); 347 | } 348 | } 349 | } 350 | return output; 351 | } 352 | 353 | /** 354 | * Compresses the string with 1 byte of header + standard gzip encoding of the UTF-8 content 355 | * 356 | * @param input 357 | * @return the compressed byte[] 358 | */ 359 | public static byte[] z3(String input) { 360 | ByteArrayOutputStream output = new ByteArrayOutputStream(); 361 | output.write(0b10111111); 362 | try { 363 | try (Writer writer = new OutputStreamWriter(new GZIPOutputStream(output), StandardCharsets.UTF_8)) { 364 | writer.write(input); 365 | } 366 | } catch (IOException e) { 367 | throw new IllegalArgumentException(e); 368 | } finally { 369 | try { 370 | output.close(); 371 | } catch (IOException e) { 372 | throw new IllegalArgumentException(e); 373 | } 374 | } 375 | 376 | return output.toByteArray(); 377 | } 378 | 379 | /** 380 | * Like z3, but with an UTF-8 encoded string as input 381 | * 382 | * @param input 383 | * @return the compressed byte[] 384 | */ 385 | public static byte[] z3UTF8(byte[] input) { 386 | ByteArrayOutputStream output = new ByteArrayOutputStream(); 387 | output.write(0b10111111); 388 | try { 389 | try (GZIPOutputStream writer = new GZIPOutputStream(output)) { 390 | writer.write(input, 0, input.length); 391 | } 392 | } catch (IOException e) { 393 | throw new IllegalArgumentException(e); 394 | } finally { 395 | try { 396 | output.close(); 397 | } catch (IOException e) { 398 | throw new IllegalArgumentException(e); 399 | } 400 | } 401 | return output.toByteArray(); 402 | } 403 | 404 | /** 405 | * Uncompresses the compressed content with the right algorithm 406 | * 407 | * @param content 408 | * @return the original string 409 | */ 410 | public static String unzip(byte[] content) { 411 | if (content == null) { 412 | return null; 413 | } 414 | if (content.length == 0) { 415 | return ""; 416 | } 417 | if (startsWith(content[0], "100")) { 418 | return unzip1(content); 419 | } 420 | if (startsWith(content[0], "1010")) { 421 | return unzip2(content); 422 | } 423 | if ((content[0] & 0xFF) == 0b10111111) { 424 | return unzip3(content); 425 | } 426 | return new String(content, StandardCharsets.UTF_8); 427 | } 428 | 429 | /** 430 | * Uncompresses the compressed content using type1 algorithm 431 | * 432 | * @param content 433 | * @return the original string 434 | */ 435 | private static String unzip1(byte[] content) { 436 | BitReader bitReader = new BitReader(content); 437 | bitReader.advance(3); 438 | 439 | ByteArrayOutputStream baos = new ByteArrayOutputStream(); 440 | boolean caps = bitReader.read(1) == 1; 441 | while (!bitReader.isClosed()) { 442 | if (bitReader.peek01("111")) { 443 | bitReader.advance(3); 444 | byte read = bitReader.read(); 445 | 446 | baos.write(read); 447 | for (int i = 0; i < getNExtraBytes(read); i++) { 448 | baos.write(bitReader.read()); 449 | } 450 | } else { 451 | byte read = bitReader.read(5); 452 | if (read == 0) { 453 | caps = !caps; 454 | } else if ((read & 0xFF) == 0b00011011) { 455 | baos.write(32); // space 456 | } else { 457 | baos.write(read + (caps ? 64 : 96)); // UPPER:lower 458 | } 459 | } 460 | } 461 | return new String(baos.toByteArray(), StandardCharsets.UTF_8); 462 | } 463 | 464 | /** 465 | * Uncompresses the compressed content using type2 algorithm 466 | * 467 | * @param content 468 | * @return the original string 469 | */ 470 | private static String unzip2(byte[] content) { 471 | BitReader bitReader = new BitReader(content); 472 | bitReader.advance(4); 473 | while (bitReader.read(1) == 0) ; 474 | 475 | int keyLength = 0; 476 | Trie01 trie = new Trie01<>(); 477 | Map mmm = new HashMap<>(); 478 | while (!bitReader.isClosed()) { 479 | if (bitReader.peek01("0")) { 480 | while (bitReader.read(1) == 0) { 481 | keyLength++; 482 | } 483 | } else if (bitReader.peek01("10")) { 484 | bitReader.advance(2); 485 | } else if (bitReader.peek01("11")) { 486 | bitReader.advance(2); 487 | break; 488 | } 489 | String key = bitReader.readAsString(keyLength); 490 | if (bitReader.peek01("10")) { 491 | bitReader.advance(2); 492 | UTF8Char u = UTF8Char.getInvalidChar(); 493 | trie.add(key, u); 494 | mmm.put(key, u.asString()); 495 | } else { 496 | byte read = bitReader.read(); 497 | 498 | int nExtraBytes = getNExtraBytes(read); 499 | byte[] bytes = new byte[nExtraBytes + 1]; 500 | bytes[0] = read; 501 | for (int i = 0; i < nExtraBytes; i++) { 502 | bytes[i + 1] = bitReader.read(); 503 | } 504 | UTF8Char u = new UTF8Char(bytes); 505 | trie.add(key, u); 506 | mmm.put(key, u.asString()); 507 | } 508 | } 509 | 510 | ByteArrayOutputStream baos = new ByteArrayOutputStream(); 511 | while (!bitReader.isClosed()) { 512 | Trie01.Scanner scanner = trie.scan(bitReader.read01Char()); 513 | while (!scanner.hasValue()) { 514 | scanner.scan(bitReader.read01Char()); 515 | } 516 | UTF8Char value = scanner.getValue(); 517 | if (value.isInvalid()) { 518 | byte read = bitReader.read(); 519 | 520 | int nExtraBytes = getNExtraBytes(read); 521 | byte[] bytes = new byte[nExtraBytes + 1]; 522 | bytes[0] = read; 523 | for (int i = 0; i < nExtraBytes; i++) { 524 | bytes[i + 1] = bitReader.read(); 525 | } 526 | UTF8Char u = new UTF8Char(bytes); 527 | for (byte b : u.getBytes()) { 528 | baos.write(b); 529 | } 530 | } else { 531 | for (byte b : value.getBytes()) { 532 | baos.write(b); 533 | } 534 | } 535 | } 536 | return new String(baos.toByteArray(), StandardCharsets.UTF_8); 537 | } 538 | 539 | /** 540 | * Uncompresses the compressed content using type3 algorithm 541 | * 542 | * @param content 543 | * @return the original string 544 | */ 545 | private static String unzip3(byte[] content) { 546 | ByteArrayInputStream bais = new ByteArrayInputStream(content); 547 | bais.read(); 548 | try (GZIPInputStream gis = new GZIPInputStream(bais)) { 549 | byte[] buffer = new byte[1024]; 550 | ByteArrayOutputStream out = new ByteArrayOutputStream(); 551 | 552 | int len; 553 | while ((len = gis.read(buffer)) > 0) { 554 | out.write(buffer, 0, len); 555 | } 556 | 557 | gis.close(); 558 | out.close(); 559 | return new String(out.toByteArray(), StandardCharsets.UTF_8); 560 | } catch (IOException e) { 561 | throw new IllegalArgumentException(e); 562 | } 563 | } 564 | 565 | /** 566 | * Returns the shortest between these byte[] 567 | * 568 | * @param bytes 569 | * @return the shortest byte[] 570 | */ 571 | private static byte[] shortest(byte[]... bytes) { 572 | byte[] shortest = bytes[0]; 573 | for (int i = 1; i < bytes.length; i++) { 574 | if (bytes[i].length < shortest.length) { 575 | shortest = bytes[i]; 576 | } 577 | } 578 | return shortest; 579 | } 580 | 581 | /** 582 | * Returns the value of the n-th bit of the byte 583 | * 584 | * @param value 585 | * @param n 586 | * @return the value of the n-th bit of the byte 587 | */ 588 | private static boolean getNBitValue(byte value, byte n) { 589 | return (value & (1 << n)) != 0; 590 | } 591 | 592 | private static class BitWriter { 593 | private static final int START = 7; 594 | private final ByteArrayOutputStream b = new ByteArrayOutputStream(); 595 | private byte current = 0; 596 | private int curpos = START; 597 | 598 | private boolean closed = false; 599 | 600 | public BitWriter() { 601 | } 602 | 603 | public void write(boolean bit) { 604 | if (bit) { 605 | current |= 1 << curpos; 606 | } else { 607 | current &= ~(1 << curpos); 608 | } 609 | if (curpos == 0) { 610 | b.write(current); 611 | current = 0; 612 | curpos = START; 613 | } else { 614 | curpos--; 615 | } 616 | } 617 | 618 | public void write01(String binaryString) { 619 | char[] chars = binaryString.toCharArray(); 620 | for (char c : chars) { 621 | write(c == '1'); 622 | } 623 | } 624 | 625 | public void write(byte value) { 626 | for (byte bit = 8; bit-- > 0; ) { 627 | write(getNBitValue(value, bit)); 628 | } 629 | } 630 | 631 | public void writeLast5Bits(byte value) { 632 | for (byte bit = 5; bit-- > 0; ) { 633 | write(getNBitValue(value, bit)); 634 | } 635 | } 636 | 637 | public void close() { 638 | if (!closed) { 639 | closed = true; 640 | if (curpos != START) { 641 | b.write(current); 642 | } 643 | } 644 | } 645 | 646 | public int getSpareBits() { 647 | if (curpos == START) { 648 | return 0; 649 | } 650 | return curpos + 1; 651 | } 652 | 653 | public byte[] toByteArray() { 654 | if (closed || curpos == START) { 655 | return b.toByteArray(); 656 | } else { 657 | byte[] bytes = b.toByteArray(); 658 | byte[] bytes2 = new byte[bytes.length + 1]; 659 | System.arraycopy(bytes, 0, bytes2, 0, bytes.length); 660 | bytes2[bytes.length] = current; 661 | return bytes2; 662 | } 663 | } 664 | 665 | } 666 | 667 | private static class BitReader { 668 | private static final byte START = 7; 669 | private byte curpos = START; 670 | private byte[] bytes; 671 | private int bytePos = 0; 672 | 673 | public BitReader(byte[] b) { 674 | bytes = b; 675 | } 676 | 677 | public byte read() { 678 | if (curpos == START) { 679 | if (bytePos >= bytes.length) { 680 | return 0; //force return 0 because the stream ends 681 | } 682 | byte toRet = bytes[bytePos]; 683 | bytePos++; 684 | return toRet; 685 | } else { 686 | byte toRet = 0; 687 | for (int bit = 8; bit-- > 0; ) { 688 | if (bytePos >= bytes.length) { 689 | return 0; //force return 0 because the stream ends 690 | } 691 | if (getNBitValue(bytes[bytePos], curpos)) { 692 | toRet |= 1 << bit; 693 | } else { 694 | toRet &= ~(1 << bit); 695 | } 696 | if (curpos == 0) { 697 | bytePos++; 698 | curpos = START; 699 | } else { 700 | curpos--; 701 | } 702 | } 703 | return toRet; 704 | 705 | } 706 | } 707 | 708 | public byte read(int numberOfBits) { 709 | byte toRet = 0; 710 | for (int bit = numberOfBits; bit-- > 0; ) { 711 | if (bytePos >= bytes.length) { 712 | return 0; //force return 0 because the stream ends 713 | } 714 | if (getNBitValue(bytes[bytePos], curpos)) { 715 | toRet |= 1 << bit; 716 | } else { 717 | toRet &= ~(1 << bit); 718 | } 719 | if (curpos == 0) { 720 | bytePos++; 721 | curpos = START; 722 | } else { 723 | curpos--; 724 | } 725 | } 726 | return toRet; 727 | } 728 | 729 | public String readAsString(int numberOfBits) { 730 | StringBuilder sb = new StringBuilder(); 731 | for (int bit = numberOfBits; bit-- > 0; ) { 732 | if (bytePos >= bytes.length) { 733 | return null; //force return null because the stream ends 734 | } 735 | if (getNBitValue(bytes[bytePos], curpos)) { 736 | sb.append('1'); 737 | } else { 738 | sb.append('0'); 739 | } 740 | if (curpos == 0) { 741 | bytePos++; 742 | curpos = START; 743 | } else { 744 | curpos--; 745 | } 746 | } 747 | return sb.toString(); 748 | } 749 | 750 | public char read01Char() { 751 | if (bytePos >= bytes.length) { 752 | return 0; //force return 0 because the stream ends 753 | } 754 | char c; 755 | if (getNBitValue(bytes[bytePos], curpos)) { 756 | c = '1'; 757 | } else { 758 | c = '0'; 759 | } 760 | if (curpos == 0) { 761 | bytePos++; 762 | curpos = START; 763 | } else { 764 | curpos--; 765 | } 766 | return c; 767 | } 768 | 769 | 770 | public boolean isClosed() { 771 | return bytePos >= bytes.length; 772 | } 773 | 774 | public void advance(int n) { 775 | for (int i = 0; i < n; i++) { 776 | if (curpos == 0) { 777 | bytePos++; 778 | curpos = START; 779 | } else { 780 | curpos--; 781 | } 782 | } 783 | } 784 | 785 | public boolean peek01(String s) { 786 | int bytePosCopy = bytePos; 787 | byte curPosCopy = curpos; 788 | 789 | for (int i = 0; i < s.length(); i++) { 790 | if (bytePosCopy >= bytes.length) { 791 | return false; 792 | } 793 | boolean bitValue = getNBitValue(bytes[bytePosCopy], curPosCopy); 794 | boolean iBitIsSet = s.charAt(i) == ('1'); 795 | if (bitValue != iBitIsSet) { 796 | return false; 797 | } 798 | 799 | if (curPosCopy == 0) { 800 | bytePosCopy++; 801 | curPosCopy = START; 802 | } else { 803 | curPosCopy--; 804 | } 805 | } 806 | return true; 807 | } 808 | } 809 | 810 | private static class Huffer { 811 | 812 | private static class HuffmanTree implements Comparable> { 813 | 814 | public final int freq; 815 | 816 | public final HuffmanTree l, r; 817 | 818 | public final T value; 819 | 820 | private final boolean isLeaf; 821 | 822 | public HuffmanTree(HuffmanTree l, HuffmanTree r) { 823 | freq = l.freq + r.freq; 824 | this.l = l; 825 | this.r = r; 826 | value = null; 827 | isLeaf = false; 828 | } 829 | 830 | public HuffmanTree(int freq, T value) { 831 | this.freq = freq; 832 | this.value = value; 833 | l = null; 834 | r = null; 835 | isLeaf = true; 836 | } 837 | 838 | public int compareTo(HuffmanTree o) { 839 | return freq - o.freq; 840 | } 841 | 842 | } 843 | 844 | private static void toMap(HuffmanTree tree, StringBuilder prefix, Map t) { 845 | if (tree.isLeaf) { 846 | t.put(tree.value, prefix.toString()); 847 | } else { 848 | prefix.append('0'); 849 | toMap(tree.l, prefix, t); 850 | prefix.deleteCharAt(prefix.length() - 1); 851 | 852 | prefix.append('1'); 853 | toMap(tree.r, prefix, t); 854 | prefix.deleteCharAt(prefix.length() - 1); 855 | } 856 | } 857 | 858 | private static HuffmanTree makeHuffmanTree(Map objFreqs) { 859 | PriorityQueue> huffmanTrees = new PriorityQueue>(); 860 | for (Map.Entry entry : objFreqs.entrySet()) { 861 | huffmanTrees.offer(new HuffmanTree(entry.getValue(), entry.getKey())); 862 | } 863 | while (huffmanTrees.size() > 1) { 864 | HuffmanTree l = huffmanTrees.poll(); 865 | HuffmanTree r = huffmanTrees.poll(); 866 | huffmanTrees.offer(new HuffmanTree(l, r)); 867 | } 868 | return huffmanTrees.poll(); 869 | } 870 | 871 | public static Map huff(Collection input) { 872 | Map objectFreqs = makeFreqs(input); 873 | return makeMap(objectFreqs); 874 | } 875 | 876 | private static Map makeFreqs(Collection input) { 877 | Map objectFreqs = new HashMap<>(); 878 | for (T obj : input) { 879 | Integer count = objectFreqs.get(obj); 880 | objectFreqs.put(obj, count == null ? 1 : count + 1); 881 | } 882 | return objectFreqs; 883 | } 884 | 885 | private static Map makeMap(Map objectFreqs) { 886 | Map map = new LinkedHashMap(); 887 | if (objectFreqs.size() == 1) { 888 | map.put(objectFreqs.entrySet().iterator().next().getKey(), "1"); 889 | } else { 890 | toMap(makeHuffmanTree(objectFreqs), new StringBuilder(), map); 891 | } 892 | return map; 893 | } 894 | 895 | } 896 | 897 | private static class UTF8Char { 898 | private static UTF8Char invalidChar = null; 899 | private final byte[] bytes; 900 | 901 | public UTF8Char(byte[] bytes) { 902 | this.bytes = bytes; 903 | } 904 | 905 | @Override 906 | public boolean equals(Object obj) { 907 | if (obj == null || !(obj instanceof UTF8Char)) { 908 | return false; 909 | } 910 | return Arrays.equals(this.bytes, ((UTF8Char) obj).bytes); 911 | } 912 | 913 | public byte[] getBytes() { 914 | return bytes; 915 | } 916 | 917 | 918 | public byte getFirst() { 919 | return bytes[0]; 920 | } 921 | 922 | @Override 923 | public int hashCode() { 924 | if (bytes.length == 1) { 925 | return bytes[0]; 926 | } else if (bytes.length == 2) { 927 | return ((0xFF & bytes[1]) << 8) | (0xFF & bytes[0]); 928 | } else if (bytes.length == 3) { 929 | return ((0xFF & bytes[2]) << 16) | ((0xFF & bytes[1]) << 8) | (0xFF & bytes[0]); 930 | } else { 931 | return ((0xFF & bytes[3]) << 24) | ((0xFF & bytes[2]) << 16) | ((0xFF & bytes[1]) << 8) | (0xFF & bytes[0]); 932 | } 933 | } 934 | 935 | public String asString() { 936 | return new String(bytes, StandardCharsets.UTF_8); 937 | } 938 | 939 | @Override 940 | public String toString() { 941 | return "UTF8Char:[" + asString() + "][" + as01String() + "]"; 942 | } 943 | 944 | 945 | public String as01String() { 946 | StringBuilder sb = new StringBuilder(); 947 | for (byte b : bytes) { 948 | for (int bit = 8; bit-- > 0; ) { 949 | sb.append(((b & (1 << bit)) != 0) ? '1' : '0'); 950 | } 951 | sb.append(' '); 952 | } 953 | return sb.toString(); 954 | } 955 | 956 | public static UTF8Char getInvalidChar() { 957 | if (invalidChar == null) { 958 | invalidChar = new UTF8Char(new byte[]{(byte) 0b10000000}); 959 | } 960 | return invalidChar; 961 | } 962 | 963 | public boolean isInvalid() { 964 | return bytes.length == 1 && bytes[0] == (byte) 0b10000000; 965 | } 966 | } 967 | 968 | private static class Trie01 { 969 | 970 | private Trie01[] chldrn = null; 971 | 972 | private T value = null; 973 | 974 | public void add(String s, T value) { 975 | privateAdd(s, value, 0); 976 | } 977 | 978 | @SuppressWarnings("unchecked") 979 | private void privateAdd(String s, T value2, int i) { 980 | if (i < s.length()) { 981 | char charati = s.charAt(i); 982 | int index = charati == '0' ? 0 : charati == '1' ? 1 : -1; 983 | if (chldrn == null) chldrn = (Trie01[]) new Trie01[2]; 984 | if (chldrn[index] == null) chldrn[index] = new Trie01<>(); 985 | chldrn[index].privateAdd(s, value2, i + 1); 986 | } else { 987 | value = value2; 988 | } 989 | } 990 | 991 | public Scanner scan(char c) { 992 | Scanner sc = new Scanner(this); 993 | sc.scan(c); 994 | return sc; 995 | } 996 | 997 | private static class Scanner { 998 | 999 | private Trie01 curNode; 1000 | 1001 | private Scanner(Trie01 start) { 1002 | curNode = start; 1003 | } 1004 | 1005 | public boolean hasValue() { 1006 | return curNode.chldrn == null; 1007 | } 1008 | 1009 | public void scan(char c) { 1010 | curNode = curNode.chldrn[c == '0' ? 0 : c == '1' ? 1 : -1]; 1011 | } 1012 | 1013 | public T getValue() { 1014 | return curNode.value; 1015 | } 1016 | 1017 | } 1018 | 1019 | } 1020 | 1021 | } --------------------------------------------------------------------------------