├── .github └── workflows │ └── codeql-analysis.yml ├── .gitignore ├── .settings ├── org.eclipse.core.resources.prefs └── org.eclipse.m2e.core.prefs ├── LICENSE ├── README.md ├── SECURITY.md ├── build.gradle ├── dict_build-0.0.3.tar ├── gradle └── wrapper │ └── gradle-wrapper.properties ├── gradlew ├── gradlew.bat ├── pom.xml ├── settings.gradle └── src └── main ├── java ├── com │ └── fasterxml │ │ └── sort │ │ ├── DataReader.java │ │ ├── DataReaderFactory.java │ │ ├── DataWriter.java │ │ ├── DataWriterFactory.java │ │ ├── IterableSorterException.java │ │ ├── IteratingSorter.java │ │ ├── Merger.java │ │ ├── SortConfig.java │ │ ├── Sorter.java │ │ ├── SorterBase.java │ │ ├── SortingState.java │ │ ├── TempFileProvider.java │ │ ├── std │ │ ├── ByteArrayComparator.java │ │ ├── RawTextLineReader.java │ │ ├── RawTextLineWriter.java │ │ ├── StdComparator.java │ │ ├── StdTempFileProvider.java │ │ └── TextFileSorter.java │ │ └── util │ │ ├── BlockingQueueReader.java │ │ ├── CastingIterator.java │ │ ├── CollectionReader.java │ │ ├── NaturalComparator.java │ │ └── SegmentedBuffer.java ├── dict.properties ├── dict │ └── build │ │ ├── Builder.java │ │ ├── CounterMap.java │ │ ├── FastBuilder.java │ │ ├── LineReader.java │ │ ├── LineWriter.java │ │ ├── Main.java │ │ ├── PosProbability.java │ │ ├── SplitFileSorter.java │ │ ├── SplitStringComparator.java │ │ ├── TernaryNode.java │ │ └── TernaryTree.java └── pos_prop.txt └── resources ├── dict.properties ├── logback.xml └── pos_prop.txt /.github/workflows/codeql-analysis.yml: -------------------------------------------------------------------------------- 1 | # For most projects, this workflow file will not need changing; you simply need 2 | # to commit it to your repository. 3 | # 4 | # You may wish to alter this file to override the set of languages analyzed, 5 | # or to provide custom queries or build logic. 6 | # 7 | # ******** NOTE ******** 8 | # We have attempted to detect the languages in your repository. Please check 9 | # the `language` matrix defined below to confirm you have the correct set of 10 | # supported CodeQL languages. 11 | # 12 | name: "CodeQL" 13 | 14 | on: 15 | push: 16 | branches: [ master ] 17 | pull_request: 18 | # The branches below must be a subset of the branches above 19 | branches: [ master ] 20 | schedule: 21 | - cron: '42 2 * * 2' 22 | 23 | jobs: 24 | analyze: 25 | name: Analyze 26 | runs-on: ubuntu-latest 27 | 28 | strategy: 29 | fail-fast: false 30 | matrix: 31 | language: [ 'java' ] 32 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python' ] 33 | # Learn more: 34 | # https://docs.github.com/en/free-pro-team@latest/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#changing-the-languages-that-are-analyzed 35 | 36 | steps: 37 | - name: Checkout repository 38 | uses: actions/checkout@v2 39 | 40 | # Initializes the CodeQL tools for scanning. 41 | - name: Initialize CodeQL 42 | uses: github/codeql-action/init@v1 43 | with: 44 | languages: ${{ matrix.language }} 45 | # If you wish to specify custom queries, you can do so here or in a config file. 46 | # By default, queries listed here will override any specified in a config file. 47 | # Prefix the list here with "+" to use these queries and those in the config file. 48 | # queries: ./path/to/local/query, your-org/your-repo/queries@main 49 | 50 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). 51 | # If this step fails, then you should remove it and run the build manually (see below) 52 | - name: Autobuild 53 | uses: github/codeql-action/autobuild@v1 54 | 55 | # ℹ️ Command-line programs to run using the OS shell. 56 | # 📚 https://git.io/JvXDl 57 | 58 | # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines 59 | # and modify them (or add more) to build your code if your project 60 | # uses a compiled language 61 | 62 | #- run: | 63 | # make bootstrap 64 | # make release 65 | 66 | - name: Perform CodeQL Analysis 67 | uses: github/codeql-action/analyze@v1 68 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | 3 | # Mobile Tools for Java (J2ME) 4 | .mtj.tmp/ 5 | 6 | # Package Files # 7 | *.jar 8 | *.war 9 | *.ear 10 | 11 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 12 | hs_err_pid* 13 | /bin 14 | /target 15 | # idea IDE 16 | out 17 | .idea 18 | *.iml 19 | *.ipr 20 | *.iws 21 | *.ids 22 | 23 | #file 24 | *.pdf 25 | 26 | # eclipse IDE 27 | .classpath 28 | .project 29 | .settings 30 | 31 | # build 32 | .gradle 33 | /build 34 | 35 | #mac 36 | .DS_Store 37 | 38 | #log 39 | logs 40 | -------------------------------------------------------------------------------- /.settings/org.eclipse.core.resources.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | encoding/=UTF-8 3 | -------------------------------------------------------------------------------- /.settings/org.eclipse.m2e.core.prefs: -------------------------------------------------------------------------------- 1 | activeProfiles= 2 | eclipse.preferences.version=1 3 | resolveWorkspaceProjects=true 4 | version=1 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 构建词库 2 | ========== 3 | 4 | 从原始文本中,自动构建词库,目前只适用于中文。参考: 5 | 6 | http://www.matrix67.com/blog/archives/5044 7 | 8 | ### new in 0.0.3 9 | 10 | 1. 使用radix tree代替ternary search tree,提升性能。 11 | 2. 加入LOG信息,展示抽取的进度。 12 | 13 | 14 | ### new in 0.0.2 15 | 16 | 1. 直接导入[java-merge-sort](https://github.com/cowtowncoder/java-merge-sort)源码, thx[@cowtowncoder](https://github.com/cowtowncoder) 17 | 2. 将之前的maven项目,转变为一个gradle项目,方便打包使用。 18 | 19 | ### 成词条件 20 | 21 | 1. 互信息 22 | 2. 左右熵 23 | 3. 位置成词概率 24 | 4. ngram 频率 25 | 26 | ### 运行方法 27 | 28 | 1. [下载](https://github.com/sing1ee/dict_build/blob/master/dict_build-0.0.3.tar?raw=true)或者gradle distTar打包程序 29 | 2. 解压dict_build-x.x.x.tar 30 | 3. 解压之后,进入bin. 运行:./dict_build 你的数据文件的绝对路径 31 | 4. 结束之后,在数据文件同目录有文件:words_sort.data 32 | 5. 四列分别为:词,词频,互信息,左右熵,位置成词概率. 33 | 34 | ### 注意 35 | 36 | - 数据文件一定要是UTF8编码的 37 | - 如果数据文件较大, 出现out of memory问题,可以尝试如下方式,限mac和linux,其中2G可以根据实际情况调整 38 | 39 | ```shell 40 | export JAVA_OPTS=-Xmx2G 41 | ./dict_build 你的数据文件的绝对路径 42 | ``` 43 | 44 | ### 示例 45 | 46 | #### 《金瓶梅》抽取结果 47 | ```shell 48 | 西门庆 4754 6.727920454563199 2.0315193024276885 0.17472535684926388 49 | 月娘 1829 6.491853096329675 2.3714166640957095 0.22135096835144072 50 | 敬济 906 9.084808387804362 2.554594603718855 0.14485683987274656 51 | 春梅 799 8.134426320220927 2.7880175589451714 0.16484505593416485 52 | 玳安 796 8.228818690495881 2.865686193737731 0.11791820110723605 53 | 后边 617 6.6293566200796095 4.008365154080131 0.2160373686259245 54 | 玉楼 594 7.977279923499917 2.27346284978306 0.27518689925240297 55 | 明日 580 6.189824558880018 2.705423396095033 0.1774535638537181 56 | 两银子 458 6.129283016944967 2.351100547282295 0.3809078896437581 57 | 小厮 454 7.257387842692652 3.945653525477103 0.16666666666666666 58 | 打发 444 6.870364719583405 3.694604352707633 0.18409496065046307 59 | 如今 410 6.643856189774725 2.1460777430093394 0.1780766096169519 60 | 淫妇 382 7.768184324776926 3.277903508489837 0.2555205047318612 61 | 桂姐 371 7.584962500721156 2.5922046565140424 0.36255305256284687 62 | 老婆 331 6.266786540694902 3.5783015008688523 0.3758007117437722 63 | 衣服 309 8.90388184573618 2.786139685416002 0.13284518828451883 64 | 丫头 297 7.383704292474053 4.291010086795063 0.21875 65 | 潘金莲 288 8.276124405274238 2.4955186567189194 0.35333669524289796 66 | 昨日 285 6.857980995127572 2.6387249970833997 0.1774535638537181 67 | 王婆 284 7.1799090900149345 2.3129267619188907 0.3758007117437722 68 | ``` 69 | 70 | #### 《西游记》抽取结果 71 | ```shell 72 | 八戒 1807 7.88874324889826 2.00952580557629 0.36441586280814575 73 | 师父 1632 7.507794640198696 3.745294449785798 0.1371395690812608 74 | 大圣 1270 6.599912842187128 2.7790919785432147 0.13128460061010055 75 | 唐僧 1003 7.076815597050832 4.350465172292435 0.43277723258096173 76 | 菩萨 765 9.471675214392045 3.6013747138664756 0.15910495734948696 77 | 妖精 634 7.199672344836364 3.1817261900583627 0.13134411600669268 78 | 徒弟 439 8.060695931687555 2.498555429145656 0.15553809897879026 79 | 兄弟 284 7.845490050944376 2.93037668783551 0.16085578446909668 80 | 宝贝 283 9.319672120946995 2.616164396748633 0.15108220492589827 81 | 今日 282 6.714245517666122 2.1303069812971214 0.1774535638537181 82 | 取经 263 7.539158811108032 2.663944888382171 0.10181178023912565 83 | 如今 259 6.189824558880018 2.056188859866133 0.1780766096169519 84 | 认得 223 6.357552004618085 2.9543379335926954 0.2326782564877803 85 | 东土 212 8.422064766172811 3.326253983395916 0.14745277618775043 86 | 孙大圣 202 6.022367813028454 2.4886576514017107 0.13128460061010055 87 | 变作 189 7.554588851677638 3.0713596792578635 0.23452975920036348 88 | 玉帝 189 8.912889336229961 2.973106046717708 0.27518689925240297 89 | 土地 179 7.499845887083206 3.1206506190132566 0.2819944064037033 90 | 欢喜 173 8.861086905995393 2.184918471204895 0.31727272727272726 91 | 贫僧 170 7.400879436282184 2.0731236036504477 0.43277723258096173 92 | ``` 93 | 94 | #### 拉勾JD语料抽取结果 95 | ```shell 96 | 工作 641962 11.645208082774683 4.083574124851783 0.11247281022865935 97 | 开发 348538 14.031184262140844 4.37645153459778 0.18409496065046307 98 | 相关 300517 10.477758266443889 5.038915743418073 0.1758213331033888 99 | 合作 159688 10.397674632948268 3.9963476653135794 0.19498851077798446 100 | 专业 158831 10.712527000439824 3.152041650598071 0.2640750670241287 101 | 测试 158179 13.65362883340751 4.464104436545589 0.18344308560677328 102 | 互联网 148818 16.106992250086762 3.9556191209604314 0.407386403912951 103 | 活动 131099 10.391243589427443 3.9155422678129406 0.20137250696976194 104 | 维护 120316 12.681677655209691 3.2400117935377266 0.1960306406685237 105 | 问题 112116 9.159871336778389 2.314215135279833 0.20283174185051037 106 | 优化 109563 11.324180546618742 4.331660381832997 0.2456782591010779 107 | 营销 105845 14.36850646150769 5.097001962525406 0.14961371773129828 108 | 平台 100783 9.002815015607053 4.443804901153697 0.2877423571272965 109 | 培训 93204 9.041659151637216 3.8898570467819824 0.13345998575160295 110 | 资源 90339 8.651051691178928 4.063430372719874 0.14695817490494298 111 | 相关专业 87545 8.988684686772165 2.4897196388075598 0.2905199904149232 112 | 网站 87182 8.92184093707449 5.465843476701055 0.21266038137095059 113 | 独立 86111 9.074141462752506 3.1456261690072957 0.19050261614079594 114 | 一定 83798 8.335390354693924 2.107303660112154 0.26157299167679793 115 | 流程 83165 9.321928094887362 2.5509378861028074 0.2063141084699957 116 | 网络 82742 9.087462841250339 4.681429111504988 0.21266038137095059 117 | 优秀 74600 9.370687406807217 2.0756995478573135 0.2899855507391353 118 | 信息 71009 9.820178962415188 4.2602697278449755 0.18863532864443658 119 | 媒体 67533 10.556506054671928 4.615376861300178 0.17976710334788937 120 | 编写 64337 7.960001932068081 3.482400585501417 0.265625 121 | 思维 62351 8.741466986401146 2.4320664807326646 0.15396736072031514 122 | 规划 59733 7.851749041416057 2.936854928368285 0.14166201896263245 123 | 移动 59671 10.10459875356437 3.4421932833155653 0.20137250696976194 124 | 渠道 59072 9.513727595952437 4.597891463808354 0.23578595317725753 125 | 关系 58483 8.348728154231077 2.4369558675502927 0.3170022612253688 126 | 积极 57295 9.044394119358454 2.763249521041074 0.1746848469256496 127 | 实施 56645 7.781359713524661 4.371966846513886 0.15944453739334113 128 | 福利 55732 8.475733430966399 2.4036919305145426 0.20908952728378172 129 | 其他 55665 8.434628227636725 2.9614863103296867 0.15943975441289332 130 | 功能 55087 7.787902559391432 4.1663586610392755 0.18097560975609756 131 | 代码 52431 7.88874324889826 3.876917512626917 0.2135697048449972 132 | 微信 49143 8.945443836377912 3.6868130380800643 0.18215857916308253 133 | 企业 48799 9.422064766172813 5.568662443510237 0.2905199904149232 134 | 提升 48446 8.233619676759702 3.7390647282620666 0.29750778816199375 135 | 质量 47918 10.861862340059153 3.391825261582227 0.10921827734437191 136 | 人员 47109 7.774787059601174 5.249783964892326 0.13589632038101343 137 | 数据库 45445 8.290018846932618 4.123423571610193 0.2640569395017794 138 | 商务 44047 8.189824558880018 3.44858516585648 0.12901085044961344 139 | 主动 42628 13.815583433851023 2.5049637884195137 0.1968791796700847 140 | 创意 41768 14.396470993910388 4.115068825929573 0.30544056771141337 141 | 工具 40227 9.927777962082342 2.208874047820781 0.11247281022865935 142 | 等相关 39230 11.919608238603255 3.0330398736413557 0.1758213331033888 143 | 提出 38741 10.179909090014934 4.46446156782086 0.13053040103492886 144 | 各类 38309 8.344295907915816 5.136417986953123 0.3969948596283116 145 | 操作 37061 9.06339508128851 4.676836974292029 0.23452975920036348 146 | 收集 36600 8.800899899920305 2.797691452951563 0.11388512456999896 147 | 过程 36534 8.214319120800766 2.5633950372758565 0.2063141084699957 148 | 数据分析 36081 8.442943495848729 3.5589033442862585 0.2640569395017794 149 | ``` 150 | 151 | #### 全宋词抽取结果 152 | ```shell 153 | 何处 388 6.491853096329675 3.3628674437455617 0.6815015936725298 154 | 东风 286 5.392317422778761 4.458774408044057 0.19724622030237582 155 | 江南 250 6.409390936137703 3.903802705407174 0.10545138034778331 156 | 春风 237 3.5849625007211565 4.927775131630969 0.16484505593416485 157 | 相思 225 6.614709844115209 4.358855443007008 0.242072962836686 158 | 千里 218 6.409390936137703 4.4108660037595 0.2562873368242496 159 | 人间 200 5.357552004618084 3.6298146463975085 0.13589632038101343 160 | 明月 196 5.357552004618084 4.461698115330817 0.2009720696427977 161 | 归来 195 5.08746284125034 4.510975805812117 0.4260707923476106 162 | 尊前 190 7.607330313749611 3.7677180601390012 0.1516088400320623 163 | 相逢 179 7.426264754702098 3.729594240735622 0.2827298050139276 164 | 芳草 176 7.409390936137703 4.193709696939418 0.10797973400886637 165 | 多情 175 6.247927513443586 3.8156445316213303 0.3327408912022344 166 | 阑干 167 9.30149619498255 4.1027945328835855 0.17564639607106747 167 | 梅花 159 4.807354922057604 4.829461592976214 0.1725721995566835 168 | 年年 157 3.8073549220576037 3.401504022650184 0.10157033077180087 169 | 无人 150 2.807354922057604 4.773999920722275 0.35809310100061825 170 | 如今 148 5.7279204545632 2.4554158038937834 0.1780766096169519 171 | 回首 145 7.94251450533924 3.197825274741958 0.20080445544554457 172 | 天涯 142 7.74819284958946 4.087307754334477 0.4339155749636099 173 | 一枝 135 5.20945336562895 3.5111675192832683 0.2674922938432581 174 | 当时 134 6.08746284125034 3.2683525636568564 0.14850198715988994 175 | 流水 132 5.700439718141093 4.024081009656002 0.13549047394111163 176 | 佳人 131 5.20945336562895 3.0918026501936384 0.22896958600345846 177 | 西风 128 4.321928094887363 4.310178372466687 0.19724622030237582 178 | 依旧 125 7.768184324776926 3.8821144630683277 0.1728525980911983 179 | 故人 122 5.392317422778761 2.9526098687901237 0.2363130219610269 180 | 今夜 121 5.554588851677638 3.239568407653533 0.2543231961836613 181 | 少年 120 5.357552004618084 2.8645866477158934 0.23419345103365022 182 | 春色 120 5.129283016944966 4.576389958371988 0.16484505593416485 183 | ``` 184 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | 3 | ## Supported Versions 4 | 5 | Use this section to tell people about which versions of your project are 6 | currently being supported with security updates. 7 | 8 | | Version | Supported | 9 | | ------- | ------------------ | 10 | | 5.1.x | :white_check_mark: | 11 | | 5.0.x | :x: | 12 | | 4.0.x | :white_check_mark: | 13 | | < 4.0 | :x: | 14 | 15 | ## Reporting a Vulnerability 16 | 17 | Use this section to tell people how to report a vulnerability. 18 | 19 | Tell them where to go, how often they can expect to get an update on a 20 | reported vulnerability, what to expect if the vulnerability is accepted or 21 | declined, etc. 22 | -------------------------------------------------------------------------------- /build.gradle: -------------------------------------------------------------------------------- 1 | apply plugin: 'java' 2 | apply plugin: 'maven' 3 | apply plugin: 'eclipse' 4 | apply plugin: 'idea' 5 | apply plugin: 'application' 6 | 7 | group = 'build.dict' 8 | version = '0.0.3' 9 | 10 | sourceCompatibility = 1.8 11 | targetCompatibility = 1.8 12 | 13 | mainClassName='dict.build.Main' 14 | 15 | repositories { 16 | mavenLocal() 17 | maven { url 'http://nexus.ufish.io/content/groups/public/' } 18 | mavenCentral() 19 | } 20 | dependencies { 21 | compile('com.google.guava:guava:17.0') 22 | compile('com.google.code.externalsortinginjava:externalsortinginjava:0.1.9') 23 | compile('ch.qos.logback:logback-classic:1.0.13') 24 | compile('ch.qos.logback:logback-core:1.0.13') 25 | compile('org.slf4j:slf4j-api:1.6.4') 26 | compile('commons-logging:commons-logging:1.1.1') 27 | compile('commons-cli:commons-cli:1.2') 28 | compile('com.googlecode.concurrent-trees:concurrent-trees:2.6.0') 29 | } 30 | -------------------------------------------------------------------------------- /dict_build-0.0.3.tar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sing1ee/dict_build/426368bbfb4cc360c678cc75ab7b3ca4a926e25b/dict_build-0.0.3.tar -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | #Tue Apr 26 11:25:52 CST 2016 2 | distributionBase=GRADLE_USER_HOME 3 | distributionPath=wrapper/dists 4 | zipStoreBase=GRADLE_USER_HOME 5 | zipStorePath=wrapper/dists 6 | distributionUrl=https\://services.gradle.org/distributions/gradle-2.10-bin.zip 7 | -------------------------------------------------------------------------------- /gradlew: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ############################################################################## 4 | ## 5 | ## Gradle start up script for UN*X 6 | ## 7 | ############################################################################## 8 | 9 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 10 | DEFAULT_JVM_OPTS="" 11 | 12 | APP_NAME="Gradle" 13 | APP_BASE_NAME=`basename "$0"` 14 | 15 | # Use the maximum available, or set MAX_FD != -1 to use that value. 16 | MAX_FD="maximum" 17 | 18 | warn ( ) { 19 | echo "$*" 20 | } 21 | 22 | die ( ) { 23 | echo 24 | echo "$*" 25 | echo 26 | exit 1 27 | } 28 | 29 | # OS specific support (must be 'true' or 'false'). 30 | cygwin=false 31 | msys=false 32 | darwin=false 33 | case "`uname`" in 34 | CYGWIN* ) 35 | cygwin=true 36 | ;; 37 | Darwin* ) 38 | darwin=true 39 | ;; 40 | MINGW* ) 41 | msys=true 42 | ;; 43 | esac 44 | 45 | # Attempt to set APP_HOME 46 | # Resolve links: $0 may be a link 47 | PRG="$0" 48 | # Need this for relative symlinks. 49 | while [ -h "$PRG" ] ; do 50 | ls=`ls -ld "$PRG"` 51 | link=`expr "$ls" : '.*-> \(.*\)$'` 52 | if expr "$link" : '/.*' > /dev/null; then 53 | PRG="$link" 54 | else 55 | PRG=`dirname "$PRG"`"/$link" 56 | fi 57 | done 58 | SAVED="`pwd`" 59 | cd "`dirname \"$PRG\"`/" >/dev/null 60 | APP_HOME="`pwd -P`" 61 | cd "$SAVED" >/dev/null 62 | 63 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar 64 | 65 | # Determine the Java command to use to start the JVM. 66 | if [ -n "$JAVA_HOME" ] ; then 67 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then 68 | # IBM's JDK on AIX uses strange locations for the executables 69 | JAVACMD="$JAVA_HOME/jre/sh/java" 70 | else 71 | JAVACMD="$JAVA_HOME/bin/java" 72 | fi 73 | if [ ! -x "$JAVACMD" ] ; then 74 | die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME 75 | 76 | Please set the JAVA_HOME variable in your environment to match the 77 | location of your Java installation." 78 | fi 79 | else 80 | JAVACMD="java" 81 | which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 82 | 83 | Please set the JAVA_HOME variable in your environment to match the 84 | location of your Java installation." 85 | fi 86 | 87 | # Increase the maximum file descriptors if we can. 88 | if [ "$cygwin" = "false" -a "$darwin" = "false" ] ; then 89 | MAX_FD_LIMIT=`ulimit -H -n` 90 | if [ $? -eq 0 ] ; then 91 | if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then 92 | MAX_FD="$MAX_FD_LIMIT" 93 | fi 94 | ulimit -n $MAX_FD 95 | if [ $? -ne 0 ] ; then 96 | warn "Could not set maximum file descriptor limit: $MAX_FD" 97 | fi 98 | else 99 | warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT" 100 | fi 101 | fi 102 | 103 | # For Darwin, add options to specify how the application appears in the dock 104 | if $darwin; then 105 | GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\"" 106 | fi 107 | 108 | # For Cygwin, switch paths to Windows format before running java 109 | if $cygwin ; then 110 | APP_HOME=`cygpath --path --mixed "$APP_HOME"` 111 | CLASSPATH=`cygpath --path --mixed "$CLASSPATH"` 112 | JAVACMD=`cygpath --unix "$JAVACMD"` 113 | 114 | # We build the pattern for arguments to be converted via cygpath 115 | ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null` 116 | SEP="" 117 | for dir in $ROOTDIRSRAW ; do 118 | ROOTDIRS="$ROOTDIRS$SEP$dir" 119 | SEP="|" 120 | done 121 | OURCYGPATTERN="(^($ROOTDIRS))" 122 | # Add a user-defined pattern to the cygpath arguments 123 | if [ "$GRADLE_CYGPATTERN" != "" ] ; then 124 | OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)" 125 | fi 126 | # Now convert the arguments - kludge to limit ourselves to /bin/sh 127 | i=0 128 | for arg in "$@" ; do 129 | CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -` 130 | CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option 131 | 132 | if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition 133 | eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"` 134 | else 135 | eval `echo args$i`="\"$arg\"" 136 | fi 137 | i=$((i+1)) 138 | done 139 | case $i in 140 | (0) set -- ;; 141 | (1) set -- "$args0" ;; 142 | (2) set -- "$args0" "$args1" ;; 143 | (3) set -- "$args0" "$args1" "$args2" ;; 144 | (4) set -- "$args0" "$args1" "$args2" "$args3" ;; 145 | (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;; 146 | (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;; 147 | (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;; 148 | (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;; 149 | (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;; 150 | esac 151 | fi 152 | 153 | # Split up the JVM_OPTS And GRADLE_OPTS values into an array, following the shell quoting and substitution rules 154 | function splitJvmOpts() { 155 | JVM_OPTS=("$@") 156 | } 157 | eval splitJvmOpts $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS 158 | JVM_OPTS[${#JVM_OPTS[*]}]="-Dorg.gradle.appname=$APP_BASE_NAME" 159 | 160 | exec "$JAVACMD" "${JVM_OPTS[@]}" -classpath "$CLASSPATH" org.gradle.wrapper.GradleWrapperMain "$@" 161 | -------------------------------------------------------------------------------- /gradlew.bat: -------------------------------------------------------------------------------- 1 | @if "%DEBUG%" == "" @echo off 2 | @rem ########################################################################## 3 | @rem 4 | @rem Gradle startup script for Windows 5 | @rem 6 | @rem ########################################################################## 7 | 8 | @rem Set local scope for the variables with windows NT shell 9 | if "%OS%"=="Windows_NT" setlocal 10 | 11 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 12 | set DEFAULT_JVM_OPTS= 13 | 14 | set DIRNAME=%~dp0 15 | if "%DIRNAME%" == "" set DIRNAME=. 16 | set APP_BASE_NAME=%~n0 17 | set APP_HOME=%DIRNAME% 18 | 19 | @rem Find java.exe 20 | if defined JAVA_HOME goto findJavaFromJavaHome 21 | 22 | set JAVA_EXE=java.exe 23 | %JAVA_EXE% -version >NUL 2>&1 24 | if "%ERRORLEVEL%" == "0" goto init 25 | 26 | echo. 27 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 28 | echo. 29 | echo Please set the JAVA_HOME variable in your environment to match the 30 | echo location of your Java installation. 31 | 32 | goto fail 33 | 34 | :findJavaFromJavaHome 35 | set JAVA_HOME=%JAVA_HOME:"=% 36 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe 37 | 38 | if exist "%JAVA_EXE%" goto init 39 | 40 | echo. 41 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 42 | echo. 43 | echo Please set the JAVA_HOME variable in your environment to match the 44 | echo location of your Java installation. 45 | 46 | goto fail 47 | 48 | :init 49 | @rem Get command-line arguments, handling Windowz variants 50 | 51 | if not "%OS%" == "Windows_NT" goto win9xME_args 52 | if "%@eval[2+2]" == "4" goto 4NT_args 53 | 54 | :win9xME_args 55 | @rem Slurp the command line arguments. 56 | set CMD_LINE_ARGS= 57 | set _SKIP=2 58 | 59 | :win9xME_args_slurp 60 | if "x%~1" == "x" goto execute 61 | 62 | set CMD_LINE_ARGS=%* 63 | goto execute 64 | 65 | :4NT_args 66 | @rem Get arguments from the 4NT Shell from JP Software 67 | set CMD_LINE_ARGS=%$ 68 | 69 | :execute 70 | @rem Setup the command line 71 | 72 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar 73 | 74 | @rem Execute Gradle 75 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS% 76 | 77 | :end 78 | @rem End local scope for the variables with windows NT shell 79 | if "%ERRORLEVEL%"=="0" goto mainEnd 80 | 81 | :fail 82 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of 83 | rem the _cmd.exe /c_ return code! 84 | if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 85 | exit /b 1 86 | 87 | :mainEnd 88 | if "%OS%"=="Windows_NT" endlocal 89 | 90 | :omega 91 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | build.dict 5 | 0.0.1 6 | 7 | jar 8 | 9 | 10 | UTF-8 11 | 12 | 13 | 14 | junit 15 | junit 16 | 4.13.1 17 | test 18 | 19 | 20 | com.google.guava 21 | guava 22 | 17.0 23 | 24 | 25 | com.google.code.externalsortinginjava 26 | externalsortinginjava 27 | 0.1.9 28 | 29 | 30 | ch.qos.logback 31 | logback-classic 32 | 1.2.0 33 | 34 | 35 | ch.qos.logback 36 | logback-core 37 | 1.2.9 38 | 39 | 40 | org.slf4j 41 | slf4j-api 42 | 1.6.4 43 | 44 | 45 | commons-logging 46 | commons-logging 47 | 1.1.1 48 | 49 | 50 | commons-cli 51 | commons-cli 52 | 1.2 53 | 54 | 55 | com.googlecode.concurrent-trees 56 | concurrent-trees 57 | 2.6.0 58 | 59 | 60 | 61 | 62 | 63 | maven-compiler-plugin 64 | 3.0 65 | 66 | 1.8 67 | 1.8 68 | 69 | 70 | 71 | org.apache.maven.plugins 72 | maven-assembly-plugin 73 | 2.4 74 | 75 | 76 | jar-with-dependencies 77 | 78 | 79 | 80 | 81 | 82 | assemble-all 83 | package 84 | 85 | single 86 | 87 | 88 | 89 | 90 | 91 | 92 | dict_build 93 | -------------------------------------------------------------------------------- /settings.gradle: -------------------------------------------------------------------------------- 1 | rootProject.name = 'dict_build' 2 | -------------------------------------------------------------------------------- /src/main/java/com/fasterxml/sort/DataReader.java: -------------------------------------------------------------------------------- 1 | package com.fasterxml.sort; 2 | 3 | import java.io.IOException; 4 | 5 | public abstract class DataReader 6 | { 7 | /** 8 | * Method for reading the next data item; will return 9 | * null to indicate end of input, otherwise return a non-null 10 | * item. 11 | */ 12 | public abstract T readNext() throws IOException; 13 | 14 | /** 15 | * Method that should estimate memory usage of given item, for purpose 16 | * of limiting amount of data kept in memory during pre-sorting phase. 17 | */ 18 | public abstract int estimateSizeInBytes(T item); 19 | 20 | /** 21 | * Method for closing the reader. Note that reader needs to ensure 22 | * that it is ok to call close multiple times. Reader may also 23 | * close underlying resources as soon as it has reached end of input. 24 | */ 25 | public abstract void close() throws IOException; 26 | } 27 | -------------------------------------------------------------------------------- /src/main/java/com/fasterxml/sort/DataReaderFactory.java: -------------------------------------------------------------------------------- 1 | package com.fasterxml.sort; 2 | 3 | import java.io.*; 4 | 5 | public abstract class DataReaderFactory 6 | { 7 | public abstract DataReader constructReader(InputStream in) throws IOException; 8 | } -------------------------------------------------------------------------------- /src/main/java/com/fasterxml/sort/DataWriter.java: -------------------------------------------------------------------------------- 1 | package com.fasterxml.sort; 2 | 3 | import java.io.IOException; 4 | 5 | public abstract class DataWriter 6 | { 7 | public abstract void writeEntry(T item) throws IOException; 8 | 9 | public abstract void close() throws IOException; 10 | } 11 | -------------------------------------------------------------------------------- /src/main/java/com/fasterxml/sort/DataWriterFactory.java: -------------------------------------------------------------------------------- 1 | package com.fasterxml.sort; 2 | 3 | import java.io.*; 4 | 5 | public abstract class DataWriterFactory 6 | { 7 | public abstract DataWriter constructWriter(OutputStream out) throws IOException; 8 | } 9 | -------------------------------------------------------------------------------- /src/main/java/com/fasterxml/sort/IterableSorterException.java: -------------------------------------------------------------------------------- 1 | package com.fasterxml.sort; 2 | 3 | import java.io.IOException; 4 | import java.util.Iterator; 5 | 6 | /** 7 | * We need an unchecked exception to work with {@link Iterator}, and 8 | * want a specific subtype to catch. 9 | */ 10 | public class IterableSorterException extends RuntimeException { 11 | private static final long serialVersionUID = 1L; 12 | 13 | public IterableSorterException(IOException cause) { 14 | super(cause); 15 | } 16 | } -------------------------------------------------------------------------------- /src/main/java/com/fasterxml/sort/IteratingSorter.java: -------------------------------------------------------------------------------- 1 | package com.fasterxml.sort; 2 | 3 | import com.fasterxml.sort.util.CastingIterator; 4 | import com.fasterxml.sort.util.SegmentedBuffer; 5 | 6 | import java.io.Closeable; 7 | import java.io.File; 8 | import java.io.IOException; 9 | import java.util.ArrayList; 10 | import java.util.Arrays; 11 | import java.util.Comparator; 12 | import java.util.Iterator; 13 | import java.util.List; 14 | import java.util.NoSuchElementException; 15 | 16 | public class IteratingSorter extends SorterBase implements Closeable 17 | { 18 | // Set iff sort spilled to disk 19 | private List _mergerInputs; 20 | private DataReader _merger; 21 | 22 | 23 | public IteratingSorter(SortConfig config, 24 | DataReaderFactory readerFactory, 25 | DataWriterFactory writerFactory, 26 | Comparator comparator) 27 | { 28 | super(config, readerFactory, writerFactory, comparator); 29 | } 30 | 31 | public IteratingSorter() { 32 | super(); 33 | } 34 | 35 | public IteratingSorter(SortConfig config) { 36 | super(config); 37 | } 38 | 39 | /** 40 | * Method that will perform full sort on input data read using given 41 | * {@link DataReader}. 42 | * 43 | * Conversions to and from intermediate sort files is done 44 | * using {@link DataReaderFactory} and {@link DataWriterFactory} configured 45 | * for this sorter. 46 | * 47 | * The returned Iterator will throw {@link IterableSorterException} if any 48 | * IOException is encountered during calls of {@link Iterator#next()}. 49 | * 50 | * @return Iterator if sorting complete and output is ready to be written; null if it was cancelled 51 | */ 52 | public Iterator sort(DataReader inputReader) 53 | throws IOException 54 | { 55 | // Clean up any previous sort 56 | close(); 57 | 58 | // First, pre-sort: 59 | _phase = Phase.PRE_SORTING; 60 | boolean inputClosed = false; 61 | 62 | SegmentedBuffer buffer = new SegmentedBuffer(); 63 | _presortFileCount = 0; 64 | _sortRoundCount = -1; 65 | _currentSortRound = -1; 66 | 67 | Iterator iterator = null; 68 | try { 69 | Object[] items = _readMax(inputReader, buffer, _config.getMaxMemoryUsage(), null); 70 | if (_checkForCancel()) { 71 | close(); 72 | return null; 73 | } 74 | Arrays.sort(items, _rawComparator()); 75 | T next = inputReader.readNext(); 76 | /* Minor optimization: in case all entries might fit in 77 | * in-memory sort buffer, avoid writing intermediate file 78 | * and just write results directly. 79 | */ 80 | if (next == null) { 81 | inputClosed = true; 82 | inputReader.close(); 83 | _phase = Phase.SORTING; 84 | iterator = new CastingIterator(Arrays.asList(items).iterator()); 85 | } else { // but if more data than memory-buffer-full, do it right: 86 | List presorted = new ArrayList(); 87 | presorted.add(_writePresorted(items)); 88 | items = null; // it's a big array, clear refs as early as possible 89 | _presort(inputReader, buffer, next, presorted); 90 | inputClosed = true; 91 | inputReader.close(); 92 | _phase = Phase.SORTING; 93 | if (_checkForCancel(presorted)) { 94 | close(); 95 | return null; 96 | } 97 | _mergerInputs = presorted; 98 | _merger = _createMergeReader(merge(presorted)); 99 | iterator = new MergerIterator(_merger); 100 | } 101 | } finally { 102 | if (!inputClosed) { 103 | try { 104 | inputReader.close(); 105 | } catch (IOException e) { 106 | // Ignore 107 | } 108 | } 109 | } 110 | if (_checkForCancel()) { 111 | close(); 112 | return null; 113 | } 114 | _phase = Phase.COMPLETE; 115 | return iterator; 116 | } 117 | 118 | 119 | /* 120 | /********************************************************************** 121 | /* Closeable API 122 | /********************************************************************** 123 | */ 124 | 125 | @Override 126 | public void close() { 127 | if (_merger != null) { 128 | try { 129 | _merger.close(); 130 | } 131 | catch (IOException e) { 132 | // Ignore 133 | } 134 | } 135 | if (_mergerInputs != null) { 136 | for (File input : _mergerInputs) { 137 | input.delete(); 138 | } 139 | } 140 | _mergerInputs = null; 141 | _merger = null; 142 | } 143 | 144 | /* 145 | /********************************************************************** 146 | /* Iterator implementations 147 | /********************************************************************** 148 | */ 149 | 150 | private static class MergerIterator implements Iterator { 151 | private final DataReader _merger; 152 | private T _next; 153 | 154 | private MergerIterator(DataReader merger) throws IOException { 155 | _merger = merger; 156 | _next = _merger.readNext(); 157 | } 158 | 159 | @Override 160 | public boolean hasNext() { 161 | return (_next != null); 162 | } 163 | 164 | @Override 165 | public T next() { 166 | if (_next == null) { 167 | throw new NoSuchElementException(); 168 | } 169 | T t = _next; 170 | try { 171 | _next = _merger.readNext(); 172 | } catch (IOException e) { 173 | throw new IterableSorterException(e); 174 | } 175 | return t; 176 | } 177 | 178 | @Override 179 | public void remove() { 180 | throw new UnsupportedOperationException(); 181 | } 182 | } 183 | } 184 | -------------------------------------------------------------------------------- /src/main/java/com/fasterxml/sort/Merger.java: -------------------------------------------------------------------------------- 1 | package com.fasterxml.sort; 2 | 3 | import java.io.IOException; 4 | import java.util.*; 5 | 6 | /** 7 | * Object used to merge items from multiple input sources into one. 8 | */ 9 | public abstract class Merger 10 | extends DataReader 11 | { 12 | protected final Comparator _comparator; 13 | 14 | /* 15 | /********************************************************************** 16 | /* Construction 17 | /********************************************************************** 18 | */ 19 | 20 | public Merger(Comparator cmp) { 21 | _comparator = cmp; 22 | } 23 | 24 | public static DataReader mergedReader(Comparator cmp, List> inputs) 25 | throws IOException 26 | { 27 | switch (inputs.size()) { 28 | case 0: 29 | throw new IllegalArgumentException("Can not pass empty DataReader array"); 30 | case 1: 31 | return inputs.get(0); 32 | case 2: 33 | return new PairwiseMerger(cmp, inputs.get(0), inputs.get(1)); 34 | } 35 | 36 | // otherwise, divide and conquer 37 | ArrayList> readers = new ArrayList>(1 + (inputs.size() >> 1)); 38 | int i = 0; 39 | final int end = inputs.size()-1; 40 | for (; i < end; i += 2) { 41 | readers.add(new PairwiseMerger(cmp, inputs.get(i), inputs.get(i+1))); 42 | } 43 | // and for odd number of readers, add last one as is without merging 44 | if (i < inputs.size()) { 45 | readers.add(inputs.get(i)); 46 | } 47 | return mergedReader(cmp, readers); 48 | } 49 | 50 | /* 51 | /********************************************************************** 52 | /* Concrete implementations 53 | /********************************************************************** 54 | */ 55 | 56 | protected static class PairwiseMerger 57 | extends Merger 58 | { 59 | protected final DataReader _reader1; 60 | protected final DataReader _reader2; 61 | 62 | protected T _data1; 63 | protected T _data2; 64 | 65 | protected boolean _closed; 66 | 67 | public PairwiseMerger(Comparator comparator, 68 | DataReader reader1, DataReader reader2) 69 | throws IOException 70 | { 71 | super(comparator); 72 | _reader1 = reader1; 73 | _data1 = reader1.readNext(); 74 | _reader2 = reader2; 75 | _data2 = reader2.readNext(); 76 | } 77 | 78 | @Override 79 | public T readNext() throws IOException 80 | { 81 | if (_data1 == null) { 82 | if (_data2 == null) { 83 | // [Issue#8]: Should auto-close merged input when there is no more data 84 | close(); 85 | return null; 86 | } 87 | T result = _data2; 88 | _data2 = _reader2.readNext(); 89 | return result; 90 | } 91 | if (_data2 == null) { 92 | T result = _data1; 93 | _data1 = _reader1.readNext(); 94 | return result; 95 | } 96 | // neither is null, compare 97 | T result; 98 | if (_comparator.compare(_data1, _data2) <= 0) { 99 | result = _data1; 100 | _data1 = _reader1.readNext(); 101 | } else { 102 | result = _data2; 103 | _data2 = _reader2.readNext(); 104 | } 105 | return result; 106 | } 107 | 108 | @Override 109 | public int estimateSizeInBytes(T item) { 110 | // should not matter so 111 | return _reader1.estimateSizeInBytes(item); 112 | } 113 | 114 | @Override 115 | public void close() throws IOException 116 | { 117 | if (!_closed) { 118 | _reader1.close(); 119 | _reader2.close(); 120 | _closed = true; 121 | } 122 | } 123 | } 124 | } 125 | -------------------------------------------------------------------------------- /src/main/java/com/fasterxml/sort/SortConfig.java: -------------------------------------------------------------------------------- 1 | package com.fasterxml.sort; 2 | 3 | import com.fasterxml.sort.std.StdTempFileProvider; 4 | 5 | /** 6 | * Configuration object used for changing details of sorting 7 | * process. Default settings are usable, so often 8 | * instance is created without arguments and used as is. 9 | */ 10 | public class SortConfig 11 | { 12 | /** 13 | * By default we will use 40 megs for pre-sorting. 14 | */ 15 | public final static long DEFAULT_MEMORY_USAGE = 40 * 1024 * 1024; 16 | 17 | /** 18 | * Default merge sort is 16-way sort (using 16 input files concurrently) 19 | */ 20 | public final static int DEFAULT_MERGE_FACTOR = 16; 21 | 22 | protected int _mergeFactor; 23 | 24 | protected long _maxMemoryUsage; 25 | 26 | protected TempFileProvider _tempFileProvider; 27 | 28 | /* 29 | /************************************************************************ 30 | /* Construction 31 | /************************************************************************ 32 | */ 33 | 34 | public SortConfig() 35 | { 36 | _mergeFactor = DEFAULT_MERGE_FACTOR; 37 | _maxMemoryUsage = DEFAULT_MEMORY_USAGE; 38 | _tempFileProvider = new StdTempFileProvider(); 39 | } 40 | 41 | protected SortConfig(SortConfig base, int mergeFactor) { 42 | _maxMemoryUsage = base._maxMemoryUsage; 43 | _mergeFactor = mergeFactor; 44 | _tempFileProvider = base._tempFileProvider; 45 | } 46 | 47 | protected SortConfig(SortConfig base, long maxMem) { 48 | _maxMemoryUsage = maxMem; 49 | _mergeFactor = base._mergeFactor; 50 | _tempFileProvider = base._tempFileProvider; 51 | } 52 | 53 | protected SortConfig(SortConfig base, TempFileProvider prov) { 54 | _mergeFactor = base._mergeFactor; 55 | _maxMemoryUsage = base._maxMemoryUsage; 56 | _tempFileProvider = prov; 57 | } 58 | 59 | /* 60 | /************************************************************************ 61 | /* Accessors 62 | /************************************************************************ 63 | */ 64 | 65 | public int getMergeFactor() { return _mergeFactor; } 66 | 67 | public long getMaxMemoryUsage() { return _maxMemoryUsage; } 68 | 69 | public TempFileProvider getTempFileProvider() { return _tempFileProvider; } 70 | 71 | /* 72 | /************************************************************************ 73 | /* Fluent construction methods 74 | /************************************************************************ 75 | */ 76 | 77 | /** 78 | * Method for constructing configuration instance that defines that maximum amount 79 | * of memory to use for pre-sorting. This is generally a crude approximation and 80 | * implementations make best effort to honor it. 81 | * 82 | * @param maxMem Maximum memory that pre-sorted should use for in-memory sorting 83 | * @return New 84 | */ 85 | public SortConfig withMaxMemoryUsage(long maxMem) 86 | { 87 | if (maxMem == _maxMemoryUsage) { 88 | return this; 89 | } 90 | return new SortConfig(this, maxMem); 91 | } 92 | 93 | public SortConfig withTempFileProvider(TempFileProvider provider) 94 | { 95 | if (provider == _tempFileProvider) { 96 | return this; 97 | } 98 | return new SortConfig(this, provider); 99 | } 100 | 101 | } 102 | -------------------------------------------------------------------------------- /src/main/java/com/fasterxml/sort/Sorter.java: -------------------------------------------------------------------------------- 1 | package com.fasterxml.sort; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | import java.io.OutputStream; 6 | import java.util.Comparator; 7 | import java.util.Iterator; 8 | 9 | /** 10 | * Main entry point for sorting functionality; object that drives 11 | * the sorting process from pre-sort to final output. 12 | * Instances are not thread-safe, although they are reusable. 13 | * Since the cost of creating new instances is trivial, there is usually 14 | * no benefit from reusing instances, other than possible convenience. 15 | */ 16 | public class Sorter extends IteratingSorter 17 | { 18 | /** 19 | * @param config Configuration for the sorter 20 | * @param readerFactory Factory used for creating readers for pre-sorted data; 21 | * as well as for input if an {@link InputStream} is passed as source 22 | * @param writerFactory Factory used for creating writers for storing pre-sorted data; 23 | * as well as for results if an {@link OutputStream} is passed as destination. 24 | */ 25 | public Sorter(SortConfig config, 26 | DataReaderFactory readerFactory, 27 | DataWriterFactory writerFactory, 28 | Comparator comparator) 29 | { 30 | super(config, readerFactory, writerFactory, comparator); 31 | } 32 | 33 | public Sorter() { 34 | super(); 35 | } 36 | 37 | public Sorter(SortConfig config) { 38 | super(config); 39 | } 40 | 41 | protected Sorter withReaderFactory(DataReaderFactory f) { 42 | return new Sorter(_config, f, _writerFactory, _comparator); 43 | } 44 | 45 | protected Sorter withWriterFactory(DataWriterFactory f) { 46 | return new Sorter(_config, _readerFactory, f, _comparator); 47 | } 48 | 49 | protected Sorter withComparator(Comparator cmp) { 50 | return new Sorter(_config, _readerFactory, _writerFactory, cmp); 51 | } 52 | 53 | 54 | /* 55 | /********************************************************************** 56 | /* Main sorting API 57 | /********************************************************************** 58 | */ 59 | 60 | /** 61 | * Method that will perform full sort on specified input, writing results 62 | * into specified destination. Data conversions needed are done 63 | * using {@link DataReaderFactory} and {@link DataWriterFactory} configured 64 | * for this sorter. 65 | */ 66 | public void sort(InputStream source, OutputStream destination) 67 | throws IOException 68 | { 69 | sort(_readerFactory.constructReader(source), 70 | _writerFactory.constructWriter(destination)); 71 | } 72 | 73 | /** 74 | * Method that will perform full sort on input data read using given 75 | * {@link DataReader}, and written out using specified {@link DataWriter}. 76 | * Conversions to and from intermediate sort files is done 77 | * using {@link DataReaderFactory} and {@link DataWriterFactory} configured 78 | * for this sorter. 79 | * 80 | * @return true if sorting completed successfully; false if it was cancelled 81 | */ 82 | public boolean sort(DataReader inputReader, DataWriter resultWriter) 83 | throws IOException 84 | { 85 | Iterator it = super.sort(inputReader); 86 | if(it == null) { 87 | return false; 88 | } 89 | try { 90 | while(it.hasNext()) { 91 | T value = it.next(); 92 | resultWriter.writeEntry(value); 93 | } 94 | resultWriter.close(); 95 | } finally { 96 | super.close(); 97 | } 98 | return true; 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /src/main/java/com/fasterxml/sort/SorterBase.java: -------------------------------------------------------------------------------- 1 | package com.fasterxml.sort; 2 | 3 | import java.io.*; 4 | import java.util.*; 5 | import java.util.concurrent.atomic.AtomicBoolean; 6 | 7 | import com.fasterxml.sort.util.SegmentedBuffer; 8 | 9 | public abstract class SorterBase 10 | implements SortingState 11 | { 12 | /* each entry (in buffer) takes about 4 bytes on 32-bit machine; but let's be 13 | * conservative and use 8 as base, plus size of object itself. 14 | */ 15 | private final static long ENTRY_SLOT_SIZE = 8L; 16 | 17 | /* 18 | /********************************************************************** 19 | /* Configuration 20 | /********************************************************************** 21 | */ 22 | 23 | protected final SortConfig _config; 24 | 25 | /** 26 | * Factory used for reading intermediate sorted files. 27 | */ 28 | protected DataReaderFactory _readerFactory; 29 | 30 | /** 31 | * Factory used for writing intermediate sorted files. 32 | */ 33 | protected DataWriterFactory _writerFactory; 34 | 35 | /** 36 | * Comparator to use for sorting entries; defaults to 'C 37 | */ 38 | protected Comparator _comparator; 39 | 40 | /* 41 | /********************************************************************** 42 | /* State 43 | /********************************************************************** 44 | */ 45 | 46 | protected SortingState.Phase _phase; 47 | 48 | protected int _presortFileCount; 49 | 50 | protected int _sortRoundCount; 51 | 52 | protected int _currentSortRound; 53 | 54 | protected final AtomicBoolean _cancelRequest = new AtomicBoolean(false); 55 | 56 | protected Exception _cancelForException; 57 | 58 | /* 59 | /********************************************************************** 60 | /* Construction 61 | /********************************************************************** 62 | */ 63 | 64 | protected SorterBase(SortConfig config, 65 | DataReaderFactory readerFactory, 66 | DataWriterFactory writerFactory, 67 | Comparator comparator) 68 | { 69 | _config = config; 70 | 71 | _readerFactory = readerFactory; 72 | _writerFactory = writerFactory; 73 | _comparator = comparator; 74 | 75 | _phase = null; 76 | } 77 | 78 | protected SorterBase() { 79 | this(new SortConfig()); 80 | } 81 | 82 | protected SorterBase(SortConfig config) { 83 | this(config, null, null, null); 84 | } 85 | 86 | /* 87 | /********************************************************************** 88 | /* SortingState implementation 89 | /********************************************************************** 90 | */ 91 | 92 | @Override 93 | public void cancel() { 94 | _cancelForException = null; 95 | _cancelRequest.set(true); 96 | } 97 | 98 | @Override 99 | public void cancel(RuntimeException e) { 100 | _cancelForException = e; 101 | _cancelRequest.set(true); 102 | } 103 | 104 | @Override 105 | public void cancel(IOException e) { 106 | _cancelForException = e; 107 | _cancelRequest.set(true); 108 | } 109 | 110 | @Override 111 | public Phase getPhase() { 112 | return _phase; 113 | } 114 | 115 | @Override 116 | public int getNumberOfSortRounds() { 117 | return _sortRoundCount; 118 | } 119 | 120 | @Override 121 | public int getNumberOfPreSortFiles() { 122 | return _presortFileCount; 123 | } 124 | 125 | @Override 126 | public int getSortRound() { 127 | return _currentSortRound; 128 | } 129 | 130 | @Override 131 | public boolean isCompleted() { 132 | return (_phase == SortingState.Phase.COMPLETE); 133 | } 134 | 135 | @Override 136 | public boolean isPreSorting() { 137 | return (_phase == SortingState.Phase.PRE_SORTING); 138 | } 139 | 140 | @Override 141 | public boolean isSorting() { 142 | return (_phase == SortingState.Phase.SORTING); 143 | } 144 | 145 | /* 146 | /********************************************************************** 147 | /* Internal methods, pre-sorting 148 | /********************************************************************** 149 | */ 150 | 151 | /** 152 | * Helper method that will fill given buffer with data read using 153 | * given reader, obeying given memory usage constraints. 154 | */ 155 | protected Object[] _readMax(DataReader inputReader, SegmentedBuffer buffer, 156 | long memoryToUse, T firstItem) 157 | throws IOException 158 | { 159 | // how much memory do we expect largest remaining entry to take? 160 | int ptr = 0; 161 | Object[] segment = buffer.resetAndStart(); 162 | int segmentLength = segment.length; 163 | long minMemoryNeeded; 164 | 165 | if (firstItem != null) { 166 | segment[ptr++] = firstItem; 167 | long firstSize = ENTRY_SLOT_SIZE + inputReader.estimateSizeInBytes(firstItem); 168 | minMemoryNeeded = Math.max(firstSize, 256L); 169 | } else { 170 | minMemoryNeeded = 256L; 171 | } 172 | 173 | // reduce mem amount by buffer cost too: 174 | memoryToUse -= (ENTRY_SLOT_SIZE * segmentLength); 175 | 176 | while (true) { 177 | T value = inputReader.readNext(); 178 | if (value == null) { 179 | break; 180 | } 181 | long size = inputReader.estimateSizeInBytes(value); 182 | if (size > minMemoryNeeded) { 183 | minMemoryNeeded = size; 184 | } 185 | if (ptr >= segmentLength) { 186 | segment = buffer.appendCompletedChunk(segment); 187 | segmentLength = segment.length; 188 | memoryToUse -= (ENTRY_SLOT_SIZE * segmentLength); 189 | ptr = 0; 190 | } 191 | segment[ptr++] = value; 192 | memoryToUse -= size; 193 | if (memoryToUse < minMemoryNeeded) { 194 | break; 195 | } 196 | } 197 | return buffer.completeAndClearBuffer(segment, ptr); 198 | } 199 | 200 | protected void _presort(DataReader inputReader, SegmentedBuffer buffer, T nextValue, 201 | List presorted) 202 | throws IOException 203 | { 204 | do { 205 | Object[] items = _readMax(inputReader, buffer, _config.getMaxMemoryUsage(), nextValue); 206 | Arrays.sort(items, _rawComparator()); 207 | presorted.add(_writePresorted(items)); 208 | nextValue = inputReader.readNext(); 209 | } while (nextValue != null); 210 | } 211 | 212 | @SuppressWarnings("resource") 213 | protected File _writePresorted(Object[] items) throws IOException 214 | { 215 | File tmp = _config.getTempFileProvider().provide(); 216 | @SuppressWarnings("unchecked") 217 | DataWriter writer = (DataWriter) _writerFactory.constructWriter(new FileOutputStream(tmp)); 218 | boolean closed = false; 219 | try { 220 | ++_presortFileCount; 221 | for (int i = 0, end = items.length; i < end; ++i) { 222 | writer.writeEntry(items[i]); 223 | // to further reduce transient mem usage, clear out the ref 224 | items[i] = null; 225 | } 226 | closed = true; 227 | writer.close(); 228 | } finally { 229 | if (!closed) { 230 | // better swallow since most likely we are getting an exception already... 231 | try { writer.close(); } catch (IOException e) { } 232 | } 233 | } 234 | return tmp; 235 | } 236 | 237 | /* 238 | /********************************************************************** 239 | /* Internal methods, sorting, output 240 | /********************************************************************** 241 | */ 242 | 243 | /** 244 | * Main-level merge method that sorts the given input and writes to final output. 245 | */ 246 | protected void merge(List presorted, DataWriter resultWriter) 247 | throws IOException 248 | { 249 | List inputs = merge(presorted); 250 | // and then last around to produce the result file 251 | _merge(inputs, resultWriter); 252 | } 253 | 254 | /** 255 | * Main-level merge method that sorts the given input. 256 | * @return List of files that are individually sorted and ready for final merge. 257 | */ 258 | protected List merge(List presorted) 259 | throws IOException 260 | { 261 | // Ok, let's see how many rounds we should have... 262 | final int mergeFactor = _config.getMergeFactor(); 263 | _sortRoundCount = _calculateRoundCount(presorted.size(), mergeFactor); 264 | _currentSortRound = 0; 265 | 266 | // first intermediate rounds 267 | List inputs = presorted; 268 | while (inputs.size() > mergeFactor) { 269 | ArrayList outputs = new ArrayList(1 + ((inputs.size() + mergeFactor - 1) / mergeFactor)); 270 | for (int offset = 0, end = inputs.size(); offset < end; offset += mergeFactor) { 271 | int localEnd = Math.min(offset + mergeFactor, end); 272 | outputs.add(_merge(inputs.subList(offset, localEnd))); 273 | } 274 | ++_currentSortRound; 275 | // and then switch result files to be input files 276 | inputs = outputs; 277 | } 278 | return inputs; 279 | } 280 | 281 | protected void _writeAll(DataWriter resultWriter, Object[] items) 282 | throws IOException 283 | { 284 | // need to go through acrobatics, due to type erasure... works, if ugly: 285 | @SuppressWarnings("unchecked") 286 | DataWriter writer = (DataWriter) resultWriter; 287 | for (Object item : items) { 288 | writer.writeEntry(item); 289 | } 290 | } 291 | 292 | @SuppressWarnings("resource") 293 | protected File _merge(List inputs) 294 | throws IOException 295 | { 296 | File resultFile = _config.getTempFileProvider().provide(); 297 | _merge(inputs, _writerFactory.constructWriter(new FileOutputStream(resultFile))); 298 | return resultFile; 299 | } 300 | 301 | protected void _merge(List inputs, DataWriter writer) 302 | throws IOException 303 | { 304 | DataReader merger = null; 305 | try { 306 | merger = _createMergeReader(inputs); 307 | T value; 308 | while ((value = merger.readNext()) != null) { 309 | writer.writeEntry(value); 310 | } 311 | merger.close(); // usually not necessary (reader should close on eof) but... 312 | merger = null; 313 | writer.close(); 314 | } finally { 315 | if (merger != null) { 316 | try { merger.close(); } catch (IOException e) { } 317 | } 318 | for (File input : inputs) { 319 | input.delete(); 320 | } 321 | } 322 | } 323 | 324 | protected DataReader _createMergeReader(List inputs) throws IOException { 325 | ArrayList> readers = new ArrayList>(inputs.size()); 326 | for (File mergedInput : inputs) { 327 | readers.add(_readerFactory.constructReader(new FileInputStream(mergedInput))); 328 | } 329 | return Merger.mergedReader(_comparator, readers); 330 | } 331 | 332 | /* 333 | /********************************************************************** 334 | /* Internal methods, other 335 | /********************************************************************** 336 | */ 337 | 338 | protected static int _calculateRoundCount(int files, int mergeFactor) 339 | { 340 | int count = 1; 341 | while (files > mergeFactor) { 342 | ++count; 343 | files = (files + mergeFactor - 1) / mergeFactor; 344 | } 345 | return count; 346 | } 347 | 348 | protected boolean _checkForCancel() throws IOException 349 | { 350 | return _checkForCancel(null); 351 | } 352 | 353 | protected boolean _checkForCancel(Collection tmpFilesToDelete) throws IOException 354 | { 355 | if (!_cancelRequest.get()) { 356 | return false; 357 | } 358 | if (tmpFilesToDelete != null) { 359 | for (File f : tmpFilesToDelete) { 360 | f.delete(); 361 | } 362 | } 363 | if (_cancelForException != null) { 364 | // can only be an IOException or RuntimeException, so 365 | if (_cancelForException instanceof RuntimeException) { 366 | throw (RuntimeException) _cancelForException; 367 | } 368 | throw (IOException) _cancelForException; 369 | } 370 | return true; 371 | } 372 | 373 | @SuppressWarnings("unchecked") 374 | protected Comparator _rawComparator() { 375 | return (Comparator) _comparator; 376 | } 377 | } 378 | -------------------------------------------------------------------------------- /src/main/java/com/fasterxml/sort/SortingState.java: -------------------------------------------------------------------------------- 1 | package com.fasterxml.sort; 2 | 3 | import java.io.IOException; 4 | 5 | /** 6 | * Interface that defines how calling application can interact with a {@link Sorter}; both 7 | * by accessing progress information and by requesting cancellation if necessary. 8 | * It is implemented by {@link Sorter}. 9 | */ 10 | public interface SortingState 11 | { 12 | /** 13 | * Different phases that sorter goes through 14 | */ 15 | public enum Phase { 16 | PRE_SORTING, 17 | SORTING, 18 | COMPLETE 19 | } 20 | 21 | /* 22 | /************************************************************************ 23 | /* Accessors 24 | /************************************************************************ 25 | */ 26 | 27 | public Phase getPhase(); 28 | 29 | /** 30 | * Accessor for determining whether sorter is in its in-memory pre-sorting phase. 31 | */ 32 | public boolean isPreSorting(); 33 | 34 | /** 35 | * Accessor for determining whether sorter is in regular merge-sort phase or not. 36 | */ 37 | public boolean isSorting(); 38 | 39 | /** 40 | * Accessor for determining whether sorting has been successfully completed or not. 41 | */ 42 | public boolean isCompleted(); 43 | 44 | /** 45 | * Accessor for checking how many pre-sort files were created during 46 | * pre-sort phase. Can be zero if the whole data fit in memory during 47 | * pre-sorting. 48 | */ 49 | public int getNumberOfPreSortFiles(); 50 | 51 | /** 52 | * Accessor for checking which sorting round sorter is doing: for pre-sort 53 | * it basically means number of segment (0-based) that is being processed 54 | * in-memory, for regular sort it is number of (0-based) sorting round. 55 | */ 56 | public int getSortRound(); 57 | 58 | /** 59 | * Accessor for figuring out how many regular sorting rounds need to be taken to 60 | * complete sorting, if known. If information is not known, will return -1. 61 | * This information generally becomes available after pre-sorting round. 62 | */ 63 | public int getNumberOfSortRounds(); 64 | 65 | /* 66 | /************************************************************************ 67 | /* Cancellation 68 | /************************************************************************ 69 | */ 70 | 71 | /** 72 | * Method that can be used to try to cancel executing sort operation. 73 | * No exception will be thrown; sorting will just be stopped as soon as 74 | * sorting thread notices request. 75 | */ 76 | public void cancel(); 77 | 78 | /** 79 | * Method that can be used to try to cancel executing sort operation. 80 | * Exception object can be specified; if non-null instance is given, 81 | * it will be thrown to indicate erroneous result, otherwise sorting is 82 | * just interrupted but execution returns normally. 83 | */ 84 | public void cancel(RuntimeException e); 85 | 86 | /** 87 | * Method that can be used to try to cancel executing sort operation. 88 | * Exception object can be specified; if non-null instance is given, 89 | * it will be thrown to indicate erroneous result, otherwise sorting is 90 | * just interrupted but execution returns normally. 91 | */ 92 | public void cancel(IOException e); 93 | } 94 | -------------------------------------------------------------------------------- /src/main/java/com/fasterxml/sort/TempFileProvider.java: -------------------------------------------------------------------------------- 1 | package com.fasterxml.sort; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | 6 | /** 7 | * Interface used for object that can handle constructing of temporary files that are 8 | * needed during sort and non-final merge phases. 9 | * 10 | * @author tatu 11 | * 12 | */ 13 | public interface TempFileProvider 14 | { 15 | public File provide() throws IOException; 16 | } 17 | -------------------------------------------------------------------------------- /src/main/java/com/fasterxml/sort/std/ByteArrayComparator.java: -------------------------------------------------------------------------------- 1 | package com.fasterxml.sort.std; 2 | 3 | import java.util.Comparator; 4 | 5 | /** 6 | * Simple implementation of comparator for byte arrays which 7 | * will compare using unsigned byte values (meaning 8 | * that 0xFF is creator than 0x00, for example). 9 | */ 10 | public class ByteArrayComparator 11 | implements Comparator 12 | { 13 | @Override 14 | public int compare(byte[] o1, byte[] o2) 15 | { 16 | final int len = Math.min(o1.length, o2.length); 17 | for (int i = 0; i < len; ++i) { 18 | // alas, sign extension means we must do masking... 19 | int diff = (o1[i] & 0xFF) - (o2[i] & 0xFF); 20 | if (diff != 0) { 21 | return diff; 22 | } 23 | } 24 | return o1.length - o2.length; 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /src/main/java/com/fasterxml/sort/std/RawTextLineReader.java: -------------------------------------------------------------------------------- 1 | package com.fasterxml.sort.std; 2 | 3 | import java.io.*; 4 | import java.util.Arrays; 5 | 6 | import com.fasterxml.sort.*; 7 | 8 | /** 9 | * Efficient reader for data that consists of text lines, i.e. character 10 | * data separated by one of standard line feeds (CR, LF or CR+LF). 11 | * For efficiency no decoding is done 12 | */ 13 | public class RawTextLineReader 14 | extends DataReader 15 | { 16 | protected final static byte BYTE_CR = (byte) '\r'; 17 | protected final static byte BYTE_LF = (byte) '\n'; 18 | 19 | protected final InputStream _in; 20 | 21 | protected boolean _closed = false; 22 | 23 | protected byte[] _inputBuffer = new byte[16000]; 24 | protected int _inputPtr = 0; 25 | protected int _inputEnd = 0; 26 | 27 | /** 28 | * Marker we set if the last line ended with a CR, since it 29 | * may be followed by a trailing LF as part of two-byte linefeed. 30 | */ 31 | protected boolean _hadCR = false; 32 | 33 | protected ByteArrayOutputStream _tmpBytes; 34 | 35 | public RawTextLineReader(InputStream in) 36 | { 37 | _in = in; 38 | } 39 | 40 | /** 41 | * Convenience method for instantiating factory to create instances of 42 | * this {@link DataReader}. 43 | */ 44 | public static Factory factory() { 45 | return new Factory(); 46 | } 47 | 48 | @Override 49 | public void close() throws IOException 50 | { 51 | if (!_closed) { 52 | _closed = true; 53 | _in.close(); 54 | } 55 | } 56 | 57 | @Override 58 | public int estimateSizeInBytes(byte[] item) 59 | { 60 | // Wild guess: array objects take at least 8 bytes, probably 12 or 16. 61 | // And size of actual array storage rounded up to 4-byte alignment. So: 62 | 63 | int bytes = item.length; 64 | bytes = ((bytes + 3) >> 2) << 2; 65 | return 16 + bytes; 66 | } 67 | 68 | @Override 69 | public byte[] readNext() throws IOException 70 | { 71 | if (_closed) { 72 | return null; 73 | } 74 | if (_inputPtr >= _inputEnd) { 75 | if (!_loadMore()) { 76 | close(); 77 | return null; 78 | } 79 | } 80 | 81 | // first thing(s) first: skip a linefeed we might have 82 | if (_hadCR) { 83 | if (!_skipLF()) { 84 | return null; 85 | } 86 | } 87 | 88 | // set the start point after our call to _skipLF() so that if a linefeed is skipped, we also skip it in Arrays.copyOfRange below 89 | final int start = _inputPtr; 90 | 91 | // then common case: we find full row: 92 | final int end = _inputEnd; 93 | while (_inputPtr < end) { 94 | byte b = _inputBuffer[_inputPtr++]; 95 | if (b == BYTE_CR || b == BYTE_LF) { 96 | _hadCR = (b == BYTE_CR); 97 | return Arrays.copyOfRange(_inputBuffer, start, _inputPtr-1); 98 | } 99 | } 100 | // but if not, need to buffer 101 | return _readNextSlow(start); 102 | } 103 | 104 | protected final byte[] _readNextSlow(int start) throws IOException 105 | { 106 | ByteArrayOutputStream bytes = _tmpBytes; 107 | if (bytes == null) { 108 | _tmpBytes = bytes = new ByteArrayOutputStream(); 109 | } else { 110 | bytes.reset(); 111 | } 112 | // add stuff we have seen so far, and... 113 | bytes.write(_inputBuffer, start, _inputEnd - start); 114 | 115 | main_loop: 116 | while (true) { 117 | if (!_loadMore()) { 118 | close(); 119 | break; 120 | } 121 | for (int i = 0, end = _inputEnd; i < end; ++i) { 122 | byte b = _inputBuffer[_inputPtr++]; 123 | if (b == BYTE_CR || b == BYTE_LF) { 124 | _hadCR = (b == BYTE_CR); 125 | bytes.write(_inputBuffer, 0, _inputPtr-1); 126 | break main_loop; 127 | } 128 | } 129 | } 130 | return bytes.toByteArray(); 131 | } 132 | 133 | /* 134 | /********************************************************************** 135 | /* Internal methods 136 | /********************************************************************** 137 | */ 138 | 139 | protected boolean _loadMore() throws IOException 140 | { 141 | int count = _in.read(_inputBuffer); 142 | if (count < 0) { 143 | return false; 144 | } 145 | _inputPtr = 0; 146 | _inputEnd = count; 147 | return true; 148 | } 149 | 150 | protected boolean _skipLF() throws IOException 151 | { 152 | _hadCR = false; 153 | if (_inputBuffer[_inputPtr] == BYTE_LF) { 154 | ++_inputPtr; 155 | if (_inputPtr >= _inputEnd) { 156 | if (!_loadMore()) { 157 | close(); 158 | return false; 159 | } 160 | } 161 | } 162 | return true; 163 | } 164 | 165 | /* 166 | /********************************************************************** 167 | /* Helper classes 168 | /********************************************************************** 169 | */ 170 | 171 | public static class Factory 172 | extends DataReaderFactory 173 | { 174 | @Override 175 | public DataReader constructReader(InputStream in) { 176 | return new RawTextLineReader(in); 177 | } 178 | } 179 | } 180 | -------------------------------------------------------------------------------- /src/main/java/com/fasterxml/sort/std/RawTextLineWriter.java: -------------------------------------------------------------------------------- 1 | package com.fasterxml.sort.std; 2 | 3 | import java.io.*; 4 | 5 | import com.fasterxml.sort.*; 6 | 7 | public class RawTextLineWriter 8 | extends DataWriter 9 | { 10 | protected final static byte[] STD_LINEFEED_CR = new byte[] { '\r' }; 11 | protected final static byte[] STD_LINEFEED_LF = new byte[] { '\n' }; 12 | protected final static byte[] STD_LINEFEED_CRLF = new byte[] { '\r', '\n' }; 13 | 14 | protected final static byte[] DEFAULT_LINEFEED = STD_LINEFEED_LF; 15 | 16 | protected final OutputStream _out; 17 | 18 | /** 19 | * Linefeed used after entries 20 | */ 21 | protected final byte[] _lf; 22 | 23 | protected boolean _closed = false; 24 | 25 | public RawTextLineWriter(OutputStream out) { 26 | this(out, DEFAULT_LINEFEED); 27 | } 28 | 29 | public RawTextLineWriter(OutputStream out, byte[] linefeed) 30 | { 31 | _out = out; 32 | _lf = linefeed; 33 | } 34 | 35 | /** 36 | * Convenience method for instantiating factory to create instances of 37 | * this {@link DataWriter}. 38 | */ 39 | public static Factory factory() { 40 | return new Factory(); 41 | } 42 | 43 | /** 44 | * Convenience method for instantiating factory to create instances of 45 | * this {@link DataWriter}. 46 | */ 47 | public static Factory factory(byte[] linefeed) { 48 | return new Factory(linefeed); 49 | } 50 | 51 | @Override 52 | public void close() throws IOException { 53 | if (!_closed) { 54 | _closed = true; 55 | _out.close(); 56 | } 57 | } 58 | 59 | @Override 60 | public void writeEntry(byte[] item) throws IOException 61 | { 62 | if (_closed) { 63 | throw new IOException("Can not write using closed DataWriter"); 64 | } 65 | _out.write(item); 66 | if (_lf != null) { 67 | _out.write(_lf); 68 | } 69 | } 70 | 71 | /* 72 | /********************************************************************** 73 | /* Helper classes 74 | /********************************************************************** 75 | */ 76 | 77 | /** 78 | * Basic factory implementation. The only noteworthy things are: 79 | *
    80 | *
  • Ability to configure linefeed to use (including none, pass null)
  • 81 | *
  • Writer uses {@link BufferedOutputStream} by default (can be disabled) 82 | *
83 | */ 84 | public static class Factory 85 | extends DataWriterFactory 86 | { 87 | protected final byte[] _linefeed; 88 | protected final boolean _addBuffering; 89 | 90 | public Factory() { 91 | this(DEFAULT_LINEFEED); 92 | } 93 | 94 | public Factory(byte[] linefeed) { 95 | this(linefeed, true); 96 | } 97 | 98 | public Factory(byte[] linefeed, boolean addBuffering) { 99 | _linefeed = linefeed; 100 | _addBuffering = addBuffering; 101 | } 102 | 103 | @Override 104 | public DataWriter constructWriter(OutputStream out) { 105 | if (_addBuffering) { 106 | if (!(out instanceof BufferedOutputStream)) { 107 | out = new BufferedOutputStream(out); 108 | } 109 | } 110 | return new RawTextLineWriter(out, _linefeed); 111 | } 112 | } 113 | } 114 | -------------------------------------------------------------------------------- /src/main/java/com/fasterxml/sort/std/StdComparator.java: -------------------------------------------------------------------------------- 1 | package com.fasterxml.sort.std; 2 | 3 | import java.util.Comparator; 4 | 5 | /** 6 | * Basic comparator implementation that works on types that implement 7 | * {@link Comparator}. 8 | */ 9 | public class StdComparator> implements Comparator 10 | { 11 | @Override 12 | public int compare(T object1, T object2) { 13 | if (object1 == object2) return 0; 14 | if (object1 == null) return -1; 15 | return object1.compareTo(object2); 16 | } 17 | 18 | } 19 | -------------------------------------------------------------------------------- /src/main/java/com/fasterxml/sort/std/StdTempFileProvider.java: -------------------------------------------------------------------------------- 1 | package com.fasterxml.sort.std; 2 | 3 | import java.io.*; 4 | 5 | import com.fasterxml.sort.TempFileProvider; 6 | 7 | /** 8 | * Default {@link TempFileProvider} implementation which uses JDK default 9 | * temporary file generation mechanism. 10 | * 11 | * @author tatu 12 | */ 13 | public class StdTempFileProvider 14 | implements TempFileProvider 15 | { 16 | /** 17 | * Default temporary file prefix to use. 18 | */ 19 | public final static String DEFAULT_PREFIX = "j-merge-sort-"; 20 | 21 | /** 22 | * Default temporary file suffix to use. 23 | */ 24 | public final static String DEFAULT_SUFFIX = ".tmp"; 25 | 26 | protected final String _prefix; 27 | protected final String _suffix; 28 | 29 | public StdTempFileProvider() { this(DEFAULT_PREFIX, DEFAULT_SUFFIX); } 30 | public StdTempFileProvider(String prefix, String suffix) { 31 | _prefix = prefix; 32 | _suffix = suffix; 33 | } 34 | 35 | @Override 36 | public File provide() throws IOException 37 | { 38 | File f = File.createTempFile(_prefix, _suffix); 39 | f.deleteOnExit(); 40 | return f; 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/main/java/com/fasterxml/sort/std/TextFileSorter.java: -------------------------------------------------------------------------------- 1 | package com.fasterxml.sort.std; 2 | 3 | import java.io.*; 4 | 5 | import com.fasterxml.sort.*; 6 | 7 | /** 8 | * Basic {@link Sorter} implementation that operates on text line input. 9 | */ 10 | public class TextFileSorter extends Sorter 11 | { 12 | /** 13 | * Let's limit maximum memory used for pre-sorting when invoked from command-line to be 14 | * 256 megs 15 | */ 16 | public final static long MAX_HEAP_FOR_PRESORT = 256L * 1024 * 1024; 17 | 18 | /** 19 | * Also just in case our calculations are wrong, require 10 megs for pre-sort anyway 20 | * (if invoked from CLI) 21 | */ 22 | public final static long MIN_HEAP_FOR_PRESORT = 10L * 1024 * 1024; 23 | 24 | public TextFileSorter() { 25 | this(new SortConfig()); 26 | } 27 | 28 | public TextFileSorter(SortConfig config) 29 | { 30 | super(config, 31 | RawTextLineReader.factory(), RawTextLineWriter.factory(), 32 | new ByteArrayComparator()); 33 | } 34 | 35 | /* 36 | /********************************************************************** 37 | /* Main method for simple command-line operation for line-based 38 | /* sorting using default ISO-8859-1 collation (i.e. byte-by-byte sorting) 39 | /********************************************************************** 40 | */ 41 | 42 | public static void main(String[] args) throws Exception 43 | { 44 | if (args.length > 1) { 45 | System.err.println("Usage: java "+TextFileSorter.class.getName()+" [input-file]"); 46 | System.err.println("(where input-file is optional; if missing, read from STDIN)"); 47 | System.exit(1); 48 | } 49 | 50 | // One more thing: use 50% of memory (but no more than 200 megs) for pre-sort 51 | // minor tweak: consider first 40 megs to go for other overhead... 52 | long availMem = Runtime.getRuntime().maxMemory() - (40 * 1024 * 1024); 53 | long maxMem = (availMem >> 1); 54 | if (maxMem > MAX_HEAP_FOR_PRESORT) { 55 | maxMem = MAX_HEAP_FOR_PRESORT; 56 | } else if (maxMem < MIN_HEAP_FOR_PRESORT) { 57 | maxMem = MIN_HEAP_FOR_PRESORT; 58 | } 59 | final TextFileSorter sorter = new TextFileSorter(new SortConfig().withMaxMemoryUsage(maxMem)); 60 | final InputStream in; 61 | 62 | if (args.length == 0) { 63 | in = System.in; 64 | } else { 65 | File input = new File(args[0]); 66 | if (!input.exists() || input.isDirectory()) { 67 | System.err.println("File '"+input.getAbsolutePath()+"' does not exist (or is not file)"); 68 | System.exit(2); 69 | } 70 | in = new FileInputStream(input); 71 | } 72 | 73 | // To be able to print out progress, need to spin one additional thread... 74 | new Thread(new Runnable() { 75 | @Override 76 | public void run() { 77 | final long start = System.currentTimeMillis(); 78 | try { 79 | while (!sorter.isCompleted()) { 80 | Thread.sleep(5000L); 81 | if (sorter.isPreSorting()) { 82 | System.err.printf(" pre-sorting: %d files written\n", sorter.getNumberOfPreSortFiles()); 83 | } else if (sorter.isSorting()) { 84 | System.err.printf(" sorting, round: %d/%d\n", 85 | sorter.getSortRound(), sorter.getNumberOfSortRounds()); 86 | } 87 | } 88 | double secs = (System.currentTimeMillis() - start) / 1000.0; 89 | System.err.printf("Completed: took %.1f seconds.\n", secs); 90 | } catch (InterruptedException e) { 91 | double secs = (System.currentTimeMillis() - start) / 1000.0; 92 | System.err.printf("[INTERRUPTED] -- took %.1f seconds.\n", secs); 93 | } 94 | } 95 | }).start(); 96 | sorter.sort(in, System.out); 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /src/main/java/com/fasterxml/sort/util/BlockingQueueReader.java: -------------------------------------------------------------------------------- 1 | package com.fasterxml.sort.util; 2 | 3 | import java.io.IOException; 4 | import java.io.InterruptedIOException; 5 | import java.util.concurrent.BlockingQueue; 6 | 7 | import com.fasterxml.sort.DataReader; 8 | 9 | /** 10 | * Base implementation for {@link DataReader} that uses a 11 | * {@link BlockingQueue} for getting input. 12 | * The only missing part is implementation for 13 | * {@link #estimateSizeInBytes(Object)}, since there is no way 14 | * to provide a meaningful estimate without knowing object type. 15 | */ 16 | public abstract class BlockingQueueReader 17 | extends DataReader 18 | { 19 | protected final BlockingQueue _queue; 20 | 21 | protected final E _endMarker; 22 | 23 | protected boolean _closed; 24 | 25 | @Deprecated 26 | public BlockingQueueReader(BlockingQueue q) { 27 | this(q, null); 28 | } 29 | 30 | /** 31 | * @param q Queue to read entries from 32 | * @param endMarker Value that is used to signal end-of-input; when this value 33 | * is gotten from queue, reader assumes that no more input is coming and 34 | * will return null from {@link #readNext}. 35 | */ 36 | public BlockingQueueReader(BlockingQueue q, E endMarker) { 37 | _queue = q; 38 | _endMarker = endMarker; 39 | } 40 | 41 | @Override 42 | public void close() throws IOException { 43 | _closed = true; 44 | } 45 | 46 | @Override 47 | public abstract int estimateSizeInBytes(E item); 48 | 49 | @Override 50 | public E readNext() throws IOException { 51 | if (_closed) { 52 | return null; 53 | } 54 | try { 55 | E value = _queue.take(); 56 | if (value == _endMarker) { 57 | _closed = true; 58 | return null; 59 | } 60 | return value; 61 | } catch (InterruptedException e) { 62 | InterruptedIOException ie = new InterruptedIOException(); 63 | ie.initCause(e); 64 | throw ie; 65 | } 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /src/main/java/com/fasterxml/sort/util/CastingIterator.java: -------------------------------------------------------------------------------- 1 | package com.fasterxml.sort.util; 2 | 3 | import java.util.Iterator; 4 | 5 | public class CastingIterator implements Iterator { 6 | private final Iterator _it; 7 | 8 | public CastingIterator(Iterator it) { 9 | _it = it; 10 | } 11 | 12 | @Override 13 | public boolean hasNext() { 14 | return _it.hasNext(); 15 | } 16 | 17 | @SuppressWarnings("unchecked") 18 | @Override 19 | public T next() { 20 | return (T)_it.next(); 21 | } 22 | 23 | @Override 24 | public void remove() { 25 | throw new UnsupportedOperationException(); 26 | } 27 | } -------------------------------------------------------------------------------- /src/main/java/com/fasterxml/sort/util/CollectionReader.java: -------------------------------------------------------------------------------- 1 | package com.fasterxml.sort.util; 2 | 3 | import java.io.IOException; 4 | import java.util.*; 5 | 6 | import com.fasterxml.sort.DataReader; 7 | 8 | /** 9 | * Simple {@link DataReader} implementation that can be used to 10 | * serve items from a {@link Collection} (or {@link Iterator}). 11 | * Note that implementation of {@link #estimateSizeInBytes} is 12 | * naive and returns 1 for all items; it must be redefined if 13 | * memory limits are to be enforced, or alternatively 14 | * Sorter should be configured with maximum number of 15 | * items to use as memory limit. 16 | */ 17 | public class CollectionReader extends DataReader 18 | { 19 | protected Iterator _items; 20 | 21 | public CollectionReader(Collection items) { 22 | this(items.iterator()); 23 | } 24 | 25 | public CollectionReader(Iterator items) { 26 | _items = items; 27 | } 28 | 29 | @Override 30 | public T readNext() 31 | { 32 | if (_items == null) { 33 | return null; 34 | } 35 | if (!_items.hasNext()) { 36 | _items = null; 37 | return null; 38 | } 39 | return _items.next(); 40 | } 41 | 42 | @Override 43 | public int estimateSizeInBytes(T item) { 44 | return 1; 45 | } 46 | 47 | @Override 48 | public void close() throws IOException { 49 | // no-op 50 | } 51 | 52 | } 53 | -------------------------------------------------------------------------------- /src/main/java/com/fasterxml/sort/util/NaturalComparator.java: -------------------------------------------------------------------------------- 1 | package com.fasterxml.sort.util; 2 | 3 | import java.util.Comparator; 4 | 5 | /** 6 | * Simple {@link Comparator} implementation that can be used 7 | * when items to compare have "natural" sorting order that 8 | * can be used via {@link Comparable} interface. 9 | */ 10 | public class NaturalComparator> 11 | implements Comparator 12 | { 13 | @Override 14 | public int compare(T arg0, T arg1) { 15 | return arg0.compareTo(arg1); 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /src/main/java/com/fasterxml/sort/util/SegmentedBuffer.java: -------------------------------------------------------------------------------- 1 | package com.fasterxml.sort.util; 2 | 3 | import java.util.Arrays; 4 | 5 | /** 6 | * Helper class used instead of a standard JDK list or buffer, 7 | * to avoid constant re-allocations. 8 | */ 9 | public class SegmentedBuffer 10 | { 11 | // // // Config constants 12 | 13 | /** 14 | * Let's start with relatively small chunks 15 | */ 16 | final static int INITIAL_CHUNK_SIZE = 1024; 17 | 18 | /** 19 | * Also: let's expand by doubling up until 16k entry chunks (which is 64k 20 | * in size for 32-bit machines) 21 | */ 22 | final static int MAX_CHUNK_SIZE = (1 << 14); 23 | 24 | // // // Data storage 25 | 26 | private Node _bufferHead; 27 | 28 | private Node _bufferTail; 29 | 30 | /** 31 | * Number of total buffered entries in this buffer, counting all instances 32 | * within linked list formed by following {@link #_bufferHead}. 33 | */ 34 | private int _bufferedEntryCount; 35 | 36 | // // // Simple reuse 37 | 38 | /** 39 | * Reusable Object array, stored here after buffer has been released having 40 | * been used previously. 41 | */ 42 | private Object[] _freeBuffer; 43 | 44 | /* 45 | /********************************************************** 46 | /* Construction 47 | /********************************************************** 48 | */ 49 | 50 | public SegmentedBuffer() { } 51 | 52 | /* 53 | /********************************************************** 54 | /* Public API 55 | /********************************************************** 56 | */ 57 | 58 | /** 59 | * Method called to start buffering process. Will ensure that the buffer 60 | * is empty, and then return an object array to start chunking content on 61 | */ 62 | public Object[] resetAndStart() 63 | { 64 | if (_bufferedEntryCount > 0) { 65 | _reset(); 66 | } 67 | if (_freeBuffer == null) { 68 | return new Object[INITIAL_CHUNK_SIZE]; 69 | } 70 | return _freeBuffer; 71 | } 72 | 73 | /** 74 | * Method called to add a full Object array as a chunk buffered within 75 | * this buffer, and to obtain a new array to fill. Caller is not to use 76 | * the array it gives; but to use the returned array for continued 77 | * buffering. 78 | * 79 | * @param fullChunk Completed chunk that the caller is requesting 80 | * to append to this buffer. It is generally chunk that was 81 | * returned by an earlier call to {@link #resetAndStart} or 82 | * {@link #appendCompletedChunk} (although this is not required or 83 | * enforced) 84 | * 85 | * @return New chunk buffer for caller to fill 86 | */ 87 | public Object[] appendCompletedChunk(Object[] fullChunk) 88 | { 89 | Node next = new Node(fullChunk); 90 | if (_bufferHead == null) { // first chunk 91 | _bufferHead = _bufferTail = next; 92 | } else { // have something already 93 | _bufferTail.linkNext(next); 94 | _bufferTail = next; 95 | } 96 | int len = fullChunk.length; 97 | _bufferedEntryCount += len; 98 | // double the size for small chunks 99 | if (len < MAX_CHUNK_SIZE) { 100 | len += len; 101 | } else { // but by +25% for larger (to limit overhead) 102 | len += (len >> 2); 103 | } 104 | return new Object[len]; 105 | } 106 | 107 | /** 108 | * Method called to indicate that the buffering process is now 109 | * complete; and to construct a combined exactly-sized result 110 | * array. Additionally the buffer itself will be reset to 111 | * reduce memory retention. 112 | *

113 | * Resulting array will be of generic Object[] type: 114 | * if a typed array is needed, use the method with additional 115 | * type argument. 116 | */ 117 | public Object[] completeAndClearBuffer(Object[] lastChunk, int lastChunkEntries) 118 | { 119 | int totalSize = lastChunkEntries + _bufferedEntryCount; 120 | Object[] result = new Object[totalSize]; 121 | _copyTo(result, totalSize, lastChunk, lastChunkEntries); 122 | // [Issue-5]: should reduce mem usage here 123 | _reset(); 124 | return result; 125 | } 126 | 127 | /** 128 | * Helper method that can be used to check how much free capacity 129 | * will this instance start with. Can be used to choose the best 130 | * instance to reuse, based on size of reusable object chunk 131 | * buffer holds reference to. 132 | */ 133 | public int initialCapacity() 134 | { 135 | return (_freeBuffer == null) ? 0 : _freeBuffer.length; 136 | } 137 | 138 | /** 139 | * Method that can be used to check how many Objects have been buffered 140 | * within this buffer. 141 | */ 142 | public int bufferedSize() { return _bufferedEntryCount; } 143 | 144 | /* 145 | /********************************************************************** 146 | /* Internal methods 147 | /********************************************************************** 148 | */ 149 | 150 | private void _reset() 151 | { 152 | // can we reuse the last (and thereby biggest) array for next time? 153 | if (_bufferedEntryCount > 0) { 154 | if (_bufferTail != null) { 155 | Object[] obs = _bufferTail.getData(); 156 | // also, let's clear it of contents as well, just in case 157 | Arrays.fill(obs, null); 158 | _freeBuffer = obs; 159 | } 160 | // either way, must discard current contents 161 | _bufferHead = _bufferTail = null; 162 | _bufferedEntryCount = 0; 163 | } 164 | } 165 | 166 | private final void _copyTo(Object resultArray, int totalSize, 167 | Object[] lastChunk, int lastChunkEntries) 168 | { 169 | int ptr = 0; 170 | 171 | for (Node n = _bufferHead; n != null; n = n.next()) { 172 | Object[] curr = n.getData(); 173 | int len = curr.length; 174 | System.arraycopy(curr, 0, resultArray, ptr, len); 175 | ptr += len; 176 | } 177 | System.arraycopy(lastChunk, 0, resultArray, ptr, lastChunkEntries); 178 | ptr += lastChunkEntries; 179 | 180 | // sanity check (could have failed earlier due to out-of-bounds, too) 181 | if (ptr != totalSize) { 182 | throw new IllegalStateException("Should have gotten "+totalSize+" entries, got "+ptr); 183 | } 184 | } 185 | 186 | /* 187 | /********************************************************************** 188 | /* Helper classes 189 | /********************************************************************** 190 | */ 191 | 192 | /** 193 | * Helper class used to store actual data, in a linked list. 194 | */ 195 | private final static class Node 196 | { 197 | /** 198 | * Data stored in this node. Array is considered to be full. 199 | */ 200 | private final Object[] _data; 201 | 202 | private Node _next; 203 | 204 | public Node(Object[] data) { 205 | _data = data; 206 | } 207 | 208 | public Object[] getData() { return _data; } 209 | 210 | public Node next() { return _next; } 211 | 212 | public void linkNext(Node next) 213 | { 214 | if (_next != null) { // sanity check 215 | throw new IllegalStateException(); 216 | } 217 | _next = next; 218 | } 219 | } 220 | } 221 | -------------------------------------------------------------------------------- /src/main/java/dict.properties: -------------------------------------------------------------------------------- 1 | HELP_DESCRIPTION=list all cmd 2 | RAW_FILE=raw data file line by line 3 | SORT_MEM_SIZE_IN_MB=memory size in mb use by sorting 4 | MAX_WORD_LENGTH=max length of word 5 | OUTPUT_DICT_FILE=output result dict file -------------------------------------------------------------------------------- /src/main/java/dict/build/Builder.java: -------------------------------------------------------------------------------- 1 | package dict.build; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.BufferedWriter; 5 | import java.io.File; 6 | import java.io.FileInputStream; 7 | import java.io.FileNotFoundException; 8 | import java.io.IOException; 9 | import java.io.PrintStream; 10 | import java.util.Comparator; 11 | import java.util.List; 12 | import java.util.TreeMap; 13 | 14 | import com.fasterxml.sort.SortConfig; 15 | import com.fasterxml.sort.std.TextFileSorter; 16 | import com.google.common.base.Charsets; 17 | import com.google.common.base.Splitter; 18 | import com.google.common.collect.Lists; 19 | import com.google.common.io.Files; 20 | 21 | /** 22 | * 23 | * @author Jennifer 24 | * 25 | */ 26 | public class Builder { 27 | 28 | /** 29 | * Let's limit maximum memory used for pre-sorting when invoked from 30 | * command-line to be 256 megs 31 | */ 32 | public final static long MAX_HEAP_FOR_PRESORT = 2048L * 1024 * 1024; 33 | 34 | /** 35 | * Also just in case our calculations are wrong, require 10 megs for 36 | * pre-sort anyway (if invoked from CLI) 37 | */ 38 | public final static long MIN_HEAP_FOR_PRESORT = 10L * 1024 * 1024; 39 | 40 | private String parse(String filepath) { 41 | 42 | File in = new File(filepath); 43 | File out = new File(in.getParentFile(), "out.data"); 44 | 45 | try (BufferedReader ir = Files.newReader(in, Charsets.UTF_8); 46 | BufferedWriter ow = Files.newWriter(out, Charsets.UTF_8);) { 47 | String line = null; 48 | while (null != (line = ir.readLine())) { 49 | String[] seg = line.split(","); 50 | StringBuilder bui = new StringBuilder(); 51 | for (int i = 6; i < seg.length; ++i) { 52 | bui.append(seg[i]); 53 | } 54 | bui.append("\n"); 55 | ow.write(bui.toString()); 56 | } 57 | } catch (FileNotFoundException e) { 58 | e.printStackTrace(); 59 | } catch (IOException e) { 60 | e.printStackTrace(); 61 | } 62 | 63 | return out.getAbsolutePath(); 64 | } 65 | 66 | private String reverse(String raw) { 67 | StringBuilder bui = new StringBuilder(); 68 | for (int i = raw.length() - 1; i >= 0; --i) 69 | bui.append(raw.charAt(i)); 70 | return bui.toString(); 71 | } 72 | 73 | public void sortFile(File in, File out, Comparator cmp) { 74 | try { 75 | long availMem = Runtime.getRuntime().maxMemory() 76 | - (2048 * 1024 * 1024); 77 | long maxMem = (availMem >> 1); 78 | if (maxMem > MAX_HEAP_FOR_PRESORT) { 79 | maxMem = MAX_HEAP_FOR_PRESORT; 80 | } else if (maxMem < MIN_HEAP_FOR_PRESORT) { 81 | maxMem = MIN_HEAP_FOR_PRESORT; 82 | } 83 | final TextFileSorter sorter = new TextFileSorter( 84 | new SortConfig().withMaxMemoryUsage(maxMem)); 85 | sorter.sort(new FileInputStream(in), new PrintStream(out)); 86 | } catch (IOException e) { 87 | e.printStackTrace(); 88 | } 89 | 90 | } 91 | 92 | public String genLeft(String rawTextFile, int maxLen, int memSize) { 93 | 94 | File rawFile = new File(rawTextFile); 95 | 96 | File dir = rawFile.getParentFile(); 97 | 98 | File ngramFile = new File(dir, "ngram_left.data"); 99 | File ngramSort = new File(dir, "sort_ngram_left.data"); 100 | File ngramfreq = new File(dir, "freq_ngram_left.data"); 101 | File ngramFreqSort = new File(dir, "freq_ngram_left_sort.data"); 102 | 103 | try (BufferedReader breader = Files.newReader(rawFile, Charsets.UTF_8); 104 | BufferedWriter writer = Files.newWriter(ngramFile, 105 | Charsets.UTF_8); 106 | BufferedWriter freqWriter = Files.newWriter(ngramfreq, 107 | Charsets.UTF_8);) { 108 | String line = null; 109 | while (null != (line = breader.readLine())) { 110 | line = line.replaceAll("\\p{Punct}", " ") 111 | .replaceAll("\\pP", " ").replaceAll(" ", " ") 112 | .replaceAll("\\p{Blank}", " ") 113 | .replaceAll("\\p{Space}", " ") 114 | .replaceAll("\\p{Cntrl}", " ") 115 | .replaceAll("[的很了么呢是嘛]", " "); 116 | for (String sen : Splitter.on(" ").omitEmptyStrings() 117 | .splitToList(line)) { 118 | sen = reverse(sen.trim()); 119 | sen = "$" + sen + "$"; 120 | System.out.println(sen); 121 | System.out.println(sen.length()); 122 | for (int i = 0; i < sen.length(); ++i) { 123 | for (int j = i + 1; j < i + maxLen + 1 124 | && j <= sen.length(); ++j) { 125 | String w = sen.substring(i, j); 126 | writer.write(w + "\n"); 127 | } 128 | } 129 | } 130 | } 131 | sortFile(ngramFile, ngramSort, new Comparator() { 132 | 133 | @Override 134 | public int compare(String o1, String o2) { 135 | return o1.compareTo(o2); 136 | } 137 | }); 138 | 139 | 140 | try(BufferedReader nsr = Files.newReader(ngramSort, Charsets.UTF_8)) { 141 | 142 | String ngram = null; 143 | String curr = null; 144 | List sameWord = Lists.newLinkedList(); 145 | boolean pause = false; 146 | while (pause || null != (curr = nsr.readLine())) { 147 | if (null == ngram) { 148 | sameWord.add(curr); 149 | ngram = curr; 150 | } else { 151 | if (curr.startsWith(ngram)) { 152 | sameWord.add(curr); 153 | pause = false; 154 | } else { 155 | if (sameWord.isEmpty()) { 156 | pause = false; 157 | sameWord.add(curr); 158 | ngram = curr; 159 | continue; 160 | } 161 | CounterMap right = new CounterMap(); 162 | int freq = 0; 163 | for (String w : sameWord) { 164 | if (!w.startsWith(ngram)) { 165 | break; 166 | } 167 | if (w.equals(ngram)) { 168 | continue; 169 | } 170 | ++freq; 171 | right.incr(w.substring(ngram.length())); 172 | } 173 | double re = 0.0; 174 | for (String t : right.countAll().keySet()) { 175 | double p = right.get(t) * 1.0 / freq; 176 | re += -1 * p * Math.log(p); 177 | } 178 | freqWriter.write(reverse(ngram) + "\t" + re + "\n"); 179 | List newlist = Lists.newLinkedList(); 180 | for (String w : sameWord) { 181 | if (!w.equals(ngram)) { 182 | newlist.add(w); 183 | } 184 | } 185 | sameWord = newlist; 186 | if (sameWord.isEmpty()) { 187 | pause = false; 188 | sameWord.add(curr); 189 | ngram = curr; 190 | continue; 191 | } 192 | ngram = sameWord.get(0); 193 | if (curr.startsWith(ngram)) { 194 | sameWord.add(curr); 195 | pause = false; 196 | } else { 197 | pause = true; 198 | } 199 | } 200 | } 201 | } 202 | } 203 | sortFile(ngramfreq, ngramFreqSort, new Comparator() { 204 | 205 | @Override 206 | public int compare(String o1, String o2) { 207 | return o1.compareTo(o2); 208 | } 209 | }); 210 | 211 | } catch (FileNotFoundException e) { 212 | e.printStackTrace(); 213 | } catch (IOException e) { 214 | e.printStackTrace(); 215 | } 216 | 217 | return ngramFreqSort.getAbsolutePath(); 218 | } 219 | 220 | public String genFreqRight(String rawTextFile, int maxLen, int memSize) { 221 | 222 | File rawFile = new File(rawTextFile); 223 | 224 | File dir = rawFile.getParentFile(); 225 | 226 | File ngramFile = new File(dir, "ngram.data"); 227 | File ngramSort = new File(dir, "ngram_sort.data"); 228 | File ngramfreq = new File(dir, "freq_ngram.data"); 229 | File ngramfreqSort = new File(dir, "freq_ngram_sort.data"); 230 | 231 | try (BufferedReader breader = Files.newReader(rawFile, Charsets.UTF_8); 232 | BufferedWriter writer = Files.newWriter(ngramFile, 233 | Charsets.UTF_8); 234 | BufferedWriter freqWriter = Files.newWriter(ngramfreq, 235 | Charsets.UTF_8);) { 236 | String line = null; 237 | while (null != (line = breader.readLine())) { 238 | line = line.replaceAll("\\p{Punct}", " ") 239 | .replaceAll("\\pP", " ").replaceAll(" ", " ") 240 | .replaceAll("\\p{Blank}", " ") 241 | .replaceAll("\\p{Space}", " ") 242 | .replaceAll("\\p{Cntrl}", " ") 243 | .replaceAll("[的很了么呢是嘛]", " "); 244 | for (String sen : Splitter.on(" ").omitEmptyStrings() 245 | .splitToList(line)) { 246 | sen = sen.trim(); 247 | sen = "$" + sen + "$"; 248 | System.out.println(sen); 249 | System.out.println(sen.length()); 250 | for (int i = 0; i < sen.length(); ++i) { 251 | for (int j = i + 1; j < i + maxLen + 1 && j <= sen.length(); ++j) { 252 | String w = sen.substring(i, j); 253 | writer.write(w + "\n"); 254 | } 255 | } 256 | } 257 | } 258 | System.out.println("gen sorting..."); 259 | sortFile(ngramFile, ngramSort, new Comparator() { 260 | 261 | @Override 262 | public int compare(String o1, String o2) { 263 | return o1.compareTo(o2); 264 | } 265 | }); 266 | 267 | 268 | try(BufferedReader nsr = Files.newReader(ngramSort, Charsets.UTF_8)) { 269 | 270 | String ngram = null; 271 | String curr = null; 272 | List sameWord = Lists.newLinkedList(); 273 | boolean pause = false; 274 | while (pause || null != (curr = nsr.readLine())) { 275 | if (null == ngram) { 276 | sameWord.add(curr); 277 | ngram = curr; 278 | } else { 279 | if (curr.startsWith(ngram)) { 280 | sameWord.add(curr); 281 | } else { 282 | if (sameWord.isEmpty()) { 283 | pause = false; 284 | sameWord.add(curr); 285 | ngram = curr; 286 | continue; 287 | } 288 | CounterMap right = new CounterMap(); 289 | int freq = 0; 290 | for (String w : sameWord) { 291 | if (!w.startsWith(ngram)) { 292 | break; 293 | } 294 | if (w.equals(ngram)) { 295 | continue; 296 | } 297 | ++freq; 298 | right.incr(w.substring(ngram.length())); 299 | } 300 | double re = 0.0; 301 | for (String t : right.countAll().keySet()) { 302 | double p = right.get(t) * 1.0 / freq; 303 | re += -1 * p * Math.log(p); 304 | } 305 | freqWriter.write(ngram + "\t" + freq + "\t" + re + "\n"); 306 | List newlist = Lists.newLinkedList(); 307 | for (String w : sameWord) { 308 | if (!w.equals(ngram)) { 309 | newlist.add(w); 310 | } 311 | } 312 | sameWord = newlist; 313 | if (sameWord.isEmpty()) { 314 | pause = false; 315 | sameWord.add(curr); 316 | ngram = curr; 317 | continue; 318 | } 319 | ngram = sameWord.get(0); 320 | if (curr.startsWith(ngram)) { 321 | sameWord.add(curr); 322 | } else { 323 | pause = true; 324 | } 325 | } 326 | } 327 | } 328 | } 329 | 330 | sortFile(ngramfreq, ngramfreqSort, new Comparator() { 331 | 332 | @Override 333 | public int compare(String o1, String o2) { 334 | return o1.compareTo(o2); 335 | } 336 | }); 337 | } catch (FileNotFoundException e) { 338 | e.printStackTrace(); 339 | } catch (IOException e) { 340 | e.printStackTrace(); 341 | } 342 | 343 | return ngramfreqSort.getAbsolutePath(); 344 | } 345 | 346 | public String mergeEntropy(String freqRight, String left) { 347 | 348 | // Sorter sorter = new TextFileSorter( 349 | // new SortConfig().withMaxMemoryUsage(1024 * 1000 * 1000)); 350 | 351 | File frFile = new File(freqRight); 352 | File lFile = new File(left); 353 | File mergeTmp = new File(frFile.getParentFile(), "merge.tmp"); 354 | File mergeTmp2 = new File(frFile.getParentFile(), "merge.tmp2"); 355 | File mergeFile = new File(frFile.getParentFile(), "merge_entropy.data"); 356 | 357 | try (BufferedReader rr = Files.newReader(frFile, Charsets.UTF_8); 358 | BufferedReader lr = Files.newReader(lFile, Charsets.UTF_8); 359 | BufferedWriter mw = Files.newWriter(mergeTmp, Charsets.UTF_8); 360 | BufferedWriter mf = Files.newWriter(mergeFile, Charsets.UTF_8);) { 361 | String line = null; 362 | while (null != (line = rr.readLine())) { 363 | mw.write(line + "\n"); 364 | } 365 | line = null; 366 | while (null != (line = lr.readLine())) { 367 | mw.write(line + "\n"); 368 | } 369 | 370 | // sorter.sort(new FileInputStream(mergeTmp), new FileOutputStream( 371 | // mergeTmp2)); 372 | sortFile(mergeTmp, mergeTmp2, new Comparator() { 373 | 374 | @Override 375 | public int compare(String o1, String o2) { 376 | return o1.compareTo(o2); 377 | } 378 | }); 379 | 380 | BufferedReader br = Files.newReader(mergeTmp2, Charsets.UTF_8); 381 | 382 | String line1 = null; 383 | String line2 = null; 384 | line1 = br.readLine(); 385 | line2 = br.readLine(); 386 | while (true) { 387 | 388 | if (null == line1 || null == line2) 389 | break; 390 | String[] seg1 = line1.split("\t"); 391 | String[] seg2 = line2.split("\t"); 392 | if (!seg1[0].equals(seg2[0])) { 393 | line1 = new String(line2.getBytes()); 394 | line2 = br.readLine(); 395 | continue; 396 | } 397 | if (seg1.length < 2) { 398 | line1 = new String(line2.getBytes()); 399 | line2 = br.readLine(); 400 | continue; 401 | } 402 | double le = seg1.length == 2 ? Double.parseDouble(seg1[1]) 403 | : Double.parseDouble(seg2[1]); 404 | double re = seg1.length == 3 ? Double.parseDouble(seg1[2]) 405 | : Double.parseDouble(seg2[2]); 406 | int freq = seg1.length == 3 ? Integer.parseInt(seg1[1]) 407 | : Integer.parseInt(seg2[1]); 408 | double e = Math.min(le, re); 409 | mf.write(seg1[0] + "\t" + freq + "\t" + e + "\n"); 410 | 411 | line1 = br.readLine(); 412 | line2 = br.readLine(); 413 | } 414 | 415 | } catch (FileNotFoundException e) { 416 | e.printStackTrace(); 417 | } catch (IOException e) { 418 | e.printStackTrace(); 419 | } 420 | 421 | return mergeFile.toString(); 422 | } 423 | 424 | public void extractWords(String freqFile, String entropyFile) { 425 | 426 | TreeMap freq = new TreeMap<>(); 427 | 428 | File ffile = new File(freqFile); 429 | File efile = new File(entropyFile); 430 | File wfile = new File(efile.getParentFile(), "words.data"); 431 | 432 | try (BufferedReader fr = Files.newReader(ffile, Charsets.UTF_8); 433 | BufferedReader er = Files.newReader(efile, Charsets.UTF_8); 434 | BufferedWriter ww = Files.newWriter(wfile, Charsets.UTF_8);) { 435 | 436 | String line = null; 437 | while (null != (line = fr.readLine())) { 438 | String[] seg = line.split("\t"); 439 | if (seg.length < 3) continue; 440 | freq.put(seg[0], Integer.parseInt(seg[1])); 441 | } 442 | line = null; 443 | while (null != (line = er.readLine())) { 444 | String[] seg = line.split("\t"); 445 | if (3 != seg.length) 446 | continue; 447 | String w = seg[0]; 448 | int f = Integer.parseInt(seg[1]); 449 | double e = Double.parseDouble(seg[2]); 450 | long max = -1; 451 | for (int s = 1; s < w.length(); ++s) { 452 | String lw = w.substring(0, s); 453 | String rw = w.substring(s); 454 | if (!freq.containsKey(lw) || !freq.containsKey(rw)) 455 | continue; 456 | long ff = freq.get(lw) * freq.get(rw); 457 | if (ff > max) 458 | max = ff; 459 | } 460 | double pf = f * 2000000.0 / max; 461 | if (pf < 10 || e < 2) 462 | continue; 463 | ww.write(w + "\t" + pf + "\t" + e + "\n"); 464 | } 465 | } catch (FileNotFoundException e) { 466 | // TODO Auto-generated catch block 467 | e.printStackTrace(); 468 | } catch (IOException e) { 469 | // TODO Auto-generated catch block 470 | e.printStackTrace(); 471 | } 472 | } 473 | 474 | public static void main(String[] args) { 475 | 476 | Builder builder = new Builder(); 477 | 478 | String rawpath = builder.parse("/Users/zhangcheng/Downloads/comment/test/all.csv"); 479 | // String rawpath = "/Users/zhangcheng/Documents/workspace/python/meta_search/raw_data.txt"; 480 | // 481 | String freqRight = builder.genFreqRight(rawpath, 5, 1024); 482 | String left = builder.genLeft(rawpath, 5, 1024); 483 | // 484 | // String freqRight = 485 | // "/Users/zhangcheng/Documents/workspace/python/meta_search/freq_ngram_sort.data"; 486 | // String left = 487 | // "/Users/zhangcheng/Documents/workspace/python/meta_search/freq_ngram_left_sort.data"; 488 | 489 | // String freqRight = "/Users/zhangcheng/Downloads/comment/test/freq_ngram_sort.data"; 490 | // String left = "/Users/zhangcheng/Downloads/comment/test/freq_ngram_left_sort.data"; 491 | 492 | String entropyfile = builder.mergeEntropy(freqRight, left); 493 | 494 | builder.extractWords(freqRight, entropyfile); 495 | 496 | } 497 | } 498 | -------------------------------------------------------------------------------- /src/main/java/dict/build/CounterMap.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | package dict.build; 5 | 6 | import java.io.Serializable; 7 | import java.util.Map; 8 | import java.util.concurrent.ConcurrentHashMap; 9 | 10 | /** 11 | * @author Jennifer 12 | * 13 | */ 14 | public class CounterMap implements Serializable { 15 | 16 | /** 17 | * 18 | */ 19 | private static final long serialVersionUID = -3903452740943758085L; 20 | 21 | private Map count = new ConcurrentHashMap(); 22 | 23 | public CounterMap() { 24 | } 25 | 26 | public CounterMap(int capacitySize) { 27 | count = new ConcurrentHashMap(capacitySize); 28 | } 29 | 30 | public void incr(String key) { 31 | if (count.containsKey(key)) { 32 | count.put(key, count.get(key) + 1); 33 | } else { 34 | count.put(key, 1); 35 | } 36 | } 37 | 38 | public void incrby(String key, int delta) { 39 | if (count.containsKey(key)) { 40 | count.put(key, count.get(key) + delta); 41 | } else { 42 | count.put(key, delta); 43 | } 44 | } 45 | 46 | public int get(String key) { 47 | Integer value = count.get(key); 48 | if (null == value) 49 | return 0; 50 | return value; 51 | } 52 | 53 | public Map countAll() { 54 | return count; 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /src/main/java/dict/build/FastBuilder.java: -------------------------------------------------------------------------------- 1 | package dict.build; 2 | 3 | import java.io.*; 4 | import java.util.Map; 5 | import java.util.TreeMap; 6 | 7 | import com.fasterxml.sort.SortConfig; 8 | import com.fasterxml.sort.std.TextFileSorter; 9 | import com.google.common.base.Charsets; 10 | import com.google.common.base.Splitter; 11 | import com.google.common.collect.Maps; 12 | import com.google.common.io.Files; 13 | import com.googlecode.concurrenttrees.radix.ConcurrentRadixTree; 14 | import com.googlecode.concurrenttrees.radix.RadixTree; 15 | import com.googlecode.concurrenttrees.radix.node.concrete.DefaultCharArrayNodeFactory; 16 | import org.slf4j.Logger; 17 | import org.slf4j.LoggerFactory; 18 | 19 | /** 20 | * 21 | * @author Jennifer 22 | * 23 | */ 24 | public class FastBuilder { 25 | 26 | private static final Logger LOG = LoggerFactory.getLogger(FastBuilder.class); 27 | 28 | /** 29 | * Let's limit maximum memory used for pre-sorting when invoked from 30 | * command-line to be 256 megs 31 | */ 32 | public final static long MAX_HEAP_FOR_PRESORT = 256L * 1024 * 1024; 33 | 34 | /** 35 | * Also just in case our calculations are wrong, require 10 megs for 36 | * pre-sort anyway (if invoked from CLI) 37 | */ 38 | public final static long MIN_HEAP_FOR_PRESORT = 10L * 1024 * 1024; 39 | 40 | 41 | public final static String stopwords = "的很了么呢是嘛个都也比还这于不与才上用就好在和对挺去后没说"; 42 | 43 | 44 | /** 45 | * 输入的字符是否是汉字 46 | * @param a char 47 | * @return boolean 48 | */ 49 | public static boolean isChinese(char a) { 50 | int v = (int)a; 51 | return (v >=19968 && v <= 40869); // [0x4e00, 0x29fa5] 52 | } 53 | 54 | public static boolean allChs(String s){ 55 | if (null == s || "".equals(s.trim())) return false; 56 | for (int i = 0; i < s.length(); i++) { 57 | if (!isChinese(s.charAt(i))) return false; 58 | } 59 | return true; 60 | } 61 | 62 | public TreeMap loadPosprop() { 63 | 64 | TreeMap prop = Maps.newTreeMap(); 65 | try { 66 | System.out.println(FastBuilder.class.getResourceAsStream("/pos_prop.txt")); 67 | BufferedReader br = new BufferedReader(new InputStreamReader(this.getClass().getResourceAsStream("/pos_prop.txt"),"UTF-8")); 68 | String l = null; 69 | while (null != (l = br.readLine())) { 70 | String[] seg = l.split("\t"); 71 | prop.put(seg[0], new double[]{Double.parseDouble(seg[1]), Double.parseDouble(seg[2]), Double.parseDouble(seg[3])}); 72 | } 73 | } catch (IOException e) { 74 | e.printStackTrace(); 75 | } 76 | return prop; 77 | } 78 | 79 | public String parse(String filepath) { 80 | 81 | File in = new File(filepath); 82 | File out = new File(in.getParentFile(), "out.data"); 83 | 84 | try (BufferedReader ir = Files.newReader(in, Charsets.UTF_8); 85 | BufferedWriter ow = Files.newWriter(out, Charsets.UTF_8);) { 86 | String line = null; 87 | while (null != (line = ir.readLine())) { 88 | String[] seg = line.split(","); 89 | StringBuilder bui = new StringBuilder(); 90 | for (int i = 6; i < seg.length; ++i) { 91 | bui.append(seg[i]); 92 | } 93 | bui.append("\n"); 94 | ow.write(bui.toString()); 95 | } 96 | } catch (FileNotFoundException e) { 97 | e.printStackTrace(); 98 | } catch (IOException e) { 99 | e.printStackTrace(); 100 | } 101 | 102 | return out.getAbsolutePath(); 103 | } 104 | 105 | private String reverse(String raw) { 106 | StringBuilder bui = new StringBuilder(); 107 | for (int i = raw.length() - 1; i >= 0; --i) 108 | bui.append(raw.charAt(i)); 109 | return bui.toString(); 110 | } 111 | 112 | public void sortFile(File in, File out) { 113 | try { 114 | long availMem = Runtime.getRuntime().maxMemory() 115 | - (40 * 1024 * 1024); 116 | long maxMem = (availMem >> 1); 117 | if (maxMem > MAX_HEAP_FOR_PRESORT) { 118 | maxMem = MAX_HEAP_FOR_PRESORT; 119 | } else if (maxMem < MIN_HEAP_FOR_PRESORT) { 120 | maxMem = MIN_HEAP_FOR_PRESORT; 121 | } 122 | final TextFileSorter sorter = new TextFileSorter( 123 | new SortConfig().withMaxMemoryUsage(maxMem)); 124 | sorter.sort(new FileInputStream(in), new PrintStream(out)); 125 | } catch (IOException e) { 126 | e.printStackTrace(); 127 | } 128 | } 129 | 130 | public String genLeft(String rawTextFile, int maxLen, int memSize) { 131 | 132 | File rawFile = new File(rawTextFile); 133 | 134 | File dir = rawFile.getParentFile(); 135 | 136 | File ngramFile = new File(dir, "ngram_left.data"); 137 | File ngramSort = new File(dir, "sort_ngram_left.data"); 138 | File ngramfreq = new File(dir, "freq_ngram_left.data"); 139 | File ngramFreqSort = new File(dir, "freq_ngram_left_sort.data"); 140 | 141 | try (BufferedReader breader = Files.newReader(rawFile, Charsets.UTF_8); 142 | BufferedWriter writer = Files.newWriter(ngramFile, 143 | Charsets.UTF_8); 144 | BufferedWriter freqWriter = Files.newWriter(ngramfreq, 145 | Charsets.UTF_8);) { 146 | String line = null; 147 | while (null != (line = breader.readLine())) { 148 | line = line.replaceAll("[" + stopwords + "]", " ") 149 | .replaceAll("\\p{Punct}", " ") 150 | .replaceAll("\\pP", " ") 151 | .replaceAll(" ", " ") 152 | .replaceAll("\\p{Blank}", " ") 153 | .replaceAll("\\p{Space}", " ") 154 | .replaceAll("\\p{Cntrl}", " "); 155 | for (String sen : Splitter.on(" ").omitEmptyStrings() 156 | .splitToList(line)) { 157 | sen = reverse(sen.trim()); 158 | if (!allChs(sen)) continue; 159 | sen = "$" + sen + "$"; 160 | for (int i = 1; i < sen.length() - 1; ++i) { 161 | writer.write(sen.substring(i, Math.min(maxLen + i, sen.length())) + "\n"); 162 | } 163 | } 164 | } 165 | writer.close(); 166 | sortFile(ngramFile, ngramSort); 167 | 168 | try(BufferedReader nsr = Files.newReader(ngramSort, Charsets.UTF_8)) { 169 | String first = null; 170 | String curr = null; 171 | Map stat = Maps.newHashMap(); 172 | while (null != (curr = nsr.readLine())) { 173 | if (null == first) { 174 | for (int i = 1; i < curr.length(); ++i) { 175 | String w = curr.substring(0, i); 176 | String suffix = curr.substring(i).substring(0, 1); 177 | if (stat.containsKey(w)) { 178 | stat.get(w).incr(suffix); 179 | } else { 180 | CounterMap cm = new CounterMap(); 181 | cm.incr(suffix); 182 | stat.put(w, cm); 183 | } 184 | } 185 | first = curr.substring(0, 1); 186 | } else { 187 | if (!curr.startsWith(first)) { 188 | 189 | StringBuilder builder = new StringBuilder(); 190 | for (String w : stat.keySet()) { 191 | CounterMap cm = stat.get(w); 192 | int freq = 0; 193 | double re = 0; 194 | for (String k : cm.countAll().keySet()) { 195 | freq += cm.get(k); 196 | } 197 | for (String k : cm.countAll().keySet()) { 198 | double p = cm.get(k) * 1.0 / freq; 199 | re += -1 * Math.log(p) / Math.log(2) * p; 200 | } 201 | builder.append(reverse(w)).append("\t").append(re).append("\n"); 202 | } 203 | freqWriter.write(builder.toString()); 204 | stat.clear(); 205 | first = curr.substring(0, 1); 206 | } 207 | for (int i = 1; i < curr.length(); ++i) { 208 | String w = curr.substring(0, i); 209 | String suffix = curr.substring(i).substring(0, 1); 210 | if (stat.containsKey(w)) { 211 | stat.get(w).incr(suffix); 212 | } else { 213 | CounterMap cm = new CounterMap(); 214 | cm.incr(suffix); 215 | stat.put(w, cm); 216 | } 217 | } 218 | } 219 | } 220 | StringBuilder builder = new StringBuilder(); 221 | for (String w : stat.keySet()) { 222 | CounterMap cm = stat.get(w); 223 | int freq = 0; 224 | double re = 0; 225 | for (String k : cm.countAll().keySet()) { 226 | freq += cm.get(k); 227 | } 228 | for (String k : cm.countAll().keySet()) { 229 | double p = cm.get(k) * 1.0 / freq; 230 | re += -1 * Math.log(p) / Math.log(2) * p; 231 | } 232 | builder.append(reverse(w)).append("\t").append(re).append("\n"); 233 | } 234 | freqWriter.write(builder.toString()); 235 | stat.clear(); 236 | 237 | freqWriter.close(); 238 | } 239 | 240 | sortFile(ngramfreq, ngramFreqSort); 241 | 242 | } catch (FileNotFoundException e) { 243 | e.printStackTrace(); 244 | } catch (IOException e) { 245 | e.printStackTrace(); 246 | } 247 | 248 | return ngramFreqSort.getAbsolutePath(); 249 | } 250 | 251 | public String genFreqRight(String rawTextFile, int maxLen, int memSize) { 252 | 253 | File rawFile = new File(rawTextFile); 254 | 255 | File dir = rawFile.getParentFile(); 256 | 257 | File ngramFile = new File(dir, "ngram.data"); 258 | File ngramSort = new File(dir, "ngram_sort.data"); 259 | File ngramfreq = new File(dir, "freq_ngram.data"); 260 | File ngramfreqSort = new File(dir, "freq_ngram_sort.data"); 261 | 262 | try (BufferedReader breader = Files.newReader(rawFile, Charsets.UTF_8); 263 | BufferedWriter writer = Files.newWriter(ngramFile, 264 | Charsets.UTF_8); 265 | BufferedWriter freqWriter = Files.newWriter(ngramfreq, 266 | Charsets.UTF_8);) { 267 | String line = null; 268 | while (null != (line = breader.readLine())) { 269 | line = line.replaceAll("["+stopwords+"]", " ") 270 | .replaceAll("\\p{Punct}", " ") 271 | .replaceAll("\\pP", " ") 272 | .replaceAll(" ", " ") 273 | .replaceAll("\\p{Blank}", " ") 274 | .replaceAll("\\p{Space}", " ") 275 | .replaceAll("\\p{Cntrl}", " "); 276 | for (String sen : Splitter.on(" ").omitEmptyStrings() 277 | .splitToList(line)) { 278 | sen = sen.trim(); 279 | if (!allChs(sen)) continue; 280 | sen = "$" + sen + "$"; 281 | for (int i = 1; i < sen.length() - 1; ++i) { 282 | writer.write(sen.substring(i, Math.min(maxLen + i, sen.length())) + "\n"); 283 | } 284 | } 285 | } 286 | writer.close(); 287 | System.out.println("gen sorting..."); 288 | sortFile(ngramFile, ngramSort); 289 | 290 | try(BufferedReader nsr = Files.newReader(ngramSort, Charsets.UTF_8)) { 291 | String first = null; 292 | String curr = null; 293 | Map stat = Maps.newHashMap(); 294 | while (null != (curr = nsr.readLine())) { 295 | if (null == first) { 296 | for (int i = 1; i < curr.length(); ++i) { 297 | String w = curr.substring(0, i); 298 | String suffix = curr.substring(i).substring(0, 1); 299 | if (stat.containsKey(w)) { 300 | stat.get(w).incr(suffix); 301 | } else { 302 | CounterMap cm = new CounterMap(); 303 | cm.incr(suffix); 304 | stat.put(w, cm); 305 | } 306 | } 307 | first = curr.substring(0, 1); 308 | } else { 309 | if (!curr.startsWith(first)) { 310 | 311 | StringBuilder builder = new StringBuilder(); 312 | for (String w : stat.keySet()) { 313 | CounterMap cm = stat.get(w); 314 | int freq = 0; 315 | double re = 0; 316 | for (String k : cm.countAll().keySet()) { 317 | freq += cm.get(k); 318 | } 319 | for (String k : cm.countAll().keySet()) { 320 | double p = cm.get(k) * 1.0 / freq; 321 | re += -1 * Math.log(p) / Math.log(2) * p; 322 | } 323 | builder.append(w).append("\t").append(freq).append("\t").append(re).append("\n"); 324 | } 325 | freqWriter.write(builder.toString()); 326 | stat.clear(); 327 | first = curr.substring(0, 1); 328 | } 329 | for (int i = 1; i < curr.length(); ++i) { 330 | String w = curr.substring(0, i); 331 | String suffix = curr.substring(i).substring(0, 1); 332 | if (stat.containsKey(w)) { 333 | stat.get(w).incr(suffix); 334 | } else { 335 | CounterMap cm = new CounterMap(); 336 | cm.incr(suffix); 337 | stat.put(w, cm); 338 | } 339 | } 340 | } 341 | } 342 | StringBuilder builder = new StringBuilder(); 343 | for (String w : stat.keySet()) { 344 | CounterMap cm = stat.get(w); 345 | int freq = 0; 346 | double re = 0; 347 | for (String k : cm.countAll().keySet()) { 348 | freq += cm.get(k); 349 | } 350 | for (String k : cm.countAll().keySet()) { 351 | double p = cm.get(k) * 1.0 / freq; 352 | re += -1 * Math.log(p) / Math.log(2) * p; 353 | } 354 | builder.append(w).append("\t").append(freq).append("\t").append(re).append("\n"); 355 | } 356 | freqWriter.write(builder.toString()); 357 | stat.clear(); 358 | freqWriter.close(); 359 | } 360 | 361 | sortFile(ngramfreq, ngramfreqSort); 362 | 363 | } catch (FileNotFoundException e) { 364 | e.printStackTrace(); 365 | } catch (IOException e) { 366 | e.printStackTrace(); 367 | } 368 | 369 | return ngramfreqSort.getAbsolutePath(); 370 | } 371 | 372 | public String mergeEntropy(String freqRight, String left) { 373 | 374 | 375 | File frFile = new File(freqRight); 376 | File lFile = new File(left); 377 | File mergeTmp = new File(frFile.getParentFile(), "merge.tmp"); 378 | File mergeTmp2 = new File(frFile.getParentFile(), "merge.tmp2"); 379 | File mergeFile = new File(frFile.getParentFile(), "merge_entropy.data"); 380 | 381 | try (BufferedReader rr = Files.newReader(frFile, Charsets.UTF_8); 382 | BufferedReader lr = Files.newReader(lFile, Charsets.UTF_8); 383 | BufferedWriter mw = Files.newWriter(mergeTmp, Charsets.UTF_8); 384 | BufferedWriter mf = Files.newWriter(mergeFile, Charsets.UTF_8);) { 385 | String line = null; 386 | while (null != (line = rr.readLine())) { 387 | mw.write(line + "\n"); 388 | } 389 | line = null; 390 | while (null != (line = lr.readLine())) { 391 | mw.write(line + "\n"); 392 | } 393 | mw.close(); 394 | 395 | sortFile(mergeTmp, mergeTmp2); 396 | 397 | BufferedReader br = Files.newReader(mergeTmp2, Charsets.UTF_8); 398 | 399 | String line1 = null; 400 | String line2 = null; 401 | line1 = br.readLine(); 402 | line2 = br.readLine(); 403 | while (true) { 404 | 405 | if (null == line1 || null == line2) 406 | break; 407 | String[] seg1 = line1.split("\t"); 408 | String[] seg2 = line2.split("\t"); 409 | if (!seg1[0].equals(seg2[0])) { 410 | line1 = new String(line2.getBytes()); 411 | line2 = br.readLine(); 412 | continue; 413 | } 414 | if (seg1.length < 2) { 415 | line1 = new String(line2.getBytes()); 416 | line2 = br.readLine(); 417 | continue; 418 | } 419 | line1 = br.readLine(); 420 | line2 = br.readLine(); 421 | 422 | if (seg1.length < 3 && seg2.length < 3) 423 | continue; 424 | double le = seg1.length == 2 ? Double.parseDouble(seg1[1]) 425 | : Double.parseDouble(seg2[1]); 426 | double re = seg1.length == 3 ? Double.parseDouble(seg1[2]) 427 | : Double.parseDouble(seg2[2]); 428 | int freq = seg1.length == 3 ? Integer.parseInt(seg1[1]) 429 | : Integer.parseInt(seg2[1]); 430 | double e = Math.min(le, re); 431 | mf.write(seg1[0] + "\t" + freq + "\t" + e + "\n"); 432 | 433 | } 434 | mf.close(); 435 | 436 | } catch (FileNotFoundException e) { 437 | e.printStackTrace(); 438 | } catch (IOException e) { 439 | e.printStackTrace(); 440 | } 441 | 442 | return mergeFile.toString(); 443 | } 444 | 445 | public static boolean allLetterOrNumber(String w) { 446 | 447 | for (char c : w.toLowerCase().toCharArray()) { 448 | boolean letter = c >= 'a' && c <= 'z'; 449 | boolean digit = c >= '0' && c <= '9'; 450 | if (!letter && !digit) return false; 451 | } 452 | return true; 453 | } 454 | 455 | public void extractWords(String freqFile, String entropyFile) { 456 | 457 | LOG.info("start to extract words"); 458 | 459 | TreeMap posProp = this.loadPosprop(); 460 | 461 | RadixTree tree = new ConcurrentRadixTree(new DefaultCharArrayNodeFactory()); 462 | 463 | File ffile = new File(freqFile); 464 | File efile = new File(entropyFile); 465 | File wfile = new File(efile.getParentFile(), "words.data"); 466 | File wsfile = new File(efile.getParentFile(), "words_sort.data"); 467 | 468 | try (BufferedReader fr = Files.newReader(ffile, Charsets.UTF_8); 469 | BufferedReader er = Files.newReader(efile, Charsets.UTF_8); 470 | BufferedWriter ww = Files.newWriter(wfile, Charsets.UTF_8);) { 471 | 472 | String line = null; 473 | long total = 0; 474 | long epoch = 0; 475 | while (null != (line = fr.readLine())) { 476 | String[] seg = line.split("\t"); 477 | if (seg.length < 3) continue; 478 | tree.put(seg[0], Integer.parseInt(seg[1])); 479 | epoch += 1; 480 | //all single char's frequency 481 | if(seg[0].length()<2) total += Integer.parseInt(seg[1]); 482 | if (epoch % 1000 == 0) { 483 | LOG.info("load freq to radix tree done: " + total); 484 | } 485 | } 486 | LOG.info("build freq TST done!"); 487 | line = null; 488 | int cnt = 0; 489 | while (null != (line = er.readLine())) { 490 | cnt += 1; 491 | if (cnt % 1000 == 0) { 492 | LOG.info("extract words done: " + cnt); 493 | } 494 | String[] seg = line.split("\t"); 495 | if (3 != seg.length) 496 | continue; 497 | String w = seg[0]; 498 | if (allLetterOrNumber(w)) { 499 | continue; 500 | } 501 | int f = Integer.parseInt(seg[1]); 502 | double e = Double.parseDouble(seg[2]); 503 | long max = -1; 504 | for (int s = 1; s < w.length(); ++s) { 505 | String lw = w.substring(0, s); 506 | String rw = w.substring(s); 507 | Integer lfObj = tree.getValueForExactKey(lw); 508 | Integer rfObj = tree.getValueForExactKey(rw); 509 | long lf = -1; 510 | long rf = -1; 511 | if (null != lfObj) { 512 | lf = lfObj.intValue(); 513 | } 514 | if (null != rfObj) { 515 | rf = rfObj.intValue(); 516 | } 517 | if (-1 == lf || -1 == rf) continue; 518 | 519 | long ff = lf * rf; 520 | if (ff > max) 521 | max = ff; 522 | } 523 | double pf = f * total / max; 524 | double pmi = Math.log(pf) / Math.log(2); 525 | if (Double.isNaN(pmi)) continue; 526 | double pp = -1; 527 | if (null != posProp.get(w.subSequence(0, 1)) && null != posProp.get(w.subSequence(w.length() - 1, w.length()))) 528 | pp = Math.min(posProp.get(w.subSequence(0, 1))[0], posProp.get(w.subSequence(w.length() - 1, w.length()))[2]); 529 | if (pmi < 1 || e < 2 || pp < 0.1) 530 | continue; 531 | ww.write(w + "\t" + f + "\t" + pmi + "\t" + e + "\t" + pp + "\n"); 532 | 533 | } 534 | ww.close(); 535 | LOG.info("start to sort extracted words"); 536 | try { 537 | long availMem = Runtime.getRuntime().maxMemory() - (2048 * 1024 * 1024); 538 | long maxMem = (availMem >> 1); 539 | if (maxMem > MAX_HEAP_FOR_PRESORT) { 540 | maxMem = MAX_HEAP_FOR_PRESORT; 541 | } else if (maxMem < MIN_HEAP_FOR_PRESORT) { 542 | maxMem = MIN_HEAP_FOR_PRESORT; 543 | } 544 | final SplitFileSorter sorter = new SplitFileSorter(new SortConfig().withMaxMemoryUsage(maxMem)); 545 | sorter.sort(new FileInputStream(wfile), new PrintStream(wsfile)); 546 | } catch (IOException e) { 547 | e.printStackTrace(); 548 | } 549 | 550 | LOG.info("all done"); 551 | 552 | } catch (FileNotFoundException e) { 553 | e.printStackTrace(); 554 | } catch (IOException e) { 555 | e.printStackTrace(); 556 | } 557 | } 558 | } 559 | -------------------------------------------------------------------------------- /src/main/java/dict/build/LineReader.java: -------------------------------------------------------------------------------- 1 | package dict.build; 2 | 3 | import java.io.*; 4 | import java.util.Arrays; 5 | 6 | import com.fasterxml.sort.*; 7 | import com.google.common.base.Charsets; 8 | 9 | /** 10 | * Efficient reader for data that consists of text lines, i.e. character 11 | * data separated by one of standard line feeds (CR, LF or CR+LF). 12 | * For efficiency no decoding is done 13 | */ 14 | public class LineReader 15 | extends DataReader 16 | { 17 | 18 | protected final BufferedReader _br; 19 | 20 | 21 | public LineReader(InputStream in) 22 | { 23 | _br = new BufferedReader(new InputStreamReader(in, Charsets.UTF_8)); 24 | } 25 | 26 | /** 27 | * Convenience method for instantiating factory to create instances of 28 | * this {@link DataReader}. 29 | */ 30 | public static Factory factory() { 31 | return new Factory(); 32 | } 33 | 34 | @Override 35 | public void close() throws IOException 36 | { 37 | _br.close(); 38 | } 39 | 40 | @Override 41 | public int estimateSizeInBytes(String item) 42 | { 43 | // Wild guess: array objects take at least 8 bytes, probably 12 or 16. 44 | // And size of actual array storage rounded up to 4-byte alignment. So: 45 | 46 | int bytes = item.getBytes().length; 47 | bytes = ((bytes + 3) >> 2) << 2; 48 | return 16 + bytes; 49 | } 50 | 51 | @Override 52 | public String readNext() throws IOException 53 | { 54 | 55 | return _br.readLine(); 56 | } 57 | 58 | /* 59 | /********************************************************************** 60 | /* Helper classes 61 | /********************************************************************** 62 | */ 63 | 64 | public static class Factory 65 | extends DataReaderFactory 66 | { 67 | @Override 68 | public DataReader constructReader(InputStream in) { 69 | return new LineReader(in); 70 | } 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /src/main/java/dict/build/LineWriter.java: -------------------------------------------------------------------------------- 1 | package dict.build; 2 | 3 | import java.io.*; 4 | 5 | import com.fasterxml.sort.*; 6 | import com.google.common.base.Charsets; 7 | 8 | public class LineWriter 9 | extends DataWriter 10 | { 11 | 12 | protected final BufferedWriter _out; 13 | 14 | 15 | public LineWriter(OutputStream out) { 16 | _out = new BufferedWriter(new OutputStreamWriter(out,Charsets.UTF_8)); 17 | } 18 | 19 | 20 | public static Factory factory() { 21 | return new Factory(); 22 | } 23 | 24 | 25 | @Override 26 | public void close() throws IOException { 27 | _out.close(); 28 | } 29 | 30 | @Override 31 | public void writeEntry(String item) throws IOException 32 | { 33 | _out.write(item + "\n"); 34 | } 35 | 36 | /* 37 | /********************************************************************** 38 | /* Helper classes 39 | /********************************************************************** 40 | */ 41 | 42 | /** 43 | * Basic factory implementation. The only noteworthy things are: 44 | *

    45 | *
  • Ability to configure linefeed to use (including none, pass null)
  • 46 | *
  • Writer uses {@link BufferedOutputStream} by default (can be disabled) 47 | *
48 | */ 49 | public static class Factory 50 | extends DataWriterFactory 51 | { 52 | public Factory() { 53 | } 54 | 55 | 56 | 57 | @Override 58 | public DataWriter constructWriter(OutputStream out) { 59 | return new LineWriter(out); 60 | } 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /src/main/java/dict/build/Main.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | package dict.build; 5 | 6 | /** 7 | * @author zhangcheng 8 | * 9 | */ 10 | public class Main { 11 | 12 | /** 13 | * @param args 14 | */ 15 | public static void main(String[] args) { 16 | 17 | if (args.length == 0) { 18 | System.out.println("rawpath"); 19 | return; 20 | } 21 | 22 | String rawpath = null; 23 | if (args.length > 0) { 24 | rawpath = args[0]; 25 | } 26 | 27 | String left = null; 28 | String right = null; 29 | String entropyfile = null; 30 | 31 | FastBuilder builder = new FastBuilder(); 32 | 33 | if (null == right) 34 | right = builder.genFreqRight(rawpath, 6, 10 * 1024); 35 | if (null == left) 36 | left = builder.genLeft(rawpath, 6, 10 * 1024); 37 | if (null == entropyfile) 38 | entropyfile = builder.mergeEntropy(right, left); 39 | 40 | builder.extractWords(right, entropyfile); 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/main/java/dict/build/PosProbability.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | package dict.build; 5 | 6 | import java.io.BufferedReader; 7 | import java.io.BufferedWriter; 8 | import java.io.File; 9 | import java.io.FileNotFoundException; 10 | import java.io.IOException; 11 | import java.util.Map; 12 | 13 | import com.google.common.base.Charsets; 14 | import com.google.common.collect.Maps; 15 | import com.google.common.io.Files; 16 | 17 | /** 18 | * @author Jennifer 19 | * 20 | */ 21 | public class PosProbability { 22 | 23 | /** 24 | * @param args 25 | * @throws IOException 26 | * @throws FileNotFoundException 27 | */ 28 | public static void main(String[] args) throws FileNotFoundException, IOException { 29 | 30 | File dictFile = new File("sogou.dic"); 31 | File ppFile = new File(dictFile.getParentFile(), "dict/build/pos_prop.txt"); 32 | try(BufferedReader br = Files.newReader(dictFile, Charsets.UTF_8); 33 | BufferedWriter pw = Files.newWriter(ppFile, Charsets.UTF_8); 34 | ) { 35 | String line = null; 36 | Map pp = Maps.newHashMap(); 37 | while (null != (line = br.readLine())) { 38 | String[] seg = line.split("\t"); 39 | // int freq = Integer.parseInt(seg[2]); 40 | int freq = 1; 41 | for (int i = 0; i < seg[0].length(); ++i) { 42 | String label = null; 43 | if (0 == i) { 44 | label = "S"; 45 | } else if (seg[0].length() - 1 == i) { 46 | label = "E"; 47 | } else { 48 | label = "M"; 49 | } 50 | String key = seg[0].substring(i, i + 1); 51 | if (pp.containsKey(key)) { 52 | pp.get(key).incrby(label, freq); 53 | } else { 54 | CounterMap cm = new CounterMap(); 55 | cm.incrby(label, freq); 56 | pp.put(key, cm); 57 | } 58 | } 59 | } 60 | String[] labels = new String[]{"S", "M", "E"}; 61 | for (String key : pp.keySet()) { 62 | int total = 0; 63 | for (String l : labels) { 64 | total += pp.get(key).get(l); 65 | } 66 | if (0 == total) 67 | continue; 68 | StringBuilder bui = new StringBuilder(); 69 | bui.append(key); 70 | for (String l : labels) { 71 | bui.append("\t").append(pp.get(key).get(l) * 1.0 / total); 72 | } 73 | bui.append("\n"); 74 | pw.write(bui.toString()); 75 | } 76 | } 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /src/main/java/dict/build/SplitFileSorter.java: -------------------------------------------------------------------------------- 1 | package dict.build; 2 | 3 | import java.io.File; 4 | import java.io.FileInputStream; 5 | import java.io.InputStream; 6 | 7 | import com.fasterxml.sort.SortConfig; 8 | import com.fasterxml.sort.Sorter; 9 | 10 | /** 11 | * Basic {@link Sorter} implementation that operates on text line input. 12 | */ 13 | public class SplitFileSorter extends Sorter 14 | { 15 | /** 16 | * Let's limit maximum memory used for pre-sorting when invoked from command-line to be 17 | * 256 megs 18 | */ 19 | public final static long MAX_HEAP_FOR_PRESORT = 256L * 1024 * 1024; 20 | 21 | /** 22 | * Also just in case our calculations are wrong, require 10 megs for pre-sort anyway 23 | * (if invoked from CLI) 24 | */ 25 | public final static long MIN_HEAP_FOR_PRESORT = 10L * 1024 * 1024; 26 | 27 | public SplitFileSorter() { 28 | this(new SortConfig()); 29 | } 30 | 31 | public SplitFileSorter(SortConfig config) 32 | { 33 | super(config, 34 | LineReader.factory(), LineWriter.factory(), 35 | new SplitStringComparator()); 36 | } 37 | 38 | /* 39 | /********************************************************************** 40 | /* Main method for simple command-line operation for line-based 41 | /* sorting using default ISO-8859-1 collation (i.e. byte-by-byte sorting) 42 | /********************************************************************** 43 | */ 44 | 45 | public static void main(String[] args) throws Exception 46 | { 47 | if (args.length > 1) { 48 | System.err.println("Usage: java "+SplitFileSorter.class.getName()+" [input-file]"); 49 | System.err.println("(where input-file is optional; if missing, read from STDIN)"); 50 | System.exit(1); 51 | } 52 | 53 | // One more thing: use 50% of memory (but no more than 200 megs) for pre-sort 54 | // minor tweak: consider first 40 megs to go for other overhead... 55 | long availMem = Runtime.getRuntime().maxMemory() - (40 * 1024 * 1024); 56 | long maxMem = (availMem >> 1); 57 | if (maxMem > MAX_HEAP_FOR_PRESORT) { 58 | maxMem = MAX_HEAP_FOR_PRESORT; 59 | } else if (maxMem < MIN_HEAP_FOR_PRESORT) { 60 | maxMem = MIN_HEAP_FOR_PRESORT; 61 | } 62 | final SplitFileSorter sorter = new SplitFileSorter(new SortConfig().withMaxMemoryUsage(maxMem)); 63 | final InputStream in; 64 | 65 | if (args.length == 0) { 66 | in = System.in; 67 | } else { 68 | File input = new File(args[0]); 69 | if (!input.exists() || input.isDirectory()) { 70 | System.err.println("File '"+input.getAbsolutePath()+"' does not exist (or is not file)"); 71 | System.exit(2); 72 | } 73 | in = new FileInputStream(input); 74 | } 75 | 76 | // To be able to print out progress, need to spin one additional thread... 77 | new Thread(new Runnable() { 78 | @Override 79 | public void run() { 80 | final long start = System.currentTimeMillis(); 81 | try { 82 | while (!sorter.isCompleted()) { 83 | Thread.sleep(5000L); 84 | if (sorter.isPreSorting()) { 85 | System.err.printf(" pre-sorting: %d files written\n", sorter.getNumberOfPreSortFiles()); 86 | } else if (sorter.isSorting()) { 87 | System.err.printf(" sorting, round: %d/%d\n", 88 | sorter.getSortRound(), sorter.getNumberOfSortRounds()); 89 | } 90 | } 91 | double secs = (System.currentTimeMillis() - start) / 1000.0; 92 | System.err.printf("Completed: took %.1f seconds.\n", secs); 93 | } catch (InterruptedException e) { 94 | double secs = (System.currentTimeMillis() - start) / 1000.0; 95 | System.err.printf("[INTERRUPTED] -- took %.1f seconds.\n", secs); 96 | } 97 | } 98 | }).start(); 99 | sorter.sort(in, System.out); 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /src/main/java/dict/build/SplitStringComparator.java: -------------------------------------------------------------------------------- 1 | package dict.build; 2 | 3 | import java.util.Comparator; 4 | 5 | /** 6 | * Simple implementation of comparator for byte arrays which 7 | * will compare using unsigned byte values (meaning 8 | * that 0xFF is creator than 0x00, for example). 9 | */ 10 | public class SplitStringComparator 11 | implements Comparator 12 | { 13 | @Override 14 | public int compare(String o1, String o2) 15 | { 16 | String[] seg1 = o1.split("\t"); 17 | String[] seg2 = o2.split("\t"); 18 | if (4 > seg1.length || 4 > seg2.length) return 1; 19 | Double d1 = Double.parseDouble(seg1[1]); 20 | Double d2 = Double.parseDouble(seg2[1]); 21 | return d2.compareTo(d1); 22 | } 23 | 24 | } 25 | -------------------------------------------------------------------------------- /src/main/java/dict/build/TernaryNode.java: -------------------------------------------------------------------------------- 1 | package dict.build; 2 | 3 | @Deprecated 4 | public class TernaryNode { 5 | 6 | /** character of this node. */ 7 | private char splitchar; 8 | 9 | /** whether this character is the end of a word. */ 10 | private int value; 11 | 12 | /** low child of this node. */ 13 | private TernaryNode lokid; 14 | 15 | /** equal child of this node. */ 16 | private TernaryNode eqkid; 17 | 18 | /** high child of this node. */ 19 | private TernaryNode hikid; 20 | 21 | /** 22 | * This will create a new TernaryNode with the supplied 23 | * character. 24 | * 25 | * @param c 26 | * char 27 | */ 28 | public TernaryNode(final char c) { 29 | this.splitchar = c; 30 | } 31 | 32 | /** 33 | * This returns the splitchar of this TernaryNode. 34 | * 35 | * @return char 36 | */ 37 | public char getSplitChar() { 38 | return this.splitchar; 39 | } 40 | 41 | /** 42 | * This sets the splitchar for this TernaryNode. 43 | * 44 | * @param c 45 | * char 46 | */ 47 | public void setSplitChar(final char c) { 48 | this.splitchar = c; 49 | } 50 | 51 | /** 52 | * This returns the endOfWord for this TernaryNode. 53 | * 54 | * @return boolean 55 | */ 56 | public boolean isEndOfWord() { 57 | return this.value > 0; 58 | } 59 | 60 | public int getValue() { 61 | return this.value; 62 | } 63 | 64 | /** 65 | * This sets the endOfWord for this TernaryNode. 66 | * 67 | * @param b 68 | * boolean 69 | */ 70 | public void setEndOfWord(final int b) { 71 | this.value = b; 72 | } 73 | 74 | /** 75 | * This returns the lokid of this TernaryNode. 76 | * 77 | * @return TernaryNode 78 | */ 79 | public TernaryNode getLokid() { 80 | return this.lokid; 81 | } 82 | 83 | /** 84 | * This sets the lokid of this TernaryNode. 85 | * 86 | * @param node 87 | * TernaryNode 88 | */ 89 | public void setLokid(final TernaryNode node) { 90 | this.lokid = node; 91 | } 92 | 93 | /** 94 | * This returns the eqkid of this TernaryNode. 95 | * 96 | * @return TernaryNode 97 | */ 98 | public TernaryNode getEqkid() { 99 | return this.eqkid; 100 | } 101 | 102 | /** 103 | * This sets the eqkid of this TernaryNode. 104 | * 105 | * @param node 106 | * TernaryNode 107 | */ 108 | public void setEqkid(final TernaryNode node) { 109 | this.eqkid = node; 110 | } 111 | 112 | /** 113 | * This returns the hikid of this TernaryNode. 114 | * 115 | * @return TernaryNode 116 | */ 117 | public TernaryNode getHikid() { 118 | return this.hikid; 119 | } 120 | 121 | /** 122 | * This sets the hikid of this TernaryNode. 123 | * 124 | * @param node 125 | * TernaryNode 126 | */ 127 | public void setHikid(final TernaryNode node) { 128 | this.hikid = node; 129 | } 130 | } -------------------------------------------------------------------------------- /src/main/java/dict/build/TernaryTree.java: -------------------------------------------------------------------------------- 1 | package dict.build; 2 | 3 | import java.io.IOException; 4 | import java.io.Writer; 5 | import java.util.ArrayList; 6 | import java.util.Collections; 7 | import java.util.List; 8 | import java.util.StringTokenizer; 9 | 10 | /** 11 | * TernaryTree is an implementation of a ternary tree. Methods are 12 | * provided for inserting strings and searching for strings. The algorithms in 13 | * this class are all recursive, and have not been optimized for any particular 14 | * purpose. Data which is inserted is not sorted before insertion, however data 15 | * can be inserted beginning with the median of the supplied data. 16 | * 17 | * @author Middleware Services 18 | * @version $Revision$ $Date$ 19 | */ 20 | @Deprecated 21 | public class TernaryTree { 22 | 23 | /** File system line separator. */ 24 | private static final String LINE_SEPARATOR = System 25 | .getProperty("line.separator"); 26 | 27 | /** root node of the ternary tree. */ 28 | private TernaryNode root; 29 | 30 | /** Default Constructor. */ 31 | public TernaryTree() { 32 | } 33 | 34 | /** 35 | * This will insert the supplied word into the TernaryTree. 36 | * 37 | * @param word 38 | * String to insert 39 | */ 40 | public void insert(final String word, final int value) { 41 | if (word != null) { 42 | this.root = insertNode(this.root, word, 0, value); 43 | } 44 | } 45 | 46 | /** 47 | * This will return true if the supplied word has been inserted into the 48 | * TernaryTree. 49 | * 50 | * @param word 51 | * String to search for 52 | * 53 | * @return boolean - whether word was found 54 | */ 55 | public int search(final String word) { 56 | return this.searchNode(this.root, word, 0); 57 | } 58 | 59 | /** 60 | * This will return an array of strings which partially match the supplied 61 | * word. word should be of the format '.e.e.e' Where the '.' character 62 | * represents any valid character. Possible results from this query include: 63 | * Helene, delete, or severe Note that no substring matching occurs, results 64 | * only include strings of the same length. If the supplied word does not 65 | * contain the '.' character, then a regular search is performed. 66 | * 67 | * @param word 68 | * String to search for 69 | * 70 | * @return String[] - of matching words 71 | */ 72 | public String[] partialSearch(final String word) { 73 | String[] results = null; 74 | final List matches = this.partialSearchNode(this.root, 75 | new ArrayList(), "", word, 0); 76 | if (matches == null) { 77 | results = new String[] {}; 78 | } else { 79 | results = matches.toArray(new String[matches.size()]); 80 | } 81 | return results; 82 | } 83 | 84 | /** 85 | * This will return an array of strings which are near to the supplied word 86 | * by the supplied distance. For the query nearSearch("fisher", 2): Possible 87 | * results include: cipher, either, fishery, kosher, sister. If the supplied 88 | * distance is not > 0, then a regular search is performed. 89 | * 90 | * @param word 91 | * String to search for 92 | * @param distance 93 | * int for valid match 94 | * 95 | * @return String[] - of matching words 96 | */ 97 | public String[] nearSearch(final String word, final int distance) { 98 | String[] results = null; 99 | final List matches = this.nearSearchNode(this.root, distance, 100 | new ArrayList(), "", word, 0); 101 | if (matches == null) { 102 | results = new String[] {}; 103 | } else { 104 | results = matches.toArray(new String[matches.size()]); 105 | } 106 | return results; 107 | } 108 | 109 | /** 110 | * This will return a list of all the words in this 111 | * TernaryTree. This is a very expensive operation, every node in the 112 | * tree is traversed. The returned list cannot be modified. 113 | * 114 | * @return String[] - of words 115 | */ 116 | public List getWords() { 117 | final List words = this.traverseNode(this.root, "", 118 | new ArrayList()); 119 | return Collections.unmodifiableList(words); 120 | } 121 | 122 | /** 123 | * This will print an ASCII representation of this TernaryTree 124 | * to the supplied PrintWriter. This is a very expensive 125 | * operation, every node in the tree is traversed. The output produced is 126 | * hard to read, but it should give an indication of whether or not your 127 | * tree is balanced. 128 | * 129 | * @param out 130 | * PrintWriter to print to 131 | * @throws IOException 132 | * if an error occurs 133 | */ 134 | public void print(final Writer out) throws IOException { 135 | out.write(printNode(this.root, "", 0)); 136 | } 137 | 138 | /** 139 | * This will recursively insert a word into the TernaryTree one 140 | * node at a time beginning at the supplied node. 141 | * 142 | * @param node 143 | * TernaryNode to put character in 144 | * @param word 145 | * String to be inserted 146 | * @param index 147 | * int of character in word 148 | * 149 | * @return TernaryNode - to insert 150 | */ 151 | private TernaryNode insertNode(TernaryNode node, final String word, 152 | final int index, final int value) { 153 | if (index < word.length()) { 154 | final char c = word.charAt(index); 155 | if (node == null) { 156 | node = new TernaryNode(c); 157 | } 158 | 159 | final char split = node.getSplitChar(); 160 | if (c < split) { 161 | node.setLokid(insertNode(node.getLokid(), word, index, value)); 162 | } else if (c == split) { 163 | if (index == word.length() - 1) { 164 | node.setEndOfWord(value); 165 | } 166 | node.setEqkid(insertNode(node.getEqkid(), word, index + 1, 167 | value)); 168 | } else { 169 | node.setHikid(insertNode(node.getHikid(), word, index, value)); 170 | } 171 | } 172 | return node; 173 | } 174 | 175 | /** 176 | * This will recursively search for a word in the TernaryTree 177 | * one node at a time beginning at the supplied node. 178 | * 179 | * @param node 180 | * TernaryNode to search in 181 | * @param word 182 | * String to search for 183 | * @param index 184 | * int of character in word 185 | * 186 | * @return boolean - whether or not word was found 187 | */ 188 | private int searchNode(final TernaryNode node, final String word, 189 | final int index) { 190 | if (node != null && index < word.length()) { 191 | final char c = word.charAt(index); 192 | final char split = node.getSplitChar(); 193 | if (c < split) { 194 | return searchNode(node.getLokid(), word, index); 195 | } else if (c > split) { 196 | return searchNode(node.getHikid(), word, index); 197 | } else { 198 | if (index == word.length() - 1) { 199 | if (node.isEndOfWord()) { 200 | return node.getValue(); 201 | } 202 | } else { 203 | return searchNode(node.getEqkid(), word, index + 1); 204 | } 205 | } 206 | } 207 | return -1; 208 | } 209 | 210 | /** 211 | * This will recursively search for a partial word in the 212 | * TernaryTree one node at a time beginning at the supplied node. 213 | * 214 | * @param node 215 | * TernaryNode to search in 216 | * @param matches 217 | * ArrayList of partial matches 218 | * @param match 219 | * String the current word being examined 220 | * @param word 221 | * String to search for 222 | * @param index 223 | * int of character in word 224 | * 225 | * @return ArrayList - of matches 226 | */ 227 | private List partialSearchNode(final TernaryNode node, 228 | List matches, final String match, final String word, 229 | final int index) { 230 | if (node != null && index < word.length()) { 231 | final char c = word.charAt(index); 232 | final char split = node.getSplitChar(); 233 | if (c == '.' || c < split) { 234 | matches = partialSearchNode(node.getLokid(), matches, match, 235 | word, index); 236 | } 237 | if (c == '.' || c == split) { 238 | if (index == word.length() - 1) { 239 | if (node.isEndOfWord()) { 240 | matches.add(match + split); 241 | } 242 | } else { 243 | matches = partialSearchNode(node.getEqkid(), matches, match 244 | + split, word, index + 1); 245 | } 246 | } 247 | if (c == '.' || c > split) { 248 | matches = partialSearchNode(node.getHikid(), matches, match, 249 | word, index); 250 | } 251 | } 252 | return matches; 253 | } 254 | 255 | /** 256 | * This will recursively search for a near match word in the 257 | * TernaryTree one node at a time beginning at the supplied node. 258 | * 259 | * @param node 260 | * TernaryNode to search in 261 | * @param distance 262 | * int of a valid match, must be > 0 263 | * @param matches 264 | * ArrayList of near matches 265 | * @param match 266 | * String the current word being examined 267 | * @param word 268 | * String to search for 269 | * @param index 270 | * int of character in word 271 | * 272 | * @return ArrayList - of matches 273 | */ 274 | private List nearSearchNode(final TernaryNode node, 275 | final int distance, List matches, final String match, 276 | final String word, final int index) { 277 | if (node != null && distance >= 0) { 278 | 279 | final char c; 280 | if (index < word.length()) { 281 | c = word.charAt(index); 282 | } else { 283 | c = (char) -1; 284 | } 285 | 286 | final char split = node.getSplitChar(); 287 | 288 | if (distance > 0 || c < split) { 289 | matches = nearSearchNode(node.getLokid(), distance, matches, 290 | match, word, index); 291 | } 292 | 293 | final String newMatch = match + split; 294 | if (c == split) { 295 | 296 | if (node.isEndOfWord() && distance >= 0 297 | && newMatch.length() + distance >= word.length()) { 298 | matches.add(newMatch); 299 | } 300 | 301 | matches = nearSearchNode(node.getEqkid(), distance, matches, 302 | newMatch, word, index + 1); 303 | } else { 304 | 305 | if (node.isEndOfWord() && distance - 1 >= 0 306 | && newMatch.length() + distance - 1 >= word.length()) { 307 | matches.add(newMatch); 308 | } 309 | 310 | matches = nearSearchNode(node.getEqkid(), distance - 1, 311 | matches, newMatch, word, index + 1); 312 | } 313 | 314 | if (distance > 0 || c > split) { 315 | matches = nearSearchNode(node.getHikid(), distance, matches, 316 | match, word, index); 317 | } 318 | } 319 | return matches; 320 | } 321 | 322 | /** 323 | * This will recursively traverse every node in the TernaryTree 324 | * one node at a time beginning at the supplied node. The result is a string 325 | * representing every word, which is delimited by the LINE_SEPARATOR 326 | * character. 327 | * 328 | * @param node 329 | * TernaryNode to begin traversing 330 | * @param s 331 | * String of words found at the supplied node 332 | * @param words 333 | * ArrayList which will be returned (recursive 334 | * function) 335 | * 336 | * @return String - containing all words from the supplied node 337 | */ 338 | private List traverseNode(final TernaryNode node, final String s, 339 | List words) { 340 | if (node != null) { 341 | 342 | words = this.traverseNode(node.getLokid(), s, words); 343 | 344 | final String c = String.valueOf(node.getSplitChar()); 345 | if (node.getEqkid() != null) { 346 | words = this.traverseNode(node.getEqkid(), s + c, words); 347 | } 348 | 349 | if (node.isEndOfWord()) { 350 | words.add(s + c); 351 | } 352 | 353 | words = this.traverseNode(node.getHikid(), s, words); 354 | } 355 | return words; 356 | } 357 | 358 | /** 359 | * This will recursively traverse every node in the TernaryTree 360 | * one node at a time beginning at the supplied node. The result is an ASCII 361 | * string representation of the tree beginning at the supplied node. 362 | * 363 | * @param node 364 | * TernaryNode to begin traversing 365 | * @param s 366 | * String of words found at the supplied node 367 | * @param depth 368 | * int of the current node 369 | * 370 | * @return String - containing all words from the supplied node 371 | */ 372 | private String printNode(final TernaryNode node, final String s, 373 | final int depth) { 374 | final StringBuffer buffer = new StringBuffer(); 375 | if (node != null) { 376 | buffer.append(this.printNode(node.getLokid(), " <-", depth + 1)); 377 | 378 | final String c = String.valueOf(node.getSplitChar()); 379 | final StringBuffer eq = new StringBuffer(); 380 | if (node.getEqkid() != null) { 381 | eq.append(this.printNode(node.getEqkid(), s + c + "--", 382 | depth + 1)); 383 | } else { 384 | int count = (new StringTokenizer(s, "--")).countTokens(); 385 | if (count > 0) { 386 | count--; 387 | } 388 | for (int i = 1; i < depth - count - 1; i++) { 389 | eq.append(" "); 390 | } 391 | eq.append(s).append(c).append(TernaryTree.LINE_SEPARATOR); 392 | } 393 | buffer.append(eq); 394 | 395 | buffer.append(this.printNode(node.getHikid(), " >-", depth + 1)); 396 | } 397 | return buffer.toString(); 398 | } 399 | 400 | public static void main(String[] args) { 401 | TernaryTree tt = new TernaryTree(); 402 | tt.insert("a", 1); 403 | tt.insert("aa", 2); 404 | tt.insert("aaa", 3); 405 | tt.insert("aaaa", 4); 406 | System.out.println(tt.search("aaa")); 407 | } 408 | } -------------------------------------------------------------------------------- /src/main/resources/dict.properties: -------------------------------------------------------------------------------- 1 | HELP_DESCRIPTION=list all cmd 2 | RAW_FILE=raw data file line by line 3 | SORT_MEM_SIZE_IN_MB=memory size in mb use by sorting 4 | MAX_WORD_LENGTH=max length of word 5 | OUTPUT_DICT_FILE=output result dict file -------------------------------------------------------------------------------- /src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | %d{yyyy-MM-dd HH:mm:ss} [%thread] %-5level %logger -%msg%n 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | ERROR 34 | ACCEPT 35 | DENY 36 | 37 | 38 | 39 | 40 | ${log_dir}/%d{yyyy-MM-dd}/error-log.log 41 | 43 | ${maxHistory} 44 | 45 | 46 | 52 | 56 | 57 | 58 | %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger - %msg%n 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | WARN 69 | ACCEPT 70 | DENY 71 | 72 | 73 | 74 | ${log_dir}/%d{yyyy-MM-dd}/warn-log.log 75 | 76 | 77 | ${maxHistory} 78 | 79 | 80 | %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger - %msg%n 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | INFO 92 | ACCEPT 93 | DENY 94 | 95 | 96 | 97 | ${log_dir}/%d{yyyy-MM-dd}/info-log.log 98 | 99 | 100 | ${maxHistory} 101 | 102 | 103 | %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger - %msg%n 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | DEBUG 115 | ACCEPT 116 | DENY 117 | 118 | 119 | 120 | ${log_dir}/%d{yyyy-MM-dd}/debug-log.log 121 | 122 | 123 | ${maxHistory} 124 | 125 | 126 | %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger - %msg%n 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | TRACE 138 | ACCEPT 139 | DENY 140 | 141 | 142 | 143 | ${log_dir}/%d{yyyy-MM-dd}/trace-log.log 144 | 145 | 146 | ${maxHistory} 147 | 148 | 149 | %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger - %msg%n 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | --------------------------------------------------------------------------------