├── .github
    └── workflows
    │   └── codeql-analysis.yml
├── .gitignore
├── .settings
    ├── org.eclipse.core.resources.prefs
    └── org.eclipse.m2e.core.prefs
├── LICENSE
├── README.md
├── SECURITY.md
├── build.gradle
├── dict_build-0.0.3.tar
├── gradle
    └── wrapper
    │   └── gradle-wrapper.properties
├── gradlew
├── gradlew.bat
├── pom.xml
├── settings.gradle
└── src
    └── main
        ├── java
            ├── com
            │   └── fasterxml
            │   │   └── sort
            │   │       ├── DataReader.java
            │   │       ├── DataReaderFactory.java
            │   │       ├── DataWriter.java
            │   │       ├── DataWriterFactory.java
            │   │       ├── IterableSorterException.java
            │   │       ├── IteratingSorter.java
            │   │       ├── Merger.java
            │   │       ├── SortConfig.java
            │   │       ├── Sorter.java
            │   │       ├── SorterBase.java
            │   │       ├── SortingState.java
            │   │       ├── TempFileProvider.java
            │   │       ├── std
            │   │           ├── ByteArrayComparator.java
            │   │           ├── RawTextLineReader.java
            │   │           ├── RawTextLineWriter.java
            │   │           ├── StdComparator.java
            │   │           ├── StdTempFileProvider.java
            │   │           └── TextFileSorter.java
            │   │       └── util
            │   │           ├── BlockingQueueReader.java
            │   │           ├── CastingIterator.java
            │   │           ├── CollectionReader.java
            │   │           ├── NaturalComparator.java
            │   │           └── SegmentedBuffer.java
            ├── dict.properties
            ├── dict
            │   └── build
            │   │   ├── Builder.java
            │   │   ├── CounterMap.java
            │   │   ├── FastBuilder.java
            │   │   ├── LineReader.java
            │   │   ├── LineWriter.java
            │   │   ├── Main.java
            │   │   ├── PosProbability.java
            │   │   ├── SplitFileSorter.java
            │   │   ├── SplitStringComparator.java
            │   │   ├── TernaryNode.java
            │   │   └── TernaryTree.java
            └── pos_prop.txt
        └── resources
            ├── dict.properties
            ├── logback.xml
            └── pos_prop.txt


/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
 1 | # For most projects, this workflow file will not need changing; you simply need
 2 | # to commit it to your repository.
 3 | #
 4 | # You may wish to alter this file to override the set of languages analyzed,
 5 | # or to provide custom queries or build logic.
 6 | #
 7 | # ******** NOTE ********
 8 | # We have attempted to detect the languages in your repository. Please check
 9 | # the `language` matrix defined below to confirm you have the correct set of
10 | # supported CodeQL languages.
11 | #
12 | name: "CodeQL"
13 | 
14 | on:
15 |   push:
16 |     branches: [ master ]
17 |   pull_request:
18 |     # The branches below must be a subset of the branches above
19 |     branches: [ master ]
20 |   schedule:
21 |     - cron: '42 2 * * 2'
22 | 
23 | jobs:
24 |   analyze:
25 |     name: Analyze
26 |     runs-on: ubuntu-latest
27 | 
28 |     strategy:
29 |       fail-fast: false
30 |       matrix:
31 |         language: [ 'java' ]
32 |         # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python' ]
33 |         # Learn more:
34 |         # https://docs.github.com/en/free-pro-team@latest/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#changing-the-languages-that-are-analyzed
35 | 
36 |     steps:
37 |     - name: Checkout repository
38 |       uses: actions/checkout@v2
39 | 
40 |     # Initializes the CodeQL tools for scanning.
41 |     - name: Initialize CodeQL
42 |       uses: github/codeql-action/init@v1
43 |       with:
44 |         languages: ${{ matrix.language }}
45 |         # If you wish to specify custom queries, you can do so here or in a config file.
46 |         # By default, queries listed here will override any specified in a config file.
47 |         # Prefix the list here with "+" to use these queries and those in the config file.
48 |         # queries: ./path/to/local/query, your-org/your-repo/queries@main
49 | 
50 |     # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
51 |     # If this step fails, then you should remove it and run the build manually (see below)
52 |     - name: Autobuild
53 |       uses: github/codeql-action/autobuild@v1
54 | 
55 |     # ℹ️ Command-line programs to run using the OS shell.
56 |     # 📚 https://git.io/JvXDl
57 | 
58 |     # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
59 |     #    and modify them (or add more) to build your code if your project
60 |     #    uses a compiled language
61 | 
62 |     #- run: |
63 |     #   make bootstrap
64 |     #   make release
65 | 
66 |     - name: Perform CodeQL Analysis
67 |       uses: github/codeql-action/analyze@v1
68 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.class
 2 | 
 3 | # Mobile Tools for Java (J2ME)
 4 | .mtj.tmp/
 5 | 
 6 | # Package Files #
 7 | *.jar
 8 | *.war
 9 | *.ear
10 | 
11 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
12 | hs_err_pid*
13 | /bin
14 | /target
15 | # idea IDE
16 | out
17 | .idea
18 | *.iml
19 | *.ipr
20 | *.iws
21 | *.ids
22 | 
23 | #file
24 | *.pdf
25 | 
26 | # eclipse IDE
27 | .classpath
28 | .project
29 | .settings
30 | 
31 | # build
32 | .gradle
33 | /build
34 | 
35 | #mac
36 | .DS_Store
37 | 
38 | #log
39 | logs
40 | 


--------------------------------------------------------------------------------
/.settings/org.eclipse.core.resources.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | encoding/<project>=UTF-8
3 | 


--------------------------------------------------------------------------------
/.settings/org.eclipse.m2e.core.prefs:
--------------------------------------------------------------------------------
1 | activeProfiles=
2 | eclipse.preferences.version=1
3 | resolveWorkspaceProjects=true
4 | version=1
5 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 
203 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 构建词库
  2 | ==========
  3 | 
  4 | 从原始文本中，自动构建词库，目前只适用于中文。参考：
  5 | 
  6 | http://www.matrix67.com/blog/archives/5044
  7 | 
  8 | ### new in 0.0.3
  9 | 
 10 | 1. 使用radix tree代替ternary search tree，提升性能。
 11 | 2. 加入LOG信息，展示抽取的进度。
 12 | 
 13 | 
 14 | ### new in 0.0.2
 15 | 
 16 | 1. 直接导入[java-merge-sort](https://github.com/cowtowncoder/java-merge-sort)源码, thx[@cowtowncoder](https://github.com/cowtowncoder)
 17 | 2. 将之前的maven项目，转变为一个gradle项目，方便打包使用。
 18 | 
 19 | ### 成词条件
 20 | 
 21 | 1. 互信息
 22 | 2. 左右熵
 23 | 3. 位置成词概率
 24 | 4. ngram 频率
 25 | 
 26 | ### 运行方法
 27 | 
 28 | 1. [下载](https://github.com/sing1ee/dict_build/blob/master/dict_build-0.0.3.tar?raw=true)或者gradle distTar打包程序
 29 | 2. 解压dict_build-x.x.x.tar
 30 | 3. 解压之后,进入bin. 运行:./dict_build 你的数据文件的绝对路径
 31 | 4. 结束之后,在数据文件同目录有文件:words_sort.data
 32 | 5. 四列分别为:词,词频,互信息,左右熵,位置成词概率.
 33 | 
 34 | ### 注意
 35 | 
 36 | - 数据文件一定要是UTF8编码的
 37 | - 如果数据文件较大, 出现out of memory问题，可以尝试如下方式，限mac和linux，其中2G可以根据实际情况调整
 38 | 
 39 | ```shell
 40 | export JAVA_OPTS=-Xmx2G
 41 | ./dict_build 你的数据文件的绝对路径
 42 | ```
 43 | 
 44 | ### 示例
 45 | 
 46 | #### 《金瓶梅》抽取结果
 47 | ```shell
 48 | 西门庆  4754    6.727920454563199   2.0315193024276885  0.17472535684926388
 49 | 月娘    1829    6.491853096329675   2.3714166640957095  0.22135096835144072
 50 | 敬济    906 9.084808387804362   2.554594603718855   0.14485683987274656
 51 | 春梅    799 8.134426320220927   2.7880175589451714  0.16484505593416485
 52 | 玳安    796 8.228818690495881   2.865686193737731   0.11791820110723605
 53 | 后边    617 6.6293566200796095  4.008365154080131   0.2160373686259245
 54 | 玉楼    594 7.977279923499917   2.27346284978306    0.27518689925240297
 55 | 明日    580 6.189824558880018   2.705423396095033   0.1774535638537181
 56 | 两银子  458 6.129283016944967   2.351100547282295   0.3809078896437581
 57 | 小厮    454 7.257387842692652   3.945653525477103   0.16666666666666666
 58 | 打发    444 6.870364719583405   3.694604352707633   0.18409496065046307
 59 | 如今    410 6.643856189774725   2.1460777430093394  0.1780766096169519
 60 | 淫妇    382 7.768184324776926   3.277903508489837   0.2555205047318612
 61 | 桂姐    371 7.584962500721156   2.5922046565140424  0.36255305256284687
 62 | 老婆    331 6.266786540694902   3.5783015008688523  0.3758007117437722
 63 | 衣服    309 8.90388184573618    2.786139685416002   0.13284518828451883
 64 | 丫头    297 7.383704292474053   4.291010086795063   0.21875
 65 | 潘金莲  288 8.276124405274238   2.4955186567189194  0.35333669524289796
 66 | 昨日    285 6.857980995127572   2.6387249970833997  0.1774535638537181
 67 | 王婆    284 7.1799090900149345  2.3129267619188907  0.3758007117437722
 68 | ```
 69 | 
 70 | #### 《西游记》抽取结果
 71 | ```shell
 72 | 八戒    1807    7.88874324889826    2.00952580557629    0.36441586280814575
 73 | 师父    1632    7.507794640198696   3.745294449785798   0.1371395690812608
 74 | 大圣    1270    6.599912842187128   2.7790919785432147  0.13128460061010055
 75 | 唐僧    1003    7.076815597050832   4.350465172292435   0.43277723258096173
 76 | 菩萨    765 9.471675214392045   3.6013747138664756  0.15910495734948696
 77 | 妖精    634 7.199672344836364   3.1817261900583627  0.13134411600669268
 78 | 徒弟    439 8.060695931687555   2.498555429145656   0.15553809897879026
 79 | 兄弟    284 7.845490050944376   2.93037668783551    0.16085578446909668
 80 | 宝贝    283 9.319672120946995   2.616164396748633   0.15108220492589827
 81 | 今日    282 6.714245517666122   2.1303069812971214  0.1774535638537181
 82 | 取经    263 7.539158811108032   2.663944888382171   0.10181178023912565
 83 | 如今    259 6.189824558880018   2.056188859866133   0.1780766096169519
 84 | 认得    223 6.357552004618085   2.9543379335926954  0.2326782564877803
 85 | 东土    212 8.422064766172811   3.326253983395916   0.14745277618775043
 86 | 孙大圣  202 6.022367813028454   2.4886576514017107  0.13128460061010055
 87 | 变作    189 7.554588851677638   3.0713596792578635  0.23452975920036348
 88 | 玉帝    189 8.912889336229961   2.973106046717708   0.27518689925240297
 89 | 土地    179 7.499845887083206   3.1206506190132566  0.2819944064037033
 90 | 欢喜    173 8.861086905995393   2.184918471204895   0.31727272727272726
 91 | 贫僧    170 7.400879436282184   2.0731236036504477  0.43277723258096173
 92 | ```
 93 | 
 94 | #### 拉勾JD语料抽取结果
 95 | ```shell
 96 | 工作	641962	11.645208082774683	4.083574124851783	0.11247281022865935
 97 | 开发	348538	14.031184262140844	4.37645153459778	0.18409496065046307
 98 | 相关	300517	10.477758266443889	5.038915743418073	0.1758213331033888
 99 | 合作	159688	10.397674632948268	3.9963476653135794	0.19498851077798446
100 | 专业	158831	10.712527000439824	3.152041650598071	0.2640750670241287
101 | 测试	158179	13.65362883340751	4.464104436545589	0.18344308560677328
102 | 互联网	148818	16.106992250086762	3.9556191209604314	0.407386403912951
103 | 活动	131099	10.391243589427443	3.9155422678129406	0.20137250696976194
104 | 维护	120316	12.681677655209691	3.2400117935377266	0.1960306406685237
105 | 问题	112116	9.159871336778389	2.314215135279833	0.20283174185051037
106 | 优化	109563	11.324180546618742	4.331660381832997	0.2456782591010779
107 | 营销	105845	14.36850646150769	5.097001962525406	0.14961371773129828
108 | 平台	100783	9.002815015607053	4.443804901153697	0.2877423571272965
109 | 培训	93204	9.041659151637216	3.8898570467819824	0.13345998575160295
110 | 资源	90339	8.651051691178928	4.063430372719874	0.14695817490494298
111 | 相关专业	87545	8.988684686772165	2.4897196388075598	0.2905199904149232
112 | 网站	87182	8.92184093707449	5.465843476701055	0.21266038137095059
113 | 独立	86111	9.074141462752506	3.1456261690072957	0.19050261614079594
114 | 一定	83798	8.335390354693924	2.107303660112154	0.26157299167679793
115 | 流程	83165	9.321928094887362	2.5509378861028074	0.2063141084699957
116 | 网络	82742	9.087462841250339	4.681429111504988	0.21266038137095059
117 | 优秀	74600	9.370687406807217	2.0756995478573135	0.2899855507391353
118 | 信息	71009	9.820178962415188	4.2602697278449755	0.18863532864443658
119 | 媒体	67533	10.556506054671928	4.615376861300178	0.17976710334788937
120 | 编写	64337	7.960001932068081	3.482400585501417	0.265625
121 | 思维	62351	8.741466986401146	2.4320664807326646	0.15396736072031514
122 | 规划	59733	7.851749041416057	2.936854928368285	0.14166201896263245
123 | 移动	59671	10.10459875356437	3.4421932833155653	0.20137250696976194
124 | 渠道	59072	9.513727595952437	4.597891463808354	0.23578595317725753
125 | 关系	58483	8.348728154231077	2.4369558675502927	0.3170022612253688
126 | 积极	57295	9.044394119358454	2.763249521041074	0.1746848469256496
127 | 实施	56645	7.781359713524661	4.371966846513886	0.15944453739334113
128 | 福利	55732	8.475733430966399	2.4036919305145426	0.20908952728378172
129 | 其他	55665	8.434628227636725	2.9614863103296867	0.15943975441289332
130 | 功能	55087	7.787902559391432	4.1663586610392755	0.18097560975609756
131 | 代码	52431	7.88874324889826	3.876917512626917	0.2135697048449972
132 | 微信	49143	8.945443836377912	3.6868130380800643	0.18215857916308253
133 | 企业	48799	9.422064766172813	5.568662443510237	0.2905199904149232
134 | 提升	48446	8.233619676759702	3.7390647282620666	0.29750778816199375
135 | 质量	47918	10.861862340059153	3.391825261582227	0.10921827734437191
136 | 人员	47109	7.774787059601174	5.249783964892326	0.13589632038101343
137 | 数据库	45445	8.290018846932618	4.123423571610193	0.2640569395017794
138 | 商务	44047	8.189824558880018	3.44858516585648	0.12901085044961344
139 | 主动	42628	13.815583433851023	2.5049637884195137	0.1968791796700847
140 | 创意	41768	14.396470993910388	4.115068825929573	0.30544056771141337
141 | 工具	40227	9.927777962082342	2.208874047820781	0.11247281022865935
142 | 等相关	39230	11.919608238603255	3.0330398736413557	0.1758213331033888
143 | 提出	38741	10.179909090014934	4.46446156782086	0.13053040103492886
144 | 各类	38309	8.344295907915816	5.136417986953123	0.3969948596283116
145 | 操作	37061	9.06339508128851	4.676836974292029	0.23452975920036348
146 | 收集	36600	8.800899899920305	2.797691452951563	0.11388512456999896
147 | 过程	36534	8.214319120800766	2.5633950372758565	0.2063141084699957
148 | 数据分析	36081	8.442943495848729	3.5589033442862585	0.2640569395017794
149 | ```
150 | 
151 | #### 全宋词抽取结果
152 | ```shell
153 | 何处	388	6.491853096329675	3.3628674437455617	0.6815015936725298
154 | 东风	286	5.392317422778761	4.458774408044057	0.19724622030237582
155 | 江南	250	6.409390936137703	3.903802705407174	0.10545138034778331
156 | 春风	237	3.5849625007211565	4.927775131630969	0.16484505593416485
157 | 相思	225	6.614709844115209	4.358855443007008	0.242072962836686
158 | 千里	218	6.409390936137703	4.4108660037595	0.2562873368242496
159 | 人间	200	5.357552004618084	3.6298146463975085	0.13589632038101343
160 | 明月	196	5.357552004618084	4.461698115330817	0.2009720696427977
161 | 归来	195	5.08746284125034	4.510975805812117	0.4260707923476106
162 | 尊前	190	7.607330313749611	3.7677180601390012	0.1516088400320623
163 | 相逢	179	7.426264754702098	3.729594240735622	0.2827298050139276
164 | 芳草	176	7.409390936137703	4.193709696939418	0.10797973400886637
165 | 多情	175	6.247927513443586	3.8156445316213303	0.3327408912022344
166 | 阑干	167	9.30149619498255	4.1027945328835855	0.17564639607106747
167 | 梅花	159	4.807354922057604	4.829461592976214	0.1725721995566835
168 | 年年	157	3.8073549220576037	3.401504022650184	0.10157033077180087
169 | 无人	150	2.807354922057604	4.773999920722275	0.35809310100061825
170 | 如今	148	5.7279204545632	2.4554158038937834	0.1780766096169519
171 | 回首	145	7.94251450533924	3.197825274741958	0.20080445544554457
172 | 天涯	142	7.74819284958946	4.087307754334477	0.4339155749636099
173 | 一枝	135	5.20945336562895	3.5111675192832683	0.2674922938432581
174 | 当时	134	6.08746284125034	3.2683525636568564	0.14850198715988994
175 | 流水	132	5.700439718141093	4.024081009656002	0.13549047394111163
176 | 佳人	131	5.20945336562895	3.0918026501936384	0.22896958600345846
177 | 西风	128	4.321928094887363	4.310178372466687	0.19724622030237582
178 | 依旧	125	7.768184324776926	3.8821144630683277	0.1728525980911983
179 | 故人	122	5.392317422778761	2.9526098687901237	0.2363130219610269
180 | 今夜	121	5.554588851677638	3.239568407653533	0.2543231961836613
181 | 少年	120	5.357552004618084	2.8645866477158934	0.23419345103365022
182 | 春色	120	5.129283016944966	4.576389958371988	0.16484505593416485
183 | ```
184 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | # Security Policy
 2 | 
 3 | ## Supported Versions
 4 | 
 5 | Use this section to tell people about which versions of your project are
 6 | currently being supported with security updates.
 7 | 
 8 | | Version | Supported          |
 9 | | ------- | ------------------ |
10 | | 5.1.x   | :white_check_mark: |
11 | | 5.0.x   | :x:                |
12 | | 4.0.x   | :white_check_mark: |
13 | | < 4.0   | :x:                |
14 | 
15 | ## Reporting a Vulnerability
16 | 
17 | Use this section to tell people how to report a vulnerability.
18 | 
19 | Tell them where to go, how often they can expect to get an update on a
20 | reported vulnerability, what to expect if the vulnerability is accepted or
21 | declined, etc.
22 | 


--------------------------------------------------------------------------------
/build.gradle:
--------------------------------------------------------------------------------
 1 | apply plugin: 'java'
 2 | apply plugin: 'maven'
 3 | apply plugin: 'eclipse'
 4 | apply plugin: 'idea'
 5 | apply plugin: 'application'
 6 | 
 7 | group = 'build.dict'
 8 | version = '0.0.3'
 9 | 
10 | sourceCompatibility = 1.8
11 | targetCompatibility = 1.8
12 | 
13 | mainClassName='dict.build.Main'
14 | 
15 | repositories {
16 |     mavenLocal()
17 |     maven { url 'http://nexus.ufish.io/content/groups/public/' }
18 |     mavenCentral()
19 | }
20 | dependencies {
21 |     compile('com.google.guava:guava:17.0')
22 |     compile('com.google.code.externalsortinginjava:externalsortinginjava:0.1.9')
23 |     compile('ch.qos.logback:logback-classic:1.0.13')
24 |     compile('ch.qos.logback:logback-core:1.0.13')
25 |     compile('org.slf4j:slf4j-api:1.6.4')
26 |     compile('commons-logging:commons-logging:1.1.1')
27 |     compile('commons-cli:commons-cli:1.2')
28 |     compile('com.googlecode.concurrent-trees:concurrent-trees:2.6.0')
29 | }
30 | 


--------------------------------------------------------------------------------
/dict_build-0.0.3.tar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sing1ee/dict_build/426368bbfb4cc360c678cc75ab7b3ca4a926e25b/dict_build-0.0.3.tar


--------------------------------------------------------------------------------
/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | #Tue Apr 26 11:25:52 CST 2016
2 | distributionBase=GRADLE_USER_HOME
3 | distributionPath=wrapper/dists
4 | zipStoreBase=GRADLE_USER_HOME
5 | zipStorePath=wrapper/dists
6 | distributionUrl=https\://services.gradle.org/distributions/gradle-2.10-bin.zip
7 | 


--------------------------------------------------------------------------------
/gradlew:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | ##############################################################################
  4 | ##
  5 | ##  Gradle start up script for UN*X
  6 | ##
  7 | ##############################################################################
  8 | 
  9 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
 10 | DEFAULT_JVM_OPTS=""
 11 | 
 12 | APP_NAME="Gradle"
 13 | APP_BASE_NAME=`basename "$0"`
 14 | 
 15 | # Use the maximum available, or set MAX_FD != -1 to use that value.
 16 | MAX_FD="maximum"
 17 | 
 18 | warn ( ) {
 19 |     echo "$*"
 20 | }
 21 | 
 22 | die ( ) {
 23 |     echo
 24 |     echo "$*"
 25 |     echo
 26 |     exit 1
 27 | }
 28 | 
 29 | # OS specific support (must be 'true' or 'false').
 30 | cygwin=false
 31 | msys=false
 32 | darwin=false
 33 | case "`uname`" in
 34 |   CYGWIN* )
 35 |     cygwin=true
 36 |     ;;
 37 |   Darwin* )
 38 |     darwin=true
 39 |     ;;
 40 |   MINGW* )
 41 |     msys=true
 42 |     ;;
 43 | esac
 44 | 
 45 | # Attempt to set APP_HOME
 46 | # Resolve links: $0 may be a link
 47 | PRG="$0"
 48 | # Need this for relative symlinks.
 49 | while [ -h "$PRG" ] ; do
 50 |     ls=`ls -ld "$PRG"`
 51 |     link=`expr "$ls" : '.*-> \(.*\)$'`
 52 |     if expr "$link" : '/.*' > /dev/null; then
 53 |         PRG="$link"
 54 |     else
 55 |         PRG=`dirname "$PRG"`"/$link"
 56 |     fi
 57 | done
 58 | SAVED="`pwd`"
 59 | cd "`dirname \"$PRG\"`/" >/dev/null
 60 | APP_HOME="`pwd -P`"
 61 | cd "$SAVED" >/dev/null
 62 | 
 63 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
 64 | 
 65 | # Determine the Java command to use to start the JVM.
 66 | if [ -n "$JAVA_HOME" ] ; then
 67 |     if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
 68 |         # IBM's JDK on AIX uses strange locations for the executables
 69 |         JAVACMD="$JAVA_HOME/jre/sh/java"
 70 |     else
 71 |         JAVACMD="$JAVA_HOME/bin/java"
 72 |     fi
 73 |     if [ ! -x "$JAVACMD" ] ; then
 74 |         die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
 75 | 
 76 | Please set the JAVA_HOME variable in your environment to match the
 77 | location of your Java installation."
 78 |     fi
 79 | else
 80 |     JAVACMD="java"
 81 |     which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
 82 | 
 83 | Please set the JAVA_HOME variable in your environment to match the
 84 | location of your Java installation."
 85 | fi
 86 | 
 87 | # Increase the maximum file descriptors if we can.
 88 | if [ "$cygwin" = "false" -a "$darwin" = "false" ] ; then
 89 |     MAX_FD_LIMIT=`ulimit -H -n`
 90 |     if [ $? -eq 0 ] ; then
 91 |         if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
 92 |             MAX_FD="$MAX_FD_LIMIT"
 93 |         fi
 94 |         ulimit -n $MAX_FD
 95 |         if [ $? -ne 0 ] ; then
 96 |             warn "Could not set maximum file descriptor limit: $MAX_FD"
 97 |         fi
 98 |     else
 99 |         warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
100 |     fi
101 | fi
102 | 
103 | # For Darwin, add options to specify how the application appears in the dock
104 | if $darwin; then
105 |     GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
106 | fi
107 | 
108 | # For Cygwin, switch paths to Windows format before running java
109 | if $cygwin ; then
110 |     APP_HOME=`cygpath --path --mixed "$APP_HOME"`
111 |     CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
112 |     JAVACMD=`cygpath --unix "$JAVACMD"`
113 | 
114 |     # We build the pattern for arguments to be converted via cygpath
115 |     ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
116 |     SEP=""
117 |     for dir in $ROOTDIRSRAW ; do
118 |         ROOTDIRS="$ROOTDIRS$SEP$dir"
119 |         SEP="|"
120 |     done
121 |     OURCYGPATTERN="(^($ROOTDIRS))"
122 |     # Add a user-defined pattern to the cygpath arguments
123 |     if [ "$GRADLE_CYGPATTERN" != "" ] ; then
124 |         OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
125 |     fi
126 |     # Now convert the arguments - kludge to limit ourselves to /bin/sh
127 |     i=0
128 |     for arg in "$@" ; do
129 |         CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
130 |         CHECK2=`echo "$arg"|egrep -c "^-"`                                 ### Determine if an option
131 | 
132 |         if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then                    ### Added a condition
133 |             eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
134 |         else
135 |             eval `echo args$i`="\"$arg\""
136 |         fi
137 |         i=$((i+1))
138 |     done
139 |     case $i in
140 |         (0) set -- ;;
141 |         (1) set -- "$args0" ;;
142 |         (2) set -- "$args0" "$args1" ;;
143 |         (3) set -- "$args0" "$args1" "$args2" ;;
144 |         (4) set -- "$args0" "$args1" "$args2" "$args3" ;;
145 |         (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
146 |         (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
147 |         (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
148 |         (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
149 |         (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
150 |     esac
151 | fi
152 | 
153 | # Split up the JVM_OPTS And GRADLE_OPTS values into an array, following the shell quoting and substitution rules
154 | function splitJvmOpts() {
155 |     JVM_OPTS=("$@")
156 | }
157 | eval splitJvmOpts $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS
158 | JVM_OPTS[${#JVM_OPTS[*]}]="-Dorg.gradle.appname=$APP_BASE_NAME"
159 | 
160 | exec "$JAVACMD" "${JVM_OPTS[@]}" -classpath "$CLASSPATH" org.gradle.wrapper.GradleWrapperMain "$@"
161 | 


--------------------------------------------------------------------------------
/gradlew.bat:
--------------------------------------------------------------------------------
 1 | @if "%DEBUG%" == "" @echo off
 2 | @rem ##########################################################################
 3 | @rem
 4 | @rem  Gradle startup script for Windows
 5 | @rem
 6 | @rem ##########################################################################
 7 | 
 8 | @rem Set local scope for the variables with windows NT shell
 9 | if "%OS%"=="Windows_NT" setlocal
10 | 
11 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
12 | set DEFAULT_JVM_OPTS=
13 | 
14 | set DIRNAME=%~dp0
15 | if "%DIRNAME%" == "" set DIRNAME=.
16 | set APP_BASE_NAME=%~n0
17 | set APP_HOME=%DIRNAME%
18 | 
19 | @rem Find java.exe
20 | if defined JAVA_HOME goto findJavaFromJavaHome
21 | 
22 | set JAVA_EXE=java.exe
23 | %JAVA_EXE% -version >NUL 2>&1
24 | if "%ERRORLEVEL%" == "0" goto init
25 | 
26 | echo.
27 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
28 | echo.
29 | echo Please set the JAVA_HOME variable in your environment to match the
30 | echo location of your Java installation.
31 | 
32 | goto fail
33 | 
34 | :findJavaFromJavaHome
35 | set JAVA_HOME=%JAVA_HOME:"=%
36 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe
37 | 
38 | if exist "%JAVA_EXE%" goto init
39 | 
40 | echo.
41 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
42 | echo.
43 | echo Please set the JAVA_HOME variable in your environment to match the
44 | echo location of your Java installation.
45 | 
46 | goto fail
47 | 
48 | :init
49 | @rem Get command-line arguments, handling Windowz variants
50 | 
51 | if not "%OS%" == "Windows_NT" goto win9xME_args
52 | if "%@eval[2+2]" == "4" goto 4NT_args
53 | 
54 | :win9xME_args
55 | @rem Slurp the command line arguments.
56 | set CMD_LINE_ARGS=
57 | set _SKIP=2
58 | 
59 | :win9xME_args_slurp
60 | if "x%~1" == "x" goto execute
61 | 
62 | set CMD_LINE_ARGS=%*
63 | goto execute
64 | 
65 | :4NT_args
66 | @rem Get arguments from the 4NT Shell from JP Software
67 | set CMD_LINE_ARGS=%$
68 | 
69 | :execute
70 | @rem Setup the command line
71 | 
72 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
73 | 
74 | @rem Execute Gradle
75 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
76 | 
77 | :end
78 | @rem End local scope for the variables with windows NT shell
79 | if "%ERRORLEVEL%"=="0" goto mainEnd
80 | 
81 | :fail
82 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
83 | rem the _cmd.exe /c_ return code!
84 | if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
85 | exit /b 1
86 | 
87 | :mainEnd
88 | if "%OS%"=="Windows_NT" endlocal
89 | 
90 | :omega
91 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 2 | 	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 3 | 	<modelVersion>4.0.0</modelVersion>
 4 | 	<groupId>build.dict</groupId>
 5 | 	<version>0.0.1</version>
 6 | 
 7 | 	<packaging>jar</packaging>
 8 | 
 9 | 	<properties>
10 | 		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
11 | 	</properties>
12 | 	<dependencies>
13 | 		<dependency>
14 | 			<groupId>junit</groupId>
15 | 			<artifactId>junit</artifactId>
16 | 			<version>4.13.1</version>
17 | 			<scope>test</scope>
18 | 		</dependency>
19 | 		<dependency>
20 | 			<groupId>com.google.guava</groupId>
21 | 			<artifactId>guava</artifactId>
22 | 			<version>17.0</version>
23 | 		</dependency>
24 | 		<dependency>
25 | 			<groupId>com.google.code.externalsortinginjava</groupId>
26 | 			<artifactId>externalsortinginjava</artifactId>
27 | 			<version>0.1.9</version>
28 | 		</dependency>
29 | 		<dependency>
30 | 			<groupId>ch.qos.logback</groupId>
31 | 			<artifactId>logback-classic</artifactId>
32 | 			<version>1.2.0</version>
33 | 		</dependency>
34 | 		<dependency>
35 | 			<groupId>ch.qos.logback</groupId>
36 | 			<artifactId>logback-core</artifactId>
37 | 			<version>1.2.9</version>
38 | 		</dependency>
39 | 		<dependency>
40 | 			<groupId>org.slf4j</groupId>
41 | 			<artifactId>slf4j-api</artifactId>
42 | 			<version>1.6.4</version>
43 | 		</dependency>
44 | 		<dependency>
45 | 			<groupId>commons-logging</groupId>
46 | 			<artifactId>commons-logging</artifactId>
47 | 			<version>1.1.1</version>
48 | 		</dependency>
49 | 		<dependency>
50 | 			<groupId>commons-cli</groupId>
51 | 			<artifactId>commons-cli</artifactId>
52 | 			<version>1.2</version>
53 | 		</dependency>
54 | 		<dependency>
55 | 			<groupId>com.googlecode.concurrent-trees</groupId>
56 | 			<artifactId>concurrent-trees</artifactId>
57 | 			<version>2.6.0</version>
58 | 		</dependency>
59 | 	</dependencies>
60 | 	<build>
61 | 		<plugins>
62 | 			<plugin>
63 | 				<artifactId>maven-compiler-plugin</artifactId>
64 | 				<version>3.0</version>
65 | 				<configuration>
66 | 					<source>1.8</source>
67 | 					<target>1.8</target>
68 | 				</configuration>
69 | 			</plugin>
70 | 			<plugin>
71 | 				<groupId>org.apache.maven.plugins</groupId>
72 | 				<artifactId>maven-assembly-plugin</artifactId>
73 | 				<version>2.4</version>
74 | 				<configuration>
75 | 					<descriptorRefs>
76 | 						<descriptorRef>jar-with-dependencies</descriptorRef>
77 | 					</descriptorRefs>
78 | 
79 | 				</configuration>
80 | 				<executions>
81 | 					<execution>
82 | 						<id>assemble-all</id>
83 | 						<phase>package</phase>
84 | 						<goals>
85 | 							<goal>single</goal>
86 | 						</goals>
87 | 					</execution>
88 | 				</executions>
89 | 			</plugin>
90 | 		</plugins>
91 | 	</build>
92 | 	<artifactId>dict_build</artifactId>
93 | </project>


--------------------------------------------------------------------------------
/settings.gradle:
--------------------------------------------------------------------------------
1 | rootProject.name = 'dict_build'
2 | 


--------------------------------------------------------------------------------
/src/main/java/com/fasterxml/sort/DataReader.java:
--------------------------------------------------------------------------------
 1 | package com.fasterxml.sort;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | public abstract class DataReader<T>
 6 | {
 7 |     /**
 8 |      * Method for reading the next data item; will return
 9 |      * null to indicate end of input, otherwise return a non-null
10 |      * item.
11 |      */
12 |     public abstract T readNext() throws IOException;
13 | 
14 |     /**
15 |      * Method that should estimate memory usage of given item, for purpose
16 |      * of limiting amount of data kept in memory during pre-sorting phase.
17 |      */
18 |     public abstract int estimateSizeInBytes(T item);
19 |     
20 |     /**
21 |      * Method for closing the reader. Note that reader needs to ensure
22 |      * that it is ok to call close multiple times. Reader may also
23 |      * close underlying resources as soon as it has reached end of input.
24 |      */
25 |     public abstract void close() throws IOException;
26 | }
27 | 


--------------------------------------------------------------------------------
/src/main/java/com/fasterxml/sort/DataReaderFactory.java:
--------------------------------------------------------------------------------
1 | package com.fasterxml.sort;
2 | 
3 | import java.io.*;
4 | 
5 | public abstract class DataReaderFactory<T>
6 | {
7 |     public abstract DataReader<T> constructReader(InputStream in) throws IOException;
8 | }


--------------------------------------------------------------------------------
/src/main/java/com/fasterxml/sort/DataWriter.java:
--------------------------------------------------------------------------------
 1 | package com.fasterxml.sort;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | public abstract class DataWriter<T>
 6 | {
 7 |     public abstract void writeEntry(T item) throws IOException;
 8 | 
 9 |     public abstract void close() throws IOException;
10 | }
11 | 


--------------------------------------------------------------------------------
/src/main/java/com/fasterxml/sort/DataWriterFactory.java:
--------------------------------------------------------------------------------
1 | package com.fasterxml.sort;
2 | 
3 | import java.io.*;
4 | 
5 | public abstract class DataWriterFactory<T>
6 | {
7 |     public abstract DataWriter<T> constructWriter(OutputStream out) throws IOException;
8 | }
9 | 


--------------------------------------------------------------------------------
/src/main/java/com/fasterxml/sort/IterableSorterException.java:
--------------------------------------------------------------------------------
 1 | package com.fasterxml.sort;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.Iterator;
 5 | 
 6 | /**
 7 |  * We need an unchecked exception to work with {@link Iterator}, and
 8 |  * want a specific subtype to catch.
 9 |  */
10 | public class IterableSorterException extends RuntimeException {
11 |     private static final long serialVersionUID = 1L;
12 | 
13 |     public IterableSorterException(IOException cause) {
14 |         super(cause);
15 |     }
16 | }


--------------------------------------------------------------------------------
/src/main/java/com/fasterxml/sort/IteratingSorter.java:
--------------------------------------------------------------------------------
  1 | package com.fasterxml.sort;
  2 | 
  3 | import com.fasterxml.sort.util.CastingIterator;
  4 | import com.fasterxml.sort.util.SegmentedBuffer;
  5 | 
  6 | import java.io.Closeable;
  7 | import java.io.File;
  8 | import java.io.IOException;
  9 | import java.util.ArrayList;
 10 | import java.util.Arrays;
 11 | import java.util.Comparator;
 12 | import java.util.Iterator;
 13 | import java.util.List;
 14 | import java.util.NoSuchElementException;
 15 | 
 16 | public class IteratingSorter<T> extends SorterBase<T> implements Closeable
 17 | {
 18 |     // Set iff sort spilled to disk
 19 |     private List<File> _mergerInputs;
 20 |     private DataReader<T> _merger;
 21 | 
 22 | 
 23 |     public IteratingSorter(SortConfig config,
 24 |                            DataReaderFactory<T> readerFactory,
 25 |                            DataWriterFactory<T> writerFactory,
 26 |                            Comparator<T> comparator)
 27 |     {
 28 |         super(config, readerFactory, writerFactory, comparator);
 29 |     }
 30 | 
 31 |     public IteratingSorter() {
 32 |         super();
 33 |     }
 34 | 
 35 |     public IteratingSorter(SortConfig config) {
 36 |         super(config);
 37 |     }
 38 | 
 39 |     /**
 40 |      * Method that will perform full sort on input data read using given
 41 |      * {@link DataReader}.
 42 |      *
 43 |      * Conversions to and from intermediate sort files is done
 44 |      * using {@link DataReaderFactory} and {@link DataWriterFactory} configured
 45 |      * for this sorter.
 46 |      *
 47 |      * The returned Iterator will throw {@link IterableSorterException} if any
 48 |      * IOException is encountered during calls of {@link Iterator#next()}.
 49 |      *
 50 |      * @return Iterator if sorting complete and output is ready to be written; null if it was cancelled
 51 |      */
 52 |     public Iterator<T> sort(DataReader<T> inputReader)
 53 |         throws IOException
 54 |     {
 55 |         // Clean up any previous sort
 56 |         close();
 57 | 
 58 |         // First, pre-sort:
 59 |         _phase = Phase.PRE_SORTING;
 60 |         boolean inputClosed = false;
 61 | 
 62 |         SegmentedBuffer buffer = new SegmentedBuffer();
 63 |         _presortFileCount = 0;
 64 |         _sortRoundCount = -1;
 65 |         _currentSortRound = -1;
 66 | 
 67 |         Iterator<T> iterator = null;
 68 |         try {
 69 |             Object[] items = _readMax(inputReader, buffer, _config.getMaxMemoryUsage(), null);
 70 |             if (_checkForCancel()) {
 71 |                 close();
 72 |                 return null;
 73 |             }
 74 |             Arrays.sort(items, _rawComparator());
 75 |             T next = inputReader.readNext();
 76 |             /* Minor optimization: in case all entries might fit in
 77 |              * in-memory sort buffer, avoid writing intermediate file
 78 |              * and just write results directly.
 79 |              */
 80 |             if (next == null) {
 81 |                 inputClosed = true;
 82 |                 inputReader.close();
 83 |                 _phase = Phase.SORTING;
 84 |                 iterator = new CastingIterator<T>(Arrays.asList(items).iterator());
 85 |             } else { // but if more data than memory-buffer-full, do it right:
 86 |                 List<File> presorted = new ArrayList<File>();
 87 |                 presorted.add(_writePresorted(items));
 88 |                 items = null; // it's a big array, clear refs as early as possible
 89 |                 _presort(inputReader, buffer, next, presorted);
 90 |                 inputClosed = true;
 91 |                 inputReader.close();
 92 |                 _phase = Phase.SORTING;
 93 |                 if (_checkForCancel(presorted)) {
 94 |                     close();
 95 |                     return null;
 96 |                 }
 97 |                 _mergerInputs = presorted;
 98 |                 _merger = _createMergeReader(merge(presorted));
 99 |                 iterator = new MergerIterator<T>(_merger);
100 |             }
101 |         } finally {
102 |             if (!inputClosed) {
103 |                 try {
104 |                     inputReader.close();
105 |                 } catch (IOException e) {
106 |                     // Ignore
107 |                 }
108 |             }
109 |         }
110 |         if (_checkForCancel()) {
111 |             close();
112 |             return null;
113 |         }
114 |         _phase = Phase.COMPLETE;
115 |         return iterator;
116 |     }
117 | 
118 | 
119 |     /*
120 |     /**********************************************************************
121 |     /* Closeable API
122 |     /**********************************************************************
123 |     */
124 | 
125 |     @Override
126 |     public void close() {
127 |         if (_merger != null) {
128 |             try {
129 |                 _merger.close();
130 |             }
131 |             catch (IOException e) {
132 |                 // Ignore
133 |             }
134 |         }
135 |         if (_mergerInputs != null) {
136 |             for (File input : _mergerInputs) {
137 |                 input.delete();
138 |             }
139 |         }
140 |         _mergerInputs = null;
141 |         _merger = null;
142 |     }
143 | 
144 |     /*
145 |     /**********************************************************************
146 |     /* Iterator implementations
147 |     /**********************************************************************
148 |     */
149 | 
150 |     private static class MergerIterator<T> implements Iterator<T> {
151 |         private final DataReader<T> _merger;
152 |         private T _next;
153 | 
154 |         private MergerIterator(DataReader<T> merger) throws IOException {
155 |             _merger = merger;
156 |             _next = _merger.readNext();
157 |         }
158 | 
159 |         @Override
160 |         public boolean hasNext() {
161 |             return (_next != null);
162 |         }
163 | 
164 |         @Override
165 |         public T next() {
166 |             if (_next == null) {
167 |                 throw new NoSuchElementException();
168 |             }
169 |             T t = _next;
170 |             try {
171 |                 _next = _merger.readNext();
172 |             } catch (IOException e) {
173 |                 throw new IterableSorterException(e);
174 |             }
175 |             return t;
176 |         }
177 | 
178 |         @Override
179 |         public void remove() {
180 |             throw new UnsupportedOperationException();
181 |         }
182 |     }
183 | }
184 | 


--------------------------------------------------------------------------------
/src/main/java/com/fasterxml/sort/Merger.java:
--------------------------------------------------------------------------------
  1 | package com.fasterxml.sort;
  2 | 
  3 | import java.io.IOException;
  4 | import java.util.*;
  5 | 
  6 | /**
  7 |  * Object used to merge items from multiple input sources into one.
  8 |  */
  9 | public abstract class Merger<T>
 10 |     extends DataReader<T>
 11 | {
 12 |     protected final Comparator<T> _comparator;
 13 | 
 14 |     /*
 15 |     /********************************************************************** 
 16 |     /* Construction
 17 |     /********************************************************************** 
 18 |      */
 19 |     
 20 |     public Merger(Comparator<T> cmp) {
 21 |         _comparator = cmp;
 22 |     }
 23 |     
 24 |     public static <T> DataReader<T> mergedReader(Comparator<T> cmp, List<DataReader<T>> inputs)
 25 |         throws IOException
 26 |     {
 27 |         switch (inputs.size()) {
 28 |         case 0:
 29 |             throw new IllegalArgumentException("Can not pass empty DataReader array");
 30 |         case 1:
 31 |             return inputs.get(0);
 32 |         case 2:
 33 |             return new PairwiseMerger<T>(cmp, inputs.get(0), inputs.get(1));
 34 |         }
 35 |         
 36 |         // otherwise, divide and conquer
 37 |         ArrayList<DataReader<T>> readers = new ArrayList<DataReader<T>>(1 + (inputs.size() >> 1));
 38 |         int i = 0;
 39 |         final int end = inputs.size()-1;
 40 |         for (; i < end; i += 2) {
 41 |             readers.add(new PairwiseMerger<T>(cmp, inputs.get(i), inputs.get(i+1)));
 42 |         }
 43 |         // and for odd number of readers, add last one as is without merging
 44 |         if (i < inputs.size()) {
 45 |             readers.add(inputs.get(i));
 46 |         }
 47 |         return mergedReader(cmp, readers);
 48 |     }
 49 | 
 50 |     /*
 51 |     /********************************************************************** 
 52 |     /* Concrete implementations
 53 |     /********************************************************************** 
 54 |      */
 55 | 
 56 |     protected static class PairwiseMerger<T>
 57 |         extends Merger<T>
 58 |     {
 59 |         protected final DataReader<T> _reader1;
 60 |         protected final DataReader<T> _reader2;
 61 | 
 62 |         protected T _data1;
 63 |         protected T _data2;
 64 | 
 65 |         protected boolean _closed;
 66 |         
 67 |         public PairwiseMerger(Comparator<T> comparator,
 68 |                 DataReader<T> reader1, DataReader<T> reader2)
 69 |             throws IOException
 70 |         {
 71 |             super(comparator);
 72 |             _reader1 = reader1;
 73 |             _data1 = reader1.readNext();
 74 |             _reader2 = reader2;
 75 |             _data2 = reader2.readNext();
 76 |         }
 77 | 
 78 |         @Override
 79 |         public T readNext() throws IOException
 80 |         {
 81 |             if (_data1 == null) {
 82 |                 if (_data2 == null) {
 83 |                     // [Issue#8]: Should auto-close merged input when there is no more data
 84 |                     close();
 85 |                     return null;
 86 |                 }
 87 |                 T result = _data2;
 88 |                 _data2 = _reader2.readNext();
 89 |                 return result;
 90 |             }
 91 |             if (_data2 == null) {
 92 |                 T result = _data1;
 93 |                 _data1 = _reader1.readNext();
 94 |                 return result;
 95 |             }
 96 |             // neither is null, compare
 97 |             T result;
 98 |             if (_comparator.compare(_data1, _data2) <= 0) {
 99 |                 result = _data1;
100 |                 _data1 = _reader1.readNext();
101 |             } else {
102 |                 result = _data2;
103 |                 _data2 = _reader2.readNext();
104 |             }
105 |             return result;
106 |         }
107 | 
108 |         @Override
109 |         public int estimateSizeInBytes(T item) {
110 |             // should not matter so
111 |             return _reader1.estimateSizeInBytes(item);
112 |         }
113 | 
114 |         @Override
115 |         public void close() throws IOException
116 |         {
117 |             if (!_closed) {
118 |                 _reader1.close();
119 |                 _reader2.close();
120 |                 _closed = true;
121 |             }
122 |         }
123 |     }
124 | }
125 | 


--------------------------------------------------------------------------------
/src/main/java/com/fasterxml/sort/SortConfig.java:
--------------------------------------------------------------------------------
  1 | package com.fasterxml.sort;
  2 | 
  3 | import com.fasterxml.sort.std.StdTempFileProvider;
  4 | 
  5 | /**
  6 |  * Configuration object used for changing details of sorting
  7 |  * process. Default settings are usable, so often
  8 |  * instance is created without arguments and used as is.
  9 |  */
 10 | public class SortConfig
 11 | {
 12 |     /**
 13 |      * By default we will use 40 megs for pre-sorting.
 14 |      */
 15 |     public final static long DEFAULT_MEMORY_USAGE = 40 * 1024 * 1024;
 16 | 
 17 |     /**
 18 |      * Default merge sort is 16-way sort (using 16 input files concurrently)
 19 |      */
 20 |     public final static int DEFAULT_MERGE_FACTOR = 16;
 21 | 
 22 |     protected int _mergeFactor;
 23 | 
 24 |     protected long _maxMemoryUsage;
 25 |     
 26 |     protected TempFileProvider _tempFileProvider;
 27 | 
 28 |     /*
 29 |     /************************************************************************
 30 |     /* Construction
 31 |     /************************************************************************
 32 |      */
 33 | 
 34 |     public SortConfig()
 35 |     {
 36 |         _mergeFactor = DEFAULT_MERGE_FACTOR;
 37 |         _maxMemoryUsage = DEFAULT_MEMORY_USAGE;
 38 |         _tempFileProvider = new StdTempFileProvider();
 39 |     }
 40 | 
 41 |     protected SortConfig(SortConfig base, int mergeFactor) {
 42 |         _maxMemoryUsage = base._maxMemoryUsage;
 43 |         _mergeFactor = mergeFactor;
 44 |         _tempFileProvider = base._tempFileProvider;
 45 |     }
 46 |     
 47 |     protected SortConfig(SortConfig base, long maxMem) {
 48 |         _maxMemoryUsage = maxMem;
 49 |         _mergeFactor = base._mergeFactor;
 50 |         _tempFileProvider = base._tempFileProvider;
 51 |     }
 52 | 
 53 |     protected SortConfig(SortConfig base, TempFileProvider prov) {
 54 |         _mergeFactor = base._mergeFactor;
 55 |         _maxMemoryUsage = base._maxMemoryUsage;
 56 |         _tempFileProvider = prov;
 57 |     }
 58 |     
 59 |     /*
 60 |     /************************************************************************
 61 |     /* Accessors
 62 |     /************************************************************************
 63 |      */
 64 | 
 65 |     public int getMergeFactor() { return _mergeFactor; }
 66 |     
 67 |     public long getMaxMemoryUsage() { return _maxMemoryUsage; }
 68 | 
 69 |     public TempFileProvider getTempFileProvider() { return _tempFileProvider; }
 70 |     
 71 |     /*
 72 |     /************************************************************************
 73 |     /* Fluent construction methods
 74 |     /************************************************************************
 75 |      */
 76 |     
 77 |     /**
 78 |      * Method for constructing configuration instance that defines that maximum amount
 79 |      * of memory to use for pre-sorting. This is generally a crude approximation and
 80 |      * implementations make best effort to honor it.
 81 |      * 
 82 |      * @param maxMem Maximum memory that pre-sorted should use for in-memory sorting
 83 |      * @return New 
 84 |      */
 85 |     public SortConfig withMaxMemoryUsage(long maxMem)
 86 |     {
 87 |         if (maxMem == _maxMemoryUsage) {
 88 |             return this;
 89 |         }
 90 |         return new SortConfig(this, maxMem);
 91 |     }
 92 | 
 93 |     public SortConfig withTempFileProvider(TempFileProvider provider)
 94 |     {
 95 |         if (provider == _tempFileProvider) {
 96 |             return this;
 97 |         }
 98 |         return new SortConfig(this, provider);
 99 |     }
100 | 
101 | }
102 | 


--------------------------------------------------------------------------------
/src/main/java/com/fasterxml/sort/Sorter.java:
--------------------------------------------------------------------------------
  1 | package com.fasterxml.sort;
  2 | 
  3 | import java.io.IOException;
  4 | import java.io.InputStream;
  5 | import java.io.OutputStream;
  6 | import java.util.Comparator;
  7 | import java.util.Iterator;
  8 | 
  9 | /**
 10 |  * Main entry point for sorting functionality; object that drives
 11 |  * the sorting process from pre-sort to final output.
 12 |  * Instances are not thread-safe, although they are reusable.
 13 |  * Since the cost of creating new instances is trivial, there is usually
 14 |  * no benefit from reusing instances, other than possible convenience.
 15 |  */
 16 | public class Sorter<T> extends IteratingSorter<T>
 17 | {
 18 |     /**
 19 |      * @param config Configuration for the sorter
 20 |      * @param readerFactory Factory used for creating readers for pre-sorted data;
 21 |      *   as well as for input if an {@link InputStream} is passed as source
 22 |      * @param writerFactory Factory used for creating writers for storing pre-sorted data;
 23 |      *   as well as for results if an {@link OutputStream} is passed as destination.
 24 |      */
 25 |     public Sorter(SortConfig config,
 26 |                   DataReaderFactory<T> readerFactory,
 27 |                   DataWriterFactory<T> writerFactory,
 28 |                   Comparator<T> comparator)
 29 |     {
 30 |         super(config, readerFactory, writerFactory, comparator);
 31 |     }
 32 | 
 33 |     public Sorter() {
 34 |         super();
 35 |     }
 36 | 
 37 |     public Sorter(SortConfig config) {
 38 |         super(config);
 39 |     }
 40 | 
 41 |     protected Sorter<T> withReaderFactory(DataReaderFactory<T> f) {
 42 |         return new Sorter<T>(_config, f, _writerFactory, _comparator);
 43 |     }
 44 | 
 45 |     protected Sorter<T> withWriterFactory(DataWriterFactory<T> f) {
 46 |         return new Sorter<T>(_config, _readerFactory, f, _comparator);
 47 |     }
 48 | 
 49 |     protected Sorter<T> withComparator(Comparator<T> cmp) {
 50 |         return new Sorter<T>(_config, _readerFactory, _writerFactory, cmp);
 51 |     }
 52 | 
 53 | 
 54 |     /*
 55 |     /********************************************************************** 
 56 |     /* Main sorting API
 57 |     /********************************************************************** 
 58 |      */
 59 | 
 60 |     /**
 61 |      * Method that will perform full sort on specified input, writing results
 62 |      * into specified destination. Data conversions needed are done
 63 |      * using {@link DataReaderFactory} and {@link DataWriterFactory} configured
 64 |      * for this sorter.
 65 |      */
 66 |     public void sort(InputStream source, OutputStream destination)
 67 |         throws IOException
 68 |     {
 69 |         sort(_readerFactory.constructReader(source),
 70 |                 _writerFactory.constructWriter(destination));
 71 |     }
 72 | 
 73 |     /**
 74 |      * Method that will perform full sort on input data read using given
 75 |      * {@link DataReader}, and written out using specified {@link DataWriter}.
 76 |      * Conversions to and from intermediate sort files is done
 77 |      * using {@link DataReaderFactory} and {@link DataWriterFactory} configured
 78 |      * for this sorter.
 79 |      * 
 80 |      * @return true if sorting completed successfully; false if it was cancelled
 81 |      */
 82 |     public boolean sort(DataReader<T> inputReader, DataWriter<T> resultWriter)
 83 |         throws IOException
 84 |     {
 85 |         Iterator<T> it = super.sort(inputReader);
 86 |         if(it == null) {
 87 |             return false;
 88 |         }
 89 |         try {
 90 |             while(it.hasNext()) {
 91 |                 T value = it.next();
 92 |                 resultWriter.writeEntry(value);
 93 |             }
 94 |             resultWriter.close();
 95 |         } finally {
 96 |             super.close();
 97 |         }
 98 |         return true;
 99 |     }
100 | }
101 | 


--------------------------------------------------------------------------------
/src/main/java/com/fasterxml/sort/SorterBase.java:
--------------------------------------------------------------------------------
  1 | package com.fasterxml.sort;
  2 | 
  3 | import java.io.*;
  4 | import java.util.*;
  5 | import java.util.concurrent.atomic.AtomicBoolean;
  6 | 
  7 | import com.fasterxml.sort.util.SegmentedBuffer;
  8 | 
  9 | public abstract class SorterBase<T>
 10 |     implements SortingState
 11 | {
 12 |     /* each entry (in buffer) takes about 4 bytes on 32-bit machine; but let's be
 13 |      * conservative and use 8 as base, plus size of object itself.
 14 |      */
 15 |     private final static long ENTRY_SLOT_SIZE = 8L;
 16 |     
 17 |     /*
 18 |     /********************************************************************** 
 19 |     /* Configuration
 20 |     /********************************************************************** 
 21 |      */
 22 |     
 23 |     protected final SortConfig _config;
 24 |     
 25 |     /**
 26 |      * Factory used for reading intermediate sorted files.
 27 |      */
 28 |     protected DataReaderFactory<T> _readerFactory;
 29 |     
 30 |     /**
 31 |      * Factory used for writing intermediate sorted files.
 32 |      */
 33 |     protected DataWriterFactory<T> _writerFactory;
 34 | 
 35 |     /**
 36 |      * Comparator to use for sorting entries; defaults to 'C
 37 |      */
 38 |     protected Comparator<T> _comparator;
 39 |     
 40 |     /*
 41 |     /********************************************************************** 
 42 |     /* State
 43 |     /********************************************************************** 
 44 |      */
 45 |     
 46 |     protected SortingState.Phase _phase;
 47 | 
 48 |     protected int _presortFileCount;
 49 |     
 50 |     protected int _sortRoundCount;
 51 | 
 52 |     protected int _currentSortRound;
 53 |     
 54 |     protected final AtomicBoolean _cancelRequest = new AtomicBoolean(false);
 55 |     
 56 |     protected Exception _cancelForException;
 57 |     
 58 |     /*
 59 |     /********************************************************************** 
 60 |     /* Construction
 61 |     /********************************************************************** 
 62 |      */
 63 | 
 64 |     protected SorterBase(SortConfig config,
 65 |                          DataReaderFactory<T> readerFactory,
 66 |                          DataWriterFactory<T> writerFactory,
 67 |                          Comparator<T> comparator)
 68 |     {
 69 |         _config = config;
 70 | 
 71 |         _readerFactory = readerFactory;
 72 |         _writerFactory = writerFactory;
 73 |         _comparator = comparator;
 74 |         
 75 |         _phase = null;
 76 |     }
 77 | 
 78 |     protected SorterBase() {
 79 |         this(new SortConfig());
 80 |     }
 81 |     
 82 |     protected SorterBase(SortConfig config) {
 83 |         this(config, null, null, null);
 84 |     }
 85 | 
 86 |     /*
 87 |     /********************************************************************** 
 88 |     /* SortingState implementation
 89 |     /********************************************************************** 
 90 |      */
 91 |     
 92 |     @Override
 93 |     public void cancel() {
 94 |         _cancelForException = null;
 95 |         _cancelRequest.set(true);
 96 |     }
 97 | 
 98 |     @Override
 99 |     public void cancel(RuntimeException e) {
100 |         _cancelForException = e;
101 |         _cancelRequest.set(true);
102 |     }
103 |     
104 |     @Override
105 |     public void cancel(IOException e) {
106 |         _cancelForException = e;
107 |         _cancelRequest.set(true);
108 |     }
109 | 
110 |     @Override
111 |     public Phase getPhase() {
112 |         return _phase;
113 |     }
114 |     
115 |     @Override
116 |     public int getNumberOfSortRounds() {
117 |         return _sortRoundCount;
118 |     }
119 | 
120 |     @Override
121 |     public int getNumberOfPreSortFiles() {
122 |         return _presortFileCount;
123 |     }
124 |     
125 |     @Override
126 |     public int getSortRound() {
127 |         return _currentSortRound;
128 |     }
129 |     
130 |     @Override
131 |     public boolean isCompleted() {
132 |         return (_phase == SortingState.Phase.COMPLETE);
133 |     }
134 | 
135 |     @Override
136 |     public boolean isPreSorting() {
137 |         return (_phase == SortingState.Phase.PRE_SORTING);
138 |     }
139 | 
140 |     @Override
141 |     public boolean isSorting() {
142 |         return (_phase == SortingState.Phase.SORTING);
143 |     }
144 | 
145 |     /*
146 |     /********************************************************************** 
147 |     /* Internal methods, pre-sorting
148 |     /********************************************************************** 
149 |      */
150 | 
151 |     /**
152 |      * Helper method that will fill given buffer with data read using
153 |      * given reader, obeying given memory usage constraints.
154 |      */
155 |     protected Object[] _readMax(DataReader<T> inputReader, SegmentedBuffer buffer,
156 |             long memoryToUse, T firstItem)
157 |         throws IOException
158 |     {
159 |         // how much memory do we expect largest remaining entry to take?
160 |         int ptr = 0;
161 |         Object[] segment = buffer.resetAndStart();
162 |         int segmentLength = segment.length;
163 |         long minMemoryNeeded;
164 | 
165 |         if (firstItem != null) {
166 |             segment[ptr++] = firstItem;
167 |             long firstSize = ENTRY_SLOT_SIZE + inputReader.estimateSizeInBytes(firstItem);
168 |             minMemoryNeeded = Math.max(firstSize, 256L);
169 |         } else  {
170 |             minMemoryNeeded = 256L;
171 |         }
172 | 
173 |         // reduce mem amount by buffer cost too:
174 |         memoryToUse -= (ENTRY_SLOT_SIZE * segmentLength);
175 |         
176 |         while (true) {
177 |             T value = inputReader.readNext();
178 |             if (value == null) {
179 |                 break;
180 |             }
181 |             long size = inputReader.estimateSizeInBytes(value);
182 |             if (size > minMemoryNeeded) {
183 |                 minMemoryNeeded = size;
184 |             }
185 |             if (ptr >= segmentLength) {
186 |                 segment = buffer.appendCompletedChunk(segment);
187 |                 segmentLength = segment.length;
188 |                 memoryToUse -= (ENTRY_SLOT_SIZE * segmentLength);
189 |                 ptr = 0;
190 |             }
191 |             segment[ptr++] = value;
192 |             memoryToUse -= size;
193 |             if (memoryToUse < minMemoryNeeded) {
194 |                 break;
195 |             }
196 |         }
197 |         return buffer.completeAndClearBuffer(segment, ptr);
198 |     }
199 |     
200 |     protected void _presort(DataReader<T> inputReader, SegmentedBuffer buffer, T nextValue,
201 |             List<File> presorted)
202 |         throws IOException
203 |     {
204 |         do {
205 |             Object[] items = _readMax(inputReader, buffer, _config.getMaxMemoryUsage(), nextValue);
206 |             Arrays.sort(items, _rawComparator());
207 |             presorted.add(_writePresorted(items));
208 |             nextValue = inputReader.readNext();
209 |         } while (nextValue != null);
210 |     }
211 | 
212 |     @SuppressWarnings("resource")
213 |     protected File _writePresorted(Object[] items) throws IOException
214 |     {
215 |         File tmp = _config.getTempFileProvider().provide();
216 |         @SuppressWarnings("unchecked")
217 |         DataWriter<Object> writer = (DataWriter<Object>) _writerFactory.constructWriter(new FileOutputStream(tmp));
218 |         boolean closed = false;
219 |         try {
220 |             ++_presortFileCount;
221 |             for (int i = 0, end = items.length; i < end; ++i) {
222 |                 writer.writeEntry(items[i]);
223 |                 // to further reduce transient mem usage, clear out the ref
224 |                 items[i] = null;
225 |             }
226 |             closed = true;
227 |             writer.close();
228 |         } finally {
229 |             if (!closed) {
230 |                 // better swallow since most likely we are getting an exception already...
231 |                 try { writer.close(); } catch (IOException e) { }
232 |             }
233 |         }
234 |         return tmp;
235 |     }
236 |     
237 |     /*
238 |     /********************************************************************** 
239 |     /* Internal methods, sorting, output
240 |     /********************************************************************** 
241 |      */
242 | 
243 |     /**
244 |      * Main-level merge method that sorts the given input and writes to final output.
245 |      */
246 |     protected void merge(List<File> presorted, DataWriter<T> resultWriter)
247 |         throws IOException
248 |     {
249 |         List<File> inputs = merge(presorted);
250 |         // and then last around to produce the result file
251 |         _merge(inputs, resultWriter);
252 |     }
253 | 
254 |     /**
255 |      * Main-level merge method that sorts the given input.
256 |      * @return List of files that are individually sorted and ready for final merge.
257 |      */
258 |     protected List<File> merge(List<File> presorted)
259 |         throws IOException
260 |     {
261 |         // Ok, let's see how many rounds we should have...
262 |         final int mergeFactor = _config.getMergeFactor();
263 |         _sortRoundCount = _calculateRoundCount(presorted.size(), mergeFactor);
264 |         _currentSortRound = 0;
265 | 
266 |         // first intermediate rounds
267 |         List<File> inputs = presorted;
268 |         while (inputs.size() > mergeFactor) {
269 |             ArrayList<File> outputs = new ArrayList<File>(1 + ((inputs.size() + mergeFactor - 1) / mergeFactor));
270 |             for (int offset = 0, end = inputs.size(); offset < end; offset += mergeFactor) {
271 |                 int localEnd = Math.min(offset + mergeFactor, end);
272 |                 outputs.add(_merge(inputs.subList(offset, localEnd)));
273 |             }
274 |             ++_currentSortRound;
275 |             // and then switch result files to be input files
276 |             inputs = outputs;
277 |         }
278 |         return inputs;
279 |     }
280 | 
281 |     protected void _writeAll(DataWriter<T> resultWriter, Object[] items)
282 |         throws IOException
283 |     {
284 |         // need to go through acrobatics, due to type erasure... works, if ugly:
285 |         @SuppressWarnings("unchecked")
286 |         DataWriter<Object> writer = (DataWriter<Object>) resultWriter;
287 |         for (Object item : items) {
288 |             writer.writeEntry(item);
289 |         }
290 |     }
291 | 
292 |     @SuppressWarnings("resource")
293 |     protected File _merge(List<File> inputs)
294 |         throws IOException
295 |     {
296 |         File resultFile = _config.getTempFileProvider().provide();
297 |         _merge(inputs, _writerFactory.constructWriter(new FileOutputStream(resultFile)));
298 |         return resultFile;
299 |     }
300 | 
301 |     protected void _merge(List<File> inputs, DataWriter<T> writer)
302 |         throws IOException
303 |     {
304 |         DataReader<T> merger = null;
305 |         try {
306 |             merger = _createMergeReader(inputs);
307 |             T value;
308 |             while ((value = merger.readNext()) != null) {
309 |                 writer.writeEntry(value);
310 |             }
311 |             merger.close(); // usually not necessary (reader should close on eof) but...
312 |             merger = null;
313 |             writer.close();
314 |         } finally {
315 |             if (merger != null) {
316 |                 try { merger.close(); } catch (IOException e) { }
317 |             }
318 |             for (File input : inputs) {
319 |                 input.delete();
320 |             }
321 |         }
322 |     }
323 | 
324 |     protected DataReader<T> _createMergeReader(List<File> inputs) throws IOException {
325 |         ArrayList<DataReader<T>> readers = new ArrayList<DataReader<T>>(inputs.size());
326 |         for (File mergedInput : inputs) {
327 |             readers.add(_readerFactory.constructReader(new FileInputStream(mergedInput)));
328 |         }
329 |         return Merger.mergedReader(_comparator, readers);
330 |     }
331 |     
332 |     /*
333 |     /********************************************************************** 
334 |     /* Internal methods, other
335 |     /********************************************************************** 
336 |      */
337 | 
338 |     protected static int _calculateRoundCount(int files, int mergeFactor)
339 |     {
340 |         int count = 1;
341 |         while (files > mergeFactor) {
342 |             ++count;
343 |             files = (files + mergeFactor - 1) / mergeFactor;
344 |         }
345 |         return count;
346 |     }
347 |     
348 |     protected boolean _checkForCancel() throws IOException
349 |     {
350 |         return _checkForCancel(null);
351 |     }
352 | 
353 |     protected boolean _checkForCancel(Collection<File> tmpFilesToDelete) throws IOException
354 |     {
355 |         if (!_cancelRequest.get()) {
356 |             return false;
357 |         }
358 |         if (tmpFilesToDelete != null) {
359 |             for (File f : tmpFilesToDelete) {
360 |                 f.delete();
361 |             }
362 |         }
363 |         if (_cancelForException != null) {
364 |             // can only be an IOException or RuntimeException, so
365 |             if (_cancelForException instanceof RuntimeException) {
366 |                 throw (RuntimeException) _cancelForException;
367 |             }
368 |             throw (IOException) _cancelForException;
369 |         }
370 |         return true;
371 |     }
372 | 
373 |     @SuppressWarnings("unchecked")
374 |     protected Comparator<Object> _rawComparator() {
375 |         return (Comparator<Object>) _comparator;
376 |     }
377 | }
378 | 


--------------------------------------------------------------------------------
/src/main/java/com/fasterxml/sort/SortingState.java:
--------------------------------------------------------------------------------
 1 | package com.fasterxml.sort;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | /**
 6 |  * Interface that defines how calling application can interact with a {@link Sorter}; both
 7 |  * by accessing progress information and by requesting cancellation if necessary.
 8 |  * It is implemented by {@link Sorter}.
 9 |  */
10 | public interface SortingState
11 | {
12 |     /**
13 |      * Different phases that sorter goes through
14 |      */
15 |     public enum Phase {
16 |         PRE_SORTING,
17 |         SORTING,
18 |         COMPLETE
19 |     }
20 | 
21 |     /*
22 |     /************************************************************************
23 |     /* Accessors
24 |     /************************************************************************
25 |      */
26 | 
27 |     public Phase getPhase();
28 |     
29 |     /**
30 |      * Accessor for determining whether sorter is in its in-memory pre-sorting phase.
31 |      */
32 |     public boolean isPreSorting();
33 |     
34 |     /**
35 |      * Accessor for determining whether sorter is in regular merge-sort phase or not.
36 |      */
37 |     public boolean isSorting();
38 | 
39 |     /**
40 |      * Accessor for determining whether sorting has been successfully completed or not.
41 |      */
42 |     public boolean isCompleted();
43 | 
44 |     /**
45 |      * Accessor for checking how many pre-sort files were created during
46 |      * pre-sort phase. Can be zero if the whole data fit in memory during
47 |      * pre-sorting.
48 |      */
49 |     public int getNumberOfPreSortFiles();
50 |     
51 |     /**
52 |      * Accessor for checking which sorting round sorter is doing: for pre-sort
53 |      * it basically means number of segment (0-based) that is being processed
54 |      * in-memory, for regular sort it is number of (0-based) sorting round.
55 |      */
56 |     public int getSortRound();
57 | 
58 |     /**
59 |      * Accessor for figuring out how many regular sorting rounds need to be taken to
60 |      * complete sorting, if known. If information is not known, will return -1.
61 |      * This information generally becomes available after pre-sorting round.
62 |      */
63 |     public int getNumberOfSortRounds();
64 |     
65 |     /*
66 |     /************************************************************************
67 |     /* Cancellation
68 |     /************************************************************************
69 |      */
70 | 
71 |     /**
72 |      * Method that can be used to try to cancel executing sort operation.
73 |      * No exception will be thrown; sorting will just be stopped as soon as
74 |      * sorting thread notices request.
75 |      */
76 |     public void cancel();
77 |     
78 |     /**
79 |      * Method that can be used to try to cancel executing sort operation.
80 |      * Exception object can be specified; if non-null instance is given,
81 |      * it will be thrown to indicate erroneous result, otherwise sorting is
82 |      * just interrupted but execution returns normally.
83 |      */
84 |     public void cancel(RuntimeException e);
85 | 
86 |     /**
87 |      * Method that can be used to try to cancel executing sort operation.
88 |      * Exception object can be specified; if non-null instance is given,
89 |      * it will be thrown to indicate erroneous result, otherwise sorting is
90 |      * just interrupted but execution returns normally.
91 |      */
92 |     public void cancel(IOException e);
93 | }
94 | 


--------------------------------------------------------------------------------
/src/main/java/com/fasterxml/sort/TempFileProvider.java:
--------------------------------------------------------------------------------
 1 | package com.fasterxml.sort;
 2 | 
 3 | import java.io.File;
 4 | import java.io.IOException;
 5 | 
 6 | /**
 7 |  * Interface used for object that can handle constructing of temporary files that are
 8 |  * needed during sort and non-final merge phases.
 9 |  * 
10 |  * @author tatu
11 |  *
12 |  */
13 | public interface TempFileProvider
14 | {
15 |     public File provide() throws IOException;
16 | }
17 | 


--------------------------------------------------------------------------------
/src/main/java/com/fasterxml/sort/std/ByteArrayComparator.java:
--------------------------------------------------------------------------------
 1 | package com.fasterxml.sort.std;
 2 | 
 3 | import java.util.Comparator;
 4 | 
 5 | /**
 6 |  * Simple implementation of comparator for byte arrays which
 7 |  * will compare using <code>unsigned</code> byte values (meaning
 8 |  * that 0xFF is creator than 0x00, for example).
 9 |  */
10 | public class ByteArrayComparator
11 |     implements Comparator<byte[]>
12 | {
13 |     @Override
14 |     public int compare(byte[] o1, byte[] o2)
15 |     {
16 |         final int len = Math.min(o1.length, o2.length);
17 |         for (int i = 0; i < len; ++i) {
18 |             // alas, sign extension means we must do masking...
19 |             int diff = (o1[i] & 0xFF) - (o2[i] & 0xFF);
20 |             if (diff != 0) {
21 |                 return diff;
22 |             }
23 |         }
24 |         return o1.length - o2.length;
25 |     }
26 | 
27 | }
28 | 


--------------------------------------------------------------------------------
/src/main/java/com/fasterxml/sort/std/RawTextLineReader.java:
--------------------------------------------------------------------------------
  1 | package com.fasterxml.sort.std;
  2 | 
  3 | import java.io.*;
  4 | import java.util.Arrays;
  5 | 
  6 | import com.fasterxml.sort.*;
  7 | 
  8 | /**
  9 |  * Efficient reader for data that consists of text lines, i.e. character
 10 |  * data separated by one of standard line feeds (CR, LF or CR+LF).
 11 |  * For efficiency no decoding is done
 12 |  */
 13 | public class RawTextLineReader
 14 |     extends DataReader<byte[]>
 15 | {
 16 |     protected final static byte BYTE_CR = (byte) '\r';
 17 |     protected final static byte BYTE_LF = (byte) '\n';
 18 |     
 19 |     protected final InputStream _in;
 20 | 
 21 |     protected boolean _closed = false;
 22 |     
 23 |     protected byte[] _inputBuffer = new byte[16000];
 24 |     protected int _inputPtr = 0;
 25 |     protected int _inputEnd = 0;
 26 |     
 27 |     /**
 28 |      * Marker we set if the last line ended with a CR, since it
 29 |      * may be followed by a trailing LF as part of two-byte linefeed.
 30 |      */
 31 |     protected boolean _hadCR = false;
 32 | 
 33 |     protected ByteArrayOutputStream _tmpBytes;
 34 |     
 35 |     public RawTextLineReader(InputStream in)
 36 |     {
 37 |         _in = in;
 38 |     }
 39 | 
 40 |     /**
 41 |      * Convenience method for instantiating factory to create instances of
 42 |      * this {@link DataReader}.
 43 |      */
 44 |     public static Factory factory() {
 45 |         return new Factory();
 46 |     }    
 47 |     
 48 |     @Override
 49 |     public void close() throws IOException
 50 |     {
 51 |         if (!_closed) {
 52 |             _closed = true;
 53 |             _in.close();
 54 |         }
 55 |     }
 56 | 
 57 |     @Override
 58 |     public int estimateSizeInBytes(byte[] item)
 59 |     {
 60 |         // Wild guess: array objects take at least 8 bytes, probably 12 or 16.
 61 |         // And size of actual array storage rounded up to 4-byte alignment. So:
 62 | 
 63 |         int bytes = item.length;
 64 |         bytes = ((bytes + 3) >> 2) << 2;
 65 |         return 16 + bytes;
 66 |     }
 67 | 
 68 |     @Override
 69 |     public byte[] readNext() throws IOException
 70 |     {
 71 |         if (_closed) {
 72 |             return null;
 73 |         }
 74 |         if (_inputPtr >= _inputEnd) {
 75 |             if (!_loadMore()) {
 76 |                 close();
 77 |                 return null;
 78 |             }
 79 |         }
 80 | 
 81 |         // first thing(s) first: skip a linefeed we might have
 82 |         if (_hadCR) {
 83 |             if (!_skipLF()) {
 84 |                 return null;
 85 |             }
 86 |         }
 87 | 
 88 |         // set the start point after our call to _skipLF() so that if a linefeed is skipped, we also skip it in Arrays.copyOfRange below
 89 |         final int start = _inputPtr;
 90 | 
 91 |         // then common case: we find full row:
 92 |         final int end = _inputEnd;
 93 |         while (_inputPtr < end) {
 94 |             byte b = _inputBuffer[_inputPtr++];
 95 |             if (b == BYTE_CR || b == BYTE_LF) {
 96 |                 _hadCR = (b == BYTE_CR);
 97 |                 return Arrays.copyOfRange(_inputBuffer, start, _inputPtr-1);
 98 |             }
 99 |         }
100 |         // but if not, need to buffer
101 |         return _readNextSlow(start);
102 |     }
103 | 
104 |     protected final byte[] _readNextSlow(int start) throws IOException
105 |     {
106 |         ByteArrayOutputStream bytes = _tmpBytes;
107 |         if (bytes == null) {
108 |             _tmpBytes = bytes = new ByteArrayOutputStream();
109 |         } else {
110 |             bytes.reset();
111 |         }
112 |         // add stuff we have seen so far, and...
113 |         bytes.write(_inputBuffer, start, _inputEnd - start);
114 | 
115 |         main_loop:        
116 |         while (true) {
117 |             if (!_loadMore()) {
118 |                 close();
119 |                 break;
120 |             }
121 |             for (int i = 0, end = _inputEnd; i < end; ++i) {
122 |                 byte b = _inputBuffer[_inputPtr++];
123 |                 if (b == BYTE_CR || b == BYTE_LF) {
124 |                     _hadCR = (b == BYTE_CR);
125 |                     bytes.write(_inputBuffer, 0, _inputPtr-1);
126 |                     break main_loop;
127 |                 }
128 |             }
129 |         }
130 |         return bytes.toByteArray();
131 |     }
132 | 
133 |     /*
134 |     /**********************************************************************
135 |     /* Internal methods
136 |     /**********************************************************************
137 |      */
138 |     
139 |     protected boolean _loadMore() throws IOException
140 |     {
141 |         int count = _in.read(_inputBuffer);
142 |         if (count < 0) {
143 |             return false;
144 |         }
145 |         _inputPtr = 0;
146 |         _inputEnd = count;
147 |         return true;
148 |     }
149 | 
150 |     protected boolean _skipLF() throws IOException
151 |     {
152 |         _hadCR = false;
153 |         if (_inputBuffer[_inputPtr] == BYTE_LF) {
154 |             ++_inputPtr;
155 |             if (_inputPtr >= _inputEnd) {
156 |                 if (!_loadMore()) {
157 |                     close();
158 |                     return false;
159 |                 }
160 |             }
161 |         }
162 |         return true;
163 |     }
164 |     
165 |     /*
166 |     /**********************************************************************
167 |     /* Helper classes
168 |     /**********************************************************************
169 |      */
170 |     
171 |     public static class Factory
172 |         extends DataReaderFactory<byte[]>
173 |     {
174 |         @Override
175 |         public DataReader<byte[]> constructReader(InputStream in) {
176 |             return new RawTextLineReader(in);
177 |         }
178 |     }        
179 | }
180 | 


--------------------------------------------------------------------------------
/src/main/java/com/fasterxml/sort/std/RawTextLineWriter.java:
--------------------------------------------------------------------------------
  1 | package com.fasterxml.sort.std;
  2 | 
  3 | import java.io.*;
  4 | 
  5 | import com.fasterxml.sort.*;
  6 | 
  7 | public class RawTextLineWriter
  8 |     extends DataWriter<byte[]>
  9 | {
 10 |     protected final static byte[] STD_LINEFEED_CR = new byte[] { '\r' };
 11 |     protected final static byte[] STD_LINEFEED_LF = new byte[] { '\n' };
 12 |     protected final static byte[] STD_LINEFEED_CRLF = new byte[] { '\r', '\n' };
 13 | 
 14 |     protected final static byte[] DEFAULT_LINEFEED = STD_LINEFEED_LF;
 15 |     
 16 |     protected final OutputStream _out;
 17 | 
 18 |     /**
 19 |      * Linefeed used after entries
 20 |      */
 21 |     protected final byte[] _lf;
 22 |     
 23 |     protected boolean _closed = false;
 24 | 
 25 |     public RawTextLineWriter(OutputStream out) {
 26 |         this(out, DEFAULT_LINEFEED);
 27 |     }
 28 | 
 29 |     public RawTextLineWriter(OutputStream out, byte[] linefeed)
 30 |     {
 31 |         _out = out;
 32 |         _lf = linefeed;
 33 |     }
 34 | 
 35 |     /**
 36 |      * Convenience method for instantiating factory to create instances of
 37 |      * this {@link DataWriter}.
 38 |      */
 39 |     public static Factory factory() {
 40 |         return new Factory();
 41 |     }
 42 | 
 43 |     /**
 44 |      * Convenience method for instantiating factory to create instances of
 45 |      * this {@link DataWriter}.
 46 |      */
 47 |     public static Factory factory(byte[] linefeed) {
 48 |         return new Factory(linefeed);
 49 |     }
 50 |     
 51 |     @Override
 52 |     public void close() throws IOException {
 53 |         if (!_closed) {
 54 |             _closed = true;
 55 |             _out.close();
 56 |         }
 57 |     }
 58 | 
 59 |     @Override
 60 |     public void writeEntry(byte[] item) throws IOException
 61 |     {
 62 |         if (_closed) {
 63 |             throw new IOException("Can not write using closed DataWriter");
 64 |         }
 65 |         _out.write(item);
 66 |         if (_lf != null) {
 67 |             _out.write(_lf);
 68 |         }
 69 |     }
 70 | 
 71 |     /*
 72 |     /**********************************************************************
 73 |     /* Helper classes
 74 |     /**********************************************************************
 75 |      */
 76 |     
 77 |     /**
 78 |      * Basic factory implementation. The only noteworthy things are:
 79 |      * <ul>
 80 |      * <li>Ability to configure linefeed to use (including none, pass null)</li>
 81 |      * <li>Writer uses {@link BufferedOutputStream} by default (can be disabled)
 82 |      *  </ul>
 83 |      */
 84 |     public static class Factory
 85 |         extends DataWriterFactory<byte[]>
 86 |     {
 87 |         protected final byte[] _linefeed;
 88 |         protected final boolean _addBuffering;
 89 |         
 90 |         public Factory() {
 91 |             this(DEFAULT_LINEFEED);
 92 |         }
 93 | 
 94 |         public Factory(byte[] linefeed) {
 95 |             this(linefeed, true);
 96 |         }
 97 | 
 98 |         public Factory(byte[] linefeed, boolean addBuffering) {
 99 |             _linefeed = linefeed;
100 |             _addBuffering = addBuffering;
101 |         }
102 |         
103 |         @Override
104 |         public DataWriter<byte[]> constructWriter(OutputStream out) {
105 |             if (_addBuffering) {
106 |                 if (!(out instanceof BufferedOutputStream)) {
107 |                     out = new BufferedOutputStream(out);
108 |                 }
109 |             }
110 |             return new RawTextLineWriter(out, _linefeed);
111 |         }
112 |     }
113 | }
114 | 


--------------------------------------------------------------------------------
/src/main/java/com/fasterxml/sort/std/StdComparator.java:
--------------------------------------------------------------------------------
 1 | package com.fasterxml.sort.std;
 2 | 
 3 | import java.util.Comparator;
 4 | 
 5 | /**
 6 |  * Basic comparator implementation that works on types that implement
 7 |  * {@link Comparator}.
 8 |  */
 9 | public class StdComparator<T extends Comparable<T>> implements Comparator<T>
10 | {
11 |     @Override
12 |     public int compare(T object1, T object2) {
13 |         if (object1 == object2) return 0;
14 |         if (object1 == null) return -1;
15 |         return object1.compareTo(object2);
16 |     }
17 | 
18 | }
19 | 


--------------------------------------------------------------------------------
/src/main/java/com/fasterxml/sort/std/StdTempFileProvider.java:
--------------------------------------------------------------------------------
 1 | package com.fasterxml.sort.std;
 2 | 
 3 | import java.io.*;
 4 | 
 5 | import com.fasterxml.sort.TempFileProvider;
 6 | 
 7 | /**
 8 |  * Default {@link TempFileProvider} implementation which uses JDK default
 9 |  * temporary file generation mechanism.
10 |  * 
11 |  * @author tatu
12 |  */
13 | public class StdTempFileProvider
14 |     implements TempFileProvider
15 | {
16 |     /**
17 |      * Default temporary file prefix to use.
18 |      */
19 |     public final static String DEFAULT_PREFIX = "j-merge-sort-";
20 | 
21 |     /**
22 |      * Default temporary file suffix to use.
23 |      */
24 |     public final static String DEFAULT_SUFFIX = ".tmp";
25 |     
26 |     protected final String _prefix;
27 |     protected final String _suffix;
28 |     
29 |     public StdTempFileProvider() { this(DEFAULT_PREFIX, DEFAULT_SUFFIX); }
30 |     public StdTempFileProvider(String prefix, String suffix) {
31 |         _prefix = prefix;
32 |         _suffix = suffix;
33 |     }
34 |     
35 |     @Override
36 |     public File provide() throws IOException
37 |     {
38 |         File f = File.createTempFile(_prefix, _suffix);
39 |         f.deleteOnExit();
40 |         return f;
41 |     }
42 | }
43 | 


--------------------------------------------------------------------------------
/src/main/java/com/fasterxml/sort/std/TextFileSorter.java:
--------------------------------------------------------------------------------
 1 | package com.fasterxml.sort.std;
 2 | 
 3 | import java.io.*;
 4 | 
 5 | import com.fasterxml.sort.*;
 6 | 
 7 | /**
 8 |  * Basic {@link Sorter} implementation that operates on text line input.
 9 |  */
10 | public class TextFileSorter extends Sorter<byte[]>
11 | {
12 |     /**
13 |      * Let's limit maximum memory used for pre-sorting when invoked from command-line to be
14 |      * 256 megs
15 |      */
16 |     public final static long MAX_HEAP_FOR_PRESORT = 256L * 1024 * 1024;
17 | 
18 |     /**
19 |      * Also just in case our calculations are wrong, require 10 megs for pre-sort anyway
20 |      * (if invoked from CLI)
21 |      */
22 |     public final static long MIN_HEAP_FOR_PRESORT = 10L * 1024 * 1024;
23 |     
24 |     public TextFileSorter() {
25 |         this(new SortConfig());
26 |     }
27 |     
28 |     public TextFileSorter(SortConfig config)
29 |     {
30 |         super(config,
31 |                 RawTextLineReader.factory(), RawTextLineWriter.factory(),
32 |                 new ByteArrayComparator());
33 |     }
34 | 
35 |     /*
36 |     /********************************************************************** 
37 |     /* Main method for simple command-line operation for line-based
38 |     /* sorting using default ISO-8859-1 collation (i.e. byte-by-byte sorting)
39 |     /********************************************************************** 
40 |      */
41 |     
42 |     public static void main(String[] args) throws Exception
43 |     {
44 |         if (args.length > 1) {
45 |             System.err.println("Usage: java "+TextFileSorter.class.getName()+" [input-file]");
46 |             System.err.println("(where input-file is optional; if missing, read from STDIN)");
47 |             System.exit(1);
48 |         }
49 |         
50 |         // One more thing: use 50% of memory (but no more than 200 megs) for pre-sort
51 |         // minor tweak: consider first 40 megs to go for other overhead...
52 |         long availMem = Runtime.getRuntime().maxMemory() - (40 * 1024 * 1024);
53 |         long maxMem = (availMem >> 1);
54 |         if (maxMem > MAX_HEAP_FOR_PRESORT) {
55 |             maxMem = MAX_HEAP_FOR_PRESORT;
56 |         } else if (maxMem < MIN_HEAP_FOR_PRESORT) {
57 |             maxMem = MIN_HEAP_FOR_PRESORT;
58 |         }
59 |         final TextFileSorter sorter = new TextFileSorter(new SortConfig().withMaxMemoryUsage(maxMem));
60 |         final InputStream in;
61 |         
62 |         if (args.length == 0) {
63 |             in = System.in;
64 |         } else {
65 |             File input = new File(args[0]);
66 |             if (!input.exists() || input.isDirectory()) {
67 |                 System.err.println("File '"+input.getAbsolutePath()+"' does not exist (or is not file)");
68 |                 System.exit(2);
69 |             }
70 |             in = new FileInputStream(input);
71 |         }
72 | 
73 |         // To be able to print out progress, need to spin one additional thread...
74 |         new Thread(new Runnable() {
75 |             @Override
76 |             public void run() {
77 |                 final long start = System.currentTimeMillis();
78 |                 try {
79 |                     while (!sorter.isCompleted()) {
80 |                         Thread.sleep(5000L);
81 |                         if (sorter.isPreSorting()) {
82 |                             System.err.printf(" pre-sorting: %d files written\n", sorter.getNumberOfPreSortFiles());
83 |                         } else if (sorter.isSorting()) {
84 |                             System.err.printf(" sorting, round: %d/%d\n",
85 |                                     sorter.getSortRound(), sorter.getNumberOfSortRounds());
86 |                         }
87 |                     }
88 |                     double secs = (System.currentTimeMillis() - start) / 1000.0;
89 |                     System.err.printf("Completed: took %.1f seconds.\n", secs);
90 |                 } catch (InterruptedException e) {
91 |                     double secs = (System.currentTimeMillis() - start) / 1000.0;
92 |                     System.err.printf("[INTERRUPTED] -- took %.1f seconds.\n", secs);
93 |                 }
94 |             } 
95 |         }).start();
96 |         sorter.sort(in, System.out);
97 |     }
98 | }
99 | 


--------------------------------------------------------------------------------
/src/main/java/com/fasterxml/sort/util/BlockingQueueReader.java:
--------------------------------------------------------------------------------
 1 | package com.fasterxml.sort.util;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.InterruptedIOException;
 5 | import java.util.concurrent.BlockingQueue;
 6 | 
 7 | import com.fasterxml.sort.DataReader;
 8 | 
 9 | /**
10 |  * Base implementation for {@link DataReader} that uses a
11 |  * {@link BlockingQueue} for getting input.
12 |  * The only missing part is implementation for
13 |  * {@link #estimateSizeInBytes(Object)}, since there is no way
14 |  * to provide a meaningful estimate without knowing object type.
15 |  */
16 | public abstract class BlockingQueueReader<E>
17 |     extends DataReader<E>
18 | {
19 |     protected final BlockingQueue<E> _queue;
20 |     
21 |     protected final E _endMarker;
22 | 
23 |     protected boolean _closed;
24 | 
25 |     @Deprecated
26 |     public BlockingQueueReader(BlockingQueue<E> q) {
27 |         this(q, null);
28 |     }
29 |     
30 |     /**
31 |      * @param q Queue to read entries from
32 |      * @param endMarker Value that is used to signal end-of-input; when this value
33 |      *   is gotten from queue, reader assumes that no more input is coming and
34 |      *   will return <code>null</code> from {@link #readNext}.
35 |      */
36 |     public BlockingQueueReader(BlockingQueue<E> q, E endMarker) {
37 |         _queue = q;
38 |         _endMarker = endMarker;
39 |     }
40 |     
41 |     @Override
42 |     public void close() throws IOException {
43 |         _closed = true;
44 |     }
45 | 
46 |     @Override
47 |     public abstract int estimateSizeInBytes(E item);
48 | 
49 |     @Override
50 |     public E readNext() throws IOException {
51 |         if (_closed) {
52 |             return null;
53 |         }
54 |         try {
55 |             E value = _queue.take();
56 |             if (value == _endMarker) {
57 |                 _closed = true;
58 |                 return null;
59 |             }
60 |             return value;
61 |         } catch (InterruptedException e) {
62 |             InterruptedIOException ie = new InterruptedIOException();
63 |             ie.initCause(e);
64 |             throw ie;
65 |         }
66 |     }
67 | }
68 | 


--------------------------------------------------------------------------------
/src/main/java/com/fasterxml/sort/util/CastingIterator.java:
--------------------------------------------------------------------------------
 1 | package com.fasterxml.sort.util;
 2 | 
 3 | import java.util.Iterator;
 4 | 
 5 | public class CastingIterator<T> implements Iterator<T> {
 6 |     private final Iterator<Object> _it;
 7 | 
 8 |     public CastingIterator(Iterator<Object> it) {
 9 |         _it = it;
10 |     }
11 | 
12 |     @Override
13 |     public boolean hasNext() {
14 |         return _it.hasNext();
15 |     }
16 | 
17 |     @SuppressWarnings("unchecked")
18 |     @Override
19 |     public T next() {
20 |         return (T)_it.next();
21 |     }
22 | 
23 |     @Override
24 |     public void remove() {
25 |         throw new UnsupportedOperationException();
26 |     }
27 | }


--------------------------------------------------------------------------------
/src/main/java/com/fasterxml/sort/util/CollectionReader.java:
--------------------------------------------------------------------------------
 1 | package com.fasterxml.sort.util;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.*;
 5 | 
 6 | import com.fasterxml.sort.DataReader;
 7 | 
 8 | /**
 9 |  * Simple {@link DataReader} implementation that can be used to
10 |  * serve items from a {@link Collection} (or {@link Iterator}).
11 |  * Note that implementation of {@link #estimateSizeInBytes} is
12 |  * naive and returns 1 for all items; it must be redefined if
13 |  * memory limits are to be enforced, or alternatively
14 |  * <code>Sorter</code> should be configured with maximum number of
15 |  * items to use as memory limit.
16 |  */
17 | public class CollectionReader<T> extends DataReader<T>
18 | {
19 |     protected Iterator<T> _items;
20 | 
21 |     public CollectionReader(Collection<T> items) {
22 |         this(items.iterator());
23 |     }
24 | 
25 |     public CollectionReader(Iterator<T> items) {
26 |         _items = items;
27 |     }
28 |     
29 |     @Override
30 |     public T readNext()
31 |     {
32 |         if (_items == null) {
33 |             return null;
34 |         }
35 |         if (!_items.hasNext()) {
36 |             _items = null;
37 |             return null;
38 |         }
39 |         return _items.next();
40 |     }
41 | 
42 |     @Override
43 |     public int estimateSizeInBytes(T item) {
44 |         return 1;
45 |     }
46 |     
47 |     @Override
48 |     public void close() throws IOException {
49 |         // no-op
50 |     }
51 | 
52 | }
53 | 


--------------------------------------------------------------------------------
/src/main/java/com/fasterxml/sort/util/NaturalComparator.java:
--------------------------------------------------------------------------------
 1 | package com.fasterxml.sort.util;
 2 | 
 3 | import java.util.Comparator;
 4 | 
 5 | /**
 6 |  * Simple {@link Comparator} implementation that can be used
 7 |  * when items to compare have "natural" sorting order that
 8 |  * can be used via {@link Comparable} interface.
 9 |  */
10 | public class NaturalComparator<T extends Comparable<T>>
11 |     implements Comparator<T>
12 | {
13 |     @Override
14 |     public int compare(T arg0, T arg1) {
15 |         return arg0.compareTo(arg1);
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/src/main/java/com/fasterxml/sort/util/SegmentedBuffer.java:
--------------------------------------------------------------------------------
  1 | package com.fasterxml.sort.util;
  2 | 
  3 | import java.util.Arrays;
  4 | 
  5 | /**
  6 |  * Helper class used instead of a standard JDK list or buffer,
  7 |  * to avoid constant re-allocations.
  8 |  */
  9 | public class SegmentedBuffer
 10 | {
 11 |     // // // Config constants
 12 | 
 13 |     /**
 14 |      * Let's start with relatively small chunks
 15 |      */
 16 |     final static int INITIAL_CHUNK_SIZE = 1024;
 17 | 
 18 |     /**
 19 |      * Also: let's expand by doubling up until 16k entry chunks (which is 64k 
 20 |      * in size for 32-bit machines)
 21 |      */
 22 |     final static int MAX_CHUNK_SIZE = (1 << 14);
 23 | 
 24 |     // // // Data storage
 25 | 
 26 |     private Node _bufferHead;
 27 | 
 28 |     private Node _bufferTail;
 29 | 
 30 |     /**
 31 |      * Number of total buffered entries in this buffer, counting all instances
 32 |      * within linked list formed by following {@link #_bufferHead}.
 33 |      */
 34 |     private int _bufferedEntryCount;
 35 | 
 36 |     // // // Simple reuse
 37 | 
 38 |     /**
 39 |      * Reusable Object array, stored here after buffer has been released having
 40 |      * been used previously.
 41 |      */
 42 |     private Object[] _freeBuffer;
 43 | 
 44 |     /*
 45 |     /**********************************************************
 46 |     /* Construction
 47 |     /**********************************************************
 48 |      */
 49 | 
 50 |     public SegmentedBuffer() { }
 51 | 
 52 |     /*
 53 |     /**********************************************************
 54 |     /* Public API
 55 |     /**********************************************************
 56 |      */
 57 | 
 58 |     /**
 59 |      * Method called to start buffering process. Will ensure that the buffer
 60 |      * is empty, and then return an object array to start chunking content on
 61 |      */
 62 |     public Object[] resetAndStart()
 63 |     {
 64 |         if (_bufferedEntryCount > 0) {
 65 |             _reset();
 66 |         }
 67 |         if (_freeBuffer == null) {
 68 |             return new Object[INITIAL_CHUNK_SIZE];
 69 |         }
 70 |         return _freeBuffer;
 71 |     }
 72 | 
 73 |     /**
 74 |      * Method called to add a full Object array as a chunk buffered within
 75 |      * this buffer, and to obtain a new array to fill. Caller is not to use
 76 |      * the array it gives; but to use the returned array for continued
 77 |      * buffering.
 78 |      *
 79 |      * @param fullChunk Completed chunk that the caller is requesting
 80 |      *   to append to this buffer. It is generally chunk that was
 81 |      *   returned by an earlier call to {@link #resetAndStart} or
 82 |      *   {@link #appendCompletedChunk} (although this is not required or
 83 |      *   enforced)
 84 |      *
 85 |      * @return New chunk buffer for caller to fill
 86 |      */
 87 |     public Object[] appendCompletedChunk(Object[] fullChunk)
 88 |     {
 89 |         Node next = new Node(fullChunk);
 90 |         if (_bufferHead == null) { // first chunk
 91 |             _bufferHead = _bufferTail = next;
 92 |         } else { // have something already
 93 |             _bufferTail.linkNext(next);
 94 |             _bufferTail = next;
 95 |         }
 96 |         int len = fullChunk.length;
 97 |         _bufferedEntryCount += len;
 98 |         // double the size for small chunks
 99 |         if (len < MAX_CHUNK_SIZE) {
100 |             len += len;
101 |         } else { // but by +25% for larger (to limit overhead)
102 |             len += (len >> 2);
103 |         }
104 |         return new Object[len];
105 |     }
106 | 
107 |     /**
108 |      * Method called to indicate that the buffering process is now
109 |      * complete; and to construct a combined exactly-sized result
110 |      * array. Additionally the buffer itself will be reset to
111 |      * reduce memory retention.
112 |      *<p>
113 |      * Resulting array will be of generic <code>Object[]</code> type:
114 |      * if a typed array is needed, use the method with additional
115 |      * type argument.
116 |      */
117 |     public Object[] completeAndClearBuffer(Object[] lastChunk, int lastChunkEntries)
118 |     {
119 |         int totalSize = lastChunkEntries + _bufferedEntryCount;
120 |         Object[] result = new Object[totalSize];
121 |         _copyTo(result, totalSize, lastChunk, lastChunkEntries);
122 |         // [Issue-5]: should reduce mem usage here
123 |         _reset();
124 |         return result;
125 |     }
126 |         
127 |     /**
128 |      * Helper method that can be used to check how much free capacity
129 |      * will this instance start with. Can be used to choose the best
130 |      * instance to reuse, based on size of reusable object chunk
131 |      * buffer holds reference to.
132 |      */
133 |     public int initialCapacity()
134 |     {
135 |         return (_freeBuffer == null) ? 0 : _freeBuffer.length;
136 |     }
137 | 
138 |     /**
139 |      * Method that can be used to check how many Objects have been buffered
140 |      * within this buffer.
141 |      */
142 |     public int bufferedSize() { return _bufferedEntryCount; }
143 | 
144 |     /*
145 |     /**********************************************************************
146 |     /* Internal methods
147 |     /**********************************************************************
148 |      */
149 | 
150 |     private void _reset()
151 |     {
152 |         // can we reuse the last (and thereby biggest) array for next time?
153 |         if (_bufferedEntryCount > 0) {
154 |             if (_bufferTail != null) {
155 |                 Object[] obs = _bufferTail.getData();
156 |                 // also, let's clear it of contents as well, just in case
157 |                 Arrays.fill(obs, null);
158 |                 _freeBuffer = obs;
159 |             }
160 |             // either way, must discard current contents
161 |             _bufferHead = _bufferTail = null;
162 |             _bufferedEntryCount = 0;
163 |         }
164 |     }
165 | 
166 |     private final void _copyTo(Object resultArray, int totalSize,
167 |                                  Object[] lastChunk, int lastChunkEntries)
168 |     {
169 |         int ptr = 0;
170 | 
171 |         for (Node n = _bufferHead; n != null; n = n.next()) {
172 |             Object[] curr = n.getData();
173 |             int len = curr.length;
174 |             System.arraycopy(curr, 0, resultArray, ptr, len);
175 |             ptr += len;
176 |         }
177 |         System.arraycopy(lastChunk, 0, resultArray, ptr, lastChunkEntries);
178 |         ptr += lastChunkEntries;
179 | 
180 |         // sanity check (could have failed earlier due to out-of-bounds, too)
181 |         if (ptr != totalSize) {
182 |             throw new IllegalStateException("Should have gotten "+totalSize+" entries, got "+ptr);
183 |         }
184 |     }
185 | 
186 |     /*
187 |     /**********************************************************************
188 |     /* Helper classes
189 |     /**********************************************************************
190 |      */
191 | 
192 |     /**
193 |      * Helper class used to store actual data, in a linked list.
194 |      */
195 |     private final static class Node
196 |     {
197 |         /**
198 |          * Data stored in this node. Array is considered to be full.
199 |          */
200 |         private final Object[] _data;
201 | 
202 |         private Node _next;
203 | 
204 |         public Node(Object[] data) {
205 |             _data = data;
206 |         }
207 | 
208 |         public Object[] getData() { return _data; }
209 | 
210 |         public Node next() { return _next; }
211 | 
212 |         public void linkNext(Node next)
213 |         {
214 |             if (_next != null) { // sanity check
215 |                 throw new IllegalStateException();
216 |             }
217 |             _next = next;
218 |         }
219 |     }
220 | }
221 | 


--------------------------------------------------------------------------------
/src/main/java/dict.properties:
--------------------------------------------------------------------------------
1 | HELP_DESCRIPTION=list all cmd
2 | RAW_FILE=raw data file line by line
3 | SORT_MEM_SIZE_IN_MB=memory size in mb use by sorting
4 | MAX_WORD_LENGTH=max length of word
5 | OUTPUT_DICT_FILE=output result dict file


--------------------------------------------------------------------------------
/src/main/java/dict/build/Builder.java:
--------------------------------------------------------------------------------
  1 | package dict.build;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.BufferedWriter;
  5 | import java.io.File;
  6 | import java.io.FileInputStream;
  7 | import java.io.FileNotFoundException;
  8 | import java.io.IOException;
  9 | import java.io.PrintStream;
 10 | import java.util.Comparator;
 11 | import java.util.List;
 12 | import java.util.TreeMap;
 13 | 
 14 | import com.fasterxml.sort.SortConfig;
 15 | import com.fasterxml.sort.std.TextFileSorter;
 16 | import com.google.common.base.Charsets;
 17 | import com.google.common.base.Splitter;
 18 | import com.google.common.collect.Lists;
 19 | import com.google.common.io.Files;
 20 | 
 21 | /**
 22 |  * 
 23 |  * @author Jennifer
 24 |  * 
 25 |  */
 26 | public class Builder {
 27 | 
 28 | 	/**
 29 | 	 * Let's limit maximum memory used for pre-sorting when invoked from
 30 | 	 * command-line to be 256 megs
 31 | 	 */
 32 | 	public final static long MAX_HEAP_FOR_PRESORT = 2048L * 1024 * 1024;
 33 | 
 34 | 	/**
 35 | 	 * Also just in case our calculations are wrong, require 10 megs for
 36 | 	 * pre-sort anyway (if invoked from CLI)
 37 | 	 */
 38 | 	public final static long MIN_HEAP_FOR_PRESORT = 10L * 1024 * 1024;
 39 | 
 40 | 	private String parse(String filepath) {
 41 | 
 42 | 		File in = new File(filepath);
 43 | 		File out = new File(in.getParentFile(), "out.data");
 44 | 
 45 | 		try (BufferedReader ir = Files.newReader(in, Charsets.UTF_8);
 46 | 				BufferedWriter ow = Files.newWriter(out, Charsets.UTF_8);) {
 47 | 			String line = null;
 48 | 			while (null != (line = ir.readLine())) {
 49 | 				String[] seg = line.split(",");
 50 | 				StringBuilder bui = new StringBuilder();
 51 | 				for (int i = 6; i < seg.length; ++i) {
 52 | 					bui.append(seg[i]);
 53 | 				}
 54 | 				bui.append("\n");
 55 | 				ow.write(bui.toString());
 56 | 			}
 57 | 		} catch (FileNotFoundException e) {
 58 | 			e.printStackTrace();
 59 | 		} catch (IOException e) {
 60 | 			e.printStackTrace();
 61 | 		}
 62 | 
 63 | 		return out.getAbsolutePath();
 64 | 	}
 65 | 
 66 | 	private String reverse(String raw) {
 67 | 		StringBuilder bui = new StringBuilder();
 68 | 		for (int i = raw.length() - 1; i >= 0; --i)
 69 | 			bui.append(raw.charAt(i));
 70 | 		return bui.toString();
 71 | 	}
 72 | 
 73 | 	public void sortFile(File in, File out, Comparator<String> cmp) {
 74 | 		try {
 75 | 			long availMem = Runtime.getRuntime().maxMemory()
 76 | 					- (2048 * 1024 * 1024);
 77 | 			long maxMem = (availMem >> 1);
 78 | 			if (maxMem > MAX_HEAP_FOR_PRESORT) {
 79 | 				maxMem = MAX_HEAP_FOR_PRESORT;
 80 | 			} else if (maxMem < MIN_HEAP_FOR_PRESORT) {
 81 | 				maxMem = MIN_HEAP_FOR_PRESORT;
 82 | 			}
 83 | 			final TextFileSorter sorter = new TextFileSorter(
 84 | 					new SortConfig().withMaxMemoryUsage(maxMem));
 85 | 			sorter.sort(new FileInputStream(in), new PrintStream(out));
 86 | 		} catch (IOException e) {
 87 | 			e.printStackTrace();
 88 | 		}
 89 | 
 90 | 	}
 91 | 
 92 | 	public String genLeft(String rawTextFile, int maxLen, int memSize) {
 93 | 
 94 | 		File rawFile = new File(rawTextFile);
 95 | 
 96 | 		File dir = rawFile.getParentFile();
 97 | 
 98 | 		File ngramFile = new File(dir, "ngram_left.data");
 99 | 		File ngramSort = new File(dir, "sort_ngram_left.data");
100 | 		File ngramfreq = new File(dir, "freq_ngram_left.data");
101 | 		File ngramFreqSort = new File(dir, "freq_ngram_left_sort.data");
102 | 
103 | 		try (BufferedReader breader = Files.newReader(rawFile, Charsets.UTF_8);
104 | 				BufferedWriter writer = Files.newWriter(ngramFile,
105 | 						Charsets.UTF_8);
106 | 				BufferedWriter freqWriter = Files.newWriter(ngramfreq,
107 | 						Charsets.UTF_8);) {
108 | 			String line = null;
109 | 			while (null != (line = breader.readLine())) {
110 | 				line = line.replaceAll("\\p{Punct}", " ")
111 | 						.replaceAll("\\pP", " ").replaceAll("　", " ")
112 | 						.replaceAll("\\p{Blank}", " ")
113 | 						.replaceAll("\\p{Space}", " ")
114 | 						.replaceAll("\\p{Cntrl}", " ")
115 | 						.replaceAll("[的很了么呢是嘛]", " ");
116 | 				for (String sen : Splitter.on(" ").omitEmptyStrings()
117 | 						.splitToList(line)) {
118 | 					sen = reverse(sen.trim());
119 | 					sen = "$" + sen + "$";
120 | 					System.out.println(sen);
121 | 					System.out.println(sen.length());
122 | 					for (int i = 0; i < sen.length(); ++i) {
123 | 						for (int j = i + 1; j < i + maxLen + 1
124 | 								&& j <= sen.length(); ++j) {
125 | 							String w = sen.substring(i, j);
126 | 							writer.write(w + "\n");
127 | 						}
128 | 					}
129 | 				}
130 | 			}
131 | 			sortFile(ngramFile, ngramSort, new Comparator<String>() {
132 | 
133 | 				@Override
134 | 				public int compare(String o1, String o2) {
135 | 					return o1.compareTo(o2);
136 | 				}
137 | 			});
138 | 
139 | 			
140 | 			try(BufferedReader nsr = Files.newReader(ngramSort, Charsets.UTF_8)) {
141 | 				
142 | 				String ngram = null;
143 | 				String curr = null;
144 | 				List<String> sameWord = Lists.newLinkedList();
145 | 				boolean pause = false;
146 | 				while (pause || null != (curr = nsr.readLine())) {
147 | 					if (null == ngram) {
148 | 						sameWord.add(curr);
149 | 						ngram = curr;
150 | 					} else {
151 | 						if (curr.startsWith(ngram)) {
152 | 							sameWord.add(curr);
153 | 							pause = false;
154 | 						} else {
155 | 							if (sameWord.isEmpty()) {
156 | 								pause = false;
157 | 								sameWord.add(curr);
158 | 								ngram = curr;
159 | 								continue;
160 | 							}
161 | 							CounterMap right = new CounterMap();
162 | 							int freq = 0;
163 | 							for (String w : sameWord) {
164 | 								if (!w.startsWith(ngram)) {
165 | 									break;
166 | 								}
167 | 								if (w.equals(ngram)) {
168 | 									continue;
169 | 								}
170 | 								++freq;
171 | 								right.incr(w.substring(ngram.length()));
172 | 							}
173 | 							double re = 0.0;
174 | 							for (String t : right.countAll().keySet()) {
175 | 								double p = right.get(t) * 1.0 / freq;
176 | 								re += -1 * p * Math.log(p);
177 | 							}
178 | 							freqWriter.write(reverse(ngram) + "\t" + re + "\n");
179 | 							List<String> newlist = Lists.newLinkedList();
180 | 							for (String w : sameWord) {
181 | 								if (!w.equals(ngram)) {
182 | 									newlist.add(w);
183 | 								}
184 | 							}
185 | 							sameWord = newlist;
186 | 							if (sameWord.isEmpty()) {
187 | 								pause = false;
188 | 								sameWord.add(curr);
189 | 								ngram = curr;
190 | 								continue;
191 | 							}
192 | 							ngram = sameWord.get(0);
193 | 							if (curr.startsWith(ngram)) {
194 | 								sameWord.add(curr);
195 | 								pause = false;
196 | 							} else {
197 | 								pause = true;
198 | 							}
199 | 						}
200 | 					}
201 | 				}
202 | 			}
203 | 			sortFile(ngramfreq, ngramFreqSort, new Comparator<String>() {
204 | 
205 | 				@Override
206 | 				public int compare(String o1, String o2) {
207 | 					return o1.compareTo(o2);
208 | 				}
209 | 			});
210 | 
211 | 		} catch (FileNotFoundException e) {
212 | 			e.printStackTrace();
213 | 		} catch (IOException e) {
214 | 			e.printStackTrace();
215 | 		}
216 | 
217 | 		return ngramFreqSort.getAbsolutePath();
218 | 	}
219 | 
220 | 	public String genFreqRight(String rawTextFile, int maxLen, int memSize) {
221 | 
222 | 		File rawFile = new File(rawTextFile);
223 | 
224 | 		File dir = rawFile.getParentFile();
225 | 
226 | 		File ngramFile = new File(dir, "ngram.data");
227 | 		File ngramSort = new File(dir, "ngram_sort.data");
228 | 		File ngramfreq = new File(dir, "freq_ngram.data");
229 | 		File ngramfreqSort = new File(dir, "freq_ngram_sort.data");
230 | 
231 | 		try (BufferedReader breader = Files.newReader(rawFile, Charsets.UTF_8);
232 | 				BufferedWriter writer = Files.newWriter(ngramFile,
233 | 						Charsets.UTF_8);
234 | 				BufferedWriter freqWriter = Files.newWriter(ngramfreq,
235 | 						Charsets.UTF_8);) {
236 | 			String line = null;
237 | 			while (null != (line = breader.readLine())) {
238 | 				line = line.replaceAll("\\p{Punct}", " ")
239 | 						.replaceAll("\\pP", " ").replaceAll("　", " ")
240 | 						.replaceAll("\\p{Blank}", " ")
241 | 						.replaceAll("\\p{Space}", " ")
242 | 						.replaceAll("\\p{Cntrl}", " ")
243 | 						.replaceAll("[的很了么呢是嘛]", " ");
244 | 				for (String sen : Splitter.on(" ").omitEmptyStrings()
245 | 						.splitToList(line)) {
246 | 					sen = sen.trim();
247 | 					sen = "$" + sen + "$";
248 | 					System.out.println(sen);
249 | 					System.out.println(sen.length());
250 | 					for (int i = 0; i < sen.length(); ++i) {
251 | 						for (int j = i + 1; j < i + maxLen + 1 && j <= sen.length(); ++j) {
252 | 							String w = sen.substring(i, j);
253 | 							writer.write(w + "\n");
254 | 						}
255 | 					}
256 | 				}
257 | 			}
258 | 			System.out.println("gen sorting...");
259 | 			sortFile(ngramFile, ngramSort, new Comparator<String>() {
260 | 
261 | 				@Override
262 | 				public int compare(String o1, String o2) {
263 | 					return o1.compareTo(o2);
264 | 				}
265 | 			});
266 | 			
267 | 			
268 | 			try(BufferedReader nsr = Files.newReader(ngramSort, Charsets.UTF_8)) {
269 | 				
270 | 				String ngram = null;
271 | 				String curr = null;
272 | 				List<String> sameWord = Lists.newLinkedList();
273 | 				boolean pause = false;
274 | 				while (pause || null != (curr = nsr.readLine())) {
275 | 					if (null == ngram) {
276 | 						sameWord.add(curr);
277 | 						ngram = curr;
278 | 					} else {
279 | 						if (curr.startsWith(ngram)) {
280 | 							sameWord.add(curr);
281 | 						} else {
282 | 							if (sameWord.isEmpty()) {
283 | 								pause = false;
284 | 								sameWord.add(curr);
285 | 								ngram = curr;
286 | 								continue;
287 | 							}
288 | 							CounterMap right = new CounterMap();
289 | 							int freq = 0;
290 | 							for (String w : sameWord) {
291 | 								if (!w.startsWith(ngram)) {
292 | 									break;
293 | 								}
294 | 								if (w.equals(ngram)) {
295 | 									continue;
296 | 								}
297 | 								++freq;
298 | 								right.incr(w.substring(ngram.length()));
299 | 							}
300 | 							double re = 0.0;
301 | 							for (String t : right.countAll().keySet()) {
302 | 								double p = right.get(t) * 1.0 / freq;
303 | 								re += -1 * p * Math.log(p);
304 | 							}
305 | 							freqWriter.write(ngram + "\t" + freq + "\t" + re + "\n");
306 | 							List<String> newlist = Lists.newLinkedList();
307 | 							for (String w : sameWord) {
308 | 								if (!w.equals(ngram)) {
309 | 									newlist.add(w);
310 | 								}
311 | 							}
312 | 							sameWord = newlist;
313 | 							if (sameWord.isEmpty()) {
314 | 								pause = false;
315 | 								sameWord.add(curr);
316 | 								ngram = curr;
317 | 								continue;
318 | 							}
319 | 							ngram = sameWord.get(0);
320 | 							if (curr.startsWith(ngram)) {
321 | 								sameWord.add(curr);
322 | 							} else {
323 | 								pause = true;
324 | 							}
325 | 						}
326 | 					}
327 | 				}
328 | 			}
329 | 
330 | 			sortFile(ngramfreq, ngramfreqSort, new Comparator<String>() {
331 | 
332 | 				@Override
333 | 				public int compare(String o1, String o2) {
334 | 					return o1.compareTo(o2);
335 | 				}
336 | 			});
337 | 		} catch (FileNotFoundException e) {
338 | 			e.printStackTrace();
339 | 		} catch (IOException e) {
340 | 			e.printStackTrace();
341 | 		}
342 | 
343 | 		return ngramfreqSort.getAbsolutePath();
344 | 	}
345 | 
346 | 	public String mergeEntropy(String freqRight, String left) {
347 | 
348 | 		// Sorter sorter = new TextFileSorter(
349 | 		// new SortConfig().withMaxMemoryUsage(1024 * 1000 * 1000));
350 | 
351 | 		File frFile = new File(freqRight);
352 | 		File lFile = new File(left);
353 | 		File mergeTmp = new File(frFile.getParentFile(), "merge.tmp");
354 | 		File mergeTmp2 = new File(frFile.getParentFile(), "merge.tmp2");
355 | 		File mergeFile = new File(frFile.getParentFile(), "merge_entropy.data");
356 | 
357 | 		try (BufferedReader rr = Files.newReader(frFile, Charsets.UTF_8);
358 | 				BufferedReader lr = Files.newReader(lFile, Charsets.UTF_8);
359 | 				BufferedWriter mw = Files.newWriter(mergeTmp, Charsets.UTF_8);
360 | 				BufferedWriter mf = Files.newWriter(mergeFile, Charsets.UTF_8);) {
361 | 			String line = null;
362 | 			while (null != (line = rr.readLine())) {
363 | 				mw.write(line + "\n");
364 | 			}
365 | 			line = null;
366 | 			while (null != (line = lr.readLine())) {
367 | 				mw.write(line + "\n");
368 | 			}
369 | 
370 | 			// sorter.sort(new FileInputStream(mergeTmp), new FileOutputStream(
371 | 			// mergeTmp2));
372 | 			sortFile(mergeTmp, mergeTmp2, new Comparator<String>() {
373 | 
374 | 				@Override
375 | 				public int compare(String o1, String o2) {
376 | 					return o1.compareTo(o2);
377 | 				}
378 | 			});
379 | 
380 | 			BufferedReader br = Files.newReader(mergeTmp2, Charsets.UTF_8);
381 | 
382 | 			String line1 = null;
383 | 			String line2 = null;
384 | 			line1 = br.readLine();
385 | 			line2 = br.readLine();
386 | 			while (true) {
387 | 
388 | 				if (null == line1 || null == line2)
389 | 					break;
390 | 				String[] seg1 = line1.split("\t");
391 | 				String[] seg2 = line2.split("\t");
392 | 				if (!seg1[0].equals(seg2[0])) {
393 | 					line1 = new String(line2.getBytes());
394 | 					line2 = br.readLine();
395 | 					continue;
396 | 				}
397 | 				if (seg1.length < 2) {
398 | 					line1 = new String(line2.getBytes());
399 | 					line2 = br.readLine();
400 | 					continue;
401 | 				}
402 | 				double le = seg1.length == 2 ? Double.parseDouble(seg1[1])
403 | 						: Double.parseDouble(seg2[1]);
404 | 				double re = seg1.length == 3 ? Double.parseDouble(seg1[2])
405 | 						: Double.parseDouble(seg2[2]);
406 | 				int freq = seg1.length == 3 ? Integer.parseInt(seg1[1])
407 | 						: Integer.parseInt(seg2[1]);
408 | 				double e = Math.min(le, re);
409 | 				mf.write(seg1[0] + "\t" + freq + "\t" + e + "\n");
410 | 
411 | 				line1 = br.readLine();
412 | 				line2 = br.readLine();
413 | 			}
414 | 
415 | 		} catch (FileNotFoundException e) {
416 | 			e.printStackTrace();
417 | 		} catch (IOException e) {
418 | 			e.printStackTrace();
419 | 		}
420 | 
421 | 		return mergeFile.toString();
422 | 	}
423 | 
424 | 	public void extractWords(String freqFile, String entropyFile) {
425 | 
426 | 		TreeMap<String, Integer> freq = new TreeMap<>();
427 | 
428 | 		File ffile = new File(freqFile);
429 | 		File efile = new File(entropyFile);
430 | 		File wfile = new File(efile.getParentFile(), "words.data");
431 | 
432 | 		try (BufferedReader fr = Files.newReader(ffile, Charsets.UTF_8);
433 | 				BufferedReader er = Files.newReader(efile, Charsets.UTF_8);
434 | 				BufferedWriter ww = Files.newWriter(wfile, Charsets.UTF_8);) {
435 | 
436 | 			String line = null;
437 | 			while (null != (line = fr.readLine())) {
438 | 				String[] seg = line.split("\t");
439 | 				if (seg.length < 3) continue;
440 | 				freq.put(seg[0], Integer.parseInt(seg[1]));
441 | 			}
442 | 			line = null;
443 | 			while (null != (line = er.readLine())) {
444 | 				String[] seg = line.split("\t");
445 | 				if (3 != seg.length)
446 | 					continue;
447 | 				String w = seg[0];
448 | 				int f = Integer.parseInt(seg[1]);
449 | 				double e = Double.parseDouble(seg[2]);
450 | 				long max = -1;
451 | 				for (int s = 1; s < w.length(); ++s) {
452 | 					String lw = w.substring(0, s);
453 | 					String rw = w.substring(s);
454 | 					if (!freq.containsKey(lw) || !freq.containsKey(rw))
455 | 						continue;
456 | 					long ff = freq.get(lw) * freq.get(rw);
457 | 					if (ff > max)
458 | 						max = ff;
459 | 				}
460 | 				double pf = f * 2000000.0 / max;
461 | 				if (pf < 10 || e < 2)
462 | 					continue;
463 | 				ww.write(w + "\t" + pf + "\t" + e + "\n");
464 | 			}
465 | 		} catch (FileNotFoundException e) {
466 | 			// TODO Auto-generated catch block
467 | 			e.printStackTrace();
468 | 		} catch (IOException e) {
469 | 			// TODO Auto-generated catch block
470 | 			e.printStackTrace();
471 | 		}
472 | 	}
473 | 
474 | 	public static void main(String[] args) {
475 | 
476 | 		Builder builder = new Builder();
477 | 
478 | 		String rawpath = builder.parse("/Users/zhangcheng/Downloads/comment/test/all.csv");
479 | //		String rawpath = "/Users/zhangcheng/Documents/workspace/python/meta_search/raw_data.txt";
480 | 		//
481 | 		String freqRight = builder.genFreqRight(rawpath, 5, 1024);
482 | 		String left = builder.genLeft(rawpath, 5, 1024);
483 | 		//
484 | 		// String freqRight =
485 | 		// "/Users/zhangcheng/Documents/workspace/python/meta_search/freq_ngram_sort.data";
486 | 		// String left =
487 | 		// "/Users/zhangcheng/Documents/workspace/python/meta_search/freq_ngram_left_sort.data";
488 | 		
489 | //		String freqRight = "/Users/zhangcheng/Downloads/comment/test/freq_ngram_sort.data";
490 | //		String left = "/Users/zhangcheng/Downloads/comment/test/freq_ngram_left_sort.data";
491 | 
492 | 		String entropyfile = builder.mergeEntropy(freqRight, left);
493 | 
494 | 		builder.extractWords(freqRight, entropyfile);
495 | 
496 | 	}
497 | }
498 | 


--------------------------------------------------------------------------------
/src/main/java/dict/build/CounterMap.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * 
 3 |  */
 4 | package dict.build;
 5 | 
 6 | import java.io.Serializable;
 7 | import java.util.Map;
 8 | import java.util.concurrent.ConcurrentHashMap;
 9 | 
10 | /**
11 |  * @author Jennifer
12 |  * 
13 |  */
14 | public class CounterMap implements Serializable {
15 | 
16 | 	/**
17 | 	 * 
18 | 	 */
19 | 	private static final long serialVersionUID = -3903452740943758085L;
20 | 
21 | 	private Map<String, Integer> count = new ConcurrentHashMap<String, Integer>();
22 | 
23 | 	public CounterMap() {
24 | 	}
25 | 
26 | 	public CounterMap(int capacitySize) {
27 | 		count = new ConcurrentHashMap<String, Integer>(capacitySize);
28 | 	}
29 | 
30 | 	public void incr(String key) {
31 | 		if (count.containsKey(key)) {
32 | 			count.put(key, count.get(key) + 1);
33 | 		} else {
34 | 			count.put(key, 1);
35 | 		}
36 | 	}
37 | 	
38 | 	public void incrby(String key, int delta) {
39 | 		if (count.containsKey(key)) {
40 | 			count.put(key, count.get(key) + delta);
41 | 		} else {
42 | 			count.put(key, delta);
43 | 		}
44 | 	}
45 | 	
46 | 	public int get(String key) {
47 | 		Integer value =  count.get(key);
48 | 		if (null == value)
49 | 			return 0;
50 | 		return value;
51 | 	}
52 | 	
53 | 	public Map<String, Integer> countAll() {
54 | 		return count;
55 | 	}
56 | }
57 | 


--------------------------------------------------------------------------------
/src/main/java/dict/build/FastBuilder.java:
--------------------------------------------------------------------------------
  1 | package dict.build;
  2 | 
  3 | import java.io.*;
  4 | import java.util.Map;
  5 | import java.util.TreeMap;
  6 | 
  7 | import com.fasterxml.sort.SortConfig;
  8 | import com.fasterxml.sort.std.TextFileSorter;
  9 | import com.google.common.base.Charsets;
 10 | import com.google.common.base.Splitter;
 11 | import com.google.common.collect.Maps;
 12 | import com.google.common.io.Files;
 13 | import com.googlecode.concurrenttrees.radix.ConcurrentRadixTree;
 14 | import com.googlecode.concurrenttrees.radix.RadixTree;
 15 | import com.googlecode.concurrenttrees.radix.node.concrete.DefaultCharArrayNodeFactory;
 16 | import org.slf4j.Logger;
 17 | import org.slf4j.LoggerFactory;
 18 | 
 19 | /**
 20 |  * 
 21 |  * @author Jennifer
 22 |  * 
 23 |  */
 24 | public class FastBuilder {
 25 | 
 26 | 	private static final Logger LOG = LoggerFactory.getLogger(FastBuilder.class);
 27 | 
 28 | 	/**
 29 | 	 * Let's limit maximum memory used for pre-sorting when invoked from
 30 | 	 * command-line to be 256 megs
 31 | 	 */
 32 | 	public final static long MAX_HEAP_FOR_PRESORT = 256L * 1024 * 1024;
 33 | 
 34 | 	/**
 35 | 	 * Also just in case our calculations are wrong, require 10 megs for
 36 | 	 * pre-sort anyway (if invoked from CLI)
 37 | 	 */
 38 | 	public final static long MIN_HEAP_FOR_PRESORT = 10L * 1024 * 1024;
 39 | 	
 40 | 	
 41 | 	public final static String stopwords = "的很了么呢是嘛个都也比还这于不与才上用就好在和对挺去后没说";
 42 | 	
 43 | 	
 44 | 	/**
 45 | 	 * 输入的字符是否是汉字
 46 | 	 * @param a char
 47 | 	 * @return boolean
 48 | 	 */
 49 | 	public static boolean isChinese(char a) { 
 50 | 	     int v = (int)a; 
 51 | 	     return (v >=19968 && v <= 40869);  //  [0x4e00, 0x29fa5]	
 52 | 	}
 53 | 	
 54 | 	public static boolean allChs(String s){
 55 | 		if (null == s || "".equals(s.trim())) return false;
 56 | 		for (int i = 0; i < s.length(); i++) {
 57 | 			if (!isChinese(s.charAt(i))) return false;
 58 | 		}
 59 | 		return true;
 60 | 	}
 61 | 	
 62 | 	public TreeMap<String, double[]> loadPosprop() {
 63 | 		
 64 | 		TreeMap<String, double[]> prop = Maps.newTreeMap();
 65 | 		try {
 66 |             System.out.println(FastBuilder.class.getResourceAsStream("/pos_prop.txt"));
 67 |             BufferedReader br = new BufferedReader(new InputStreamReader(this.getClass().getResourceAsStream("/pos_prop.txt"),"UTF-8"));
 68 |             String l = null;
 69 |             while (null != (l = br.readLine())) {
 70 | 				String[] seg = l.split("\t");
 71 | 				prop.put(seg[0], new double[]{Double.parseDouble(seg[1]), Double.parseDouble(seg[2]), Double.parseDouble(seg[3])});
 72 | 			}
 73 | 		} catch (IOException e) {
 74 | 			e.printStackTrace();
 75 | 		}
 76 | 		return prop;
 77 | 	}
 78 | 
 79 | 	public String parse(String filepath) {
 80 | 
 81 | 		File in = new File(filepath);
 82 | 		File out = new File(in.getParentFile(), "out.data");
 83 | 
 84 | 		try (BufferedReader ir = Files.newReader(in, Charsets.UTF_8);
 85 | 				BufferedWriter ow = Files.newWriter(out, Charsets.UTF_8);) {
 86 | 			String line = null;
 87 | 			while (null != (line = ir.readLine())) {
 88 | 				String[] seg = line.split(",");
 89 | 				StringBuilder bui = new StringBuilder();
 90 | 				for (int i = 6; i < seg.length; ++i) {
 91 | 					bui.append(seg[i]);
 92 | 				}
 93 | 				bui.append("\n");
 94 | 				ow.write(bui.toString());
 95 | 			}
 96 | 		} catch (FileNotFoundException e) {
 97 | 			e.printStackTrace();
 98 | 		} catch (IOException e) {
 99 | 			e.printStackTrace();
100 | 		}
101 | 
102 | 		return out.getAbsolutePath();
103 | 	}
104 | 
105 | 	private String reverse(String raw) {
106 | 		StringBuilder bui = new StringBuilder();
107 | 		for (int i = raw.length() - 1; i >= 0; --i)
108 | 			bui.append(raw.charAt(i));
109 | 		return bui.toString();
110 | 	}
111 | 
112 | 	public void sortFile(File in, File out) {
113 | 		try {
114 | 			long availMem = Runtime.getRuntime().maxMemory()
115 | 					- (40 * 1024 * 1024);
116 | 			long maxMem = (availMem >> 1);
117 | 			if (maxMem > MAX_HEAP_FOR_PRESORT) {
118 | 				maxMem = MAX_HEAP_FOR_PRESORT;
119 | 			} else if (maxMem < MIN_HEAP_FOR_PRESORT) {
120 | 				maxMem = MIN_HEAP_FOR_PRESORT;
121 | 			}
122 | 			final TextFileSorter sorter = new TextFileSorter(
123 | 					new SortConfig().withMaxMemoryUsage(maxMem));
124 | 			sorter.sort(new FileInputStream(in), new PrintStream(out));
125 | 		} catch (IOException e) {
126 | 			e.printStackTrace();
127 | 		}
128 | 	}
129 | 
130 | 	public String genLeft(String rawTextFile, int maxLen, int memSize) {
131 | 
132 | 		File rawFile = new File(rawTextFile);
133 | 
134 | 		File dir = rawFile.getParentFile();
135 | 
136 | 		File ngramFile = new File(dir, "ngram_left.data");
137 | 		File ngramSort = new File(dir, "sort_ngram_left.data");
138 | 		File ngramfreq = new File(dir, "freq_ngram_left.data");
139 | 		File ngramFreqSort = new File(dir, "freq_ngram_left_sort.data");
140 | 
141 | 		try (BufferedReader breader = Files.newReader(rawFile, Charsets.UTF_8);
142 | 				BufferedWriter writer = Files.newWriter(ngramFile,
143 | 						Charsets.UTF_8);
144 | 				BufferedWriter freqWriter = Files.newWriter(ngramfreq,
145 | 						Charsets.UTF_8);) {
146 | 			String line = null;
147 | 			while (null != (line = breader.readLine())) {
148 | 				line = line.replaceAll("[" + stopwords + "]", " ")
149 | 						.replaceAll("\\p{Punct}", " ")
150 | 						.replaceAll("\\pP", " ")
151 | 						.replaceAll("　", " ")
152 | 						.replaceAll("\\p{Blank}", " ")
153 | 						.replaceAll("\\p{Space}", " ")
154 | 						.replaceAll("\\p{Cntrl}", " ");
155 | 				for (String sen : Splitter.on(" ").omitEmptyStrings()
156 | 						.splitToList(line)) {
157 | 					sen = reverse(sen.trim());
158 | 					if (!allChs(sen)) continue;
159 | 					sen = "$" + sen + "$";
160 | 					for (int i = 1; i < sen.length() - 1; ++i) {
161 | 						writer.write(sen.substring(i, Math.min(maxLen + i,  sen.length())) + "\n");
162 | 					}
163 | 				}
164 | 			}
165 | 			writer.close();
166 | 			sortFile(ngramFile, ngramSort);
167 | 
168 | 			try(BufferedReader nsr = Files.newReader(ngramSort, Charsets.UTF_8)) {
169 | 				String first = null;
170 | 				String curr = null;
171 | 				Map<String, CounterMap> stat = Maps.newHashMap();
172 | 				while (null != (curr = nsr.readLine())) {
173 | 					if (null == first) {
174 | 						for (int i = 1; i < curr.length(); ++i) {
175 | 							String w = curr.substring(0, i);
176 | 							String suffix = curr.substring(i).substring(0, 1);
177 | 							if (stat.containsKey(w)) {
178 | 								stat.get(w).incr(suffix);
179 | 							} else {
180 | 								CounterMap cm = new CounterMap();
181 | 								cm.incr(suffix);
182 | 								stat.put(w, cm);
183 | 							}
184 | 						}
185 | 						first = curr.substring(0, 1);
186 | 					} else {
187 | 						if (!curr.startsWith(first)) {
188 | 
189 | 							StringBuilder builder = new StringBuilder();
190 | 							for (String w : stat.keySet()) {
191 | 								CounterMap cm = stat.get(w);
192 | 								int freq = 0;
193 | 								double re = 0;
194 | 								for (String k : cm.countAll().keySet()) {
195 | 									freq += cm.get(k);
196 | 								}
197 | 								for (String k : cm.countAll().keySet()) {
198 | 									double p = cm.get(k) * 1.0 / freq;
199 | 									re += -1 * Math.log(p) / Math.log(2) * p;
200 | 								}
201 | 								builder.append(reverse(w)).append("\t").append(re).append("\n");
202 | 							}
203 | 							freqWriter.write(builder.toString());
204 | 							stat.clear();
205 | 							first = curr.substring(0, 1);
206 | 						}
207 | 						for (int i = 1; i < curr.length(); ++i) {
208 | 							String w = curr.substring(0, i);
209 | 							String suffix = curr.substring(i).substring(0, 1);
210 | 							if (stat.containsKey(w)) {
211 | 								stat.get(w).incr(suffix);
212 | 							} else {
213 | 								CounterMap cm = new CounterMap();
214 | 								cm.incr(suffix);
215 | 								stat.put(w, cm);
216 | 							}
217 | 						}
218 | 					}
219 | 				}
220 | 				StringBuilder builder = new StringBuilder();
221 | 							for (String w : stat.keySet()) {
222 | 								CounterMap cm = stat.get(w);
223 | 								int freq = 0;
224 | 								double re = 0;
225 | 								for (String k : cm.countAll().keySet()) {
226 | 									freq += cm.get(k);
227 | 								}
228 | 								for (String k : cm.countAll().keySet()) {
229 | 									double p = cm.get(k) * 1.0 / freq;
230 | 									re += -1 * Math.log(p) / Math.log(2) * p;
231 | 								}
232 | 								builder.append(reverse(w)).append("\t").append(re).append("\n");
233 | 							}
234 | 							freqWriter.write(builder.toString());
235 | 							stat.clear();
236 | 				
237 | 				freqWriter.close();
238 | 			}
239 | 			
240 | 			sortFile(ngramfreq, ngramFreqSort);
241 | 
242 | 		} catch (FileNotFoundException e) {
243 | 			e.printStackTrace();
244 | 		} catch (IOException e) {
245 | 			e.printStackTrace();
246 | 		}
247 | 
248 | 		return ngramFreqSort.getAbsolutePath();
249 | 	}
250 | 
251 | 	public String genFreqRight(String rawTextFile, int maxLen, int memSize) {
252 | 
253 | 		File rawFile = new File(rawTextFile);
254 | 
255 | 		File dir = rawFile.getParentFile();
256 | 
257 | 		File ngramFile = new File(dir, "ngram.data");
258 | 		File ngramSort = new File(dir, "ngram_sort.data");
259 | 		File ngramfreq = new File(dir, "freq_ngram.data");
260 | 		File ngramfreqSort = new File(dir, "freq_ngram_sort.data");
261 | 
262 | 		try (BufferedReader breader = Files.newReader(rawFile, Charsets.UTF_8);
263 | 				BufferedWriter writer = Files.newWriter(ngramFile,
264 | 						Charsets.UTF_8);
265 | 				BufferedWriter freqWriter = Files.newWriter(ngramfreq,
266 | 						Charsets.UTF_8);) {
267 | 			String line = null;
268 | 			while (null != (line = breader.readLine())) {
269 | 				line = line.replaceAll("["+stopwords+"]", " ")
270 | 						.replaceAll("\\p{Punct}", " ")
271 | 						.replaceAll("\\pP", " ")
272 | 						.replaceAll("　", " ")
273 | 						.replaceAll("\\p{Blank}", " ")
274 | 						.replaceAll("\\p{Space}", " ")
275 | 						.replaceAll("\\p{Cntrl}", " ");
276 | 				for (String sen : Splitter.on(" ").omitEmptyStrings()
277 | 						.splitToList(line)) {
278 | 					sen = sen.trim();
279 | 					if (!allChs(sen)) continue;
280 | 					sen = "$" + sen + "$";
281 | 					for (int i = 1; i < sen.length() - 1; ++i) {
282 | 						writer.write(sen.substring(i, Math.min(maxLen + i,  sen.length())) + "\n");
283 | 					}
284 | 				}
285 | 			}
286 | 			writer.close();
287 | 			System.out.println("gen sorting...");
288 | 			sortFile(ngramFile, ngramSort);
289 | 			
290 | 			try(BufferedReader nsr = Files.newReader(ngramSort, Charsets.UTF_8)) {
291 | 				String first = null;
292 | 				String curr = null;
293 | 				Map<String, CounterMap> stat = Maps.newHashMap();
294 | 				while (null != (curr = nsr.readLine())) {
295 | 					if (null == first) {
296 | 						for (int i = 1; i < curr.length(); ++i) {
297 | 							String w = curr.substring(0, i);
298 | 							String suffix = curr.substring(i).substring(0, 1);
299 | 							if (stat.containsKey(w)) {
300 | 								stat.get(w).incr(suffix);
301 | 							} else {
302 | 								CounterMap cm = new CounterMap();
303 | 								cm.incr(suffix);
304 | 								stat.put(w, cm);
305 | 							}
306 | 						}
307 | 						first = curr.substring(0, 1);
308 | 					} else {
309 | 						if (!curr.startsWith(first)) {
310 | 
311 | 							StringBuilder builder = new StringBuilder();
312 | 							for (String w : stat.keySet()) {
313 | 								CounterMap cm = stat.get(w);
314 | 								int freq = 0;
315 | 								double re = 0;
316 | 								for (String k : cm.countAll().keySet()) {
317 | 									freq += cm.get(k);
318 | 								}
319 | 								for (String k : cm.countAll().keySet()) {
320 | 									double p = cm.get(k) * 1.0 / freq;
321 | 									re += -1 * Math.log(p) / Math.log(2) * p;
322 | 								}
323 | 								builder.append(w).append("\t").append(freq).append("\t").append(re).append("\n");
324 | 							}
325 | 							freqWriter.write(builder.toString());
326 | 							stat.clear();
327 | 							first = curr.substring(0, 1);
328 | 						}
329 | 						for (int i = 1; i < curr.length(); ++i) {
330 | 							String w = curr.substring(0, i);
331 | 							String suffix = curr.substring(i).substring(0, 1);
332 | 							if (stat.containsKey(w)) {
333 | 								stat.get(w).incr(suffix);
334 | 							} else {
335 | 								CounterMap cm = new CounterMap();
336 | 								cm.incr(suffix);
337 | 								stat.put(w, cm);
338 | 							}
339 | 						}
340 | 					}
341 | 				}
342 | 				StringBuilder builder = new StringBuilder();
343 | 							for (String w : stat.keySet()) {
344 | 								CounterMap cm = stat.get(w);
345 | 								int freq = 0;
346 | 								double re = 0;
347 | 								for (String k : cm.countAll().keySet()) {
348 | 									freq += cm.get(k);
349 | 								}
350 | 								for (String k : cm.countAll().keySet()) {
351 | 									double p = cm.get(k) * 1.0 / freq;
352 | 									re += -1 * Math.log(p) / Math.log(2) * p;
353 | 								}
354 | 								builder.append(w).append("\t").append(freq).append("\t").append(re).append("\n");
355 | 							}
356 | 							freqWriter.write(builder.toString());
357 | 							stat.clear();
358 | 				freqWriter.close();
359 | 			}
360 | 			
361 | 			sortFile(ngramfreq, ngramfreqSort);
362 | 
363 | 		} catch (FileNotFoundException e) {
364 | 			e.printStackTrace();
365 | 		} catch (IOException e) {
366 | 			e.printStackTrace();
367 | 		}
368 | 
369 | 		return ngramfreqSort.getAbsolutePath();
370 | 	}
371 | 
372 | 	public String mergeEntropy(String freqRight, String left) {
373 | 
374 | 
375 | 		File frFile = new File(freqRight);
376 | 		File lFile = new File(left);
377 | 		File mergeTmp = new File(frFile.getParentFile(), "merge.tmp");
378 | 		File mergeTmp2 = new File(frFile.getParentFile(), "merge.tmp2");
379 | 		File mergeFile = new File(frFile.getParentFile(), "merge_entropy.data");
380 | 
381 | 		try (BufferedReader rr = Files.newReader(frFile, Charsets.UTF_8);
382 | 				BufferedReader lr = Files.newReader(lFile, Charsets.UTF_8);
383 | 				BufferedWriter mw = Files.newWriter(mergeTmp, Charsets.UTF_8);
384 | 				BufferedWriter mf = Files.newWriter(mergeFile, Charsets.UTF_8);) {
385 | 			String line = null;
386 | 			while (null != (line = rr.readLine())) {
387 | 				mw.write(line + "\n");
388 | 			}
389 | 			line = null;
390 | 			while (null != (line = lr.readLine())) {
391 | 				mw.write(line + "\n");
392 | 			}
393 | 			mw.close();
394 | 
395 | 			sortFile(mergeTmp, mergeTmp2);
396 | 
397 | 			BufferedReader br = Files.newReader(mergeTmp2, Charsets.UTF_8);
398 | 
399 | 			String line1 = null;
400 | 			String line2 = null;
401 | 			line1 = br.readLine();
402 | 			line2 = br.readLine();
403 | 			while (true) {
404 | 
405 | 				if (null == line1 || null == line2)
406 | 					break;
407 | 				String[] seg1 = line1.split("\t");
408 | 				String[] seg2 = line2.split("\t");
409 | 				if (!seg1[0].equals(seg2[0])) {
410 | 					line1 = new String(line2.getBytes());
411 | 					line2 = br.readLine();
412 | 					continue;
413 | 				}
414 | 				if (seg1.length < 2) {
415 | 					line1 = new String(line2.getBytes());
416 | 					line2 = br.readLine();
417 | 					continue;
418 | 				}
419 | 				line1 = br.readLine();
420 | 				line2 = br.readLine();
421 | 				
422 | 				if (seg1.length < 3 && seg2.length < 3) 
423 | 					continue;
424 | 				double le = seg1.length == 2 ? Double.parseDouble(seg1[1])
425 | 						: Double.parseDouble(seg2[1]);
426 | 				double re = seg1.length == 3 ? Double.parseDouble(seg1[2])
427 | 						: Double.parseDouble(seg2[2]);
428 | 				int freq = seg1.length == 3 ? Integer.parseInt(seg1[1])
429 | 						: Integer.parseInt(seg2[1]);
430 | 				double e = Math.min(le, re);
431 | 				mf.write(seg1[0] + "\t" + freq + "\t" + e + "\n");
432 | 
433 | 			}
434 | 			mf.close();
435 | 
436 | 		} catch (FileNotFoundException e) {
437 | 			e.printStackTrace();
438 | 		} catch (IOException e) {
439 | 			e.printStackTrace();
440 | 		}
441 | 
442 | 		return mergeFile.toString();
443 | 	}
444 | 	
445 | 	public static boolean allLetterOrNumber(String w) {
446 | 
447 | 		for (char c : w.toLowerCase().toCharArray()) {
448 | 			boolean letter = c >= 'a' && c <= 'z';
449 | 			boolean digit = c >= '0' && c <= '9';
450 | 			if (!letter && !digit) return false;
451 | 		}
452 | 		return true;
453 | 	}
454 | 
455 | 	public void extractWords(String freqFile, String entropyFile) {
456 | 
457 | 		LOG.info("start to extract words");
458 | 		
459 | 		TreeMap<String, double[]> posProp = this.loadPosprop();
460 | 
461 | 		RadixTree<Integer> tree = new ConcurrentRadixTree<Integer>(new DefaultCharArrayNodeFactory());
462 | 
463 | 		File ffile = new File(freqFile);
464 | 		File efile = new File(entropyFile);
465 | 		File wfile = new File(efile.getParentFile(), "words.data");
466 | 		File wsfile = new File(efile.getParentFile(), "words_sort.data");
467 | 
468 | 		try (BufferedReader fr = Files.newReader(ffile, Charsets.UTF_8);
469 | 				BufferedReader er = Files.newReader(efile, Charsets.UTF_8);
470 | 				BufferedWriter ww = Files.newWriter(wfile, Charsets.UTF_8);) {
471 | 
472 | 			String line = null;
473 | 			long total = 0;
474 | 			long epoch = 0;
475 | 			while (null != (line = fr.readLine())) {
476 | 				String[] seg = line.split("\t");
477 | 				if (seg.length < 3) continue;
478 | 				tree.put(seg[0], Integer.parseInt(seg[1]));
479 | 				epoch += 1;
480 | 				//all single char's frequency
481 | 				if(seg[0].length()<2) total += Integer.parseInt(seg[1]);
482 | 				if (epoch % 1000 == 0) {
483 | 					LOG.info("load freq to radix tree done: " + total);
484 | 				}
485 | 			}
486 | 			LOG.info("build freq TST done!");
487 | 			line = null;
488 | 			int cnt = 0;
489 | 			while (null != (line = er.readLine())) {
490 | 				cnt += 1;
491 | 				if (cnt % 1000 == 0) {
492 | 					LOG.info("extract words done: " + cnt);
493 | 				}
494 | 				String[] seg = line.split("\t");
495 | 				if (3 != seg.length)
496 | 					continue;
497 | 				String w = seg[0];
498 | 				if (allLetterOrNumber(w)) {
499 | 					continue;
500 | 				}
501 | 				int f = Integer.parseInt(seg[1]);
502 | 				double e = Double.parseDouble(seg[2]);
503 | 				long max = -1;
504 | 				for (int s = 1; s < w.length(); ++s) {
505 | 					String lw = w.substring(0, s);
506 | 					String rw = w.substring(s);
507 | 					Integer lfObj = tree.getValueForExactKey(lw);
508 | 					Integer rfObj = tree.getValueForExactKey(rw);
509 | 					long lf = -1;
510 | 					long rf = -1;
511 | 					if (null != lfObj) {
512 | 						lf = lfObj.intValue();
513 | 					}
514 | 					if (null != rfObj) {
515 | 						rf = rfObj.intValue();
516 | 					}
517 | 					if (-1 == lf || -1 == rf) continue;
518 | 					
519 | 					long ff = lf * rf;
520 | 					if (ff > max)
521 | 						max = ff;
522 | 				}
523 | 				double pf = f * total / max;
524 | 				double pmi = Math.log(pf) / Math.log(2);
525 | 				if (Double.isNaN(pmi)) continue;
526 | 				double pp = -1;
527 | 				if (null != posProp.get(w.subSequence(0, 1)) && null != posProp.get(w.subSequence(w.length() - 1, w.length())))
528 | 					pp = Math.min(posProp.get(w.subSequence(0, 1))[0], posProp.get(w.subSequence(w.length() - 1, w.length()))[2]);
529 | 				if (pmi < 1 || e < 2 || pp < 0.1)
530 | 					continue;
531 | 				ww.write(w + "\t" + f + "\t" + pmi + "\t" + e + "\t"  + pp + "\n");
532 | 
533 | 			}
534 | 			ww.close();
535 | 			LOG.info("start to sort extracted words");
536 | 			try {
537 | 				long availMem = Runtime.getRuntime().maxMemory() - (2048 * 1024 * 1024);
538 | 				long maxMem = (availMem >> 1);
539 | 				if (maxMem > MAX_HEAP_FOR_PRESORT) {
540 | 					maxMem = MAX_HEAP_FOR_PRESORT;
541 | 				} else if (maxMem < MIN_HEAP_FOR_PRESORT) {
542 | 					maxMem = MIN_HEAP_FOR_PRESORT;
543 | 				}
544 | 				final SplitFileSorter sorter = new SplitFileSorter(new SortConfig().withMaxMemoryUsage(maxMem));
545 | 				sorter.sort(new FileInputStream(wfile), new PrintStream(wsfile));
546 | 			} catch (IOException e) {
547 | 				e.printStackTrace();
548 | 			}
549 | 
550 | 			LOG.info("all done");
551 | 			
552 | 		} catch (FileNotFoundException e) {
553 | 			e.printStackTrace();
554 | 		} catch (IOException e) {
555 | 			e.printStackTrace();
556 | 		}
557 | 	}
558 | }
559 | 


--------------------------------------------------------------------------------
/src/main/java/dict/build/LineReader.java:
--------------------------------------------------------------------------------
 1 | package dict.build;
 2 | 
 3 | import java.io.*;
 4 | import java.util.Arrays;
 5 | 
 6 | import com.fasterxml.sort.*;
 7 | import com.google.common.base.Charsets;
 8 | 
 9 | /**
10 |  * Efficient reader for data that consists of text lines, i.e. character
11 |  * data separated by one of standard line feeds (CR, LF or CR+LF).
12 |  * For efficiency no decoding is done
13 |  */
14 | public class LineReader
15 |     extends DataReader<String>
16 | {
17 |     
18 |     protected final BufferedReader _br;
19 | 
20 |     
21 |     public LineReader(InputStream in)
22 |     {
23 |         _br = new BufferedReader(new InputStreamReader(in, Charsets.UTF_8));
24 |     }
25 | 
26 |     /**
27 |      * Convenience method for instantiating factory to create instances of
28 |      * this {@link DataReader}.
29 |      */
30 |     public static Factory factory() {
31 |         return new Factory();
32 |     }    
33 |     
34 |     @Override
35 |     public void close() throws IOException
36 |     {
37 |        _br.close();
38 |     }
39 | 
40 |     @Override
41 |     public int estimateSizeInBytes(String item)
42 |     {
43 |         // Wild guess: array objects take at least 8 bytes, probably 12 or 16.
44 |         // And size of actual array storage rounded up to 4-byte alignment. So:
45 | 
46 |         int bytes = item.getBytes().length;
47 |         bytes = ((bytes + 3) >> 2) << 2;
48 |         return 16 + bytes;
49 |     }
50 | 
51 |     @Override
52 |     public String readNext() throws IOException
53 |     {
54 |         
55 |     	return _br.readLine();
56 |     }
57 | 
58 |     /*
59 |     /**********************************************************************
60 |     /* Helper classes
61 |     /**********************************************************************
62 |      */
63 |     
64 |     public static class Factory
65 |         extends DataReaderFactory<String>
66 |     {
67 |         @Override
68 |         public DataReader<String> constructReader(InputStream in) {
69 |             return new LineReader(in);
70 |         }
71 |     }        
72 | }
73 | 


--------------------------------------------------------------------------------
/src/main/java/dict/build/LineWriter.java:
--------------------------------------------------------------------------------
 1 | package dict.build;
 2 | 
 3 | import java.io.*;
 4 | 
 5 | import com.fasterxml.sort.*;
 6 | import com.google.common.base.Charsets;
 7 | 
 8 | public class LineWriter
 9 |     extends DataWriter<String>
10 | {
11 |     
12 |     protected final BufferedWriter _out;
13 | 
14 | 
15 |     public LineWriter(OutputStream out) {
16 |     	_out = new BufferedWriter(new OutputStreamWriter(out,Charsets.UTF_8));
17 |     }
18 | 
19 | 
20 |     public static Factory factory() {
21 |         return new Factory();
22 |     }
23 | 
24 |     
25 |     @Override
26 |     public void close() throws IOException {
27 |     	_out.close();
28 |     }
29 | 
30 |     @Override
31 |     public void writeEntry(String item) throws IOException
32 |     {
33 |     	_out.write(item + "\n");
34 |     }
35 | 
36 |     /*
37 |     /**********************************************************************
38 |     /* Helper classes
39 |     /**********************************************************************
40 |      */
41 |     
42 |     /**
43 |      * Basic factory implementation. The only noteworthy things are:
44 |      * <ul>
45 |      * <li>Ability to configure linefeed to use (including none, pass null)</li>
46 |      * <li>Writer uses {@link BufferedOutputStream} by default (can be disabled)
47 |      *  </ul>
48 |      */
49 |     public static class Factory
50 |         extends DataWriterFactory<String>
51 |     {
52 |         public Factory() {
53 |         }
54 | 
55 | 
56 |         
57 |         @Override
58 |         public DataWriter<String> constructWriter(OutputStream out) {
59 |             return new LineWriter(out);
60 |         }
61 |     }
62 | }
63 | 


--------------------------------------------------------------------------------
/src/main/java/dict/build/Main.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * 
 3 |  */
 4 | package dict.build;
 5 | 
 6 | /**
 7 |  * @author zhangcheng
 8 |  *
 9 |  */
10 | public class Main {
11 | 
12 | 	/**
13 | 	 * @param args
14 | 	 */
15 | 	public static void main(String[] args) {
16 | 
17 | 		if (args.length == 0) {
18 | 			System.out.println("rawpath");
19 | 			return;
20 | 		}
21 | 		
22 | 		String rawpath = null;
23 | 		if (args.length > 0) {
24 | 			rawpath = args[0];
25 | 		}
26 | 		
27 | 		String left = null;
28 | 		String right = null;
29 | 		String entropyfile = null;
30 | 
31 | 		FastBuilder builder = new FastBuilder();
32 | 
33 | 		if (null == right)
34 | 			right = builder.genFreqRight(rawpath, 6, 10 * 1024);
35 | 		if (null == left)
36 | 			left = builder.genLeft(rawpath, 6, 10 * 1024);
37 | 		if (null == entropyfile)
38 | 			entropyfile = builder.mergeEntropy(right, left);
39 | 
40 | 		builder.extractWords(right, entropyfile);
41 | 	}
42 | }
43 | 


--------------------------------------------------------------------------------
/src/main/java/dict/build/PosProbability.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * 
 3 |  */
 4 | package dict.build;
 5 | 
 6 | import java.io.BufferedReader;
 7 | import java.io.BufferedWriter;
 8 | import java.io.File;
 9 | import java.io.FileNotFoundException;
10 | import java.io.IOException;
11 | import java.util.Map;
12 | 
13 | import com.google.common.base.Charsets;
14 | import com.google.common.collect.Maps;
15 | import com.google.common.io.Files;
16 | 
17 | /**
18 |  * @author Jennifer
19 |  *
20 |  */
21 | public class PosProbability {
22 | 
23 | 	/**
24 | 	 * @param args
25 | 	 * @throws IOException 
26 | 	 * @throws FileNotFoundException 
27 | 	 */
28 | 	public static void main(String[] args) throws FileNotFoundException, IOException {
29 | 		
30 | 		File dictFile = new File("sogou.dic");
31 | 		File ppFile = new File(dictFile.getParentFile(), "dict/build/pos_prop.txt");
32 | 		try(BufferedReader br = Files.newReader(dictFile, Charsets.UTF_8);
33 | 				BufferedWriter pw = Files.newWriter(ppFile, Charsets.UTF_8);
34 | 				) {
35 | 			String line = null;
36 | 			Map<String, CounterMap> pp = Maps.newHashMap();
37 | 			while (null != (line = br.readLine())) {
38 | 				String[] seg = line.split("\t");
39 | //				int freq = Integer.parseInt(seg[2]);
40 | 				int freq = 1;
41 | 				for (int i = 0; i < seg[0].length(); ++i) {
42 | 					String label = null;
43 | 					if (0 == i) {
44 | 						label = "S";
45 | 					} else if (seg[0].length() - 1 == i) {
46 | 						label = "E";
47 | 					} else {
48 | 						label = "M";
49 | 					}
50 | 					String key = seg[0].substring(i, i + 1);
51 | 					if (pp.containsKey(key)) {
52 | 						pp.get(key).incrby(label, freq);
53 | 					} else {
54 | 						CounterMap cm = new CounterMap();
55 | 						cm.incrby(label, freq);
56 | 						pp.put(key, cm);
57 | 					}
58 | 				}
59 | 			}
60 | 			String[] labels = new String[]{"S", "M", "E"};
61 | 			for (String key : pp.keySet()) {
62 | 				int total = 0;
63 | 				for (String l : labels) {
64 | 					total += pp.get(key).get(l);
65 | 				}
66 | 				if (0 == total) 
67 | 					continue;
68 | 				StringBuilder bui = new StringBuilder();
69 | 				bui.append(key);
70 | 				for (String l : labels) {
71 | 					bui.append("\t").append(pp.get(key).get(l) * 1.0 / total);
72 | 				}
73 | 				bui.append("\n");
74 | 				pw.write(bui.toString());
75 | 			}
76 | 		}
77 | 	}
78 | }
79 | 


--------------------------------------------------------------------------------
/src/main/java/dict/build/SplitFileSorter.java:
--------------------------------------------------------------------------------
  1 | package dict.build;
  2 | 
  3 | import java.io.File;
  4 | import java.io.FileInputStream;
  5 | import java.io.InputStream;
  6 | 
  7 | import com.fasterxml.sort.SortConfig;
  8 | import com.fasterxml.sort.Sorter;
  9 | 
 10 | /**
 11 |  * Basic {@link Sorter} implementation that operates on text line input.
 12 |  */
 13 | public class SplitFileSorter extends Sorter<String>
 14 | {
 15 |     /**
 16 |      * Let's limit maximum memory used for pre-sorting when invoked from command-line to be
 17 |      * 256 megs
 18 |      */
 19 |     public final static long MAX_HEAP_FOR_PRESORT = 256L * 1024 * 1024;
 20 | 
 21 |     /**
 22 |      * Also just in case our calculations are wrong, require 10 megs for pre-sort anyway
 23 |      * (if invoked from CLI)
 24 |      */
 25 |     public final static long MIN_HEAP_FOR_PRESORT = 10L * 1024 * 1024;
 26 |     
 27 |     public SplitFileSorter() {
 28 |         this(new SortConfig());
 29 |     }
 30 |     
 31 |     public SplitFileSorter(SortConfig config)
 32 |     {
 33 |         super(config,
 34 |                 LineReader.factory(), LineWriter.factory(),
 35 |                 new SplitStringComparator());
 36 |     }
 37 | 
 38 |     /*
 39 |     /********************************************************************** 
 40 |     /* Main method for simple command-line operation for line-based
 41 |     /* sorting using default ISO-8859-1 collation (i.e. byte-by-byte sorting)
 42 |     /********************************************************************** 
 43 |      */
 44 |     
 45 |     public static void main(String[] args) throws Exception
 46 |     {
 47 |         if (args.length > 1) {
 48 |             System.err.println("Usage: java "+SplitFileSorter.class.getName()+" [input-file]");
 49 |             System.err.println("(where input-file is optional; if missing, read from STDIN)");
 50 |             System.exit(1);
 51 |         }
 52 |         
 53 |         // One more thing: use 50% of memory (but no more than 200 megs) for pre-sort
 54 |         // minor tweak: consider first 40 megs to go for other overhead...
 55 |         long availMem = Runtime.getRuntime().maxMemory() - (40 * 1024 * 1024);
 56 |         long maxMem = (availMem >> 1);
 57 |         if (maxMem > MAX_HEAP_FOR_PRESORT) {
 58 |             maxMem = MAX_HEAP_FOR_PRESORT;
 59 |         } else if (maxMem < MIN_HEAP_FOR_PRESORT) {
 60 |             maxMem = MIN_HEAP_FOR_PRESORT;
 61 |         }
 62 |         final SplitFileSorter sorter = new SplitFileSorter(new SortConfig().withMaxMemoryUsage(maxMem));
 63 |         final InputStream in;
 64 |         
 65 |         if (args.length == 0) {
 66 |             in = System.in;
 67 |         } else {
 68 |             File input = new File(args[0]);
 69 |             if (!input.exists() || input.isDirectory()) {
 70 |                 System.err.println("File '"+input.getAbsolutePath()+"' does not exist (or is not file)");
 71 |                 System.exit(2);
 72 |             }
 73 |             in = new FileInputStream(input);
 74 |         }
 75 | 
 76 |         // To be able to print out progress, need to spin one additional thread...
 77 |         new Thread(new Runnable() {
 78 |             @Override
 79 |             public void run() {
 80 |                 final long start = System.currentTimeMillis();
 81 |                 try {
 82 |                     while (!sorter.isCompleted()) {
 83 |                         Thread.sleep(5000L);
 84 |                         if (sorter.isPreSorting()) {
 85 |                             System.err.printf(" pre-sorting: %d files written\n", sorter.getNumberOfPreSortFiles());
 86 |                         } else if (sorter.isSorting()) {
 87 |                             System.err.printf(" sorting, round: %d/%d\n",
 88 |                                     sorter.getSortRound(), sorter.getNumberOfSortRounds());
 89 |                         }
 90 |                     }
 91 |                     double secs = (System.currentTimeMillis() - start) / 1000.0;
 92 |                     System.err.printf("Completed: took %.1f seconds.\n", secs);
 93 |                 } catch (InterruptedException e) {
 94 |                     double secs = (System.currentTimeMillis() - start) / 1000.0;
 95 |                     System.err.printf("[INTERRUPTED] -- took %.1f seconds.\n", secs);
 96 |                 }
 97 |             } 
 98 |         }).start();
 99 |         sorter.sort(in, System.out);
100 |     }
101 | }
102 | 


--------------------------------------------------------------------------------
/src/main/java/dict/build/SplitStringComparator.java:
--------------------------------------------------------------------------------
 1 | package dict.build;
 2 | 
 3 | import java.util.Comparator;
 4 | 
 5 | /**
 6 |  * Simple implementation of comparator for byte arrays which
 7 |  * will compare using <code>unsigned</code> byte values (meaning
 8 |  * that 0xFF is creator than 0x00, for example).
 9 |  */
10 | public class SplitStringComparator
11 |     implements Comparator<String>
12 | {
13 |     @Override
14 |     public int compare(String o1, String o2)
15 |     {
16 |     	String[] seg1 = o1.split("\t");
17 |     	String[] seg2 = o2.split("\t");
18 |     	if (4 > seg1.length || 4 > seg2.length) return 1;
19 |     	Double d1 = Double.parseDouble(seg1[1]);
20 |     	Double d2 = Double.parseDouble(seg2[1]);
21 |     	return d2.compareTo(d1);
22 |     }
23 | 
24 | }
25 | 


--------------------------------------------------------------------------------
/src/main/java/dict/build/TernaryNode.java:
--------------------------------------------------------------------------------
  1 | package dict.build;
  2 | 
  3 | @Deprecated
  4 | public class TernaryNode {
  5 | 
  6 | 	/** character of this node. */
  7 | 	private char splitchar;
  8 | 
  9 | 	/** whether this character is the end of a word. */
 10 | 	private int value;
 11 | 
 12 | 	/** low child of this node. */
 13 | 	private TernaryNode lokid;
 14 | 
 15 | 	/** equal child of this node. */
 16 | 	private TernaryNode eqkid;
 17 | 
 18 | 	/** high child of this node. */
 19 | 	private TernaryNode hikid;
 20 | 
 21 | 	/**
 22 | 	 * This will create a new <code>TernaryNode</code> with the supplied
 23 | 	 * character.
 24 | 	 * 
 25 | 	 * @param c
 26 | 	 *            <code>char</code>
 27 | 	 */
 28 | 	public TernaryNode(final char c) {
 29 | 		this.splitchar = c;
 30 | 	}
 31 | 
 32 | 	/**
 33 | 	 * This returns the splitchar of this <code>TernaryNode</code>.
 34 | 	 * 
 35 | 	 * @return <code>char</code>
 36 | 	 */
 37 | 	public char getSplitChar() {
 38 | 		return this.splitchar;
 39 | 	}
 40 | 
 41 | 	/**
 42 | 	 * This sets the splitchar for this <code>TernaryNode</code>.
 43 | 	 * 
 44 | 	 * @param c
 45 | 	 *            <code>char</code>
 46 | 	 */
 47 | 	public void setSplitChar(final char c) {
 48 | 		this.splitchar = c;
 49 | 	}
 50 | 
 51 | 	/**
 52 | 	 * This returns the endOfWord for this <code>TernaryNode</code>.
 53 | 	 * 
 54 | 	 * @return <code>boolean</code>
 55 | 	 */
 56 | 	public boolean isEndOfWord() {
 57 | 		return this.value > 0;
 58 | 	}
 59 | 
 60 | 	public int getValue() {
 61 | 		return this.value;
 62 | 	}
 63 | 
 64 | 	/**
 65 | 	 * This sets the endOfWord for this <code>TernaryNode</code>.
 66 | 	 * 
 67 | 	 * @param b
 68 | 	 *            <code>boolean</code>
 69 | 	 */
 70 | 	public void setEndOfWord(final int b) {
 71 | 		this.value = b;
 72 | 	}
 73 | 
 74 | 	/**
 75 | 	 * This returns the lokid of this <code>TernaryNode</code>.
 76 | 	 * 
 77 | 	 * @return <code>TernaryNode</code>
 78 | 	 */
 79 | 	public TernaryNode getLokid() {
 80 | 		return this.lokid;
 81 | 	}
 82 | 
 83 | 	/**
 84 | 	 * This sets the lokid of this <code>TernaryNode</code>.
 85 | 	 * 
 86 | 	 * @param node
 87 | 	 *            <code>TernaryNode</code>
 88 | 	 */
 89 | 	public void setLokid(final TernaryNode node) {
 90 | 		this.lokid = node;
 91 | 	}
 92 | 
 93 | 	/**
 94 | 	 * This returns the eqkid of this <code>TernaryNode</code>.
 95 | 	 * 
 96 | 	 * @return <code>TernaryNode</code>
 97 | 	 */
 98 | 	public TernaryNode getEqkid() {
 99 | 		return this.eqkid;
100 | 	}
101 | 
102 | 	/**
103 | 	 * This sets the eqkid of this <code>TernaryNode</code>.
104 | 	 * 
105 | 	 * @param node
106 | 	 *            <code>TernaryNode</code>
107 | 	 */
108 | 	public void setEqkid(final TernaryNode node) {
109 | 		this.eqkid = node;
110 | 	}
111 | 
112 | 	/**
113 | 	 * This returns the hikid of this <code>TernaryNode</code>.
114 | 	 * 
115 | 	 * @return <code>TernaryNode</code>
116 | 	 */
117 | 	public TernaryNode getHikid() {
118 | 		return this.hikid;
119 | 	}
120 | 
121 | 	/**
122 | 	 * This sets the hikid of this <code>TernaryNode</code>.
123 | 	 * 
124 | 	 * @param node
125 | 	 *            <code>TernaryNode</code>
126 | 	 */
127 | 	public void setHikid(final TernaryNode node) {
128 | 		this.hikid = node;
129 | 	}
130 | }


--------------------------------------------------------------------------------
/src/main/java/dict/build/TernaryTree.java:
--------------------------------------------------------------------------------
  1 | package dict.build;
  2 | 
  3 | import java.io.IOException;
  4 | import java.io.Writer;
  5 | import java.util.ArrayList;
  6 | import java.util.Collections;
  7 | import java.util.List;
  8 | import java.util.StringTokenizer;
  9 | 
 10 | /**
 11 |  * <code>TernaryTree</code> is an implementation of a ternary tree. Methods are
 12 |  * provided for inserting strings and searching for strings. The algorithms in
 13 |  * this class are all recursive, and have not been optimized for any particular
 14 |  * purpose. Data which is inserted is not sorted before insertion, however data
 15 |  * can be inserted beginning with the median of the supplied data.
 16 |  * 
 17 |  * @author Middleware Services
 18 |  * @version $Revision$ $Date$
 19 |  */
 20 | @Deprecated
 21 | public class TernaryTree {
 22 | 
 23 | 	/** File system line separator. */
 24 | 	private static final String LINE_SEPARATOR = System
 25 | 			.getProperty("line.separator");
 26 | 
 27 | 	/** root node of the ternary tree. */
 28 | 	private TernaryNode root;
 29 | 
 30 | 	/** Default Constructor. */
 31 | 	public TernaryTree() {
 32 | 	}
 33 | 
 34 | 	/**
 35 | 	 * This will insert the supplied word into the <code>TernaryTree</code>.
 36 | 	 * 
 37 | 	 * @param word
 38 | 	 *            <code>String</code> to insert
 39 | 	 */
 40 | 	public void insert(final String word, final int value) {
 41 | 		if (word != null) {
 42 | 			this.root = insertNode(this.root, word, 0, value);
 43 | 		}
 44 | 	}
 45 | 
 46 | 	/**
 47 | 	 * This will return true if the supplied word has been inserted into the
 48 | 	 * <code>TernaryTree</code>.
 49 | 	 * 
 50 | 	 * @param word
 51 | 	 *            <code>String</code> to search for
 52 | 	 * 
 53 | 	 * @return <code>boolean</code> - whether word was found
 54 | 	 */
 55 | 	public int search(final String word) {
 56 | 		return  this.searchNode(this.root, word, 0);
 57 | 	}
 58 | 
 59 | 	/**
 60 | 	 * This will return an array of strings which partially match the supplied
 61 | 	 * word. word should be of the format '.e.e.e' Where the '.' character
 62 | 	 * represents any valid character. Possible results from this query include:
 63 | 	 * Helene, delete, or severe Note that no substring matching occurs, results
 64 | 	 * only include strings of the same length. If the supplied word does not
 65 | 	 * contain the '.' character, then a regular search is performed.
 66 | 	 * 
 67 | 	 * @param word
 68 | 	 *            <code>String</code> to search for
 69 | 	 * 
 70 | 	 * @return <code>String[]</code> - of matching words
 71 | 	 */
 72 | 	public String[] partialSearch(final String word) {
 73 | 		String[] results = null;
 74 | 		final List<String> matches = this.partialSearchNode(this.root,
 75 | 				new ArrayList<String>(), "", word, 0);
 76 | 		if (matches == null) {
 77 | 			results = new String[] {};
 78 | 		} else {
 79 | 			results = matches.toArray(new String[matches.size()]);
 80 | 		}
 81 | 		return results;
 82 | 	}
 83 | 
 84 | 	/**
 85 | 	 * This will return an array of strings which are near to the supplied word
 86 | 	 * by the supplied distance. For the query nearSearch("fisher", 2): Possible
 87 | 	 * results include: cipher, either, fishery, kosher, sister. If the supplied
 88 | 	 * distance is not > 0, then a regular search is performed.
 89 | 	 * 
 90 | 	 * @param word
 91 | 	 *            <code>String</code> to search for
 92 | 	 * @param distance
 93 | 	 *            <code>int</code> for valid match
 94 | 	 * 
 95 | 	 * @return <code>String[]</code> - of matching words
 96 | 	 */
 97 | 	public String[] nearSearch(final String word, final int distance) {
 98 | 		String[] results = null;
 99 | 		final List<String> matches = this.nearSearchNode(this.root, distance,
100 | 				new ArrayList<String>(), "", word, 0);
101 | 		if (matches == null) {
102 | 			results = new String[] {};
103 | 		} else {
104 | 			results = matches.toArray(new String[matches.size()]);
105 | 		}
106 | 		return results;
107 | 	}
108 | 
109 | 	/**
110 | 	 * This will return a list of all the words in this <code>
111 | 	 * TernaryTree</code>. This is a very expensive operation, every node in the
112 | 	 * tree is traversed. The returned list cannot be modified.
113 | 	 * 
114 | 	 * @return <code>String[]</code> - of words
115 | 	 */
116 | 	public List<String> getWords() {
117 | 		final List<String> words = this.traverseNode(this.root, "",
118 | 				new ArrayList<String>());
119 | 		return Collections.unmodifiableList(words);
120 | 	}
121 | 
122 | 	/**
123 | 	 * This will print an ASCII representation of this <code>TernaryTree</code>
124 | 	 * to the supplied <code>PrintWriter</code>. This is a very expensive
125 | 	 * operation, every node in the tree is traversed. The output produced is
126 | 	 * hard to read, but it should give an indication of whether or not your
127 | 	 * tree is balanced.
128 | 	 * 
129 | 	 * @param out
130 | 	 *            <code>PrintWriter</code> to print to
131 | 	 * @throws IOException
132 | 	 *             if an error occurs
133 | 	 */
134 | 	public void print(final Writer out) throws IOException {
135 | 		out.write(printNode(this.root, "", 0));
136 | 	}
137 | 
138 | 	/**
139 | 	 * This will recursively insert a word into the <code>TernaryTree</code> one
140 | 	 * node at a time beginning at the supplied node.
141 | 	 * 
142 | 	 * @param node
143 | 	 *            <code>TernaryNode</code> to put character in
144 | 	 * @param word
145 | 	 *            <code>String</code> to be inserted
146 | 	 * @param index
147 | 	 *            <code>int</code> of character in word
148 | 	 * 
149 | 	 * @return <code>TernaryNode</code> - to insert
150 | 	 */
151 | 	private TernaryNode insertNode(TernaryNode node, final String word,
152 | 			final int index, final int value) {
153 | 		if (index < word.length()) {
154 | 			final char c = word.charAt(index);
155 | 			if (node == null) {
156 | 				node = new TernaryNode(c);
157 | 			}
158 | 
159 | 			final char split = node.getSplitChar();
160 | 			if (c < split) {
161 | 				node.setLokid(insertNode(node.getLokid(), word, index, value));
162 | 			} else if (c == split) {
163 | 				if (index == word.length() - 1) {
164 | 					node.setEndOfWord(value);
165 | 				}
166 | 				node.setEqkid(insertNode(node.getEqkid(), word, index + 1,
167 | 						value));
168 | 			} else {
169 | 				node.setHikid(insertNode(node.getHikid(), word, index, value));
170 | 			}
171 | 		}
172 | 		return node;
173 | 	}
174 | 
175 | 	/**
176 | 	 * This will recursively search for a word in the <code>TernaryTree</code>
177 | 	 * one node at a time beginning at the supplied node.
178 | 	 * 
179 | 	 * @param node
180 | 	 *            <code>TernaryNode</code> to search in
181 | 	 * @param word
182 | 	 *            <code>String</code> to search for
183 | 	 * @param index
184 | 	 *            <code>int</code> of character in word
185 | 	 * 
186 | 	 * @return <code>boolean</code> - whether or not word was found
187 | 	 */
188 | 	private int searchNode(final TernaryNode node, final String word,
189 | 			final int index) {
190 | 		if (node != null && index < word.length()) {
191 | 			final char c = word.charAt(index);
192 | 			final char split = node.getSplitChar();
193 | 			if (c < split) {
194 | 				return searchNode(node.getLokid(), word, index);
195 | 			} else if (c > split) {
196 | 				return searchNode(node.getHikid(), word, index);
197 | 			} else {
198 | 				if (index == word.length() - 1) {
199 | 					if (node.isEndOfWord()) {
200 | 						return node.getValue();
201 | 					}
202 | 				} else {
203 | 					return searchNode(node.getEqkid(), word, index + 1);
204 | 				}
205 | 			}
206 | 		}
207 | 		return -1;
208 | 	}
209 | 
210 | 	/**
211 | 	 * This will recursively search for a partial word in the <code>
212 | 	 * TernaryTree</code> one node at a time beginning at the supplied node.
213 | 	 * 
214 | 	 * @param node
215 | 	 *            <code>TernaryNode</code> to search in
216 | 	 * @param matches
217 | 	 *            <code>ArrayList</code> of partial matches
218 | 	 * @param match
219 | 	 *            <code>String</code> the current word being examined
220 | 	 * @param word
221 | 	 *            <code>String</code> to search for
222 | 	 * @param index
223 | 	 *            <code>int</code> of character in word
224 | 	 * 
225 | 	 * @return <code>ArrayList</code> - of matches
226 | 	 */
227 | 	private List<String> partialSearchNode(final TernaryNode node,
228 | 			List<String> matches, final String match, final String word,
229 | 			final int index) {
230 | 		if (node != null && index < word.length()) {
231 | 			final char c = word.charAt(index);
232 | 			final char split = node.getSplitChar();
233 | 			if (c == '.' || c < split) {
234 | 				matches = partialSearchNode(node.getLokid(), matches, match,
235 | 						word, index);
236 | 			}
237 | 			if (c == '.' || c == split) {
238 | 				if (index == word.length() - 1) {
239 | 					if (node.isEndOfWord()) {
240 | 						matches.add(match + split);
241 | 					}
242 | 				} else {
243 | 					matches = partialSearchNode(node.getEqkid(), matches, match
244 | 							+ split, word, index + 1);
245 | 				}
246 | 			}
247 | 			if (c == '.' || c > split) {
248 | 				matches = partialSearchNode(node.getHikid(), matches, match,
249 | 						word, index);
250 | 			}
251 | 		}
252 | 		return matches;
253 | 	}
254 | 
255 | 	/**
256 | 	 * This will recursively search for a near match word in the <code>
257 | 	 * TernaryTree</code> one node at a time beginning at the supplied node.
258 | 	 * 
259 | 	 * @param node
260 | 	 *            <code>TernaryNode</code> to search in
261 | 	 * @param distance
262 | 	 *            <code>int</code> of a valid match, must be > 0
263 | 	 * @param matches
264 | 	 *            <code>ArrayList</code> of near matches
265 | 	 * @param match
266 | 	 *            <code>String</code> the current word being examined
267 | 	 * @param word
268 | 	 *            <code>String</code> to search for
269 | 	 * @param index
270 | 	 *            <code>int</code> of character in word
271 | 	 * 
272 | 	 * @return <code>ArrayList</code> - of matches
273 | 	 */
274 | 	private List<String> nearSearchNode(final TernaryNode node,
275 | 			final int distance, List<String> matches, final String match,
276 | 			final String word, final int index) {
277 | 		if (node != null && distance >= 0) {
278 | 
279 | 			final char c;
280 | 			if (index < word.length()) {
281 | 				c = word.charAt(index);
282 | 			} else {
283 | 				c = (char) -1;
284 | 			}
285 | 
286 | 			final char split = node.getSplitChar();
287 | 
288 | 			if (distance > 0 || c < split) {
289 | 				matches = nearSearchNode(node.getLokid(), distance, matches,
290 | 						match, word, index);
291 | 			}
292 | 
293 | 			final String newMatch = match + split;
294 | 			if (c == split) {
295 | 
296 | 				if (node.isEndOfWord() && distance >= 0
297 | 						&& newMatch.length() + distance >= word.length()) {
298 | 					matches.add(newMatch);
299 | 				}
300 | 
301 | 				matches = nearSearchNode(node.getEqkid(), distance, matches,
302 | 						newMatch, word, index + 1);
303 | 			} else {
304 | 
305 | 				if (node.isEndOfWord() && distance - 1 >= 0
306 | 						&& newMatch.length() + distance - 1 >= word.length()) {
307 | 					matches.add(newMatch);
308 | 				}
309 | 
310 | 				matches = nearSearchNode(node.getEqkid(), distance - 1,
311 | 						matches, newMatch, word, index + 1);
312 | 			}
313 | 
314 | 			if (distance > 0 || c > split) {
315 | 				matches = nearSearchNode(node.getHikid(), distance, matches,
316 | 						match, word, index);
317 | 			}
318 | 		}
319 | 		return matches;
320 | 	}
321 | 
322 | 	/**
323 | 	 * This will recursively traverse every node in the <code>TernaryTree</code>
324 | 	 * one node at a time beginning at the supplied node. The result is a string
325 | 	 * representing every word, which is delimited by the LINE_SEPARATOR
326 | 	 * character.
327 | 	 * 
328 | 	 * @param node
329 | 	 *            <code>TernaryNode</code> to begin traversing
330 | 	 * @param s
331 | 	 *            <code>String</code> of words found at the supplied node
332 | 	 * @param words
333 | 	 *            <code>ArrayList</code> which will be returned (recursive
334 | 	 *            function)
335 | 	 * 
336 | 	 * @return <code>String</code> - containing all words from the supplied node
337 | 	 */
338 | 	private List<String> traverseNode(final TernaryNode node, final String s,
339 | 			List<String> words) {
340 | 		if (node != null) {
341 | 
342 | 			words = this.traverseNode(node.getLokid(), s, words);
343 | 
344 | 			final String c = String.valueOf(node.getSplitChar());
345 | 			if (node.getEqkid() != null) {
346 | 				words = this.traverseNode(node.getEqkid(), s + c, words);
347 | 			}
348 | 
349 | 			if (node.isEndOfWord()) {
350 | 				words.add(s + c);
351 | 			}
352 | 
353 | 			words = this.traverseNode(node.getHikid(), s, words);
354 | 		}
355 | 		return words;
356 | 	}
357 | 
358 | 	/**
359 | 	 * This will recursively traverse every node in the <code>TernaryTree</code>
360 | 	 * one node at a time beginning at the supplied node. The result is an ASCII
361 | 	 * string representation of the tree beginning at the supplied node.
362 | 	 * 
363 | 	 * @param node
364 | 	 *            <code>TernaryNode</code> to begin traversing
365 | 	 * @param s
366 | 	 *            <code>String</code> of words found at the supplied node
367 | 	 * @param depth
368 | 	 *            <code>int</code> of the current node
369 | 	 * 
370 | 	 * @return <code>String</code> - containing all words from the supplied node
371 | 	 */
372 | 	private String printNode(final TernaryNode node, final String s,
373 | 			final int depth) {
374 | 		final StringBuffer buffer = new StringBuffer();
375 | 		if (node != null) {
376 | 			buffer.append(this.printNode(node.getLokid(), " <-", depth + 1));
377 | 
378 | 			final String c = String.valueOf(node.getSplitChar());
379 | 			final StringBuffer eq = new StringBuffer();
380 | 			if (node.getEqkid() != null) {
381 | 				eq.append(this.printNode(node.getEqkid(), s + c + "--",
382 | 						depth + 1));
383 | 			} else {
384 | 				int count = (new StringTokenizer(s, "--")).countTokens();
385 | 				if (count > 0) {
386 | 					count--;
387 | 				}
388 | 				for (int i = 1; i < depth - count - 1; i++) {
389 | 					eq.append("   ");
390 | 				}
391 | 				eq.append(s).append(c).append(TernaryTree.LINE_SEPARATOR);
392 | 			}
393 | 			buffer.append(eq);
394 | 
395 | 			buffer.append(this.printNode(node.getHikid(), " >-", depth + 1));
396 | 		}
397 | 		return buffer.toString();
398 | 	}
399 | 	
400 | 	public static void main(String[] args) {
401 | 		TernaryTree tt = new TernaryTree();
402 | 		tt.insert("a", 1);
403 | 		tt.insert("aa", 2);
404 | 		tt.insert("aaa", 3);
405 | 		tt.insert("aaaa", 4);
406 | 		System.out.println(tt.search("aaa"));
407 | 	}
408 | }


--------------------------------------------------------------------------------
/src/main/resources/dict.properties:
--------------------------------------------------------------------------------
1 | HELP_DESCRIPTION=list all cmd
2 | RAW_FILE=raw data file line by line
3 | SORT_MEM_SIZE_IN_MB=memory size in mb use by sorting
4 | MAX_WORD_LENGTH=max length of word
5 | OUTPUT_DICT_FILE=output result dict file


--------------------------------------------------------------------------------
/src/main/resources/logback.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | 
  3 | <!-- 从高到地低 OFF 、 FATAL 、 ERROR 、 WARN 、 INFO 、 DEBUG 、 TRACE 、 ALL -->
  4 | <!-- 日志输出规则  根据当前ROOT 级别，日志输出时，级别高于root默认的级别时  会输出 -->
  5 | <!-- 以下  每个配置的 filter 是过滤掉输出文件里面，会出现高级别文件，依然出现低级别的日志信息，通过filter 过滤只记录本级别的日志-->
  6 | 
  7 | 
  8 | <!-- 属性描述 scan：性设置为true时，配置文件如果发生改变，将会被重新加载，默认值为true scanPeriod:设置监测配置文件是否有修改的时间间隔，如果没有给出时间单位，默认单位是毫秒。当scan为true时，此属性生效。默认的时间间隔为1分钟。
  9 |     debug:当此属性设置为true时，将打印出logback内部日志信息，实时查看logback运行状态。默认值为false。 -->
 10 | <configuration scan="true" scanPeriod="60 seconds" debug="false">
 11 |     <!-- 定义日志文件 输入位置 -->
 12 |     <property name="log_dir" value="./logs" />
 13 |     <!-- 日志最大的历史 30天 -->
 14 |     <property name="maxHistory" value="30"/>
 15 | 
 16 | 
 17 | 
 18 | 
 19 |     <!-- ConsoleAppender 控制台输出日志 -->
 20 |     <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
 21 |         <!-- 对日志进行格式化 -->
 22 |         <encoder>
 23 |             <pattern>%d{yyyy-MM-dd HH:mm:ss} [%thread] %-5level %logger -%msg%n</pattern>
 24 |         </encoder>
 25 |     </appender>
 26 | 
 27 | 
 28 |     <!-- ERROR级别日志 -->
 29 |     <!-- 滚动记录文件，先将日志记录到指定文件，当符合某个条件时，将日志记录到其他文件 RollingFileAppender-->
 30 |     <appender name="ERROR" class="ch.qos.logback.core.rolling.RollingFileAppender">
 31 |         <!-- 过滤器，只记录WARN级别的日志 -->
 32 |         <filter class="ch.qos.logback.classic.filter.LevelFilter">
 33 |             <level>ERROR</level>
 34 |             <onMatch>ACCEPT</onMatch>
 35 |             <onMismatch>DENY</onMismatch>
 36 |         </filter>
 37 |         <!-- 最常用的滚动策略，它根据时间来制定滚动策略.既负责滚动也负责出发滚动 -->
 38 |         <rollingPolicy class="ch.qos.logback.core.rolling.TimeBasedRollingPolicy">
 39 |             <!--日志输出位置  可相对、和绝对路径 -->
 40 |             <fileNamePattern>${log_dir}/%d{yyyy-MM-dd}/error-log.log</fileNamePattern>
 41 |             <!-- 可选节点，控制保留的归档文件的最大数量，超出数量就删除旧文件假设设置每个月滚动，且<maxHistory>是6，
 42 |             则只保存最近6个月的文件，删除之前的旧文件。注意，删除旧文件是，那些为了归档而创建的目录也会被删除-->
 43 |             <maxHistory>${maxHistory}</maxHistory>
 44 |         </rollingPolicy>
 45 | 
 46 |         <!-- 按照固定窗口模式生成日志文件，当文件大于20MB时，生成新的日志文件。窗口大小是1到3，当保存了3个归档文件后，将覆盖最早的日志。
 47 |         <rollingPolicy class="ch.qos.logback.core.rolling.FixedWindowRollingPolicy">
 48 |           <fileNamePattern>${log_dir}/%d{yyyy-MM-dd}/.log.zip</fileNamePattern>
 49 |           <minIndex>1</minIndex>
 50 |           <maxIndex>3</maxIndex>
 51 |         </rollingPolicy>   -->
 52 |         <!-- 查看当前活动文件的大小，如果超过指定大小会告知RollingFileAppender 触发当前活动文件滚动
 53 |         <triggeringPolicy class="ch.qos.logback.core.rolling.SizeBasedTriggeringPolicy">
 54 |             <maxFileSize>5MB</maxFileSize>
 55 |         </triggeringPolicy>   -->
 56 | 
 57 |         <encoder>
 58 |             <pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger - %msg%n</pattern>
 59 |         </encoder>
 60 |     </appender>
 61 | 
 62 | 
 63 | 
 64 |     <!-- WARN级别日志 appender -->
 65 |     <appender name="WARN" class="ch.qos.logback.core.rolling.RollingFileAppender">
 66 |         <!-- 过滤器，只记录WARN级别的日志 -->
 67 |         <filter class="ch.qos.logback.classic.filter.LevelFilter">
 68 |             <level>WARN</level>
 69 |             <onMatch>ACCEPT</onMatch>
 70 |             <onMismatch>DENY</onMismatch>
 71 |         </filter>
 72 |         <rollingPolicy class="ch.qos.logback.core.rolling.TimeBasedRollingPolicy">
 73 |             <!-- 按天回滚 daily -->
 74 |             <fileNamePattern>${log_dir}/%d{yyyy-MM-dd}/warn-log.log
 75 |             </fileNamePattern>
 76 |             <!-- 日志最大的历史 60天 -->
 77 |             <maxHistory>${maxHistory}</maxHistory>
 78 |         </rollingPolicy>
 79 |         <encoder>
 80 |             <pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger - %msg%n</pattern>
 81 |         </encoder>
 82 |     </appender>
 83 | 
 84 | 
 85 | 
 86 | 
 87 |     <!-- INFO级别日志 appender -->
 88 |     <appender name="INFO" class="ch.qos.logback.core.rolling.RollingFileAppender">
 89 |         <!-- 过滤器，只记录INFO级别的日志 -->
 90 |         <filter class="ch.qos.logback.classic.filter.LevelFilter">
 91 |             <level>INFO</level>
 92 |             <onMatch>ACCEPT</onMatch>
 93 |             <onMismatch>DENY</onMismatch>
 94 |         </filter>
 95 |         <rollingPolicy class="ch.qos.logback.core.rolling.TimeBasedRollingPolicy">
 96 |             <!-- 按天回滚 daily -->
 97 |             <fileNamePattern>${log_dir}/%d{yyyy-MM-dd}/info-log.log
 98 |             </fileNamePattern>
 99 |             <!-- 日志最大的历史 60天 -->
100 |             <maxHistory>${maxHistory}</maxHistory>
101 |         </rollingPolicy>
102 |         <encoder>
103 |             <pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger - %msg%n</pattern>
104 |         </encoder>
105 |     </appender>
106 | 
107 | 
108 | 
109 | 
110 |     <!-- DEBUG级别日志 appender -->
111 |     <appender name="DEBUG" class="ch.qos.logback.core.rolling.RollingFileAppender">
112 |         <!-- 过滤器，只记录DEBUG级别的日志 -->
113 |         <filter class="ch.qos.logback.classic.filter.LevelFilter">
114 |             <level>DEBUG</level>
115 |             <onMatch>ACCEPT</onMatch>
116 |             <onMismatch>DENY</onMismatch>
117 |         </filter>
118 |         <rollingPolicy class="ch.qos.logback.core.rolling.TimeBasedRollingPolicy">
119 |             <!-- 按天回滚 daily -->
120 |             <fileNamePattern>${log_dir}/%d{yyyy-MM-dd}/debug-log.log
121 |             </fileNamePattern>
122 |             <!-- 日志最大的历史 60天 -->
123 |             <maxHistory>${maxHistory}</maxHistory>
124 |         </rollingPolicy>
125 |         <encoder>
126 |             <pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger - %msg%n</pattern>
127 |         </encoder>
128 |     </appender>
129 | 
130 | 
131 | 
132 | 
133 |     <!-- TRACE级别日志 appender -->
134 |     <appender name="TRACE" class="ch.qos.logback.core.rolling.RollingFileAppender">
135 |         <!-- 过滤器，只记录ERROR级别的日志 -->
136 |         <filter class="ch.qos.logback.classic.filter.LevelFilter">
137 |             <level>TRACE</level>
138 |             <onMatch>ACCEPT</onMatch>
139 |             <onMismatch>DENY</onMismatch>
140 |         </filter>
141 |         <rollingPolicy class="ch.qos.logback.core.rolling.TimeBasedRollingPolicy">
142 |             <!-- 按天回滚 daily -->
143 |             <fileNamePattern>${log_dir}/%d{yyyy-MM-dd}/trace-log.log
144 |             </fileNamePattern>
145 |             <!-- 日志最大的历史 60天 -->
146 |             <maxHistory>${maxHistory}</maxHistory>
147 |         </rollingPolicy>
148 |         <encoder>
149 |             <pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger - %msg%n</pattern>
150 |         </encoder>
151 |     </appender>
152 | 
153 |     <logger name="java.sql.PreparedStatement" value="DEBUG" />
154 |     <logger name="java.sql.Connection" value="DEBUG" />
155 |     <logger name="java.sql.Statement" value="DEBUG" />
156 |     <logger name="com.ibatis" value="DEBUG" />
157 |     <logger name="com.ibatis.common.jdbc.SimpleDataSource" value="DEBUG" />
158 |     <logger name="com.ibatis.common.jdbc.ScriptRunner" level="DEBUG"/>
159 |     <logger name="com.ibatis.sqlmap.engine.impl.SqlMapClientDelegate" value="DEBUG" />
160 | 
161 | 
162 | 
163 |     <!-- root级别   DEBUG -->
164 |     <root level="INFO">
165 |         <!-- 控制台输出 -->
166 |         <appender-ref ref="STDOUT" />
167 |         <!-- 文件输出 -->
168 |         <appender-ref ref="ERROR" />
169 |         <appender-ref ref="INFO" />
170 |         <appender-ref ref="WARN" />
171 |         <appender-ref ref="DEBUG" />
172 |         <appender-ref ref="TRACE" />
173 |     </root>
174 | </configuration>


--------------------------------------------------------------------------------