├── .gitignore
├── LICENSE
├── README.md
├── WebRoot
├── META-INF
│ └── MANIFEST.MF
├── WEB-INF
│ ├── classes
│ │ └── hanlp.properties
│ ├── lib
│ │ ├── hanlp.properties
│ │ └── lexicon
│ │ │ ├── lex-admin.lex
│ │ │ ├── lex-autoload.todo
│ │ │ ├── lex-cemixed.lex
│ │ │ ├── lex-chars.lex
│ │ │ ├── lex-cn-mz.lex
│ │ │ ├── lex-cn-place.lex
│ │ │ ├── lex-company.lex
│ │ │ ├── lex-dname-1.lex
│ │ │ ├── lex-dname-2.lex
│ │ │ ├── lex-ecmixed.lex
│ │ │ ├── lex-en-pun.lex
│ │ │ ├── lex-en.lex
│ │ │ ├── lex-festival.lex
│ │ │ ├── lex-fname.lex
│ │ │ ├── lex-food.lex
│ │ │ ├── lex-lang.lex
│ │ │ ├── lex-ln-adorn.lex
│ │ │ ├── lex-lname.lex
│ │ │ ├── lex-main.lex
│ │ │ ├── lex-nation.lex
│ │ │ ├── lex-net.lex
│ │ │ ├── lex-org.lex
│ │ │ ├── lex-sname.lex
│ │ │ ├── lex-stopword.lex
│ │ │ ├── lex-touris.lex
│ │ │ └── lex-units.lex
│ └── web.xml
├── css
│ └── detail.css
├── index.jsp
└── jsp
│ └── detail.jsp
└── src
├── com
└── chenxb
│ ├── biz
│ ├── ArticleBiz.java
│ ├── ColumnBiz.java
│ ├── RotationImageBiz.java
│ └── UploadRandomImage.java
│ ├── common
│ └── StreamTool.java
│ ├── dao
│ ├── ArticleDao.java
│ ├── ColumnDao.java
│ ├── RotationImageDao.java
│ ├── SearchDao.java
│ └── SummaryDao.java
│ ├── jpush
│ └── TestJpush.java
│ ├── model
│ ├── ArticleItem.java
│ ├── RotationItem.java
│ └── SimpleArticleItem.java
│ ├── news
│ ├── HelloLucene.java
│ ├── LoadRotation.java
│ ├── ReUploadImage.java
│ ├── ReloadAcademic.java
│ ├── ReloadAll.java
│ ├── ReloadBachelor.java
│ ├── ReloadJob.java
│ ├── ReloadLatest.java
│ ├── ReloadMaster.java
│ ├── ReloadNotific.java
│ ├── Test.java
│ ├── Test4.java
│ └── TestJcseg.java
│ ├── servlet
│ ├── ArticleWithSql.java
│ ├── ColumnArticlesWithSql.java
│ ├── MoreArticlesWithSql.java
│ ├── ParseArticleById.java
│ ├── RotationWithSql.java
│ └── SearchArticle.java
│ ├── test
│ ├── JobScheduler.java
│ ├── TestHanlp.java
│ ├── TestJob.java
│ └── TestTimeAgo.java
│ └── util
│ ├── ColumnType.java
│ ├── Constant.java
│ ├── GetTimeAgo.java
│ ├── HttpTool.java
│ ├── ImageTool.java
│ ├── JobScheduler.java
│ ├── MailTool.java
│ ├── MysqlTool.java
│ ├── StreamTool.java
│ ├── StringTool.java
│ ├── TableName.java
│ ├── TimeTool.java
│ └── UrlTool.java
└── hanlp.properties
/.gitignore:
--------------------------------------------------------------------------------
1 | *.class
2 |
3 | # Mobile Tools for Java (J2ME)
4 | .mtj.tmp/
5 |
6 | # Package Files #
7 | *.jar
8 | *.war
9 | *.ear
10 |
11 | # eclipse specific git ignore
12 | *.pydevproject
13 | .project
14 | .metadata
15 | bin/**
16 | tmp/**
17 | tmp/**/*
18 | *.tmp
19 | *.bak
20 | *.swp
21 | *~.nib
22 | local.properties
23 | .classpath
24 | .settings/
25 | .loadpath
26 |
27 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
28 | hs_err_pid*
29 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "{}"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright {yyyy} {name of copyright owner}
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # SeeNewsServer
2 |
3 | Server side of personal news APP, Java Servlet + Mysql implementation
4 |
5 | The first version was hosted on Sina Cloud and later transferred to Alibaba Cloud.
6 | Pictures are stored in Qiniuyun CDN
7 |
8 | JavaServlet+Mysql
9 | ## Development records
10 | Online log monitoring system
11 | Yesterday's updated news data will be sent to your mailbox at 10 o'clock every day
12 | Modify the problem of split method returning a single element for [][""]
13 | Initialization method: From the first page to the last page, 53 records per page, crawling news
14 | If it is interrupted midway, breakpoint initialization is required. The method is:
15 | Get the smallest id from the database, and then find out which page of the website the id is on
16 | Crawl news records below this location
17 |
18 | ## Random Image API
19 |
20 | [http://7xr4g8.com1.z0.glb.clouddn.com/671](http://7xr4g8.com1.z0.glb.clouddn.com/671) Get pictures
21 |
22 | 
23 |
24 | 671 is a numerical number. Currently, the valid icon numbers are 0 to 964. Random pictures can be obtained by randomly generating IDs.
25 |
26 | ```
27 | Random randrom = new Random(47);
28 | String url = "http://7xr4g8.com1.z0.glb.clouddn.com/" +randrom.nextInt(964+1);
29 | ```
30 |
31 | ## mysql Create table statement
32 |
33 | Modify table type and length according to `Exception: Data too long for column`
34 | title Longer example:
35 | "Intelligent Perception and Image Understanding" Key Laboratory of the Ministry of Education The 15th Academic Week and Brain-like Computing and Big Data Deep Learning Frontier Forum
36 | source Longer example: Key Laboratory of Antenna and Microwave Technology
37 | The final table field type and length are as follows:
38 |
39 | ```
40 | CREATE TABLE `rotation` (
41 | `id` int(11) NOT NULL,
42 | `image_urls` text,
43 | `title` varchar(100) DEFAULT NULL,
44 | `publish_date` date NOT NULL,
45 | `read_times` int(11) NOT NULL,
46 | `source` varchar(50) DEFAULT NULL,
47 | `body` longtext,
48 | UNIQUE KEY `id_UNIQUE` (`id`)
49 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
50 |
51 | ```
52 |
53 | ## Feature list
54 |
55 | ### Crawling exception email notification
56 | Based on JavaMail, send email notifications for abnormal pictures and abnormal URLs
57 |
58 | #### Abnormal image url
59 | >Normal path of the picture `/uploads/image/20160109/20160109***.jpg`
60 | Old path `/uploads/old/201152**.jpg`
61 |
62 |
63 | | News id | Abnormal image link | Description |
64 | | ------------- |-------------| -----|
65 | | 7798 | `src="/Public/kindeditor/php/`
`../../../uploads/image/2015**.jpg"`| 多了`/Public/kindeditor/php/`
前面需加上`http://see.xidian.edu.cn` |
66 | | 7302 | `
`| 图片资源不存在
忽略 |
67 | | 7017 | `src="http://see.xidian.edu.cn/`
`uploads/image/20141021/20**.jpg"`| 绝对路径开头 |
68 |
69 |
70 | ### Reused file download icon
71 | | Icon | Original address | Qiniu key value |
72 | | ------------- |------------| -----|
73 | |
| http://rsc.xidian.edu.cn/plus/img/addon.gif
http://see.xidian.edu.cn/uploads/old/ico/zip.jpg
http://xgc.xidian.edu.cn/images/mid.gif
http://jwc.xidian.edu.cn/images/ico/rar.jpg
http://202.117.120.88/images/download.gif
The resource does not exist, use the above gif instead| `912720f605b84070e223d0dab690a114`
`3949a245e521f81ffd18e5d01347a20d`
`2a8eac72c3697a837dd66e9e5243a089`
`bc87e43d342b380a2145ee1bb8298759`
`f7324b0d360946315ac83fb8f2703044`
The key for each link |
74 | |
| http://see.xidian.edu.cn/uploads/old/file/doc.gif
http://jwc.xidian.edu.cn/images/ico/doc.jpg
http://see.xidian.edu.cn/uploads/old/ico/doc.jpg | `b5805b46ce8cf9c634b3820a23d64ca6`
`f8d0fc587a7c7295835e8094af094d2d`
`ad5d0e0cf63834756dde3dc5e9629d8` |
75 | |
| http://see.xidian.edu.cn/uploads/old/file/xls.gif
http://jwc.xidian.edu.cn/images/ico/xls.jpg
http://zzb.xidian.edu.cn/new/WebEdit/sysimage/icon16/xls.gif | `84b7028179e09614540cea8dd0122c3c`
`d72210a72c0e174245a65e8755f6eaa`
`1323ef50b1457274c914413b067e9192`|
76 |
77 |
78 | #### Collected exception href:
79 |
80 | | News id | Dirty data | Description |
81 | |------------- |-------------| -----|
82 | | - | `href="Electronic Academy"`| href is Chinese |
83 | | 7837 | `/uploads/file/20151202/20151202101309_73187.zip` | The same href appears multiple times
resulting in multiple substitutions
`http://see.xidian.edu.cnhtt`
` p://see.xidian.edu.cn/**.zip`|
84 | | 7710 | `href="Cultivation project application related documents" ` | href is Chinese|
85 | | - | `href="601240943@qq.com"`| Only email address
without the preceding "mailto:"
86 | | - | `kb.xidian.cc `|Does not start with http|
87 | | 6283 | `https://mail.google.com/mail/h/**`| https starts with|
88 | | 6206 | `ftp://linux.xidian.edu.cn`| ftp starts with |
89 |
90 |
91 | Note: Regular href starts with http https
92 |
93 |
94 | ### Upload pictures to Qiniu Cloud
95 |
96 | Asynchronously upload pictures to Qiniu Cloud
97 |
98 | ### Thanks to open source, dependent class libraries
99 | - Java crawler [Jsoup](https://github.com/jhy/jsoup)
100 | - json serialization [gson](https://github.com/google/gson)
101 | - Processing arrays [commons-lang](https://github.com/apache/commons-lang)
102 | - javamail [javamail](https://java.net/projects/javamail/pages/Home)
103 | - Chinese word segmentation [jcseg](http://www.oschina.net/p/jcseg)
104 | - Full-text search engine toolkit [lucene](http://lucene.apache.org/)
105 | - Random image API [unsplash](https://unsplash.it/)
106 |
--------------------------------------------------------------------------------
/WebRoot/META-INF/MANIFEST.MF:
--------------------------------------------------------------------------------
1 | Manifest-Version: 1.0
2 | Class-Path:
3 |
4 |
--------------------------------------------------------------------------------
/WebRoot/WEB-INF/classes/hanlp.properties:
--------------------------------------------------------------------------------
1 | #本配置文件中的路径的根目录,根目录+其他路径=绝对路径
2 | #Windows用户请注意,路径分隔符统一使用/
3 | root=./
4 | #核心词典路径
5 | CoreDictionaryPath=data/dictionary/CoreNatureDictionary.txt
6 | #2元语法词典路径
7 | BiGramDictionaryPath=data/dictionary/CoreNatureDictionary.ngram.txt
8 | #停用词词典路径
9 | CoreStopWordDictionaryPath=data/dictionary/stopwords.txt
10 | #同义词词典路径
11 | CoreSynonymDictionaryDictionaryPath=data/dictionary/synonym/CoreSynonym.txt
12 | #人名词典路径
13 | PersonDictionaryPath=data/dictionary/person/nr.txt
14 | #人名词典转移矩阵路径
15 | PersonDictionaryTrPath=data/dictionary/person/nr.tr.txt
16 | #繁简词典路径
17 | TraditionalChineseDictionaryPath=data/dictionary/tc/TraditionalChinese.txt
18 | #自定义词典路径,用;隔开多个自定义词典,空格开头表示在同一个目录,使用“文件名 词性”形式则表示这个词典的词性默认是该词性。优先级递减。
19 | #另外data/dictionary/custom/CustomDictionary.txt是个高质量的词库,请不要删除
20 | CustomDictionaryPath=data/dictionary/custom/CustomDictionary.txt; 现代汉语补充词库.txt; 全国地名大全.txt ns; 人名词典.txt; 机构名词典.txt; 上海地名.txt ns;data/dictionary/person/nrf.txt nrf
21 | #CRF分词模型路径
22 | CRFSegmentModelPath=data/model/segment/CRFSegmentModel.txt
23 | #HMM分词模型
24 | HMMSegmentModelPath=data/model/segment/HMMSegmentModel.bin
25 | #分词结果是否展示词性
26 | ShowTermNature=true
--------------------------------------------------------------------------------
/WebRoot/WEB-INF/lib/hanlp.properties:
--------------------------------------------------------------------------------
1 | #本配置文件中的路径的根目录,根目录+其他路径=绝对路径
2 | #Windows用户请注意,路径分隔符统一使用/
3 | root=./
4 | #核心词典路径
5 | CoreDictionaryPath=data/dictionary/CoreNatureDictionary.txt
6 | #2元语法词典路径
7 | BiGramDictionaryPath=data/dictionary/CoreNatureDictionary.ngram.txt
8 | #停用词词典路径
9 | CoreStopWordDictionaryPath=data/dictionary/stopwords.txt
10 | #同义词词典路径
11 | CoreSynonymDictionaryDictionaryPath=data/dictionary/synonym/CoreSynonym.txt
12 | #人名词典路径
13 | PersonDictionaryPath=data/dictionary/person/nr.txt
14 | #人名词典转移矩阵路径
15 | PersonDictionaryTrPath=data/dictionary/person/nr.tr.txt
16 | #繁简词典路径
17 | TraditionalChineseDictionaryPath=data/dictionary/tc/TraditionalChinese.txt
18 | #自定义词典路径,用;隔开多个自定义词典,空格开头表示在同一个目录,使用“文件名 词性”形式则表示这个词典的词性默认是该词性。优先级递减。
19 | #另外data/dictionary/custom/CustomDictionary.txt是个高质量的词库,请不要删除
20 | CustomDictionaryPath=data/dictionary/custom/CustomDictionary.txt; 现代汉语补充词库.txt; 全国地名大全.txt ns; 人名词典.txt; 机构名词典.txt; 上海地名.txt ns;data/dictionary/person/nrf.txt nrf
21 | #CRF分词模型路径
22 | CRFSegmentModelPath=data/model/segment/CRFSegmentModel.txt
23 | #HMM分词模型
24 | HMMSegmentModelPath=data/model/segment/HMMSegmentModel.bin
25 | #分词结果是否展示词性
26 | ShowTermNature=true
--------------------------------------------------------------------------------
/WebRoot/WEB-INF/lib/lexicon/lex-admin.lex:
--------------------------------------------------------------------------------
1 | CJK_WORDS
2 | 人事部/nt/ren shi bu/人事管理部门,人事管理部
3 | 人事管理部/nt/ren shi guan li bu/人事管理部门,人事部
4 | 信息产业部/nt/xin xi chan ye bu/null
5 | 农业部/nt/nong ye bu/null
6 | 医管局/nt/yi guan ju/医疗管理部门,医疗管理部,医疗管理局
7 | 医疗管理部/nt/yi liao guan li bu/医疗管理部门,医管局
8 | 医疗管理部门/nt/yi liao guan li bu men/医管局,医疗管理部
9 | 发改委/nt/fa gai wei/null
10 | 国土资源部/nt/guo tu zi yuan bu/null
11 | 国防部/nt/guo fang bu/人民武装力量部,军事部,防卫厅
12 | 军事部/nt/jun shi bu/人民武装力量部,防卫厅
13 | 外交部/nt/wai jiao bu/国务院,政治部,对外关系部,外务省
14 | 外交部长/r/wai jiao bu zhang/null
15 | 教育部/nt/jiao yu bu/null
16 | 文化部/nt/wen hua bu/null
17 | 民政部/nt/min zheng bu/null
18 | 能源部/nt/neng yuan bu/null
19 | 财政部/nt/cai zheng bu/null
20 | 铁道部/nt/tie dao bu/null
21 | 防卫厅/nt/fang wei ting/null
22 | 防卫省/nt/fang wei sheng/null
23 | 革命委员会/nt/ge ming wei yuan hui/null
24 | 交通运输部/nt/jiao tong yun shu bu/null
25 | 对外经济贸易部/nt/dui wai jing ji mao yi bu/null
26 | 技术部/nt/ji shu bu/null
27 | 财务部/nt/cai wu bu/null
28 | 总装备部/nt/zong zhuang bei bu/null
29 |
--------------------------------------------------------------------------------
/WebRoot/WEB-INF/lib/lexicon/lex-autoload.todo:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/studychen/SeeNewsServer/3f2ea5ee974e0dd40d735d55fd33334f2efd23c3/WebRoot/WEB-INF/lib/lexicon/lex-autoload.todo
--------------------------------------------------------------------------------
/WebRoot/WEB-INF/lib/lexicon/lex-cemixed.lex:
--------------------------------------------------------------------------------
1 | CE_MIXED_WORD
2 | #中文英文混合词词库
3 | 卡拉ok/nz/ka la ok/null
4 | 漂亮mm/nz/piao lian mm/null
5 | 拳皇ova/nz/quan huang ova/拳皇动漫
6 | 奇都ktv/nz/qi du ktv/null
7 | 哆啦a梦/nz/duo la a meng/null
8 | 高3/n/gao san/高三
9 | 高2/n/gao er/高二
10 | 高1/n/gao yi/高一
11 |
--------------------------------------------------------------------------------
/WebRoot/WEB-INF/lib/lexicon/lex-cn-mz.lex:
--------------------------------------------------------------------------------
1 | CJK_WORDS
2 | 汉族/nz/han zu/null
3 | 汉族人/nz/han zu ren/null
4 | 汉族语/nz/han zu yu/null
5 | 蒙古族/nz/meng gu zu/null
6 | 蒙古族人/nz/meng gu zu ren/null
7 | 蒙古族语/nz/meng gu zu yu/null
8 | 满族/nz/man zu/null
9 | 满族人/nz/man zu ren/null
10 | 满族语/nz/man zu yu/null
11 | 朝鲜族/nz/chao xian zu/null
12 | 朝鲜族人/nz/zhao xian zu ren/null
13 | 朝鲜族语/nz/zhao xian zu yu/null
14 | 赫哲族/nz/he zhe zu/null
15 | 赫哲族人/nz/he zhe zu ren/null
16 | 赫哲族语/nz/he zhe zu yu/null
17 | 达斡尔族/nz/da wo er zu/null
18 | 达斡尔族人/nz/da wo er zu ren/null
19 | 达斡尔族语/nz/da wo er zu yu/null
20 | 鄂温克族/nz/e wen ke zu/null
21 | 鄂温克族人/nz/e wen ke zu ren/null
22 | 鄂温克族语/nz/e wen ke zu yu/null
23 | 鄂伦春族/nz/e lun chun zu/null
24 | 鄂伦春族人/nz/e lun chun zu ren/null
25 | 鄂伦春族语/nz/e lun chun zu yu/null
26 | 回族/nz/hui zu/null
27 | 回族人/nz/hui zu ren/null
28 | 回族语/nz/hui zu yu/null
29 | 东乡族/nz/dong xiang zu/null
30 | 东乡族人/nz/dong xiang zu ren/null
31 | 东乡族语/nz/dong xiang zu yu/null
32 | 土族/nz/tu zu/null
33 | 土族人/nz/tu zu ren/null
34 | 土族语/nz/tu zu yu/null
35 | 撒拉族/nz/sa la zu/null
36 | 撒拉族人/nz/sa la zu ren/null
37 | 撒拉族语/nz/sa la zu yu/null
38 | 保安族/nz/bao an zu/null
39 | 保安族人/nz/bao an zu ren/null
40 | 保安族语/nz/bao an zu yu/null
41 | 裕固族/nz/yu gu zu/null
42 | 裕固族人/nz/yu gu zu ren/null
43 | 裕固族语/nz/yu gu zu yu/null
44 | 维吾尔族/nz/wei wu er zu/null
45 | 维吾尔族人/nz/wei wu er zu ren/null
46 | 维吾尔族语/nz/wei wu er zu yu/null
47 | 哈萨克族/nz/ha sa ke zu/null
48 | 哈萨克族人/nz/ha sa ke zu ren/null
49 | 哈萨克族语/nz/ha sa ke zu yu/null
50 | 柯尔克孜族/nz/ke er ke zi zu/null
51 | 柯尔克孜族人/nz/ke er ke zi zu ren/null
52 | 柯尔克孜族语/nz/ke er ke zi zu yu/null
53 | 锡伯族/nz/xi bo zu/null
54 | 锡伯族人/nz/xi bo zu ren/null
55 | 锡伯族语/nz/xi bo zu yu/null
56 | 塔吉克族/nz/ta ji ke zu/null
57 | 塔吉克族人/nz/ta ji ke zu ren/null
58 | 塔吉克族语/nz/ta ji ke zu yu/null
59 | 乌孜别克族/nz/wu zi bie ke zu/null
60 | 乌孜别克族人/nz/wu zi bie ke zu ren/null
61 | 乌孜别克族语/nz/wu zi bie ke zu yu/null
62 | 俄罗斯族/nz/e luo si zu/null
63 | 俄罗斯族人/nz/e luo si zu ren/null
64 | 俄罗斯族语/nz/e luo si zu yu/null
65 | 塔塔尔族/nz/ta ta er zu/null
66 | 塔塔尔族人/nz/ta ta er zu ren/null
67 | 塔塔尔族语/nz/ta ta er zu yu/null
68 | 藏族/nz/zang zu/null
69 | 藏族人/nz/zang zu ren/null
70 | 藏族语/nz/zang zu yu/null
71 | 门巴族/nz/men ba zu/null
72 | 门巴族人/nz/men ba zu ren/null
73 | 门巴族语/nz/men ba zu yu/null
74 | 珞巴族/nz/luo ba zu/null
75 | 珞巴族人/nz/luo ba zu ren/null
76 | 珞巴族语/nz/luo ba zu yu/null
77 | 羌族/nz/qiang zu/null
78 | 羌族人/nz/qiang zu ren/null
79 | 羌族语/nz/qiang zu yu/null
80 | 彝族/nz/yi zu/null
81 | 彝族人/nz/yi zu ren/null
82 | 彝族语/nz/yi zu yu/null
83 | 白族/nz/bai zu/null
84 | 白族人/nz/bai zu ren/null
85 | 白族语/nz/bai zu yu/null
86 | 哈尼族/nz/ha ni zu/null
87 | 哈尼族人/nz/ha ni zu ren/null
88 | 哈尼族语/nz/ha ni zu yu/null
89 | 傣族/nz/dai zu/null
90 | 傣族人/nz/dai zu ren/null
91 | 傣族语/nz/dai zu yu/null
92 | 僳僳族/nz/su su zu/null
93 | 僳僳族人/nz/su su zu ren/null
94 | 僳僳族语/nz/su su zu yu/null
95 | 佤族/nz/wa zu/null
96 | 佤族人/nz/wa zu ren/null
97 | 佤族语/nz/wa zu yu/null
98 | 拉祜族/nz/la hu zu/null
99 | 拉祜族人/nz/la hu zu ren/null
100 | 拉祜族语/nz/la hu zu yu/null
101 | 纳西族/nz/na xi zu/null
102 | 纳西族人/nz/na xi zu ren/null
103 | 纳西族语/nz/na xi zu yu/null
104 | 景颇族/nz/jing po zu/null
105 | 景颇族人/nz/jing po zu ren/null
106 | 景颇族语/nz/jing po zu yu/null
107 | 布朗族/nz/bu lang zu/null
108 | 布朗族人/nz/bu lang zu ren/null
109 | 布朗族语/nz/bu lang zu yu/null
110 | 阿昌族/nz/a chang zu/null
111 | 阿昌族人/nz/a chang zu ren/null
112 | 阿昌族语/nz/a chang zu yu/null
113 | 普米族/nz/pu mi zu/null
114 | 普米族人/nz/pu mi zu ren/null
115 | 普米族语/nz/pu mi zu yu/null
116 | 怒族/nz/nu zu/null
117 | 怒族人/nz/nu zu ren/null
118 | 怒族语/nz/nu zu yu/null
119 | 德昂族/nz/de ang zu/null
120 | 德昂族人/nz/de ang zu ren/null
121 | 德昂族语/nz/de ang zu yu/null
122 | 独龙族/nz/du long zu/null
123 | 独龙族人/nz/du long zu ren/null
124 | 独龙族语/nz/du long zu yu/null
125 | 基诺族/nz/ji nuo zu/null
126 | 基诺族人/nz/ji nuo zu ren/null
127 | 基诺族语/nz/ji nuo zu yu/null
128 | 苗族/nz/miao zu/null
129 | 苗族人/nz/miao zu ren/null
130 | 苗族语/nz/miao zu yu/null
131 | 布依族/nz/bu yi zu/null
132 | 布依族人/nz/bu yi zu ren/null
133 | 布依族语/nz/bu yi zu yu/null
134 | 侗族/nz/dong zu/null
135 | 侗族人/nz/dong zu ren/null
136 | 侗族语/nz/dong zu yu/null
137 | 水族/nz/shui zu/null
138 | 水族人/nz/shui zu ren/null
139 | 水族语/nz/shui zu yu/null
140 | 仡佬族/nz/ge lao zu/null
141 | 仡佬族人/nz/ge lao zu ren/null
142 | 仡佬族语/nz/ge lao zu yu/null
143 | 壮族/nz/zhuang zu/null
144 | 壮族人/nz/zhuang zu ren/null
145 | 壮族语/nz/zhuang zu yu/null
146 | 瑶族/nz/yao zu/null
147 | 瑶族人/nz/yao zu ren/null
148 | 瑶族语/nz/yao zu yu/null
149 | 仫佬族/nz/mu lao zu/null
150 | 仫佬族人/nz/mu lao zu ren/null
151 | 仫佬族语/nz/mu lao zu yu/null
152 | 毛南族/nz/mao nan zu/null
153 | 毛南族人/nz/mao nan zu ren/null
154 | 毛南族语/nz/mao nan zu yu/null
155 | 京族/nz/jing zu/null
156 | 京族人/nz/jing zu ren/null
157 | 京族语/nz/jing zu yu/null
158 | 土家族/nz/tu jia zu/null
159 | 土家族人/nz/tu jia zu ren/null
160 | 土家族语/nz/tu jia zu yu/null
161 | 黎族/nz/li zu/null
162 | 黎族人/nz/li zu ren/null
163 | 黎族语/nz/li zu yu/null
164 | 畲族/nz/she zu/null
165 | 畲族人/nz/yu zu ren/null
166 | 畲族语/nz/yu zu yu/null
167 | 高山族/nz/gao shan zu/null
168 | 高山族人/nz/gao shan zu ren/null
169 | 高山族语/nz/gao shan zu yu/null
170 |
--------------------------------------------------------------------------------
/WebRoot/WEB-INF/lib/lexicon/lex-company.lex:
--------------------------------------------------------------------------------
1 | CJK_WORDS
2 | 央视/nt/yang shi/null
3 | 电信/nt/dian xin/null
4 | 移动/nt/yi dong/null
5 | 网通/nt/wang tong/null
6 | 联通/nt/lian tong/null
7 | 铁通/nt/tie tong/null
8 | 百度/nt/bai du/null
9 | 环球网/nt/huan qiu wang/null
10 | 长城网/nt/chang cheng wang/null
11 | 新浪/nt/xin lang/null
12 | 腾讯/nt/teng xun/null
13 | 搜搜/nt/so so/soso
14 | 谷歌/nt/gu ge/null
15 | 雅虎/nt/ya hu/null
16 | 微软/nt/wei ruan/null
17 | 中关村/nt/zhong guan cun/null
18 | 搜狐/nt/sou hu/null
19 | 网易/nt/wang yi/null
20 | 硅谷/nt/gui gu/null
21 | 维基百科/nt/wei ji bai ke/null
22 | 巨人网络/nt/ju ren wang luo/null
23 | 阿里巴巴/nt/a li ba ba/null
24 | 阿里旺旺/nt/a li ba ba/旺旺
25 | 旺旺/n/wang wang/null
26 | 淘宝/nt/tao bao/null
27 | 赶集网/nt/gan ji wang/null
28 | 猪八戒网/nt/zhu ba jie wang/null
29 | 唯你英语/nt/wei ni ying yu/null
30 | 拉手网/nt/la shou wang/null
31 | 百贯福泰/nt/bai guan fu tai/null
32 | 汇划算/nt/hui hua suan/null
33 | 汇划算网/nt/hui hua suan wang/null
34 | 聚划算/nt/ju hua suan/null
35 | 天猫/nt/tian mao/null
36 | 天猫网/nt/tian mao wang/null
37 | 亚马逊/nt/ya ma xun/null
38 | 亚马逊网/nt/ya ma xun wang/null
39 | 拍拍/nt/pai pai/null
40 | 拍拍网/nt/pai pai wang/null
41 | 京东/nt/jing dong/null
42 | 京东商城/nt/jing dong shang cheng/null
43 | 返利网/nt/fan li wang/null
44 | 支付宝/nt/zhi fu bao/null
45 | 支付宝担保/nt/zhi fu bao dan bao/null
46 | 支付宝及时到帐/nt/zhi fu bao ji shi dao zhang/null
47 | 支付宝双工能/nt/zhi fu bao shuang gong neng/null
48 | 财付通/nt/cai fu tong/null
49 | 财付通及时到帐/nt/cai fu tong ji shi dao zhang/null
50 | 网银在线/nt/wang yin zai xian/null
51 | 苏宁易购/nt/su ning yi gou/null
52 | 苏宁电器/nt/su ning dian qi/null
53 | 仙童公司/nt/xian tong gong si/null
54 | 开源中国/nt/kai yuan zhong guo/null
55 | 畅想网络/nt/chang xiang wang luo/null
56 | 快乐大本营/nt/kuai yue da ben ying/null
57 | 越策越开心/nt/yue ce yue kai xin/null
58 | 超级男声/nt/chao ji nan sheng/null
59 | 超男/nt/chao nan/null
60 | 超级女声/nt/chao ji nu sheng/超女
61 | 超女/nt/chao nu/超级女声
62 | 好声音/nt/hao sheng yin/null
63 | 快乐男声/nt/kuai yue nan sheng/快男
64 | 快男/nt/kuai nan/快乐男声
65 | 快乐女声/nt/kuai yue nu sheng/null
66 | 快女/nt/kuai nu/null
67 | 德克士/nt/de ke shi/null
68 | 肯德基/nt/ken de ji/null
69 | 奥利奥/nt/ao li ao/null
70 | 回头客/nt/hui tou ke/null
71 | 苏波尔/nt/su bo er/null
72 | 苏宁/nt/su ning/null
73 | 苏宁电器/nt/su ning dian qi/null
74 | 苏宁易购/nt/su ning yi gou/null
75 | 中央银行/nt/zhong yang yin hang/null
76 | 人民银行/nt/ren min yin hang/null
77 | 工商银行/nt/gong shang yin hang/null
78 | 农业银行/nt/nong ye yin xing/null
79 | 中国银行/nt/zhong guo yin hang/null
80 | 建设银行/nt/jian she yin xing/null
81 | 交通银行/nt/jiao tong yin hang/null
82 | 华夏银行/nt/hua xia yin hang/null
83 | 光大银行/nt/guang da yin xing/null
84 | 招商银行/nt/zhao shang yin xing/null
85 | 中信银行/nt/zhong xin yin hang/null
86 | 兴业银行/nt/xing ye yin hang/null
87 | 民生银行/nt/min sheng yin xing/null
88 | 深圳发展银行/nt/shen zhen fa zhan yin xing/null
89 | 广东发展银行/nt/guang dong fa zhan yin xing/null
90 | 上海浦东发展银行/nt/shang hai pu dong fa zhan yin hang/null
91 | 恒丰银行/nt/heng feng yin xing/null
92 | 农业发展银行/nt/nong ye fa zhan yin xing/null
93 | 国家进出口信贷银行/nt/guo jia jin chu kou xin dai yin xing/null
94 | 国家开发银行/nt/guo jia kai fa yin hang/null
95 | 北京商业银行/nt/bei jing shang ye yin xing/null
96 | 上海银行/nt/shang hai yin xing/null
97 | 济南商业银行/nt/ji nan shang ye yin xing/null
98 | 信用社/nt/xin yong she/null
99 | 农村信用社/nt/nong cun xin yong she/null
100 | 邮政局/nt/you zheng ju/null
101 | 邮政储蓄银行/nt/you zheng chu xu yin xing/null
102 |
--------------------------------------------------------------------------------
/WebRoot/WEB-INF/lib/lexicon/lex-dname-1.lex:
--------------------------------------------------------------------------------
1 | CN_DNAME_1
2 | #双姓名首字词库
3 | 建
4 | 小
5 | 晓
6 | 文
7 | 志
8 | 国
9 | 玉
10 | 丽
11 | 永
12 | 海
13 | 春
14 | 金
15 | 明
16 | 新
17 | 德
18 | 秀
19 | 红
20 | 亚
21 | 伟
22 | 雪
23 | 俊
24 | 桂
25 | 爱
26 | 美
27 | 世
28 | 正
29 | 庆
30 | 学
31 | 家
32 | 立
33 | 淑
34 | 振
35 | 云
36 | 华
37 | 光
38 | 惠
39 | 兴
40 | 天
41 | 长
42 | 艳
43 | 慧
44 | 利
45 | 宏
46 | 佳
47 | 瑞
48 | 凤
49 | 荣
50 | 秋
51 | 继
52 | 嘉
53 | 卫
54 | 燕
55 | 思
56 | 维
57 | 少
58 | 福
59 | 忠
60 | 宝
61 | 子
62 | 成
63 | 月
64 | 洪
65 | 东
66 | 一
67 | 泽
68 | 林
69 | 大
70 | 素
71 | 旭
72 | 宇
73 | 智
74 | 锦
75 | 冬
76 | 玲
77 | 雅
78 | 伯
79 | 翠
80 | 传
81 | 启
82 | 剑
83 | 安
84 | 树
85 | 良
86 | 中
87 | 梦
88 | 广
89 | 昌
90 | 元
91 | 万
92 | 清
93 | 静
94 | 友
95 | 宗
96 | 兆
97 | 丹
98 | 克
99 | 彩
100 | 绍
101 | 喜
102 | 远
103 | 朝
104 | 敏
105 | 培
106 | 胜
107 | 祖
108 | 先
109 | 菊
110 | 士
111 | 向
112 | 有
113 | 连
114 | 军
115 | 健
116 | 巧
117 | 耀
118 | 莉
119 | 英
120 | 方
121 | 和
122 | 仁
123 | 孝
124 | 梅
125 | 汉
126 | 兰
127 | 松
128 | 水
129 | 江
130 | 益
131 | 开
132 | 景
133 | 运
134 | 贵
135 | 祥
136 | 青
137 | 芳
138 | 碧
139 | 婷
140 | 龙
141 | 鹏
142 | 自
143 | 顺
144 | 双
145 | 书
146 | 生
147 | 义
148 | 跃
149 | 银
150 | 佩
151 | 雨
152 | 保
153 | 贤
154 | 仲
155 | 鸿
156 | 浩
157 | 加
158 | 定
159 | 炳
160 | 飞
161 | 锡莎
162 | 柏
163 | 发
164 | 超
165 | 道
166 | 怀
167 | 进
168 | 其
169 | 富
170 | 平
171 | 全
172 | 阳
173 | 吉
174 | 茂
175 | 彦
176 | 诗
177 | 洁
178 | 润
179 | 承
180 | 治
181 | 焕
182 | 如
183 | 君
184 | 增
185 | 善
186 | 希
187 | 根
188 | 应
189 | 勇
190 | 宜
191 | 守
192 | 会
193 | 凯
194 | 育
195 | 湘
196 | 凌
197 | 本
198 | 敬
199 | 博
200 | 延
201 | 乐
202 | 三
203 | 高
204 | 熙
205 | 逸
206 | 幸
207 | 灵
208 | 宣
209 | 才
210 | 述
211 | 化
212 | 那
213 | 紫
214 | 莎
215 |
--------------------------------------------------------------------------------
/WebRoot/WEB-INF/lib/lexicon/lex-dname-2.lex:
--------------------------------------------------------------------------------
1 | CN_DNAME_2
2 | #双姓名尾字词库
3 | 华
4 | 平
5 | 明
6 | 英
7 | 军
8 | 林
9 | 萍
10 | 芳
11 | 玲
12 | 红
13 | 生
14 | 霞
15 | 梅
16 | 文
17 | 荣
18 | 珍
19 | 兰
20 | 娟
21 | 峰
22 | 琴
23 | 云
24 | 辉
25 | 东
26 | 龙
27 | 敏
28 | 伟
29 | 强
30 | 丽
31 | 春
32 | 杰
33 | 燕
34 | 民
35 | 君
36 | 波
37 | 国
38 | 芬
39 | 清
40 | 祥
41 | 斌
42 | 婷
43 | 飞
44 | 良
45 | 忠
46 | 新
47 | 凤
48 | 锋
49 | 成
50 | 勇
51 | 刚
52 | 玉
53 | 元
54 | 宇
55 | 海
56 | 兵
57 | 安
58 | 庆
59 | 涛
60 | 鹏
61 | 亮
62 | 青
63 | 阳
64 | 艳
65 | 松
66 | 江
67 | 莲
68 | 娜
69 | 兴
70 | 光
71 | 德
72 | 武
73 | 香
74 | 俊
75 | 秀
76 | 慧
77 | 雄
78 | 才
79 | 宏
80 | 群
81 | 琼
82 | 胜
83 | 超
84 | 彬
85 | 莉
86 | 中
87 | 山
88 | 富
89 | 花
90 | 宁
91 | 利
92 | 贵
93 | 福
94 | 发
95 | 义
96 | 蓉
97 | 喜
98 | 娥
99 | 昌
100 | 仁
101 | 志
102 | 全
103 | 宝
104 | 权
105 | 美
106 | 琳
107 | 建
108 | 金
109 | 贤
110 | 星
111 | 丹
112 | 根
113 | 和
114 | 珠
115 | 康
116 | 菊
117 | 琪
118 | 坤
119 | 泉
120 | 秋
121 | 静
122 | 佳
123 | 顺
124 | 源
125 | 珊
126 | 达
127 | 欣
128 | 如
129 | 莹
130 | 章
131 | 浩
132 | 勤
133 | 芹
134 | 容
135 | 友
136 | 芝
137 | 豪
138 | 洁
139 | 鑫
140 | 惠
141 | 洪
142 | 旺
143 | 虎
144 | 远
145 | 妮
146 | 森
147 | 妹
148 | 南
149 | 雯
150 | 奇
151 | 健
152 | 卿
153 | 虹
154 | 娇
155 | 媛
156 | 怡
157 | 铭
158 | 川
159 | 进
160 | 博
161 | 智
162 | 来
163 | 琦
164 | 学
165 | 聪
166 | 洋
167 | 乐
168 | 年
169 | 翔
170 | 然
171 | 栋
172 | 凯
173 | 颖
174 | 鸣
175 | 丰
176 | 瑞
177 | 奎
178 | 立
179 | 堂
180 | 威
181 | 雪
182 | 鸿
183 | 晶
184 | 桂
185 | 凡
186 | 娣
187 | 先
188 | 洲
189 | 毅
190 | 雅
191 | 月
192 | 旭
193 | 田
194 | 晖
195 | 方
196 | 恒
197 | 亚
198 | 泽
199 | 风
200 | 银
201 | 高
202 | 贞
203 | 九
204 | 薇
205 | 钰
206 | 城
207 | 宜
208 | 厚
209 | 耐
210 | 声
211 | 腾
212 | 宸
213 | 勋
214 | 曲
215 | 轩
216 | 棋
217 |
--------------------------------------------------------------------------------
/WebRoot/WEB-INF/lib/lexicon/lex-ecmixed.lex:
--------------------------------------------------------------------------------
1 | EC_MIXED_WORD
2 | #英文中文混合字, 注意英文字符均为小写
3 | a咖/n/a ga/主角
4 | a片/n/a pian/毛片,av
5 | a座/f/a zuo/null
6 | a股/n/a gu/股票
7 | a型/n/a xing/null
8 | a杯/n/a bei/a罩杯
9 | a罩杯/n/a zhao bei/a杯
10 | a计划/n/a ji hua/null
11 | aa制/I/aa zhi/null
12 | ab型/n/ab xing/null
13 | ab档案/n/ab dang an/null
14 | a美a/n/null/null
15 | a梦/a/null/null
16 | x-射线/n/null/null
17 | #
18 | b座/f/b zuo/null
19 | b股/n/b gu/null
20 | b型/n/b xing/null
21 | b树/n/b shu/null
22 | b计划/n/b ji hua/null
23 | b超/n/b chao/null
24 | b杯/n/b bei/b罩杯
25 | b罩杯/n/b zhao bei/b杯
26 | bb机/n/bb ji/call机
27 | bb仔/n/bb zai/null
28 | bp机/n/bp ji/null
29 | b型/n/b xing/null
30 | b型肝炎/n/b xing gan yan/乙型肝炎
31 | #
32 | c盘/n/c pan/null
33 | c座/f/c zuo/null
34 | c语言/n/c yu yan/null
35 | c杯/n/c bei/c罩杯
36 | c罩杯/n/c zhao bei/c杯
37 | cd盒/n/cd he/null
38 | cd机/n/cd ji/null
39 | call机/n/call ji/bb机
40 | #
41 | d盘/n/d pan/null
42 | d座/f/d zuo/null
43 | d版/n/d ban/null
44 | d杯/n/d bei/d罩杯
45 | d罩杯/n/d zhao bei/d杯
46 | dna鉴定/n/dna jian ding/null
47 | #
48 | e盘/n/e pan/null
49 | e座/f/e zuo/null
50 | e化/n/e hua/null
51 | e通/n/e tong/null
52 | e仔/n/e zai/null
53 | e语言/n/e yu yan/易语言
54 | e杯/n/e bei/e罩杯
55 | e罩杯/n/e zhao bei/e杯
56 | #
57 | f盘/n/f pan/null
58 | f座/f/f zuo/null
59 | f杯/n/f bei/f罩杯
60 | f罩杯/b/f zhao bei/f杯
61 | #
62 | g盘/n/g pan/null
63 | g点/n/g dian/null
64 | g杯/n/g bei/g罩杯
65 | g罩杯/n/g zhao bei/g杯
66 | #
67 | h盘/n/h pan/null
68 | h股/n/h gu/null
69 | h杯/n/h bei/h罩杯
70 | h罩杯/n/h zhao bei/h杯
71 | #
72 | i盘/n/i pan/null
73 | ic卡/n/ic ka/null
74 | ip卡/n/ip ka/null
75 | ip段/n/ip duan/null
76 | ip电话/n/ip dian hua/null
77 | ip地址/n/ip di zhi/null
78 | it行业/n/it hang ye/null
79 | it民工/n/it ming gong/码农
80 | it男/n/it nan/null
81 | #
82 | j盘/n/j pan/null
83 | #
84 | k仔/n/k zai/null
85 | k盘/n/k pan/null
86 | k党/n/k dang/null
87 | k书/v/k shu/看书,搞学习
88 | k粉/n/k fen/氯胺酮
89 | k歌/v/k ge/唱歌,嗨歌
90 | k他命/n/k ta ming/null
91 | k歌之王/n/k ge zhi wang/null
92 | #
93 | n年/n/n nian/很久
94 | #
95 | o型/n/o xing/null
96 | #
97 | pc机/n/pc ji/null
98 | ph值/n/ph zhi/null
99 | #
100 | sim卡/n/sim ka/null
101 | #
102 | u盘/n/u pan/null
103 | u形/n/u xing/null
104 | usb手指/n/usb shou zhi/null
105 | usb接口/n/usb jie kou/null
106 | usb插口/n/usb cha kou/null
107 | usb记忆棒/n/usb ji yi bang/null
108 | #
109 | visa卡/n/visa ka/null
110 | v沟/n/v gou/null
111 | #
112 | z盘/n/z pan/null
113 | #
114 | q版/n/q ban/null
115 | qq号/n/qq hao/null
116 | q立方/n/q li fang/null
117 | q币/n/q bi/null
118 | #
119 | rss订阅/n/rss ding yue/null
120 | #
121 | t盘/n/t pan/null
122 | #
123 | x光/n/x guan/null
124 | x光线/n/x guan xian/x射线
125 | x射线/n/x she xian/x光线
126 | γ射线/n/γ she xian/null
127 | #
128 | t恤衫/n/t xue shan/t恤
129 | t恤/n/t xue/t恤衫
130 | t字帐/n/t zi zhang/null
131 | t型台/n/t xing tai/null
132 | #
133 | 250g硬盘/n/250g ying pan/null
134 | 160g硬盘/n/160g ying pan/null
135 | 500g硬盘/n/500g ying pan/null
136 |
--------------------------------------------------------------------------------
/WebRoot/WEB-INF/lib/lexicon/lex-en-pun.lex:
--------------------------------------------------------------------------------
1 | EN_PUN_WORDS
2 | #英文和标点组合成的词,英文字母统一使用小写。
3 | c++
4 | g++
5 | c#
6 | i++
7 | x-
8 |
--------------------------------------------------------------------------------
/WebRoot/WEB-INF/lib/lexicon/lex-en.lex:
--------------------------------------------------------------------------------
1 | EN_WORD
2 | #英文词条, 做英文词语同义词追加用
3 | decimal/n/null/decimals,fraction
4 | spirit/n/null/mind
5 | admire/v/null/appreciate,like,love,enjoy
6 | chenxin12/n/null/chenxin,lionsoul
7 |
--------------------------------------------------------------------------------
/WebRoot/WEB-INF/lib/lexicon/lex-festival.lex:
--------------------------------------------------------------------------------
1 | CJK_WORDS
2 | 七七纪念日/t/qi qi ji nian ri/null
3 | 七夕/t/qi xi/七夕情人节,情人节,中国情人节
4 | 七夕情人节/t/qi xi qing ren jie/七夕,中国情人节,情人节
5 | 七夕节/t/qi xi jie/七夕,情人节,中国情人节
6 | 万圣节/t/wan sheng jie/鬼节
7 | 世界人权日/t/shi jie ren quan ri/null
8 | 世界儿歌节/t/shi jie r ge jie/null
9 | 世界儿童节/t/shi jie r tong jie/null
10 | 世界动物日/t/shi jie dong wu ri/null
11 | 世界卫生日/t/shi jie wei sheng ri/null
12 | 世界地球日/t/shi jie di qiu ri/null
13 | 世界教师日/t/shi jie jiao shi ri/null
14 | 世界无烟日/t/shi jie wu yan ri/null
15 | 世界无童工日/t/shi jie wu tong gong ri/null
16 | 世界林业节/t/shi jie lin ye jie/null
17 | 世界森林日/t/shi jie sen lin ri/null
18 | 世界水日/t/shi jie shui ri/null
19 | 世界海洋日/t/shi jie hai yang ri/null
20 | 世界湿地日/t/shi jie shi di ri/null
21 | 世界献血日/t/shi jie xian xie ri/null
22 | 世界环境日/t/shi jie huan jing ri/null
23 | 世界电视日/t/shi jie dian shi ri/null
24 | 世界睡眠日/t/shi jie shui mian ri/null
25 | 世界粮食日/t/shi jie liang shi ri/null
26 | 世界精神卫生日/t/shi jie jing shen wei sheng ri/null
27 | 世界红十字日/t/shi jie hong shi zi ri/null
28 | 世界问候日/t/shi jie wen hou ri/null
29 | 中国人民抗日战争纪念日/t/zhong guo ren min kang ri zhan zheng ji nian ri/null
30 | 抗日战争纪念日/t/kang ri zhan zheng ji nian ri/null
31 | 中国国耻日/t/zhong guo guo chi ri/null
32 | 中国学生营养日/t/zhong guo xue sheng ying yang ri/null
33 | 中国爱牙日/t/zhong guo ai ya ri/null
34 | 中国爱耳日/t/zhong guo ai er ri/null
35 | 中国青年志愿者服务日/t/zhong guo qing nian zhi yuan zhe fu wu ri/null
36 | 中国青年节/t/zhong guo qing nian jie/null
37 | 中秋/t/zhong qiu/null
38 | 中秋节/t/zhong qiu jie/null
39 | 人口日/t/ren kou ri/null
40 | 人权日/t/ren quan ri/null
41 | 儿歌节/t/er ge jie/null
42 | 儿童节/t/er tong jie/null
43 | 元宵/t/yuan xiao/null
44 | 元宵节/t/yuan xiao jie/null
45 | 元旦/t/yuan dan/null
46 | 元旦节/t/yuan dan jie/null
47 | 党生日/t/dang sheng ri/null
48 | 全国中小学生安全教育日/t/quan guo zhong xiao xue sheng an quan jiao yu ri/null
49 | 全国助残日/t/quan guo zhu can ri/null
50 | 全国爱眼日/t/quan guo ai yan ri/null
51 | 全国爱耳日/t/quan guo ai er ri/null
52 | 六十亿人口日/t/liu shi yi ren kou ri/null
53 | 六四纪念日/t/liu si ji nian ri/null
54 | 冬至/t/dong zhi/null
55 | 减轻自然灾害日/t/jian qing zi ran zai hai ri/null
56 | 动物日/t/dong wu ri/null
57 | 助残日/t/zhu can ri/null
58 | 劳动妇女节/t/lao dong fu nu: jie/null
59 | 劳动节/t/lao dong jie/null
60 | 博物馆日/t/bo wu guan ri/null
61 | 卫生日/t/wei sheng ri/null
62 | 和平日/t/he ping ri/null
63 | 国庆/t/guo qing/null
64 | 国庆节/t/guo qing jie/null
65 | 国耻日/t/guo chi ri/null
66 | 国际儿童节/t/guo ji er tong jie/null
67 | 国际减轻自然灾害日/t/guo ji jian qing zi ran zai hai ri/null
68 | 国际劳动妇女节/t/guo ji lao dong fu nu: jie/null
69 | 国际劳动节/t/guo ji lao dong jie/null
70 | 国际博物馆日/t/guo ji bo wu guan ri/null
71 | 国际和平日/t/guo ji he ping ri/null
72 | 国际奥林匹克日/t/guo ji ao lin pi ke ri/null
73 | 国际妇女节/t/guo ji fu nu: jie/null
74 | 国际容忍日/t/guo ji rong ren ri/null
75 | 国际左撇子日/t/guo ji zuo pie zi ri/null
76 | 国际志愿者日/t/guo ji zhi yuan zhe ri/null
77 | 国际护士节/t/guo ji hu shi jie/null
78 | 国际无车日/t/guo ji wu che ri/null
79 | 国际残疾人日/t/guo ji can ji ren ri/null
80 | 国际母语日/t/guo ji mu yu ri/null
81 | 国际气象节/t/guo ji qi xiang jie/null
82 | 国际消费者权益日/t/guo ji xiao fei zhe quan yi ri/null
83 | 国际牛奶日/t/guo ji niu nai ri/null
84 | 国际盲人节/t/guo ji mang ren jie/null
85 | 国际禁毒日/t/guo ji jin du ri/null
86 | 国际老人日/t/guo ji lao ren ri/null
87 | 国际臭氧层保护日/t/guo ji chou yang ceng bao hu ri/null
88 | 国际非洲儿童日/t/guo ji fei zhou r tong ri/null
89 | 国际音乐日/t/guo ji yin yue ri/null
90 | 国际麻风日/t/guo ji ma feng ri/null
91 | 圣诞节/t/sheng dan jie/null
92 | 地球日/t/di qiu ri/null
93 | 处暑/t/chu shu/null
94 | 复活节/t/fu huo jie/null
95 | 夏至/t/xia zhi/null
96 | 大寒/t/da han/null
97 | 大暑/t/da shu/null
98 | 大雪/t/da xue/null
99 | 奥林匹克日/t/ao lin pi ke ri/null
100 | 妇女节/t/fu nv jie/null
101 | 三八节/t/san ba jie/null
102 | 三八妇女节/t/san ba fu nu: jie/null
103 | 学生营养日/t/xue sheng ying yang ri/null
104 | 安全教育日/t/an quan jiao yu ri/null
105 | 安全日/t/an quan ri/null
106 | 容忍日/t/rong ren ri/null
107 | 寒露/t/han lu/null
108 | 小寒/t/xiao han/null
109 | 小年/t/xiao nian/null
110 | 小暑/t/xiao shu/null
111 | 小满/t/xiao man/null
112 | 小雪/t/xiao xue/null
113 | 左撇子日/t/zuo pie zi ri/null
114 | 平安夜/t/ping an ye/null
115 | 建党日/t/jian dang ri/null
116 | 建军节/t/jian jun jie/null
117 | 志愿人员日/t/zhi yuan ren yuan ri/null
118 | 志愿者日/t/zhi yuan zhe ri/null
119 | 情人节/t/qing ren jie/null
120 | 惊蛰/t/jing zhe/null
121 | 愚人节/t/yu ren jie/null
122 | 感恩节/t/gan en jie/null
123 | 扫房日/t/sao fang ri/null
124 | 抗日战争纪念日/t/kang ri zhan zheng ji nian ri/null
125 | 抗日纪念日/t/kang ri ji nian ri/null
126 | 护士节/t/hu shi jie/null
127 | 教师日/t/jiao shi ri/null
128 | 教师节/t/jiao shi jie/null
129 | 文化遗产日/t/wen hua yi chan ri/null
130 | 无烟日/t/wu yan ri/null
131 | 无童工日/t/wu tong gong ri/null
132 | 无车日/t/wu che ri/null
133 | 春分/t/chun fen/null
134 | 春节/t/chun jie/null
135 | 植树节/t/zhi shu jie/null
136 | 残疾人日/t/can ji ren ri/null
137 | 母亲节/t/mu qin jie/null
138 | 母语日/t/mu yu ri/null
139 | 气象节/t/qi xiang jie/null
140 | 水日/t/shui ri/null
141 | 海洋日/t/hai yang ri/null
142 | 消费者权益日/t/xiao fei zhe quan yi ri/null
143 | 清明/t/qing ming/null
144 | 清明节/t/qing ming jie/null
145 | 湿地日/t/shi di ri/null
146 | 爱牙日/t/ai ya ri/null
147 | 爱眼日/t/ai yan ri/null
148 | 爱耳日/t/ai er ri/null
149 | 父亲节/t/fu qin jie/null
150 | 牛奶日/t/niu nai ri/null
151 | 独立日/t/du li ri/null
152 | 献血日/t/xian xie ri/null
153 | 环境日/t/huan jing ri/null
154 | 电视日/t/dian shi ri/null
155 | 白露/t/bai lu/null
156 | 盲人节/t/mang ren jie/null
157 | 睡眠日/t/shui mian ri/null
158 | 秋分/t/qiu fen/null
159 | 立冬/t/li dong/null
160 | 立夏/t/li xia/null
161 | 立春/t/li chun/null
162 | 立秋/t/li qiu/null
163 | 端午节/t/duan wu jie/null
164 | 粮食日/t/liang shi ri/null
165 | 精神卫生日/t/jing shen wei sheng ri/null
166 | 红十字日/t/hong shi zi ri/null
167 | 老人日/t/lao ren ri/null
168 | 联合国日/t/lian he guo ri/null
169 | 腊八节/t/la ba jie/null
170 | 腊日/t/la ri/null
171 | 臭氧保护日/t/chou yang bao hu ri/null
172 | 臭氧层保护日/t/chou yang ceng bao hu ri/null
173 | 芒种/t/mang zhong/null
174 | 营养日/t/ying yang ri/null
175 | 谷雨/t/gu yu/null
176 | 重阳/t/chong yang/null
177 | 重阳节/t/chong yang jie/null
178 | 问候日/t/wen hou ri/null
179 | 除夕/t/chu xi/null
180 | 雨水/t/yu shui/null
181 | 霜降/t/shuang jiang/null
182 | 青年志愿者服务日/t/qing nian zhi yuan zhe fu wu ri/null
183 | 青年节/t/qing nian jie/null
184 | 非洲儿童日/t/fei zhou r tong ri/null
185 | 音乐日/t/yin yue ri/null
186 | 麻风日/t/ma feng ri/null
187 | 龙头节/t/long tou jie/null
188 |
--------------------------------------------------------------------------------
/WebRoot/WEB-INF/lib/lexicon/lex-fname.lex:
--------------------------------------------------------------------------------
1 | CJK_WORDS
2 | #西方姓氏词库
3 | 亚历山大/nr/ya li shan da/null
4 | 克林顿/nr/ke ling dun/null
5 | 克里斯汀/nr/ke li si ding/null
6 | 布什/nr/bu shi/null
7 | 布莱尔/nr/bu lai er/null
8 | 科特勒/nr/ke te lei/null
9 | 约翰/nr/yue han/null
10 | 约翰逊/nr/yue han xun/null
11 | 蒂娜/nr/di na/null
12 | 安妮/nr/an ni/null
13 | 咪咪/nr/mi mi/null
14 | 妮可/nr/ni ke/null
15 | 凯蒂/nr/kai di/null
16 | #外人翻译名字#
17 | 阿汤哥/nr/a tang ge/汤姆·克鲁斯
18 | 汤姆·克鲁斯/nr/tang mu ke lu si/阿汤哥
19 | 咪咪·罗杰斯/nr/mi mi luo jie si/null
20 | 妮可·基德曼/nr/ni ke ji de man/null
21 | 凯蒂·赫尔墨斯/nr/ka di he er mo si/null
22 |
--------------------------------------------------------------------------------
/WebRoot/WEB-INF/lib/lexicon/lex-food.lex:
--------------------------------------------------------------------------------
1 | CJK_WORDS
2 | 雪碧/n/xue bi/null
3 | 可口可乐/n/ke kou ke le/null
4 | 冰红茶/n/bing hong cha/null
5 | 奶茶/n/nai cha/null
6 | 花生奶/n/hua sheng nai/null
7 | 芬达/n/fen da/null
8 | 珍珠奶茶/n/zhen zhu nai cha/null
9 | 达利源/n/da li yuan/null
10 | 肯德鸡/n/ken de ji/null
11 | 炸薯条/n/zha shu tiao/null
12 | 麻辣烫/n/ma la tang/null
13 | 麻辣干锅/n/ma la gan guo/null
14 |
--------------------------------------------------------------------------------
/WebRoot/WEB-INF/lib/lexicon/lex-lang.lex:
--------------------------------------------------------------------------------
1 | CJK_WORDS
2 | 中文/n/zhong wen/国语
3 | 国语/n/guo yu/null
4 | 台湾话/n/tai wan hua/台语
5 | 台语/n/tai yu/台湾话
6 | 客家话/n/ke jia hua/null
7 | 汉字/n/han zi/null
8 | 汉语/n/han yu/国语,中文
9 | 法文/n/fa wen/法文
10 | 法语/n/fa yu/法语
11 | 福建话/n/fu jian hua/null
12 | 粤语/n/yue yu/广东话
13 | 美语/n/mei yu/英语,英文
14 | 英文/n/ying wen/英语
15 | 英语/n/ying yu/英文
16 | 西班牙语/n/xi ban ya yu/null
17 | 闽南语/n/min nan yu/null
18 | 泰语/n/tai yu/null
19 | 西班牙语/n/xi ban ya yu/null
20 | 俄罗斯语/n/e luo si yu/null
21 | 拉丁语/n/la ding yu/null
22 |
--------------------------------------------------------------------------------
/WebRoot/WEB-INF/lib/lexicon/lex-ln-adorn.lex:
--------------------------------------------------------------------------------
1 | CN_LNAME_ADORN
2 | #姓氏修饰,例如:老陈,小陈,中的老,小
3 | #如果他已经是姓氏(lex-lname.lex中的词),则无须放在这里。
4 | 老
5 | 小
--------------------------------------------------------------------------------
/WebRoot/WEB-INF/lib/lexicon/lex-lname.lex:
--------------------------------------------------------------------------------
1 | CN_LNAME
2 | #中文姓氏词库
3 | #单姓
4 | 王
5 | 李
6 | 张
7 | 刘
8 | 陈
9 | 杨
10 | 周
11 | 黄
12 | 孙
13 | 吴
14 | 徐
15 | 赵
16 | 林
17 | 胡
18 | 朱
19 | 梁
20 | 郭
21 | 高
22 | 何
23 | 马
24 | 郑
25 | 罗
26 | 宋
27 | 唐
28 | 谢
29 | 叶
30 | 韩
31 | 任
32 | 潘
33 | 于
34 | 冯
35 | 蒋
36 | 董
37 | 吕
38 | 邓
39 | 许
40 | 曹
41 | 曾
42 | 袁
43 | 汪
44 | 程
45 | 田
46 | 彭
47 | 钟
48 | 蔡
49 | 魏
50 | 沈
51 | 方
52 | 卢
53 | 余
54 | 杜
55 | 丁
56 | 苏
57 | 贾
58 | 姚
59 | 姜
60 | 陆
61 | 戴
62 | 傅
63 | 夏
64 | 廖
65 | 萧
66 | 石
67 | 江
68 | 范
69 | 今
70 | 谭
71 | 邹
72 | 崔
73 | 薛
74 | 邱
75 | 康
76 | 史
77 | 侯
78 | 邵
79 | 熊
80 | 秦
81 | 雷
82 | 孟
83 | 庞
84 | 白
85 | 毛
86 | 郝
87 | 钱
88 | 段
89 | 俞
90 | 洪
91 | 汤
92 | 顾
93 | 贺
94 | 龚
95 | 尹
96 | 万
97 | 龙
98 | 赖
99 | 章
100 | 孔
101 | 武
102 | 邢
103 | 颜
104 | 梅
105 | 阮
106 | 黎
107 | 常
108 | 倪
109 | 施
110 | 乔
111 | 樊
112 | 严
113 | 齐
114 | 陶
115 | #向
116 | 温
117 | 文
118 | 易
119 | 兰
120 | 闫
121 | 芦
122 | 牛
123 | 尚
124 | 安
125 | 管
126 | 殷
127 | 霍
128 | 翟
129 | 佘
130 | 葛
131 | 庄
132 | 伍
133 | 辛
134 | 练
135 | 申
136 | 付
137 | 曲
138 | 焦
139 | 项
140 | 代
141 | 鲁
142 | 季
143 | 覃
144 | 覃
145 | 毕
146 | 麦
147 | 阳
148 | 耿
149 | 舒
150 | 聂
151 | 盛
152 | 童
153 | 祝
154 | 柳
155 | 单
156 | 单
157 | 岳
158 | 骆
159 | 纪
160 | 欧
161 | 房
162 | 左
163 | 尤
164 | 凌
165 | 韦
166 | 景
167 | 詹
168 | 莫
169 | 郎
170 | 路
171 | 宁
172 | 宁
173 | 关
174 | 丛
175 | 翁
176 | 容
177 | 亢
178 | 柯
179 | 鲍
180 | 蒲
181 | 苗
182 | 牟
183 | 谷
184 | 裴
185 | 商
186 | 初
187 | 屈
188 | 成
189 | 包
190 | 游
191 | 司
192 | 祁
193 | 强
194 | 靳
195 | 甘
196 | 席
197 | 瞿
198 | 卜
199 | 褚
200 | 解
201 | 臧
202 | 时
203 | 费
204 | 班
205 | 华
206 | 全
207 | 涂
208 | 卓
209 | 党
210 | 饶
211 | 应
212 | 卫
213 | 丘
214 | 隋
215 | 米
216 | 闵
217 | 畅
218 | 喻
219 | 冉
220 | 宫
221 | 甄
222 | 宣
223 | 穆
224 | 谈
225 | 匡
226 | 帅
227 | 车
228 | 母
229 | 查
230 | 戚
231 | 符
232 | 缪
233 | 昌
234 | 娄
235 | 滕
236 | 位
237 | 奚
238 | 边
239 | 卞
240 | 桂
241 | 邝
242 | 苟
243 | 柏
244 | 井
245 | 冀
246 | 邬
247 | 吉
248 | 敖
249 | 桑
250 | 池
251 | 简
252 | 蔺
253 | 连
254 | 艾
255 | 蓝
256 | 窦
257 | 刚
258 | 封
259 | 占
260 | 迟
261 | 姬
262 | 刁
263 | 栾
264 | 冷
265 | 杭
266 | 植
267 | 郁
268 | 晋
269 | 虞
270 | 佟
271 | 苑
272 | 屠
273 | 藏
274 | 蒙
275 | 占
276 | 辜
277 | 廉
278 | 巩
279 | 麻
280 | 晏
281 | 相
282 | 师
283 | 鄢
284 | 泮
285 | 燕
286 | 岑
287 | 官
288 | 仲
289 | 羊
290 | 揭
291 | 仇
292 | 邸
293 | 宗
294 | 荆
295 | 盖
296 | 盖
297 | 粱
298 | 原
299 | 茅
300 | 荣
301 | 沙
302 | 郜
303 | 巫
304 | 鞠
305 | 罡
306 | 未
307 | 来
308 | 劳
309 | 诸
310 | 计
311 | 乐
312 | 乐
313 | 双
314 | 花
315 | 冼
316 | 尉
317 | 木
318 | 丰
319 | 寇
320 | 栗
321 | 况
322 | 干
323 | 楼
324 | 满
325 | 桑
326 | 湛
327 | 谌
328 | 储
329 | 邦
330 | 皮
331 | 楚
332 | 胥
333 | 明
334 | 平
335 | 腾
336 | 厉
337 | 仉
338 | 励
339 | 竺
340 | 闻
341 | 宇
342 | 支
343 | 都
344 | 折
345 | 旷
346 | 南
347 | 战
348 | 嵇
349 | 化
350 | 糜
351 | 衣
352 | 国
353 | 逄
354 | 门
355 | 崇
356 | 裘
357 | 薄
358 | 束
359 | 宿
360 | 东
361 | 降
362 | 逯
363 | 伊
364 | 修
365 | 粟
366 | 漆
367 | 阙
368 | 禹
369 | 先
370 | 银
371 | 台
372 | #和
373 | 祖
374 | 惠
375 | 伦
376 | 候
377 | 阚
378 | 慕
379 | 戈
380 | 富
381 | 伏
382 | 僧
383 | 习
384 | 云
385 | 元
386 | 狄
387 | 危
388 | 雍
389 | 蔚
390 | 索
391 | 居
392 | 浦
393 | 权
394 | 税
395 | 谯
396 | 於
397 | 芮
398 | 濮
399 | 基
400 | 寿
401 | 凡
402 | 卿
403 | 酆
404 | 苻
405 | 保
406 | 郗
407 | 渠
408 | 琚
409 | 淡
410 | 由
411 | 豆
412 | 扈
413 | 仁
414 | 呼
415 | 矫
416 | 巢
417 | 盘
418 | 敬
419 | 巴
420 | 茆
421 | 鱼
422 | 戎
423 | 缠
424 | 区
425 | 幸
426 | 海
427 | 弓
428 | 阴
429 | 住
430 | 晁
431 | 菅
432 | 印
433 | 汝
434 | 历
435 | 么
436 | 乌
437 | 贡
438 | 妙
439 | 禤
440 | 荀
441 | 鹿
442 | 邰
443 | 随
444 | 雒
445 | 贝
446 | 录
447 | 鲜
448 | 茹
449 | 种
450 | 农
451 | 佐
452 | 赫
453 | 字
454 | 油
455 | #但
456 | 綦
457 | 美
458 | 利
459 | 钮
460 | 信
461 | 勾
462 | 火
463 | 昝
464 | 圣
465 | 颉
466 | 从
467 | 靖
468 | 开
469 | 公
470 | 那
471 | 山
472 | 智
473 | 补
474 | 虎
475 | 才
476 | 布
477 | 亓
478 | 药
479 | 造
480 | 普
481 | 五
482 | 仝
483 | 扆
484 | 暴
485 | 咸
486 | 庚
487 | 奕
488 | 锺
489 | 问
490 | 招
491 | 贵
492 | 巨
493 | 檀
494 | 厚
495 | 恽
496 | 过
497 | 达
498 | 邴
499 | 洛
500 | 忻
501 | 展
502 | 户
503 | 毋
504 | 暨
505 | 金
506 | #复姓
507 | 欧阳
508 | 上官
509 | 司徒
510 | 刘付
511 | 皇甫
512 | 长孙
513 | 相里
514 | 令狐
515 | 诸葛
516 |
--------------------------------------------------------------------------------
/WebRoot/WEB-INF/lib/lexicon/lex-nation.lex:
--------------------------------------------------------------------------------
1 | CJK_WORDS
2 | 东非/ns/dong fei/null
3 | 中华/ns/zhong hua/null
4 | 中华/ns/zhong hua/null
5 | 中华人民共和国/ns/zhong hua ren min gong he guo/null
6 | 中华民国/ns/zhong hua min guo/null
7 | 中国/ns/zhong guo/null
8 | 中國/nz/zhong guo/null
9 | 中非/ns/zhong fei/null
10 | 乌克兰/ns/wu ke lan/null
11 | 也门/ns/ye men/null
12 | 以色列/ns/yi se lie/null
13 | 伊拉克/ns/yi la ke/null
14 | 伊朗/ns/yi lang/null
15 | 俄罗斯/ns/e luo si/null
16 | 分类/ns/fen lei/null
17 | 加拿大/ns/jia na da/null
18 | 南非/ns/nan fei/null
19 | 古巴/ns/gu ba/null
20 | 台湾/ns/tai wan/null
21 | 埃及/ns/ai ji/null
22 | 塞尔维亚/ns/sai er wei ya/null
23 | 墨西哥/ns/mo xi ge/null
24 | 威尔士/ns/wei er shi/null
25 | 尼日利亚/ns/ni ri li ya/null
26 | 巴比伦/ns/ba bi lun/null
27 | 希腊/ns/xi la/null
28 | 德国/ns/de guo/null
29 | 德意志/ns/de yi zhi/null
30 | 意大利/ns/yi da li/null
31 | 捷克/ns/jie ke/null
32 | 日本/ns/ri ben/null
33 | 朝鲜/ns/chao xian/null
34 | 比利时/ns/bi li shi/null
35 | 法兰西/ns/fa lan xi/null
36 | 法国/ns/fa guo/null
37 | 波兰/ns/bo lan/null
38 | 波黑/ns/bo hei/null
39 | 瑞典/ns/rui dian/null
40 | 瑞士/ns/rui shi/null
41 | 白俄罗斯/ns/bai e luo si/null
42 | 缅甸/ns/mian dian/null
43 | 美利坚/ns/mei li jian/null
44 | 美利坚合众国/ns/mei li jian he zhong guo/null
45 | 美国/ns/mei guo/null
46 | 老挝/ns/lao wo/null
47 | 苏格兰/ns/su ge lan/null
48 | 苏联/ns/su lian/null
49 | 英国/ns/ying guo/null
50 | 英格兰/ns/ying ge lan/null
51 | 葡萄牙/ns/pu tao ya/null
52 | 蒙古/ns/meng gu/null
53 | 西班牙/ns/xi ban ya/null
54 | 越南/ns/yue nan/null
55 | 韩国/ns/han guo/null
56 |
57 | #added at 2015-10-23
58 | 中国/ns/zhong guo/null
59 | 蒙古/ns/meng gu/null
60 | 朝鲜/ns/chao xian/null
61 | 韩国/ns/han guo/null
62 | 日本/ns/ri ben/null
63 | 菲律宾/ns/fei lv bin/null
64 | 越南/ns/yue nan/null
65 | 老挝/ns/lao wo/null
66 | 柬埔寨/ns/jian pu zhai/null
67 | 缅甸/ns/mian dian/null
68 | 泰国/ns/tai guo/null
69 | 马来西亚/ns/ma lai xi ya/null
70 | 文莱/ns/wen lai/null
71 | 新加坡/ns/xin jia po/null
72 | 印度尼西亚/ns/yi se lie/null
73 | 东帝汶 /ns/yi se lie/null
74 | 尼泊尔/ns/yi se lie/null
75 | 不丹/ns/yi se lie/null
76 | 孟加拉国/ns/yi se lie/null
77 | 印度/ns/yi se lie/null
78 | 巴基斯坦/ns/yi se lie/null
79 | 斯里兰卡/ns/yi se lie/null
80 | 马尔代夫 /ns/yi se lie/null
81 | 哈萨克斯坦/ns/yi se lie/null
82 | 吉尔吉斯斯坦/ns/yi se lie/null
83 | 塔吉克斯坦/ns/yi se lie/null
84 | 乌兹别克斯坦/ns/yi se lie/null
85 | 土库曼斯坦 /ns/yi se lie/null
86 | 阿富汗/ns/yi se lie/null
87 | 伊拉克/ns/yi se lie/null
88 | 伊朗/ns/yi se lie/null
89 | 叙利亚/ns/yi se lie/null
90 | 约旦/ns/yi se lie/null
91 | 黎巴嫩/ns/yi se lie/null
92 | 以色列/ns/yi se lie/null
93 | 巴勒斯坦/ns/yi se lie/null
94 | 沙特阿拉伯/ns/yi se lie/null
95 | 巴林/ns/yi se lie/null
96 | 卡塔尔/ns/yi se lie/null
97 | 科威特/ns/yi se lie/null
98 | 阿拉伯联合酋长国/ns/yi se lie/null
99 | 阿曼/ns/yi se lie/null
100 | 也门/ns/yi se lie/null
101 | 格鲁吉亚/ns/yi se lie/null
102 | 亚美尼亚/ns/yi se lie/null
103 | 阿塞拜疆/ns/yi se lie/null
104 | 土耳其/ns/yi se lie/null
105 | 塞浦路斯 /ns/yi se lie/null
106 | 芬兰/ns/yi se lie/null
107 | 瑞典/ns/yi se lie/null
108 | 挪威/ns/yi se lie/null
109 | 冰岛/ns/yi se lie/null
110 | 丹麦 法罗群岛/ns/yi se lie/null
111 | 爱沙尼亚/ns/yi se lie/null
112 | 拉脱维亚/ns/yi se lie/null
113 | 立陶宛/ns/yi se lie/null
114 | 白俄罗斯/ns/yi se lie/null
115 | 俄罗斯/ns/yi se lie/null
116 | 乌克兰/ns/yi se lie/null
117 | 摩尔多瓦 /ns/yi se lie/null
118 | 波兰/ns/yi se lie/null
119 | 捷克/ns/yi se lie/null
120 | 斯洛伐克/ns/yi se lie/null
121 | 匈牙利/ns/yi se lie/null
122 | 德国/ns/yi se lie/null
123 | 奥地利/ns/yi se lie/null
124 | 瑞士/ns/yi se lie/null
125 | 列支敦士登 /ns/yi se lie/null
126 | 英国/ns/yi se lie/null
127 | 爱尔兰/ns/yi se lie/null
128 | 荷兰/ns/yi se lie/null
129 | 比利时/ns/yi se lie/null
130 | 卢森堡/ns/yi se lie/null
131 | 法国/ns/yi se lie/null
132 | 摩纳哥 /ns/yi se lie/null
133 | 罗马尼亚/ns/yi se lie/null
134 | 保加利亚/ns/yi se lie/null
135 | 塞尔维亚/ns/yi se lie/null
136 | 马其顿/ns/yi se lie/null
137 | 阿尔巴尼亚/ns/yi se lie/null
138 | 希腊/ns/yi se lie/null
139 | 斯洛文尼亚/ns/yi se lie/null
140 | 克罗地亚/ns/yi se lie/null
141 | 波斯尼亚和墨塞哥维那/ns/yi se lie/null
142 | 意大利/ns/yi se lie/null
143 | 梵蒂冈/ns/yi se lie/null
144 | 圣马力诺/ns/yi se lie/null
145 | 马耳他/ns/yi se lie/null
146 | 西班牙/ns/yi se lie/null
147 | 葡萄牙/ns/yi se lie/null
148 | 安道尔 /ns/yi se lie/null
149 | 埃及/ns/yi se lie/null
150 | 利比亚/ns/yi se lie/null
151 | 苏丹/ns/yi se lie/null
152 | 突尼斯/ns/yi se lie/null
153 | 阿尔及利亚/ns/yi se lie/null
154 | 摩洛哥/ns/yi se lie/null
155 | 亚速尔群岛/ns/yi se lie/null
156 | 马德拉群岛/ns/yi se lie/null
157 | 埃塞俄比亚/ns/yi se lie/null
158 | 厄立特里亚/ns/yi se lie/null
159 | 索马里/ns/yi se lie/null
160 | 吉布提/ns/yi se lie/null
161 | 肯尼亚/ns/yi se lie/null
162 | 坦桑尼亚/ns/yi se lie/null
163 | 乌干达/ns/yi se lie/null
164 | 卢旺达/ns/yi se lie/null
165 | 布隆迪/ns/yi se lie/null
166 | 塞舌尔 刚果/ns/yi se lie/null
167 | 圣多美及普林西比/ns/yi se lie/null
168 | 塞内加尔/ns/yi se lie/null
169 | 冈比亚/ns/yi se lie/null
170 | 马里/ns/yi se lie/null
171 | 布基纳法索/ns/yi se lie/null
172 | 几内亚/ns/yi se lie/null
173 | 几内亚比绍/ns/yi se lie/null
174 | 佛得角/ns/yi se lie/null
175 | 塞拉利昂/ns/yi se lie/null
176 | 利比里亚/ns/yi se lie/null
177 | 科特迪瓦/ns/yi se lie/null
178 | 加纳/ns/yi se lie/null
179 | 多哥/ns/yi se lie/null
180 | 贝宁/ns/yi se lie/null
181 | 尼日尔/ns/yi se lie/null
182 | 加那利群岛/ns/yi se lie/null
183 | 赞比亚/ns/yi se lie/null
184 | 安哥拉/ns/yi se lie/null
185 | 津巴布韦/ns/yi se lie/null
186 | 马拉维/ns/yi se lie/null
187 | 莫桑比克/ns/yi se lie/null
188 | 博茨瓦纳/ns/yi se lie/null
189 | 纳米比亚/ns/yi se lie/null
190 | 南非/ns/yi se lie/null
191 | 斯威士兰/ns/yi se lie/null
192 | 莱索托/ns/yi se lie/null
193 | 马达加斯加/ns/yi se lie/null
194 | 科摩罗/ns/yi se lie/null
195 | 毛里求斯/ns/yi se lie/null
196 | 留尼旺/ns/yi se lie/null
197 | 圣赫勒拿 /ns/yi se lie/null
198 | 澳大利亚/ns/yi se lie/null
199 | 新西兰/ns/yi se lie/null
200 | 巴布亚新几内亚/ns/yi se lie/null
201 | 所罗门群岛/ns/yi se lie/null
202 | 瓦努阿图/ns/yi se lie/null
203 | 密克罗尼西亚/ns/yi se lie/null
204 | 马绍尔群岛/ns/yi se lie/null
205 | 帕劳/ns/yi se lie/null
206 | 瑙鲁/ns/yi se lie/null
207 | 基里巴斯/ns/yi se lie/null
208 | 图瓦卢/ns/yi se lie/null
209 | 萨摩亚/ns/yi se lie/null
210 | 斐济群岛/ns/yi se lie/null
211 | 汤加/ns/yi se lie/null
212 | 库克群岛/ns/yi se lie/null
213 | 关岛/ns/yi se lie/null
214 | 新喀里多尼亚/ns/yi se lie/null
215 | 法属波利尼西亚/ns/yi se lie/null
216 | 皮特凯恩岛/ns/yi se lie/null
217 | 瓦利斯与富图纳/ns/yi se lie/null
218 | 纽埃/ns/yi se lie/null
219 | 托克劳/ns/yi se lie/null
220 | 美属萨摩亚/ns/yi se lie/null
221 | 北马里亚纳/ns/yi se lie/null
222 | 加拿大/ns/yi se lie/null
223 | 美国/ns/yi se lie/null
224 | 墨西哥/ns/yi se lie/null
225 | 格陵兰 /ns/yi se lie/null
226 | 危地马拉/ns/yi se lie/null
227 | 伯利兹/ns/yi se lie/null
228 | 萨尔瓦多/ns/yi se lie/null
229 | 洪都拉斯/ns/yi se lie/null
230 | 尼加拉瓜/ns/yi se lie/null
231 | 哥斯达黎加/ns/yi se lie/null
232 | 巴拿马 /ns/yi se lie/null
233 | 巴哈马/ns/yi se lie/null
234 | 古巴/ns/yi se lie/null
235 | 牙买加/ns/yi se lie/null
236 | 海地/ns/yi se lie/null
237 | 多米尼加共和国/ns/yi se lie/null
238 | 安提瓜和巴布达/ns/yi se lie/null
239 | 圣基茨和尼维斯/ns/yi se lie/null
240 | 多米尼克/ns/yi se lie/null
241 | 圣卢西亚/ns/yi se lie/null
242 | 圣文森特和格林纳丁斯/ns/yi se lie/null
243 | 格林纳达/ns/yi se lie/null
244 | 巴巴多斯/ns/yi se lie/null
245 | 特立尼达和多巴哥/ns/yi se lie/null
246 | 波多黎各/ns/yi se lie/null
247 | 英属维尔京群岛/ns/yi se lie/null
248 | 美属维尔京群岛/ns/yi se lie/null
249 | 安圭拉/ns/yi se lie/null
250 | 蒙特塞拉特/ns/yi se lie/null
251 | 瓜德罗普/ns/yi se lie/null
252 | 马提尼克/ns/yi se lie/null
253 | 荷属安的列斯/ns/yi se lie/null
254 | 阿鲁巴/ns/yi se lie/null
255 | 特克斯和凯科斯群岛/ns/yi se lie/null
256 | 开曼群岛/ns/yi se lie/null
257 | 百慕大 /ns/yi se lie/null
258 | 哥伦比亚/ns/yi se lie/null
259 | 委内瑞拉/ns/yi se lie/null
260 | 圭亚那/ns/yi se lie/null
261 | 法属圭亚那/ns/yi se lie/null
262 | 苏里南 /ns/yi se lie/null
263 | 厄瓜多尔/ns/yi se lie/null
264 | 秘鲁/ns/yi se lie/null
265 | 玻利维亚/ns/yi se lie/null
266 | 巴西/ns/yi se lie/null
267 | 智利/ns/yi se lie/null
268 | 阿根廷/ns/yi se lie/null
269 | 乌拉圭/ns/yi se lie/null
270 | 巴拉圭/ns/yi se lie/null
271 |
--------------------------------------------------------------------------------
/WebRoot/WEB-INF/lib/lexicon/lex-net.lex:
--------------------------------------------------------------------------------
1 | CJK_WORDS
2 | 油条哥/n/you tiao ge/null
3 | 活雷锋/n/huo lei feng/null
4 | 夕阳红/n/xi yang hong/null
5 | 帮扶村/n/bang fu cun/null
6 | 后援会/n/hou yuan hui/null
7 | 复炸油/n/fu zha you/null
8 | 献血哥/n/xian xie ge/null
9 | 放心姐/n/fang xin jie/null
10 | 啃老族/n/ken lao zu/null
11 | 特训班/n/te xun ban/null
12 | 平头男/n/ping tou nan/null
13 | 爆头哥/n/bao tou ge/null
14 | 楼主/n/lou zhu/null
15 | 有两把刷子/a/you liang ba shua zi/null
16 | 非典/n/fei dian/null
17 | 微信/n/wei xin/null
18 | 微博/n/wei bo/null
19 | 吊丝/n/diao si/null
20 | 高富帅/n/gao fu shuai/null
21 | 矮穷挫/n/ai qiong cuo/null
22 | 白富美/n/bai fu mei/null
23 | 狮子的魂/nz/shi zi de hun/null
24 | 仓老师/nz/cang lao shi/仓井空
25 | 菇凉/n/gu liang/null
26 |
--------------------------------------------------------------------------------
/WebRoot/WEB-INF/lib/lexicon/lex-org.lex:
--------------------------------------------------------------------------------
1 | CJK_WORDS
2 | 上海合作组织/nt/shang hai he zuo zu zhi/null
3 | 世卫/nt/shi wei/null
4 | 世界卫生组织/nt/shi jie wei sheng zu zhi/null
5 | 世界银行/nt/shi jie yin hang/null
6 | 东盟/nt/dong meng/null
7 | 亚太经合组织/nt/ya tai jing he zu zhi/null
8 | 人权理事会/nt/ren quan li shi hui/null
9 | 六方会谈/nt/liu fang hui tan/null
10 | 北约/nt/bei yue/null
11 | 哈马斯/nt/ha ma si/null
12 | 安全理事会/nt/an quan li shi hui/null
13 | 安理会/nt/an li hui/null
14 | 欧佩克/nt/ou pei ke/null
15 | 红十字会/nt/hong shi zi hui/null
16 | 联合国/nt/lian he guo/null
17 |
--------------------------------------------------------------------------------
/WebRoot/WEB-INF/lib/lexicon/lex-sname.lex:
--------------------------------------------------------------------------------
1 | CN_SNAME
2 | #中文单名词库
3 | 敏
4 | 伟
5 | 勇
6 | 军
7 | 斌
8 | 静
9 | 丽
10 | 涛
11 | 芳
12 | 杰
13 | 萍
14 | 强
15 | 俊
16 | 明
17 | 燕
18 | 磊
19 | 玲
20 | 华
21 | 平
22 | 鹏
23 | 健
24 | 波
25 | 红
26 | 丹
27 | 辉
28 | 超
29 | 艳
30 | 莉
31 | 刚
32 | 娟
33 | 峰
34 | 婷
35 | 亮
36 | 洁
37 | 颖
38 | 琳
39 | 英
40 | 慧
41 | 飞
42 | 霞
43 | 浩
44 | 凯
45 | 宇
46 | 毅
47 | 林
48 | 佳
49 | 云
50 | 莹
51 | 娜
52 | 晶
53 | 洋
54 | 文
55 | 鑫
56 | 欣
57 | 琴
58 | 宁
59 | 琼
60 | 兵
61 | 青
62 | 琦
63 | 翔
64 | 彬
65 | 锋
66 | 阳
67 | 璐
68 | 旭
69 | 蕾
70 | 剑
71 | 虹
72 | 蓉
73 | 建
74 | 倩
75 | 梅
76 | 宏
77 | 威
78 | 博
79 | 君
80 | 力
81 | 龙
82 | 晨
83 | 薇
84 | 雪
85 | 琪
86 | 欢
87 | 荣
88 | 江
89 | 炜
90 | 成
91 | 庆
92 | 冰
93 | 东
94 | 帆
95 | 雷
96 | 楠
97 | 锐
98 | 进
99 | 海
100 | 凡
101 | 巍
102 | 维
103 | 迪
104 | 媛
105 | 玮
106 | 杨
107 | 群
108 | 瑛
109 | 悦
110 | 春
111 | 瑶
112 | 婧
113 | 兰
114 | 茜
115 | 松
116 | 爽
117 | 立
118 | 瑜
119 | 睿
120 | 晖
121 | 聪
122 | 帅
123 | 瑾
124 | 骏
125 | 雯
126 | 晓
127 | 昊
128 | 勤
129 | 新
130 | 瑞
131 | 岩
132 | 星
133 | 忠
134 | 志
135 | 怡
136 | 坤
137 | 康
138 | 航
139 | 利
140 | 畅
141 | 坚
142 | 雄
143 | 智
144 | 萌
145 | 哲
146 | 岚
147 | 洪
148 | 捷
149 | 珊
150 | 恒
151 | 靖
152 | 清
153 | 扬
154 | 昕
155 | 乐
156 | 武
157 | 玉
158 | 诚
159 | 菲
160 | 锦
161 | 凤
162 | 珍
163 | 晔
164 | 妍
165 | 璇
166 | 胜
167 | 菁
168 | 科
169 | 芬
170 | 露
171 | 越
172 | 彤
173 | 曦
174 | 义
175 | 良
176 | 鸣
177 | 芸
178 | 方
179 | 月
180 | 铭
181 | 光
182 | 震
183 | 冬
184 | 源
185 | 政
186 | 虎
187 | 莎
188 | 彪
189 | 蓓
190 | 钢
191 | 凌
192 | 奇
193 | 卫
194 | 彦
195 | 烨
196 | 可
197 | 黎
198 | 川
199 | 淼
200 | 惠
201 | 祥
202 | 然
203 | 三
204 | 逗
205 | 高
206 | 潇
207 | 正
208 | 硕
209 |
--------------------------------------------------------------------------------
/WebRoot/WEB-INF/lib/lexicon/lex-stopword.lex:
--------------------------------------------------------------------------------
1 | STOP_WORDS
2 | #en-punctuation
3 | !
4 | "
5 | #
6 | $
7 | %
8 | &
9 | '
10 | (
11 | )
12 | *
13 | +
14 | ,
15 | -
16 | .
17 | /
18 | #0
19 | #1
20 | #2
21 | #3
22 | #4
23 | #5
24 | #6
25 | #7
26 | #8
27 | #9
28 | :
29 | ;
30 | <
31 | =
32 | >
33 | ?
34 | @
35 | [
36 | \
37 | ]
38 | ^
39 | _
40 | `
41 | #a
42 | #b
43 | #c
44 | #d
45 | #e
46 | #f
47 | #g
48 | #h
49 | #i
50 | #j
51 | #k
52 | #l
53 | #m
54 | #n
55 | #o
56 | #p
57 | #q
58 | #r
59 | #s
60 | #t
61 | #u
62 | #v
63 | #w
64 | #x
65 | #y
66 | #z
67 | {
68 | |
69 | }
70 | ~
71 | !
72 | #fullwidth
73 | !
74 | "
75 | #
76 | $
77 | %
78 | &
79 | '
80 | (
81 | )
82 | *
83 | +
84 | ,
85 | -
86 | .
87 | /
88 | :
89 | ;
90 | <
91 | =
92 | >
93 | ?
94 | @
95 | [
96 | \
97 | ]
98 | ^
99 | _
100 | `
101 | {
102 | |
103 | }
104 | ~
105 | ⦅
106 | ⦆
107 | 。
108 | 「
109 | 」
110 | 、
111 | ・
112 | #cn-punctuation
113 | 、
114 | 。
115 | 〃
116 | 〄
117 | 々
118 | 〆
119 | 〇
120 | 〈
121 | 〉
122 | 《
123 | 》
124 | 「
125 | 」
126 | 『
127 | 』
128 | 【
129 | 】
130 | 〒
131 | 〓
132 | 〔
133 | 〕
134 | 〖
135 | 〗
136 | 〘
137 | 〙
138 | 〚
139 | 〛
140 | 〜
141 | 〝
142 | 〞
143 | 〟
144 | #中文
145 | 的
146 | 吗
147 | 不
148 | 我
149 | 们
150 | 起
151 | 就
152 | 最
153 | 在
154 | 人
155 | 有
156 | 是
157 | 为
158 | 以
159 | 于
160 | 上
161 | 他
162 | 而
163 | 后
164 | 之
165 | 来
166 | 由
167 | 及
168 | 了
169 | 下
170 | 可
171 | 到
172 | 这
173 | 与
174 | 也
175 | 因
176 | 此
177 | 但
178 | 并
179 | 个
180 | 其
181 | 已
182 | 无
183 | 小
184 | 今
185 | 去
186 | 再
187 | 好
188 | 只
189 | 又
190 | 或
191 | 很
192 | 亦
193 | 某
194 | 把
195 | 那
196 | 你
197 | 乃
198 | 它
199 | 吧
200 | 被
201 | 比
202 | 别
203 | 趁
204 | 当
205 | 从
206 | 到
207 | 得
208 | 打
209 | 凡
210 | 儿
211 | 尔
212 | 该
213 | 各
214 | 给
215 | 跟
216 | 和
217 | 何
218 | 还
219 | 即
220 | 几
221 | 既
222 | 看
223 | 据
224 | 距
225 | 靠
226 | 啦
227 | 了
228 | 另
229 | 么
230 | 每
231 | 们
232 | 嘛
233 | 拿
234 | 哪
235 | 那
236 | 您
237 | 凭
238 | 且
239 | 却
240 | 让
241 | 仍
242 | 啥
243 | 如
244 | 若
245 | 使
246 | 谁
247 | 虽
248 | 随
249 | 同
250 | 所
251 | 她
252 | 哇
253 | 嗡
254 | 往
255 | 哪
256 | 些
257 | 向
258 | 沿
259 | 哟
260 | 用
261 | 于
262 | 咱
263 | 则
264 | 怎
265 | 曾
266 | 至
267 | 致
268 | 着
269 | 诸
270 | 自
271 | 啊
272 | #英文
273 | to
274 | can
275 | could
276 | dare
277 | do
278 | did
279 | does
280 | may
281 | might
282 | would
283 | should
284 | must
285 | will
286 | ought
287 | shall
288 | need
289 | is
290 | a
291 | am
292 | are
293 | about
294 | according
295 | after
296 | against
297 | all
298 | almost
299 | also
300 | although
301 | among
302 | an
303 | and
304 | another
305 | any
306 | anything
307 | approximately
308 | as
309 | asked
310 | at
311 | back
312 | because
313 | before
314 | besides
315 | between
316 | both
317 | but
318 | by
319 | call
320 | called
321 | currently
322 | despite
323 | did
324 | do
325 | dr
326 | during
327 | each
328 | earlier
329 | eight
330 | even
331 | eventually
332 | every
333 | everything
334 | five
335 | for
336 | four
337 | from
338 | he
339 | her
340 | here
341 | his
342 | how
343 | however
344 | i
345 | if
346 | in
347 | indeed
348 | instead
349 | it
350 | its
351 | just
352 | last
353 | like
354 | major
355 | many
356 | may
357 | maybe
358 | meanwhile
359 | more
360 | moreover
361 | most
362 | mr
363 | mrs
364 | ms
365 | much
366 | my
367 | neither
368 | net
369 | never
370 | nevertheless
371 | nine
372 | no
373 | none
374 | not
375 | nothing
376 | now
377 | of
378 | on
379 | once
380 | one
381 | only
382 | or
383 | other
384 | our
385 | over
386 | partly
387 | perhaps
388 | prior
389 | regarding
390 | separately
391 | seven
392 | several
393 | she
394 | should
395 | similarly
396 | since
397 | six
398 | so
399 | some
400 | somehow
401 | still
402 | such
403 | ten
404 | that
405 | the
406 | their
407 | then
408 | there
409 | therefore
410 | these
411 | they
412 | this
413 | those
414 | though
415 | three
416 | to
417 | two
418 | under
419 | unless
420 | unlike
421 | until
422 | volume
423 | we
424 | what
425 | whatever
426 | whats
427 | when
428 | where
429 | which
430 | while
431 | why
432 | with
433 | without
434 | yesterday
435 | yet
436 | you
437 | your
438 | aboard
439 | about
440 | above
441 | according to
442 | across
443 | afore
444 | after
445 | against
446 | agin
447 | along
448 | alongside
449 | amid
450 | amidst
451 | among
452 | amongst
453 | anent
454 | around
455 | as
456 | aslant
457 | astride
458 | at
459 | athwart
460 | bar
461 | because of
462 | before
463 | behind
464 | below
465 | beneath
466 | beside
467 | besides
468 | between
469 | betwixt
470 | beyond
471 | but
472 | by
473 | circa
474 | despite
475 | down
476 | during
477 | due to
478 | ere
479 | except
480 | for
481 | from
482 | in
483 | inside
484 | into
485 | less
486 | like
487 | mid
488 | midst
489 | minus
490 | near
491 | next
492 | nigh
493 | nigher
494 | nighest
495 | notwithstanding
496 | of
497 | off
498 | on
499 | on to
500 | onto
501 | out
502 | out of
503 | outside
504 | over
505 | past
506 | pending
507 | per
508 | plus
509 | qua
510 | re
511 | round
512 | sans
513 | save
514 | since
515 | through
516 | throughout
517 | thru
518 | till
519 | to
520 | toward
521 | towards
522 | under
523 | underneath
524 | unlike
525 | until
526 | unto
527 | up
528 | upon
529 | versus
530 | via
531 | vice
532 | with
533 | within
534 | without
535 | he
536 | her
537 | herself
538 | hers
539 | him
540 | himself
541 | his
542 | I
543 | it
544 | its
545 | itself
546 | me
547 | mine
548 | my
549 | myself
550 | ours
551 | she
552 | their
553 | theirs
554 | them
555 | themselves
556 | they
557 | us
558 | we
559 | our
560 | ourselves
561 | you
562 | your
563 | yours
564 | yourselves
565 | yourself
566 | this
567 | that
568 | these
569 | those
570 | a
571 | about
572 | above
573 | across
574 | after
575 | afterwards
576 | again
577 | against
578 | all
579 | almost
580 | alone
581 | along
582 | already
583 | also
584 | although
585 | always
586 | am
587 | among
588 | amongst
589 | amoungst
590 | amount
591 | an
592 | and
593 | another
594 | any
595 | anyhow
596 | anyone
597 | anything
598 | anyway
599 | anywhere
600 | are
601 | around
602 | as
603 | at
604 | back
605 | be
606 | became
607 | because
608 | become
609 | becomes
610 | becoming
611 | been
612 | before
613 | beforehand
614 | behind
615 | being
616 | below
617 | beside
618 | besides
619 | between
620 | beyond
621 | bill
622 | both
623 | bottom
624 | but
625 | by
626 | call
627 | can
628 | cannot
629 | cant
630 | co
631 | computer
632 | con
633 | could
634 | couldnt
635 | cry
636 | de
637 | describe
638 | detail
639 | do
640 | done
641 | down
642 | due
643 | during
644 | each
645 | eg
646 | eight
647 | either
648 | eleven
649 | else
650 | elsewhere
651 | empty
652 | enough
653 | etc
654 | even
655 | ever
656 | every
657 | everyone
658 | everything
659 | everywhere
660 | except
661 | few
662 | fifteen
663 | fify
664 | fill
665 | find
666 | fire
667 | first
668 | five
669 | for
670 | former
671 | formerly
672 | forty
673 | found
674 | four
675 | from
676 | front
677 | full
678 | further
679 | get
680 | give
681 | go
682 | had
683 | has
684 | hasnt
685 | have
686 | he
687 | hence
688 | her
689 | here
690 | hereafter
691 | hereby
692 | herein
693 | hereupon
694 | hers
695 | herself
696 | him
697 | himself
698 | his
699 | how
700 | however
701 | hundred
702 | i
703 | ie
704 | if
705 | in
706 | inc
707 | indeed
708 | interest
709 | into
710 | is
711 | it
712 | its
713 | itself
714 | keep
715 | last
716 | latter
717 | latterly
718 | least
719 | less
720 | ltd
721 | made
722 | many
723 | may
724 | me
725 | meanwhile
726 | might
727 | mill
728 | mine
729 | more
730 | moreover
731 | most
732 | mostly
733 | move
734 | much
735 | must
736 | my
737 | myself
738 | name
739 | namely
740 | neither
741 | never
742 | nevertheless
743 | next
744 | nine
745 | no
746 | nobody
747 | none
748 | noone
749 | nor
750 | not
751 | nothing
752 | now
753 | nowhere
754 | of
755 | off
756 | often
757 | on
758 | once
759 | one
760 | only
761 | onto
762 | or
763 | other
764 | others
765 | otherwise
766 | our
767 | ours
768 | ourselves
769 | out
770 | over
771 | own
772 | part
773 | per
774 | perhaps
775 | please
776 | put
777 | rather
778 | re
779 | same
780 | see
781 | seem
782 | seemed
783 | seeming
784 | seems
785 | serious
786 | several
787 | she
788 | should
789 | show
790 | side
791 | since
792 | sincere
793 | six
794 | sixty
795 | so
796 | some
797 | somehow
798 | someone
799 | something
800 | sometime
801 | sometimes
802 | somewhere
803 | still
804 | such
805 | take
806 | ten
807 | than
808 | that
809 | the
810 | their
811 | them
812 | themselves
813 | then
814 | thence
815 | there
816 | thereafter
817 | thereby
818 | therefore
819 | therein
820 | thereupon
821 | these
822 | they
823 | thick
824 | thin
825 | third
826 | this
827 | those
828 | though
829 | three
830 | through
831 | throughout
832 | thru
833 | thus
834 | to
835 | together
836 | too
837 | top
838 | toward
839 | towards
840 | twelve
841 | twenty
842 | two
843 | un
844 | under
845 | until
846 | up
847 | upon
848 | us
849 | very
850 | via
851 | was
852 | we
853 | well
854 | were
855 | what
856 | whatever
857 | when
858 | whence
859 | whenever
860 | where
861 | whereafter
862 | whereas
863 | whereby
864 | wherein
865 | whereupon
866 | wherever
867 | whether
868 | which
869 | while
870 | whither
871 | who
872 | whoever
873 | whole
874 | whom
875 | whose
876 | why
877 | will
878 | with
879 | within
880 | without
881 | would
882 | yet
883 | you
884 | your
885 | yours
886 | yourself
887 | yourselves
888 | #chenxin12
889 | #other number
890 |
--------------------------------------------------------------------------------
/WebRoot/WEB-INF/lib/lexicon/lex-touris.lex:
--------------------------------------------------------------------------------
1 | CJK_WORDS
2 | 世博园/ns/shi bo yuan/null
3 | 世博会/ns/shi bo hui/null
4 | 长城/ns/chang cheng/null
5 | 黄山/ns/huang shan/null
6 | 衡山/ns/heng shan/null
7 | 华山/ns/hua shan/null
8 | 泰山/ns/tai shan/null
9 |
--------------------------------------------------------------------------------
/WebRoot/WEB-INF/lib/lexicon/lex-units.lex:
--------------------------------------------------------------------------------
1 | CJK_UNITS
2 | #中文单字单位词库
3 | #长度
4 | 米
5 | 寸
6 | 尺
7 | 丈
8 | 里
9 | #时间
10 | 年
11 | 月
12 | 日
13 | 时
14 | #分
15 | 秒
16 | #币
17 | 元
18 | 角
19 | #容量
20 | 升
21 | 斗
22 | 石
23 | 瓶
24 | 袋
25 | 盒
26 | #重量
27 | 吨
28 | 克
29 | 斤
30 | 两
31 | 担
32 | #地积
33 | 亩
34 | 顷
35 | #其他
36 | 折
37 | 件
38 | 番
39 | ℃
40 | ℉
41 |
--------------------------------------------------------------------------------
/WebRoot/WEB-INF/web.xml:
--------------------------------------------------------------------------------
1 |
2 |
6 |
7 |
8 | index.jsp
9 |
10 |
11 | parseArticleById
12 | com.chenxb.servlet.ParseArticleById
13 |
14 |
15 | parseArticleById
16 | /parseArticle
17 |
18 |
19 |
20 |
21 | ArticleWithSql
22 | com.chenxb.servlet.ArticleWithSql
23 |
24 |
25 | ArticleWithSql
26 | /articleWithSql
27 |
28 |
29 |
30 |
31 | parseArticleByColumn
32 | com.chenxb.servlet.ColumnArticlesWithSql
33 |
34 |
35 | parseArticleByColumn
36 | /columnWithSql
37 |
38 |
39 |
40 |
41 | moreArticlesWithSql
42 | com.chenxb.servlet.MoreArticlesWithSql
43 |
44 |
45 | moreArticlesWithSql
46 | /columnMore
47 |
48 |
49 |
50 |
51 | parseRotationWithSql
52 | com.chenxb.servlet.RotationWithSql
53 |
54 |
55 | parseRotationWithSql
56 | /rotationWithSql
57 |
58 |
59 |
60 |
61 |
62 | searchArticle
63 | com.chenxb.servlet.SearchArticle
64 |
65 |
66 | searchArticle
67 | /searchArticle
68 |
69 |
--------------------------------------------------------------------------------
/WebRoot/css/detail.css:
--------------------------------------------------------------------------------
1 | @charset "utf-8";
2 | /* p {color:#FF3366;text-indent:2em;line-height:24px;} */
3 | #article_title {
4 | text-align: center;
5 | width: 100%;
6 | font-size: 20px;
7 | margin: 0 0 10px 0;
8 | }
9 | #article_title h1 {
10 | font-size: 20px;
11 | line-height: 30px;
12 | font-weight: normal;
13 | }
14 |
15 |
16 | #article_detail {
17 | font-size: 14px;
18 | color: #888888;
19 | margin: 10px 0px;
20 | text-align: center;
21 | border-bottom: 1px solid #f0f0f0;
22 | line-height: 25px;
23 | }
24 | #article_detail span{
25 | margin: 0px 5px;
26 | }
27 |
28 | #article_content p{
29 | text-indent: 2em;
30 | margin-bottom: 20px;
31 | font-size: 15px;
32 | }
33 |
34 | #article_content a:hover {
35 | text-decoration: underline;
36 | }
37 | #article_content a {
38 | color: #428bca;
39 | }
40 | #original_post {
41 | margin-top: 15px;
42 | border-top: 1px dashed #ccc;
43 | color: #777;
44 | text-align: center;
45 | }
46 | #original_post p{
47 | margin: 5px 0;
48 | color: #777;
49 | font-size: 12px;
50 | }
51 | /* a{
52 | text-decoration: none;
53 | color: #111;
54 | }
55 | a:hover {
56 | color:#BD0800;
57 | } */
58 | /* a {color:#3E62A6;} */
59 | img {max-width:310px;display:table-cell;vertical-align:middle;margin-left:1em;}
60 |
61 | img.alignleft {float:left;max-width:120px;margin:0 10px 5px 0;border:1px solid #ccc;background:#fff;padding:2px;}
62 | pre {font-size:9pt;line-height:12pt;font-family:Courier New,Arial;border:1px solid #ddd;border-left:5px solid #6CE26C;background:#f6f6f6;padding:5px;overflow: auto;}
63 | a.tag {font-size:15px;text-decoration:none;background-color:#bbd6f3;border-bottom:2px solid #3E6D8E;border-right:2px solid #7F9FB6;color:#284a7b;margin:2px 2px 2px 0;padding:2px 4px;white-space:nowrap;}
--------------------------------------------------------------------------------
/WebRoot/index.jsp:
--------------------------------------------------------------------------------
1 | <%@ page language="java" import="java.util.*" pageEncoding="UTF-8"%>
2 | <%
3 | String path = request.getContextPath();
4 | String num = request.getQueryString(); //获得新闻详情的num
5 | String basePath = request.getScheme() + "://" + request.getServerName() + ":" + request.getServerPort()
6 | + path + "/";
7 | %>
8 |
9 |
10 |
11 |
12 |
14 |
15 |
16 | My JSP 'index.jsp' starting page
17 |
18 |
19 |
20 |
21 |
22 |
25 |
26 |
27 |
28 | This is my JSP page.
29 |
30 | <%
31 | if (num == null || num.equals("")) {
32 | %>
33 | 请使用index后面更7000参数,例如?index=7000
34 | <%
35 | }
36 | %>
37 |
38 |
39 |
--------------------------------------------------------------------------------
/WebRoot/jsp/detail.jsp:
--------------------------------------------------------------------------------
1 | <%@ page language="java" import="java.util.*" pageEncoding="UTF-8"%>
2 | <%@ page language="java" import="com.chenxb.news.*"%>
3 | <%@ page language="java" import="com.chenxb.model.*"%>
4 | <%@ page language="java" import="com.chenxb.biz.*"%>
5 | <%@ taglib uri="http://java.sun.com/jsp/jstl/core" prefix="c"%>
6 | <%
7 | String num = request.getParameter("num");
8 | String original = "http://see.xidian.edu.cn/html/news/" + num + ".html";
9 | ArticleItem detail = ArticleBiz.parseNewsItem(Integer.parseInt(num));
10 | request.setAttribute("detail", detail);
11 | request.setAttribute("original", original);
12 | %>
13 |
14 |
15 |
16 |
17 |
19 | <%--显示为可以拨号的连接 --%>
20 |
21 |
22 | ${detail.title }
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
${detail.title }
32 |
33 |
34 |
35 | ${detail.publishDate }
36 | 浏览次数:${detail.readTimes }
37 |
38 |
39 | ${detail.body }
40 |
41 |
42 |
43 | SeeNews已优化原网页方便移动设备查看
44 |
45 |
46 |
47 |
48 |
49 |
--------------------------------------------------------------------------------
/src/com/chenxb/biz/ArticleBiz.java:
--------------------------------------------------------------------------------
1 | package com.chenxb.biz;
2 |
3 | import java.io.IOException;
4 |
5 | import org.apache.commons.lang3.ArrayUtils;
6 | import org.jsoup.Jsoup;
7 | import org.jsoup.nodes.Document;
8 | import org.jsoup.nodes.Element;
9 | import org.jsoup.select.Elements;
10 |
11 | import com.chenxb.model.ArticleItem;
12 | import com.chenxb.util.Constant;
13 | import com.chenxb.util.HttpTool;
14 | import com.chenxb.util.ImageTool;
15 | import com.chenxb.util.UrlTool;
16 |
17 | /**
18 | * 根据指定的新闻 id 爬取新闻数据
19 | * 将获得的标题、发布时间、内容拼接成 javabean
20 | * @author tomchen
21 | *
22 | */
23 | public class ArticleBiz {
24 |
25 | // 统计点击次数的 url
26 | private static final String COUNT_BASE_URL = "http://see.xidian.edu.cn/index.php/news/click/id/";
27 |
28 | private static final String SOURCE_PREFIX = "来源:";
29 |
30 | /**
31 | * 新闻的 url 格式为 http://see.xidian.edu.cn/html/news/7928.html
32 | *
33 | * @param id
34 | * 某个新闻页面的序号
35 | * @return 爬取该页面上的新闻信息,提取相应的信息,存到新闻bean里。如果没有爬取到新闻返回null
36 | * @throws Exception
37 | */
38 | public static ArticleItem parseNewsItem(int id) throws Exception {
39 | // 根据后缀的数字,拼接新闻 url
40 | String urlStr = Constant.ARTICLE_BASE_URL + id + ".html";
41 |
42 | // 利用get请求获取字符串再解析会有小部分乱码
43 | // String htmlStr = HttpTool.doGet(urlStr);
44 | // Document doc = Jsoup.parse(htmlStr);
45 | // try {
46 | Document doc = Jsoup.connect(urlStr).timeout(10000).get();
47 | // 去掉jsoup对html字符串加的"\n",方便json字符串返回
48 | doc.outputSettings().prettyPrint(false);
49 |
50 | Element articleEle = doc.getElementById("article");
51 | // 标题
52 | Element titleEle = articleEle.getElementById("article_title");
53 | String titleStr = titleEle.text();
54 |
55 | // article_detail包括了 2016-01-15 来源: 浏览次数:177
56 | Element detailEle = articleEle.getElementById("article_detail");
57 | Elements details = detailEle.getElementsByTag("span");
58 |
59 | // 发布时间
60 | String dateStr = details.get(0).text();
61 |
62 | // 新闻来源
63 | String sourceStr = details.get(1).text();
64 |
65 | // 去掉"来源:"
66 | if (SOURCE_PREFIX.equals(sourceStr.trim())) {
67 | sourceStr = "SeeNews";
68 | } else {
69 | sourceStr = sourceStr.substring(3).trim();
70 | }
71 |
72 | // 访问这个新闻页面,浏览次数会+1,次数是 JS 渲染的
73 | String jsStr = HttpTool.doGet(COUNT_BASE_URL + id);
74 | int readTimes = Integer.parseInt(jsStr.replaceAll("\\D+", ""));
75 | // 或者使用下面这个正则方法
76 | // String readTimesStr = jsStr.replaceAll("[^0-9]", "");
77 |
78 | Element contentEle = articleEle.getElementById("article_content");
79 | // 新闻主体内容
80 |
81 | String contentStr = contentEle.toString();
82 |
83 | // 如果用 text()方法,新闻主体内容的 html 标签会丢失
84 | // 为了在 Android 上用 WebView 显示 html,用toString()
85 | // String contentStr = contentEle.text();
86 | Elements images = contentEle.getElementsByTag("img");
87 | String[] imageUrls = new String[images.size()];
88 |
89 | // 图片上传到七牛
90 | // 将body中的图片地址替换为七牛的地址
91 | for (int i = 0; i < imageUrls.length; i++) {
92 | String origin = images.get(i).attr("src");
93 | imageUrls[i] = ImageTool.convertUrl(id, origin);
94 | if (!origin.equals(imageUrls[i])) {
95 | // 只有上传图片到七牛,url 才会变化
96 | // 不相等,才替换为七牛的url
97 | contentStr = contentStr.replace(Constant.SRC_PREFIX + origin,
98 | Constant.SRC_PREFIX + Constant.BUCKET_HOST_NAME + imageUrls[i]);
99 | }
100 | }
101 |
102 | // 处理相对路径 url,不和上面的 image url 冲突
103 | Elements hrefs = contentEle.getElementsByTag("a");
104 | for (int i = 0; i < hrefs.size(); i++) {
105 | String origin = hrefs.get(i).attr("href");
106 | if (Constant.DEBUG) {
107 | System.out.println("原始 href=" + origin);
108 | }
109 | String newUrl = UrlTool.dealAttachmentUrl(id, origin);
110 |
111 | // 防止页面的附件 重复出现,替换多次
112 | // 出现这种
113 | // http://see.xidian.edu.cnhttp://see.xidian.edu.cn/uploads/file
114 | if (!origin.equals(newUrl)) {
115 | // 不相等,才替换为新的url 且url未被替换过
116 | contentStr = contentStr.replace(Constant.HREF_PREFIX + origin, Constant.HREF_PREFIX + newUrl);
117 | }
118 | }
119 |
120 | return new ArticleItem(id, imageUrls, titleStr, dateStr, readTimes, sourceStr, contentStr);
121 | }
122 |
123 | /**
124 | * 根据 id 得到这条新闻属于哪个栏目
125 | * NOTIFIC = 1;// 校园通知
126 | * BACHELOR = 2;// 本科教学 学士
127 | * MASTER = 3;// 研究生 硕士
128 | * ACADEMIC = 5;// 学术交流
129 | * 选取了电院新闻的部分栏目
130 | * JOB = 8;// 就业招聘
131 | * @param id
132 | * @return
133 | * @throws IOException
134 | */
135 | public static int getType(int id) throws IOException {
136 | // 根据后缀的数字,拼接新闻 url
137 | String urlStr = Constant.ARTICLE_BASE_URL + id + ".html";
138 |
139 | Document doc = Jsoup.connect(urlStr).timeout(10000).get();
140 | Element ele = doc.getElementById("position_guide");
141 | // href 类似http://see.xidian.edu.cn/html/category/2.html
142 | // 取出最后的数字2作为 type
143 | String href = ele.getElementsByTag("a").get(1).attr("href");
144 | return Integer.valueOf(href.replaceAll("\\D+", ""));
145 |
146 | }
147 |
148 | }
149 |
--------------------------------------------------------------------------------
/src/com/chenxb/biz/ColumnBiz.java:
--------------------------------------------------------------------------------
1 | package com.chenxb.biz;
2 |
3 | import java.io.IOException;
4 | import java.util.regex.Matcher;
5 | import java.util.regex.Pattern;
6 |
7 | import org.jsoup.Jsoup;
8 | import org.jsoup.nodes.Document;
9 | import org.jsoup.nodes.Element;
10 | import org.jsoup.select.Elements;
11 |
12 | import com.chenxb.util.ColumnType;
13 | import com.chenxb.util.UrlTool;
14 |
15 | public class ColumnBiz {
16 |
17 | private static Pattern regexCountPage = Pattern.compile("\\d+/(\\d+)");
18 |
19 | /**
20 | * 爬取本科教学、研究生、就业招聘等栏目
21 | *
22 | * @param type
23 | * 栏目
24 | * @param currentPage
25 | * 当前页码
26 | * @return 返回新闻的id数组
27 | * @throws IOException
28 | */
29 | public static int[] parseColumn(int type, int currentPage) throws IOException {
30 |
31 | String columnUrl = UrlTool.generateUrl(type, currentPage);
32 | Document doc = Jsoup.connect(columnUrl).timeout(10000).get();
33 | Elements eles = doc.getElementById("list_area").getElementsByTag("a");
34 | int[] articleIds = new int[eles.size()];
35 | for (int i = 0; i < eles.size(); i++) {
36 | String url = eles.get(i).attr("href");
37 | articleIds[i] = Integer.parseInt(url.replaceAll("\\D+", ""));
38 | }
39 | return articleIds;
40 | }
41 |
42 | /**
43 | * 根据栏目类型获取 本栏目共有几页
44 | *
45 | * @param type
46 | * 栏目类型
47 | * @return 总页数
48 | * @throws CommonException
49 | * @throws IOException
50 | */
51 | public static int getTotalPage(int type) throws IOException {
52 | // 最新消息栏目特殊,只有1页,没有下一页
53 | if (type == ColumnType.LATEST)
54 | return 1;
55 | String columnUrl = UrlTool.generateUrl(type, 1);
56 |
57 | // String htmlStr = HttpTool.doGet(columnUrl);
58 |
59 | Document doc = Jsoup.connect(columnUrl).timeout(10000).get();
60 | // 正则匹配 1262 条记录 1/26 页
61 | Element page = doc.getElementById("div_page");
62 |
63 | Matcher matcher = regexCountPage.matcher(page.text());
64 | if (matcher.find()) {
65 | return Integer.parseInt(matcher.group(1));
66 | } else {
67 | // 根据经验值,一个栏目至少有5页
68 | return 5;
69 | }
70 | }
71 |
72 | /**
73 | * 获取某页码的新闻个数
74 | * @param type
75 | * @param indexPage 从第1页开始
76 | * @return
77 | * @throws IOException
78 | */
79 | public static int countArticles(int type,int indexPage) throws IOException {
80 | return parseColumn(type, indexPage).length;
81 | }
82 |
83 | }
--------------------------------------------------------------------------------
/src/com/chenxb/biz/RotationImageBiz.java:
--------------------------------------------------------------------------------
1 | package com.chenxb.biz;
2 |
3 | import java.io.IOException;
4 | import java.util.ArrayList;
5 | import java.util.List;
6 |
7 | import org.jsoup.Jsoup;
8 | import org.jsoup.nodes.Document;
9 | import org.jsoup.nodes.Element;
10 | import org.jsoup.select.Elements;
11 |
12 | import com.chenxb.model.RotationItem;
13 | import com.chenxb.util.Constant;
14 | import com.chenxb.util.ImageTool;
15 |
16 | /**
17 | * 首页轮播图片
18 | * @author tomchen
19 | *
20 | */
21 | public class RotationImageBiz {
22 |
23 | /**
24 | * 爬取主页的轮播图片
25 | * @throws Exception
26 | *
27 | */
28 | public static List parseHomeRotaions() throws Exception {
29 | Document doc = Jsoup.connect(Constant.SEE_URL).timeout(10000).get();
30 | Elements eles = doc.getElementsByClass("rotaion_list").get(0).getElementsByTag("a");
31 | List rotaions = new ArrayList(eles.size());
32 |
33 | for (Element e : eles) {
34 |
35 | String articleUrl = e.attr("href");
36 |
37 | int id = Integer.parseInt(articleUrl.replaceAll("\\D+", ""));
38 |
39 | Element imgEle = e.getElementsByTag("img").get(0);
40 |
41 | String imageUrl = imgEle.attr("src");
42 |
43 | String[] key = { ImageTool.convertUrl(id, imageUrl) };
44 |
45 | String title = imgEle.attr("alt");
46 |
47 | // 该 id 的新闻属于哪个栏目
48 | int type = ArticleBiz.getType(id);
49 |
50 | rotaions.add(new RotationItem(id, key, title, type));
51 | }
52 | return rotaions;
53 | }
54 |
55 | }
56 |
--------------------------------------------------------------------------------
/src/com/chenxb/biz/UploadRandomImage.java:
--------------------------------------------------------------------------------
1 | package com.chenxb.biz;
2 |
3 | import java.io.IOException;
4 | import java.net.HttpURLConnection;
5 | import java.net.URL;
6 | import java.util.Random;
7 |
8 | import com.qiniu.storage.BucketManager;
9 | import com.qiniu.util.Auth;
10 |
11 | public class UploadRandomImage {
12 |
13 | private static final String RANDOM_URL = "https://unsplash.it/640/427/?image=";
14 | private static final String ACCESS_KEY = "***-***"; // 你的access_key
15 | private static final String SECRET_KEY = "***-***"; // 你的secret_key
16 | private static final String BUCKET_NAME = "***-***"; // 你的空间名称
17 |
18 | public static void main(String[] args) throws InterruptedException, IOException {
19 | Auth auth = Auth.create(ACCESS_KEY, SECRET_KEY);
20 | // 获取空间管理
21 | BucketManager bucketManager = new BucketManager(auth);
22 |
23 | int key = 0;
24 | for (int i = 0; i < 1025; i++) {
25 | // 如果 i 对应的图片存在,上传七牛
26 | if (exists(i)) {
27 | try {
28 | bucketManager.fetch(RANDOM_URL + i, BUCKET_NAME, key + "");
29 | System.out.println("i = " + i + " to key = " + key);
30 | key++;
31 | } catch (Exception e) {
32 | bucketManager.fetch(RANDOM_URL + i, BUCKET_NAME, key + "");
33 | System.out.println("Exception i = " + i + " to key = " + key);
34 | // 只有上传了七牛,key 才+1,保证七牛的 key 连续
35 | key++;
36 | }
37 | // sleep一段时间,免得对网站负载过大
38 | } else {
39 | System.out.println(i + "不存在");
40 | }
41 | }
42 | System.out.println(key + "最终图片数目");
43 |
44 | }
45 |
46 | /**
47 | * 判断地址对于的图片是否存在
48 | * @param id
49 | * @return
50 | */
51 | public static boolean exists(int id) {
52 | try {
53 | HttpURLConnection.setFollowRedirects(false);
54 | HttpURLConnection con = (HttpURLConnection) new URL(RANDOM_URL + id).openConnection();
55 | con.setRequestMethod("HEAD");
56 | return (con.getResponseCode() == HttpURLConnection.HTTP_OK);
57 | } catch (Exception e) {
58 | e.printStackTrace();
59 | return false;
60 | }
61 | }
62 | }
63 |
--------------------------------------------------------------------------------
/src/com/chenxb/common/StreamTool.java:
--------------------------------------------------------------------------------
1 | package com.chenxb.common;
2 |
3 | import java.io.ByteArrayOutputStream;
4 | import java.io.InputStream;
5 |
6 | public class StreamTool {
7 |
8 | public static byte[] read(InputStream inputStr) throws Exception {
9 | ByteArrayOutputStream outStr = new ByteArrayOutputStream();
10 | byte[] buffer = new byte[1024];
11 | int len = 0;
12 | while ((len = inputStr.read(buffer)) != -1) {
13 | outStr.write(buffer, 0, len);
14 | }
15 | inputStr.close();
16 | return outStr.toByteArray();
17 | }
18 |
19 | }
20 |
--------------------------------------------------------------------------------
/src/com/chenxb/dao/ArticleDao.java:
--------------------------------------------------------------------------------
1 | package com.chenxb.dao;
2 |
3 | import java.sql.Connection;
4 | import java.sql.Date;
5 | import java.sql.PreparedStatement;
6 | import java.sql.ResultSet;
7 | import java.sql.SQLException;
8 | import java.util.Arrays;
9 |
10 | import org.apache.commons.lang3.ArrayUtils;
11 |
12 | import com.chenxb.model.ArticleItem;
13 | import com.chenxb.util.Constant;
14 | import com.chenxb.util.MysqlTool;
15 | import com.chenxb.util.TableName;
16 |
17 | /**
18 | * 插入新闻纪录到 mysql
19 | * 从 mysql 获取某条新闻
20 | * @author tomchen
21 | *
22 | */
23 | public class ArticleDao {
24 | private Connection connection;
25 |
26 | public ArticleDao() throws Exception {
27 | connection = new MysqlTool().getConnection();
28 | }
29 |
30 | /**
31 | * 根据 type 找到数据库表名称
32 | * 再从该表里找出 id 对应的新闻
33 | * @param type
34 | * @param id
35 | * @return
36 | * @throws SQLException
37 | */
38 | public ArticleItem getArticleByTypeId(int type, int id) throws SQLException {
39 | // 根据 type 找出对应的 table 名称
40 | String tableName = TableName.getTableByType(type);
41 |
42 | // the mysql select statement
43 | String query = "select * from " + tableName + " where id = ?";
44 |
45 | // create the mysql preparedstatement
46 | PreparedStatement preparedStmt = connection.prepareStatement(query);
47 | preparedStmt.setInt(1, id);
48 |
49 | ResultSet rs = preparedStmt.executeQuery();
50 | while (rs.next()) {
51 | String[] imageUrls = {};
52 | String urls = rs.getString(2);
53 | // split 最少也是返回一个元素 [] 返回 [""s]
54 | if (!urls.equals("[]")) {
55 | imageUrls = urls.replace("[", "").replace("]", "").split(", ");
56 | for (String url : Constant.USELESS_IMAGE_URL) {
57 | // 删除所有出现的元素
58 | imageUrls = ArrayUtils.removeAllOccurences(imageUrls, url);
59 | }
60 | }
61 | String title = rs.getString(3);
62 | String date = rs.getDate(4).toString();
63 | int readTimes = rs.getInt(5);
64 | String source = rs.getString(6);
65 | String body = rs.getString(7);
66 | ArticleItem article = new ArticleItem(id, imageUrls, title, date, readTimes, source, body);
67 | return article;
68 | }
69 | return null;
70 | }
71 |
72 | /**
73 | * 将记录插入到数据库中
74 | * @param tableNmae 数据库名称,从 TableName 类中选取
75 | * @param article
76 | * @return
77 | * @throws SQLException
78 | */
79 | public int insertArticle(String tableName, ArticleItem article) throws SQLException {
80 | // the mysql insert statement
81 | String query = " insert into " + tableName + " (id, image_urls, title, publish_date, read_times,source,body)"
82 | + " values (?, ?, ?, ?, ?,?,?)";
83 |
84 | // create the mysql insert preparedstatement
85 | PreparedStatement preparedStmt = connection.prepareStatement(query);
86 | preparedStmt.setInt(1, article.getId());
87 | preparedStmt.setString(2, Arrays.toString(article.getImageUrls()));
88 | preparedStmt.setString(3, article.getTitle());
89 | preparedStmt.setDate(4, Date.valueOf(article.getPublishDate()));
90 | preparedStmt.setInt(5, article.getReadTimes());
91 | preparedStmt.setString(6, article.getSource());
92 | preparedStmt.setString(7, article.getBody());
93 | return preparedStmt.executeUpdate();
94 | }
95 |
96 | public Connection getConnection() {
97 | return connection;
98 | }
99 |
100 | public void setConnection(Connection connection) {
101 | this.connection = connection;
102 | }
103 |
104 | }
105 |
--------------------------------------------------------------------------------
/src/com/chenxb/dao/ColumnDao.java:
--------------------------------------------------------------------------------
1 | package com.chenxb.dao;
2 |
3 | import static com.chenxb.util.Constant.DEBUG;
4 |
5 | import java.sql.Connection;
6 | import java.sql.Date;
7 | import java.sql.PreparedStatement;
8 | import java.sql.ResultSet;
9 | import java.sql.SQLException;
10 | import java.util.ArrayList;
11 | import java.util.Arrays;
12 | import java.util.List;
13 |
14 | import org.apache.commons.lang3.ArrayUtils;
15 | import org.apache.commons.lang3.StringUtils;
16 |
17 | import com.chenxb.biz.ArticleBiz;
18 | import com.chenxb.biz.ColumnBiz;
19 | import com.chenxb.model.ArticleItem;
20 | import com.chenxb.model.SimpleArticleItem;
21 | import com.chenxb.util.Constant;
22 | import com.chenxb.util.MysqlTool;
23 | import com.chenxb.util.TableName;
24 | import com.chenxb.util.TimeTool;
25 |
26 | /**
27 | * 获取某个栏目 多页新闻记录,插入到 Mysql 中
28 | * 为了简便,只获取前20页新闻
29 | * @author tomchen
30 | *
31 | */
32 | public class ColumnDao {
33 | private static final int MAX_COLUMN_NUM = 30;
34 | private Connection connection;
35 |
36 | public ColumnDao() throws Exception {
37 | connection = new MysqlTool().getConnection();
38 | }
39 |
40 | /**
41 | * @throws Exception
42 | *
43 | */
44 | public void initArticles(int type) throws Exception {
45 | int total = ColumnBiz.getTotalPage(type);
46 | // 该栏目的页数若过多,只爬取20页
47 | if (total > MAX_COLUMN_NUM) {
48 | total = MAX_COLUMN_NUM;
49 | }
50 |
51 | String tableName = TableName.getTableByType(type);
52 |
53 | // 注意从第1页开始
54 | for (int i = 1; i <= total; i++) {
55 | int[] ids = ColumnBiz.parseColumn(type, i);
56 | for (int id : ids) {
57 | ArticleItem article = ArticleBiz.parseNewsItem(id);
58 | if (DEBUG) {
59 | System.out.println(TimeTool.getCurrentTime() + " insert " + id + " " + article.getTitle() + " into "
60 | + tableName);
61 | }
62 | insertArticle(tableName, article);
63 | // 等待时间,避免对被爬取的网站负载过大
64 | TimeTool.sleepSomeTime();
65 | }
66 | }
67 | }
68 |
69 | /**
70 | * @throws Exception
71 | * 只爬取新闻 为了图片上传失败 重新上传
72 | */
73 | public static void justParseArticles(int type) throws Exception {
74 | int total = ColumnBiz.getTotalPage(type);
75 | // 该栏目的页数若过多,只爬取20页
76 | if (total > MAX_COLUMN_NUM) {
77 | total = MAX_COLUMN_NUM;
78 | }
79 |
80 | String tableName = TableName.getTableByType(type);
81 |
82 | // 注意从第1页开始
83 | for (int i = 1; i <= total; i++) {
84 | int[] ids = ColumnBiz.parseColumn(type, i);
85 | for (int id : ids) {
86 | ArticleItem article = ArticleBiz.parseNewsItem(id);
87 | if (DEBUG) {
88 | System.out.println(TimeTool.getCurrentTime() + " insert " + id + " " + article.getTitle() + " into "
89 | + tableName);
90 | }
91 | // 等待时间,避免对被爬取的网站负载过大
92 | TimeTool.sleepSomeTime();
93 | }
94 |
95 | }
96 | }
97 |
98 | /**
99 | * 重新抓取新闻,比较当前最小的 id
100 | * 把最小的 id 之前的新闻插入 mysql
101 | * @throws Exception
102 | *
103 | */
104 | public void reInitArticles(int type) throws Exception {
105 | int total = ColumnBiz.getTotalPage(type);
106 | // 该栏目的页数若过多,只爬取20页
107 | if (total > MAX_COLUMN_NUM) {
108 | total = MAX_COLUMN_NUM;
109 | }
110 |
111 | int minId = getMinId(type);
112 |
113 | if (Constant.DEBUG) {
114 | System.out.println(TimeTool.getCurrentTime() + " get minId " + minId);
115 | }
116 |
117 | String tableName = TableName.getTableByType(type);
118 |
119 | boolean find = false;
120 |
121 | // 注意从第1页开始
122 | for (int i = 1; i <= total; i++) {
123 | int[] ids = ColumnBiz.parseColumn(type, i);
124 |
125 | if (!find) {
126 | // 数组最后一个数,是否在 mysql
127 | boolean isExist = isIdExist(type, ids[ids.length - 1]);
128 |
129 | // 如果最后一个数在 mysql 中,该页所有记录都已爬取
130 | if (isExist) {
131 | continue;
132 | } else {
133 | // 爬取到这一页中断
134 | int pre = 0;
135 | for (; pre < ids.length; pre++) {
136 | // 如果记录存在,继续向下寻找
137 | if (isIdExist(type, ids[pre])) {
138 | continue;
139 | } else {
140 | if (DEBUG) {
141 | // pre 不存在,pre-1存在
142 | System.out.println(TimeTool.getCurrentTime() + " " + ids[pre] + " 第一个不存在 in 第 " + (i)
143 | + " 页,第 " + (pre + 1) + " 个");
144 | }
145 | // 不存在,跳槽循环
146 | break;
147 | }
148 | }
149 |
150 | for (int re = pre; re < ids.length; re++) {
151 | ArticleItem article = ArticleBiz.parseNewsItem(ids[re]);
152 | if (DEBUG) {
153 | System.out.println(TimeTool.getCurrentTime() + " insert " + ids[re] + " "
154 | + article.getTitle() + " into " + tableName);
155 | }
156 | insertArticle(tableName, article);
157 | // 等待时间,避免对被爬取的网站负载过大
158 | TimeTool.sleepSomeTime();
159 | }
160 | }
161 | } else {
162 | // 已经找到了上次的断点 id,把往后页面全部新闻插入 mysql
163 | if (find) {
164 | for (int id : ids) {
165 | ArticleItem article = ArticleBiz.parseNewsItem(id);
166 | insertArticle(tableName, article);
167 | if (DEBUG) {
168 | System.out.println(TimeTool.getCurrentTime() + " insert " + id + " " + article.getTitle()
169 | + " into " + tableName);
170 | }
171 | // 等待时间,避免对被爬取的网站负载过大
172 | TimeTool.sleepSomeTime();
173 | }
174 | }
175 | }
176 | }
177 |
178 | }
179 |
180 | /**
181 | * 对比网站和Mysql
182 | * 新增数据到 Mysql
183 | * @param type
184 | * @throws Exception
185 | */
186 | public void addArticles(int type) throws Exception {
187 | String tableName = TableName.getTableByType(type);
188 |
189 | int topId = getMaxId(type);
190 |
191 | int currentPage = 1;
192 |
193 | while (currentPage < MAX_COLUMN_NUM) {
194 | int[] ids = ColumnBiz.parseColumn(type, currentPage);
195 |
196 | int index = ArrayUtils.indexOf(ids, topId);
197 |
198 | // 如果当前数据库最新记录 == 网站最新记录
199 | if (index == 0) {
200 | return;
201 | }
202 |
203 | // 网站当前页包含 mysql 里的最新id
204 | // 把更新的记录插入到 mysql 中
205 | if (index > 0) {
206 | for (int i = 0; i < index; i++) {
207 | ArticleItem article = ArticleBiz.parseNewsItem(ids[i]);
208 | if (DEBUG) {
209 | System.out.println("insert " + ids[i] + " " + article.getTitle() + " into mysql");
210 | }
211 | insertArticle(tableName, article);
212 | // 等待时间,避免对被爬取的网站负载过大
213 | TimeTool.sleepSomeTime();
214 | }
215 | return;
216 | }
217 |
218 | // 最新的 id 不在当前页里
219 | // 需要全部更新数据
220 | if (index < 0) {
221 | for (int id : ids) {
222 | ArticleItem article = ArticleBiz.parseNewsItem(id);
223 | if (DEBUG) {
224 | System.out.println("insert " + id + " " + article.getTitle() + " into mysql");
225 | }
226 | insertArticle(tableName, article);
227 | // 等待时间,避免对被爬取的网站负载过大
228 | TimeTool.sleepSomeTime();
229 | }
230 | currentPage++;
231 | }
232 | }
233 | }
234 |
235 | /**
236 | * 返回某个表 最新的Constant.EACH_AMOUNT条新闻
237 | * 只是 listview 展示
238 | * 分页展示,需要 type 和偏移id
239 | * 修改了数据库中图片数组
240 | * 对于附件图标、doc 图标等不返回给手机端
241 | * @param type
242 | * @param threshold
243 | * @return
244 | * @throws SQLException
245 | */
246 | public List getTopSimpleArticles(int type, int offset) throws SQLException {
247 | String tableName = TableName.getTableByType(type);
248 |
249 | String selectColumns = "select id,image_urls,title,publish_date,read_times,summary from " + tableName;
250 | String limitCount = " order by id desc limit " + Constant.EACH_AMOUNT;
251 |
252 | // 这儿两个 sql 语句要同步修改
253 | String query = selectColumns + " where id < ? " + limitCount;
254 |
255 | PreparedStatement preparedStmt = connection.prepareStatement(query);
256 |
257 | preparedStmt.setInt(1, offset);
258 |
259 | // 如果是首页 这是很少的情况
260 | if (offset == -1) {
261 | query = selectColumns + limitCount;
262 | preparedStmt = connection.prepareStatement(query);
263 | }
264 |
265 | ResultSet rs = preparedStmt.executeQuery();
266 |
267 | List articles = new ArrayList(Constant.EACH_AMOUNT);
268 |
269 | while (rs.next()) {
270 | int id = rs.getInt(1);
271 | String[] imageUrls = {};
272 | String urls = rs.getString(2);
273 | // split 最少也是返回一个元素 [] 返回 [""s]
274 | if (!urls.equals("[]")) {
275 | imageUrls = urls.replace("[", "").replace("]", "").split(", ");
276 | for (String url : Constant.USELESS_IMAGE_URL) {
277 | // 删除所有出现的元素
278 | imageUrls = ArrayUtils.removeAllOccurences(imageUrls, url);
279 | }
280 | }
281 | String title = rs.getString(3);
282 | String date = rs.getDate(4).toString();
283 | int readTimes = rs.getInt(5);
284 | String summary = rs.getString(6);
285 | SimpleArticleItem article = new SimpleArticleItem(id, imageUrls, title, date, readTimes, summary);
286 | articles.add(article);
287 | }
288 | return articles;
289 | }
290 |
291 | /**
292 | * 返回某个表 最新的Constant.EACH_AMOUNT条新闻
293 | * 只是 listview 展示
294 | * 分页展示,需要 type
295 | * 大于某个给定的id
296 | * 修改了数据库中图片数组
297 | * 对于附件图标、doc 图标等不返回给手机端
298 | * @param type
299 | * @param threshold
300 | * @return
301 | * @throws SQLException
302 | */
303 | public List moreArticles(int type, int morethan) throws SQLException {
304 | String tableName = TableName.getTableByType(type);
305 |
306 | String selectColumns = "select id,image_urls,title,publish_date,read_times,summary from " + tableName;
307 | String limitCount = " order by id desc limit " + Constant.EACH_AMOUNT;
308 |
309 | // 这儿两个 sql 语句要同步修改
310 | String query = selectColumns + " where id > ? " + limitCount;
311 |
312 | PreparedStatement preparedStmt = connection.prepareStatement(query);
313 |
314 | preparedStmt.setInt(1, morethan);
315 |
316 | // 如果是首页 这是很少的情况
317 | if (morethan == -1) {
318 | query = selectColumns + limitCount;
319 | preparedStmt = connection.prepareStatement(query);
320 | }
321 |
322 | ResultSet rs = preparedStmt.executeQuery();
323 |
324 | List articles = new ArrayList(Constant.EACH_AMOUNT);
325 |
326 | while (rs.next()) {
327 | int id = rs.getInt(1);
328 | String[] imageUrls = {};
329 | String urls = rs.getString(2);
330 | // split 最少也是返回一个元素 [] 返回 [""s]
331 | if (!urls.equals("[]")) {
332 | imageUrls = urls.replace("[", "").replace("]", "").split(", ");
333 | for (String url : Constant.USELESS_IMAGE_URL) {
334 | // 删除所有出现的元素
335 | imageUrls = ArrayUtils.removeAllOccurences(imageUrls, url);
336 | }
337 | }
338 | String title = rs.getString(3);
339 | String date = rs.getDate(4).toString();
340 | int readTimes = rs.getInt(5);
341 | String summary = rs.getString(6);
342 | SimpleArticleItem article = new SimpleArticleItem(id, imageUrls, title, date, readTimes, summary);
343 | articles.add(article);
344 | }
345 | return articles;
346 | }
347 |
348 | public int insertArticle(String tableName, ArticleItem article) throws SQLException {
349 | // the mysql insert statement
350 | String query = " insert ignore into " + tableName
351 | + " (id, image_urls, title, publish_date, read_times,source,body)" + " values (?, ?, ?, ?, ?,?,?)";
352 |
353 | // create the mysql insert preparedstatement
354 | PreparedStatement preparedStmt = connection.prepareStatement(query);
355 | preparedStmt.setInt(1, article.getId());
356 | preparedStmt.setString(2, Arrays.toString(article.getImageUrls()));
357 | preparedStmt.setString(3, article.getTitle());
358 | preparedStmt.setDate(4, Date.valueOf(article.getPublishDate()));
359 | preparedStmt.setInt(5, article.getReadTimes());
360 | preparedStmt.setString(6, article.getSource());
361 | preparedStmt.setString(7, article.getBody());
362 | return preparedStmt.executeUpdate();
363 | }
364 |
365 | /**
366 | * 判断某个栏目对应的table是否空
367 | * @param type
368 | * @return
369 | * @throws SQLException
370 | */
371 | public boolean isTableEmpty(int type) throws SQLException {
372 | String tableName = TableName.getTableByType(type);
373 |
374 | String query = "select count(*) from " + tableName;
375 |
376 | PreparedStatement preparedStmt = connection.prepareStatement(query);
377 |
378 | ResultSet rs = preparedStmt.executeQuery();
379 |
380 | if (rs.next()) {
381 | if (rs.getInt(1) > 0) {
382 | return false;
383 | }
384 | }
385 | return true;
386 | }
387 |
388 | /**
389 | * 获取某个表中的最小 id,也就是最新的新闻 id
390 | * @param type
391 | * @return
392 | * @throws SQLException
393 | */
394 | public int getMinId(int type) throws SQLException {
395 |
396 | String tableName = TableName.getTableByType(type);
397 | // 取出最新的新闻 id 表名不能用PreparedStatement
398 | String query = "select min(id) from " + tableName;
399 | // create the mysql preparedstatement
400 | PreparedStatement preparedStmt = connection.prepareStatement(query);
401 |
402 | ResultSet rs = preparedStmt.executeQuery();
403 | // 空记录 null 会返回 0
404 | if (rs.next()) {
405 | return rs.getInt(1);
406 | }
407 | // 如果数据库没最大的 id,返回 -1
408 | return -1;
409 | }
410 |
411 | /**
412 | * 获取某个表中的最大 id,也就是最新的新闻 id
413 | * @param type
414 | * @return
415 | * @throws SQLException
416 | */
417 | public int getMaxId(int type) throws SQLException {
418 |
419 | String tableName = TableName.getTableByType(type);
420 | // 取出最新的新闻 id 表名不能用PreparedStatement
421 | String query = "select max(id) from " + tableName;
422 | // create the mysql preparedstatement
423 | PreparedStatement preparedStmt = connection.prepareStatement(query);
424 |
425 | ResultSet rs = preparedStmt.executeQuery();
426 | // 空记录 null 会返回 0
427 | if (rs.next()) {
428 | return rs.getInt(1);
429 | }
430 | // 如果数据库没最大的 id,返回 -1
431 | return -1;
432 | }
433 |
434 | /**
435 | * 判断某个记录是否存在
436 | * @param type
437 | * @return
438 | * @throws SQLException
439 | */
440 | public boolean isIdExist(int type, int id) throws SQLException {
441 |
442 | String tableName = TableName.getTableByType(type);
443 | // 取出最新的新闻 id 表名不能用PreparedStatement
444 |
445 | String query = "select exists(select 1 from " + tableName + " where id =?)";
446 | // create the mysql preparedstatement
447 | PreparedStatement preparedStmt = connection.prepareStatement(query);
448 | preparedStmt.setInt(1, id);
449 | ResultSet rs = preparedStmt.executeQuery();
450 | // 空记录 null 会返回 0
451 | if (rs.next()) {
452 | if (rs.getInt(1) == 1)
453 | return true;
454 | else
455 | return false;
456 | }
457 | // 如果数据库没最大的 id,返回 -1
458 | return false;
459 | }
460 | }
--------------------------------------------------------------------------------
/src/com/chenxb/dao/RotationImageDao.java:
--------------------------------------------------------------------------------
1 | package com.chenxb.dao;
2 |
3 | import static com.chenxb.util.Constant.DEBUG;
4 |
5 | import java.sql.Connection;
6 | import java.sql.Date;
7 | import java.sql.PreparedStatement;
8 | import java.sql.ResultSet;
9 | import java.sql.SQLException;
10 | import java.util.ArrayList;
11 | import java.util.Arrays;
12 | import java.util.List;
13 |
14 | import org.apache.commons.lang3.ArrayUtils;
15 |
16 | import com.chenxb.biz.ArticleBiz;
17 | import com.chenxb.biz.ColumnBiz;
18 | import com.chenxb.biz.RotationImageBiz;
19 | import com.chenxb.model.ArticleItem;
20 | import com.chenxb.model.RotationItem;
21 | import com.chenxb.util.Constant;
22 | import com.chenxb.util.MysqlTool;
23 | import com.chenxb.util.TableName;
24 | import com.chenxb.util.TimeTool;
25 |
26 | /**
27 | * 将首页轮播图片新闻,插入到 Mysql 中
28 | * @author tomchen
29 | *
30 | */
31 | public class RotationImageDao {
32 | private Connection connection;
33 | private static final String TABLE_RATATION = "rotation";
34 |
35 | public RotationImageDao() throws Exception {
36 | connection = new MysqlTool().getConnection();
37 | }
38 |
39 | /**
40 | * @throws Exception
41 | *
42 | */
43 | public void initRotations() throws Exception {
44 |
45 | List rotations = RotationImageBiz.parseHomeRotaions();
46 |
47 | if (rotations == null || rotations.isEmpty())
48 | return;
49 |
50 | for (RotationItem rotation : rotations) {
51 | if (DEBUG) {
52 | System.out.println(TimeTool.getCurrentTime() + " insert " + rotation.getId() + " " + rotation.getTitle()
53 | + " type " + rotation.getType() + " into " + TABLE_RATATION);
54 | }
55 | insertRotationItem(rotation);
56 | // 等待时间,避免对被爬取的网站负载过大
57 | TimeTool.sleepSomeTime();
58 | }
59 | }
60 |
61 | /**
62 | * 从数据库中获取 多条 轮播图片记录
63 | * @param type
64 | * @return
65 | * @throws SQLException
66 | */
67 | public List getTopRotations() throws SQLException {
68 |
69 | String query = "select * from " + TABLE_RATATION + " order by id desc limit " + Constant.ROTATION_AMOUNT;
70 |
71 | PreparedStatement preparedStmt = connection.prepareStatement(query);
72 |
73 | ResultSet rs = preparedStmt.executeQuery();
74 |
75 | List rotations = new ArrayList(Constant.ROTATION_AMOUNT);
76 |
77 | while (rs.next()) {
78 | int id = rs.getInt(1);
79 | String[] imageUrl = { rs.getString(2).replace("[", "").replace("]", "") };
80 |
81 | String title = rs.getString(3);
82 | int type = rs.getInt(4);
83 | rotations.add(new RotationItem(id, imageUrl, title, type));
84 | }
85 | return rotations;
86 | }
87 |
88 | public int insertRotationItem(RotationItem rotation) throws SQLException {
89 | // the mysql insert statement
90 | // 根据 type 找到某条新闻属于的栏目
91 | String query = " insert ignore into " + TABLE_RATATION + " (id, image_url, title, type)"
92 | + " values (?, ?, ?, ?)";
93 |
94 | // create the mysql insert preparedstatement
95 | PreparedStatement preparedStmt = connection.prepareStatement(query);
96 | preparedStmt.setInt(1, rotation.getId());
97 | preparedStmt.setString(2, Arrays.toString(rotation.getImageUrl()));
98 | preparedStmt.setString(3, rotation.getTitle());
99 | preparedStmt.setInt(4, rotation.getType());
100 | return preparedStmt.executeUpdate();
101 | }
102 |
103 | }
--------------------------------------------------------------------------------
/src/com/chenxb/dao/SearchDao.java:
--------------------------------------------------------------------------------
1 | package com.chenxb.dao;
2 |
3 | import java.nio.file.FileSystems;
4 | import java.sql.Connection;
5 | import java.sql.DriverManager;
6 | import java.sql.ResultSet;
7 | import java.sql.Statement;
8 |
9 | import org.apache.lucene.analysis.Analyzer;
10 | import org.apache.lucene.document.Document;
11 | import org.apache.lucene.document.Field;
12 | import org.apache.lucene.document.FieldType;
13 | import org.apache.lucene.document.IntField;
14 | import org.apache.lucene.document.StringField;
15 | import org.apache.lucene.document.TextField;
16 | import org.apache.lucene.index.DirectoryReader;
17 | import org.apache.lucene.index.IndexReader;
18 | import org.apache.lucene.index.IndexWriter;
19 | import org.apache.lucene.index.IndexWriterConfig;
20 | import org.apache.lucene.index.IndexWriterConfig.OpenMode;
21 | import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
22 | import org.apache.lucene.search.IndexSearcher;
23 | import org.apache.lucene.search.Query;
24 | import org.apache.lucene.search.Sort;
25 | import org.apache.lucene.search.SortField;
26 | import org.apache.lucene.search.SortField.Type;
27 | import org.apache.lucene.search.TopDocs;
28 | import org.apache.lucene.store.Directory;
29 | import org.apache.lucene.store.FSDirectory;
30 | import org.lionsoul.jcseg.analyzer.v5x.JcsegAnalyzer5X;
31 | import org.lionsoul.jcseg.tokenizer.core.JcsegTaskConfig;
32 |
33 | /**
34 | * 利用 Lucene 搜索 mysql 里的记录
35 | * 全文搜索
36 | *
37 | */
38 | public class SearchDao {
39 | /**
40 | * this is index directory path where all index file will be stored which lucene uses internally.
41 | */
42 |
43 | /**
44 | * to create index on simple database table
45 | */
46 | public void createIndex() {
47 |
48 | System.out.println("-- Indexing --");
49 |
50 | try {
51 | /** JDBC Section */
52 | Class.forName("com.mysql.jdbc.Driver").newInstance();
53 |
54 | // 后面unicode和utf8设置防止中文乱码
55 | String url = "jdbc:mysql://127.0.0.1:3306/see_news?useSSL=false&useUnicode=true&characterEncoding=utf-8";
56 | String name = "root";
57 | String password = "chenxb123";
58 |
59 | Connection conn = DriverManager.getConnection(url, name, password);
60 |
61 | Statement stmt = conn.createStatement();
62 | String sql = "select * from bachelor order by id desc limit 1000";
63 | ResultSet rs = stmt.executeQuery(sql);
64 |
65 | /** defining Analyzer */
66 |
67 | // 1. create the index
68 | Directory directory = FSDirectory.open(FileSystems.getDefault().getPath("./index2222"));
69 |
70 | // 创建标准文本分析器, 标准的是可以支持的中文的
71 |
72 | // StandardAnalyzer luceneAnalyzer = new StandardAnalyzer();
73 |
74 | Analyzer luceneAnalyzer = new JcsegAnalyzer5X(JcsegTaskConfig.COMPLEX_MODE);
75 |
76 | /** preparing config for indexWriter */
77 | IndexWriterConfig writerConfig = new IndexWriterConfig(luceneAnalyzer);
78 |
79 | /** Create a new index in the directory, removing any previously indexed documents */
80 | writerConfig.setOpenMode(OpenMode.CREATE);
81 | /**
82 | * Optional: for better indexing performance, if you are indexing many documents,
83 | * increase the RAM buffer. But if you do this, increase the max heap size to the JVM (eg add -Xmx512m or -Xmx1g):
84 | */
85 | // writerConfig.setRAMBufferSizeMB(256.0);
86 |
87 | IndexWriter iWriter = new IndexWriter(directory, writerConfig);
88 |
89 | int count = 0;
90 | Document doc = null;
91 | Field field = null;
92 |
93 | /** declaring string type */
94 | FieldType stringType = new FieldType();
95 | stringType.setTokenized(true);
96 |
97 | /** Looping through resultset and adding data to index file */
98 | while (rs.next()) {
99 | doc = new Document();
100 |
101 | /** adding id in document */
102 | field = new IntField("id", rs.getInt("id"), Field.Store.YES);
103 | doc.add(field);
104 |
105 | /** adding name in document */
106 | field = new TextField("title", rs.getString("title"), Field.Store.YES);
107 | doc.add(field);
108 |
109 | /** adding details in document */
110 | field = new TextField("body", rs.getString("body"), Field.Store.YES);
111 | doc.add(field);
112 |
113 | /** Adding doc to iWriter */
114 | iWriter.addDocument(doc);
115 | count++;
116 | }
117 |
118 | System.out.println(count + " record indexed");
119 |
120 | /** Closing iWriter */
121 | iWriter.commit();
122 | iWriter.close();
123 |
124 | /** Closing JDBC connection */
125 | rs.close();
126 | stmt.close();
127 | conn.close();
128 |
129 | } catch (Exception e) {
130 | e.printStackTrace();
131 | }
132 |
133 | }
134 |
135 | /**
136 | * to search the keywords
137 | *
138 | * @param keyword
139 | */
140 | public void search(String keyword) {
141 |
142 | System.out.println("-- Seaching --");
143 |
144 | try {
145 | /** Searching */
146 | Directory index = FSDirectory.open(FileSystems.getDefault().getPath("./index2222"));
147 |
148 | IndexReader directoryReader = DirectoryReader.open(index);
149 |
150 | // IndexReader directoryReader = DirectoryReader
151 | // .open(FSDirectory.open(FileSystems.getDefault().getPath("./index2222")));
152 |
153 | IndexSearcher searcher = new IndexSearcher(directoryReader);
154 | // StandardAnalyzer keywordAnalyzer = new StandardAnalyzer();
155 | Analyzer luceneAnalyzer = new JcsegAnalyzer5X(JcsegTaskConfig.COMPLEX_MODE);
156 |
157 | /** MultiFieldQueryParser is used to search multiple fields */
158 | String[] filesToSearch = { "id", "title", "body" };
159 | MultiFieldQueryParser mqp = new MultiFieldQueryParser(filesToSearch, luceneAnalyzer);
160 |
161 | /** search the given keyword */
162 | Query query = mqp.parse(keyword);
163 | System.out.println("query >> " + query);
164 | //
165 | // /** defining the sorting on filed "name" */
166 | Sort nameSort = new Sort(new SortField("id", Type.STRING));
167 |
168 | /** run the query */
169 | TopDocs hits = searcher.search(query, 1000);
170 | System.out.println("Results found >> " + hits.totalHits);
171 |
172 | Document doc = null;
173 | for (int i = 0; i < hits.totalHits; i++) {
174 | /** get the next document */
175 | doc = searcher.doc(hits.scoreDocs[i].doc);
176 | System.out.println("==========" + (i + 1) + " : Start Record=========\nId :: " + doc.get("id")
177 | + "\ntitle :: " + doc.get("title") + "\n==========End Record=========\n");
178 | }
179 | } catch (Exception e) {
180 | e.printStackTrace();
181 | }
182 |
183 | }
184 |
185 | /**
186 | * main method to check the output
187 | *
188 | * @param args
189 | */
190 | public static void main(String[] args) {
191 |
192 | SearchDao obj = new SearchDao();
193 |
194 | /** creating index */
195 | // obj.createIndex();
196 |
197 | /** searching simple keyword */
198 | System.out.println("==================searching simple keyword===========================");
199 | obj.search("电院课表");
200 |
201 | // /** searching simple keyword */
202 | // System.out.println("==================searching simple
203 | // keyword===========================");
204 | // obj.search("褚");
205 | //
206 | // /** searching using wild card */
207 | // System.out.println("==================searching using wild
208 | // card===========================");
209 | // obj.search("791");
210 | //
211 | // /** searching using logical OR operator */
212 | // System.out.println("==================searching using logical OR
213 | // operator===========================");
214 | // obj.search("院");
215 | }
216 |
217 | }
--------------------------------------------------------------------------------
/src/com/chenxb/dao/SummaryDao.java:
--------------------------------------------------------------------------------
1 | package com.chenxb.dao;
2 |
3 | import java.sql.Connection;
4 | import java.sql.PreparedStatement;
5 | import java.sql.ResultSet;
6 | import java.sql.SQLException;
7 | import java.util.List;
8 |
9 | import org.jsoup.Jsoup;
10 | import org.jsoup.nodes.Document;
11 |
12 | import com.chenxb.model.ArticleItem;
13 | import com.chenxb.util.MysqlTool;
14 | import com.chenxb.util.TableName;
15 | import com.hankcs.hanlp.HanLP;
16 |
17 | /**
18 | * 增加文章的摘要
19 | * 可以先把数据存入 mysql,再读取mysql 的数据增加摘要
20 | * 也可以爬虫的时候就把摘要存入 mysql
21 | * @author tomchen
22 | *
23 | */
24 | public class SummaryDao {
25 | private Connection connection;
26 |
27 | public SummaryDao() throws Exception {
28 | connection = new MysqlTool().getConnection();
29 | }
30 |
31 | /**
32 | * 根据 type 找到数据库表名称
33 | * 再从该表里找出 id 对应的新闻
34 | * @param type
35 | * @param id
36 | * @return
37 | * @throws SQLException
38 | */
39 | public int updateSummary(int type, int id) throws SQLException {
40 | // 根据 type 找出对应的 table 名称
41 | String tableName = TableName.getTableByType(type);
42 |
43 | String body = getArticleBody(type, id);
44 | // body是 html 表示的
45 | Document doc = Jsoup.parse(body);
46 | List sentenceList = HanLP.extractSummary(doc.text(), 3);
47 |
48 | // 如果摘要是空,不采取任何操作
49 | if (sentenceList.isEmpty()) {
50 |
51 | String update = "update " + tableName + " set summary = title WHERE id= ?";
52 |
53 | // create the mysql preparedstatement
54 | PreparedStatement preparedStmt = connection.prepareStatement(update);
55 | preparedStmt.setInt(1, id);
56 |
57 | return preparedStmt.executeUpdate();
58 | } else {
59 | String summary = sentenceList.toString();
60 | // 去掉 list 首尾的[ 和 ]
61 | summary = summary.substring(1, summary.length() - 1);
62 | summary = summary.replaceAll("&" + "nbsp;", "");
63 | // unicode 空格是160
64 | summary = summary.replaceAll(String.valueOf((char) 160), "");
65 | // 将多个空格替换为1个空格
66 | summary = summary.trim().replaceAll("\\s+", " ") + "。";
67 |
68 | String update = "update " + tableName + " set summary = ? WHERE id= ?";
69 |
70 | // create the mysql preparedstatement
71 | PreparedStatement preparedStmt = connection.prepareStatement(update);
72 | preparedStmt.setString(1, summary);
73 | preparedStmt.setInt(2, id);
74 | return preparedStmt.executeUpdate();
75 | }
76 |
77 | }
78 |
79 | /**
80 | * 找出新闻的主体内容
81 | * @param type
82 | * @param id
83 | * @return
84 | * @throws SQLException
85 | */
86 | public String getArticleBody(int type, int id) throws SQLException {
87 | // 根据 type 找出对应的 table 名称
88 | String tableName = TableName.getTableByType(type);
89 |
90 | // the mysql select statement
91 | String query = "select body from " + tableName + " where id = ?";
92 |
93 | // create the mysql preparedstatement
94 | PreparedStatement preparedStmt = connection.prepareStatement(query);
95 | preparedStmt.setInt(1, id);
96 |
97 | ResultSet rs = preparedStmt.executeQuery();
98 | if (rs.next()) {
99 | return rs.getString(1);
100 | }
101 | return "";
102 | }
103 |
104 | }
105 |
--------------------------------------------------------------------------------
/src/com/chenxb/jpush/TestJpush.java:
--------------------------------------------------------------------------------
1 | package com.chenxb.jpush;
2 |
3 | import java.sql.SQLException;
4 | import java.util.HashMap;
5 | import java.util.Map;
6 |
7 | import org.slf4j.Logger;
8 | import org.slf4j.LoggerFactory;
9 |
10 | import com.chenxb.dao.ArticleDao;
11 | import com.chenxb.model.ArticleItem;
12 |
13 | import cn.jpush.api.JPushClient;
14 | import cn.jpush.api.common.ClientConfig;
15 | import cn.jpush.api.common.resp.APIConnectionException;
16 | import cn.jpush.api.common.resp.APIRequestException;
17 | import cn.jpush.api.push.PushResult;
18 | import cn.jpush.api.push.model.Platform;
19 | import cn.jpush.api.push.model.PushPayload;
20 | import cn.jpush.api.push.model.SMS;
21 | import cn.jpush.api.push.model.audience.Audience;
22 | import cn.jpush.api.push.model.notification.Notification;
23 |
24 | public class TestJpush {
25 |
26 | private static final String appKey = "8c4911096188db2e7f2b370c";
27 | private static final String masterSecret = "1cd48b15285f5c6f100f46d4";
28 | public static final String ALERT = "救助郭燕-电院2000级校友,参与互联网众筹,通过网络传递爱心!";
29 |
30 | public static final String TITLE = "电院最新资讯";
31 |
32 | protected static final Logger LOG = LoggerFactory.getLogger(TestJpush.class);
33 |
34 | public static void main(String[] args) {
35 |
36 | JPushClient jpushClient = new JPushClient(masterSecret, appKey, 3);
37 | // For push, all you need do is to build PushPayload object.
38 | PushPayload payload = buildPushObject_android_tag_alertWithTitle();
39 |
40 | System.out.println("PushPayload 信息" + payload.toString());
41 |
42 | try {
43 | PushResult result = jpushClient.sendPush(payload);
44 | LOG.info("Got result - " + result);
45 |
46 | } catch (APIConnectionException e) {
47 | // Connection error, should retry later
48 | LOG.error("Connection error, should retry later", e);
49 |
50 | } catch (APIRequestException e) {
51 | // Should review the error, and fix the request
52 | LOG.error("Should review the error, and fix the request", e);
53 | LOG.info("HTTP Status: " + e.getStatus());
54 | LOG.info("Error Code: " + e.getErrorCode());
55 | LOG.info("Error Message: " + e.getErrorMessage());
56 | }
57 |
58 | }
59 |
60 | public static PushPayload buildPushObject_android_tag_alertWithTitle() {
61 | try {
62 | return PushPayload.newBuilder().setPlatform(Platform.android()).setAudience(Audience.all())
63 | .setNotification(Notification.android(ALERT, TITLE, getArticleExtraInfo())).build();
64 | } catch (SQLException e) {
65 | e.printStackTrace();
66 | } catch (Exception e) {
67 | e.printStackTrace();
68 | }
69 | return null;
70 | }
71 |
72 | public static Map getArticleExtraInfo() throws SQLException, Exception {
73 | ArticleItem article = new ArticleDao().getArticleByTypeId(0, 7948);
74 | Map extras = new HashMap();
75 |
76 | extras.put("type", "0");
77 | extras.put("id", article.getId() + "");
78 | extras.put("publishDate", article.getPublishDate());
79 | extras.put("readTimes", article.getReadTimes() + "");
80 |
81 | return extras;
82 | }
83 |
84 | }
85 |
--------------------------------------------------------------------------------
/src/com/chenxb/model/ArticleItem.java:
--------------------------------------------------------------------------------
1 | package com.chenxb.model;
2 |
3 | import java.util.Arrays;
4 |
5 | /**
6 | * model/ItemNews.java 新闻详情页面用到的完整实体类 新闻实体类 包括标题,发布日期,阅读次数,新闻主体内容等
7 | * @author tomchen
8 | *
9 | */
10 |
11 | public class ArticleItem extends SimpleArticleItem {
12 | // 图片资源不是必须的
13 | private String source;
14 | private String body;
15 |
16 | public ArticleItem(int id, String[] imageUrls, String title, String publishDate, int readTimes, String source,
17 | String body) {
18 | super(id,imageUrls, title, publishDate, readTimes);
19 | this.source = source;
20 | this.body = body;
21 | }
22 |
23 | public String getSource() {
24 | return source;
25 | }
26 |
27 | public void setSource(String source) {
28 | this.source = source;
29 | }
30 |
31 | public String getBody() {
32 | return body;
33 | }
34 |
35 | public void setBody(String body) {
36 | this.body = body;
37 | }
38 |
39 | @Override
40 | public String toString() {
41 | return "ArticleItem [id=" + getId() + ",\n imageUrls=" + Arrays.toString(getImageUrls()) + ",\n title="
42 | + getTitle() + ",\n publishDate=" + getPublishDate() + ",\n source=" + source + ",\n readTimes="
43 | + getReadTimes() + ",\n body=" + body + "]";
44 | }
45 |
46 | }
--------------------------------------------------------------------------------
/src/com/chenxb/model/RotationItem.java:
--------------------------------------------------------------------------------
1 | package com.chenxb.model;
2 |
3 | /**
4 | * 首页轮播图片 javabean
5 | * @author tomchen
6 | *
7 | */
8 | public class RotationItem {
9 |
10 | int id;
11 | // 首页只有一张图片
12 | String[] imageUrls;
13 | String title;
14 | // type 是数字 1表示新闻通知 2本科教学 见 ColumnType
15 | int type;
16 |
17 | public RotationItem(int id, String[] imageUrls, String title, int type) {
18 | this.id = id;
19 | this.imageUrls = imageUrls;
20 | this.title = title;
21 | this.type = type;
22 | }
23 |
24 | @Override
25 | public String toString() {
26 | return "RotationItem [id=" + id + ", imageUrls=" + imageUrls + ", title=" + title + ", type=" + type + "]";
27 | }
28 |
29 | public int getId() {
30 | return id;
31 | }
32 |
33 | public void setId(int id) {
34 | this.id = id;
35 | }
36 |
37 | public String[] getImageUrl() {
38 | return imageUrls;
39 | }
40 |
41 | public void setImageUrl(String[] imageUrls) {
42 | this.imageUrls = imageUrls;
43 | }
44 |
45 | public String getTitle() {
46 | return title;
47 | }
48 |
49 | public void setTitle(String title) {
50 | this.title = title;
51 | }
52 |
53 | public int getType() {
54 | return type;
55 | }
56 |
57 | public void setType(int type) {
58 | this.type = type;
59 | }
60 |
61 | }
62 |
--------------------------------------------------------------------------------
/src/com/chenxb/model/SimpleArticleItem.java:
--------------------------------------------------------------------------------
1 | package com.chenxb.model;
2 |
3 | import java.util.Arrays;
4 |
5 | /**
6 | * listview 用到的简单实体类
7 | * 只包括 id,标题,发布日期,阅读次数
8 | * 没有新闻主体内容等
9 | * @author tomchen
10 | *
11 | */
12 |
13 | public class SimpleArticleItem {
14 |
15 | private int id;
16 | private String[] imageUrls;
17 | // 图片资源不是必须的
18 | private String title;
19 | private String publishDate;
20 | private int readTimes;
21 | private String summary;
22 |
23 | public SimpleArticleItem(int id, String[] imageUrls, String title, String publishDate, int readTimes) {
24 | this.id = id;
25 | this.imageUrls = imageUrls;
26 | this.title = title;
27 | this.publishDate = publishDate;
28 | this.readTimes = readTimes;
29 | }
30 |
31 | public SimpleArticleItem(int id, String[] imageUrls, String title, String publishDate, int readTimes,
32 | String summary) {
33 | this.id = id;
34 | this.imageUrls = imageUrls;
35 | this.title = title;
36 | this.publishDate = publishDate;
37 | this.readTimes = readTimes;
38 | this.summary = summary;
39 | }
40 |
41 | public int getId() {
42 | return id;
43 | }
44 |
45 | public void setId(int id) {
46 | this.id = id;
47 | }
48 |
49 | public String[] getImageUrls() {
50 | return imageUrls;
51 | }
52 |
53 | public void setImageUrls(String[] imageUrls) {
54 | this.imageUrls = imageUrls;
55 | }
56 |
57 | public String getTitle() {
58 | return title;
59 | }
60 |
61 | public void setTitle(String title) {
62 | this.title = title;
63 | }
64 |
65 | public String getPublishDate() {
66 | return publishDate;
67 | }
68 |
69 | public void setPublishDate(String publishDate) {
70 | this.publishDate = publishDate;
71 | }
72 |
73 | public int getReadTimes() {
74 | return readTimes;
75 | }
76 |
77 | public void setReadTimes(int readTimes) {
78 | this.readTimes = readTimes;
79 | }
80 |
81 | public String getSummary() {
82 | return summary;
83 | }
84 |
85 | public void setSummary(String summary) {
86 | this.summary = summary;
87 | }
88 |
89 | @Override
90 | public String toString() {
91 | return "SimpleArticleItem [id=" + id + ", imageUrls=" + Arrays.toString(imageUrls) + ", title=" + title
92 | + ", publishDate=" + publishDate + ", readTimes=" + readTimes + ", summary=" + summary + "]";
93 | }
94 | }
--------------------------------------------------------------------------------
/src/com/chenxb/news/HelloLucene.java:
--------------------------------------------------------------------------------
1 | package com.chenxb.news;
2 |
3 | import java.io.IOException;
4 | import java.nio.file.FileSystems;
5 | import java.sql.Connection;
6 | import java.sql.DriverManager;
7 | import java.sql.ResultSet;
8 | import java.sql.Statement;
9 |
10 | import org.apache.lucene.analysis.Analyzer;
11 | import org.apache.lucene.document.Document;
12 | import org.apache.lucene.document.Field;
13 | import org.apache.lucene.document.StringField;
14 | import org.apache.lucene.document.TextField;
15 | import org.apache.lucene.index.DirectoryReader;
16 | import org.apache.lucene.index.IndexReader;
17 | import org.apache.lucene.index.IndexWriter;
18 | import org.apache.lucene.index.IndexWriterConfig;
19 | import org.apache.lucene.queryparser.classic.QueryParser;
20 | import org.apache.lucene.search.IndexSearcher;
21 | import org.apache.lucene.search.Query;
22 | import org.apache.lucene.search.ScoreDoc;
23 | import org.apache.lucene.search.TopScoreDocCollector;
24 | import org.apache.lucene.store.Directory;
25 | import org.apache.lucene.store.FSDirectory;
26 | import org.lionsoul.jcseg.analyzer.v5x.JcsegAnalyzer5X;
27 | import org.lionsoul.jcseg.tokenizer.core.JcsegTaskConfig;
28 |
29 | public class HelloLucene {
30 | public static void main(String[] args) throws Exception {
31 |
32 | // 0. Specify the analyzer for tokenizing text.
33 | // The same analyzer should be used for indexing and searching
34 | Analyzer analyzer = new JcsegAnalyzer5X(JcsegTaskConfig.COMPLEX_MODE);
35 |
36 |
37 | // 1. create the index
38 | Directory index = FSDirectory.open(FileSystems.getDefault().getPath("./index22"));
39 |
40 | System.out.println(index.toString());
41 | IndexWriterConfig config = new IndexWriterConfig(analyzer);
42 |
43 | IndexWriter w = new IndexWriter(index, config);
44 |
45 | /** JDBC Section */
46 | Class.forName("com.mysql.jdbc.Driver").newInstance();
47 |
48 | // 后面unicode和utf8设置防止中文乱码
49 | String url = "jdbc:mysql://127.0.0.1:3306/see_news?useSSL=false&useUnicode=true&characterEncoding=utf-8";
50 | String name = "root";
51 | String password = "chenxb123";
52 |
53 | Connection conn = DriverManager.getConnection(url, name, password);
54 |
55 | Statement stmt = conn.createStatement();
56 | String sql = "select * from academic order by id desc limit 10";
57 | ResultSet rs = stmt.executeQuery(sql);
58 |
59 | while (rs.next()) {
60 | addDoc(w, rs.getString("title"), rs.getInt("id") + "");
61 | }
62 |
63 | addDoc(w, "Lucene in Action", "193398817");
64 | addDoc(w, "Lucene for Dummies", "55320055Z");
65 | addDoc(w, "Managing Gigabytes", "55063554A");
66 | addDoc(w, "The Art of Computer Science", "9900333X");
67 | w.close();
68 |
69 | // 2. query
70 | String querystr = args.length > 0 ? args[0] : "新加坡";
71 |
72 | // the "title" arg specifies the default field to use
73 | // when no field is explicitly specified in the query.
74 | Query q = new QueryParser("title", analyzer).parse(querystr);
75 |
76 | // 3. search
77 | int hitsPerPage = 10;
78 | IndexReader reader = DirectoryReader.open(index);
79 | IndexSearcher searcher = new IndexSearcher(reader);
80 | TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage);
81 | searcher.search(q, collector);
82 | ScoreDoc[] hits = collector.topDocs().scoreDocs;
83 |
84 | // 4. display results
85 | System.out.println("Found " + hits.length + " hits.");
86 | for (int i = 0; i < hits.length; ++i) {
87 | int docId = hits[i].doc;
88 | Document d = searcher.doc(docId);
89 | System.out.println((i + 1) + ". " + d.get("isbn") + "\t" + d.get("title"));
90 | }
91 |
92 | // reader can only be closed when there
93 | // is no need to access the documents any more.
94 | reader.close();
95 | }
96 |
97 | private static void addDoc(IndexWriter w, String title, String isbn) throws IOException {
98 | Document doc = new Document();
99 | doc.add(new TextField("title", title, Field.Store.YES));
100 |
101 | // use a string field for isbn because we don't want it tokenized
102 | doc.add(new StringField("isbn", isbn, Field.Store.YES));
103 | w.addDocument(doc);
104 | }
105 | }
--------------------------------------------------------------------------------
/src/com/chenxb/news/LoadRotation.java:
--------------------------------------------------------------------------------
1 | package com.chenxb.news;
2 |
3 | import com.chenxb.dao.RotationImageDao;
4 |
5 | public class LoadRotation {
6 | public static void main(String[] args) throws Exception {
7 | new RotationImageDao().initRotations();
8 | }
9 |
10 | }
11 |
--------------------------------------------------------------------------------
/src/com/chenxb/news/ReUploadImage.java:
--------------------------------------------------------------------------------
1 | package com.chenxb.news;
2 |
3 | import com.chenxb.util.ImageTool;
4 |
5 | public class ReUploadImage {
6 | public static void main(String[] args) {
7 | String origin = "http://rsc.xidian.edu.cn/plus/img/addon.gif";
8 | System.out.println(ImageTool.convertUrl(0000, origin,"a2f5daa62be22c5a07ea60d8db6741f"));
9 | }
10 |
11 | }
12 |
--------------------------------------------------------------------------------
/src/com/chenxb/news/ReloadAcademic.java:
--------------------------------------------------------------------------------
1 | package com.chenxb.news;
2 |
3 | import com.chenxb.dao.ColumnDao;
4 | import com.chenxb.util.ColumnType;
5 |
6 | public class ReloadAcademic {
7 | public static void main(String[] args) {
8 | new Thread() {
9 | public void run() {
10 | try {
11 | new ColumnDao().reInitArticles(ColumnType.ACADEMIC);
12 | } catch (Exception e) {
13 | e.printStackTrace();
14 | }
15 |
16 | }
17 | }.start();
18 | }
19 | }
20 |
--------------------------------------------------------------------------------
/src/com/chenxb/news/ReloadAll.java:
--------------------------------------------------------------------------------
1 | package com.chenxb.news;
2 |
3 | import java.sql.Connection;
4 | import java.util.ArrayList;
5 | import java.util.Arrays;
6 |
7 | import org.apache.commons.lang3.ArrayUtils;
8 |
9 | import com.chenxb.biz.ColumnBiz;
10 | import com.chenxb.dao.ColumnDao;
11 | import com.chenxb.util.ColumnType;
12 | import com.chenxb.util.MysqlTool;
13 | import com.chenxb.util.TableName;
14 |
15 | public class ReloadAll {
16 | public static void main(String arg[]) throws Exception {
17 | new Thread() {
18 | public void run() {
19 | try {
20 | new ColumnDao().reInitArticles(ColumnType.BACHELOR);
21 | } catch (Exception e) {
22 | e.printStackTrace();
23 | }
24 |
25 | }
26 | }.start();
27 |
28 | new Thread() {
29 | public void run() {
30 | try {
31 | new ColumnDao().reInitArticles(ColumnType.MASTER);
32 | } catch (Exception e) {
33 | e.printStackTrace();
34 | }
35 |
36 | }
37 | }.start();
38 |
39 | new Thread() {
40 | public void run() {
41 | try {
42 | new ColumnDao().reInitArticles(ColumnType.JOB);
43 | } catch (Exception e) {
44 | e.printStackTrace();
45 | }
46 |
47 | }
48 | }.start();
49 |
50 | new Thread() {
51 | public void run() {
52 | try {
53 | new ColumnDao().reInitArticles(ColumnType.ACADEMIC);
54 | } catch (Exception e) {
55 | e.printStackTrace();
56 | }
57 |
58 | }
59 | }.start();
60 |
61 | new Thread() {
62 | public void run() {
63 | try {
64 | new ColumnDao().reInitArticles(ColumnType.LATEST);
65 | } catch (Exception e) {
66 | e.printStackTrace();
67 | }
68 |
69 | }
70 | }.start();
71 |
72 | new Thread() {
73 | public void run() {
74 | try {
75 | new ColumnDao().reInitArticles(ColumnType.NOTIFIC);
76 | } catch (Exception e) {
77 | e.printStackTrace();
78 | }
79 |
80 | }
81 | }.start();
82 |
83 | // System.out.println(new
84 | // new ColumnDao()().isTableEmpty(ColumnType.BACHELOR));
85 | // Connection connection = MysqlTool.getConnection();
86 | // String tableName = TableName.getTableByType(ColumnType.LATEST);
87 | //
88 | // int[] ids = ColumnBiz.parseColumn(ColumnType.LATEST, 1);
89 | //
90 | // System.out.println(ArrayUtils.indexOf(ids, ids[0]));
91 | // System.out.println(ArrayUtils.indexOf(ids, 6531));
92 | // System.out.println(ids[0]);
93 | // Arrays.binarySearch(ids, 3);
94 | // System.out.println(ArrayUtils.contains(ids, 7948));
95 | // System.out.println(ArrayUtils.contains(ids, 4));
96 | // System.out.println(ArrayUtils.contains(ids, 7945));
97 | // // Arrays.asList(ids);
98 | // // System.out.println(list);
99 |
100 | }
101 |
102 | }
103 |
--------------------------------------------------------------------------------
/src/com/chenxb/news/ReloadBachelor.java:
--------------------------------------------------------------------------------
1 | package com.chenxb.news;
2 |
3 | import com.chenxb.dao.ColumnDao;
4 | import com.chenxb.util.ColumnType;
5 |
6 | public class ReloadBachelor {
7 | public static void main(String[] args) {
8 | new Thread() {
9 | public void run() {
10 | try {
11 | new ColumnDao().reInitArticles(ColumnType.BACHELOR);
12 | } catch (Exception e) {
13 | e.printStackTrace();
14 | }
15 |
16 | }
17 | }.start();
18 | }
19 | }
20 |
--------------------------------------------------------------------------------
/src/com/chenxb/news/ReloadJob.java:
--------------------------------------------------------------------------------
1 | package com.chenxb.news;
2 |
3 | import java.util.ArrayList;
4 |
5 | import com.chenxb.dao.ColumnDao;
6 | import com.chenxb.util.ColumnType;
7 |
8 | public class ReloadJob {
9 | public static void main(String[] args) {
10 | ArrayList a = new ArrayList();
11 | a.add(11);
12 | a.add(112);
13 | System.out.println("array: " + a);
14 | A aaa = new A(a);
15 | System.out.println("A: " + aaa);
16 | a.add(33);
17 | System.out.println("array: " + a);
18 | System.out.println("A: " + aaa);
19 |
20 | }
21 |
22 | }
23 |
24 | class A {
25 | ArrayList arr;
26 |
27 | A(ArrayList arr) {
28 | this.arr = arr;
29 | }
30 |
31 | @Override
32 | public String toString() {
33 | return arr + "";
34 | }
35 |
36 | }
37 |
--------------------------------------------------------------------------------
/src/com/chenxb/news/ReloadLatest.java:
--------------------------------------------------------------------------------
1 | package com.chenxb.news;
2 |
3 | import com.chenxb.dao.ColumnDao;
4 | import com.chenxb.util.ColumnType;
5 |
6 | public class ReloadLatest {
7 | public static void main(String[] args) {
8 | new Thread() {
9 | public void run() {
10 | try {
11 | new ColumnDao().reInitArticles(ColumnType.LATEST);
12 | } catch (Exception e) {
13 | e.printStackTrace();
14 | }
15 |
16 | }
17 | }.start();
18 | }
19 | }
20 |
--------------------------------------------------------------------------------
/src/com/chenxb/news/ReloadMaster.java:
--------------------------------------------------------------------------------
1 | package com.chenxb.news;
2 |
3 | import com.chenxb.dao.ColumnDao;
4 | import com.chenxb.util.ColumnType;
5 |
6 | public class ReloadMaster {
7 | public static void main(String[] args) {
8 | new Thread() {
9 | public void run() {
10 | try {
11 | new ColumnDao().reInitArticles(ColumnType.MASTER);
12 | } catch (Exception e) {
13 | e.printStackTrace();
14 | }
15 |
16 | }
17 | }.start();
18 | }
19 | }
20 |
--------------------------------------------------------------------------------
/src/com/chenxb/news/ReloadNotific.java:
--------------------------------------------------------------------------------
1 | package com.chenxb.news;
2 |
3 | import com.chenxb.dao.ColumnDao;
4 | import com.chenxb.util.ColumnType;
5 |
6 | public class ReloadNotific {
7 | public static void main(String[] args) {
8 | new Thread() {
9 | public void run() {
10 | try {
11 | new ColumnDao().reInitArticles(ColumnType.NOTIFIC);
12 | } catch (Exception e) {
13 | e.printStackTrace();
14 | }
15 |
16 | }
17 | }.start();
18 | }
19 | }
20 |
--------------------------------------------------------------------------------
/src/com/chenxb/news/Test.java:
--------------------------------------------------------------------------------
1 | package com.chenxb.news;
2 |
3 | import java.lang.reflect.Type;
4 | import java.sql.Connection;
5 | import java.sql.DriverManager;
6 | import java.util.ArrayList;
7 | import java.util.Arrays;
8 | import java.util.List;
9 | import java.util.Random;
10 |
11 | import org.apache.commons.lang3.ArrayUtils;
12 | import org.apache.commons.lang3.StringUtils;
13 | import org.jsoup.Jsoup;
14 | import org.jsoup.nodes.Document;
15 | import org.jsoup.nodes.Element;
16 | import org.jsoup.select.Elements;
17 |
18 | import com.chenxb.biz.ArticleBiz;
19 | import com.chenxb.biz.ColumnBiz;
20 | import com.chenxb.dao.ColumnDao;
21 | import com.chenxb.model.ArticleItem;
22 | import com.chenxb.model.SimpleArticleItem;
23 | import com.chenxb.util.ColumnType;
24 | import com.chenxb.util.Constant;
25 | import com.chenxb.util.MysqlTool;
26 | import com.chenxb.util.TableName;
27 | import com.google.gson.Gson;
28 | import com.google.gson.GsonBuilder;
29 | import com.google.gson.reflect.TypeToken;
30 | import com.sina.sae.util.SaeUserInfo;
31 |
32 | public class Test {
33 | private String x;
34 |
35 | public static void main(String arg[]) throws Exception {
36 | test(7937);
37 | }
38 |
39 | public static void test(int id) {
40 | Random rand = new Random(id);
41 | System.out.println(rand.nextInt(965));
42 | }
43 |
44 | public static final boolean isCloud = false;
45 | // 新浪云 ip,外网使用
46 | public static final String saeIP = "http://javanews.applinzi.com/";
47 | // 本地局域网 ip,测试使用
48 | public static final String localIP = "http://192.168.199.133/";
49 |
50 | public static String columnUrl() {
51 | String suffix = "columnWithSql?column=%d&offset=%d";
52 | if (isCloud)
53 | return saeIP + suffix;
54 | else
55 | return localIP + suffix;
56 | }
57 |
58 | public static String articleUrl() {
59 | String suffix = "articleWithSql?column=%d&id=%d";
60 | if (isCloud)
61 | return saeIP + suffix;
62 | else
63 | return localIP + suffix;
64 | }
65 |
66 | }
67 |
--------------------------------------------------------------------------------
/src/com/chenxb/news/Test4.java:
--------------------------------------------------------------------------------
1 | package com.chenxb.news;
2 |
3 | import java.sql.Time;
4 | import java.util.ArrayList;
5 | import java.util.List;
6 |
7 | import org.jsoup.Jsoup;
8 | import org.jsoup.nodes.Document;
9 | import org.jsoup.nodes.Element;
10 | import org.jsoup.select.Elements;
11 |
12 | import com.chenxb.biz.ArticleBiz;
13 | import com.chenxb.biz.ColumnBiz;
14 | import com.chenxb.biz.RotationImageBiz;
15 | import com.chenxb.dao.ColumnDao;
16 | import com.chenxb.model.ArticleItem;
17 | import com.chenxb.util.ColumnType;
18 | import com.chenxb.util.Constant;
19 | import com.chenxb.util.ImageTool;
20 | import com.chenxb.util.TableName;
21 | import com.chenxb.util.TimeTool;
22 | import com.chenxb.model.RotationItem;
23 |
24 | public class Test4 {
25 |
26 | public static void main(String[] args) throws Exception {
27 |
28 |
29 | System.out.println("ca4b9dadbc73ccfd4c995d7c0a179f95".length());
30 | // Elements eles =
31 | // doc.getElementsByClass("rotaion_list").get(0).getElementsByTag("a");
32 | //
33 | //
34 | //
35 | // List rotaions = new
36 | // ArrayList(eles.size());
37 | //
38 | // for (Element e : eles) {
39 | //
40 | // String articleUrl = e.attr("href");
41 | //
42 | // int id = Integer.parseInt(articleUrl.replaceAll("\\D+", ""));
43 | //
44 | // String imageUrl = e.getElementsByTag("img").get(0).attr("src");
45 | //
46 | // String key = ImageTool.convertUrl(id, imageUrl);
47 | //
48 | // String title = e.getElementsByTag("img").get(0).attr("alt");
49 | //
50 | // String body = ArticleBiz.parseNewsItem(id).getBody();
51 | //
52 | // rotaions.add(new RotationItem(id, key, title, body));
53 | // }
54 | //
55 | // System.out.println(rotaions);
56 | }
57 |
58 | }
59 |
--------------------------------------------------------------------------------
/src/com/chenxb/news/TestJcseg.java:
--------------------------------------------------------------------------------
1 | package com.chenxb.news;
2 |
3 | import org.apache.lucene.analysis.Analyzer;
4 | import org.apache.lucene.analysis.standard.StandardAnalyzer;
5 | import org.apache.lucene.document.Document;
6 | import org.apache.lucene.document.Field;
7 | import org.apache.lucene.document.TextField;
8 | import org.apache.lucene.index.DirectoryReader;
9 | import org.apache.lucene.index.IndexWriter;
10 | import org.apache.lucene.index.IndexWriterConfig;
11 | import org.apache.lucene.queryparser.classic.QueryParser;
12 | import org.apache.lucene.search.IndexSearcher;
13 | import org.apache.lucene.search.Query;
14 | import org.apache.lucene.search.ScoreDoc;
15 | import org.apache.lucene.store.Directory;
16 | import org.apache.lucene.store.RAMDirectory;
17 | import org.apache.lucene.util.Version;
18 |
19 | public class TestJcseg {
20 | public static void main(String arg[]) throws Exception {
21 | Analyzer analyzer = new StandardAnalyzer();
22 |
23 | // Store the index in memory:
24 | Directory directory = new RAMDirectory();
25 | // To store an index on disk, use this instead:
26 | //Directory directory = FSDirectory.open("/tmp/testindex");
27 | IndexWriterConfig config = new IndexWriterConfig(analyzer);
28 | IndexWriter iwriter = new IndexWriter(directory, config);
29 | Document doc = new Document();
30 | String text = "This is the text to be indexed.";
31 | doc.add(new Field("fieldname", text, TextField.TYPE_STORED));
32 | iwriter.addDocument(doc);
33 | iwriter.close();
34 |
35 | // Now search the index:
36 | DirectoryReader ireader = DirectoryReader.open(directory);
37 | IndexSearcher isearcher = new IndexSearcher(ireader);
38 | // Parse a simple query that searches for "text":
39 | QueryParser parser = new QueryParser("fieldname", analyzer);
40 | Query query = parser.parse("text");
41 | ScoreDoc[] hits = isearcher.search(query, null, 1000).scoreDocs;
42 | // Iterate through the results:
43 | for (int i = 0; i < hits.length; i++) {
44 | Document hitDoc = isearcher.doc(hits[i].doc);
45 | }
46 | ireader.close();
47 | directory.close();
48 | // //lucene 5.x版本
49 | // Analyzer analyzer = new JcsegAnalyzer5X(JcsegTaskConfig.COMPLEX_MODE);
50 | // //非必须(用于修改默认配置): 获取分词任务配置实例
51 | // JcsegAnalyzer5X jcseg = (JcsegAnalyzer5X) analyzer;
52 | // JcsegTaskConfig config = jcseg.getTaskConfig();
53 | // //追加同义词, 需要在 jcseg.properties中配置jcseg.loadsyn=1
54 | // config.setAppendCJKSyn(true);
55 | // //追加拼音, 需要在jcseg.properties中配置jcseg.loadpinyin=1
56 | // config.setAppendCJKPinyin(false);
57 | // //更多配置, 请查看 org.lionsoul.jcseg.tokenizer.core.JcsegTaskConfig
58 | }
59 |
60 | }
61 |
--------------------------------------------------------------------------------
/src/com/chenxb/servlet/ArticleWithSql.java:
--------------------------------------------------------------------------------
1 | package com.chenxb.servlet;
2 |
3 | import java.io.IOException;
4 | import java.io.PrintWriter;
5 | import java.io.StringWriter;
6 | import java.sql.SQLException;
7 |
8 | import javax.servlet.ServletException;
9 | import javax.servlet.http.HttpServlet;
10 | import javax.servlet.http.HttpServletRequest;
11 | import javax.servlet.http.HttpServletResponse;
12 |
13 | import com.chenxb.biz.ArticleBiz;
14 | import com.chenxb.dao.ArticleDao;
15 | import com.chenxb.model.ArticleItem;
16 | import com.chenxb.util.TableName;
17 | import com.google.gson.Gson;
18 | import com.google.gson.GsonBuilder;
19 |
20 | /**
21 | * 从 mysql 中根据 id 和 column 获取新闻详情
22 | * 先获取 colunm,再到对应的表里查询数据
23 | * @author tomchen
24 | *
25 | */
26 | public class ArticleWithSql extends HttpServlet {
27 |
28 | private ArticleDao dao;
29 |
30 | public ArticleWithSql() {
31 | super();
32 | try {
33 | dao = new ArticleDao();
34 | } catch (Exception e) {
35 | e.printStackTrace();
36 | }
37 | }
38 |
39 | @Override
40 | protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException {
41 | doPost(req, resp);
42 | }
43 |
44 | @Override
45 | protected void doPost(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException {
46 | // 目的是为了控制浏览器的行为,即控制浏览器用UTF-8进行解码;
47 | resp.setContentType("text/html;charset=UTF-8");
48 |
49 | // 用于response.getWriter()输出的字符流的乱码问题,response.getOutputStream()是不需要此种解决方案的
50 | // 因为这句话的意思是为了将response对象中的数据以UTF-8解码后发向浏览器;
51 | resp.setCharacterEncoding("UTF-8");
52 |
53 | PrintWriter out = resp.getWriter();
54 | if (req.getParameter("id") == null || req.getParameter("column") == null) {
55 | out.write("usage: http://localhost:8080/test/articleWithSql?column=1&id=7000");
56 | return;
57 | }
58 |
59 | try {
60 | if (dao == null || dao.getConnection().isClosed()) {
61 | out.write("mysql is null or closed\n");
62 | return;
63 | }
64 | } catch (SQLException e) {
65 | StringWriter errors = new StringWriter();
66 | e.printStackTrace(new PrintWriter(errors));
67 | out.write("mysql is null or closed\n");
68 | out.print(errors.toString());
69 | }
70 |
71 | // 获取哪个栏目的表
72 | int type = Integer.parseInt(req.getParameter("column"));
73 |
74 | int id = Integer.parseInt(req.getParameter("id"));
75 |
76 | ArticleItem article;
77 | try {
78 | article = dao.getArticleByTypeId(type, id);
79 | Gson gson = new GsonBuilder().disableHtmlEscaping().create();
80 | String result = gson.toJson(article);
81 | out.write(result);
82 | } catch (Exception e) {
83 | StringWriter errors = new StringWriter();
84 | e.printStackTrace(new PrintWriter(errors));
85 | out.write("ArticleDao getArticleByTypeId error\n");
86 | out.print(errors.toString());
87 | } finally {
88 | out.flush();
89 | out.close();
90 | }
91 | }
92 |
93 | }
94 |
--------------------------------------------------------------------------------
/src/com/chenxb/servlet/ColumnArticlesWithSql.java:
--------------------------------------------------------------------------------
1 | package com.chenxb.servlet;
2 |
3 | import java.io.IOException;
4 | import java.io.PrintWriter;
5 | import java.io.StringWriter;
6 | import java.util.List;
7 |
8 | import javax.servlet.ServletException;
9 | import javax.servlet.http.HttpServlet;
10 | import javax.servlet.http.HttpServletRequest;
11 | import javax.servlet.http.HttpServletResponse;
12 |
13 | import com.chenxb.dao.ColumnDao;
14 | import com.chenxb.model.SimpleArticleItem;
15 | import com.google.gson.Gson;
16 | import com.google.gson.GsonBuilder;
17 |
18 | /**
19 | * 查找某个栏目的多条新闻
20 | * 根据栏目、偏移值
21 | * 分页返回
22 | * @author tomchen
23 | *
24 | */
25 | public class ColumnArticlesWithSql extends HttpServlet {
26 |
27 | private static final long serialVersionUID = 1L;
28 |
29 | private ColumnDao colDao;
30 |
31 | public ColumnArticlesWithSql() {
32 | super();
33 | try {
34 | colDao = new ColumnDao();
35 | } catch (Exception e) {
36 | e.printStackTrace();
37 | }
38 | }
39 |
40 | @Override
41 | protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException {
42 | doPost(req, resp);
43 | }
44 |
45 | @Override
46 | protected void doPost(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException {
47 | // 目的是为了控制浏览器的行为,即控制浏览器用UTF-8进行解码;
48 | resp.setContentType("text/html;charset=UTF-8");
49 |
50 | // 用于response.getWriter()输出的字符流的乱码问题,response.getOutputStream()是不需要此种解决方案的
51 | // 因为这句话的意思是为了将response对象中的数据以UTF-8解码后发向浏览器;
52 | resp.setCharacterEncoding("UTF-8");
53 | PrintWriter out = resp.getWriter();
54 |
55 | if (req.getParameter("column") == null || req.getParameter("offset") == null) {
56 | out.write("usage: http://localhost:8080/test/columnWithSql?column=1&offset=7916");
57 | return;
58 | }
59 |
60 | int type = Integer.parseInt(req.getParameter("column"));
61 | int offset = Integer.parseInt(req.getParameter("offset"));
62 |
63 | //用-1表示首页的数据
64 | //下面几页就是根据偏移量
65 | try {
66 | List articles = colDao.getTopSimpleArticles(type,offset);
67 | Gson gson = new GsonBuilder().disableHtmlEscaping().create();
68 | String result = gson.toJson(articles);
69 | out.write(result);
70 | } catch (Exception e) {
71 | StringWriter errors = new StringWriter();
72 | e.printStackTrace(new PrintWriter(errors));
73 | out.write("ColumnDao getTopSimpleArticles error\n");
74 | out.print(errors.toString());
75 | } finally {
76 | out.flush();
77 | out.close();
78 | }
79 | }
80 |
81 | }
82 |
--------------------------------------------------------------------------------
/src/com/chenxb/servlet/MoreArticlesWithSql.java:
--------------------------------------------------------------------------------
1 | package com.chenxb.servlet;
2 |
3 | import java.io.IOException;
4 | import java.io.PrintWriter;
5 | import java.io.StringWriter;
6 | import java.util.List;
7 |
8 | import javax.servlet.ServletException;
9 | import javax.servlet.http.HttpServlet;
10 | import javax.servlet.http.HttpServletRequest;
11 | import javax.servlet.http.HttpServletResponse;
12 |
13 | import com.chenxb.dao.ColumnDao;
14 | import com.chenxb.model.SimpleArticleItem;
15 | import com.google.gson.Gson;
16 | import com.google.gson.GsonBuilder;
17 |
18 | /**
19 | * 某栏目,查找大于某个id的新数据
20 | * 分页返回
21 | * @author tomchen
22 | *
23 | */
24 | public class MoreArticlesWithSql extends HttpServlet {
25 |
26 | private static final long serialVersionUID = 1L;
27 |
28 | private ColumnDao colDao;
29 |
30 | public MoreArticlesWithSql() {
31 | super();
32 | try {
33 | colDao = new ColumnDao();
34 | } catch (Exception e) {
35 | e.printStackTrace();
36 | }
37 | }
38 |
39 | @Override
40 | protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException {
41 | doPost(req, resp);
42 | }
43 |
44 | @Override
45 | protected void doPost(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException {
46 | // 目的是为了控制浏览器的行为,即控制浏览器用UTF-8进行解码;
47 | resp.setContentType("text/html;charset=UTF-8");
48 |
49 | // 用于response.getWriter()输出的字符流的乱码问题,response.getOutputStream()是不需要此种解决方案的
50 | // 因为这句话的意思是为了将response对象中的数据以UTF-8解码后发向浏览器;
51 | resp.setCharacterEncoding("UTF-8");
52 | PrintWriter out = resp.getWriter();
53 |
54 | if (req.getParameter("column") == null || req.getParameter("morethan") == null) {
55 | out.write("usage: http://localhost:8080/test/columnWithSql?column=1&morethan=7916");
56 | return;
57 | }
58 |
59 | int type = Integer.parseInt(req.getParameter("column"));
60 | int morethan = Integer.parseInt(req.getParameter("morethan"));
61 |
62 | // 下面几页就是根据偏移量
63 | try {
64 | List articles = colDao.moreArticles(type, morethan);
65 | Gson gson = new GsonBuilder().disableHtmlEscaping().create();
66 | String result = gson.toJson(articles);
67 | out.write(result);
68 | } catch (Exception e) {
69 | StringWriter errors = new StringWriter();
70 | e.printStackTrace(new PrintWriter(errors));
71 | out.write("ColumnDao getTopSimpleArticles error\n");
72 | out.print(errors.toString());
73 | } finally {
74 | out.flush();
75 | out.close();
76 | }
77 | }
78 |
79 | }
80 |
--------------------------------------------------------------------------------
/src/com/chenxb/servlet/ParseArticleById.java:
--------------------------------------------------------------------------------
1 | package com.chenxb.servlet;
2 |
3 | import java.io.IOException;
4 | import java.io.PrintWriter;
5 |
6 | import javax.servlet.ServletException;
7 | import javax.servlet.http.HttpServlet;
8 | import javax.servlet.http.HttpServletRequest;
9 | import javax.servlet.http.HttpServletResponse;
10 |
11 | import com.chenxb.biz.ArticleBiz;
12 | import com.chenxb.model.ArticleItem;
13 | import com.google.gson.Gson;
14 | import com.google.gson.GsonBuilder;
15 |
16 | /**
17 | * 使用示例 http://localhost:8080/seenews/parseArticle?id=7938
18 | * 根据给定的id从电院http://see.xidian.edu.cn/html/news/7938.html爬取数据 返回 json 字符串
19 | *
20 | * @author tomchen
21 | *
22 | */
23 | public class ParseArticleById extends HttpServlet {
24 |
25 | private static final long serialVersionUID = 1L;
26 |
27 | @Override
28 | protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException {
29 | doPost(req, resp);
30 | }
31 |
32 | @Override
33 | protected void doPost(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException {
34 | // 目的是为了控制浏览器的行为,即控制浏览器用UTF-8进行解码;
35 | resp.setContentType("text/html;charset=UTF-8");
36 |
37 | // 用于response.getWriter()输出的字符流的乱码问题,response.getOutputStream()是不需要此种解决方案的
38 | // 因为这句话的意思是为了将response对象中的数据以UTF-8解码后发向浏览器;
39 | resp.setCharacterEncoding("UTF-8");
40 |
41 | int id = Integer.parseInt(req.getParameter("id"));
42 |
43 | PrintWriter out = resp.getWriter();
44 | try {
45 | ArticleItem article = ArticleBiz.parseNewsItem(id);
46 | Gson gson = new GsonBuilder().disableHtmlEscaping().create();
47 | String result = gson.toJson(article);
48 | out.write(result);
49 | } catch (Exception e) {
50 | e.printStackTrace();
51 | } finally {
52 | out.flush();
53 | out.close();
54 | }
55 | }
56 |
57 | }
58 |
--------------------------------------------------------------------------------
/src/com/chenxb/servlet/RotationWithSql.java:
--------------------------------------------------------------------------------
1 | package com.chenxb.servlet;
2 |
3 | import java.io.IOException;
4 | import java.io.PrintWriter;
5 | import java.io.StringWriter;
6 | import java.util.List;
7 |
8 | import javax.servlet.ServletException;
9 | import javax.servlet.http.HttpServlet;
10 | import javax.servlet.http.HttpServletRequest;
11 | import javax.servlet.http.HttpServletResponse;
12 |
13 | import com.chenxb.dao.RotationImageDao;
14 | import com.chenxb.model.RotationItem;
15 | import com.google.gson.Gson;
16 | import com.google.gson.GsonBuilder;
17 |
18 | public class RotationWithSql extends HttpServlet {
19 |
20 | private RotationImageDao dao;
21 |
22 | public RotationWithSql() {
23 | super();
24 | try {
25 | dao = new RotationImageDao();
26 | } catch (Exception e) {
27 | e.printStackTrace();
28 | }
29 | }
30 |
31 | @Override
32 | protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException {
33 | doPost(req, resp);
34 | }
35 |
36 | @Override
37 | protected void doPost(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException {
38 | // 目的是为了控制浏览器的行为,即控制浏览器用UTF-8进行解码;
39 | resp.setContentType("text/html;charset=UTF-8");
40 |
41 | // 用于response.getWriter()输出的字符流的乱码问题,response.getOutputStream()是不需要此种解决方案的
42 | // 因为这句话的意思是为了将response对象中的数据以UTF-8解码后发向浏览器;
43 | resp.setCharacterEncoding("UTF-8");
44 |
45 | PrintWriter out = resp.getWriter();
46 |
47 | try {
48 | List rotations = dao.getTopRotations();
49 | Gson gson = new GsonBuilder().disableHtmlEscaping().create();
50 | String result = gson.toJson(rotations);
51 | out.write(result);
52 |
53 | } catch (Exception e) {
54 | StringWriter errors = new StringWriter();
55 | e.printStackTrace(new PrintWriter(errors));
56 | out.print(errors.toString());
57 | e.printStackTrace();
58 | }
59 |
60 | }
61 |
62 | }
63 |
--------------------------------------------------------------------------------
/src/com/chenxb/servlet/SearchArticle.java:
--------------------------------------------------------------------------------
1 | package com.chenxb.servlet;
2 |
3 | import java.io.IOException;
4 | import java.io.PrintWriter;
5 | import java.io.StringWriter;
6 |
7 | import javax.servlet.ServletException;
8 | import javax.servlet.http.HttpServlet;
9 | import javax.servlet.http.HttpServletRequest;
10 | import javax.servlet.http.HttpServletResponse;
11 |
12 | import com.chenxb.biz.ArticleBiz;
13 | import com.chenxb.dao.ArticleDao;
14 | import com.chenxb.model.ArticleItem;
15 | import com.chenxb.util.TableName;
16 | import com.google.gson.Gson;
17 | import com.google.gson.GsonBuilder;
18 |
19 | /**
20 | * 搜索新闻
21 | * 根据关键词全文搜索
22 | * @author tomchen
23 | *
24 | */
25 | public class SearchArticle extends HttpServlet {
26 |
27 | private ArticleDao dao;
28 |
29 | public SearchArticle() {
30 | super();
31 | try {
32 | dao = new ArticleDao();
33 | } catch (Exception e) {
34 | e.printStackTrace();
35 | }
36 | }
37 |
38 | @Override
39 | protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException {
40 | doPost(req, resp);
41 | }
42 |
43 | @Override
44 | protected void doPost(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException {
45 | // 目的是为了控制浏览器的行为,即控制浏览器用UTF-8进行解码;
46 | resp.setContentType("text/html;charset=UTF-8");
47 |
48 | // 用于response.getWriter()输出的字符流的乱码问题,response.getOutputStream()是不需要此种解决方案的
49 | // 因为这句话的意思是为了将response对象中的数据以UTF-8解码后发向浏览器;
50 | resp.setCharacterEncoding("UTF-8");
51 |
52 | PrintWriter out = resp.getWriter();
53 | if ( req.getParameter("keyword") == null) {
54 | out.write("usage:http://localhost:8080/test/searchArticle?keyword=电院");
55 | return;
56 | }
57 |
58 | // 获取哪个栏目的表
59 | String word = req.getParameter("keyword");
60 |
61 |
62 | try {
63 | out.write(word);
64 | } catch (Exception e) {
65 | e.printStackTrace();
66 | }
67 | }
68 |
69 | }
70 |
--------------------------------------------------------------------------------
/src/com/chenxb/test/JobScheduler.java:
--------------------------------------------------------------------------------
1 | package com.chenxb.test;
2 |
3 | import org.quartz.JobBuilder;
4 | import org.quartz.JobDetail;
5 | import org.quartz.Scheduler;
6 | import org.quartz.SchedulerFactory;
7 | import org.quartz.SimpleScheduleBuilder;
8 | import org.quartz.Trigger;
9 | import org.quartz.TriggerBuilder;
10 | import org.quartz.impl.StdSchedulerFactory;
11 |
12 | public class JobScheduler {
13 | public static void main(String[] args) throws Exception {
14 | JobDetail job = JobBuilder.newJob(TestJob.class).withIdentity("ttt").build();
15 | Trigger trigger = TriggerBuilder.newTrigger()
16 | .withSchedule(SimpleScheduleBuilder.simpleSchedule().withIntervalInSeconds(30).repeatForever()).build();
17 |
18 | SchedulerFactory factory = new StdSchedulerFactory();
19 |
20 | Scheduler scheduler = factory.getScheduler();
21 | scheduler.start();
22 | scheduler.scheduleJob(job, trigger);
23 | }
24 |
25 | }
26 |
27 | // class MyTrigger extend Trig
28 |
--------------------------------------------------------------------------------
/src/com/chenxb/test/TestHanlp.java:
--------------------------------------------------------------------------------
1 | package com.chenxb.test;
2 |
3 | import java.sql.SQLException;
4 | import java.util.Arrays;
5 | import java.util.List;
6 |
7 | import org.apache.commons.lang3.StringUtils;
8 | import org.jsoup.Jsoup;
9 | import org.jsoup.nodes.Document;
10 |
11 | import com.chenxb.dao.ArticleDao;
12 | import com.chenxb.dao.SummaryDao;
13 | import com.hankcs.hanlp.HanLP;
14 |
15 | /**
16 | * 中文分词
17 | * 提取文章摘要
18 | * @author tomchen
19 | *
20 | */
21 | public class TestHanlp {
22 | public static void main(String[] args) throws SQLException, Exception {
23 |
24 | for (int i = 7948; i >= 7896; i--) {
25 | System.out.println(new SummaryDao().updateSummary(0, i));
26 | }
27 |
28 | }
29 |
30 | public static String get(int id) throws SQLException, Exception {
31 | String document = new ArticleDao().getArticleByTypeId(0, id).getBody();
32 | Document doc = Jsoup.parse(document);
33 | List sentenceList = HanLP.extractSummary(doc.text(), 2);
34 |
35 | if (!sentenceList.isEmpty()) {
36 | String summary = sentenceList.toString();
37 | String temp = summary.substring(1, summary.length() - 1);
38 | temp = temp.replaceAll("&" + "nbsp;", "");
39 | // unicode 空格是160
40 | temp = temp.replaceAll(String.valueOf((char) 160), "");
41 | System.out.println("util===" + temp.trim());
42 | // 将多个空格替换为1个空格
43 | return temp.trim().replaceAll("\\s+", " ") + "。";
44 | } else {
45 | return "";
46 | }
47 | }
48 |
49 | }
50 |
--------------------------------------------------------------------------------
/src/com/chenxb/test/TestJob.java:
--------------------------------------------------------------------------------
1 | package com.chenxb.test;
2 |
3 | import java.io.IOException;
4 |
5 | import org.jsoup.Jsoup;
6 | import org.jsoup.nodes.Document;
7 | import org.jsoup.nodes.Element;
8 | import org.jsoup.select.Elements;
9 | import org.quartz.Job;
10 | import org.quartz.JobExecutionContext;
11 | import org.quartz.JobExecutionException;
12 |
13 | import com.chenxb.biz.ArticleBiz;
14 | import com.chenxb.util.Constant;
15 | import com.chenxb.util.UrlTool;
16 |
17 | public class TestJob implements Job {
18 |
19 | private int articleId;
20 |
21 | public TestJob(int articleId) {
22 | this.articleId = articleId;
23 | }
24 |
25 | @Override
26 | public void execute(JobExecutionContext arg0) throws JobExecutionException {
27 |
28 | System.out.println("========== articleId " + articleId + " ==========");
29 | System.out.println("TestJob running");
30 | }
31 |
32 | public static void main(String[] args) throws IOException {
33 | // 根据后缀的数字,拼接新闻 url
34 | Document doc = Jsoup.connect(Constant.SEE_URL).timeout(10000).get();
35 | // 去掉jsoup对html字符串加的"\n",方便json字符串返回
36 | doc.outputSettings().prettyPrint(false);
37 |
38 | Elements eles = doc.getElementsByClass("rotaion_list");
39 | System.out.println(eles.get(0));
40 | //
62 |
63 | // Element contentEle = articleEle.getElementById("article_content");
64 | // // 处理相对路径 url,不和上面的 image url 冲突
65 | // Elements hrefs = contentEle.getElementsByTag("a");
66 | // for (int i = 0; i < hrefs.size(); i++) {
67 | // String origin = hrefs.get(i).attr("href");
68 | // System.out.println("origin: " + origin.length());
69 | // String newUrl = UrlTool.dealAttachmentUrl(id, origin);
70 | // System.out.println("newUrl: " + origin);
71 | }
72 |
73 | }
74 |
--------------------------------------------------------------------------------
/src/com/chenxb/test/TestTimeAgo.java:
--------------------------------------------------------------------------------
1 | package com.chenxb.test;
2 |
3 | import com.chenxb.util.GetTimeAgo;
4 |
5 | public class TestTimeAgo {
6 | public static void main(String[] args) {
7 | long time = System.currentTimeMillis();
8 | System.out.println(time);
9 |
10 | System.out.println(time / 1000 < 1000000000000L);
11 |
12 | try {
13 | Thread.sleep(1000 * 280);
14 | } catch (InterruptedException e) {
15 | e.printStackTrace();
16 | }
17 |
18 | System.out.println(GetTimeAgo.getTimeAgo(time / 1000));
19 |
20 | }
21 |
22 | }
23 |
--------------------------------------------------------------------------------
/src/com/chenxb/util/ColumnType.java:
--------------------------------------------------------------------------------
1 | package com.chenxb.util;
2 |
3 | public class ColumnType {
4 |
5 | // LATEST是选取的下面几个栏目里的最近新闻
6 | public static final int LATEST = 0;// 最新消息
7 | public static final int NOTIFIC = 1;// 校园通知
8 | public static final int BACHELOR = 2;// 本科教学 学士
9 | public static final int MASTER = 3;// 研究生 硕士
10 | public static final int ACADEMIC = 5;// 学术交流
11 | // 选取了电院新闻的部分栏目
12 | public static final int JOB = 8;// 就业招聘
13 |
14 | }
15 |
--------------------------------------------------------------------------------
/src/com/chenxb/util/Constant.java:
--------------------------------------------------------------------------------
1 | package com.chenxb.util;
2 |
3 | public class Constant {
4 | public static final String SEE_URL = "http://**********";
5 |
6 | public static final String ARTICLE_BASE_URL = "http://**********";
7 |
8 | public static final String HTTP_PREFIX = "http://";
9 | public static final String HTTPS_PREFIX = "https://";
10 | public static final String FTP_PREFIX = "ftp://";
11 |
12 | public static final String SRC_PREFIX = "src=\"";
13 | public static final String HREF_PREFIX = "href=\"";
14 |
15 | public static final String WWW_PREFIX = "www";
16 | public static final String WEBSITE_NAME = "电院";
17 | public static final String JS_PREFIX = "javascript";
18 |
19 | // mailto:lzli@see.xidian.edu.cn
20 | public static final String MAILTO_PREFIX = "mailto:";
21 |
22 | // 附件图标(资源已经不存在)
23 | public static final String DOC_JPG_SUFFIX = "doc.jpg";
24 | public static final String XLS_JPG_SUFFIX = "xls.jpg";
25 | public static final String RAR_JPG_SUFFIX = "rar.jpg";
26 | public static final String ZIP_JPG_SUFFIX = "zip.jpg";
27 |
28 | // 无用连接 附件图标等等
29 | public static final String[] USELESS_IMAGE_URL = { "912720f605b84070e223d0dab690a114",
30 | "b5805b46ce8cf9c634b3820a23d64ca6", "84b7028179e09614540cea8dd0122c3c" };
31 |
32 | // 七牛图片链接 域名
33 | public static final String BUCKET_HOST_NAME = "http://**********";
34 |
35 | public static final boolean DEBUG = true;
36 |
37 | // 手机端 listview 新闻数目
38 | public static final int EACH_AMOUNT = 10;
39 |
40 | // 轮播图片数量
41 | public static final int ROTATION_AMOUNT = 7;
42 |
43 | }
44 |
--------------------------------------------------------------------------------
/src/com/chenxb/util/GetTimeAgo.java:
--------------------------------------------------------------------------------
1 | package com.chenxb.util;
2 |
3 | public class GetTimeAgo {
4 |
5 | /**
6 | * Created by tomchen on 2/26/16.
7 | */
8 |
9 | private static final int SECOND_MILLIS = 1000;
10 | private static final int MINUTE_MILLIS = 60 * SECOND_MILLIS;
11 |
12 | private static final int HOUR_MILLIS = 60 * MINUTE_MILLIS;
13 | private static final int DAY_MILLIS = 24 * HOUR_MILLIS;
14 |
15 | public static String getTimeAgo(long time) {
16 | if (time < 1000000000000L) {
17 | // if timestamp given in seconds, convert to millis
18 | time *= 1000;
19 | }
20 |
21 | long now = System.currentTimeMillis();
22 | if (time > now || time <= 0) {
23 | return "未知时间";
24 | }
25 |
26 | final long diff = now - time;
27 |
28 | if (diff < MINUTE_MILLIS) {
29 | return "刚刚";
30 | } else if (diff < 2 * MINUTE_MILLIS) {
31 | return "1分钟前";
32 | } else if (diff < 50 * MINUTE_MILLIS) {
33 | return diff / MINUTE_MILLIS + "分钟前";
34 | } else if (diff < 90 * MINUTE_MILLIS) {
35 | return "1小时前";
36 | } else if (diff < 24 * HOUR_MILLIS) {
37 | return diff / HOUR_MILLIS + "小时前";
38 | } else if (diff < 48 * HOUR_MILLIS) {
39 | return "昨天";
40 | } else {
41 | return diff / DAY_MILLIS + "天前";
42 | }
43 | }
44 | }
45 |
--------------------------------------------------------------------------------
/src/com/chenxb/util/HttpTool.java:
--------------------------------------------------------------------------------
1 | package com.chenxb.util;
2 |
3 | import java.io.IOException;
4 | import java.io.InputStream;
5 | import java.net.HttpURLConnection;
6 | import java.net.URL;
7 |
8 |
9 | public class HttpTool {
10 | /**
11 | *
12 | * @param urlStr
13 | * 网页链接
14 | * @return 网页的 html 源码
15 | * @throws Exception
16 | * @throws CommonException
17 | * @throws IOException
18 | */
19 | public static String doGet(String urlStr) throws Exception {
20 | URL url;
21 | String html = "";
22 | url = new URL(urlStr);
23 | HttpURLConnection connection = (HttpURLConnection) url.openConnection();
24 | connection.setRequestMethod("GET");
25 | connection.setConnectTimeout(5000);
26 | connection.setDoInput(true);
27 | connection.setDoOutput(true);
28 | if (connection.getResponseCode() == 200) {
29 | InputStream in = connection.getInputStream();
30 | html = StreamTool.inToStringByByte(in);
31 | in.close();
32 | } else {
33 | throw new Exception("新闻服务器返回值不为200");
34 | }
35 | return html;
36 | }
37 |
38 | }
39 |
--------------------------------------------------------------------------------
/src/com/chenxb/util/ImageTool.java:
--------------------------------------------------------------------------------
1 | package com.chenxb.util;
2 |
3 | import java.io.PrintWriter;
4 | import java.io.StringWriter;
5 |
6 | import org.apache.commons.lang3.StringUtils;
7 |
8 | import com.qiniu.common.QiniuException;
9 | import com.qiniu.storage.BucketManager;
10 | import com.qiniu.util.Auth;
11 |
12 | public class ImageTool {
13 |
14 | // 附件下载的图标,忽略
15 | private static final String IMAGE_BASE = "/uploads/image";
16 | private static final String IMAGE_OLD__BASE = "/uploads/old";
17 | private static final String IMAGE_OLD__NEWS = "/news/Images";
18 | private static final String IMAGE_OLD__GRAPH = "/graph";
19 |
20 | /**
21 | * 图片上传到七牛,和原来的 imageUrl 不相等
22 | * 否则还是返回原来的 url
23 | * @param currentPage
24 | * @param origin
25 | * @return
26 | */
27 | public static String convertUrl(int currentPage, String origin) {
28 | // 图片资源不一定都是在 uploads 文件夹下面
29 | // 也有可能外链到其他网站的图片
30 | if (origin.startsWith(Constant.HTTP_PREFIX)) {
31 | // 以绝对路径开头,最前面是网站域名
32 | // 比如 http://see.xidian.edu.cn/uploads/image/20141120/201411**.png
33 | // http://imgtec.eetrend.com/sites/***
34 | String imageKey = StringTool.createMD5(origin);
35 | uploadByUrl(currentPage, origin, imageKey);
36 | return imageKey;
37 | } else if (origin.contains(IMAGE_BASE)) {
38 | // 相对路径,比如/uploads/image/20141120/20141120**.jpg
39 | // /Public/kindeditor/php/../../../uploads/image/20151116/20151116114927_39484.jpg
40 | // 把图片上传给七牛
41 | // if 的先后顺序,先判断是否是全路径,再判断是不是相对路径
42 | String wholeURl = Constant.SEE_URL + origin;
43 |
44 | String imageKey = StringTool.createMD5(origin);
45 |
46 | uploadByUrl(currentPage, wholeURl, imageKey);
47 | return imageKey;
48 | } else if (origin.startsWith(IMAGE_OLD__BASE)) {
49 | // 老图片路径 /uploads/old
50 | String wholeURl = Constant.SEE_URL + origin;
51 |
52 | String imageKey = StringTool.createMD5(origin);
53 |
54 | uploadByUrl(currentPage, wholeURl, imageKey);
55 | return imageKey;
56 |
57 | } else if (StringUtils.startsWithAny(origin, IMAGE_OLD__NEWS, IMAGE_OLD__GRAPH)) {
58 | // 资源已被删除 返回原地址
59 | // /news/Images/2006060515215782024.jpg
60 | // /graph/jpg.gif
61 | return Constant.SEE_URL + origin;
62 | } else {
63 | // 这部分 todo,识别其他格式的图片
64 | // 或者试图访问这个图片,但失败了,则不是完整的 url
65 | StringBuilder builder = new StringBuilder();
66 | builder.append("ImageTool.convertUrl() 无法解析图片
");
67 | builder.append("图片 src = " + origin + "
");
68 | MailTool.sendException(builder.toString(), currentPage, MailTool.IMAGE_UNUSUAL);
69 | return origin;
70 | }
71 |
72 | }
73 |
74 | /** imageKey 为输入参数
75 | *
76 | * @param currentPage
77 | * @param origin
78 | * @param imageKey
79 | * @return
80 | */
81 | public static String convertUrl(int currentPage, String origin, String imageKey) {
82 | // 图片资源不一定都是在 uploads 文件夹下面
83 | // 也有可能外链到其他网站的图片
84 | if (origin.startsWith(Constant.HTTP_PREFIX)) {
85 | // 以绝对路径开头,最前面是网站域名
86 | // 比如 http://see.xidian.edu.cn/uploads/image/20141120/201411**.png
87 | // http://imgtec.eetrend.com/sites/***
88 | uploadByUrl(currentPage, origin, imageKey);
89 | return imageKey;
90 | } else if (origin.startsWith(IMAGE_BASE)) {
91 | // 相对路径,比如/uploads/image/20141120/20141120**.jpg
92 | // /Public/kindeditor/php/../../../uploads/image/20151116/20151116114927_39484.jpg
93 | // 把图片上传给七牛
94 | // if 的先后顺序,先判断是否是全路径,再判断是不是相对路径
95 | String wholeURl = Constant.SEE_URL + origin;
96 |
97 | uploadByUrl(currentPage, wholeURl, imageKey);
98 | return imageKey;
99 | } else if (origin.startsWith(IMAGE_OLD__BASE)) {
100 | // 老图片路径 /uploads/old
101 | String wholeURl = Constant.SEE_URL + origin;
102 | uploadByUrl(currentPage, wholeURl, imageKey);
103 | return imageKey;
104 | } else {
105 | // 这部分 todo,识别其他格式的图片
106 | // 或者试图访问这个图片,但失败了,则不是完整的 url
107 | StringBuilder builder = new StringBuilder();
108 | builder.append("ImageTool.convertUrl() 无法解析图片
");
109 | builder.append("图片 url = " + origin + "
");
110 | MailTool.sendException(builder.toString(), currentPage, MailTool.IMAGE_UNUSUAL);
111 | return origin;
112 | }
113 |
114 | }
115 |
116 | /**
117 | *
118 | * @param url
119 | * 给定图片的 url
120 | * @return 将图片上传至七牛,返回七牛上图片的 url
121 | * @throws QiniuException
122 | */
123 | private static void uploadByUrl(int currentPage, String originalUrl, String key) {
124 | FetchRunnable f = new FetchRunnable(currentPage, originalUrl, key);
125 | new Thread(f).start();
126 | }
127 |
128 | }
129 |
130 | /**
131 | * 图片上传使用多线程
132 | * 有问题?上传失败如何回滚?
133 | * 失败的概率很小,暂时不考虑
134 | * 或者失败了发邮件通知
135 | * @author tomchen
136 | *
137 | */
138 | class FetchRunnable implements Runnable {
139 | private static final String ACCESS_KEY = "**-*********"; // 你的access_key
140 | private static final String SECRET_KEY = "**-*********"; // 你的secret_key
141 | private static final String BUCKET_NAME = "*****"; // 你的secret_key
142 |
143 | private int currentPage;
144 | private String url;
145 | private String key;
146 |
147 | public FetchRunnable(int currentPage, String url, String key) {
148 | this.currentPage = currentPage;
149 | this.url = url;
150 | this.key = key;
151 | }
152 |
153 | @Override
154 | public void run() {
155 | // 获取到 Access Key 和 Secret Key 之后,您可以按照如下方式进行密钥配置
156 | Auth auth = Auth.create(ACCESS_KEY, SECRET_KEY);
157 | // 获取空间管理器
158 | BucketManager bucketManager = new BucketManager(auth);
159 | try {
160 | // 要求url可公网正常访问BucketManager.fetch(url, bucketName, key);
161 | // @param url 网络上一个资源文件的URL
162 | // @param bucketName 空间名称
163 | // @param key 空间内文件的key[唯一的]
164 | bucketManager.fetch(url, BUCKET_NAME, key);
165 | } catch (QiniuException e) {
166 | // 处理已知的部分资源不存在
167 | if (StringUtils.endsWithAny(url, Constant.DOC_JPG_SUFFIX, Constant.XLS_JPG_SUFFIX, Constant.RAR_JPG_SUFFIX,
168 | Constant.ZIP_JPG_SUFFIX)) {
169 | // 已经手工上传了这几种图标
170 | return;
171 | }
172 |
173 | StringWriter errors = new StringWriter();
174 | e.printStackTrace(new PrintWriter(errors));
175 |
176 | StringBuilder builder = new StringBuilder(errors.toString());
177 | builder.append(" ImageTool.uploadByUrl(url, key)发生异常!
");
178 | builder.append(" url = " + url + "
");
179 | builder.append(" key = " + key + "
");
180 | MailTool.sendException(builder.toString(), currentPage, MailTool.ARTICLE_ITEM_BIZ);
181 |
182 | // to do 失败了邮件通知
183 | }
184 | }
185 | }
186 |
--------------------------------------------------------------------------------
/src/com/chenxb/util/JobScheduler.java:
--------------------------------------------------------------------------------
1 | package com.chenxb.util;
2 |
3 | import java.util.LinkedList;
4 | import java.util.List;
5 | import java.util.Random;
6 |
7 | import com.chenxb.biz.ArticleBiz;
8 | import com.chenxb.biz.ColumnBiz;
9 |
10 | public class JobScheduler {
11 |
12 | public static void main(String[] args) throws Exception {
13 | int[] ids = ColumnBiz.parseColumn(2, 3);
14 | List datas = new LinkedList();
15 | for (int i = 0; i < ids.length; i++) {
16 | datas.add(ids[i]);
17 | }
18 |
19 | while (datas.size() > 0) {
20 | Random r = new Random();
21 | int id = datas.remove(0);
22 | System.out.println(ArticleBiz.parseNewsItem(id));
23 | System.out.println("deal id = " + id);
24 | Thread.sleep(100 * 1000 + r.nextInt(50 * 1000) + r.nextInt(20 * 1000));
25 |
26 | }
27 |
28 | // JobDetail job =
29 | // JobBuilder.newJob(TestJob.class).withIdentity("ttt").build();
30 | // Trigger trigger = TriggerBuilder.newTrigger()
31 | // .withSchedule(SimpleScheduleBuilder.simpleSchedule()
32 | // .withIntervalInSeconds(30).repeatForever()).build();
33 | //
34 | // SchedulerFactory factory = new StdSchedulerFactory();
35 | //
36 | // Scheduler scheduler = factory.getScheduler();
37 | // scheduler.start();
38 | // scheduler.scheduleJob(job, trigger);
39 |
40 | }
41 |
42 | }
43 |
--------------------------------------------------------------------------------
/src/com/chenxb/util/MailTool.java:
--------------------------------------------------------------------------------
1 | package com.chenxb.util;
2 |
3 | import java.security.GeneralSecurityException;
4 | import java.util.Properties;
5 |
6 | import javax.mail.Address;
7 | import javax.mail.Message;
8 | import javax.mail.MessagingException;
9 | import javax.mail.Session;
10 | import javax.mail.Transport;
11 | import javax.mail.internet.InternetAddress;
12 | import javax.mail.internet.MimeMessage;
13 |
14 | import com.sun.mail.util.MailSSLSocketFactory;
15 |
16 | public class MailTool {
17 | /**
18 | * 将爬虫抛出异常的url、堆栈信息发送邮件
19 | *
20 | * @param content
21 | * 邮件类型
22 | * @param type
23 | * 爬虫错误代码
24 | *
25 | * @return 邮件是否发送成功
26 | */
27 |
28 | // 图片异常,不是以 /uploads 开头
29 | public static final int IMAGE_UNUSUAL = 0;
30 | public static final int ARTICLE_ITEM_BIZ = 1;
31 | public static final int HREF_UNUSUAL = 2;
32 |
33 | public static boolean sendException(String content, int currentPage, int type) {
34 | // 配置信息支持从文件读取 props.load(InputStream inStream);
35 | Properties props = new Properties();
36 |
37 | // 调试的时候需开启debug调试
38 | props.setProperty("mail.debug", "false");
39 | // 发送服务器需要身份验证
40 | props.setProperty("mail.smtp.auth", "true");
41 | // 设置邮件服务器主机名
42 | props.setProperty("mail.host", "smtp.qq.com");
43 | // 发送邮件协议名称
44 | props.setProperty("mail.transport.protocol", "smtp");
45 |
46 | MailSSLSocketFactory sf;
47 | try {
48 | sf = new MailSSLSocketFactory();
49 | sf.setTrustAllHosts(true);
50 | props.put("mail.smtp.ssl.enable", "true");
51 | props.put("mail.smtp.ssl.socketFactory", sf);
52 | } catch (GeneralSecurityException e) {
53 | e.printStackTrace();
54 | return false;
55 | }
56 |
57 | // 根据配置文件生成一个 session 对象
58 | Session session = Session.getInstance(props);
59 |
60 | // 发件人邮箱用户名、密码,连接到邮件服务器,
61 | Transport transport;
62 | try {
63 | transport = session.getTransport();
64 | transport.connect("smtp.qq.com", "905073281@qq.com", "*********");
65 |
66 | } catch (Exception e) {
67 | e.printStackTrace();
68 | return false;
69 | }
70 |
71 | // 创建邮件
72 | Message msg = new MimeMessage(session);
73 | // 邮件主题,也就是标题
74 | try {
75 | msg.setSubject("seenews 错误 type " + type);
76 | // 邮件内容,支持 html 格式
77 | StringBuilder builder = new StringBuilder(content);
78 | builder.append("新闻页面 url = " + Constant.ARTICLE_BASE_URL + currentPage + ".html" + "
");
79 | builder.append("异常发生时间 " + TimeTool.getCurrentTime() + "
");
80 | // /* 设置Content 浏览器解析编码和格式等 */
81 | msg.setContent(builder.toString(), "text/html;charset=utf-8");
82 | // 设置发件人的邮箱
83 | msg.setFrom(new InternetAddress("905073281@qq.com"));
84 |
85 | // 给收件人的地址发送上面的 Message
86 | transport.sendMessage(msg, new Address[] { new InternetAddress("studychen@foxmail.com") });
87 | transport.close();
88 | } catch (MessagingException e) {
89 | e.printStackTrace();
90 | return false;
91 | }
92 |
93 | // 无异常抛出,表示发送成功
94 | return true;
95 | }
96 | }
97 |
--------------------------------------------------------------------------------
/src/com/chenxb/util/MysqlTool.java:
--------------------------------------------------------------------------------
1 | package com.chenxb.util;
2 |
3 | import java.sql.Connection;
4 | import java.sql.DriverManager;
5 | import java.sql.SQLException;
6 |
7 | import com.sina.sae.util.SaeUserInfo;
8 |
9 | public class MysqlTool {
10 |
11 | // 是否使用新浪云
12 | private static boolean isCloud = false;
13 |
14 | public MysqlTool() {
15 | // JDBC驱动程序
16 | try {
17 | Class.forName("com.mysql.jdbc.Driver").newInstance();
18 | } catch (Exception e) {
19 | e.printStackTrace();
20 | }
21 | }
22 |
23 | // 为了方便分析错误,将异常全部抛出到最顶层
24 | public Connection getConnection() throws Exception {
25 |
26 | // 后面unicode和utf8设置防止中文乱码
27 | String url = "jdbc:mysql://127.0.0.1:3306/see_news?useSSL=false&useUnicode=true&characterEncoding=utf-8";
28 | String name = "root";
29 | String password = "chenxb123";
30 |
31 | if (isCloud) {
32 | String appName = SaeUserInfo.getAppName();
33 | String mysqlName = "app_" + appName;
34 | url = "jdbc:mysql://w.rdc.sae.sina.com.cn:3307/" + mysqlName + "?autoReconnect=true";
35 | name = SaeUserInfo.getAccessKey();
36 | password = SaeUserInfo.getSecretKey();
37 | }
38 | Connection con = DriverManager.getConnection(url, name, password);
39 |
40 | return con;
41 |
42 | }
43 |
44 | }
45 |
--------------------------------------------------------------------------------
/src/com/chenxb/util/StreamTool.java:
--------------------------------------------------------------------------------
1 | package com.chenxb.util;
2 |
3 | import java.io.BufferedReader;
4 | import java.io.ByteArrayOutputStream;
5 | import java.io.InputStream;
6 | import java.io.InputStreamReader;
7 |
8 | public class StreamTool {
9 |
10 | /**
11 | * 利用ByteArrayOutputStream将流转化为字符串
12 | *
13 | * @param in
14 | * 需要读取的InputStream
15 | * @return 读取的字符串
16 | * @throws Exception
17 | */
18 | public static String inToStringByByte(InputStream in) throws Exception {
19 | ByteArrayOutputStream outStr = new ByteArrayOutputStream();
20 | byte[] buffer = new byte[1024];
21 | // 这部分有问题,一个中文3个byte,如何确定1024最末尾的正好是一个中文
22 | int len = 0;
23 | StringBuilder content = new StringBuilder();
24 | while ((len = in.read(buffer)) != -1) {
25 | content.append(new String(buffer, 0, len, "UTF-8"));
26 | }
27 | outStr.close();
28 | return content.toString();
29 | }
30 |
31 | /**
32 | * 利用BufferedReader将流转化为字符串
33 | *
34 | * @param in
35 | * 需要读取的InputStream
36 | * @return 读取的字符串
37 | * @throws Exception
38 | */
39 | public static String inToStringByReader(InputStream in) throws Exception {
40 | BufferedReader reader = null;
41 | StringBuilder content = new StringBuilder();
42 | reader = new BufferedReader(new InputStreamReader(in));
43 | String line = "";
44 | while ((line = reader.readLine()) != null) {
45 | System.out.println(line);
46 | content.append(line);
47 | }
48 | return content.toString();
49 | }
50 |
51 | }
52 |
--------------------------------------------------------------------------------
/src/com/chenxb/util/StringTool.java:
--------------------------------------------------------------------------------
1 | package com.chenxb.util;
2 |
3 | import java.math.BigInteger;
4 | import java.security.MessageDigest;
5 | import java.security.NoSuchAlgorithmException;
6 | import java.util.UUID;
7 |
8 | public class StringTool {
9 |
10 | public static String createUUID() {
11 | String s = UUID.randomUUID().toString();
12 | return s.replaceAll("-", "");
13 | }
14 |
15 | public static String createMD5(String plaintext) {
16 | MessageDigest m;
17 | try {
18 | m = MessageDigest.getInstance("MD5");
19 | m.reset();
20 | m.update(plaintext.getBytes());
21 | byte[] digest = m.digest();
22 | BigInteger bigInt = new BigInteger(1, digest);
23 | return bigInt.toString(16);
24 | } catch (NoSuchAlgorithmException e) {
25 | }
26 | return plaintext;
27 | }
28 | }
29 |
--------------------------------------------------------------------------------
/src/com/chenxb/util/TableName.java:
--------------------------------------------------------------------------------
1 | package com.chenxb.util;
2 |
3 | public class TableName {
4 | public static final String LATEST = "latest";// 最新消息
5 | public static final String NOTIFIC = "notific";// 校园通知
6 | public static final String BACHELOR = "bachelor";// 本科教学 学士
7 | public static final String MASTER = "master";// 研究生 硕士
8 | public static final String ACADEMIC = "academic";// 学术交流
9 | public static final String JOB = "job";// 就业招聘
10 |
11 | /**
12 | * 这儿的设计能不能更优雅
13 | * 0是最新消息表,1是校园通知表
14 | * @param type 获取对应的表名称
15 | * @return
16 | */
17 | public static String getTableByType(int type) {
18 | switch (type) {
19 | case ColumnType.LATEST:
20 | return LATEST;
21 | case ColumnType.NOTIFIC:
22 | return NOTIFIC;
23 | case ColumnType.BACHELOR:
24 | return BACHELOR;
25 | case ColumnType.MASTER:
26 | return MASTER;
27 | case ColumnType.ACADEMIC:
28 | return ACADEMIC;
29 | case ColumnType.JOB:
30 | return JOB;
31 | default:
32 | return LATEST;
33 | }
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/src/com/chenxb/util/TimeTool.java:
--------------------------------------------------------------------------------
1 | package com.chenxb.util;
2 |
3 | import java.text.SimpleDateFormat;
4 | import java.util.Date;
5 | import java.util.Random;
6 |
7 | public class TimeTool {
8 | private static final SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss E");
9 |
10 | private static final Random random = new Random();
11 | private static final int WAIT_TIME = 60 * 1000;// 毫秒
12 |
13 | /**
14 | *
15 | * @return 格式化,得到当前的日期
16 | */
17 | public static String getCurrentTime() {
18 | return dateFormat.format(new Date());
19 | }
20 |
21 | /**
22 | * 等待一段时间,防止对被爬虫的网站负载太大
23 | */
24 | public static void sleepSomeTime() {
25 | try {
26 | Thread.sleep(random.nextInt(WAIT_TIME) + WAIT_TIME);
27 | } catch (InterruptedException e) {
28 | e.printStackTrace();
29 | }
30 | }
31 |
32 | }
33 |
--------------------------------------------------------------------------------
/src/com/chenxb/util/UrlTool.java:
--------------------------------------------------------------------------------
1 | package com.chenxb.util;
2 |
3 | import java.util.regex.Pattern;
4 |
5 | import org.apache.commons.lang3.StringUtils;
6 |
7 | public class UrlTool {
8 | // LATEST,//最新消息
9 | // NOTIFIC, //校园通知
10 | // BACHELOR, //本科教学 学士
11 | // MASTER, //研究生 硕士
12 | // RESEARCH, //科研
13 | // ACADEMIC //学术交流
14 |
15 | private static final String LATEST_URL = "http://see.xidian.edu.cn/index.php/index/more";
16 | // 格式为http://see.xidian.edu.cn/html/category/5/2.html
17 | private static final String NOTIFIC_URL = "http://see.xidian.edu.cn/html/category/";
18 |
19 | public static final Pattern VALID_EMAIL_REGEX = Pattern.compile("^[A-Z0-9._%+-]+@[A-Z0-9.-]+\\.[A-Z]{2,6}$",
20 | Pattern.CASE_INSENSITIVE);
21 |
22 | /**
23 | *
24 | * @param type
25 | * @param currentPage
26 | * 不是无限大,有一定范围
27 | * @return
28 | */
29 | public static String generateUrl(int type, int currentPage) {
30 | currentPage = currentPage > 0 ? currentPage : 1;
31 | switch (type) {
32 | case ColumnType.LATEST:
33 | return LATEST_URL;
34 | case ColumnType.NOTIFIC:
35 | case ColumnType.BACHELOR:
36 | case ColumnType.MASTER:
37 | case ColumnType.ACADEMIC:
38 | case ColumnType.JOB:
39 | return NOTIFIC_URL + type + "/" + currentPage + ".html";
40 | default:
41 | return LATEST_URL;
42 | }
43 | }
44 |
45 | /**
46 | * 这儿的tag 不和 冲突
47 | *
48 | * 处理文章 body 里的 url
49 | * /uploads/file/20150706/20150706094631_73253.doc
50 | *
51 | * 相对路径全部转化为绝对路径
52 | * @param originTrim
53 | * @return
54 | */
55 | public static String dealAttachmentUrl(int currentPage, String origin) {
56 | // 去掉首尾的空格
57 | String originTrim = origin.trim();
58 | // 附件不一定都是在 uploads 文件夹下面
59 | // 也有可能外链到其他网站的图片/uploads/image/20141120/20141120**.jpg
60 | // /news/Upload/2006051811250740787.xls
61 | if (StringUtils.startsWithAny(originTrim, "/uploads", "/news/Upload","/news/Images")) {
62 | // 相对路径,比如
63 | return Constant.SEE_URL + originTrim;
64 | } else if (StringUtils.startsWithAny(originTrim, Constant.HTTP_PREFIX, Constant.HTTPS_PREFIX,
65 | Constant.FTP_PREFIX, Constant.JS_PREFIX)) {
66 | // http https ftp 开头
67 | // 先后顺序,先判断是不是 http 开头,再判断是不是 www 开头
68 | if (Constant.DEBUG) {
69 | System.out.println("in dealAttachmentUrl 全路径");
70 | }
71 | return originTrim;
72 | } else if (originTrim.length() == 0 || originTrim.equals("")) {
73 | // 无效标签,获得的href为""
74 | return originTrim;
75 | } else if (originTrim.startsWith(Constant.MAILTO_PREFIX)) {
76 | return originTrim;
77 | } else if (VALID_EMAIL_REGEX.matcher(originTrim).find()) {
78 | // 只是邮件名,加上 mailto
79 | // 注意前后顺序
80 | return Constant.MAILTO_PREFIX + originTrim;
81 | } else if (originTrim.startsWith(Constant.WWW_PREFIX)) {
82 | // www开头的,加上 http://
83 | return Constant.HTTP_PREFIX + originTrim;
84 | } else if (originTrim.equals(Constant.WEBSITE_NAME)) {
85 | return Constant.SEE_URL;
86 | } else {
87 | // 链接未在考虑范围内,发邮件通知
88 | StringBuilder builder = new StringBuilder();
89 | builder.append("UrlTool.dealAttachmentUrl() 无法解析url
");
90 | builder.append("异常 href = " + originTrim + "
");
91 | MailTool.sendException(builder.toString(), currentPage, MailTool.HREF_UNUSUAL);
92 | return originTrim;
93 | }
94 | }
95 |
96 | }
97 |
--------------------------------------------------------------------------------
/src/hanlp.properties:
--------------------------------------------------------------------------------
1 | #本配置文件中的路径的根目录,根目录+其他路径=绝对路径
2 | #Windows用户请注意,路径分隔符统一使用/
3 | root=./
4 | #核心词典路径
5 | CoreDictionaryPath=data/dictionary/CoreNatureDictionary.txt
6 | #2元语法词典路径
7 | BiGramDictionaryPath=data/dictionary/CoreNatureDictionary.ngram.txt
8 | #停用词词典路径
9 | CoreStopWordDictionaryPath=data/dictionary/stopwords.txt
10 | #同义词词典路径
11 | CoreSynonymDictionaryDictionaryPath=data/dictionary/synonym/CoreSynonym.txt
12 | #人名词典路径
13 | PersonDictionaryPath=data/dictionary/person/nr.txt
14 | #人名词典转移矩阵路径
15 | PersonDictionaryTrPath=data/dictionary/person/nr.tr.txt
16 | #繁简词典路径
17 | TraditionalChineseDictionaryPath=data/dictionary/tc/TraditionalChinese.txt
18 | #自定义词典路径,用;隔开多个自定义词典,空格开头表示在同一个目录,使用“文件名 词性”形式则表示这个词典的词性默认是该词性。优先级递减。
19 | #另外data/dictionary/custom/CustomDictionary.txt是个高质量的词库,请不要删除
20 | CustomDictionaryPath=data/dictionary/custom/CustomDictionary.txt; 现代汉语补充词库.txt; 全国地名大全.txt ns; 人名词典.txt; 机构名词典.txt; 上海地名.txt ns;data/dictionary/person/nrf.txt nrf
21 | #CRF分词模型路径
22 | CRFSegmentModelPath=data/model/segment/CRFSegmentModel.txt
23 | #HMM分词模型
24 | HMMSegmentModelPath=data/model/segment/HMMSegmentModel.bin
25 | #分词结果是否展示词性
26 | ShowTermNature=true
--------------------------------------------------------------------------------