├── .gitignore ├── LICENSE ├── README.md ├── WebRoot ├── META-INF │ └── MANIFEST.MF ├── WEB-INF │ ├── classes │ │ └── hanlp.properties │ ├── lib │ │ ├── hanlp.properties │ │ └── lexicon │ │ │ ├── lex-admin.lex │ │ │ ├── lex-autoload.todo │ │ │ ├── lex-cemixed.lex │ │ │ ├── lex-chars.lex │ │ │ ├── lex-cn-mz.lex │ │ │ ├── lex-cn-place.lex │ │ │ ├── lex-company.lex │ │ │ ├── lex-dname-1.lex │ │ │ ├── lex-dname-2.lex │ │ │ ├── lex-ecmixed.lex │ │ │ ├── lex-en-pun.lex │ │ │ ├── lex-en.lex │ │ │ ├── lex-festival.lex │ │ │ ├── lex-fname.lex │ │ │ ├── lex-food.lex │ │ │ ├── lex-lang.lex │ │ │ ├── lex-ln-adorn.lex │ │ │ ├── lex-lname.lex │ │ │ ├── lex-main.lex │ │ │ ├── lex-nation.lex │ │ │ ├── lex-net.lex │ │ │ ├── lex-org.lex │ │ │ ├── lex-sname.lex │ │ │ ├── lex-stopword.lex │ │ │ ├── lex-touris.lex │ │ │ └── lex-units.lex │ └── web.xml ├── css │ └── detail.css ├── index.jsp └── jsp │ └── detail.jsp └── src ├── com └── chenxb │ ├── biz │ ├── ArticleBiz.java │ ├── ColumnBiz.java │ ├── RotationImageBiz.java │ └── UploadRandomImage.java │ ├── common │ └── StreamTool.java │ ├── dao │ ├── ArticleDao.java │ ├── ColumnDao.java │ ├── RotationImageDao.java │ ├── SearchDao.java │ └── SummaryDao.java │ ├── jpush │ └── TestJpush.java │ ├── model │ ├── ArticleItem.java │ ├── RotationItem.java │ └── SimpleArticleItem.java │ ├── news │ ├── HelloLucene.java │ ├── LoadRotation.java │ ├── ReUploadImage.java │ ├── ReloadAcademic.java │ ├── ReloadAll.java │ ├── ReloadBachelor.java │ ├── ReloadJob.java │ ├── ReloadLatest.java │ ├── ReloadMaster.java │ ├── ReloadNotific.java │ ├── Test.java │ ├── Test4.java │ └── TestJcseg.java │ ├── servlet │ ├── ArticleWithSql.java │ ├── ColumnArticlesWithSql.java │ ├── MoreArticlesWithSql.java │ ├── ParseArticleById.java │ ├── RotationWithSql.java │ └── SearchArticle.java │ ├── test │ ├── JobScheduler.java │ ├── TestHanlp.java │ ├── TestJob.java │ └── TestTimeAgo.java │ └── util │ ├── ColumnType.java │ ├── Constant.java │ ├── GetTimeAgo.java │ ├── HttpTool.java │ ├── ImageTool.java │ ├── JobScheduler.java │ ├── MailTool.java │ ├── MysqlTool.java │ ├── StreamTool.java │ ├── StringTool.java │ ├── TableName.java │ ├── TimeTool.java │ └── UrlTool.java └── hanlp.properties /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | 3 | # Mobile Tools for Java (J2ME) 4 | .mtj.tmp/ 5 | 6 | # Package Files # 7 | *.jar 8 | *.war 9 | *.ear 10 | 11 | # eclipse specific git ignore 12 | *.pydevproject 13 | .project 14 | .metadata 15 | bin/** 16 | tmp/** 17 | tmp/**/* 18 | *.tmp 19 | *.bak 20 | *.swp 21 | *~.nib 22 | local.properties 23 | .classpath 24 | .settings/ 25 | .loadpath 26 | 27 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 28 | hs_err_pid* 29 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SeeNewsServer 2 | 3 | Server side of personal news APP, Java Servlet + Mysql implementation 4 | 5 | The first version was hosted on Sina Cloud and later transferred to Alibaba Cloud. 6 | Pictures are stored in Qiniuyun CDN 7 | 8 | JavaServlet+Mysql 9 | ## Development records 10 | Online log monitoring system 11 | Yesterday's updated news data will be sent to your mailbox at 10 o'clock every day 12 | Modify the problem of split method returning a single element for [][""] 13 | Initialization method: From the first page to the last page, 53 records per page, crawling news
14 | If it is interrupted midway, breakpoint initialization is required. The method is:
15 | Get the smallest id from the database, and then find out which page of the website the id is on
16 | Crawl news records below this location
17 | 18 | ## Random Image API 19 | 20 | [http://7xr4g8.com1.z0.glb.clouddn.com/671](http://7xr4g8.com1.z0.glb.clouddn.com/671) Get pictures 21 | 22 | ![blog.csdn.net/never_cxb](http://7xr4g8.com1.z0.glb.clouddn.com/671) 23 | 24 | 671 is a numerical number. Currently, the valid icon numbers are 0 to 964. Random pictures can be obtained by randomly generating IDs. 25 | 26 | ``` 27 | Random randrom = new Random(47); 28 | String url = "http://7xr4g8.com1.z0.glb.clouddn.com/" +randrom.nextInt(964+1); 29 | ``` 30 | 31 | ## mysql Create table statement 32 | 33 | Modify table type and length according to `Exception: Data too long for column`
34 | title Longer example:
35 | "Intelligent Perception and Image Understanding" Key Laboratory of the Ministry of Education The 15th Academic Week and Brain-like Computing and Big Data Deep Learning Frontier Forum 36 | source Longer example: Key Laboratory of Antenna and Microwave Technology 37 | The final table field type and length are as follows: 38 | 39 | ``` 40 | CREATE TABLE `rotation` ( 41 | `id` int(11) NOT NULL, 42 | `image_urls` text, 43 | `title` varchar(100) DEFAULT NULL, 44 | `publish_date` date NOT NULL, 45 | `read_times` int(11) NOT NULL, 46 | `source` varchar(50) DEFAULT NULL, 47 | `body` longtext, 48 | UNIQUE KEY `id_UNIQUE` (`id`) 49 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8; 50 | 51 | ``` 52 | 53 | ## Feature list 54 | 55 | ### Crawling exception email notification 56 | Based on JavaMail, send email notifications for abnormal pictures and abnormal URLs 57 | 58 | #### Abnormal image url 59 | >Normal path of the picture `/uploads/image/20160109/20160109***.jpg` 60 | Old path `/uploads/old/201152**.jpg` 61 | 62 | 63 | | News id | Abnormal image link | Description | 64 | | ------------- |-------------| -----| 65 | | 7798 | `src="/Public/kindeditor/php/`
`../../../uploads/image/2015**.jpg"`| 多了`/Public/kindeditor/php/`
前面需加上`http://see.xidian.edu.cn` | 66 | | 7302 | ``| 图片资源不存在
忽略 | 67 | | 7017 | `src="http://see.xidian.edu.cn/`
`uploads/image/20141021/20**.jpg"`| 绝对路径开头 | 68 | 69 | 70 | ### Reused file download icon 71 | | Icon | Original address | Qiniu key value | 72 | | ------------- |------------| -----| 73 | | | http://rsc.xidian.edu.cn/plus/img/addon.gif
http://see.xidian.edu.cn/uploads/old/ico/zip.jpg
http://xgc.xidian.edu.cn/images/mid.gif
http://jwc.xidian.edu.cn/images/ico/rar.jpg
http://202.117.120.88/images/download.gif
The resource does not exist, use the above gif instead| `912720f605b84070e223d0dab690a114`
`3949a245e521f81ffd18e5d01347a20d`
`2a8eac72c3697a837dd66e9e5243a089`
`bc87e43d342b380a2145ee1bb8298759`
`f7324b0d360946315ac83fb8f2703044`
The key for each link | 74 | | | http://see.xidian.edu.cn/uploads/old/file/doc.gif
http://jwc.xidian.edu.cn/images/ico/doc.jpg
http://see.xidian.edu.cn/uploads/old/ico/doc.jpg | `b5805b46ce8cf9c634b3820a23d64ca6`
`f8d0fc587a7c7295835e8094af094d2d`
`ad5d0e0cf63834756dde3dc5e9629d8` | 75 | | | http://see.xidian.edu.cn/uploads/old/file/xls.gif
http://jwc.xidian.edu.cn/images/ico/xls.jpg
http://zzb.xidian.edu.cn/new/WebEdit/sysimage/icon16/xls.gif | `84b7028179e09614540cea8dd0122c3c`
`d72210a72c0e174245a65e8755f6eaa`
`1323ef50b1457274c914413b067e9192`| 76 | 77 | 78 | #### Collected exception href: 79 | 80 | | News id | Dirty data | Description | 81 | |------------- |-------------| -----| 82 | | - | `href="Electronic Academy"`| href is Chinese | 83 | | 7837 | `/uploads/file/20151202/20151202101309_73187.zip` | The same href appears multiple times
resulting in multiple substitutions
`http://see.xidian.edu.cnhtt`
` p://see.xidian.edu.cn/**.zip`| 84 | | 7710 | `href="Cultivation project application related documents" ` | href is Chinese| 85 | | - | `href="601240943@qq.com"`| Only email address
without the preceding "mailto:" 86 | | - | `kb.xidian.cc `|Does not start with http| 87 | | 6283 | `https://mail.google.com/mail/h/**`| https starts with| 88 | | 6206 | `ftp://linux.xidian.edu.cn`| ftp starts with | 89 | 90 | 91 | Note: Regular href starts with http https 92 | 93 | 94 | ### Upload pictures to Qiniu Cloud 95 | 96 | Asynchronously upload pictures to Qiniu Cloud 97 | 98 | ### Thanks to open source, dependent class libraries 99 | - Java crawler [Jsoup](https://github.com/jhy/jsoup) 100 | - json serialization [gson](https://github.com/google/gson) 101 | - Processing arrays [commons-lang](https://github.com/apache/commons-lang) 102 | - javamail [javamail](https://java.net/projects/javamail/pages/Home) 103 | - Chinese word segmentation [jcseg](http://www.oschina.net/p/jcseg) 104 | - Full-text search engine toolkit [lucene](http://lucene.apache.org/) 105 | - Random image API [unsplash](https://unsplash.it/) 106 | -------------------------------------------------------------------------------- /WebRoot/META-INF/MANIFEST.MF: -------------------------------------------------------------------------------- 1 | Manifest-Version: 1.0 2 | Class-Path: 3 | 4 | -------------------------------------------------------------------------------- /WebRoot/WEB-INF/classes/hanlp.properties: -------------------------------------------------------------------------------- 1 | #本配置文件中的路径的根目录,根目录+其他路径=绝对路径 2 | #Windows用户请注意,路径分隔符统一使用/ 3 | root=./ 4 | #核心词典路径 5 | CoreDictionaryPath=data/dictionary/CoreNatureDictionary.txt 6 | #2元语法词典路径 7 | BiGramDictionaryPath=data/dictionary/CoreNatureDictionary.ngram.txt 8 | #停用词词典路径 9 | CoreStopWordDictionaryPath=data/dictionary/stopwords.txt 10 | #同义词词典路径 11 | CoreSynonymDictionaryDictionaryPath=data/dictionary/synonym/CoreSynonym.txt 12 | #人名词典路径 13 | PersonDictionaryPath=data/dictionary/person/nr.txt 14 | #人名词典转移矩阵路径 15 | PersonDictionaryTrPath=data/dictionary/person/nr.tr.txt 16 | #繁简词典路径 17 | TraditionalChineseDictionaryPath=data/dictionary/tc/TraditionalChinese.txt 18 | #自定义词典路径,用;隔开多个自定义词典,空格开头表示在同一个目录,使用“文件名 词性”形式则表示这个词典的词性默认是该词性。优先级递减。 19 | #另外data/dictionary/custom/CustomDictionary.txt是个高质量的词库,请不要删除 20 | CustomDictionaryPath=data/dictionary/custom/CustomDictionary.txt; 现代汉语补充词库.txt; 全国地名大全.txt ns; 人名词典.txt; 机构名词典.txt; 上海地名.txt ns;data/dictionary/person/nrf.txt nrf 21 | #CRF分词模型路径 22 | CRFSegmentModelPath=data/model/segment/CRFSegmentModel.txt 23 | #HMM分词模型 24 | HMMSegmentModelPath=data/model/segment/HMMSegmentModel.bin 25 | #分词结果是否展示词性 26 | ShowTermNature=true -------------------------------------------------------------------------------- /WebRoot/WEB-INF/lib/hanlp.properties: -------------------------------------------------------------------------------- 1 | #本配置文件中的路径的根目录,根目录+其他路径=绝对路径 2 | #Windows用户请注意,路径分隔符统一使用/ 3 | root=./ 4 | #核心词典路径 5 | CoreDictionaryPath=data/dictionary/CoreNatureDictionary.txt 6 | #2元语法词典路径 7 | BiGramDictionaryPath=data/dictionary/CoreNatureDictionary.ngram.txt 8 | #停用词词典路径 9 | CoreStopWordDictionaryPath=data/dictionary/stopwords.txt 10 | #同义词词典路径 11 | CoreSynonymDictionaryDictionaryPath=data/dictionary/synonym/CoreSynonym.txt 12 | #人名词典路径 13 | PersonDictionaryPath=data/dictionary/person/nr.txt 14 | #人名词典转移矩阵路径 15 | PersonDictionaryTrPath=data/dictionary/person/nr.tr.txt 16 | #繁简词典路径 17 | TraditionalChineseDictionaryPath=data/dictionary/tc/TraditionalChinese.txt 18 | #自定义词典路径,用;隔开多个自定义词典,空格开头表示在同一个目录,使用“文件名 词性”形式则表示这个词典的词性默认是该词性。优先级递减。 19 | #另外data/dictionary/custom/CustomDictionary.txt是个高质量的词库,请不要删除 20 | CustomDictionaryPath=data/dictionary/custom/CustomDictionary.txt; 现代汉语补充词库.txt; 全国地名大全.txt ns; 人名词典.txt; 机构名词典.txt; 上海地名.txt ns;data/dictionary/person/nrf.txt nrf 21 | #CRF分词模型路径 22 | CRFSegmentModelPath=data/model/segment/CRFSegmentModel.txt 23 | #HMM分词模型 24 | HMMSegmentModelPath=data/model/segment/HMMSegmentModel.bin 25 | #分词结果是否展示词性 26 | ShowTermNature=true -------------------------------------------------------------------------------- /WebRoot/WEB-INF/lib/lexicon/lex-admin.lex: -------------------------------------------------------------------------------- 1 | CJK_WORDS 2 | 人事部/nt/ren shi bu/人事管理部门,人事管理部 3 | 人事管理部/nt/ren shi guan li bu/人事管理部门,人事部 4 | 信息产业部/nt/xin xi chan ye bu/null 5 | 农业部/nt/nong ye bu/null 6 | 医管局/nt/yi guan ju/医疗管理部门,医疗管理部,医疗管理局 7 | 医疗管理部/nt/yi liao guan li bu/医疗管理部门,医管局 8 | 医疗管理部门/nt/yi liao guan li bu men/医管局,医疗管理部 9 | 发改委/nt/fa gai wei/null 10 | 国土资源部/nt/guo tu zi yuan bu/null 11 | 国防部/nt/guo fang bu/人民武装力量部,军事部,防卫厅 12 | 军事部/nt/jun shi bu/人民武装力量部,防卫厅 13 | 外交部/nt/wai jiao bu/国务院,政治部,对外关系部,外务省 14 | 外交部长/r/wai jiao bu zhang/null 15 | 教育部/nt/jiao yu bu/null 16 | 文化部/nt/wen hua bu/null 17 | 民政部/nt/min zheng bu/null 18 | 能源部/nt/neng yuan bu/null 19 | 财政部/nt/cai zheng bu/null 20 | 铁道部/nt/tie dao bu/null 21 | 防卫厅/nt/fang wei ting/null 22 | 防卫省/nt/fang wei sheng/null 23 | 革命委员会/nt/ge ming wei yuan hui/null 24 | 交通运输部/nt/jiao tong yun shu bu/null 25 | 对外经济贸易部/nt/dui wai jing ji mao yi bu/null 26 | 技术部/nt/ji shu bu/null 27 | 财务部/nt/cai wu bu/null 28 | 总装备部/nt/zong zhuang bei bu/null 29 | -------------------------------------------------------------------------------- /WebRoot/WEB-INF/lib/lexicon/lex-autoload.todo: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/studychen/SeeNewsServer/3f2ea5ee974e0dd40d735d55fd33334f2efd23c3/WebRoot/WEB-INF/lib/lexicon/lex-autoload.todo -------------------------------------------------------------------------------- /WebRoot/WEB-INF/lib/lexicon/lex-cemixed.lex: -------------------------------------------------------------------------------- 1 | CE_MIXED_WORD 2 | #中文英文混合词词库 3 | 卡拉ok/nz/ka la ok/null 4 | 漂亮mm/nz/piao lian mm/null 5 | 拳皇ova/nz/quan huang ova/拳皇动漫 6 | 奇都ktv/nz/qi du ktv/null 7 | 哆啦a梦/nz/duo la a meng/null 8 | 高3/n/gao san/高三 9 | 高2/n/gao er/高二 10 | 高1/n/gao yi/高一 11 | -------------------------------------------------------------------------------- /WebRoot/WEB-INF/lib/lexicon/lex-cn-mz.lex: -------------------------------------------------------------------------------- 1 | CJK_WORDS 2 | 汉族/nz/han zu/null 3 | 汉族人/nz/han zu ren/null 4 | 汉族语/nz/han zu yu/null 5 | 蒙古族/nz/meng gu zu/null 6 | 蒙古族人/nz/meng gu zu ren/null 7 | 蒙古族语/nz/meng gu zu yu/null 8 | 满族/nz/man zu/null 9 | 满族人/nz/man zu ren/null 10 | 满族语/nz/man zu yu/null 11 | 朝鲜族/nz/chao xian zu/null 12 | 朝鲜族人/nz/zhao xian zu ren/null 13 | 朝鲜族语/nz/zhao xian zu yu/null 14 | 赫哲族/nz/he zhe zu/null 15 | 赫哲族人/nz/he zhe zu ren/null 16 | 赫哲族语/nz/he zhe zu yu/null 17 | 达斡尔族/nz/da wo er zu/null 18 | 达斡尔族人/nz/da wo er zu ren/null 19 | 达斡尔族语/nz/da wo er zu yu/null 20 | 鄂温克族/nz/e wen ke zu/null 21 | 鄂温克族人/nz/e wen ke zu ren/null 22 | 鄂温克族语/nz/e wen ke zu yu/null 23 | 鄂伦春族/nz/e lun chun zu/null 24 | 鄂伦春族人/nz/e lun chun zu ren/null 25 | 鄂伦春族语/nz/e lun chun zu yu/null 26 | 回族/nz/hui zu/null 27 | 回族人/nz/hui zu ren/null 28 | 回族语/nz/hui zu yu/null 29 | 东乡族/nz/dong xiang zu/null 30 | 东乡族人/nz/dong xiang zu ren/null 31 | 东乡族语/nz/dong xiang zu yu/null 32 | 土族/nz/tu zu/null 33 | 土族人/nz/tu zu ren/null 34 | 土族语/nz/tu zu yu/null 35 | 撒拉族/nz/sa la zu/null 36 | 撒拉族人/nz/sa la zu ren/null 37 | 撒拉族语/nz/sa la zu yu/null 38 | 保安族/nz/bao an zu/null 39 | 保安族人/nz/bao an zu ren/null 40 | 保安族语/nz/bao an zu yu/null 41 | 裕固族/nz/yu gu zu/null 42 | 裕固族人/nz/yu gu zu ren/null 43 | 裕固族语/nz/yu gu zu yu/null 44 | 维吾尔族/nz/wei wu er zu/null 45 | 维吾尔族人/nz/wei wu er zu ren/null 46 | 维吾尔族语/nz/wei wu er zu yu/null 47 | 哈萨克族/nz/ha sa ke zu/null 48 | 哈萨克族人/nz/ha sa ke zu ren/null 49 | 哈萨克族语/nz/ha sa ke zu yu/null 50 | 柯尔克孜族/nz/ke er ke zi zu/null 51 | 柯尔克孜族人/nz/ke er ke zi zu ren/null 52 | 柯尔克孜族语/nz/ke er ke zi zu yu/null 53 | 锡伯族/nz/xi bo zu/null 54 | 锡伯族人/nz/xi bo zu ren/null 55 | 锡伯族语/nz/xi bo zu yu/null 56 | 塔吉克族/nz/ta ji ke zu/null 57 | 塔吉克族人/nz/ta ji ke zu ren/null 58 | 塔吉克族语/nz/ta ji ke zu yu/null 59 | 乌孜别克族/nz/wu zi bie ke zu/null 60 | 乌孜别克族人/nz/wu zi bie ke zu ren/null 61 | 乌孜别克族语/nz/wu zi bie ke zu yu/null 62 | 俄罗斯族/nz/e luo si zu/null 63 | 俄罗斯族人/nz/e luo si zu ren/null 64 | 俄罗斯族语/nz/e luo si zu yu/null 65 | 塔塔尔族/nz/ta ta er zu/null 66 | 塔塔尔族人/nz/ta ta er zu ren/null 67 | 塔塔尔族语/nz/ta ta er zu yu/null 68 | 藏族/nz/zang zu/null 69 | 藏族人/nz/zang zu ren/null 70 | 藏族语/nz/zang zu yu/null 71 | 门巴族/nz/men ba zu/null 72 | 门巴族人/nz/men ba zu ren/null 73 | 门巴族语/nz/men ba zu yu/null 74 | 珞巴族/nz/luo ba zu/null 75 | 珞巴族人/nz/luo ba zu ren/null 76 | 珞巴族语/nz/luo ba zu yu/null 77 | 羌族/nz/qiang zu/null 78 | 羌族人/nz/qiang zu ren/null 79 | 羌族语/nz/qiang zu yu/null 80 | 彝族/nz/yi zu/null 81 | 彝族人/nz/yi zu ren/null 82 | 彝族语/nz/yi zu yu/null 83 | 白族/nz/bai zu/null 84 | 白族人/nz/bai zu ren/null 85 | 白族语/nz/bai zu yu/null 86 | 哈尼族/nz/ha ni zu/null 87 | 哈尼族人/nz/ha ni zu ren/null 88 | 哈尼族语/nz/ha ni zu yu/null 89 | 傣族/nz/dai zu/null 90 | 傣族人/nz/dai zu ren/null 91 | 傣族语/nz/dai zu yu/null 92 | 僳僳族/nz/su su zu/null 93 | 僳僳族人/nz/su su zu ren/null 94 | 僳僳族语/nz/su su zu yu/null 95 | 佤族/nz/wa zu/null 96 | 佤族人/nz/wa zu ren/null 97 | 佤族语/nz/wa zu yu/null 98 | 拉祜族/nz/la hu zu/null 99 | 拉祜族人/nz/la hu zu ren/null 100 | 拉祜族语/nz/la hu zu yu/null 101 | 纳西族/nz/na xi zu/null 102 | 纳西族人/nz/na xi zu ren/null 103 | 纳西族语/nz/na xi zu yu/null 104 | 景颇族/nz/jing po zu/null 105 | 景颇族人/nz/jing po zu ren/null 106 | 景颇族语/nz/jing po zu yu/null 107 | 布朗族/nz/bu lang zu/null 108 | 布朗族人/nz/bu lang zu ren/null 109 | 布朗族语/nz/bu lang zu yu/null 110 | 阿昌族/nz/a chang zu/null 111 | 阿昌族人/nz/a chang zu ren/null 112 | 阿昌族语/nz/a chang zu yu/null 113 | 普米族/nz/pu mi zu/null 114 | 普米族人/nz/pu mi zu ren/null 115 | 普米族语/nz/pu mi zu yu/null 116 | 怒族/nz/nu zu/null 117 | 怒族人/nz/nu zu ren/null 118 | 怒族语/nz/nu zu yu/null 119 | 德昂族/nz/de ang zu/null 120 | 德昂族人/nz/de ang zu ren/null 121 | 德昂族语/nz/de ang zu yu/null 122 | 独龙族/nz/du long zu/null 123 | 独龙族人/nz/du long zu ren/null 124 | 独龙族语/nz/du long zu yu/null 125 | 基诺族/nz/ji nuo zu/null 126 | 基诺族人/nz/ji nuo zu ren/null 127 | 基诺族语/nz/ji nuo zu yu/null 128 | 苗族/nz/miao zu/null 129 | 苗族人/nz/miao zu ren/null 130 | 苗族语/nz/miao zu yu/null 131 | 布依族/nz/bu yi zu/null 132 | 布依族人/nz/bu yi zu ren/null 133 | 布依族语/nz/bu yi zu yu/null 134 | 侗族/nz/dong zu/null 135 | 侗族人/nz/dong zu ren/null 136 | 侗族语/nz/dong zu yu/null 137 | 水族/nz/shui zu/null 138 | 水族人/nz/shui zu ren/null 139 | 水族语/nz/shui zu yu/null 140 | 仡佬族/nz/ge lao zu/null 141 | 仡佬族人/nz/ge lao zu ren/null 142 | 仡佬族语/nz/ge lao zu yu/null 143 | 壮族/nz/zhuang zu/null 144 | 壮族人/nz/zhuang zu ren/null 145 | 壮族语/nz/zhuang zu yu/null 146 | 瑶族/nz/yao zu/null 147 | 瑶族人/nz/yao zu ren/null 148 | 瑶族语/nz/yao zu yu/null 149 | 仫佬族/nz/mu lao zu/null 150 | 仫佬族人/nz/mu lao zu ren/null 151 | 仫佬族语/nz/mu lao zu yu/null 152 | 毛南族/nz/mao nan zu/null 153 | 毛南族人/nz/mao nan zu ren/null 154 | 毛南族语/nz/mao nan zu yu/null 155 | 京族/nz/jing zu/null 156 | 京族人/nz/jing zu ren/null 157 | 京族语/nz/jing zu yu/null 158 | 土家族/nz/tu jia zu/null 159 | 土家族人/nz/tu jia zu ren/null 160 | 土家族语/nz/tu jia zu yu/null 161 | 黎族/nz/li zu/null 162 | 黎族人/nz/li zu ren/null 163 | 黎族语/nz/li zu yu/null 164 | 畲族/nz/she zu/null 165 | 畲族人/nz/yu zu ren/null 166 | 畲族语/nz/yu zu yu/null 167 | 高山族/nz/gao shan zu/null 168 | 高山族人/nz/gao shan zu ren/null 169 | 高山族语/nz/gao shan zu yu/null 170 | -------------------------------------------------------------------------------- /WebRoot/WEB-INF/lib/lexicon/lex-company.lex: -------------------------------------------------------------------------------- 1 | CJK_WORDS 2 | 央视/nt/yang shi/null 3 | 电信/nt/dian xin/null 4 | 移动/nt/yi dong/null 5 | 网通/nt/wang tong/null 6 | 联通/nt/lian tong/null 7 | 铁通/nt/tie tong/null 8 | 百度/nt/bai du/null 9 | 环球网/nt/huan qiu wang/null 10 | 长城网/nt/chang cheng wang/null 11 | 新浪/nt/xin lang/null 12 | 腾讯/nt/teng xun/null 13 | 搜搜/nt/so so/soso 14 | 谷歌/nt/gu ge/null 15 | 雅虎/nt/ya hu/null 16 | 微软/nt/wei ruan/null 17 | 中关村/nt/zhong guan cun/null 18 | 搜狐/nt/sou hu/null 19 | 网易/nt/wang yi/null 20 | 硅谷/nt/gui gu/null 21 | 维基百科/nt/wei ji bai ke/null 22 | 巨人网络/nt/ju ren wang luo/null 23 | 阿里巴巴/nt/a li ba ba/null 24 | 阿里旺旺/nt/a li ba ba/旺旺 25 | 旺旺/n/wang wang/null 26 | 淘宝/nt/tao bao/null 27 | 赶集网/nt/gan ji wang/null 28 | 猪八戒网/nt/zhu ba jie wang/null 29 | 唯你英语/nt/wei ni ying yu/null 30 | 拉手网/nt/la shou wang/null 31 | 百贯福泰/nt/bai guan fu tai/null 32 | 汇划算/nt/hui hua suan/null 33 | 汇划算网/nt/hui hua suan wang/null 34 | 聚划算/nt/ju hua suan/null 35 | 天猫/nt/tian mao/null 36 | 天猫网/nt/tian mao wang/null 37 | 亚马逊/nt/ya ma xun/null 38 | 亚马逊网/nt/ya ma xun wang/null 39 | 拍拍/nt/pai pai/null 40 | 拍拍网/nt/pai pai wang/null 41 | 京东/nt/jing dong/null 42 | 京东商城/nt/jing dong shang cheng/null 43 | 返利网/nt/fan li wang/null 44 | 支付宝/nt/zhi fu bao/null 45 | 支付宝担保/nt/zhi fu bao dan bao/null 46 | 支付宝及时到帐/nt/zhi fu bao ji shi dao zhang/null 47 | 支付宝双工能/nt/zhi fu bao shuang gong neng/null 48 | 财付通/nt/cai fu tong/null 49 | 财付通及时到帐/nt/cai fu tong ji shi dao zhang/null 50 | 网银在线/nt/wang yin zai xian/null 51 | 苏宁易购/nt/su ning yi gou/null 52 | 苏宁电器/nt/su ning dian qi/null 53 | 仙童公司/nt/xian tong gong si/null 54 | 开源中国/nt/kai yuan zhong guo/null 55 | 畅想网络/nt/chang xiang wang luo/null 56 | 快乐大本营/nt/kuai yue da ben ying/null 57 | 越策越开心/nt/yue ce yue kai xin/null 58 | 超级男声/nt/chao ji nan sheng/null 59 | 超男/nt/chao nan/null 60 | 超级女声/nt/chao ji nu sheng/超女 61 | 超女/nt/chao nu/超级女声 62 | 好声音/nt/hao sheng yin/null 63 | 快乐男声/nt/kuai yue nan sheng/快男 64 | 快男/nt/kuai nan/快乐男声 65 | 快乐女声/nt/kuai yue nu sheng/null 66 | 快女/nt/kuai nu/null 67 | 德克士/nt/de ke shi/null 68 | 肯德基/nt/ken de ji/null 69 | 奥利奥/nt/ao li ao/null 70 | 回头客/nt/hui tou ke/null 71 | 苏波尔/nt/su bo er/null 72 | 苏宁/nt/su ning/null 73 | 苏宁电器/nt/su ning dian qi/null 74 | 苏宁易购/nt/su ning yi gou/null 75 | 中央银行/nt/zhong yang yin hang/null 76 | 人民银行/nt/ren min yin hang/null 77 | 工商银行/nt/gong shang yin hang/null 78 | 农业银行/nt/nong ye yin xing/null 79 | 中国银行/nt/zhong guo yin hang/null 80 | 建设银行/nt/jian she yin xing/null 81 | 交通银行/nt/jiao tong yin hang/null 82 | 华夏银行/nt/hua xia yin hang/null 83 | 光大银行/nt/guang da yin xing/null 84 | 招商银行/nt/zhao shang yin xing/null 85 | 中信银行/nt/zhong xin yin hang/null 86 | 兴业银行/nt/xing ye yin hang/null 87 | 民生银行/nt/min sheng yin xing/null 88 | 深圳发展银行/nt/shen zhen fa zhan yin xing/null 89 | 广东发展银行/nt/guang dong fa zhan yin xing/null 90 | 上海浦东发展银行/nt/shang hai pu dong fa zhan yin hang/null 91 | 恒丰银行/nt/heng feng yin xing/null 92 | 农业发展银行/nt/nong ye fa zhan yin xing/null 93 | 国家进出口信贷银行/nt/guo jia jin chu kou xin dai yin xing/null 94 | 国家开发银行/nt/guo jia kai fa yin hang/null 95 | 北京商业银行/nt/bei jing shang ye yin xing/null 96 | 上海银行/nt/shang hai yin xing/null 97 | 济南商业银行/nt/ji nan shang ye yin xing/null 98 | 信用社/nt/xin yong she/null 99 | 农村信用社/nt/nong cun xin yong she/null 100 | 邮政局/nt/you zheng ju/null 101 | 邮政储蓄银行/nt/you zheng chu xu yin xing/null 102 | -------------------------------------------------------------------------------- /WebRoot/WEB-INF/lib/lexicon/lex-dname-1.lex: -------------------------------------------------------------------------------- 1 | CN_DNAME_1 2 | #双姓名首字词库 3 | 建 4 | 小 5 | 晓 6 | 文 7 | 志 8 | 国 9 | 玉 10 | 丽 11 | 永 12 | 海 13 | 春 14 | 金 15 | 明 16 | 新 17 | 德 18 | 秀 19 | 红 20 | 亚 21 | 伟 22 | 雪 23 | 俊 24 | 桂 25 | 爱 26 | 美 27 | 世 28 | 正 29 | 庆 30 | 学 31 | 家 32 | 立 33 | 淑 34 | 振 35 | 云 36 | 华 37 | 光 38 | 惠 39 | 兴 40 | 天 41 | 长 42 | 艳 43 | 慧 44 | 利 45 | 宏 46 | 佳 47 | 瑞 48 | 凤 49 | 荣 50 | 秋 51 | 继 52 | 嘉 53 | 卫 54 | 燕 55 | 思 56 | 维 57 | 少 58 | 福 59 | 忠 60 | 宝 61 | 子 62 | 成 63 | 月 64 | 洪 65 | 东 66 | 一 67 | 泽 68 | 林 69 | 大 70 | 素 71 | 旭 72 | 宇 73 | 智 74 | 锦 75 | 冬 76 | 玲 77 | 雅 78 | 伯 79 | 翠 80 | 传 81 | 启 82 | 剑 83 | 安 84 | 树 85 | 良 86 | 中 87 | 梦 88 | 广 89 | 昌 90 | 元 91 | 万 92 | 清 93 | 静 94 | 友 95 | 宗 96 | 兆 97 | 丹 98 | 克 99 | 彩 100 | 绍 101 | 喜 102 | 远 103 | 朝 104 | 敏 105 | 培 106 | 胜 107 | 祖 108 | 先 109 | 菊 110 | 士 111 | 向 112 | 有 113 | 连 114 | 军 115 | 健 116 | 巧 117 | 耀 118 | 莉 119 | 英 120 | 方 121 | 和 122 | 仁 123 | 孝 124 | 梅 125 | 汉 126 | 兰 127 | 松 128 | 水 129 | 江 130 | 益 131 | 开 132 | 景 133 | 运 134 | 贵 135 | 祥 136 | 青 137 | 芳 138 | 碧 139 | 婷 140 | 龙 141 | 鹏 142 | 自 143 | 顺 144 | 双 145 | 书 146 | 生 147 | 义 148 | 跃 149 | 银 150 | 佩 151 | 雨 152 | 保 153 | 贤 154 | 仲 155 | 鸿 156 | 浩 157 | 加 158 | 定 159 | 炳 160 | 飞 161 | 锡莎 162 | 柏 163 | 发 164 | 超 165 | 道 166 | 怀 167 | 进 168 | 其 169 | 富 170 | 平 171 | 全 172 | 阳 173 | 吉 174 | 茂 175 | 彦 176 | 诗 177 | 洁 178 | 润 179 | 承 180 | 治 181 | 焕 182 | 如 183 | 君 184 | 增 185 | 善 186 | 希 187 | 根 188 | 应 189 | 勇 190 | 宜 191 | 守 192 | 会 193 | 凯 194 | 育 195 | 湘 196 | 凌 197 | 本 198 | 敬 199 | 博 200 | 延 201 | 乐 202 | 三 203 | 高 204 | 熙 205 | 逸 206 | 幸 207 | 灵 208 | 宣 209 | 才 210 | 述 211 | 化 212 | 那 213 | 紫 214 | 莎 215 | -------------------------------------------------------------------------------- /WebRoot/WEB-INF/lib/lexicon/lex-dname-2.lex: -------------------------------------------------------------------------------- 1 | CN_DNAME_2 2 | #双姓名尾字词库 3 | 华 4 | 平 5 | 明 6 | 英 7 | 军 8 | 林 9 | 萍 10 | 芳 11 | 玲 12 | 红 13 | 生 14 | 霞 15 | 梅 16 | 文 17 | 荣 18 | 珍 19 | 兰 20 | 娟 21 | 峰 22 | 琴 23 | 云 24 | 辉 25 | 东 26 | 龙 27 | 敏 28 | 伟 29 | 强 30 | 丽 31 | 春 32 | 杰 33 | 燕 34 | 民 35 | 君 36 | 波 37 | 国 38 | 芬 39 | 清 40 | 祥 41 | 斌 42 | 婷 43 | 飞 44 | 良 45 | 忠 46 | 新 47 | 凤 48 | 锋 49 | 成 50 | 勇 51 | 刚 52 | 玉 53 | 元 54 | 宇 55 | 海 56 | 兵 57 | 安 58 | 庆 59 | 涛 60 | 鹏 61 | 亮 62 | 青 63 | 阳 64 | 艳 65 | 松 66 | 江 67 | 莲 68 | 娜 69 | 兴 70 | 光 71 | 德 72 | 武 73 | 香 74 | 俊 75 | 秀 76 | 慧 77 | 雄 78 | 才 79 | 宏 80 | 群 81 | 琼 82 | 胜 83 | 超 84 | 彬 85 | 莉 86 | 中 87 | 山 88 | 富 89 | 花 90 | 宁 91 | 利 92 | 贵 93 | 福 94 | 发 95 | 义 96 | 蓉 97 | 喜 98 | 娥 99 | 昌 100 | 仁 101 | 志 102 | 全 103 | 宝 104 | 权 105 | 美 106 | 琳 107 | 建 108 | 金 109 | 贤 110 | 星 111 | 丹 112 | 根 113 | 和 114 | 珠 115 | 康 116 | 菊 117 | 琪 118 | 坤 119 | 泉 120 | 秋 121 | 静 122 | 佳 123 | 顺 124 | 源 125 | 珊 126 | 达 127 | 欣 128 | 如 129 | 莹 130 | 章 131 | 浩 132 | 勤 133 | 芹 134 | 容 135 | 友 136 | 芝 137 | 豪 138 | 洁 139 | 鑫 140 | 惠 141 | 洪 142 | 旺 143 | 虎 144 | 远 145 | 妮 146 | 森 147 | 妹 148 | 南 149 | 雯 150 | 奇 151 | 健 152 | 卿 153 | 虹 154 | 娇 155 | 媛 156 | 怡 157 | 铭 158 | 川 159 | 进 160 | 博 161 | 智 162 | 来 163 | 琦 164 | 学 165 | 聪 166 | 洋 167 | 乐 168 | 年 169 | 翔 170 | 然 171 | 栋 172 | 凯 173 | 颖 174 | 鸣 175 | 丰 176 | 瑞 177 | 奎 178 | 立 179 | 堂 180 | 威 181 | 雪 182 | 鸿 183 | 晶 184 | 桂 185 | 凡 186 | 娣 187 | 先 188 | 洲 189 | 毅 190 | 雅 191 | 月 192 | 旭 193 | 田 194 | 晖 195 | 方 196 | 恒 197 | 亚 198 | 泽 199 | 风 200 | 银 201 | 高 202 | 贞 203 | 九 204 | 薇 205 | 钰 206 | 城 207 | 宜 208 | 厚 209 | 耐 210 | 声 211 | 腾 212 | 宸 213 | 勋 214 | 曲 215 | 轩 216 | 棋 217 | -------------------------------------------------------------------------------- /WebRoot/WEB-INF/lib/lexicon/lex-ecmixed.lex: -------------------------------------------------------------------------------- 1 | EC_MIXED_WORD 2 | #英文中文混合字, 注意英文字符均为小写 3 | a咖/n/a ga/主角 4 | a片/n/a pian/毛片,av 5 | a座/f/a zuo/null 6 | a股/n/a gu/股票 7 | a型/n/a xing/null 8 | a杯/n/a bei/a罩杯 9 | a罩杯/n/a zhao bei/a杯 10 | a计划/n/a ji hua/null 11 | aa制/I/aa zhi/null 12 | ab型/n/ab xing/null 13 | ab档案/n/ab dang an/null 14 | a美a/n/null/null 15 | a梦/a/null/null 16 | x-射线/n/null/null 17 | # 18 | b座/f/b zuo/null 19 | b股/n/b gu/null 20 | b型/n/b xing/null 21 | b树/n/b shu/null 22 | b计划/n/b ji hua/null 23 | b超/n/b chao/null 24 | b杯/n/b bei/b罩杯 25 | b罩杯/n/b zhao bei/b杯 26 | bb机/n/bb ji/call机 27 | bb仔/n/bb zai/null 28 | bp机/n/bp ji/null 29 | b型/n/b xing/null 30 | b型肝炎/n/b xing gan yan/乙型肝炎 31 | # 32 | c盘/n/c pan/null 33 | c座/f/c zuo/null 34 | c语言/n/c yu yan/null 35 | c杯/n/c bei/c罩杯 36 | c罩杯/n/c zhao bei/c杯 37 | cd盒/n/cd he/null 38 | cd机/n/cd ji/null 39 | call机/n/call ji/bb机 40 | # 41 | d盘/n/d pan/null 42 | d座/f/d zuo/null 43 | d版/n/d ban/null 44 | d杯/n/d bei/d罩杯 45 | d罩杯/n/d zhao bei/d杯 46 | dna鉴定/n/dna jian ding/null 47 | # 48 | e盘/n/e pan/null 49 | e座/f/e zuo/null 50 | e化/n/e hua/null 51 | e通/n/e tong/null 52 | e仔/n/e zai/null 53 | e语言/n/e yu yan/易语言 54 | e杯/n/e bei/e罩杯 55 | e罩杯/n/e zhao bei/e杯 56 | # 57 | f盘/n/f pan/null 58 | f座/f/f zuo/null 59 | f杯/n/f bei/f罩杯 60 | f罩杯/b/f zhao bei/f杯 61 | # 62 | g盘/n/g pan/null 63 | g点/n/g dian/null 64 | g杯/n/g bei/g罩杯 65 | g罩杯/n/g zhao bei/g杯 66 | # 67 | h盘/n/h pan/null 68 | h股/n/h gu/null 69 | h杯/n/h bei/h罩杯 70 | h罩杯/n/h zhao bei/h杯 71 | # 72 | i盘/n/i pan/null 73 | ic卡/n/ic ka/null 74 | ip卡/n/ip ka/null 75 | ip段/n/ip duan/null 76 | ip电话/n/ip dian hua/null 77 | ip地址/n/ip di zhi/null 78 | it行业/n/it hang ye/null 79 | it民工/n/it ming gong/码农 80 | it男/n/it nan/null 81 | # 82 | j盘/n/j pan/null 83 | # 84 | k仔/n/k zai/null 85 | k盘/n/k pan/null 86 | k党/n/k dang/null 87 | k书/v/k shu/看书,搞学习 88 | k粉/n/k fen/氯胺酮 89 | k歌/v/k ge/唱歌,嗨歌 90 | k他命/n/k ta ming/null 91 | k歌之王/n/k ge zhi wang/null 92 | # 93 | n年/n/n nian/很久 94 | # 95 | o型/n/o xing/null 96 | # 97 | pc机/n/pc ji/null 98 | ph值/n/ph zhi/null 99 | # 100 | sim卡/n/sim ka/null 101 | # 102 | u盘/n/u pan/null 103 | u形/n/u xing/null 104 | usb手指/n/usb shou zhi/null 105 | usb接口/n/usb jie kou/null 106 | usb插口/n/usb cha kou/null 107 | usb记忆棒/n/usb ji yi bang/null 108 | # 109 | visa卡/n/visa ka/null 110 | v沟/n/v gou/null 111 | # 112 | z盘/n/z pan/null 113 | # 114 | q版/n/q ban/null 115 | qq号/n/qq hao/null 116 | q立方/n/q li fang/null 117 | q币/n/q bi/null 118 | # 119 | rss订阅/n/rss ding yue/null 120 | # 121 | t盘/n/t pan/null 122 | # 123 | x光/n/x guan/null 124 | x光线/n/x guan xian/x射线 125 | x射线/n/x she xian/x光线 126 | γ射线/n/γ she xian/null 127 | # 128 | t恤衫/n/t xue shan/t恤 129 | t恤/n/t xue/t恤衫 130 | t字帐/n/t zi zhang/null 131 | t型台/n/t xing tai/null 132 | # 133 | 250g硬盘/n/250g ying pan/null 134 | 160g硬盘/n/160g ying pan/null 135 | 500g硬盘/n/500g ying pan/null 136 | -------------------------------------------------------------------------------- /WebRoot/WEB-INF/lib/lexicon/lex-en-pun.lex: -------------------------------------------------------------------------------- 1 | EN_PUN_WORDS 2 | #英文和标点组合成的词,英文字母统一使用小写。 3 | c++ 4 | g++ 5 | c# 6 | i++ 7 | x- 8 | -------------------------------------------------------------------------------- /WebRoot/WEB-INF/lib/lexicon/lex-en.lex: -------------------------------------------------------------------------------- 1 | EN_WORD 2 | #英文词条, 做英文词语同义词追加用 3 | decimal/n/null/decimals,fraction 4 | spirit/n/null/mind 5 | admire/v/null/appreciate,like,love,enjoy 6 | chenxin12/n/null/chenxin,lionsoul 7 | -------------------------------------------------------------------------------- /WebRoot/WEB-INF/lib/lexicon/lex-festival.lex: -------------------------------------------------------------------------------- 1 | CJK_WORDS 2 | 七七纪念日/t/qi qi ji nian ri/null 3 | 七夕/t/qi xi/七夕情人节,情人节,中国情人节 4 | 七夕情人节/t/qi xi qing ren jie/七夕,中国情人节,情人节 5 | 七夕节/t/qi xi jie/七夕,情人节,中国情人节 6 | 万圣节/t/wan sheng jie/鬼节 7 | 世界人权日/t/shi jie ren quan ri/null 8 | 世界儿歌节/t/shi jie r ge jie/null 9 | 世界儿童节/t/shi jie r tong jie/null 10 | 世界动物日/t/shi jie dong wu ri/null 11 | 世界卫生日/t/shi jie wei sheng ri/null 12 | 世界地球日/t/shi jie di qiu ri/null 13 | 世界教师日/t/shi jie jiao shi ri/null 14 | 世界无烟日/t/shi jie wu yan ri/null 15 | 世界无童工日/t/shi jie wu tong gong ri/null 16 | 世界林业节/t/shi jie lin ye jie/null 17 | 世界森林日/t/shi jie sen lin ri/null 18 | 世界水日/t/shi jie shui ri/null 19 | 世界海洋日/t/shi jie hai yang ri/null 20 | 世界湿地日/t/shi jie shi di ri/null 21 | 世界献血日/t/shi jie xian xie ri/null 22 | 世界环境日/t/shi jie huan jing ri/null 23 | 世界电视日/t/shi jie dian shi ri/null 24 | 世界睡眠日/t/shi jie shui mian ri/null 25 | 世界粮食日/t/shi jie liang shi ri/null 26 | 世界精神卫生日/t/shi jie jing shen wei sheng ri/null 27 | 世界红十字日/t/shi jie hong shi zi ri/null 28 | 世界问候日/t/shi jie wen hou ri/null 29 | 中国人民抗日战争纪念日/t/zhong guo ren min kang ri zhan zheng ji nian ri/null 30 | 抗日战争纪念日/t/kang ri zhan zheng ji nian ri/null 31 | 中国国耻日/t/zhong guo guo chi ri/null 32 | 中国学生营养日/t/zhong guo xue sheng ying yang ri/null 33 | 中国爱牙日/t/zhong guo ai ya ri/null 34 | 中国爱耳日/t/zhong guo ai er ri/null 35 | 中国青年志愿者服务日/t/zhong guo qing nian zhi yuan zhe fu wu ri/null 36 | 中国青年节/t/zhong guo qing nian jie/null 37 | 中秋/t/zhong qiu/null 38 | 中秋节/t/zhong qiu jie/null 39 | 人口日/t/ren kou ri/null 40 | 人权日/t/ren quan ri/null 41 | 儿歌节/t/er ge jie/null 42 | 儿童节/t/er tong jie/null 43 | 元宵/t/yuan xiao/null 44 | 元宵节/t/yuan xiao jie/null 45 | 元旦/t/yuan dan/null 46 | 元旦节/t/yuan dan jie/null 47 | 党生日/t/dang sheng ri/null 48 | 全国中小学生安全教育日/t/quan guo zhong xiao xue sheng an quan jiao yu ri/null 49 | 全国助残日/t/quan guo zhu can ri/null 50 | 全国爱眼日/t/quan guo ai yan ri/null 51 | 全国爱耳日/t/quan guo ai er ri/null 52 | 六十亿人口日/t/liu shi yi ren kou ri/null 53 | 六四纪念日/t/liu si ji nian ri/null 54 | 冬至/t/dong zhi/null 55 | 减轻自然灾害日/t/jian qing zi ran zai hai ri/null 56 | 动物日/t/dong wu ri/null 57 | 助残日/t/zhu can ri/null 58 | 劳动妇女节/t/lao dong fu nu: jie/null 59 | 劳动节/t/lao dong jie/null 60 | 博物馆日/t/bo wu guan ri/null 61 | 卫生日/t/wei sheng ri/null 62 | 和平日/t/he ping ri/null 63 | 国庆/t/guo qing/null 64 | 国庆节/t/guo qing jie/null 65 | 国耻日/t/guo chi ri/null 66 | 国际儿童节/t/guo ji er tong jie/null 67 | 国际减轻自然灾害日/t/guo ji jian qing zi ran zai hai ri/null 68 | 国际劳动妇女节/t/guo ji lao dong fu nu: jie/null 69 | 国际劳动节/t/guo ji lao dong jie/null 70 | 国际博物馆日/t/guo ji bo wu guan ri/null 71 | 国际和平日/t/guo ji he ping ri/null 72 | 国际奥林匹克日/t/guo ji ao lin pi ke ri/null 73 | 国际妇女节/t/guo ji fu nu: jie/null 74 | 国际容忍日/t/guo ji rong ren ri/null 75 | 国际左撇子日/t/guo ji zuo pie zi ri/null 76 | 国际志愿者日/t/guo ji zhi yuan zhe ri/null 77 | 国际护士节/t/guo ji hu shi jie/null 78 | 国际无车日/t/guo ji wu che ri/null 79 | 国际残疾人日/t/guo ji can ji ren ri/null 80 | 国际母语日/t/guo ji mu yu ri/null 81 | 国际气象节/t/guo ji qi xiang jie/null 82 | 国际消费者权益日/t/guo ji xiao fei zhe quan yi ri/null 83 | 国际牛奶日/t/guo ji niu nai ri/null 84 | 国际盲人节/t/guo ji mang ren jie/null 85 | 国际禁毒日/t/guo ji jin du ri/null 86 | 国际老人日/t/guo ji lao ren ri/null 87 | 国际臭氧层保护日/t/guo ji chou yang ceng bao hu ri/null 88 | 国际非洲儿童日/t/guo ji fei zhou r tong ri/null 89 | 国际音乐日/t/guo ji yin yue ri/null 90 | 国际麻风日/t/guo ji ma feng ri/null 91 | 圣诞节/t/sheng dan jie/null 92 | 地球日/t/di qiu ri/null 93 | 处暑/t/chu shu/null 94 | 复活节/t/fu huo jie/null 95 | 夏至/t/xia zhi/null 96 | 大寒/t/da han/null 97 | 大暑/t/da shu/null 98 | 大雪/t/da xue/null 99 | 奥林匹克日/t/ao lin pi ke ri/null 100 | 妇女节/t/fu nv jie/null 101 | 三八节/t/san ba jie/null 102 | 三八妇女节/t/san ba fu nu: jie/null 103 | 学生营养日/t/xue sheng ying yang ri/null 104 | 安全教育日/t/an quan jiao yu ri/null 105 | 安全日/t/an quan ri/null 106 | 容忍日/t/rong ren ri/null 107 | 寒露/t/han lu/null 108 | 小寒/t/xiao han/null 109 | 小年/t/xiao nian/null 110 | 小暑/t/xiao shu/null 111 | 小满/t/xiao man/null 112 | 小雪/t/xiao xue/null 113 | 左撇子日/t/zuo pie zi ri/null 114 | 平安夜/t/ping an ye/null 115 | 建党日/t/jian dang ri/null 116 | 建军节/t/jian jun jie/null 117 | 志愿人员日/t/zhi yuan ren yuan ri/null 118 | 志愿者日/t/zhi yuan zhe ri/null 119 | 情人节/t/qing ren jie/null 120 | 惊蛰/t/jing zhe/null 121 | 愚人节/t/yu ren jie/null 122 | 感恩节/t/gan en jie/null 123 | 扫房日/t/sao fang ri/null 124 | 抗日战争纪念日/t/kang ri zhan zheng ji nian ri/null 125 | 抗日纪念日/t/kang ri ji nian ri/null 126 | 护士节/t/hu shi jie/null 127 | 教师日/t/jiao shi ri/null 128 | 教师节/t/jiao shi jie/null 129 | 文化遗产日/t/wen hua yi chan ri/null 130 | 无烟日/t/wu yan ri/null 131 | 无童工日/t/wu tong gong ri/null 132 | 无车日/t/wu che ri/null 133 | 春分/t/chun fen/null 134 | 春节/t/chun jie/null 135 | 植树节/t/zhi shu jie/null 136 | 残疾人日/t/can ji ren ri/null 137 | 母亲节/t/mu qin jie/null 138 | 母语日/t/mu yu ri/null 139 | 气象节/t/qi xiang jie/null 140 | 水日/t/shui ri/null 141 | 海洋日/t/hai yang ri/null 142 | 消费者权益日/t/xiao fei zhe quan yi ri/null 143 | 清明/t/qing ming/null 144 | 清明节/t/qing ming jie/null 145 | 湿地日/t/shi di ri/null 146 | 爱牙日/t/ai ya ri/null 147 | 爱眼日/t/ai yan ri/null 148 | 爱耳日/t/ai er ri/null 149 | 父亲节/t/fu qin jie/null 150 | 牛奶日/t/niu nai ri/null 151 | 独立日/t/du li ri/null 152 | 献血日/t/xian xie ri/null 153 | 环境日/t/huan jing ri/null 154 | 电视日/t/dian shi ri/null 155 | 白露/t/bai lu/null 156 | 盲人节/t/mang ren jie/null 157 | 睡眠日/t/shui mian ri/null 158 | 秋分/t/qiu fen/null 159 | 立冬/t/li dong/null 160 | 立夏/t/li xia/null 161 | 立春/t/li chun/null 162 | 立秋/t/li qiu/null 163 | 端午节/t/duan wu jie/null 164 | 粮食日/t/liang shi ri/null 165 | 精神卫生日/t/jing shen wei sheng ri/null 166 | 红十字日/t/hong shi zi ri/null 167 | 老人日/t/lao ren ri/null 168 | 联合国日/t/lian he guo ri/null 169 | 腊八节/t/la ba jie/null 170 | 腊日/t/la ri/null 171 | 臭氧保护日/t/chou yang bao hu ri/null 172 | 臭氧层保护日/t/chou yang ceng bao hu ri/null 173 | 芒种/t/mang zhong/null 174 | 营养日/t/ying yang ri/null 175 | 谷雨/t/gu yu/null 176 | 重阳/t/chong yang/null 177 | 重阳节/t/chong yang jie/null 178 | 问候日/t/wen hou ri/null 179 | 除夕/t/chu xi/null 180 | 雨水/t/yu shui/null 181 | 霜降/t/shuang jiang/null 182 | 青年志愿者服务日/t/qing nian zhi yuan zhe fu wu ri/null 183 | 青年节/t/qing nian jie/null 184 | 非洲儿童日/t/fei zhou r tong ri/null 185 | 音乐日/t/yin yue ri/null 186 | 麻风日/t/ma feng ri/null 187 | 龙头节/t/long tou jie/null 188 | -------------------------------------------------------------------------------- /WebRoot/WEB-INF/lib/lexicon/lex-fname.lex: -------------------------------------------------------------------------------- 1 | CJK_WORDS 2 | #西方姓氏词库 3 | 亚历山大/nr/ya li shan da/null 4 | 克林顿/nr/ke ling dun/null 5 | 克里斯汀/nr/ke li si ding/null 6 | 布什/nr/bu shi/null 7 | 布莱尔/nr/bu lai er/null 8 | 科特勒/nr/ke te lei/null 9 | 约翰/nr/yue han/null 10 | 约翰逊/nr/yue han xun/null 11 | 蒂娜/nr/di na/null 12 | 安妮/nr/an ni/null 13 | 咪咪/nr/mi mi/null 14 | 妮可/nr/ni ke/null 15 | 凯蒂/nr/kai di/null 16 | #外人翻译名字# 17 | 阿汤哥/nr/a tang ge/汤姆·克鲁斯 18 | 汤姆·克鲁斯/nr/tang mu ke lu si/阿汤哥 19 | 咪咪·罗杰斯/nr/mi mi luo jie si/null 20 | 妮可·基德曼/nr/ni ke ji de man/null 21 | 凯蒂·赫尔墨斯/nr/ka di he er mo si/null 22 | -------------------------------------------------------------------------------- /WebRoot/WEB-INF/lib/lexicon/lex-food.lex: -------------------------------------------------------------------------------- 1 | CJK_WORDS 2 | 雪碧/n/xue bi/null 3 | 可口可乐/n/ke kou ke le/null 4 | 冰红茶/n/bing hong cha/null 5 | 奶茶/n/nai cha/null 6 | 花生奶/n/hua sheng nai/null 7 | 芬达/n/fen da/null 8 | 珍珠奶茶/n/zhen zhu nai cha/null 9 | 达利源/n/da li yuan/null 10 | 肯德鸡/n/ken de ji/null 11 | 炸薯条/n/zha shu tiao/null 12 | 麻辣烫/n/ma la tang/null 13 | 麻辣干锅/n/ma la gan guo/null 14 | -------------------------------------------------------------------------------- /WebRoot/WEB-INF/lib/lexicon/lex-lang.lex: -------------------------------------------------------------------------------- 1 | CJK_WORDS 2 | 中文/n/zhong wen/国语 3 | 国语/n/guo yu/null 4 | 台湾话/n/tai wan hua/台语 5 | 台语/n/tai yu/台湾话 6 | 客家话/n/ke jia hua/null 7 | 汉字/n/han zi/null 8 | 汉语/n/han yu/国语,中文 9 | 法文/n/fa wen/法文 10 | 法语/n/fa yu/法语 11 | 福建话/n/fu jian hua/null 12 | 粤语/n/yue yu/广东话 13 | 美语/n/mei yu/英语,英文 14 | 英文/n/ying wen/英语 15 | 英语/n/ying yu/英文 16 | 西班牙语/n/xi ban ya yu/null 17 | 闽南语/n/min nan yu/null 18 | 泰语/n/tai yu/null 19 | 西班牙语/n/xi ban ya yu/null 20 | 俄罗斯语/n/e luo si yu/null 21 | 拉丁语/n/la ding yu/null 22 | -------------------------------------------------------------------------------- /WebRoot/WEB-INF/lib/lexicon/lex-ln-adorn.lex: -------------------------------------------------------------------------------- 1 | CN_LNAME_ADORN 2 | #姓氏修饰,例如:老陈,小陈,中的老,小 3 | #如果他已经是姓氏(lex-lname.lex中的词),则无须放在这里。 4 | 老 5 | 小 -------------------------------------------------------------------------------- /WebRoot/WEB-INF/lib/lexicon/lex-lname.lex: -------------------------------------------------------------------------------- 1 | CN_LNAME 2 | #中文姓氏词库 3 | #单姓 4 | 王 5 | 李 6 | 张 7 | 刘 8 | 陈 9 | 杨 10 | 周 11 | 黄 12 | 孙 13 | 吴 14 | 徐 15 | 赵 16 | 林 17 | 胡 18 | 朱 19 | 梁 20 | 郭 21 | 高 22 | 何 23 | 马 24 | 郑 25 | 罗 26 | 宋 27 | 唐 28 | 谢 29 | 叶 30 | 韩 31 | 任 32 | 潘 33 | 于 34 | 冯 35 | 蒋 36 | 董 37 | 吕 38 | 邓 39 | 许 40 | 曹 41 | 曾 42 | 袁 43 | 汪 44 | 程 45 | 田 46 | 彭 47 | 钟 48 | 蔡 49 | 魏 50 | 沈 51 | 方 52 | 卢 53 | 余 54 | 杜 55 | 丁 56 | 苏 57 | 贾 58 | 姚 59 | 姜 60 | 陆 61 | 戴 62 | 傅 63 | 夏 64 | 廖 65 | 萧 66 | 石 67 | 江 68 | 范 69 | 今 70 | 谭 71 | 邹 72 | 崔 73 | 薛 74 | 邱 75 | 康 76 | 史 77 | 侯 78 | 邵 79 | 熊 80 | 秦 81 | 雷 82 | 孟 83 | 庞 84 | 白 85 | 毛 86 | 郝 87 | 钱 88 | 段 89 | 俞 90 | 洪 91 | 汤 92 | 顾 93 | 贺 94 | 龚 95 | 尹 96 | 万 97 | 龙 98 | 赖 99 | 章 100 | 孔 101 | 武 102 | 邢 103 | 颜 104 | 梅 105 | 阮 106 | 黎 107 | 常 108 | 倪 109 | 施 110 | 乔 111 | 樊 112 | 严 113 | 齐 114 | 陶 115 | #向 116 | 温 117 | 文 118 | 易 119 | 兰 120 | 闫 121 | 芦 122 | 牛 123 | 尚 124 | 安 125 | 管 126 | 殷 127 | 霍 128 | 翟 129 | 佘 130 | 葛 131 | 庄 132 | 伍 133 | 辛 134 | 练 135 | 申 136 | 付 137 | 曲 138 | 焦 139 | 项 140 | 代 141 | 鲁 142 | 季 143 | 覃 144 | 覃 145 | 毕 146 | 麦 147 | 阳 148 | 耿 149 | 舒 150 | 聂 151 | 盛 152 | 童 153 | 祝 154 | 柳 155 | 单 156 | 单 157 | 岳 158 | 骆 159 | 纪 160 | 欧 161 | 房 162 | 左 163 | 尤 164 | 凌 165 | 韦 166 | 景 167 | 詹 168 | 莫 169 | 郎 170 | 路 171 | 宁 172 | 宁 173 | 关 174 | 丛 175 | 翁 176 | 容 177 | 亢 178 | 柯 179 | 鲍 180 | 蒲 181 | 苗 182 | 牟 183 | 谷 184 | 裴 185 | 商 186 | 初 187 | 屈 188 | 成 189 | 包 190 | 游 191 | 司 192 | 祁 193 | 强 194 | 靳 195 | 甘 196 | 席 197 | 瞿 198 | 卜 199 | 褚 200 | 解 201 | 臧 202 | 时 203 | 费 204 | 班 205 | 华 206 | 全 207 | 涂 208 | 卓 209 | 党 210 | 饶 211 | 应 212 | 卫 213 | 丘 214 | 隋 215 | 米 216 | 闵 217 | 畅 218 | 喻 219 | 冉 220 | 宫 221 | 甄 222 | 宣 223 | 穆 224 | 谈 225 | 匡 226 | 帅 227 | 车 228 | 母 229 | 查 230 | 戚 231 | 符 232 | 缪 233 | 昌 234 | 娄 235 | 滕 236 | 位 237 | 奚 238 | 边 239 | 卞 240 | 桂 241 | 邝 242 | 苟 243 | 柏 244 | 井 245 | 冀 246 | 邬 247 | 吉 248 | 敖 249 | 桑 250 | 池 251 | 简 252 | 蔺 253 | 连 254 | 艾 255 | 蓝 256 | 窦 257 | 刚 258 | 封 259 | 占 260 | 迟 261 | 姬 262 | 刁 263 | 栾 264 | 冷 265 | 杭 266 | 植 267 | 郁 268 | 晋 269 | 虞 270 | 佟 271 | 苑 272 | 屠 273 | 藏 274 | 蒙 275 | 占 276 | 辜 277 | 廉 278 | 巩 279 | 麻 280 | 晏 281 | 相 282 | 师 283 | 鄢 284 | 泮 285 | 燕 286 | 岑 287 | 官 288 | 仲 289 | 羊 290 | 揭 291 | 仇 292 | 邸 293 | 宗 294 | 荆 295 | 盖 296 | 盖 297 | 粱 298 | 原 299 | 茅 300 | 荣 301 | 沙 302 | 郜 303 | 巫 304 | 鞠 305 | 罡 306 | 未 307 | 来 308 | 劳 309 | 诸 310 | 计 311 | 乐 312 | 乐 313 | 双 314 | 花 315 | 冼 316 | 尉 317 | 木 318 | 丰 319 | 寇 320 | 栗 321 | 况 322 | 干 323 | 楼 324 | 满 325 | 桑 326 | 湛 327 | 谌 328 | 储 329 | 邦 330 | 皮 331 | 楚 332 | 胥 333 | 明 334 | 平 335 | 腾 336 | 厉 337 | 仉 338 | 励 339 | 竺 340 | 闻 341 | 宇 342 | 支 343 | 都 344 | 折 345 | 旷 346 | 南 347 | 战 348 | 嵇 349 | 化 350 | 糜 351 | 衣 352 | 国 353 | 逄 354 | 门 355 | 崇 356 | 裘 357 | 薄 358 | 束 359 | 宿 360 | 东 361 | 降 362 | 逯 363 | 伊 364 | 修 365 | 粟 366 | 漆 367 | 阙 368 | 禹 369 | 先 370 | 银 371 | 台 372 | #和 373 | 祖 374 | 惠 375 | 伦 376 | 候 377 | 阚 378 | 慕 379 | 戈 380 | 富 381 | 伏 382 | 僧 383 | 习 384 | 云 385 | 元 386 | 狄 387 | 危 388 | 雍 389 | 蔚 390 | 索 391 | 居 392 | 浦 393 | 权 394 | 税 395 | 谯 396 | 於 397 | 芮 398 | 濮 399 | 基 400 | 寿 401 | 凡 402 | 卿 403 | 酆 404 | 苻 405 | 保 406 | 郗 407 | 渠 408 | 琚 409 | 淡 410 | 由 411 | 豆 412 | 扈 413 | 仁 414 | 呼 415 | 矫 416 | 巢 417 | 盘 418 | 敬 419 | 巴 420 | 茆 421 | 鱼 422 | 戎 423 | 缠 424 | 区 425 | 幸 426 | 海 427 | 弓 428 | 阴 429 | 住 430 | 晁 431 | 菅 432 | 印 433 | 汝 434 | 历 435 | 么 436 | 乌 437 | 贡 438 | 妙 439 | 禤 440 | 荀 441 | 鹿 442 | 邰 443 | 随 444 | 雒 445 | 贝 446 | 录 447 | 鲜 448 | 茹 449 | 种 450 | 农 451 | 佐 452 | 赫 453 | 字 454 | 油 455 | #但 456 | 綦 457 | 美 458 | 利 459 | 钮 460 | 信 461 | 勾 462 | 火 463 | 昝 464 | 圣 465 | 颉 466 | 从 467 | 靖 468 | 开 469 | 公 470 | 那 471 | 山 472 | 智 473 | 补 474 | 虎 475 | 才 476 | 布 477 | 亓 478 | 药 479 | 造 480 | 普 481 | 五 482 | 仝 483 | 扆 484 | 暴 485 | 咸 486 | 庚 487 | 奕 488 | 锺 489 | 问 490 | 招 491 | 贵 492 | 巨 493 | 檀 494 | 厚 495 | 恽 496 | 过 497 | 达 498 | 邴 499 | 洛 500 | 忻 501 | 展 502 | 户 503 | 毋 504 | 暨 505 | 金 506 | #复姓 507 | 欧阳 508 | 上官 509 | 司徒 510 | 刘付 511 | 皇甫 512 | 长孙 513 | 相里 514 | 令狐 515 | 诸葛 516 | -------------------------------------------------------------------------------- /WebRoot/WEB-INF/lib/lexicon/lex-nation.lex: -------------------------------------------------------------------------------- 1 | CJK_WORDS 2 | 东非/ns/dong fei/null 3 | 中华/ns/zhong hua/null 4 | 中华/ns/zhong hua/null 5 | 中华人民共和国/ns/zhong hua ren min gong he guo/null 6 | 中华民国/ns/zhong hua min guo/null 7 | 中国/ns/zhong guo/null 8 | 中國/nz/zhong guo/null 9 | 中非/ns/zhong fei/null 10 | 乌克兰/ns/wu ke lan/null 11 | 也门/ns/ye men/null 12 | 以色列/ns/yi se lie/null 13 | 伊拉克/ns/yi la ke/null 14 | 伊朗/ns/yi lang/null 15 | 俄罗斯/ns/e luo si/null 16 | 分类/ns/fen lei/null 17 | 加拿大/ns/jia na da/null 18 | 南非/ns/nan fei/null 19 | 古巴/ns/gu ba/null 20 | 台湾/ns/tai wan/null 21 | 埃及/ns/ai ji/null 22 | 塞尔维亚/ns/sai er wei ya/null 23 | 墨西哥/ns/mo xi ge/null 24 | 威尔士/ns/wei er shi/null 25 | 尼日利亚/ns/ni ri li ya/null 26 | 巴比伦/ns/ba bi lun/null 27 | 希腊/ns/xi la/null 28 | 德国/ns/de guo/null 29 | 德意志/ns/de yi zhi/null 30 | 意大利/ns/yi da li/null 31 | 捷克/ns/jie ke/null 32 | 日本/ns/ri ben/null 33 | 朝鲜/ns/chao xian/null 34 | 比利时/ns/bi li shi/null 35 | 法兰西/ns/fa lan xi/null 36 | 法国/ns/fa guo/null 37 | 波兰/ns/bo lan/null 38 | 波黑/ns/bo hei/null 39 | 瑞典/ns/rui dian/null 40 | 瑞士/ns/rui shi/null 41 | 白俄罗斯/ns/bai e luo si/null 42 | 缅甸/ns/mian dian/null 43 | 美利坚/ns/mei li jian/null 44 | 美利坚合众国/ns/mei li jian he zhong guo/null 45 | 美国/ns/mei guo/null 46 | 老挝/ns/lao wo/null 47 | 苏格兰/ns/su ge lan/null 48 | 苏联/ns/su lian/null 49 | 英国/ns/ying guo/null 50 | 英格兰/ns/ying ge lan/null 51 | 葡萄牙/ns/pu tao ya/null 52 | 蒙古/ns/meng gu/null 53 | 西班牙/ns/xi ban ya/null 54 | 越南/ns/yue nan/null 55 | 韩国/ns/han guo/null 56 | 57 | #added at 2015-10-23 58 | 中国/ns/zhong guo/null 59 | 蒙古/ns/meng gu/null 60 | 朝鲜/ns/chao xian/null 61 | 韩国/ns/han guo/null 62 | 日本/ns/ri ben/null 63 | 菲律宾/ns/fei lv bin/null 64 | 越南/ns/yue nan/null 65 | 老挝/ns/lao wo/null 66 | 柬埔寨/ns/jian pu zhai/null 67 | 缅甸/ns/mian dian/null 68 | 泰国/ns/tai guo/null 69 | 马来西亚/ns/ma lai xi ya/null 70 | 文莱/ns/wen lai/null 71 | 新加坡/ns/xin jia po/null 72 | 印度尼西亚/ns/yi se lie/null 73 | 东帝汶 /ns/yi se lie/null 74 | 尼泊尔/ns/yi se lie/null 75 | 不丹/ns/yi se lie/null 76 | 孟加拉国/ns/yi se lie/null 77 | 印度/ns/yi se lie/null 78 | 巴基斯坦/ns/yi se lie/null 79 | 斯里兰卡/ns/yi se lie/null 80 | 马尔代夫 /ns/yi se lie/null 81 | 哈萨克斯坦/ns/yi se lie/null 82 | 吉尔吉斯斯坦/ns/yi se lie/null 83 | 塔吉克斯坦/ns/yi se lie/null 84 | 乌兹别克斯坦/ns/yi se lie/null 85 | 土库曼斯坦 /ns/yi se lie/null 86 | 阿富汗/ns/yi se lie/null 87 | 伊拉克/ns/yi se lie/null 88 | 伊朗/ns/yi se lie/null 89 | 叙利亚/ns/yi se lie/null 90 | 约旦/ns/yi se lie/null 91 | 黎巴嫩/ns/yi se lie/null 92 | 以色列/ns/yi se lie/null 93 | 巴勒斯坦/ns/yi se lie/null 94 | 沙特阿拉伯/ns/yi se lie/null 95 | 巴林/ns/yi se lie/null 96 | 卡塔尔/ns/yi se lie/null 97 | 科威特/ns/yi se lie/null 98 | 阿拉伯联合酋长国/ns/yi se lie/null 99 | 阿曼/ns/yi se lie/null 100 | 也门/ns/yi se lie/null 101 | 格鲁吉亚/ns/yi se lie/null 102 | 亚美尼亚/ns/yi se lie/null 103 | 阿塞拜疆/ns/yi se lie/null 104 | 土耳其/ns/yi se lie/null 105 | 塞浦路斯 /ns/yi se lie/null 106 | 芬兰/ns/yi se lie/null 107 | 瑞典/ns/yi se lie/null 108 | 挪威/ns/yi se lie/null 109 | 冰岛/ns/yi se lie/null 110 | 丹麦 法罗群岛/ns/yi se lie/null 111 | 爱沙尼亚/ns/yi se lie/null 112 | 拉脱维亚/ns/yi se lie/null 113 | 立陶宛/ns/yi se lie/null 114 | 白俄罗斯/ns/yi se lie/null 115 | 俄罗斯/ns/yi se lie/null 116 | 乌克兰/ns/yi se lie/null 117 | 摩尔多瓦 /ns/yi se lie/null 118 | 波兰/ns/yi se lie/null 119 | 捷克/ns/yi se lie/null 120 | 斯洛伐克/ns/yi se lie/null 121 | 匈牙利/ns/yi se lie/null 122 | 德国/ns/yi se lie/null 123 | 奥地利/ns/yi se lie/null 124 | 瑞士/ns/yi se lie/null 125 | 列支敦士登 /ns/yi se lie/null 126 | 英国/ns/yi se lie/null 127 | 爱尔兰/ns/yi se lie/null 128 | 荷兰/ns/yi se lie/null 129 | 比利时/ns/yi se lie/null 130 | 卢森堡/ns/yi se lie/null 131 | 法国/ns/yi se lie/null 132 | 摩纳哥 /ns/yi se lie/null 133 | 罗马尼亚/ns/yi se lie/null 134 | 保加利亚/ns/yi se lie/null 135 | 塞尔维亚/ns/yi se lie/null 136 | 马其顿/ns/yi se lie/null 137 | 阿尔巴尼亚/ns/yi se lie/null 138 | 希腊/ns/yi se lie/null 139 | 斯洛文尼亚/ns/yi se lie/null 140 | 克罗地亚/ns/yi se lie/null 141 | 波斯尼亚和墨塞哥维那/ns/yi se lie/null 142 | 意大利/ns/yi se lie/null 143 | 梵蒂冈/ns/yi se lie/null 144 | 圣马力诺/ns/yi se lie/null 145 | 马耳他/ns/yi se lie/null 146 | 西班牙/ns/yi se lie/null 147 | 葡萄牙/ns/yi se lie/null 148 | 安道尔 /ns/yi se lie/null 149 | 埃及/ns/yi se lie/null 150 | 利比亚/ns/yi se lie/null 151 | 苏丹/ns/yi se lie/null 152 | 突尼斯/ns/yi se lie/null 153 | 阿尔及利亚/ns/yi se lie/null 154 | 摩洛哥/ns/yi se lie/null 155 | 亚速尔群岛/ns/yi se lie/null 156 | 马德拉群岛/ns/yi se lie/null 157 | 埃塞俄比亚/ns/yi se lie/null 158 | 厄立特里亚/ns/yi se lie/null 159 | 索马里/ns/yi se lie/null 160 | 吉布提/ns/yi se lie/null 161 | 肯尼亚/ns/yi se lie/null 162 | 坦桑尼亚/ns/yi se lie/null 163 | 乌干达/ns/yi se lie/null 164 | 卢旺达/ns/yi se lie/null 165 | 布隆迪/ns/yi se lie/null 166 | 塞舌尔 刚果/ns/yi se lie/null 167 | 圣多美及普林西比/ns/yi se lie/null 168 | 塞内加尔/ns/yi se lie/null 169 | 冈比亚/ns/yi se lie/null 170 | 马里/ns/yi se lie/null 171 | 布基纳法索/ns/yi se lie/null 172 | 几内亚/ns/yi se lie/null 173 | 几内亚比绍/ns/yi se lie/null 174 | 佛得角/ns/yi se lie/null 175 | 塞拉利昂/ns/yi se lie/null 176 | 利比里亚/ns/yi se lie/null 177 | 科特迪瓦/ns/yi se lie/null 178 | 加纳/ns/yi se lie/null 179 | 多哥/ns/yi se lie/null 180 | 贝宁/ns/yi se lie/null 181 | 尼日尔/ns/yi se lie/null 182 | 加那利群岛/ns/yi se lie/null 183 | 赞比亚/ns/yi se lie/null 184 | 安哥拉/ns/yi se lie/null 185 | 津巴布韦/ns/yi se lie/null 186 | 马拉维/ns/yi se lie/null 187 | 莫桑比克/ns/yi se lie/null 188 | 博茨瓦纳/ns/yi se lie/null 189 | 纳米比亚/ns/yi se lie/null 190 | 南非/ns/yi se lie/null 191 | 斯威士兰/ns/yi se lie/null 192 | 莱索托/ns/yi se lie/null 193 | 马达加斯加/ns/yi se lie/null 194 | 科摩罗/ns/yi se lie/null 195 | 毛里求斯/ns/yi se lie/null 196 | 留尼旺/ns/yi se lie/null 197 | 圣赫勒拿 /ns/yi se lie/null 198 | 澳大利亚/ns/yi se lie/null 199 | 新西兰/ns/yi se lie/null 200 | 巴布亚新几内亚/ns/yi se lie/null 201 | 所罗门群岛/ns/yi se lie/null 202 | 瓦努阿图/ns/yi se lie/null 203 | 密克罗尼西亚/ns/yi se lie/null 204 | 马绍尔群岛/ns/yi se lie/null 205 | 帕劳/ns/yi se lie/null 206 | 瑙鲁/ns/yi se lie/null 207 | 基里巴斯/ns/yi se lie/null 208 | 图瓦卢/ns/yi se lie/null 209 | 萨摩亚/ns/yi se lie/null 210 | 斐济群岛/ns/yi se lie/null 211 | 汤加/ns/yi se lie/null 212 | 库克群岛/ns/yi se lie/null 213 | 关岛/ns/yi se lie/null 214 | 新喀里多尼亚/ns/yi se lie/null 215 | 法属波利尼西亚/ns/yi se lie/null 216 | 皮特凯恩岛/ns/yi se lie/null 217 | 瓦利斯与富图纳/ns/yi se lie/null 218 | 纽埃/ns/yi se lie/null 219 | 托克劳/ns/yi se lie/null 220 | 美属萨摩亚/ns/yi se lie/null 221 | 北马里亚纳/ns/yi se lie/null 222 | 加拿大/ns/yi se lie/null 223 | 美国/ns/yi se lie/null 224 | 墨西哥/ns/yi se lie/null 225 | 格陵兰 /ns/yi se lie/null 226 | 危地马拉/ns/yi se lie/null 227 | 伯利兹/ns/yi se lie/null 228 | 萨尔瓦多/ns/yi se lie/null 229 | 洪都拉斯/ns/yi se lie/null 230 | 尼加拉瓜/ns/yi se lie/null 231 | 哥斯达黎加/ns/yi se lie/null 232 | 巴拿马 /ns/yi se lie/null 233 | 巴哈马/ns/yi se lie/null 234 | 古巴/ns/yi se lie/null 235 | 牙买加/ns/yi se lie/null 236 | 海地/ns/yi se lie/null 237 | 多米尼加共和国/ns/yi se lie/null 238 | 安提瓜和巴布达/ns/yi se lie/null 239 | 圣基茨和尼维斯/ns/yi se lie/null 240 | 多米尼克/ns/yi se lie/null 241 | 圣卢西亚/ns/yi se lie/null 242 | 圣文森特和格林纳丁斯/ns/yi se lie/null 243 | 格林纳达/ns/yi se lie/null 244 | 巴巴多斯/ns/yi se lie/null 245 | 特立尼达和多巴哥/ns/yi se lie/null 246 | 波多黎各/ns/yi se lie/null 247 | 英属维尔京群岛/ns/yi se lie/null 248 | 美属维尔京群岛/ns/yi se lie/null 249 | 安圭拉/ns/yi se lie/null 250 | 蒙特塞拉特/ns/yi se lie/null 251 | 瓜德罗普/ns/yi se lie/null 252 | 马提尼克/ns/yi se lie/null 253 | 荷属安的列斯/ns/yi se lie/null 254 | 阿鲁巴/ns/yi se lie/null 255 | 特克斯和凯科斯群岛/ns/yi se lie/null 256 | 开曼群岛/ns/yi se lie/null 257 | 百慕大 /ns/yi se lie/null 258 | 哥伦比亚/ns/yi se lie/null 259 | 委内瑞拉/ns/yi se lie/null 260 | 圭亚那/ns/yi se lie/null 261 | 法属圭亚那/ns/yi se lie/null 262 | 苏里南 /ns/yi se lie/null 263 | 厄瓜多尔/ns/yi se lie/null 264 | 秘鲁/ns/yi se lie/null 265 | 玻利维亚/ns/yi se lie/null 266 | 巴西/ns/yi se lie/null 267 | 智利/ns/yi se lie/null 268 | 阿根廷/ns/yi se lie/null 269 | 乌拉圭/ns/yi se lie/null 270 | 巴拉圭/ns/yi se lie/null 271 | -------------------------------------------------------------------------------- /WebRoot/WEB-INF/lib/lexicon/lex-net.lex: -------------------------------------------------------------------------------- 1 | CJK_WORDS 2 | 油条哥/n/you tiao ge/null 3 | 活雷锋/n/huo lei feng/null 4 | 夕阳红/n/xi yang hong/null 5 | 帮扶村/n/bang fu cun/null 6 | 后援会/n/hou yuan hui/null 7 | 复炸油/n/fu zha you/null 8 | 献血哥/n/xian xie ge/null 9 | 放心姐/n/fang xin jie/null 10 | 啃老族/n/ken lao zu/null 11 | 特训班/n/te xun ban/null 12 | 平头男/n/ping tou nan/null 13 | 爆头哥/n/bao tou ge/null 14 | 楼主/n/lou zhu/null 15 | 有两把刷子/a/you liang ba shua zi/null 16 | 非典/n/fei dian/null 17 | 微信/n/wei xin/null 18 | 微博/n/wei bo/null 19 | 吊丝/n/diao si/null 20 | 高富帅/n/gao fu shuai/null 21 | 矮穷挫/n/ai qiong cuo/null 22 | 白富美/n/bai fu mei/null 23 | 狮子的魂/nz/shi zi de hun/null 24 | 仓老师/nz/cang lao shi/仓井空 25 | 菇凉/n/gu liang/null 26 | -------------------------------------------------------------------------------- /WebRoot/WEB-INF/lib/lexicon/lex-org.lex: -------------------------------------------------------------------------------- 1 | CJK_WORDS 2 | 上海合作组织/nt/shang hai he zuo zu zhi/null 3 | 世卫/nt/shi wei/null 4 | 世界卫生组织/nt/shi jie wei sheng zu zhi/null 5 | 世界银行/nt/shi jie yin hang/null 6 | 东盟/nt/dong meng/null 7 | 亚太经合组织/nt/ya tai jing he zu zhi/null 8 | 人权理事会/nt/ren quan li shi hui/null 9 | 六方会谈/nt/liu fang hui tan/null 10 | 北约/nt/bei yue/null 11 | 哈马斯/nt/ha ma si/null 12 | 安全理事会/nt/an quan li shi hui/null 13 | 安理会/nt/an li hui/null 14 | 欧佩克/nt/ou pei ke/null 15 | 红十字会/nt/hong shi zi hui/null 16 | 联合国/nt/lian he guo/null 17 | -------------------------------------------------------------------------------- /WebRoot/WEB-INF/lib/lexicon/lex-sname.lex: -------------------------------------------------------------------------------- 1 | CN_SNAME 2 | #中文单名词库 3 | 敏 4 | 伟 5 | 勇 6 | 军 7 | 斌 8 | 静 9 | 丽 10 | 涛 11 | 芳 12 | 杰 13 | 萍 14 | 强 15 | 俊 16 | 明 17 | 燕 18 | 磊 19 | 玲 20 | 华 21 | 平 22 | 鹏 23 | 健 24 | 波 25 | 红 26 | 丹 27 | 辉 28 | 超 29 | 艳 30 | 莉 31 | 刚 32 | 娟 33 | 峰 34 | 婷 35 | 亮 36 | 洁 37 | 颖 38 | 琳 39 | 英 40 | 慧 41 | 飞 42 | 霞 43 | 浩 44 | 凯 45 | 宇 46 | 毅 47 | 林 48 | 佳 49 | 云 50 | 莹 51 | 娜 52 | 晶 53 | 洋 54 | 文 55 | 鑫 56 | 欣 57 | 琴 58 | 宁 59 | 琼 60 | 兵 61 | 青 62 | 琦 63 | 翔 64 | 彬 65 | 锋 66 | 阳 67 | 璐 68 | 旭 69 | 蕾 70 | 剑 71 | 虹 72 | 蓉 73 | 建 74 | 倩 75 | 梅 76 | 宏 77 | 威 78 | 博 79 | 君 80 | 力 81 | 龙 82 | 晨 83 | 薇 84 | 雪 85 | 琪 86 | 欢 87 | 荣 88 | 江 89 | 炜 90 | 成 91 | 庆 92 | 冰 93 | 东 94 | 帆 95 | 雷 96 | 楠 97 | 锐 98 | 进 99 | 海 100 | 凡 101 | 巍 102 | 维 103 | 迪 104 | 媛 105 | 玮 106 | 杨 107 | 群 108 | 瑛 109 | 悦 110 | 春 111 | 瑶 112 | 婧 113 | 兰 114 | 茜 115 | 松 116 | 爽 117 | 立 118 | 瑜 119 | 睿 120 | 晖 121 | 聪 122 | 帅 123 | 瑾 124 | 骏 125 | 雯 126 | 晓 127 | 昊 128 | 勤 129 | 新 130 | 瑞 131 | 岩 132 | 星 133 | 忠 134 | 志 135 | 怡 136 | 坤 137 | 康 138 | 航 139 | 利 140 | 畅 141 | 坚 142 | 雄 143 | 智 144 | 萌 145 | 哲 146 | 岚 147 | 洪 148 | 捷 149 | 珊 150 | 恒 151 | 靖 152 | 清 153 | 扬 154 | 昕 155 | 乐 156 | 武 157 | 玉 158 | 诚 159 | 菲 160 | 锦 161 | 凤 162 | 珍 163 | 晔 164 | 妍 165 | 璇 166 | 胜 167 | 菁 168 | 科 169 | 芬 170 | 露 171 | 越 172 | 彤 173 | 曦 174 | 义 175 | 良 176 | 鸣 177 | 芸 178 | 方 179 | 月 180 | 铭 181 | 光 182 | 震 183 | 冬 184 | 源 185 | 政 186 | 虎 187 | 莎 188 | 彪 189 | 蓓 190 | 钢 191 | 凌 192 | 奇 193 | 卫 194 | 彦 195 | 烨 196 | 可 197 | 黎 198 | 川 199 | 淼 200 | 惠 201 | 祥 202 | 然 203 | 三 204 | 逗 205 | 高 206 | 潇 207 | 正 208 | 硕 209 | -------------------------------------------------------------------------------- /WebRoot/WEB-INF/lib/lexicon/lex-stopword.lex: -------------------------------------------------------------------------------- 1 | STOP_WORDS 2 | #en-punctuation 3 | ! 4 | " 5 | # 6 | $ 7 | % 8 | & 9 | ' 10 | ( 11 | ) 12 | * 13 | + 14 | , 15 | - 16 | . 17 | / 18 | #0 19 | #1 20 | #2 21 | #3 22 | #4 23 | #5 24 | #6 25 | #7 26 | #8 27 | #9 28 | : 29 | ; 30 | < 31 | = 32 | > 33 | ? 34 | @ 35 | [ 36 | \ 37 | ] 38 | ^ 39 | _ 40 | ` 41 | #a 42 | #b 43 | #c 44 | #d 45 | #e 46 | #f 47 | #g 48 | #h 49 | #i 50 | #j 51 | #k 52 | #l 53 | #m 54 | #n 55 | #o 56 | #p 57 | #q 58 | #r 59 | #s 60 | #t 61 | #u 62 | #v 63 | #w 64 | #x 65 | #y 66 | #z 67 | { 68 | | 69 | } 70 | ~ 71 | ! 72 | #fullwidth 73 | ! 74 | " 75 | # 76 | $ 77 | % 78 | & 79 | ' 80 | ( 81 | ) 82 | * 83 | + 84 | , 85 | - 86 | . 87 | / 88 | : 89 | ; 90 | < 91 | = 92 | > 93 | ? 94 | @ 95 | [ 96 | \ 97 | ] 98 | ^ 99 | _ 100 | ` 101 | { 102 | | 103 | } 104 | ~ 105 | ⦅ 106 | ⦆ 107 | 。 108 | 「 109 | 」 110 | 、 111 | ・ 112 | #cn-punctuation 113 | 、 114 | 。 115 | 〃 116 | 〄 117 | 々 118 | 〆 119 | 〇 120 | 〈 121 | 〉 122 | 《 123 | 》 124 | 「 125 | 」 126 | 『 127 | 』 128 | 【 129 | 】 130 | 〒 131 | 〓 132 | 〔 133 | 〕 134 | 〖 135 | 〗 136 | 〘 137 | 〙 138 | 〚 139 | 〛 140 | 〜 141 | 〝 142 | 〞 143 | 〟 144 | #中文 145 | 的 146 | 吗 147 | 不 148 | 我 149 | 们 150 | 起 151 | 就 152 | 最 153 | 在 154 | 人 155 | 有 156 | 是 157 | 为 158 | 以 159 | 于 160 | 上 161 | 他 162 | 而 163 | 后 164 | 之 165 | 来 166 | 由 167 | 及 168 | 了 169 | 下 170 | 可 171 | 到 172 | 这 173 | 与 174 | 也 175 | 因 176 | 此 177 | 但 178 | 并 179 | 个 180 | 其 181 | 已 182 | 无 183 | 小 184 | 今 185 | 去 186 | 再 187 | 好 188 | 只 189 | 又 190 | 或 191 | 很 192 | 亦 193 | 某 194 | 把 195 | 那 196 | 你 197 | 乃 198 | 它 199 | 吧 200 | 被 201 | 比 202 | 别 203 | 趁 204 | 当 205 | 从 206 | 到 207 | 得 208 | 打 209 | 凡 210 | 儿 211 | 尔 212 | 该 213 | 各 214 | 给 215 | 跟 216 | 和 217 | 何 218 | 还 219 | 即 220 | 几 221 | 既 222 | 看 223 | 据 224 | 距 225 | 靠 226 | 啦 227 | 了 228 | 另 229 | 么 230 | 每 231 | 们 232 | 嘛 233 | 拿 234 | 哪 235 | 那 236 | 您 237 | 凭 238 | 且 239 | 却 240 | 让 241 | 仍 242 | 啥 243 | 如 244 | 若 245 | 使 246 | 谁 247 | 虽 248 | 随 249 | 同 250 | 所 251 | 她 252 | 哇 253 | 嗡 254 | 往 255 | 哪 256 | 些 257 | 向 258 | 沿 259 | 哟 260 | 用 261 | 于 262 | 咱 263 | 则 264 | 怎 265 | 曾 266 | 至 267 | 致 268 | 着 269 | 诸 270 | 自 271 | 啊 272 | #英文 273 | to 274 | can 275 | could 276 | dare 277 | do 278 | did 279 | does 280 | may 281 | might 282 | would 283 | should 284 | must 285 | will 286 | ought 287 | shall 288 | need 289 | is 290 | a 291 | am 292 | are 293 | about 294 | according 295 | after 296 | against 297 | all 298 | almost 299 | also 300 | although 301 | among 302 | an 303 | and 304 | another 305 | any 306 | anything 307 | approximately 308 | as 309 | asked 310 | at 311 | back 312 | because 313 | before 314 | besides 315 | between 316 | both 317 | but 318 | by 319 | call 320 | called 321 | currently 322 | despite 323 | did 324 | do 325 | dr 326 | during 327 | each 328 | earlier 329 | eight 330 | even 331 | eventually 332 | every 333 | everything 334 | five 335 | for 336 | four 337 | from 338 | he 339 | her 340 | here 341 | his 342 | how 343 | however 344 | i 345 | if 346 | in 347 | indeed 348 | instead 349 | it 350 | its 351 | just 352 | last 353 | like 354 | major 355 | many 356 | may 357 | maybe 358 | meanwhile 359 | more 360 | moreover 361 | most 362 | mr 363 | mrs 364 | ms 365 | much 366 | my 367 | neither 368 | net 369 | never 370 | nevertheless 371 | nine 372 | no 373 | none 374 | not 375 | nothing 376 | now 377 | of 378 | on 379 | once 380 | one 381 | only 382 | or 383 | other 384 | our 385 | over 386 | partly 387 | perhaps 388 | prior 389 | regarding 390 | separately 391 | seven 392 | several 393 | she 394 | should 395 | similarly 396 | since 397 | six 398 | so 399 | some 400 | somehow 401 | still 402 | such 403 | ten 404 | that 405 | the 406 | their 407 | then 408 | there 409 | therefore 410 | these 411 | they 412 | this 413 | those 414 | though 415 | three 416 | to 417 | two 418 | under 419 | unless 420 | unlike 421 | until 422 | volume 423 | we 424 | what 425 | whatever 426 | whats 427 | when 428 | where 429 | which 430 | while 431 | why 432 | with 433 | without 434 | yesterday 435 | yet 436 | you 437 | your 438 | aboard 439 | about 440 | above 441 | according to 442 | across 443 | afore 444 | after 445 | against 446 | agin 447 | along 448 | alongside 449 | amid 450 | amidst 451 | among 452 | amongst 453 | anent 454 | around 455 | as 456 | aslant 457 | astride 458 | at 459 | athwart 460 | bar 461 | because of 462 | before 463 | behind 464 | below 465 | beneath 466 | beside 467 | besides 468 | between 469 | betwixt 470 | beyond 471 | but 472 | by 473 | circa 474 | despite 475 | down 476 | during 477 | due to 478 | ere 479 | except 480 | for 481 | from 482 | in 483 | inside 484 | into 485 | less 486 | like 487 | mid 488 | midst 489 | minus 490 | near 491 | next 492 | nigh 493 | nigher 494 | nighest 495 | notwithstanding 496 | of 497 | off 498 | on 499 | on to 500 | onto 501 | out 502 | out of 503 | outside 504 | over 505 | past 506 | pending 507 | per 508 | plus 509 | qua 510 | re 511 | round 512 | sans 513 | save 514 | since 515 | through 516 | throughout 517 | thru 518 | till 519 | to 520 | toward 521 | towards 522 | under 523 | underneath 524 | unlike 525 | until 526 | unto 527 | up 528 | upon 529 | versus 530 | via 531 | vice 532 | with 533 | within 534 | without 535 | he 536 | her 537 | herself 538 | hers 539 | him 540 | himself 541 | his 542 | I 543 | it 544 | its 545 | itself 546 | me 547 | mine 548 | my 549 | myself 550 | ours 551 | she 552 | their 553 | theirs 554 | them 555 | themselves 556 | they 557 | us 558 | we 559 | our 560 | ourselves 561 | you 562 | your 563 | yours 564 | yourselves 565 | yourself 566 | this 567 | that 568 | these 569 | those 570 | a 571 | about 572 | above 573 | across 574 | after 575 | afterwards 576 | again 577 | against 578 | all 579 | almost 580 | alone 581 | along 582 | already 583 | also 584 | although 585 | always 586 | am 587 | among 588 | amongst 589 | amoungst 590 | amount 591 | an 592 | and 593 | another 594 | any 595 | anyhow 596 | anyone 597 | anything 598 | anyway 599 | anywhere 600 | are 601 | around 602 | as 603 | at 604 | back 605 | be 606 | became 607 | because 608 | become 609 | becomes 610 | becoming 611 | been 612 | before 613 | beforehand 614 | behind 615 | being 616 | below 617 | beside 618 | besides 619 | between 620 | beyond 621 | bill 622 | both 623 | bottom 624 | but 625 | by 626 | call 627 | can 628 | cannot 629 | cant 630 | co 631 | computer 632 | con 633 | could 634 | couldnt 635 | cry 636 | de 637 | describe 638 | detail 639 | do 640 | done 641 | down 642 | due 643 | during 644 | each 645 | eg 646 | eight 647 | either 648 | eleven 649 | else 650 | elsewhere 651 | empty 652 | enough 653 | etc 654 | even 655 | ever 656 | every 657 | everyone 658 | everything 659 | everywhere 660 | except 661 | few 662 | fifteen 663 | fify 664 | fill 665 | find 666 | fire 667 | first 668 | five 669 | for 670 | former 671 | formerly 672 | forty 673 | found 674 | four 675 | from 676 | front 677 | full 678 | further 679 | get 680 | give 681 | go 682 | had 683 | has 684 | hasnt 685 | have 686 | he 687 | hence 688 | her 689 | here 690 | hereafter 691 | hereby 692 | herein 693 | hereupon 694 | hers 695 | herself 696 | him 697 | himself 698 | his 699 | how 700 | however 701 | hundred 702 | i 703 | ie 704 | if 705 | in 706 | inc 707 | indeed 708 | interest 709 | into 710 | is 711 | it 712 | its 713 | itself 714 | keep 715 | last 716 | latter 717 | latterly 718 | least 719 | less 720 | ltd 721 | made 722 | many 723 | may 724 | me 725 | meanwhile 726 | might 727 | mill 728 | mine 729 | more 730 | moreover 731 | most 732 | mostly 733 | move 734 | much 735 | must 736 | my 737 | myself 738 | name 739 | namely 740 | neither 741 | never 742 | nevertheless 743 | next 744 | nine 745 | no 746 | nobody 747 | none 748 | noone 749 | nor 750 | not 751 | nothing 752 | now 753 | nowhere 754 | of 755 | off 756 | often 757 | on 758 | once 759 | one 760 | only 761 | onto 762 | or 763 | other 764 | others 765 | otherwise 766 | our 767 | ours 768 | ourselves 769 | out 770 | over 771 | own 772 | part 773 | per 774 | perhaps 775 | please 776 | put 777 | rather 778 | re 779 | same 780 | see 781 | seem 782 | seemed 783 | seeming 784 | seems 785 | serious 786 | several 787 | she 788 | should 789 | show 790 | side 791 | since 792 | sincere 793 | six 794 | sixty 795 | so 796 | some 797 | somehow 798 | someone 799 | something 800 | sometime 801 | sometimes 802 | somewhere 803 | still 804 | such 805 | take 806 | ten 807 | than 808 | that 809 | the 810 | their 811 | them 812 | themselves 813 | then 814 | thence 815 | there 816 | thereafter 817 | thereby 818 | therefore 819 | therein 820 | thereupon 821 | these 822 | they 823 | thick 824 | thin 825 | third 826 | this 827 | those 828 | though 829 | three 830 | through 831 | throughout 832 | thru 833 | thus 834 | to 835 | together 836 | too 837 | top 838 | toward 839 | towards 840 | twelve 841 | twenty 842 | two 843 | un 844 | under 845 | until 846 | up 847 | upon 848 | us 849 | very 850 | via 851 | was 852 | we 853 | well 854 | were 855 | what 856 | whatever 857 | when 858 | whence 859 | whenever 860 | where 861 | whereafter 862 | whereas 863 | whereby 864 | wherein 865 | whereupon 866 | wherever 867 | whether 868 | which 869 | while 870 | whither 871 | who 872 | whoever 873 | whole 874 | whom 875 | whose 876 | why 877 | will 878 | with 879 | within 880 | without 881 | would 882 | yet 883 | you 884 | your 885 | yours 886 | yourself 887 | yourselves 888 | #chenxin12 889 | #other number 890 | -------------------------------------------------------------------------------- /WebRoot/WEB-INF/lib/lexicon/lex-touris.lex: -------------------------------------------------------------------------------- 1 | CJK_WORDS 2 | 世博园/ns/shi bo yuan/null 3 | 世博会/ns/shi bo hui/null 4 | 长城/ns/chang cheng/null 5 | 黄山/ns/huang shan/null 6 | 衡山/ns/heng shan/null 7 | 华山/ns/hua shan/null 8 | 泰山/ns/tai shan/null 9 | -------------------------------------------------------------------------------- /WebRoot/WEB-INF/lib/lexicon/lex-units.lex: -------------------------------------------------------------------------------- 1 | CJK_UNITS 2 | #中文单字单位词库 3 | #长度 4 | 米 5 | 寸 6 | 尺 7 | 丈 8 | 里 9 | #时间 10 | 年 11 | 月 12 | 日 13 | 时 14 | #分 15 | 秒 16 | #币 17 | 元 18 | 角 19 | #容量 20 | 升 21 | 斗 22 | 石 23 | 瓶 24 | 袋 25 | 盒 26 | #重量 27 | 吨 28 | 克 29 | 斤 30 | 两 31 | 担 32 | #地积 33 | 亩 34 | 顷 35 | #其他 36 | 折 37 | 件 38 | 番 39 | ℃ 40 | ℉ 41 | -------------------------------------------------------------------------------- /WebRoot/WEB-INF/web.xml: -------------------------------------------------------------------------------- 1 | 2 | 6 | 7 | 8 | index.jsp 9 | 10 | 11 | parseArticleById 12 | com.chenxb.servlet.ParseArticleById 13 | 14 | 15 | parseArticleById 16 | /parseArticle 17 | 18 | 19 | 20 | 21 | ArticleWithSql 22 | com.chenxb.servlet.ArticleWithSql 23 | 24 | 25 | ArticleWithSql 26 | /articleWithSql 27 | 28 | 29 | 30 | 31 | parseArticleByColumn 32 | com.chenxb.servlet.ColumnArticlesWithSql 33 | 34 | 35 | parseArticleByColumn 36 | /columnWithSql 37 | 38 | 39 | 40 | 41 | moreArticlesWithSql 42 | com.chenxb.servlet.MoreArticlesWithSql 43 | 44 | 45 | moreArticlesWithSql 46 | /columnMore 47 | 48 | 49 | 50 | 51 | parseRotationWithSql 52 | com.chenxb.servlet.RotationWithSql 53 | 54 | 55 | parseRotationWithSql 56 | /rotationWithSql 57 | 58 | 59 | 60 | 61 | 62 | searchArticle 63 | com.chenxb.servlet.SearchArticle 64 | 65 | 66 | searchArticle 67 | /searchArticle 68 | 69 | -------------------------------------------------------------------------------- /WebRoot/css/detail.css: -------------------------------------------------------------------------------- 1 | @charset "utf-8"; 2 | /* p {color:#FF3366;text-indent:2em;line-height:24px;} */ 3 | #article_title { 4 | text-align: center; 5 | width: 100%; 6 | font-size: 20px; 7 | margin: 0 0 10px 0; 8 | } 9 | #article_title h1 { 10 | font-size: 20px; 11 | line-height: 30px; 12 | font-weight: normal; 13 | } 14 | 15 | 16 | #article_detail { 17 | font-size: 14px; 18 | color: #888888; 19 | margin: 10px 0px; 20 | text-align: center; 21 | border-bottom: 1px solid #f0f0f0; 22 | line-height: 25px; 23 | } 24 | #article_detail span{ 25 | margin: 0px 5px; 26 | } 27 | 28 | #article_content p{ 29 | text-indent: 2em; 30 | margin-bottom: 20px; 31 | font-size: 15px; 32 | } 33 | 34 | #article_content a:hover { 35 | text-decoration: underline; 36 | } 37 | #article_content a { 38 | color: #428bca; 39 | } 40 | #original_post { 41 | margin-top: 15px; 42 | border-top: 1px dashed #ccc; 43 | color: #777; 44 | text-align: center; 45 | } 46 | #original_post p{ 47 | margin: 5px 0; 48 | color: #777; 49 | font-size: 12px; 50 | } 51 | /* a{ 52 | text-decoration: none; 53 | color: #111; 54 | } 55 | a:hover { 56 | color:#BD0800; 57 | } */ 58 | /* a {color:#3E62A6;} */ 59 | img {max-width:310px;display:table-cell;vertical-align:middle;margin-left:1em;} 60 | 61 | img.alignleft {float:left;max-width:120px;margin:0 10px 5px 0;border:1px solid #ccc;background:#fff;padding:2px;} 62 | pre {font-size:9pt;line-height:12pt;font-family:Courier New,Arial;border:1px solid #ddd;border-left:5px solid #6CE26C;background:#f6f6f6;padding:5px;overflow: auto;} 63 | a.tag {font-size:15px;text-decoration:none;background-color:#bbd6f3;border-bottom:2px solid #3E6D8E;border-right:2px solid #7F9FB6;color:#284a7b;margin:2px 2px 2px 0;padding:2px 4px;white-space:nowrap;} -------------------------------------------------------------------------------- /WebRoot/index.jsp: -------------------------------------------------------------------------------- 1 | <%@ page language="java" import="java.util.*" pageEncoding="UTF-8"%> 2 | <% 3 | String path = request.getContextPath(); 4 | String num = request.getQueryString(); //获得新闻详情的num 5 | String basePath = request.getScheme() + "://" + request.getServerName() + ":" + request.getServerPort() 6 | + path + "/"; 7 | %> 8 | 9 | 10 | 11 | 12 | 14 | 15 | 16 | My JSP 'index.jsp' starting page 17 | 18 | 19 | 20 | 21 | 22 | 25 | 26 | 27 | 28 | This is my JSP page. 29 |
30 | <% 31 | if (num == null || num.equals("")) { 32 | %> 33 |

请使用index后面更7000参数,例如?index=7000

34 | <% 35 | } 36 | %> 37 | 38 | 39 | -------------------------------------------------------------------------------- /WebRoot/jsp/detail.jsp: -------------------------------------------------------------------------------- 1 | <%@ page language="java" import="java.util.*" pageEncoding="UTF-8"%> 2 | <%@ page language="java" import="com.chenxb.news.*"%> 3 | <%@ page language="java" import="com.chenxb.model.*"%> 4 | <%@ page language="java" import="com.chenxb.biz.*"%> 5 | <%@ taglib uri="http://java.sun.com/jsp/jstl/core" prefix="c"%> 6 | <% 7 | String num = request.getParameter("num"); 8 | String original = "http://see.xidian.edu.cn/html/news/" + num + ".html"; 9 | ArticleItem detail = ArticleBiz.parseNewsItem(Integer.parseInt(num)); 10 | request.setAttribute("detail", detail); 11 | request.setAttribute("original", original); 12 | %> 13 | 14 | 15 | 16 | 17 | 19 | <%--显示为可以拨号的连接 --%> 20 | 21 | 22 | ${detail.title } 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 |
31 |

${detail.title }

32 |
33 | 34 |
35 | ${detail.publishDate } 36 | 浏览次数:${detail.readTimes } 37 |
38 | 39 |
${detail.body }
40 | 41 |
42 |

43 | SeeNews已优化原网页方便移动设备查看 44 |

45 |
46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /src/com/chenxb/biz/ArticleBiz.java: -------------------------------------------------------------------------------- 1 | package com.chenxb.biz; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.commons.lang3.ArrayUtils; 6 | import org.jsoup.Jsoup; 7 | import org.jsoup.nodes.Document; 8 | import org.jsoup.nodes.Element; 9 | import org.jsoup.select.Elements; 10 | 11 | import com.chenxb.model.ArticleItem; 12 | import com.chenxb.util.Constant; 13 | import com.chenxb.util.HttpTool; 14 | import com.chenxb.util.ImageTool; 15 | import com.chenxb.util.UrlTool; 16 | 17 | /** 18 | * 根据指定的新闻 id 爬取新闻数据 19 | * 将获得的标题、发布时间、内容拼接成 javabean 20 | * @author tomchen 21 | * 22 | */ 23 | public class ArticleBiz { 24 | 25 | // 统计点击次数的 url 26 | private static final String COUNT_BASE_URL = "http://see.xidian.edu.cn/index.php/news/click/id/"; 27 | 28 | private static final String SOURCE_PREFIX = "来源:"; 29 | 30 | /** 31 | * 新闻的 url 格式为 http://see.xidian.edu.cn/html/news/7928.html 32 | * 33 | * @param id 34 | * 某个新闻页面的序号 35 | * @return 爬取该页面上的新闻信息,提取相应的信息,存到新闻bean里。如果没有爬取到新闻返回null 36 | * @throws Exception 37 | */ 38 | public static ArticleItem parseNewsItem(int id) throws Exception { 39 | // 根据后缀的数字,拼接新闻 url 40 | String urlStr = Constant.ARTICLE_BASE_URL + id + ".html"; 41 | 42 | // 利用get请求获取字符串再解析会有小部分乱码 43 | // String htmlStr = HttpTool.doGet(urlStr); 44 | // Document doc = Jsoup.parse(htmlStr); 45 | // try { 46 | Document doc = Jsoup.connect(urlStr).timeout(10000).get(); 47 | // 去掉jsoup对html字符串加的"\n",方便json字符串返回 48 | doc.outputSettings().prettyPrint(false); 49 | 50 | Element articleEle = doc.getElementById("article"); 51 | // 标题 52 | Element titleEle = articleEle.getElementById("article_title"); 53 | String titleStr = titleEle.text(); 54 | 55 | // article_detail包括了 2016-01-15 来源: 浏览次数:177 56 | Element detailEle = articleEle.getElementById("article_detail"); 57 | Elements details = detailEle.getElementsByTag("span"); 58 | 59 | // 发布时间 60 | String dateStr = details.get(0).text(); 61 | 62 | // 新闻来源 63 | String sourceStr = details.get(1).text(); 64 | 65 | // 去掉"来源:" 66 | if (SOURCE_PREFIX.equals(sourceStr.trim())) { 67 | sourceStr = "SeeNews"; 68 | } else { 69 | sourceStr = sourceStr.substring(3).trim(); 70 | } 71 | 72 | // 访问这个新闻页面,浏览次数会+1,次数是 JS 渲染的 73 | String jsStr = HttpTool.doGet(COUNT_BASE_URL + id); 74 | int readTimes = Integer.parseInt(jsStr.replaceAll("\\D+", "")); 75 | // 或者使用下面这个正则方法 76 | // String readTimesStr = jsStr.replaceAll("[^0-9]", ""); 77 | 78 | Element contentEle = articleEle.getElementById("article_content"); 79 | // 新闻主体内容 80 | 81 | String contentStr = contentEle.toString(); 82 | 83 | // 如果用 text()方法,新闻主体内容的 html 标签会丢失 84 | // 为了在 Android 上用 WebView 显示 html,用toString() 85 | // String contentStr = contentEle.text(); 86 | Elements images = contentEle.getElementsByTag("img"); 87 | String[] imageUrls = new String[images.size()]; 88 | 89 | // 图片上传到七牛 90 | // 将body中的图片地址替换为七牛的地址 91 | for (int i = 0; i < imageUrls.length; i++) { 92 | String origin = images.get(i).attr("src"); 93 | imageUrls[i] = ImageTool.convertUrl(id, origin); 94 | if (!origin.equals(imageUrls[i])) { 95 | // 只有上传图片到七牛,url 才会变化 96 | // 不相等,才替换为七牛的url 97 | contentStr = contentStr.replace(Constant.SRC_PREFIX + origin, 98 | Constant.SRC_PREFIX + Constant.BUCKET_HOST_NAME + imageUrls[i]); 99 | } 100 | } 101 | 102 | // 处理相对路径 url,不和上面的 image url 冲突 103 | Elements hrefs = contentEle.getElementsByTag("a"); 104 | for (int i = 0; i < hrefs.size(); i++) { 105 | String origin = hrefs.get(i).attr("href"); 106 | if (Constant.DEBUG) { 107 | System.out.println("原始 href=" + origin); 108 | } 109 | String newUrl = UrlTool.dealAttachmentUrl(id, origin); 110 | 111 | // 防止页面的附件 重复出现,替换多次 112 | // 出现这种 113 | // http://see.xidian.edu.cnhttp://see.xidian.edu.cn/uploads/file 114 | if (!origin.equals(newUrl)) { 115 | // 不相等,才替换为新的url 且url未被替换过 116 | contentStr = contentStr.replace(Constant.HREF_PREFIX + origin, Constant.HREF_PREFIX + newUrl); 117 | } 118 | } 119 | 120 | return new ArticleItem(id, imageUrls, titleStr, dateStr, readTimes, sourceStr, contentStr); 121 | } 122 | 123 | /** 124 | * 根据 id 得到这条新闻属于哪个栏目 125 | * NOTIFIC = 1;// 校园通知 126 | * BACHELOR = 2;// 本科教学 学士 127 | * MASTER = 3;// 研究生 硕士 128 | * ACADEMIC = 5;// 学术交流 129 | * 选取了电院新闻的部分栏目 130 | * JOB = 8;// 就业招聘 131 | * @param id 132 | * @return 133 | * @throws IOException 134 | */ 135 | public static int getType(int id) throws IOException { 136 | // 根据后缀的数字,拼接新闻 url 137 | String urlStr = Constant.ARTICLE_BASE_URL + id + ".html"; 138 | 139 | Document doc = Jsoup.connect(urlStr).timeout(10000).get(); 140 | Element ele = doc.getElementById("position_guide"); 141 | // href 类似http://see.xidian.edu.cn/html/category/2.html 142 | // 取出最后的数字2作为 type 143 | String href = ele.getElementsByTag("a").get(1).attr("href"); 144 | return Integer.valueOf(href.replaceAll("\\D+", "")); 145 | 146 | } 147 | 148 | } 149 | -------------------------------------------------------------------------------- /src/com/chenxb/biz/ColumnBiz.java: -------------------------------------------------------------------------------- 1 | package com.chenxb.biz; 2 | 3 | import java.io.IOException; 4 | import java.util.regex.Matcher; 5 | import java.util.regex.Pattern; 6 | 7 | import org.jsoup.Jsoup; 8 | import org.jsoup.nodes.Document; 9 | import org.jsoup.nodes.Element; 10 | import org.jsoup.select.Elements; 11 | 12 | import com.chenxb.util.ColumnType; 13 | import com.chenxb.util.UrlTool; 14 | 15 | public class ColumnBiz { 16 | 17 | private static Pattern regexCountPage = Pattern.compile("\\d+/(\\d+)"); 18 | 19 | /** 20 | * 爬取本科教学、研究生、就业招聘等栏目 21 | * 22 | * @param type 23 | * 栏目 24 | * @param currentPage 25 | * 当前页码 26 | * @return 返回新闻的id数组 27 | * @throws IOException 28 | */ 29 | public static int[] parseColumn(int type, int currentPage) throws IOException { 30 | 31 | String columnUrl = UrlTool.generateUrl(type, currentPage); 32 | Document doc = Jsoup.connect(columnUrl).timeout(10000).get(); 33 | Elements eles = doc.getElementById("list_area").getElementsByTag("a"); 34 | int[] articleIds = new int[eles.size()]; 35 | for (int i = 0; i < eles.size(); i++) { 36 | String url = eles.get(i).attr("href"); 37 | articleIds[i] = Integer.parseInt(url.replaceAll("\\D+", "")); 38 | } 39 | return articleIds; 40 | } 41 | 42 | /** 43 | * 根据栏目类型获取 本栏目共有几页 44 | * 45 | * @param type 46 | * 栏目类型 47 | * @return 总页数 48 | * @throws CommonException 49 | * @throws IOException 50 | */ 51 | public static int getTotalPage(int type) throws IOException { 52 | // 最新消息栏目特殊,只有1页,没有下一页 53 | if (type == ColumnType.LATEST) 54 | return 1; 55 | String columnUrl = UrlTool.generateUrl(type, 1); 56 | 57 | // String htmlStr = HttpTool.doGet(columnUrl); 58 | 59 | Document doc = Jsoup.connect(columnUrl).timeout(10000).get(); 60 | // 正则匹配 1262 条记录 1/26 页 61 | Element page = doc.getElementById("div_page"); 62 | 63 | Matcher matcher = regexCountPage.matcher(page.text()); 64 | if (matcher.find()) { 65 | return Integer.parseInt(matcher.group(1)); 66 | } else { 67 | // 根据经验值,一个栏目至少有5页 68 | return 5; 69 | } 70 | } 71 | 72 | /** 73 | * 获取某页码的新闻个数 74 | * @param type 75 | * @param indexPage 从第1页开始 76 | * @return 77 | * @throws IOException 78 | */ 79 | public static int countArticles(int type,int indexPage) throws IOException { 80 | return parseColumn(type, indexPage).length; 81 | } 82 | 83 | } -------------------------------------------------------------------------------- /src/com/chenxb/biz/RotationImageBiz.java: -------------------------------------------------------------------------------- 1 | package com.chenxb.biz; 2 | 3 | import java.io.IOException; 4 | import java.util.ArrayList; 5 | import java.util.List; 6 | 7 | import org.jsoup.Jsoup; 8 | import org.jsoup.nodes.Document; 9 | import org.jsoup.nodes.Element; 10 | import org.jsoup.select.Elements; 11 | 12 | import com.chenxb.model.RotationItem; 13 | import com.chenxb.util.Constant; 14 | import com.chenxb.util.ImageTool; 15 | 16 | /** 17 | * 首页轮播图片 18 | * @author tomchen 19 | * 20 | */ 21 | public class RotationImageBiz { 22 | 23 | /** 24 | * 爬取主页的轮播图片 25 | * @throws Exception 26 | * 27 | */ 28 | public static List parseHomeRotaions() throws Exception { 29 | Document doc = Jsoup.connect(Constant.SEE_URL).timeout(10000).get(); 30 | Elements eles = doc.getElementsByClass("rotaion_list").get(0).getElementsByTag("a"); 31 | List rotaions = new ArrayList(eles.size()); 32 | 33 | for (Element e : eles) { 34 | 35 | String articleUrl = e.attr("href"); 36 | 37 | int id = Integer.parseInt(articleUrl.replaceAll("\\D+", "")); 38 | 39 | Element imgEle = e.getElementsByTag("img").get(0); 40 | 41 | String imageUrl = imgEle.attr("src"); 42 | 43 | String[] key = { ImageTool.convertUrl(id, imageUrl) }; 44 | 45 | String title = imgEle.attr("alt"); 46 | 47 | // 该 id 的新闻属于哪个栏目 48 | int type = ArticleBiz.getType(id); 49 | 50 | rotaions.add(new RotationItem(id, key, title, type)); 51 | } 52 | return rotaions; 53 | } 54 | 55 | } 56 | -------------------------------------------------------------------------------- /src/com/chenxb/biz/UploadRandomImage.java: -------------------------------------------------------------------------------- 1 | package com.chenxb.biz; 2 | 3 | import java.io.IOException; 4 | import java.net.HttpURLConnection; 5 | import java.net.URL; 6 | import java.util.Random; 7 | 8 | import com.qiniu.storage.BucketManager; 9 | import com.qiniu.util.Auth; 10 | 11 | public class UploadRandomImage { 12 | 13 | private static final String RANDOM_URL = "https://unsplash.it/640/427/?image="; 14 | private static final String ACCESS_KEY = "***-***"; // 你的access_key 15 | private static final String SECRET_KEY = "***-***"; // 你的secret_key 16 | private static final String BUCKET_NAME = "***-***"; // 你的空间名称 17 | 18 | public static void main(String[] args) throws InterruptedException, IOException { 19 | Auth auth = Auth.create(ACCESS_KEY, SECRET_KEY); 20 | // 获取空间管理 21 | BucketManager bucketManager = new BucketManager(auth); 22 | 23 | int key = 0; 24 | for (int i = 0; i < 1025; i++) { 25 | // 如果 i 对应的图片存在,上传七牛 26 | if (exists(i)) { 27 | try { 28 | bucketManager.fetch(RANDOM_URL + i, BUCKET_NAME, key + ""); 29 | System.out.println("i = " + i + " to key = " + key); 30 | key++; 31 | } catch (Exception e) { 32 | bucketManager.fetch(RANDOM_URL + i, BUCKET_NAME, key + ""); 33 | System.out.println("Exception i = " + i + " to key = " + key); 34 | // 只有上传了七牛,key 才+1,保证七牛的 key 连续 35 | key++; 36 | } 37 | // sleep一段时间,免得对网站负载过大 38 | } else { 39 | System.out.println(i + "不存在"); 40 | } 41 | } 42 | System.out.println(key + "最终图片数目"); 43 | 44 | } 45 | 46 | /** 47 | * 判断地址对于的图片是否存在 48 | * @param id 49 | * @return 50 | */ 51 | public static boolean exists(int id) { 52 | try { 53 | HttpURLConnection.setFollowRedirects(false); 54 | HttpURLConnection con = (HttpURLConnection) new URL(RANDOM_URL + id).openConnection(); 55 | con.setRequestMethod("HEAD"); 56 | return (con.getResponseCode() == HttpURLConnection.HTTP_OK); 57 | } catch (Exception e) { 58 | e.printStackTrace(); 59 | return false; 60 | } 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /src/com/chenxb/common/StreamTool.java: -------------------------------------------------------------------------------- 1 | package com.chenxb.common; 2 | 3 | import java.io.ByteArrayOutputStream; 4 | import java.io.InputStream; 5 | 6 | public class StreamTool { 7 | 8 | public static byte[] read(InputStream inputStr) throws Exception { 9 | ByteArrayOutputStream outStr = new ByteArrayOutputStream(); 10 | byte[] buffer = new byte[1024]; 11 | int len = 0; 12 | while ((len = inputStr.read(buffer)) != -1) { 13 | outStr.write(buffer, 0, len); 14 | } 15 | inputStr.close(); 16 | return outStr.toByteArray(); 17 | } 18 | 19 | } 20 | -------------------------------------------------------------------------------- /src/com/chenxb/dao/ArticleDao.java: -------------------------------------------------------------------------------- 1 | package com.chenxb.dao; 2 | 3 | import java.sql.Connection; 4 | import java.sql.Date; 5 | import java.sql.PreparedStatement; 6 | import java.sql.ResultSet; 7 | import java.sql.SQLException; 8 | import java.util.Arrays; 9 | 10 | import org.apache.commons.lang3.ArrayUtils; 11 | 12 | import com.chenxb.model.ArticleItem; 13 | import com.chenxb.util.Constant; 14 | import com.chenxb.util.MysqlTool; 15 | import com.chenxb.util.TableName; 16 | 17 | /** 18 | * 插入新闻纪录到 mysql 19 | * 从 mysql 获取某条新闻 20 | * @author tomchen 21 | * 22 | */ 23 | public class ArticleDao { 24 | private Connection connection; 25 | 26 | public ArticleDao() throws Exception { 27 | connection = new MysqlTool().getConnection(); 28 | } 29 | 30 | /** 31 | * 根据 type 找到数据库表名称 32 | * 再从该表里找出 id 对应的新闻 33 | * @param type 34 | * @param id 35 | * @return 36 | * @throws SQLException 37 | */ 38 | public ArticleItem getArticleByTypeId(int type, int id) throws SQLException { 39 | // 根据 type 找出对应的 table 名称 40 | String tableName = TableName.getTableByType(type); 41 | 42 | // the mysql select statement 43 | String query = "select * from " + tableName + " where id = ?"; 44 | 45 | // create the mysql preparedstatement 46 | PreparedStatement preparedStmt = connection.prepareStatement(query); 47 | preparedStmt.setInt(1, id); 48 | 49 | ResultSet rs = preparedStmt.executeQuery(); 50 | while (rs.next()) { 51 | String[] imageUrls = {}; 52 | String urls = rs.getString(2); 53 | // split 最少也是返回一个元素 [] 返回 [""s] 54 | if (!urls.equals("[]")) { 55 | imageUrls = urls.replace("[", "").replace("]", "").split(", "); 56 | for (String url : Constant.USELESS_IMAGE_URL) { 57 | // 删除所有出现的元素 58 | imageUrls = ArrayUtils.removeAllOccurences(imageUrls, url); 59 | } 60 | } 61 | String title = rs.getString(3); 62 | String date = rs.getDate(4).toString(); 63 | int readTimes = rs.getInt(5); 64 | String source = rs.getString(6); 65 | String body = rs.getString(7); 66 | ArticleItem article = new ArticleItem(id, imageUrls, title, date, readTimes, source, body); 67 | return article; 68 | } 69 | return null; 70 | } 71 | 72 | /** 73 | * 将记录插入到数据库中 74 | * @param tableNmae 数据库名称,从 TableName 类中选取 75 | * @param article 76 | * @return 77 | * @throws SQLException 78 | */ 79 | public int insertArticle(String tableName, ArticleItem article) throws SQLException { 80 | // the mysql insert statement 81 | String query = " insert into " + tableName + " (id, image_urls, title, publish_date, read_times,source,body)" 82 | + " values (?, ?, ?, ?, ?,?,?)"; 83 | 84 | // create the mysql insert preparedstatement 85 | PreparedStatement preparedStmt = connection.prepareStatement(query); 86 | preparedStmt.setInt(1, article.getId()); 87 | preparedStmt.setString(2, Arrays.toString(article.getImageUrls())); 88 | preparedStmt.setString(3, article.getTitle()); 89 | preparedStmt.setDate(4, Date.valueOf(article.getPublishDate())); 90 | preparedStmt.setInt(5, article.getReadTimes()); 91 | preparedStmt.setString(6, article.getSource()); 92 | preparedStmt.setString(7, article.getBody()); 93 | return preparedStmt.executeUpdate(); 94 | } 95 | 96 | public Connection getConnection() { 97 | return connection; 98 | } 99 | 100 | public void setConnection(Connection connection) { 101 | this.connection = connection; 102 | } 103 | 104 | } 105 | -------------------------------------------------------------------------------- /src/com/chenxb/dao/ColumnDao.java: -------------------------------------------------------------------------------- 1 | package com.chenxb.dao; 2 | 3 | import static com.chenxb.util.Constant.DEBUG; 4 | 5 | import java.sql.Connection; 6 | import java.sql.Date; 7 | import java.sql.PreparedStatement; 8 | import java.sql.ResultSet; 9 | import java.sql.SQLException; 10 | import java.util.ArrayList; 11 | import java.util.Arrays; 12 | import java.util.List; 13 | 14 | import org.apache.commons.lang3.ArrayUtils; 15 | import org.apache.commons.lang3.StringUtils; 16 | 17 | import com.chenxb.biz.ArticleBiz; 18 | import com.chenxb.biz.ColumnBiz; 19 | import com.chenxb.model.ArticleItem; 20 | import com.chenxb.model.SimpleArticleItem; 21 | import com.chenxb.util.Constant; 22 | import com.chenxb.util.MysqlTool; 23 | import com.chenxb.util.TableName; 24 | import com.chenxb.util.TimeTool; 25 | 26 | /** 27 | * 获取某个栏目 多页新闻记录,插入到 Mysql 中 28 | * 为了简便,只获取前20页新闻 29 | * @author tomchen 30 | * 31 | */ 32 | public class ColumnDao { 33 | private static final int MAX_COLUMN_NUM = 30; 34 | private Connection connection; 35 | 36 | public ColumnDao() throws Exception { 37 | connection = new MysqlTool().getConnection(); 38 | } 39 | 40 | /** 41 | * @throws Exception 42 | * 43 | */ 44 | public void initArticles(int type) throws Exception { 45 | int total = ColumnBiz.getTotalPage(type); 46 | // 该栏目的页数若过多,只爬取20页 47 | if (total > MAX_COLUMN_NUM) { 48 | total = MAX_COLUMN_NUM; 49 | } 50 | 51 | String tableName = TableName.getTableByType(type); 52 | 53 | // 注意从第1页开始 54 | for (int i = 1; i <= total; i++) { 55 | int[] ids = ColumnBiz.parseColumn(type, i); 56 | for (int id : ids) { 57 | ArticleItem article = ArticleBiz.parseNewsItem(id); 58 | if (DEBUG) { 59 | System.out.println(TimeTool.getCurrentTime() + " insert " + id + " " + article.getTitle() + " into " 60 | + tableName); 61 | } 62 | insertArticle(tableName, article); 63 | // 等待时间,避免对被爬取的网站负载过大 64 | TimeTool.sleepSomeTime(); 65 | } 66 | } 67 | } 68 | 69 | /** 70 | * @throws Exception 71 | * 只爬取新闻 为了图片上传失败 重新上传 72 | */ 73 | public static void justParseArticles(int type) throws Exception { 74 | int total = ColumnBiz.getTotalPage(type); 75 | // 该栏目的页数若过多,只爬取20页 76 | if (total > MAX_COLUMN_NUM) { 77 | total = MAX_COLUMN_NUM; 78 | } 79 | 80 | String tableName = TableName.getTableByType(type); 81 | 82 | // 注意从第1页开始 83 | for (int i = 1; i <= total; i++) { 84 | int[] ids = ColumnBiz.parseColumn(type, i); 85 | for (int id : ids) { 86 | ArticleItem article = ArticleBiz.parseNewsItem(id); 87 | if (DEBUG) { 88 | System.out.println(TimeTool.getCurrentTime() + " insert " + id + " " + article.getTitle() + " into " 89 | + tableName); 90 | } 91 | // 等待时间,避免对被爬取的网站负载过大 92 | TimeTool.sleepSomeTime(); 93 | } 94 | 95 | } 96 | } 97 | 98 | /** 99 | * 重新抓取新闻,比较当前最小的 id 100 | * 把最小的 id 之前的新闻插入 mysql 101 | * @throws Exception 102 | * 103 | */ 104 | public void reInitArticles(int type) throws Exception { 105 | int total = ColumnBiz.getTotalPage(type); 106 | // 该栏目的页数若过多,只爬取20页 107 | if (total > MAX_COLUMN_NUM) { 108 | total = MAX_COLUMN_NUM; 109 | } 110 | 111 | int minId = getMinId(type); 112 | 113 | if (Constant.DEBUG) { 114 | System.out.println(TimeTool.getCurrentTime() + " get minId " + minId); 115 | } 116 | 117 | String tableName = TableName.getTableByType(type); 118 | 119 | boolean find = false; 120 | 121 | // 注意从第1页开始 122 | for (int i = 1; i <= total; i++) { 123 | int[] ids = ColumnBiz.parseColumn(type, i); 124 | 125 | if (!find) { 126 | // 数组最后一个数,是否在 mysql 127 | boolean isExist = isIdExist(type, ids[ids.length - 1]); 128 | 129 | // 如果最后一个数在 mysql 中,该页所有记录都已爬取 130 | if (isExist) { 131 | continue; 132 | } else { 133 | // 爬取到这一页中断 134 | int pre = 0; 135 | for (; pre < ids.length; pre++) { 136 | // 如果记录存在,继续向下寻找 137 | if (isIdExist(type, ids[pre])) { 138 | continue; 139 | } else { 140 | if (DEBUG) { 141 | // pre 不存在,pre-1存在 142 | System.out.println(TimeTool.getCurrentTime() + " " + ids[pre] + " 第一个不存在 in 第 " + (i) 143 | + " 页,第 " + (pre + 1) + " 个"); 144 | } 145 | // 不存在,跳槽循环 146 | break; 147 | } 148 | } 149 | 150 | for (int re = pre; re < ids.length; re++) { 151 | ArticleItem article = ArticleBiz.parseNewsItem(ids[re]); 152 | if (DEBUG) { 153 | System.out.println(TimeTool.getCurrentTime() + " insert " + ids[re] + " " 154 | + article.getTitle() + " into " + tableName); 155 | } 156 | insertArticle(tableName, article); 157 | // 等待时间,避免对被爬取的网站负载过大 158 | TimeTool.sleepSomeTime(); 159 | } 160 | } 161 | } else { 162 | // 已经找到了上次的断点 id,把往后页面全部新闻插入 mysql 163 | if (find) { 164 | for (int id : ids) { 165 | ArticleItem article = ArticleBiz.parseNewsItem(id); 166 | insertArticle(tableName, article); 167 | if (DEBUG) { 168 | System.out.println(TimeTool.getCurrentTime() + " insert " + id + " " + article.getTitle() 169 | + " into " + tableName); 170 | } 171 | // 等待时间,避免对被爬取的网站负载过大 172 | TimeTool.sleepSomeTime(); 173 | } 174 | } 175 | } 176 | } 177 | 178 | } 179 | 180 | /** 181 | * 对比网站和Mysql 182 | * 新增数据到 Mysql 183 | * @param type 184 | * @throws Exception 185 | */ 186 | public void addArticles(int type) throws Exception { 187 | String tableName = TableName.getTableByType(type); 188 | 189 | int topId = getMaxId(type); 190 | 191 | int currentPage = 1; 192 | 193 | while (currentPage < MAX_COLUMN_NUM) { 194 | int[] ids = ColumnBiz.parseColumn(type, currentPage); 195 | 196 | int index = ArrayUtils.indexOf(ids, topId); 197 | 198 | // 如果当前数据库最新记录 == 网站最新记录 199 | if (index == 0) { 200 | return; 201 | } 202 | 203 | // 网站当前页包含 mysql 里的最新id 204 | // 把更新的记录插入到 mysql 中 205 | if (index > 0) { 206 | for (int i = 0; i < index; i++) { 207 | ArticleItem article = ArticleBiz.parseNewsItem(ids[i]); 208 | if (DEBUG) { 209 | System.out.println("insert " + ids[i] + " " + article.getTitle() + " into mysql"); 210 | } 211 | insertArticle(tableName, article); 212 | // 等待时间,避免对被爬取的网站负载过大 213 | TimeTool.sleepSomeTime(); 214 | } 215 | return; 216 | } 217 | 218 | // 最新的 id 不在当前页里 219 | // 需要全部更新数据 220 | if (index < 0) { 221 | for (int id : ids) { 222 | ArticleItem article = ArticleBiz.parseNewsItem(id); 223 | if (DEBUG) { 224 | System.out.println("insert " + id + " " + article.getTitle() + " into mysql"); 225 | } 226 | insertArticle(tableName, article); 227 | // 等待时间,避免对被爬取的网站负载过大 228 | TimeTool.sleepSomeTime(); 229 | } 230 | currentPage++; 231 | } 232 | } 233 | } 234 | 235 | /** 236 | * 返回某个表 最新的Constant.EACH_AMOUNT条新闻 237 | * 只是 listview 展示 238 | * 分页展示,需要 type 和偏移id 239 | * 修改了数据库中图片数组 240 | * 对于附件图标、doc 图标等不返回给手机端 241 | * @param type 242 | * @param threshold 243 | * @return 244 | * @throws SQLException 245 | */ 246 | public List getTopSimpleArticles(int type, int offset) throws SQLException { 247 | String tableName = TableName.getTableByType(type); 248 | 249 | String selectColumns = "select id,image_urls,title,publish_date,read_times,summary from " + tableName; 250 | String limitCount = " order by id desc limit " + Constant.EACH_AMOUNT; 251 | 252 | // 这儿两个 sql 语句要同步修改 253 | String query = selectColumns + " where id < ? " + limitCount; 254 | 255 | PreparedStatement preparedStmt = connection.prepareStatement(query); 256 | 257 | preparedStmt.setInt(1, offset); 258 | 259 | // 如果是首页 这是很少的情况 260 | if (offset == -1) { 261 | query = selectColumns + limitCount; 262 | preparedStmt = connection.prepareStatement(query); 263 | } 264 | 265 | ResultSet rs = preparedStmt.executeQuery(); 266 | 267 | List articles = new ArrayList(Constant.EACH_AMOUNT); 268 | 269 | while (rs.next()) { 270 | int id = rs.getInt(1); 271 | String[] imageUrls = {}; 272 | String urls = rs.getString(2); 273 | // split 最少也是返回一个元素 [] 返回 [""s] 274 | if (!urls.equals("[]")) { 275 | imageUrls = urls.replace("[", "").replace("]", "").split(", "); 276 | for (String url : Constant.USELESS_IMAGE_URL) { 277 | // 删除所有出现的元素 278 | imageUrls = ArrayUtils.removeAllOccurences(imageUrls, url); 279 | } 280 | } 281 | String title = rs.getString(3); 282 | String date = rs.getDate(4).toString(); 283 | int readTimes = rs.getInt(5); 284 | String summary = rs.getString(6); 285 | SimpleArticleItem article = new SimpleArticleItem(id, imageUrls, title, date, readTimes, summary); 286 | articles.add(article); 287 | } 288 | return articles; 289 | } 290 | 291 | /** 292 | * 返回某个表 最新的Constant.EACH_AMOUNT条新闻 293 | * 只是 listview 展示 294 | * 分页展示,需要 type 295 | * 大于某个给定的id 296 | * 修改了数据库中图片数组 297 | * 对于附件图标、doc 图标等不返回给手机端 298 | * @param type 299 | * @param threshold 300 | * @return 301 | * @throws SQLException 302 | */ 303 | public List moreArticles(int type, int morethan) throws SQLException { 304 | String tableName = TableName.getTableByType(type); 305 | 306 | String selectColumns = "select id,image_urls,title,publish_date,read_times,summary from " + tableName; 307 | String limitCount = " order by id desc limit " + Constant.EACH_AMOUNT; 308 | 309 | // 这儿两个 sql 语句要同步修改 310 | String query = selectColumns + " where id > ? " + limitCount; 311 | 312 | PreparedStatement preparedStmt = connection.prepareStatement(query); 313 | 314 | preparedStmt.setInt(1, morethan); 315 | 316 | // 如果是首页 这是很少的情况 317 | if (morethan == -1) { 318 | query = selectColumns + limitCount; 319 | preparedStmt = connection.prepareStatement(query); 320 | } 321 | 322 | ResultSet rs = preparedStmt.executeQuery(); 323 | 324 | List articles = new ArrayList(Constant.EACH_AMOUNT); 325 | 326 | while (rs.next()) { 327 | int id = rs.getInt(1); 328 | String[] imageUrls = {}; 329 | String urls = rs.getString(2); 330 | // split 最少也是返回一个元素 [] 返回 [""s] 331 | if (!urls.equals("[]")) { 332 | imageUrls = urls.replace("[", "").replace("]", "").split(", "); 333 | for (String url : Constant.USELESS_IMAGE_URL) { 334 | // 删除所有出现的元素 335 | imageUrls = ArrayUtils.removeAllOccurences(imageUrls, url); 336 | } 337 | } 338 | String title = rs.getString(3); 339 | String date = rs.getDate(4).toString(); 340 | int readTimes = rs.getInt(5); 341 | String summary = rs.getString(6); 342 | SimpleArticleItem article = new SimpleArticleItem(id, imageUrls, title, date, readTimes, summary); 343 | articles.add(article); 344 | } 345 | return articles; 346 | } 347 | 348 | public int insertArticle(String tableName, ArticleItem article) throws SQLException { 349 | // the mysql insert statement 350 | String query = " insert ignore into " + tableName 351 | + " (id, image_urls, title, publish_date, read_times,source,body)" + " values (?, ?, ?, ?, ?,?,?)"; 352 | 353 | // create the mysql insert preparedstatement 354 | PreparedStatement preparedStmt = connection.prepareStatement(query); 355 | preparedStmt.setInt(1, article.getId()); 356 | preparedStmt.setString(2, Arrays.toString(article.getImageUrls())); 357 | preparedStmt.setString(3, article.getTitle()); 358 | preparedStmt.setDate(4, Date.valueOf(article.getPublishDate())); 359 | preparedStmt.setInt(5, article.getReadTimes()); 360 | preparedStmt.setString(6, article.getSource()); 361 | preparedStmt.setString(7, article.getBody()); 362 | return preparedStmt.executeUpdate(); 363 | } 364 | 365 | /** 366 | * 判断某个栏目对应的table是否空 367 | * @param type 368 | * @return 369 | * @throws SQLException 370 | */ 371 | public boolean isTableEmpty(int type) throws SQLException { 372 | String tableName = TableName.getTableByType(type); 373 | 374 | String query = "select count(*) from " + tableName; 375 | 376 | PreparedStatement preparedStmt = connection.prepareStatement(query); 377 | 378 | ResultSet rs = preparedStmt.executeQuery(); 379 | 380 | if (rs.next()) { 381 | if (rs.getInt(1) > 0) { 382 | return false; 383 | } 384 | } 385 | return true; 386 | } 387 | 388 | /** 389 | * 获取某个表中的最小 id,也就是最新的新闻 id 390 | * @param type 391 | * @return 392 | * @throws SQLException 393 | */ 394 | public int getMinId(int type) throws SQLException { 395 | 396 | String tableName = TableName.getTableByType(type); 397 | // 取出最新的新闻 id 表名不能用PreparedStatement 398 | String query = "select min(id) from " + tableName; 399 | // create the mysql preparedstatement 400 | PreparedStatement preparedStmt = connection.prepareStatement(query); 401 | 402 | ResultSet rs = preparedStmt.executeQuery(); 403 | // 空记录 null 会返回 0 404 | if (rs.next()) { 405 | return rs.getInt(1); 406 | } 407 | // 如果数据库没最大的 id,返回 -1 408 | return -1; 409 | } 410 | 411 | /** 412 | * 获取某个表中的最大 id,也就是最新的新闻 id 413 | * @param type 414 | * @return 415 | * @throws SQLException 416 | */ 417 | public int getMaxId(int type) throws SQLException { 418 | 419 | String tableName = TableName.getTableByType(type); 420 | // 取出最新的新闻 id 表名不能用PreparedStatement 421 | String query = "select max(id) from " + tableName; 422 | // create the mysql preparedstatement 423 | PreparedStatement preparedStmt = connection.prepareStatement(query); 424 | 425 | ResultSet rs = preparedStmt.executeQuery(); 426 | // 空记录 null 会返回 0 427 | if (rs.next()) { 428 | return rs.getInt(1); 429 | } 430 | // 如果数据库没最大的 id,返回 -1 431 | return -1; 432 | } 433 | 434 | /** 435 | * 判断某个记录是否存在 436 | * @param type 437 | * @return 438 | * @throws SQLException 439 | */ 440 | public boolean isIdExist(int type, int id) throws SQLException { 441 | 442 | String tableName = TableName.getTableByType(type); 443 | // 取出最新的新闻 id 表名不能用PreparedStatement 444 | 445 | String query = "select exists(select 1 from " + tableName + " where id =?)"; 446 | // create the mysql preparedstatement 447 | PreparedStatement preparedStmt = connection.prepareStatement(query); 448 | preparedStmt.setInt(1, id); 449 | ResultSet rs = preparedStmt.executeQuery(); 450 | // 空记录 null 会返回 0 451 | if (rs.next()) { 452 | if (rs.getInt(1) == 1) 453 | return true; 454 | else 455 | return false; 456 | } 457 | // 如果数据库没最大的 id,返回 -1 458 | return false; 459 | } 460 | } -------------------------------------------------------------------------------- /src/com/chenxb/dao/RotationImageDao.java: -------------------------------------------------------------------------------- 1 | package com.chenxb.dao; 2 | 3 | import static com.chenxb.util.Constant.DEBUG; 4 | 5 | import java.sql.Connection; 6 | import java.sql.Date; 7 | import java.sql.PreparedStatement; 8 | import java.sql.ResultSet; 9 | import java.sql.SQLException; 10 | import java.util.ArrayList; 11 | import java.util.Arrays; 12 | import java.util.List; 13 | 14 | import org.apache.commons.lang3.ArrayUtils; 15 | 16 | import com.chenxb.biz.ArticleBiz; 17 | import com.chenxb.biz.ColumnBiz; 18 | import com.chenxb.biz.RotationImageBiz; 19 | import com.chenxb.model.ArticleItem; 20 | import com.chenxb.model.RotationItem; 21 | import com.chenxb.util.Constant; 22 | import com.chenxb.util.MysqlTool; 23 | import com.chenxb.util.TableName; 24 | import com.chenxb.util.TimeTool; 25 | 26 | /** 27 | * 将首页轮播图片新闻,插入到 Mysql 中 28 | * @author tomchen 29 | * 30 | */ 31 | public class RotationImageDao { 32 | private Connection connection; 33 | private static final String TABLE_RATATION = "rotation"; 34 | 35 | public RotationImageDao() throws Exception { 36 | connection = new MysqlTool().getConnection(); 37 | } 38 | 39 | /** 40 | * @throws Exception 41 | * 42 | */ 43 | public void initRotations() throws Exception { 44 | 45 | List rotations = RotationImageBiz.parseHomeRotaions(); 46 | 47 | if (rotations == null || rotations.isEmpty()) 48 | return; 49 | 50 | for (RotationItem rotation : rotations) { 51 | if (DEBUG) { 52 | System.out.println(TimeTool.getCurrentTime() + " insert " + rotation.getId() + " " + rotation.getTitle() 53 | + " type " + rotation.getType() + " into " + TABLE_RATATION); 54 | } 55 | insertRotationItem(rotation); 56 | // 等待时间,避免对被爬取的网站负载过大 57 | TimeTool.sleepSomeTime(); 58 | } 59 | } 60 | 61 | /** 62 | * 从数据库中获取 多条 轮播图片记录 63 | * @param type 64 | * @return 65 | * @throws SQLException 66 | */ 67 | public List getTopRotations() throws SQLException { 68 | 69 | String query = "select * from " + TABLE_RATATION + " order by id desc limit " + Constant.ROTATION_AMOUNT; 70 | 71 | PreparedStatement preparedStmt = connection.prepareStatement(query); 72 | 73 | ResultSet rs = preparedStmt.executeQuery(); 74 | 75 | List rotations = new ArrayList(Constant.ROTATION_AMOUNT); 76 | 77 | while (rs.next()) { 78 | int id = rs.getInt(1); 79 | String[] imageUrl = { rs.getString(2).replace("[", "").replace("]", "") }; 80 | 81 | String title = rs.getString(3); 82 | int type = rs.getInt(4); 83 | rotations.add(new RotationItem(id, imageUrl, title, type)); 84 | } 85 | return rotations; 86 | } 87 | 88 | public int insertRotationItem(RotationItem rotation) throws SQLException { 89 | // the mysql insert statement 90 | // 根据 type 找到某条新闻属于的栏目 91 | String query = " insert ignore into " + TABLE_RATATION + " (id, image_url, title, type)" 92 | + " values (?, ?, ?, ?)"; 93 | 94 | // create the mysql insert preparedstatement 95 | PreparedStatement preparedStmt = connection.prepareStatement(query); 96 | preparedStmt.setInt(1, rotation.getId()); 97 | preparedStmt.setString(2, Arrays.toString(rotation.getImageUrl())); 98 | preparedStmt.setString(3, rotation.getTitle()); 99 | preparedStmt.setInt(4, rotation.getType()); 100 | return preparedStmt.executeUpdate(); 101 | } 102 | 103 | } -------------------------------------------------------------------------------- /src/com/chenxb/dao/SearchDao.java: -------------------------------------------------------------------------------- 1 | package com.chenxb.dao; 2 | 3 | import java.nio.file.FileSystems; 4 | import java.sql.Connection; 5 | import java.sql.DriverManager; 6 | import java.sql.ResultSet; 7 | import java.sql.Statement; 8 | 9 | import org.apache.lucene.analysis.Analyzer; 10 | import org.apache.lucene.document.Document; 11 | import org.apache.lucene.document.Field; 12 | import org.apache.lucene.document.FieldType; 13 | import org.apache.lucene.document.IntField; 14 | import org.apache.lucene.document.StringField; 15 | import org.apache.lucene.document.TextField; 16 | import org.apache.lucene.index.DirectoryReader; 17 | import org.apache.lucene.index.IndexReader; 18 | import org.apache.lucene.index.IndexWriter; 19 | import org.apache.lucene.index.IndexWriterConfig; 20 | import org.apache.lucene.index.IndexWriterConfig.OpenMode; 21 | import org.apache.lucene.queryparser.classic.MultiFieldQueryParser; 22 | import org.apache.lucene.search.IndexSearcher; 23 | import org.apache.lucene.search.Query; 24 | import org.apache.lucene.search.Sort; 25 | import org.apache.lucene.search.SortField; 26 | import org.apache.lucene.search.SortField.Type; 27 | import org.apache.lucene.search.TopDocs; 28 | import org.apache.lucene.store.Directory; 29 | import org.apache.lucene.store.FSDirectory; 30 | import org.lionsoul.jcseg.analyzer.v5x.JcsegAnalyzer5X; 31 | import org.lionsoul.jcseg.tokenizer.core.JcsegTaskConfig; 32 | 33 | /** 34 | * 利用 Lucene 搜索 mysql 里的记录 35 | * 全文搜索 36 | * 37 | */ 38 | public class SearchDao { 39 | /** 40 | * this is index directory path where all index file will be stored which lucene uses internally. 41 | */ 42 | 43 | /** 44 | * to create index on simple database table 45 | */ 46 | public void createIndex() { 47 | 48 | System.out.println("-- Indexing --"); 49 | 50 | try { 51 | /** JDBC Section */ 52 | Class.forName("com.mysql.jdbc.Driver").newInstance(); 53 | 54 | // 后面unicode和utf8设置防止中文乱码 55 | String url = "jdbc:mysql://127.0.0.1:3306/see_news?useSSL=false&useUnicode=true&characterEncoding=utf-8"; 56 | String name = "root"; 57 | String password = "chenxb123"; 58 | 59 | Connection conn = DriverManager.getConnection(url, name, password); 60 | 61 | Statement stmt = conn.createStatement(); 62 | String sql = "select * from bachelor order by id desc limit 1000"; 63 | ResultSet rs = stmt.executeQuery(sql); 64 | 65 | /** defining Analyzer */ 66 | 67 | // 1. create the index 68 | Directory directory = FSDirectory.open(FileSystems.getDefault().getPath("./index2222")); 69 | 70 | // 创建标准文本分析器, 标准的是可以支持的中文的 71 | 72 | // StandardAnalyzer luceneAnalyzer = new StandardAnalyzer(); 73 | 74 | Analyzer luceneAnalyzer = new JcsegAnalyzer5X(JcsegTaskConfig.COMPLEX_MODE); 75 | 76 | /** preparing config for indexWriter */ 77 | IndexWriterConfig writerConfig = new IndexWriterConfig(luceneAnalyzer); 78 | 79 | /** Create a new index in the directory, removing any previously indexed documents */ 80 | writerConfig.setOpenMode(OpenMode.CREATE); 81 | /** 82 | * Optional: for better indexing performance, if you are indexing many documents,
83 | * increase the RAM buffer. But if you do this, increase the max heap size to the JVM (eg add -Xmx512m or -Xmx1g): 84 | */ 85 | // writerConfig.setRAMBufferSizeMB(256.0); 86 | 87 | IndexWriter iWriter = new IndexWriter(directory, writerConfig); 88 | 89 | int count = 0; 90 | Document doc = null; 91 | Field field = null; 92 | 93 | /** declaring string type */ 94 | FieldType stringType = new FieldType(); 95 | stringType.setTokenized(true); 96 | 97 | /** Looping through resultset and adding data to index file */ 98 | while (rs.next()) { 99 | doc = new Document(); 100 | 101 | /** adding id in document */ 102 | field = new IntField("id", rs.getInt("id"), Field.Store.YES); 103 | doc.add(field); 104 | 105 | /** adding name in document */ 106 | field = new TextField("title", rs.getString("title"), Field.Store.YES); 107 | doc.add(field); 108 | 109 | /** adding details in document */ 110 | field = new TextField("body", rs.getString("body"), Field.Store.YES); 111 | doc.add(field); 112 | 113 | /** Adding doc to iWriter */ 114 | iWriter.addDocument(doc); 115 | count++; 116 | } 117 | 118 | System.out.println(count + " record indexed"); 119 | 120 | /** Closing iWriter */ 121 | iWriter.commit(); 122 | iWriter.close(); 123 | 124 | /** Closing JDBC connection */ 125 | rs.close(); 126 | stmt.close(); 127 | conn.close(); 128 | 129 | } catch (Exception e) { 130 | e.printStackTrace(); 131 | } 132 | 133 | } 134 | 135 | /** 136 | * to search the keywords 137 | * 138 | * @param keyword 139 | */ 140 | public void search(String keyword) { 141 | 142 | System.out.println("-- Seaching --"); 143 | 144 | try { 145 | /** Searching */ 146 | Directory index = FSDirectory.open(FileSystems.getDefault().getPath("./index2222")); 147 | 148 | IndexReader directoryReader = DirectoryReader.open(index); 149 | 150 | // IndexReader directoryReader = DirectoryReader 151 | // .open(FSDirectory.open(FileSystems.getDefault().getPath("./index2222"))); 152 | 153 | IndexSearcher searcher = new IndexSearcher(directoryReader); 154 | // StandardAnalyzer keywordAnalyzer = new StandardAnalyzer(); 155 | Analyzer luceneAnalyzer = new JcsegAnalyzer5X(JcsegTaskConfig.COMPLEX_MODE); 156 | 157 | /** MultiFieldQueryParser is used to search multiple fields */ 158 | String[] filesToSearch = { "id", "title", "body" }; 159 | MultiFieldQueryParser mqp = new MultiFieldQueryParser(filesToSearch, luceneAnalyzer); 160 | 161 | /** search the given keyword */ 162 | Query query = mqp.parse(keyword); 163 | System.out.println("query >> " + query); 164 | // 165 | // /** defining the sorting on filed "name" */ 166 | Sort nameSort = new Sort(new SortField("id", Type.STRING)); 167 | 168 | /** run the query */ 169 | TopDocs hits = searcher.search(query, 1000); 170 | System.out.println("Results found >> " + hits.totalHits); 171 | 172 | Document doc = null; 173 | for (int i = 0; i < hits.totalHits; i++) { 174 | /** get the next document */ 175 | doc = searcher.doc(hits.scoreDocs[i].doc); 176 | System.out.println("==========" + (i + 1) + " : Start Record=========\nId :: " + doc.get("id") 177 | + "\ntitle :: " + doc.get("title") + "\n==========End Record=========\n"); 178 | } 179 | } catch (Exception e) { 180 | e.printStackTrace(); 181 | } 182 | 183 | } 184 | 185 | /** 186 | * main method to check the output 187 | * 188 | * @param args 189 | */ 190 | public static void main(String[] args) { 191 | 192 | SearchDao obj = new SearchDao(); 193 | 194 | /** creating index */ 195 | // obj.createIndex(); 196 | 197 | /** searching simple keyword */ 198 | System.out.println("==================searching simple keyword==========================="); 199 | obj.search("电院课表"); 200 | 201 | // /** searching simple keyword */ 202 | // System.out.println("==================searching simple 203 | // keyword==========================="); 204 | // obj.search("褚"); 205 | // 206 | // /** searching using wild card */ 207 | // System.out.println("==================searching using wild 208 | // card==========================="); 209 | // obj.search("791"); 210 | // 211 | // /** searching using logical OR operator */ 212 | // System.out.println("==================searching using logical OR 213 | // operator==========================="); 214 | // obj.search("院"); 215 | } 216 | 217 | } -------------------------------------------------------------------------------- /src/com/chenxb/dao/SummaryDao.java: -------------------------------------------------------------------------------- 1 | package com.chenxb.dao; 2 | 3 | import java.sql.Connection; 4 | import java.sql.PreparedStatement; 5 | import java.sql.ResultSet; 6 | import java.sql.SQLException; 7 | import java.util.List; 8 | 9 | import org.jsoup.Jsoup; 10 | import org.jsoup.nodes.Document; 11 | 12 | import com.chenxb.model.ArticleItem; 13 | import com.chenxb.util.MysqlTool; 14 | import com.chenxb.util.TableName; 15 | import com.hankcs.hanlp.HanLP; 16 | 17 | /** 18 | * 增加文章的摘要 19 | * 可以先把数据存入 mysql,再读取mysql 的数据增加摘要 20 | * 也可以爬虫的时候就把摘要存入 mysql 21 | * @author tomchen 22 | * 23 | */ 24 | public class SummaryDao { 25 | private Connection connection; 26 | 27 | public SummaryDao() throws Exception { 28 | connection = new MysqlTool().getConnection(); 29 | } 30 | 31 | /** 32 | * 根据 type 找到数据库表名称 33 | * 再从该表里找出 id 对应的新闻 34 | * @param type 35 | * @param id 36 | * @return 37 | * @throws SQLException 38 | */ 39 | public int updateSummary(int type, int id) throws SQLException { 40 | // 根据 type 找出对应的 table 名称 41 | String tableName = TableName.getTableByType(type); 42 | 43 | String body = getArticleBody(type, id); 44 | // body是 html 表示的 45 | Document doc = Jsoup.parse(body); 46 | List sentenceList = HanLP.extractSummary(doc.text(), 3); 47 | 48 | // 如果摘要是空,不采取任何操作 49 | if (sentenceList.isEmpty()) { 50 | 51 | String update = "update " + tableName + " set summary = title WHERE id= ?"; 52 | 53 | // create the mysql preparedstatement 54 | PreparedStatement preparedStmt = connection.prepareStatement(update); 55 | preparedStmt.setInt(1, id); 56 | 57 | return preparedStmt.executeUpdate(); 58 | } else { 59 | String summary = sentenceList.toString(); 60 | // 去掉 list 首尾的[ 和 ] 61 | summary = summary.substring(1, summary.length() - 1); 62 | summary = summary.replaceAll("&" + "nbsp;", ""); 63 | // unicode 空格是160 64 | summary = summary.replaceAll(String.valueOf((char) 160), ""); 65 | // 将多个空格替换为1个空格 66 | summary = summary.trim().replaceAll("\\s+", " ") + "。"; 67 | 68 | String update = "update " + tableName + " set summary = ? WHERE id= ?"; 69 | 70 | // create the mysql preparedstatement 71 | PreparedStatement preparedStmt = connection.prepareStatement(update); 72 | preparedStmt.setString(1, summary); 73 | preparedStmt.setInt(2, id); 74 | return preparedStmt.executeUpdate(); 75 | } 76 | 77 | } 78 | 79 | /** 80 | * 找出新闻的主体内容 81 | * @param type 82 | * @param id 83 | * @return 84 | * @throws SQLException 85 | */ 86 | public String getArticleBody(int type, int id) throws SQLException { 87 | // 根据 type 找出对应的 table 名称 88 | String tableName = TableName.getTableByType(type); 89 | 90 | // the mysql select statement 91 | String query = "select body from " + tableName + " where id = ?"; 92 | 93 | // create the mysql preparedstatement 94 | PreparedStatement preparedStmt = connection.prepareStatement(query); 95 | preparedStmt.setInt(1, id); 96 | 97 | ResultSet rs = preparedStmt.executeQuery(); 98 | if (rs.next()) { 99 | return rs.getString(1); 100 | } 101 | return ""; 102 | } 103 | 104 | } 105 | -------------------------------------------------------------------------------- /src/com/chenxb/jpush/TestJpush.java: -------------------------------------------------------------------------------- 1 | package com.chenxb.jpush; 2 | 3 | import java.sql.SQLException; 4 | import java.util.HashMap; 5 | import java.util.Map; 6 | 7 | import org.slf4j.Logger; 8 | import org.slf4j.LoggerFactory; 9 | 10 | import com.chenxb.dao.ArticleDao; 11 | import com.chenxb.model.ArticleItem; 12 | 13 | import cn.jpush.api.JPushClient; 14 | import cn.jpush.api.common.ClientConfig; 15 | import cn.jpush.api.common.resp.APIConnectionException; 16 | import cn.jpush.api.common.resp.APIRequestException; 17 | import cn.jpush.api.push.PushResult; 18 | import cn.jpush.api.push.model.Platform; 19 | import cn.jpush.api.push.model.PushPayload; 20 | import cn.jpush.api.push.model.SMS; 21 | import cn.jpush.api.push.model.audience.Audience; 22 | import cn.jpush.api.push.model.notification.Notification; 23 | 24 | public class TestJpush { 25 | 26 | private static final String appKey = "8c4911096188db2e7f2b370c"; 27 | private static final String masterSecret = "1cd48b15285f5c6f100f46d4"; 28 | public static final String ALERT = "救助郭燕-电院2000级校友,参与互联网众筹,通过网络传递爱心!"; 29 | 30 | public static final String TITLE = "电院最新资讯"; 31 | 32 | protected static final Logger LOG = LoggerFactory.getLogger(TestJpush.class); 33 | 34 | public static void main(String[] args) { 35 | 36 | JPushClient jpushClient = new JPushClient(masterSecret, appKey, 3); 37 | // For push, all you need do is to build PushPayload object. 38 | PushPayload payload = buildPushObject_android_tag_alertWithTitle(); 39 | 40 | System.out.println("PushPayload 信息" + payload.toString()); 41 | 42 | try { 43 | PushResult result = jpushClient.sendPush(payload); 44 | LOG.info("Got result - " + result); 45 | 46 | } catch (APIConnectionException e) { 47 | // Connection error, should retry later 48 | LOG.error("Connection error, should retry later", e); 49 | 50 | } catch (APIRequestException e) { 51 | // Should review the error, and fix the request 52 | LOG.error("Should review the error, and fix the request", e); 53 | LOG.info("HTTP Status: " + e.getStatus()); 54 | LOG.info("Error Code: " + e.getErrorCode()); 55 | LOG.info("Error Message: " + e.getErrorMessage()); 56 | } 57 | 58 | } 59 | 60 | public static PushPayload buildPushObject_android_tag_alertWithTitle() { 61 | try { 62 | return PushPayload.newBuilder().setPlatform(Platform.android()).setAudience(Audience.all()) 63 | .setNotification(Notification.android(ALERT, TITLE, getArticleExtraInfo())).build(); 64 | } catch (SQLException e) { 65 | e.printStackTrace(); 66 | } catch (Exception e) { 67 | e.printStackTrace(); 68 | } 69 | return null; 70 | } 71 | 72 | public static Map getArticleExtraInfo() throws SQLException, Exception { 73 | ArticleItem article = new ArticleDao().getArticleByTypeId(0, 7948); 74 | Map extras = new HashMap(); 75 | 76 | extras.put("type", "0"); 77 | extras.put("id", article.getId() + ""); 78 | extras.put("publishDate", article.getPublishDate()); 79 | extras.put("readTimes", article.getReadTimes() + ""); 80 | 81 | return extras; 82 | } 83 | 84 | } 85 | -------------------------------------------------------------------------------- /src/com/chenxb/model/ArticleItem.java: -------------------------------------------------------------------------------- 1 | package com.chenxb.model; 2 | 3 | import java.util.Arrays; 4 | 5 | /** 6 | * model/ItemNews.java 新闻详情页面用到的完整实体类 新闻实体类 包括标题,发布日期,阅读次数,新闻主体内容等 7 | * @author tomchen 8 | * 9 | */ 10 | 11 | public class ArticleItem extends SimpleArticleItem { 12 | // 图片资源不是必须的 13 | private String source; 14 | private String body; 15 | 16 | public ArticleItem(int id, String[] imageUrls, String title, String publishDate, int readTimes, String source, 17 | String body) { 18 | super(id,imageUrls, title, publishDate, readTimes); 19 | this.source = source; 20 | this.body = body; 21 | } 22 | 23 | public String getSource() { 24 | return source; 25 | } 26 | 27 | public void setSource(String source) { 28 | this.source = source; 29 | } 30 | 31 | public String getBody() { 32 | return body; 33 | } 34 | 35 | public void setBody(String body) { 36 | this.body = body; 37 | } 38 | 39 | @Override 40 | public String toString() { 41 | return "ArticleItem [id=" + getId() + ",\n imageUrls=" + Arrays.toString(getImageUrls()) + ",\n title=" 42 | + getTitle() + ",\n publishDate=" + getPublishDate() + ",\n source=" + source + ",\n readTimes=" 43 | + getReadTimes() + ",\n body=" + body + "]"; 44 | } 45 | 46 | } -------------------------------------------------------------------------------- /src/com/chenxb/model/RotationItem.java: -------------------------------------------------------------------------------- 1 | package com.chenxb.model; 2 | 3 | /** 4 | * 首页轮播图片 javabean 5 | * @author tomchen 6 | * 7 | */ 8 | public class RotationItem { 9 | 10 | int id; 11 | // 首页只有一张图片 12 | String[] imageUrls; 13 | String title; 14 | // type 是数字 1表示新闻通知 2本科教学 见 ColumnType 15 | int type; 16 | 17 | public RotationItem(int id, String[] imageUrls, String title, int type) { 18 | this.id = id; 19 | this.imageUrls = imageUrls; 20 | this.title = title; 21 | this.type = type; 22 | } 23 | 24 | @Override 25 | public String toString() { 26 | return "RotationItem [id=" + id + ", imageUrls=" + imageUrls + ", title=" + title + ", type=" + type + "]"; 27 | } 28 | 29 | public int getId() { 30 | return id; 31 | } 32 | 33 | public void setId(int id) { 34 | this.id = id; 35 | } 36 | 37 | public String[] getImageUrl() { 38 | return imageUrls; 39 | } 40 | 41 | public void setImageUrl(String[] imageUrls) { 42 | this.imageUrls = imageUrls; 43 | } 44 | 45 | public String getTitle() { 46 | return title; 47 | } 48 | 49 | public void setTitle(String title) { 50 | this.title = title; 51 | } 52 | 53 | public int getType() { 54 | return type; 55 | } 56 | 57 | public void setType(int type) { 58 | this.type = type; 59 | } 60 | 61 | } 62 | -------------------------------------------------------------------------------- /src/com/chenxb/model/SimpleArticleItem.java: -------------------------------------------------------------------------------- 1 | package com.chenxb.model; 2 | 3 | import java.util.Arrays; 4 | 5 | /** 6 | * listview 用到的简单实体类 7 | * 只包括 id,标题,发布日期,阅读次数 8 | * 没有新闻主体内容等 9 | * @author tomchen 10 | * 11 | */ 12 | 13 | public class SimpleArticleItem { 14 | 15 | private int id; 16 | private String[] imageUrls; 17 | // 图片资源不是必须的 18 | private String title; 19 | private String publishDate; 20 | private int readTimes; 21 | private String summary; 22 | 23 | public SimpleArticleItem(int id, String[] imageUrls, String title, String publishDate, int readTimes) { 24 | this.id = id; 25 | this.imageUrls = imageUrls; 26 | this.title = title; 27 | this.publishDate = publishDate; 28 | this.readTimes = readTimes; 29 | } 30 | 31 | public SimpleArticleItem(int id, String[] imageUrls, String title, String publishDate, int readTimes, 32 | String summary) { 33 | this.id = id; 34 | this.imageUrls = imageUrls; 35 | this.title = title; 36 | this.publishDate = publishDate; 37 | this.readTimes = readTimes; 38 | this.summary = summary; 39 | } 40 | 41 | public int getId() { 42 | return id; 43 | } 44 | 45 | public void setId(int id) { 46 | this.id = id; 47 | } 48 | 49 | public String[] getImageUrls() { 50 | return imageUrls; 51 | } 52 | 53 | public void setImageUrls(String[] imageUrls) { 54 | this.imageUrls = imageUrls; 55 | } 56 | 57 | public String getTitle() { 58 | return title; 59 | } 60 | 61 | public void setTitle(String title) { 62 | this.title = title; 63 | } 64 | 65 | public String getPublishDate() { 66 | return publishDate; 67 | } 68 | 69 | public void setPublishDate(String publishDate) { 70 | this.publishDate = publishDate; 71 | } 72 | 73 | public int getReadTimes() { 74 | return readTimes; 75 | } 76 | 77 | public void setReadTimes(int readTimes) { 78 | this.readTimes = readTimes; 79 | } 80 | 81 | public String getSummary() { 82 | return summary; 83 | } 84 | 85 | public void setSummary(String summary) { 86 | this.summary = summary; 87 | } 88 | 89 | @Override 90 | public String toString() { 91 | return "SimpleArticleItem [id=" + id + ", imageUrls=" + Arrays.toString(imageUrls) + ", title=" + title 92 | + ", publishDate=" + publishDate + ", readTimes=" + readTimes + ", summary=" + summary + "]"; 93 | } 94 | } -------------------------------------------------------------------------------- /src/com/chenxb/news/HelloLucene.java: -------------------------------------------------------------------------------- 1 | package com.chenxb.news; 2 | 3 | import java.io.IOException; 4 | import java.nio.file.FileSystems; 5 | import java.sql.Connection; 6 | import java.sql.DriverManager; 7 | import java.sql.ResultSet; 8 | import java.sql.Statement; 9 | 10 | import org.apache.lucene.analysis.Analyzer; 11 | import org.apache.lucene.document.Document; 12 | import org.apache.lucene.document.Field; 13 | import org.apache.lucene.document.StringField; 14 | import org.apache.lucene.document.TextField; 15 | import org.apache.lucene.index.DirectoryReader; 16 | import org.apache.lucene.index.IndexReader; 17 | import org.apache.lucene.index.IndexWriter; 18 | import org.apache.lucene.index.IndexWriterConfig; 19 | import org.apache.lucene.queryparser.classic.QueryParser; 20 | import org.apache.lucene.search.IndexSearcher; 21 | import org.apache.lucene.search.Query; 22 | import org.apache.lucene.search.ScoreDoc; 23 | import org.apache.lucene.search.TopScoreDocCollector; 24 | import org.apache.lucene.store.Directory; 25 | import org.apache.lucene.store.FSDirectory; 26 | import org.lionsoul.jcseg.analyzer.v5x.JcsegAnalyzer5X; 27 | import org.lionsoul.jcseg.tokenizer.core.JcsegTaskConfig; 28 | 29 | public class HelloLucene { 30 | public static void main(String[] args) throws Exception { 31 | 32 | // 0. Specify the analyzer for tokenizing text. 33 | // The same analyzer should be used for indexing and searching 34 | Analyzer analyzer = new JcsegAnalyzer5X(JcsegTaskConfig.COMPLEX_MODE); 35 | 36 | 37 | // 1. create the index 38 | Directory index = FSDirectory.open(FileSystems.getDefault().getPath("./index22")); 39 | 40 | System.out.println(index.toString()); 41 | IndexWriterConfig config = new IndexWriterConfig(analyzer); 42 | 43 | IndexWriter w = new IndexWriter(index, config); 44 | 45 | /** JDBC Section */ 46 | Class.forName("com.mysql.jdbc.Driver").newInstance(); 47 | 48 | // 后面unicode和utf8设置防止中文乱码 49 | String url = "jdbc:mysql://127.0.0.1:3306/see_news?useSSL=false&useUnicode=true&characterEncoding=utf-8"; 50 | String name = "root"; 51 | String password = "chenxb123"; 52 | 53 | Connection conn = DriverManager.getConnection(url, name, password); 54 | 55 | Statement stmt = conn.createStatement(); 56 | String sql = "select * from academic order by id desc limit 10"; 57 | ResultSet rs = stmt.executeQuery(sql); 58 | 59 | while (rs.next()) { 60 | addDoc(w, rs.getString("title"), rs.getInt("id") + ""); 61 | } 62 | 63 | addDoc(w, "Lucene in Action", "193398817"); 64 | addDoc(w, "Lucene for Dummies", "55320055Z"); 65 | addDoc(w, "Managing Gigabytes", "55063554A"); 66 | addDoc(w, "The Art of Computer Science", "9900333X"); 67 | w.close(); 68 | 69 | // 2. query 70 | String querystr = args.length > 0 ? args[0] : "新加坡"; 71 | 72 | // the "title" arg specifies the default field to use 73 | // when no field is explicitly specified in the query. 74 | Query q = new QueryParser("title", analyzer).parse(querystr); 75 | 76 | // 3. search 77 | int hitsPerPage = 10; 78 | IndexReader reader = DirectoryReader.open(index); 79 | IndexSearcher searcher = new IndexSearcher(reader); 80 | TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage); 81 | searcher.search(q, collector); 82 | ScoreDoc[] hits = collector.topDocs().scoreDocs; 83 | 84 | // 4. display results 85 | System.out.println("Found " + hits.length + " hits."); 86 | for (int i = 0; i < hits.length; ++i) { 87 | int docId = hits[i].doc; 88 | Document d = searcher.doc(docId); 89 | System.out.println((i + 1) + ". " + d.get("isbn") + "\t" + d.get("title")); 90 | } 91 | 92 | // reader can only be closed when there 93 | // is no need to access the documents any more. 94 | reader.close(); 95 | } 96 | 97 | private static void addDoc(IndexWriter w, String title, String isbn) throws IOException { 98 | Document doc = new Document(); 99 | doc.add(new TextField("title", title, Field.Store.YES)); 100 | 101 | // use a string field for isbn because we don't want it tokenized 102 | doc.add(new StringField("isbn", isbn, Field.Store.YES)); 103 | w.addDocument(doc); 104 | } 105 | } -------------------------------------------------------------------------------- /src/com/chenxb/news/LoadRotation.java: -------------------------------------------------------------------------------- 1 | package com.chenxb.news; 2 | 3 | import com.chenxb.dao.RotationImageDao; 4 | 5 | public class LoadRotation { 6 | public static void main(String[] args) throws Exception { 7 | new RotationImageDao().initRotations(); 8 | } 9 | 10 | } 11 | -------------------------------------------------------------------------------- /src/com/chenxb/news/ReUploadImage.java: -------------------------------------------------------------------------------- 1 | package com.chenxb.news; 2 | 3 | import com.chenxb.util.ImageTool; 4 | 5 | public class ReUploadImage { 6 | public static void main(String[] args) { 7 | String origin = "http://rsc.xidian.edu.cn/plus/img/addon.gif"; 8 | System.out.println(ImageTool.convertUrl(0000, origin,"a2f5daa62be22c5a07ea60d8db6741f")); 9 | } 10 | 11 | } 12 | -------------------------------------------------------------------------------- /src/com/chenxb/news/ReloadAcademic.java: -------------------------------------------------------------------------------- 1 | package com.chenxb.news; 2 | 3 | import com.chenxb.dao.ColumnDao; 4 | import com.chenxb.util.ColumnType; 5 | 6 | public class ReloadAcademic { 7 | public static void main(String[] args) { 8 | new Thread() { 9 | public void run() { 10 | try { 11 | new ColumnDao().reInitArticles(ColumnType.ACADEMIC); 12 | } catch (Exception e) { 13 | e.printStackTrace(); 14 | } 15 | 16 | } 17 | }.start(); 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /src/com/chenxb/news/ReloadAll.java: -------------------------------------------------------------------------------- 1 | package com.chenxb.news; 2 | 3 | import java.sql.Connection; 4 | import java.util.ArrayList; 5 | import java.util.Arrays; 6 | 7 | import org.apache.commons.lang3.ArrayUtils; 8 | 9 | import com.chenxb.biz.ColumnBiz; 10 | import com.chenxb.dao.ColumnDao; 11 | import com.chenxb.util.ColumnType; 12 | import com.chenxb.util.MysqlTool; 13 | import com.chenxb.util.TableName; 14 | 15 | public class ReloadAll { 16 | public static void main(String arg[]) throws Exception { 17 | new Thread() { 18 | public void run() { 19 | try { 20 | new ColumnDao().reInitArticles(ColumnType.BACHELOR); 21 | } catch (Exception e) { 22 | e.printStackTrace(); 23 | } 24 | 25 | } 26 | }.start(); 27 | 28 | new Thread() { 29 | public void run() { 30 | try { 31 | new ColumnDao().reInitArticles(ColumnType.MASTER); 32 | } catch (Exception e) { 33 | e.printStackTrace(); 34 | } 35 | 36 | } 37 | }.start(); 38 | 39 | new Thread() { 40 | public void run() { 41 | try { 42 | new ColumnDao().reInitArticles(ColumnType.JOB); 43 | } catch (Exception e) { 44 | e.printStackTrace(); 45 | } 46 | 47 | } 48 | }.start(); 49 | 50 | new Thread() { 51 | public void run() { 52 | try { 53 | new ColumnDao().reInitArticles(ColumnType.ACADEMIC); 54 | } catch (Exception e) { 55 | e.printStackTrace(); 56 | } 57 | 58 | } 59 | }.start(); 60 | 61 | new Thread() { 62 | public void run() { 63 | try { 64 | new ColumnDao().reInitArticles(ColumnType.LATEST); 65 | } catch (Exception e) { 66 | e.printStackTrace(); 67 | } 68 | 69 | } 70 | }.start(); 71 | 72 | new Thread() { 73 | public void run() { 74 | try { 75 | new ColumnDao().reInitArticles(ColumnType.NOTIFIC); 76 | } catch (Exception e) { 77 | e.printStackTrace(); 78 | } 79 | 80 | } 81 | }.start(); 82 | 83 | // System.out.println(new 84 | // new ColumnDao()().isTableEmpty(ColumnType.BACHELOR)); 85 | // Connection connection = MysqlTool.getConnection(); 86 | // String tableName = TableName.getTableByType(ColumnType.LATEST); 87 | // 88 | // int[] ids = ColumnBiz.parseColumn(ColumnType.LATEST, 1); 89 | // 90 | // System.out.println(ArrayUtils.indexOf(ids, ids[0])); 91 | // System.out.println(ArrayUtils.indexOf(ids, 6531)); 92 | // System.out.println(ids[0]); 93 | // Arrays.binarySearch(ids, 3); 94 | // System.out.println(ArrayUtils.contains(ids, 7948)); 95 | // System.out.println(ArrayUtils.contains(ids, 4)); 96 | // System.out.println(ArrayUtils.contains(ids, 7945)); 97 | // // Arrays.asList(ids); 98 | // // System.out.println(list); 99 | 100 | } 101 | 102 | } 103 | -------------------------------------------------------------------------------- /src/com/chenxb/news/ReloadBachelor.java: -------------------------------------------------------------------------------- 1 | package com.chenxb.news; 2 | 3 | import com.chenxb.dao.ColumnDao; 4 | import com.chenxb.util.ColumnType; 5 | 6 | public class ReloadBachelor { 7 | public static void main(String[] args) { 8 | new Thread() { 9 | public void run() { 10 | try { 11 | new ColumnDao().reInitArticles(ColumnType.BACHELOR); 12 | } catch (Exception e) { 13 | e.printStackTrace(); 14 | } 15 | 16 | } 17 | }.start(); 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /src/com/chenxb/news/ReloadJob.java: -------------------------------------------------------------------------------- 1 | package com.chenxb.news; 2 | 3 | import java.util.ArrayList; 4 | 5 | import com.chenxb.dao.ColumnDao; 6 | import com.chenxb.util.ColumnType; 7 | 8 | public class ReloadJob { 9 | public static void main(String[] args) { 10 | ArrayList a = new ArrayList(); 11 | a.add(11); 12 | a.add(112); 13 | System.out.println("array: " + a); 14 | A aaa = new A(a); 15 | System.out.println("A: " + aaa); 16 | a.add(33); 17 | System.out.println("array: " + a); 18 | System.out.println("A: " + aaa); 19 | 20 | } 21 | 22 | } 23 | 24 | class A { 25 | ArrayList arr; 26 | 27 | A(ArrayList arr) { 28 | this.arr = arr; 29 | } 30 | 31 | @Override 32 | public String toString() { 33 | return arr + ""; 34 | } 35 | 36 | } 37 | -------------------------------------------------------------------------------- /src/com/chenxb/news/ReloadLatest.java: -------------------------------------------------------------------------------- 1 | package com.chenxb.news; 2 | 3 | import com.chenxb.dao.ColumnDao; 4 | import com.chenxb.util.ColumnType; 5 | 6 | public class ReloadLatest { 7 | public static void main(String[] args) { 8 | new Thread() { 9 | public void run() { 10 | try { 11 | new ColumnDao().reInitArticles(ColumnType.LATEST); 12 | } catch (Exception e) { 13 | e.printStackTrace(); 14 | } 15 | 16 | } 17 | }.start(); 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /src/com/chenxb/news/ReloadMaster.java: -------------------------------------------------------------------------------- 1 | package com.chenxb.news; 2 | 3 | import com.chenxb.dao.ColumnDao; 4 | import com.chenxb.util.ColumnType; 5 | 6 | public class ReloadMaster { 7 | public static void main(String[] args) { 8 | new Thread() { 9 | public void run() { 10 | try { 11 | new ColumnDao().reInitArticles(ColumnType.MASTER); 12 | } catch (Exception e) { 13 | e.printStackTrace(); 14 | } 15 | 16 | } 17 | }.start(); 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /src/com/chenxb/news/ReloadNotific.java: -------------------------------------------------------------------------------- 1 | package com.chenxb.news; 2 | 3 | import com.chenxb.dao.ColumnDao; 4 | import com.chenxb.util.ColumnType; 5 | 6 | public class ReloadNotific { 7 | public static void main(String[] args) { 8 | new Thread() { 9 | public void run() { 10 | try { 11 | new ColumnDao().reInitArticles(ColumnType.NOTIFIC); 12 | } catch (Exception e) { 13 | e.printStackTrace(); 14 | } 15 | 16 | } 17 | }.start(); 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /src/com/chenxb/news/Test.java: -------------------------------------------------------------------------------- 1 | package com.chenxb.news; 2 | 3 | import java.lang.reflect.Type; 4 | import java.sql.Connection; 5 | import java.sql.DriverManager; 6 | import java.util.ArrayList; 7 | import java.util.Arrays; 8 | import java.util.List; 9 | import java.util.Random; 10 | 11 | import org.apache.commons.lang3.ArrayUtils; 12 | import org.apache.commons.lang3.StringUtils; 13 | import org.jsoup.Jsoup; 14 | import org.jsoup.nodes.Document; 15 | import org.jsoup.nodes.Element; 16 | import org.jsoup.select.Elements; 17 | 18 | import com.chenxb.biz.ArticleBiz; 19 | import com.chenxb.biz.ColumnBiz; 20 | import com.chenxb.dao.ColumnDao; 21 | import com.chenxb.model.ArticleItem; 22 | import com.chenxb.model.SimpleArticleItem; 23 | import com.chenxb.util.ColumnType; 24 | import com.chenxb.util.Constant; 25 | import com.chenxb.util.MysqlTool; 26 | import com.chenxb.util.TableName; 27 | import com.google.gson.Gson; 28 | import com.google.gson.GsonBuilder; 29 | import com.google.gson.reflect.TypeToken; 30 | import com.sina.sae.util.SaeUserInfo; 31 | 32 | public class Test { 33 | private String x; 34 | 35 | public static void main(String arg[]) throws Exception { 36 | test(7937); 37 | } 38 | 39 | public static void test(int id) { 40 | Random rand = new Random(id); 41 | System.out.println(rand.nextInt(965)); 42 | } 43 | 44 | public static final boolean isCloud = false; 45 | // 新浪云 ip,外网使用 46 | public static final String saeIP = "http://javanews.applinzi.com/"; 47 | // 本地局域网 ip,测试使用 48 | public static final String localIP = "http://192.168.199.133/"; 49 | 50 | public static String columnUrl() { 51 | String suffix = "columnWithSql?column=%d&offset=%d"; 52 | if (isCloud) 53 | return saeIP + suffix; 54 | else 55 | return localIP + suffix; 56 | } 57 | 58 | public static String articleUrl() { 59 | String suffix = "articleWithSql?column=%d&id=%d"; 60 | if (isCloud) 61 | return saeIP + suffix; 62 | else 63 | return localIP + suffix; 64 | } 65 | 66 | } 67 | -------------------------------------------------------------------------------- /src/com/chenxb/news/Test4.java: -------------------------------------------------------------------------------- 1 | package com.chenxb.news; 2 | 3 | import java.sql.Time; 4 | import java.util.ArrayList; 5 | import java.util.List; 6 | 7 | import org.jsoup.Jsoup; 8 | import org.jsoup.nodes.Document; 9 | import org.jsoup.nodes.Element; 10 | import org.jsoup.select.Elements; 11 | 12 | import com.chenxb.biz.ArticleBiz; 13 | import com.chenxb.biz.ColumnBiz; 14 | import com.chenxb.biz.RotationImageBiz; 15 | import com.chenxb.dao.ColumnDao; 16 | import com.chenxb.model.ArticleItem; 17 | import com.chenxb.util.ColumnType; 18 | import com.chenxb.util.Constant; 19 | import com.chenxb.util.ImageTool; 20 | import com.chenxb.util.TableName; 21 | import com.chenxb.util.TimeTool; 22 | import com.chenxb.model.RotationItem; 23 | 24 | public class Test4 { 25 | 26 | public static void main(String[] args) throws Exception { 27 | 28 | 29 | System.out.println("ca4b9dadbc73ccfd4c995d7c0a179f95".length()); 30 | // Elements eles = 31 | // doc.getElementsByClass("rotaion_list").get(0).getElementsByTag("a"); 32 | // 33 | // 34 | // 35 | // List rotaions = new 36 | // ArrayList(eles.size()); 37 | // 38 | // for (Element e : eles) { 39 | // 40 | // String articleUrl = e.attr("href"); 41 | // 42 | // int id = Integer.parseInt(articleUrl.replaceAll("\\D+", "")); 43 | // 44 | // String imageUrl = e.getElementsByTag("img").get(0).attr("src"); 45 | // 46 | // String key = ImageTool.convertUrl(id, imageUrl); 47 | // 48 | // String title = e.getElementsByTag("img").get(0).attr("alt"); 49 | // 50 | // String body = ArticleBiz.parseNewsItem(id).getBody(); 51 | // 52 | // rotaions.add(new RotationItem(id, key, title, body)); 53 | // } 54 | // 55 | // System.out.println(rotaions); 56 | } 57 | 58 | } 59 | -------------------------------------------------------------------------------- /src/com/chenxb/news/TestJcseg.java: -------------------------------------------------------------------------------- 1 | package com.chenxb.news; 2 | 3 | import org.apache.lucene.analysis.Analyzer; 4 | import org.apache.lucene.analysis.standard.StandardAnalyzer; 5 | import org.apache.lucene.document.Document; 6 | import org.apache.lucene.document.Field; 7 | import org.apache.lucene.document.TextField; 8 | import org.apache.lucene.index.DirectoryReader; 9 | import org.apache.lucene.index.IndexWriter; 10 | import org.apache.lucene.index.IndexWriterConfig; 11 | import org.apache.lucene.queryparser.classic.QueryParser; 12 | import org.apache.lucene.search.IndexSearcher; 13 | import org.apache.lucene.search.Query; 14 | import org.apache.lucene.search.ScoreDoc; 15 | import org.apache.lucene.store.Directory; 16 | import org.apache.lucene.store.RAMDirectory; 17 | import org.apache.lucene.util.Version; 18 | 19 | public class TestJcseg { 20 | public static void main(String arg[]) throws Exception { 21 | Analyzer analyzer = new StandardAnalyzer(); 22 | 23 | // Store the index in memory: 24 | Directory directory = new RAMDirectory(); 25 | // To store an index on disk, use this instead: 26 | //Directory directory = FSDirectory.open("/tmp/testindex"); 27 | IndexWriterConfig config = new IndexWriterConfig(analyzer); 28 | IndexWriter iwriter = new IndexWriter(directory, config); 29 | Document doc = new Document(); 30 | String text = "This is the text to be indexed."; 31 | doc.add(new Field("fieldname", text, TextField.TYPE_STORED)); 32 | iwriter.addDocument(doc); 33 | iwriter.close(); 34 | 35 | // Now search the index: 36 | DirectoryReader ireader = DirectoryReader.open(directory); 37 | IndexSearcher isearcher = new IndexSearcher(ireader); 38 | // Parse a simple query that searches for "text": 39 | QueryParser parser = new QueryParser("fieldname", analyzer); 40 | Query query = parser.parse("text"); 41 | ScoreDoc[] hits = isearcher.search(query, null, 1000).scoreDocs; 42 | // Iterate through the results: 43 | for (int i = 0; i < hits.length; i++) { 44 | Document hitDoc = isearcher.doc(hits[i].doc); 45 | } 46 | ireader.close(); 47 | directory.close(); 48 | // //lucene 5.x版本 49 | // Analyzer analyzer = new JcsegAnalyzer5X(JcsegTaskConfig.COMPLEX_MODE); 50 | // //非必须(用于修改默认配置): 获取分词任务配置实例 51 | // JcsegAnalyzer5X jcseg = (JcsegAnalyzer5X) analyzer; 52 | // JcsegTaskConfig config = jcseg.getTaskConfig(); 53 | // //追加同义词, 需要在 jcseg.properties中配置jcseg.loadsyn=1 54 | // config.setAppendCJKSyn(true); 55 | // //追加拼音, 需要在jcseg.properties中配置jcseg.loadpinyin=1 56 | // config.setAppendCJKPinyin(false); 57 | // //更多配置, 请查看 org.lionsoul.jcseg.tokenizer.core.JcsegTaskConfig 58 | } 59 | 60 | } 61 | -------------------------------------------------------------------------------- /src/com/chenxb/servlet/ArticleWithSql.java: -------------------------------------------------------------------------------- 1 | package com.chenxb.servlet; 2 | 3 | import java.io.IOException; 4 | import java.io.PrintWriter; 5 | import java.io.StringWriter; 6 | import java.sql.SQLException; 7 | 8 | import javax.servlet.ServletException; 9 | import javax.servlet.http.HttpServlet; 10 | import javax.servlet.http.HttpServletRequest; 11 | import javax.servlet.http.HttpServletResponse; 12 | 13 | import com.chenxb.biz.ArticleBiz; 14 | import com.chenxb.dao.ArticleDao; 15 | import com.chenxb.model.ArticleItem; 16 | import com.chenxb.util.TableName; 17 | import com.google.gson.Gson; 18 | import com.google.gson.GsonBuilder; 19 | 20 | /** 21 | * 从 mysql 中根据 id 和 column 获取新闻详情 22 | * 先获取 colunm,再到对应的表里查询数据 23 | * @author tomchen 24 | * 25 | */ 26 | public class ArticleWithSql extends HttpServlet { 27 | 28 | private ArticleDao dao; 29 | 30 | public ArticleWithSql() { 31 | super(); 32 | try { 33 | dao = new ArticleDao(); 34 | } catch (Exception e) { 35 | e.printStackTrace(); 36 | } 37 | } 38 | 39 | @Override 40 | protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException { 41 | doPost(req, resp); 42 | } 43 | 44 | @Override 45 | protected void doPost(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException { 46 | // 目的是为了控制浏览器的行为,即控制浏览器用UTF-8进行解码; 47 | resp.setContentType("text/html;charset=UTF-8"); 48 | 49 | // 用于response.getWriter()输出的字符流的乱码问题,response.getOutputStream()是不需要此种解决方案的 50 | // 因为这句话的意思是为了将response对象中的数据以UTF-8解码后发向浏览器; 51 | resp.setCharacterEncoding("UTF-8"); 52 | 53 | PrintWriter out = resp.getWriter(); 54 | if (req.getParameter("id") == null || req.getParameter("column") == null) { 55 | out.write("usage: http://localhost:8080/test/articleWithSql?column=1&id=7000"); 56 | return; 57 | } 58 | 59 | try { 60 | if (dao == null || dao.getConnection().isClosed()) { 61 | out.write("mysql is null or closed\n"); 62 | return; 63 | } 64 | } catch (SQLException e) { 65 | StringWriter errors = new StringWriter(); 66 | e.printStackTrace(new PrintWriter(errors)); 67 | out.write("mysql is null or closed\n"); 68 | out.print(errors.toString()); 69 | } 70 | 71 | // 获取哪个栏目的表 72 | int type = Integer.parseInt(req.getParameter("column")); 73 | 74 | int id = Integer.parseInt(req.getParameter("id")); 75 | 76 | ArticleItem article; 77 | try { 78 | article = dao.getArticleByTypeId(type, id); 79 | Gson gson = new GsonBuilder().disableHtmlEscaping().create(); 80 | String result = gson.toJson(article); 81 | out.write(result); 82 | } catch (Exception e) { 83 | StringWriter errors = new StringWriter(); 84 | e.printStackTrace(new PrintWriter(errors)); 85 | out.write("ArticleDao getArticleByTypeId error\n"); 86 | out.print(errors.toString()); 87 | } finally { 88 | out.flush(); 89 | out.close(); 90 | } 91 | } 92 | 93 | } 94 | -------------------------------------------------------------------------------- /src/com/chenxb/servlet/ColumnArticlesWithSql.java: -------------------------------------------------------------------------------- 1 | package com.chenxb.servlet; 2 | 3 | import java.io.IOException; 4 | import java.io.PrintWriter; 5 | import java.io.StringWriter; 6 | import java.util.List; 7 | 8 | import javax.servlet.ServletException; 9 | import javax.servlet.http.HttpServlet; 10 | import javax.servlet.http.HttpServletRequest; 11 | import javax.servlet.http.HttpServletResponse; 12 | 13 | import com.chenxb.dao.ColumnDao; 14 | import com.chenxb.model.SimpleArticleItem; 15 | import com.google.gson.Gson; 16 | import com.google.gson.GsonBuilder; 17 | 18 | /** 19 | * 查找某个栏目的多条新闻 20 | * 根据栏目、偏移值 21 | * 分页返回 22 | * @author tomchen 23 | * 24 | */ 25 | public class ColumnArticlesWithSql extends HttpServlet { 26 | 27 | private static final long serialVersionUID = 1L; 28 | 29 | private ColumnDao colDao; 30 | 31 | public ColumnArticlesWithSql() { 32 | super(); 33 | try { 34 | colDao = new ColumnDao(); 35 | } catch (Exception e) { 36 | e.printStackTrace(); 37 | } 38 | } 39 | 40 | @Override 41 | protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException { 42 | doPost(req, resp); 43 | } 44 | 45 | @Override 46 | protected void doPost(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException { 47 | // 目的是为了控制浏览器的行为,即控制浏览器用UTF-8进行解码; 48 | resp.setContentType("text/html;charset=UTF-8"); 49 | 50 | // 用于response.getWriter()输出的字符流的乱码问题,response.getOutputStream()是不需要此种解决方案的 51 | // 因为这句话的意思是为了将response对象中的数据以UTF-8解码后发向浏览器; 52 | resp.setCharacterEncoding("UTF-8"); 53 | PrintWriter out = resp.getWriter(); 54 | 55 | if (req.getParameter("column") == null || req.getParameter("offset") == null) { 56 | out.write("usage: http://localhost:8080/test/columnWithSql?column=1&offset=7916"); 57 | return; 58 | } 59 | 60 | int type = Integer.parseInt(req.getParameter("column")); 61 | int offset = Integer.parseInt(req.getParameter("offset")); 62 | 63 | //用-1表示首页的数据 64 | //下面几页就是根据偏移量 65 | try { 66 | List articles = colDao.getTopSimpleArticles(type,offset); 67 | Gson gson = new GsonBuilder().disableHtmlEscaping().create(); 68 | String result = gson.toJson(articles); 69 | out.write(result); 70 | } catch (Exception e) { 71 | StringWriter errors = new StringWriter(); 72 | e.printStackTrace(new PrintWriter(errors)); 73 | out.write("ColumnDao getTopSimpleArticles error\n"); 74 | out.print(errors.toString()); 75 | } finally { 76 | out.flush(); 77 | out.close(); 78 | } 79 | } 80 | 81 | } 82 | -------------------------------------------------------------------------------- /src/com/chenxb/servlet/MoreArticlesWithSql.java: -------------------------------------------------------------------------------- 1 | package com.chenxb.servlet; 2 | 3 | import java.io.IOException; 4 | import java.io.PrintWriter; 5 | import java.io.StringWriter; 6 | import java.util.List; 7 | 8 | import javax.servlet.ServletException; 9 | import javax.servlet.http.HttpServlet; 10 | import javax.servlet.http.HttpServletRequest; 11 | import javax.servlet.http.HttpServletResponse; 12 | 13 | import com.chenxb.dao.ColumnDao; 14 | import com.chenxb.model.SimpleArticleItem; 15 | import com.google.gson.Gson; 16 | import com.google.gson.GsonBuilder; 17 | 18 | /** 19 | * 某栏目,查找大于某个id的新数据 20 | * 分页返回 21 | * @author tomchen 22 | * 23 | */ 24 | public class MoreArticlesWithSql extends HttpServlet { 25 | 26 | private static final long serialVersionUID = 1L; 27 | 28 | private ColumnDao colDao; 29 | 30 | public MoreArticlesWithSql() { 31 | super(); 32 | try { 33 | colDao = new ColumnDao(); 34 | } catch (Exception e) { 35 | e.printStackTrace(); 36 | } 37 | } 38 | 39 | @Override 40 | protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException { 41 | doPost(req, resp); 42 | } 43 | 44 | @Override 45 | protected void doPost(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException { 46 | // 目的是为了控制浏览器的行为,即控制浏览器用UTF-8进行解码; 47 | resp.setContentType("text/html;charset=UTF-8"); 48 | 49 | // 用于response.getWriter()输出的字符流的乱码问题,response.getOutputStream()是不需要此种解决方案的 50 | // 因为这句话的意思是为了将response对象中的数据以UTF-8解码后发向浏览器; 51 | resp.setCharacterEncoding("UTF-8"); 52 | PrintWriter out = resp.getWriter(); 53 | 54 | if (req.getParameter("column") == null || req.getParameter("morethan") == null) { 55 | out.write("usage: http://localhost:8080/test/columnWithSql?column=1&morethan=7916"); 56 | return; 57 | } 58 | 59 | int type = Integer.parseInt(req.getParameter("column")); 60 | int morethan = Integer.parseInt(req.getParameter("morethan")); 61 | 62 | // 下面几页就是根据偏移量 63 | try { 64 | List articles = colDao.moreArticles(type, morethan); 65 | Gson gson = new GsonBuilder().disableHtmlEscaping().create(); 66 | String result = gson.toJson(articles); 67 | out.write(result); 68 | } catch (Exception e) { 69 | StringWriter errors = new StringWriter(); 70 | e.printStackTrace(new PrintWriter(errors)); 71 | out.write("ColumnDao getTopSimpleArticles error\n"); 72 | out.print(errors.toString()); 73 | } finally { 74 | out.flush(); 75 | out.close(); 76 | } 77 | } 78 | 79 | } 80 | -------------------------------------------------------------------------------- /src/com/chenxb/servlet/ParseArticleById.java: -------------------------------------------------------------------------------- 1 | package com.chenxb.servlet; 2 | 3 | import java.io.IOException; 4 | import java.io.PrintWriter; 5 | 6 | import javax.servlet.ServletException; 7 | import javax.servlet.http.HttpServlet; 8 | import javax.servlet.http.HttpServletRequest; 9 | import javax.servlet.http.HttpServletResponse; 10 | 11 | import com.chenxb.biz.ArticleBiz; 12 | import com.chenxb.model.ArticleItem; 13 | import com.google.gson.Gson; 14 | import com.google.gson.GsonBuilder; 15 | 16 | /** 17 | * 使用示例 http://localhost:8080/seenews/parseArticle?id=7938 18 | * 根据给定的id从电院http://see.xidian.edu.cn/html/news/7938.html爬取数据 返回 json 字符串 19 | * 20 | * @author tomchen 21 | * 22 | */ 23 | public class ParseArticleById extends HttpServlet { 24 | 25 | private static final long serialVersionUID = 1L; 26 | 27 | @Override 28 | protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException { 29 | doPost(req, resp); 30 | } 31 | 32 | @Override 33 | protected void doPost(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException { 34 | // 目的是为了控制浏览器的行为,即控制浏览器用UTF-8进行解码; 35 | resp.setContentType("text/html;charset=UTF-8"); 36 | 37 | // 用于response.getWriter()输出的字符流的乱码问题,response.getOutputStream()是不需要此种解决方案的 38 | // 因为这句话的意思是为了将response对象中的数据以UTF-8解码后发向浏览器; 39 | resp.setCharacterEncoding("UTF-8"); 40 | 41 | int id = Integer.parseInt(req.getParameter("id")); 42 | 43 | PrintWriter out = resp.getWriter(); 44 | try { 45 | ArticleItem article = ArticleBiz.parseNewsItem(id); 46 | Gson gson = new GsonBuilder().disableHtmlEscaping().create(); 47 | String result = gson.toJson(article); 48 | out.write(result); 49 | } catch (Exception e) { 50 | e.printStackTrace(); 51 | } finally { 52 | out.flush(); 53 | out.close(); 54 | } 55 | } 56 | 57 | } 58 | -------------------------------------------------------------------------------- /src/com/chenxb/servlet/RotationWithSql.java: -------------------------------------------------------------------------------- 1 | package com.chenxb.servlet; 2 | 3 | import java.io.IOException; 4 | import java.io.PrintWriter; 5 | import java.io.StringWriter; 6 | import java.util.List; 7 | 8 | import javax.servlet.ServletException; 9 | import javax.servlet.http.HttpServlet; 10 | import javax.servlet.http.HttpServletRequest; 11 | import javax.servlet.http.HttpServletResponse; 12 | 13 | import com.chenxb.dao.RotationImageDao; 14 | import com.chenxb.model.RotationItem; 15 | import com.google.gson.Gson; 16 | import com.google.gson.GsonBuilder; 17 | 18 | public class RotationWithSql extends HttpServlet { 19 | 20 | private RotationImageDao dao; 21 | 22 | public RotationWithSql() { 23 | super(); 24 | try { 25 | dao = new RotationImageDao(); 26 | } catch (Exception e) { 27 | e.printStackTrace(); 28 | } 29 | } 30 | 31 | @Override 32 | protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException { 33 | doPost(req, resp); 34 | } 35 | 36 | @Override 37 | protected void doPost(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException { 38 | // 目的是为了控制浏览器的行为,即控制浏览器用UTF-8进行解码; 39 | resp.setContentType("text/html;charset=UTF-8"); 40 | 41 | // 用于response.getWriter()输出的字符流的乱码问题,response.getOutputStream()是不需要此种解决方案的 42 | // 因为这句话的意思是为了将response对象中的数据以UTF-8解码后发向浏览器; 43 | resp.setCharacterEncoding("UTF-8"); 44 | 45 | PrintWriter out = resp.getWriter(); 46 | 47 | try { 48 | List rotations = dao.getTopRotations(); 49 | Gson gson = new GsonBuilder().disableHtmlEscaping().create(); 50 | String result = gson.toJson(rotations); 51 | out.write(result); 52 | 53 | } catch (Exception e) { 54 | StringWriter errors = new StringWriter(); 55 | e.printStackTrace(new PrintWriter(errors)); 56 | out.print(errors.toString()); 57 | e.printStackTrace(); 58 | } 59 | 60 | } 61 | 62 | } 63 | -------------------------------------------------------------------------------- /src/com/chenxb/servlet/SearchArticle.java: -------------------------------------------------------------------------------- 1 | package com.chenxb.servlet; 2 | 3 | import java.io.IOException; 4 | import java.io.PrintWriter; 5 | import java.io.StringWriter; 6 | 7 | import javax.servlet.ServletException; 8 | import javax.servlet.http.HttpServlet; 9 | import javax.servlet.http.HttpServletRequest; 10 | import javax.servlet.http.HttpServletResponse; 11 | 12 | import com.chenxb.biz.ArticleBiz; 13 | import com.chenxb.dao.ArticleDao; 14 | import com.chenxb.model.ArticleItem; 15 | import com.chenxb.util.TableName; 16 | import com.google.gson.Gson; 17 | import com.google.gson.GsonBuilder; 18 | 19 | /** 20 | * 搜索新闻 21 | * 根据关键词全文搜索 22 | * @author tomchen 23 | * 24 | */ 25 | public class SearchArticle extends HttpServlet { 26 | 27 | private ArticleDao dao; 28 | 29 | public SearchArticle() { 30 | super(); 31 | try { 32 | dao = new ArticleDao(); 33 | } catch (Exception e) { 34 | e.printStackTrace(); 35 | } 36 | } 37 | 38 | @Override 39 | protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException { 40 | doPost(req, resp); 41 | } 42 | 43 | @Override 44 | protected void doPost(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException { 45 | // 目的是为了控制浏览器的行为,即控制浏览器用UTF-8进行解码; 46 | resp.setContentType("text/html;charset=UTF-8"); 47 | 48 | // 用于response.getWriter()输出的字符流的乱码问题,response.getOutputStream()是不需要此种解决方案的 49 | // 因为这句话的意思是为了将response对象中的数据以UTF-8解码后发向浏览器; 50 | resp.setCharacterEncoding("UTF-8"); 51 | 52 | PrintWriter out = resp.getWriter(); 53 | if ( req.getParameter("keyword") == null) { 54 | out.write("usage:http://localhost:8080/test/searchArticle?keyword=电院"); 55 | return; 56 | } 57 | 58 | // 获取哪个栏目的表 59 | String word = req.getParameter("keyword"); 60 | 61 | 62 | try { 63 | out.write(word); 64 | } catch (Exception e) { 65 | e.printStackTrace(); 66 | } 67 | } 68 | 69 | } 70 | -------------------------------------------------------------------------------- /src/com/chenxb/test/JobScheduler.java: -------------------------------------------------------------------------------- 1 | package com.chenxb.test; 2 | 3 | import org.quartz.JobBuilder; 4 | import org.quartz.JobDetail; 5 | import org.quartz.Scheduler; 6 | import org.quartz.SchedulerFactory; 7 | import org.quartz.SimpleScheduleBuilder; 8 | import org.quartz.Trigger; 9 | import org.quartz.TriggerBuilder; 10 | import org.quartz.impl.StdSchedulerFactory; 11 | 12 | public class JobScheduler { 13 | public static void main(String[] args) throws Exception { 14 | JobDetail job = JobBuilder.newJob(TestJob.class).withIdentity("ttt").build(); 15 | Trigger trigger = TriggerBuilder.newTrigger() 16 | .withSchedule(SimpleScheduleBuilder.simpleSchedule().withIntervalInSeconds(30).repeatForever()).build(); 17 | 18 | SchedulerFactory factory = new StdSchedulerFactory(); 19 | 20 | Scheduler scheduler = factory.getScheduler(); 21 | scheduler.start(); 22 | scheduler.scheduleJob(job, trigger); 23 | } 24 | 25 | } 26 | 27 | // class MyTrigger extend Trig 28 | -------------------------------------------------------------------------------- /src/com/chenxb/test/TestHanlp.java: -------------------------------------------------------------------------------- 1 | package com.chenxb.test; 2 | 3 | import java.sql.SQLException; 4 | import java.util.Arrays; 5 | import java.util.List; 6 | 7 | import org.apache.commons.lang3.StringUtils; 8 | import org.jsoup.Jsoup; 9 | import org.jsoup.nodes.Document; 10 | 11 | import com.chenxb.dao.ArticleDao; 12 | import com.chenxb.dao.SummaryDao; 13 | import com.hankcs.hanlp.HanLP; 14 | 15 | /** 16 | * 中文分词 17 | * 提取文章摘要 18 | * @author tomchen 19 | * 20 | */ 21 | public class TestHanlp { 22 | public static void main(String[] args) throws SQLException, Exception { 23 | 24 | for (int i = 7948; i >= 7896; i--) { 25 | System.out.println(new SummaryDao().updateSummary(0, i)); 26 | } 27 | 28 | } 29 | 30 | public static String get(int id) throws SQLException, Exception { 31 | String document = new ArticleDao().getArticleByTypeId(0, id).getBody(); 32 | Document doc = Jsoup.parse(document); 33 | List sentenceList = HanLP.extractSummary(doc.text(), 2); 34 | 35 | if (!sentenceList.isEmpty()) { 36 | String summary = sentenceList.toString(); 37 | String temp = summary.substring(1, summary.length() - 1); 38 | temp = temp.replaceAll("&" + "nbsp;", ""); 39 | // unicode 空格是160 40 | temp = temp.replaceAll(String.valueOf((char) 160), ""); 41 | System.out.println("util===" + temp.trim()); 42 | // 将多个空格替换为1个空格 43 | return temp.trim().replaceAll("\\s+", " ") + "。"; 44 | } else { 45 | return ""; 46 | } 47 | } 48 | 49 | } 50 | -------------------------------------------------------------------------------- /src/com/chenxb/test/TestJob.java: -------------------------------------------------------------------------------- 1 | package com.chenxb.test; 2 | 3 | import java.io.IOException; 4 | 5 | import org.jsoup.Jsoup; 6 | import org.jsoup.nodes.Document; 7 | import org.jsoup.nodes.Element; 8 | import org.jsoup.select.Elements; 9 | import org.quartz.Job; 10 | import org.quartz.JobExecutionContext; 11 | import org.quartz.JobExecutionException; 12 | 13 | import com.chenxb.biz.ArticleBiz; 14 | import com.chenxb.util.Constant; 15 | import com.chenxb.util.UrlTool; 16 | 17 | public class TestJob implements Job { 18 | 19 | private int articleId; 20 | 21 | public TestJob(int articleId) { 22 | this.articleId = articleId; 23 | } 24 | 25 | @Override 26 | public void execute(JobExecutionContext arg0) throws JobExecutionException { 27 | 28 | System.out.println("========== articleId " + articleId + " =========="); 29 | System.out.println("TestJob running"); 30 | } 31 | 32 | public static void main(String[] args) throws IOException { 33 | // 根据后缀的数字,拼接新闻 url 34 | Document doc = Jsoup.connect(Constant.SEE_URL).timeout(10000).get(); 35 | // 去掉jsoup对html字符串加的"\n",方便json字符串返回 36 | doc.outputSettings().prettyPrint(false); 37 | 38 | Elements eles = doc.getElementsByClass("rotaion_list"); 39 | System.out.println(eles.get(0)); 40 | //
  • 我院刘宏伟教授团队科研成果获2015年度国家技术发明二等奖
  • 我校海外名师新加坡南洋理工大学Lin Weisi教授应邀来我校进行学术交流与访问
  • 日本东北大学陈强教授应邀到电子工程学院作报告
  • 2015省电赛暨陕西教师电赛颁奖我院获最高奖“TI”杯
  • 电院学生在全国电赛总结暨颁奖大会上被授予6项全国奖
  • 电院大二学生边畅获外研社杯全国英语演讲大赛总决赛二等奖
  • 【电院校友】专访78级校友中科院院士包为民
62 | 63 | // Element contentEle = articleEle.getElementById("article_content"); 64 | // // 处理相对路径 url,不和上面的 image url 冲突 65 | // Elements hrefs = contentEle.getElementsByTag("a"); 66 | // for (int i = 0; i < hrefs.size(); i++) { 67 | // String origin = hrefs.get(i).attr("href"); 68 | // System.out.println("origin: " + origin.length()); 69 | // String newUrl = UrlTool.dealAttachmentUrl(id, origin); 70 | // System.out.println("newUrl: " + origin); 71 | } 72 | 73 | } 74 | -------------------------------------------------------------------------------- /src/com/chenxb/test/TestTimeAgo.java: -------------------------------------------------------------------------------- 1 | package com.chenxb.test; 2 | 3 | import com.chenxb.util.GetTimeAgo; 4 | 5 | public class TestTimeAgo { 6 | public static void main(String[] args) { 7 | long time = System.currentTimeMillis(); 8 | System.out.println(time); 9 | 10 | System.out.println(time / 1000 < 1000000000000L); 11 | 12 | try { 13 | Thread.sleep(1000 * 280); 14 | } catch (InterruptedException e) { 15 | e.printStackTrace(); 16 | } 17 | 18 | System.out.println(GetTimeAgo.getTimeAgo(time / 1000)); 19 | 20 | } 21 | 22 | } 23 | -------------------------------------------------------------------------------- /src/com/chenxb/util/ColumnType.java: -------------------------------------------------------------------------------- 1 | package com.chenxb.util; 2 | 3 | public class ColumnType { 4 | 5 | // LATEST是选取的下面几个栏目里的最近新闻 6 | public static final int LATEST = 0;// 最新消息 7 | public static final int NOTIFIC = 1;// 校园通知 8 | public static final int BACHELOR = 2;// 本科教学 学士 9 | public static final int MASTER = 3;// 研究生 硕士 10 | public static final int ACADEMIC = 5;// 学术交流 11 | // 选取了电院新闻的部分栏目 12 | public static final int JOB = 8;// 就业招聘 13 | 14 | } 15 | -------------------------------------------------------------------------------- /src/com/chenxb/util/Constant.java: -------------------------------------------------------------------------------- 1 | package com.chenxb.util; 2 | 3 | public class Constant { 4 | public static final String SEE_URL = "http://**********"; 5 | 6 | public static final String ARTICLE_BASE_URL = "http://**********"; 7 | 8 | public static final String HTTP_PREFIX = "http://"; 9 | public static final String HTTPS_PREFIX = "https://"; 10 | public static final String FTP_PREFIX = "ftp://"; 11 | 12 | public static final String SRC_PREFIX = "src=\""; 13 | public static final String HREF_PREFIX = "href=\""; 14 | 15 | public static final String WWW_PREFIX = "www"; 16 | public static final String WEBSITE_NAME = "电院"; 17 | public static final String JS_PREFIX = "javascript"; 18 | 19 | // mailto:lzli@see.xidian.edu.cn 20 | public static final String MAILTO_PREFIX = "mailto:"; 21 | 22 | // 附件图标(资源已经不存在) 23 | public static final String DOC_JPG_SUFFIX = "doc.jpg"; 24 | public static final String XLS_JPG_SUFFIX = "xls.jpg"; 25 | public static final String RAR_JPG_SUFFIX = "rar.jpg"; 26 | public static final String ZIP_JPG_SUFFIX = "zip.jpg"; 27 | 28 | // 无用连接 附件图标等等 29 | public static final String[] USELESS_IMAGE_URL = { "912720f605b84070e223d0dab690a114", 30 | "b5805b46ce8cf9c634b3820a23d64ca6", "84b7028179e09614540cea8dd0122c3c" }; 31 | 32 | // 七牛图片链接 域名 33 | public static final String BUCKET_HOST_NAME = "http://**********"; 34 | 35 | public static final boolean DEBUG = true; 36 | 37 | // 手机端 listview 新闻数目 38 | public static final int EACH_AMOUNT = 10; 39 | 40 | // 轮播图片数量 41 | public static final int ROTATION_AMOUNT = 7; 42 | 43 | } 44 | -------------------------------------------------------------------------------- /src/com/chenxb/util/GetTimeAgo.java: -------------------------------------------------------------------------------- 1 | package com.chenxb.util; 2 | 3 | public class GetTimeAgo { 4 | 5 | /** 6 | * Created by tomchen on 2/26/16. 7 | */ 8 | 9 | private static final int SECOND_MILLIS = 1000; 10 | private static final int MINUTE_MILLIS = 60 * SECOND_MILLIS; 11 | 12 | private static final int HOUR_MILLIS = 60 * MINUTE_MILLIS; 13 | private static final int DAY_MILLIS = 24 * HOUR_MILLIS; 14 | 15 | public static String getTimeAgo(long time) { 16 | if (time < 1000000000000L) { 17 | // if timestamp given in seconds, convert to millis 18 | time *= 1000; 19 | } 20 | 21 | long now = System.currentTimeMillis(); 22 | if (time > now || time <= 0) { 23 | return "未知时间"; 24 | } 25 | 26 | final long diff = now - time; 27 | 28 | if (diff < MINUTE_MILLIS) { 29 | return "刚刚"; 30 | } else if (diff < 2 * MINUTE_MILLIS) { 31 | return "1分钟前"; 32 | } else if (diff < 50 * MINUTE_MILLIS) { 33 | return diff / MINUTE_MILLIS + "分钟前"; 34 | } else if (diff < 90 * MINUTE_MILLIS) { 35 | return "1小时前"; 36 | } else if (diff < 24 * HOUR_MILLIS) { 37 | return diff / HOUR_MILLIS + "小时前"; 38 | } else if (diff < 48 * HOUR_MILLIS) { 39 | return "昨天"; 40 | } else { 41 | return diff / DAY_MILLIS + "天前"; 42 | } 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/com/chenxb/util/HttpTool.java: -------------------------------------------------------------------------------- 1 | package com.chenxb.util; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | import java.net.HttpURLConnection; 6 | import java.net.URL; 7 | 8 | 9 | public class HttpTool { 10 | /** 11 | * 12 | * @param urlStr 13 | * 网页链接 14 | * @return 网页的 html 源码 15 | * @throws Exception 16 | * @throws CommonException 17 | * @throws IOException 18 | */ 19 | public static String doGet(String urlStr) throws Exception { 20 | URL url; 21 | String html = ""; 22 | url = new URL(urlStr); 23 | HttpURLConnection connection = (HttpURLConnection) url.openConnection(); 24 | connection.setRequestMethod("GET"); 25 | connection.setConnectTimeout(5000); 26 | connection.setDoInput(true); 27 | connection.setDoOutput(true); 28 | if (connection.getResponseCode() == 200) { 29 | InputStream in = connection.getInputStream(); 30 | html = StreamTool.inToStringByByte(in); 31 | in.close(); 32 | } else { 33 | throw new Exception("新闻服务器返回值不为200"); 34 | } 35 | return html; 36 | } 37 | 38 | } 39 | -------------------------------------------------------------------------------- /src/com/chenxb/util/ImageTool.java: -------------------------------------------------------------------------------- 1 | package com.chenxb.util; 2 | 3 | import java.io.PrintWriter; 4 | import java.io.StringWriter; 5 | 6 | import org.apache.commons.lang3.StringUtils; 7 | 8 | import com.qiniu.common.QiniuException; 9 | import com.qiniu.storage.BucketManager; 10 | import com.qiniu.util.Auth; 11 | 12 | public class ImageTool { 13 | 14 | // 附件下载的图标,忽略 15 | private static final String IMAGE_BASE = "/uploads/image"; 16 | private static final String IMAGE_OLD__BASE = "/uploads/old"; 17 | private static final String IMAGE_OLD__NEWS = "/news/Images"; 18 | private static final String IMAGE_OLD__GRAPH = "/graph"; 19 | 20 | /** 21 | * 图片上传到七牛,和原来的 imageUrl 不相等 22 | * 否则还是返回原来的 url 23 | * @param currentPage 24 | * @param origin 25 | * @return 26 | */ 27 | public static String convertUrl(int currentPage, String origin) { 28 | // 图片资源不一定都是在 uploads 文件夹下面 29 | // 也有可能外链到其他网站的图片 30 | if (origin.startsWith(Constant.HTTP_PREFIX)) { 31 | // 以绝对路径开头,最前面是网站域名 32 | // 比如 http://see.xidian.edu.cn/uploads/image/20141120/201411**.png 33 | // http://imgtec.eetrend.com/sites/*** 34 | String imageKey = StringTool.createMD5(origin); 35 | uploadByUrl(currentPage, origin, imageKey); 36 | return imageKey; 37 | } else if (origin.contains(IMAGE_BASE)) { 38 | // 相对路径,比如/uploads/image/20141120/20141120**.jpg 39 | // /Public/kindeditor/php/../../../uploads/image/20151116/20151116114927_39484.jpg 40 | // 把图片上传给七牛 41 | // if 的先后顺序,先判断是否是全路径,再判断是不是相对路径 42 | String wholeURl = Constant.SEE_URL + origin; 43 | 44 | String imageKey = StringTool.createMD5(origin); 45 | 46 | uploadByUrl(currentPage, wholeURl, imageKey); 47 | return imageKey; 48 | } else if (origin.startsWith(IMAGE_OLD__BASE)) { 49 | // 老图片路径 /uploads/old 50 | String wholeURl = Constant.SEE_URL + origin; 51 | 52 | String imageKey = StringTool.createMD5(origin); 53 | 54 | uploadByUrl(currentPage, wholeURl, imageKey); 55 | return imageKey; 56 | 57 | } else if (StringUtils.startsWithAny(origin, IMAGE_OLD__NEWS, IMAGE_OLD__GRAPH)) { 58 | // 资源已被删除 返回原地址 59 | // /news/Images/2006060515215782024.jpg 60 | // /graph/jpg.gif 61 | return Constant.SEE_URL + origin; 62 | } else { 63 | // 这部分 todo,识别其他格式的图片 64 | // 或者试图访问这个图片,但失败了,则不是完整的 url 65 | StringBuilder builder = new StringBuilder(); 66 | builder.append("

ImageTool.convertUrl() 无法解析图片

"); 67 | builder.append("

图片 src = " + origin + "

"); 68 | MailTool.sendException(builder.toString(), currentPage, MailTool.IMAGE_UNUSUAL); 69 | return origin; 70 | } 71 | 72 | } 73 | 74 | /** imageKey 为输入参数 75 | * 76 | * @param currentPage 77 | * @param origin 78 | * @param imageKey 79 | * @return 80 | */ 81 | public static String convertUrl(int currentPage, String origin, String imageKey) { 82 | // 图片资源不一定都是在 uploads 文件夹下面 83 | // 也有可能外链到其他网站的图片 84 | if (origin.startsWith(Constant.HTTP_PREFIX)) { 85 | // 以绝对路径开头,最前面是网站域名 86 | // 比如 http://see.xidian.edu.cn/uploads/image/20141120/201411**.png 87 | // http://imgtec.eetrend.com/sites/*** 88 | uploadByUrl(currentPage, origin, imageKey); 89 | return imageKey; 90 | } else if (origin.startsWith(IMAGE_BASE)) { 91 | // 相对路径,比如/uploads/image/20141120/20141120**.jpg 92 | // /Public/kindeditor/php/../../../uploads/image/20151116/20151116114927_39484.jpg 93 | // 把图片上传给七牛 94 | // if 的先后顺序,先判断是否是全路径,再判断是不是相对路径 95 | String wholeURl = Constant.SEE_URL + origin; 96 | 97 | uploadByUrl(currentPage, wholeURl, imageKey); 98 | return imageKey; 99 | } else if (origin.startsWith(IMAGE_OLD__BASE)) { 100 | // 老图片路径 /uploads/old 101 | String wholeURl = Constant.SEE_URL + origin; 102 | uploadByUrl(currentPage, wholeURl, imageKey); 103 | return imageKey; 104 | } else { 105 | // 这部分 todo,识别其他格式的图片 106 | // 或者试图访问这个图片,但失败了,则不是完整的 url 107 | StringBuilder builder = new StringBuilder(); 108 | builder.append("

ImageTool.convertUrl() 无法解析图片

"); 109 | builder.append("

图片 url = " + origin + "

"); 110 | MailTool.sendException(builder.toString(), currentPage, MailTool.IMAGE_UNUSUAL); 111 | return origin; 112 | } 113 | 114 | } 115 | 116 | /** 117 | * 118 | * @param url 119 | * 给定图片的 url 120 | * @return 将图片上传至七牛,返回七牛上图片的 url 121 | * @throws QiniuException 122 | */ 123 | private static void uploadByUrl(int currentPage, String originalUrl, String key) { 124 | FetchRunnable f = new FetchRunnable(currentPage, originalUrl, key); 125 | new Thread(f).start(); 126 | } 127 | 128 | } 129 | 130 | /** 131 | * 图片上传使用多线程 132 | * 有问题?上传失败如何回滚? 133 | * 失败的概率很小,暂时不考虑 134 | * 或者失败了发邮件通知 135 | * @author tomchen 136 | * 137 | */ 138 | class FetchRunnable implements Runnable { 139 | private static final String ACCESS_KEY = "**-*********"; // 你的access_key 140 | private static final String SECRET_KEY = "**-*********"; // 你的secret_key 141 | private static final String BUCKET_NAME = "*****"; // 你的secret_key 142 | 143 | private int currentPage; 144 | private String url; 145 | private String key; 146 | 147 | public FetchRunnable(int currentPage, String url, String key) { 148 | this.currentPage = currentPage; 149 | this.url = url; 150 | this.key = key; 151 | } 152 | 153 | @Override 154 | public void run() { 155 | // 获取到 Access Key 和 Secret Key 之后,您可以按照如下方式进行密钥配置 156 | Auth auth = Auth.create(ACCESS_KEY, SECRET_KEY); 157 | // 获取空间管理器 158 | BucketManager bucketManager = new BucketManager(auth); 159 | try { 160 | // 要求url可公网正常访问BucketManager.fetch(url, bucketName, key); 161 | // @param url 网络上一个资源文件的URL 162 | // @param bucketName 空间名称 163 | // @param key 空间内文件的key[唯一的] 164 | bucketManager.fetch(url, BUCKET_NAME, key); 165 | } catch (QiniuException e) { 166 | // 处理已知的部分资源不存在 167 | if (StringUtils.endsWithAny(url, Constant.DOC_JPG_SUFFIX, Constant.XLS_JPG_SUFFIX, Constant.RAR_JPG_SUFFIX, 168 | Constant.ZIP_JPG_SUFFIX)) { 169 | // 已经手工上传了这几种图标 170 | return; 171 | } 172 | 173 | StringWriter errors = new StringWriter(); 174 | e.printStackTrace(new PrintWriter(errors)); 175 | 176 | StringBuilder builder = new StringBuilder(errors.toString()); 177 | builder.append("

ImageTool.uploadByUrl(url, key)发生异常!

"); 178 | builder.append("

url = " + url + "

"); 179 | builder.append("

key = " + key + "

"); 180 | MailTool.sendException(builder.toString(), currentPage, MailTool.ARTICLE_ITEM_BIZ); 181 | 182 | // to do 失败了邮件通知 183 | } 184 | } 185 | } 186 | -------------------------------------------------------------------------------- /src/com/chenxb/util/JobScheduler.java: -------------------------------------------------------------------------------- 1 | package com.chenxb.util; 2 | 3 | import java.util.LinkedList; 4 | import java.util.List; 5 | import java.util.Random; 6 | 7 | import com.chenxb.biz.ArticleBiz; 8 | import com.chenxb.biz.ColumnBiz; 9 | 10 | public class JobScheduler { 11 | 12 | public static void main(String[] args) throws Exception { 13 | int[] ids = ColumnBiz.parseColumn(2, 3); 14 | List datas = new LinkedList(); 15 | for (int i = 0; i < ids.length; i++) { 16 | datas.add(ids[i]); 17 | } 18 | 19 | while (datas.size() > 0) { 20 | Random r = new Random(); 21 | int id = datas.remove(0); 22 | System.out.println(ArticleBiz.parseNewsItem(id)); 23 | System.out.println("deal id = " + id); 24 | Thread.sleep(100 * 1000 + r.nextInt(50 * 1000) + r.nextInt(20 * 1000)); 25 | 26 | } 27 | 28 | // JobDetail job = 29 | // JobBuilder.newJob(TestJob.class).withIdentity("ttt").build(); 30 | // Trigger trigger = TriggerBuilder.newTrigger() 31 | // .withSchedule(SimpleScheduleBuilder.simpleSchedule() 32 | // .withIntervalInSeconds(30).repeatForever()).build(); 33 | // 34 | // SchedulerFactory factory = new StdSchedulerFactory(); 35 | // 36 | // Scheduler scheduler = factory.getScheduler(); 37 | // scheduler.start(); 38 | // scheduler.scheduleJob(job, trigger); 39 | 40 | } 41 | 42 | } 43 | -------------------------------------------------------------------------------- /src/com/chenxb/util/MailTool.java: -------------------------------------------------------------------------------- 1 | package com.chenxb.util; 2 | 3 | import java.security.GeneralSecurityException; 4 | import java.util.Properties; 5 | 6 | import javax.mail.Address; 7 | import javax.mail.Message; 8 | import javax.mail.MessagingException; 9 | import javax.mail.Session; 10 | import javax.mail.Transport; 11 | import javax.mail.internet.InternetAddress; 12 | import javax.mail.internet.MimeMessage; 13 | 14 | import com.sun.mail.util.MailSSLSocketFactory; 15 | 16 | public class MailTool { 17 | /** 18 | * 将爬虫抛出异常的url、堆栈信息发送邮件 19 | * 20 | * @param content 21 | * 邮件类型 22 | * @param type 23 | * 爬虫错误代码 24 | * 25 | * @return 邮件是否发送成功 26 | */ 27 | 28 | // 图片异常,不是以 /uploads 开头 29 | public static final int IMAGE_UNUSUAL = 0; 30 | public static final int ARTICLE_ITEM_BIZ = 1; 31 | public static final int HREF_UNUSUAL = 2; 32 | 33 | public static boolean sendException(String content, int currentPage, int type) { 34 | // 配置信息支持从文件读取 props.load(InputStream inStream); 35 | Properties props = new Properties(); 36 | 37 | // 调试的时候需开启debug调试 38 | props.setProperty("mail.debug", "false"); 39 | // 发送服务器需要身份验证 40 | props.setProperty("mail.smtp.auth", "true"); 41 | // 设置邮件服务器主机名 42 | props.setProperty("mail.host", "smtp.qq.com"); 43 | // 发送邮件协议名称 44 | props.setProperty("mail.transport.protocol", "smtp"); 45 | 46 | MailSSLSocketFactory sf; 47 | try { 48 | sf = new MailSSLSocketFactory(); 49 | sf.setTrustAllHosts(true); 50 | props.put("mail.smtp.ssl.enable", "true"); 51 | props.put("mail.smtp.ssl.socketFactory", sf); 52 | } catch (GeneralSecurityException e) { 53 | e.printStackTrace(); 54 | return false; 55 | } 56 | 57 | // 根据配置文件生成一个 session 对象 58 | Session session = Session.getInstance(props); 59 | 60 | // 发件人邮箱用户名、密码,连接到邮件服务器, 61 | Transport transport; 62 | try { 63 | transport = session.getTransport(); 64 | transport.connect("smtp.qq.com", "905073281@qq.com", "*********"); 65 | 66 | } catch (Exception e) { 67 | e.printStackTrace(); 68 | return false; 69 | } 70 | 71 | // 创建邮件 72 | Message msg = new MimeMessage(session); 73 | // 邮件主题,也就是标题 74 | try { 75 | msg.setSubject("seenews 错误 type " + type); 76 | // 邮件内容,支持 html 格式 77 | StringBuilder builder = new StringBuilder(content); 78 | builder.append("

新闻页面 url = " + Constant.ARTICLE_BASE_URL + currentPage + ".html" + "

"); 79 | builder.append("

异常发生时间 " + TimeTool.getCurrentTime() + "

"); 80 | // /* 设置Content 浏览器解析编码和格式等 */ 81 | msg.setContent(builder.toString(), "text/html;charset=utf-8"); 82 | // 设置发件人的邮箱 83 | msg.setFrom(new InternetAddress("905073281@qq.com")); 84 | 85 | // 给收件人的地址发送上面的 Message 86 | transport.sendMessage(msg, new Address[] { new InternetAddress("studychen@foxmail.com") }); 87 | transport.close(); 88 | } catch (MessagingException e) { 89 | e.printStackTrace(); 90 | return false; 91 | } 92 | 93 | // 无异常抛出,表示发送成功 94 | return true; 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /src/com/chenxb/util/MysqlTool.java: -------------------------------------------------------------------------------- 1 | package com.chenxb.util; 2 | 3 | import java.sql.Connection; 4 | import java.sql.DriverManager; 5 | import java.sql.SQLException; 6 | 7 | import com.sina.sae.util.SaeUserInfo; 8 | 9 | public class MysqlTool { 10 | 11 | // 是否使用新浪云 12 | private static boolean isCloud = false; 13 | 14 | public MysqlTool() { 15 | // JDBC驱动程序 16 | try { 17 | Class.forName("com.mysql.jdbc.Driver").newInstance(); 18 | } catch (Exception e) { 19 | e.printStackTrace(); 20 | } 21 | } 22 | 23 | // 为了方便分析错误,将异常全部抛出到最顶层 24 | public Connection getConnection() throws Exception { 25 | 26 | // 后面unicode和utf8设置防止中文乱码 27 | String url = "jdbc:mysql://127.0.0.1:3306/see_news?useSSL=false&useUnicode=true&characterEncoding=utf-8"; 28 | String name = "root"; 29 | String password = "chenxb123"; 30 | 31 | if (isCloud) { 32 | String appName = SaeUserInfo.getAppName(); 33 | String mysqlName = "app_" + appName; 34 | url = "jdbc:mysql://w.rdc.sae.sina.com.cn:3307/" + mysqlName + "?autoReconnect=true"; 35 | name = SaeUserInfo.getAccessKey(); 36 | password = SaeUserInfo.getSecretKey(); 37 | } 38 | Connection con = DriverManager.getConnection(url, name, password); 39 | 40 | return con; 41 | 42 | } 43 | 44 | } 45 | -------------------------------------------------------------------------------- /src/com/chenxb/util/StreamTool.java: -------------------------------------------------------------------------------- 1 | package com.chenxb.util; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.ByteArrayOutputStream; 5 | import java.io.InputStream; 6 | import java.io.InputStreamReader; 7 | 8 | public class StreamTool { 9 | 10 | /** 11 | * 利用ByteArrayOutputStream将流转化为字符串 12 | * 13 | * @param in 14 | * 需要读取的InputStream 15 | * @return 读取的字符串 16 | * @throws Exception 17 | */ 18 | public static String inToStringByByte(InputStream in) throws Exception { 19 | ByteArrayOutputStream outStr = new ByteArrayOutputStream(); 20 | byte[] buffer = new byte[1024]; 21 | // 这部分有问题,一个中文3个byte,如何确定1024最末尾的正好是一个中文 22 | int len = 0; 23 | StringBuilder content = new StringBuilder(); 24 | while ((len = in.read(buffer)) != -1) { 25 | content.append(new String(buffer, 0, len, "UTF-8")); 26 | } 27 | outStr.close(); 28 | return content.toString(); 29 | } 30 | 31 | /** 32 | * 利用BufferedReader将流转化为字符串 33 | * 34 | * @param in 35 | * 需要读取的InputStream 36 | * @return 读取的字符串 37 | * @throws Exception 38 | */ 39 | public static String inToStringByReader(InputStream in) throws Exception { 40 | BufferedReader reader = null; 41 | StringBuilder content = new StringBuilder(); 42 | reader = new BufferedReader(new InputStreamReader(in)); 43 | String line = ""; 44 | while ((line = reader.readLine()) != null) { 45 | System.out.println(line); 46 | content.append(line); 47 | } 48 | return content.toString(); 49 | } 50 | 51 | } 52 | -------------------------------------------------------------------------------- /src/com/chenxb/util/StringTool.java: -------------------------------------------------------------------------------- 1 | package com.chenxb.util; 2 | 3 | import java.math.BigInteger; 4 | import java.security.MessageDigest; 5 | import java.security.NoSuchAlgorithmException; 6 | import java.util.UUID; 7 | 8 | public class StringTool { 9 | 10 | public static String createUUID() { 11 | String s = UUID.randomUUID().toString(); 12 | return s.replaceAll("-", ""); 13 | } 14 | 15 | public static String createMD5(String plaintext) { 16 | MessageDigest m; 17 | try { 18 | m = MessageDigest.getInstance("MD5"); 19 | m.reset(); 20 | m.update(plaintext.getBytes()); 21 | byte[] digest = m.digest(); 22 | BigInteger bigInt = new BigInteger(1, digest); 23 | return bigInt.toString(16); 24 | } catch (NoSuchAlgorithmException e) { 25 | } 26 | return plaintext; 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /src/com/chenxb/util/TableName.java: -------------------------------------------------------------------------------- 1 | package com.chenxb.util; 2 | 3 | public class TableName { 4 | public static final String LATEST = "latest";// 最新消息 5 | public static final String NOTIFIC = "notific";// 校园通知 6 | public static final String BACHELOR = "bachelor";// 本科教学 学士 7 | public static final String MASTER = "master";// 研究生 硕士 8 | public static final String ACADEMIC = "academic";// 学术交流 9 | public static final String JOB = "job";// 就业招聘 10 | 11 | /** 12 | * 这儿的设计能不能更优雅 13 | * 0是最新消息表,1是校园通知表 14 | * @param type 获取对应的表名称 15 | * @return 16 | */ 17 | public static String getTableByType(int type) { 18 | switch (type) { 19 | case ColumnType.LATEST: 20 | return LATEST; 21 | case ColumnType.NOTIFIC: 22 | return NOTIFIC; 23 | case ColumnType.BACHELOR: 24 | return BACHELOR; 25 | case ColumnType.MASTER: 26 | return MASTER; 27 | case ColumnType.ACADEMIC: 28 | return ACADEMIC; 29 | case ColumnType.JOB: 30 | return JOB; 31 | default: 32 | return LATEST; 33 | } 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/com/chenxb/util/TimeTool.java: -------------------------------------------------------------------------------- 1 | package com.chenxb.util; 2 | 3 | import java.text.SimpleDateFormat; 4 | import java.util.Date; 5 | import java.util.Random; 6 | 7 | public class TimeTool { 8 | private static final SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss E"); 9 | 10 | private static final Random random = new Random(); 11 | private static final int WAIT_TIME = 60 * 1000;// 毫秒 12 | 13 | /** 14 | * 15 | * @return 格式化,得到当前的日期 16 | */ 17 | public static String getCurrentTime() { 18 | return dateFormat.format(new Date()); 19 | } 20 | 21 | /** 22 | * 等待一段时间,防止对被爬虫的网站负载太大 23 | */ 24 | public static void sleepSomeTime() { 25 | try { 26 | Thread.sleep(random.nextInt(WAIT_TIME) + WAIT_TIME); 27 | } catch (InterruptedException e) { 28 | e.printStackTrace(); 29 | } 30 | } 31 | 32 | } 33 | -------------------------------------------------------------------------------- /src/com/chenxb/util/UrlTool.java: -------------------------------------------------------------------------------- 1 | package com.chenxb.util; 2 | 3 | import java.util.regex.Pattern; 4 | 5 | import org.apache.commons.lang3.StringUtils; 6 | 7 | public class UrlTool { 8 | // LATEST,//最新消息 9 | // NOTIFIC, //校园通知 10 | // BACHELOR, //本科教学 学士 11 | // MASTER, //研究生 硕士 12 | // RESEARCH, //科研 13 | // ACADEMIC //学术交流 14 | 15 | private static final String LATEST_URL = "http://see.xidian.edu.cn/index.php/index/more"; 16 | // 格式为http://see.xidian.edu.cn/html/category/5/2.html 17 | private static final String NOTIFIC_URL = "http://see.xidian.edu.cn/html/category/"; 18 | 19 | public static final Pattern VALID_EMAIL_REGEX = Pattern.compile("^[A-Z0-9._%+-]+@[A-Z0-9.-]+\\.[A-Z]{2,6}$", 20 | Pattern.CASE_INSENSITIVE); 21 | 22 | /** 23 | * 24 | * @param type 25 | * @param currentPage 26 | * 不是无限大,有一定范围 27 | * @return 28 | */ 29 | public static String generateUrl(int type, int currentPage) { 30 | currentPage = currentPage > 0 ? currentPage : 1; 31 | switch (type) { 32 | case ColumnType.LATEST: 33 | return LATEST_URL; 34 | case ColumnType.NOTIFIC: 35 | case ColumnType.BACHELOR: 36 | case ColumnType.MASTER: 37 | case ColumnType.ACADEMIC: 38 | case ColumnType.JOB: 39 | return NOTIFIC_URL + type + "/" + currentPage + ".html"; 40 | default: 41 | return LATEST_URL; 42 | } 43 | } 44 | 45 | /** 46 | * 这儿的tag 不和 冲突 47 | * 48 | * 处理文章 body 里的 url 49 | * /uploads/file/20150706/20150706094631_73253.doc 50 | * 51 | * 相对路径全部转化为绝对路径 52 | * @param originTrim 53 | * @return 54 | */ 55 | public static String dealAttachmentUrl(int currentPage, String origin) { 56 | // 去掉首尾的空格 57 | String originTrim = origin.trim(); 58 | // 附件不一定都是在 uploads 文件夹下面 59 | // 也有可能外链到其他网站的图片/uploads/image/20141120/20141120**.jpg 60 | // /news/Upload/2006051811250740787.xls 61 | if (StringUtils.startsWithAny(originTrim, "/uploads", "/news/Upload","/news/Images")) { 62 | // 相对路径,比如 63 | return Constant.SEE_URL + originTrim; 64 | } else if (StringUtils.startsWithAny(originTrim, Constant.HTTP_PREFIX, Constant.HTTPS_PREFIX, 65 | Constant.FTP_PREFIX, Constant.JS_PREFIX)) { 66 | // http https ftp 开头 67 | // 先后顺序,先判断是不是 http 开头,再判断是不是 www 开头 68 | if (Constant.DEBUG) { 69 | System.out.println("in dealAttachmentUrl 全路径"); 70 | } 71 | return originTrim; 72 | } else if (originTrim.length() == 0 || originTrim.equals("")) { 73 | // 无效标签,获得的href为"" 74 | return originTrim; 75 | } else if (originTrim.startsWith(Constant.MAILTO_PREFIX)) { 76 | return originTrim; 77 | } else if (VALID_EMAIL_REGEX.matcher(originTrim).find()) { 78 | // 只是邮件名,加上 mailto 79 | // 注意前后顺序 80 | return Constant.MAILTO_PREFIX + originTrim; 81 | } else if (originTrim.startsWith(Constant.WWW_PREFIX)) { 82 | // www开头的,加上 http:// 83 | return Constant.HTTP_PREFIX + originTrim; 84 | } else if (originTrim.equals(Constant.WEBSITE_NAME)) { 85 | return Constant.SEE_URL; 86 | } else { 87 | // 链接未在考虑范围内,发邮件通知 88 | StringBuilder builder = new StringBuilder(); 89 | builder.append("

UrlTool.dealAttachmentUrl() 无法解析url

"); 90 | builder.append("

异常 href = " + originTrim + "

"); 91 | MailTool.sendException(builder.toString(), currentPage, MailTool.HREF_UNUSUAL); 92 | return originTrim; 93 | } 94 | } 95 | 96 | } 97 | -------------------------------------------------------------------------------- /src/hanlp.properties: -------------------------------------------------------------------------------- 1 | #本配置文件中的路径的根目录,根目录+其他路径=绝对路径 2 | #Windows用户请注意,路径分隔符统一使用/ 3 | root=./ 4 | #核心词典路径 5 | CoreDictionaryPath=data/dictionary/CoreNatureDictionary.txt 6 | #2元语法词典路径 7 | BiGramDictionaryPath=data/dictionary/CoreNatureDictionary.ngram.txt 8 | #停用词词典路径 9 | CoreStopWordDictionaryPath=data/dictionary/stopwords.txt 10 | #同义词词典路径 11 | CoreSynonymDictionaryDictionaryPath=data/dictionary/synonym/CoreSynonym.txt 12 | #人名词典路径 13 | PersonDictionaryPath=data/dictionary/person/nr.txt 14 | #人名词典转移矩阵路径 15 | PersonDictionaryTrPath=data/dictionary/person/nr.tr.txt 16 | #繁简词典路径 17 | TraditionalChineseDictionaryPath=data/dictionary/tc/TraditionalChinese.txt 18 | #自定义词典路径,用;隔开多个自定义词典,空格开头表示在同一个目录,使用“文件名 词性”形式则表示这个词典的词性默认是该词性。优先级递减。 19 | #另外data/dictionary/custom/CustomDictionary.txt是个高质量的词库,请不要删除 20 | CustomDictionaryPath=data/dictionary/custom/CustomDictionary.txt; 现代汉语补充词库.txt; 全国地名大全.txt ns; 人名词典.txt; 机构名词典.txt; 上海地名.txt ns;data/dictionary/person/nrf.txt nrf 21 | #CRF分词模型路径 22 | CRFSegmentModelPath=data/model/segment/CRFSegmentModel.txt 23 | #HMM分词模型 24 | HMMSegmentModelPath=data/model/segment/HMMSegmentModel.bin 25 | #分词结果是否展示词性 26 | ShowTermNature=true --------------------------------------------------------------------------------