├── .gitignore
├── .travis.yml
├── LICENSE.txt
├── README.md
├── mvnw
├── mvnw.cmd
├── pom.xml
└── src
    └── main
        ├── java
            └── org
            │   └── seo
            │       └── rank
            │           ├── CopyChecker.java
            │           ├── Ranker.java
            │           ├── SimilarChecker.java
            │           ├── api
            │               ├── GetArticle.java
            │               ├── GetListRank.java
            │               └── GetRank.java
            │           ├── impl
            │               ├── BaiduCopyChecker.java
            │               ├── BaiduRanker.java
            │               ├── GenericWebPageSimilarChecker.java
            │               ├── ITEYEBlogSimilarChecker.java
            │               └── WordBasedGenericWebPageSimilarChecker.java
            │           ├── list
            │               ├── Parser.java
            │               ├── UrlTools.java
            │               └── impl
            │               │   └── DefaultParser.java
            │           ├── model
            │               ├── Article.java
            │               └── Rank.java
            │           └── tools
            │               ├── DynamicIp.java
            │               ├── ProxyIp.java
            │               └── VoteRanker.java
        ├── resources
            ├── logback.xml
            └── proxy_ips_excellent.txt
        └── webapp
            ├── META-INF
                └── context.xml
            ├── WEB-INF
                └── web.xml
            ├── index.jsp
            ├── rank.jsp
            └── ranks.jsp


/.gitignore:
--------------------------------------------------------------------------------
 1 | .settings/
 2 | .classpath
 3 | .project
 4 | target/
 5 | logs/
 6 | data/
 7 | .idea/
 8 | Rank.iml
 9 | Rank.ipr
10 | Rank.iws
11 | .gradle/
12 | build/
13 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: java
2 | 
3 | jdk:
4 |   - oraclejdk8
5 | 
6 | install:
7 |   - mvn -N io.takari:maven:wrapper


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ### rank是一个seo工具，用于分析网站的搜索引擎收录排名。
 2 | 
 3 | ### [捐赠致谢](https://github.com/ysc/QuestionAnsweringSystem/wiki/donation)
 4 | 
 5 | ### 1、指定一个栏目入口页面，配置标题的CSS路径，配置下一页的CSS路径，配置下一页的标签文本，返回所有的文章标题和URL列表
 6 | ### 2、指定一个URL和关键词，返回使用关键词在搜索引擎中搜索的结果中URL的排名
 7 | 
 8 | ### [一种通用的网页相似度检测算法](http://my.oschina.net/apdplat/blog/398361)
 9 | ### [一种防止用户生成内容站点出现商业广告以及非法有害等垃圾信息的方法](http://my.oschina.net/apdplat/blog/398338)
10 | ### [计算OSCHINA博文在百度的收录与排名情况](http://my.oschina.net/apdplat/blog/395810)
11 | ### [计算ITEYE博文在百度的收录与排名情况](http://my.oschina.net/apdplat/blog/395970)
12 | ### [OSCHINA博文抄袭检查](http://my.oschina.net/apdplat/blog/396414)
13 | ### [ITEYE博文抄袭检查](http://my.oschina.net/apdplat/blog/396411)
14 | ### [我的ITEYE和OSCHINA博客的异同](http://my.oschina.net/apdplat/blog/395494)
15 | 
16 | [https://travis-ci.org/ysc/rank](https://travis-ci.org/ysc/rank)
17 | 


--------------------------------------------------------------------------------
/mvnw:
--------------------------------------------------------------------------------
  1 | #!/bin/sh
  2 | # ----------------------------------------------------------------------------
  3 | # Licensed to the Apache Software Foundation (ASF) under one
  4 | # or more contributor license agreements.  See the NOTICE file
  5 | # distributed with this work for additional information
  6 | # regarding copyright ownership.  The ASF licenses this file
  7 | # to you under the Apache License, Version 2.0 (the
  8 | # "License"); you may not use this file except in compliance
  9 | # with the License.  You may obtain a copy of the License at
 10 | #
 11 | #    http://www.apache.org/licenses/LICENSE-2.0
 12 | #
 13 | # Unless required by applicable law or agreed to in writing,
 14 | # software distributed under the License is distributed on an
 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 16 | # KIND, either express or implied.  See the License for the
 17 | # specific language governing permissions and limitations
 18 | # under the License.
 19 | # ----------------------------------------------------------------------------
 20 | 
 21 | # ----------------------------------------------------------------------------
 22 | # Maven2 Start Up Batch script
 23 | #
 24 | # Required ENV vars:
 25 | # ------------------
 26 | #   JAVA_HOME - location of a JDK home dir
 27 | #
 28 | # Optional ENV vars
 29 | # -----------------
 30 | #   M2_HOME - location of maven2's installed home dir
 31 | #   MAVEN_OPTS - parameters passed to the Java VM when running Maven
 32 | #     e.g. to debug Maven itself, use
 33 | #       set MAVEN_OPTS=-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=8000
 34 | #   MAVEN_SKIP_RC - flag to disable loading of mavenrc files
 35 | # ----------------------------------------------------------------------------
 36 | 
 37 | if [ -z "$MAVEN_SKIP_RC" ] ; then
 38 | 
 39 |   if [ -f /etc/mavenrc ] ; then
 40 |     . /etc/mavenrc
 41 |   fi
 42 | 
 43 |   if [ -f "$HOME/.mavenrc" ] ; then
 44 |     . "$HOME/.mavenrc"
 45 |   fi
 46 | 
 47 | fi
 48 | 
 49 | # OS specific support.  $var _must_ be set to either true or false.
 50 | cygwin=false;
 51 | darwin=false;
 52 | mingw=false
 53 | case "`uname`" in
 54 |   CYGWIN*) cygwin=true ;;
 55 |   MINGW*) mingw=true;;
 56 |   Darwin*) darwin=true
 57 |            #
 58 |            # Look for the Apple JDKs first to preserve the existing behaviour, and then look
 59 |            # for the new JDKs provided by Oracle.
 60 |            #
 61 |            if [ -z "$JAVA_HOME" ] && [ -L /System/Library/Frameworks/JavaVM.framework/Versions/CurrentJDK ] ; then
 62 |              #
 63 |              # Apple JDKs
 64 |              #
 65 |              export JAVA_HOME=/System/Library/Frameworks/JavaVM.framework/Versions/CurrentJDK/Home
 66 |            fi
 67 | 
 68 |            if [ -z "$JAVA_HOME" ] && [ -L /System/Library/Java/JavaVirtualMachines/CurrentJDK ] ; then
 69 |              #
 70 |              # Apple JDKs
 71 |              #
 72 |              export JAVA_HOME=/System/Library/Java/JavaVirtualMachines/CurrentJDK/Contents/Home
 73 |            fi
 74 | 
 75 |            if [ -z "$JAVA_HOME" ] && [ -L "/Library/Java/JavaVirtualMachines/CurrentJDK" ] ; then
 76 |              #
 77 |              # Oracle JDKs
 78 |              #
 79 |              export JAVA_HOME=/Library/Java/JavaVirtualMachines/CurrentJDK/Contents/Home
 80 |            fi
 81 | 
 82 |            if [ -z "$JAVA_HOME" ] && [ -x "/usr/libexec/java_home" ]; then
 83 |              #
 84 |              # Apple JDKs
 85 |              #
 86 |              export JAVA_HOME=`/usr/libexec/java_home`
 87 |            fi
 88 |            ;;
 89 | esac
 90 | 
 91 | if [ -z "$JAVA_HOME" ] ; then
 92 |   if [ -r /etc/gentoo-release ] ; then
 93 |     JAVA_HOME=`java-config --jre-home`
 94 |   fi
 95 | fi
 96 | 
 97 | if [ -z "$M2_HOME" ] ; then
 98 |   ## resolve links - $0 may be a link to maven's home
 99 |   PRG="$0"
100 | 
101 |   # need this for relative symlinks
102 |   while [ -h "$PRG" ] ; do
103 |     ls=`ls -ld "$PRG"`
104 |     link=`expr "$ls" : '.*-> \(.*\)$'`
105 |     if expr "$link" : '/.*' > /dev/null; then
106 |       PRG="$link"
107 |     else
108 |       PRG="`dirname "$PRG"`/$link"
109 |     fi
110 |   done
111 | 
112 |   saveddir=`pwd`
113 | 
114 |   M2_HOME=`dirname "$PRG"`/..
115 | 
116 |   # make it fully qualified
117 |   M2_HOME=`cd "$M2_HOME" && pwd`
118 | 
119 |   cd "$saveddir"
120 |   # echo Using m2 at $M2_HOME
121 | fi
122 | 
123 | # For Cygwin, ensure paths are in UNIX format before anything is touched
124 | if $cygwin ; then
125 |   [ -n "$M2_HOME" ] &&
126 |     M2_HOME=`cygpath --unix "$M2_HOME"`
127 |   [ -n "$JAVA_HOME" ] &&
128 |     JAVA_HOME=`cygpath --unix "$JAVA_HOME"`
129 |   [ -n "$CLASSPATH" ] &&
130 |     CLASSPATH=`cygpath --path --unix "$CLASSPATH"`
131 | fi
132 | 
133 | # For Migwn, ensure paths are in UNIX format before anything is touched
134 | if $mingw ; then
135 |   [ -n "$M2_HOME" ] &&
136 |     M2_HOME="`(cd "$M2_HOME"; pwd)`"
137 |   [ -n "$JAVA_HOME" ] &&
138 |     JAVA_HOME="`(cd "$JAVA_HOME"; pwd)`"
139 |   # TODO classpath?
140 | fi
141 | 
142 | if [ -z "$JAVA_HOME" ]; then
143 |   javaExecutable="`which javac`"
144 |   if [ -n "$javaExecutable" ] && ! [ "`expr \"$javaExecutable\" : '\([^ ]*\)'`" = "no" ]; then
145 |     # readlink(1) is not available as standard on Solaris 10.
146 |     readLink=`which readlink`
147 |     if [ ! `expr "$readLink" : '\([^ ]*\)'` = "no" ]; then
148 |       if $darwin ; then
149 |         javaHome="`dirname \"$javaExecutable\"`"
150 |         javaExecutable="`cd \"$javaHome\" && pwd -P`/javac"
151 |       else
152 |         javaExecutable="`readlink -f \"$javaExecutable\"`"
153 |       fi
154 |       javaHome="`dirname \"$javaExecutable\"`"
155 |       javaHome=`expr "$javaHome" : '\(.*\)/bin'`
156 |       JAVA_HOME="$javaHome"
157 |       export JAVA_HOME
158 |     fi
159 |   fi
160 | fi
161 | 
162 | if [ -z "$JAVACMD" ] ; then
163 |   if [ -n "$JAVA_HOME"  ] ; then
164 |     if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
165 |       # IBM's JDK on AIX uses strange locations for the executables
166 |       JAVACMD="$JAVA_HOME/jre/sh/java"
167 |     else
168 |       JAVACMD="$JAVA_HOME/bin/java"
169 |     fi
170 |   else
171 |     JAVACMD="`which java`"
172 |   fi
173 | fi
174 | 
175 | if [ ! -x "$JAVACMD" ] ; then
176 |   echo "Error: JAVA_HOME is not defined correctly." >&2
177 |   echo "  We cannot execute $JAVACMD" >&2
178 |   exit 1
179 | fi
180 | 
181 | if [ -z "$JAVA_HOME" ] ; then
182 |   echo "Warning: JAVA_HOME environment variable is not set."
183 | fi
184 | 
185 | CLASSWORLDS_LAUNCHER=org.codehaus.plexus.classworlds.launcher.Launcher
186 | 
187 | # For Cygwin, switch paths to Windows format before running java
188 | if $cygwin; then
189 |   [ -n "$M2_HOME" ] &&
190 |     M2_HOME=`cygpath --path --windows "$M2_HOME"`
191 |   [ -n "$JAVA_HOME" ] &&
192 |     JAVA_HOME=`cygpath --path --windows "$JAVA_HOME"`
193 |   [ -n "$CLASSPATH" ] &&
194 |     CLASSPATH=`cygpath --path --windows "$CLASSPATH"`
195 | fi
196 | 
197 | # traverses directory structure from process work directory to filesystem root
198 | # first directory with .mvn subdirectory is considered project base directory
199 | find_maven_basedir() {
200 |   local basedir=$(pwd)
201 |   local wdir=$(pwd)
202 |   while [ "$wdir" != '/' ] ; do
203 |     if [ -d "$wdir"/.mvn ] ; then
204 |       basedir=$wdir
205 |       break
206 |     fi
207 |     wdir=$(cd "$wdir/.."; pwd)
208 |   done
209 |   echo "${basedir}"
210 | }
211 | 
212 | # concatenates all lines of a file
213 | concat_lines() {
214 |   if [ -f "$1" ]; then
215 |     echo "$(tr -s '\n' ' ' < "$1")"
216 |   fi
217 | }
218 | 
219 | export MAVEN_PROJECTBASEDIR=${MAVEN_BASEDIR:-$(find_maven_basedir)}
220 | MAVEN_OPTS="$(concat_lines "$MAVEN_PROJECTBASEDIR/.mvn/jvm.config") $MAVEN_OPTS"
221 | 
222 | # Provide a "standardized" way to retrieve the CLI args that will
223 | # work with both Windows and non-Windows executions.
224 | MAVEN_CMD_LINE_ARGS="$MAVEN_CONFIG $@"
225 | export MAVEN_CMD_LINE_ARGS
226 | 
227 | WRAPPER_LAUNCHER=org.apache.maven.wrapper.MavenWrapperMain
228 | 
229 | exec "$JAVACMD" \
230 |   $MAVEN_OPTS \
231 |   -classpath "$MAVEN_PROJECTBASEDIR/.mvn/wrapper/maven-wrapper.jar" \
232 |   "-Dmaven.home=${M2_HOME}" "-Dmaven.multiModuleProjectDirectory=${MAVEN_PROJECTBASEDIR}" \
233 |   ${WRAPPER_LAUNCHER} "$@"
234 | 


--------------------------------------------------------------------------------
/mvnw.cmd:
--------------------------------------------------------------------------------
  1 | @REM ----------------------------------------------------------------------------
  2 | @REM Licensed to the Apache Software Foundation (ASF) under one
  3 | @REM or more contributor license agreements.  See the NOTICE file
  4 | @REM distributed with this work for additional information
  5 | @REM regarding copyright ownership.  The ASF licenses this file
  6 | @REM to you under the Apache License, Version 2.0 (the
  7 | @REM "License"); you may not use this file except in compliance
  8 | @REM with the License.  You may obtain a copy of the License at
  9 | @REM
 10 | @REM    http://www.apache.org/licenses/LICENSE-2.0
 11 | @REM
 12 | @REM Unless required by applicable law or agreed to in writing,
 13 | @REM software distributed under the License is distributed on an
 14 | @REM "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 15 | @REM KIND, either express or implied.  See the License for the
 16 | @REM specific language governing permissions and limitations
 17 | @REM under the License.
 18 | @REM ----------------------------------------------------------------------------
 19 | 
 20 | @REM ----------------------------------------------------------------------------
 21 | @REM Maven2 Start Up Batch script
 22 | @REM
 23 | @REM Required ENV vars:
 24 | @REM JAVA_HOME - location of a JDK home dir
 25 | @REM
 26 | @REM Optional ENV vars
 27 | @REM M2_HOME - location of maven2's installed home dir
 28 | @REM MAVEN_BATCH_ECHO - set to 'on' to enable the echoing of the batch commands
 29 | @REM MAVEN_BATCH_PAUSE - set to 'on' to wait for a key stroke before ending
 30 | @REM MAVEN_OPTS - parameters passed to the Java VM when running Maven
 31 | @REM     e.g. to debug Maven itself, use
 32 | @REM set MAVEN_OPTS=-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=8000
 33 | @REM MAVEN_SKIP_RC - flag to disable loading of mavenrc files
 34 | @REM ----------------------------------------------------------------------------
 35 | 
 36 | @REM Begin all REM lines with '@' in case MAVEN_BATCH_ECHO is 'on'
 37 | @echo off
 38 | @REM enable echoing my setting MAVEN_BATCH_ECHO to 'on'
 39 | @if "%MAVEN_BATCH_ECHO%" == "on"  echo %MAVEN_BATCH_ECHO%
 40 | 
 41 | @REM set %HOME% to equivalent of $HOME
 42 | if "%HOME%" == "" (set "HOME=%HOMEDRIVE%%HOMEPATH%")
 43 | 
 44 | @REM Execute a user defined script before this one
 45 | if not "%MAVEN_SKIP_RC%" == "" goto skipRcPre
 46 | @REM check for pre script, once with legacy .bat ending and once with .cmd ending
 47 | if exist "%HOME%\mavenrc_pre.bat" call "%HOME%\mavenrc_pre.bat"
 48 | if exist "%HOME%\mavenrc_pre.cmd" call "%HOME%\mavenrc_pre.cmd"
 49 | :skipRcPre
 50 | 
 51 | @setlocal
 52 | 
 53 | set ERROR_CODE=0
 54 | 
 55 | @REM To isolate internal variables from possible post scripts, we use another setlocal
 56 | @setlocal
 57 | 
 58 | @REM ==== START VALIDATION ====
 59 | if not "%JAVA_HOME%" == "" goto OkJHome
 60 | 
 61 | echo.
 62 | echo Error: JAVA_HOME not found in your environment. >&2
 63 | echo Please set the JAVA_HOME variable in your environment to match the >&2
 64 | echo location of your Java installation. >&2
 65 | echo.
 66 | goto error
 67 | 
 68 | :OkJHome
 69 | if exist "%JAVA_HOME%\bin\java.exe" goto init
 70 | 
 71 | echo.
 72 | echo Error: JAVA_HOME is set to an invalid directory. >&2
 73 | echo JAVA_HOME = "%JAVA_HOME%" >&2
 74 | echo Please set the JAVA_HOME variable in your environment to match the >&2
 75 | echo location of your Java installation. >&2
 76 | echo.
 77 | goto error
 78 | 
 79 | @REM ==== END VALIDATION ====
 80 | 
 81 | :init
 82 | 
 83 | set MAVEN_CMD_LINE_ARGS=%*
 84 | 
 85 | @REM Find the project base dir, i.e. the directory that contains the folder ".mvn".
 86 | @REM Fallback to current working directory if not found.
 87 | 
 88 | set MAVEN_PROJECTBASEDIR=%MAVEN_BASEDIR%
 89 | IF NOT "%MAVEN_PROJECTBASEDIR%"=="" goto endDetectBaseDir
 90 | 
 91 | set EXEC_DIR=%CD%
 92 | set WDIR=%EXEC_DIR%
 93 | :findBaseDir
 94 | IF EXIST "%WDIR%"\.mvn goto baseDirFound
 95 | cd ..
 96 | IF "%WDIR%"=="%CD%" goto baseDirNotFound
 97 | set WDIR=%CD%
 98 | goto findBaseDir
 99 | 
100 | :baseDirFound
101 | set MAVEN_PROJECTBASEDIR=%WDIR%
102 | cd "%EXEC_DIR%"
103 | goto endDetectBaseDir
104 | 
105 | :baseDirNotFound
106 | set MAVEN_PROJECTBASEDIR=%EXEC_DIR%
107 | cd "%EXEC_DIR%"
108 | 
109 | :endDetectBaseDir
110 | 
111 | IF NOT EXIST "%MAVEN_PROJECTBASEDIR%\.mvn\jvm.config" goto endReadAdditionalConfig
112 | 
113 | @setlocal EnableExtensions EnableDelayedExpansion
114 | for /F "usebackq delims=" %%a in ("%MAVEN_PROJECTBASEDIR%\.mvn\jvm.config") do set JVM_CONFIG_MAVEN_PROPS=!JVM_CONFIG_MAVEN_PROPS! %%a
115 | @endlocal & set JVM_CONFIG_MAVEN_PROPS=%JVM_CONFIG_MAVEN_PROPS%
116 | 
117 | :endReadAdditionalConfig
118 | 
119 | SET MAVEN_JAVA_EXE="%JAVA_HOME%\bin\java.exe"
120 | 
121 | set WRAPPER_JAR="".\.mvn\wrapper\maven-wrapper.jar""
122 | set WRAPPER_LAUNCHER=org.apache.maven.wrapper.MavenWrapperMain
123 | 
124 | %MAVEN_JAVA_EXE% %JVM_CONFIG_MAVEN_PROPS% %MAVEN_OPTS% %MAVEN_DEBUG_OPTS% -classpath %WRAPPER_JAR% "-Dmaven.multiModuleProjectDirectory=%MAVEN_PROJECTBASEDIR%" %WRAPPER_LAUNCHER% %MAVEN_CMD_LINE_ARGS%
125 | if ERRORLEVEL 1 goto error
126 | goto end
127 | 
128 | :error
129 | set ERROR_CODE=1
130 | 
131 | :end
132 | @endlocal & set ERROR_CODE=%ERROR_CODE%
133 | 
134 | if not "%MAVEN_SKIP_RC%" == "" goto skipRcPost
135 | @REM check for post script, once with legacy .bat ending and once with .cmd ending
136 | if exist "%HOME%\mavenrc_post.bat" call "%HOME%\mavenrc_post.bat"
137 | if exist "%HOME%\mavenrc_post.cmd" call "%HOME%\mavenrc_post.cmd"
138 | :skipRcPost
139 | 
140 | @REM pause the script if MAVEN_BATCH_PAUSE is set to 'on'
141 | if "%MAVEN_BATCH_PAUSE%" == "on" pause
142 | 
143 | if "%MAVEN_TERMINATE_CMD%" == "on" exit %ERROR_CODE%
144 | 
145 | exit /B %ERROR_CODE%


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  3 |     <modelVersion>4.0.0</modelVersion>
  4 |     
  5 |     <groupId>org.seo</groupId>
  6 |     <artifactId>rank</artifactId>
  7 |     <version>1.0</version>
  8 |     <packaging>war</packaging>
  9 | 
 10 |     <name>rank</name>
 11 |     <url>https://github.com/ysc/rank</url>
 12 |     <description>
 13 |         rank是一个seo工具，用于分析网站的搜索引擎收录排名。
 14 |     </description>
 15 |     <organization>
 16 |         <name>APDPlat</name>
 17 |         <url>http://apdplat.org/</url>
 18 |     </organization>
 19 |     <licenses>
 20 |         <license>
 21 |             <name>GNU GENERAL PUBLIC LICENSE, Version 3</name>
 22 |             <url>http://www.gnu.org/licenses/gpl.html</url>
 23 |         </license>
 24 |     </licenses>
 25 |     <inceptionYear>2014</inceptionYear>
 26 |     <scm>
 27 |         <url>https://github.com/ysc/rank</url>
 28 |         <connection>scm:git:git://github.com/ysc/rank.git</connection>
 29 |         <developerConnection>scm:git:ssh://git@github.com/ysc/rank.git</developerConnection>
 30 |         <tag>HEAD</tag>
 31 |     </scm>
 32 |     <developers>
 33 |         <developer>
 34 |             <name>杨尚川</name>
 35 |             <email>ysc@apdplat.org</email>
 36 |             <url>http://yangshangchuan.iteye.com</url>
 37 |         </developer>
 38 |     </developers>
 39 |     
 40 |     <properties>
 41 |         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 42 |         <java.version>1.8</java.version>
 43 |         
 44 |         <javaee-web-api.version>6.0</javaee-web-api.version>
 45 |         <junit.version>4.11</junit.version>
 46 |         <slf4j-api.version>1.6.4</slf4j-api.version>
 47 |         <logback-classic.version>0.9.28</logback-classic.version>
 48 |         <jsoup.version>1.7.2</jsoup.version>
 49 |         <commons-httpclient.version>3.1</commons-httpclient.version>
 50 |         <commons-lang.version>2.6</commons-lang.version>
 51 |         <jackson.version>1.9.13</jackson.version>
 52 |         <htmlunit.version>2.14</htmlunit.version>
 53 |         <word.version>1.3</word.version>
 54 |         <jcl-over-slf4j.version>1.6.4</jcl-over-slf4j.version>
 55 |         
 56 |         <maven-compiler-plugin.version>3.0</maven-compiler-plugin.version>
 57 |         <maven-jar-plugin.version>2.4</maven-jar-plugin.version>
 58 |         <maven-surefire-plugin.version>2.14</maven-surefire-plugin.version>
 59 |         <maven-resources-plugin.version>2.6</maven-resources-plugin.version>
 60 |         <maven-source-plugin.version>2.2.1</maven-source-plugin.version>
 61 |         <maven-jetty-plugin.version>6.1.26</maven-jetty-plugin.version>
 62 |         <sonar-maven3-plugin.version>3.5</sonar-maven3-plugin.version>
 63 |     </properties>
 64 |        
 65 |     <build>
 66 |         <plugins>
 67 |             <!-- 编译插件, 设定JDK版本 -->
 68 |             <plugin>
 69 |                 <groupId>org.apache.maven.plugins</groupId>
 70 |                 <artifactId>maven-compiler-plugin</artifactId>
 71 |                 <version>${maven-compiler-plugin.version}</version>
 72 |                 <configuration>
 73 |                     <encoding>${project.build.sourceEncoding}</encoding>
 74 |                     <source>${java.version}</source>
 75 |                     <target>${java.version}</target>
 76 |                     <showDeprecation>true</showDeprecation>
 77 |                     <showWarnings>true</showWarnings>
 78 |                     <debug>true</debug>
 79 |                 </configuration>
 80 |             </plugin>
 81 |             <!-- 打包插件 -->
 82 |             <plugin>
 83 |                 <groupId>org.apache.maven.plugins</groupId>
 84 |                 <artifactId>maven-jar-plugin</artifactId>
 85 |                 <version>${maven-jar-plugin.version}</version>
 86 |             </plugin>
 87 |             <!-- 单元测试插件 -->
 88 |             <plugin>
 89 |                 <groupId>org.apache.maven.plugins</groupId>
 90 |                 <artifactId>maven-surefire-plugin</artifactId>
 91 |                 <version>${maven-surefire-plugin.version}</version>
 92 |                 <configuration>
 93 |                     <testFailureIgnore>true</testFailureIgnore>
 94 |                 </configuration>
 95 |             </plugin>
 96 |             <!-- resource插件, 设定编码 -->
 97 |             <plugin>
 98 |                 <groupId>org.apache.maven.plugins</groupId>
 99 |                 <artifactId>maven-resources-plugin</artifactId>
100 |                 <version>${maven-resources-plugin.version}</version>
101 |                 <configuration>
102 |                     <encoding>${project.build.sourceEncoding}</encoding>
103 |                 </configuration>
104 |             </plugin>
105 |             <!-- source插件,打包源码 -->
106 |             <plugin>
107 |                 <artifactId>maven-source-plugin</artifactId>
108 |                 <version>${maven-source-plugin.version}</version>
109 |                 <executions>
110 |                     <execution>
111 |                         <id>attach-sources</id>
112 |                         <goals>
113 |                             <goal>jar</goal>
114 |                         </goals>
115 |                     </execution>
116 |                 </executions>
117 |             </plugin>
118 |             <plugin>
119 |                 <groupId>org.mortbay.jetty</groupId>
120 |                 <artifactId>maven-jetty-plugin</artifactId>
121 |                 <version>${maven-jetty-plugin.version}</version>
122 |             </plugin>
123 |             <!--  运行 mvn sonar:sonar 可将项目发布给质量管理平台-->
124 |             <!-- 软件环境： jdk1.7.0_51 apache-maven-3.0.4 sonar-3.6 -->
125 |             <plugin>
126 |                 <groupId>org.codehaus.sonar</groupId>
127 |                 <artifactId>sonar-maven3-plugin</artifactId>
128 |                 <version>${sonar-maven3-plugin.version}</version>
129 |             </plugin>
130 |         </plugins>
131 |     </build>
132 | 
133 |     <dependencies>
134 |         <dependency>
135 |             <groupId>javax</groupId>
136 |             <artifactId>javaee-web-api</artifactId>
137 |             <version>${javaee-web-api.version}</version>
138 |             <scope>provided</scope>
139 |         </dependency>
140 |         <dependency>
141 |             <groupId>junit</groupId>
142 |             <artifactId>junit</artifactId>
143 |             <version>${junit.version}</version>
144 |             <scope>test</scope>
145 |         </dependency>
146 |         <!-- SLF4J日志框架API -->
147 |         <dependency>
148 |             <groupId>org.slf4j</groupId>
149 |             <artifactId>slf4j-api</artifactId>
150 |             <version>${slf4j-api.version}</version>
151 |         </dependency>
152 |         <!-- LOGBACK日志实现提供者 -->
153 |         <dependency>
154 |             <groupId>ch.qos.logback</groupId>
155 |             <artifactId>logback-classic</artifactId>
156 |             <version>${logback-classic.version}</version>
157 |             <exclusions>
158 |                 <exclusion>
159 |                     <groupId>commons-logging</groupId>
160 |                     <artifactId>commons-logging</artifactId>
161 |                 </exclusion>
162 |             </exclusions>
163 |             <scope>runtime</scope>
164 |         </dependency>
165 |         <!-- 拦截 apache commons logging -->
166 |         <dependency>
167 |             <groupId>org.slf4j</groupId>
168 |             <artifactId>jcl-over-slf4j</artifactId>
169 |             <version>${jcl-over-slf4j.version}</version>
170 |         </dependency>
171 |         <dependency>
172 |             <groupId>org.jsoup</groupId>
173 |             <artifactId>jsoup</artifactId>
174 |             <version>${jsoup.version}</version>
175 |         </dependency>
176 |         <dependency>
177 |             <groupId>commons-httpclient</groupId>
178 |             <artifactId>commons-httpclient</artifactId>
179 |             <version>${commons-httpclient.version}</version>
180 |             <exclusions>
181 |                 <exclusion>
182 |                     <groupId>commons-logging</groupId>
183 |                     <artifactId>commons-logging</artifactId>
184 |                 </exclusion>
185 |             </exclusions>
186 |         </dependency>
187 |         <dependency>
188 |             <groupId>commons-lang</groupId>
189 |             <artifactId>commons-lang</artifactId>
190 |             <version>${commons-lang.version}</version>
191 |         </dependency>
192 |         <dependency>
193 |             <groupId>org.codehaus.jackson</groupId>
194 |             <artifactId>jackson-mapper-asl</artifactId>
195 |             <version>${jackson.version}</version>
196 |         </dependency>
197 |         <dependency>
198 |             <groupId>net.sourceforge.htmlunit</groupId>
199 |             <artifactId>htmlunit</artifactId>
200 |             <version>${htmlunit.version}</version>
201 |             <exclusions>
202 |                 <exclusion>
203 |                     <groupId>commons-logging</groupId>
204 |                     <artifactId>commons-logging</artifactId>
205 |                 </exclusion>
206 |             </exclusions>
207 |         </dependency>
208 |         <!--分布式中文分词组件-->
209 |         <dependency>
210 |             <groupId>org.apdplat</groupId>
211 |             <artifactId>word</artifactId>
212 |             <version>${word.version}</version>
213 |         </dependency>
214 |     </dependencies>
215 | </project>


--------------------------------------------------------------------------------
/src/main/java/org/seo/rank/CopyChecker.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  *
 3 |  * APDPlat - Application Product Development Platform
 4 |  * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com
 5 |  *
 6 |  * This program is free software: you can redistribute it and/or modify
 7 |  * it under the terms of the GNU General Public License as published by
 8 |  * the Free Software Foundation, either version 3 of the License, or
 9 |  * (at your option) any later version.
10 |  *
11 |  * This program is distributed in the hope that it will be useful,
12 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 |  * GNU General Public License for more details.
15 |  *
16 |  * You should have received a copy of the GNU General Public License
17 |  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
18 |  *
19 |  */
20 | 
21 | package org.seo.rank;
22 | 
23 | import org.seo.rank.model.Article;
24 | 
25 | import java.util.List;
26 | import java.util.Map;
27 | import java.util.Set;
28 | 
29 | /**
30 |  * 文章抄袭检查
31 |  * 比如我写了一篇文章：使用Java8实现自己的个性化搜索引擎
32 |  * 我想知道有哪些网站转载了我的文章
33 |  * 那么我可以通过搜索引擎来进行查询
34 |  * @author 杨尚川
35 |  */
36 | public interface CopyChecker {
37 |     /**
38 |      * 返回结果中的Set里面的内容是抄袭的文章的URL
39 |      * @param titles
40 |      * @return
41 |      */
42 |     public Map<Article, Set<String>> check(List<Article> titles);
43 | }
44 | 


--------------------------------------------------------------------------------
/src/main/java/org/seo/rank/Ranker.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * 
 3 |  * APDPlat - Application Product Development Platform
 4 |  * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com
 5 |  * 
 6 |  * This program is free software: you can redistribute it and/or modify
 7 |  * it under the terms of the GNU General Public License as published by
 8 |  * the Free Software Foundation, either version 3 of the License, or
 9 |  * (at your option) any later version.
10 |  * 
11 |  * This program is distributed in the hope that it will be useful,
12 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 |  * GNU General Public License for more details.
15 |  * 
16 |  * You should have received a copy of the GNU General Public License
17 |  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
18 |  * 
19 |  */
20 | 
21 | package org.seo.rank;
22 | 
23 | import java.util.List;
24 | import org.seo.rank.model.Rank;
25 | 
26 | /**
27 |  * 网页排名和搜索引擎收录检测
28 |  * @author 杨尚川
29 |  */
30 | public interface Ranker {
31 |     public void rank(Rank rank);
32 |     public void rank(List<Rank> ranks);
33 | }
34 | 


--------------------------------------------------------------------------------
/src/main/java/org/seo/rank/SimilarChecker.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  *
 3 |  * APDPlat - Application Product Development Platform
 4 |  * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com
 5 |  *
 6 |  * This program is free software: you can redistribute it and/or modify
 7 |  * it under the terms of the GNU General Public License as published by
 8 |  * the Free Software Foundation, either version 3 of the License, or
 9 |  * (at your option) any later version.
10 |  *
11 |  * This program is distributed in the hope that it will be useful,
12 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 |  * GNU General Public License for more details.
15 |  *
16 |  * You should have received a copy of the GNU General Public License
17 |  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
18 |  *
19 |  */
20 | 
21 | package org.seo.rank;
22 | 
23 | /**
24 |  * 文章相似性检测
25 |  * @author 杨尚川
26 |  */
27 | public interface SimilarChecker {
28 |     public boolean isSimilar(String url1, String url2);
29 |     public double similarScore(String url1, String url2);
30 | }
31 | 


--------------------------------------------------------------------------------
/src/main/java/org/seo/rank/api/GetArticle.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * 
  3 |  * APDPlat - Application Product Development Platform
  4 |  * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com
  5 |  * 
  6 |  * This program is free software: you can redistribute it and/or modify
  7 |  * it under the terms of the GNU General Public License as published by
  8 |  * the Free Software Foundation, either version 3 of the License, or
  9 |  * (at your option) any later version.
 10 |  * 
 11 |  * This program is distributed in the hope that it will be useful,
 12 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 |  * GNU General Public License for more details.
 15 |  * 
 16 |  * You should have received a copy of the GNU General Public License
 17 |  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 18 |  * 
 19 |  */
 20 | 
 21 | package org.seo.rank.api;
 22 | 
 23 | import java.io.IOException;
 24 | import java.io.PrintWriter;
 25 | import java.util.List;
 26 | import javax.servlet.ServletException;
 27 | import javax.servlet.annotation.WebServlet;
 28 | import javax.servlet.http.HttpServlet;
 29 | import javax.servlet.http.HttpServletRequest;
 30 | import javax.servlet.http.HttpServletResponse;
 31 | import org.codehaus.jackson.map.ObjectMapper;
 32 | import org.seo.rank.list.Parser;
 33 | import org.seo.rank.list.impl.DefaultParser;
 34 | import org.seo.rank.model.Article;
 35 | import org.slf4j.Logger;
 36 | import org.slf4j.LoggerFactory;
 37 | 
 38 | /**
 39 |  *
 40 |  * @author 杨尚川
 41 |  */
 42 | @WebServlet(name = "GetArticle", urlPatterns = {"/GetArticle"})
 43 | public class GetArticle extends HttpServlet {
 44 |     private static final Logger LOGGER = LoggerFactory.getLogger(GetArticle.class);
 45 |     private static final ObjectMapper MAPPER = new ObjectMapper();
 46 |     private static final Parser PARSER = new DefaultParser();
 47 | 
 48 |     /**
 49 |      * Processes requests for both HTTP <code>GET</code> and <code>POST</code>
 50 |      * methods.
 51 |      *
 52 |      * @param request servlet request
 53 |      * @param response servlet response
 54 |      * @throws ServletException if a servlet-specific error occurs
 55 |      * @throws IOException if an I/O error occurs
 56 |      */
 57 |     protected void processRequest(HttpServletRequest request, HttpServletResponse response)
 58 |             throws ServletException, IOException {
 59 |         long start = System.currentTimeMillis();
 60 |         request.setCharacterEncoding("UTF-8");
 61 |         response.setContentType("application/json;charset=UTF-8");
 62 |         String url = request.getParameter("url");
 63 |         String nextPageCssQuery = request.getParameter("nextPageCssQuery");
 64 |         String nextPageText = request.getParameter("nextPageText");
 65 |         String titleCssQuery = request.getParameter("titleCssQuery");
 66 |         String lastTime = request.getParameter("lastTime");
 67 |         String proxyHost = request.getParameter("proxyHost");
 68 |         String proxyPort = request.getParameter("proxyPort");
 69 |         LOGGER.info("url:"+url);
 70 |         LOGGER.info("nextPageCssQuery:"+nextPageCssQuery);
 71 |         LOGGER.info("nextPageText:"+nextPageText);
 72 |         LOGGER.info("titleCssQuery:"+titleCssQuery);
 73 |         List<Article> articles = PARSER.parse(url, nextPageCssQuery, nextPageText, titleCssQuery);
 74 |         
 75 |         try (PrintWriter out = response.getWriter()) {
 76 |             String json = MAPPER.writeValueAsString(articles);
 77 |             out.println(json);
 78 |         }
 79 |         long cost = System.currentTimeMillis() - start;
 80 |         LOGGER.info("GetArticle 耗时 "+cost+" 毫秒");
 81 |     }
 82 | 
 83 |     // <editor-fold defaultstate="collapsed" desc="HttpServlet methods. Click on the + sign on the left to edit the code.">
 84 |     /**
 85 |      * Handles the HTTP <code>GET</code> method.
 86 |      *
 87 |      * @param request servlet request
 88 |      * @param response servlet response
 89 |      * @throws ServletException if a servlet-specific error occurs
 90 |      * @throws IOException if an I/O error occurs
 91 |      */
 92 |     @Override
 93 |     protected void doGet(HttpServletRequest request, HttpServletResponse response)
 94 |             throws ServletException, IOException {
 95 |         processRequest(request, response);
 96 |     }
 97 | 
 98 |     /**
 99 |      * Handles the HTTP <code>POST</code> method.
100 |      *
101 |      * @param request servlet request
102 |      * @param response servlet response
103 |      * @throws ServletException if a servlet-specific error occurs
104 |      * @throws IOException if an I/O error occurs
105 |      */
106 |     @Override
107 |     protected void doPost(HttpServletRequest request, HttpServletResponse response)
108 |             throws ServletException, IOException {
109 |         processRequest(request, response);
110 |     }
111 | 
112 |     /**
113 |      * Returns a short description of the servlet.
114 |      *
115 |      * @return a String containing servlet description
116 |      */
117 |     @Override
118 |     public String getServletInfo() {
119 |         return "Short description";
120 |     }// </editor-fold>
121 | 
122 | }
123 | 


--------------------------------------------------------------------------------
/src/main/java/org/seo/rank/api/GetListRank.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * 
  3 |  * APDPlat - Application Product Development Platform
  4 |  * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com
  5 |  * 
  6 |  * This program is free software: you can redistribute it and/or modify
  7 |  * it under the terms of the GNU General Public License as published by
  8 |  * the Free Software Foundation, either version 3 of the License, or
  9 |  * (at your option) any later version.
 10 |  * 
 11 |  * This program is distributed in the hope that it will be useful,
 12 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 |  * GNU General Public License for more details.
 15 |  * 
 16 |  * You should have received a copy of the GNU General Public License
 17 |  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 18 |  * 
 19 |  */
 20 | 
 21 | package org.seo.rank.api;
 22 | 
 23 | import java.io.IOException;
 24 | import java.io.PrintWriter;
 25 | import java.util.ArrayList;
 26 | import java.util.List;
 27 | import javax.servlet.ServletException;
 28 | import javax.servlet.annotation.WebServlet;
 29 | import javax.servlet.http.HttpServlet;
 30 | import javax.servlet.http.HttpServletRequest;
 31 | import javax.servlet.http.HttpServletResponse;
 32 | import org.codehaus.jackson.map.ObjectMapper;
 33 | import org.seo.rank.Ranker;
 34 | import org.seo.rank.impl.BaiduRanker;
 35 | import org.seo.rank.list.Parser;
 36 | import org.seo.rank.list.impl.DefaultParser;
 37 | import org.seo.rank.model.Article;
 38 | import org.seo.rank.model.Rank;
 39 | import org.slf4j.Logger;
 40 | import org.slf4j.LoggerFactory;
 41 | 
 42 | /**
 43 |  *
 44 |  * @author 杨尚川
 45 |  */
 46 | @WebServlet(name = "GetListRank", urlPatterns = {"/GetListRank"})
 47 | public class GetListRank extends HttpServlet {
 48 |     private static final Logger LOGGER = LoggerFactory.getLogger(GetListRank.class);
 49 |     private static final ObjectMapper MAPPER = new ObjectMapper();
 50 |     private static final Parser PARSER = new DefaultParser();
 51 |     private static final Ranker RANKER = new BaiduRanker();
 52 | 
 53 |     /**
 54 |      * Processes requests for both HTTP <code>GET</code> and <code>POST</code>
 55 |      * methods.
 56 |      *
 57 |      * @param request servlet request
 58 |      * @param response servlet response
 59 |      * @throws ServletException if a servlet-specific error occurs
 60 |      * @throws IOException if an I/O error occurs
 61 |      */
 62 |     protected void processRequest(HttpServletRequest request, HttpServletResponse response)
 63 |             throws ServletException, IOException {
 64 |         request.setCharacterEncoding("UTF-8");
 65 |         response.setContentType("application/json;charset=UTF-8");
 66 |         long start = System.currentTimeMillis();
 67 |         //获取栏目文章和链接
 68 |         String url = request.getParameter("url");
 69 |         String nextPageCssQuery = request.getParameter("nextPageCssQuery");
 70 |         String nextPageText = request.getParameter("nextPageText");
 71 |         String titleCssQuery = request.getParameter("titleCssQuery");
 72 |         String lastTime = request.getParameter("lastTime");
 73 |         String proxyHost = request.getParameter("proxyHost");
 74 |         String proxyPort = request.getParameter("proxyPort");
 75 |         LOGGER.info("url:"+url);
 76 |         LOGGER.info("nextPageCssQuery:"+nextPageCssQuery);
 77 |         LOGGER.info("nextPageText:"+nextPageText);
 78 |         LOGGER.info("titleCssQuery:"+titleCssQuery);
 79 |         List<Article> articles = PARSER.parse(url, nextPageCssQuery, nextPageText, titleCssQuery);
 80 |         LOGGER.info("文章数目："+articles.size());
 81 |         //将栏目文章和链接转换为排名对象
 82 |         List<Rank> ranks = new ArrayList<>();
 83 |         for(Article article : articles){
 84 |             Rank rank = new Rank();
 85 |             rank.setKeyword(article.getTitle());
 86 |             rank.setUrl(article.getUrl());
 87 |             ranks.add(rank);
 88 |         }
 89 |         //获取排名
 90 |         RANKER.rank(ranks);
 91 |         LOGGER.info("排名数目："+ranks.size());
 92 |         try (PrintWriter out = response.getWriter()) {
 93 |             String json = MAPPER.writeValueAsString(ranks);
 94 |             out.println(json);
 95 |         }
 96 |         long cost = System.currentTimeMillis() - start;
 97 |         LOGGER.info("GetListRank 耗时 "+cost+" 毫秒");
 98 |     }
 99 | 
100 |     // <editor-fold defaultstate="collapsed" desc="HttpServlet methods. Click on the + sign on the left to edit the code.">
101 |     /**
102 |      * Handles the HTTP <code>GET</code> method.
103 |      *
104 |      * @param request servlet request
105 |      * @param response servlet response
106 |      * @throws ServletException if a servlet-specific error occurs
107 |      * @throws IOException if an I/O error occurs
108 |      */
109 |     @Override
110 |     protected void doGet(HttpServletRequest request, HttpServletResponse response)
111 |             throws ServletException, IOException {
112 |         processRequest(request, response);
113 |     }
114 | 
115 |     /**
116 |      * Handles the HTTP <code>POST</code> method.
117 |      *
118 |      * @param request servlet request
119 |      * @param response servlet response
120 |      * @throws ServletException if a servlet-specific error occurs
121 |      * @throws IOException if an I/O error occurs
122 |      */
123 |     @Override
124 |     protected void doPost(HttpServletRequest request, HttpServletResponse response)
125 |             throws ServletException, IOException {
126 |         processRequest(request, response);
127 |     }
128 | 
129 |     /**
130 |      * Returns a short description of the servlet.
131 |      *
132 |      * @return a String containing servlet description
133 |      */
134 |     @Override
135 |     public String getServletInfo() {
136 |         return "Short description";
137 |     }// </editor-fold>
138 | 
139 | }
140 | 


--------------------------------------------------------------------------------
/src/main/java/org/seo/rank/api/GetRank.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * 
  3 |  * APDPlat - Application Product Development Platform
  4 |  * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com
  5 |  * 
  6 |  * This program is free software: you can redistribute it and/or modify
  7 |  * it under the terms of the GNU General Public License as published by
  8 |  * the Free Software Foundation, either version 3 of the License, or
  9 |  * (at your option) any later version.
 10 |  * 
 11 |  * This program is distributed in the hope that it will be useful,
 12 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 |  * GNU General Public License for more details.
 15 |  * 
 16 |  * You should have received a copy of the GNU General Public License
 17 |  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 18 |  * 
 19 |  */
 20 | 
 21 | package org.seo.rank.api;
 22 | 
 23 | import java.io.IOException;
 24 | import java.io.PrintWriter;
 25 | import javax.servlet.ServletException;
 26 | import javax.servlet.annotation.WebServlet;
 27 | import javax.servlet.http.HttpServlet;
 28 | import javax.servlet.http.HttpServletRequest;
 29 | import javax.servlet.http.HttpServletResponse;
 30 | import org.codehaus.jackson.map.ObjectMapper;
 31 | import org.seo.rank.Ranker;
 32 | import org.seo.rank.impl.BaiduRanker;
 33 | import org.seo.rank.model.Rank;
 34 | import org.slf4j.Logger;
 35 | import org.slf4j.LoggerFactory;
 36 | 
 37 | /**
 38 |  *
 39 |  * @author 杨尚川
 40 |  */
 41 | @WebServlet(name = "GetRank", urlPatterns = {"/GetRank"})
 42 | public class GetRank extends HttpServlet {
 43 |     private static final Logger LOGGER = LoggerFactory.getLogger(GetArticle.class);
 44 |     private static final ObjectMapper MAPPER = new ObjectMapper();
 45 |     private static final Ranker RANKER = new BaiduRanker();
 46 | 
 47 |     /**
 48 |      * Processes requests for both HTTP <code>GET</code> and <code>POST</code>
 49 |      * methods.
 50 |      *
 51 |      * @param request servlet request
 52 |      * @param response servlet response
 53 |      * @throws ServletException if a servlet-specific error occurs
 54 |      * @throws IOException if an I/O error occurs
 55 |      */
 56 |     protected void processRequest(HttpServletRequest request, HttpServletResponse response)
 57 |             throws ServletException, IOException {        
 58 |         long start = System.currentTimeMillis();
 59 |         request.setCharacterEncoding("UTF-8");
 60 |         response.setContentType("application/json;charset=UTF-8");
 61 |         String url = request.getParameter("url");
 62 |         String keyword = request.getParameter("keyword");
 63 |         LOGGER.info("url:"+url);
 64 |         LOGGER.info("keyword:"+keyword);
 65 |         Rank rank = new Rank();
 66 |         rank.setUrl(url);
 67 |         rank.setKeyword(keyword);
 68 |         RANKER.rank(rank);
 69 |         
 70 |         try (PrintWriter out = response.getWriter()) {
 71 |             String json = MAPPER.writeValueAsString(rank);
 72 |             out.println(json);
 73 |         }
 74 |         long cost = System.currentTimeMillis() - start;
 75 |         LOGGER.info("GetRank 耗时 "+cost+" 毫秒");
 76 |     }
 77 | 
 78 |     // <editor-fold defaultstate="collapsed" desc="HttpServlet methods. Click on the + sign on the left to edit the code.">
 79 |     /**
 80 |      * Handles the HTTP <code>GET</code> method.
 81 |      *
 82 |      * @param request servlet request
 83 |      * @param response servlet response
 84 |      * @throws ServletException if a servlet-specific error occurs
 85 |      * @throws IOException if an I/O error occurs
 86 |      */
 87 |     @Override
 88 |     protected void doGet(HttpServletRequest request, HttpServletResponse response)
 89 |             throws ServletException, IOException {
 90 |         processRequest(request, response);
 91 |     }
 92 | 
 93 |     /**
 94 |      * Handles the HTTP <code>POST</code> method.
 95 |      *
 96 |      * @param request servlet request
 97 |      * @param response servlet response
 98 |      * @throws ServletException if a servlet-specific error occurs
 99 |      * @throws IOException if an I/O error occurs
100 |      */
101 |     @Override
102 |     protected void doPost(HttpServletRequest request, HttpServletResponse response)
103 |             throws ServletException, IOException {
104 |         processRequest(request, response);
105 |     }
106 | 
107 |     /**
108 |      * Returns a short description of the servlet.
109 |      *
110 |      * @return a String containing servlet description
111 |      */
112 |     @Override
113 |     public String getServletInfo() {
114 |         return "Short description";
115 |     }// </editor-fold>
116 | 
117 | }
118 | 


--------------------------------------------------------------------------------
/src/main/java/org/seo/rank/impl/BaiduCopyChecker.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * *
  3 |  *  *
  4 |  *  * APDPlat - Application Product Development Platform
  5 |  *  * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com
  6 |  *  *
  7 |  *  * This program is free software: you can redistribute it and/or modify
  8 |  *  * it under the terms of the GNU General Public License as published by
  9 |  *  * the Free Software Foundation, either version 3 of the License, or
 10 |  *  * (at your option) any later version.
 11 |  *  *
 12 |  *  * This program is distributed in the hope that it will be useful,
 13 |  *  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 14 |  *  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 15 |  *  * GNU General Public License for more details.
 16 |  *  *
 17 |  *  * You should have received a copy of the GNU General Public License
 18 |  *  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 19 |  *  *
 20 |  *
 21 |  */
 22 | 
 23 | package org.seo.rank.impl;
 24 | 
 25 | import org.apache.commons.lang.StringUtils;
 26 | import org.jsoup.Connection;
 27 | import org.jsoup.Jsoup;
 28 | import org.jsoup.nodes.Document;
 29 | import org.jsoup.nodes.Element;
 30 | import org.jsoup.select.Elements;
 31 | import org.seo.rank.CopyChecker;
 32 | import org.seo.rank.tools.DynamicIp;
 33 | import org.seo.rank.list.UrlTools;
 34 | import org.seo.rank.list.impl.DefaultParser;
 35 | import org.seo.rank.model.Article;
 36 | import org.slf4j.Logger;
 37 | import org.slf4j.LoggerFactory;
 38 | 
 39 | import java.io.UnsupportedEncodingException;
 40 | import java.net.URL;
 41 | import java.net.URLEncoder;
 42 | import java.util.*;
 43 | import java.util.concurrent.atomic.AtomicInteger;
 44 | import java.util.stream.Collectors;
 45 | 
 46 | /**
 47 |  * 检查文章抄袭情况
 48 |  * @author 杨尚川
 49 |  */
 50 | public class BaiduCopyChecker implements CopyChecker {
 51 |     private static final Logger LOGGER = LoggerFactory.getLogger(BaiduCopyChecker.class);
 52 |     private static final String ACCEPT = "text/html, */*; q=0.01";
 53 |     private static final String ENCODING = "gzip, deflate";
 54 |     private static final String LANGUAGE = "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3";
 55 |     private static final String CONNECTION = "keep-alive";
 56 |     private static final String HOST = "www.baidu.com";
 57 |     private static final String REFERER = "http://www.baidu.com";
 58 |     private static final String USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:31.0) Gecko/20100101 Firefox/31.0";
 59 |     
 60 |     //获取多少页
 61 |     private static final int PAGE = 15;
 62 |     private static final int PAGESIZE = 10;
 63 | 
 64 | 
 65 |     @Override
 66 |     public Map<Article, Set<String>> check(List<Article> articles) {
 67 |         Map<Article, Set<String>> data = new HashMap<>();
 68 |         articles.forEach(article -> {
 69 |             data.put(article, doCheck(article));
 70 |         });
 71 |         return data;
 72 |     }
 73 | 
 74 |     public Set<String> doCheck(Article article){
 75 |         Set<String> data = new HashSet<>();
 76 |         if(StringUtils.isBlank(article.getTitle())
 77 |                 || StringUtils.isBlank(article.getUrl())){
 78 |             return data;
 79 |         }
 80 |         String query = null;
 81 |         try {
 82 |             query = URLEncoder.encode(article.getTitle(), "UTF-8");
 83 |         } catch (UnsupportedEncodingException e) {
 84 |             LOGGER.error("url构造失败", e);
 85 |             return data;
 86 |         }
 87 |         if(StringUtils.isBlank(query)){
 88 |             return data;
 89 |         }
 90 |         for (int i = 0; i < PAGE; i++) {
 91 |             String url = "http://www.baidu.com/s?tn=monline_5_dg&ie=utf-8&wd=" + query+"&oq="+query+"&usm=3&f=8&bs="+query+"&rsv_bp=1&rsv_sug3=1&rsv_sug4=141&rsv_sug1=1&rsv_sug=1&pn=" + i * PAGESIZE;
 92 |             LOGGER.debug(url);
 93 |             data.addAll(doCheck(url, article));
 94 |         }
 95 |         return data;
 96 |     }
 97 | 
 98 |     private Set<String> doCheck(String url, Article article) {
 99 |         Set<String> data = new HashSet<>();
100 |         try {
101 |             Document document = Jsoup.connect(url)
102 |                     .header("Accept", ACCEPT)
103 |                     .header("Accept-Encoding", ENCODING)
104 |                     .header("Accept-Language", LANGUAGE)
105 |                     .header("Connection", CONNECTION)
106 |                     .header("Host", HOST)
107 |                     .header("Referer", REFERER)
108 |                     .header("User-Agent", USER_AGENT)
109 |                     .get();
110 |             String titleCssQuery = "html body div div div div div h3.t a";
111 |             Elements elements = document.select(titleCssQuery);
112 |             int i=0;
113 |             for(Element element : elements){
114 |                 String _title = element.text();
115 |                 if(StringUtils.isBlank(_title)){
116 |                     continue;
117 |                 }
118 |                 i++;
119 |                 LOGGER.debug(i+":"+_title);
120 |                 if(_title.contains("百度翻译")
121 |                         || !contains(_title, article.getTitle())){
122 |                     LOGGER.debug("搜索结果检查通过");
123 |                     continue;
124 |                 }
125 |                 String href = element.attr("href");
126 |                 href = UrlTools.normalizeUrl(url, href);
127 |                 String realUrl = urlConvert(href);
128 |                 LOGGER.debug("url:"+url);
129 |                 LOGGER.debug("realUrl:"+realUrl);
130 |                 String[] target = new URL(realUrl).getHost().split("\\.");
131 |                 String[] source = new URL(article.getUrl()).getHost().split("\\.");
132 |                 if(target.length>1
133 |                         && source.length>1
134 |                         && !(target[target.length-2]+target[target.length-1]).equals(source[source.length-2]+source[source.length-1])) {
135 |                     data.add(realUrl);
136 |                 }
137 |             }
138 |         } catch (Exception ex) {
139 |             LOGGER.error("搜索出错",ex);
140 |         }
141 |         return data;
142 |     }
143 |     /**
144 |      * 判断title2是否包含title1，去除标题中的特殊字符
145 |      * @param title2
146 |      * @param title1
147 |      * @return
148 |      */
149 |     private static boolean contains(String title2, String title1){
150 |         StringBuilder str2 = new StringBuilder();
151 |         StringBuilder str1 = new StringBuilder();
152 |         for(char c : title2.toCharArray()){
153 |             if(Character.isLetter(c)){
154 |                 str2.append(c);
155 |             }
156 |         }
157 |         for(char c : title1.toCharArray()){
158 |             if(Character.isLetter(c)){
159 |                 str1.append(c);
160 |             }
161 |         }
162 |         LOGGER.debug("转换标题前："+title2);
163 |         LOGGER.debug("转换标题后："+str2.toString());
164 |         LOGGER.debug("转换标题前："+title1);
165 |         LOGGER.debug("转换标题后："+str1.toString());
166 |         if(str2.toString().contains(str1.toString())){
167 |             LOGGER.debug(title2+" 【包含】 "+title1);
168 |             return true;
169 |         }
170 |         LOGGER.debug(title2+" 【不包含】 "+title1);
171 |         return false;
172 |     }
173 |     /**
174 |      * 将百度的链接转换为网页的链接
175 |      * @param url 百度链接
176 |      * @return 网页链接
177 |      */
178 |     private static String urlConvert(String url){
179 |         try{
180 |             if(!url.startsWith("http://www.baidu.com/link?url=")){
181 |                 //不需要转换URL
182 |                 return url;
183 |             }
184 |             LOGGER.debug("转换前的URL："+url);
185 |             Connection.Response response = getResponse(url);
186 |             //这里要处理爬虫限制
187 |             if(response==null || response.body().contains("请您点击按钮解除封锁")
188 |                     || response.body().contains("请输入以下验证码")){
189 |                 //使用新的IP地址
190 |                 DynamicIp.toNewIp();
191 |                 response = getResponse(url);
192 |             }
193 |             String realUrl = response.header("Location");
194 |             LOGGER.debug("转换后的URL："+realUrl);
195 |             //检查网页是否被重定向
196 |             //这个检查会导致速度有点慢
197 |             //这个检测基本没有必要，除非是那种极其特殊的网站，ITEYE曾经就是，后来在我的建议下改进了
198 |             /*
199 |             LOGGER.debug("检查是否有重定向："+realUrl);
200 |             Connection.Response response = getResponse(realUrl);
201 |             //这里要处理爬虫限制
202 |             if(response==null || response.body().contains("请您点击按钮解除封锁")
203 |                               || response.body().contains("请输入以下验证码")){
204 |                 //使用新的IP地址
205 |                 DynamicIp.toNewIp();
206 |                 response = getResponse(realUrl);
207 |             }
208 |             String realUrl2 = response.header("Location");
209 |             if(!StringUtils.isBlank(realUrl2)){
210 |                 LOGGER.debug("检查到重定向到："+realUrl2);
211 |                 return realUrl2;
212 |             }
213 |             */
214 |             return realUrl;
215 |         }catch(Exception e){
216 |             LOGGER.error("URL转换异常", e);
217 |         }
218 |         return url;
219 |     }
220 |     private static Connection.Response getResponse(String url) {
221 |         try{
222 |             Connection.Response response = Jsoup.connect(url)
223 |                     .header("Accept", ACCEPT)
224 |                     .header("Accept-Encoding", ENCODING)
225 |                     .header("Accept-Language", LANGUAGE)
226 |                     .header("Connection", CONNECTION)
227 |                     .header("Host", HOST)
228 |                     .header("Referer", REFERER)
229 |                     .header("User-Agent", USER_AGENT)
230 |                     .ignoreContentType(true)
231 |                     .timeout(30000)
232 |                     .followRedirects(false)
233 |                     .execute();
234 |             return response;
235 |         } catch (Exception e){
236 |             LOGGER.debug("获取页面失败：", e);
237 |         }
238 |         return null;
239 |     }
240 |     public static void main(String[] args){
241 |         CopyChecker copyChecker = new BaiduCopyChecker();
242 |         //计算OSCHINA博文被抄袭的情况
243 |         //List<Article> articles = DefaultParser.oschinaBlog();
244 |         //计算ITEYE博文被抄袭的情况
245 |         List<Article> articles = DefaultParser.iteyeBlog();
246 |         //这里排除不统计的博文
247 |         articles=articles.stream().filter(article ->
248 |                 !(article.getTitle().contains("idioms")
249 |                 || article.getTitle().contains("分布式内存文件系统：Tachyon")
250 |                 || article.getTitle().contains("Nutch视频")
251 |                 || article.getTitle().contains("如何解决BUG？")
252 |                 || article.getTitle().contains("采集电子报纸")
253 |                 || article.getTitle().contains("汉英双语的差异")
254 |                 || article.getTitle().contains("分布式搜索算法")
255 |                 || article.getTitle().contains("The Future of Compass & ElasticSearch")
256 |                 || article.getTitle().contains("1208个合成词")
257 |                 || article.getTitle().contains("Java远程调试")
258 |                 || article.getTitle().contains("What a Wonderful Code")
259 |                 || article.getTitle().contains("代码评审脚本")
260 |                 || article.getTitle().contains("Linux Netcat command – The swiss army knife of net")
261 |                 || article.getTitle().contains("common prefix different suffix"))
262 |         ).collect(Collectors.toList());
263 |         //检查
264 |         Map<Article, Set<String>> result = copyChecker.check(articles);
265 |         //输出检查报告
266 |         LOGGER.info("<h4>检查博文数目：" + articles.size()+"</h4>");
267 |         AtomicInteger i = new AtomicInteger();
268 |         result.entrySet().stream().sorted((a,b)->b.getValue().size()-a.getValue().size()).forEach(e -> {
269 |             String query = null;
270 |             try {
271 |                 query = URLEncoder.encode(e.getKey().getTitle(), "UTF-8");
272 |             } catch (UnsupportedEncodingException ex) {
273 |                 LOGGER.error("url构造失败", ex);
274 |                 return;
275 |             }
276 |             String originURL = e.getKey().getUrl();
277 |             if(e.getValue().size()>0) {
278 |                 LOGGER.info("<h4>"+i.incrementAndGet()+"、<a target=\"_blank\" href=\"http://www.baidu.com/s?wd=" + query + "\">" + e.getKey().getTitle() + "</a>  抄袭链接有("+e.getValue().size()+")个</h4>");
279 |                 LOGGER.info("原文链接：<a target=\"_blank\" href=\"" + originURL + "\">" + originURL + "</a><br/>");
280 |                 LOGGER.info("抄袭链接：");
281 |                 LOGGER.info("<ol>");
282 |                 e.getValue().stream().sorted().forEach(url-> LOGGER.info("<li><a target=\"_blank\" href=\"" + url + "\">" + url + "</a></li>"));
283 |                 LOGGER.info("</ol>");
284 |             }else{
285 |                 LOGGER.info(i.incrementAndGet()+"、<a target=\"_blank\" href=\"http://www.baidu.com/s?wd=" + query + "\">" + e.getKey().getTitle() + "</a><br/>");
286 |                 LOGGER.info("原文链接：<a target=\"_blank\" href=\"" + originURL + "\">" + originURL + "</a>    无抄袭链接<br/>");
287 |             }
288 |         });
289 |     }
290 | }
291 | 


--------------------------------------------------------------------------------
/src/main/java/org/seo/rank/impl/BaiduRanker.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * 
  3 |  * APDPlat - Application Product Development Platform
  4 |  * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com
  5 |  * 
  6 |  * This program is free software: you can redistribute it and/or modify
  7 |  * it under the terms of the GNU General Public License as published by
  8 |  * the Free Software Foundation, either version 3 of the License, or
  9 |  * (at your option) any later version.
 10 |  * 
 11 |  * This program is distributed in the hope that it will be useful,
 12 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 |  * GNU General Public License for more details.
 15 |  * 
 16 |  * You should have received a copy of the GNU General Public License
 17 |  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 18 |  * 
 19 |  */
 20 | 
 21 | package org.seo.rank.impl;
 22 | 
 23 | import java.io.IOException;
 24 | import java.io.UnsupportedEncodingException;
 25 | import java.net.URLEncoder;
 26 | import java.util.ArrayList;
 27 | import java.util.HashMap;
 28 | import java.util.List;
 29 | import java.util.Map;
 30 | import org.apache.commons.lang.StringUtils;
 31 | import org.jsoup.Connection;
 32 | import org.jsoup.Jsoup;
 33 | import org.jsoup.nodes.Document;
 34 | import org.jsoup.nodes.Element;
 35 | import org.jsoup.select.Elements;
 36 | import org.seo.rank.Ranker;
 37 | import org.seo.rank.tools.DynamicIp;
 38 | import org.seo.rank.list.UrlTools;
 39 | import org.seo.rank.list.impl.DefaultParser;
 40 | import org.seo.rank.model.Article;
 41 | import org.seo.rank.model.Rank;
 42 | import org.slf4j.Logger;
 43 | import org.slf4j.LoggerFactory;
 44 | 
 45 | /**
 46 |  * 判断网页是否被搜索引擎收录以及收录之后的排名情况
 47 |  * @author 杨尚川
 48 |  */
 49 | public class BaiduRanker implements Ranker{
 50 |     private static final Logger LOGGER = LoggerFactory.getLogger(BaiduRanker.class);
 51 |     private static final String ACCEPT = "text/html, */*; q=0.01";
 52 |     private static final String ENCODING = "gzip, deflate";
 53 |     private static final String LANGUAGE = "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3";
 54 |     private static final String CONNECTION = "keep-alive";
 55 |     private static final String HOST = "www.baidu.com";
 56 |     private static final String REFERER = "http://www.baidu.com";
 57 |     private static final String USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:31.0) Gecko/20100101 Firefox/31.0";
 58 |     
 59 |     //获取多少页
 60 |     private static final int PAGE = 15;
 61 |     private static final int PAGESIZE = 10;
 62 | 
 63 |     @Override
 64 |     public void rank(List<Rank> ranks) {
 65 |         for(Rank rank : ranks){
 66 |             rank(rank);
 67 |         }
 68 |     }
 69 |     @Override
 70 |     public void rank(Rank rank){
 71 |         doRank(rank);
 72 |     }
 73 |     /**
 74 |      * 查询网页在百度的排名
 75 |      * @param rank 排名数据结构
 76 |      */
 77 |     public void doRank(Rank rank){
 78 |         if(StringUtils.isBlank(rank.getKeyword()) || StringUtils.isBlank(rank.getUrl())){
 79 |             return ;
 80 |         }
 81 |         //检查是否被百度收录
 82 |         searchBaiduIndex(rank);
 83 |         if(!rank.isIndex()){
 84 |             return;
 85 |         }
 86 |         //检查百度排名
 87 |         String query = null;
 88 |         try {
 89 |             query = URLEncoder.encode(rank.getKeyword(), "UTF-8");
 90 |         } catch (UnsupportedEncodingException e) {
 91 |             LOGGER.error("url构造失败", e);
 92 |             return ;
 93 |         }
 94 |         if(StringUtils.isBlank(query)){
 95 |             return ;
 96 |         }
 97 |         for (int i = 0; i < PAGE; i++) {
 98 |             String path = "http://www.baidu.com/s?tn=monline_5_dg&ie=utf-8&wd=" + query+"&oq="+query+"&usm=3&f=8&bs="+query+"&rsv_bp=1&rsv_sug3=1&rsv_sug4=141&rsv_sug1=1&rsv_sug=1&pn=" + i * PAGESIZE;
 99 |             LOGGER.debug(path);
100 |             int r = searchBaiduRank(path, rank);
101 |             if (r > 0){
102 |                 rank.setRank(r+i*10);
103 |                 //找到排名
104 |                 return;
105 |             }
106 |         }
107 |     }
108 |     /**
109 |      * 检查百度是否收录
110 |      * @param rank 
111 |      */
112 |     private void searchBaiduIndex(Rank rank) {
113 |         String url = "url:"+rank.getUrl();
114 |         url = "http://www.baidu.com/s?wd=" + url;
115 |         LOGGER.debug(url);
116 |         try {
117 |             Document document = Jsoup.connect(url)
118 |                     .header("Accept", ACCEPT)
119 |                     .header("Accept-Encoding", ENCODING)
120 |                     .header("Accept-Language", LANGUAGE)
121 |                     .header("Connection", CONNECTION)
122 |                     .header("User-Agent", USER_AGENT)
123 |                     .header("Host", HOST)
124 |                     .get();
125 | 
126 |             String notFoundCssQuery = "html body div div div div div p";
127 |             Elements elements = document.select(notFoundCssQuery);
128 |             for(Element element : elements){
129 |                 String text = element.text();
130 |                 if(text.contains("抱歉，没有找到与") && text.contains("相关的网页。")){
131 |                     //未被百度收录
132 |                     LOGGER.debug("未被百度收录");
133 |                     rank.setIndex(false);
134 |                     return;
135 |                 }
136 |             }
137 |             String numberCssQuery = "html body div div div div.nums";
138 |             elements = document.select(numberCssQuery);
139 |             for(Element element : elements){
140 |                 String text = element.text();
141 |                 if(text.equals("百度为您找到相关结果约1个")){
142 |                     //百度收录
143 |                     LOGGER.debug("被百度收录");
144 |                     rank.setIndex(true);
145 |                     return;
146 |                 }
147 |             }
148 |         } catch (IOException ex) {
149 |             LOGGER.error("搜索出错",ex);
150 |         }
151 |         LOGGER.debug("未被百度收录");
152 |     }
153 |     /**
154 |      * 检查百度排名
155 |      * @param url 检查百度的URL
156 |      * @param rank 网页排名
157 |      * @return 
158 |      */
159 |     private int searchBaiduRank(String url, Rank rank) {
160 |         String targetUrl = rank.getUrl();
161 |         try {
162 |             Document document = Jsoup.connect(url)
163 |                     .header("Accept", ACCEPT)
164 |                     .header("Accept-Encoding", ENCODING)
165 |                     .header("Accept-Language", LANGUAGE)
166 |                     .header("Connection", CONNECTION)
167 |                     .header("Host", HOST)
168 |                     .header("Referer", REFERER)
169 |                     .header("User-Agent", USER_AGENT)
170 |                     .get();
171 |             String titleCssQuery = "html body div div div div div h3.t a";
172 |             Elements elements = document.select(titleCssQuery);
173 |             int i=0;
174 |             for(Element element : elements){
175 |                 String title = element.text();
176 |                 if(StringUtils.isBlank(title)){
177 |                     continue;
178 |                 }
179 |                 i++;
180 |                 LOGGER.debug(i+":"+title);
181 |                 if(!title.contains(rank.getKeyword())){
182 |                     LOGGER.debug("搜索结果标题不包括关键词，忽略");
183 |                     continue;
184 |                 }
185 |                 String href = element.attr("href");
186 |                 href = UrlTools.normalizeUrl(url, href);
187 |                 String realUrl = urlConvert(href);
188 |                 LOGGER.debug("url:"+url);
189 |                 LOGGER.debug("realUrl:"+realUrl);
190 |                 LOGGER.debug("targetUrl:"+targetUrl);
191 |                 if(targetUrl.equals(realUrl)){
192 |                     return i;
193 |                 }
194 |             }
195 |         } catch (Exception ex) {
196 |             LOGGER.error("搜索出错",ex);
197 |         }
198 |         return -1;
199 |     }
200 |     /**
201 |      * 将百度的链接转换为网页的链接
202 |      * @param url 百度链接
203 |      * @return 网页链接
204 |      */
205 |     private static String urlConvert(String url){
206 |         try{
207 |             if(!url.startsWith("http://www.baidu.com/link?url=")){
208 |                 //不需要转换URL
209 |                 return url;
210 |             }
211 |             LOGGER.debug("转换前的URL："+url);
212 |             Connection.Response response = getResponse(url);
213 |             //这里要处理爬虫限制
214 |             if(response==null || response.body().contains("请您点击按钮解除封锁")
215 |                     || response.body().contains("请输入以下验证码")){
216 |                 //使用新的IP地址
217 |                 DynamicIp.toNewIp();
218 |                 response = getResponse(url);
219 |             }
220 |             String realUrl = response.header("Location");
221 |             LOGGER.debug("转换后的URL："+realUrl);
222 |             //检查网页是否被重定向
223 |             //这个检查会导致速度有点慢
224 |             //这个检测基本没有必要，除非是那种极其特殊的网站，ITEYE曾经就是，后来在我的建议下改进了
225 |             /*
226 |             LOGGER.debug("检查是否有重定向："+realUrl);
227 |             Connection.Response response = getResponse(realUrl);
228 |             //这里要处理爬虫限制
229 |             if(response==null || response.body().contains("请您点击按钮解除封锁")
230 |                               || response.body().contains("请输入以下验证码")){
231 |                 //使用新的IP地址
232 |                 DynamicIp.toNewIp();
233 |                 response = getResponse(realUrl);
234 |             }
235 |             String realUrl2 = response.header("Location");
236 |             if(!StringUtils.isBlank(realUrl2)){
237 |                 LOGGER.debug("检查到重定向到："+realUrl2);
238 |                 return realUrl2;
239 |             }
240 |             */
241 |             return realUrl;
242 |         }catch(Exception e){
243 |             LOGGER.error("URL转换异常", e);
244 |         }
245 |         return url;
246 |     }
247 |     private static Connection.Response getResponse(String url) {
248 |         try{
249 |             Connection.Response response = Jsoup.connect(url)
250 |                     .header("Accept", ACCEPT)
251 |                     .header("Accept-Encoding", ENCODING)
252 |                     .header("Accept-Language", LANGUAGE)
253 |                     .header("Connection", CONNECTION)
254 |                     .header("Host", HOST)
255 |                     .header("Referer", REFERER)
256 |                     .header("User-Agent", USER_AGENT)
257 |                     .ignoreContentType(true)
258 |                     .timeout(30000)
259 |                     .followRedirects(false)
260 |                     .execute();
261 |             return response;
262 |         } catch (Exception e){
263 |             LOGGER.debug("获取页面失败：", e);
264 |         }
265 |         return null;
266 |     }
267 |     public static void main(String[] args){
268 |         BaiduRanker ranker = new BaiduRanker();
269 |         /*
270 |         Rank rank = new Rank();
271 |         rank.setKeyword("Java应用级产品开发平台APDPlat作者杨尚川专访");
272 |         rank.setUrl("http://www.iteye.com/magazines/113");
273 |         ranker.searchBaiduIndex(rank);
274 |         System.out.println(rank);
275 |         
276 |         rank = new Rank();
277 |         rank.setKeyword("Java应用级产品开发平台APDPlat作者杨尚川专访");
278 |         rank.setUrl("http://www.iteye.com/magazines/113");
279 |         ranker.rank(rank);
280 |         System.out.println(rank);
281 |         
282 |         rank = new Rank();
283 |         rank.setKeyword("QuestionAnsweringSystem v1.1 发布，人机问答系统");
284 |         rank.setUrl("http://yangshangchuan.iteye.com/blog/2101533");
285 |         ranker.searchBaiduIndex(rank);
286 |         System.out.println(rank);
287 |         
288 |         rank = new Rank();
289 |         rank.setKeyword("天天向上");
290 |         rank.setUrl("http://www.manmankan.com/dy2013/zongyi/201306/6.shtml");
291 |         ranker.rank(rank);
292 |         System.out.println(rank);
293 |         */
294 |         //计算OSCHINA博文在百度的收录与排名情况
295 |         //List<Article> articles = DefaultParser.oschinaBlog();
296 |         //计算ITEYE博文在百度的收录与排名情况
297 |         List<Article> articles = DefaultParser.iteyeBlog();
298 |         //将博文转换为排名对象
299 |         List<Rank> ranks = new ArrayList<>();
300 |         articles.forEach(blog -> {
301 |             Rank rank = new Rank();
302 |             rank.setKeyword(blog.getTitle());
303 |             rank.setUrl(blog.getUrl());
304 |             ranks.add(rank);
305 |         });
306 |         //获取排名信息
307 |         ranker.rank(ranks);
308 |         //按排名排序
309 |         Map<String, Integer> map = new HashMap<>();
310 |         ranks.forEach(rank -> map.put(rank.getKeyword(), rank.getRank()));
311 |         LOGGER.info("排名博文数目：" + ranks.size());
312 |         LOGGER.info("<ol>");
313 |         map.entrySet().stream().sorted((a,b)->a.getValue()-b.getValue()).forEach(e -> {
314 |             String query = null;
315 |             try {
316 |                 query = URLEncoder.encode(e.getKey(), "UTF-8");
317 |             } catch (UnsupportedEncodingException ex) {
318 |                 LOGGER.error("url构造失败", ex);
319 |                 return ;
320 |             }
321 |             LOGGER.info("<li><a target=\"_blank\" href=\"http://www.baidu.com/s?wd=" + query + "\">" + e.getKey() + "(" + e.getValue() + ")</a></li>");
322 |         });
323 |         LOGGER.info("</ol>");
324 |     }
325 | }
326 | 


--------------------------------------------------------------------------------
/src/main/java/org/seo/rank/impl/GenericWebPageSimilarChecker.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  *
  3 |  * APDPlat - Application Product Development Platform
  4 |  * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com
  5 |  *
  6 |  * This program is free software: you can redistribute it and/or modify
  7 |  * it under the terms of the GNU General Public License as published by
  8 |  * the Free Software Foundation, either version 3 of the License, or
  9 |  * (at your option) any later version.
 10 |  *
 11 |  * This program is distributed in the hope that it will be useful,
 12 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 |  * GNU General Public License for more details.
 15 |  *
 16 |  * You should have received a copy of the GNU General Public License
 17 |  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 18 |  *
 19 |  */
 20 | 
 21 | package org.seo.rank.impl;
 22 | 
 23 | import org.apdplat.word.WordSegmenter;
 24 | import org.apdplat.word.segmentation.Word;
 25 | import org.jsoup.Jsoup;
 26 | import org.jsoup.nodes.Document;
 27 | import org.seo.rank.SimilarChecker;
 28 | import org.seo.rank.tools.DynamicIp;
 29 | import org.seo.rank.list.impl.DefaultParser;
 30 | import org.seo.rank.model.Article;
 31 | import org.slf4j.Logger;
 32 | import org.slf4j.LoggerFactory;
 33 | 
 34 | import java.math.BigDecimal;
 35 | import java.net.URL;
 36 | import java.util.*;
 37 | import java.util.concurrent.atomic.AtomicInteger;
 38 | import java.util.stream.Collectors;
 39 | 
 40 | /**
 41 |  * 通用的网页相似度检测算法
 42 |  * @author 杨尚川
 43 |  */
 44 | public class GenericWebPageSimilarChecker implements SimilarChecker{
 45 |     private static final Logger LOGGER = LoggerFactory.getLogger(GenericWebPageSimilarChecker.class);
 46 |     private static final String ACCEPT = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
 47 |     private static final String ENCODING = "gzip, deflate";
 48 |     private static final String LANGUAGE = "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3";
 49 |     private static final String CONNECTION = "keep-alive";
 50 |     private static final String REFERER = "http://www.baidu.com";
 51 |     private static final String USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:36.0) Gecko/20100101 Firefox/36.0";
 52 |     private static final float THRESHOLD_RATE = 0.5F;
 53 | 
 54 |     @Override
 55 |     public boolean isSimilar(String url1, String url2) {
 56 |         return similarScore(url1, url2)>=THRESHOLD_RATE;
 57 |     }
 58 |     @Override
 59 |     public double similarScore(String url1, String url2) {
 60 |         WebPage webPage1 = getWebPage(url1);
 61 |         if(webPage1!=null) {
 62 |             WebPage webPage2 = getWebPage(url2);
 63 |             if(webPage2!=null) {
 64 |                 double score = score(webPage1, webPage2);
 65 |                 //取两位小数
 66 |                 score = (int)(score*100+0.5)/(double)100;
 67 |                 return score;
 68 |             }
 69 |         }
 70 |         return 0;
 71 |     }
 72 | 
 73 |     private double score(WebPage webPage1, WebPage webPage2){
 74 |         //分词
 75 |         List<Word> webPage1Words = WordSegmenter.seg(webPage1.getTitle()+"\n"+webPage1.getContent());
 76 |         List<Word> webPage2Words = WordSegmenter.seg(webPage2.getTitle()+"\n"+webPage2.getContent());
 77 |         //词频统计
 78 |         Map<Word, AtomicInteger> webPage1WordsFre = frequence(webPage1Words);
 79 |         Map<Word, AtomicInteger> webPage2WordsFre = frequence(webPage2Words);
 80 |         //输出详细信息
 81 |         if(LOGGER.isDebugEnabled()){
 82 |             showDetail(webPage1, webPage1Words, webPage1WordsFre);
 83 |             showDetail(webPage2, webPage2Words, webPage2WordsFre);
 84 |         }
 85 |         //使用简单共有词判定
 86 |         return simpleScore(webPage1WordsFre, webPage2WordsFre);
 87 |         //使用余弦相似度判定
 88 |         //return cosScore(webPage1WordsFre, webPage2WordsFre);
 89 |     }
 90 | 
 91 |     /**
 92 |      * 判定相似性的方式一：简单共有词
 93 |      * @param webPage1WordsFre
 94 |      * @param webPage2WordsFre
 95 |      * @return
 96 |      */
 97 |     private double simpleScore(Map<Word, AtomicInteger> webPage1WordsFre, Map<Word, AtomicInteger> webPage2WordsFre){
 98 |         //判断有几个相同的词
 99 |         AtomicInteger intersectionLength = new AtomicInteger();
100 |         webPage1WordsFre.keySet().forEach(word -> {
101 |             if (webPage2WordsFre.keySet().contains(word)) {
102 |                 intersectionLength.incrementAndGet();
103 |             }
104 |         });
105 |         LOGGER.info("网页1有的词数：" + webPage1WordsFre.size());
106 |         LOGGER.info("网页2有的词数：" + webPage2WordsFre.size());
107 |         LOGGER.info("网页1和2共有的词数：" + intersectionLength.get());
108 |         double score = intersectionLength.get()/(double)Math.min(webPage1WordsFre.size(), webPage2WordsFre.size());
109 |         LOGGER.info("相似度分值="+intersectionLength.get()+"/(double)Math.min("+webPage1WordsFre.size()+", "+webPage2WordsFre.size()+")="+score);
110 |         return score;
111 |     }
112 | 
113 |     /**
114 |      *
115 |      * 判定相似性的方式二：余弦相似度
116 |      * 余弦夹角原理：
117 |      * 向量a=(x1,y1),向量b=(x2,y2)
118 |      * a.b=x1x2+y1y2
119 |      * |a|=根号[(x1)^2+(y1)^2],|b|=根号[(x2)^2+(y2)^2]
120 |      * a,b的夹角的余弦cos=a.b/|a|*|b|=(x1x2+y1y2)/根号[(x1)^2+(y1)^2]*根号[(x2)^2+(y2)^2]
121 |      * @param webPage1WordsFre
122 |      * @param webPage2WordsFre
123 |      */
124 |     private double cosScore(Map<Word, AtomicInteger> webPage1WordsFre, Map<Word, AtomicInteger> webPage2WordsFre){
125 |         Set<Word> words = new HashSet<>();
126 |         words.addAll(webPage1WordsFre.keySet());
127 |         words.addAll(webPage2WordsFre.keySet());
128 |         //向量的维度为words的大小，每一个维度的权重是词频，注意的是，中文分词的时候已经去了停用词
129 |         //a.b
130 |         AtomicInteger ab = new AtomicInteger();
131 |         //|a|
132 |         AtomicInteger aa = new AtomicInteger();
133 |         //|b|
134 |         AtomicInteger bb = new AtomicInteger();
135 |         //计算
136 |         words
137 |             .stream()
138 |             .forEach(word -> {
139 |                 AtomicInteger x1 = webPage1WordsFre.get(word);
140 |                 AtomicInteger x2 = webPage2WordsFre.get(word);
141 |                 if(x1!=null && x2!=null) {
142 |                     //x1x2
143 |                     int oneOfTheDimension = x1.get() * x2.get();
144 |                     //+
145 |                     ab.addAndGet(oneOfTheDimension);
146 |                 }
147 |                 if(x1!=null){
148 |                     //(x1)^2
149 |                     int oneOfTheDimension = x1.get() * x1.get();
150 |                     //+
151 |                     aa.addAndGet(oneOfTheDimension);
152 |                 }
153 |                 if(x2!=null){
154 |                     //(x2)^2
155 |                     int oneOfTheDimension = x2.get() * x2.get();
156 |                     //+
157 |                     bb.addAndGet(oneOfTheDimension);
158 |                 }
159 |             });
160 | 
161 |         double aaa = Math.sqrt(aa.get());
162 |         double bbb = Math.sqrt(bb.get());
163 |         //使用BigDecimal保证精确计算浮点数
164 |         BigDecimal aabb = BigDecimal.valueOf(aaa).multiply(BigDecimal.valueOf(bbb));
165 |         double cos = ab.get()/aabb.doubleValue();
166 |         return cos;
167 |     }
168 | 
169 |     private void showDetail(WebPage webPage, List<Word> webPageWords, Map<Word, AtomicInteger> webPageWordsFre){
170 |         LOGGER.debug("网页URL：");
171 |         LOGGER.debug("\t"+webPage.getUrl());
172 |         LOGGER.debug("网页标题：");
173 |         LOGGER.debug("\t"+webPage.getTitle());
174 |         LOGGER.debug("网页内容：");
175 |         LOGGER.debug("\t"+webPage.getContent());
176 |         LOGGER.debug("网页长度："+webPage.getContent().length());
177 |         LOGGER.debug("网页分词结果：");
178 |         LOGGER.debug("\t"+webPageWords);
179 |         LOGGER.debug("网页词频统计：");
180 |         AtomicInteger c = new AtomicInteger();
181 |         webPageWordsFre
182 |                 .entrySet()
183 |                 .stream()
184 |                 .sorted((a,b)->b.getValue().get()-a.getValue().get())
185 |                 .forEach(e->LOGGER.debug("\t"+c.incrementAndGet()+"、"+e.getKey()+"="+e.getValue()));
186 |     }
187 | 
188 |     private Map<Word, AtomicInteger> frequence(List<Word> words){
189 |         Map<Word, AtomicInteger> fre =new HashMap<>();
190 |         words.forEach(word->{
191 |             fre.putIfAbsent(word, new AtomicInteger());
192 |             fre.get(word).incrementAndGet();
193 |         });
194 |         return fre;
195 |     }
196 | 
197 |     private WebPage getWebPage(String url){
198 |         WebPage webPage = getWebPageInternal(url);
199 |         int times = 1;
200 |         while (webPage==null && times<4){
201 |             times++;
202 |             //使用新的IP地址
203 |             DynamicIp.toNewIp();
204 |             webPage = getWebPageInternal(url);
205 |         }
206 |         if(webPage==null){
207 |             return null;
208 |         }
209 |         times = 1;
210 |         //LOGGER.debug("获取到的HTML：" +html);
211 |         while((webPage.getContent().contains("非常抱歉，来自您ip的请求异常频繁")
212 |                 || webPage.getContent().contains("请您点击按钮解除封锁")
213 |                 || webPage.getContent().contains("请输入以下验证码"))
214 |                 && times<4){
215 |             times++;
216 |             //使用新的IP地址
217 |             DynamicIp.toNewIp();
218 |             webPage = getWebPageInternal(url);
219 |         }
220 |         return webPage;
221 |     }
222 |     private WebPage getWebPageInternal(String url) {
223 |         try {
224 |             Document doc = Jsoup.connect(url)
225 |                     .header("Accept", ACCEPT)
226 |                     .header("Accept-Encoding", ENCODING)
227 |                     .header("Accept-Language", LANGUAGE)
228 |                     .header("Connection", CONNECTION)
229 |                     .header("Referer", "http://"+new URL(url).getHost())
230 |                     .header("Host", new URL(url).getHost())
231 |                     .header("User-Agent", USER_AGENT)
232 |                     .header("X-Forwarded-For", getRandomIp())
233 |                     .header("Proxy-Client-IP", getRandomIp())
234 |                     .header("WL-Proxy-Client-IP", getRandomIp())
235 |                     .ignoreContentType(true)
236 |                     .timeout(30000)
237 |                     .get();
238 |             WebPage webPage = new WebPage();
239 |             webPage.setUrl(url);
240 |             webPage.setContent(doc.text());
241 |             webPage.setTitle(doc.title());
242 |             return webPage;
243 |         } catch (Exception e) {
244 |             LOGGER.error("获取网页失败", e);
245 |         }
246 |         return null;
247 |     }
248 |     private String getRandomIp(){
249 |         int first = new Random().nextInt(254)+1;
250 |         //排除A类私有地址0.0.0.0--10.255.255.255
251 |         while(first==10){
252 |             first = new Random().nextInt(254)+1;
253 |         }
254 |         int second = new Random().nextInt(254)+1;
255 |         //排除B类私有地址172.16.0.0--172.31.255.255
256 |         while(first==172 && (second>=16 && second<=31)){
257 |             first = new Random().nextInt(254)+1;
258 |             second = new Random().nextInt(254)+1;
259 |         }
260 |         //排除C类私有地址192.168.0.0--192.168.255.255
261 |         while(first==192 && second==168){
262 |             first = new Random().nextInt(254)+1;
263 |             second = new Random().nextInt(254)+1;
264 |         }
265 |         int third = new Random().nextInt(254)+1;
266 |         int forth = new Random().nextInt(254)+1;
267 |         return first+"."+second+"."+second+"."+forth;
268 |     }
269 |     private static class WebPage{
270 |         private String url;
271 |         private String title;
272 |         private String content;
273 | 
274 |         public String getUrl() {
275 |             return url;
276 |         }
277 | 
278 |         public void setUrl(String url) {
279 |             this.url = url;
280 |         }
281 | 
282 |         public String getTitle() {
283 |             return title;
284 |         }
285 | 
286 |         public void setTitle(String title) {
287 |             this.title = title;
288 |         }
289 | 
290 |         public String getContent() {
291 |             return content;
292 |         }
293 | 
294 |         public void setContent(String content) {
295 |             this.content = content;
296 |         }
297 |     }
298 | 
299 |     /**
300 |      * 我的ITEYE和OSCHINA博客有很多同样的博文，主要目的是备份
301 |      * 这里刚好用来测试相似性检测算法的效果
302 |      * http://yangshangchuan.iteye.com
303 |      * http://my.oschina.net/apdplat/blog
304 |      */
305 |     private void verifyYscBlog(){
306 |         List<Article> ob = DefaultParser.oschinaBlog();
307 |         List<Article> ib = DefaultParser.iteyeBlog();
308 |         Map<String, String> om = new HashMap<>();
309 |         Map<String, String> im = new HashMap<>();
310 |         ob.stream().forEach(b->om.put(b.getTitle(), b.getUrl()));
311 |         ib.stream().forEach(b->im.put(b.getTitle(), b.getUrl()));
312 |         List<String> oschinaBlog = ob.stream().map(b -> b.getTitle()).sorted().collect(Collectors.toList());
313 |         List<String> iteyeBlog   = ib.stream().map(b -> b.getTitle()).sorted().collect(Collectors.toList());
314 | 
315 |         List<String> commons = oschinaBlog.stream().filter(b -> iteyeBlog.contains(b)).collect(Collectors.toList());
316 |         commons.remove("自动更改IP地址反爬虫封锁，支持多线程");
317 |         Map<String, Double> result = new HashMap<>();
318 |         AtomicInteger similarCount = new AtomicInteger();
319 |         commons.forEach(title -> {
320 |             double score = similarScore(om.get(title), im.get(title));
321 |             result.put(title, score);
322 |             if (score >= THRESHOLD_RATE) {
323 |                 similarCount.incrementAndGet();
324 |             }
325 |         });
326 |         LOGGER.info("<h4>检查的博文数：" + commons.size() + "，相似度大于等于" + THRESHOLD_RATE + "的博文数：" + similarCount.get() + "，相似度小于" + THRESHOLD_RATE + "的博文数：" + (commons.size() - similarCount.get())+"</h4>");
327 |         AtomicInteger i = new AtomicInteger();
328 |         result
329 |             .entrySet()
330 |             .stream()
331 |             .sorted((a, b) -> b.getValue().compareTo(a.getValue()))
332 |             .forEach(e -> {
333 |                 LOGGER.info("");
334 |                 LOGGER.info("<h4>"+i.incrementAndGet() + "、检查博文" + "：" + e.getKey()+"，相似度分值："+e.getValue().doubleValue()+"</h4>");
335 |             LOGGER.info("\t博文地址1：<a target=\"_blank\" href=\""+om.get(e.getKey())+"\">"+om.get(e.getKey())+"</a><br/>");
336 |             LOGGER.info("\t博文地址2：<a target=\"_blank\" href=\""+im.get(e.getKey())+"\">"+im.get(e.getKey())+"</a><br/>");
337 |             });
338 |     }
339 |     public static void main(String[] args) throws Exception{
340 |         GenericWebPageSimilarChecker genericWebPageSimilarChecker = new GenericWebPageSimilarChecker();
341 |         genericWebPageSimilarChecker.verifyYscBlog();
342 |     }
343 | }
344 | 


--------------------------------------------------------------------------------
/src/main/java/org/seo/rank/impl/ITEYEBlogSimilarChecker.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  *
  3 |  * APDPlat - Application Product Development Platform
  4 |  * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com
  5 |  *
  6 |  * This program is free software: you can redistribute it and/or modify
  7 |  * it under the terms of the GNU General Public License as published by
  8 |  * the Free Software Foundation, either version 3 of the License, or
  9 |  * (at your option) any later version.
 10 |  *
 11 |  * This program is distributed in the hope that it will be useful,
 12 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 |  * GNU General Public License for more details.
 15 |  *
 16 |  * You should have received a copy of the GNU General Public License
 17 |  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 18 |  *
 19 |  */
 20 | 
 21 | package org.seo.rank.impl;
 22 | 
 23 | import org.apdplat.word.WordSegmenter;
 24 | import org.apdplat.word.segmentation.Word;
 25 | import org.jsoup.Jsoup;
 26 | import org.jsoup.nodes.Document;
 27 | import org.jsoup.nodes.Element;
 28 | import org.jsoup.select.Elements;
 29 | import org.seo.rank.SimilarChecker;
 30 | import org.seo.rank.tools.DynamicIp;
 31 | import org.slf4j.Logger;
 32 | import org.slf4j.LoggerFactory;
 33 | 
 34 | import java.math.BigDecimal;
 35 | import java.util.*;
 36 | import java.util.concurrent.atomic.AtomicInteger;
 37 | 
 38 | /**
 39 |  * ITEYE博文相似性检测
 40 |  * @author 杨尚川
 41 |  */
 42 | public class ITEYEBlogSimilarChecker implements SimilarChecker{
 43 |     private static final Logger LOGGER = LoggerFactory.getLogger(ITEYEBlogSimilarChecker.class);
 44 |     private static final String ACCEPT = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
 45 |     private static final String ENCODING = "gzip, deflate";
 46 |     private static final String LANGUAGE = "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3";
 47 |     private static final String CONNECTION = "keep-alive";
 48 |     private static final String REFERER = "http://yangshangchuan.iteye.com";
 49 |     private static final String HOST = "yangshangchuan.iteye.com";
 50 |     private static final String USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:36.0) Gecko/20100101 Firefox/36.0";
 51 |     private static final String BLOG_CSS_PATH = "html body div#page div#content.clearfix div#main div.blog_main";
 52 |     private static final String BLOG_TITLE_CSS_PATH = "div.blog_title";
 53 |     private static final String BLOG_CONTENT_CSS_PATH = "div#blog_content.blog_content";
 54 |     private static final float THRESHOLD_RATE = 0.8F;
 55 | 
 56 |     @Override
 57 |     public boolean isSimilar(String url1, String url2) {
 58 |         return similarScore(url1, url2)>=THRESHOLD_RATE;
 59 |     }
 60 |     @Override
 61 |     public double similarScore(String url1, String url2) {
 62 |         Blog blog1 = getBlog(url1);
 63 |         if(blog1!=null) {
 64 |             Blog blog2 = getBlog(url2);
 65 |             if(blog2!=null) {
 66 |                 double score = score(blog1, blog2);
 67 |                 //取两位小数
 68 |                 score = (int)(score*100)/(double)100;
 69 |                 return score;
 70 |             }
 71 |         }
 72 |         return 0;
 73 |     }
 74 | 
 75 |     private double score(Blog blog1, Blog blog2){
 76 |         //分词
 77 |         List<Word> blog1Words = WordSegmenter.seg(blog1.getTitle()+"\n"+blog1.getContent());
 78 |         List<Word> blog2Words = WordSegmenter.seg(blog2.getTitle()+"\n"+blog2.getContent());
 79 |         //词频统计
 80 |         Map<Word, AtomicInteger> blog1WordsFre = frequence(blog1Words);
 81 |         Map<Word, AtomicInteger> blog2WordsFre = frequence(blog2Words);
 82 |         //输出详细信息
 83 |         if(LOGGER.isDebugEnabled()){
 84 |             showDetail(blog1, blog1Words, blog1WordsFre);
 85 |             showDetail(blog2, blog2Words, blog2WordsFre);
 86 |         }
 87 |         //使用简单共有词判定
 88 |         return simpleScore(blog1WordsFre, blog2WordsFre);
 89 |         //使用余弦相似度判定
 90 |         //return cosScore(blog1WordsFre, blog2WordsFre);
 91 |     }
 92 | 
 93 |     /**
 94 |      * 判定相似性的方式一：简单共有词
 95 |      * @param blog1WordsFre
 96 |      * @param blog2WordsFre
 97 |      * @return
 98 |      */
 99 |     private double simpleScore(Map<Word, AtomicInteger> blog1WordsFre, Map<Word, AtomicInteger> blog2WordsFre){
100 |         //判断有几个相同的词
101 |         AtomicInteger intersectionLength = new AtomicInteger();
102 |         blog1WordsFre.keySet().forEach(word -> {
103 |             if (blog2WordsFre.keySet().contains(word)) {
104 |                 intersectionLength.incrementAndGet();
105 |             }
106 |         });
107 |         LOGGER.info("网页1有的词数：" + blog1WordsFre.size());
108 |         LOGGER.info("网页2有的词数：" + blog2WordsFre.size());
109 |         LOGGER.info("网页1和2共有的词数：" + intersectionLength.get());
110 |         double score = intersectionLength.get()/(double)Math.min(blog1WordsFre.size(), blog2WordsFre.size());
111 |         LOGGER.info("相似度分值="+intersectionLength.get()+"/(double)Math.min("+blog1WordsFre.size()+", "+blog2WordsFre.size()+")="+score);
112 |         return score;
113 |     }
114 | 
115 |     /**
116 |      *
117 |      * 判定相似性的方式二：余弦相似度
118 |      * 余弦夹角原理：
119 |      * 向量a=(x1,y1),向量b=(x2,y2)
120 |      * a.b=x1x2+y1y2
121 |      * |a|=根号[(x1)^2+(y1)^2],|b|=根号[(x2)^2+(y2)^2]
122 |      * a,b的夹角的余弦cos=a.b/|a|*|b|=(x1x2+y1y2)/根号[(x1)^2+(y1)^2]*根号[(x2)^2+(y2)^2]
123 |      * @param blog1WordsFre
124 |      * @param blog2WordsFre
125 |      */
126 |     private double cosScore(Map<Word, AtomicInteger> blog1WordsFre, Map<Word, AtomicInteger> blog2WordsFre){
127 |         Set<Word> words = new HashSet<>();
128 |         words.addAll(blog1WordsFre.keySet());
129 |         words.addAll(blog2WordsFre.keySet());
130 |         //向量的维度为words的大小，每一个维度的权重是词频，注意的是，中文分词的时候已经去了停用词
131 |         //a.b
132 |         AtomicInteger ab = new AtomicInteger();
133 |         //|a|
134 |         AtomicInteger aa = new AtomicInteger();
135 |         //|b|
136 |         AtomicInteger bb = new AtomicInteger();
137 |         //计算
138 |         words
139 |             .stream()
140 |             .forEach(word -> {
141 |                 AtomicInteger x1 = blog1WordsFre.get(word);
142 |                 AtomicInteger x2 = blog2WordsFre.get(word);
143 |                 if(x1!=null && x2!=null) {
144 |                     //x1x2
145 |                     int oneOfTheDimension = x1.get() * x2.get();
146 |                     //+
147 |                     ab.addAndGet(oneOfTheDimension);
148 |                 }
149 |                 if(x1!=null){
150 |                     //(x1)^2
151 |                     int oneOfTheDimension = x1.get() * x1.get();
152 |                     //+
153 |                     aa.addAndGet(oneOfTheDimension);
154 |                 }
155 |                 if(x2!=null){
156 |                     //(x2)^2
157 |                     int oneOfTheDimension = x2.get() * x2.get();
158 |                     //+
159 |                     bb.addAndGet(oneOfTheDimension);
160 |                 }
161 |             });
162 | 
163 |         double aaa = Math.sqrt(aa.get());
164 |         double bbb = Math.sqrt(bb.get());
165 |         //使用BigDecimal保证精确计算浮点数
166 |         BigDecimal aabb = BigDecimal.valueOf(aaa).multiply(BigDecimal.valueOf(bbb));
167 |         double cos = ab.get()/aabb.doubleValue();
168 |         return cos;
169 |     }
170 | 
171 |     private void showDetail(Blog blog, List<Word> blogWords, Map<Word, AtomicInteger> blogWordsFre){
172 |         LOGGER.debug("博文URL：");
173 |         LOGGER.debug("\t"+blog.getUrl());
174 |         LOGGER.debug("博文标题：");
175 |         LOGGER.debug("\t"+blog.getTitle());
176 |         LOGGER.debug("博文内容：");
177 |         LOGGER.debug("\t"+blog.getContent());
178 |         LOGGER.debug("博文长度："+blog.getContent().length());
179 |         LOGGER.debug("博文分词结果：");
180 |         LOGGER.debug("\t" + blogWords);
181 |         LOGGER.debug("博文词频统计：");
182 |         AtomicInteger c = new AtomicInteger();
183 |         blogWordsFre
184 |                 .entrySet()
185 |                 .stream()
186 |                 .sorted((a,b)->b.getValue().get()-a.getValue().get())
187 |                 .forEach(e->LOGGER.debug("\t"+c.incrementAndGet()+"、"+e.getKey()+"="+e.getValue()));
188 |     }
189 | 
190 |     private Map<Word, AtomicInteger> frequence(List<Word> words){
191 |         Map<Word, AtomicInteger> fre =new HashMap<>();
192 |         words.forEach(word->{
193 |             fre.putIfAbsent(word, new AtomicInteger());
194 |             fre.get(word).incrementAndGet();
195 |         });
196 |         return fre;
197 |     }
198 | 
199 |     private Blog getBlog(String url) {
200 |         try {
201 |             String html = getHtml(url);
202 |             Document doc = Jsoup.parse(html);
203 |             Elements elements = doc.select(BLOG_CSS_PATH);
204 |             String title = null;
205 |             String content = null;
206 |             for(Element element : elements){
207 |                 Elements ts = element.select(BLOG_TITLE_CSS_PATH);
208 |                 if(ts.size()==1){
209 |                     title = ts.get(0).text();
210 |                 }
211 |                 ts = element.select(BLOG_CONTENT_CSS_PATH);
212 |                 if(ts.size()==1){
213 |                     content = ts.get(0).text();
214 |                 }
215 |             }
216 |             if(title!=null && content!=null){
217 |                 Blog blog = new Blog();
218 |                 blog.setUrl(url);
219 |                 blog.setTitle(title);
220 |                 blog.setContent(content);
221 |                 return blog;
222 |             }
223 |         } catch (Exception e) {
224 |             LOGGER.error("获取博文失败", e);
225 |         }
226 |         return null;
227 |     }
228 |     private String getHtml(String url){
229 |         String html = getHtmlInternal(url);
230 |         int times = 1;
231 |         while (html==null && times<4){
232 |             times++;
233 |             //使用新的IP地址
234 |             DynamicIp.toNewIp();
235 |             html = getHtmlInternal(url);
236 |         }
237 |         times = 1;
238 |         //LOGGER.debug("获取到的HTML：" +html);
239 |         while((html.contains("非常抱歉，来自您ip的请求异常频繁")
240 |                 || html.contains("请您点击按钮解除封锁")
241 |                 || html.contains("请输入以下验证码"))
242 |                 && times<4){
243 |             times++;
244 |             //使用新的IP地址
245 |             DynamicIp.toNewIp();
246 |             html = getHtmlInternal(url);
247 |         }
248 |         return html;
249 |     }
250 |     private String getHtmlInternal(String url) {
251 |         try {
252 |             return Jsoup.connect(url)
253 |                     .header("Accept", ACCEPT)
254 |                     .header("Accept-Encoding", ENCODING)
255 |                     .header("Accept-Language", LANGUAGE)
256 |                     .header("Connection", CONNECTION)
257 |                     .header("Referer", REFERER)
258 |                     .header("Host", HOST)
259 |                     .header("User-Agent", USER_AGENT)
260 |                     .header("X-Forwarded-For", getRandomIp())
261 |                     .header("Proxy-Client-IP", getRandomIp())
262 |                     .header("WL-Proxy-Client-IP", getRandomIp())
263 |                     .ignoreContentType(true)
264 |                     .timeout(30000)
265 |                     .get().html();
266 |         } catch (Exception e) {
267 |             LOGGER.error("获取博文失败", e);
268 |         }
269 |         return null;
270 |     }
271 |     private String getRandomIp(){
272 |         int first = new Random().nextInt(254)+1;
273 |         //排除A类私有地址0.0.0.0--10.255.255.255
274 |         while(first==10){
275 |             first = new Random().nextInt(254)+1;
276 |         }
277 |         int second = new Random().nextInt(254)+1;
278 |         //排除B类私有地址172.16.0.0--172.31.255.255
279 |         while(first==172 && (second>=16 && second<=31)){
280 |             first = new Random().nextInt(254)+1;
281 |             second = new Random().nextInt(254)+1;
282 |         }
283 |         //排除C类私有地址192.168.0.0--192.168.255.255
284 |         while(first==192 && second==168){
285 |             first = new Random().nextInt(254)+1;
286 |             second = new Random().nextInt(254)+1;
287 |         }
288 |         int third = new Random().nextInt(254)+1;
289 |         int forth = new Random().nextInt(254)+1;
290 |         return first+"."+second+"."+second+"."+forth;
291 |     }
292 |     private static class Blog{
293 |         private String url;
294 |         private String title;
295 |         private String content;
296 | 
297 |         public String getUrl() {
298 |             return url;
299 |         }
300 | 
301 |         public void setUrl(String url) {
302 |             this.url = url;
303 |         }
304 | 
305 |         public String getTitle() {
306 |             return title;
307 |         }
308 | 
309 |         public void setTitle(String title) {
310 |             this.title = title;
311 |         }
312 | 
313 |         public String getContent() {
314 |             return content;
315 |         }
316 | 
317 |         public void setContent(String content) {
318 |             this.content = content;
319 |         }
320 |     }
321 | 
322 |     public static void main(String[] args) {
323 |         SimilarChecker similarChecker = new ITEYEBlogSimilarChecker();
324 |         double score = similarChecker.similarScore("http://baidu-27233181.iteye.com/blog/2200707",
325 |                 "http://baidu-27233181.iteye.com/blog/2200706");
326 |         LOGGER.info("相似度分值："+score);
327 |     }
328 | }
329 | 


--------------------------------------------------------------------------------
/src/main/java/org/seo/rank/impl/WordBasedGenericWebPageSimilarChecker.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  *
  3 |  * APDPlat - Application Product Development Platform
  4 |  * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com
  5 |  *
  6 |  * This program is free software: you can redistribute it and/or modify
  7 |  * it under the terms of the GNU General Public License as published by
  8 |  * the Free Software Foundation, either version 3 of the License, or
  9 |  * (at your option) any later version.
 10 |  *
 11 |  * This program is distributed in the hope that it will be useful,
 12 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 |  * GNU General Public License for more details.
 15 |  *
 16 |  * You should have received a copy of the GNU General Public License
 17 |  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 18 |  *
 19 |  */
 20 | 
 21 | package org.seo.rank.impl;
 22 | 
 23 | import org.apdplat.word.analysis.*;
 24 | import org.jsoup.Jsoup;
 25 | import org.jsoup.nodes.Document;
 26 | import org.seo.rank.SimilarChecker;
 27 | import org.seo.rank.list.impl.DefaultParser;
 28 | import org.seo.rank.model.Article;
 29 | import org.seo.rank.tools.DynamicIp;
 30 | import org.seo.rank.tools.ProxyIp;
 31 | import org.slf4j.Logger;
 32 | import org.slf4j.LoggerFactory;
 33 | 
 34 | import java.math.BigDecimal;
 35 | import java.net.URL;
 36 | import java.util.*;
 37 | import java.util.concurrent.atomic.AtomicInteger;
 38 | import java.util.stream.Collectors;
 39 | 
 40 | /**
 41 |  * 基于word分词提供的文本相似度算法来实现通用的网页相似度检测
 42 |  * @author 杨尚川
 43 |  */
 44 | public class WordBasedGenericWebPageSimilarChecker implements SimilarChecker {
 45 |     private static final Logger LOGGER = LoggerFactory.getLogger(WordBasedGenericWebPageSimilarChecker.class);
 46 |     private static final String ACCEPT = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
 47 |     private static final String ENCODING = "gzip, deflate";
 48 |     private static final String LANGUAGE = "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3";
 49 |     private static final String CONNECTION = "keep-alive";
 50 |     private static final String USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:36.0) Gecko/20100101 Firefox/36.0";
 51 |     private static final float THRESHOLD_RATE = 0.5F;
 52 |     private TextSimilarity textSimilarity = new EditDistanceTextSimilarity();
 53 | 
 54 |     public WordBasedGenericWebPageSimilarChecker(){}
 55 | 
 56 |     public WordBasedGenericWebPageSimilarChecker(TextSimilarity textSimilarity){
 57 |         this.textSimilarity = textSimilarity;
 58 |     }
 59 | 
 60 |     public void setTextSimilarity(TextSimilarity textSimilarity) {
 61 |         this.textSimilarity = textSimilarity;
 62 |     }
 63 | 
 64 |     @Override
 65 |     public boolean isSimilar(String url1, String url2) {
 66 |         return similarScore(url1, url2)>=THRESHOLD_RATE;
 67 |     }
 68 | 
 69 |     @Override
 70 |     public double similarScore(String url1, String url2) {
 71 |         WebPage webPage1 = getWebPage(url1);
 72 |         if(webPage1!=null) {
 73 |             WebPage webPage2 = getWebPage(url2);
 74 |             if(webPage2!=null) {
 75 |                 double score = textSimilarity.similarScore(webPage1.getContent(), webPage2.getContent());
 76 |                 return score;
 77 |             }
 78 |         }
 79 |         return 0.0;
 80 |     }
 81 | 
 82 |     public String contrastSimilarScore(String url1, String url2, List<TextSimilarity> textSimilarities) {
 83 |         StringBuilder result = new StringBuilder();
 84 |         WebPage webPage1 = getWebPage(url1);
 85 |         if(webPage1!=null) {
 86 |             WebPage webPage2 = getWebPage(url2);
 87 |             if(webPage2!=null) {
 88 |                 textSimilarities.forEach(textSimilarity -> {
 89 |                     double score = textSimilarity.similarScore(webPage1.getContent(), webPage2.getContent());
 90 |                     result.append(textSimilarity.getClass().getSimpleName().replace("TextSimilarity", ""))
 91 |                             .append("=")
 92 |                             .append(BigDecimal.valueOf(score).toString())
 93 |                             .append(" ");
 94 |                 });
 95 |             }
 96 |         }
 97 |         return result.toString();
 98 |     }
 99 | 
100 |     private WebPage getWebPage(String url){
101 |         WebPage webPage = getWebPageInternal(url);
102 |         int times = 1;
103 |         while (webPage==null && times<4){
104 |             times++;
105 |             //使用新的IP地址
106 |             DynamicIp.toNewIp();
107 |             webPage = getWebPageInternal(url);
108 |         }
109 |         if(webPage==null){
110 |             return null;
111 |         }
112 |         times = 1;
113 |         //LOGGER.debug("获取到的HTML：" +html);
114 |         while((webPage.getContent().contains("非常抱歉，来自您ip的请求异常频繁")
115 |                 || webPage.getContent().contains("请您点击按钮解除封锁")
116 |                 || webPage.getContent().contains("请输入以下验证码"))
117 |                 && times<4){
118 |             times++;
119 |             //使用新的IP地址
120 |             ProxyIp.toNewIp();
121 |             webPage = getWebPageInternal(url);
122 |         }
123 |         return webPage;
124 |     }
125 |     private WebPage getWebPageInternal(String url) {
126 |         try {
127 |             Document doc = Jsoup.connect(url)
128 |                     .header("Accept", ACCEPT)
129 |                     .header("Accept-Encoding", ENCODING)
130 |                     .header("Accept-Language", LANGUAGE)
131 |                     .header("Connection", CONNECTION)
132 |                     .header("Referer", "http://"+new URL(url).getHost())
133 |                     .header("Host", new URL(url).getHost())
134 |                     .header("User-Agent", USER_AGENT)
135 |                     .header("X-Forwarded-For", getRandomIp())
136 |                     .header("Proxy-Client-IP", getRandomIp())
137 |                     .header("WL-Proxy-Client-IP", getRandomIp())
138 |                     .ignoreContentType(true)
139 |                     .timeout(30000)
140 |                     .get();
141 |             WebPage webPage = new WebPage();
142 |             webPage.setUrl(url);
143 |             webPage.setContent(doc.text());
144 |             webPage.setTitle(doc.title());
145 |             return webPage;
146 |         } catch (Exception e) {
147 |             LOGGER.error("获取网页失败", e);
148 |         }
149 |         return null;
150 |     }
151 |     private String getRandomIp(){
152 |         int first = new Random().nextInt(254)+1;
153 |         //排除A类私有地址0.0.0.0--10.255.255.255
154 |         while(first==10){
155 |             first = new Random().nextInt(254)+1;
156 |         }
157 |         int second = new Random().nextInt(254)+1;
158 |         //排除B类私有地址172.16.0.0--172.31.255.255
159 |         while(first==172 && (second>=16 && second<=31)){
160 |             first = new Random().nextInt(254)+1;
161 |             second = new Random().nextInt(254)+1;
162 |         }
163 |         //排除C类私有地址192.168.0.0--192.168.255.255
164 |         while(first==192 && second==168){
165 |             first = new Random().nextInt(254)+1;
166 |             second = new Random().nextInt(254)+1;
167 |         }
168 |         int third = new Random().nextInt(254)+1;
169 |         int forth = new Random().nextInt(254)+1;
170 |         return first+"."+second+"."+second+"."+forth;
171 |     }
172 |     private static class WebPage{
173 |         private String url;
174 |         private String title;
175 |         private String content;
176 | 
177 |         public String getUrl() {
178 |             return url;
179 |         }
180 | 
181 |         public void setUrl(String url) {
182 |             this.url = url;
183 |         }
184 | 
185 |         public String getTitle() {
186 |             return title;
187 |         }
188 | 
189 |         public void setTitle(String title) {
190 |             this.title = title;
191 |         }
192 | 
193 |         public String getContent() {
194 |             return content;
195 |         }
196 | 
197 |         public void setContent(String content) {
198 |             this.content = content;
199 |         }
200 |     }
201 | 
202 |     /**
203 |      * 我的ITEYE和OSCHINA博客有很多同样的博文，主要目的是备份
204 |      * 这里刚好用来测试相似性检测算法的效果
205 |      * http://yangshangchuan.iteye.com
206 |      * http://my.oschina.net/apdplat/blog
207 |      */
208 |     private void verifyYscBlog(List<TextSimilarity> textSimilarities){
209 |         List<Article> ob = DefaultParser.oschinaBlog();
210 |         List<Article> ib = DefaultParser.iteyeBlog();
211 |         Map<String, String> om = new HashMap<>();
212 |         Map<String, String> im = new HashMap<>();
213 |         ob.stream().forEach(b->om.put(b.getTitle(), b.getUrl()));
214 |         ib.stream().forEach(b->im.put(b.getTitle(), b.getUrl()));
215 |         List<String> oschinaBlog = ob.stream().map(b -> b.getTitle()).sorted().collect(Collectors.toList());
216 |         List<String> iteyeBlog   = ib.stream().map(b -> b.getTitle()).sorted().collect(Collectors.toList());
217 | 
218 |         List<String> commons = oschinaBlog.stream().filter(b -> iteyeBlog.contains(b)).collect(Collectors.toList());
219 |         commons.remove("自动更改IP地址反爬虫封锁，支持多线程");
220 |         Map<String, String> result = new HashMap<>();
221 |         AtomicInteger similarCount = new AtomicInteger();
222 |         AtomicInteger j = new AtomicInteger();
223 |         commons.forEach(title -> {
224 |             String contrastResult = contrastSimilarScore(om.get(title), im.get(title), textSimilarities);
225 |             LOGGER.info(contrastResult+" "+title+" "+om.get(title)+" "+im.get(title));
226 |             result.put(title, contrastResult);
227 |             LOGGER.info("进度：" + commons.size() + "/" + j.incrementAndGet());
228 |         });
229 |         LOGGER.info("<h4>检查的博文数：" + commons.size() + "</h4>");
230 |         AtomicInteger i = new AtomicInteger();
231 |         result
232 |             .entrySet()
233 |             .stream()
234 |             .forEach(e -> {
235 |                 LOGGER.info("");
236 |                 LOGGER.info("<h4>"+i.incrementAndGet() + "、检查博文" + "：" + e.getKey()+"，相似度分值："+e.getValue()+"</h4>");
237 |                 LOGGER.info("\t博文地址1：<a target=\"_blank\" href=\""+om.get(e.getKey())+"\">"+om.get(e.getKey())+"</a><br/>");
238 |                 LOGGER.info("\t博文地址2：<a target=\"_blank\" href=\""+im.get(e.getKey())+"\">"+im.get(e.getKey())+"</a><br/>");
239 |             });
240 |     }
241 |     public static void main(String[] args) throws Exception{
242 |         List<TextSimilarity> textSimilarities = Arrays.asList(new SimpleTextSimilarity(),
243 |                 new CosineTextSimilarity(),
244 |                 new EditDistanceTextSimilarity(),
245 |                 new EuclideanDistanceTextSimilarity(),
246 |                 new ManhattanDistanceTextSimilarity(),
247 |                 new JaccardTextSimilarity(),
248 |                 new JaroDistanceTextSimilarity(),
249 |                 new JaroWinklerDistanceTextSimilarity(),
250 |                 new SørensenDiceCoefficientTextSimilarity(),
251 |                 new SimHashPlusHammingDistanceTextSimilarity());
252 |         WordBasedGenericWebPageSimilarChecker similarChecker = new WordBasedGenericWebPageSimilarChecker();
253 |         similarChecker.verifyYscBlog(textSimilarities);
254 |     }
255 | }
256 | 


--------------------------------------------------------------------------------
/src/main/java/org/seo/rank/list/Parser.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * 
 3 |  * APDPlat - Application Product Development Platform
 4 |  * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com
 5 |  * 
 6 |  * This program is free software: you can redistribute it and/or modify
 7 |  * it under the terms of the GNU General Public License as published by
 8 |  * the Free Software Foundation, either version 3 of the License, or
 9 |  * (at your option) any later version.
10 |  * 
11 |  * This program is distributed in the hope that it will be useful,
12 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 |  * GNU General Public License for more details.
15 |  * 
16 |  * You should have received a copy of the GNU General Public License
17 |  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
18 |  * 
19 |  */
20 | 
21 | package org.seo.rank.list;
22 | 
23 | import java.util.List;
24 | import org.seo.rank.model.Article;
25 | 
26 | /**
27 |  * 解析所有的列表页面
28 |  * 获取文章的标题和URL
29 |  * @author 杨尚川
30 |  */
31 | public interface Parser {
32 |     /**
33 |      * 解析列表页面
34 |      * @param url 列表页面第一页
35 |      * @param nextPageCssQuery 获取下一页的CSS路径
36 |      * @param nextPageText 获取下一页的CSS路径元素中的文本值
37 |      * @param titleCssQuery 提取文章标题的CSS路径
38 |      * @return 
39 |      */
40 |     public List<Article> parse(String url, String nextPageCssQuery, String nextPageText, String titleCssQuery);
41 | }
42 | 


--------------------------------------------------------------------------------
/src/main/java/org/seo/rank/list/UrlTools.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * 
 3 |  * APDPlat - Application Product Development Platform
 4 |  * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com
 5 |  * 
 6 |  * This program is free software: you can redistribute it and/or modify
 7 |  * it under the terms of the GNU General Public License as published by
 8 |  * the Free Software Foundation, either version 3 of the License, or
 9 |  * (at your option) any later version.
10 |  * 
11 |  * This program is distributed in the hope that it will be useful,
12 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 |  * GNU General Public License for more details.
15 |  * 
16 |  * You should have received a copy of the GNU General Public License
17 |  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
18 |  * 
19 |  */
20 | 
21 | package org.seo.rank.list;
22 | 
23 | import java.io.IOException;
24 | import java.io.OutputStream;
25 | import java.net.MalformedURLException;
26 | import java.net.URL;
27 | import java.net.URLConnection;
28 | 
29 | /**
30 |  *
31 |  * @author 杨尚川
32 |  */
33 | public class UrlTools {
34 |     private static final String ACCEPT = "text/html, */*; q=0.01";
35 |     private static final String ENCODING = "gzip, deflate";
36 |     private static final String LANGUAGE = "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3";
37 |     private static final String CONNECTION = "keep-alive";
38 |     private static final String USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:31.0) Gecko/20100101 Firefox/31.0";
39 |     
40 |     private UrlTools(){}
41 |     /**
42 |      * 将本页的非完整URL转换为完整的URL
43 |      * @param url 本页的URL
44 |      * @param href 本页上的相对或绝对非完整URL
45 |      * @return 完整的URL
46 |      * @throws MalformedURLException 
47 |      */
48 |     public static String normalizeUrl(String url, String href) throws MalformedURLException {
49 |         URL u = new URL(url);
50 |         String port = "";
51 |         if(u.getPort() > 0){
52 |             port = ":"+port;
53 |         }
54 |         String host = u.getProtocol()+"://"+u.getHost()+port;
55 |         if (!href.startsWith("http")) {
56 |             //处理非完整路径
57 |             if (href.startsWith("//")) {
58 |                 //处理绝对路径
59 |                 href = "http:" + href;
60 |             }else if (href.startsWith("/")) {
61 |                 //处理绝对路径
62 |                 href = host + href;
63 |             }else if(href.startsWith("?")) {
64 |                 //处理页面参数
65 |                 int index = url.indexOf("?");
66 |                 if(index > 0){
67 |                     String temp = url.substring(0, index);
68 |                     href = temp + href;
69 |                 }else{
70 |                     href = url + href;
71 |                 }
72 |             } else {
73 |                 //处理相对路径
74 |                 String temp = url;
75 |                 int index = url.lastIndexOf("/");
76 |                 if (index > 7) {
77 |                     //非协议后面的//
78 |                     //如：http://yangshangchuan.iteye.com/
79 |                     temp = url.substring(0, index + 1);
80 |                 } else {
81 |                     temp += "/";
82 |                 }
83 |                 href = temp + href;
84 |             }
85 |         }
86 |         return href;
87 |     }
88 | }
89 | 


--------------------------------------------------------------------------------
/src/main/java/org/seo/rank/list/impl/DefaultParser.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * 
  3 |  * APDPlat - Application Product Development Platform
  4 |  * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com
  5 |  * 
  6 |  * This program is free software: you can redistribute it and/or modify
  7 |  * it under the terms of the GNU General Public License as published by
  8 |  * the Free Software Foundation, either version 3 of the License, or
  9 |  * (at your option) any later version.
 10 |  * 
 11 |  * This program is distributed in the hope that it will be useful,
 12 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 |  * GNU General Public License for more details.
 15 |  * 
 16 |  * You should have received a copy of the GNU General Public License
 17 |  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 18 |  * 
 19 |  */
 20 | 
 21 | package org.seo.rank.list.impl;
 22 | 
 23 | import java.io.UnsupportedEncodingException;
 24 | import java.net.URLEncoder;
 25 | import java.util.*;
 26 | import java.util.concurrent.atomic.AtomicInteger;
 27 | import java.util.stream.Collectors;
 28 | 
 29 | import org.apache.commons.lang.StringUtils;
 30 | import org.jsoup.Jsoup;
 31 | import org.jsoup.nodes.Document;
 32 | import org.jsoup.nodes.Element;
 33 | import org.jsoup.select.Elements;
 34 | import org.seo.rank.list.Parser;
 35 | import org.seo.rank.list.UrlTools;
 36 | import org.seo.rank.model.Article;
 37 | import org.slf4j.Logger;
 38 | import org.slf4j.LoggerFactory;
 39 | 
 40 | /**
 41 |  *
 42 |  * @author 杨尚川
 43 |  */
 44 | public class DefaultParser implements Parser{
 45 |     private static final Logger LOGGER = LoggerFactory.getLogger(DefaultParser.class);
 46 |     private static final String ACCEPT = "text/html, */*; q=0.01";
 47 |     private static final String ENCODING = "gzip, deflate";
 48 |     private static final String LANGUAGE = "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3";
 49 |     private static final String CONNECTION = "keep-alive";
 50 |     private static final String USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:31.0) Gecko/20100101 Firefox/31.0";
 51 |     
 52 |     @Override
 53 |     public List<Article> parse(String url, String nextPageCssQuery, String nextPageText, String titleCssQuery) {
 54 |         List<Article> articles = new ArrayList<>();
 55 |         try{
 56 |             Document document = Jsoup.connect(url)
 57 |                         .header("Accept", ACCEPT)
 58 |                         .header("Accept-Encoding", ENCODING)
 59 |                         .header("Accept-Language", LANGUAGE)
 60 |                         .header("Connection", CONNECTION)
 61 |                         .header("User-Agent", USER_AGENT)
 62 |                         .get();
 63 |             Elements elements = document.select(titleCssQuery);
 64 |             for(Element element : elements){
 65 |                 String title = element.text();
 66 |                 String href = element.attr("href");
 67 |                 if(!StringUtils.isBlank(title) && !StringUtils.isBlank(href)){
 68 |                     href = UrlTools.normalizeUrl(url, href);
 69 |                     Article article = new Article();
 70 |                     article.setTitle(title);
 71 |                     article.setUrl(href);
 72 |                     articles.add(article);
 73 |                 }else{
 74 |                     LOGGER.info("解析列表页出错："+url+" title:"+title+", href:"+href);
 75 |                 }
 76 |             }
 77 |             //获取下一页链接地址
 78 |             String nextPageUrl = getNextPageUrl(document, nextPageCssQuery, nextPageText);
 79 |             LOGGER.debug("下一页链接："+nextPageUrl);
 80 |             if(nextPageUrl != null){
 81 |                 nextPageUrl = UrlTools.normalizeUrl(url, nextPageUrl);
 82 |                 LOGGER.debug("规范化后的下一页链接："+nextPageUrl);
 83 |                 //解析下一页
 84 |                 List<Article> result = parse(nextPageUrl, nextPageCssQuery, nextPageText, titleCssQuery);
 85 |                 articles.addAll(result);
 86 |             }else{
 87 |                 LOGGER.info("列表页解析完毕，最后一页："+url);
 88 |             }
 89 |         }catch(Exception e){
 90 |             LOGGER.error("解析列表页出错："+url, e);
 91 |         }
 92 |         return articles;
 93 |     }
 94 |     /**
 95 |      * 获取下一页链接地址
 96 |      * @param document 本页文档对象
 97 |      * @param nextPageCssQuery 获取下一页的CSS路径
 98 |      * @param nextPageText 下一页CSS路径对应的元素的文本值
 99 |      * @return 下一页链接地址
100 |      */
101 |     private String getNextPageUrl(Document document, String nextPageCssQuery, String nextPageText){
102 |         Elements elements = document.select(nextPageCssQuery);
103 |         for(Element element : elements){
104 |             String text = element.text();
105 |             LOGGER.debug(text);
106 |             if(text != null && nextPageText.trim().equals(text.trim())){
107 |                 String href = element.attr("href");
108 |                 return href;
109 |             }
110 |         }
111 |         return null;
112 |     }
113 |     public static List<Article> run(String url, String nextPageCssQuery, String nextPageText, String titleCssQuery){
114 |         Parser parser = new DefaultParser();
115 |         long start = System.currentTimeMillis();
116 |         List<Article> articles = parser.parse(url, nextPageCssQuery, nextPageText, titleCssQuery);
117 |         long cost = System.currentTimeMillis() - start;
118 |         int i=1;
119 |         for(Article article : articles){
120 |             LOGGER.info((i++) + "、" + article.getTitle() + " : " + article.getUrl());
121 |         }
122 |         LOGGER.info("采集文章 " + articles.size() + " 篇耗时：" + cost / 1000.0 + " 秒");
123 |         return articles;
124 |     }
125 |     public static List<Article> iteyeBlog(){
126 |         String url = "http://yangshangchuan.iteye.com/";
127 |         String nextPageCssQuery = "html body div#page div#content.clearfix div#main div.pagination a.next_page";
128 |         String nextPageText = "下一页 »";
129 |         String titleCssQuery = "html body div#page div#content.clearfix div#main div.blog_main div.blog_title h3 a";
130 |         return run(url, nextPageCssQuery, nextPageText, titleCssQuery);
131 |     }
132 |     public static List<Article> iteyeNews(){
133 |         String url = "http://www.iteye.com/news";
134 |         String nextPageCssQuery = "html body div#page div#content.clearfix div#main div#index_main div.pagination a.next_page";
135 |         String nextPageText = "下一页 »";
136 |         //h3 > a表示h3后直接跟着a，这样 h3 span.category a 就不会被选择
137 |         String titleCssQuery = "html body div#page div#content.clearfix div#main div#index_main div.news.clearfix div.content h3 > a";
138 |         return run(url, nextPageCssQuery, nextPageText, titleCssQuery);
139 |     }
140 |     public static List<Article> iteyeMagazines(){
141 |         String url = "http://www.iteye.com/magazines";
142 |         String nextPageCssQuery = "html body div#page div#content.clearfix div#main div#index_main div.pagination a.next_page";
143 |         String nextPageText = "下一页 »";
144 |         String titleCssQuery = "html body div#page div#content.clearfix div#main div#index_main div.news.clearfix div.content h3 a";
145 |         return run(url, nextPageCssQuery, nextPageText, titleCssQuery);
146 |     }
147 |     public static List<Article> csdnBlog(){
148 |         String url = "http://blog.csdn.net/iispring";
149 |         String nextPageCssQuery = "html body div#container div#body div#main div.main div#papelist.pagelist a";
150 |         String titleCssQuery = "html body div#container div#body div#main div.main div#article_list.list div.list_item.article_item div.article_title h1 span.link_title a";
151 |         String nextPageText = "下一页";
152 |         return run(url, nextPageCssQuery, nextPageText, titleCssQuery);
153 |     }
154 |     public static List<Article> oschinaNews(){
155 |         String url = "http://www.oschina.net/news";
156 |         String nextPageCssQuery = "html body div#OSC_Screen div#OSC_Content.CenterDiv div#NewsChannel.Channel div#NewsList.ListPanel div#RecentNewsList.panel ul.pager li.page.next a";
157 |         String titleCssQuery = "html body div#OSC_Screen div#OSC_Content.CenterDiv div#NewsChannel.Channel div#NewsList.ListPanel div#RecentNewsList.panel ul.List li h2 a";
158 |         String nextPageText = ">";
159 |         return run(url, nextPageCssQuery, nextPageText, titleCssQuery);
160 |     }
161 |     public static List<Article> oschinaBlog(){
162 |         String url = "http://my.oschina.net/apdplat/blog";
163 |         String nextPageCssQuery = "html body div#OSC_Screen div#OSC_Content div.blog-user div.container div.flex-item div#search_list.flex-item-9.flex-item-md-9.content div#list.list.blog-list div.pages.sm-hide ul li a";
164 |         String titleCssQuery = "html body div#OSC_Screen div#OSC_Content div.blog-user div.container div.flex-item div#search_list.flex-item-9.flex-item-md-9.content div#list.list.blog-list div.list-item div.layout div.layout-column div.title a";
165 |         String nextPageText = "下一页";
166 |         return run(url, nextPageCssQuery, nextPageText, titleCssQuery);
167 |     }
168 |     public static List<Article> baidu(String query){
169 |         //对查询词进行编码
170 |         try {
171 |             query = URLEncoder.encode(query, "UTF-8");
172 |         } catch (UnsupportedEncodingException e) {
173 |             LOGGER.error("url构造失败", e);
174 |             return Collections.emptyList();
175 |         }
176 |         if(StringUtils.isBlank(query)){
177 |             return Collections.emptyList();
178 |         }
179 |         String url = "http://www.baidu.com/s?wd=" + query;
180 |         String nextPageCssQuery = "html body div div div p#page a.n";
181 |         String titleCssQuery = "html body div div div div div h3.t a";
182 |         String nextPageText = "下一页>";
183 |         return run(url, nextPageCssQuery, nextPageText, titleCssQuery);
184 |     }
185 | 
186 |     /**
187 |      * 比较我的OSCHINA博客和ITEYE博客的异同
188 |      */
189 |     public static void blogCompare(){
190 |         List<Article> ob = oschinaBlog();
191 |         List<Article> ib = iteyeBlog();
192 |         Map<String, String> om = new HashMap<>();
193 |         Map<String, String> im = new HashMap<>();
194 |         ob.stream().forEach(b->om.put(b.getTitle(), b.getUrl()));
195 |         ib.stream().forEach(b->im.put(b.getTitle(), b.getUrl()));
196 |         List<String> iteyeBlog   = ib.stream().map(b -> b.getTitle().replace("[置顶]", "").trim()).sorted().collect(Collectors.toList());
197 |         List<String> oschinaBlog = ob.stream().map(b -> b.getTitle()).sorted().collect(Collectors.toList());
198 | 
199 |         List<String> commons = oschinaBlog.stream().filter(b -> iteyeBlog.contains(b)).collect(Collectors.toList());
200 |         LOGGER.info("<h4>oschina和iteye都有("+commons.size()+")：</h4>");
201 |         AtomicInteger j = new AtomicInteger();
202 |         commons.forEach(item -> LOGGER.info(j.incrementAndGet()+"、"+item+"    <a target=\"_blank\" href=\""+om.get(item)+"\">oschina</a>    <a target=\"_blank\" href=\""+im.get(item)+"\">iteye</a><br/>"));
203 | 
204 |         List<String> oschina = oschinaBlog.stream().filter(i -> !iteyeBlog.contains(i)).collect(Collectors.toList());
205 |         LOGGER.info("<h4>oschina独有("+oschina.size()+")：</h4>");
206 |         AtomicInteger l = new AtomicInteger();
207 |         oschina.forEach(item -> LOGGER.info(l.incrementAndGet()+"、<a target=\"_blank\" href=\""+om.get(item)+"\">"+item+"</a><br/>"));
208 | 
209 |         List<String> iteye = iteyeBlog.stream().filter(i -> !oschinaBlog.contains(i)).collect(Collectors.toList());
210 |         LOGGER.info("<h4>iteye独有("+iteye.size()+")：</h4>");
211 |         AtomicInteger k = new AtomicInteger();
212 |         iteye.forEach(item -> LOGGER.info(k.incrementAndGet()+"、<a target=\"_blank\" href=\""+im.get(item)+"\">"+item+"</a><br/>"));
213 |     }
214 |     public static void main(String[] args){
215 |         //iteyeBlog();
216 |         //iteyeNews();
217 |         //iteyeMagazines();
218 |         //csdnBlog();
219 |         //oschinaNews();
220 |         //oschinaBlog();
221 |         //baidu("Java应用级产品开发平台APDPlat作者杨尚川专访");
222 |         blogCompare();
223 |     }
224 | }
225 | 


--------------------------------------------------------------------------------
/src/main/java/org/seo/rank/model/Article.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * 
 3 |  * APDPlat - Application Product Development Platform
 4 |  * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com
 5 |  * 
 6 |  * This program is free software: you can redistribute it and/or modify
 7 |  * it under the terms of the GNU General Public License as published by
 8 |  * the Free Software Foundation, either version 3 of the License, or
 9 |  * (at your option) any later version.
10 |  * 
11 |  * This program is distributed in the hope that it will be useful,
12 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 |  * GNU General Public License for more details.
15 |  * 
16 |  * You should have received a copy of the GNU General Public License
17 |  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
18 |  * 
19 |  */
20 | 
21 | package org.seo.rank.model;
22 | 
23 | /**
24 |  *
25 |  * @author 杨尚川
26 |  */
27 | public class Article {
28 |     private String title;
29 |     private String url;
30 | 
31 |     public Article(){}
32 |     public Article(String title, String url) {
33 |         this.title = title;
34 |         this.url = url;
35 |     }
36 | 
37 |     public String getTitle() {
38 |         return title;
39 |     }
40 | 
41 |     public void setTitle(String title) {
42 |         this.title = title;
43 |     }
44 | 
45 |     public String getUrl() {
46 |         return url;
47 |     }
48 | 
49 |     public void setUrl(String url) {
50 |         this.url = url;
51 |     }
52 | }
53 | 


--------------------------------------------------------------------------------
/src/main/java/org/seo/rank/model/Rank.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * 
 3 |  * APDPlat - Application Product Development Platform
 4 |  * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com
 5 |  * 
 6 |  * This program is free software: you can redistribute it and/or modify
 7 |  * it under the terms of the GNU General Public License as published by
 8 |  * the Free Software Foundation, either version 3 of the License, or
 9 |  * (at your option) any later version.
10 |  * 
11 |  * This program is distributed in the hope that it will be useful,
12 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 |  * GNU General Public License for more details.
15 |  * 
16 |  * You should have received a copy of the GNU General Public License
17 |  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
18 |  * 
19 |  */
20 | 
21 | package org.seo.rank.model;
22 | 
23 | /**
24 |  *
25 |  * @author 杨尚川
26 |  */
27 | public class Rank {
28 |     /**
29 |      * 查询排名的网页地址
30 |      */
31 |     private String url;
32 |     /**
33 |      * 查询关键词
34 |      */
35 |     private String keyword;
36 |     /**
37 |      * 是否被收录
38 |      */
39 |     private boolean index;
40 |     /**
41 |      * 查询结果排名
42 |      */
43 |     private int rank;
44 | 
45 |     public String getUrl() {
46 |         return url;
47 |     }
48 | 
49 |     public void setUrl(String url) {
50 |         this.url = url;
51 |     }
52 | 
53 |     public String getKeyword() {
54 |         return keyword;
55 |     }
56 | 
57 |     public void setKeyword(String keyword) {
58 |         this.keyword = keyword;
59 |     }
60 | 
61 | 
62 |     public boolean isIndex() {
63 |         return index;
64 |     }
65 | 
66 |     public void setIndex(boolean index) {
67 |         this.index = index;
68 |     }
69 |     public int getRank() {
70 |         if(!isIndex()){
71 |             return -1;
72 |         }
73 |         return rank;
74 |     }
75 | 
76 |     public void setRank(int rank) {
77 |         this.rank = rank;
78 |     }
79 | 
80 |     @Override
81 |     public String toString() {
82 |         return "url=" + url + "\nkeyword=" + keyword + "\nindex=" + index + "\nrank=" + rank;
83 |     }
84 | }
85 | 


--------------------------------------------------------------------------------
/src/main/java/org/seo/rank/tools/DynamicIp.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  *
  3 |  * APDPlat - Application Product Development Platform Copyright (c) 2013, 杨尚川,
  4 |  * yang-shangchuan@qq.com
  5 |  *
  6 |  * This program is free software: you can redistribute it and/or modify it under
  7 |  * the terms of the GNU General Public License as published by the Free Software
  8 |  * Foundation, either version 3 of the License, or (at your option) any later
  9 |  * version.
 10 |  *
 11 |  * This program is distributed in the hope that it will be useful, but WITHOUT
 12 |  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 13 |  * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
 14 |  * details.
 15 |  *
 16 |  * You should have received a copy of the GNU General Public License along with
 17 |  * this program. If not, see <http://www.gnu.org/licenses/>.
 18 |  *
 19 |  */
 20 | 
 21 | package org.seo.rank.tools;
 22 | 
 23 | 
 24 | import org.jsoup.Connection;
 25 | import org.jsoup.Jsoup;
 26 | import org.jsoup.nodes.Document;
 27 | import org.slf4j.Logger;
 28 | import org.slf4j.LoggerFactory;
 29 | 
 30 | import java.util.Collections;
 31 | import java.util.HashMap;
 32 | import java.util.Map;
 33 | 
 34 | /**
 35 |  *
 36 |  * 自动更改IP地址反爬虫封锁，支持多线程
 37 |  *
 38 |  * ADSL拨号上网使用动态IP地址，每一次拨号得到的IP都不一样
 39 |  *
 40 |  * 使用腾达300M无线路由器，型号：N302 v2
 41 |  * 路由器设置中最好设置一下：上网设置 -》请根据需要选择连接模式 -》手动连接，由用户手动进行连接。
 42 |  * 其他的路由器使用方法类似，参照本类替换相应的登录地址、断开连接及建立连接地址即可
 43 |  *
 44 |  * @author 杨尚川
 45 |  */
 46 | public class DynamicIp {
 47 |     private DynamicIp(){}
 48 |     private static final Logger LOGGER = LoggerFactory.getLogger(DynamicIp.class);
 49 |     private static final String ACCEPT = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
 50 |     private static final String ENCODING = "gzip, deflate";
 51 |     private static final String LANGUAGE = "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3";
 52 |     private static final String CONNECTION = "keep-alive";
 53 |     private static final String HOST = "192.168.0.1";
 54 |     private static final String REFERER = "http://192.168.0.1/login.asp";
 55 |     private static final String USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:36.0) Gecko/20100101 Firefox/36.0";
 56 |     private static volatile boolean isDialing = false;
 57 |     private static volatile long lastDialTime = 0l;
 58 | 
 59 |     public static void main(String[] args) {
 60 |         toNewIp();
 61 |     }
 62 | 
 63 |     /**
 64 |      * 假设有10个线程在跑，大家都正常的跑，跑着跑着达到限制了，
 65 |      * 于是大家争先恐后（几乎是同时）请求拨号，
 66 |      * 这个时候同步的作用就显示出来了，只会有一个线程能拨号，
 67 |      * 在他结束之前其他线程都在等，等他拨号成功之后，
 68 |      * 其他线程会被唤醒并返回
 69 |      *
 70 |      * 算法描述：
 71 |      * 1、假设总共有N个线程抓取网页，发现被封锁之后依次排队请求锁，注意：可以想象成是同时请求。
 72 |      * 2、线程1抢先获得锁，并且设置isDialing = true后开始拨号，注意：线程1设置isDialing = true后其他线程才可能获得锁。
 73 |      * 3、其他线程（2-N）依次获得锁，发现isDialing = true，于是wait。注意：获得锁并判断一个布尔值，跟后面的拨号操作比起来，时间可以忽略。
 74 |      * 4、线程1拨号完毕isDialing = false。注意：这个时候可以断定，其他所有线程必定是处于wait状态等待唤醒。
 75 |      * 5、线程1唤醒其他线程，其他线程和线程1返回开始抓取网页。
 76 |      * 6、抓了一会儿之后，又会被封锁，于是回到步骤1。
 77 |      * 注意：在本场景中，3和4的断定是没问题的，就算是出现“不可能”的情况，
 78 |      * 即线程1已经拨号完成了，可2-N还没获得锁（汗），也不会重复拨号的情况，
 79 |      * 因为算法考虑了请求拨号时间和上一次成功拨号时间。
 80 |      * @return 更改IP是否成功
 81 |      */
 82 |     public static boolean toNewIp() {
 83 |         long requestDialTime = System.currentTimeMillis();
 84 |         LOGGER.info(Thread.currentThread()+"请求重新拨号");
 85 |         synchronized (DynamicIp.class) {
 86 |             if (isDialing) {
 87 |                 LOGGER.info(Thread.currentThread()+"已经有其他线程在进行拨号了，我睡觉等待吧，其他线程拨号完毕会叫醒我的");
 88 |                 try {
 89 |                     DynamicIp.class.wait();
 90 |                 } catch (InterruptedException e) {
 91 |                     LOGGER.error(e.getMessage(), e);
 92 |                 }
 93 |                 LOGGER.info(Thread.currentThread()+"其他线程已经拨完号了，我可以返回了");
 94 |                 return true;
 95 |             }
 96 |             isDialing = true;
 97 |         }
 98 |         //保险起见，这里再判断一下
 99 |         //如果请求拨号的时间小于上次成功拨号的时间，则说明这个请求来的【太迟了】，则返回。
100 |         if(requestDialTime <= lastDialTime){
101 |             LOGGER.info("请求来的太迟了");
102 |             isDialing = false;
103 |             return true;
104 |         }
105 |         LOGGER.info(Thread.currentThread()+"开始重新拨号");
106 |         long start = System.currentTimeMillis();
107 |         Map<String, String> cookies = login("username***", "password***", "phonenumber***");
108 |         if("true".equals(cookies.get("success"))) {
109 |             LOGGER.info(Thread.currentThread()+"登陆成功");
110 |             cookies.remove("success");
111 |             while (!disConnect(cookies)) {
112 |                 LOGGER.info(Thread.currentThread()+"断开连接失败，重试！");
113 |             }
114 |             LOGGER.info(Thread.currentThread()+"断开连接成功");
115 |             while (!connect(cookies)) {
116 |                 LOGGER.info(Thread.currentThread()+"建立连接失败，重试！");
117 |             }
118 |             LOGGER.info(Thread.currentThread()+"建立连接成功");
119 |             LOGGER.info(Thread.currentThread()+"自动更改IP地址成功！");
120 |             LOGGER.info(Thread.currentThread()+"拨号耗时："+(System.currentTimeMillis()-start)+"毫秒");
121 |             //通知其他线程结束等待
122 |             synchronized (DynamicIp.class) {
123 |                 DynamicIp.class.notifyAll();
124 |             }
125 |             isDialing = false;
126 |             lastDialTime = System.currentTimeMillis();
127 |             return true;
128 |         }
129 |         LOGGER.info(Thread.currentThread()+"登陆无线路由器失败，检查用户名和密码！");
130 |         //通知其他线程结束等待
131 |         synchronized (DynamicIp.class) {
132 |             DynamicIp.class.notifyAll();
133 |         }
134 |         isDialing = false;
135 |         return false;
136 |     }
137 | 
138 |     public static boolean connect(Map<String, String> cookies){
139 |         return execute(cookies, "3");
140 |     }
141 |     public static boolean disConnect(Map<String, String> cookies){
142 |         return execute(cookies, "4");
143 |     }
144 |     public static boolean execute(Map<String, String> cookies, String action){
145 |         String url = "http://192.168.0.1/goform/SysStatusHandle";
146 |         Map<String, String> map = new HashMap<>();
147 |         map.put("action", action);
148 |         map.put("CMD", "WAN_CON");
149 |         map.put("GO", "system_status.asp");
150 |         Connection conn = Jsoup.connect(url)
151 |                 .header("Accept", ACCEPT)
152 |                 .header("Accept-Encoding", ENCODING)
153 |                 .header("Accept-Language", LANGUAGE)
154 |                 .header("Connection", CONNECTION)
155 |                 .header("Host", HOST)
156 |                 .header("Referer", REFERER)
157 |                 .header("User-Agent", USER_AGENT)
158 |                 .ignoreContentType(true)
159 |                 .timeout(30000);
160 |         for(String cookie : cookies.keySet()){
161 |             conn.cookie(cookie, cookies.get(cookie));
162 |         }
163 | 
164 |         String title = null;
165 |         try {
166 |             Connection.Response response = conn.method(Connection.Method.POST).data(map).execute();
167 |             String html = response.body();
168 |             Document doc = Jsoup.parse(html);
169 |             title = doc.title();
170 |             LOGGER.info("操作连接页面标题："+title);
171 |             Thread.sleep(10000);
172 |         }catch (Exception e){
173 |             LOGGER.error(e.getMessage());
174 |         }
175 |         if("LAN | LAN Settings".equals(title)){
176 |             if(("3".equals(action) && isConnected())
177 |                     || ("4".equals(action) && !isConnected())){
178 |                 return true;
179 |             }
180 |         }
181 |         return false;
182 |     }
183 |     public static boolean isConnected(){
184 |         try {
185 |             Document doc = Jsoup.connect("http://www.baidu.com/s?wd=杨尚川&t=" + System.currentTimeMillis())
186 |                     .header("Accept", ACCEPT)
187 |                     .header("Accept-Encoding", ENCODING)
188 |                     .header("Accept-Language", LANGUAGE)
189 |                     .header("Connection", CONNECTION)
190 |                     .header("Referer", "https://www.baidu.com")
191 |                     .header("Host", "www.baidu.com")
192 |                     .header("User-Agent", USER_AGENT)
193 |                     .ignoreContentType(true)
194 |                     .timeout(30000)
195 |                     .get();
196 |             LOGGER.info("搜索结果页面标题："+doc.title());
197 |             if(doc.title() != null && doc.title().contains("杨尚川")){
198 |                 return true;
199 |             }
200 |         }catch (Exception e){
201 |             if("Network is unreachable".equals(e.getMessage())){
202 |                 return false;
203 |             }else{
204 |                 LOGGER.error("状态检查失败:"+e.getMessage());
205 |             }
206 |         }
207 |         return false;
208 |     }
209 |     public static Map<String, String> login(String userName, String password, String verify){
210 |         try {
211 |             Map<String, String> map = new HashMap<>();
212 |             map.put("Username", userName);
213 |             map.put("Password", password);
214 |             map.put("checkEn", "0");
215 |             Connection conn = Jsoup.connect("http://192.168.0.1/LoginCheck")
216 |                     .header("Accept", ACCEPT)
217 |                     .header("Accept-Encoding", ENCODING)
218 |                     .header("Accept-Language", LANGUAGE)
219 |                     .header("Connection", CONNECTION)
220 |                     .header("Referer", REFERER)
221 |                     .header("Host", HOST)
222 |                     .header("User-Agent", USER_AGENT)
223 |                     .ignoreContentType(true)
224 |                     .timeout(30000);
225 | 
226 |             Connection.Response response = conn.method(Connection.Method.POST).data(map).execute();
227 |             String html = response.body();
228 |             Document doc = Jsoup.parse(html);
229 |             LOGGER.info("登陆页面标题："+doc.title());
230 |             Map<String, String> cookies = response.cookies();
231 |             if(html.contains(verify)){
232 |                 cookies.put("success", Boolean.TRUE.toString());
233 |             }
234 |             LOGGER.info("*******************************************************cookies start:");
235 |             cookies.keySet().stream().forEach((cookie) -> {
236 |                 LOGGER.info(cookie + ":" + cookies.get(cookie));
237 |             });
238 |             LOGGER.info("*******************************************************cookies end:");
239 |             return cookies;
240 |         }catch (Exception e){
241 |             LOGGER.error(e.getMessage(), e);
242 |         }
243 |         return Collections.emptyMap();
244 |     }
245 | }
246 | 


--------------------------------------------------------------------------------
/src/main/java/org/seo/rank/tools/ProxyIp.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  *
  3 |  * APDPlat - Application Product Development Platform Copyright (c) 2013, 杨尚川,
  4 |  * yang-shangchuan@qq.com
  5 |  *
  6 |  * This program is free software: you can redistribute it and/or modify it under
  7 |  * the terms of the GNU General Public License as published by the Free Software
  8 |  * Foundation, either version 3 of the License, or (at your option) any later
  9 |  * version.
 10 |  *
 11 |  * This program is distributed in the hope that it will be useful, but WITHOUT
 12 |  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 13 |  * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
 14 |  * details.
 15 |  *
 16 |  * You should have received a copy of the GNU General Public License along with
 17 |  * this program. If not, see <http://www.gnu.org/licenses/>.
 18 |  *
 19 |  */
 20 | 
 21 | package org.seo.rank.tools;
 22 | 
 23 | import com.gargoylesoftware.htmlunit.BrowserVersion;
 24 | import com.gargoylesoftware.htmlunit.WebClient;
 25 | import com.gargoylesoftware.htmlunit.html.HtmlPage;
 26 | import org.apache.commons.lang.StringUtils;
 27 | import org.jsoup.Jsoup;
 28 | import org.jsoup.nodes.Document;
 29 | import org.jsoup.nodes.Element;
 30 | import org.jsoup.select.Elements;
 31 | import org.slf4j.Logger;
 32 | import org.slf4j.LoggerFactory;
 33 | 
 34 | import java.io.BufferedReader;
 35 | import java.io.InputStreamReader;
 36 | import java.net.HttpURLConnection;
 37 | import java.net.InetSocketAddress;
 38 | import java.net.Proxy;
 39 | import java.net.URL;
 40 | import java.nio.file.Files;
 41 | import java.nio.file.Path;
 42 | import java.nio.file.Paths;
 43 | import java.util.*;
 44 | import java.util.concurrent.ConcurrentSkipListSet;
 45 | import java.util.concurrent.atomic.AtomicInteger;
 46 | import java.util.regex.Matcher;
 47 | import java.util.regex.Pattern;
 48 | import java.util.stream.Collectors;
 49 | 
 50 | /**
 51 |  *
 52 |  * 自动更改IP地址反爬虫封锁，支持多线程
 53 |  * 使用代理服务器的方式
 54 |  *
 55 |  * @author 杨尚川
 56 |  */
 57 | public class ProxyIp {
 58 |     private ProxyIp(){}
 59 |     private static final Logger LOGGER = LoggerFactory.getLogger(ProxyIp.class);
 60 |     private static final String ACCEPT = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
 61 |     private static final String ENCODING = "gzip, deflate";
 62 |     private static final String LANGUAGE = "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3";
 63 |     private static final String CONNECTION = "keep-alive";
 64 |     private static final String USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:36.0) Gecko/20100101 Firefox/36.0";
 65 |     private static volatile boolean isSwitching = false;
 66 |     private static volatile long lastSwitchTime = 0l;
 67 |     private static final WebClient WEB_CLIENT = new WebClient(BrowserVersion.INTERNET_EXPLORER_11);
 68 |     private static final Pattern IP_PATTERN = Pattern.compile("((?:(?:25[0-5]|2[0-4]\\d|((1\\d{2})|([1-9]?\\d)))\\.){3}(?:25[0-5]|2[0-4]\\d|((1\\d{2})|([1-9]?\\d))))");
 69 |     //可用代理IP列表
 70 |     private static final List<String> IPS = new Vector<>();
 71 |     private static volatile int currentIpIndex = 0;
 72 |     private static volatile boolean detect = true;
 73 |     //五分钟
 74 |     private static volatile int detectInterval = 300000;
 75 |     private static final Path PROXY_IPS_FILE = Paths.get("src/main/resources/proxy_ips.txt");
 76 |     //自身IP地址
 77 |     private static String previousIp = getCurrentIp();
 78 |     //能隐藏自己IP的代理
 79 |     private static final Set<String> EXCELLENT_IPS = new ConcurrentSkipListSet<>();
 80 |     private static final Set<String> EXCELLENT_USA_IPS = new ConcurrentSkipListSet<>();
 81 |     //不能隐藏自己IP的代理
 82 |     private static final Set<String> NORMAL_IPS = new ConcurrentSkipListSet<>();
 83 |     private static final Path EXCELLENT_PROXY_IPS_FILE = Paths.get("src/main/resources/proxy_ips_excellent.txt");;
 84 |     private static final Path EXCELLENT_USA_PROXY_IPS_FILE = Paths.get("src/main/resources/proxy_ips_excellent_usa.txt");
 85 |     private static final Path NORMAL_PROXY_IPS_FILE = Paths.get("src/main/resources/proxy_ips_normal.txt");
 86 |     static {
 87 |         Set<String> ipSet = new HashSet<>();
 88 |         //如果本地有则读取
 89 |         try {
 90 |             if(Files.notExists(PROXY_IPS_FILE.getParent())){
 91 |                 PROXY_IPS_FILE.getParent().toFile().mkdirs();
 92 |             }
 93 |             if(Files.notExists(PROXY_IPS_FILE)){
 94 |                 PROXY_IPS_FILE.toFile().createNewFile();
 95 |             }
 96 |             if(Files.notExists(EXCELLENT_PROXY_IPS_FILE)){
 97 |                 EXCELLENT_PROXY_IPS_FILE.toFile().createNewFile();
 98 |             }
 99 |             if(Files.notExists(EXCELLENT_USA_PROXY_IPS_FILE)){
100 |                 EXCELLENT_USA_PROXY_IPS_FILE.toFile().createNewFile();
101 |             }
102 |             if(Files.notExists(NORMAL_PROXY_IPS_FILE)){
103 |                 NORMAL_PROXY_IPS_FILE.toFile().createNewFile();
104 |             }
105 |             LOGGER.info("代理IP存放路径："+PROXY_IPS_FILE.toAbsolutePath().toString());
106 |             ipSet.addAll(Files.readAllLines(PROXY_IPS_FILE));
107 |             ipSet.addAll(Files.readAllLines(EXCELLENT_PROXY_IPS_FILE));
108 |         }catch (Exception e){
109 |             LOGGER.error("读取本地代理IP失败", e);
110 |         }
111 |         if(ipSet.isEmpty()){
112 |             //从已知的网站获取代理IP和端口
113 |             ipSet.addAll(getProxyIps());
114 |         }
115 |         IPS.addAll(ipSet);
116 |         LOGGER.info("所有IP列表("+IPS.size()+")：");
117 |         AtomicInteger i = new AtomicInteger();
118 |         IPS.forEach(ip->LOGGER.info(i.incrementAndGet()+"、"+ip));
119 | 
120 |         new Thread(()->{
121 |             //检查次数
122 |             int count=0;
123 |             while(detect) {
124 |                 try {
125 |                     save();
126 |                     if(count%10==9){
127 |                         //也要防止被更新IP站点封锁
128 |                         toNewIp();
129 |                     }
130 |                     Thread.sleep(detectInterval);
131 |                     //检查网站是否有新IP
132 |                     getProxyIps().forEach(ip -> {
133 |                         if (!IPS.contains(ip)) {
134 |                             IPS.add(ip);
135 |                             LOGGER.info("发现新代理IP：" + ip);
136 |                         }
137 |                     });
138 |                     count++;
139 |                 } catch (Exception e) {
140 |                     LOGGER.error("更新代理IP出错", e);
141 |                 }
142 |             }
143 |         }).start();
144 |     }
145 |     public static void stopDetect(){
146 |         detect = false;
147 |     }
148 |     public static void startDetect(){
149 |         detect = true;
150 |     }
151 |     private static void save(){
152 |         try {
153 |             //将本地的和新发现的代理IP进行合并保存到本地
154 |             Set<String> ips = new ConcurrentSkipListSet<>();
155 |             ips.addAll(Files.readAllLines(PROXY_IPS_FILE));
156 |             ips.addAll(IPS);
157 |             //移除不能隐藏自己的IP
158 |             ips.removeAll(NORMAL_IPS);
159 |             Files.write(PROXY_IPS_FILE, toVerify(ips));
160 |             LOGGER.info("将" + ips.size() + "条代理IP地址写入本地");
161 |             Set<String> excellentIps = new HashSet<>();
162 |             excellentIps.addAll(Files.readAllLines(EXCELLENT_PROXY_IPS_FILE));
163 |             excellentIps.addAll(EXCELLENT_IPS);
164 |             Files.write(EXCELLENT_PROXY_IPS_FILE, toVerify(excellentIps));
165 |             LOGGER.info("将" + excellentIps.size() + "条能隐藏自己的代理IP地址写入本地");
166 |             Set<String> excellentUsaIps = new HashSet<>();
167 |             excellentUsaIps.addAll(Files.readAllLines(EXCELLENT_USA_PROXY_IPS_FILE));
168 |             excellentUsaIps.addAll(EXCELLENT_USA_IPS);
169 |             Files.write(EXCELLENT_USA_PROXY_IPS_FILE, toVerify(excellentUsaIps));
170 |             LOGGER.info("将" + excellentUsaIps.size() + "条能隐藏自己的美国代理IP地址写入本地");
171 |             Set<String> normalIps = new HashSet<>();
172 |             normalIps.addAll(Files.readAllLines(NORMAL_PROXY_IPS_FILE));
173 |             normalIps.addAll(NORMAL_IPS);
174 |             Files.write(NORMAL_PROXY_IPS_FILE, toVerify(normalIps));
175 |             LOGGER.info("将" + normalIps.size() + "条不能隐藏自己的代理IP地址写入本地");
176 |         }catch (Exception e){
177 |             LOGGER.error("保存失败", e);
178 |         }
179 |     }
180 | 
181 |     private static List<String> toVerify(Set<String> ips){
182 |         AtomicInteger i = new AtomicInteger();
183 |         AtomicInteger f = new AtomicInteger();
184 |         List<String> list = ips.parallelStream().filter(ip->{
185 |             LOGGER.info("验证进度："+ips.size()+"/"+i.incrementAndGet());
186 |             String[] attr = ip.split(":");
187 |             if(verify(attr[0], Integer.parseInt(attr[1]))){
188 |                 return true;
189 |             }
190 |             IPS.remove(ip);
191 |             f.incrementAndGet();
192 |             return false;
193 |         }).sorted().collect(Collectors.toList());
194 |         LOGGER.info("验证成功的IP数："+(ips.size()-f.get()));
195 |         LOGGER.info("验证失败的IP数："+f.get());
196 |         return list;
197 |     }
198 | 
199 |     private static String getNextProxyIp(){
200 |         int index = currentIpIndex%IPS.size();
201 |         currentIpIndex++;
202 |         return IPS.get(index);
203 |     }
204 | 
205 |     public static boolean toNewIp() {
206 |         long requestSwitchTime = System.currentTimeMillis();
207 |         LOGGER.info(Thread.currentThread()+"请求重新更换代理");
208 |         synchronized (ProxyIp.class) {
209 |             if (isSwitching) {
210 |                 LOGGER.info(Thread.currentThread()+"已经有其他线程在进行更换代理了，我睡觉等待吧，其他线程更换代理完毕会叫醒我的");
211 |                 try {
212 |                     ProxyIp.class.wait();
213 |                 } catch (InterruptedException e) {
214 |                     LOGGER.error(e.getMessage(), e);
215 |                 }
216 |                 LOGGER.info(Thread.currentThread()+"其他线程已经更换完代理了，我可以返回了");
217 |                 return true;
218 |             }
219 |             isSwitching = true;
220 |         }
221 |         //保险起见，这里再判断一下
222 |         //如果请求更换代理的时间小于上次成功更换代理的时间，则说明这个请求来的【太迟了】，则返回。
223 |         if(requestSwitchTime <= lastSwitchTime){
224 |             LOGGER.info("请求来的太迟了");
225 |             isSwitching = false;
226 |             return true;
227 |         }
228 |         LOGGER.info(Thread.currentThread()+"开始重新更换代理");
229 |         long start = System.currentTimeMillis();
230 |         String proxyIp = useNewProxyIp();
231 |         String currentIp = null;
232 |         int times=0;
233 |         //如果当前IP没有变化，还是等于之前的IP，则继续设置下一个代理IP
234 |         //为了防止无休止重试，设置限制次数
235 |         while((currentIp=getCurrentIp()).equals(previousIp)
236 |                 && (times++)<Integer.MAX_VALUE){
237 |             NORMAL_IPS.add(proxyIp);
238 |             IPS.remove(proxyIp);
239 |             proxyIp = useNewProxyIp();
240 |         }
241 |         if(!currentIp.equals(previousIp)) {
242 |             previousIp =currentIp;
243 |             EXCELLENT_IPS.add(proxyIp);
244 |             LOGGER.info(Thread.currentThread()+"自动更换代理成功！");
245 |             LOGGER.info(Thread.currentThread()+"更换代理耗时："+(System.currentTimeMillis()-start)+"毫秒");
246 |             //通知其他线程结束等待
247 |             synchronized (ProxyIp.class) {
248 |                 ProxyIp.class.notifyAll();
249 |             }
250 |             isSwitching = false;
251 |             lastSwitchTime = System.currentTimeMillis();
252 |             return true;
253 |         }
254 |         NORMAL_IPS.add(proxyIp);
255 |         IPS.remove(proxyIp);
256 |         LOGGER.info(Thread.currentThread()+"自动更换代理失败！");
257 |         LOGGER.info(Thread.currentThread()+"更换代理耗时："+(System.currentTimeMillis()-start)+"毫秒");
258 |         //通知其他线程结束等待
259 |         synchronized (ProxyIp.class) {
260 |             ProxyIp.class.notifyAll();
261 |         }
262 |         isSwitching = false;
263 |         return false;
264 |     }
265 |     private static String useNewProxyIp(){
266 |         String newProxy = getNextProxyIp();
267 |         String[] attr = newProxy.split(":");
268 |         System.setProperty("proxySet", "true");
269 |         System.setProperty("http.proxyHost", attr[0]);
270 |         System.setProperty("http.proxyPort", attr[1]);
271 |         LOGGER.info("尝试使用新的代理："+newProxy);
272 |         return newProxy;
273 |     }
274 |     /**
275 |      * 验证代理IP是否能工作，能工作不代表能向目标网站隐藏自己的IP
276 |      * @param host
277 |      * @param port
278 |      * @return
279 |      */
280 |     public static boolean verify(String host, int port){
281 |         try {
282 |             String url = "http://apdplat.org";
283 |             Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(host, port));
284 |             HttpURLConnection connection = (HttpURLConnection)new URL(url).openConnection(proxy);
285 |             connection.setConnectTimeout(10000);
286 |             connection.setReadTimeout(10000);
287 |             connection.setUseCaches(false);
288 |             BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream()));
289 |             StringBuilder html = new StringBuilder();
290 |             String line = null;
291 |             while ((line=reader.readLine()) != null){
292 |                 html.append(line);
293 |             }
294 |             LOGGER.info("HTML："+html);
295 |             if(html.toString().contains("APDPlat应用级产品开发平台")){
296 |                 LOGGER.info("代理IP验证成功："+host+":"+port);
297 |                 return true;
298 |             }
299 |         }catch (Exception e){
300 |             LOGGER.error(e.getMessage());
301 |         }
302 |         LOGGER.info("代理IP验证失败："+host+":"+port);
303 |         return false;
304 |     }
305 |     /**
306 |      * 看看在ip138的眼中，自己的IP是多少
307 |      * @return
308 |      */
309 |     public static String getCurrentIp(){
310 |         try {
311 |             String url = "http://1212.ip138.com/ic.asp?timestamp="+System.nanoTime();
312 |             String text = Jsoup.connect(url)
313 |                     .header("Accept", ACCEPT)
314 |                     .header("Accept-Encoding", ENCODING)
315 |                     .header("Accept-Language", LANGUAGE)
316 |                     .header("Connection", CONNECTION)
317 |                     .header("Host", "1111.ip138.com")
318 |                     .header("Referer", "http://ip138.com/")
319 |                     .header("User-Agent", USER_AGENT)
320 |                     .ignoreContentType(true)
321 |                     .timeout(5000)
322 |                     .get()
323 |                     .text();
324 |             LOGGER.info("检查自身IP地址："+text);
325 |             Matcher matcher = IP_PATTERN.matcher(text);
326 |             if(matcher.find()){
327 |                 String ip = matcher.group();
328 |                 LOGGER.info("自身IP地址："+ip);
329 |                 if(text.contains("美国")){
330 |                     EXCELLENT_USA_IPS.add(System.getProperty("http.proxyHost") + ":" + System.getProperty("http.proxyPort"));
331 |                 }
332 |                 return ip;
333 |             }
334 |         }catch (Exception e){
335 |             LOGGER.error(e.getMessage());
336 |         }
337 |         LOGGER.info("检查自身IP地址失败，返回之前的IP地址："+ previousIp);
338 |         return previousIp;
339 |     }
340 |     private static Set<String> getProxyIps(){
341 |         Set<String> ips = new HashSet<>();
342 |         ips.addAll(getProxyIpOne());
343 |         ips.addAll(getProxyIpTwo());
344 |         ips.addAll(getProxyIpThree());
345 |         ips.addAll(getProxyIpFour());
346 |         return ips;
347 |     }
348 |     private static List<String> getProxyIpOne(){
349 |         String url = "http://proxy.goubanjia.com/?timestamp="+System.nanoTime();
350 |         String cssPath = "html body div.wrap.fullwidth div#content div#post-2.post-2.page.type-page.status-publish.hentry div.entry.entry-content div#list table.table tbody tr";
351 |         return getProxyIp(url, cssPath);
352 |     }
353 |     private static List<String> getProxyIpTwo(){
354 |         String url = "http://ip.qiaodm.com/?timestamp="+System.nanoTime();
355 |         String cssPath = "html body div#main_container div.inner table.iplist tbody tr";
356 |         return getProxyIp(url, cssPath);
357 |     }
358 |     private static List<String> getProxyIp(String url, String cssPath){
359 |         List<String> ips = new ArrayList<>();
360 |         try {
361 |             String html = ((HtmlPage)WEB_CLIENT.getPage(url)).getBody().asXml();
362 |             //LOGGER.info("html："+html);
363 |             Document doc = Jsoup.parse(html);
364 |             Elements elements = doc.select(cssPath);
365 |             elements
366 |                     .forEach(element -> {
367 |                         try {
368 |                             Elements tds = element.children();
369 |                             String ip = null;
370 |                             int port = 0;
371 |                             if (tds.size() > 1) {
372 |                                 Element ele = tds.get(0);
373 |                                 ip = getIps(ele);
374 |                                 String text = tds.get(1).text();
375 |                                 LOGGER.info("端口："+text+" -> "+tds.get(1).outerHtml());
376 |                                 port = Integer.parseInt(text);
377 |                             }
378 |                             if(ip != null && port > 0){
379 |                                 LOGGER.info("解析出IP："+ip+"，端口："+port);
380 |                                 if(verify(ip, port)){
381 |                                     LOGGER.info("IP："+ip+"，端口："+port+"可以使用");
382 |                                     ips.add(ip + ":" + port);
383 |                                 }else {
384 |                                     LOGGER.info("IP："+ip+"，端口："+port+"不能使用");
385 |                                 }
386 |                             }
387 |                         }catch (Exception e){
388 |                             LOGGER.error("解析IP出错", e);
389 |                         }
390 |                     });
391 |         }catch (Exception e){
392 |             LOGGER.error("解析IP出错", e);
393 |         }
394 |         return ips;
395 |     }
396 |     private static List<String> getProxyIpThree(){
397 |         List<String> ips = new ArrayList<>();
398 |         for(int i=1; i<=10; i++){
399 |             ips.addAll(getProxyIpThree(i));
400 |         }
401 |         return ips;
402 |     }
403 |     private static List<String> getProxyIpThree(int page){
404 |         List<String> ips = new ArrayList<>();
405 |         try {
406 |             String url = "http://www.kuaidaili.com/proxylist/"+page;
407 |             String html = ((HtmlPage)WEB_CLIENT.getPage(url)).getBody().asXml();
408 |             //LOGGER.info("html："+html);
409 |             Document doc = Jsoup.parse(html);
410 |             Elements elements = doc.select("html body div#container div#list table.table.table-bordered.table-striped tbody tr");
411 |             elements
412 |                     .forEach(element -> {
413 |                         try {
414 |                             Elements tds = element.children();
415 |                             String ip = null;
416 |                             int port = 0;
417 |                             if (tds.size() > 1) {
418 |                                 ip = tds.get(0).text();
419 |                                 String text = tds.get(1).text();
420 |                                 LOGGER.info("IP："+ip);
421 |                                 LOGGER.info("端口："+text);
422 |                                 Matcher matcher = IP_PATTERN.matcher(ip.toString());
423 |                                 if(matcher.find()){
424 |                                     ip = matcher.group();
425 |                                     LOGGER.info("ip地址验证通过："+ip);
426 |                                 }else{
427 |                                     LOGGER.info("ip地址验证失败："+ip);
428 |                                     ip = null;
429 |                                 }
430 |                                 try{
431 |                                     port = Integer.parseInt(text);
432 |                                     LOGGER.info("端口验证通过："+port);
433 |                                 }catch (Exception e){
434 |                                     LOGGER.info("端口验证失败："+port);
435 |                                 }
436 |                             }
437 |                             if(ip != null && port > 0){
438 |                                 LOGGER.info("解析出IP："+ip+"，端口："+port);
439 |                                 if(verify(ip, port)){
440 |                                     LOGGER.info("IP："+ip+"，端口："+port+"可以使用");
441 |                                     ips.add(ip + ":" + port);
442 |                                 }else {
443 |                                     LOGGER.info("IP："+ip+"，端口："+port+"不能使用");
444 |                                 }
445 |                             }
446 |                         }catch (Exception e){
447 |                             LOGGER.error("解析IP出错", e);
448 |                         }
449 |                     });
450 |         }catch (Exception e){
451 |             LOGGER.error("解析IP出错", e);
452 |         }
453 |         return ips;
454 |     }
455 |     private static List<String> getProxyIpFour(){
456 |         List<String> ips = new ArrayList<>();
457 |         for(int i=1; i<=10; i++){
458 |             ips.addAll(getProxyIpFour(i));
459 |         }
460 |         return ips;
461 |     }
462 |     private static List<String> getProxyIpFour(int page){
463 |         List<String> ips = new ArrayList<>();
464 |         try {
465 |             String url = "http://www.kxdaili.com/ipList/"+page+".html";
466 |             String html = ((HtmlPage)WEB_CLIENT.getPage(url)).getBody().asXml();
467 |             //LOGGER.info("html："+html);
468 |             Document doc = Jsoup.parse(html);
469 |             Elements elements = doc.select("html body#nav_btn01 div.tab_c_box.buy_tab_box table.ui.table.segment tbody tr");
470 |             elements
471 |                     .forEach(element -> {
472 |                         try {
473 |                             Elements tds = element.children();
474 |                             String ip = null;
475 |                             int port = 0;
476 |                             if (tds.size() > 1) {
477 |                                 ip = tds.get(0).text();
478 |                                 String text = tds.get(1).text();
479 |                                 LOGGER.info("IP："+ip);
480 |                                 LOGGER.info("端口："+text);
481 |                                 Matcher matcher = IP_PATTERN.matcher(ip.toString());
482 |                                 if(matcher.find()){
483 |                                     ip = matcher.group();
484 |                                     LOGGER.info("ip地址验证通过："+ip);
485 |                                 }else{
486 |                                     LOGGER.info("ip地址验证失败："+ip);
487 |                                     ip = null;
488 |                                 }
489 |                                 try{
490 |                                     port = Integer.parseInt(text);
491 |                                     LOGGER.info("端口验证通过："+port);
492 |                                 }catch (Exception e){
493 |                                     LOGGER.info("端口验证失败："+port);
494 |                                 }
495 |                             }
496 |                             if(ip != null && port > 0){
497 |                                 LOGGER.info("解析出IP："+ip+"，端口："+port);
498 |                                 if(verify(ip, port)){
499 |                                     LOGGER.info("IP："+ip+"，端口："+port+"可以使用");
500 |                                     ips.add(ip + ":" + port);
501 |                                 }else {
502 |                                     LOGGER.info("IP："+ip+"，端口："+port+"不能使用");
503 |                                 }
504 |                             }
505 |                         }catch (Exception e){
506 |                             LOGGER.error("解析IP出错", e);
507 |                         }
508 |                     });
509 |         }catch (Exception e){
510 |             LOGGER.error("解析IP出错", e);
511 |         }
512 |         return ips;
513 |     }
514 |     private static String getIps(Element element){
515 |         StringBuilder ip = new StringBuilder();
516 |         Elements all = element.children();
517 |         LOGGER.info("");
518 |         LOGGER.info("开始解析IP地址，机器读到的文本："+element.text());
519 |         AtomicInteger count = new AtomicInteger();
520 |         all.forEach(ele -> {
521 |             String html = ele.outerHtml();
522 |             LOGGER.info(count.incrementAndGet() + "、" + "原始HTML："+html.replaceAll("[\n\r]", ""));
523 |             String text = ele.text();
524 |             if(ele.hasAttr("style")
525 |                     && (ele.attr("style").equals("display: none;")
526 |                     || ele.attr("style").equals("display:none;"))) {
527 |                 LOGGER.info("忽略不显示的文本："+text);
528 |             }else{
529 |                 if(StringUtils.isNotBlank(text)){
530 |                     LOGGER.info("需要的文本："+text);
531 |                     ip.append(text);
532 |                 }else{
533 |                     LOGGER.info("忽略空文本");
534 |                 }
535 |             }
536 |         });
537 |         LOGGER.info("----------------------------------------------------------------");
538 |         LOGGER.info("解析到的ip: "+ip);
539 |         LOGGER.info("----------------------------------------------------------------");
540 |         Matcher matcher = IP_PATTERN.matcher(ip.toString());
541 |         if(matcher.find()){
542 |             String _ip = matcher.group();
543 |             LOGGER.info("ip地址验证通过："+_ip);
544 |             return _ip;
545 |         }else{
546 |             LOGGER.info("ip地址验证失败："+ip);
547 |         }
548 |         return null;
549 |     }
550 |     public static void main(String[] args) {
551 |         //如果只是想收集IP，则一直运行此程序即可，更新时间改为1秒钟。
552 |         detectInterval=1000;
553 |         while(true){
554 |             toNewIp();
555 |         }
556 |     }
557 | }


--------------------------------------------------------------------------------
/src/main/java/org/seo/rank/tools/VoteRanker.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * APDPlat - Application Product Development Platform
 3 |  * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com
 4 |  *
 5 |  *  This program is free software: you can redistribute it and/or modify
 6 |  *  it under the terms of the GNU General Public License as published by
 7 |  *  the Free Software Foundation, either version 3 of the License, or
 8 |  *  (at your option) any later version.
 9 |  *
10 |  *  This program is distributed in the hope that it will be useful,
11 |  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
12 |  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 |  *  GNU General Public License for more details.
14 |  *
15 |  *  You should have received a copy of the GNU General Public License
16 |  *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
17 |  */
18 | package org.seo.rank.tools;
19 | 
20 | import org.jsoup.Jsoup;
21 | import org.jsoup.nodes.Element;
22 | 
23 | import java.net.URL;
24 | import java.util.HashMap;
25 | import java.util.Map;
26 | import java.util.concurrent.atomic.AtomicInteger;
27 | 
28 | /**
29 |  * 投票排名分析工具
30 |  * @author 杨尚川
31 |  */
32 | public class VoteRanker {
33 | 
34 |     private VoteRanker(){}
35 | 
36 |     private static final String WORKS = "html body div.vote-container.block div.container div.vote-project";
37 |     private static final String PROJECT_NAME = "div.project-detail a.project-name";
38 |     private static final String PROJECT_DES = "div.project-detail div.project-description";
39 |     private static final String PROJECT_OWNER = "div.project-detail div.project-owner";
40 |     private static final String VOTE_COUNT = "div.vote-action div.vote-button span";
41 |     public static Map<String, Integer> getRank(){
42 |         String url = "http://i.100offer.com/projects?page=";
43 |         Map<String, Integer> map = new HashMap<>();
44 |         for(int i=1; i<24; i++) {
45 |             System.out.println("get page "+(url+i));
46 |             try {
47 |                 for (Element element : Jsoup.parse(new URL(url + i), 60000).select(WORKS)) {
48 |                     String projectName = element.select(PROJECT_NAME).text();
49 |                     String voteCount = element.select(VOTE_COUNT).text();
50 |                     String des = element.select(PROJECT_DES).text().replace("故事", "");
51 |                     String owner = element.select(PROJECT_OWNER).text().replace("Hot", "").replace("故事", "").replace("by&nbsp", "").replace("by ", "");
52 |                     map.put(projectName+"_"+owner+"_"+des, Integer.parseInt(voteCount));
53 |                 }
54 |             } catch (Exception e) {
55 |                 e.printStackTrace();
56 |             }
57 |         }
58 |         return map;
59 |     }
60 |     public static void main(String[] args){
61 |         Map<String, Integer> data = getRank();
62 |         AtomicInteger i = new AtomicInteger();
63 |         System.out.println("<table>");
64 |         System.out.println("<tr><td>排名</td><td>票数</td><td>项目名称</td><td>项目作者</td><td>项目描述</td></tr>");
65 |         data.entrySet().stream().sorted((a, b) -> b.getValue().compareTo(a.getValue())).forEach(e -> {
66 |             String[] value=e.getKey().split("_");
67 |             String projectName = value[0];
68 |             String owner = value[1];
69 |             String des = value[2];
70 |             System.out.println("<tr><td>" + i.incrementAndGet() + "</td><td>" + e.getValue() + "</td><td>" + projectName + "</td><td>" + owner + "</td><td>" + des + "</td></tr>");
71 |         });
72 |         System.out.println("</table>");
73 |     }
74 | }
75 | 


--------------------------------------------------------------------------------
/src/main/resources/logback.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | 
 3 | <configuration>
 4 |   <appender name="stdout" class="ch.qos.logback.core.ConsoleAppender">
 5 |     <layout class="ch.qos.logback.classic.PatternLayout">
 6 |       <Pattern>%d [%t] %-5p [%c] - %m%n</Pattern>
 7 |     </layout>
 8 |   </appender>
 9 |   <appender name="logfile" class="ch.qos.logback.core.rolling.RollingFileAppender">
10 |     <File>logs/logback.log</File>
11 |     <rollingPolicy class="ch.qos.logback.core.rolling.FixedWindowRollingPolicy">  
12 |       <FileNamePattern>logs/logback_%i.log</FileNamePattern>
13 |       <MinIndex>1</MinIndex>  
14 |       <MaxIndex>10000</MaxIndex>
15 |     </rollingPolicy>  
16 |     <triggeringPolicy class="ch.qos.logback.core.rolling.SizeBasedTriggeringPolicy">  
17 |       <MaxFileSize>5MB</MaxFileSize> 
18 |     </triggeringPolicy> 
19 |     <layout class="ch.qos.logback.classic.PatternLayout">
20 |       <Pattern>%d [%t] %-5p [%c] - %m%n</Pattern>
21 |     </layout>
22 |   </appender>
23 |   <root level="INFO">
24 |     <!--
25 |     <appender-ref ref="logfile"/>
26 |     -->
27 |     <appender-ref ref="stdout"/>
28 |   </root>
29 | </configuration>
30 | 


--------------------------------------------------------------------------------
/src/main/resources/proxy_ips_excellent.txt:
--------------------------------------------------------------------------------
  1 | 1.202.56.194:9000
  2 | 101.226.249.237:80
  3 | 101.230.8.69:8000
  4 | 101.4.136.101:9999
  5 | 101.4.136.103:9999
  6 | 101.4.136.104:9999
  7 | 101.71.27.120:80
  8 | 103.27.24.236:80
  9 | 106.38.251.231:8088
 10 | 106.38.251.61:8088
 11 | 106.38.251.63:8088
 12 | 106.38.251.65:8088
 13 | 111.1.36.133:80
 14 | 111.1.36.6:80
 15 | 111.11.184.110:80
 16 | 111.13.136.58:80
 17 | 111.13.136.59:80
 18 | 111.13.136.59:843
 19 | 111.161.126.100:80
 20 | 111.161.126.98:80
 21 | 111.161.65.79:80
 22 | 111.40.196.68:80
 23 | 111.78.8.27:9797
 24 | 112.102.50.208:8118
 25 | 112.84.130.11:80
 26 | 112.84.130.14:80
 27 | 112.84.130.18:80
 28 | 112.84.130.2:80
 29 | 112.84.130.31:80
 30 | 113.107.57.76:3128
 31 | 113.109.110.153:9797
 32 | 113.140.25.4:80
 33 | 113.78.19.207:3128
 34 | 113.97.118.105:8888
 35 | 113.97.118.105:9999
 36 | 114.112.91.97:90
 37 | 114.215.108.155:9999
 38 | 114.252.245.183:8118
 39 | 114.255.183.173:8080
 40 | 114.255.183.189:8080
 41 | 115.182.62.99:10080
 42 | 115.231.100.253:3128
 43 | 115.231.96.120:80
 44 | 115.238.55.91:8088
 45 | 115.29.202.148:8888
 46 | 115.44.114.45:8118
 47 | 116.20.224.174:9999
 48 | 117.146.116.2:443
 49 | 117.146.116.2:80
 50 | 117.146.116.2:8080
 51 | 117.146.116.2:8088
 52 | 117.146.116.2:8888
 53 | 117.146.116.2:91
 54 | 117.162.116.91:8123
 55 | 117.177.240.43:80
 56 | 118.126.142.209:3128
 57 | 118.193.166.141:80
 58 | 118.26.150.161:8088
 59 | 119.123.180.250:8118
 60 | 119.147.91.21:80
 61 | 119.52.63.218:9999
 62 | 120.193.146.93:80
 63 | 120.193.146.93:81
 64 | 120.193.146.93:843
 65 | 120.193.146.93:86
 66 | 120.197.234.166:80
 67 | 120.198.243.83:80
 68 | 120.203.148.5:8118
 69 | 120.203.148.6:8118
 70 | 120.203.148.7:8001
 71 | 120.203.149.104:8001
 72 | 120.203.149.110:8118
 73 | 120.203.149.162:8118
 74 | 120.203.149.163:8118
 75 | 120.203.214.206:3128
 76 | 120.236.148.113:3128
 77 | 120.24.79.70:80
 78 | 120.8.208.207:9999
 79 | 121.101.214.160:80
 80 | 121.14.138.56:81
 81 | 121.40.72.148:9527
 82 | 121.41.32.4:80
 83 | 121.52.229.156:8088
 84 | 122.115.49.85:8088
 85 | 122.136.46.151:80
 86 | 122.224.169.90:8080
 87 | 122.225.117.26:80
 88 | 122.228.92.103:80
 89 | 122.228.92.103:8000
 90 | 122.228.92.103:8080
 91 | 122.228.92.73:3128
 92 | 122.228.92.73:80
 93 | 122.228.92.73:8000
 94 | 122.228.92.73:8080
 95 | 122.96.59.106:80
 96 | 122.96.59.106:81
 97 | 122.96.59.106:82
 98 | 122.96.59.106:83
 99 | 122.96.59.106:843
100 | 123.121.95.61:8118
101 | 123.58.129.48:80
102 | 124.126.126.105:80
103 | 124.161.94.8:80
104 | 124.192.101.254:8001
105 | 124.202.168.74:8118
106 | 124.202.169.226:8118
107 | 124.202.169.2:8118
108 | 124.202.169.50:8118
109 | 124.202.169.98:8118
110 | 124.202.170.178:8001
111 | 124.202.170.206:8118
112 | 124.202.170.242:8118
113 | 124.202.171.2:8118
114 | 124.202.173.234:8118
115 | 124.202.175.70:8118
116 | 124.202.177.26:8118
117 | 124.202.178.22:8001
118 | 124.202.179.150:8118
119 | 124.202.180.6:8118
120 | 124.202.181.110:8118
121 | 124.202.181.146:8118
122 | 124.202.181.18:8118
123 | 124.202.181.230:8001
124 | 124.202.181.230:8118
125 | 124.202.182.182:8118
126 | 124.202.182.214:8001
127 | 124.202.182.214:8118
128 | 124.202.182.22:8118
129 | 124.202.183.222:8118
130 | 124.202.183.74:8001
131 | 124.202.183.74:8118
132 | 124.202.192.150:8118
133 | 124.202.208.18:8001
134 | 124.202.208.18:8118
135 | 124.202.217.134:8001
136 | 124.202.217.134:8118
137 | 124.202.221.26:8001
138 | 124.202.221.26:8118
139 | 14.153.113.243:9999
140 | 14.20.6.175:9797
141 | 14.218.220.216:9797
142 | 162.208.49.45:8089
143 | 163.125.195.170:9999
144 | 171.118.183.111:9999
145 | 175.143.151.16:8080
146 | 181.39.32.98:3130
147 | 182.92.240.197:8080
148 | 183.131.144.204:443
149 | 183.131.144.204:80
150 | 183.203.208.168:8118
151 | 183.207.224.13:80
152 | 183.207.224.14:80
153 | 183.207.224.15:80
154 | 183.207.224.43:80
155 | 183.207.224.44:80
156 | 183.207.224.45:80
157 | 183.207.228.115:80
158 | 183.207.228.51:80
159 | 183.207.228.6:89
160 | 183.207.229.146:80
161 | 183.207.237.11:80
162 | 183.207.237.11:81
163 | 183.207.237.11:83
164 | 183.207.237.11:84
165 | 183.207.237.11:85
166 | 183.207.237.11:86
167 | 183.216.255.18:8118
168 | 183.218.63.102:8118
169 | 183.218.63.168:8118
170 | 183.218.63.179:8118
171 | 183.218.63.59:8118
172 | 183.230.53.26:8123
173 | 183.60.156.17:8888
174 | 186.92.172.126:8080
175 | 202.104.208.0:80
176 | 202.104.208.1:80
177 | 202.106.169.142:80
178 | 202.108.35.151:80
179 | 202.119.25.227:9999
180 | 202.119.25.69:9999
181 | 202.119.25.72:9999
182 | 202.119.25.73:9999
183 | 202.39.175.6:3128
184 | 203.192.12.146:80
185 | 203.75.167.252:8000
186 | 210.14.158.122:80
187 | 211.141.130.105:8118
188 | 211.141.130.106:8118
189 | 211.141.130.108:8118
190 | 211.141.130.109:8118
191 | 211.141.130.112:8118
192 | 211.141.130.114:8118
193 | 211.141.130.186:8118
194 | 211.141.130.245:8118
195 | 211.141.130.252:8118
196 | 211.141.130.253:8001
197 | 211.141.130.253:8118
198 | 211.141.130.254:8118
199 | 211.141.130.56:8118
200 | 211.141.130.96:8118
201 | 211.141.81.212:8118
202 | 211.141.81.215:8118
203 | 211.141.82.246:8118
204 | 211.141.82.247:8118
205 | 211.144.81.69:18000
206 | 211.150.65.29:80
207 | 218.204.140.104:8118
208 | 218.204.140.106:8118
209 | 218.204.140.212:8001
210 | 218.204.140.97:8118
211 | 218.204.141.92:8118
212 | 218.204.143.132:8118
213 | 218.204.143.137:8118
214 | 218.204.143.4:8118
215 | 218.204.143.85:8118
216 | 218.204.143.87:8001
217 | 218.204.143.87:8118
218 | 218.205.65.186:3128
219 | 218.240.156.82:80
220 | 218.241.158.250:3128
221 | 218.28.96.39:3128
222 | 218.4.236.117:80
223 | 218.75.26.44:808
224 | 218.78.210.190:8080
225 | 218.78.210.54:8080
226 | 218.89.170.75:8888
227 | 218.90.174.167:3128
228 | 218.97.194.200:81
229 | 218.97.194.201:80
230 | 218.97.194.202:80
231 | 218.97.194.212:80
232 | 218.97.194.216:80
233 | 218.97.194.218:80
234 | 218.97.194.219:80
235 | 218.97.194.220:80
236 | 218.97.194.221:80
237 | 218.97.194.222:80
238 | 218.97.194.223:80
239 | 218.97.195.38:81
240 | 218.97.195.39:80
241 | 218.97.195.40:80
242 | 218.97.195.43:80
243 | 221.10.102.203:80
244 | 221.10.102.203:81
245 | 221.10.102.203:83
246 | 221.10.102.203:843
247 | 221.5.69.51:80
248 | 222.45.195.34:8118
249 | 222.45.196.17:8118
250 | 222.45.196.46:8118
251 | 222.45.212.130:8118
252 | 222.45.85.210:8118
253 | 222.45.85.53:8118
254 | 223.100.98.44:8000
255 | 223.94.145.126:8123
256 | 27.115.75.114:8080
257 | 39.190.108.152:8123
258 | 39.190.109.100:8123
259 | 42.159.193.148:80
260 | 5.9.190.133:8080
261 | 58.134.102.3:13530
262 | 58.220.2.130:80
263 | 58.220.2.132:80
264 | 58.220.2.133:80
265 | 58.220.2.134:80
266 | 58.220.2.136:80
267 | 58.220.2.137:80
268 | 58.220.2.138:80
269 | 58.220.2.139:80
270 | 58.220.2.140:80
271 | 58.220.2.141:80
272 | 58.220.2.142:80
273 | 58.220.2.145:80
274 | 58.220.2.148:80
275 | 58.220.2.153:80
276 | 58.220.2.156:80
277 | 58.221.85.182:3128
278 | 58.240.227.204:80
279 | 58.251.132.181:8888
280 | 58.251.78.71:8088
281 | 58.253.238.242:80
282 | 58.253.238.243:80
283 | 59.61.79.124:8118
284 | 60.194.67.230:8118
285 | 60.195.3.180:8118
286 | 60.206.153.177:8118
287 | 60.216.225.157:8118
288 | 61.133.51.6:9797
289 | 61.144.14.67:8085
290 | 61.154.127.136:10001
291 | 61.156.3.166:80
292 | 61.162.223.41:9797
293 | 61.184.192.42:80
294 | 61.237.162.119:8118
295 | 61.54.221.200:3128
296 | 


--------------------------------------------------------------------------------
/src/main/webapp/META-INF/context.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <Context antiJARLocking="true" path="/rank"/>
3 | 


--------------------------------------------------------------------------------
/src/main/webapp/WEB-INF/web.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <web-app version="3.0" xmlns="http://java.sun.com/xml/ns/javaee" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://java.sun.com/xml/ns/javaee http://java.sun.com/xml/ns/javaee/web-app_3_0.xsd">
3 |     <session-config>
4 |         <session-timeout>
5 |             30
6 |         </session-timeout>
7 |     </session-config>
8 | </web-app>
9 | 


--------------------------------------------------------------------------------
/src/main/webapp/index.jsp:
--------------------------------------------------------------------------------
 1 | <%--
 2 |    APDPlat - Application Product Development Platform
 3 |    Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com
 4 |    
 5 |    This program is free software: you can redistribute it and/or modify
 6 |    it under the terms of the GNU General Public License as published by
 7 |    the Free Software Foundation, either version 3 of the License, or
 8 |    (at your option) any later version.
 9 |    
10 |    This program is distributed in the hope that it will be useful,
11 |    but WITHOUT ANY WARRANTY; without even the implied warranty of
12 |    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 |    GNU General Public License for more details.
14 |    
15 |    You should have received a copy of the GNU General Public License
16 |    along with this program.  If not, see <http://www.gnu.org/licenses/>.
17 | --%>
18 | 
19 | <%@page contentType="text/html" pageEncoding="UTF-8"%>
20 | <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
21 | <html>
22 |     <head>
23 |         <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
24 |         <title>获取标题和链接API调用演示</title>
25 |     </head>
26 |     <body>
27 |         <h2><font color="blue">获取标题和链接API调用演示</font></h2>
28 |         
29 |         <form action="GetArticle" method="post">
30 |             栏目入口URL地址：<input name="url" size="150" maxlength="150"><br/>
31 |             下一页CSS路径：<input name="nextPageCssQuery" size="150" maxlength="150"><br/>
32 |             下一页标签文本：<input name="nextPageText" size="150" maxlength="150"><br/>
33 |             标题CSS路径：<input name="titleCssQuery" size="150" maxlength="150"><br/>
34 |             <input type="submit" value="获取标题和链接"/>
35 |         </form>
36 |     </body>
37 | </html>


--------------------------------------------------------------------------------
/src/main/webapp/rank.jsp:
--------------------------------------------------------------------------------
 1 | <%--
 2 |    APDPlat - Application Product Development Platform
 3 |    Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com
 4 |    
 5 |    This program is free software: you can redistribute it and/or modify
 6 |    it under the terms of the GNU General Public License as published by
 7 |    the Free Software Foundation, either version 3 of the License, or
 8 |    (at your option) any later version.
 9 |    
10 |    This program is distributed in the hope that it will be useful,
11 |    but WITHOUT ANY WARRANTY; without even the implied warranty of
12 |    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 |    GNU General Public License for more details.
14 |    
15 |    You should have received a copy of the GNU General Public License
16 |    along with this program.  If not, see <http://www.gnu.org/licenses/>.
17 | --%>
18 | 
19 | <%@page contentType="text/html" pageEncoding="UTF-8"%>
20 | <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
21 | <html>
22 |     <head>
23 |         <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
24 |         <title>获取关键词和链接的百度排名API调用演示</title>
25 |     </head>
26 |     <body>
27 |         <h2><font color="blue">获取关键词和链接的百度排名API调用演示</font></h2>
28 |         
29 |         <form action="GetRank" method="post">
30 |             URL地址：<input name="url" size="150" maxlength="150"><br/>
31 |             关键词：<input name="keyword" size="150" maxlength="150"><br/>
32 |             <input type="submit" value="获取关键词和链接的百度排名"/>
33 |         </form>
34 |     </body>
35 | </html>


--------------------------------------------------------------------------------
/src/main/webapp/ranks.jsp:
--------------------------------------------------------------------------------
 1 | <%--
 2 |    APDPlat - Application Product Development Platform
 3 |    Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com
 4 |    
 5 |    This program is free software: you can redistribute it and/or modify
 6 |    it under the terms of the GNU General Public License as published by
 7 |    the Free Software Foundation, either version 3 of the License, or
 8 |    (at your option) any later version.
 9 |    
10 |    This program is distributed in the hope that it will be useful,
11 |    but WITHOUT ANY WARRANTY; without even the implied warranty of
12 |    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 |    GNU General Public License for more details.
14 |    
15 |    You should have received a copy of the GNU General Public License
16 |    along with this program.  If not, see <http://www.gnu.org/licenses/>.
17 | --%>
18 | 
19 | <%@page contentType="text/html" pageEncoding="UTF-8"%>
20 | <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
21 | <html>
22 |     <head>
23 |         <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
24 |         <title>对栏目下的所有文章计算百度排名API调用演示</title>
25 |     </head>
26 |     <body>
27 |         <h2><font color="blue">对栏目下的所有文章计算百度排名API调用演示</font></h2>
28 |         
29 |         <form action="GetListRank" method="post">
30 |             栏目入口URL地址：<input name="url" size="150" maxlength="150"><br/>
31 |             下一页CSS路径：<input name="nextPageCssQuery" size="150" maxlength="150"><br/>
32 |             下一页标签文本：<input name="nextPageText" size="150" maxlength="150"><br/>
33 |             标题CSS路径：<input name="titleCssQuery" size="150" maxlength="150"><br/>
34 |             <input type="submit" value="获取栏目下所有文章的排名"/>
35 |         </form>
36 |     </body>
37 | </html>


--------------------------------------------------------------------------------