├── src
└── main
│ ├── resources
│ ├── generator.properties
│ ├── jdbc.properties
│ ├── applicationContext.xml
│ ├── log4j.properties
│ ├── quartz.xml
│ ├── mybatis-config.xml
│ ├── spring-mvc.xml
│ ├── mybatis-mapper
│ │ ├── PostKeywordMapper.xml
│ │ └── WechatPostMapper.xml
│ ├── spring-mybatis.xml
│ └── generatorConfig.xml
│ ├── webapp
│ ├── index.jsp
│ ├── weekly_posts.jsp
│ └── WEB-INF
│ │ └── web.xml
│ ├── java
│ └── com
│ │ └── wechat
│ │ └── auto
│ │ ├── model
│ │ ├── PostJSP.java
│ │ ├── PostKeyword.java
│ │ └── WechatPost.java
│ │ ├── util
│ │ ├── SimHash.java
│ │ └── Utils.java
│ │ ├── mapper
│ │ ├── PostKeywordMapper.java
│ │ └── WechatPostMapper.java
│ │ ├── controller
│ │ └── WechatController.java
│ │ └── spider
│ │ ├── KeyWordProcessor.java
│ │ ├── SpiderKeyword.java
│ │ ├── PostProcessor.java
│ │ └── SpiderTimer.java
│ └── main.iml
├── README.md
├── WechatSpider.iml
├── rule_default_js
├── rule_default_before20190512.js
└── rule_default_after20190512.js
└── pom.xml
/src/main/resources/generator.properties:
--------------------------------------------------------------------------------
1 | jdbc.driverLocation=/Users/chenwenguan/.m2/repository/mysql/mysql-connector-java/5.1.30/mysql-connector-java-5.1.30.jar
2 | jdbc.driverClass=com.mysql.jdbc.Driver
3 | jdbc.connectionURL=jdbc:mysql://localhost:3306/WechatPost?useUnicode=true&characterEncoding=utf-8
4 | jdbc.userId=root
5 | jdbc.password=*************
--------------------------------------------------------------------------------
/src/main/webapp/index.jsp:
--------------------------------------------------------------------------------
1 | <%@ page language="java" contentType="text/html; charset=utf-8"
2 | pageEncoding="utf-8"%>
3 |
4 |
5 |
6 |
7 | Insert title here
8 |
9 |
10 | 456
11 |
12 |
--------------------------------------------------------------------------------
/src/main/resources/jdbc.properties:
--------------------------------------------------------------------------------
1 | driver=com.mysql.jdbc.Driver
2 |
3 | url=jdbc:mysql://localhost:3306/WechatPost?useUnicode=true&characterEncoding=utf-8
4 | #url=jdbc:mysql://localhost:3306/newsinfo?useUnicode=true&characterEncoding=utf-8
5 | username=root
6 | password=*************
7 |
8 | #定义初始连接数
9 | initialSize=0
10 | #定义最大连接数
11 | maxActive=20
12 | #定义最大空闲
13 | maxIdle=20
14 | #定义最小空闲
15 | minIdle=1
16 | #定义最长等待时间
17 | maxWait=60000
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # WechatSpider
2 |
3 | 微信公众号文章爬取JavaWeb服务端实现源码
4 |
5 | 相应的环境配置和源码说明参考个人博客:[微信公众号爬虫:服务端公众号文章数据采集](https://www.chenwenguan.com/wechat-spider-server/)
6 |
7 | 微信公众号文章自动浏览实现参考:[微信公众号爬虫:微信公众号浏览自动化](https://www.chenwenguan.com/wechat-browse-automation/)
8 |
9 |
10 | TIPS:
11 |
12 | 1、generator.properties文件中的jdbc.driverLocation改成自己电脑的connector实际路径,jdbc.userId和jdbc.password改成自己数据库的用户名和密码。
13 |
14 | 2、jdbc.properties文件中的数据库用户名和密码参数也改成自己配置的值。
15 |
16 |
--------------------------------------------------------------------------------
/src/main/java/com/wechat/auto/model/PostJSP.java:
--------------------------------------------------------------------------------
1 | package com.wechat.auto.model;
2 |
3 | /**
4 | * 用于JSP页面的文本显示,避免特殊字符串连接导致的显示问题
5 | */
6 | public class PostJSP {
7 |
8 | private String title;
9 |
10 | private String digest;
11 |
12 | public void setTitle(String title){
13 | this.title = title;
14 | }
15 |
16 | public void setDigest(String digest){
17 | this.digest = digest;
18 | }
19 |
20 | public String getDigest() {
21 | return digest;
22 | }
23 |
24 | public String getTitle() {
25 | return title;
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/WechatSpider.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
--------------------------------------------------------------------------------
/src/main/resources/applicationContext.xml:
--------------------------------------------------------------------------------
1 |
2 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | #定义LOG输出级别
2 | log4j.rootLogger=INFO,Console,File
3 | #定义日志输出目的地为控制台
4 | log4j.appender.Console=org.apache.log4j.ConsoleAppender
5 | log4j.appender.Console.Target=System.out
6 | #可以灵活地指定日志输出格式,下面一行是指定具体的格式
7 | log4j.appender.Console.layout = org.apache.log4j.PatternLayout
8 | log4j.appender.Console.layout.ConversionPattern=[%c] - %m%n
9 |
10 | #文件大小到达指定尺寸的时候产生一个新的文件
11 | log4j.appender.File = org.apache.log4j.RollingFileAppender
12 | #指定输出目录
13 | log4j.appender.File.File = logs/ssm.log
14 | #定义文件最大大小
15 | log4j.appender.File.MaxFileSize = 10MB
16 | # 输出所以日志,如果换成DEBUG表示输出DEBUG以上级别日志
17 | log4j.appender.File.Threshold = ALL
18 | log4j.appender.File.layout = org.apache.log4j.PatternLayout
19 | log4j.appender.File.layout.ConversionPattern =[%p] [%d{yyyy-MM-dd HH\:mm\:ss}][%c]%m%n
20 |
21 | log4j.logger.org.mongodb.driver=OFF
--------------------------------------------------------------------------------
/src/main/main.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
--------------------------------------------------------------------------------
/src/main/webapp/weekly_posts.jsp:
--------------------------------------------------------------------------------
1 | <%@ taglib prefix="c" uri="http://java.sun.com/jsp/jstl/core"%>
2 | <%@ page language="java" contentType="text/html; charset=utf-8"
3 | pageEncoding="utf-8" %>
4 |
5 |
6 | <%
7 | String path = request.getContextPath();
8 | %>
9 |
10 |
11 |
12 |
13 | Auto技术周报
14 |
15 |
16 |
17 | ##行业新闻
18 |
19 |
20 |
21 | ${postnew.title}
22 | ${postnew.digest}
23 |
24 |
25 |
26 |
27 | ##Android开发
28 |
29 |
30 |
31 | ${androidpost.title}
32 | ${androidpost.digest}
33 |
34 |
35 |
36 |
37 | ##技术纵横
38 |
39 |
40 |
41 | ${extendpost.title}
42 | ${extendpost.digest}
43 |
44 |
45 |
46 |
47 |
48 |
--------------------------------------------------------------------------------
/src/main/resources/quartz.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 |
6 |
7 |
8 |
9 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 | 0 0 9,0,0,0 * * ?
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
--------------------------------------------------------------------------------
/src/main/resources/mybatis-config.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
--------------------------------------------------------------------------------
/src/main/resources/spring-mvc.xml:
--------------------------------------------------------------------------------
1 |
2 |
12 |
13 |
14 |
15 |
16 |
17 |
19 |
20 |
21 | text/html;charset=UTF-8
22 |
23 |
24 |
25 |
26 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
--------------------------------------------------------------------------------
/src/main/webapp/WEB-INF/web.xml:
--------------------------------------------------------------------------------
1 |
2 |
6 |
7 | WechatSpider
8 |
9 |
10 |
11 | contextConfigLocation
12 |
13 | classpath:spring-mybatis.xml,
14 | classpath:quartz.xml
15 |
16 |
17 |
18 |
19 | encodingFilter
20 | org.springframework.web.filter.CharacterEncodingFilter
21 | true
22 |
23 | encoding
24 | UTF-8
25 |
26 |
27 |
28 | encodingFilter
29 | /*
30 |
31 |
32 |
33 | org.springframework.web.context.ContextLoaderListener
34 |
35 |
36 |
37 | org.springframework.web.util.IntrospectorCleanupListener
38 |
39 |
40 |
41 |
42 | SpringMVC
43 | org.springframework.web.servlet.DispatcherServlet
44 |
45 | contextConfigLocation
46 | classpath:spring-mvc.xml
47 |
48 | 1
49 | true
50 |
51 |
52 | SpringMVC
53 |
54 | /
55 | *.action
56 |
57 |
58 |
59 | index.html
60 | index.htm
61 | index.jsp
62 | default.html
63 | default.htm
64 | default.jsp
65 |
66 |
67 |
68 |
--------------------------------------------------------------------------------
/src/main/resources/mybatis-mapper/PostKeywordMapper.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
21 | id, wordtext, wordfrequency, wordtype
22 |
23 |
24 |
28 | insert into keywordTable
29 |
30 |
31 | id,
32 |
33 |
34 | wordtext,
35 |
36 |
37 | wordfrequency,
38 |
39 |
40 | wordtype,
41 |
42 |
43 |
44 |
45 | #{id,jdbcType=INTEGER},
46 |
47 |
48 | #{wordtext,jdbcType=VARCHAR},
49 |
50 |
51 | #{wordfrequency,jdbcType=INTEGER},
52 |
53 |
54 | #{wordtype,jdbcType=INTEGER},
55 |
56 |
57 |
58 |
59 |
63 | update keywordTable
64 |
65 |
66 | wordtext = #{wordtext,jdbcType=VARCHAR},
67 |
68 |
69 | wordfrequency = #{wordfrequency,jdbcType=INTEGER},
70 |
71 |
72 | wordtype = #{wordtype,jdbcType=INTEGER},
73 |
74 |
75 | where id = #{id,jdbcType=INTEGER}
76 |
77 |
78 |
--------------------------------------------------------------------------------
/src/main/java/com/wechat/auto/util/SimHash.java:
--------------------------------------------------------------------------------
1 | package com.wechat.auto.util;
2 |
3 | import java.math.BigInteger;
4 | import java.util.HashMap;
5 |
6 |
7 | /**
8 | * 计算海明距离
9 | * 参考博客:https://www.cnblogs.com/shaosks/p/9121774.html
10 | */
11 | public class SimHash {
12 |
13 | public static final int HASH_BITS = 128;
14 |
15 | private HashMap mTokens;
16 |
17 | private BigInteger mSimHashValue;
18 |
19 | public SimHash(HashMap tokens){
20 | this.mTokens = tokens;
21 | this.mSimHashValue = simHash();
22 | }
23 |
24 | public BigInteger getSimHashValue(){
25 | return mSimHashValue;
26 | }
27 |
28 | private BigInteger simHash() {
29 | int[] v = new int[HASH_BITS];
30 |
31 | for(String key: mTokens.keySet())
32 | {
33 | int weight = mTokens.get(key);
34 | BigInteger t = this.hash(key);
35 | for (int i = 0; i < HASH_BITS; i++){
36 | BigInteger bitmask = new BigInteger("1").shiftLeft(i);
37 | if (t.and(bitmask).signum() != 0) {
38 | v[i] += 1 * weight;//加权
39 | }
40 | else {
41 | v[i] -= 1 * weight;
42 | }
43 | }
44 | }
45 |
46 | BigInteger fingerprint = new BigInteger("0");
47 | for (int i = 0; i < HASH_BITS; i++) {
48 | if (v[i] >= 0) {
49 | fingerprint = fingerprint.add(new BigInteger("1").shiftLeft(i));
50 | }
51 | }
52 | return fingerprint;
53 | }
54 |
55 | private BigInteger hash(String source) {
56 | if (source == null || source.length() == 0) {
57 | return new BigInteger("0");
58 | }
59 | else {
60 | char[] sourceArray = source.toCharArray();
61 | BigInteger x = BigInteger.valueOf(((long) sourceArray[0]) << 7);
62 | BigInteger m = new BigInteger("1000003");
63 | BigInteger mask = new BigInteger("2").pow(HASH_BITS).subtract(
64 | new BigInteger("1"));
65 | for (char item : sourceArray) {
66 | BigInteger temp = BigInteger.valueOf((long) item);
67 | x = x.multiply(m).xor(temp).and(mask);
68 | }
69 | x = x.xor(new BigInteger(String.valueOf(source.length())));
70 | if (x.equals(new BigInteger("-1"))) {
71 | x = new BigInteger("-2");
72 | }
73 | return x;
74 | }
75 | }
76 |
77 | /**
78 | * @Author:sks
79 | * @Description:计算海明距离
80 | * @leftSimHash,rightSimHash:要比较的信息指纹
81 | * @hashbits:128
82 | */
83 | public static int hammingDistance(BigInteger leftSimHash, BigInteger rightSimHash){
84 | BigInteger m = new BigInteger("1").shiftLeft(HASH_BITS).subtract(
85 | new BigInteger("1"));
86 | BigInteger x = leftSimHash.xor(rightSimHash).and(m);
87 | int count = 0;
88 | while (x.signum() != 0) {
89 | count += 1;
90 | x = x.and(x.subtract(new BigInteger("1")));
91 | }
92 | return count;
93 | }
94 | }
95 |
--------------------------------------------------------------------------------
/src/main/java/com/wechat/auto/model/PostKeyword.java:
--------------------------------------------------------------------------------
1 | package com.wechat.auto.model;
2 |
3 | import javax.persistence.criteria.CriteriaBuilder;
4 |
5 | public class PostKeyword {
6 | /**
7 | * This field was generated by MyBatis Generator.
8 | * This field corresponds to the database column keywordTable.id
9 | *
10 | * @mbggenerated
11 | */
12 | private Integer id;
13 |
14 | /**
15 | * This field was generated by MyBatis Generator.
16 | * This field corresponds to the database column keywordTable.wordtext
17 | *
18 | * @mbggenerated
19 | */
20 | private String wordtext;
21 |
22 | /**
23 | * This field was generated by MyBatis Generator.
24 | * This field corresponds to the database column keywordTable.wordfrequency
25 | *
26 | * @mbggenerated
27 | */
28 | private Integer wordfrequency;
29 |
30 | /**
31 | * This field was generated by MyBatis Generator.
32 | * This field corresponds to the database column keywordTable.wordtype
33 | *
34 | * @mbggenerated
35 | */
36 | private Integer wordtype;
37 |
38 | /**
39 | * This method was generated by MyBatis Generator.
40 | * This method corresponds to the database table keywordTable
41 | *
42 | * @mbggenerated
43 | */
44 | public PostKeyword(Integer id, String wordtext, Integer wordfrequency, Integer wordtype) {
45 | this.id = id;
46 | this.wordtext = wordtext;
47 | this.wordfrequency = wordfrequency;
48 | this.wordtype = wordtype;
49 | }
50 |
51 | /**
52 | * This method was generated by MyBatis Generator.
53 | * This method returns the value of the database column keywordTable.id
54 | *
55 | * @return the value of keywordTable.id
56 | *
57 | * @mbggenerated
58 | */
59 | public Integer getId() {
60 | return id;
61 | }
62 |
63 | /**
64 | * This method was generated by MyBatis Generator.
65 | * This method returns the value of the database column keywordTable.wordtext
66 | *
67 | * @return the value of keywordTable.wordtext
68 | *
69 | * @mbggenerated
70 | */
71 | public String getWordtext() {
72 | return wordtext;
73 | }
74 |
75 | /**
76 | * This method was generated by MyBatis Generator.
77 | * This method returns the value of the database column keywordTable.wordfrequency
78 | *
79 | * @return the value of keywordTable.wordfrequency
80 | *
81 | * @mbggenerated
82 | */
83 | public Integer getWordfrequency() {
84 | return wordfrequency;
85 | }
86 |
87 | /**
88 | * This method was generated by MyBatis Generator.
89 | * This method returns the value of the database column keywordTable.wordtype
90 | *
91 | * @return the value of keywordTable.wordtype
92 | *
93 | * @mbggenerated
94 | */
95 | public Integer getWordtype() {
96 | return wordtype;
97 | }
98 |
99 |
100 | public void setWeight(Integer wordfrequency){
101 | this.wordfrequency = wordfrequency;
102 | }
103 | }
--------------------------------------------------------------------------------
/src/main/resources/spring-mybatis.xml:
--------------------------------------------------------------------------------
1 |
2 |
12 |
13 |
14 |
15 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 | classpath:mybatis-mapper/*.xml
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
59 |
60 |
61 |
62 |
--------------------------------------------------------------------------------
/src/main/java/com/wechat/auto/mapper/PostKeywordMapper.java:
--------------------------------------------------------------------------------
1 | package com.wechat.auto.mapper;
2 |
3 | import com.wechat.auto.model.PostKeyword;
4 | import org.apache.ibatis.annotations.*;
5 | import org.springframework.stereotype.Repository;
6 |
7 | import java.util.List;
8 |
9 | @Repository
10 | public interface PostKeywordMapper {
11 | /**
12 | * This method was generated by MyBatis Generator.
13 | * This method corresponds to the database table keywordTable
14 | *
15 | * @mbggenerated
16 | */
17 | @Delete({
18 | "delete from keywordTable",
19 | "where id = #{id,jdbcType=INTEGER}"
20 | })
21 | int deleteByPrimaryKey(Integer id);
22 |
23 | /**
24 | * This method was generated by MyBatis Generator.
25 | * This method corresponds to the database table keywordTable
26 | *
27 | * @mbggenerated
28 | */
29 | @Insert({
30 | "insert into keywordTable (id, wordtext, ",
31 | "wordfrequency, wordtype)",
32 | "values (#{id,jdbcType=INTEGER}, #{wordtext,jdbcType=VARCHAR}, ",
33 | "#{wordfrequency,jdbcType=INTEGER}, #{wordtype,jdbcType=INTEGER})"
34 | })
35 | int insert(PostKeyword record);
36 |
37 | /**
38 | * This method was generated by MyBatis Generator.
39 | * This method corresponds to the database table keywordTable
40 | *
41 | * @mbggenerated
42 | */
43 | int insertSelective(PostKeyword record);
44 |
45 | /**
46 | * This method was generated by MyBatis Generator.
47 | * This method corresponds to the database table keywordTable
48 | *
49 | * @mbggenerated
50 | */
51 | @Select({
52 | "select",
53 | "id, wordtext, wordfrequency, wordtype",
54 | "from keywordTable",
55 | "where id = #{id,jdbcType=INTEGER}"
56 | })
57 | @ResultMap("BaseResultMap")
58 | PostKeyword selectByPrimaryKey(Integer id);
59 |
60 | /**
61 | * This method was generated by MyBatis Generator.
62 | * This method corresponds to the database table keywordTable
63 | *
64 | * @mbggenerated
65 | */
66 | int updateByPrimaryKeySelective(PostKeyword record);
67 |
68 | /**
69 | * This method was generated by MyBatis Generator.
70 | * This method corresponds to the database table keywordTable
71 | *
72 | * @mbggenerated
73 | */
74 | @Update({
75 | "update keywordTable",
76 | "set wordtext = #{wordtext,jdbcType=VARCHAR},",
77 | "wordfrequency = #{wordfrequency,jdbcType=INTEGER},",
78 | "wordtype = #{wordtype,jdbcType=INTEGER}",
79 | "where id = #{id,jdbcType=INTEGER}"
80 | })
81 | int updateByPrimaryKey(PostKeyword record);
82 |
83 | @Select({
84 | "select",
85 | "id, wordtext, wordfrequency, wordtype",
86 | "from keywordTable",
87 | "where wordtext = #{wordtext,jdbcType=INTEGER}"
88 | })
89 | @ResultMap("BaseResultMap")
90 | PostKeyword selectKeyWord(@Param("wordtext") String wordtext);
91 |
92 | @Select({
93 | "select",
94 | "id, wordtext, wordfrequency, wordtype",
95 | "from keywordTable",
96 | "where wordtype = #{wordtype,jdbcType=INTEGER}",
97 | "order by wordfrequency DESC"
98 | })
99 | @ResultMap("BaseResultMap")
100 | List selectAllKeyWord(@Param("wordtype") Integer wordtype);
101 |
102 | @Delete({
103 | "delete from keywordTable"
104 | })
105 | int deleteAll();
106 | }
--------------------------------------------------------------------------------
/src/main/java/com/wechat/auto/controller/WechatController.java:
--------------------------------------------------------------------------------
1 | package com.wechat.auto.controller;
2 |
3 | import com.wechat.auto.mapper.WechatPostMapper;
4 | import com.wechat.auto.model.PostJSP;
5 | import com.wechat.auto.model.WechatPost;
6 | import com.wechat.auto.util.Utils;
7 | import org.springframework.beans.factory.annotation.Autowired;
8 | import org.springframework.stereotype.Controller;
9 | import org.springframework.web.bind.annotation.RequestMapping;
10 | import org.springframework.web.servlet.ModelAndView;
11 |
12 | import javax.servlet.http.HttpServletRequest;
13 | import javax.servlet.http.HttpServletResponse;
14 | import java.net.URLDecoder;
15 | import java.util.ArrayList;
16 | import java.util.Calendar;
17 | import java.util.Date;
18 | import java.util.List;
19 |
20 | @Controller
21 | @RequestMapping("/getData")
22 | public class WechatController {
23 |
24 | @Autowired
25 | private WechatPostMapper wechatPostMapper;
26 |
27 | private final String HOST = "https://mp.weixin.qq.com/s?";
28 |
29 | @RequestMapping("/getWxPost")
30 | public void getWxPost(HttpServletRequest request, HttpServletResponse response){
31 | String url = request.getParameter("url");
32 | try{
33 | url = URLDecoder.decode(url, "utf-8");
34 | System.out.println("=================url : "+ url);
35 | WechatPost post;
36 | post = wechatPostMapper.getPostByUrl(HOST + url);
37 | System.out.println("=================post : "+ post);
38 | if(post == null){
39 | post = new WechatPost();
40 | post.setContenturl(HOST + url);
41 | post.setIsspider(0);
42 | wechatPostMapper.insert(post);
43 | }
44 | }catch(Exception e){
45 | e.printStackTrace();
46 | System.out.println("=================insert exception : "+ e.getMessage());
47 | }
48 | }
49 |
50 | @RequestMapping(value="/weekly_posts")
51 | public ModelAndView weeklyPosts(){
52 | Calendar c = Calendar.getInstance();
53 | c.setTime(new Date());
54 | c.add(Calendar.DATE, - 7);
55 | Date weekBefore = c.getTime();
56 |
57 | System.out.println("================weekbefore : " + weekBefore.getTime());
58 | List newsPost = wechatPostMapper.getPostByTypeInWeek(Utils.KEYWORD_TYPE_NEWS, weekBefore);
59 |
60 | List androidPost = wechatPostMapper.getPostByTypeInWeek(Utils.KEYWORD_TYPE_ANDROID, weekBefore);
61 |
62 | List extendPost = wechatPostMapper.getPostByTypeInWeek(Utils.KEYWORD_TYPE_EXTEND, weekBefore);
63 |
64 | List newsJSP = new ArrayList();
65 | List androidsJSP = new ArrayList();
66 | List extendsJSP = new ArrayList();
67 |
68 | initJspData(newsJSP, newsPost);
69 | initJspData(androidsJSP, androidPost);
70 | initJspData(extendsJSP, extendPost);
71 |
72 | ModelAndView mdl = new ModelAndView();
73 |
74 | mdl.setViewName("weekly_posts");
75 | mdl.addObject("newspost", newsJSP);
76 | mdl.addObject("androidspost", androidsJSP);
77 | mdl.addObject("extendspost", extendsJSP);
78 |
79 | return mdl;
80 | }
81 |
82 | private void initJspData(List jspArray, List postArray){
83 | if(postArray != null && postArray.size() > 0){
84 | String title = null;
85 | WechatPost itemPost = null;
86 | PostJSP itemJsp = null;
87 | int arraySize = postArray.size();
88 | for (int i=0;i< arraySize;i++){
89 | itemPost = postArray.get(i);
90 | title = "###" + String.valueOf(i+1) + ".["+itemPost.getTitle() + "](" + itemPost.getContenturl() +")";
91 | itemJsp = new PostJSP();
92 | itemJsp.setTitle(title);
93 | itemJsp.setDigest(itemPost.getDigest());
94 | jspArray.add(i, itemJsp);
95 | }
96 | }
97 | }
98 | }
99 |
--------------------------------------------------------------------------------
/src/main/resources/generatorConfig.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
61 |
62 |
63 |
71 |
72 |
76 |
77 |
78 |
81 |
82 |
83 |
84 |
85 |
90 |
91 |
96 |
97 |
98 |
--------------------------------------------------------------------------------
/src/main/java/com/wechat/auto/spider/KeyWordProcessor.java:
--------------------------------------------------------------------------------
1 | package com.wechat.auto.spider;
2 |
3 | import com.hankcs.hanlp.HanLP;
4 | import com.hankcs.hanlp.mining.word.WordInfo;
5 | import com.wechat.auto.mapper.PostKeywordMapper;
6 | import com.wechat.auto.model.PostKeyword;
7 | import com.wechat.auto.util.Utils;
8 | import org.apache.http.util.TextUtils;
9 | import us.codecraft.webmagic.Page;
10 | import us.codecraft.webmagic.Site;
11 | import us.codecraft.webmagic.Spider;
12 | import us.codecraft.webmagic.downloader.HttpClientDownloader;
13 | import us.codecraft.webmagic.monitor.SpiderMonitor;
14 | import us.codecraft.webmagic.processor.PageProcessor;
15 |
16 | import javax.management.JMException;
17 | import java.util.*;
18 |
19 | public class KeyWordProcessor implements PageProcessor {
20 |
21 | // 抓取网站的相关配置,包括编码、抓取间隔、重试次数等
22 | private Site site = Site.me().setRetryTimes(3).setSleepTime(100);
23 |
24 | private static PostKeywordMapper mKeywordMapper;
25 |
26 | private HashMap mKeyWordMap = new HashMap();
27 |
28 | private PostKeywordMapper postKeywordMapper;
29 |
30 | private int mUrlType = -1;
31 |
32 | public void setUrlType(int type){
33 | this.mUrlType = type;
34 | }
35 |
36 | public void setPostMapper(PostKeywordMapper keywordMapper){
37 | postKeywordMapper = keywordMapper;
38 | }
39 |
40 | @Override
41 | public void process(Page page) {
42 | // TODO Auto-generated method stub
43 | String content = page.getHtml().xpath("//div[@id='js_content']").get();
44 |
45 | if(TextUtils.isEmpty(content)){
46 | System.out.println("文章已和谐!");
47 | return;
48 | }
49 |
50 | String contentTxt = Utils.stripHtml(content).trim();//纯文本内容
51 |
52 | List keyWords = HanLP.extractWords(contentTxt, 30);
53 |
54 | WordInfo[] wordInfos = new WordInfo[keyWords.size()];
55 | for(int i=0;i< keyWords.size();i++){
56 | wordInfos[i] = keyWords.get(i);
57 | }
58 | if (keyWords != null && keyWords.size()> 0){
59 | int size = keyWords.size();
60 | WordInfo word;
61 | // StringBuilder keyWordSb = new StringBuilder();
62 | Arrays.sort(wordInfos, new Comparator() {
63 |
64 | @Override
65 | public int compare(WordInfo o1, WordInfo o2) {
66 | return -(o1.frequency - o2.frequency);
67 | }
68 | });
69 |
70 | for(int i=0;i listdata = postKeywordMapper.selectAllKeyWord(Utils.KEYWORD_TYPE_ANDROID);
96 |
97 | //PostKeyword word = postKeywordMapper.selectByPrimaryKey(10);
98 |
99 | //System.out.println("================word text : " + word.getWordtext() + word.getWordfrequency());
100 |
101 |
102 | // System.out.println("================result size: " + listdata.size());
103 | //
104 | // StringBuilder sb = new StringBuilder();
105 | // PostKeyword tempData;
106 | // for (int i=0;i< listdata.size();i++){
107 | // tempData = listdata.get(i);
108 | // sb.append(tempData.getWordtext() + " : " + tempData.getWordfrequency() + " , ");
109 | // }
110 | //
111 | // System.out.println("================result : " + sb.toString());
112 |
113 | // int deleteResult = postKeywordMapper.deleteAll();
114 | // System.out.println("======================delete result : " + deleteResult);
115 | }
116 |
117 |
118 | }
119 |
--------------------------------------------------------------------------------
/src/main/java/com/wechat/auto/spider/PostProcessor.java:
--------------------------------------------------------------------------------
1 | package com.wechat.auto.spider;
2 |
3 | import com.wechat.auto.mapper.WechatPostMapper;
4 | import com.wechat.auto.model.WechatPost;
5 | import com.wechat.auto.util.Utils;
6 | import org.apache.commons.lang3.time.DateFormatUtils;
7 | import org.apache.http.util.TextUtils;
8 | import us.codecraft.webmagic.Page;
9 | import us.codecraft.webmagic.Site;
10 | import us.codecraft.webmagic.Spider;
11 | import us.codecraft.webmagic.downloader.HttpClientDownloader;
12 | import us.codecraft.webmagic.monitor.SpiderMonitor;
13 | import us.codecraft.webmagic.processor.PageProcessor;
14 |
15 | import javax.management.JMException;
16 | import java.math.BigInteger;
17 | import java.util.Date;
18 |
19 | public class PostProcessor implements PageProcessor {
20 |
21 | // 抓取网站的相关配置,包括编码、抓取间隔、重试次数等
22 | private Site site = Site.me().setRetryTimes(3).setSleepTime(100);
23 |
24 | private WechatPostMapper wechatPostMapper;
25 |
26 | private BigInteger mAndroidSimHash;
27 |
28 | private BigInteger mExtendSimHash;
29 |
30 | public void setPostMapper(WechatPostMapper postMapper) {
31 | this.wechatPostMapper = postMapper;
32 | }
33 |
34 | public void setKeyWordMap(BigInteger androidSimHash, BigInteger extendSimHash){
35 | this.mAndroidSimHash = androidSimHash;
36 | this.mExtendSimHash = extendSimHash;
37 | }
38 |
39 | @Override
40 | public void process(Page page) {
41 | //System.out.println("============page : " + page.getHtml().toString());
42 |
43 | // TODO Auto-generated method stub
44 | String content = page.getHtml().xpath("//div[@id='js_content']").get();
45 | String requestUrl = page.getUrl().toString();
46 | System.out.println("===========request Url : "+ requestUrl);
47 | if(TextUtils.isEmpty(content)){
48 | System.out.println("文章已和谐!");
49 | wechatPostMapper.deleteByRequestUrl(requestUrl);
50 | return;
51 | }
52 |
53 | String contentTxt = Utils.stripHtml(content).trim();//纯文本内容
54 | //System.out.println("===========content : "+ contentTxt);
55 | String htmlStr = page.getHtml().toString();
56 |
57 | String biz = Utils.stripVarValue(htmlStr, Utils.VAR_BIZ);
58 | String msglink = Utils.stripVarValue(htmlStr, Utils.VAR_MSG_LINK);
59 | String msgId = Utils.stripVarValue(msglink, Utils.MSG_ID);
60 | String title= Utils.stripVarValue(htmlStr, Utils.VAR_TITLE);
61 | String digest = Utils.stripVarValue(htmlStr, Utils.VAR_DIGEST);
62 | String contentDesc = "";
63 | if(!TextUtils.isEmpty(contentTxt.trim())){
64 | contentDesc = Utils.stripDesc(contentTxt);
65 | }
66 | String time = Utils.stripVarValue(htmlStr, Utils.VAR_TIME);
67 | //String svrTime = Utils.stripVarValue(htmlStr, Utils.SVR_TIME);
68 | //Date dateTime = new Date(Long.parseLong(svrTime) * 1000);
69 |
70 | Date dateTime = Utils.strToDate(time);
71 |
72 | System.out.println("=================== svrTime : "+ time + " ; " + DateFormatUtils.format(dateTime,"yyyy-MM-dd HH:mm:ss"));
73 |
74 | String nickName = Utils.stripVarValue(htmlStr, Utils.VAR_NICKNAME);
75 | String coverUrl = Utils.stripVarValue(htmlStr, Utils.VAR_COVER_URL);
76 | String msgSourceUrl = Utils.stripVarValue(htmlStr, Utils.VAR_MSG_SOURCE_URL);
77 | String author = Utils.stripVarValue(htmlStr, Utils.AUTHOR);
78 |
79 | String articleType = Utils.stripVarValue(htmlStr, Utils.VAR_ARTICLE_TYPE);
80 | if(TextUtils.isEmpty(digest) && !TextUtils.isEmpty(contentDesc.trim())){
81 | digest = contentDesc;
82 | }
83 |
84 | digest.replaceAll("\n","");
85 |
86 | System.out.println("===========contentTxt length: "+ contentTxt.length());
87 | System.out.println("===========biz : "+biz);
88 | System.out.println("===========msglink : "+msglink);
89 | System.out.println("===========msgid : "+msgId);
90 | System.out.println("===========title : "+title);
91 | System.out.println("===========desc : "+digest);
92 | System.out.println("===========time : "+ time + " ; date time : " + dateTime.toString());
93 | System.out.println("===========nickName : "+nickName);
94 | System.out.println("===========coverUrl : "+coverUrl);
95 | System.out.println("===========msgSourceUrl : "+msgSourceUrl);
96 | System.out.println("===========autor : "+author);
97 | System.out.println("===========article : "+articleType);
98 |
99 | int postType = Utils.analysisType(nickName, title, contentTxt);
100 | int weight = Utils.analysisWeight(nickName, title, contentTxt);
101 | System.out.println("===========postType : "+postType);
102 | System.out.println("===========weight : "+weight);
103 |
104 | WechatPost post = wechatPostMapper.getPostByUrl(requestUrl);
105 | System.out.println("===========get post by url : "+post);
106 | post.setBiz(biz);
107 | post.setAppmsgid(msgId);
108 | post.setTitle(title);
109 | post.setDigest(digest);
110 | post.setSourceurl(msgSourceUrl);
111 | post.setConver(coverUrl);
112 | post.setDatetime(dateTime);
113 | post.setIsspider(1);
114 | post.setAuthor(author);
115 | post.setNickname(nickName);
116 | post.setWeight(weight);
117 | post.setPosttype(postType);
118 | post.setContent(contentTxt);
119 | /**
120 | * 更新文章的实际链接地址参数,msglink链接更短
121 | */
122 | post.setContenturl(msglink);
123 | try {
124 | int updateResult = wechatPostMapper.updateByPrimaryKeySelective(post);
125 | System.out.println("===========update result : " + updateResult);
126 | } catch (Exception e) {
127 | /**
128 | * 文章内容的特殊字符会导致写入数据库失败,置空处理
129 | */
130 | post.setContent("");
131 | int updateResult = wechatPostMapper.updateByPrimaryKeySelective(post);
132 | System.out.println("===========update result exception : " + e.getMessage());
133 | }
134 |
135 |
136 | }
137 |
138 | @Override
139 | public Site getSite() {
140 | // TODO Auto-generated method stub
141 | return this.site;
142 | }
143 |
144 | public static void startSpider(WechatPostMapper wechatPostMapper,
145 | String... urls){//PostMapper myPostMapper
146 | HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
147 | PostProcessor spiderModel = new PostProcessor();
148 | spiderModel.setPostMapper(wechatPostMapper);
149 | Spider mySpider = Spider.create(spiderModel).addUrl(urls);
150 | mySpider.setDownloader(httpClientDownloader);
151 | try {
152 | SpiderMonitor.instance().register(mySpider);
153 | mySpider.thread(1).run();
154 | } catch (JMException e) {
155 | e.printStackTrace();
156 | }
157 | }
158 |
159 | private void simHashTest(){
160 | /*List keyWords = HanLP.extractWords(contentTxt, 30);
161 |
162 | HashMap postMap = new HashMap();
163 |
164 | WordInfo tempInfo;
165 | for(int i=0;i= 0){
137 | req.anyproxy_map_local = item.local;
138 | return false;
139 | }
140 | });
141 |
142 | return !!req.anyproxy_map_local;
143 | },
144 |
145 | dealLocalResponse : function(req,reqBody,callback){
146 | if(req.replaceLocalFile){
147 | callback(200, {"content-type":"image/png"}, img);
148 | } else if(req.anyproxy_map_local){
149 | fs.readFile(req.anyproxy_map_local,function(err,buffer){
150 | if(err){
151 | callback(200, {}, "[AnyProxy failed to load local file] " + err);
152 | }else{
153 | var header = {
154 | 'Content-Type': utils.contentType(req.anyproxy_map_local)
155 | };
156 | callback(200, header, buffer);
157 | }
158 | });
159 | }
160 | },
161 |
162 | replaceRequestProtocol:function(req,protocol){
163 | },
164 |
165 | replaceRequestOption : function(req,option){
166 | var newOption = option;
167 | //这里面的正则可以替换成自己不希望访问的网址特征字符串,这里面的btrace是一个腾讯视频的域名,经过实践发现特别容易导致浏览器崩溃,所以加在里面了,继续添加可以使用|分割。
168 | if(/google|btrace/i.test(newOption.headers.host)){
169 | newOption.hostname = "127.0.0.1";//这个ip也可以替换成其他的
170 | newOption.port = "80";
171 | }
172 | return newOption;
173 | },
174 |
175 | replaceRequestData: function(req,data){
176 | },
177 |
178 | replaceResponseStatusCode: function(req,res,statusCode){
179 | },
180 |
181 | replaceResponseHeader: function(req,res,header){
182 | },
183 |
184 | // Deprecated
185 | // replaceServerResData: function(req,res,serverResData){
186 | // return serverResData;
187 | // },
188 |
189 | replaceServerResDataAsync: function(req,res,serverResData,callback){
190 | //把微信文章URL发送到服务器
191 | if(/s\?__biz/i.test(req.url)){
192 | try {
193 | HttpPost(req.url, "http://localhost:8080/WechatSpider/getData/getWxPost");
194 | }catch(e){
195 |
196 | }
197 | }
198 | callback(serverResData);
199 | },
200 |
201 | pauseBeforeSendingResponse: function(req,res){
202 | },
203 |
204 | shouldInterceptHttpsReq:function(req){
205 | return interceptFlag;
206 | },
207 |
208 | //[beta]
209 | //fetch entire traffic data
210 | fetchTrafficData: function(id,info){},
211 |
212 | setInterceptFlag: function(flag){
213 | interceptFlag = flag && isRootCAFileExists;
214 | },
215 |
216 | _plugIntoWebinterface: function(app,cb){
217 |
218 | app.get("/filetree",function(req,res){
219 | try{
220 | var root = req.query.root || utils.getUserHome() || "/";
221 | utils.filewalker(root,function(err, info){
222 | res.json(info);
223 | });
224 | }catch(e){
225 | res.end(e);
226 | }
227 | });
228 |
229 | app.use(bodyParser.json());
230 | app.get("/getMapConfig",function(req,res){
231 | res.json(mapConfig);
232 | });
233 | app.post("/setMapConfig",function(req,res){
234 | mapConfig = req.body;
235 | res.json(mapConfig);
236 |
237 | saveMapConfig(mapConfig);
238 | });
239 |
240 | cb();
241 | },
242 |
243 | _getCustomMenu : function(){
244 | return [
245 | // {
246 | // name:"test",
247 | // icon:"uk-icon-lemon-o",
248 | // url :"http://anyproxy.io"
249 | // }
250 | ];
251 | }
252 | };
253 |
254 |
255 |
--------------------------------------------------------------------------------
/rule_default_js/rule_default_after20190512.js:
--------------------------------------------------------------------------------
1 | var utils = require("./util"),
2 | bodyParser = require("body-parser"),
3 | path = require("path"),
4 | fs = require("fs"),
5 | img = fs.readFileSync("/Users/chenwenguan/.nvm/versions/node/v8.9.3/lib/node_modules/anyproxy/lib/one_pixel.png"),//代码绝对路径替换成自己的
6 | Promise = require("promise");
7 |
8 | var isRootCAFileExists = require("./certMgr.js").isRootCAFileExists(),
9 | interceptFlag = false;
10 |
11 | //e.g. [ { keyword: 'aaa', local: '/Users/Stella/061739.pdf' } ]
12 | var mapConfig = [],
13 | configFile = "mapConfig.json";
14 | function saveMapConfig(content,cb){
15 | new Promise(function(resolve,reject){
16 | var anyproxyHome = utils.getAnyProxyHome(),
17 | mapCfgPath = path.join(anyproxyHome,configFile);
18 |
19 | if(typeof content == "object"){
20 | content = JSON.stringify(content);
21 | }
22 | resolve({
23 | path :mapCfgPath,
24 | content :content
25 | });
26 | })
27 | .then(function(config){
28 | return new Promise(function(resolve,reject){
29 | fs.writeFile(config.path, config.content, function(e){
30 | if(e){
31 | reject(e);
32 | }else{
33 | resolve();
34 | }
35 | });
36 | });
37 | })
38 | .catch(function(e){
39 | cb && cb(e);
40 | })
41 | .done(function(){
42 | cb && cb();
43 | });
44 | }
45 | function getMapConfig(cb){
46 | var read = Promise.denodeify(fs.readFile);
47 |
48 | new Promise(function(resolve,reject){
49 | var anyproxyHome = utils.getAnyProxyHome(),
50 | mapCfgPath = path.join(anyproxyHome,configFile);
51 |
52 | resolve(mapCfgPath);
53 | })
54 | .then(read)
55 | .then(function(content){
56 | return JSON.parse(content);
57 | })
58 | .catch(function(e){
59 | cb && cb(e);
60 | })
61 | .done(function(obj){
62 | cb && cb(null,obj);
63 | });
64 | }
65 |
66 | setTimeout(function(){
67 | //load saved config file
68 | getMapConfig(function(err,result){
69 | if(result){
70 | mapConfig = result;
71 | }
72 | });
73 | },1000);
74 |
75 | //wexin begin
76 | function HttpPost(url, path, referer) {//将json发送到服务器,str为json内容,url为历史消息页面地址,path是接收程序的路径和文件名
77 | console.log("开始执行转发操作");
78 | // console.log("开始执行转发操作str"+str);
79 | console.log("开始执行转发操作url"+url);
80 | console.log("开始执行转发操作path"+path);
81 | try{
82 | var http = require('http');
83 | var data = {
84 | //str: encodeURIComponent(str),
85 | url: encodeURIComponent(url),
86 | };
87 | data = require('querystring').stringify(data);
88 | var options = {
89 | method: "POST",
90 | host: "localhost",//注意没有http://,这是服务器的域名。
91 | port: 8080,
92 | path: path,//接收程序的路径和文件名
93 | headers: {
94 | 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
95 | "Content-Length": data.length
96 | }
97 | };
98 | var req = http.request(options, function (res) {
99 | res.setEncoding('utf8');
100 | res.on('data', function (chunk) {
101 | console.log('BODY-xs: ' + chunk);
102 | });
103 | });
104 | req.on('error', function (e) {
105 | console.log('problem with request: ' + e.message);
106 | });
107 |
108 | req.write(data);
109 | req.end();
110 | }catch(e){
111 | console.log("错误信息:"+e);
112 | }
113 | console.log("转发操作结束-xs");
114 | }
115 | //wexin end
116 |
117 | module.exports = {
118 | token: Date.now(),
119 | summary:function(){
120 | var tip = "the default rule for AnyProxy.";
121 | if(!isRootCAFileExists){
122 | tip += "\nRoot CA does not exist, will not intercept any https requests.";
123 | }
124 | return tip;
125 | },
126 |
127 | shouldUseLocalResponse : function(req,reqBody){
128 | console.log("====================shouldUseLocalResponse req url: "+req.url);
129 | console.log("====================shouldUseLocalResponse reqBody: "+reqBody);
130 | if(/r=0/i.test(reqBody) && /mp\/getappmsgext\?/i.test(req.url)){
131 | try {
132 | HttpPost(reqBody, "http://localhost:8080/WechatSpider/getData/getWxPost");
133 | }catch(e){
134 |
135 | }
136 | }
137 | if(/mmbiz\.qpic\.cn|wx\.qlogo\.cn/i.test(req.url)){
138 | req.replaceLocalFile = true;
139 | return true;
140 | }
141 | //intercept all options request
142 | var simpleUrl = (req.headers.host || "") + (req.url || "");
143 | mapConfig.map(function(item){
144 | var key = item.keyword;
145 | if(simpleUrl.indexOf(key) >= 0){
146 | req.anyproxy_map_local = item.local;
147 | return false;
148 | }
149 | });
150 |
151 |
152 | return !!req.anyproxy_map_local;
153 | },
154 |
155 | dealLocalResponse : function(req,reqBody,callback){
156 | if(req.replaceLocalFile){
157 | callback(200, {"content-type":"image/png"}, img);
158 | } else if(req.anyproxy_map_local){
159 | fs.readFile(req.anyproxy_map_local,function(err,buffer){
160 | if(err){
161 | callback(200, {}, "[AnyProxy failed to load local file] " + err);
162 | }else{
163 | var header = {
164 | 'Content-Type': utils.contentType(req.anyproxy_map_local)
165 | };
166 | callback(200, header, buffer);
167 | }
168 | });
169 | }
170 | },
171 |
172 | replaceRequestProtocol:function(req,protocol){
173 | },
174 |
175 | replaceRequestOption : function(req,option){
176 | var newOption = option;
177 | //这里面的正则可以替换成自己不希望访问的网址特征字符串,这里面的btrace是一个腾讯视频的域名,经过实践发现特别容易导致浏览器崩溃,所以加在里面了,继续添加可以使用|分割。
178 | if(/google|btrace/i.test(newOption.headers.host)){
179 | newOption.hostname = "127.0.0.1";//这个ip也可以替换成其他的
180 | newOption.port = "80";
181 | }
182 | return newOption;
183 | },
184 |
185 | replaceRequestData: function(req,data){
186 | },
187 |
188 | replaceResponseStatusCode: function(req,res,statusCode){
189 | },
190 |
191 | replaceResponseHeader: function(req,res,header){
192 |
193 | },
194 |
195 | // Deprecated
196 | // replaceServerResData: function(req,res,serverResData){
197 | // return serverResData;
198 | // },
199 |
200 | replaceServerResDataAsync: function(req,res,serverResData,callback){
201 | callback(serverResData);
202 | },
203 |
204 | pauseBeforeSendingResponse: function(req,res){
205 | },
206 |
207 | shouldInterceptHttpsReq:function(req){
208 | return interceptFlag;
209 | },
210 |
211 | //[beta]
212 | //fetch entire traffic data
213 | fetchTrafficData: function(id,info){},
214 |
215 | setInterceptFlag: function(flag){
216 | interceptFlag = flag && isRootCAFileExists;
217 | },
218 |
219 | _plugIntoWebinterface: function(app,cb){
220 |
221 | app.get("/filetree",function(req,res){
222 | try{
223 | var root = req.query.root || utils.getUserHome() || "/";
224 | utils.filewalker(root,function(err, info){
225 | res.json(info);
226 | });
227 | }catch(e){
228 | res.end(e);
229 | }
230 | });
231 |
232 | app.use(bodyParser.json());
233 | app.get("/getMapConfig",function(req,res){
234 | res.json(mapConfig);
235 | });
236 | app.post("/setMapConfig",function(req,res){
237 | mapConfig = req.body;
238 | res.json(mapConfig);
239 |
240 | saveMapConfig(mapConfig);
241 | });
242 |
243 | cb();
244 | },
245 |
246 | _getCustomMenu : function(){
247 | return [
248 | // {
249 | // name:"test",
250 | // icon:"uk-icon-lemon-o",
251 | // url :"http://anyproxy.io"
252 | // }
253 | ];
254 | }
255 | };
--------------------------------------------------------------------------------
/src/main/java/com/wechat/auto/mapper/WechatPostMapper.java:
--------------------------------------------------------------------------------
1 | package com.wechat.auto.mapper;
2 |
3 | import com.wechat.auto.model.WechatPost;
4 | import org.apache.ibatis.annotations.*;
5 | import org.springframework.stereotype.Repository;
6 |
7 | import java.util.Date;
8 | import java.util.List;
9 |
10 | @Repository
11 | public interface WechatPostMapper {
12 | /**
13 | * This method was generated by MyBatis Generator.
14 | * This method corresponds to the database table postTable
15 | *
16 | * @mbggenerated
17 | */
18 | @Delete({
19 | "delete from postTable",
20 | "where id = #{id,jdbcType=INTEGER}"
21 | })
22 | int deleteByPrimaryKey(Integer id);
23 |
24 | /**
25 | * This method was generated by MyBatis Generator.
26 | * This method corresponds to the database table postTable
27 | *
28 | * @mbggenerated
29 | */
30 | @Insert({
31 | "insert into postTable (id, biz, ",
32 | "appmsgid, title, ",
33 | "digest, contenturl, ",
34 | "sourceurl, cover, ",
35 | "datetime, readnum, ",
36 | "likenum, isspider, ",
37 | "author, nickname, ",
38 | "weight, posttype, ",
39 | "content)",
40 | "values (#{id,jdbcType=INTEGER}, #{biz,jdbcType=VARCHAR}, ",
41 | "#{appmsgid,jdbcType=VARCHAR}, #{title,jdbcType=VARCHAR}, ",
42 | "#{digest,jdbcType=VARCHAR}, #{contenturl,jdbcType=VARCHAR}, ",
43 | "#{sourceurl,jdbcType=VARCHAR}, #{cover,jdbcType=VARCHAR}, ",
44 | "#{datetime,jdbcType=TIMESTAMP}, #{readnum,jdbcType=INTEGER}, ",
45 | "#{likenum,jdbcType=INTEGER}, #{isspider,jdbcType=INTEGER}, ",
46 | "#{author,jdbcType=VARCHAR}, #{nickname,jdbcType=VARCHAR}, ",
47 | "#{weight,jdbcType=INTEGER}, #{posttype,jdbcType=INTEGER}, ",
48 | "#{content,jdbcType=LONGVARCHAR})"
49 | })
50 | int insert(WechatPost record);
51 |
52 | /**
53 | * This method was generated by MyBatis Generator.
54 | * This method corresponds to the database table postTable
55 | *
56 | * @mbggenerated
57 | */
58 | int insertSelective(WechatPost record);
59 |
60 | /**
61 | * This method was generated by MyBatis Generator.
62 | * This method corresponds to the database table postTable
63 | *
64 | * @mbggenerated
65 | */
66 | @Select({
67 | "select",
68 | "id, biz, appmsgid, title, digest, contenturl, sourceurl, cover, datetime, readnum, ",
69 | "likenum, isspider, author, nickname, weight, posttype, content",
70 | "from postTable",
71 | "where id = #{id,jdbcType=INTEGER}"
72 | })
73 | @ResultMap("ResultMapWithBLOBs")
74 | WechatPost selectByPrimaryKey(Integer id);
75 |
76 | /**
77 | * This method was generated by MyBatis Generator.
78 | * This method corresponds to the database table postTable
79 | *
80 | * @mbggenerated
81 | */
82 | int updateByPrimaryKeySelective(WechatPost record);
83 |
84 | /**
85 | * This method was generated by MyBatis Generator.
86 | * This method corresponds to the database table postTable
87 | *
88 | * @mbggenerated
89 | */
90 | @Update({
91 | "update postTable",
92 | "set biz = #{biz,jdbcType=VARCHAR},",
93 | "appmsgid = #{appmsgid,jdbcType=VARCHAR},",
94 | "title = #{title,jdbcType=VARCHAR},",
95 | "digest = #{digest,jdbcType=VARCHAR},",
96 | "contenturl = #{contenturl,jdbcType=VARCHAR},",
97 | "sourceurl = #{sourceurl,jdbcType=VARCHAR},",
98 | "cover = #{cover,jdbcType=VARCHAR},",
99 | "datetime = #{datetime,jdbcType=TIMESTAMP},",
100 | "readnum = #{readnum,jdbcType=INTEGER},",
101 | "likenum = #{likenum,jdbcType=INTEGER},",
102 | "isspider = #{isspider,jdbcType=INTEGER},",
103 | "author = #{author,jdbcType=VARCHAR},",
104 | "nickname = #{nickname,jdbcType=VARCHAR},",
105 | "weight = #{weight,jdbcType=INTEGER},",
106 | "posttype = #{posttype,jdbcType=INTEGER},",
107 | "content = #{content,jdbcType=LONGVARCHAR}",
108 | "where id = #{id,jdbcType=INTEGER}"
109 | })
110 | int updateByPrimaryKeyWithBLOBs(WechatPost record);
111 |
112 | /**
113 | * This method was generated by MyBatis Generator.
114 | * This method corresponds to the database table postTable
115 | *
116 | * @mbggenerated
117 | */
118 | @Update({
119 | "update postTable",
120 | "set biz = #{biz,jdbcType=VARCHAR},",
121 | "appmsgid = #{appmsgid,jdbcType=VARCHAR},",
122 | "title = #{title,jdbcType=VARCHAR},",
123 | "digest = #{digest,jdbcType=VARCHAR},",
124 | "contenturl = #{contenturl,jdbcType=VARCHAR},",
125 | "sourceurl = #{sourceurl,jdbcType=VARCHAR},",
126 | "cover = #{cover,jdbcType=VARCHAR},",
127 | "datetime = #{datetime,jdbcType=TIMESTAMP},",
128 | "readnum = #{readnum,jdbcType=INTEGER},",
129 | "likenum = #{likenum,jdbcType=INTEGER},",
130 | "isspider = #{isspider,jdbcType=INTEGER},",
131 | "author = #{author,jdbcType=VARCHAR},",
132 | "nickname = #{nickname,jdbcType=VARCHAR},",
133 | "weight = #{weight,jdbcType=INTEGER},",
134 | "posttype = #{posttype,jdbcType=INTEGER}",
135 | "where id = #{id,jdbcType=INTEGER}"
136 | })
137 | int updateByPrimaryKey(WechatPost record);
138 |
139 | @Select({
140 | "select",
141 | "id, biz, appmsgid, title, digest, contenturl, sourceurl, cover, datetime, readnum, ",
142 | "likenum, isspider, author, nickname, weight, posttype, content",
143 | "from postTable",
144 | "where posttype = #{posttype,jdbcType=INTEGER}"
145 | })
146 | @ResultMap("ResultMapWithBLOBs")
147 | List getPostByType(@Param("posttype") Integer posttype);
148 |
149 | @Select({
150 | "select",
151 | "id, biz, appmsgid, title, digest, contenturl, sourceurl, cover, datetime, readnum, ",
152 | "likenum, isspider, author, nickname, weight, posttype, content",
153 | "from postTable",
154 | "where posttype = #{posttype,jdbcType=INTEGER} and datetime >=#{datetime,jdbcType=TIMESTAMP}",
155 | "order by weight DESC"
156 | })
157 | @ResultMap("ResultMapWithBLOBs")
158 | List getPostByTypeInWeek(@Param("posttype") Integer posttype, @Param("datetime") Date time);
159 |
160 |
161 | @Select({
162 | "select",
163 | "id, biz, appmsgid, title, digest, contenturl, sourceurl, cover, datetime, readnum, ",
164 | "likenum, isspider, author, nickname, weight, posttype, content",
165 | "from postTable",
166 | "where contenturl = #{contenturl,jdbcType=VARCHAR}"
167 | })
168 | @ResultMap("ResultMapWithBLOBs")
169 | WechatPost getPostByUrl(@Param("contenturl") String contenturl);
170 |
171 | @Select({
172 | "select",
173 | "id, biz, appmsgid, title, digest, contenturl, sourceurl, cover, datetime, readnum, ",
174 | "likenum, isspider, author, nickname, weight, posttype, content",
175 | "from postTable",
176 | "where appmsgid = #{appmsgid,jdbcType=VARCHAR}"
177 | })
178 | @ResultMap("ResultMapWithBLOBs")
179 | WechatPost getPostByAppMsId(@Param("appmsgid") String appmsgid);
180 |
181 | @Select({
182 | "select",
183 | "id, biz, appmsgid, title, digest, contenturl, sourceurl, cover, datetime, readnum, ",
184 | "likenum, isspider, author, nickname, weight, posttype, content",
185 | "from postTable"
186 | })
187 | @ResultMap("ResultMapWithBLOBs")
188 | List getAllPost();
189 |
190 | @Select({
191 | "select",
192 | "id, biz, appmsgid, title, digest, contenturl, sourceurl, cover, datetime, readnum, ",
193 | "likenum, isspider, author, nickname, weight, posttype, content",
194 | "from postTable",
195 | "where isspider = #{isspider,jdbcType=INTEGER}"
196 | })
197 | @ResultMap("ResultMapWithBLOBs")
198 | List getAllUnSpiderPost(@Param("isspider") Integer isspider);
199 |
200 | @Delete({
201 | "delete from postTable"
202 | })
203 | int deleteAllData();
204 |
205 | @Delete({
206 | "delete from postTable",
207 | "where contenturl = #{contenturl,jdbcType=VARCHAR}"
208 | })
209 | int deleteByRequestUrl(@Param("contenturl") String contenturl);
210 |
211 |
212 | @Select({""
223 | })
224 | @ResultMap("ResultMapWithBLOBs")
225 | List getATAPosts(@Param("nickname") List nickname, @Param("datetime") Date time);
226 | }
--------------------------------------------------------------------------------
/src/main/resources/mybatis-mapper/WechatPostMapper.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
58 | id, biz, appmsgid, title, digest, contenturl, sourceurl, cover, datetime, readnum,
59 | likenum, isspider, author, nickname, weight, posttype
60 |
61 |
62 |
66 | content
67 |
68 |
69 |
73 | insert into postTable
74 |
75 |
76 | id,
77 |
78 |
79 | biz,
80 |
81 |
82 | appmsgid,
83 |
84 |
85 | title,
86 |
87 |
88 | digest,
89 |
90 |
91 | contenturl,
92 |
93 |
94 | sourceurl,
95 |
96 |
97 | cover,
98 |
99 |
100 | datetime,
101 |
102 |
103 | readnum,
104 |
105 |
106 | likenum,
107 |
108 |
109 | isspider,
110 |
111 |
112 | author,
113 |
114 |
115 | nickname,
116 |
117 |
118 | weight,
119 |
120 |
121 | posttype,
122 |
123 |
124 | content,
125 |
126 |
127 |
128 |
129 | #{id,jdbcType=INTEGER},
130 |
131 |
132 | #{biz,jdbcType=VARCHAR},
133 |
134 |
135 | #{appmsgid,jdbcType=VARCHAR},
136 |
137 |
138 | #{title,jdbcType=VARCHAR},
139 |
140 |
141 | #{digest,jdbcType=VARCHAR},
142 |
143 |
144 | #{contenturl,jdbcType=VARCHAR},
145 |
146 |
147 | #{sourceurl,jdbcType=VARCHAR},
148 |
149 |
150 | #{cover,jdbcType=VARCHAR},
151 |
152 |
153 | #{datetime,jdbcType=TIMESTAMP},
154 |
155 |
156 | #{readnum,jdbcType=INTEGER},
157 |
158 |
159 | #{likenum,jdbcType=INTEGER},
160 |
161 |
162 | #{isspider,jdbcType=INTEGER},
163 |
164 |
165 | #{author,jdbcType=VARCHAR},
166 |
167 |
168 | #{nickname,jdbcType=VARCHAR},
169 |
170 |
171 | #{weight,jdbcType=INTEGER},
172 |
173 |
174 | #{posttype,jdbcType=INTEGER},
175 |
176 |
177 | #{content,jdbcType=LONGVARCHAR},
178 |
179 |
180 |
181 |
182 |
186 | update postTable
187 |
188 |
189 | biz = #{biz,jdbcType=VARCHAR},
190 |
191 |
192 | appmsgid = #{appmsgid,jdbcType=VARCHAR},
193 |
194 |
195 | title = #{title,jdbcType=VARCHAR},
196 |
197 |
198 | digest = #{digest,jdbcType=VARCHAR},
199 |
200 |
201 | contenturl = #{contenturl,jdbcType=VARCHAR},
202 |
203 |
204 | sourceurl = #{sourceurl,jdbcType=VARCHAR},
205 |
206 |
207 | cover = #{cover,jdbcType=VARCHAR},
208 |
209 |
210 | datetime = #{datetime,jdbcType=TIMESTAMP},
211 |
212 |
213 | readnum = #{readnum,jdbcType=INTEGER},
214 |
215 |
216 | likenum = #{likenum,jdbcType=INTEGER},
217 |
218 |
219 | isspider = #{isspider,jdbcType=INTEGER},
220 |
221 |
222 | author = #{author,jdbcType=VARCHAR},
223 |
224 |
225 | nickname = #{nickname,jdbcType=VARCHAR},
226 |
227 |
228 | weight = #{weight,jdbcType=INTEGER},
229 |
230 |
231 | posttype = #{posttype,jdbcType=INTEGER},
232 |
233 |
234 | content = #{content,jdbcType=LONGVARCHAR},
235 |
236 |
237 | where id = #{id,jdbcType=INTEGER}
238 |
239 |
--------------------------------------------------------------------------------
/src/main/java/com/wechat/auto/util/Utils.java:
--------------------------------------------------------------------------------
1 | package com.wechat.auto.util;
2 |
3 | import org.apache.http.util.TextUtils;
4 |
5 | import java.math.BigInteger;
6 | import java.text.ParseException;
7 | import java.text.SimpleDateFormat;
8 | import java.util.Date;
9 | import java.util.List;
10 |
11 | public class Utils {
12 |
13 | public static final String VAR_BIZ = "var appuin = \"";
14 | public static final String VAR_TITLE = "var msg_title = '";
15 | public static final String VAR_DIGEST = "var msg_desc = \"";
16 | public static final String VAR_TIME = "\",i=\"";
17 | public static final String SVR_TIME ="var svr_time = \"";
18 | public static final String VAR_NICKNAME = "var nickname = \"";
19 | public static final String VAR_COVER_URL = "var msg_cdn_url = \"";
20 | public static final String VAR_MSG_SOURCE_URL = "var msg_source_url = '";
21 | public static final String VAR_MSG_LINK = "var msg_link = \"";
22 | public static final String VAR_ARTICLE_TYPE = "var _ori_article_type = \"";
23 | public static final String MSG_ID = "mid=";
24 | public static final String AUTHOR = "";
25 |
26 | public static String[] tagArray = new String[]{"背景" , "前言"};
27 |
28 | public static String[] fieldNews = new String[]{"车云", "InfoQ"};
29 |
30 | public static String[] androidDevelop = new String[]{"Android", "View", "Binder", "C++", "APK", "App", "移动端",
31 | "Gradle", "Fragment", "Java", "JDK", "Gson", "JVM", "Handler"};
32 |
33 | public static String[] cppDevelop = new String[]{"C++"};
34 |
35 | public static String[] techExtend = new String[]{"Node.js", "架构", "算法", "MySQL", "优化实践", "高可用", "React",
36 | "人工智能", "Go", "机器学习", "优化实战", "区块链", "开源", "优化实战","深度学习", "框架"};
37 |
38 | public static final int KEYWORD_TYPE_NEWS = 1;
39 |
40 | public static final int KEYWORD_TYPE_ANDROID = 2;
41 |
42 | public static final int KEYWORD_TYPE_CPP = 3;
43 |
44 | public static final int KEYWORD_TYPE_EXTEND = 4;
45 |
46 | public static String[] ATA_NICK_LIST = new String[]{"机器之心","量子位", "AI前线", "新智元", "36氪", "雷锋网", "InfoQ", "CSDN"};
47 |
48 | /**
49 | * 解析文章类型
50 | *
51 | * @param nickName
52 | * @return 文章类型:1、行业新闻 2、Android开发 3、C++开发 4、技术扩展
53 | */
54 | public static int analysisType(String nickName, String title, String contentTxt){
55 | if("InfoQ".equals(nickName) && title.contains("Q资讯")){
56 | return 1;
57 | }
58 | if("AI前线".equals(nickName) && title.contains("一周热闻")){
59 | return 1;
60 | }
61 | if("车云".equals(nickName)){
62 | return 1;
63 | }
64 |
65 | if("前端之巅".equals(nickName) && title.contains("前端周报")){
66 | return 1;
67 | }
68 |
69 | for (int i=0;i 0){
231 | if(fromIndex -1 > 0){
232 | /**
233 | * 避免在正文内容中出现的标签文本内容导致内容截取错误
234 | */
235 | if(!TextUtils.isEmpty(sourceContent.substring(fromIndex-1, fromIndex))){
236 | return null;
237 | }
238 | }
239 | return sourceContent.substring(fromIndex + tag.length(), getPreferDotIndex(sourceContent, fromIndex) + 1);
240 | }
241 | return null;
242 | }
243 |
244 | /**
245 | * 最多截取三句话
246 | *
247 | * @param sourceContent
248 | * @param fromIndex
249 | * @return
250 | */
251 | public static int getPreferDotIndex(String sourceContent, int fromIndex){
252 | int firstDotIndex = sourceContent.indexOf("。", fromIndex);
253 | int secondDotIndex = sourceContent.indexOf("。", firstDotIndex + 1);
254 | int thirdDotIndex = sourceContent.indexOf("。", secondDotIndex + 1);
255 | if(thirdDotIndex > 0){
256 | return thirdDotIndex;
257 | }
258 | if(secondDotIndex > 0){
259 | return secondDotIndex;
260 | }
261 | return firstDotIndex;
262 | }
263 |
264 | public static String stripHtml(String content) {
265 | if(content == null){
266 | return "";
267 | }
268 | //去掉script
269 | content = content.replaceAll("