├── .gitignore
├── README.md
├── file
├── read.txt
└── 对应分类.txt
├── pom.xml
└── src
├── main
└── java
│ └── com
│ └── blogchong
│ └── webmite
│ ├── company
│ ├── AnalyUrl.java
│ ├── AnalyzeData.java
│ ├── CompanyStart.java
│ ├── GetCompanyFromZhilian.java
│ └── GetUrlFrom360.java
│ └── util
│ ├── JudgeEmail.java
│ ├── MacroDef.java
│ └── OptFile.java
└── test
└── java
└── com
└── blogchong
└── webmite
└── AppTest.java
/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | /.classpath
3 | /.project
4 | /.cache
5 | /.settings
6 | /logs
7 | /.idea
8 | /.file/ot
9 | /.file/qo
10 | /.file/qq
11 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # webmite
2 |
3 | 网页虫,定向抓取互联网中有价值的数据。
4 | -------------------------------
5 |
6 | ### 相关信息:
7 | 作者:blogchong
8 | 邮箱:blogchong@163.com //有事可以邮箱
9 | QQ: 874450476 //一起交流技术
10 | storm群:191321336 //国内最大最最活跃的storm群,你找到组织了
11 | [更多资料技术文章,欢迎访问博客虫网站:www.blogchong.com](http://www.blogchong.com)
12 |
13 | ### 项目说明:
14 | 1 以智联招聘为源头,分析网页,抓取中小型公司的名称(<=49人)。
15 | 2 通过名称,以360好搜为依据,进行检索探测,判断该企业是否有企业官网(独立域名)。
16 | 3 筛选出没有企业的企业,从360的检索页面中,查找企业邮箱(包括邮箱正则匹配)。
17 |
--------------------------------------------------------------------------------
/file/read.txt:
--------------------------------------------------------------------------------
1 | http://sou.zhaopin.com/jobs/searchresult.ashx?in=210500%3B160400%3B160000%3B160500%3B160200%3B300100%3B160100%3B160600&jl=%E9%80%89%E6%8B%A9%E5%9C%B0%E5%8C%BA&sm=0&p=1&sf=0&st=99999&cs=1&isadv=1
2 | http://sou.zhaopin.com/jobs/searchresult.ashx?in=180000%3B180100%3B300500%3B300900&jl=%E9%80%89%E6%8B%A9%E5%9C%B0%E5%8C%BA&sm=0&p=1&sf=0&st=99999&cs=1&isadv=1
3 | http://sou.zhaopin.com/jobs/searchresult.ashx?in=140000%3B140100%3B140200&jl=%E9%80%89%E6%8B%A9%E5%9C%B0%E5%8C%BA&sm=0&p=1&sf=0&st=99999&cs=1&isadv=1
4 | http://sou.zhaopin.com/jobs/searchresult.ashx?in=200300%3B200302%3B201400%3B201300%3B300300&jl=%E9%80%89%E6%8B%A9%E5%9C%B0%E5%8C%BA&sm=0&p=1&sf=0&st=99999&cs=1&isadv=1
5 | http://sou.zhaopin.com/jobs/searchresult.ashx?in=120400%3B120200%3B170500%3B170000%3B300700&jl=%E9%80%89%E6%8B%A9%E5%9C%B0%E5%8C%BA&sm=0&p=1&sf=0&st=99999&cs=1&isadv=1
6 | http://sou.zhaopin.com/jobs/searchresult.ashx?in=201100%3B120800&jl=%E9%80%89%E6%8B%A9%E5%9C%B0%E5%8C%BA&sm=0&p=1&sf=0&st=99999&cs=1&isadv=1
7 | http://sou.zhaopin.com/jobs/searchresult.ashx?in=121000%3B129900%3B121100%3B121200%3B210600%3B120700%3B121300%3B121500%3B300000&jl=%E9%80%89%E6%8B%A9%E5%9C%B0%E5%8C%BA&sm=0&p=1&sf=0&st=99999&cs=1&isadv=1
8 | http://sou.zhaopin.com/jobs/searchresult.ashx?in=150000%3B301100&jl=%E9%80%89%E6%8B%A9%E5%9C%B0%E5%8C%BA&sm=0&p=1&sf=0&st=99999&cs=1&isadv=1
9 | http://sou.zhaopin.com/jobs/searchresult.ashx?in=121400%3B200600%3B200800&jl=%E9%80%89%E6%8B%A9%E5%9C%B0%E5%8C%BA&sm=0&p=1&sf=0&st=99999&cs=1&isadv=1
10 | http://sou.zhaopin.com/jobs/searchresult.ashx?in=210300%3B200700&jl=%E9%80%89%E6%8B%A9%E5%9C%B0%E5%8C%BA&sm=0&p=1&sf=0&st=99999&cs=1&isadv=1
11 | http://sou.zhaopin.com/jobs/searchresult.ashx?in=130000%3B120500%3B130100%3B201200&jl=%E9%80%89%E6%8B%A9%E5%9C%B0%E5%8C%BA&sm=0&p=1&sf=0&st=99999&cs=1&isadv=1
12 | http://sou.zhaopin.com/jobs/searchresult.ashx?in=200100%3B120600&jl=%E9%80%89%E6%8B%A9%E5%9C%B0%E5%8C%BA&sm=0&p=1&sf=0&st=99999&cs=1&isadv=1
13 | http://sou.zhaopin.com/jobs/searchresult.ashx?in=100000%3B100100%3B990000&jl=%E9%80%89%E6%8B%A9%E5%9C%B0%E5%8C%BA&sm=0&p=1&sf=0&st=99999&cs=1&isadv=1
--------------------------------------------------------------------------------
/file/对应分类.txt:
--------------------------------------------------------------------------------
1 | 1 IT|通信|电子|互联网
2 | 2 金融业
3 | 3 房地产|建筑业
4 | 4 商业服务
5 | 5 贸易|批发|零售|租赁业
6 | 6 文体教育|工艺美术
7 | 7 生产|加工|制造
8 | 8 交通|运输|物流|仓储
9 | 9 服务业
10 | 10 文化|传媒|娱乐|体育
11 | 11 能源|矿产|环保
12 | 12 政府|非营利性机构
13 | 13 农|林|畜|渔|其他
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 |
5 | com.blogchong
6 | webmite
7 | 0.0.1-SNAPSHOT
8 | jar
9 |
10 | webmite
11 | http://maven.apache.org
12 |
13 |
14 | UTF-8
15 |
16 |
17 |
18 |
19 |
20 | org.json
21 | json
22 | 20140107
23 |
24 |
25 |
26 | net.sourceforge.htmlunit
27 | htmlunit
28 | 2.15
29 |
30 |
31 |
32 | junit
33 | junit
34 | 3.8.1
35 | test
36 |
37 |
38 |
39 |
--------------------------------------------------------------------------------
/src/main/java/com/blogchong/webmite/company/AnalyUrl.java:
--------------------------------------------------------------------------------
1 | package com.blogchong.webmite.company;
2 |
3 | import java.util.Set;
4 | import org.json.JSONArray;
5 | import org.json.JSONObject;
6 | import com.blogchong.webmite.util.MacroDef;
7 |
8 | /**
9 | * @Author: blogchong
10 | * @Blog: www.blogchong.com
11 | * @Mailbox: blogchong@163.com
12 | * @QQGroup: 191321336
13 | * @Weixin: blogchong
14 | * @CreateTime:2015年1月14日 下午3:09:01
15 | * @Description: 过滤出无企业网站的公司
16 | */
17 |
18 | public class AnalyUrl {
19 |
20 | @SuppressWarnings("unchecked")
21 | public static JSONObject analyUrl(String url) {
22 |
23 | JSONObject obj_ret = new JSONObject();
24 |
25 | Set set = GetCompanyFromZhilian.getCompanyFromZl(url);
26 |
27 | for (String str : set) {
28 |
29 | GetUrlFrom360 getUrlFrom360 = new GetUrlFrom360();
30 |
31 | JSONObject obj = getUrlFrom360.getUrlFrom360(str);
32 |
33 | Set keys = obj.keySet();
34 |
35 | // 是否有首页的标识
36 | boolean index_flag = false;
37 | // 其他是否为空标识
38 | boolean other_flag = false;
39 |
40 | // 存储非首页网址
41 | JSONArray jar_not = new JSONArray();
42 |
43 | for (String key : keys) {
44 |
45 | JSONArray jar = obj.getJSONArray(key);
46 |
47 | if (key.equals(MacroDef.INDEX_FLAG) && jar.length() == 0) {
48 | index_flag = true;
49 | } else if (key.equals(MacroDef.OTHER_FLAG) && jar.length() == 0) {
50 | other_flag = true;
51 | }
52 |
53 | for (int i = 0; i < jar.length(); i++) {
54 |
55 | if (key.equals(MacroDef.OTHER_FLAG)) {
56 | jar_not.put(jar.getString(i));
57 | }
58 |
59 | }
60 | }
61 |
62 | if (index_flag && !other_flag) {
63 | obj_ret.put(str, jar_not);
64 | }
65 |
66 | }
67 |
68 | return obj_ret;
69 | }
70 |
71 | @SuppressWarnings("unchecked")
72 | public static void main(String[] args) {
73 | String url = "http://sou.zhaopin.com/jobs/searchresult.ashx?in=210500%3B160400%3B160000%3B160500%3B160200%3B300100%3B160100%3B160600&jl=%E5%8C%97%E4%BA%AC&sm=0&p=1&sf=0&st=99999&cs=1&isadv=1";
74 | JSONObject obj = analyUrl(url);
75 |
76 | Set keys = obj.keySet();
77 |
78 | for (String key : keys) {
79 | System.out.println(key + ":(" + obj.get(key) + ")");
80 | }
81 | }
82 |
83 | }
84 |
--------------------------------------------------------------------------------
/src/main/java/com/blogchong/webmite/company/AnalyzeData.java:
--------------------------------------------------------------------------------
1 | package com.blogchong.webmite.company;
2 |
3 | import java.io.IOException;
4 | import java.net.MalformedURLException;
5 | import java.util.regex.Matcher;
6 | import java.util.regex.Pattern;
7 | import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
8 | import com.gargoylesoftware.htmlunit.WebClient;
9 | import com.gargoylesoftware.htmlunit.html.HtmlPage;
10 |
11 | /**
12 | * @Author: blogchong
13 | * @Blog: www.blogchong.com
14 | * @Mailbox: blogchong@163.com
15 | * @QQGroup: 191321336
16 | * @Weixin: blogchong
17 | * @CreateTime:2015年1月14日 下午3:00:02
18 | * @Description: 从网页源码中解析邮箱
19 | */
20 |
21 | public class AnalyzeData {
22 |
23 | // 从源码中分析地址、电话、手机、电子邮箱
24 | public static String analyzeData(String url) {
25 |
26 | WebClient webClient = new WebClient();
27 |
28 | webClient.getOptions().setCssEnabled(false);
29 | webClient.getOptions().setJavaScriptEnabled(false);
30 |
31 | try {
32 |
33 | HtmlPage page = webClient.getPage(url);
34 | String content = page.asText();
35 |
36 | String regex_email = "\\s*(\\w+@\\S+\\.\\S*com|cn|net|org|gov|edu|int|biz|info|name|tv|cc|pro|coop|aero|museum|CC|TV|Club).*";
37 | Matcher mt_email = Pattern.compile(regex_email).matcher(content);
38 |
39 | if (mt_email.find()) {
40 | return mt_email.group(1);
41 | } else {
42 | return null;
43 | }
44 |
45 | } catch (FailingHttpStatusCodeException e) {
46 | // e.printStackTrace();
47 | } catch (MalformedURLException e) {
48 | // e.printStackTrace();
49 | } catch (IOException e) {
50 | // e.printStackTrace();
51 | } finally {
52 | webClient.closeAllWindows();
53 | }
54 |
55 | return null;
56 | }
57 |
58 | public static void main(String[] args) {
59 | String url = "http://shanghai.myjob.com/company/11-2378094572.html";
60 | System.out.println(analyzeData(url));
61 | }
62 |
63 | }
64 |
--------------------------------------------------------------------------------
/src/main/java/com/blogchong/webmite/company/CompanyStart.java:
--------------------------------------------------------------------------------
1 | package com.blogchong.webmite.company;
2 |
3 | import java.util.ArrayList;
4 | import java.util.List;
5 | import java.util.Set;
6 |
7 | import org.json.JSONArray;
8 | import org.json.JSONObject;
9 |
10 | import com.blogchong.webmite.util.JudgeEmail;
11 | import com.blogchong.webmite.util.OptFile;
12 |
13 | /**
14 | * @Author: blogchong
15 | * @Blog: www.blogchong.com
16 | * @Mailbox: blogchong@163.com
17 | * @QQGroup: 191321336
18 | * @Weixin: blogchong
19 | * @CreateTime:2015年1月14日 下午5:40:20
20 | * @Description: 起始类
21 | */
22 |
23 | public class CompanyStart {
24 |
25 | @SuppressWarnings("unchecked")
26 | public static void main(String[] args) {
27 |
28 | List listUrl = OptFile
29 | .readFile("D:\\projects\\webmite\\file\\read.txt");
30 |
31 | // 统计
32 | int count = 1;
33 |
34 | for (String url : listUrl) {
35 |
36 | List listEmail = new ArrayList();
37 | List listQqEmail = new ArrayList();
38 | List listQqOtherEmail = new ArrayList();
39 |
40 | JSONObject obj = AnalyUrl.analyUrl(url);
41 |
42 | Set keys = obj.keySet();
43 |
44 | for (String key : keys) {
45 |
46 | String companyName = key;
47 | try {
48 |
49 | JSONArray jar_url = obj.getJSONArray(key);
50 |
51 | JSONArray jarOhter = new JSONArray();
52 | JSONArray jarQq = new JSONArray();
53 |
54 | for (int i = 0; i < jar_url.length(); i++) {
55 |
56 | String email_tmp = AnalyzeData.analyzeData(jar_url
57 | .getString(i));
58 |
59 | if (email_tmp != null
60 | && JudgeEmail.judgeEmail(email_tmp)) {
61 | if (JudgeEmail.judgeEmail2(email_tmp)) {
62 | jarQq.put(email_tmp);
63 | } else {
64 | jarOhter.put(email_tmp);
65 | }
66 | }
67 |
68 | }
69 |
70 | if (jarOhter.length() != 0 && jarQq.length() == 0) {
71 | listEmail
72 | .add(companyName + "==>" + jarOhter.toString());
73 | } else if (jarOhter.length() != 0 && jarQq.length() != 0) {
74 | listQqEmail.add(companyName + "==>" + jarQq.toString());
75 | listQqOtherEmail.add(companyName + "==>"
76 | + jarOhter.toString());
77 | } else if (jarOhter.length() == 0 && jarQq.length() != 0) {
78 | listQqEmail.add(companyName + "==>" + jarQq.toString());
79 | }
80 |
81 | } catch (Exception e) {
82 | }
83 |
84 | }
85 |
86 | // 写入
87 | OptFile.writeFile("D:\\projects\\webmite\\file\\ot\\ot_email_"
88 | + count + ".log", listEmail);
89 | // 写入
90 | OptFile.writeFile("D:\\projects\\webmite\\file\\qq\\qq_email_"
91 | + count + ".log", listQqEmail);
92 |
93 | // 写入
94 | OptFile.writeFile("D:\\projects\\webmite\\file\\qo\\qo_email_"
95 | + count + ".log", listQqOtherEmail);
96 |
97 | count++;
98 | }
99 |
100 | }
101 |
102 | }
103 |
--------------------------------------------------------------------------------
/src/main/java/com/blogchong/webmite/company/GetCompanyFromZhilian.java:
--------------------------------------------------------------------------------
1 | package com.blogchong.webmite.company;
2 |
3 | import java.io.IOException;
4 | import java.net.MalformedURLException;
5 | import java.util.HashSet;
6 | import java.util.Set;
7 | import java.util.regex.Matcher;
8 | import java.util.regex.Pattern;
9 | import com.blogchong.webmite.util.MacroDef;
10 | import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
11 | import com.gargoylesoftware.htmlunit.WebClient;
12 | import com.gargoylesoftware.htmlunit.html.HtmlPage;
13 |
14 | /**
15 | * @Author: blogchong
16 | * @Blog: www.blogchong.com
17 | * @Mailbox: blogchong@163.com
18 | * @QQGroup: 191321336
19 | * @Weixin: blogchong
20 | * @CreateTime:2015年1月14日 下午1:43:28
21 | * @Description: 从智联招聘中解析出公司名
22 | */
23 |
24 | public class GetCompanyFromZhilian {
25 |
26 | // 从智联解析公司
27 | public static Set getCompanyFromZl(String urls) {
28 |
29 | Set set = new HashSet();
30 |
31 | WebClient webClient = new WebClient();
32 |
33 | webClient.getOptions().setCssEnabled(false);
34 | webClient.getOptions().setJavaScriptEnabled(false);
35 |
36 | String content = null;
37 |
38 | int count = 0;
39 |
40 | String url = null;
41 |
42 | while (true) {
43 |
44 | // 构造url
45 | if (count == 0) {
46 | url = urls;
47 | } else {
48 | // 解析后面页面的url
49 | url = packUrl(urls, count);
50 | }
51 |
52 | // 若是url不符合跳出
53 | if (url == null) {
54 | break;
55 | }
56 |
57 | try {
58 |
59 | HtmlPage page = webClient.getPage(url);
60 |
61 | content = page.asText();
62 |
63 | // 判断前一页跟后一页是否相同,若相同,说明已经到最后,跳出
64 | if (content.equals(MacroDef.ZL_SRC)) {
65 | break;
66 | } else {
67 | MacroDef.ZL_SRC = content;
68 | count++;
69 | }
70 |
71 | String[] lines = content.split("\n");
72 |
73 | for (String line : lines) {
74 |
75 | // 匹配该行模板
76 | String regex = "^\\s+\\S+\\s+\\S+\\s+\\S+\\s+.*(\\d{1,2}-\\d{1,2})$";
77 |
78 | Pattern pattern = Pattern.compile(regex);
79 | Matcher mt = pattern.matcher(line);
80 |
81 | // 首先将不符合基本格式的过滤
82 | if (mt.find()) {
83 |
84 | String note = mt.group();
85 |
86 | String[] fields = note.split("\t");
87 | if (fields.length >= 2) {
88 | set.add(fields[1]);
89 | // System.out.println(fields[1]);
90 | }
91 | }
92 |
93 | }
94 |
95 | } catch (FailingHttpStatusCodeException e) {
96 | e.printStackTrace();
97 | } catch (MalformedURLException e) {
98 | e.printStackTrace();
99 | } catch (IOException e) {
100 | e.printStackTrace();
101 | } finally {
102 | webClient.closeAllWindows();
103 | }
104 | }
105 |
106 | // 最后将标识置为原先默认值
107 | MacroDef.ZL_SRC = MacroDef.ZL_FLAG;
108 |
109 | return set;
110 | }
111 |
112 | // url拆分
113 | public static String packUrl(String url_tmp, int count) {
114 |
115 | String url = null;
116 |
117 | String[] url_tmps = url_tmp.split("sm=0&");
118 |
119 | if (url_tmps.length >= 2) {
120 |
121 | String[] url_tmps2 = url_tmps[0].split("jl=");
122 |
123 | if (url_tmps2.length == 2) {
124 | url = url_tmps2[0].toLowerCase()
125 | + "jl="
126 | + url_tmps2[1]
127 | + "sm=0&"
128 | + "sf=0&st=99999&cs=1&isadv=1&sg=a021204b5ef145448301208cd5b38c68&p="
129 | + (count + 1);
130 | }
131 | }
132 |
133 | return url;
134 | }
135 |
136 | public static void main(String[] args) {
137 | String url = "http://sou.zhaopin.com/jobs/searchresult.ashx?in=140000%3B140100%3B140200%3B201400%3B120200%3B170000%3B120800%3B121100%3B210600%3B120700&jl=%E5%8C%97%E4%BA%AC&sm=0&p=1&sf=0&st=99999&cs=1&isadv=1";
138 | Set set = GetCompanyFromZhilian.getCompanyFromZl(url);
139 | for (String str : set) {
140 | System.out.println(str);
141 | }
142 | }
143 |
144 | }
145 |
--------------------------------------------------------------------------------
/src/main/java/com/blogchong/webmite/company/GetUrlFrom360.java:
--------------------------------------------------------------------------------
1 | package com.blogchong.webmite.company;
2 |
3 | import java.io.IOException;
4 | import java.net.MalformedURLException;
5 | import java.util.List;
6 | import java.util.Set;
7 | import java.util.regex.Matcher;
8 | import java.util.regex.Pattern;
9 |
10 | import org.json.JSONArray;
11 | import org.json.JSONObject;
12 |
13 | import com.blogchong.webmite.util.MacroDef;
14 | import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
15 | import com.gargoylesoftware.htmlunit.WebClient;
16 | import com.gargoylesoftware.htmlunit.html.HtmlAnchor;
17 | import com.gargoylesoftware.htmlunit.html.HtmlPage;
18 |
19 | /**
20 | * @Author: blogchong
21 | * @Blog: www.blogchong.com
22 | * @Mailbox: blogchong@163.com
23 | * @QQGroup: 191321336
24 | * @Weixin: blogchong
25 | * @CreateTime:2015年1月14日 下午12:50:17
26 | * @Description:从360搜索解析url
27 | */
28 |
29 | public class GetUrlFrom360 {
30 |
31 | /*
32 | * 入参为公司名,返回一个json
33 | */
34 | @SuppressWarnings("deprecation")
35 | public JSONObject getUrlFrom360(String companyName) {
36 |
37 | // JSON格式数据解析对象
38 | JSONObject obj = new JSONObject();
39 |
40 | String str = java.net.URLEncoder.encode(companyName);
41 |
42 | String url = "http://www.haosou.com/s?ie=utf-8&shb=1&src=360sou_newhome&q="
43 | + str;
44 |
45 | WebClient webClient = new WebClient();
46 |
47 | webClient.getOptions().setCssEnabled(false);
48 | webClient.getOptions().setJavaScriptEnabled(false);
49 |
50 | // JSON格式数据解析对象
51 | JSONArray jar_is = new JSONArray();
52 | JSONArray jar_not = new JSONArray();
53 |
54 | try {
55 |
56 | HtmlPage page = webClient.getPage(url);
57 |
58 | List list = page.getAnchors();
59 |
60 | for (HtmlAnchor htmlAnchor : list) {
61 |
62 | String url_tmp = htmlAnchor.getHrefAttribute();
63 |
64 | // 匹配该行模板
65 | String regex = "http://.*";
66 |
67 | Pattern pattern = Pattern.compile(regex);
68 | Matcher line = pattern.matcher(url_tmp);
69 |
70 | // 首先将不符合基本格式的过滤
71 | if (line.find()) {
72 |
73 | // 匹配该行模板
74 | String regex2 = "http://.*haoso|360|baidu|google|bing|leidian.*";
75 |
76 | Pattern pattern2 = Pattern.compile(regex2);
77 | Matcher line2 = pattern2.matcher(line.group());
78 |
79 | // 将带搜索引擎专有页面过滤
80 | if (!line2.find()) {
81 |
82 | // 匹配该行模板
83 | String regex3 = "http://www\\..*\\.((cn|com|net|org)/)$";
84 |
85 | Pattern pattern3 = Pattern.compile(regex3);
86 | Matcher line3 = pattern3.matcher(line.group());
87 |
88 | // 判断是否为首页
89 | if (line3.find()) {
90 | jar_is.put(line.group());
91 | } else {
92 | jar_not.put(line.group());
93 | }
94 |
95 | }
96 | }
97 | }
98 |
99 | page.cleanUp();
100 |
101 | } catch (FailingHttpStatusCodeException e) {
102 | // e.printStackTrace();
103 | } catch (MalformedURLException e) {
104 | // e.printStackTrace();
105 | } catch (IOException e) {
106 | // e.printStackTrace();
107 | } finally {
108 | webClient.closeAllWindows();
109 | }
110 |
111 | obj.put(MacroDef.INDEX_FLAG, jar_is);
112 | obj.put(MacroDef.OTHER_FLAG, jar_not);
113 |
114 | return obj;
115 | }
116 |
117 | @SuppressWarnings("unchecked")
118 | public static void main(String[] args) {
119 |
120 | GetUrlFrom360 getUrlFrom360 = new GetUrlFrom360();
121 |
122 | JSONObject obj = getUrlFrom360.getUrlFrom360("北京场道市政工程有限公司");
123 |
124 | Set keys = obj.keySet();
125 |
126 | for (String key : keys) {
127 |
128 | JSONArray jar = obj.getJSONArray(key);
129 |
130 | for (int i = 0; i < jar.length(); i++) {
131 |
132 | System.out.println(key + ":" + jar.getString(i));
133 |
134 | }
135 | }
136 | }
137 |
138 | }
139 |
--------------------------------------------------------------------------------
/src/main/java/com/blogchong/webmite/util/JudgeEmail.java:
--------------------------------------------------------------------------------
1 | package com.blogchong.webmite.util;
2 |
3 | import java.util.regex.Matcher;
4 | import java.util.regex.Pattern;
5 |
6 | /**
7 | * @Author: blogchong
8 | * @Blog: www.blogchong.com
9 | * @Mailbox: blogchong@163.com
10 | * @QQGroup: 191321336
11 | * @Weixin: blogchong
12 | * @version: 2015年1月14日 下午8:46:02
13 | * @Des: 判断邮箱格式
14 | */
15 |
16 | public class JudgeEmail {
17 |
18 | public static boolean judgeEmail(String email) {
19 |
20 | // 匹配该行模板
21 | String regex = "^([a-zA-Z0-9]|[._])+@([a-zA-Z0-9_-])+([a-zA-Z0-9]|[._])+\\.(com|cn|net|org|gov|edu|int|biz|info|name|tv|cc|pro|coop|aero|museum|CC|TV|Club)$";
22 |
23 | Pattern pattern = Pattern.compile(regex);
24 | Matcher line = pattern.matcher(email);
25 |
26 | if (line.find()) {
27 | return true;
28 | } else {
29 | return false;
30 | }
31 |
32 | }
33 |
34 | // 判断是否为qq邮箱
35 | public static boolean judgeEmail2(String email) {
36 |
37 | // 匹配该行模板
38 | String regex = "^([a-zA-Z0-9]|[._])+@qq\\.com$";
39 |
40 | Pattern pattern = Pattern.compile(regex);
41 | Matcher line = pattern.matcher(email);
42 |
43 | if (line.find()) {
44 | return true;
45 | } else {
46 | return false;
47 | }
48 |
49 | }
50 |
51 | public static void main(String[] args) {
52 | String email = "dkj_dkdl@126.dd.com";
53 | if (judgeEmail(email)) {
54 | System.out.println("OK");
55 | } else {
56 | System.out.println("NO");
57 | }
58 | }
59 |
60 | }
61 |
--------------------------------------------------------------------------------
/src/main/java/com/blogchong/webmite/util/MacroDef.java:
--------------------------------------------------------------------------------
1 | package com.blogchong.webmite.util;
2 |
3 | /**
4 | * @Author: blogchong
5 | * @Blog: www.blogchong.com
6 | * @Mailbox: blogchong@163.com
7 | * @QQGroup: 191321336
8 | * @Weixin: blogchong
9 | * @CreateTime:2015年1月14日 下午1:00:57
10 | * @Description:宏定义
11 | */
12 |
13 | public class MacroDef {
14 |
15 | //首页标识
16 | public static final String INDEX_FLAG = "isHome";
17 | public static final String OTHER_FLAG = "notHome";
18 |
19 | //存储上一次智联招聘获取信息的页面源码
20 | public static String ZL_SRC = "hcy";
21 | public static final String ZL_FLAG = "hcy";
22 |
23 | //公司属性
24 | public static final String COM_NAME = "name";
25 | public static final String COM_PHONE = "phone";
26 | public static final String COM_EMAIL = "email";
27 |
28 | }
29 |
--------------------------------------------------------------------------------
/src/main/java/com/blogchong/webmite/util/OptFile.java:
--------------------------------------------------------------------------------
1 | package com.blogchong.webmite.util;
2 |
3 | import java.io.BufferedReader;
4 | import java.io.BufferedWriter;
5 | import java.io.File;
6 | import java.io.FileReader;
7 | import java.io.FileWriter;
8 | import java.util.ArrayList;
9 | import java.util.List;
10 |
11 | /**
12 | * @Author: blogchong
13 | * @Blog: www.blogchong.com
14 | * @Mailbox: blogchong@163.com
15 | * @QQGroup: 191321336
16 | * @Weixin: blogchong
17 | * @CreateTime:2015年1月14日 下午1:00:57
18 | * @Description: 生成的文件处理
19 | */
20 |
21 | public class OptFile {
22 |
23 | // 读文件
24 | public static List readFile(String path) {
25 | List list = new ArrayList();
26 |
27 | File fd = new File(path);
28 | String line = "";
29 | try {
30 |
31 | FileReader reader = new FileReader(fd);
32 | BufferedReader br = new BufferedReader(reader);
33 | while ((line = br.readLine()) != null) {
34 | list.add(line);
35 | }
36 |
37 | br.close();
38 | } catch (Exception e) {
39 | e.printStackTrace();
40 | }
41 |
42 | return list;
43 | }
44 |
45 | // 写文件
46 | public static void writeFile(String path, List list) {
47 | File fd = new File(path);
48 | try {
49 |
50 | FileWriter writer = new FileWriter(fd);
51 | BufferedWriter bw = new BufferedWriter(writer);
52 |
53 | for (String line : list) {
54 | bw.write(line);
55 | bw.newLine();
56 | }
57 |
58 | bw.close();
59 |
60 | } catch (Exception e) {
61 | e.printStackTrace();
62 | }
63 | }
64 |
65 | public static void main(String[] args) {
66 | List list = readFile("E:\\Java_Web\\projects\\webmite\\file\\read.txt");
67 | writeFile("E:\\Java_Web\\projects\\webmite\\file\\email.txt", list);
68 | }
69 |
70 | }
71 |
--------------------------------------------------------------------------------
/src/test/java/com/blogchong/webmite/AppTest.java:
--------------------------------------------------------------------------------
1 | package com.blogchong.webmite;
2 |
3 | import junit.framework.Test;
4 | import junit.framework.TestCase;
5 | import junit.framework.TestSuite;
6 |
7 | /**
8 | * Unit test for simple App.
9 | */
10 | public class AppTest
11 | extends TestCase
12 | {
13 | /**
14 | * Create the test case
15 | *
16 | * @param testName name of the test case
17 | */
18 | public AppTest( String testName )
19 | {
20 | super( testName );
21 | }
22 |
23 | /**
24 | * @return the suite of tests being tested
25 | */
26 | public static Test suite()
27 | {
28 | return new TestSuite( AppTest.class );
29 | }
30 |
31 | /**
32 | * Rigourous Test :-)
33 | */
34 | public void testApp()
35 | {
36 | assertTrue( true );
37 | }
38 | }
39 |
--------------------------------------------------------------------------------