├── .gitignore ├── README.md ├── file ├── read.txt └── 对应分类.txt ├── pom.xml └── src ├── main └── java │ └── com │ └── blogchong │ └── webmite │ ├── company │ ├── AnalyUrl.java │ ├── AnalyzeData.java │ ├── CompanyStart.java │ ├── GetCompanyFromZhilian.java │ └── GetUrlFrom360.java │ └── util │ ├── JudgeEmail.java │ ├── MacroDef.java │ └── OptFile.java └── test └── java └── com └── blogchong └── webmite └── AppTest.java /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | /.classpath 3 | /.project 4 | /.cache 5 | /.settings 6 | /logs 7 | /.idea 8 | /.file/ot 9 | /.file/qo 10 | /.file/qq 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # webmite 2 | 3 | 网页虫,定向抓取互联网中有价值的数据。 4 | ------------------------------- 5 | 6 | ### 相关信息: 7 | 作者:blogchong 8 | 邮箱:blogchong@163.com //有事可以邮箱 9 | QQ: 874450476 //一起交流技术 10 | storm群:191321336 //国内最大最最活跃的storm群,你找到组织了 11 | [更多资料技术文章,欢迎访问博客虫网站:www.blogchong.com](http://www.blogchong.com)
12 | 13 | ### 项目说明: 14 | 1 以智联招聘为源头,分析网页,抓取中小型公司的名称(<=49人)。 15 | 2 通过名称,以360好搜为依据,进行检索探测,判断该企业是否有企业官网(独立域名)。 16 | 3 筛选出没有企业的企业,从360的检索页面中,查找企业邮箱(包括邮箱正则匹配)。 17 | -------------------------------------------------------------------------------- /file/read.txt: -------------------------------------------------------------------------------- 1 | http://sou.zhaopin.com/jobs/searchresult.ashx?in=210500%3B160400%3B160000%3B160500%3B160200%3B300100%3B160100%3B160600&jl=%E9%80%89%E6%8B%A9%E5%9C%B0%E5%8C%BA&sm=0&p=1&sf=0&st=99999&cs=1&isadv=1 2 | http://sou.zhaopin.com/jobs/searchresult.ashx?in=180000%3B180100%3B300500%3B300900&jl=%E9%80%89%E6%8B%A9%E5%9C%B0%E5%8C%BA&sm=0&p=1&sf=0&st=99999&cs=1&isadv=1 3 | http://sou.zhaopin.com/jobs/searchresult.ashx?in=140000%3B140100%3B140200&jl=%E9%80%89%E6%8B%A9%E5%9C%B0%E5%8C%BA&sm=0&p=1&sf=0&st=99999&cs=1&isadv=1 4 | http://sou.zhaopin.com/jobs/searchresult.ashx?in=200300%3B200302%3B201400%3B201300%3B300300&jl=%E9%80%89%E6%8B%A9%E5%9C%B0%E5%8C%BA&sm=0&p=1&sf=0&st=99999&cs=1&isadv=1 5 | http://sou.zhaopin.com/jobs/searchresult.ashx?in=120400%3B120200%3B170500%3B170000%3B300700&jl=%E9%80%89%E6%8B%A9%E5%9C%B0%E5%8C%BA&sm=0&p=1&sf=0&st=99999&cs=1&isadv=1 6 | http://sou.zhaopin.com/jobs/searchresult.ashx?in=201100%3B120800&jl=%E9%80%89%E6%8B%A9%E5%9C%B0%E5%8C%BA&sm=0&p=1&sf=0&st=99999&cs=1&isadv=1 7 | http://sou.zhaopin.com/jobs/searchresult.ashx?in=121000%3B129900%3B121100%3B121200%3B210600%3B120700%3B121300%3B121500%3B300000&jl=%E9%80%89%E6%8B%A9%E5%9C%B0%E5%8C%BA&sm=0&p=1&sf=0&st=99999&cs=1&isadv=1 8 | http://sou.zhaopin.com/jobs/searchresult.ashx?in=150000%3B301100&jl=%E9%80%89%E6%8B%A9%E5%9C%B0%E5%8C%BA&sm=0&p=1&sf=0&st=99999&cs=1&isadv=1 9 | http://sou.zhaopin.com/jobs/searchresult.ashx?in=121400%3B200600%3B200800&jl=%E9%80%89%E6%8B%A9%E5%9C%B0%E5%8C%BA&sm=0&p=1&sf=0&st=99999&cs=1&isadv=1 10 | http://sou.zhaopin.com/jobs/searchresult.ashx?in=210300%3B200700&jl=%E9%80%89%E6%8B%A9%E5%9C%B0%E5%8C%BA&sm=0&p=1&sf=0&st=99999&cs=1&isadv=1 11 | http://sou.zhaopin.com/jobs/searchresult.ashx?in=130000%3B120500%3B130100%3B201200&jl=%E9%80%89%E6%8B%A9%E5%9C%B0%E5%8C%BA&sm=0&p=1&sf=0&st=99999&cs=1&isadv=1 12 | http://sou.zhaopin.com/jobs/searchresult.ashx?in=200100%3B120600&jl=%E9%80%89%E6%8B%A9%E5%9C%B0%E5%8C%BA&sm=0&p=1&sf=0&st=99999&cs=1&isadv=1 13 | http://sou.zhaopin.com/jobs/searchresult.ashx?in=100000%3B100100%3B990000&jl=%E9%80%89%E6%8B%A9%E5%9C%B0%E5%8C%BA&sm=0&p=1&sf=0&st=99999&cs=1&isadv=1 -------------------------------------------------------------------------------- /file/对应分类.txt: -------------------------------------------------------------------------------- 1 | 1 IT|通信|电子|互联网 2 | 2 金融业 3 | 3 房地产|建筑业 4 | 4 商业服务 5 | 5 贸易|批发|零售|租赁业 6 | 6 文体教育|工艺美术 7 | 7 生产|加工|制造 8 | 8 交通|运输|物流|仓储 9 | 9 服务业 10 | 10 文化|传媒|娱乐|体育 11 | 11 能源|矿产|环保 12 | 12 政府|非营利性机构 13 | 13 农|林|畜|渔|其他 -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | com.blogchong 6 | webmite 7 | 0.0.1-SNAPSHOT 8 | jar 9 | 10 | webmite 11 | http://maven.apache.org 12 | 13 | 14 | UTF-8 15 | 16 | 17 | 18 | 19 | 20 | org.json 21 | json 22 | 20140107 23 | 24 | 25 | 26 | net.sourceforge.htmlunit 27 | htmlunit 28 | 2.15 29 | 30 | 31 | 32 | junit 33 | junit 34 | 3.8.1 35 | test 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /src/main/java/com/blogchong/webmite/company/AnalyUrl.java: -------------------------------------------------------------------------------- 1 | package com.blogchong.webmite.company; 2 | 3 | import java.util.Set; 4 | import org.json.JSONArray; 5 | import org.json.JSONObject; 6 | import com.blogchong.webmite.util.MacroDef; 7 | 8 | /** 9 | * @Author: blogchong 10 | * @Blog: www.blogchong.com 11 | * @Mailbox: blogchong@163.com 12 | * @QQGroup: 191321336 13 | * @Weixin: blogchong 14 | * @CreateTime:2015年1月14日 下午3:09:01 15 | * @Description: 过滤出无企业网站的公司 16 | */ 17 | 18 | public class AnalyUrl { 19 | 20 | @SuppressWarnings("unchecked") 21 | public static JSONObject analyUrl(String url) { 22 | 23 | JSONObject obj_ret = new JSONObject(); 24 | 25 | Set set = GetCompanyFromZhilian.getCompanyFromZl(url); 26 | 27 | for (String str : set) { 28 | 29 | GetUrlFrom360 getUrlFrom360 = new GetUrlFrom360(); 30 | 31 | JSONObject obj = getUrlFrom360.getUrlFrom360(str); 32 | 33 | Set keys = obj.keySet(); 34 | 35 | // 是否有首页的标识 36 | boolean index_flag = false; 37 | // 其他是否为空标识 38 | boolean other_flag = false; 39 | 40 | // 存储非首页网址 41 | JSONArray jar_not = new JSONArray(); 42 | 43 | for (String key : keys) { 44 | 45 | JSONArray jar = obj.getJSONArray(key); 46 | 47 | if (key.equals(MacroDef.INDEX_FLAG) && jar.length() == 0) { 48 | index_flag = true; 49 | } else if (key.equals(MacroDef.OTHER_FLAG) && jar.length() == 0) { 50 | other_flag = true; 51 | } 52 | 53 | for (int i = 0; i < jar.length(); i++) { 54 | 55 | if (key.equals(MacroDef.OTHER_FLAG)) { 56 | jar_not.put(jar.getString(i)); 57 | } 58 | 59 | } 60 | } 61 | 62 | if (index_flag && !other_flag) { 63 | obj_ret.put(str, jar_not); 64 | } 65 | 66 | } 67 | 68 | return obj_ret; 69 | } 70 | 71 | @SuppressWarnings("unchecked") 72 | public static void main(String[] args) { 73 | String url = "http://sou.zhaopin.com/jobs/searchresult.ashx?in=210500%3B160400%3B160000%3B160500%3B160200%3B300100%3B160100%3B160600&jl=%E5%8C%97%E4%BA%AC&sm=0&p=1&sf=0&st=99999&cs=1&isadv=1"; 74 | JSONObject obj = analyUrl(url); 75 | 76 | Set keys = obj.keySet(); 77 | 78 | for (String key : keys) { 79 | System.out.println(key + ":(" + obj.get(key) + ")"); 80 | } 81 | } 82 | 83 | } 84 | -------------------------------------------------------------------------------- /src/main/java/com/blogchong/webmite/company/AnalyzeData.java: -------------------------------------------------------------------------------- 1 | package com.blogchong.webmite.company; 2 | 3 | import java.io.IOException; 4 | import java.net.MalformedURLException; 5 | import java.util.regex.Matcher; 6 | import java.util.regex.Pattern; 7 | import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException; 8 | import com.gargoylesoftware.htmlunit.WebClient; 9 | import com.gargoylesoftware.htmlunit.html.HtmlPage; 10 | 11 | /** 12 | * @Author: blogchong 13 | * @Blog: www.blogchong.com 14 | * @Mailbox: blogchong@163.com 15 | * @QQGroup: 191321336 16 | * @Weixin: blogchong 17 | * @CreateTime:2015年1月14日 下午3:00:02 18 | * @Description: 从网页源码中解析邮箱 19 | */ 20 | 21 | public class AnalyzeData { 22 | 23 | // 从源码中分析地址、电话、手机、电子邮箱 24 | public static String analyzeData(String url) { 25 | 26 | WebClient webClient = new WebClient(); 27 | 28 | webClient.getOptions().setCssEnabled(false); 29 | webClient.getOptions().setJavaScriptEnabled(false); 30 | 31 | try { 32 | 33 | HtmlPage page = webClient.getPage(url); 34 | String content = page.asText(); 35 | 36 | String regex_email = "\\s*(\\w+@\\S+\\.\\S*com|cn|net|org|gov|edu|int|biz|info|name|tv|cc|pro|coop|aero|museum|CC|TV|Club).*"; 37 | Matcher mt_email = Pattern.compile(regex_email).matcher(content); 38 | 39 | if (mt_email.find()) { 40 | return mt_email.group(1); 41 | } else { 42 | return null; 43 | } 44 | 45 | } catch (FailingHttpStatusCodeException e) { 46 | // e.printStackTrace(); 47 | } catch (MalformedURLException e) { 48 | // e.printStackTrace(); 49 | } catch (IOException e) { 50 | // e.printStackTrace(); 51 | } finally { 52 | webClient.closeAllWindows(); 53 | } 54 | 55 | return null; 56 | } 57 | 58 | public static void main(String[] args) { 59 | String url = "http://shanghai.myjob.com/company/11-2378094572.html"; 60 | System.out.println(analyzeData(url)); 61 | } 62 | 63 | } 64 | -------------------------------------------------------------------------------- /src/main/java/com/blogchong/webmite/company/CompanyStart.java: -------------------------------------------------------------------------------- 1 | package com.blogchong.webmite.company; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | import java.util.Set; 6 | 7 | import org.json.JSONArray; 8 | import org.json.JSONObject; 9 | 10 | import com.blogchong.webmite.util.JudgeEmail; 11 | import com.blogchong.webmite.util.OptFile; 12 | 13 | /** 14 | * @Author: blogchong 15 | * @Blog: www.blogchong.com 16 | * @Mailbox: blogchong@163.com 17 | * @QQGroup: 191321336 18 | * @Weixin: blogchong 19 | * @CreateTime:2015年1月14日 下午5:40:20 20 | * @Description: 起始类 21 | */ 22 | 23 | public class CompanyStart { 24 | 25 | @SuppressWarnings("unchecked") 26 | public static void main(String[] args) { 27 | 28 | List listUrl = OptFile 29 | .readFile("D:\\projects\\webmite\\file\\read.txt"); 30 | 31 | // 统计 32 | int count = 1; 33 | 34 | for (String url : listUrl) { 35 | 36 | List listEmail = new ArrayList(); 37 | List listQqEmail = new ArrayList(); 38 | List listQqOtherEmail = new ArrayList(); 39 | 40 | JSONObject obj = AnalyUrl.analyUrl(url); 41 | 42 | Set keys = obj.keySet(); 43 | 44 | for (String key : keys) { 45 | 46 | String companyName = key; 47 | try { 48 | 49 | JSONArray jar_url = obj.getJSONArray(key); 50 | 51 | JSONArray jarOhter = new JSONArray(); 52 | JSONArray jarQq = new JSONArray(); 53 | 54 | for (int i = 0; i < jar_url.length(); i++) { 55 | 56 | String email_tmp = AnalyzeData.analyzeData(jar_url 57 | .getString(i)); 58 | 59 | if (email_tmp != null 60 | && JudgeEmail.judgeEmail(email_tmp)) { 61 | if (JudgeEmail.judgeEmail2(email_tmp)) { 62 | jarQq.put(email_tmp); 63 | } else { 64 | jarOhter.put(email_tmp); 65 | } 66 | } 67 | 68 | } 69 | 70 | if (jarOhter.length() != 0 && jarQq.length() == 0) { 71 | listEmail 72 | .add(companyName + "==>" + jarOhter.toString()); 73 | } else if (jarOhter.length() != 0 && jarQq.length() != 0) { 74 | listQqEmail.add(companyName + "==>" + jarQq.toString()); 75 | listQqOtherEmail.add(companyName + "==>" 76 | + jarOhter.toString()); 77 | } else if (jarOhter.length() == 0 && jarQq.length() != 0) { 78 | listQqEmail.add(companyName + "==>" + jarQq.toString()); 79 | } 80 | 81 | } catch (Exception e) { 82 | } 83 | 84 | } 85 | 86 | // 写入 87 | OptFile.writeFile("D:\\projects\\webmite\\file\\ot\\ot_email_" 88 | + count + ".log", listEmail); 89 | // 写入 90 | OptFile.writeFile("D:\\projects\\webmite\\file\\qq\\qq_email_" 91 | + count + ".log", listQqEmail); 92 | 93 | // 写入 94 | OptFile.writeFile("D:\\projects\\webmite\\file\\qo\\qo_email_" 95 | + count + ".log", listQqOtherEmail); 96 | 97 | count++; 98 | } 99 | 100 | } 101 | 102 | } 103 | -------------------------------------------------------------------------------- /src/main/java/com/blogchong/webmite/company/GetCompanyFromZhilian.java: -------------------------------------------------------------------------------- 1 | package com.blogchong.webmite.company; 2 | 3 | import java.io.IOException; 4 | import java.net.MalformedURLException; 5 | import java.util.HashSet; 6 | import java.util.Set; 7 | import java.util.regex.Matcher; 8 | import java.util.regex.Pattern; 9 | import com.blogchong.webmite.util.MacroDef; 10 | import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException; 11 | import com.gargoylesoftware.htmlunit.WebClient; 12 | import com.gargoylesoftware.htmlunit.html.HtmlPage; 13 | 14 | /** 15 | * @Author: blogchong 16 | * @Blog: www.blogchong.com 17 | * @Mailbox: blogchong@163.com 18 | * @QQGroup: 191321336 19 | * @Weixin: blogchong 20 | * @CreateTime:2015年1月14日 下午1:43:28 21 | * @Description: 从智联招聘中解析出公司名 22 | */ 23 | 24 | public class GetCompanyFromZhilian { 25 | 26 | // 从智联解析公司 27 | public static Set getCompanyFromZl(String urls) { 28 | 29 | Set set = new HashSet(); 30 | 31 | WebClient webClient = new WebClient(); 32 | 33 | webClient.getOptions().setCssEnabled(false); 34 | webClient.getOptions().setJavaScriptEnabled(false); 35 | 36 | String content = null; 37 | 38 | int count = 0; 39 | 40 | String url = null; 41 | 42 | while (true) { 43 | 44 | // 构造url 45 | if (count == 0) { 46 | url = urls; 47 | } else { 48 | // 解析后面页面的url 49 | url = packUrl(urls, count); 50 | } 51 | 52 | // 若是url不符合跳出 53 | if (url == null) { 54 | break; 55 | } 56 | 57 | try { 58 | 59 | HtmlPage page = webClient.getPage(url); 60 | 61 | content = page.asText(); 62 | 63 | // 判断前一页跟后一页是否相同,若相同,说明已经到最后,跳出 64 | if (content.equals(MacroDef.ZL_SRC)) { 65 | break; 66 | } else { 67 | MacroDef.ZL_SRC = content; 68 | count++; 69 | } 70 | 71 | String[] lines = content.split("\n"); 72 | 73 | for (String line : lines) { 74 | 75 | // 匹配该行模板 76 | String regex = "^\\s+\\S+\\s+\\S+\\s+\\S+\\s+.*(\\d{1,2}-\\d{1,2})$"; 77 | 78 | Pattern pattern = Pattern.compile(regex); 79 | Matcher mt = pattern.matcher(line); 80 | 81 | // 首先将不符合基本格式的过滤 82 | if (mt.find()) { 83 | 84 | String note = mt.group(); 85 | 86 | String[] fields = note.split("\t"); 87 | if (fields.length >= 2) { 88 | set.add(fields[1]); 89 | // System.out.println(fields[1]); 90 | } 91 | } 92 | 93 | } 94 | 95 | } catch (FailingHttpStatusCodeException e) { 96 | e.printStackTrace(); 97 | } catch (MalformedURLException e) { 98 | e.printStackTrace(); 99 | } catch (IOException e) { 100 | e.printStackTrace(); 101 | } finally { 102 | webClient.closeAllWindows(); 103 | } 104 | } 105 | 106 | // 最后将标识置为原先默认值 107 | MacroDef.ZL_SRC = MacroDef.ZL_FLAG; 108 | 109 | return set; 110 | } 111 | 112 | // url拆分 113 | public static String packUrl(String url_tmp, int count) { 114 | 115 | String url = null; 116 | 117 | String[] url_tmps = url_tmp.split("sm=0&"); 118 | 119 | if (url_tmps.length >= 2) { 120 | 121 | String[] url_tmps2 = url_tmps[0].split("jl="); 122 | 123 | if (url_tmps2.length == 2) { 124 | url = url_tmps2[0].toLowerCase() 125 | + "jl=" 126 | + url_tmps2[1] 127 | + "sm=0&" 128 | + "sf=0&st=99999&cs=1&isadv=1&sg=a021204b5ef145448301208cd5b38c68&p=" 129 | + (count + 1); 130 | } 131 | } 132 | 133 | return url; 134 | } 135 | 136 | public static void main(String[] args) { 137 | String url = "http://sou.zhaopin.com/jobs/searchresult.ashx?in=140000%3B140100%3B140200%3B201400%3B120200%3B170000%3B120800%3B121100%3B210600%3B120700&jl=%E5%8C%97%E4%BA%AC&sm=0&p=1&sf=0&st=99999&cs=1&isadv=1"; 138 | Set set = GetCompanyFromZhilian.getCompanyFromZl(url); 139 | for (String str : set) { 140 | System.out.println(str); 141 | } 142 | } 143 | 144 | } 145 | -------------------------------------------------------------------------------- /src/main/java/com/blogchong/webmite/company/GetUrlFrom360.java: -------------------------------------------------------------------------------- 1 | package com.blogchong.webmite.company; 2 | 3 | import java.io.IOException; 4 | import java.net.MalformedURLException; 5 | import java.util.List; 6 | import java.util.Set; 7 | import java.util.regex.Matcher; 8 | import java.util.regex.Pattern; 9 | 10 | import org.json.JSONArray; 11 | import org.json.JSONObject; 12 | 13 | import com.blogchong.webmite.util.MacroDef; 14 | import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException; 15 | import com.gargoylesoftware.htmlunit.WebClient; 16 | import com.gargoylesoftware.htmlunit.html.HtmlAnchor; 17 | import com.gargoylesoftware.htmlunit.html.HtmlPage; 18 | 19 | /** 20 | * @Author: blogchong 21 | * @Blog: www.blogchong.com 22 | * @Mailbox: blogchong@163.com 23 | * @QQGroup: 191321336 24 | * @Weixin: blogchong 25 | * @CreateTime:2015年1月14日 下午12:50:17 26 | * @Description:从360搜索解析url 27 | */ 28 | 29 | public class GetUrlFrom360 { 30 | 31 | /* 32 | * 入参为公司名,返回一个json 33 | */ 34 | @SuppressWarnings("deprecation") 35 | public JSONObject getUrlFrom360(String companyName) { 36 | 37 | // JSON格式数据解析对象 38 | JSONObject obj = new JSONObject(); 39 | 40 | String str = java.net.URLEncoder.encode(companyName); 41 | 42 | String url = "http://www.haosou.com/s?ie=utf-8&shb=1&src=360sou_newhome&q=" 43 | + str; 44 | 45 | WebClient webClient = new WebClient(); 46 | 47 | webClient.getOptions().setCssEnabled(false); 48 | webClient.getOptions().setJavaScriptEnabled(false); 49 | 50 | // JSON格式数据解析对象 51 | JSONArray jar_is = new JSONArray(); 52 | JSONArray jar_not = new JSONArray(); 53 | 54 | try { 55 | 56 | HtmlPage page = webClient.getPage(url); 57 | 58 | List list = page.getAnchors(); 59 | 60 | for (HtmlAnchor htmlAnchor : list) { 61 | 62 | String url_tmp = htmlAnchor.getHrefAttribute(); 63 | 64 | // 匹配该行模板 65 | String regex = "http://.*"; 66 | 67 | Pattern pattern = Pattern.compile(regex); 68 | Matcher line = pattern.matcher(url_tmp); 69 | 70 | // 首先将不符合基本格式的过滤 71 | if (line.find()) { 72 | 73 | // 匹配该行模板 74 | String regex2 = "http://.*haoso|360|baidu|google|bing|leidian.*"; 75 | 76 | Pattern pattern2 = Pattern.compile(regex2); 77 | Matcher line2 = pattern2.matcher(line.group()); 78 | 79 | // 将带搜索引擎专有页面过滤 80 | if (!line2.find()) { 81 | 82 | // 匹配该行模板 83 | String regex3 = "http://www\\..*\\.((cn|com|net|org)/)$"; 84 | 85 | Pattern pattern3 = Pattern.compile(regex3); 86 | Matcher line3 = pattern3.matcher(line.group()); 87 | 88 | // 判断是否为首页 89 | if (line3.find()) { 90 | jar_is.put(line.group()); 91 | } else { 92 | jar_not.put(line.group()); 93 | } 94 | 95 | } 96 | } 97 | } 98 | 99 | page.cleanUp(); 100 | 101 | } catch (FailingHttpStatusCodeException e) { 102 | // e.printStackTrace(); 103 | } catch (MalformedURLException e) { 104 | // e.printStackTrace(); 105 | } catch (IOException e) { 106 | // e.printStackTrace(); 107 | } finally { 108 | webClient.closeAllWindows(); 109 | } 110 | 111 | obj.put(MacroDef.INDEX_FLAG, jar_is); 112 | obj.put(MacroDef.OTHER_FLAG, jar_not); 113 | 114 | return obj; 115 | } 116 | 117 | @SuppressWarnings("unchecked") 118 | public static void main(String[] args) { 119 | 120 | GetUrlFrom360 getUrlFrom360 = new GetUrlFrom360(); 121 | 122 | JSONObject obj = getUrlFrom360.getUrlFrom360("北京场道市政工程有限公司"); 123 | 124 | Set keys = obj.keySet(); 125 | 126 | for (String key : keys) { 127 | 128 | JSONArray jar = obj.getJSONArray(key); 129 | 130 | for (int i = 0; i < jar.length(); i++) { 131 | 132 | System.out.println(key + ":" + jar.getString(i)); 133 | 134 | } 135 | } 136 | } 137 | 138 | } 139 | -------------------------------------------------------------------------------- /src/main/java/com/blogchong/webmite/util/JudgeEmail.java: -------------------------------------------------------------------------------- 1 | package com.blogchong.webmite.util; 2 | 3 | import java.util.regex.Matcher; 4 | import java.util.regex.Pattern; 5 | 6 | /** 7 | * @Author: blogchong 8 | * @Blog: www.blogchong.com 9 | * @Mailbox: blogchong@163.com 10 | * @QQGroup: 191321336 11 | * @Weixin: blogchong 12 | * @version: 2015年1月14日 下午8:46:02 13 | * @Des: 判断邮箱格式 14 | */ 15 | 16 | public class JudgeEmail { 17 | 18 | public static boolean judgeEmail(String email) { 19 | 20 | // 匹配该行模板 21 | String regex = "^([a-zA-Z0-9]|[._])+@([a-zA-Z0-9_-])+([a-zA-Z0-9]|[._])+\\.(com|cn|net|org|gov|edu|int|biz|info|name|tv|cc|pro|coop|aero|museum|CC|TV|Club)$"; 22 | 23 | Pattern pattern = Pattern.compile(regex); 24 | Matcher line = pattern.matcher(email); 25 | 26 | if (line.find()) { 27 | return true; 28 | } else { 29 | return false; 30 | } 31 | 32 | } 33 | 34 | // 判断是否为qq邮箱 35 | public static boolean judgeEmail2(String email) { 36 | 37 | // 匹配该行模板 38 | String regex = "^([a-zA-Z0-9]|[._])+@qq\\.com$"; 39 | 40 | Pattern pattern = Pattern.compile(regex); 41 | Matcher line = pattern.matcher(email); 42 | 43 | if (line.find()) { 44 | return true; 45 | } else { 46 | return false; 47 | } 48 | 49 | } 50 | 51 | public static void main(String[] args) { 52 | String email = "dkj_dkdl@126.dd.com"; 53 | if (judgeEmail(email)) { 54 | System.out.println("OK"); 55 | } else { 56 | System.out.println("NO"); 57 | } 58 | } 59 | 60 | } 61 | -------------------------------------------------------------------------------- /src/main/java/com/blogchong/webmite/util/MacroDef.java: -------------------------------------------------------------------------------- 1 | package com.blogchong.webmite.util; 2 | 3 | /** 4 | * @Author: blogchong 5 | * @Blog: www.blogchong.com 6 | * @Mailbox: blogchong@163.com 7 | * @QQGroup: 191321336 8 | * @Weixin: blogchong 9 | * @CreateTime:2015年1月14日 下午1:00:57 10 | * @Description:宏定义 11 | */ 12 | 13 | public class MacroDef { 14 | 15 | //首页标识 16 | public static final String INDEX_FLAG = "isHome"; 17 | public static final String OTHER_FLAG = "notHome"; 18 | 19 | //存储上一次智联招聘获取信息的页面源码 20 | public static String ZL_SRC = "hcy"; 21 | public static final String ZL_FLAG = "hcy"; 22 | 23 | //公司属性 24 | public static final String COM_NAME = "name"; 25 | public static final String COM_PHONE = "phone"; 26 | public static final String COM_EMAIL = "email"; 27 | 28 | } 29 | -------------------------------------------------------------------------------- /src/main/java/com/blogchong/webmite/util/OptFile.java: -------------------------------------------------------------------------------- 1 | package com.blogchong.webmite.util; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.BufferedWriter; 5 | import java.io.File; 6 | import java.io.FileReader; 7 | import java.io.FileWriter; 8 | import java.util.ArrayList; 9 | import java.util.List; 10 | 11 | /** 12 | * @Author: blogchong 13 | * @Blog: www.blogchong.com 14 | * @Mailbox: blogchong@163.com 15 | * @QQGroup: 191321336 16 | * @Weixin: blogchong 17 | * @CreateTime:2015年1月14日 下午1:00:57 18 | * @Description: 生成的文件处理 19 | */ 20 | 21 | public class OptFile { 22 | 23 | // 读文件 24 | public static List readFile(String path) { 25 | List list = new ArrayList(); 26 | 27 | File fd = new File(path); 28 | String line = ""; 29 | try { 30 | 31 | FileReader reader = new FileReader(fd); 32 | BufferedReader br = new BufferedReader(reader); 33 | while ((line = br.readLine()) != null) { 34 | list.add(line); 35 | } 36 | 37 | br.close(); 38 | } catch (Exception e) { 39 | e.printStackTrace(); 40 | } 41 | 42 | return list; 43 | } 44 | 45 | // 写文件 46 | public static void writeFile(String path, List list) { 47 | File fd = new File(path); 48 | try { 49 | 50 | FileWriter writer = new FileWriter(fd); 51 | BufferedWriter bw = new BufferedWriter(writer); 52 | 53 | for (String line : list) { 54 | bw.write(line); 55 | bw.newLine(); 56 | } 57 | 58 | bw.close(); 59 | 60 | } catch (Exception e) { 61 | e.printStackTrace(); 62 | } 63 | } 64 | 65 | public static void main(String[] args) { 66 | List list = readFile("E:\\Java_Web\\projects\\webmite\\file\\read.txt"); 67 | writeFile("E:\\Java_Web\\projects\\webmite\\file\\email.txt", list); 68 | } 69 | 70 | } 71 | -------------------------------------------------------------------------------- /src/test/java/com/blogchong/webmite/AppTest.java: -------------------------------------------------------------------------------- 1 | package com.blogchong.webmite; 2 | 3 | import junit.framework.Test; 4 | import junit.framework.TestCase; 5 | import junit.framework.TestSuite; 6 | 7 | /** 8 | * Unit test for simple App. 9 | */ 10 | public class AppTest 11 | extends TestCase 12 | { 13 | /** 14 | * Create the test case 15 | * 16 | * @param testName name of the test case 17 | */ 18 | public AppTest( String testName ) 19 | { 20 | super( testName ); 21 | } 22 | 23 | /** 24 | * @return the suite of tests being tested 25 | */ 26 | public static Test suite() 27 | { 28 | return new TestSuite( AppTest.class ); 29 | } 30 | 31 | /** 32 | * Rigourous Test :-) 33 | */ 34 | public void testApp() 35 | { 36 | assertTrue( true ); 37 | } 38 | } 39 | --------------------------------------------------------------------------------