昵称 | 35 |性别 | 36 |年龄 | 37 |身高(cm) | 38 |体重(kg) | 39 |收入 | 40 |婚姻状况 | 41 |地址 | 42 |
---|---|---|---|---|---|---|---|
{{.Payload.Name}} | 48 | {{with .Payload}} 49 |{{.Gender}} | 50 |{{.Age}} | 51 |{{.Height}} | 52 |{{.Weight}} | 53 |{{.Income}} | 54 |{{.Marriage}} | 55 |{{.Address}} | 56 | {{end}} 57 |
([^<]+)([^<]+)([^<]+)
`) 16 | var positionUrl = regexp.MustCompile(`([^<]+)`) 19 | 20 | // 匹配公司要求整段代码 21 | var positionRe = regexp.MustCompile( 22 | `target="_blank">[^<]+([^p]+)
`) 23 | 24 | // 判断数据个数, 25 | var isIntegrityRe = regexp.MustCompile(``) 26 | 27 | // 数据完整时的匹配 28 | var integrityRe = regexp.MustCompile( 29 | `([^<]+)([^<]+)([^<]+)
`) 30 | 31 | // 数据不完整时的匹配(缺少融资信息) 32 | var unIntegrityRe = regexp.MustCompile( 33 | `([^<]+)([^<]+)
`) 34 | 35 | // 下一页正则 36 | var nextPageRe = regexp.MustCompile( 37 | ``) 38 | 39 | // 解析每个 area 的职位 40 | func ParsePositionList(bytes []byte) engine.ParseResult { 41 | 42 | //defer func() { 43 | // err := recover() 44 | // if err, ok := err.(error); ok { 45 | // log.Println("Error occurred:", err) 46 | // } else { 47 | // panic(fmt.Sprintf("I do't know what to do %v", err)) 48 | // } 49 | //}() 50 | 51 | content := util.RemoveSpace(bytes) 52 | position := model.Position{} 53 | 54 | nameMatch := nameRe.FindAllSubmatch(content, -1) 55 | paymentMatch := paymentRe.FindAllSubmatch(content, -1) 56 | requireMatch := requireRe.FindAllSubmatch(content, -1) 57 | positionUrl := positionUrl.FindAllSubmatch(content, -1) 58 | fimeNameMatch := fimeNameRe.FindAllSubmatch(content, -1) 59 | 60 | submatch := positionRe.FindAllSubmatch(content, -1) 61 | nextPageUrl := nextPageRe.FindSubmatch(content) // 解析下一页Url 62 | 63 | items := []engine.Item{} 64 | requests := []engine.Request{} 65 | 66 | if len(nameMatch) == len(requireMatch) { 67 | for k, item := range nameMatch { 68 | position.Name = string(item[1]) 69 | position.Payment = string(paymentMatch[k][1]) 70 | position.Address = string(requireMatch[k][1]) 71 | position.Experience = string(requireMatch[k][2]) 72 | position.Education = string(requireMatch[k][3]) 73 | position.PosiUrl = "http://www.zhipin.com" + string(positionUrl[k][1]) 74 | 75 | position.FimeUrl = "http://www.zhipin.com" + string(fimeNameMatch[k][1]) + "?ka=" + string(fimeNameMatch[k][2]) 76 | position.FirmName = string(fimeNameMatch[k][3]) 77 | 78 | position.FirmType, position.FirmFinancing, position.FirmSize = extractFirmMsg(submatch[k]) 79 | 80 | items = append(items, engine.Item{ 81 | Url: "", 82 | Type: "zhipin_jinan", 83 | Id: "", 84 | Payload: position, 85 | }) 86 | } 87 | } 88 | 89 | // 请求下一页 90 | if len(nextPageUrl) != 0 { 91 | requests = append(requests, engine.Request{ 92 | Url: "http://www.zhipin.com" + string(nextPageUrl[1]), 93 | ParseFunc: ParsePositionList, 94 | }) 95 | } 96 | // 本页解析数据 97 | result := engine.ParseResult{ 98 | Items: items, 99 | Requests: requests, 100 | } 101 | 102 | return result 103 | } 104 | 105 | // 传入公司信息整段代码,提取公司类型、融资阶段、公司规模 106 | func extractFirmMsg(item [][]byte) (string, string, string) { 107 | dataNum := isIntegrityRe.FindAllSubmatch(item[0], -1) 108 | if len(dataNum) == 1 { // 当缺少融资信息时 109 | submatch := unIntegrityRe.FindSubmatch(item[0]) 110 | return string(submatch[1]), "", string(submatch[2]) 111 | } else { // 当不缺少融资信息时 112 | submatch := integrityRe.FindSubmatch(item[0]) 113 | return string(submatch[1]), string(submatch[2]), string(submatch[3]) 114 | } 115 | } 116 | -------------------------------------------------------------------------------- /zhipin/parser/area_test.go: -------------------------------------------------------------------------------- 1 | package parser 2 | 3 | import ( 4 | "crawler/zhipin/util" 5 | "fmt" 6 | "io/ioutil" 7 | "regexp" 8 | "testing" 9 | ) 10 | 11 | func TestParsePositionList(t *testing.T) { 12 | bytes, err := ioutil.ReadFile("area_test_data.html") 13 | if err != nil { 14 | panic(err) 15 | } 16 | parseResult := ParsePositionList(bytes) 17 | 18 | for _, item := range parseResult.Items { 19 | fmt.Println(item.Payload) 20 | } 21 | for _, item := range parseResult.Requests { 22 | fmt.Println("下一页:" + item.Url) 23 | } 24 | fmt.Printf("一共 %d 条数据\n", len(parseResult.Items)) 25 | } 26 | 27 | // 测试职位相关要求正则 28 | func TestPositionMsgRe(t *testing.T) { 29 | var postionRe = regexp.MustCompile( 30 | `([^<]+)([^<]+)([^<]+)
`) 31 | bytes, err := ioutil.ReadFile("area_test_data.html") 32 | if err != nil { 33 | panic(err) 34 | } 35 | content := util.RemoveSpace(bytes) 36 | //fmt.Println(string(content)) 37 | submatch := postionRe.FindAllSubmatch(content, -1) 38 | for _, item := range submatch { 39 | fmt.Printf("地址:%s 工作经验:%s 学历要求:%s\n", string(item[1]), string(item[2]), string(item[3])) 40 | } 41 | fmt.Printf("一共 %d 条数据\n", len(submatch)) 42 | } 43 | 44 | // 测试公司状况正则 45 | func TestFimeMsgRe(t *testing.T) { 46 | // 匹配公司要求整段代码 47 | var postionRe = regexp.MustCompile( 48 | `target="_blank">[^<]+([^p]+)
`) 49 | // 判断数据个数, 50 | var isIntegrityRe = regexp.MustCompile(``) 51 | // 数据完整时的匹配 52 | var integrityRe = regexp.MustCompile( 53 | `([^<]+)([^<]+)([^<]+)
`) 54 | // 数据不完整时的匹配(缺少融资信息) 55 | var unIntegrityRe = regexp.MustCompile( 56 | `([^<]+)([^<]+)
`) 57 | 58 | bytes, err := ioutil.ReadFile("area_test_data.html") 59 | if err != nil { 60 | panic(err) 61 | } 62 | content := util.RemoveSpace(bytes) 63 | 64 | submatch := postionRe.FindAllSubmatch(content, -1) 65 | for _, item := range submatch { 66 | dataNum := isIntegrityRe.FindAllSubmatch(item[0], -1) 67 | if len(dataNum) == 1 { // 当缺少融资信息时 68 | submatch := unIntegrityRe.FindSubmatch(item[0]) 69 | fmt.Printf("公司类型:%s 公司规模:%s\n", 70 | string(submatch[1]), string(submatch[2])) 71 | } else { // 当不缺少融资信息时 72 | submatch := integrityRe.FindSubmatch(item[0]) 73 | fmt.Printf("公司类型:%s 融资阶段:%s 公司规模:%s\n", 74 | string(submatch[1]), string(submatch[2]), string(submatch[3])) 75 | } 76 | } 77 | fmt.Printf("一共 %d 条数据\n", len(submatch)) 78 | } 79 | 80 | func TestNextPageRe(t *testing.T) { 81 | var nextPageRe = regexp.MustCompile(``) 82 | bytes, err := ioutil.ReadFile("area_test_data.html") 83 | if err != nil { 84 | panic(err) 85 | } 86 | content := util.RemoveSpace(bytes) 87 | submatch := nextPageRe.FindSubmatch(content) 88 | fmt.Println("http://www.zhipin.com" + string(submatch[1])) 89 | } 90 | -------------------------------------------------------------------------------- /zhipin/parser/business.go: -------------------------------------------------------------------------------- 1 | package parser 2 | 3 | import ( 4 | "crawler/engine" 5 | "crawler/zhipin/util" 6 | "regexp" 7 | ) 8 | 9 | // 解析 每个区中的 area 列表 10 | func ParseAreaList(bytes []byte) engine.ParseResult { 11 | // 匹配 area 正则 12 | var areaRe = regexp.MustCompile(`([^<]+)`) 13 | // 匹配当前页面的Url 14 | var pageUrlRe = regexp.MustCompile( 15 | `[^<]+`) 16 | 17 | // 除去字符串中空格 18 | content := util.RemoveSpace(bytes) 19 | // 匹配 area 信息 20 | submatch := areaRe.FindAllSubmatch(content, -1) 21 | 22 | result := engine.ParseResult{} 23 | 24 | if len(submatch) == 0 { 25 | // 如果在在当前 区 内 没有匹配到 area 数据,则直接在当前页面匹配 职位 信息 26 | urlMatch := pageUrlRe.FindSubmatch(content) 27 | //fmt.Printf("%s %s\n", string(urlMatch[1]), string(urlMatch[2])) 28 | result.Requests = append(result.Requests, engine.Request{ 29 | Url: "http://www.zhipin.com" + string(urlMatch[1]) + "?ka=" + string(urlMatch[2]), 30 | ParseFunc: ParsePositionList, 31 | }) 32 | } else { 33 | // 否则匹配下一级 area 信息 34 | for _, item := range submatch { 35 | result.Requests = append(result.Requests, engine.Request{ 36 | Url: "https://www.zhipin.com" + string(item[1]) + "?ka=" + string(item[2]), 37 | ParseFunc: ParsePositionList, 38 | }) 39 | } 40 | } 41 | return result 42 | } 43 | -------------------------------------------------------------------------------- /zhipin/parser/business_test.go: -------------------------------------------------------------------------------- 1 | package parser 2 | 3 | import ( 4 | "crawler/zhipin/util" 5 | "fmt" 6 | "io/ioutil" 7 | "regexp" 8 | "testing" 9 | ) 10 | 11 | func TestParseAreaList(t *testing.T) { 12 | bytes, err := ioutil.ReadFile("business_test_data.html") 13 | if err != nil { 14 | panic(err) 15 | } 16 | content := util.RemoveSpace(bytes) 17 | parseResult := ParseAreaList(content) 18 | for _, item := range parseResult.Requests { 19 | fmt.Println(item.Url) 20 | } 21 | } 22 | 23 | func TestPageUrlRe(t *testing.T) { 24 | var pageUrlRe = regexp.MustCompile( 25 | `[^<]+`) 26 | bytes, err := ioutil.ReadFile("business_test_data.html") 27 | if err != nil { 28 | panic(err) 29 | } 30 | content := util.RemoveSpace(bytes) 31 | submatch := pageUrlRe.FindSubmatch(content) 32 | fmt.Println(string(submatch[0])) 33 | } 34 | -------------------------------------------------------------------------------- /zhipin/parser/city.go: -------------------------------------------------------------------------------- 1 | package parser 2 | 3 | import ( 4 | "crawler/engine" 5 | "crawler/zhipin/util" 6 | "regexp" 7 | ) 8 | 9 | // 解析城市包含的所有 区 信息 10 | func ParseBusinessList(bytes []byte) engine.ParseResult { 11 | var businessRe = regexp.MustCompile( 12 | `([^<]+)`) 13 | 14 | content := util.RemoveSpace(bytes) 15 | submatch := businessRe.FindAllSubmatch(content, -1) 16 | 17 | result := engine.ParseResult{} 18 | for _, item := range submatch { 19 | result.Requests = append(result.Requests, engine.Request{ 20 | Url: "https://www.zhipin.com" + string(item[1]) + "?ka=" + string(item[2]), 21 | ParseFunc: ParseAreaList, 22 | }) 23 | } 24 | return result 25 | } 26 | -------------------------------------------------------------------------------- /zhipin/parser/city_test.go: -------------------------------------------------------------------------------- 1 | package parser 2 | 3 | import ( 4 | "fmt" 5 | "io/ioutil" 6 | "testing" 7 | ) 8 | 9 | func TestParseBusinessList(t *testing.T) { 10 | bytes, err := ioutil.ReadFile("city_test_data.html") 11 | if err != nil { 12 | panic(err) 13 | } 14 | parseResult := ParseBusinessList(bytes) 15 | for _, item := range parseResult.Requests { 16 | fmt.Println(item.Url) 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /zhipin/parser/home.go: -------------------------------------------------------------------------------- 1 | package parser 2 | 3 | import ( 4 | "crawler/engine" 5 | "regexp" 6 | ) 7 | 8 | // 解析首页 的 城市 列表 9 | func PraseCityList(bytes []byte) engine.ParseResult { 10 | cityListRe := regexp.MustCompile(`([^<]+)`) 11 | submatch := cityListRe.FindAllSubmatch(bytes, -1) 12 | result := engine.ParseResult{} 13 | for _, item := range submatch { 14 | result.Requests = append(result.Requests, engine.Request{ 15 | Url: "https://www.zhipin.com" + string(item[1]) + "?ka=" + string(item[2]), 16 | ParseFunc: ParseBusinessList, 17 | }) 18 | } 19 | return result 20 | } 21 | -------------------------------------------------------------------------------- /zhipin/parser/home_test.go: -------------------------------------------------------------------------------- 1 | package parser 2 | 3 | import ( 4 | "fmt" 5 | "io/ioutil" 6 | "testing" 7 | ) 8 | 9 | func TestPraseCityList(t *testing.T) { 10 | bytes, err := ioutil.ReadFile("home_test_data.html") 11 | if err != nil { 12 | panic(err) 13 | } 14 | parseResult := PraseCityList(bytes) 15 | for _, item := range parseResult.Requests { 16 | fmt.Println(item.Url) 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /zhipin/util/util.go: -------------------------------------------------------------------------------- 1 | package util 2 | 3 | import "regexp" 4 | 5 | func RemoveSpace(bytes []byte) []byte { 6 | var removeSpace = regexp.MustCompile(`[\s]+`) 7 | // 除去空格 8 | content := removeSpace.ReplaceAllString(string(bytes), " ") 9 | return []byte(content) 10 | } 11 | --------------------------------------------------------------------------------