└── crawler.go /crawler.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "github.com/PuerkitoBio/goquery" 6 | "net/http" 7 | "strings" 8 | "runtime" 9 | //"sync" 10 | ) 11 | 12 | type Page struct{ 13 | url string 14 | statusCode int 15 | } 16 | 17 | //extend page 18 | type ErrorPages struct { 19 | originUrl string 20 | page Page 21 | } 22 | 23 | func GetFirstPage(url string, crawledPages map[string]int){ 24 | //http getでresponse codeを見る 25 | fmt.Println("start crawl" + url) 26 | res, _ := http.Get(url) 27 | //defer res.Body.Close() 28 | //200なら 29 | if res.StatusCode == 200{ 30 | //中身のhrefを取得する。 31 | doc, _ := goquery.NewDocumentFromResponse(res) 32 | doc.Find("a").Each(func(_ int, s *goquery.Selection) { 33 | nextUrl, _ := s.Attr("href") 34 | if isMatchDomain(nextUrl) > 0 { 35 | if _, ok:= crawledPages[nextUrl]; ok{ 36 | }else{ 37 | crawledPages[nextUrl] = 200 38 | } 39 | } 40 | }) 41 | }else{ 42 | fmt.Println(res.StatusCode) 43 | } 44 | } 45 | 46 | //func GoGetPages(urls []string) /*[]string*/{ 47 | // pageurls := []string{} 48 | // var wg sync.WaitGroup 49 | // for _, url := range urls { 50 | // wg.Add(1) 51 | // go func(url string){ 52 | // defer wg. 53 | // Done() 54 | // fmt.Println(url) 55 | // pageurls = GetFirstPage(url) 56 | // }(url) 57 | // } 58 | // wg.Wait() 59 | // //return pageurls 60 | //} 61 | 62 | func isMatchDomain(url string) int{ 63 | return strings.Index(url, "://hair.cm") 64 | } 65 | 66 | func main() { 67 | fmt.Println(runtime.NumCPU()) 68 | runtime.GOMAXPROCS(runtime.NumCPU()) 69 | 70 | url := "https://hair.cm/tu/article-27066/" 71 | crawledPages := map[string]int{} 72 | GetFirstPage(url, crawledPages) 73 | 74 | for url, _ := range crawledPages { 75 | GetFirstPage(url, crawledPages) 76 | } 77 | fmt.Println(crawledPages) 78 | } 79 | --------------------------------------------------------------------------------