├── .gitignore ├── README.md ├── docs └── moive.sql ├── main.go ├── model └── model.go └── parse └── douban.go /.gitignore: -------------------------------------------------------------------------------- 1 | # ---> Go 2 | # Compiled Object files, Static and Dynamic libs (Shared Objects) 3 | *.o 4 | *.a 5 | *.so 6 | 7 | # Folders 8 | _obj 9 | _test 10 | 11 | # Architecture specific extensions/prefixes 12 | *.[568vq] 13 | [568vq].out 14 | 15 | *.cgo1.go 16 | *.cgo2.c 17 | _cgo_defun.c 18 | _cgo_gotypes.go 19 | _cgo_export.* 20 | 21 | _testmain.go 22 | 23 | *.exe 24 | *.test 25 | *.prof 26 | deploypkg/* 27 | .idea/ 28 | .DS_Store 29 | .vscode 30 | debug -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 爬取豆瓣电影 Top250 2 | 3 | 爬虫是标配了,第一个就从最最最简单的爬虫开始写起吧 4 | 5 | ## 目标 6 | 7 | 我们的目标站点是 [豆瓣电影 Top250](https://movie.douban.com/top250),估计大家都很眼熟了 8 | 9 | 本次爬取8个字段,用于简单的概括分析。具体的字段如下: 10 | 11 | ![image](https://i.loli.net/2018/03/20/5ab11596b8810.png) 12 | 13 | 简单的分析一下目标源 14 | - 一页共25条 15 | - 含分页(共10页)且分页规则是正常的 16 | - 每一项的数据字段排序都是规则且不变 17 | 18 | ## 开始 19 | 20 | 由于量不大,我们的爬取步骤如下 21 | - 分析页面,获取所有的分页 22 | - 分析页面,循环爬取所有页面的电影信息 23 | - 爬取的电影信息入库 24 | 25 | ### 安装 26 | ``` 27 | $ go get -u github.com/PuerkitoBio/goquery 28 | ``` 29 | 30 | ### 运行 31 | ``` 32 | $ go run main.go 33 | ``` 34 | 35 | ### 代码片段 36 | 37 | #### 1、获取所有分页 38 | ``` 39 | func ParsePages(doc *goquery.Document) (pages []Page) { 40 | pages = append(pages, Page{Page: 1, Url: ""}) 41 | doc.Find("#content > div > div.article > div.paginator > a").Each(func(i int, s *goquery.Selection) { 42 | page, _ := strconv.Atoi(s.Text()) 43 | url, _ := s.Attr("href") 44 | 45 | pages = append(pages, Page{ 46 | Page: page, 47 | Url: url, 48 | }) 49 | }) 50 | 51 | return pages 52 | } 53 | ``` 54 | 55 | #### 2、分析豆瓣电影信息 56 | ``` 57 | func ParseMovies(doc *goquery.Document) (movies []Movie) { 58 | doc.Find("#content > div > div.article > ol > li").Each(func(i int, s *goquery.Selection) { 59 | title := s.Find(".hd a span").Eq(0).Text() 60 | 61 | ... 62 | 63 | movieDesc := strings.Split(DescInfo[1], "/") 64 | year := strings.TrimSpace(movieDesc[0]) 65 | area := strings.TrimSpace(movieDesc[1]) 66 | tag := strings.TrimSpace(movieDesc[2]) 67 | 68 | star := s.Find(".bd .star .rating_num").Text() 69 | 70 | comment := strings.TrimSpace(s.Find(".bd .star span").Eq(3).Text()) 71 | compile := regexp.MustCompile("[0-9]") 72 | comment = strings.Join(compile.FindAllString(comment, -1), "") 73 | 74 | quote := s.Find(".quote .inq").Text() 75 | 76 | ... 77 | 78 | log.Printf("i: %d, movie: %v", i, movie) 79 | 80 | movies = append(movies, movie) 81 | }) 82 | 83 | return movies 84 | } 85 | ``` 86 | 87 | 88 | ### 数据 89 | ![image](https://i.loli.net/2018/03/21/5ab1309594741.png) 90 | 91 | ![image](https://i.loli.net/2018/03/21/5ab131ca582f8.png) 92 | 93 | ![image](https://i.loli.net/2018/03/21/5ab130d3a00d9.png) -------------------------------------------------------------------------------- /docs/moive.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE `sp_douban_movie` ( 2 | `id` int(10) unsigned NOT NULL AUTO_INCREMENT, 3 | `title` varchar(30) DEFAULT '' COMMENT '标题', 4 | `subtitle` varchar(20) DEFAULT '' COMMENT '副标题', 5 | `other` varchar(20) DEFAULT '' COMMENT '其他', 6 | `desc` varchar(30) DEFAULT '' COMMENT '简述', 7 | `year` int(10) unsigned DEFAULT '0' COMMENT '年份', 8 | `area` varchar(20) DEFAULT '' COMMENT '地区', 9 | `tag` varchar(20) DEFAULT '' COMMENT '标签', 10 | `star` int(10) unsigned DEFAULT '0' COMMENT 'star', 11 | `comment` int(10) unsigned DEFAULT '0' COMMENT '评分', 12 | `quote` varchar(30) DEFAULT '' COMMENT '引用', 13 | PRIMARY KEY (`id`) 14 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='豆瓣电影Top250'; 15 | -------------------------------------------------------------------------------- /main.go: -------------------------------------------------------------------------------- 1 | // 爬取豆瓣电影 TOP250 2 | package main 3 | 4 | import ( 5 | "log" 6 | "strings" 7 | 8 | "github.com/PuerkitoBio/goquery" 9 | 10 | "github.com/go-crawler/douban-movie/model" 11 | "github.com/go-crawler/douban-movie/parse" 12 | ) 13 | 14 | var ( 15 | BaseUrl = "https://movie.douban.com/top250" 16 | ) 17 | 18 | // 新增数据 19 | func Add(movies []parse.DoubanMovie) { 20 | for index, movie := range movies { 21 | if err := model.DB.Create(&movie).Error; err != nil { 22 | log.Printf("db.Create index: %s, err : %v", index, err) 23 | } 24 | } 25 | } 26 | 27 | // 开始爬取 28 | func Start() { 29 | var movies []parse.DoubanMovie 30 | 31 | pages := parse.GetPages(BaseUrl) 32 | for _, page := range pages { 33 | doc, err := goquery.NewDocument(strings.Join([]string{BaseUrl, page.Url}, "")) 34 | if err != nil { 35 | log.Println(err) 36 | } 37 | 38 | movies = append(movies, parse.ParseMovies(doc)...) 39 | } 40 | 41 | Add(movies) 42 | } 43 | 44 | func main() { 45 | Start() 46 | 47 | defer model.DB.Close() 48 | } 49 | -------------------------------------------------------------------------------- /model/model.go: -------------------------------------------------------------------------------- 1 | package model 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | 7 | "github.com/jinzhu/gorm" 8 | _ "github.com/jinzhu/gorm/dialects/mysql" 9 | ) 10 | 11 | var ( 12 | DB *gorm.DB 13 | 14 | username string = "root" 15 | password string = "rootroot" 16 | dbName string = "spiders" 17 | ) 18 | 19 | func init() { 20 | var err error 21 | DB, err = gorm.Open("mysql", fmt.Sprintf("%s:%s@/%s?charset=utf8&parseTime=True&loc=Local", username, password, dbName)) 22 | if err != nil { 23 | log.Fatalf(" gorm.Open.err: %v", err) 24 | } 25 | 26 | DB.SingularTable(true) 27 | gorm.DefaultTableNameHandler = func(db *gorm.DB, defaultTableName string) string { 28 | return "sp_" + defaultTableName 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /parse/douban.go: -------------------------------------------------------------------------------- 1 | package parse 2 | 3 | import ( 4 | "log" 5 | "regexp" 6 | "strconv" 7 | "strings" 8 | 9 | "github.com/PuerkitoBio/goquery" 10 | ) 11 | 12 | type DoubanMovie struct { 13 | Title string 14 | Subtitle string 15 | Other string 16 | Desc string 17 | Year string 18 | Area string 19 | Tag string 20 | Star string 21 | Comment string 22 | Quote string 23 | } 24 | 25 | type Page struct { 26 | Page int 27 | Url string 28 | } 29 | 30 | // 获取分页 31 | func GetPages(url string) []Page { 32 | doc, err := goquery.NewDocument(url) 33 | if err != nil { 34 | log.Fatal(err) 35 | } 36 | 37 | return ParsePages(doc) 38 | } 39 | 40 | // 分析分页 41 | func ParsePages(doc *goquery.Document) (pages []Page) { 42 | pages = append(pages, Page{Page: 1, Url: ""}) 43 | doc.Find("#content > div > div.article > div.paginator > a").Each(func(i int, s *goquery.Selection) { 44 | page, _ := strconv.Atoi(s.Text()) 45 | url, _ := s.Attr("href") 46 | 47 | pages = append(pages, Page{ 48 | Page: page, 49 | Url: url, 50 | }) 51 | }) 52 | 53 | return pages 54 | } 55 | 56 | // 分析电影数据 57 | func ParseMovies(doc *goquery.Document) (movies []DoubanMovie) { 58 | doc.Find("#content > div > div.article > ol > li").Each(func(i int, s *goquery.Selection) { 59 | title := s.Find(".hd a span").Eq(0).Text() 60 | 61 | subtitle := s.Find(".hd a span").Eq(1).Text() 62 | subtitle = strings.TrimLeft(subtitle, "  / ") 63 | 64 | other := s.Find(".hd a span").Eq(2).Text() 65 | other = strings.TrimLeft(other, "  / ") 66 | 67 | desc := strings.TrimSpace(s.Find(".bd p").Eq(0).Text()) 68 | DescInfo := strings.Split(desc, "\n") 69 | desc = DescInfo[0] 70 | 71 | movieDesc := strings.Split(DescInfo[1], "/") 72 | year := strings.TrimSpace(movieDesc[0]) 73 | area := strings.TrimSpace(movieDesc[1]) 74 | tag := strings.TrimSpace(movieDesc[2]) 75 | 76 | star := s.Find(".bd .star .rating_num").Text() 77 | 78 | comment := strings.TrimSpace(s.Find(".bd .star span").Eq(3).Text()) 79 | compile := regexp.MustCompile("[0-9]") 80 | comment = strings.Join(compile.FindAllString(comment, -1), "") 81 | 82 | quote := s.Find(".quote .inq").Text() 83 | 84 | movie := DoubanMovie{ 85 | Title: title, 86 | Subtitle: subtitle, 87 | Other: other, 88 | Desc: desc, 89 | Year: year, 90 | Area: area, 91 | Tag: tag, 92 | Star: star, 93 | Comment: comment, 94 | Quote: quote, 95 | } 96 | 97 | log.Printf("i: %d, movie: %v", i, movie) 98 | 99 | movies = append(movies, movie) 100 | }) 101 | 102 | return movies 103 | } 104 | --------------------------------------------------------------------------------