├── .gitignore ├── README.md ├── catch └── catch.go ├── doneDB └── doneDB.go ├── go.mod ├── mailConfig.yaml ├── mailSend └── mailSend.go ├── main.go ├── proxyConfig.yaml ├── score ├── ownValue.txt ├── score.go └── wordValue.txt └── update.sh /.gitignore: -------------------------------------------------------------------------------- 1 | # Binaries for programs and plugins 2 | *.exe 3 | *.exe~ 4 | *.dll 5 | *.so 6 | *.dylib 7 | 8 | # Test binary, built with `go test -c` 9 | *.test 10 | 11 | # Output of the go coverage tool, specifically when used with LiteIDE 12 | *.out 13 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # spider91 2 | 3 | #### 介绍 4 | 91视频网站爬虫工具,可以批量或单独爬取视频。 5 | 不带参数运行程序时,进入日常爬取模式,固定每天8点爬取24小时内发布的30个评分最高的视频,评分由关键字、视频时长、作者分三项评分组成(score下的两个txt定义了关键词评分和作者评分,分数范围[-∞,100])。每周六9点会爬取本周评分最高的30个最热视频并把当周的视频整理到一个文件夹下。程序有去重机制不会重复下载同一个视频。 6 | 7 | #### 软件架构 8 | 基于go1.15编写,依赖chrome浏览器、python下的m3_dl、pysocks。 9 | 10 | 11 | #### 安装教程 12 | 13 | 1. 安装chrome浏览器。 14 | 2. 安装python、m3_dl、pysocks 15 | pip3 install m3_dl 16 | pip3 install pysocks 17 | 3. 编译代码 18 | 工程根目录下执行go build 19 | 20 | 21 | #### 使用说明 22 | 23 | 1. 参数说明 24 | -c 爬取页面 25 | -u 爬取的网页 可以是单个视频的页面也可以使是类似首页的多个视频的页面。 26 | -o 视频存储路径 27 | -p 代理地址 28 | -t 同时爬取的视频个数 29 | -now 爬取前X天的的视频 30 | -n 与-now一起使用,表示存评分前X个视频 31 | 32 | 2. 示例 33 | **单个视频爬取** 34 | ./spider91 -c -u "http://91porn.com/view_video.php?viewkey=8cd0148b3fe08d4a4c2f" -p "http://127.0.0.1:10808" 35 | **单页多个视频爬取** 36 | ./spider91 -c -u "http://91porn.com/v.php?category=rf&viewtype=basic&page=2" -p "http://127.0.0.1:10808" 37 | **爬取前3天评分前100的视频** 38 | ./spider91 -now 3 -n 100 39 | 40 | 3. 新增docker版本 41 | https://hub.docker.com/repository/docker/templelv/spider91 42 | 43 | ``` 44 | docker run --restart=always -it --name spider91 \ 45 | -v /dir_to_save/:/root/spider91/save \ 46 | templelv/spider91 sh -c 'service supervisor start && /bin/bash' 47 | ``` 48 | /dir_to_save为用户设置的视频保存路径 49 | docker attach spider91 命令可以进入容器终端 50 | 容器终端中执行/root/spider91/update.sh 将更新代码为github上最新并编译重新执行。 51 | 代理列表通过修改配置文件更新,配置文件路径/root/spider91/proxyConfig.yaml 52 | 推荐使用passwall建立多个代理。 53 | -------------------------------------------------------------------------------- /catch/catch.go: -------------------------------------------------------------------------------- 1 | package catch 2 | 3 | import ( 4 | "bytes" 5 | "context" 6 | "fmt" 7 | "github.com/PuerkitoBio/goquery" 8 | "github.com/chromedp/cdproto/network" 9 | "github.com/chromedp/chromedp" 10 | "io/ioutil" 11 | "net/http" 12 | "net/url" 13 | "os" 14 | "os/exec" 15 | "path/filepath" 16 | "regexp" 17 | "strconv" 18 | "strings" 19 | "sync" 20 | "time" 21 | ) 22 | 23 | type VideoInfo struct { 24 | Title string 25 | ViewKey string 26 | Owner string 27 | UpTime time.Time 28 | DlAddr string 29 | Vdurat float64 30 | Watch int 31 | Collect int 32 | Score float64 33 | } 34 | 35 | type ViSlice []*VideoInfo 36 | 37 | func (v ViSlice) Len() int { return len(v) } 38 | 39 | func (v ViSlice) Swap(i, j int) { v[i], v[j] = v[j], v[i] } 40 | 41 | func (v ViSlice) Less(i, j int) bool { return v[i].Score < v[j].Score } 42 | 43 | func (v ViSlice) String() string { 44 | str := "" 45 | for _, v := range v { 46 | str += fmt.Sprintf("VideoInfo: (%.1f %.1f)%s %s %d %d %s %s\n", 47 | v.Score, v.Vdurat, v.Title, v.ViewKey, v.Watch, v.Collect, v.UpTime.Format("2006-01-02 15:04:05"), v.DlAddr) 48 | } 49 | 50 | return str 51 | } 52 | 53 | func (v VideoInfo) String() string { 54 | return fmt.Sprintf("VideoInfo: (%.1f %.1f)%s %s %d %d %s %s", 55 | v.Score, v.Vdurat, v.Title, v.ViewKey, v.Watch, v.Collect, v.UpTime.Format("2006-01-02 15:04:05"), v.DlAddr) 56 | } 57 | 58 | func (v *VideoInfo) updateDlAddr(proxy string) (err error) { 59 | v.DlAddr = "" 60 | options := []chromedp.ExecAllocatorOption{ 61 | chromedp.Flag("hide-scrollbars", false), 62 | chromedp.Flag("mute-audio", false), 63 | chromedp.Flag("blink-settings", "imagesEnabled=false"), 64 | chromedp.ProxyServer(proxy), 65 | //chromedp.Flag("headless", false), 66 | chromedp.UserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0"), 67 | } 68 | options = append(chromedp.DefaultExecAllocatorOptions[:], options...) 69 | 70 | allocCtx, cancel := chromedp.NewExecAllocator(context.Background(), options...) 71 | defer cancel() 72 | ctx, cancel := chromedp.NewContext(allocCtx) 73 | defer cancel() 74 | ctx, _ = context.WithTimeout(ctx, time.Second*10) 75 | 76 | htmlText := "" 77 | fullUrl := "https://www.91porn.com/view_video.php?viewkey=" + v.ViewKey 78 | if err = chromedp.Run(ctx, sourHtml(fullUrl, "#player_one_html5_api > source", &htmlText)); err != nil { 79 | if err = chromedp.Run(ctx, sourHtml_click(fullUrl, "#player_one_html5_api > source", &htmlText)); err != nil { 80 | fmt.Println("DlAddr", fullUrl, err) 81 | return 82 | } else { 83 | fmt.Println(fullUrl, "DlAddr done!") 84 | } 85 | } else { 86 | fmt.Println(fullUrl, "DlAddr done!") 87 | } 88 | regAddr := regexp.MustCompile(` 0 { 101 | //strCmd := fmt.Sprintf(" -p \"%s\" -t %d -w -o %s \"%s\"", proxy, numThread, savePath, v.DlAddr) 102 | ctx, cancel := context.WithTimeout(context.Background(), time.Minute*5+time.Second*30) 103 | defer cancel() 104 | cmd := exec.CommandContext(ctx, "curl", "-x", proxy, "-o", savePath, "-O", v.DlAddr) 105 | 106 | //cmd := exec.Command("m3_dl", "-p", proxy, "-t", strconv.Itoa(numThread), "-w", "-o", savePath, v.DlAddr) 107 | //fmt.Println(cmd) 108 | out, ierr := cmd.CombinedOutput() 109 | if ierr != nil { 110 | //fmt.Println(string(out)) 111 | _ = out 112 | err = ierr 113 | fmt.Println(v.Title, "download fail!") 114 | } else { 115 | fmt.Println(v.Title, "download success!") 116 | } 117 | } else { 118 | fmt.Println(v.Title, "DlAddr not set!") 119 | err = fmt.Errorf(v.Title, "DlAddr not set!") 120 | } 121 | 122 | return 123 | } 124 | 125 | func sourHtml(urlstr, sel string, html *string) chromedp.Tasks { 126 | return chromedp.Tasks{ 127 | network.Enable(), 128 | network.SetExtraHTTPHeaders(network.Headers(map[string]interface{}{ 129 | "Accept-Language": "zh-CN,zh;q=0.9", 130 | })), 131 | chromedp.Navigate(urlstr), 132 | //chromedp.WaitVisible(sel), 133 | //chromedp.Text("source src", html), 134 | //chromedp.ActionFunc(func(ctx context.Context) error { 135 | // return nil 136 | //}), 137 | //chromedp.Click("body > table > tbody > tr > td > a", chromedp.ByQuery), 138 | chromedp.OuterHTML(sel, html), 139 | } 140 | } 141 | 142 | func sourHtml_click(urlstr, sel string, html *string) chromedp.Tasks { 143 | return chromedp.Tasks{ 144 | network.Enable(), 145 | network.SetExtraHTTPHeaders(network.Headers(map[string]interface{}{ 146 | "Accept-Language": "zh-CN,zh;q=0.9", 147 | })), 148 | chromedp.Navigate(urlstr), 149 | //chromedp.WaitVisible(sel), 150 | //chromedp.Text("source src", html), 151 | //chromedp.ActionFunc(func(ctx context.Context) error { 152 | // return nil 153 | //}), 154 | chromedp.Click("body > table > tbody > tr > td > a", chromedp.ByQuery), 155 | chromedp.OuterHTML(sel, html), 156 | } 157 | } 158 | 159 | func sourManyHtml(urlstr string, sel, html []string) chromedp.Tasks { 160 | task := chromedp.Tasks{ 161 | network.Enable(), 162 | network.SetExtraHTTPHeaders(network.Headers(map[string]interface{}{ 163 | "Accept-Language": "zh-CN,zh;q=0.9", 164 | })), 165 | chromedp.Navigate(urlstr), 166 | } 167 | if len(sel) == len(html) { 168 | for i, _ := range sel { 169 | task = append(task, chromedp.OuterHTML(sel[i], &html[i])) 170 | } 171 | } 172 | 173 | return task 174 | } 175 | 176 | func nopCrawHtml(urlstr string, sel string, html *string) chromedp.Tasks { 177 | task := chromedp.Tasks{ 178 | network.Enable(), 179 | network.SetExtraHTTPHeaders(network.Headers(map[string]interface{}{ 180 | "Accept-Language": "zh-CN,zh;q=0.9", 181 | })), 182 | chromedp.Navigate(urlstr), 183 | //chromedp.Click("body > table > tbody > tr > td > a", chromedp.ByQuery), 184 | chromedp.OuterHTML(sel, html), 185 | } 186 | 187 | return task 188 | } 189 | 190 | func nopCrawHtml_click(urlstr string, sel string, html *string) chromedp.Tasks { 191 | task := chromedp.Tasks{ 192 | network.Enable(), 193 | network.SetExtraHTTPHeaders(network.Headers(map[string]interface{}{ 194 | "Accept-Language": "zh-CN,zh;q=0.9", 195 | })), 196 | chromedp.Navigate(urlstr), 197 | chromedp.Click("body > table > tbody > tr > td > a", chromedp.ByQuery), 198 | chromedp.OuterHTML(sel, html), 199 | } 200 | 201 | return task 202 | } 203 | 204 | func PageCrawlOne(dstUrl, proxyUrl string) (vi VideoInfo, err error) { 205 | vi.DlAddr = "" 206 | options := []chromedp.ExecAllocatorOption{ 207 | chromedp.Flag("hide-scrollbars", false), 208 | chromedp.Flag("mute-audio", false), 209 | chromedp.Flag("blink-settings", "imagesEnabled=true"), 210 | chromedp.ProxyServer(proxyUrl), 211 | //chromedp.Flag("headless", false), 212 | chromedp.UserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0"), 213 | } 214 | options = append(chromedp.DefaultExecAllocatorOptions[:], options...) 215 | 216 | allocCtx, cancel := chromedp.NewExecAllocator(context.Background(), options...) 217 | defer cancel() 218 | ctx, cancel := chromedp.NewContext(allocCtx) 219 | defer cancel() 220 | ctx, _ = context.WithTimeout(ctx, time.Second*25) 221 | 222 | sels := [...]string{"#player_one_html5_api > source", "#videodetails > h4", "#videodetails-content > div:nth-child(2) > span.title-yakov > a:nth-child(1) > span"} 223 | htmlText := [len(sels)]string{} 224 | if err = chromedp.Run(ctx, sourManyHtml(dstUrl, sels[:], htmlText[:])); err != nil { 225 | fmt.Println(err) 226 | return 227 | } 228 | regAddr := regexp.MustCompile(`(?s:(.*?))`) 230 | regOwner := regexp.MustCompile(`">(?s:(.*?))`) 231 | dlAddr := regAddr.FindAllStringSubmatch(htmlText[0], 1) 232 | title := regTitle.FindAllStringSubmatch(htmlText[1], 1) 233 | owner := regOwner.FindAllStringSubmatch(htmlText[2], 1) 234 | if len(dlAddr) > 0 && len(title) > 0 && len(owner) > 0 { 235 | vi.DlAddr = dlAddr[0][1] 236 | //将vi.DlAddr中的&转换为& 237 | vi.DlAddr = strings.ReplaceAll(vi.DlAddr, "&", "&") 238 | //去掉title[0][1]中的空格和换行符 239 | vi.Title = strings.ReplaceAll(title[0][1], " ", "") 240 | vi.Title = strings.ReplaceAll(vi.Title, "\n", "") 241 | vi.Owner = owner[0][1] 242 | } 243 | 244 | return 245 | } 246 | 247 | func PageCrawl_chromedp(dstUrl, proxyUrl string) (viAll []*VideoInfo) { 248 | options := []chromedp.ExecAllocatorOption{ 249 | chromedp.Flag("hide-scrollbars", false), 250 | chromedp.Flag("mute-audio", false), 251 | chromedp.Flag("blink-settings", "imagesEnabled=false"), 252 | chromedp.ProxyServer(proxyUrl), 253 | //chromedp.Flag("headless", false), 254 | chromedp.UserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"), 255 | } 256 | options = append(chromedp.DefaultExecAllocatorOptions[:], options...) 257 | 258 | allocCtx, cancel := chromedp.NewExecAllocator(context.Background(), options...) 259 | defer cancel() 260 | ctx, cancel := chromedp.NewContext(allocCtx) 261 | defer cancel() 262 | ctx, _ = context.WithTimeout(ctx, time.Second*25) 263 | 264 | sel := "#wrapper" 265 | htmlText := "" 266 | if err := chromedp.Run(ctx, nopCrawHtml(dstUrl, sel, &htmlText)); err != nil { 267 | if err := chromedp.Run(ctx, nopCrawHtml_click(dstUrl, sel, &htmlText)); err != nil { 268 | fmt.Println("Crawl", dstUrl, err) 269 | return 270 | } else { 271 | fmt.Println(dstUrl, "Crawl done!") 272 | } 273 | } else { 274 | fmt.Println(dstUrl, "Crawl done!") 275 | } 276 | 277 | doc, err := goquery.NewDocumentFromReader(bytes.NewBufferString(htmlText)) 278 | if err != nil { 279 | fmt.Println(err) 280 | return 281 | } 282 | 283 | doc.Find("#wrapper > div.container.container-minheight > div.row > div > div > div > div").Each(func(i int, selection *goquery.Selection) { 284 | textStr := selection.Text() 285 | 286 | title := selection.Find("a").Find("span.video-title").Text() 287 | videoUrl, urlOk := selection.Find("a").Attr("href") 288 | 289 | regViewKey := regexp.MustCompile(`viewkey=(?s:(.*))&page`) 290 | regAddTime := regexp.MustCompile(`添加时间:(?s:(.*?))\n`) 291 | regWatch := regexp.MustCompile(`热度:(?s:(.*?))\n`) 292 | regCollect := regexp.MustCompile(`收藏:(?s:(.*?))\n`) 293 | regOwner := regexp.MustCompile(`作者: \n(?s:(.*?))\n`) 294 | 295 | viewkey := regViewKey.FindAllStringSubmatch(videoUrl, 1) 296 | addTime := regAddTime.FindAllStringSubmatch(textStr, 1) 297 | watch := regWatch.FindAllStringSubmatch(textStr, 1) 298 | collect := regCollect.FindAllStringSubmatch(textStr, 1) 299 | owner := regOwner.FindAllStringSubmatch(textStr, 1) 300 | 301 | if len(viewkey) > 0 && len(addTime) > 0 && len(watch) > 0 && len(collect) > 0 && len(owner) > 0 && urlOk { 302 | 303 | vi := new(VideoInfo) 304 | 305 | //title = "1" 306 | strs := strings.Fields(addTime[0][1]) 307 | 308 | if len(strs) == 3 { 309 | switch strs[1] { 310 | case "分钟": 311 | duration, _ := time.ParseDuration("-" + strs[0] + "m") 312 | vi.UpTime = time.Now().Add(duration) 313 | case "小时": 314 | duration, _ := time.ParseDuration("-" + strs[0] + "h") 315 | vi.UpTime = time.Now().Add(duration) 316 | case "天": 317 | hourDay, _ := strconv.Atoi(strs[0]) 318 | hourDay = hourDay * 24 319 | duration, _ := time.ParseDuration("-" + strconv.Itoa(hourDay) + "h") 320 | vi.UpTime = time.Now().Add(duration) 321 | } 322 | } 323 | vi.Title = title 324 | vi.ViewKey = viewkey[0][1] 325 | strs = strings.Fields(watch[0][1]) 326 | vi.Watch, _ = strconv.Atoi(strs[0]) 327 | strs = strings.Fields(collect[0][1]) 328 | if len(strs) > 0 { 329 | vi.Collect, _ = strconv.Atoi(strs[0]) 330 | } 331 | 332 | if len(strings.Fields(owner[0][1])) > 0 { 333 | vi.Owner = strings.Fields(owner[0][1])[0] 334 | } else { 335 | vi.Owner = "unknown" 336 | } 337 | 338 | vMinute := 0 339 | vSecond := 0 340 | fmt.Sscanf(selection.Find("span.duration").Text(), "%d:%d\n", &vMinute, &vSecond) 341 | vi.Vdurat = float64(vMinute) + float64(vSecond)/60.0 342 | 343 | viAll = append(viAll, vi) 344 | 345 | //fmt.Println(vi) 346 | } 347 | 348 | }) 349 | 350 | return 351 | } 352 | 353 | func PageCrawl(dstUrl, proxyUrl string) (viAll []*VideoInfo) { 354 | req, err := http.NewRequest("GET", dstUrl, nil) 355 | if err != nil { 356 | fmt.Println(err) 357 | return 358 | } 359 | 360 | req.Header.Add("Connection", "keep-alive") 361 | req.Header.Add("Accept-Language", "zh-CN,zh;q=0.9") 362 | req.Header.Add("sec-ch-ua", "\" Not;A Brand\";v=\"99\", \"Google Chrome\";v=\"91\", \"Chromium\";v=\"91\"") 363 | req.Header.Add("sec-ch-ua-mobile", "?0") 364 | req.Header.Add("Upgrade-Insecure-Requests", "1") 365 | req.Header.Add("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") 366 | req.Header.Add("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9") 367 | req.Header.Add("Sec-Fetch-Site", "none") 368 | req.Header.Add("Sec-Fetch-Mode", "navigate") 369 | req.Header.Add("Sec-Fetch-User", "?1") 370 | req.Header.Add("Sec-Fetch-Dest", "document") 371 | req.Header.Add("Accept-Encoding", " gzip, deflate, br") 372 | //req.Header.Add("", "") 373 | 374 | //req.Header.Add("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) Gecko/20100101 Firefox/12.0") 375 | 376 | //req.Header.Add("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9") 377 | //req.Header.Add("Accept-Encoding", "gzip, deflate") 378 | //req.Header.Add("Accept-Language", "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3") 379 | 380 | //req.Header.Add("Upgrade-Insecure-Requests", "1") 381 | //req.Header.Add("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") 382 | 383 | client := &http.Client{} 384 | 385 | if len(proxyUrl) > 0 { 386 | proxy := func(_ *http.Request) (*url.URL, error) { 387 | return url.Parse(proxyUrl) 388 | } 389 | transport := &http.Transport{Proxy: proxy} 390 | client = &http.Client{Transport: transport} 391 | } 392 | 393 | resp, err := client.Do(req) 394 | if err != nil { 395 | fmt.Println(err) 396 | return 397 | } 398 | 399 | defer resp.Body.Close() 400 | 401 | doc, err := goquery.NewDocumentFromReader(resp.Body) 402 | if err != nil { 403 | fmt.Println(err) 404 | return 405 | } 406 | 407 | doc.Find("#wrapper > div.container.container-minheight > div.row > div > div > div > div").Each(func(i int, selection *goquery.Selection) { 408 | textStr := selection.Text() 409 | 410 | title := selection.Find("a").Find("span.video-title").Text() 411 | videoUrl, urlOk := selection.Find("a").Attr("href") 412 | 413 | regViewKey := regexp.MustCompile(`91porn.com/view_video.php\?viewkey=(?s:(.*?))&page`) 414 | regAddTime := regexp.MustCompile(`添加时间:(?s:(.*?))\n`) 415 | regWatch := regexp.MustCompile(`查看:(?s:(.*?))\n`) 416 | regCollect := regexp.MustCompile(`收藏:(?s:(.*?))\n`) 417 | regOwner := regexp.MustCompile(`作者: \n(?s:(.*?))\n`) 418 | 419 | viewkey := regViewKey.FindAllStringSubmatch(videoUrl, 1) 420 | addTime := regAddTime.FindAllStringSubmatch(textStr, 1) 421 | watch := regWatch.FindAllStringSubmatch(textStr, 1) 422 | collect := regCollect.FindAllStringSubmatch(textStr, 1) 423 | owner := regOwner.FindAllStringSubmatch(textStr, 1) 424 | 425 | if len(viewkey) > 0 && len(addTime) > 0 && len(watch) > 0 && len(collect) > 0 && len(owner) > 0 && urlOk { 426 | 427 | vi := new(VideoInfo) 428 | 429 | //title = "1" 430 | strs := strings.Fields(addTime[0][1]) 431 | 432 | if len(strs) == 3 { 433 | switch strs[1] { 434 | case "分钟": 435 | duration, _ := time.ParseDuration("-" + strs[0] + "m") 436 | vi.UpTime = time.Now().Add(duration) 437 | case "小时": 438 | duration, _ := time.ParseDuration("-" + strs[0] + "h") 439 | vi.UpTime = time.Now().Add(duration) 440 | case "天": 441 | hourDay, _ := strconv.Atoi(strs[0]) 442 | hourDay = hourDay * 24 443 | duration, _ := time.ParseDuration("-" + strconv.Itoa(hourDay) + "h") 444 | vi.UpTime = time.Now().Add(duration) 445 | } 446 | } 447 | vi.Title = title 448 | vi.ViewKey = viewkey[0][1] 449 | strs = strings.Fields(watch[0][1]) 450 | vi.Watch, _ = strconv.Atoi(strs[0]) 451 | strs = strings.Fields(collect[0][1]) 452 | if len(strs) > 0 { 453 | vi.Collect, _ = strconv.Atoi(strs[0]) 454 | } 455 | vi.Owner = strings.Fields(owner[0][1])[0] 456 | vMinute := 0 457 | vSecond := 0 458 | fmt.Sscanf(selection.Find("span.duration").Text(), "%d:%d\n", &vMinute, &vSecond) 459 | vi.Vdurat = float64(vMinute) + float64(vSecond)/60.0 460 | 461 | viAll = append(viAll, vi) 462 | 463 | //fmt.Println(vi) 464 | } 465 | 466 | }) 467 | return 468 | } 469 | 470 | func OrgPageSave(dstUrl, proxyUrl, fileName string) { 471 | req, err := http.NewRequest("GET", dstUrl, nil) 472 | if err != nil { 473 | fmt.Println(err) 474 | return 475 | } 476 | 477 | req.Header.Add("Accept-Language", "zh-CN,zh;q=0.9") 478 | 479 | client := &http.Client{} 480 | 481 | if len(proxyUrl) > 0 { 482 | fmt.Println("use proxy") 483 | proxy := func(_ *http.Request) (*url.URL, error) { 484 | return url.Parse(proxyUrl) 485 | } 486 | transport := &http.Transport{Proxy: proxy} 487 | client = &http.Client{Transport: transport} 488 | } 489 | 490 | resp, err := client.Do(req) 491 | if err != nil { 492 | fmt.Println(err) 493 | return 494 | } 495 | 496 | file, err := os.Create(fileName) 497 | if err != nil { 498 | fmt.Println(err) 499 | return 500 | } 501 | defer file.Close() 502 | buf, _ := ioutil.ReadAll(resp.Body) 503 | file.Write(buf) 504 | } 505 | 506 | func DownloadMany(viAll []*VideoInfo, numThread int, proxyUrl, savePath string) (failVi, succsVi []*VideoInfo) { 507 | ch := make(chan int, len(viAll)) 508 | chq := make(chan int, numThread) 509 | fmt.Print("DownladMany:len([]*VideoInfo)=", len(viAll), "\n") 510 | var mutex sync.Mutex 511 | for i, vi := range viAll { 512 | go func(info *VideoInfo, cnt int) { 513 | chq <- 1 514 | info.updateDlAddr(proxyUrl) 515 | savePath := filepath.Join(savePath, fmt.Sprintf("%s(%s)_%d.mp4", info.Title, info.Owner, cnt)) 516 | err := info.Download(savePath, 15, proxyUrl) 517 | if err != nil { 518 | mutex.Lock() 519 | failVi = append(failVi, info) 520 | mutex.Unlock() 521 | os.Remove(savePath) 522 | } else { 523 | mutex.Lock() 524 | succsVi = append(succsVi, info) 525 | mutex.Unlock() 526 | } 527 | 528 | <-chq 529 | ch <- 1 530 | }(vi, i) 531 | time.Sleep(time.Second * 3) 532 | } 533 | 534 | for range viAll { 535 | <-ch 536 | } 537 | return 538 | } 539 | -------------------------------------------------------------------------------- /doneDB/doneDB.go: -------------------------------------------------------------------------------- 1 | package doneDB 2 | 3 | import ( 4 | "database/sql" 5 | "fmt" 6 | _ "github.com/mattn/go-sqlite3" 7 | "spider91/catch" 8 | "strconv" 9 | "strings" 10 | "time" 11 | ) 12 | 13 | type VideoDB struct { 14 | db *sql.DB 15 | } 16 | 17 | func (v *VideoDB) Close() error { 18 | return v.db.Close() 19 | } 20 | 21 | func (v *VideoDB) AddDone(vis []*catch.VideoInfo) (fails []*catch.VideoInfo) { 22 | 23 | for _, vi := range vis { 24 | stmt, err := v.db.Prepare("INSERT INTO done(viewkey, title, owner, UpTime) values(?, ?, ?, ?)") 25 | if err == nil { 26 | _, err := stmt.Exec(vi.ViewKey, vi.Title, vi.Owner, vi.UpTime) 27 | //fmt.Println(err) 28 | stmt.Close() 29 | if err == nil { 30 | continue 31 | } 32 | } 33 | 34 | fails = append(fails, vi) 35 | } 36 | 37 | return 38 | } 39 | 40 | func (v *VideoDB) ClearDone(before time.Time) (err error) { 41 | 42 | stmt, err := v.db.Prepare("delete from done where UpTime 0 && err == nil { 80 | failcount := 0 81 | err := v.db.QueryRow("select failcount FROM undone WHERE viewkey=?", vi.ViewKey).Scan(&failcount) 82 | if err == nil { 83 | if failcount >= 3 { 84 | //del item 85 | stmt, err := v.db.Prepare("delete from undone where viewkey=?") 86 | if err == nil { 87 | stmt.Exec(vi.ViewKey) 88 | stmt.Close() 89 | continue 90 | } 91 | } else { 92 | //update item 93 | stmt, err := v.db.Prepare("update undone set failcount=? where viewkey=?") 94 | if err == nil { 95 | stmt.Exec(failcount+1, vi.ViewKey) 96 | stmt.Close() 97 | continue 98 | } 99 | } 100 | } 101 | } 102 | 103 | fails = append(fails, vi) 104 | } 105 | 106 | if len(done) > 0 { 107 | var keys []string 108 | for _, vi := range done { 109 | keys = append(keys, strconv.Quote(vi.ViewKey)) 110 | } 111 | 112 | sql := fmt.Sprintf("DELETE from undone WHERE viewkey IN (%s)", strings.Join(keys, ",")) 113 | 114 | v.db.Exec(sql) 115 | } 116 | 117 | return 118 | } 119 | 120 | func (v *VideoDB) GetUD() (undone []*catch.VideoInfo) { 121 | 122 | rows, err := v.db.Query("SELECT viewkey, title, owner, UpTime FROM undone") 123 | if err != nil { 124 | return 125 | } 126 | defer rows.Close() 127 | for rows.Next() { 128 | vi := catch.VideoInfo{} 129 | err := rows.Scan(&vi.ViewKey, &vi.Title, &vi.Owner, &vi.UpTime) 130 | if err == nil { 131 | undone = append(undone, &vi) 132 | } 133 | } 134 | 135 | return 136 | } 137 | 138 | func OpenVDB(filename string) (vdb *VideoDB, err error) { 139 | vdb = new(VideoDB) 140 | vdb.db, err = sql.Open("sqlite3", filename) 141 | if err == nil { 142 | 143 | sql_table := ` 144 | CREATE TABLE IF NOT EXISTS "done" ( 145 | "viewkey" VARCHAR(64) PRIMARY KEY, 146 | "title" VARCHAR(64) NULL, 147 | "owner" VARCHAR(64) NULL, 148 | "UpTime" TIMESTAMP default (datetime('now', 'localtime')) 149 | ); 150 | CREATE TABLE IF NOT EXISTS "undone" ( 151 | "viewkey" VARCHAR(64) PRIMARY KEY, 152 | "title" VARCHAR(64) NULL, 153 | "owner" VARCHAR(64) NULL, 154 | "UpTime" TIMESTAMP default (datetime('now', 'localtime')), 155 | "failcount" INTEGER NOT NULL DEFAULT 0 156 | ); 157 | ` 158 | _, err = vdb.db.Exec(sql_table) 159 | return 160 | } 161 | 162 | return nil, err 163 | } 164 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module spider91 2 | 3 | go 1.15 4 | 5 | require ( 6 | github.com/PuerkitoBio/goquery v1.6.1 7 | github.com/chromedp/cdproto v0.0.0-20210713064928-7d28b402946a 8 | github.com/chromedp/chromedp v0.7.4 9 | github.com/mattn/go-sqlite3 v1.14.6 10 | github.com/robfig/cron/v3 v3.0.1 11 | github.com/yanyiwu/gojieba v1.1.2 12 | gopkg.in/yaml.v2 v2.4.0 13 | ) 14 | -------------------------------------------------------------------------------- /mailConfig.yaml: -------------------------------------------------------------------------------- 1 | user : sender@126.com 2 | password : XXXXXXXXXXXX 3 | host : smtp.126.com:25 4 | addr : receiver@qq.com -------------------------------------------------------------------------------- /mailSend/mailSend.go: -------------------------------------------------------------------------------- 1 | package mailSend 2 | 3 | import ( 4 | "fmt" 5 | yaml "gopkg.in/yaml.v2" 6 | "io/ioutil" 7 | "log" 8 | "net/smtp" 9 | "strings" 10 | ) 11 | 12 | type MailInfo struct { 13 | User string `yaml:"user"` 14 | Password string `yaml:"password"` 15 | Host string `yaml:"host"` 16 | Addr string `yaml:"addr"` 17 | } 18 | 19 | func SendMailByYaml(subject, content, mailtype string) error { 20 | 21 | conf := new(MailInfo) 22 | yamlFile, err := ioutil.ReadFile("./save/mailConfig.yaml") 23 | if err != nil { 24 | yamlFile, err = ioutil.ReadFile("mailConfig.yaml") 25 | } 26 | err = yaml.Unmarshal(yamlFile, conf) 27 | // err = yaml.Unmarshal(yamlFile, &resultMap) 28 | if err != nil { 29 | log.Println("can't get mail config!!!") 30 | return err 31 | } 32 | 33 | hp := strings.Split(conf.Host, ":") 34 | auth := smtp.PlainAuth("", conf.User, conf.Password, hp[0]) 35 | var content_type string 36 | if mailtype == "html" { 37 | content_type = "Content-Type: text/" + mailtype + "; charset=UTF-8" 38 | } else { 39 | content_type = "Content-Type: text/plain" + "; charset=UTF-8" 40 | } 41 | send_to := strings.Split(conf.Addr, ";") 42 | rfc822_to := strings.Join(send_to, ",") 43 | 44 | body := ` 45 | 46 | 47 |

48 | "%s" 49 |

50 | 51 | 52 | ` 53 | body = fmt.Sprintf(body, content) 54 | 55 | msg := []byte("To: " + rfc822_to + "\r\nFrom: " + conf.User + ">\r\nSubject: " + subject + "\r\n" + content_type + "\r\n\r\n" + body) 56 | 57 | err = smtp.SendMail(conf.Host, auth, conf.User, send_to, msg) 58 | return err 59 | } 60 | -------------------------------------------------------------------------------- /main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "flag" 5 | "fmt" 6 | "github.com/robfig/cron/v3" 7 | "gopkg.in/yaml.v2" 8 | "io" 9 | "io/ioutil" 10 | "log" 11 | "math" 12 | "os" 13 | "path/filepath" 14 | "spider91/catch" 15 | "spider91/doneDB" 16 | "spider91/mailSend" 17 | "spider91/score" 18 | "strconv" 19 | "strings" 20 | "time" 21 | ) 22 | 23 | type proxyInfo struct { 24 | ProxyUrls []string `yaml:"ProxyUrls,flow"` 25 | } 26 | 27 | func weeklyFunc(proxyUrls []string) func() { 28 | proxyUrls = append([]string{}, proxyUrls...) 29 | return func() { 30 | 31 | log.Println("Start weekly download and organize!!") 32 | 33 | var viAll []*catch.VideoInfo 34 | s := score.NewScore("./score/wordValue.txt", "./score/ownValue.txt") 35 | defer s.Free() 36 | 37 | for i := 1; i < 6; i++ { 38 | var vis []*catch.VideoInfo 39 | for _, pu := range proxyUrls { 40 | vis = catch.PageCrawl_chromedp("http://91porn.com/v.php?category=rf&viewtype=basic&page="+strconv.Itoa(i), pu) 41 | if len(vis) > 0 { 42 | break 43 | } 44 | } 45 | 46 | for _, vi := range vis { 47 | if time.Now().Sub(vi.UpTime) < time.Hour*24*7+time.Minute*10 { 48 | viAll = append(viAll, vi) 49 | } 50 | } 51 | } 52 | 53 | if len(viAll) > 0 { 54 | ddb, err := doneDB.OpenVDB("./save/videoDB.db") 55 | if err != nil { 56 | log.Println("videoDB.db open fail!!!", err) 57 | return 58 | } 59 | defer ddb.Close() 60 | 61 | viAll = ddb.DelRepeat(viAll) 62 | s.GradeSort(viAll) 63 | viAll = s.Above(viAll, 0) 64 | length := int(math.Min(20, float64(len(viAll)))) 65 | pickVi := append(viAll[:length], ddb.GetUD()...) 66 | savePath := time.Now().Format("./save/weekly_top_060102") 67 | 68 | path, _ := filepath.Abs(savePath) 69 | 70 | _, err = os.Stat(path) 71 | if os.IsNotExist(err) { 72 | if err = os.MkdirAll(path, os.ModePerm); err != nil { 73 | log.Println("savePath create failed!", err) 74 | return 75 | } 76 | } 77 | 78 | failVi := pickVi 79 | var succsVi []*catch.VideoInfo 80 | for _, pu := range proxyUrls { 81 | var ssc []*catch.VideoInfo 82 | failVi, ssc = catch.DownloadMany(failVi, 2, pu, path) 83 | succsVi = append(succsVi, ssc...) 84 | if len(failVi) == 0 { 85 | break 86 | } else { 87 | log.Printf("proxy:%s left %d items\n", pu, len(failVi)) 88 | } 89 | } 90 | ddb.AddDone(pickVi) 91 | ddb.UpdateUD(failVi, succsVi) 92 | log.Printf("Download weekly top total:%d, success %d, fail %d.\n", len(pickVi), len(succsVi), len(failVi)) 93 | for _, vi := range failVi { 94 | log.Println("Download Fail!", vi.Title, vi.ViewKey) 95 | } 96 | 97 | if len(failVi) > 5 { 98 | 99 | subject := fmt.Sprintf("Download total:%d, success %d, fail %d.\n", len(pickVi), len(pickVi)-len(failVi), len(failVi)) 100 | content := fmt.Sprintf("Download total:%d, success %d, fail %d.\n", len(pickVi), len(pickVi)-len(failVi), len(failVi)) 101 | 102 | err := mailSend.SendMailByYaml(subject, content, "html") 103 | if err != nil { 104 | log.Println("Send mail error!") 105 | log.Println(err) 106 | } else { 107 | log.Println("Send mail success!") 108 | } 109 | } 110 | 111 | } else { 112 | log.Println("No top page was crawled!!!") 113 | 114 | subject := "No page was crawled!!!" 115 | content := "No page was crawled!!!" 116 | 117 | err := mailSend.SendMailByYaml(subject, content, "html") 118 | if err != nil { 119 | log.Println("Send mail error!") 120 | log.Println(err) 121 | } else { 122 | log.Println("Send mail success!") 123 | } 124 | } 125 | 126 | savePath := time.Now().Format("./save/weekly_060102") 127 | 128 | path, _ := filepath.Abs(savePath) 129 | 130 | _, err := os.Stat(path) 131 | if os.IsNotExist(err) { 132 | if err = os.MkdirAll(path, os.ModePerm); err != nil { 133 | log.Println("savePath create failed!", err) 134 | return 135 | } 136 | } 137 | fi, _ := ioutil.ReadDir("./save") 138 | 139 | for _, f := range fi { 140 | if f.IsDir() && (strings.Contains(f.Name(), "daily") || strings.Contains(f.Name(), "top")) { 141 | os.Rename(filepath.Join("./save", f.Name()), filepath.Join(path, f.Name())) 142 | } 143 | } 144 | } 145 | } 146 | 147 | func dailyFunc(proxyUrls []string) func() { 148 | proxyUrls = append([]string{}, proxyUrls...) 149 | return func() { 150 | log.Println("Start daily Download!!") 151 | var viAll []*catch.VideoInfo 152 | s := score.NewScore("./score/wordValue.txt", "./score/ownValue.txt") 153 | defer s.Free() 154 | CRAWL: 155 | for i := 1; i < 50; i++ { 156 | var vis []*catch.VideoInfo 157 | for _, pu := range proxyUrls { 158 | vis = catch.PageCrawl_chromedp("http://91porn.com/v.php?next=watch&page="+strconv.Itoa(i), pu) 159 | if len(vis) > 0 { 160 | break 161 | } 162 | } 163 | 164 | for _, vi := range vis { 165 | if time.Now().Sub(vi.UpTime) < time.Hour*30+time.Minute*10 { 166 | viAll = append(viAll, vi) 167 | } else { 168 | break CRAWL 169 | } 170 | } 171 | } 172 | 173 | if len(viAll) > 0 { 174 | ddb, err := doneDB.OpenVDB("./save/videoDB.db") 175 | if err != nil { 176 | log.Println("videoDB.db open fail!!!", err) 177 | return 178 | } 179 | defer ddb.Close() 180 | 181 | ddb.ClearDone(time.Now().Add(-time.Hour * 24 * 28)) 182 | 183 | viAll = ddb.DelRepeat(viAll) 184 | s.GradeSort(viAll) 185 | viAll = s.Above(viAll, 0) 186 | length := int(math.Min(30, float64(len(viAll)))) 187 | pickVi := append(viAll[:length], ddb.GetUD()...) 188 | savePath := time.Now().Format("./save/daily_060102") 189 | 190 | path, _ := filepath.Abs(savePath) 191 | 192 | _, err = os.Stat(path) 193 | if os.IsNotExist(err) { 194 | if err = os.MkdirAll(path, os.ModePerm); err != nil { 195 | log.Println("savePath create failed!", err) 196 | return 197 | } 198 | } 199 | 200 | failVi := pickVi 201 | var succsVi []*catch.VideoInfo 202 | for _, pu := range proxyUrls { 203 | var ssc []*catch.VideoInfo 204 | failVi, ssc = catch.DownloadMany(failVi, 2, pu, path) 205 | succsVi = append(succsVi, ssc...) 206 | if len(failVi) == 0 { 207 | break 208 | } else { 209 | log.Printf("proxy:%s left %d items\n", pu, len(failVi)) 210 | } 211 | } 212 | ddb.AddDone(pickVi) 213 | ddb.UpdateUD(failVi, succsVi) 214 | log.Printf("Download total:%d, success %d, fail %d.\n", len(pickVi), len(succsVi), len(failVi)) 215 | for _, vi := range failVi { 216 | log.Println("Download Fail!", vi.Title, vi.ViewKey) 217 | } 218 | 219 | if len(failVi) > 5 { 220 | 221 | subject := fmt.Sprintf("Download total:%d, success %d, fail %d.\n", len(pickVi), len(pickVi)-len(failVi), len(failVi)) 222 | content := fmt.Sprintf("Download total:%d, success %d, fail %d.\n", len(pickVi), len(pickVi)-len(failVi), len(failVi)) 223 | 224 | err := mailSend.SendMailByYaml(subject, content, "html") 225 | if err != nil { 226 | log.Println("Send mail error!") 227 | log.Println(err) 228 | } else { 229 | log.Println("Send mail success!") 230 | } 231 | } 232 | } else { 233 | log.Println("No page was crawled!!!") 234 | 235 | subject := "No page was crawled!!!" 236 | content := "No page was crawled!!!" 237 | 238 | err := mailSend.SendMailByYaml(subject, content, "html") 239 | if err != nil { 240 | log.Println("Send mail error!") 241 | log.Println(err) 242 | } else { 243 | log.Println("Send mail success!") 244 | } 245 | 246 | } 247 | 248 | } 249 | } 250 | 251 | func nowFunc(days, count int, proxyUrls []string) func() { 252 | proxyUrls = append([]string{}, proxyUrls...) 253 | return func() { 254 | log.Printf("Start %d days Download, count %d!!\n", days, count) 255 | var viAll []*catch.VideoInfo 256 | s := score.NewScore("./score/wordValue.txt", "./score/ownValue.txt") 257 | defer s.Free() 258 | CRAWL: 259 | for i := 1; i < days*20; i++ { 260 | var vis []*catch.VideoInfo 261 | for _, pu := range proxyUrls { 262 | vis = catch.PageCrawl_chromedp("http://91porn.com/v.php?next=watch&page="+strconv.Itoa(i), pu) 263 | if len(vis) > 0 { 264 | break 265 | } 266 | } 267 | 268 | for _, vi := range vis { 269 | if time.Now().Sub(vi.UpTime) < time.Hour*24*time.Duration(days) { 270 | viAll = append(viAll, vi) 271 | } else { 272 | break CRAWL 273 | } 274 | } 275 | } 276 | 277 | if len(viAll) > 0 { 278 | ddb, err := doneDB.OpenVDB("./save/videoDB.db") 279 | if err != nil { 280 | log.Println("videoDB.db open fail!!!", err) 281 | return 282 | } 283 | defer ddb.Close() 284 | 285 | ddb.ClearDone(time.Now().Add(-time.Hour * 24 * 28)) 286 | 287 | viAll = ddb.DelRepeat(viAll) 288 | s.GradeSort(viAll) 289 | viAll = s.Above(viAll, 0) 290 | length := int(math.Min(float64(count), float64(len(viAll)))) 291 | pickVi := append(viAll[:length], ddb.GetUD()...) 292 | savePath := time.Now().Format("./save/ext_060102") + fmt.Sprintf("_%ddays", days) 293 | 294 | path, _ := filepath.Abs(savePath) 295 | 296 | _, err = os.Stat(path) 297 | if os.IsNotExist(err) { 298 | if err = os.MkdirAll(path, os.ModePerm); err != nil { 299 | log.Println("savePath create failed!", err) 300 | return 301 | } 302 | } 303 | 304 | failVi := pickVi 305 | var succsVi []*catch.VideoInfo 306 | for _, pu := range proxyUrls { 307 | var ssc []*catch.VideoInfo 308 | failVi, ssc = catch.DownloadMany(failVi, 3, pu, path) 309 | succsVi = append(succsVi, ssc...) 310 | if len(failVi) == 0 { 311 | break 312 | } else { 313 | log.Printf("proxy:%s left %d items\n", pu, len(failVi)) 314 | } 315 | } 316 | ddb.AddDone(pickVi) 317 | ddb.UpdateUD(failVi, succsVi) 318 | log.Printf("Download total:%d, success %d, fail %d.\n", len(pickVi), len(succsVi), len(failVi)) 319 | for _, vi := range failVi { 320 | log.Println("Download Fail!", vi.Title, vi.ViewKey) 321 | } 322 | 323 | if len(failVi) > 5 { 324 | 325 | subject := fmt.Sprintf("Download total:%d, success %d, fail %d.\n", len(pickVi), len(pickVi)-len(failVi), len(failVi)) 326 | content := fmt.Sprintf("Download total:%d, success %d, fail %d.\n", len(pickVi), len(pickVi)-len(failVi), len(failVi)) 327 | 328 | err := mailSend.SendMailByYaml(subject, content, "html") 329 | if err != nil { 330 | log.Println("Send mail error!") 331 | log.Println(err) 332 | } else { 333 | log.Println("Send mail success!") 334 | } 335 | } 336 | } else { 337 | log.Println("No page was crawled!!!") 338 | 339 | subject := "No page was crawled!!!" 340 | content := "No page was crawled!!!" 341 | 342 | err := mailSend.SendMailByYaml(subject, content, "html") 343 | if err != nil { 344 | log.Println("Send mail error!") 345 | log.Println(err) 346 | } else { 347 | log.Println("Send mail success!") 348 | } 349 | 350 | } 351 | 352 | } 353 | } 354 | 355 | func dbFunc(proxyUrls []string) func() { 356 | proxyUrls = append([]string{}, proxyUrls...) 357 | return func() { 358 | var viAll []*catch.VideoInfo 359 | 360 | ddb, err := doneDB.OpenVDB("./save/videoDB.db") 361 | if err != nil { 362 | log.Println("videoDB.db open fail!!!", err) 363 | return 364 | } 365 | defer ddb.Close() 366 | 367 | ddb.ClearDone(time.Now().Add(-time.Hour * 24 * 28)) 368 | 369 | viAll = ddb.DelRepeat(viAll) 370 | length := int(math.Min(30, float64(len(viAll)))) 371 | pickVi := append(viAll[:length], ddb.GetUD()...) 372 | savePath := time.Now().Format("./save/daily_060102") 373 | 374 | path, _ := filepath.Abs(savePath) 375 | 376 | _, err = os.Stat(path) 377 | if os.IsNotExist(err) { 378 | if err = os.MkdirAll(path, os.ModePerm); err != nil { 379 | log.Println("savePath create failed!", err) 380 | return 381 | } 382 | } 383 | 384 | failVi := pickVi 385 | var succsVi []*catch.VideoInfo 386 | for _, pu := range proxyUrls { 387 | var ssc []*catch.VideoInfo 388 | failVi, ssc = catch.DownloadMany(failVi, 3, pu, path) 389 | succsVi = append(succsVi, ssc...) 390 | if len(failVi) == 0 { 391 | break 392 | } else { 393 | log.Printf("proxy:%s left %d items\n", pu, len(failVi)) 394 | } 395 | } 396 | ddb.AddDone(pickVi) 397 | ddb.UpdateUD(failVi, succsVi) 398 | log.Printf("Download total:%d, success %d, fail %d.\n", len(pickVi), len(succsVi), len(failVi)) 399 | for _, vi := range failVi { 400 | log.Println("Download Fail!", vi.Title, vi.ViewKey) 401 | } 402 | 403 | if len(failVi) > 5 { 404 | 405 | subject := fmt.Sprintf("Download total:%d, success %d, fail %d.\n", len(pickVi), len(pickVi)-len(failVi), len(failVi)) 406 | content := fmt.Sprintf("Download total:%d, success %d, fail %d.\n", len(pickVi), len(pickVi)-len(failVi), len(failVi)) 407 | 408 | err := mailSend.SendMailByYaml(subject, content, "html") 409 | if err != nil { 410 | log.Println("Send mail error!") 411 | log.Println(err) 412 | } else { 413 | log.Println("Send mail success!") 414 | } 415 | } 416 | } 417 | } 418 | 419 | func main() { 420 | 421 | //ddb, err1 := doneDB.OpenVDB("./save/videoDB.db") 422 | //if err1 != nil { 423 | // panic(err1) 424 | //} 425 | //defer ddb.Close() 426 | // 427 | //viAll := catch.PageCrawl("http://91porn.com/index.php", "") 428 | ////ddb.AddDone(viAll) 429 | //ddb.UpdateUD(viAll, viAll[:10]) 430 | ////viAll = ddb.DelRepeat(viAll) 431 | //viAll = ddb.GetUD() 432 | //ddb.ClearDone(time.Now().Add(-time.Hour*24*2 + time.Hour*10)) 433 | //return 434 | 435 | //catch.PageCrawlOne("http://91porn.com/view_video.php?viewkey=8cd0148b3fe08d4a4c2f&page=3&viewtype=basic&category=rf", "http://192.168.4.66:10808") 436 | //return 437 | 438 | //subject := "No page was crawled!!!" 439 | //content := "No page was crawled!!!" 440 | // 441 | // mailSend.SendMailByYaml(subject, content, "html") 442 | // 443 | //return 444 | 445 | proxyUrl := "" 446 | pageUrl := "" 447 | savePath := "" 448 | threadNum := 5 449 | cpage := false 450 | now := -1 451 | nCount := -1 452 | db_left := false 453 | week := false 454 | 455 | flag.StringVar(&proxyUrl, "p", "", "proxy") 456 | flag.StringVar(&pageUrl, "u", "http://91porn.com/index.php", "page to crawl") 457 | flag.StringVar(&savePath, "o", "./save", "path to output") 458 | flag.IntVar(&threadNum, "t", 5, "threadcount") 459 | flag.BoolVar(&cpage, "c", false, "crawl whole page") 460 | flag.IntVar(&now, "now", -1, "n days favourite porn") 461 | flag.IntVar(&nCount, "n", -1, "Download quantity, used with now.") 462 | flag.BoolVar(&db_left, "db", false, "download left db porn") 463 | flag.BoolVar(&week, "week", false, "week favourite porn") 464 | 465 | flag.Parse() 466 | 467 | conf := new(proxyInfo) 468 | yamlFile, err := ioutil.ReadFile("./save/proxyConfig.yaml") 469 | if err != nil { 470 | yamlFile, err = ioutil.ReadFile("proxyConfig.yaml") 471 | } 472 | err = yaml.Unmarshal(yamlFile, conf) 473 | // err = yaml.Unmarshal(yamlFile, &resultMap) 474 | if err != nil { 475 | log.Println("can't get proxy config!!!") 476 | } 477 | 478 | proxyUrls := conf.ProxyUrls 479 | 480 | if cpage == true { 481 | path, _ := filepath.Abs(savePath) 482 | 483 | _, err := os.Stat(path) 484 | if os.IsNotExist(err) { 485 | if err = os.MkdirAll(path, os.ModePerm); err != nil { 486 | fmt.Println(err) 487 | } 488 | } 489 | if strings.Contains(pageUrl, "viewkey") { 490 | 491 | vi, err := catch.PageCrawlOne(pageUrl, proxyUrl) 492 | if err == nil { 493 | fmt.Println("Crawled one page, DownLoading", fmt.Sprintf("%s(%s).mp4", vi.Title, vi.Owner)) 494 | savePath := filepath.Join(path, fmt.Sprintf("%s(%s).mp4", vi.Title, vi.Owner)) 495 | vi.Download(savePath, threadNum, proxyUrl) 496 | } 497 | 498 | } else { 499 | viAll := catch.PageCrawl_chromedp(pageUrl, proxyUrl) 500 | 501 | catch.DownloadMany(viAll, threadNum, proxyUrl, path) 502 | } 503 | 504 | return 505 | } else if db_left == true { 506 | 507 | dbFunc(proxyUrls)() 508 | 509 | return 510 | } else if now > 0 && nCount > 0 { 511 | 512 | nowFunc(now, nCount, proxyUrls)() 513 | 514 | return 515 | } else if week == true { 516 | 517 | weeklyFunc(proxyUrls)() 518 | 519 | return 520 | } 521 | 522 | path, _ := filepath.Abs("./save/") 523 | 524 | _, err = os.Stat(path) 525 | if os.IsNotExist(err) { 526 | if err = os.MkdirAll(path, os.ModePerm); err != nil { 527 | log.Println("log path create failed!", err) 528 | return 529 | } 530 | } 531 | 532 | logFile, err := os.OpenFile("./save/spider91.log", os.O_RDWR|os.O_CREATE|os.O_APPEND, 0777) 533 | if err != nil { 534 | panic(err) 535 | } 536 | defer logFile.Close() 537 | 538 | logW := io.MultiWriter(logFile, os.Stdout) 539 | log.SetOutput(logW) 540 | 541 | log.Println("spider91 startup!") 542 | 543 | c := cron.New(cron.WithSeconds()) 544 | 545 | c.AddFunc("00 00 09 * * 6", weeklyFunc(proxyUrls)) 546 | c.AddFunc("00 00 08 * * *", dailyFunc(proxyUrls)) 547 | 548 | c.Start() 549 | defer c.Stop() 550 | 551 | select {} 552 | } 553 | -------------------------------------------------------------------------------- /proxyConfig.yaml: -------------------------------------------------------------------------------- 1 | ProxyUrls : 2 | - "" 3 | - "socks5://192.168.3.254:1081" 4 | - "socks5://192.168.3.254:1082" 5 | - "socks5://192.168.3.254:1083" 6 | - "socks5://192.168.3.254:1084" 7 | - "socks5://192.168.3.254:1085" 8 | - "socks5://192.168.3.254:1086" 9 | - "socks5://192.168.3.254:1087" 10 | - "socks5://192.168.3.254:1088" 11 | - "socks5://192.168.3.254:1089" 12 | - "socks5://192.168.3.254:1090" 13 | - "socks5://192.168.3.254:1091" 14 | 15 | -------------------------------------------------------------------------------- /score/ownValue.txt: -------------------------------------------------------------------------------- 1 | ykqgg9988 -1000 2 | superlakersjames -1000 3 | taitailala -1000 4 | 舌尖上的生活 -100 5 | COUPLEOFMINS -50 6 | pppooozxc -1000 7 | RNG虎哥 -1000 8 | xMassageLovex -1000 9 | Silky -1000 10 | Abyss22b -1000 11 | nulixuexi -20 12 | asiangirlspick -50 13 | 色霸哥 -100 14 | 舌尖上的生活 -100 15 | foreverplayer -100 16 | ykqgg9988 -100 17 | Zyusn99_ -1000 18 | tfboss58 -1000 19 | lufeifei123123 -1000 20 | 露丝13 -1000 21 | feigeshaofu1987 -1000 22 | jijimax -1000 23 | 狼君~及时雨 -1000 24 | 三新途友 -1000 25 | youxirensheng369 -1000 26 | mu1nan3 -100 27 | ykqgg9988 -150 28 | jbxwz2580 -1000 29 | LovELolita -1000 30 | jbxwz2580 -1000 31 | 酒酿sama -1000 32 | uutrg.rr58 -1000 33 | hugoair147 -1000 34 | ccy1871 -1000 35 | 91有个好大哥 -100 36 | Jing_Yun -1000 37 | hyvr -1000 38 | fei771558401 -1000 39 | Kyara -1000 40 | Deca -1000 41 | asdfasdf168 99 42 | 萌神小玥 -1000 43 | mmmn753 -1000 44 | 粉红兔 -1000 45 | strauswang -1000 46 | winforu3535 -300 47 | 麻豆传媒映画 -1000 48 | aczz02081 -1000 49 | zhouxin827 -1000 50 | hongyanyixiao -1000 51 | jueshi5921 99 52 | lmzzyn568 -1000 53 | zhou8878 -1000 54 | swfbxn106 -1000 55 | 韦小宝呀 -200 56 | mmmn7534 -1000 57 | 何先生0618 -500 58 | 盛世美景 -300 59 | 玩物上志 -300 60 | 郁金香没有你的浴巾香 -500 61 | 西门庆在91 -500 62 | 李沫生的春天 -500 63 | 拆二代Cc -500 64 | 91探花花总 100 65 | s2752 -500 66 | 91大杨哥 100 67 | 失眠飞行 -500 68 | 甜心奈奈 -500 69 | 星球大战三百回合 -500 70 | caramel2020 -500 71 | foottoofiob -500 72 | 蜜桃影像传媒 -500 73 | OF杂货铺 -500 74 | xbb2022 -500 75 | 杨五郎 -500 76 | ai美乳 -500 77 | 深情小吕布 -500 78 | xgqbyl -1000 79 | 匿名 -1000 80 | 利达LiD -1000 81 | Lewd_b -1000 82 | Kboy_Lee -1000 83 | Vritra -1000 84 | 叉叉酱 -2000 85 | wexlerarcheyl -2000 86 | 91探花泽泽 -1000 87 | 布隆的爱 -1000 88 | Timepasserby -1000 89 | 是性奴小雨喔 -1000 90 | 痞幼哥 -1000 91 | 91大奶牛 -1000 92 | 超爱丝袜奶茶 -1000 93 | ASSFUCKASS -1000 94 | jiklme31kaosi -200 95 | wshinibaba2022 -200 96 | Miki002002 -500 97 | eminembai -500 98 | 隔壁男同学 -500 99 | 岛城先生 -200 100 | woainigr001 -500 101 | 美乳小艾 -500 102 | 宝贝今天还要么 -500 103 | 驹驹表哥 -500 104 | wodezipai -500 105 | 海绵体Baby -500 106 | 色魔老实人 -300 107 | 狂射丝袜脚 -500 108 | kyqipai123 -300 109 | 小小八戒 -300 110 | 91隔壁老W -500 111 | 丝袜主任 -500 112 | 九九叔叔i -500 113 | Sqtrayer -500 114 | 91斯文禽兽 -500 115 | xxuss33 -500 116 | lolalol -300 117 | woshidadiao -500 118 | LuckyBOBO -500 119 | lhc1802 -500 120 | hopoe710 -500 121 | Strawberry-doll -500 122 | 喜欢与多人性交 -500 123 | qwq17717 -500 124 | MetteTalon -500 125 | 橘酱Misa -500 126 | 91JACK船长 -500 127 | 八十万涩逼总教头 -500 128 | Sexbaby1 -500 129 | GG陪玩 -500 130 | Dark -500 131 | 我叫山雞雞巴的雞 -500 132 | Alox -500 133 | Sextoy -500 134 | mxl0333 -500 135 | 长战将军 -500 136 | 91小帅BBB -500 137 | liguangzivipa -500 138 | pprlv -200 139 | 二师兄八戒哥 -200 140 | yangyangfuqi -500 -------------------------------------------------------------------------------- /score/score.go: -------------------------------------------------------------------------------- 1 | package score 2 | 3 | import ( 4 | "fmt" 5 | "github.com/yanyiwu/gojieba" 6 | "io/ioutil" 7 | "sort" 8 | "spider91/catch" 9 | "strconv" 10 | "strings" 11 | ) 12 | 13 | type Score struct { 14 | jieba *gojieba.Jieba 15 | keyValue map[string]int 16 | ownValue map[string]int 17 | } 18 | 19 | func NewScore(keyFile, ownFile string) *Score { 20 | mapKv := map[string]int{} 21 | mapOv := map[string]int{} 22 | x := gojieba.NewJieba() 23 | 24 | data, err := ioutil.ReadFile(keyFile) 25 | if err != nil { 26 | fmt.Println("keyFile read fail:", err) 27 | } 28 | 29 | owndata, err := ioutil.ReadFile(ownFile) 30 | if err != nil { 31 | fmt.Println("ownFile read fail:", err) 32 | } 33 | 34 | for _, line := range strings.Split(string(data), "\n") { 35 | strs := strings.Fields(line) 36 | if len(strs) == 2 { 37 | v, err := strconv.Atoi(strs[1]) 38 | if err != nil || v > 100 { 39 | fmt.Println("wrong key value format!", strs) 40 | continue 41 | } 42 | mapKv[strs[0]] = v 43 | x.AddWord(strs[0]) 44 | } else { 45 | fmt.Println("wrong key value format!", strs) 46 | } 47 | 48 | } 49 | 50 | for _, line := range strings.Split(string(owndata), "\n") { 51 | strs := strings.Fields(line) 52 | if len(strs) == 2 { 53 | v, err := strconv.Atoi(strs[1]) 54 | if err != nil || v > 100 { 55 | fmt.Println("wrong own value format!", strs) 56 | continue 57 | } 58 | mapOv[strs[0]] = v 59 | } else { 60 | fmt.Println("wrong own value format!", strs) 61 | } 62 | } 63 | 64 | return &Score{x, mapKv, mapOv} 65 | } 66 | 67 | func (s *Score) Free() { 68 | s.jieba.Free() 69 | } 70 | 71 | func (s *Score) Grade(info *catch.VideoInfo) float64 { 72 | 73 | words := s.jieba.Cut(info.Title, true) 74 | var titleScore, duraScore, ownScore float64 75 | for _, w := range words { 76 | titleScore += float64(s.keyValue[w]) 77 | } 78 | 79 | duraScore = 10.0 * info.Vdurat 80 | if duraScore > 100 { 81 | duraScore = 100 82 | } 83 | 84 | ownScore = float64(s.ownValue[info.Owner]) 85 | 86 | finalScore := 0.4*titleScore + 0.4*duraScore + 0.2*ownScore 87 | 88 | return finalScore 89 | } 90 | 91 | func (s *Score) GradeSort(vis []*catch.VideoInfo) { 92 | for _, vi := range vis { 93 | vi.Score = s.Grade(vi) 94 | } 95 | sort.Sort(sort.Reverse(catch.ViSlice(vis))) 96 | } 97 | 98 | func (s *Score) Above(vis []*catch.VideoInfo, score float64) (above []*catch.VideoInfo) { 99 | for _, vi := range vis { 100 | if vi.Score > score { 101 | above = append(above, vi) 102 | } 103 | } 104 | return 105 | } 106 | -------------------------------------------------------------------------------- /score/wordValue.txt: -------------------------------------------------------------------------------- 1 | 露脸 100 2 | 内射 100 3 | 无套 100 4 | 無套 100 5 | 强推 30 6 | 中出 100 7 | 舞蹈 35 8 | 幼教 35 9 | 矜持 20 10 | 嫖 20 11 | 少婦 20 12 | 外圍 20 13 | 女神 45 14 | 极品 50 15 | 少妇 20 16 | 制服 33 17 | 腿 50 18 | 美乳 50 19 | 性感 50 20 | 沙發 20 21 | 極品 50 22 | 美女 50 23 | 口爆 20 24 | 白皙 60 25 | 尤物 60 26 | 高潮 50 27 | 大奶 50 28 | 紅 25 29 | 妹 60 30 | 巨乳 60 31 | 完美 60 32 | 酒店 20 33 | 身材 50 34 | E奶 65 35 | 貌美 45 36 | 学妹 50 37 | 騎乘 15 38 | 呻吟 25 39 | 真实 70 40 | 偷情 50 41 | 萌妹 70 42 | 对白 65 43 | 人妻 50 44 | 辣妹 50 45 | 姐 60 46 | 美腿 65 47 | 粉嫩 65 48 | 粉红 65 49 | 漂亮 30 50 | 女上位 60 51 | 喜欢 40 52 | 高三 65 53 | 挑逗 40 54 | 学生妹 65 55 | 白虎 70 56 | 援交 65 57 | 連幹 25 58 | 高颜值 60 59 | 颜值 60 60 | 探花 60 61 | 淫荡 50 62 | 大学生 60 63 | 女大学生 60 64 | 大奶妹 65 65 | 騷貨 40 66 | 做爱 40 67 | 马尾 70 68 | 双马尾 70 69 | 可爱 50 70 | 丰乳 60 71 | 魔鬼身材 50 72 | 過夜 40 73 | 抬腿 40 74 | 合集 40 75 | 陰毛 40 76 | 兩炮 40 77 | 长腿 60 78 | 骚货 40 79 | 主动 55 80 | 萌妹 60 81 | 一字 65 82 | 內射 100 83 | 正妹 60 84 | 校花 55 85 | 反差 55 86 | 闺蜜 60 87 | 清纯 60 88 | 高顏值 60 89 | 学生 80 90 | 出轨 50 91 | 风骚 40 92 | 小美女 50 93 | 勾引 55 94 | 粉 70 95 | 微露脸 90 96 | 害羞 50 97 | 按摩 70 98 | 健身 70 99 | 对话 70 100 | 大一 70 101 | 大二 70 102 | 大三 70 103 | 出差 45 104 | 短发 60 105 | 姐姐 65 106 | 學妹 65 107 | 双飞 50 108 | 护士 65 109 | 打电话 60 110 | 乖巧 50 111 | 空姐 65 112 | 嫂子 50 113 | 里面 78 114 | 年轻 68 115 | 蜂腰 70 116 | 忍不住 70 117 | 同事 65 118 | 高挑 65 119 | 娇喘 40 120 | 性欲 35 121 | 幼师 75 122 | 熟妇 35 123 | 约炮 30 124 | 已婚 40 125 | 小护士 50 126 | 爆乳 80 127 | 处女 90 128 | 分手 56 129 | 怀孕 93 130 | 高中生 90 131 | 绝美 30 132 | 高考 85 133 | 满足 30 134 | 偷情 30 135 | 顶不住 35 136 | 文静 60 137 | 受不了 50 138 | 初次 50 139 | 小姨 50 140 | 表妹 85 141 | 娇妻 65 142 | 初中生 88 143 | 毕业 88 144 | 同意 50 145 | 子宫 75 146 | 破处 90 147 | 受孕 90 148 | 大四 85 149 | 女同事 75 150 | 射满 80 151 | 博士 85 152 | 无毛 80 153 | 小姐 65 154 | 排卵期 99 155 | 排卵 99 156 | 产后 85 157 | 孕期 95 158 | 白领 45 159 | 初尝 50 160 | 混血 75 161 | 学长 65 162 | 雪白 70 163 | 00后 80 164 | 学霸 75 165 | 小女友 65 166 | 真舒服 60 167 | 舒服 60 168 | D奶 65 169 | d奶 65 170 | E奶 70 171 | e奶 70 172 | F奶 75 173 | f奶 75 174 | G奶 80 175 | g奶 80 176 | 大屁股 60 177 | 同性 -10000 178 | 老师 70 179 | 教师 70 180 | 女室友 50 181 | 草死 20 182 | 网红 50 183 | 干进去 25 184 | 肉臀 50 185 | 高跟 30 186 | 小母狗 35 187 | 性瘾 30 188 | 年轻 35 189 | 护士长 80 -------------------------------------------------------------------------------- /update.sh: -------------------------------------------------------------------------------- 1 | cd /root/spider91/ 2 | git pull 3 | go build 4 | supervisorctl restart spider91 --------------------------------------------------------------------------------