├── url.txt
├── config.toml
├── go.mod
├── makefile
├── README.md
├── .github
    └── workflows
    │   └── go.yml
├── template
    ├── template1.html
    └── template2.html
├── LICENSE
├── .gitignore
├── template.go
├── main.go
├── go.sum
├── post_processing.go
├── fetch.go
├── parse_test.go
├── type.go
└── parse.go


/url.txt:
--------------------------------------------------------------------------------
1 | https://tieba.baidu.com/p/7201761174
2 | http://tieba.baidu.com/mo/m?kz=6212415344
3 | file:///example/test0.json?tid=6212415344


--------------------------------------------------------------------------------
/config.toml:
--------------------------------------------------------------------------------
 1 | numFetcher = 10
 2 | numParser = 50
 3 | numRenderer = 5
 4 | # "template1.html": 最简输出模板
 5 | # "template2.html": 替换为高分辨率图片
 6 | templateName = "template1.html"
 7 | retryPeriod = 10
 8 | highResImage = true
 9 | storeExternalResource = true
10 | # 人工处理百度安全认证
11 | userAgent = ""
12 | cookieString = ""
13 | 
14 | # 显示用户昵称
15 | showNickname = true
16 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
 1 | module github.com/hjhee/tiebaSpider
 2 | 
 3 | go 1.23
 4 | 
 5 | toolchain go1.23.1
 6 | 
 7 | require (
 8 | 	github.com/PuerkitoBio/goquery v1.10.0
 9 | 	github.com/fsnotify/fsnotify v1.7.0
10 | 	github.com/pelletier/go-toml v1.9.5
11 | 	golang.org/x/net v0.30.0
12 | )
13 | 
14 | require (
15 | 	github.com/andybalholm/cascadia v1.3.2 // indirect
16 | 	golang.org/x/sys v0.26.0 // indirect
17 | )
18 | 


--------------------------------------------------------------------------------
/makefile:
--------------------------------------------------------------------------------
 1 | .DEFAULT_GOAL := build
 2 | 
 3 | GITVER = `git describe --tags HEAD`
 4 | 
 5 | build:
 6 | 	@go build -ldflags "-X main.version=${GITVER}"
 7 | 
 8 | clean:
 9 | 	@go clean
10 | 
11 | .PHONY: git-tree-check
12 | git-tree-check:
13 | ifneq ($(git diff --stat),)
14 | 	$(warning "git tree is not clean")
15 | endif
16 | 
17 | win: git-tree-check
18 | 	@echo ver: ${GITVER}
19 | 	@GOOS="windows" go build -ldflags "-X main.version=${GITVER}"
20 | 	@zip win64.zip template/*.html tiebaSpider.exe LICENSE README.md url.txt config.toml
21 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # tiebaSpider
 2 | 
 3 | 程序获取百度贴吧帖子的所有评论，包括所有楼中楼，以HTML和JSON为格式保存到本地，同时合并所有楼层连续、发帖人相同帖子方便阅读。
 4 | 
 5 | 需要获取的帖子在`url.txt`中逐行指定。程序读取程序所在目录下的文件`url.txt`获取贴吧URL，逐行爬取URL指向的帖子。除了http协议的URL之外还支持file协议，file协议格式参考`url.txt`已有的URL。此功能主要用于验证程序功能或者调整HTML模板样式。所有已提取的帖子将命名为`file_{帖子主题}.{json,html}`保存至程序所在目录下的`output`文件夹。若开启了本地保存图片功能，程序会把已获取的资源保存到`res_{帖子主题}`文件夹下。
 6 | 
 7 | ## 特点
 8 | 
 9 | 程序采用Go语言编写，利用goroutine同时获取、解析和渲染页面，各类goroutine的数量可以在`config.toml`文件调整。
10 | 
11 | - 支持所有楼中楼评论
12 | - 支持访问WAP版贴吧链接
13 | 
14 | 此外还可以通过配置文件开启如下功能:
15 | 
16 | - 切换模板以设定输出HTML样式
17 | - 图片链接替换为高清原图
18 | - 本地保存图片
19 | - 设定Cookie和User-Agent处理安全认证
20 | 
21 | ## 模板
22 | 
23 | - 保存的HTML格式文件由`template/template1.html`的HTML模板定义。可以改写该文件以调整生成的HTML文件，从而美化界面或者嵌入Javascript脚本实现根据发帖人筛选帖子，比如只看楼主等自定义功能。模板的所有可指定的数据参考`type.go`的`TemplateField`定义，模板语法参考go官方文档。
24 | 
25 | - `template/template2.html`演示了如何利用模板文件通过javascript程序替换缩略图为高分辨率图片的链接。
26 | 


--------------------------------------------------------------------------------
/.github/workflows/go.yml:
--------------------------------------------------------------------------------
 1 | name: Go
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ master ]
 6 |   pull_request:
 7 |     branches: [ master ]
 8 | 
 9 | jobs:
10 | 
11 |   build:
12 |     if: github.repository == 'hjhee/tiebaSpider'
13 |     name: Build
14 |     runs-on: ubuntu-latest
15 |     steps:
16 | 
17 |       - name: Set up Go
18 |         uses: actions/setup-go@v5
19 |         with:
20 |           go-version: '>=1.23.1'
21 | 
22 |       - name: Check out code into the Go module directory
23 |         uses: actions/checkout@v4
24 | 
25 |       - name: Get dependencies
26 |         run: |
27 |           go get -v -t -d ./...
28 |           if [ -f Gopkg.toml ]; then
29 |               curl https://raw.githubusercontent.com/golang/dep/master/install.sh | sh
30 |               dep ensure
31 |           fi
32 | 
33 |       - name: Build
34 |         run: GOOS=windows GOARCH=amd64 go build -v .
35 | 
36 |       - name: Create build artifacts
37 |         uses: actions/upload-artifact@v4
38 |         with:
39 |           name: win64
40 |           path: |
41 |             template/
42 |             LICENSE
43 |             README.md
44 |             url.txt
45 |             tiebaSpider.exe
46 |             config.toml


--------------------------------------------------------------------------------
/template/template1.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="zh">
 3 | 
 4 | <head>
 5 |   <title>{{.Title}}</title>
 6 |   <meta charset="UTF-8">
 7 |   <script>
 8 |     function toggleLzl(thread_id) {
 9 |       let x = document.getElementById('lzl' + thread_id);
10 |       if (x.style.display === 'none') {
11 |         x.style.display = 'block';
12 |       } else {
13 |         x.style.display = 'none';
14 |       }
15 |     }
16 |   </script>
17 |   <style>
18 |     .lzl {
19 |       border-style: solid;
20 |       border-width: thin;
21 |       border-color: #000000;
22 |     }
23 |   </style>
24 | </head>
25 | 
26 | <body>
27 |   <h1>{{.Title}}</h1>
28 |   <div><a href="{{.Url}}">{{.Url}}</a></div>
29 |   <hr />
30 |   {{range .Comments}}
31 |   <div>
32 |     <div>
33 |       <div>{{.Time}} #{{.PostNO}}: <b>{{.UserName}}</b></div>
34 |       <div>{{.Content}}</div>
35 |     </div>
36 |     {{if index $.Lzls .PostID}}
37 |     <button onclick="toggleLzl({{ .PostID }})">收起回复</button>
38 |     <div id="lzl{{.PostID}}" class="lzl">
39 |       {{$lzl := index $.Lzls .PostID }}
40 |       {{range $lzl.Info}}
41 |       <div>{{.Time}} <b>{{.UserName}}</b>: {{.Content}}</div>
42 |       {{end}}
43 |     </div>
44 |     {{end}}
45 |     <hr />
46 |   </div>
47 |   {{end}}
48 | </body>
49 | 
50 | </html>


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 2-Clause License
 2 | 
 3 | Copyright (c) 2017, 
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 


--------------------------------------------------------------------------------
/template/template2.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="zh">
 3 | 
 4 | <head>
 5 |   <title>{{.Title}}</title>
 6 |   <meta charset="UTF-8">
 7 |   <script>
 8 |     function toggleLzl(thread_id) {
 9 |       let x = document.getElementById('lzl' + thread_id);
10 |       if (x.style.display === 'none') {
11 |         x.style.display = 'block';
12 |       } else {
13 |         x.style.display = 'none';
14 |       }
15 |     }
16 |   </script>
17 |   <style>
18 |     .lzl {
19 |       border-style: solid;
20 |       border-width: thin;
21 |       border-color: #000000;
22 |     }
23 |   </style>
24 | </head>
25 | 
26 | <body>
27 |   <h1>{{.Title}}</h1>
28 |   <div><a href="{{.Url}}">{{.Url}}</a></div>
29 |   <hr />
30 |   {{range .Comments}}
31 |   <div>
32 |     <div>
33 |       <div>{{.Time}} #{{.PostNO}}: <b>{{.UserName}}</b></div>
34 |       <div>{{.Content}}</div>
35 |     </div>
36 |     {{if index $.Lzls .PostID}}
37 |     <button onclick="toggleLzl({{ .PostID }})">收起回复</button>
38 |     <div id="lzl{{.PostID}}" class="lzl">
39 |       {{$lzl := index $.Lzls .PostID }}
40 |       {{range $lzl.Info}}
41 |       <div>{{.Time}} <b>{{.UserName}}</b>: {{.Content}}</div>
42 |       {{end}}
43 |     </div>
44 |     {{end}}
45 |     <hr />
46 |   </div>
47 |   {{end}}
48 |   <script>
49 |     function substituteRawImageSrc() {
50 |       let imgs = document.getElementsByTagName('img');
51 |       for (let img of imgs) {
52 |         if (img.src.startsWith('file://')) {
53 |           img.src = img.src.replace("file://","https://");
54 |         }
55 |         let src = new URL(img.getAttribute('src'));
56 |         // http://imgsa.baidu.com/forum/w%3D580/sign=fa9bff75584a20a4311e3ccfa0539847/2ca3b6096b63f624d2ed9e649044ebf81b4ca366.jpg
57 |         if (src.hostname !== 'imgsa.baidu.com') {
58 |           continue;
59 |         }
60 |         let imageName = src.pathname.substring(src.pathname.lastIndexOf('/') + 1);
61 |         img.onerror = () => {
62 |           console.log(`image loading failed: ${imageName}`);
63 |         };
64 |         // console.log(imageName);
65 |         src.host = 'imgsrc.baidu.com';
66 |         src.pathname = '/forum/pic/item/' + imageName;
67 |         img.setAttribute('src', src.toString());
68 | 
69 |       }
70 |     }
71 |     substituteRawImageSrc();
72 |   </script>
73 | </body>
74 | 
75 | </html>


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Project specified
  2 | /.devcontainer/
  3 | /output
  4 | /tiebaSpider
  5 | /tiebaSpider.exe
  6 | /url.txt
  7 | /*.zip
  8 | 
  9 | ### VisualStudioCode template
 10 | .vscode/*
 11 | !.vscode/settings.json
 12 | !.vscode/tasks.json
 13 | !.vscode/launch.json
 14 | !.vscode/extensions.json
 15 | *.code-workspace
 16 | 
 17 | # Local History for Visual Studio Code
 18 | .history/
 19 | 
 20 | ### JetBrains template
 21 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
 22 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
 23 | 
 24 | # User-specific stuff
 25 | .idea/**/workspace.xml
 26 | .idea/**/tasks.xml
 27 | .idea/**/usage.statistics.xml
 28 | .idea/**/dictionaries
 29 | .idea/**/shelf
 30 | 
 31 | # Generated files
 32 | .idea/**/contentModel.xml
 33 | 
 34 | # Sensitive or high-churn files
 35 | .idea/**/dataSources/
 36 | .idea/**/dataSources.ids
 37 | .idea/**/dataSources.local.xml
 38 | .idea/**/sqlDataSources.xml
 39 | .idea/**/dynamic.xml
 40 | .idea/**/uiDesigner.xml
 41 | .idea/**/dbnavigator.xml
 42 | 
 43 | # Gradle
 44 | .idea/**/gradle.xml
 45 | .idea/**/libraries
 46 | 
 47 | # Gradle and Maven with auto-import
 48 | # When using Gradle or Maven with auto-import, you should exclude module files,
 49 | # since they will be recreated, and may cause churn.  Uncomment if using
 50 | # auto-import.
 51 | # .idea/artifacts
 52 | # .idea/compiler.xml
 53 | # .idea/jarRepositories.xml
 54 | # .idea/modules.xml
 55 | # .idea/*.iml
 56 | # .idea/modules
 57 | # *.iml
 58 | # *.ipr
 59 | 
 60 | # CMake
 61 | cmake-build-*/
 62 | 
 63 | # Mongo Explorer plugin
 64 | .idea/**/mongoSettings.xml
 65 | 
 66 | # File-based project format
 67 | *.iws
 68 | 
 69 | # IntelliJ
 70 | out/
 71 | 
 72 | # mpeltonen/sbt-idea plugin
 73 | .idea_modules/
 74 | 
 75 | # JIRA plugin
 76 | atlassian-ide-plugin.xml
 77 | 
 78 | # Cursive Clojure plugin
 79 | .idea/replstate.xml
 80 | 
 81 | # Crashlytics plugin (for Android Studio and IntelliJ)
 82 | com_crashlytics_export_strings.xml
 83 | crashlytics.properties
 84 | crashlytics-build.properties
 85 | fabric.properties
 86 | 
 87 | # Editor-based Rest Client
 88 | .idea/httpRequests
 89 | 
 90 | # Android studio 3.1+ serialized cache file
 91 | .idea/caches/build_file_checksums.ser
 92 | 
 93 | ### Go template
 94 | # Binaries for programs and plugins
 95 | *.exe
 96 | *.exe~
 97 | *.dll
 98 | *.so
 99 | *.dylib
100 | 
101 | # Test binary, built with `go test -c`
102 | *.test
103 | 
104 | # Output of the go coverage tool, specifically when used with LiteIDE
105 | *.out
106 | 
107 | # Dependency directories (remove the comment below to include it)
108 | # vendor/
109 | 


--------------------------------------------------------------------------------
/template.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"encoding/json"
  6 | 	"fmt"
  7 | 	"html/template"
  8 | 	"os"
  9 | 	"sort"
 10 | 	"sync"
 11 | )
 12 | 
 13 | func writeOutput(filename string, callback func(w *bufio.Writer) error) error {
 14 | 	f, err := os.OpenFile(filename, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644)
 15 | 	if err != nil {
 16 | 		return fmt.Errorf("error creating output file %s: %v", filename, err)
 17 | 	}
 18 | 	defer f.Close()
 19 | 	w := bufio.NewWriter(f)
 20 | 	err = callback(w)
 21 | 	if err != nil {
 22 | 		return fmt.Errorf("error writing to bufio %s, %v", filename, err)
 23 | 	}
 24 | 	err = w.Flush()
 25 | 	if err != nil {
 26 | 		return err
 27 | 	}
 28 | 	return nil
 29 | }
 30 | 
 31 | func renderHTML(done <-chan struct{}, pc *PageChannel, tempc <-chan *TemplateField, tmpl *template.Template) (chan string, chan error) {
 32 | 	outputc := make(chan string)
 33 | 	errc := make(chan error)
 34 | 
 35 | 	// spawn renderers
 36 | 	var wg sync.WaitGroup
 37 | 	wg.Add(config.NumRenderer)
 38 | 	for i := 0; i < config.NumRenderer; i++ {
 39 | 		go func() {
 40 | 			defer wg.Done()
 41 | 			for {
 42 | 				select {
 43 | 				case <-done:
 44 | 					return
 45 | 
 46 | 				case t, ok := <-tempc:
 47 | 					if !ok {
 48 | 						return // no new task from parser, exit
 49 | 					}
 50 | 
 51 | 					if ret, err := postProcessing(done, pc, t); err != nil {
 52 | 						errc <- err
 53 | 						continue
 54 | 					} else if ret {
 55 | 						t.mutex.Lock()
 56 | 						if t.send {
 57 | 							t.send = false
 58 | 						}
 59 | 						t.mutex.Unlock()
 60 | 						continue
 61 | 					}
 62 | 
 63 | 					sort.Slice(t.Comments, func(a, b int) bool {
 64 | 						return t.Comments[a].PostNO < t.Comments[b].PostNO
 65 | 					})
 66 | 
 67 | 					for _, v := range t.Lzls.Map {
 68 | 						sort.Slice(v.Info, func(a, b int) bool {
 69 | 							return v.Info[a].Index < v.Info[b].Index
 70 | 						})
 71 | 					}
 72 | 
 73 | 					// no longer merge as requested in issue #8
 74 | 					// t.Merge()
 75 | 					// t.Unique()
 76 | 
 77 | 					// log.Printf("writing file output/file_%s.json", t.FileName())
 78 | 					filename := fmt.Sprintf("output/file_%s.json", t.FileName())
 79 | 
 80 | 					b, err := json.Marshal(t)
 81 | 					if err != nil {
 82 | 						errc <- err
 83 | 						continue
 84 | 					}
 85 | 
 86 | 					err = writeOutput(filename, func(w *bufio.Writer) error {
 87 | 						_, err := w.Write(b)
 88 | 						return err
 89 | 					})
 90 | 
 91 | 					if err != nil {
 92 | 						errc <- err
 93 | 						continue
 94 | 					}
 95 | 
 96 | 					filename = fmt.Sprintf("output/file_%s.html", t.FileName())
 97 | 					err = writeOutput(filename, func(w *bufio.Writer) error {
 98 | 						if err := tmpl.Execute(w, struct {
 99 | 							Title    string
100 | 							Url      string
101 | 							Comments []*OutputField
102 | 							Lzls     map[uint64]*LzlComment
103 | 						}{Title: t.Title, Url: t.Url, Comments: t.Comments, Lzls: t.Lzls.Map}); err != nil {
104 | 							return fmt.Errorf("error executing template %s: %v", filename, err)
105 | 						}
106 | 						return nil
107 | 					})
108 | 
109 | 					if err != nil {
110 | 						errc <- err
111 | 						continue
112 | 					}
113 | 
114 | 					outputc <- filename // report finished task
115 | 					t.SetRendered(true)
116 | 
117 | 					if config.StoreExternalResource {
118 | 						pc.Del(1)
119 | 					}
120 | 				}
121 | 			}
122 | 		}()
123 | 	}
124 | 	go func() {
125 | 		wg.Wait()
126 | 		close(errc)
127 | 		close(outputc)
128 | 	}()
129 | 	return outputc, errc
130 | }
131 | 


--------------------------------------------------------------------------------
/main.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"html/template"
  6 | 	"log"
  7 | 	"math/rand"
  8 | 	"os"
  9 | 	"time"
 10 | )
 11 | 
 12 | var config *Config
 13 | 
 14 | var version = "debug"
 15 | 
 16 | var outputTemplate *template.Template
 17 | 
 18 | type logWriter struct {
 19 | }
 20 | 
 21 | func (writer logWriter) Write(bytes []byte) (int, error) {
 22 | 	return fmt.Print(time.Now().UTC().Format("2006-01-02 15:04:05 ") + string(bytes))
 23 | }
 24 | 
 25 | func init() {
 26 | 	// setup log time format
 27 | 	// https://stackoverflow.com/a/36140590/6091246
 28 | 	log.SetFlags(0)
 29 | 	log.SetOutput(new(logWriter))
 30 | 
 31 | 	config = &Config{}
 32 | 	if err := config.Parse("config.toml"); err != nil {
 33 | 		log.Fatal(err)
 34 | 	}
 35 | 
 36 | 	outputPath := "output"
 37 | 	if _, err := os.Stat(outputPath); os.IsNotExist(err) {
 38 | 		err = os.Mkdir(outputPath, 0755)
 39 | 		if err != nil {
 40 | 			log.Fatalf("Error creating output folder: %v", err)
 41 | 		}
 42 | 	}
 43 | 
 44 | 	fmt.Fprintf(os.Stderr, "templateName: %s", config.TemplateName)
 45 | 
 46 | 	rand.Seed(41)
 47 | 
 48 | 	// outputTemplate is used to render output
 49 | 	outputTemplate = template.Must(template.New(config.TemplateName).Funcs(
 50 | 		template.FuncMap{"convertTime": func(ts int64) string {
 51 | 			// convertTime converts unix timestamp to the following format
 52 | 			// How do I format an unix timestamp to RFC3339 - golang?
 53 | 			// https://stackoverflow.com/a/21814954/6091246
 54 | 			// Convert UTC to “local” time - Go
 55 | 			// https://stackoverflow.com/a/45137855/6091246
 56 | 			// Using Functions Inside Go Templates
 57 | 			// https://www.calhoun.io/using-functions-inside-go-templates/
 58 | 			// Go template function
 59 | 			// https://stackoverflow.com/a/20872724/6091246
 60 | 			return time.Unix(ts, 0).In(time.Local).Format("2006-01-02 15:04")
 61 | 		},
 62 | 		}).ParseFiles("template/" + config.TemplateName))
 63 | 
 64 | }
 65 | 
 66 | func main() {
 67 | 	println("tiebaSpider")
 68 | 	println("version:", version)
 69 | 	println("project url: https://github.com/hjhee/tiebaSpider")
 70 | 
 71 | 	// closing done to force all goroutines to quit
 72 | 	// Go Concurrency Patterns: Pipelines and cancellation
 73 | 	// https://blog.golang.org/pipelines
 74 | 	done := make(chan struct{})
 75 | 	defer close(done)
 76 | 
 77 | 	err, errcConfig := config.Watch()
 78 | 	if err != nil {
 79 | 		panic(err)
 80 | 	}
 81 | 
 82 | 	pc, errcFetch := fetchHTMLList(done, "url.txt")
 83 | 	tempc, errcParse := parseHTML(done, pc)
 84 | 	outputc, errcRender := renderHTML(done, pc, tempc, outputTemplate)
 85 | 
 86 | 	for {
 87 | 		// programme exits when all error channels are closed:
 88 | 		// breaking out of a select statement when all channels are closed
 89 | 		// https://stackoverflow.com/a/13666733/6091246
 90 | 		if errcFetch == nil && errcParse == nil && errcRender == nil {
 91 | 			log.Printf("Job done!\n")
 92 | 			break
 93 | 		}
 94 | 	parseSelect:
 95 | 		select {
 96 | 		case <-done:
 97 | 			break parseSelect
 98 | 		case err, ok := <-errcConfig:
 99 | 			if !ok {
100 | 				log.Fatalf("[Cofnig] Config watcher encountered error")
101 | 			}
102 | 			fmt.Fprintf(os.Stderr, "[Config] Config watcher encountered error: %v\n", err)
103 | 		case err, ok := <-errcFetch:
104 | 			if !ok {
105 | 				errcFetch = nil
106 | 				log.Printf("[Fetch] job done")
107 | 				continue
108 | 			}
109 | 			fmt.Fprintf(os.Stderr, "[Fetch] error: %v\n", err)
110 | 		case err, ok := <-errcParse:
111 | 			if !ok {
112 | 				errcParse = nil
113 | 				log.Printf("[Parse] job done")
114 | 				continue
115 | 			}
116 | 			fmt.Fprintf(os.Stderr, "[Parse] error: %v\n", err)
117 | 		case err, ok := <-errcRender:
118 | 			if !ok {
119 | 				errcRender = nil
120 | 				log.Printf("[Template] job done")
121 | 				continue
122 | 			}
123 | 			fmt.Fprintf(os.Stderr, "[Template] error: %v\n", err)
124 | 		case file, ok := <-outputc:
125 | 			if ok {
126 | 				log.Printf("[Template] %s done\n", file)
127 | 			}
128 | 		}
129 | 	}
130 | }
131 | 


--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
 1 | github.com/PuerkitoBio/goquery v1.10.0 h1:6fiXdLuUvYs2OJSvNRqlNPoBm6YABE226xrbavY5Wv4=
 2 | github.com/PuerkitoBio/goquery v1.10.0/go.mod h1:TjZZl68Q3eGHNBA8CWaxAN7rOU1EbDz3CWuolcO5Yu4=
 3 | github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss=
 4 | github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU=
 5 | github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA=
 6 | github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM=
 7 | github.com/pelletier/go-toml v1.9.5 h1:4yBQzkHv+7BHq2PQUZF3Mx0IYxG7LsP222s7Agd3ve8=
 8 | github.com/pelletier/go-toml v1.9.5/go.mod h1:u1nR/EPcESfeI/szUZKdtJ0xRNbUoANCkoOuaOx1Y+c=
 9 | github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
10 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
11 | golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
12 | golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
13 | golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
14 | golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
15 | golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
16 | golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
17 | golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
18 | golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns=
19 | golang.org/x/net v0.30.0 h1:AcW1SDZMkb8IpzCdQUaIq2sP4sZ4zw+55h6ynffypl4=
20 | golang.org/x/net v0.30.0/go.mod h1:2wGyMJ5iFasEhkwi13ChkO/t1ECNC4X4eBKkVFyYFlU=
21 | golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
22 | golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
23 | golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
24 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
25 | golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
26 | golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
27 | golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
28 | golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
29 | golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
30 | golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
31 | golang.org/x/sys v0.26.0 h1:KHjCJyddX0LoSTb3J+vWpupP9p0oznkqVk/IfjymZbo=
32 | golang.org/x/sys v0.26.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
33 | golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
34 | golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
35 | golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
36 | golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY=
37 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
38 | golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
39 | golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
40 | golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
41 | golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
42 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
43 | golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
44 | golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
45 | golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
46 | golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
47 | 


--------------------------------------------------------------------------------
/post_processing.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"fmt"
  6 | 	"html/template"
  7 | 	"log"
  8 | 	"net/url"
  9 | 	"path"
 10 | 	"sync/atomic"
 11 | 
 12 | 	"golang.org/x/net/html"
 13 | )
 14 | 
 15 | func postProcessing(done <-chan struct{}, pc *PageChannel, t *TemplateField) (bool, error) {
 16 | 	select {
 17 | 	case <-done:
 18 | 		return false, nil
 19 | 	default:
 20 | 	}
 21 | 	if config.HighResImage {
 22 | 		var searchImageURL func(done <-chan struct{}, pc *PageChannel, t *TemplateField, root *html.Node) bool
 23 | 		searchImageURL = func(done <-chan struct{}, pc *PageChannel, t *TemplateField, root *html.Node) bool {
 24 | 			updated := false
 25 | 			for c := root.FirstChild; c != nil; c = c.NextSibling {
 26 | 				select {
 27 | 				case <-done:
 28 | 					return false
 29 | 				default:
 30 | 				}
 31 | 				if c.Type == html.ElementNode && c.Data == "img" {
 32 | 					for i, a := range c.Attr {
 33 | 						if a.Key == "src" {
 34 | 							// log.Printf("img: %s", a.Val)
 35 | 							localSrc, ud := cvtHighResImageURL(done, pc, t, a.Val, t.FileName())
 36 | 							updated = updated || ud
 37 | 							c.Attr[i].Val = localSrc
 38 | 						}
 39 | 					}
 40 | 				}
 41 | 				ud := searchImageURL(done, pc, t, c)
 42 | 				updated = updated || ud
 43 | 			}
 44 | 			return updated
 45 | 		}
 46 | 		if ret, err := processTemplateContent(done, pc, t, searchImageURL); err != nil {
 47 | 			return false, err
 48 | 		} else if ret {
 49 | 			return true, nil
 50 | 		}
 51 | 	}
 52 | 	if config.StoreExternalResource {
 53 | 		var searchExternalResource func(done <-chan struct{}, pc *PageChannel, t *TemplateField, root *html.Node) bool
 54 | 		searchExternalResource = func(done <-chan struct{}, pc *PageChannel, t *TemplateField, root *html.Node) bool {
 55 | 			updated := false
 56 | 			for c := root.FirstChild; c != nil; c = c.NextSibling {
 57 | 				select {
 58 | 				case <-done:
 59 | 					return false
 60 | 				default:
 61 | 				}
 62 | 				if c.Type == html.ElementNode && c.Data == "img" {
 63 | 					for i, a := range c.Attr {
 64 | 						if a.Key == "src" {
 65 | 							localSrc, ud := cvtLocalURL(done, pc, t, a.Val, t.FileName())
 66 | 							updated = updated || ud
 67 | 							c.Attr[i].Val = localSrc
 68 | 						}
 69 | 					}
 70 | 				}
 71 | 				ud := searchExternalResource(done, pc, t, c)
 72 | 				updated = updated || ud
 73 | 			}
 74 | 			return updated
 75 | 		}
 76 | 
 77 | 		if ret, err := processTemplateContent(done, pc, t, searchExternalResource); err != nil {
 78 | 			return false, err
 79 | 		} else if ret {
 80 | 			return true, nil
 81 | 		}
 82 | 	}
 83 | 	return false, nil
 84 | }
 85 | 
 86 | func processTemplateContent(done <-chan struct{}, pc *PageChannel, t *TemplateField, callback func(done <-chan struct{}, pc *PageChannel, t *TemplateField, root *html.Node) bool) (bool, error) {
 87 | 	t.mutex.Lock()
 88 | 	t.Lzls.lock.Lock()
 89 | 	defer t.mutex.Unlock()
 90 | 	defer t.Lzls.lock.Unlock()
 91 | 	updated := false
 92 | 	traverse := func(tpStr string, data interface{}) (template.HTML, bool, error) {
 93 | 		executor := template.New("comment")
 94 | 		executor.Parse(tpStr)
 95 | 		var buf bytes.Buffer
 96 | 		executor.Execute(&buf, data)
 97 | 		node, err := html.Parse(&buf)
 98 | 		if err != nil {
 99 | 			return "", false, fmt.Errorf("failed to parse html node: %v", err)
100 | 		}
101 | 		buf.Reset()
102 | 		ud := callback(done, pc, t, node)
103 | 		if err := html.Render(&buf, node); err != nil {
104 | 			return "", false, fmt.Errorf("failed to render html template: %v", err)
105 | 		}
106 | 		return template.HTML(buf.String()), ud, nil
107 | 	}
108 | 	for k := range t.Comments {
109 | 		select {
110 | 		case <-done:
111 | 			return false, nil
112 | 		default:
113 | 		}
114 | 		var ud bool
115 | 		var err error
116 | 		// log.Printf("content (before): %s", t.Comments[k].Content)
117 | 		t.Comments[k].Content, ud, err = traverse(`{{.Content}}`, &t.Comments[k])
118 | 		updated = updated || ud
119 | 		// if err != nil {
120 | 		// 	return false, fmt.Errorf("[parseExternalResource] failed to parse comment (PostID: %s): %v", t.Comments[k].PostID, err)
121 | 		// }
122 | 		if err != nil {
123 | 			log.Printf("[parseExternalResource] failed to parse comment (ThreadID: %d, PostID: %d): %v", t.ThreadID, k, err)
124 | 		}
125 | 		// log.Printf("content (after): %s", t.Comments[k].Content)
126 | 	}
127 | 	for k := range t.Lzls.Map {
128 | 		select {
129 | 		case <-done:
130 | 			return false, nil
131 | 		default:
132 | 		}
133 | 		var ud bool
134 | 		var err error
135 | 		for i := range t.Lzls.Map[k].Info {
136 | 			// log.Printf("lzl content (before): %s", t.Lzls.Map[k].Info[i].Content)
137 | 			t.Lzls.Map[k].Info[i].Content, ud, err = traverse(`{{.Content}}`, &t.Lzls.Map[k].Info[i])
138 | 			// log.Printf("lzl content (after): %s", t.Lzls.Map[k].Info[i].Content)
139 | 			updated = updated || ud
140 | 			if err != nil {
141 | 				log.Printf("[parseExternalResource] failed to parse lzlComment (ThreadID: %d, PostID: %d, Index: %d): %v", t.ThreadID, k, t.Lzls.Map[k].Info[i].Index, err)
142 | 			}
143 | 		}
144 | 	}
145 | 	return updated, nil
146 | }
147 | 
148 | func cvtLocalURL(done <-chan struct{}, pc *PageChannel, t *TemplateField, src, prefix string) (string, bool) {
149 | 	u, err := url.Parse(src)
150 | 	if err != nil {
151 | 		return src, false
152 | 	}
153 | 	uOrig := *u
154 | 	// url already converted
155 | 	if u.Scheme == "" {
156 | 		return src, false
157 | 	}
158 | 	u.Scheme = ""
159 | 	u.Path = fmt.Sprintf("%s%s", u.Host, u.Path)
160 | 	dst := fmt.Sprintf("res_%s/%s", prefix, u.Path)
161 | 	newSrc := dst
162 | 	if t.resMap.Put(newSrc) {
163 | 		atomic.AddInt64(&t.resLeft, 1)
164 | 		pc.Add(1)
165 | 		go func() {
166 | 			select {
167 | 			case pc.send <- &HTMLPage{URL: &uOrig, Type: HTMLExternalResource, Path: dst, ThreadID: t.ThreadID}:
168 | 			case <-done:
169 | 				return
170 | 			}
171 | 		}()
172 | 	}
173 | 	return newSrc, true
174 | }
175 | 
176 | func cvtHighResImageURL(done <-chan struct{}, pc *PageChannel, t *TemplateField, src, prefix string) (string, bool) {
177 | 	u, err := url.Parse(src)
178 | 	if err != nil {
179 | 		return src, false
180 | 	}
181 | 	if u.Host != "c.hiphotos.baidu.com" {
182 | 		return src, false
183 | 	}
184 | 	imageName := path.Base(u.Path)
185 | 	u.Host = "imgsrc.baidu.com"
186 | 	u.Path = fmt.Sprintf("/forum/pic/item/%s", imageName)
187 | 	dst := u.String()
188 | 	newSrc := dst
189 | 	return newSrc, true
190 | }
191 | 


--------------------------------------------------------------------------------
/fetch.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"fmt"
  6 | 	"io"
  7 | 	"log"
  8 | 	"net/http"
  9 | 	"net/url"
 10 | 	"os"
 11 | 	"regexp"
 12 | 	"strings"
 13 | 	"sync"
 14 | 	"sync/atomic"
 15 | 	"time"
 16 | )
 17 | 
 18 | func fetchHTMLList(done <-chan struct{}, filename string) (*PageChannel, <-chan error) {
 19 | 	feed := make(chan *HTMLPage, config.NumFetcher)
 20 | 	ret, retErr := spawnFetcher(done, feed)
 21 | 
 22 | 	pc := &PageChannel{send: feed, rec: ret}
 23 | 
 24 | 	errc := make(chan error)
 25 | 	go func() {
 26 | 		defer close(errc)
 27 | 		in, err := os.OpenFile(filename, os.O_RDONLY, 0644)
 28 | 		if err != nil {
 29 | 			errc <- fmt.Errorf("error reading url list: %v", err)
 30 | 			return
 31 | 		}
 32 | 		defer in.Close()
 33 | 		reader := bufio.NewReader(in)
 34 | 
 35 | 		validURL := regexp.MustCompile(`^/p/([0-9]+)$`) // example: ^/p/7201761174$
 36 | 		wapURL := regexp.MustCompile(`^/mo/m$`)         // example: "/mo/m?kz=7201761174"
 37 | 
 38 | 		// reading file line by line in go
 39 | 		// https://stackoverflow.com/a/41741702/6091246
 40 | 		// case:
 41 | 		// If you don't mind that the line could be very long (i.e. use a lot of RAM). It keeps the \n at the end of the string returned.
 42 | 		var line string
 43 | 		for isEOF := false; !isEOF; {
 44 | 			line, err = reader.ReadString('\n')
 45 | 			if err != nil {
 46 | 				isEOF = true
 47 | 			}
 48 | 			line = strings.TrimSpace(line)
 49 | 			if line == "" {
 50 | 				continue
 51 | 			}
 52 | 			u, err := url.Parse(strings.TrimSpace(line))
 53 | 			if err != nil {
 54 | 				log.Printf("[Fetch] Error parsing %s, skipping\n", line)
 55 | 				continue
 56 | 			}
 57 | 
 58 | 			var pageType HTMLType
 59 | 
 60 | 			if u.Scheme == "file" {
 61 | 				pageType = HTMLLocal
 62 | 				q := u.Query()
 63 | 				tid := q.Get("tid") // get file tid for TemplateMap key later
 64 | 				if tid == "" {
 65 | 					log.Printf("[Fetch] file path %s is missing tid field, skipping", u)
 66 | 					continue
 67 | 				}
 68 | 			} else {
 69 | 				if u.Host != "tieba.baidu.com" {
 70 | 					log.Printf("[Fetch] URL host %s is not Tieba, skipping", u)
 71 | 					continue
 72 | 				}
 73 | 
 74 | 				if match := validURL.MatchString(u.Path); match {
 75 | 					pageType = HTMLWebHomepage
 76 | 					// strip query from url
 77 | 					// URL Builder/Query builder in Go
 78 | 					// https://stackoverflow.com/a/26987017/6091246
 79 | 					u.RawQuery = ""
 80 | 				} else if match = wapURL.MatchString(u.Path); match {
 81 | 					pageType = HTMLWebWAPHomepage
 82 | 				} else {
 83 | 					log.Printf("[Fetch] %s is not a valid Tieba post URL, skipping", u)
 84 | 					continue
 85 | 				}
 86 | 			}
 87 | 
 88 | 			// log.Printf("[Fetch] Got new url from list: %v\n", u)
 89 | 
 90 | 			if config.StoreExternalResource {
 91 | 				pc.Add(1)
 92 | 			}
 93 | 
 94 | 			pc.Add(1)
 95 | 			select {
 96 | 			case pc.send <- &HTMLPage{URL: u, Type: pageType}:
 97 | 			case <-done:
 98 | 				return
 99 | 			}
100 | 		}
101 | 		pc.Inited()
102 | 	}()
103 | 
104 | 	// merge error chans
105 | 	// Go Concurrency Patterns: Pipelines and cancellation
106 | 	// https://blog.golang.org/pipelines
107 | 	errChan := make(chan error)
108 | 	go func() {
109 | 		defer close(errChan)
110 | 		for {
111 | 			if errc == nil && retErr == nil {
112 | 				return
113 | 			}
114 | 			select {
115 | 			case err, ok := <-errc:
116 | 				if !ok {
117 | 					errc = nil
118 | 					continue
119 | 				}
120 | 				errChan <- err
121 | 				return
122 | 
123 | 			case err, ok := <-retErr:
124 | 				if !ok {
125 | 					retErr = nil
126 | 					continue
127 | 				}
128 | 				errChan <- err
129 | 				return
130 | 			}
131 | 		}
132 | 	}()
133 | 
134 | 	return pc, errChan
135 | }
136 | 
137 | func fetcher(done <-chan struct{}, wg *sync.WaitGroup, jobsLeft *int64, ret chan<- *HTMLPage, jobs chan *HTMLPage) {
138 | 	defer wg.Done()
139 | 	for {
140 | 		select {
141 | 		case <-done:
142 | 			return
143 | 		case page, ok := <-jobs:
144 | 			if !ok {
145 | 				return
146 | 			}
147 | 			var err error
148 | 			switch page.Type {
149 | 			case HTMLLocal:
150 | 				err = fetchHTMLFromFile(page)
151 | 			default:
152 | 				err = fetchHTMLFromURL(page)
153 | 			}
154 | 			if err != nil {
155 | 				go func(page *HTMLPage) {
156 | 					select {
157 | 					case <-done:
158 | 						return
159 | 					case <-time.After(3 * time.Second):
160 | 						jobs <- page // add failed task back to jobs
161 | 					}
162 | 				}(page)
163 | 				log.Printf("[Fetch] error fetching %s, pause for 3s: %s\n", page.URL, err)
164 | 			} else {
165 | 				select {
166 | 				case ret <- page:
167 | 					atomic.AddInt64(jobsLeft, -1) // task done
168 | 				case <-done:
169 | 					return
170 | 				}
171 | 			}
172 | 		}
173 | 	}
174 | }
175 | 
176 | func spawnFetcher(done <-chan struct{}, jobs <-chan *HTMLPage) (<-chan *HTMLPage, <-chan error) {
177 | 	in := make(chan *HTMLPage, config.NumFetcher) // fetcher get tasks from in
178 | 	ret := make(chan *HTMLPage, config.NumParser) // send HTML content to parser
179 | 	errc := make(chan error)
180 | 
181 | 	jobsLeft := new(int64)
182 | 	chClosed := false
183 | 
184 | 	var wg sync.WaitGroup
185 | 	wg.Add(1)
186 | 	go func() {
187 | 		defer wg.Done()
188 | 		defer close(in)
189 | 		defer close(ret)
190 | 		for {
191 | 			if chClosed {
192 | 				if atomic.LoadInt64(jobsLeft) <= 0 {
193 | 					return // no more task left in channel in, exit
194 | 				}
195 | 				time.Sleep(time.Second) // check every second
196 | 				continue
197 | 			}
198 | 			select {
199 | 			case <-done:
200 | 				return
201 | 			case p, ok := <-jobs:
202 | 				if !ok {
203 | 					chClosed = true // parser sends no more jobs, time to exit
204 | 					continue
205 | 				}
206 | 				atomic.AddInt64(jobsLeft, 1) // add job to channel in
207 | 				in <- p
208 | 			}
209 | 		}
210 | 	}()
211 | 	for i := 0; i < config.NumFetcher; i++ {
212 | 		wg.Add(1)
213 | 		go fetcher(done, &wg, jobsLeft, ret, in)
214 | 	}
215 | 	go func() {
216 | 		wg.Wait()
217 | 		close(errc)
218 | 	}()
219 | 	return ret, errc
220 | }
221 | 
222 | func fetchHTMLFromURL(page *HTMLPage) error {
223 | 	req, err := http.NewRequest("GET", page.URL.String(), nil)
224 | 	if err != nil {
225 | 		return err
226 | 	}
227 | 	if config.UserAgent != "" {
228 | 		req.Header.Add("User-Agent", config.UserAgent)
229 | 	}
230 | 	if config.CookieString != "" {
231 | 		req.Header.Add("Cookie", config.CookieString)
232 | 	}
233 | 	client := &http.Client{}
234 | 	resp, err := client.Do(req)
235 | 	if err != nil {
236 | 		return err
237 | 	}
238 | 	bytes, err := io.ReadAll(resp.Body)
239 | 	if err != nil {
240 | 		return err
241 | 	}
242 | 	page.Content = bytes
243 | 	// page.Response = resp
244 | 	resp.Body.Close()
245 | 	return nil
246 | }
247 | 
248 | func fetchHTMLFromFile(page *HTMLPage) error {
249 | 	in, err := os.OpenFile(page.URL.Path, os.O_RDONLY, 0644)
250 | 	if err != nil {
251 | 		return fmt.Errorf("error reading file path from %s: %v", page.URL.Path, err)
252 | 	}
253 | 	defer in.Close()
254 | 	reader := bufio.NewReader(in)
255 | 	bytes, err := io.ReadAll(reader)
256 | 	if err != nil {
257 | 		return err
258 | 	}
259 | 	page.Content = bytes
260 | 	return nil
261 | }
262 | 


--------------------------------------------------------------------------------
/parse_test.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"net/url"
 5 | 	"strings"
 6 | 	"testing"
 7 | 
 8 | 	"github.com/PuerkitoBio/goquery"
 9 | )
10 | 
11 | var lzlTotalCommentTestString = `<li class="lzl_single_post j_lzl_s_p first_no_border" data-field='{&quot;spid&quot;:132237789491,&quot;user_name&quot;:&quot;\u4ed6\u4eec\u597d\u5435\u554a&quot;,&quot;portrait&quot;:&quot;tb.1.479b0e40.FbTmCIoclxiBD5JouhBQMA&quot;}' ><a rel="noopener" name="132237789491"></a>  <a rel="noopener" data-field='{&quot;un&quot;:&quot;\u4ed6\u4eec\u597d\u5435\u554a&quot;,&quot;id&quot;:&quot;tb.1.479b0e40.FbTmCIoclxiBD5JouhBQMA&quot;}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?un=%E4%BB%96%E4%BB%AC%E5%A5%BD%E5%90%B5%E5%95%8A&ie=utf-8&id=tb.1.479b0e40.FbTmCIoclxiBD5JouhBQMA&fr=pb" username="他们好吵啊"><img  src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.479b0e40.FbTmCIoclxiBD5JouhBQMA"/></a><div class="lzl_cnt" data-field='{&quot;iconArr&quot;:[],&quot;free_flag&quot;:null}'><a rel="noopener" class="at j_user_card " data-field='{&quot;un&quot;:&quot;\u4ed6\u4eec\u597d\u5435\u554a&quot;,&quot;id&quot;:&quot;tb.1.479b0e40.FbTmCIoclxiBD5JouhBQMA&quot;}' href="/home/main/?un=%E4%BB%96%E4%BB%AC%E5%A5%BD%E5%90%B5%E5%95%8A&ie=utf-8&id=tb.1.479b0e40.FbTmCIoclxiBD5JouhBQMA&fr=pb" target="_blank" username="他们好吵啊">终极闪耀赛罗✨</a>:<span class="lzl_content_main" data-username="">        回复 <a href="http://tieba.baidu.com/i/sys/jump?un=" onclick="Stats.sendRequest('fr=tb0_forum&st_mod=pb&st_value=atlink');" onmouseover="showattip(this)" onmouseout="hideattip(this)" username="" portrait="tb.1.fdadc6ce.2U-v9D4pWAY7LIrg8E-EOw" target="_blank" class="at">阿比酱最棒啦💖</a> :说起来纸片人搞饭圈这一套就nm离谱<img class="BDE_Smiley" width="30" height="30" changedsize="false" src="https://gsp0.baidu.com/5aAHeD3nKhI2p27j8IqW0jdnxx1xbK/tb/editor/images/client/image_emoticon16.png" >        </span><div class="lzl_content_reply"><span class="lzl_jb" style="display:none;"></span><span class="lzl_op_list j_lzl_o_l" style="display:none;"></span><span class="lzl_time">2020-5-17 22:37</span><a rel="noopener" href="#" class="lzl_s_r">回复</a></div></div></li><li class="lzl_single_post j_lzl_s_p " data-field='{&quot;spid&quot;:132237796500,&quot;user_name&quot;:&quot;\u4ed6\u4eec\u597d\u5435\u554a&quot;,&quot;portrait&quot;:&quot;tb.1.479b0e40.FbTmCIoclxiBD5JouhBQMA&quot;}' ><a rel="noopener" name="132237796500"></a>  <a rel="noopener" data-field='{&quot;un&quot;:&quot;\u4ed6\u4eec\u597d\u5435\u554a&quot;,&quot;id&quot;:&quot;tb.1.479b0e40.FbTmCIoclxiBD5JouhBQMA&quot;}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?un=%E4%BB%96%E4%BB%AC%E5%A5%BD%E5%90%B5%E5%95%8A&ie=utf-8&id=tb.1.479b0e40.FbTmCIoclxiBD5JouhBQMA&fr=pb" username="他们好吵啊"><img  src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.479b0e40.FbTmCIoclxiBD5JouhBQMA"/></a><div class="lzl_cnt" data-field='{&quot;iconArr&quot;:[],&quot;free_flag&quot;:null}'><a rel="noopener" class="at j_user_card " data-field='{&quot;un&quot;:&quot;\u4ed6\u4eec\u597d\u5435\u554a&quot;,&quot;id&quot;:&quot;tb.1.479b0e40.FbTmCIoclxiBD5JouhBQMA&quot;}' href="/home/main/?un=%E4%BB%96%E4%BB%AC%E5%A5%BD%E5%90%B5%E5%95%8A&ie=utf-8&id=tb.1.479b0e40.FbTmCIoclxiBD5JouhBQMA&fr=pb" target="_blank" username="他们好吵啊">终极闪耀赛罗✨</a>:<span class="lzl_content_main" data-username="">        回复 <a href="http://tieba.baidu.com/i/sys/jump?un=" onclick="Stats.sendRequest('fr=tb0_forum&st_mod=pb&st_value=atlink');" onmouseover="showattip(this)" onmouseout="hideattip(this)" username="" portrait="tb.1.fdadc6ce.2U-v9D4pWAY7LIrg8E-EOw" target="_blank" class="at">阿比酱最棒啦💖</a> :算了，大佬打架我这萌新还是稍稍吧<img class="BDE_Smiley" width="30" height="30" changedsize="false" src="https://gsp0.baidu.com/5aAHeD3nKhI2p27j8IqW0jdnxx1xbK/tb/editor/images/client/image_emoticon25.png" >        </span><div class="lzl_content_reply"><span class="lzl_jb" style="display:none;"></span><span class="lzl_op_list j_lzl_o_l" style="display:none;"></span><span class="lzl_time">2020-5-17 22:37</span><a rel="noopener" href="#" class="lzl_s_r">回复</a></div></div></li><li class="lzl_single_post j_lzl_s_p " data-field='{&quot;spid&quot;:132240827250,&quot;user_name&quot;:&quot;lonelyrangers&quot;,&quot;portrait&quot;:&quot;tb.1.41c90085.pOmHdZ2UOe-_Na778rOFhA&quot;}' ><a rel="noopener" name="132240827250"></a>  <a rel="noopener" data-field='{&quot;un&quot;:&quot;lonelyrangers&quot;,&quot;id&quot;:&quot;tb.1.41c90085.pOmHdZ2UOe-_Na778rOFhA&quot;}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?un=lonelyrangers&ie=utf-8&id=tb.1.41c90085.pOmHdZ2UOe-_Na778rOFhA&fr=pb" username="lonelyrangers"><img  src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.41c90085.pOmHdZ2UOe-_Na778rOFhA"/></a><div class="lzl_cnt" data-field='{&quot;iconArr&quot;:{&quot;all_level&quot;:{&quot;2&quot;:{&quot;end_time&quot;:&quot;1608785198&quot;,&quot;level&quot;:2,&quot;pic_url&quot;:&quot;https:\/\/imgsa.baidu.com\/forum\/pic\/item\/6afa80cb39dbb6fdf9de234d0b24ab18962b37f0.jpg&quot;,&quot;score_limit&quot;:8000}},&quot;level&quot;:{&quot;end_time&quot;:&quot;1608785198&quot;,&quot;pic_url&quot;:&quot;https:\/\/imgsa.baidu.com\/forum\/pic\/item\/6afa80cb39dbb6fdf9de234d0b24ab18962b37f0.jpg&quot;,&quot;props_id&quot;:2}},&quot;free_flag&quot;:null}'><a rel="noopener" class="at j_user_card " data-field='{&quot;un&quot;:&quot;lonelyrangers&quot;,&quot;id&quot;:&quot;tb.1.41c90085.pOmHdZ2UOe-_Na778rOFhA&quot;}' href="/home/main/?un=lonelyrangers&ie=utf-8&id=tb.1.41c90085.pOmHdZ2UOe-_Na778rOFhA&fr=pb" target="_blank" username="lonelyrangers">lonelyrangers</a>:<span class="lzl_content_main" data-username="">        你这形容得太有画面感了<img class="BDE_Smiley" width="30" height="30" changedsize="false" src="https://gsp0.baidu.com/5aAHeD3nKhI2p27j8IqW0jdnxx1xbK/tb/editor/images/client/image_emoticon25.png" >        </span><div class="lzl_content_reply"><span class="lzl_jb" style="display:none;"></span><span class="lzl_op_list j_lzl_o_l" style="display:none;"></span><span class="lzl_time">2020-5-18 02:12</span><a rel="noopener" href="#" class="lzl_s_r">回复</a></div></div></li><li class="lzl_single_post j_lzl_s_p " data-field='{&quot;spid&quot;:132252322102,&quot;user_name&quot;:&quot;\u73b0\u4ee3\u6dd1\u5973\u96be\u6c42\u554a&quot;,&quot;portrait&quot;:&quot;tb.1.a95746d1.CnZDCm-zX0WTKmf_9-iJYA&quot;}' ><a rel="noopener" name="132252322102"></a>  <a rel="noopener" data-field='{&quot;un&quot;:&quot;\u73b0\u4ee3\u6dd1\u5973\u96be\u6c42\u554a&quot;,&quot;id&quot;:&quot;tb.1.a95746d1.CnZDCm-zX0WTKmf_9-iJYA&quot;}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?un=%E7%8E%B0%E4%BB%A3%E6%B7%91%E5%A5%B3%E9%9A%BE%E6%B1%82%E5%95%8A&ie=utf-8&id=tb.1.a95746d1.CnZDCm-zX0WTKmf_9-iJYA&fr=pb" username="现代淑女难求啊"><img  src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.a95746d1.CnZDCm-zX0WTKmf_9-iJYA"/></a><div class="lzl_cnt" data-field='{&quot;iconArr&quot;:[],&quot;free_flag&quot;:null}'><a rel="noopener" class="at j_user_card " data-field='{&quot;un&quot;:&quot;\u73b0\u4ee3\u6dd1\u5973\u96be\u6c42\u554a&quot;,&quot;id&quot;:&quot;tb.1.a95746d1.CnZDCm-zX0WTKmf_9-iJYA&quot;}' href="/home/main/?un=%E7%8E%B0%E4%BB%A3%E6%B7%91%E5%A5%B3%E9%9A%BE%E6%B1%82%E5%95%8A&ie=utf-8&id=tb.1.a95746d1.CnZDCm-zX0WTKmf_9-iJYA&fr=pb" target="_blank" username="现代淑女难求啊">现代淑女难求啊</a>:<span class="lzl_content_main" data-username="">        回复 <a href="http://tieba.baidu.com/i/sys/jump?un=" onclick="Stats.sendRequest('fr=tb0_forum&st_mod=pb&st_value=atlink');" onmouseover="showattip(this)" onmouseout="hideattip(this)" username="" portrait="tb.1.fdadc6ce.2U-v9D4pWAY7LIrg8E-EOw" target="_blank" class="at">阿比酱最棒啦💖</a> :是秦武阳<img class="BDE_Smiley" width="30" height="30" changedsize="false" src="https://gsp0.baidu.com/5aAHeD3nKhI2p27j8IqW0jdnxx1xbK/tb/editor/images/client/image_emoticon25.png" >        </span><div class="lzl_content_reply"><span class="lzl_jb" style="display:none;"></span><span class="lzl_op_list j_lzl_o_l" style="display:none;"></span><span class="lzl_time">2020-5-18 17:06</span><a rel="noopener" href="#" class="lzl_s_r">回复</a></div></div></li><li class="lzl_single_post j_lzl_s_p " data-field='{&quot;spid&quot;:132252333701,&quot;user_name&quot;:&quot;470355389&quot;,&quot;portrait&quot;:&quot;tb.1.fdadc6ce.2U-v9D4pWAY7LIrg8E-EOw&quot;}' ><a rel="noopener" name="132252333701"></a>  <a rel="noopener" data-field='{&quot;un&quot;:&quot;470355389&quot;,&quot;id&quot;:&quot;tb.1.fdadc6ce.2U-v9D4pWAY7LIrg8E-EOw&quot;}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?un=470355389&ie=utf-8&id=tb.1.fdadc6ce.2U-v9D4pWAY7LIrg8E-EOw&fr=pb" username="470355389"><img  src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.fdadc6ce.2U-v9D4pWAY7LIrg8E-EOw"/></a><div class="lzl_cnt" data-field='{&quot;iconArr&quot;:{&quot;all_level&quot;:{&quot;2&quot;:{&quot;end_time&quot;:&quot;1617015024&quot;,&quot;level&quot;:2,&quot;pic_url&quot;:&quot;https:\/\/imgsa.baidu.com\/forum\/pic\/item\/6afa80cb39dbb6fdf9de234d0b24ab18962b37f0.jpg&quot;,&quot;score_limit&quot;:8000}},&quot;level&quot;:{&quot;end_time&quot;:&quot;1617015024&quot;,&quot;pic_url&quot;:&quot;https:\/\/imgsa.baidu.com\/forum\/pic\/item\/6afa80cb39dbb6fdf9de234d0b24ab18962b37f0.jpg&quot;,&quot;props_id&quot;:2}},&quot;free_flag&quot;:null}'><a rel="noopener" class="at j_user_card " data-field='{&quot;un&quot;:&quot;470355389&quot;,&quot;id&quot;:&quot;tb.1.fdadc6ce.2U-v9D4pWAY7LIrg8E-EOw&quot;}' href="/home/main/?un=470355389&ie=utf-8&id=tb.1.fdadc6ce.2U-v9D4pWAY7LIrg8E-EOw&fr=pb" target="_blank" username="470355389">阿比酱最棒啦💖</a>:<span class="lzl_content_main" data-username="">        回复 <a href="http://tieba.baidu.com/i/sys/jump?un=" onclick="Stats.sendRequest('fr=tb0_forum&st_mod=pb&st_value=atlink');" onmouseover="showattip(this)" onmouseout="hideattip(this)" username="" portrait="tb.1.a95746d1.CnZDCm-zX0WTKmf_9-iJYA" target="_blank" class="at">现代淑女难求啊</a> :啊这，丢人<img class="BDE_Smiley" width="30" height="30" changedsize="false" src="https://gsp0.baidu.com/5aAHeD3nKhI2p27j8IqW0jdnxx1xbK/tb/editor/images/client/image_emoticon11.png" >        </span><div class="lzl_content_reply"><span class="lzl_jb" style="display:none;"></span><span class="lzl_op_list j_lzl_o_l" style="display:none;"></span><span class="lzl_time">2020-5-18 17:07</span><a rel="noopener" href="#" class="lzl_s_r">回复</a></div></div></li><li class="lzl_li_pager j_lzl_l_p lzl_li_pager_s" data-field='{&quot;total_num&quot;:15,&quot;total_page&quot;:2}' ><a rel="noopener" class="j_lzl_p btn-sub btn-small pull-right" href="##"><i class="icon-reply"></i>我也说一句</a>                                <p class="j_pager l_pager pager_theme_2">      <a href="#1">首页</a>
12 | <a href="#1">上一页</a>
13 | <a href="#1">1</a>
14 | <span class="tP">2</span>
15 | </p>    </li>`
16 | 
17 | func TestTotalCommentParserFcn(t *testing.T) {
18 | 	u := &url.URL{}
19 | 	body := lzlTotalCommentTestString
20 | 	commentParserFcn(u, body, HTMLLzl, func(key uint64, value *LzlContent) {
21 | 		// special rule: remove username ahref in ": 回复 ", as requested in #4
22 | 		strContent := string(value.Content)
23 | 		content := strings.Trim(strContent, " ")
24 | 		if strings.HasPrefix(content, "回复") {
25 | 			doc, err := goquery.NewDocumentFromReader(strings.NewReader(content))
26 | 			if err != nil {
27 | 				t.Errorf("failed to parse comment data: %v, reason: %s", content, err)
28 | 			}
29 | 			bodyDOM := doc.Find("body")
30 | 			s := doc.Find("a.at").First()
31 | 			userNameHtml, _ := s.Html()
32 | 			// t.Errorf(userNameHtml)
33 | 			s.ReplaceWithHtml(userNameHtml)
34 | 			t.Logf(bodyDOM.Html())
35 | 		}
36 | 	}, func(string, string, string) {})
37 | }
38 | 


--------------------------------------------------------------------------------
/type.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"encoding/json"
  5 | 	"errors"
  6 | 	"html/template"
  7 | 	"net/url"
  8 | 	"os"
  9 | 	"regexp"
 10 | 	"sync"
 11 | 	"sync/atomic"
 12 | 
 13 | 	"github.com/fsnotify/fsnotify"
 14 | 	"github.com/pelletier/go-toml"
 15 | )
 16 | 
 17 | // Config stores user specified configurations in config.toml
 18 | type Config struct {
 19 | 	NumFetcher   int    `toml:"numFetcher"`
 20 | 	NumParser    int    `toml:"numParser"`
 21 | 	NumRenderer  int    `toml:"numRenderer"`
 22 | 	TemplateName string `toml:"templateName"`
 23 | 	RetryPeriod  int    `toml:"retryPeriod"`
 24 | 
 25 | 	HighResImage          bool `toml:"highResImage"`
 26 | 	StoreExternalResource bool `toml:"storeExternalResource"`
 27 | 
 28 | 	UserAgent    string `toml:"userAgent"`
 29 | 	CookieString string `toml:"cookieString"`
 30 | 
 31 | 	ShowNickName bool `toml:"showNickname"`
 32 | 
 33 | 	watcher *fsnotify.Watcher
 34 | }
 35 | 
 36 | func (c *Config) Parse(path string) error {
 37 | 	dataStr, _ := os.ReadFile(path)
 38 | 	err := toml.Unmarshal(dataStr, c)
 39 | 	return err
 40 | }
 41 | 
 42 | func (c *Config) Watch() (error, <-chan error) {
 43 | 	watcher, err := fsnotify.NewWatcher()
 44 | 	if err != nil {
 45 | 		return err, nil
 46 | 	}
 47 | 	c.watcher = watcher
 48 | 	err = c.watcher.Add(".")
 49 | 	if err != nil {
 50 | 		return err, nil
 51 | 	}
 52 | 
 53 | 	errChan := make(chan error)
 54 | 
 55 | 	// Start listening for events.
 56 | 	go func() {
 57 | 		defer c.watcher.Close()
 58 | 		for {
 59 | 			select {
 60 | 			case event, ok := <-c.watcher.Events:
 61 | 				if !ok {
 62 | 					return
 63 | 				}
 64 | 				if event.Has(fsnotify.Write) {
 65 | 					if event.Name == "./config.toml" {
 66 | 						cc := &Config{}
 67 | 						if err := cc.Parse("config.toml"); err == nil {
 68 | 							c.UserAgent = cc.UserAgent
 69 | 							c.CookieString = cc.CookieString
 70 | 						} else {
 71 | 							errChan <- errors.Join(errors.New("failed to parse config.toml"), err)
 72 | 						}
 73 | 					}
 74 | 				}
 75 | 			case err, ok := <-c.watcher.Errors:
 76 | 				if !ok {
 77 | 					return
 78 | 				}
 79 | 				errChan <- err
 80 | 			}
 81 | 		}
 82 | 	}()
 83 | 
 84 | 	return nil, errChan
 85 | }
 86 | 
 87 | // PageChannel share HTML task between fetcher and parser
 88 | type PageChannel struct {
 89 | 	// parser get HTML pages from rec
 90 | 	rec <-chan *HTMLPage
 91 | 
 92 | 	// fetcher get URL from send
 93 | 	send chan<- *HTMLPage
 94 | 
 95 | 	// number of URL to be fetched and parsed
 96 | 	ref int64
 97 | 
 98 | 	// flag, whether all URLs from list are added to fetcher
 99 | 	init int64
100 | }
101 | 
102 | // Add task number
103 | func (p *PageChannel) Add(n int64) {
104 | 	if n <= 0 {
105 | 		return
106 | 	}
107 | 	atomic.AddInt64(&p.ref, n)
108 | }
109 | 
110 | // Del task number
111 | func (p *PageChannel) Del(n int64) {
112 | 	if n <= 0 {
113 | 		return
114 | 	}
115 | 	atomic.AddInt64(&p.ref, -n)
116 | }
117 | 
118 | // Ref returns task number
119 | func (p *PageChannel) Ref() int64 {
120 | 	return atomic.LoadInt64(&p.ref)
121 | }
122 | 
123 | // Inited returns whether all URLs are read from url.txt
124 | func (p *PageChannel) Inited() {
125 | 	atomic.StoreInt64(&p.init, 1)
126 | }
127 | 
128 | // IsDone returns whether all HTML page are fetched
129 | func (p *PageChannel) IsDone() bool {
130 | 	return atomic.LoadInt64(&p.ref) <= 0 && atomic.LoadInt64(&p.init) != 0
131 | }
132 | 
133 | // HTMLType tells parser how to parse the HTMLPage
134 | type HTMLType int
135 | 
136 | const (
137 | 	// HTMLWebHomepage is the first page of a Tieba post
138 | 	HTMLWebHomepage HTMLType = iota
139 | 
140 | 	// HTMLWebPage is a page of a Tieba post
141 | 	HTMLWebPage
142 | 
143 | 	// HTMLJSON is the Lzl totalComment in JSON format
144 | 	HTMLJSON
145 | 
146 | 	// HTMLLzlHome is the Lzl Comment of a comment in page 2 in JSON format
147 | 	HTMLLzlHome
148 | 
149 | 	// HTMLLzl is the Lzl Comment of a comment in JSON format
150 | 	HTMLLzl
151 | 
152 | 	// HTMLLocal is a local HTML or JSON file
153 | 	HTMLLocal
154 | 
155 | 	// HTMLWebWAPHomepage is the first page of a wap post
156 | 	HTMLWebWAPHomepage
157 | 
158 | 	// HTMLWebWAPPage supports fetching wap posts
159 | 	HTMLWebWAPPage
160 | 
161 | 	// HTMLExternalResource containes external resources (i.e. images)
162 | 	HTMLExternalResource
163 | )
164 | 
165 | // HTMLPage is a job for fetcher and parser
166 | type HTMLPage struct {
167 | 	// URL of the Page
168 | 	URL *url.URL
169 | 
170 | 	// Content is the HTML code of the Page
171 | 	Content []byte
172 | 
173 | 	// Type indicates different types of Tieba data
174 | 	Type HTMLType
175 | 
176 | 	// Close http.Response when finished parsing
177 | 	// Response *http.Response
178 | 
179 | 	// Path where downloaded external resources are saved
180 | 	Path string
181 | 	// ThreadID links external resources to corresponding TemplateField
182 | 	ThreadID uint64
183 | }
184 | 
185 | // TiebaField parse "data-field" of each thread
186 | type TiebaField struct {
187 | 	Author struct {
188 | 		UserID   uint64 `json:"user_id"`
189 | 		UserName string `json:"user_name"` // 用户名
190 | 		// Props string `json:"props"`
191 | 	} `json:"author"`
192 | 	Content struct {
193 | 		PostID uint64 `json:"post_id"`
194 | 		// IsAnonym bool `json:"is_anonym"`
195 | 		ForumID  uint64 `json:"forum_id"`
196 | 		ThreadID uint64 `json:"thread_id"`
197 | 		Content  string `json:"content"` // 正文内容
198 | 		PostNO   uint64 `json:"post_no"` // 楼数
199 | 		// Type string `json:"type"`
200 | 		// CommentNum uint16 `json:"comment_num"`
201 | 		// Props string `json:"props"`
202 | 		// PostIndex uint64 `json:"post_index"`
203 | 		// PbTpoint *uint64 `json:"pb_tpoint"`
204 | 	} `json:"content"`
205 | }
206 | 
207 | // LzlField parse Lzl JSON data
208 | type LzlField struct {
209 | 	ErrNO  int64                      `json:"errno"`
210 | 	ErrMsg string                     `json:"errmsg"`
211 | 	Data   map[string]json.RawMessage `json:"data"`
212 | }
213 | 
214 | // LzlContent is a comment of Lzl from totalComment
215 | type LzlContent struct {
216 | 	// 	ThreadID  uint64        `json:"thread_id,string"`
217 | 	// 	PostID    uint64        `json:"post_id,string"`
218 | 	// CommentID uint64        `json:"comment_id,string"`
219 | 	Index        int64
220 | 	UserName     string        `json:"username"`
221 | 	UserNickname string        `json:"show_nickname,omitempty"`
222 | 	Content      template.HTML `json:"content"`
223 | 	Timestamp    int64         `json:"now_time"`
224 | 	Time         string
225 | }
226 | 
227 | // LzlComment indicates the relationship between a Tieba posts and the attached Lzl comment
228 | type LzlComment struct {
229 | 	Num     uint64        `json:"comment_num"`
230 | 	ListNum uint64        `json:"comment_list_num"`
231 | 	Info    []*LzlContent `json:"comment_info"`
232 | 	// Info []json.RawMessage `json:"comment_info"`
233 | }
234 | 
235 | // LzlPageComment indicates the total number of LzlComments in a single comment
236 | type LzlPageComment struct {
237 | 	TotalNum  uint64 `json:"total_num"`
238 | 	TotalPage uint64 `json:"total_page"`
239 | }
240 | 
241 | // OutputField render Tieba post in template
242 | type OutputField struct {
243 | 	UserName template.HTML
244 | 	Content  template.HTML
245 | 	PostNO   uint64
246 | 	PostID   uint64
247 | 	Time     string
248 | }
249 | 
250 | // LzlMap provides a thread safe map insert method
251 | type LzlMap struct {
252 | 	Map  map[uint64]*LzlComment
253 | 	lock *sync.Mutex
254 | }
255 | 
256 | // Append LzlComment to Map with synchronization
257 | func (lzl *LzlMap) Append(k uint64, c *LzlContent) {
258 | 	lzl.lock.Lock()
259 | 	lzl.Map[k].Info = append(lzl.Map[k].Info, c)
260 | 	lzl.lock.Unlock()
261 | }
262 | 
263 | // Insert LzlComment to Map with synchronization
264 | func (lzl *LzlMap) Insert(k uint64, v *LzlComment) {
265 | 	lzl.lock.Lock()
266 | 	lzl.Map[k] = v
267 | 	lzl.lock.Unlock()
268 | }
269 | 
270 | // IsExist returns true if key is already in Map
271 | func (lzl *LzlMap) IsExist(k uint64) bool {
272 | 	lzl.lock.Lock()
273 | 	_, ok := lzl.Map[k]
274 | 	lzl.lock.Unlock()
275 | 	return ok
276 | }
277 | 
278 | // ExternalResourceMap keeps records of fetched external resources
279 | type ExternalResourceMap struct {
280 | 	Map  map[string]interface{}
281 | 	lock *sync.Mutex
282 | }
283 | 
284 | func (erm *ExternalResourceMap) Get(k string) bool {
285 | 	erm.lock.Lock()
286 | 	defer erm.lock.Unlock()
287 | 	if _, ok := erm.Map[k]; !ok {
288 | 		return false
289 | 	}
290 | 	return true
291 | }
292 | 
293 | func (erm *ExternalResourceMap) Set(k string) {
294 | 	erm.lock.Lock()
295 | 	defer erm.lock.Unlock()
296 | 	erm.Map[k] = nil
297 | }
298 | 
299 | func (erm *ExternalResourceMap) Put(k string) bool {
300 | 	erm.lock.Lock()
301 | 	defer erm.lock.Unlock()
302 | 	ret := false
303 | 	if _, ok := erm.Map[k]; !ok {
304 | 		ret = true
305 | 	}
306 | 	erm.Map[k] = nil
307 | 	return ret
308 | }
309 | 
310 | // TemplateField stores all necessary information to render a HTML page
311 | type TemplateField struct {
312 | 	Title     string
313 | 	Url       string
314 | 	ThreadID  uint64
315 | 	Comments  []*OutputField
316 | 	pagesLeft int64
317 | 	Lzls      *LzlMap // Key is PostID
318 | 	lzlsLeft  int64
319 | 	resLeft   int64
320 | 	mutex     *sync.RWMutex
321 | 	send      bool
322 | 	rendered  int64
323 | 	resMap    *ExternalResourceMap
324 | }
325 | 
326 | // NetTemplateField returns a initialized struct
327 | func NewTemplateField(threadID uint64) *TemplateField {
328 | 	tf := &TemplateField{
329 | 		ThreadID: threadID,
330 | 		Comments: make([]*OutputField, 0, 30),
331 | 		Lzls: &LzlMap{
332 | 			Map:  make(map[uint64]*LzlComment),
333 | 			lock: &sync.Mutex{},
334 | 		},
335 | 		mutex: &sync.RWMutex{},
336 | 		resMap: &ExternalResourceMap{
337 | 			Map:  make(map[string]interface{}),
338 | 			lock: &sync.Mutex{},
339 | 		},
340 | 	}
341 | 	return tf
342 | }
343 | 
344 | // Send parsed Tieba posts to render
345 | // https://misfra.me/optimizing-concurrent-map-access-in-go/
346 | func (t *TemplateField) Send(c chan *TemplateField) {
347 | 	t.mutex.RLock()
348 | 	if !t.send {
349 | 		t.mutex.RUnlock()
350 | 		t.mutex.Lock()
351 | 		if !t.send {
352 | 			c <- t
353 | 			t.send = true
354 | 		}
355 | 		t.mutex.Unlock()
356 | 	} else {
357 | 		t.mutex.RUnlock()
358 | 	}
359 | }
360 | 
361 | // AddPage adds the number of Page to be parsed
362 | func (t *TemplateField) AddPage(n int64) {
363 | 	atomic.AddInt64(&t.pagesLeft, n)
364 | }
365 | 
366 | // AddLzl adds the number of Lzls to be parsed
367 | func (t *TemplateField) AddLzl(n int64) {
368 | 	atomic.AddInt64(&t.lzlsLeft, n)
369 | }
370 | 
371 | // Append a new post to TemplateField
372 | func (t *TemplateField) Append(post *OutputField) {
373 | 	t.mutex.Lock()
374 | 	// l := len(t.Comments)
375 | 	// n := l + 1
376 | 	// if n > cap(t.Comments) {
377 | 	// 	newSlice := make([]*OutputField, 30*10+n+1)
378 | 	// 	copy(newSlice, t.Comments)
379 | 	// 	t.Comments = newSlice
380 | 	// }
381 | 	// t.Comments = t.Comments[0:n]
382 | 	// copy(t.Comments[n:n+1], post)
383 | 	t.Comments = append(t.Comments, post)
384 | 	t.mutex.Unlock()
385 | }
386 | 
387 | // IsDone returns whether TemplateField is ready to be rendered
388 | func (t *TemplateField) IsDone() bool {
389 | 	pagesLeft := atomic.LoadInt64(&t.pagesLeft)
390 | 	lzlsLeft := atomic.LoadInt64(&t.lzlsLeft)
391 | 	ret := pagesLeft <= 0 && lzlsLeft <= 0
392 | 	if config.StoreExternalResource {
393 | 		resLeft := atomic.LoadInt64(&t.resLeft)
394 | 		// log.Printf("%d: resLeft (%d)", t.ThreadID, resLeft)
395 | 		ret = ret && (resLeft <= 0)
396 | 	}
397 | 	return ret
398 | }
399 | 
400 | // Merge consecutive posts whose Useaname is the same
401 | func (t *TemplateField) Merge() {
402 | 	l := len(t.Comments)
403 | 	for i := 0; i+1 < l; i++ {
404 | 		if t.Comments[i+1].UserName != t.Comments[i].UserName {
405 | 			continue
406 | 		}
407 | 		v, ok := t.Lzls.Map[t.Comments[i+1].PostID]
408 | 		if ok && v.ListNum != 0 && v.Num != 0 {
409 | 			continue
410 | 		}
411 | 		v, ok = t.Lzls.Map[t.Comments[i].PostID]
412 | 		if ok && v.ListNum != 0 && v.Num != 0 {
413 | 			continue
414 | 		}
415 | 		// How to efficiently concatenate strings in Go?
416 | 		// https://stackoverflow.com/a/43675122/6091246
417 | 		bs := make([]byte, len(t.Comments[i].Content)+len(t.Comments[i+1].Content)+1)
418 | 		bl := 0
419 | 		bl += copy(bs[bl:], t.Comments[i].Content)
420 | 		bs[bl] = '\n'
421 | 		bl++
422 | 		bl += copy(bs[bl:], t.Comments[i+1].Content)
423 | 		t.Comments[i].Content = template.HTML(bs)
424 | 		// t.Comments[i].Content = t.Comments[i].Content + "\n" + t.Comments[i+1].Content
425 | 		// removes duplicate values in given slice
426 | 		// https://gist.github.com/alioygur/16c66b4249cb42715091fe010eec7e33#file-unique_slice-go-L13
427 | 		t.Comments = append(t.Comments[:i+1], t.Comments[i+2:]...)
428 | 		i--
429 | 		l--
430 | 	}
431 | }
432 | 
433 | // Unique removes any duplicate posts using PoseNO
434 | func (t *TemplateField) Unique() {
435 | 	// Idiomatic way to remove duplicates in a slice
436 | 	// https://www.reddit.com/r/golang/comments/5ia523/idiomatic_way_to_remove_duplicates_in_a_slice/db6qa2e/
437 | 	seen := make(map[uint64]struct{}, len(t.Comments))
438 | 	j := 0
439 | 	for _, v := range t.Comments {
440 | 		if _, ok := seen[v.PostNO]; ok {
441 | 			continue
442 | 		}
443 | 		seen[v.PostNO] = struct{}{}
444 | 		t.Comments[j] = v
445 | 		j++
446 | 	}
447 | 	t.Comments = t.Comments[:j]
448 | }
449 | 
450 | // Rendered returns true if the template is written to the output file
451 | func (t *TemplateField) Rendered() bool {
452 | 	return atomic.LoadInt64(&t.rendered) != 0
453 | }
454 | 
455 | // SetRendered could be used to change rendered status
456 | func (t *TemplateField) SetRendered(status bool) {
457 | 	if status {
458 | 		atomic.StoreInt64(&t.rendered, 1)
459 | 	} else {
460 | 		atomic.StoreInt64(&t.rendered, 0)
461 | 	}
462 | }
463 | 
464 | func (t *TemplateField) FileName() string {
465 | 	// #6: remove illegal character in title
466 | 	// ref: https://www.codeproject.com/tips/758861/removing-characters-which-are-not-allowed-in-windo
467 | 	filenameRegex := regexp.MustCompile(`[\\/:*?""<>|]`)
468 | 	validFilename := filenameRegex.ReplaceAllLiteralString(t.Title, "")
469 | 	return validFilename
470 | }
471 | 
472 | // TemplateMap manipulate a Tieba thread in parser
473 | type TemplateMap struct {
474 | 	Map     map[uint64]*TemplateField // Key is ThreadID
475 | 	lock    *sync.RWMutex
476 | 	Channel chan *TemplateField
477 | }
478 | 
479 | // Get returns a value from Map with synchronization
480 | // see: https://misfra.me/optimizing-concurrent-map-access-in-go/ for more detail
481 | func (tm *TemplateMap) Get(k uint64) *TemplateField {
482 | 	var val *TemplateField
483 | 	var ok bool
484 | 	tm.lock.RLock()
485 | 	if val, ok = tm.Map[k]; !ok {
486 | 		tm.lock.RUnlock()
487 | 		tm.lock.Lock()
488 | 		if val, ok = tm.Map[k]; !ok {
489 | 			val = NewTemplateField(k)
490 | 			tm.Map[k] = val
491 | 		}
492 | 		tm.lock.Unlock()
493 | 	} else {
494 | 		tm.lock.RUnlock()
495 | 	}
496 | 	return val
497 | }
498 | 
499 | // Sweep search threadIDs for elements ready for rendering
500 | func (tm *TemplateMap) Sweep(pc *PageChannel) {
501 | 	tm.lock.RLock()
502 | 	for k := range tm.Map {
503 | 		tf := tm.Map[k]
504 | 		if tf.IsDone() && !tf.Rendered() {
505 | 			go tf.Send(tm.Channel)
506 | 		}
507 | 	}
508 | 	tm.lock.RUnlock()
509 | 
510 | 	// TODO: delete rendered threads from TemplateMap
511 | }
512 | 


--------------------------------------------------------------------------------
/parse.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"bytes"
  6 | 	"encoding/json"
  7 | 	"errors"
  8 | 	"fmt"
  9 | 	"html/template"
 10 | 	"log"
 11 | 	"math/rand"
 12 | 	"net/url"
 13 | 	"os"
 14 | 	"path/filepath"
 15 | 	"regexp"
 16 | 	"strconv"
 17 | 	"strings"
 18 | 	"sync"
 19 | 	"sync/atomic"
 20 | 	"time"
 21 | 
 22 | 	"github.com/PuerkitoBio/goquery"
 23 | )
 24 | 
 25 | var letterRunes = []rune("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
 26 | 
 27 | func randStringRunes(n int) string {
 28 | 	b := make([]rune, n)
 29 | 	for i := range b {
 30 | 		b[i] = letterRunes[rand.Intn(len(letterRunes))]
 31 | 	}
 32 | 	return string(b)
 33 | }
 34 | 
 35 | func htmlParseWrapperFcn(done <-chan struct{}, pc *PageChannel, page *HTMLPage, tmMap *TemplateMap, querySelector, threadSelector string, callback func(tf *TemplateField, doc *goquery.Document, posts *goquery.Selection) error) error {
 36 | 	defer pc.Del(1)
 37 | 	doc, err := goquery.NewDocumentFromReader(bytes.NewReader(page.Content))
 38 | 	if err != nil {
 39 | 		// network error, retry request
 40 | 		pc.Add(1)
 41 | 		go addPageToFetchQueue(done, pc, time.Duration(config.RetryPeriod)*time.Second, page.URL, page.Type)
 42 | 		return fmt.Errorf("error parsing %s(title: %s): %v", page.URL, findTitle(doc), err)
 43 | 	}
 44 | 
 45 | 	posts := doc.Find(querySelector)
 46 | 	threadRegex := regexp.MustCompile(threadSelector)
 47 | 	match := threadRegex.FindStringSubmatch(string(page.Content))
 48 | 	if len(match) < 1 {
 49 | 		// network error, retry request
 50 | 		pc.Add(1)
 51 | 		go addPageToFetchQueue(done, pc, time.Duration(config.RetryPeriod)*time.Second, page.URL, page.Type)
 52 | 		return fmt.Errorf("unable to parse page(title: %s), possibly a network error, readding url to queue %s", findTitle(doc), page.URL)
 53 | 	}
 54 | 	strInt, _ := strconv.ParseInt(match[1], 10, 64)
 55 | 	threadID := uint64(strInt)
 56 | 	// TODO: wrap tf.Add method in order to substitute image with html embedded one
 57 | 	tf := tmMap.Get(threadID)
 58 | 	err = callback(tf, doc, posts)
 59 | 	// page.Response.Body.Close()
 60 | 	if err != nil {
 61 | 		pc.Add(1)
 62 | 		go addPageToFetchQueue(done, pc, time.Duration(config.RetryPeriod)*time.Second, page.URL, page.Type)
 63 | 	} else {
 64 | 		tf.AddPage(-1)
 65 | 	}
 66 | 	return err
 67 | }
 68 | 
 69 | func htmlParse(done <-chan struct{}, pc *PageChannel, page *HTMLPage, tmMap *TemplateMap, callback func(tf *TemplateField, doc *goquery.Document, posts *goquery.Selection) error) error {
 70 | 	// posts := doc.Find("div.l_post.j_l_post.l_post_bright")
 71 | 	// threadRegex := regexp.MustCompile(`\b"?thread_id"?:"?(\d+)"?\b`)
 72 | 	return htmlParseWrapperFcn(done, pc, page, tmMap, "div.l_post.j_l_post.l_post_bright", `\b"?thread_id"?:"?(\d+)"?\b`, callback)
 73 | }
 74 | 
 75 | func homePageParserFcn(done <-chan struct{}, pc *PageChannel, tf *TemplateField, doc *goquery.Document, posts *goquery.Selection, page *HTMLPage, pageTitleFinder func(doc *goquery.Document) string, pageNumFinder func(doc *goquery.Document) (int64, error), pageType HTMLType, parserFcn func(page *HTMLPage, tf *TemplateField, doc *goquery.Document, posts *goquery.Selection) error) error {
 76 | 	tf.Title = pageTitleFinder(doc)
 77 | 	log.Printf("[homepage] Title: %s", tf.Title)
 78 | 	// issue #11 add url to page content
 79 | 	tf.Url = page.URL.String()
 80 | 
 81 | 	pageNum, err := pageNumFinder(doc)
 82 | 	if err != nil {
 83 | 		return fmt.Errorf("error parsing total number of pages: %v", err)
 84 | 	}
 85 | 
 86 | 	atomic.StoreInt64(&tf.pagesLeft, pageNum)
 87 | 	atomic.StoreInt64(&tf.lzlsLeft, pageNum)
 88 | 	// fetch all comments and lzls, excluding comments in the first page
 89 | 	// pageNum - 1: html page 2~pageNum
 90 | 	// pageNum + 1: lzl page 0~pageNum
 91 | 	pc.Add(pageNum - 1 + pageNum + 1)
 92 | 	go addPageToFetchQueueFromHomePage(done, pc, page.URL, tf.ThreadID, pageNum, pageType)
 93 | 
 94 | 	return parserFcn(page, tf, doc, posts)
 95 | }
 96 | 
 97 | func homepageParser(done <-chan struct{}, page *HTMLPage, pc *PageChannel, tmMap *TemplateMap) error {
 98 | 	return htmlParse(done, pc, page, tmMap, func(tf *TemplateField, doc *goquery.Document, posts *goquery.Selection) error {
 99 | 		return homePageParserFcn(done, pc, tf, doc, posts, page, findTitle, func(doc *goquery.Document) (int64, error) {
100 | 			var pageNum int64
101 | 			if s := doc.Find("span.red").Eq(1); s.Text() == "" {
102 | 				pageNum = 1 // Could not find total number of pages, default to 1
103 | 			} else {
104 | 				n, err := strconv.Atoi(s.Text())
105 | 				if err != nil {
106 | 					return 0, fmt.Errorf("error parsing total number of pages: %v", err)
107 | 				}
108 | 				pageNum = int64(n)
109 | 			}
110 | 			return pageNum, nil
111 | 		}, HTMLWebPage, pageParserFcn)
112 | 	})
113 | }
114 | 
115 | func pageParserFcn(page *HTMLPage, tf *TemplateField, doc *goquery.Document, posts *goquery.Selection) error {
116 | 	posts.Each(func(i int, s *goquery.Selection) {
117 | 		// filter elements that has more than 4 class (maybe an advertisement, commit 9c82d4e381d1bcd3f801bf5f6c07960fb7d829be)
118 | 		classStr, _ := s.Attr("class") // get class string
119 | 		if len(strings.Fields(classStr)) > 4 {
120 | 			return
121 | 		}
122 | 
123 | 		dataField, ok := s.Attr("data-field")
124 | 		if !ok {
125 | 			// maybe not an error, but an older version of data-field
126 | 			fmt.Fprintf(os.Stderr, "#%d data-field not found: %s\n", i, page.URL) // there's a error on the page, maybe Tieba updated the syntax
127 | 			return
128 | 		}
129 | 
130 | 		var tiebaPost TiebaField
131 | 		var res OutputField
132 | 		err := json.Unmarshal([]byte(dataField), &tiebaPost)
133 | 		if err != nil {
134 | 			fmt.Fprintf(os.Stderr, "#%d data-field unmarshal failed: %v, url: %s\n", i, err, page.URL) // there's a error on the page, maybe Tieba updated the syntax
135 | 			return
136 | 		}
137 | 		if content, err := s.Find("div.d_author ul.p_author li.d_name a.p_author_name.j_user_card").Html(); err != nil {
138 | 			fmt.Fprintf(os.Stderr, "#%d Error parsing username from %s\n", i, page.URL)
139 | 			return
140 | 		} else {
141 | 			res.UserName = template.HTML(handleUserNameEmojiURL(content))
142 | 		}
143 | 
144 | 		res.Content = template.HTML(tiebaPost.Content.Content)
145 | 		res.PostNO = tiebaPost.Content.PostNO
146 | 		res.PostID = tiebaPost.Content.PostID
147 | 
148 | 		if res.Content == "" {
149 | 			// data-field does not contain content
150 | 			// infer an old version of posts
151 | 			postID := fmt.Sprintf("#post_content_%d", res.PostID)
152 | 			content, err := posts.Find(postID).Html()
153 | 			if err != nil {
154 | 				log.Printf("#%d: post_content_%d parse failed, %s", i, res.PostID, err)
155 | 			} else {
156 | 				res.Content = template.HTML(content)
157 | 			}
158 | 		}
159 | 
160 | 		// get post time
161 | 		// Jquery过滤选择器，选择前几个元素，后几个元素，内容过滤选择器等
162 | 		// http://www.cnblogs.com/alone2015/p/4962687.html
163 | 		for _, elem := range s.Find("span.tail-info").EachIter() {
164 | 			if tm, err := time.Parse(`2006-01-02 15:04`, elem.Text()); err == nil {
165 | 				res.Time = tm.Format("2006-01-02 15:04")
166 | 			}
167 | 		}
168 | 
169 | 		tf.Append(&res)
170 | 		// log.Printf("#%d data-field found: %v\n", i, tiebaPost)
171 | 		// log.Printf("#%d data-field found:\nauthor: %s\ncontent: %s\n",
172 | 		// 	tiebaPost.Content.PostNo,
173 | 		// 	tiebaPost.Author.UserName,
174 | 		// 	tiebaPost.Content.Content)
175 | 
176 | 		// result.Posts <- &res
177 | 	})
178 | 	return nil
179 | }
180 | 
181 | func pageParser(done <-chan struct{}, page *HTMLPage, pc *PageChannel, tmMap *TemplateMap) error {
182 | 	// log.Printf("[Parse] parsing %s", page.URL.String())
183 | 	return htmlParse(done, pc, page, tmMap, func(tf *TemplateField, doc *goquery.Document, posts *goquery.Selection) (err error) {
184 | 		return pageParserFcn(page, tf, doc, posts)
185 | 	})
186 | }
187 | 
188 | func wapParse(done <-chan struct{}, pc *PageChannel, page *HTMLPage, tmMap *TemplateMap, callback func(tf *TemplateField, doc *goquery.Document, posts *goquery.Selection) error) error {
189 | 	// posts := doc.Find("div.i")
190 | 	// threadRegex := regexp.MustCompile(`kz=(\d+)`)
191 | 	return htmlParseWrapperFcn(done, pc, page, tmMap, "div.i", `kz=(\d+)`, callback)
192 | }
193 | 
194 | func wapHomePageParser(done <-chan struct{}, page *HTMLPage, pc *PageChannel, tmMap *TemplateMap) error {
195 | 	return wapParse(done, pc, page, tmMap, func(tf *TemplateField, doc *goquery.Document, posts *goquery.Selection) error {
196 | 		return homePageParserFcn(done, pc, tf, doc, posts, page, findWapTitle, func(doc *goquery.Document) (int64, error) {
197 | 			var pageNum int64
198 | 			pageNumMatcher := regexp.MustCompile(`第\d+/(\d+)页`)
199 | 			matches := pageNumMatcher.FindStringSubmatch(doc.Find("div.h").Text())
200 | 			if len(matches) > 1 {
201 | 				n, err := strconv.Atoi(matches[1])
202 | 				if err != nil {
203 | 					return 0, fmt.Errorf("error parsing total number of pages: %v", err)
204 | 				}
205 | 				pageNum = int64(n)
206 | 			} else {
207 | 				pageNum = 1 // Could not find total number of pages, default to 1
208 | 			}
209 | 			return pageNum, nil
210 | 		}, HTMLWebWAPPage, wapParserFcn)
211 | 	})
212 | }
213 | 
214 | func wapParserFcn(page *HTMLPage, tf *TemplateField, doc *goquery.Document, posts *goquery.Selection) error {
215 | 	postMatcher := regexp.MustCompile(`^(\d+)楼. (.*)<br/>`)
216 | 	posts.Each(func(i int, s *goquery.Selection) {
217 | 		var res OutputField
218 | 		if content, err := s.Find(".g > a").Html(); err != nil {
219 | 			fmt.Fprintf(os.Stderr, "#%d Error parsing username from %s\n", i, page.URL)
220 | 			return
221 | 		} else {
222 | 			res.UserName = template.HTML(handleUserNameEmojiURL(content))
223 | 		}
224 | 
225 | 		sBody, _ := s.Html()
226 | 		sTable, _ := s.Find("table").Html()
227 | 		sContent := strings.ReplaceAll(sBody, fmt.Sprintf("<table>%s</table>", sTable), "")
228 | 		if matches := postMatcher.FindStringSubmatch(string(sContent)); len(matches) < 3 {
229 | 			fmt.Fprintf(os.Stderr, "#%d Error parsing post content from %s\n", i, page.URL)
230 | 			return
231 | 		} else {
232 | 			n, err := strconv.Atoi(matches[1])
233 | 			if err != nil {
234 | 				fmt.Fprintf(os.Stderr, "error parsing post number: %v", err)
235 | 				return
236 | 			}
237 | 			res.PostNO = uint64(n)
238 | 			res.Content = template.HTML(matches[2])
239 | 		}
240 | 		// res.PostID = tiebaPost.Content.PostID
241 | 		if sReply, ok := s.Find(".r>a").Attr("href"); ok {
242 | 			if replyUrl, err := url.Parse(sReply); err == nil {
243 | 				pid := replyUrl.Query().Get("pid")
244 | 				if n, err := strconv.Atoi(pid); err == nil {
245 | 					res.PostID = uint64(n)
246 | 				}
247 | 			}
248 | 		}
249 | 		res.Time = s.Find(".b").Text()
250 | 		if tm, err := time.Parse(`1-2 15:04`, res.Time); err == nil {
251 | 			// rewrite time from "1-22 13:07" to "2021-01-22 13:07" for consistency
252 | 			tm = tm.AddDate(time.Now().Year(), 0, 0)
253 | 			res.Time = tm.Format("2006-01-02 15:04")
254 | 		}
255 | 
256 | 		tf.Append(&res)
257 | 	})
258 | 	return nil
259 | }
260 | 
261 | func wapPageParser(done <-chan struct{}, page *HTMLPage, pc *PageChannel, tmMap *TemplateMap) error {
262 | 	// log.Printf("[Parse] parsing %s", page.URL.String())
263 | 	return wapParse(done, pc, page, tmMap, func(tf *TemplateField, doc *goquery.Document, posts *goquery.Selection) (err error) {
264 | 		return wapParserFcn(page, tf, doc, posts)
265 | 	})
266 | }
267 | 
268 | // parse lzl comment, JSON formatted
269 | func jsonParser(done <-chan struct{}, page *HTMLPage, pc *PageChannel, tmMap *TemplateMap, callback func(done <-chan struct{}, page *HTMLPage, pc *PageChannel, tmMap *TemplateMap, tf *TemplateField) error) error {
270 | 	defer pc.Del(1)
271 | 	u := page.URL
272 | 	q := u.Query()
273 | 	tid := q.Get("tid")
274 | 	if tid == "" {
275 | 		return fmt.Errorf("error parsing getting tid from %s", page.URL) // skip illegal URL
276 | 	}
277 | 	ret, _ := strconv.Atoi(tid)
278 | 	threadID := uint64(ret)
279 | 	// TODO: wrap tf.Add method in order to substitute image with html embedded one
280 | 	tf := tmMap.Get(threadID)
281 | 	defer tf.AddLzl(-1)
282 | 	err := callback(done, page, pc, tmMap, tf)
283 | 	if err != nil {
284 | 		pc.Add(1)
285 | 		tf.AddLzl(1)
286 | 		go addPageToFetchQueue(done, pc, time.Duration(config.RetryPeriod)*time.Second, page.URL, page.Type)
287 | 	}
288 | 	return err
289 | }
290 | 
291 | func requestLzlComment(tid string, pid string, pn string, tp HTMLType, pc *PageChannel) {
292 | 	// there are more lzls to fetch
293 | 	// url syntax:
294 | 	// url example: https://tieba.baidu.com/p/comment?tid=7201761174&pn=4
295 | 	u := &url.URL{
296 | 		Scheme: "http",
297 | 		Host:   "tieba.baidu.com",
298 | 		Path:   "/p/comment",
299 | 	}
300 | 	q := u.Query()
301 | 	// q.Set("t", strconv.Itoa(int(time.Now().UnixNano()/1000000)))
302 | 	q.Set("tid", tid)
303 | 	// q.Set("pid", pid)
304 | 	q.Set("pn", pn) // start fetching additional comment from page 2
305 | 	u.RawQuery = q.Encode()
306 | 
307 | 	// log.Printf("requesting %s", u)
308 | 
309 | 	pc.send <- &HTMLPage{
310 | 		URL:  u,
311 | 		Type: tp,
312 | 	}
313 | }
314 | 
315 | func totalCommentParser(done <-chan struct{}, page *HTMLPage, pc *PageChannel, tmMap *TemplateMap) error {
316 | 	return jsonParser(done, page, pc, tmMap, func(done <-chan struct{}, page *HTMLPage, pc *PageChannel, tmMap *TemplateMap, tf *TemplateField) error {
317 | 		url := page.URL.String()
318 | 		body := string(page.Content)
319 | 		var lzl LzlField
320 | 		var err error
321 | 		contentCandidates := make(chan string, 10)
322 | 		contentCandidates <- body
323 | 		for len(contentCandidates) > 0 {
324 | 			contentBuffer := <-contentCandidates
325 | 			err := json.Unmarshal([]byte(contentBuffer), &lzl)
326 | 			if err != nil {
327 | 				switch err := err.(type) {
328 | 				default:
329 | 					if len(contentCandidates) == 0 {
330 | 						return fmt.Errorf("error parsing content file %s: %v", url, err)
331 | 					}
332 | 				case *json.SyntaxError:
333 | 					// handle corrupted json data, as in #12
334 | 					// example: https://tieba.baidu.com/p/totalComment?fid=572638&pn=0&t=1617364074015&tid=6212415344&red_tag=3017655123
335 | 					if contentBuffer[:err.Offset-1] != `{"errno":null,"errmsg":null}` {
336 | 						fmt.Fprintf(os.Stderr, "[Parser] warning: lzl data corrupted: %s: %s, trying to reparse strings between offset %d", url, contentBuffer, err.Offset)
337 | 						contentCandidates <- contentBuffer[:err.Offset-1]
338 | 					}
339 | 					contentCandidates <- contentBuffer[err.Offset-1:]
340 | 				}
341 | 			}
342 | 			if lzl.ErrMsg == "success" {
343 | 				break
344 | 			}
345 | 		}
346 | 		if lzl.ErrMsg != "success" {
347 | 			return fmt.Errorf("unable to find json lzl with ErrMsg(\"success\"), last message was: %s", body)
348 | 		}
349 | 		if lzl.ErrNO != 0 {
350 | 			return fmt.Errorf("error getting data: %s, %s", url, lzl.ErrMsg)
351 | 		}
352 | 		commentList, ok := lzl.Data["comment_list"]
353 | 		if !ok {
354 | 			return fmt.Errorf("error getting comment_list: %s", url)
355 | 		}
356 | 		if string(commentList) == "" || string(commentList) == "[]" {
357 | 			return nil // comment list empty, stop
358 | 		}
359 | 		comments := make(map[uint64]*LzlComment)
360 | 		err = json.Unmarshal([]byte(string(commentList)), &comments)
361 | 		if err != nil {
362 | 			return fmt.Errorf("error parsing comment_list from %s: %v\ncomment_list:\n%s", url, err, commentList)
363 | 		}
364 | 
365 | 		if len(comments) == 0 {
366 | 			return nil // does not have any comments, stop
367 | 		}
368 | 
369 | 		for pid, v := range comments {
370 | 			if tf.Lzls.IsExist(pid) {
371 | 				// totalComment contains lzls in different pages, which are duplicate
372 | 				continue
373 | 			}
374 | 			// normalize
375 | 			for i, comment := range v.Info {
376 | 				comment.Index = int64(i)
377 | 				comment.Time = time.Unix(comment.Timestamp, 0).In(time.Local).Format("2006-01-02 15:04")
378 | 				comment.UserName = handleUserNameEmojiURL(comment.UserName)
379 | 				if config.ShowNickName && comment.UserNickname != "" {
380 | 					comment.UserName = comment.UserNickname
381 | 				}
382 | 				comment.Content = template.HTML(reformatLzlUsername(string(comment.Content)))
383 | 			}
384 | 			// merge maps
385 | 			// Getting the union of two maps in go
386 | 			// https://stackoverflow.com/a/22621838/6091246
387 | 			numLeft := int64(v.Num) - int64(v.ListNum)
388 | 			if numLeft > 0 {
389 | 				// extend Lzl slice if needed
390 | 				if n := len(v.Info); uint64(n) < v.Num {
391 | 					// extend slice
392 | 					newSlice := make([]*LzlContent, n, v.Num+1)
393 | 					copy(newSlice, v.Info)
394 | 					v.Info = newSlice
395 | 				}
396 | 				pc.Add(1)
397 | 				tf.AddLzl(1)
398 | 				go requestLzlComment(strconv.Itoa(int(tf.ThreadID)), strconv.Itoa(int(pid)), "2", HTMLLzlHome, pc)
399 | 			}
400 | 			tf.Lzls.Insert(pid, v) // merge maps
401 | 		}
402 | 		return nil
403 | 	})
404 | }
405 | 
406 | func commentParserFcn(url *url.URL, body string, pageType HTMLType, appendLzl func(key uint64, value *LzlContent), requestLzlCommentFcn func(tid, pid, pageNum string)) error {
407 | 	doc, err := goquery.NewDocumentFromReader(strings.NewReader(body))
408 | 	if err != nil {
409 | 		return fmt.Errorf("error parsing %s: %v", url.String(), err)
410 | 	}
411 | 	q := url.Query()
412 | 	tid := q.Get("tid")
413 | 	pid := q.Get("pid")
414 | 	pn := q.Get("pn")
415 | 	if pageType == HTMLLzlHome {
416 | 		s := doc.Find("li.lzl_li_pager_s")
417 | 		dataField, ok := s.Attr("data-field")
418 | 		if !ok {
419 | 			return fmt.Errorf("error parsing %s: total number of pages is not determinable", url)
420 | 		}
421 | 		var lzlPage LzlPageComment
422 | 		err := json.Unmarshal([]byte(dataField), &lzlPage)
423 | 		if err != nil {
424 | 			return fmt.Errorf("LzlPageComment data-field unmarshal failed: %v, url: %s", err, url)
425 | 		}
426 | 		// tf.AddLzl(int64(lzlPage.TotalPage - 2))
427 | 		for i := uint64(3); i <= lzlPage.TotalPage; i++ {
428 | 			requestLzlCommentFcn(tid, pid, strconv.Itoa(int(i)))
429 | 			// requestLzlComment(tid, pid, strconv.Itoa(int(i)), HTMLLzl, pc)
430 | 		}
431 | 	}
432 | 	exLzls := doc.Find(".lzl_single_post.j_lzl_s_p")
433 | 	exLzls.Each(func(i int, s *goquery.Selection) {
434 | 		pageNum, _ := strconv.Atoi(pn)
435 | 		key, _ := strconv.Atoi(pid)
436 | 		content, err := s.Find(".lzl_content_main").Html()
437 | 		if err != nil {
438 | 			return
439 | 		}
440 | 		user := s.Find("div.lzl_cnt a.at.j_user_card")
441 | 		userName := user.Text()
442 | 		// userName, ok := user.Attr("username")
443 | 		// if !ok {
444 | 		// 	// userName not found
445 | 		// 	log.Printf("ExLzl: cannot find username for pid=%s, index=%d", pid, i+pageNum*10)
446 | 		// 	return
447 | 		// } else
448 | 		if userName == "" {
449 | 			// user name is empty, try another method
450 | 			log.Printf("ExLzl: please check url: %s", url)
451 | 			return
452 | 		}
453 | 		c := &LzlContent{
454 | 			Index:    int64(i + pageNum*10),
455 | 			UserName: handleUserNameEmojiURL(userName),
456 | 			Content:  template.HTML(reformatLzlUsername(content)),
457 | 			Time:     s.Find(".lzl_time").Text(),
458 | 		}
459 | 		appendLzl(uint64(key), c)
460 | 		// tf.Lzls.Append(uint64(key), c)
461 | 	})
462 | 	return nil
463 | }
464 | 
465 | func commentParser(done <-chan struct{}, page *HTMLPage, pc *PageChannel, tmMap *TemplateMap) error {
466 | 	return jsonParser(done, page, pc, tmMap, func(done <-chan struct{}, page *HTMLPage, pc *PageChannel, tmMap *TemplateMap, tf *TemplateField) error {
467 | 		return commentParserFcn(page.URL, string(page.Content), page.Type, func(key uint64, value *LzlContent) {
468 | 			tf.Lzls.Append(uint64(key), value)
469 | 		}, func(tid, pid, pageNum string) {
470 | 			pc.Add(1)
471 | 			tf.AddLzl(1)
472 | 			go requestLzlComment(tid, pid, pageNum, HTMLLzl, pc)
473 | 		})
474 | 	})
475 | }
476 | 
477 | // parse templateField from local file, JSON formatted
478 | func templateParser(done <-chan struct{}, page *HTMLPage, pc *PageChannel, tmMap *TemplateMap) error {
479 | 	defer pc.Del(1)
480 | 	var threadID uint64
481 | 
482 | 	u := page.URL
483 | 	q := u.Query()
484 | 	tid := q.Get("tid")
485 | 	if tid == "" {
486 | 		return fmt.Errorf("error parsing getting tid from %s", page.URL.String()) // skip illegal URL
487 | 	}
488 | 	ret, _ := strconv.Atoi(tid)
489 | 	threadID = uint64(ret)
490 | 
491 | 	var tf = tmMap.Get(threadID)
492 | 
493 | 	tf.mutex.Lock()
494 | 	err := json.Unmarshal(page.Content, tf)
495 | 	tf.mutex.Unlock()
496 | 	if err != nil {
497 | 		return fmt.Errorf("error parsing template file %s: %v", page.URL.String(), err)
498 | 	}
499 | 
500 | 	return nil
501 | }
502 | 
503 | func externalResourceParser(done <-chan struct{}, page *HTMLPage, pc *PageChannel, tmMap *TemplateMap) error {
504 | 	defer pc.Del(1)
505 | 	threadID := page.ThreadID
506 | 	var tf = tmMap.Get(threadID)
507 | 
508 | 	dstPath := fmt.Sprintf("output/%s", page.Path)
509 | 	// log.Printf("writing %s", dstPath)
510 | 	outputPath := filepath.Dir(dstPath)
511 | 	if err := os.MkdirAll(outputPath, 0755); err != nil && !os.IsNotExist(err) {
512 | 		return fmt.Errorf("error creating external resource folder (%s): %v", outputPath, err)
513 | 	}
514 | 	data := page.Content
515 | 	if err := writeOutput(dstPath, func(w *bufio.Writer) error {
516 | 		for len(data) > 0 {
517 | 			n, err := w.Write(data)
518 | 			if err != nil {
519 | 				return err
520 | 			}
521 | 			data = data[n:]
522 | 		}
523 | 		return nil
524 | 	}); err != nil {
525 | 		return fmt.Errorf("error writing external resource (%s): %v", dstPath, err)
526 | 	}
527 | 	atomic.AddInt64(&tf.resLeft, -1)
528 | 
529 | 	return nil
530 | }
531 | 
532 | func parser(done <-chan struct{}, errc chan<- error, wg *sync.WaitGroup, pc *PageChannel, tmMap *TemplateMap) {
533 | 	defer wg.Done()
534 | 	var err error
535 | 	ticker := time.NewTicker(1 * time.Second)
536 | 	for {
537 | 		select {
538 | 		case <-done:
539 | 			return
540 | 		case p, ok := <-pc.rec:
541 | 			if !ok {
542 | 				return // quit when pc.rec is closed
543 | 			}
544 | 			switch p.Type {
545 | 			case HTMLWebHomepage:
546 | 				err = homepageParser(done, p, pc, tmMap)
547 | 				if err != nil {
548 | 					errc <- err
549 | 				}
550 | 			case HTMLWebPage:
551 | 				err = pageParser(done, p, pc, tmMap)
552 | 				if err != nil {
553 | 					errc <- err
554 | 				}
555 | 			case HTMLJSON:
556 | 				err = totalCommentParser(done, p, pc, tmMap)
557 | 				if err != nil {
558 | 					errc <- err
559 | 				}
560 | 			case HTMLLzlHome, HTMLLzl:
561 | 				err = commentParser(done, p, pc, tmMap)
562 | 				if err != nil {
563 | 					errc <- err
564 | 				}
565 | 			case HTMLLocal:
566 | 				err = templateParser(done, p, pc, tmMap)
567 | 				if err != nil {
568 | 					errc <- err
569 | 				}
570 | 			case HTMLWebWAPHomepage:
571 | 				err = wapHomePageParser(done, p, pc, tmMap)
572 | 				if err != nil {
573 | 					errc <- err
574 | 				}
575 | 			case HTMLWebWAPPage:
576 | 				err = wapPageParser(done, p, pc, tmMap)
577 | 				if err != nil {
578 | 					errc <- err
579 | 				}
580 | 			case HTMLExternalResource:
581 | 				err = externalResourceParser(done, p, pc, tmMap)
582 | 				if err != nil {
583 | 					errc <- err
584 | 				}
585 | 			default:
586 | 				errc <- errors.New("unkonwn HTMLPage Type")
587 | 			}
588 | 		case <-ticker.C:
589 | 		}
590 | 		go tmMap.Sweep(pc)
591 | 	}
592 | }
593 | 
594 | func parseHTML(done <-chan struct{}, pc *PageChannel) (<-chan *TemplateField, <-chan error) {
595 | 	tmMap := &TemplateMap{
596 | 		Map:     make(map[uint64]*TemplateField),
597 | 		lock:    &sync.RWMutex{},
598 | 		Channel: make(chan *TemplateField, config.NumRenderer),
599 | 	}
600 | 	errc := make(chan error)
601 | 
602 | 	var wg sync.WaitGroup
603 | 	wg.Add(config.NumParser)
604 | 	for i := 0; i < config.NumParser; i++ {
605 | 		go parser(done, errc, &wg, pc, tmMap)
606 | 	}
607 | 	go func() {
608 | 		for {
609 | 			log.Printf("[pc] jobs: %d", pc.Ref()) // status report
610 | 			if pc.IsDone() {
611 | 				close(pc.send) // no more task, tell fetcher to exit
612 | 				break
613 | 			}
614 | 			time.Sleep(time.Second) // check task number every second
615 | 		}
616 | 		wg.Wait() // wait parser finish all remaining tasks
617 | 		close(errc)
618 | 		close(tmMap.Channel) // all page parsed, tell renderer to exit
619 | 	}()
620 | 	return tmMap.Channel, errc
621 | }
622 | 
623 | func findTitle(doc *goquery.Document) string {
624 | 	var title string
625 | 	if s := doc.Find("title"); s.Text() == "" {
626 | 		title = randStringRunes(15) // Could not find title, default to random
627 | 	} else {
628 | 		title = s.Text()
629 | 	}
630 | 	return title
631 | }
632 | 
633 | func findWapTitle(doc *goquery.Document) string {
634 | 	var title string
635 | 	if s := doc.Find(".bc > strong:nth-child(1)"); s.Text() == "" {
636 | 		title = randStringRunes(15) // Could not find title, default to random
637 | 	} else {
638 | 		title = s.Text()
639 | 	}
640 | 	if s := doc.Find("div.d.h ~ a").First(); s.Text() != "" {
641 | 		title = fmt.Sprintf("%s_%s_wap", title, s.Text())
642 | 	}
643 | 	return title
644 | }
645 | 
646 | func addPageToFetchQueue(done <-chan struct{}, pc *PageChannel, delay time.Duration, url *url.URL, pageType HTMLType) {
647 | 	if delay > 0 {
648 | 		select {
649 | 		case <-done:
650 | 			return
651 | 		case <-time.After(delay):
652 | 		}
653 | 	}
654 | 	// add failed task back to jobs
655 | 	select {
656 | 	case <-done:
657 | 		return
658 | 	case pc.send <- &HTMLPage{
659 | 		URL:  url,
660 | 		Type: pageType,
661 | 	}:
662 | 	}
663 | }
664 | 
665 | func reformatLzlUsername(content string) string {
666 | 	// special rule: remove username ahref in ": 回复 ", as requested in #4
667 | 	content = strings.Trim(content, " ")
668 | 	// fmt.Fprintf(os.Stderr, "before: %s\n", content)
669 | 	if strings.HasPrefix(content, "回复") {
670 | 		doc, err := goquery.NewDocumentFromReader(strings.NewReader(content))
671 | 		if err == nil {
672 | 			bodyDOM := doc.Find("body")
673 | 			s := doc.Find("a.at").First()
674 | 			userNameHtml, _ := s.Html()
675 | 			// t.Errorf(userNameHtml)
676 | 			s.ReplaceWithHtml(userNameHtml)
677 | 			content, _ = bodyDOM.Html()
678 | 			// t.Errorf("failed to parse comment data: %v, reason: %s", content, err)
679 | 			// fmt.Fprintf(os.Stderr, "after: %s\n", content)
680 | 		}
681 | 	}
682 | 	return content
683 | }
684 | 
685 | func handleUserNameEmojiURL(userName string) string {
686 | 	doc, err := goquery.NewDocumentFromReader(strings.NewReader(userName))
687 | 	if err != nil {
688 | 		fmt.Fprintf(os.Stderr, "[handleUserNameEmojiURL] error handling user: %s", userName)
689 | 		return userName
690 | 	}
691 | 	doc.Find("img.nicknameEmoji").Each(func(i int, s *goquery.Selection) {
692 | 		if url, ex := s.Attr("src"); ex {
693 | 			if strings.HasPrefix(url, "//") {
694 | 				// the url needs to add the protocol type
695 | 				s.SetAttr("src", "https:"+url)
696 | 			}
697 | 		}
698 | 	})
699 | 	if content, err := doc.Find("body").Html(); err == nil {
700 | 		return content
701 | 	}
702 | 	content, _ := doc.Html()
703 | 	return content
704 | }
705 | 
706 | func addPageToFetchQueueFromHomePage(done <-chan struct{}, pc *PageChannel, urlRef *url.URL, tid uint64, pageNum int64, pageType HTMLType) {
707 | 	for i := int64(2); i <= pageNum; i++ {
708 | 		u := &url.URL{}
709 | 		*u = *urlRef
710 | 		q := u.Query()
711 | 		switch pageType {
712 | 		case HTMLWebPage:
713 | 			q.Set("pn", strconv.Itoa(int(i)))
714 | 		case HTMLWebWAPPage:
715 | 			q.Set("pnum", strconv.Itoa(int(i)))
716 | 		}
717 | 		u.RawQuery = q.Encode()
718 | 		newPage := &HTMLPage{
719 | 			URL:  u, // example: http://tieba.baidu.com/mo/m?kz=7201761174&pnum=2
720 | 			Type: pageType,
721 | 		}
722 | 		select {
723 | 		case <-done:
724 | 			return
725 | 		case pc.send <- newPage: // add all other pages to fetcher
726 | 		}
727 | 	}
728 | 
729 | 	// forumRegex := regexp.MustCompile(`\b"?forum_id"?:"?(\d+)"?\b`)
730 | 	// match := forumRegex.FindStringSubmatch(string(page.Content))
731 | 	// strInt, _ := strconv.ParseInt(match[1], 10, 64)
732 | 	// forumID := uint64(strInt)
733 | 	// fetch lzl comments
734 | 	// syntax:
735 | 	// http://tieba.baidu.com/p/totalComment?t=15769421323&tid=7201761174&fid=572638&pn=2&see_lz=0
736 | 	// python爬取贴吧楼中楼
737 | 	// https://mrxin.github.io/2015/09/19/tieba-louzhonglou/
738 | 	for i := int64(0); i <= pageNum; i++ {
739 | 		u := &url.URL{
740 | 			Scheme: "http",
741 | 			Host:   "tieba.baidu.com",
742 | 			Path:   "/p/totalComment",
743 | 		}
744 | 		q := u.Query()
745 | 		// Go by Example: Epoch
746 | 		// https://gobyexample.com/epoch
747 | 		// q.Set("t", strconv.Itoa(int(time.Now().UnixNano()/1000000)))
748 | 		q.Set("tid", strconv.Itoa(int(tid)))
749 | 		// q.Set("fid", strconv.Itoa(int(forumID)))
750 | 		q.Set("pn", strconv.Itoa(int(i)))
751 | 		u.RawQuery = q.Encode()
752 | 		// log.Printf("requesting totalComment: %s", u)
753 | 		select {
754 | 		case <-done:
755 | 			return
756 | 		case pc.send <- &HTMLPage{
757 | 			URL:  u,
758 | 			Type: HTMLJSON,
759 | 		}:
760 | 		}
761 | 	}
762 | }
763 | 


--------------------------------------------------------------------------------