├── url.txt ├── config.toml ├── go.mod ├── makefile ├── README.md ├── .github └── workflows │ └── go.yml ├── template ├── template1.html └── template2.html ├── LICENSE ├── .gitignore ├── template.go ├── main.go ├── go.sum ├── post_processing.go ├── fetch.go ├── parse_test.go ├── type.go └── parse.go /url.txt: -------------------------------------------------------------------------------- 1 | https://tieba.baidu.com/p/7201761174 2 | http://tieba.baidu.com/mo/m?kz=6212415344 3 | file:///example/test0.json?tid=6212415344 -------------------------------------------------------------------------------- /config.toml: -------------------------------------------------------------------------------- 1 | numFetcher = 10 2 | numParser = 50 3 | numRenderer = 5 4 | # "template1.html": 最简输出模板 5 | # "template2.html": 替换为高分辨率图片 6 | templateName = "template1.html" 7 | retryPeriod = 10 8 | highResImage = true 9 | storeExternalResource = true 10 | # 人工处理百度安全认证 11 | userAgent = "" 12 | cookieString = "" 13 | 14 | # 显示用户昵称 15 | showNickname = true 16 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/hjhee/tiebaSpider 2 | 3 | go 1.23 4 | 5 | toolchain go1.23.1 6 | 7 | require ( 8 | github.com/PuerkitoBio/goquery v1.10.0 9 | github.com/fsnotify/fsnotify v1.7.0 10 | github.com/pelletier/go-toml v1.9.5 11 | golang.org/x/net v0.30.0 12 | ) 13 | 14 | require ( 15 | github.com/andybalholm/cascadia v1.3.2 // indirect 16 | golang.org/x/sys v0.26.0 // indirect 17 | ) 18 | -------------------------------------------------------------------------------- /makefile: -------------------------------------------------------------------------------- 1 | .DEFAULT_GOAL := build 2 | 3 | GITVER = `git describe --tags HEAD` 4 | 5 | build: 6 | @go build -ldflags "-X main.version=${GITVER}" 7 | 8 | clean: 9 | @go clean 10 | 11 | .PHONY: git-tree-check 12 | git-tree-check: 13 | ifneq ($(git diff --stat),) 14 | $(warning "git tree is not clean") 15 | endif 16 | 17 | win: git-tree-check 18 | @echo ver: ${GITVER} 19 | @GOOS="windows" go build -ldflags "-X main.version=${GITVER}" 20 | @zip win64.zip template/*.html tiebaSpider.exe LICENSE README.md url.txt config.toml 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # tiebaSpider 2 | 3 | 程序获取百度贴吧帖子的所有评论,包括所有楼中楼,以HTML和JSON为格式保存到本地,同时合并所有楼层连续、发帖人相同帖子方便阅读。 4 | 5 | 需要获取的帖子在`url.txt`中逐行指定。程序读取程序所在目录下的文件`url.txt`获取贴吧URL,逐行爬取URL指向的帖子。除了http协议的URL之外还支持file协议,file协议格式参考`url.txt`已有的URL。此功能主要用于验证程序功能或者调整HTML模板样式。所有已提取的帖子将命名为`file_{帖子主题}.{json,html}`保存至程序所在目录下的`output`文件夹。若开启了本地保存图片功能,程序会把已获取的资源保存到`res_{帖子主题}`文件夹下。 6 | 7 | ## 特点 8 | 9 | 程序采用Go语言编写,利用goroutine同时获取、解析和渲染页面,各类goroutine的数量可以在`config.toml`文件调整。 10 | 11 | - 支持所有楼中楼评论 12 | - 支持访问WAP版贴吧链接 13 | 14 | 此外还可以通过配置文件开启如下功能: 15 | 16 | - 切换模板以设定输出HTML样式 17 | - 图片链接替换为高清原图 18 | - 本地保存图片 19 | - 设定Cookie和User-Agent处理安全认证 20 | 21 | ## 模板 22 | 23 | - 保存的HTML格式文件由`template/template1.html`的HTML模板定义。可以改写该文件以调整生成的HTML文件,从而美化界面或者嵌入Javascript脚本实现根据发帖人筛选帖子,比如只看楼主等自定义功能。模板的所有可指定的数据参考`type.go`的`TemplateField`定义,模板语法参考go官方文档。 24 | 25 | - `template/template2.html`演示了如何利用模板文件通过javascript程序替换缩略图为高分辨率图片的链接。 26 | -------------------------------------------------------------------------------- /.github/workflows/go.yml: -------------------------------------------------------------------------------- 1 | name: Go 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | branches: [ master ] 8 | 9 | jobs: 10 | 11 | build: 12 | if: github.repository == 'hjhee/tiebaSpider' 13 | name: Build 14 | runs-on: ubuntu-latest 15 | steps: 16 | 17 | - name: Set up Go 18 | uses: actions/setup-go@v5 19 | with: 20 | go-version: '>=1.23.1' 21 | 22 | - name: Check out code into the Go module directory 23 | uses: actions/checkout@v4 24 | 25 | - name: Get dependencies 26 | run: | 27 | go get -v -t -d ./... 28 | if [ -f Gopkg.toml ]; then 29 | curl https://raw.githubusercontent.com/golang/dep/master/install.sh | sh 30 | dep ensure 31 | fi 32 | 33 | - name: Build 34 | run: GOOS=windows GOARCH=amd64 go build -v . 35 | 36 | - name: Create build artifacts 37 | uses: actions/upload-artifact@v4 38 | with: 39 | name: win64 40 | path: | 41 | template/ 42 | LICENSE 43 | README.md 44 | url.txt 45 | tiebaSpider.exe 46 | config.toml -------------------------------------------------------------------------------- /template/template1.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | {{.Title}} 6 | 7 | 17 | 24 | 25 | 26 | 27 |

{{.Title}}

28 |
{{.Url}}
29 |
30 | {{range .Comments}} 31 |
32 |
33 |
{{.Time}} #{{.PostNO}}: {{.UserName}}
34 |
{{.Content}}
35 |
36 | {{if index $.Lzls .PostID}} 37 | 38 |
39 | {{$lzl := index $.Lzls .PostID }} 40 | {{range $lzl.Info}} 41 |
{{.Time}} {{.UserName}}: {{.Content}}
42 | {{end}} 43 |
44 | {{end}} 45 |
46 |
47 | {{end}} 48 | 49 | 50 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 2-Clause License 2 | 3 | Copyright (c) 2017, 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | -------------------------------------------------------------------------------- /template/template2.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | {{.Title}} 6 | 7 | 17 | 24 | 25 | 26 | 27 |

{{.Title}}

28 |
{{.Url}}
29 |
30 | {{range .Comments}} 31 |
32 |
33 |
{{.Time}} #{{.PostNO}}: {{.UserName}}
34 |
{{.Content}}
35 |
36 | {{if index $.Lzls .PostID}} 37 | 38 |
39 | {{$lzl := index $.Lzls .PostID }} 40 | {{range $lzl.Info}} 41 |
{{.Time}} {{.UserName}}: {{.Content}}
42 | {{end}} 43 |
44 | {{end}} 45 |
46 |
47 | {{end}} 48 | 73 | 74 | 75 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Project specified 2 | /.devcontainer/ 3 | /output 4 | /tiebaSpider 5 | /tiebaSpider.exe 6 | /url.txt 7 | /*.zip 8 | 9 | ### VisualStudioCode template 10 | .vscode/* 11 | !.vscode/settings.json 12 | !.vscode/tasks.json 13 | !.vscode/launch.json 14 | !.vscode/extensions.json 15 | *.code-workspace 16 | 17 | # Local History for Visual Studio Code 18 | .history/ 19 | 20 | ### JetBrains template 21 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider 22 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 23 | 24 | # User-specific stuff 25 | .idea/**/workspace.xml 26 | .idea/**/tasks.xml 27 | .idea/**/usage.statistics.xml 28 | .idea/**/dictionaries 29 | .idea/**/shelf 30 | 31 | # Generated files 32 | .idea/**/contentModel.xml 33 | 34 | # Sensitive or high-churn files 35 | .idea/**/dataSources/ 36 | .idea/**/dataSources.ids 37 | .idea/**/dataSources.local.xml 38 | .idea/**/sqlDataSources.xml 39 | .idea/**/dynamic.xml 40 | .idea/**/uiDesigner.xml 41 | .idea/**/dbnavigator.xml 42 | 43 | # Gradle 44 | .idea/**/gradle.xml 45 | .idea/**/libraries 46 | 47 | # Gradle and Maven with auto-import 48 | # When using Gradle or Maven with auto-import, you should exclude module files, 49 | # since they will be recreated, and may cause churn. Uncomment if using 50 | # auto-import. 51 | # .idea/artifacts 52 | # .idea/compiler.xml 53 | # .idea/jarRepositories.xml 54 | # .idea/modules.xml 55 | # .idea/*.iml 56 | # .idea/modules 57 | # *.iml 58 | # *.ipr 59 | 60 | # CMake 61 | cmake-build-*/ 62 | 63 | # Mongo Explorer plugin 64 | .idea/**/mongoSettings.xml 65 | 66 | # File-based project format 67 | *.iws 68 | 69 | # IntelliJ 70 | out/ 71 | 72 | # mpeltonen/sbt-idea plugin 73 | .idea_modules/ 74 | 75 | # JIRA plugin 76 | atlassian-ide-plugin.xml 77 | 78 | # Cursive Clojure plugin 79 | .idea/replstate.xml 80 | 81 | # Crashlytics plugin (for Android Studio and IntelliJ) 82 | com_crashlytics_export_strings.xml 83 | crashlytics.properties 84 | crashlytics-build.properties 85 | fabric.properties 86 | 87 | # Editor-based Rest Client 88 | .idea/httpRequests 89 | 90 | # Android studio 3.1+ serialized cache file 91 | .idea/caches/build_file_checksums.ser 92 | 93 | ### Go template 94 | # Binaries for programs and plugins 95 | *.exe 96 | *.exe~ 97 | *.dll 98 | *.so 99 | *.dylib 100 | 101 | # Test binary, built with `go test -c` 102 | *.test 103 | 104 | # Output of the go coverage tool, specifically when used with LiteIDE 105 | *.out 106 | 107 | # Dependency directories (remove the comment below to include it) 108 | # vendor/ 109 | -------------------------------------------------------------------------------- /template.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "encoding/json" 6 | "fmt" 7 | "html/template" 8 | "os" 9 | "sort" 10 | "sync" 11 | ) 12 | 13 | func writeOutput(filename string, callback func(w *bufio.Writer) error) error { 14 | f, err := os.OpenFile(filename, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644) 15 | if err != nil { 16 | return fmt.Errorf("error creating output file %s: %v", filename, err) 17 | } 18 | defer f.Close() 19 | w := bufio.NewWriter(f) 20 | err = callback(w) 21 | if err != nil { 22 | return fmt.Errorf("error writing to bufio %s, %v", filename, err) 23 | } 24 | err = w.Flush() 25 | if err != nil { 26 | return err 27 | } 28 | return nil 29 | } 30 | 31 | func renderHTML(done <-chan struct{}, pc *PageChannel, tempc <-chan *TemplateField, tmpl *template.Template) (chan string, chan error) { 32 | outputc := make(chan string) 33 | errc := make(chan error) 34 | 35 | // spawn renderers 36 | var wg sync.WaitGroup 37 | wg.Add(config.NumRenderer) 38 | for i := 0; i < config.NumRenderer; i++ { 39 | go func() { 40 | defer wg.Done() 41 | for { 42 | select { 43 | case <-done: 44 | return 45 | 46 | case t, ok := <-tempc: 47 | if !ok { 48 | return // no new task from parser, exit 49 | } 50 | 51 | if ret, err := postProcessing(done, pc, t); err != nil { 52 | errc <- err 53 | continue 54 | } else if ret { 55 | t.mutex.Lock() 56 | if t.send { 57 | t.send = false 58 | } 59 | t.mutex.Unlock() 60 | continue 61 | } 62 | 63 | sort.Slice(t.Comments, func(a, b int) bool { 64 | return t.Comments[a].PostNO < t.Comments[b].PostNO 65 | }) 66 | 67 | for _, v := range t.Lzls.Map { 68 | sort.Slice(v.Info, func(a, b int) bool { 69 | return v.Info[a].Index < v.Info[b].Index 70 | }) 71 | } 72 | 73 | // no longer merge as requested in issue #8 74 | // t.Merge() 75 | // t.Unique() 76 | 77 | // log.Printf("writing file output/file_%s.json", t.FileName()) 78 | filename := fmt.Sprintf("output/file_%s.json", t.FileName()) 79 | 80 | b, err := json.Marshal(t) 81 | if err != nil { 82 | errc <- err 83 | continue 84 | } 85 | 86 | err = writeOutput(filename, func(w *bufio.Writer) error { 87 | _, err := w.Write(b) 88 | return err 89 | }) 90 | 91 | if err != nil { 92 | errc <- err 93 | continue 94 | } 95 | 96 | filename = fmt.Sprintf("output/file_%s.html", t.FileName()) 97 | err = writeOutput(filename, func(w *bufio.Writer) error { 98 | if err := tmpl.Execute(w, struct { 99 | Title string 100 | Url string 101 | Comments []*OutputField 102 | Lzls map[uint64]*LzlComment 103 | }{Title: t.Title, Url: t.Url, Comments: t.Comments, Lzls: t.Lzls.Map}); err != nil { 104 | return fmt.Errorf("error executing template %s: %v", filename, err) 105 | } 106 | return nil 107 | }) 108 | 109 | if err != nil { 110 | errc <- err 111 | continue 112 | } 113 | 114 | outputc <- filename // report finished task 115 | t.SetRendered(true) 116 | 117 | if config.StoreExternalResource { 118 | pc.Del(1) 119 | } 120 | } 121 | } 122 | }() 123 | } 124 | go func() { 125 | wg.Wait() 126 | close(errc) 127 | close(outputc) 128 | }() 129 | return outputc, errc 130 | } 131 | -------------------------------------------------------------------------------- /main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "html/template" 6 | "log" 7 | "math/rand" 8 | "os" 9 | "time" 10 | ) 11 | 12 | var config *Config 13 | 14 | var version = "debug" 15 | 16 | var outputTemplate *template.Template 17 | 18 | type logWriter struct { 19 | } 20 | 21 | func (writer logWriter) Write(bytes []byte) (int, error) { 22 | return fmt.Print(time.Now().UTC().Format("2006-01-02 15:04:05 ") + string(bytes)) 23 | } 24 | 25 | func init() { 26 | // setup log time format 27 | // https://stackoverflow.com/a/36140590/6091246 28 | log.SetFlags(0) 29 | log.SetOutput(new(logWriter)) 30 | 31 | config = &Config{} 32 | if err := config.Parse("config.toml"); err != nil { 33 | log.Fatal(err) 34 | } 35 | 36 | outputPath := "output" 37 | if _, err := os.Stat(outputPath); os.IsNotExist(err) { 38 | err = os.Mkdir(outputPath, 0755) 39 | if err != nil { 40 | log.Fatalf("Error creating output folder: %v", err) 41 | } 42 | } 43 | 44 | fmt.Fprintf(os.Stderr, "templateName: %s", config.TemplateName) 45 | 46 | rand.Seed(41) 47 | 48 | // outputTemplate is used to render output 49 | outputTemplate = template.Must(template.New(config.TemplateName).Funcs( 50 | template.FuncMap{"convertTime": func(ts int64) string { 51 | // convertTime converts unix timestamp to the following format 52 | // How do I format an unix timestamp to RFC3339 - golang? 53 | // https://stackoverflow.com/a/21814954/6091246 54 | // Convert UTC to “local” time - Go 55 | // https://stackoverflow.com/a/45137855/6091246 56 | // Using Functions Inside Go Templates 57 | // https://www.calhoun.io/using-functions-inside-go-templates/ 58 | // Go template function 59 | // https://stackoverflow.com/a/20872724/6091246 60 | return time.Unix(ts, 0).In(time.Local).Format("2006-01-02 15:04") 61 | }, 62 | }).ParseFiles("template/" + config.TemplateName)) 63 | 64 | } 65 | 66 | func main() { 67 | println("tiebaSpider") 68 | println("version:", version) 69 | println("project url: https://github.com/hjhee/tiebaSpider") 70 | 71 | // closing done to force all goroutines to quit 72 | // Go Concurrency Patterns: Pipelines and cancellation 73 | // https://blog.golang.org/pipelines 74 | done := make(chan struct{}) 75 | defer close(done) 76 | 77 | err, errcConfig := config.Watch() 78 | if err != nil { 79 | panic(err) 80 | } 81 | 82 | pc, errcFetch := fetchHTMLList(done, "url.txt") 83 | tempc, errcParse := parseHTML(done, pc) 84 | outputc, errcRender := renderHTML(done, pc, tempc, outputTemplate) 85 | 86 | for { 87 | // programme exits when all error channels are closed: 88 | // breaking out of a select statement when all channels are closed 89 | // https://stackoverflow.com/a/13666733/6091246 90 | if errcFetch == nil && errcParse == nil && errcRender == nil { 91 | log.Printf("Job done!\n") 92 | break 93 | } 94 | parseSelect: 95 | select { 96 | case <-done: 97 | break parseSelect 98 | case err, ok := <-errcConfig: 99 | if !ok { 100 | log.Fatalf("[Cofnig] Config watcher encountered error") 101 | } 102 | fmt.Fprintf(os.Stderr, "[Config] Config watcher encountered error: %v\n", err) 103 | case err, ok := <-errcFetch: 104 | if !ok { 105 | errcFetch = nil 106 | log.Printf("[Fetch] job done") 107 | continue 108 | } 109 | fmt.Fprintf(os.Stderr, "[Fetch] error: %v\n", err) 110 | case err, ok := <-errcParse: 111 | if !ok { 112 | errcParse = nil 113 | log.Printf("[Parse] job done") 114 | continue 115 | } 116 | fmt.Fprintf(os.Stderr, "[Parse] error: %v\n", err) 117 | case err, ok := <-errcRender: 118 | if !ok { 119 | errcRender = nil 120 | log.Printf("[Template] job done") 121 | continue 122 | } 123 | fmt.Fprintf(os.Stderr, "[Template] error: %v\n", err) 124 | case file, ok := <-outputc: 125 | if ok { 126 | log.Printf("[Template] %s done\n", file) 127 | } 128 | } 129 | } 130 | } 131 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/PuerkitoBio/goquery v1.10.0 h1:6fiXdLuUvYs2OJSvNRqlNPoBm6YABE226xrbavY5Wv4= 2 | github.com/PuerkitoBio/goquery v1.10.0/go.mod h1:TjZZl68Q3eGHNBA8CWaxAN7rOU1EbDz3CWuolcO5Yu4= 3 | github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss= 4 | github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU= 5 | github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA= 6 | github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM= 7 | github.com/pelletier/go-toml v1.9.5 h1:4yBQzkHv+7BHq2PQUZF3Mx0IYxG7LsP222s7Agd3ve8= 8 | github.com/pelletier/go-toml v1.9.5/go.mod h1:u1nR/EPcESfeI/szUZKdtJ0xRNbUoANCkoOuaOx1Y+c= 9 | github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= 10 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= 11 | golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= 12 | golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= 13 | golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= 14 | golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= 15 | golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= 16 | golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= 17 | golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= 18 | golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns= 19 | golang.org/x/net v0.30.0 h1:AcW1SDZMkb8IpzCdQUaIq2sP4sZ4zw+55h6ynffypl4= 20 | golang.org/x/net v0.30.0/go.mod h1:2wGyMJ5iFasEhkwi13ChkO/t1ECNC4X4eBKkVFyYFlU= 21 | golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 22 | golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 23 | golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 24 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 25 | golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 26 | golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 27 | golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 28 | golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 29 | golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 30 | golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 31 | golang.org/x/sys v0.26.0 h1:KHjCJyddX0LoSTb3J+vWpupP9p0oznkqVk/IfjymZbo= 32 | golang.org/x/sys v0.26.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= 33 | golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= 34 | golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= 35 | golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= 36 | golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY= 37 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= 38 | golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= 39 | golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= 40 | golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= 41 | golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= 42 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= 43 | golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= 44 | golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= 45 | golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= 46 | golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 47 | -------------------------------------------------------------------------------- /post_processing.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "html/template" 7 | "log" 8 | "net/url" 9 | "path" 10 | "sync/atomic" 11 | 12 | "golang.org/x/net/html" 13 | ) 14 | 15 | func postProcessing(done <-chan struct{}, pc *PageChannel, t *TemplateField) (bool, error) { 16 | select { 17 | case <-done: 18 | return false, nil 19 | default: 20 | } 21 | if config.HighResImage { 22 | var searchImageURL func(done <-chan struct{}, pc *PageChannel, t *TemplateField, root *html.Node) bool 23 | searchImageURL = func(done <-chan struct{}, pc *PageChannel, t *TemplateField, root *html.Node) bool { 24 | updated := false 25 | for c := root.FirstChild; c != nil; c = c.NextSibling { 26 | select { 27 | case <-done: 28 | return false 29 | default: 30 | } 31 | if c.Type == html.ElementNode && c.Data == "img" { 32 | for i, a := range c.Attr { 33 | if a.Key == "src" { 34 | // log.Printf("img: %s", a.Val) 35 | localSrc, ud := cvtHighResImageURL(done, pc, t, a.Val, t.FileName()) 36 | updated = updated || ud 37 | c.Attr[i].Val = localSrc 38 | } 39 | } 40 | } 41 | ud := searchImageURL(done, pc, t, c) 42 | updated = updated || ud 43 | } 44 | return updated 45 | } 46 | if ret, err := processTemplateContent(done, pc, t, searchImageURL); err != nil { 47 | return false, err 48 | } else if ret { 49 | return true, nil 50 | } 51 | } 52 | if config.StoreExternalResource { 53 | var searchExternalResource func(done <-chan struct{}, pc *PageChannel, t *TemplateField, root *html.Node) bool 54 | searchExternalResource = func(done <-chan struct{}, pc *PageChannel, t *TemplateField, root *html.Node) bool { 55 | updated := false 56 | for c := root.FirstChild; c != nil; c = c.NextSibling { 57 | select { 58 | case <-done: 59 | return false 60 | default: 61 | } 62 | if c.Type == html.ElementNode && c.Data == "img" { 63 | for i, a := range c.Attr { 64 | if a.Key == "src" { 65 | localSrc, ud := cvtLocalURL(done, pc, t, a.Val, t.FileName()) 66 | updated = updated || ud 67 | c.Attr[i].Val = localSrc 68 | } 69 | } 70 | } 71 | ud := searchExternalResource(done, pc, t, c) 72 | updated = updated || ud 73 | } 74 | return updated 75 | } 76 | 77 | if ret, err := processTemplateContent(done, pc, t, searchExternalResource); err != nil { 78 | return false, err 79 | } else if ret { 80 | return true, nil 81 | } 82 | } 83 | return false, nil 84 | } 85 | 86 | func processTemplateContent(done <-chan struct{}, pc *PageChannel, t *TemplateField, callback func(done <-chan struct{}, pc *PageChannel, t *TemplateField, root *html.Node) bool) (bool, error) { 87 | t.mutex.Lock() 88 | t.Lzls.lock.Lock() 89 | defer t.mutex.Unlock() 90 | defer t.Lzls.lock.Unlock() 91 | updated := false 92 | traverse := func(tpStr string, data interface{}) (template.HTML, bool, error) { 93 | executor := template.New("comment") 94 | executor.Parse(tpStr) 95 | var buf bytes.Buffer 96 | executor.Execute(&buf, data) 97 | node, err := html.Parse(&buf) 98 | if err != nil { 99 | return "", false, fmt.Errorf("failed to parse html node: %v", err) 100 | } 101 | buf.Reset() 102 | ud := callback(done, pc, t, node) 103 | if err := html.Render(&buf, node); err != nil { 104 | return "", false, fmt.Errorf("failed to render html template: %v", err) 105 | } 106 | return template.HTML(buf.String()), ud, nil 107 | } 108 | for k := range t.Comments { 109 | select { 110 | case <-done: 111 | return false, nil 112 | default: 113 | } 114 | var ud bool 115 | var err error 116 | // log.Printf("content (before): %s", t.Comments[k].Content) 117 | t.Comments[k].Content, ud, err = traverse(`{{.Content}}`, &t.Comments[k]) 118 | updated = updated || ud 119 | // if err != nil { 120 | // return false, fmt.Errorf("[parseExternalResource] failed to parse comment (PostID: %s): %v", t.Comments[k].PostID, err) 121 | // } 122 | if err != nil { 123 | log.Printf("[parseExternalResource] failed to parse comment (ThreadID: %d, PostID: %d): %v", t.ThreadID, k, err) 124 | } 125 | // log.Printf("content (after): %s", t.Comments[k].Content) 126 | } 127 | for k := range t.Lzls.Map { 128 | select { 129 | case <-done: 130 | return false, nil 131 | default: 132 | } 133 | var ud bool 134 | var err error 135 | for i := range t.Lzls.Map[k].Info { 136 | // log.Printf("lzl content (before): %s", t.Lzls.Map[k].Info[i].Content) 137 | t.Lzls.Map[k].Info[i].Content, ud, err = traverse(`{{.Content}}`, &t.Lzls.Map[k].Info[i]) 138 | // log.Printf("lzl content (after): %s", t.Lzls.Map[k].Info[i].Content) 139 | updated = updated || ud 140 | if err != nil { 141 | log.Printf("[parseExternalResource] failed to parse lzlComment (ThreadID: %d, PostID: %d, Index: %d): %v", t.ThreadID, k, t.Lzls.Map[k].Info[i].Index, err) 142 | } 143 | } 144 | } 145 | return updated, nil 146 | } 147 | 148 | func cvtLocalURL(done <-chan struct{}, pc *PageChannel, t *TemplateField, src, prefix string) (string, bool) { 149 | u, err := url.Parse(src) 150 | if err != nil { 151 | return src, false 152 | } 153 | uOrig := *u 154 | // url already converted 155 | if u.Scheme == "" { 156 | return src, false 157 | } 158 | u.Scheme = "" 159 | u.Path = fmt.Sprintf("%s%s", u.Host, u.Path) 160 | dst := fmt.Sprintf("res_%s/%s", prefix, u.Path) 161 | newSrc := dst 162 | if t.resMap.Put(newSrc) { 163 | atomic.AddInt64(&t.resLeft, 1) 164 | pc.Add(1) 165 | go func() { 166 | select { 167 | case pc.send <- &HTMLPage{URL: &uOrig, Type: HTMLExternalResource, Path: dst, ThreadID: t.ThreadID}: 168 | case <-done: 169 | return 170 | } 171 | }() 172 | } 173 | return newSrc, true 174 | } 175 | 176 | func cvtHighResImageURL(done <-chan struct{}, pc *PageChannel, t *TemplateField, src, prefix string) (string, bool) { 177 | u, err := url.Parse(src) 178 | if err != nil { 179 | return src, false 180 | } 181 | if u.Host != "c.hiphotos.baidu.com" { 182 | return src, false 183 | } 184 | imageName := path.Base(u.Path) 185 | u.Host = "imgsrc.baidu.com" 186 | u.Path = fmt.Sprintf("/forum/pic/item/%s", imageName) 187 | dst := u.String() 188 | newSrc := dst 189 | return newSrc, true 190 | } 191 | -------------------------------------------------------------------------------- /fetch.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "fmt" 6 | "io" 7 | "log" 8 | "net/http" 9 | "net/url" 10 | "os" 11 | "regexp" 12 | "strings" 13 | "sync" 14 | "sync/atomic" 15 | "time" 16 | ) 17 | 18 | func fetchHTMLList(done <-chan struct{}, filename string) (*PageChannel, <-chan error) { 19 | feed := make(chan *HTMLPage, config.NumFetcher) 20 | ret, retErr := spawnFetcher(done, feed) 21 | 22 | pc := &PageChannel{send: feed, rec: ret} 23 | 24 | errc := make(chan error) 25 | go func() { 26 | defer close(errc) 27 | in, err := os.OpenFile(filename, os.O_RDONLY, 0644) 28 | if err != nil { 29 | errc <- fmt.Errorf("error reading url list: %v", err) 30 | return 31 | } 32 | defer in.Close() 33 | reader := bufio.NewReader(in) 34 | 35 | validURL := regexp.MustCompile(`^/p/([0-9]+)$`) // example: ^/p/7201761174$ 36 | wapURL := regexp.MustCompile(`^/mo/m$`) // example: "/mo/m?kz=7201761174" 37 | 38 | // reading file line by line in go 39 | // https://stackoverflow.com/a/41741702/6091246 40 | // case: 41 | // If you don't mind that the line could be very long (i.e. use a lot of RAM). It keeps the \n at the end of the string returned. 42 | var line string 43 | for isEOF := false; !isEOF; { 44 | line, err = reader.ReadString('\n') 45 | if err != nil { 46 | isEOF = true 47 | } 48 | line = strings.TrimSpace(line) 49 | if line == "" { 50 | continue 51 | } 52 | u, err := url.Parse(strings.TrimSpace(line)) 53 | if err != nil { 54 | log.Printf("[Fetch] Error parsing %s, skipping\n", line) 55 | continue 56 | } 57 | 58 | var pageType HTMLType 59 | 60 | if u.Scheme == "file" { 61 | pageType = HTMLLocal 62 | q := u.Query() 63 | tid := q.Get("tid") // get file tid for TemplateMap key later 64 | if tid == "" { 65 | log.Printf("[Fetch] file path %s is missing tid field, skipping", u) 66 | continue 67 | } 68 | } else { 69 | if u.Host != "tieba.baidu.com" { 70 | log.Printf("[Fetch] URL host %s is not Tieba, skipping", u) 71 | continue 72 | } 73 | 74 | if match := validURL.MatchString(u.Path); match { 75 | pageType = HTMLWebHomepage 76 | // strip query from url 77 | // URL Builder/Query builder in Go 78 | // https://stackoverflow.com/a/26987017/6091246 79 | u.RawQuery = "" 80 | } else if match = wapURL.MatchString(u.Path); match { 81 | pageType = HTMLWebWAPHomepage 82 | } else { 83 | log.Printf("[Fetch] %s is not a valid Tieba post URL, skipping", u) 84 | continue 85 | } 86 | } 87 | 88 | // log.Printf("[Fetch] Got new url from list: %v\n", u) 89 | 90 | if config.StoreExternalResource { 91 | pc.Add(1) 92 | } 93 | 94 | pc.Add(1) 95 | select { 96 | case pc.send <- &HTMLPage{URL: u, Type: pageType}: 97 | case <-done: 98 | return 99 | } 100 | } 101 | pc.Inited() 102 | }() 103 | 104 | // merge error chans 105 | // Go Concurrency Patterns: Pipelines and cancellation 106 | // https://blog.golang.org/pipelines 107 | errChan := make(chan error) 108 | go func() { 109 | defer close(errChan) 110 | for { 111 | if errc == nil && retErr == nil { 112 | return 113 | } 114 | select { 115 | case err, ok := <-errc: 116 | if !ok { 117 | errc = nil 118 | continue 119 | } 120 | errChan <- err 121 | return 122 | 123 | case err, ok := <-retErr: 124 | if !ok { 125 | retErr = nil 126 | continue 127 | } 128 | errChan <- err 129 | return 130 | } 131 | } 132 | }() 133 | 134 | return pc, errChan 135 | } 136 | 137 | func fetcher(done <-chan struct{}, wg *sync.WaitGroup, jobsLeft *int64, ret chan<- *HTMLPage, jobs chan *HTMLPage) { 138 | defer wg.Done() 139 | for { 140 | select { 141 | case <-done: 142 | return 143 | case page, ok := <-jobs: 144 | if !ok { 145 | return 146 | } 147 | var err error 148 | switch page.Type { 149 | case HTMLLocal: 150 | err = fetchHTMLFromFile(page) 151 | default: 152 | err = fetchHTMLFromURL(page) 153 | } 154 | if err != nil { 155 | go func(page *HTMLPage) { 156 | select { 157 | case <-done: 158 | return 159 | case <-time.After(3 * time.Second): 160 | jobs <- page // add failed task back to jobs 161 | } 162 | }(page) 163 | log.Printf("[Fetch] error fetching %s, pause for 3s: %s\n", page.URL, err) 164 | } else { 165 | select { 166 | case ret <- page: 167 | atomic.AddInt64(jobsLeft, -1) // task done 168 | case <-done: 169 | return 170 | } 171 | } 172 | } 173 | } 174 | } 175 | 176 | func spawnFetcher(done <-chan struct{}, jobs <-chan *HTMLPage) (<-chan *HTMLPage, <-chan error) { 177 | in := make(chan *HTMLPage, config.NumFetcher) // fetcher get tasks from in 178 | ret := make(chan *HTMLPage, config.NumParser) // send HTML content to parser 179 | errc := make(chan error) 180 | 181 | jobsLeft := new(int64) 182 | chClosed := false 183 | 184 | var wg sync.WaitGroup 185 | wg.Add(1) 186 | go func() { 187 | defer wg.Done() 188 | defer close(in) 189 | defer close(ret) 190 | for { 191 | if chClosed { 192 | if atomic.LoadInt64(jobsLeft) <= 0 { 193 | return // no more task left in channel in, exit 194 | } 195 | time.Sleep(time.Second) // check every second 196 | continue 197 | } 198 | select { 199 | case <-done: 200 | return 201 | case p, ok := <-jobs: 202 | if !ok { 203 | chClosed = true // parser sends no more jobs, time to exit 204 | continue 205 | } 206 | atomic.AddInt64(jobsLeft, 1) // add job to channel in 207 | in <- p 208 | } 209 | } 210 | }() 211 | for i := 0; i < config.NumFetcher; i++ { 212 | wg.Add(1) 213 | go fetcher(done, &wg, jobsLeft, ret, in) 214 | } 215 | go func() { 216 | wg.Wait() 217 | close(errc) 218 | }() 219 | return ret, errc 220 | } 221 | 222 | func fetchHTMLFromURL(page *HTMLPage) error { 223 | req, err := http.NewRequest("GET", page.URL.String(), nil) 224 | if err != nil { 225 | return err 226 | } 227 | if config.UserAgent != "" { 228 | req.Header.Add("User-Agent", config.UserAgent) 229 | } 230 | if config.CookieString != "" { 231 | req.Header.Add("Cookie", config.CookieString) 232 | } 233 | client := &http.Client{} 234 | resp, err := client.Do(req) 235 | if err != nil { 236 | return err 237 | } 238 | bytes, err := io.ReadAll(resp.Body) 239 | if err != nil { 240 | return err 241 | } 242 | page.Content = bytes 243 | // page.Response = resp 244 | resp.Body.Close() 245 | return nil 246 | } 247 | 248 | func fetchHTMLFromFile(page *HTMLPage) error { 249 | in, err := os.OpenFile(page.URL.Path, os.O_RDONLY, 0644) 250 | if err != nil { 251 | return fmt.Errorf("error reading file path from %s: %v", page.URL.Path, err) 252 | } 253 | defer in.Close() 254 | reader := bufio.NewReader(in) 255 | bytes, err := io.ReadAll(reader) 256 | if err != nil { 257 | return err 258 | } 259 | page.Content = bytes 260 | return nil 261 | } 262 | -------------------------------------------------------------------------------- /parse_test.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "net/url" 5 | "strings" 6 | "testing" 7 | 8 | "github.com/PuerkitoBio/goquery" 9 | ) 10 | 11 | var lzlTotalCommentTestString = `
  • 终极闪耀赛罗✨: 回复 阿比酱最棒啦💖 :说起来纸片人搞饭圈这一套就nm离谱
    2020-5-17 22:37回复
  • 终极闪耀赛罗✨: 回复 阿比酱最棒啦💖 :算了,大佬打架我这萌新还是稍稍吧
    2020-5-17 22:37回复
  • lonelyrangers: 你这形容得太有画面感了
    2020-5-18 02:12回复
  • 现代淑女难求啊: 回复 阿比酱最棒啦💖 :是秦武阳
    2020-5-18 17:06回复
  • 阿比酱最棒啦💖: 回复 现代淑女难求啊 :啊这,丢人
    2020-5-18 17:07回复
  • 我也说一句

    首页 12 | 上一页 13 | 1 14 | 2 15 |

  • ` 16 | 17 | func TestTotalCommentParserFcn(t *testing.T) { 18 | u := &url.URL{} 19 | body := lzlTotalCommentTestString 20 | commentParserFcn(u, body, HTMLLzl, func(key uint64, value *LzlContent) { 21 | // special rule: remove username ahref in ": 回复 ", as requested in #4 22 | strContent := string(value.Content) 23 | content := strings.Trim(strContent, " ") 24 | if strings.HasPrefix(content, "回复") { 25 | doc, err := goquery.NewDocumentFromReader(strings.NewReader(content)) 26 | if err != nil { 27 | t.Errorf("failed to parse comment data: %v, reason: %s", content, err) 28 | } 29 | bodyDOM := doc.Find("body") 30 | s := doc.Find("a.at").First() 31 | userNameHtml, _ := s.Html() 32 | // t.Errorf(userNameHtml) 33 | s.ReplaceWithHtml(userNameHtml) 34 | t.Logf(bodyDOM.Html()) 35 | } 36 | }, func(string, string, string) {}) 37 | } 38 | -------------------------------------------------------------------------------- /type.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/json" 5 | "errors" 6 | "html/template" 7 | "net/url" 8 | "os" 9 | "regexp" 10 | "sync" 11 | "sync/atomic" 12 | 13 | "github.com/fsnotify/fsnotify" 14 | "github.com/pelletier/go-toml" 15 | ) 16 | 17 | // Config stores user specified configurations in config.toml 18 | type Config struct { 19 | NumFetcher int `toml:"numFetcher"` 20 | NumParser int `toml:"numParser"` 21 | NumRenderer int `toml:"numRenderer"` 22 | TemplateName string `toml:"templateName"` 23 | RetryPeriod int `toml:"retryPeriod"` 24 | 25 | HighResImage bool `toml:"highResImage"` 26 | StoreExternalResource bool `toml:"storeExternalResource"` 27 | 28 | UserAgent string `toml:"userAgent"` 29 | CookieString string `toml:"cookieString"` 30 | 31 | ShowNickName bool `toml:"showNickname"` 32 | 33 | watcher *fsnotify.Watcher 34 | } 35 | 36 | func (c *Config) Parse(path string) error { 37 | dataStr, _ := os.ReadFile(path) 38 | err := toml.Unmarshal(dataStr, c) 39 | return err 40 | } 41 | 42 | func (c *Config) Watch() (error, <-chan error) { 43 | watcher, err := fsnotify.NewWatcher() 44 | if err != nil { 45 | return err, nil 46 | } 47 | c.watcher = watcher 48 | err = c.watcher.Add(".") 49 | if err != nil { 50 | return err, nil 51 | } 52 | 53 | errChan := make(chan error) 54 | 55 | // Start listening for events. 56 | go func() { 57 | defer c.watcher.Close() 58 | for { 59 | select { 60 | case event, ok := <-c.watcher.Events: 61 | if !ok { 62 | return 63 | } 64 | if event.Has(fsnotify.Write) { 65 | if event.Name == "./config.toml" { 66 | cc := &Config{} 67 | if err := cc.Parse("config.toml"); err == nil { 68 | c.UserAgent = cc.UserAgent 69 | c.CookieString = cc.CookieString 70 | } else { 71 | errChan <- errors.Join(errors.New("failed to parse config.toml"), err) 72 | } 73 | } 74 | } 75 | case err, ok := <-c.watcher.Errors: 76 | if !ok { 77 | return 78 | } 79 | errChan <- err 80 | } 81 | } 82 | }() 83 | 84 | return nil, errChan 85 | } 86 | 87 | // PageChannel share HTML task between fetcher and parser 88 | type PageChannel struct { 89 | // parser get HTML pages from rec 90 | rec <-chan *HTMLPage 91 | 92 | // fetcher get URL from send 93 | send chan<- *HTMLPage 94 | 95 | // number of URL to be fetched and parsed 96 | ref int64 97 | 98 | // flag, whether all URLs from list are added to fetcher 99 | init int64 100 | } 101 | 102 | // Add task number 103 | func (p *PageChannel) Add(n int64) { 104 | if n <= 0 { 105 | return 106 | } 107 | atomic.AddInt64(&p.ref, n) 108 | } 109 | 110 | // Del task number 111 | func (p *PageChannel) Del(n int64) { 112 | if n <= 0 { 113 | return 114 | } 115 | atomic.AddInt64(&p.ref, -n) 116 | } 117 | 118 | // Ref returns task number 119 | func (p *PageChannel) Ref() int64 { 120 | return atomic.LoadInt64(&p.ref) 121 | } 122 | 123 | // Inited returns whether all URLs are read from url.txt 124 | func (p *PageChannel) Inited() { 125 | atomic.StoreInt64(&p.init, 1) 126 | } 127 | 128 | // IsDone returns whether all HTML page are fetched 129 | func (p *PageChannel) IsDone() bool { 130 | return atomic.LoadInt64(&p.ref) <= 0 && atomic.LoadInt64(&p.init) != 0 131 | } 132 | 133 | // HTMLType tells parser how to parse the HTMLPage 134 | type HTMLType int 135 | 136 | const ( 137 | // HTMLWebHomepage is the first page of a Tieba post 138 | HTMLWebHomepage HTMLType = iota 139 | 140 | // HTMLWebPage is a page of a Tieba post 141 | HTMLWebPage 142 | 143 | // HTMLJSON is the Lzl totalComment in JSON format 144 | HTMLJSON 145 | 146 | // HTMLLzlHome is the Lzl Comment of a comment in page 2 in JSON format 147 | HTMLLzlHome 148 | 149 | // HTMLLzl is the Lzl Comment of a comment in JSON format 150 | HTMLLzl 151 | 152 | // HTMLLocal is a local HTML or JSON file 153 | HTMLLocal 154 | 155 | // HTMLWebWAPHomepage is the first page of a wap post 156 | HTMLWebWAPHomepage 157 | 158 | // HTMLWebWAPPage supports fetching wap posts 159 | HTMLWebWAPPage 160 | 161 | // HTMLExternalResource containes external resources (i.e. images) 162 | HTMLExternalResource 163 | ) 164 | 165 | // HTMLPage is a job for fetcher and parser 166 | type HTMLPage struct { 167 | // URL of the Page 168 | URL *url.URL 169 | 170 | // Content is the HTML code of the Page 171 | Content []byte 172 | 173 | // Type indicates different types of Tieba data 174 | Type HTMLType 175 | 176 | // Close http.Response when finished parsing 177 | // Response *http.Response 178 | 179 | // Path where downloaded external resources are saved 180 | Path string 181 | // ThreadID links external resources to corresponding TemplateField 182 | ThreadID uint64 183 | } 184 | 185 | // TiebaField parse "data-field" of each thread 186 | type TiebaField struct { 187 | Author struct { 188 | UserID uint64 `json:"user_id"` 189 | UserName string `json:"user_name"` // 用户名 190 | // Props string `json:"props"` 191 | } `json:"author"` 192 | Content struct { 193 | PostID uint64 `json:"post_id"` 194 | // IsAnonym bool `json:"is_anonym"` 195 | ForumID uint64 `json:"forum_id"` 196 | ThreadID uint64 `json:"thread_id"` 197 | Content string `json:"content"` // 正文内容 198 | PostNO uint64 `json:"post_no"` // 楼数 199 | // Type string `json:"type"` 200 | // CommentNum uint16 `json:"comment_num"` 201 | // Props string `json:"props"` 202 | // PostIndex uint64 `json:"post_index"` 203 | // PbTpoint *uint64 `json:"pb_tpoint"` 204 | } `json:"content"` 205 | } 206 | 207 | // LzlField parse Lzl JSON data 208 | type LzlField struct { 209 | ErrNO int64 `json:"errno"` 210 | ErrMsg string `json:"errmsg"` 211 | Data map[string]json.RawMessage `json:"data"` 212 | } 213 | 214 | // LzlContent is a comment of Lzl from totalComment 215 | type LzlContent struct { 216 | // ThreadID uint64 `json:"thread_id,string"` 217 | // PostID uint64 `json:"post_id,string"` 218 | // CommentID uint64 `json:"comment_id,string"` 219 | Index int64 220 | UserName string `json:"username"` 221 | UserNickname string `json:"show_nickname,omitempty"` 222 | Content template.HTML `json:"content"` 223 | Timestamp int64 `json:"now_time"` 224 | Time string 225 | } 226 | 227 | // LzlComment indicates the relationship between a Tieba posts and the attached Lzl comment 228 | type LzlComment struct { 229 | Num uint64 `json:"comment_num"` 230 | ListNum uint64 `json:"comment_list_num"` 231 | Info []*LzlContent `json:"comment_info"` 232 | // Info []json.RawMessage `json:"comment_info"` 233 | } 234 | 235 | // LzlPageComment indicates the total number of LzlComments in a single comment 236 | type LzlPageComment struct { 237 | TotalNum uint64 `json:"total_num"` 238 | TotalPage uint64 `json:"total_page"` 239 | } 240 | 241 | // OutputField render Tieba post in template 242 | type OutputField struct { 243 | UserName template.HTML 244 | Content template.HTML 245 | PostNO uint64 246 | PostID uint64 247 | Time string 248 | } 249 | 250 | // LzlMap provides a thread safe map insert method 251 | type LzlMap struct { 252 | Map map[uint64]*LzlComment 253 | lock *sync.Mutex 254 | } 255 | 256 | // Append LzlComment to Map with synchronization 257 | func (lzl *LzlMap) Append(k uint64, c *LzlContent) { 258 | lzl.lock.Lock() 259 | lzl.Map[k].Info = append(lzl.Map[k].Info, c) 260 | lzl.lock.Unlock() 261 | } 262 | 263 | // Insert LzlComment to Map with synchronization 264 | func (lzl *LzlMap) Insert(k uint64, v *LzlComment) { 265 | lzl.lock.Lock() 266 | lzl.Map[k] = v 267 | lzl.lock.Unlock() 268 | } 269 | 270 | // IsExist returns true if key is already in Map 271 | func (lzl *LzlMap) IsExist(k uint64) bool { 272 | lzl.lock.Lock() 273 | _, ok := lzl.Map[k] 274 | lzl.lock.Unlock() 275 | return ok 276 | } 277 | 278 | // ExternalResourceMap keeps records of fetched external resources 279 | type ExternalResourceMap struct { 280 | Map map[string]interface{} 281 | lock *sync.Mutex 282 | } 283 | 284 | func (erm *ExternalResourceMap) Get(k string) bool { 285 | erm.lock.Lock() 286 | defer erm.lock.Unlock() 287 | if _, ok := erm.Map[k]; !ok { 288 | return false 289 | } 290 | return true 291 | } 292 | 293 | func (erm *ExternalResourceMap) Set(k string) { 294 | erm.lock.Lock() 295 | defer erm.lock.Unlock() 296 | erm.Map[k] = nil 297 | } 298 | 299 | func (erm *ExternalResourceMap) Put(k string) bool { 300 | erm.lock.Lock() 301 | defer erm.lock.Unlock() 302 | ret := false 303 | if _, ok := erm.Map[k]; !ok { 304 | ret = true 305 | } 306 | erm.Map[k] = nil 307 | return ret 308 | } 309 | 310 | // TemplateField stores all necessary information to render a HTML page 311 | type TemplateField struct { 312 | Title string 313 | Url string 314 | ThreadID uint64 315 | Comments []*OutputField 316 | pagesLeft int64 317 | Lzls *LzlMap // Key is PostID 318 | lzlsLeft int64 319 | resLeft int64 320 | mutex *sync.RWMutex 321 | send bool 322 | rendered int64 323 | resMap *ExternalResourceMap 324 | } 325 | 326 | // NetTemplateField returns a initialized struct 327 | func NewTemplateField(threadID uint64) *TemplateField { 328 | tf := &TemplateField{ 329 | ThreadID: threadID, 330 | Comments: make([]*OutputField, 0, 30), 331 | Lzls: &LzlMap{ 332 | Map: make(map[uint64]*LzlComment), 333 | lock: &sync.Mutex{}, 334 | }, 335 | mutex: &sync.RWMutex{}, 336 | resMap: &ExternalResourceMap{ 337 | Map: make(map[string]interface{}), 338 | lock: &sync.Mutex{}, 339 | }, 340 | } 341 | return tf 342 | } 343 | 344 | // Send parsed Tieba posts to render 345 | // https://misfra.me/optimizing-concurrent-map-access-in-go/ 346 | func (t *TemplateField) Send(c chan *TemplateField) { 347 | t.mutex.RLock() 348 | if !t.send { 349 | t.mutex.RUnlock() 350 | t.mutex.Lock() 351 | if !t.send { 352 | c <- t 353 | t.send = true 354 | } 355 | t.mutex.Unlock() 356 | } else { 357 | t.mutex.RUnlock() 358 | } 359 | } 360 | 361 | // AddPage adds the number of Page to be parsed 362 | func (t *TemplateField) AddPage(n int64) { 363 | atomic.AddInt64(&t.pagesLeft, n) 364 | } 365 | 366 | // AddLzl adds the number of Lzls to be parsed 367 | func (t *TemplateField) AddLzl(n int64) { 368 | atomic.AddInt64(&t.lzlsLeft, n) 369 | } 370 | 371 | // Append a new post to TemplateField 372 | func (t *TemplateField) Append(post *OutputField) { 373 | t.mutex.Lock() 374 | // l := len(t.Comments) 375 | // n := l + 1 376 | // if n > cap(t.Comments) { 377 | // newSlice := make([]*OutputField, 30*10+n+1) 378 | // copy(newSlice, t.Comments) 379 | // t.Comments = newSlice 380 | // } 381 | // t.Comments = t.Comments[0:n] 382 | // copy(t.Comments[n:n+1], post) 383 | t.Comments = append(t.Comments, post) 384 | t.mutex.Unlock() 385 | } 386 | 387 | // IsDone returns whether TemplateField is ready to be rendered 388 | func (t *TemplateField) IsDone() bool { 389 | pagesLeft := atomic.LoadInt64(&t.pagesLeft) 390 | lzlsLeft := atomic.LoadInt64(&t.lzlsLeft) 391 | ret := pagesLeft <= 0 && lzlsLeft <= 0 392 | if config.StoreExternalResource { 393 | resLeft := atomic.LoadInt64(&t.resLeft) 394 | // log.Printf("%d: resLeft (%d)", t.ThreadID, resLeft) 395 | ret = ret && (resLeft <= 0) 396 | } 397 | return ret 398 | } 399 | 400 | // Merge consecutive posts whose Useaname is the same 401 | func (t *TemplateField) Merge() { 402 | l := len(t.Comments) 403 | for i := 0; i+1 < l; i++ { 404 | if t.Comments[i+1].UserName != t.Comments[i].UserName { 405 | continue 406 | } 407 | v, ok := t.Lzls.Map[t.Comments[i+1].PostID] 408 | if ok && v.ListNum != 0 && v.Num != 0 { 409 | continue 410 | } 411 | v, ok = t.Lzls.Map[t.Comments[i].PostID] 412 | if ok && v.ListNum != 0 && v.Num != 0 { 413 | continue 414 | } 415 | // How to efficiently concatenate strings in Go? 416 | // https://stackoverflow.com/a/43675122/6091246 417 | bs := make([]byte, len(t.Comments[i].Content)+len(t.Comments[i+1].Content)+1) 418 | bl := 0 419 | bl += copy(bs[bl:], t.Comments[i].Content) 420 | bs[bl] = '\n' 421 | bl++ 422 | bl += copy(bs[bl:], t.Comments[i+1].Content) 423 | t.Comments[i].Content = template.HTML(bs) 424 | // t.Comments[i].Content = t.Comments[i].Content + "\n" + t.Comments[i+1].Content 425 | // removes duplicate values in given slice 426 | // https://gist.github.com/alioygur/16c66b4249cb42715091fe010eec7e33#file-unique_slice-go-L13 427 | t.Comments = append(t.Comments[:i+1], t.Comments[i+2:]...) 428 | i-- 429 | l-- 430 | } 431 | } 432 | 433 | // Unique removes any duplicate posts using PoseNO 434 | func (t *TemplateField) Unique() { 435 | // Idiomatic way to remove duplicates in a slice 436 | // https://www.reddit.com/r/golang/comments/5ia523/idiomatic_way_to_remove_duplicates_in_a_slice/db6qa2e/ 437 | seen := make(map[uint64]struct{}, len(t.Comments)) 438 | j := 0 439 | for _, v := range t.Comments { 440 | if _, ok := seen[v.PostNO]; ok { 441 | continue 442 | } 443 | seen[v.PostNO] = struct{}{} 444 | t.Comments[j] = v 445 | j++ 446 | } 447 | t.Comments = t.Comments[:j] 448 | } 449 | 450 | // Rendered returns true if the template is written to the output file 451 | func (t *TemplateField) Rendered() bool { 452 | return atomic.LoadInt64(&t.rendered) != 0 453 | } 454 | 455 | // SetRendered could be used to change rendered status 456 | func (t *TemplateField) SetRendered(status bool) { 457 | if status { 458 | atomic.StoreInt64(&t.rendered, 1) 459 | } else { 460 | atomic.StoreInt64(&t.rendered, 0) 461 | } 462 | } 463 | 464 | func (t *TemplateField) FileName() string { 465 | // #6: remove illegal character in title 466 | // ref: https://www.codeproject.com/tips/758861/removing-characters-which-are-not-allowed-in-windo 467 | filenameRegex := regexp.MustCompile(`[\\/:*?""<>|]`) 468 | validFilename := filenameRegex.ReplaceAllLiteralString(t.Title, "") 469 | return validFilename 470 | } 471 | 472 | // TemplateMap manipulate a Tieba thread in parser 473 | type TemplateMap struct { 474 | Map map[uint64]*TemplateField // Key is ThreadID 475 | lock *sync.RWMutex 476 | Channel chan *TemplateField 477 | } 478 | 479 | // Get returns a value from Map with synchronization 480 | // see: https://misfra.me/optimizing-concurrent-map-access-in-go/ for more detail 481 | func (tm *TemplateMap) Get(k uint64) *TemplateField { 482 | var val *TemplateField 483 | var ok bool 484 | tm.lock.RLock() 485 | if val, ok = tm.Map[k]; !ok { 486 | tm.lock.RUnlock() 487 | tm.lock.Lock() 488 | if val, ok = tm.Map[k]; !ok { 489 | val = NewTemplateField(k) 490 | tm.Map[k] = val 491 | } 492 | tm.lock.Unlock() 493 | } else { 494 | tm.lock.RUnlock() 495 | } 496 | return val 497 | } 498 | 499 | // Sweep search threadIDs for elements ready for rendering 500 | func (tm *TemplateMap) Sweep(pc *PageChannel) { 501 | tm.lock.RLock() 502 | for k := range tm.Map { 503 | tf := tm.Map[k] 504 | if tf.IsDone() && !tf.Rendered() { 505 | go tf.Send(tm.Channel) 506 | } 507 | } 508 | tm.lock.RUnlock() 509 | 510 | // TODO: delete rendered threads from TemplateMap 511 | } 512 | -------------------------------------------------------------------------------- /parse.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "bytes" 6 | "encoding/json" 7 | "errors" 8 | "fmt" 9 | "html/template" 10 | "log" 11 | "math/rand" 12 | "net/url" 13 | "os" 14 | "path/filepath" 15 | "regexp" 16 | "strconv" 17 | "strings" 18 | "sync" 19 | "sync/atomic" 20 | "time" 21 | 22 | "github.com/PuerkitoBio/goquery" 23 | ) 24 | 25 | var letterRunes = []rune("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ") 26 | 27 | func randStringRunes(n int) string { 28 | b := make([]rune, n) 29 | for i := range b { 30 | b[i] = letterRunes[rand.Intn(len(letterRunes))] 31 | } 32 | return string(b) 33 | } 34 | 35 | func htmlParseWrapperFcn(done <-chan struct{}, pc *PageChannel, page *HTMLPage, tmMap *TemplateMap, querySelector, threadSelector string, callback func(tf *TemplateField, doc *goquery.Document, posts *goquery.Selection) error) error { 36 | defer pc.Del(1) 37 | doc, err := goquery.NewDocumentFromReader(bytes.NewReader(page.Content)) 38 | if err != nil { 39 | // network error, retry request 40 | pc.Add(1) 41 | go addPageToFetchQueue(done, pc, time.Duration(config.RetryPeriod)*time.Second, page.URL, page.Type) 42 | return fmt.Errorf("error parsing %s(title: %s): %v", page.URL, findTitle(doc), err) 43 | } 44 | 45 | posts := doc.Find(querySelector) 46 | threadRegex := regexp.MustCompile(threadSelector) 47 | match := threadRegex.FindStringSubmatch(string(page.Content)) 48 | if len(match) < 1 { 49 | // network error, retry request 50 | pc.Add(1) 51 | go addPageToFetchQueue(done, pc, time.Duration(config.RetryPeriod)*time.Second, page.URL, page.Type) 52 | return fmt.Errorf("unable to parse page(title: %s), possibly a network error, readding url to queue %s", findTitle(doc), page.URL) 53 | } 54 | strInt, _ := strconv.ParseInt(match[1], 10, 64) 55 | threadID := uint64(strInt) 56 | // TODO: wrap tf.Add method in order to substitute image with html embedded one 57 | tf := tmMap.Get(threadID) 58 | err = callback(tf, doc, posts) 59 | // page.Response.Body.Close() 60 | if err != nil { 61 | pc.Add(1) 62 | go addPageToFetchQueue(done, pc, time.Duration(config.RetryPeriod)*time.Second, page.URL, page.Type) 63 | } else { 64 | tf.AddPage(-1) 65 | } 66 | return err 67 | } 68 | 69 | func htmlParse(done <-chan struct{}, pc *PageChannel, page *HTMLPage, tmMap *TemplateMap, callback func(tf *TemplateField, doc *goquery.Document, posts *goquery.Selection) error) error { 70 | // posts := doc.Find("div.l_post.j_l_post.l_post_bright") 71 | // threadRegex := regexp.MustCompile(`\b"?thread_id"?:"?(\d+)"?\b`) 72 | return htmlParseWrapperFcn(done, pc, page, tmMap, "div.l_post.j_l_post.l_post_bright", `\b"?thread_id"?:"?(\d+)"?\b`, callback) 73 | } 74 | 75 | func homePageParserFcn(done <-chan struct{}, pc *PageChannel, tf *TemplateField, doc *goquery.Document, posts *goquery.Selection, page *HTMLPage, pageTitleFinder func(doc *goquery.Document) string, pageNumFinder func(doc *goquery.Document) (int64, error), pageType HTMLType, parserFcn func(page *HTMLPage, tf *TemplateField, doc *goquery.Document, posts *goquery.Selection) error) error { 76 | tf.Title = pageTitleFinder(doc) 77 | log.Printf("[homepage] Title: %s", tf.Title) 78 | // issue #11 add url to page content 79 | tf.Url = page.URL.String() 80 | 81 | pageNum, err := pageNumFinder(doc) 82 | if err != nil { 83 | return fmt.Errorf("error parsing total number of pages: %v", err) 84 | } 85 | 86 | atomic.StoreInt64(&tf.pagesLeft, pageNum) 87 | atomic.StoreInt64(&tf.lzlsLeft, pageNum) 88 | // fetch all comments and lzls, excluding comments in the first page 89 | // pageNum - 1: html page 2~pageNum 90 | // pageNum + 1: lzl page 0~pageNum 91 | pc.Add(pageNum - 1 + pageNum + 1) 92 | go addPageToFetchQueueFromHomePage(done, pc, page.URL, tf.ThreadID, pageNum, pageType) 93 | 94 | return parserFcn(page, tf, doc, posts) 95 | } 96 | 97 | func homepageParser(done <-chan struct{}, page *HTMLPage, pc *PageChannel, tmMap *TemplateMap) error { 98 | return htmlParse(done, pc, page, tmMap, func(tf *TemplateField, doc *goquery.Document, posts *goquery.Selection) error { 99 | return homePageParserFcn(done, pc, tf, doc, posts, page, findTitle, func(doc *goquery.Document) (int64, error) { 100 | var pageNum int64 101 | if s := doc.Find("span.red").Eq(1); s.Text() == "" { 102 | pageNum = 1 // Could not find total number of pages, default to 1 103 | } else { 104 | n, err := strconv.Atoi(s.Text()) 105 | if err != nil { 106 | return 0, fmt.Errorf("error parsing total number of pages: %v", err) 107 | } 108 | pageNum = int64(n) 109 | } 110 | return pageNum, nil 111 | }, HTMLWebPage, pageParserFcn) 112 | }) 113 | } 114 | 115 | func pageParserFcn(page *HTMLPage, tf *TemplateField, doc *goquery.Document, posts *goquery.Selection) error { 116 | posts.Each(func(i int, s *goquery.Selection) { 117 | // filter elements that has more than 4 class (maybe an advertisement, commit 9c82d4e381d1bcd3f801bf5f6c07960fb7d829be) 118 | classStr, _ := s.Attr("class") // get class string 119 | if len(strings.Fields(classStr)) > 4 { 120 | return 121 | } 122 | 123 | dataField, ok := s.Attr("data-field") 124 | if !ok { 125 | // maybe not an error, but an older version of data-field 126 | fmt.Fprintf(os.Stderr, "#%d data-field not found: %s\n", i, page.URL) // there's a error on the page, maybe Tieba updated the syntax 127 | return 128 | } 129 | 130 | var tiebaPost TiebaField 131 | var res OutputField 132 | err := json.Unmarshal([]byte(dataField), &tiebaPost) 133 | if err != nil { 134 | fmt.Fprintf(os.Stderr, "#%d data-field unmarshal failed: %v, url: %s\n", i, err, page.URL) // there's a error on the page, maybe Tieba updated the syntax 135 | return 136 | } 137 | if content, err := s.Find("div.d_author ul.p_author li.d_name a.p_author_name.j_user_card").Html(); err != nil { 138 | fmt.Fprintf(os.Stderr, "#%d Error parsing username from %s\n", i, page.URL) 139 | return 140 | } else { 141 | res.UserName = template.HTML(handleUserNameEmojiURL(content)) 142 | } 143 | 144 | res.Content = template.HTML(tiebaPost.Content.Content) 145 | res.PostNO = tiebaPost.Content.PostNO 146 | res.PostID = tiebaPost.Content.PostID 147 | 148 | if res.Content == "" { 149 | // data-field does not contain content 150 | // infer an old version of posts 151 | postID := fmt.Sprintf("#post_content_%d", res.PostID) 152 | content, err := posts.Find(postID).Html() 153 | if err != nil { 154 | log.Printf("#%d: post_content_%d parse failed, %s", i, res.PostID, err) 155 | } else { 156 | res.Content = template.HTML(content) 157 | } 158 | } 159 | 160 | // get post time 161 | // Jquery过滤选择器,选择前几个元素,后几个元素,内容过滤选择器等 162 | // http://www.cnblogs.com/alone2015/p/4962687.html 163 | for _, elem := range s.Find("span.tail-info").EachIter() { 164 | if tm, err := time.Parse(`2006-01-02 15:04`, elem.Text()); err == nil { 165 | res.Time = tm.Format("2006-01-02 15:04") 166 | } 167 | } 168 | 169 | tf.Append(&res) 170 | // log.Printf("#%d data-field found: %v\n", i, tiebaPost) 171 | // log.Printf("#%d data-field found:\nauthor: %s\ncontent: %s\n", 172 | // tiebaPost.Content.PostNo, 173 | // tiebaPost.Author.UserName, 174 | // tiebaPost.Content.Content) 175 | 176 | // result.Posts <- &res 177 | }) 178 | return nil 179 | } 180 | 181 | func pageParser(done <-chan struct{}, page *HTMLPage, pc *PageChannel, tmMap *TemplateMap) error { 182 | // log.Printf("[Parse] parsing %s", page.URL.String()) 183 | return htmlParse(done, pc, page, tmMap, func(tf *TemplateField, doc *goquery.Document, posts *goquery.Selection) (err error) { 184 | return pageParserFcn(page, tf, doc, posts) 185 | }) 186 | } 187 | 188 | func wapParse(done <-chan struct{}, pc *PageChannel, page *HTMLPage, tmMap *TemplateMap, callback func(tf *TemplateField, doc *goquery.Document, posts *goquery.Selection) error) error { 189 | // posts := doc.Find("div.i") 190 | // threadRegex := regexp.MustCompile(`kz=(\d+)`) 191 | return htmlParseWrapperFcn(done, pc, page, tmMap, "div.i", `kz=(\d+)`, callback) 192 | } 193 | 194 | func wapHomePageParser(done <-chan struct{}, page *HTMLPage, pc *PageChannel, tmMap *TemplateMap) error { 195 | return wapParse(done, pc, page, tmMap, func(tf *TemplateField, doc *goquery.Document, posts *goquery.Selection) error { 196 | return homePageParserFcn(done, pc, tf, doc, posts, page, findWapTitle, func(doc *goquery.Document) (int64, error) { 197 | var pageNum int64 198 | pageNumMatcher := regexp.MustCompile(`第\d+/(\d+)页`) 199 | matches := pageNumMatcher.FindStringSubmatch(doc.Find("div.h").Text()) 200 | if len(matches) > 1 { 201 | n, err := strconv.Atoi(matches[1]) 202 | if err != nil { 203 | return 0, fmt.Errorf("error parsing total number of pages: %v", err) 204 | } 205 | pageNum = int64(n) 206 | } else { 207 | pageNum = 1 // Could not find total number of pages, default to 1 208 | } 209 | return pageNum, nil 210 | }, HTMLWebWAPPage, wapParserFcn) 211 | }) 212 | } 213 | 214 | func wapParserFcn(page *HTMLPage, tf *TemplateField, doc *goquery.Document, posts *goquery.Selection) error { 215 | postMatcher := regexp.MustCompile(`^(\d+)楼. (.*)
    `) 216 | posts.Each(func(i int, s *goquery.Selection) { 217 | var res OutputField 218 | if content, err := s.Find(".g > a").Html(); err != nil { 219 | fmt.Fprintf(os.Stderr, "#%d Error parsing username from %s\n", i, page.URL) 220 | return 221 | } else { 222 | res.UserName = template.HTML(handleUserNameEmojiURL(content)) 223 | } 224 | 225 | sBody, _ := s.Html() 226 | sTable, _ := s.Find("table").Html() 227 | sContent := strings.ReplaceAll(sBody, fmt.Sprintf("%s
    ", sTable), "") 228 | if matches := postMatcher.FindStringSubmatch(string(sContent)); len(matches) < 3 { 229 | fmt.Fprintf(os.Stderr, "#%d Error parsing post content from %s\n", i, page.URL) 230 | return 231 | } else { 232 | n, err := strconv.Atoi(matches[1]) 233 | if err != nil { 234 | fmt.Fprintf(os.Stderr, "error parsing post number: %v", err) 235 | return 236 | } 237 | res.PostNO = uint64(n) 238 | res.Content = template.HTML(matches[2]) 239 | } 240 | // res.PostID = tiebaPost.Content.PostID 241 | if sReply, ok := s.Find(".r>a").Attr("href"); ok { 242 | if replyUrl, err := url.Parse(sReply); err == nil { 243 | pid := replyUrl.Query().Get("pid") 244 | if n, err := strconv.Atoi(pid); err == nil { 245 | res.PostID = uint64(n) 246 | } 247 | } 248 | } 249 | res.Time = s.Find(".b").Text() 250 | if tm, err := time.Parse(`1-2 15:04`, res.Time); err == nil { 251 | // rewrite time from "1-22 13:07" to "2021-01-22 13:07" for consistency 252 | tm = tm.AddDate(time.Now().Year(), 0, 0) 253 | res.Time = tm.Format("2006-01-02 15:04") 254 | } 255 | 256 | tf.Append(&res) 257 | }) 258 | return nil 259 | } 260 | 261 | func wapPageParser(done <-chan struct{}, page *HTMLPage, pc *PageChannel, tmMap *TemplateMap) error { 262 | // log.Printf("[Parse] parsing %s", page.URL.String()) 263 | return wapParse(done, pc, page, tmMap, func(tf *TemplateField, doc *goquery.Document, posts *goquery.Selection) (err error) { 264 | return wapParserFcn(page, tf, doc, posts) 265 | }) 266 | } 267 | 268 | // parse lzl comment, JSON formatted 269 | func jsonParser(done <-chan struct{}, page *HTMLPage, pc *PageChannel, tmMap *TemplateMap, callback func(done <-chan struct{}, page *HTMLPage, pc *PageChannel, tmMap *TemplateMap, tf *TemplateField) error) error { 270 | defer pc.Del(1) 271 | u := page.URL 272 | q := u.Query() 273 | tid := q.Get("tid") 274 | if tid == "" { 275 | return fmt.Errorf("error parsing getting tid from %s", page.URL) // skip illegal URL 276 | } 277 | ret, _ := strconv.Atoi(tid) 278 | threadID := uint64(ret) 279 | // TODO: wrap tf.Add method in order to substitute image with html embedded one 280 | tf := tmMap.Get(threadID) 281 | defer tf.AddLzl(-1) 282 | err := callback(done, page, pc, tmMap, tf) 283 | if err != nil { 284 | pc.Add(1) 285 | tf.AddLzl(1) 286 | go addPageToFetchQueue(done, pc, time.Duration(config.RetryPeriod)*time.Second, page.URL, page.Type) 287 | } 288 | return err 289 | } 290 | 291 | func requestLzlComment(tid string, pid string, pn string, tp HTMLType, pc *PageChannel) { 292 | // there are more lzls to fetch 293 | // url syntax: 294 | // url example: https://tieba.baidu.com/p/comment?tid=7201761174&pn=4 295 | u := &url.URL{ 296 | Scheme: "http", 297 | Host: "tieba.baidu.com", 298 | Path: "/p/comment", 299 | } 300 | q := u.Query() 301 | // q.Set("t", strconv.Itoa(int(time.Now().UnixNano()/1000000))) 302 | q.Set("tid", tid) 303 | // q.Set("pid", pid) 304 | q.Set("pn", pn) // start fetching additional comment from page 2 305 | u.RawQuery = q.Encode() 306 | 307 | // log.Printf("requesting %s", u) 308 | 309 | pc.send <- &HTMLPage{ 310 | URL: u, 311 | Type: tp, 312 | } 313 | } 314 | 315 | func totalCommentParser(done <-chan struct{}, page *HTMLPage, pc *PageChannel, tmMap *TemplateMap) error { 316 | return jsonParser(done, page, pc, tmMap, func(done <-chan struct{}, page *HTMLPage, pc *PageChannel, tmMap *TemplateMap, tf *TemplateField) error { 317 | url := page.URL.String() 318 | body := string(page.Content) 319 | var lzl LzlField 320 | var err error 321 | contentCandidates := make(chan string, 10) 322 | contentCandidates <- body 323 | for len(contentCandidates) > 0 { 324 | contentBuffer := <-contentCandidates 325 | err := json.Unmarshal([]byte(contentBuffer), &lzl) 326 | if err != nil { 327 | switch err := err.(type) { 328 | default: 329 | if len(contentCandidates) == 0 { 330 | return fmt.Errorf("error parsing content file %s: %v", url, err) 331 | } 332 | case *json.SyntaxError: 333 | // handle corrupted json data, as in #12 334 | // example: https://tieba.baidu.com/p/totalComment?fid=572638&pn=0&t=1617364074015&tid=6212415344&red_tag=3017655123 335 | if contentBuffer[:err.Offset-1] != `{"errno":null,"errmsg":null}` { 336 | fmt.Fprintf(os.Stderr, "[Parser] warning: lzl data corrupted: %s: %s, trying to reparse strings between offset %d", url, contentBuffer, err.Offset) 337 | contentCandidates <- contentBuffer[:err.Offset-1] 338 | } 339 | contentCandidates <- contentBuffer[err.Offset-1:] 340 | } 341 | } 342 | if lzl.ErrMsg == "success" { 343 | break 344 | } 345 | } 346 | if lzl.ErrMsg != "success" { 347 | return fmt.Errorf("unable to find json lzl with ErrMsg(\"success\"), last message was: %s", body) 348 | } 349 | if lzl.ErrNO != 0 { 350 | return fmt.Errorf("error getting data: %s, %s", url, lzl.ErrMsg) 351 | } 352 | commentList, ok := lzl.Data["comment_list"] 353 | if !ok { 354 | return fmt.Errorf("error getting comment_list: %s", url) 355 | } 356 | if string(commentList) == "" || string(commentList) == "[]" { 357 | return nil // comment list empty, stop 358 | } 359 | comments := make(map[uint64]*LzlComment) 360 | err = json.Unmarshal([]byte(string(commentList)), &comments) 361 | if err != nil { 362 | return fmt.Errorf("error parsing comment_list from %s: %v\ncomment_list:\n%s", url, err, commentList) 363 | } 364 | 365 | if len(comments) == 0 { 366 | return nil // does not have any comments, stop 367 | } 368 | 369 | for pid, v := range comments { 370 | if tf.Lzls.IsExist(pid) { 371 | // totalComment contains lzls in different pages, which are duplicate 372 | continue 373 | } 374 | // normalize 375 | for i, comment := range v.Info { 376 | comment.Index = int64(i) 377 | comment.Time = time.Unix(comment.Timestamp, 0).In(time.Local).Format("2006-01-02 15:04") 378 | comment.UserName = handleUserNameEmojiURL(comment.UserName) 379 | if config.ShowNickName && comment.UserNickname != "" { 380 | comment.UserName = comment.UserNickname 381 | } 382 | comment.Content = template.HTML(reformatLzlUsername(string(comment.Content))) 383 | } 384 | // merge maps 385 | // Getting the union of two maps in go 386 | // https://stackoverflow.com/a/22621838/6091246 387 | numLeft := int64(v.Num) - int64(v.ListNum) 388 | if numLeft > 0 { 389 | // extend Lzl slice if needed 390 | if n := len(v.Info); uint64(n) < v.Num { 391 | // extend slice 392 | newSlice := make([]*LzlContent, n, v.Num+1) 393 | copy(newSlice, v.Info) 394 | v.Info = newSlice 395 | } 396 | pc.Add(1) 397 | tf.AddLzl(1) 398 | go requestLzlComment(strconv.Itoa(int(tf.ThreadID)), strconv.Itoa(int(pid)), "2", HTMLLzlHome, pc) 399 | } 400 | tf.Lzls.Insert(pid, v) // merge maps 401 | } 402 | return nil 403 | }) 404 | } 405 | 406 | func commentParserFcn(url *url.URL, body string, pageType HTMLType, appendLzl func(key uint64, value *LzlContent), requestLzlCommentFcn func(tid, pid, pageNum string)) error { 407 | doc, err := goquery.NewDocumentFromReader(strings.NewReader(body)) 408 | if err != nil { 409 | return fmt.Errorf("error parsing %s: %v", url.String(), err) 410 | } 411 | q := url.Query() 412 | tid := q.Get("tid") 413 | pid := q.Get("pid") 414 | pn := q.Get("pn") 415 | if pageType == HTMLLzlHome { 416 | s := doc.Find("li.lzl_li_pager_s") 417 | dataField, ok := s.Attr("data-field") 418 | if !ok { 419 | return fmt.Errorf("error parsing %s: total number of pages is not determinable", url) 420 | } 421 | var lzlPage LzlPageComment 422 | err := json.Unmarshal([]byte(dataField), &lzlPage) 423 | if err != nil { 424 | return fmt.Errorf("LzlPageComment data-field unmarshal failed: %v, url: %s", err, url) 425 | } 426 | // tf.AddLzl(int64(lzlPage.TotalPage - 2)) 427 | for i := uint64(3); i <= lzlPage.TotalPage; i++ { 428 | requestLzlCommentFcn(tid, pid, strconv.Itoa(int(i))) 429 | // requestLzlComment(tid, pid, strconv.Itoa(int(i)), HTMLLzl, pc) 430 | } 431 | } 432 | exLzls := doc.Find(".lzl_single_post.j_lzl_s_p") 433 | exLzls.Each(func(i int, s *goquery.Selection) { 434 | pageNum, _ := strconv.Atoi(pn) 435 | key, _ := strconv.Atoi(pid) 436 | content, err := s.Find(".lzl_content_main").Html() 437 | if err != nil { 438 | return 439 | } 440 | user := s.Find("div.lzl_cnt a.at.j_user_card") 441 | userName := user.Text() 442 | // userName, ok := user.Attr("username") 443 | // if !ok { 444 | // // userName not found 445 | // log.Printf("ExLzl: cannot find username for pid=%s, index=%d", pid, i+pageNum*10) 446 | // return 447 | // } else 448 | if userName == "" { 449 | // user name is empty, try another method 450 | log.Printf("ExLzl: please check url: %s", url) 451 | return 452 | } 453 | c := &LzlContent{ 454 | Index: int64(i + pageNum*10), 455 | UserName: handleUserNameEmojiURL(userName), 456 | Content: template.HTML(reformatLzlUsername(content)), 457 | Time: s.Find(".lzl_time").Text(), 458 | } 459 | appendLzl(uint64(key), c) 460 | // tf.Lzls.Append(uint64(key), c) 461 | }) 462 | return nil 463 | } 464 | 465 | func commentParser(done <-chan struct{}, page *HTMLPage, pc *PageChannel, tmMap *TemplateMap) error { 466 | return jsonParser(done, page, pc, tmMap, func(done <-chan struct{}, page *HTMLPage, pc *PageChannel, tmMap *TemplateMap, tf *TemplateField) error { 467 | return commentParserFcn(page.URL, string(page.Content), page.Type, func(key uint64, value *LzlContent) { 468 | tf.Lzls.Append(uint64(key), value) 469 | }, func(tid, pid, pageNum string) { 470 | pc.Add(1) 471 | tf.AddLzl(1) 472 | go requestLzlComment(tid, pid, pageNum, HTMLLzl, pc) 473 | }) 474 | }) 475 | } 476 | 477 | // parse templateField from local file, JSON formatted 478 | func templateParser(done <-chan struct{}, page *HTMLPage, pc *PageChannel, tmMap *TemplateMap) error { 479 | defer pc.Del(1) 480 | var threadID uint64 481 | 482 | u := page.URL 483 | q := u.Query() 484 | tid := q.Get("tid") 485 | if tid == "" { 486 | return fmt.Errorf("error parsing getting tid from %s", page.URL.String()) // skip illegal URL 487 | } 488 | ret, _ := strconv.Atoi(tid) 489 | threadID = uint64(ret) 490 | 491 | var tf = tmMap.Get(threadID) 492 | 493 | tf.mutex.Lock() 494 | err := json.Unmarshal(page.Content, tf) 495 | tf.mutex.Unlock() 496 | if err != nil { 497 | return fmt.Errorf("error parsing template file %s: %v", page.URL.String(), err) 498 | } 499 | 500 | return nil 501 | } 502 | 503 | func externalResourceParser(done <-chan struct{}, page *HTMLPage, pc *PageChannel, tmMap *TemplateMap) error { 504 | defer pc.Del(1) 505 | threadID := page.ThreadID 506 | var tf = tmMap.Get(threadID) 507 | 508 | dstPath := fmt.Sprintf("output/%s", page.Path) 509 | // log.Printf("writing %s", dstPath) 510 | outputPath := filepath.Dir(dstPath) 511 | if err := os.MkdirAll(outputPath, 0755); err != nil && !os.IsNotExist(err) { 512 | return fmt.Errorf("error creating external resource folder (%s): %v", outputPath, err) 513 | } 514 | data := page.Content 515 | if err := writeOutput(dstPath, func(w *bufio.Writer) error { 516 | for len(data) > 0 { 517 | n, err := w.Write(data) 518 | if err != nil { 519 | return err 520 | } 521 | data = data[n:] 522 | } 523 | return nil 524 | }); err != nil { 525 | return fmt.Errorf("error writing external resource (%s): %v", dstPath, err) 526 | } 527 | atomic.AddInt64(&tf.resLeft, -1) 528 | 529 | return nil 530 | } 531 | 532 | func parser(done <-chan struct{}, errc chan<- error, wg *sync.WaitGroup, pc *PageChannel, tmMap *TemplateMap) { 533 | defer wg.Done() 534 | var err error 535 | ticker := time.NewTicker(1 * time.Second) 536 | for { 537 | select { 538 | case <-done: 539 | return 540 | case p, ok := <-pc.rec: 541 | if !ok { 542 | return // quit when pc.rec is closed 543 | } 544 | switch p.Type { 545 | case HTMLWebHomepage: 546 | err = homepageParser(done, p, pc, tmMap) 547 | if err != nil { 548 | errc <- err 549 | } 550 | case HTMLWebPage: 551 | err = pageParser(done, p, pc, tmMap) 552 | if err != nil { 553 | errc <- err 554 | } 555 | case HTMLJSON: 556 | err = totalCommentParser(done, p, pc, tmMap) 557 | if err != nil { 558 | errc <- err 559 | } 560 | case HTMLLzlHome, HTMLLzl: 561 | err = commentParser(done, p, pc, tmMap) 562 | if err != nil { 563 | errc <- err 564 | } 565 | case HTMLLocal: 566 | err = templateParser(done, p, pc, tmMap) 567 | if err != nil { 568 | errc <- err 569 | } 570 | case HTMLWebWAPHomepage: 571 | err = wapHomePageParser(done, p, pc, tmMap) 572 | if err != nil { 573 | errc <- err 574 | } 575 | case HTMLWebWAPPage: 576 | err = wapPageParser(done, p, pc, tmMap) 577 | if err != nil { 578 | errc <- err 579 | } 580 | case HTMLExternalResource: 581 | err = externalResourceParser(done, p, pc, tmMap) 582 | if err != nil { 583 | errc <- err 584 | } 585 | default: 586 | errc <- errors.New("unkonwn HTMLPage Type") 587 | } 588 | case <-ticker.C: 589 | } 590 | go tmMap.Sweep(pc) 591 | } 592 | } 593 | 594 | func parseHTML(done <-chan struct{}, pc *PageChannel) (<-chan *TemplateField, <-chan error) { 595 | tmMap := &TemplateMap{ 596 | Map: make(map[uint64]*TemplateField), 597 | lock: &sync.RWMutex{}, 598 | Channel: make(chan *TemplateField, config.NumRenderer), 599 | } 600 | errc := make(chan error) 601 | 602 | var wg sync.WaitGroup 603 | wg.Add(config.NumParser) 604 | for i := 0; i < config.NumParser; i++ { 605 | go parser(done, errc, &wg, pc, tmMap) 606 | } 607 | go func() { 608 | for { 609 | log.Printf("[pc] jobs: %d", pc.Ref()) // status report 610 | if pc.IsDone() { 611 | close(pc.send) // no more task, tell fetcher to exit 612 | break 613 | } 614 | time.Sleep(time.Second) // check task number every second 615 | } 616 | wg.Wait() // wait parser finish all remaining tasks 617 | close(errc) 618 | close(tmMap.Channel) // all page parsed, tell renderer to exit 619 | }() 620 | return tmMap.Channel, errc 621 | } 622 | 623 | func findTitle(doc *goquery.Document) string { 624 | var title string 625 | if s := doc.Find("title"); s.Text() == "" { 626 | title = randStringRunes(15) // Could not find title, default to random 627 | } else { 628 | title = s.Text() 629 | } 630 | return title 631 | } 632 | 633 | func findWapTitle(doc *goquery.Document) string { 634 | var title string 635 | if s := doc.Find(".bc > strong:nth-child(1)"); s.Text() == "" { 636 | title = randStringRunes(15) // Could not find title, default to random 637 | } else { 638 | title = s.Text() 639 | } 640 | if s := doc.Find("div.d.h ~ a").First(); s.Text() != "" { 641 | title = fmt.Sprintf("%s_%s_wap", title, s.Text()) 642 | } 643 | return title 644 | } 645 | 646 | func addPageToFetchQueue(done <-chan struct{}, pc *PageChannel, delay time.Duration, url *url.URL, pageType HTMLType) { 647 | if delay > 0 { 648 | select { 649 | case <-done: 650 | return 651 | case <-time.After(delay): 652 | } 653 | } 654 | // add failed task back to jobs 655 | select { 656 | case <-done: 657 | return 658 | case pc.send <- &HTMLPage{ 659 | URL: url, 660 | Type: pageType, 661 | }: 662 | } 663 | } 664 | 665 | func reformatLzlUsername(content string) string { 666 | // special rule: remove username ahref in ": 回复 ", as requested in #4 667 | content = strings.Trim(content, " ") 668 | // fmt.Fprintf(os.Stderr, "before: %s\n", content) 669 | if strings.HasPrefix(content, "回复") { 670 | doc, err := goquery.NewDocumentFromReader(strings.NewReader(content)) 671 | if err == nil { 672 | bodyDOM := doc.Find("body") 673 | s := doc.Find("a.at").First() 674 | userNameHtml, _ := s.Html() 675 | // t.Errorf(userNameHtml) 676 | s.ReplaceWithHtml(userNameHtml) 677 | content, _ = bodyDOM.Html() 678 | // t.Errorf("failed to parse comment data: %v, reason: %s", content, err) 679 | // fmt.Fprintf(os.Stderr, "after: %s\n", content) 680 | } 681 | } 682 | return content 683 | } 684 | 685 | func handleUserNameEmojiURL(userName string) string { 686 | doc, err := goquery.NewDocumentFromReader(strings.NewReader(userName)) 687 | if err != nil { 688 | fmt.Fprintf(os.Stderr, "[handleUserNameEmojiURL] error handling user: %s", userName) 689 | return userName 690 | } 691 | doc.Find("img.nicknameEmoji").Each(func(i int, s *goquery.Selection) { 692 | if url, ex := s.Attr("src"); ex { 693 | if strings.HasPrefix(url, "//") { 694 | // the url needs to add the protocol type 695 | s.SetAttr("src", "https:"+url) 696 | } 697 | } 698 | }) 699 | if content, err := doc.Find("body").Html(); err == nil { 700 | return content 701 | } 702 | content, _ := doc.Html() 703 | return content 704 | } 705 | 706 | func addPageToFetchQueueFromHomePage(done <-chan struct{}, pc *PageChannel, urlRef *url.URL, tid uint64, pageNum int64, pageType HTMLType) { 707 | for i := int64(2); i <= pageNum; i++ { 708 | u := &url.URL{} 709 | *u = *urlRef 710 | q := u.Query() 711 | switch pageType { 712 | case HTMLWebPage: 713 | q.Set("pn", strconv.Itoa(int(i))) 714 | case HTMLWebWAPPage: 715 | q.Set("pnum", strconv.Itoa(int(i))) 716 | } 717 | u.RawQuery = q.Encode() 718 | newPage := &HTMLPage{ 719 | URL: u, // example: http://tieba.baidu.com/mo/m?kz=7201761174&pnum=2 720 | Type: pageType, 721 | } 722 | select { 723 | case <-done: 724 | return 725 | case pc.send <- newPage: // add all other pages to fetcher 726 | } 727 | } 728 | 729 | // forumRegex := regexp.MustCompile(`\b"?forum_id"?:"?(\d+)"?\b`) 730 | // match := forumRegex.FindStringSubmatch(string(page.Content)) 731 | // strInt, _ := strconv.ParseInt(match[1], 10, 64) 732 | // forumID := uint64(strInt) 733 | // fetch lzl comments 734 | // syntax: 735 | // http://tieba.baidu.com/p/totalComment?t=15769421323&tid=7201761174&fid=572638&pn=2&see_lz=0 736 | // python爬取贴吧楼中楼 737 | // https://mrxin.github.io/2015/09/19/tieba-louzhonglou/ 738 | for i := int64(0); i <= pageNum; i++ { 739 | u := &url.URL{ 740 | Scheme: "http", 741 | Host: "tieba.baidu.com", 742 | Path: "/p/totalComment", 743 | } 744 | q := u.Query() 745 | // Go by Example: Epoch 746 | // https://gobyexample.com/epoch 747 | // q.Set("t", strconv.Itoa(int(time.Now().UnixNano()/1000000))) 748 | q.Set("tid", strconv.Itoa(int(tid))) 749 | // q.Set("fid", strconv.Itoa(int(forumID))) 750 | q.Set("pn", strconv.Itoa(int(i))) 751 | u.RawQuery = q.Encode() 752 | // log.Printf("requesting totalComment: %s", u) 753 | select { 754 | case <-done: 755 | return 756 | case pc.send <- &HTMLPage{ 757 | URL: u, 758 | Type: HTMLJSON, 759 | }: 760 | } 761 | } 762 | } 763 | --------------------------------------------------------------------------------