├── .gitignore ├── LICENSE ├── README.md ├── answer.go ├── collection.go ├── examples ├── config-example.json └── example.go ├── log.go ├── log_test.go ├── question.go ├── question_test.go ├── session.go ├── session_test.go ├── topic.go ├── user.go ├── util.go └── util_test.go /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### Go template 3 | # Compiled Object files, Static and Dynamic libs (Shared Objects) 4 | *.o 5 | *.a 6 | *.so 7 | 8 | # Folders 9 | _obj 10 | _test 11 | 12 | # Architecture specific extensions/prefixes 13 | *.[568vq] 14 | [568vq].out 15 | 16 | *.cgo1.go 17 | *.cgo2.c 18 | _cgo_defun.c 19 | _cgo_gotypes.go 20 | _cgo_export.* 21 | 22 | _testmain.go 23 | 24 | *.exe 25 | *.test 26 | *.prof 27 | 28 | verify.gif 29 | examples/config.json -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Yangliang Li 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | zhihu-go:知乎非官方 API 库 with Go 2 | ================================= 3 | 4 | [![GoDoc](https://godoc.org/github.com/DeanThompson/zhihu-go?status.svg)](https://godoc.org/github.com/DeanThompson/zhihu-go) 5 | 6 | 这是一个非官方的 [知乎](https://www.zhihu.com/) API 库,用 Go 实现。 7 | 8 | 本项目基本上是把 [zhihu-python](https://github.com/egrcc/zhihu-python) 和 [zhihu-py3](https://github.com/7sDream/zhihu-py3) 从 Python 移植到了 Go. 相比之下,比 zhihu-python 的 API 更丰富,比 zhihu-py3 少了活动相关的 API. 9 | 10 | **注意:知乎的 API、前端等都可能随时会更新,所以本项目的接口可能会有过时的情况。如果遇到此类问题,欢迎提交 issue 或 pull requests.** 11 | 12 | ## Table of Contents 13 | 14 | * [Table of Contents](#table-of-contents) 15 | * [Install](#install) 16 | * [Documentation](#documentation) 17 | * [Usage](#usage) 18 | * [Login:登录](#login) 19 | * [User:获取用户信息](#user) 20 | * [Question:获取问题信息](#question) 21 | * [Answer:获取答案信息](#answer) 22 | * [Collection:获取收藏夹信息](#collection) 23 | * [Topic:获取话题信息](#topic) 24 | * [Known Issues](#known-issues) 25 | * [TODO](#todo) 26 | * [LICENSE](#license) 27 | 28 | ## Install 29 | 30 | 直接使用 `go get`: 31 | 32 | ```bash 33 | go get github.com/DeanThompson/zhihu-go 34 | ``` 35 | 36 | 依赖以下第三方库: 37 | 38 | * [goquery](https://github.com/PuerkitoBio/goquery): 用于解析 HTML,语法操作类似 jQuery 39 | * [color](https://github.com/fatih/color):用于输出带颜色的日志 40 | * [persistent-cookiejar](https://github.com/juju/persistent-cookiejar):用于维护一个持久化的 cookiejar,实现保持登录 41 | 42 | ## Documentation 43 | 44 | 请点击链接前往 GoDoc 查看:[zhihu-go](https://godoc.org/github.com/DeanThompson/zhihu-go) 45 | 46 | ## Usage 47 | 48 | 目前已经实现了用户(User),问题(Question),回答(Answer),收藏夹(Collection),话题(Topic)相关的 API,都是信息获取类的,暂无操作类的。 49 | 50 | zhihu-go 包名为 `zhihu`,使用前需要先 import: 51 | 52 | ```go 53 | import "github.com/DeanThompson/zhihu-go" 54 | ``` 55 | 56 | ### Login 57 | 58 | 调用 API 之前需要先登录。在 zhihu-go 内部,使用一个全局的 session 来访问所有页面,并自动处理 cookies. 59 | 60 | 创建一个 JSON 格式的配置文件,提供一个账号和密码,格式如 [config-example.json](examples/config-example.json). 61 | 62 | 登录(初始化 session): 63 | 64 | ```go 65 | zhihu.Init("/path/to/config.json") 66 | ``` 67 | 68 | 第一次登录会调用图像界面打开验证码文件,需要手动输入验证码到控制台。如果登录成功,后续的请求会沿用此次登录的 cookie, 不需要重复登录。 69 | 70 | ### User 71 | 72 | `zhihu.User` 表示一个知乎用户,可以用于获取一个用户的各种数据。 73 | 74 | 创建一个 `User` 对象需要传入用户主页的 URL 及其知乎 ID(用户名),如: 75 | 76 | ```go 77 | link := "https://www.zhihu.com/people/jixin" 78 | userID := "黄继新" 79 | user := zhihu.NewUser(link, userID) 80 | ``` 81 | 82 | 获取用户的数据(代码见:[example.go](examples/example.go#L159)): 83 | 84 | ```go 85 | func showUser(user *zhihu.User) { 86 | logger.Info("User fields:") 87 | logger.Info(" is anonymous: %v", user.IsAnonymous()) // 是否匿名用户:false 88 | logger.Info(" userId: %s", user.GetUserID()) // 知乎ID:黄继新 89 | logger.Info(" dataId: %s", user.GetDataID()) // hash ID:b6f80220378c8b0b78175dd6a0b9c680 90 | logger.Info(" bio: %s", user.GetBio()) // BIO:和知乎在一起 91 | logger.Info(" location: %s", user.GetLocation()) // 位置:北京 92 | logger.Info(" business: %s", user.GetBusiness()) // 行业:互联网 93 | logger.Info(" gender: %s", user.GetGender()) // 性别:male 94 | logger.Info(" education: %s", user.GetEducation()) // 学校:北京第二外国语学院 95 | logger.Info(" followers num: %d", user.GetFollowersNum()) // 粉丝数:756632 96 | logger.Info(" followees num: %d", user.GetFolloweesNum()) // 关注的人数: 9249 97 | logger.Info(" followed columns num: %d", user.GetFollowedColumnsNum()) // 关注的专栏数:631 98 | logger.Info(" followed topics num: %d", user.GetFollowedTopicsNum()) // 关注的话题数:131 99 | logger.Info(" agree num: %d", user.GetAgreeNum()) // 获得的赞同数:68557 100 | logger.Info(" thanks num: %d", user.GetThanksNum()) // 获得的感谢数:17651 101 | logger.Info(" asks num: %d", user.GetAsksNum()) // 提问数:1336 102 | logger.Info(" answers num: %d", user.GetAnswersNum()) // 回答数:785 103 | logger.Info(" posts num: %d", user.GetPostsNum()) // 专栏文章数:92 104 | logger.Info(" collections num: %d", user.GetCollectionsNum()) // 收藏夹数量:44 105 | logger.Info(" logs num: %d", user.GetLogsNum()) // 公共编辑数:51596 106 | 107 | // 108 | // 109 | // 110 | // 111 | // 112 | for i, topic := range user.GetFollowedTopicsN(5) { 113 | logger.Info(" top followed topic-%d: %s", i+1, topic.String()) 114 | } 115 | 116 | // 117 | // 118 | // 119 | // 120 | // 121 | for i, follower := range user.GetFollowersN(5) { 122 | logger.Info(" top follower-%d: %s", i+1, follower.String()) 123 | } 124 | 125 | // 126 | // 127 | // 128 | // 129 | // 130 | for i, followee := range user.GetFolloweesN(5) { 131 | logger.Info(" top followee-%d: %s", i+1, followee.String()) 132 | } 133 | 134 | // 135 | // 136 | // 137 | // 138 | // 139 | for i, ask := range user.GetAsksN(5) { 140 | logger.Info(" top ask-%d: %s", i+1, ask.String()) 141 | } 142 | 143 | // - https://www.zhihu.com/question/40394171/answer/86692178> 144 | // - https://www.zhihu.com/question/19952708/answer/84561308> 145 | // - https://www.zhihu.com/question/35987345/answer/72981016> 146 | // - https://www.zhihu.com/question/24980451/answer/29789141> 147 | // - https://www.zhihu.com/question/24816698/answer/29229733> 148 | for i, answer := range user.GetAnswersN(5) { 149 | logger.Info(" top answer-%d: %s", i+1, answer.String()) 150 | } 151 | 152 | // 153 | // 154 | // 155 | // 156 | // 157 | for i, collection := range user.GetCollectionsN(5) { 158 | logger.Info(" top collection-%d: %s", i+1, collection.String()) 159 | } 160 | 161 | for i, like := range user.GetLikes() { 162 | logger.Info(" like-%d: %s", i+1, like.String()) 163 | } 164 | } 165 | ``` 166 | 167 | ### Question 168 | 169 | `zhihu.Question` 表示一个知乎问题,用于获取问题相关的数据。初始化需要提供 url 和标题(可为空): 170 | 171 | ```go 172 | link := "https://www.zhihu.com/question/28966220" 173 | title := "Python 编程,应该养成哪些好的习惯?" 174 | question := zhihu.NewQuestion(link, title) 175 | ``` 176 | 177 | 获取问题数据:(代码见:[example.go](examples/example.go#L51)) 178 | 179 | ```go 180 | func showQuestion(question *zhihu.Question) { 181 | logger.Info("Question fields:") 182 | 183 | // 链接:https://www.zhihu.com/question/28966220 184 | logger.Info(" url: %s", question.Link) 185 | 186 | // 标题:Python 编程,应该养成哪些好的习惯? 187 | logger.Info(" title: %s", question.GetTitle()) 188 | 189 | // 描述:我以为编程习惯很重要的,一开始就养成这些习惯,不仅可以提高编程速度,还可以减少 bug 出现的概率。希望各位分享好的编程习惯。 190 | logger.Info(" detail: %s", question.GetDetail()) 191 | 192 | 193 | logger.Info(" answers num: %d", question.GetAnswersNum()) // 回答数:15 194 | logger.Info(" followers num: %d", question.GetFollowersNum()) // 关注者数量:1473 195 | 196 | // 197 | // 198 | // 199 | // 200 | for i, topic := range question.GetTopics() { 201 | logger.Info(" topic-%d: %s", i+1, topic.String()) 202 | } 203 | 204 | // 205 | // 206 | // 207 | // 208 | // 209 | for i, follower := range question.GetFollowersN(5) { 210 | logger.Info(" top follower-%d: %s", i+1, follower.String()) 211 | } 212 | 213 | for i, follower := range question.GetFollowers() { // 关注者列表 214 | logger.Info(" follower-%d: %s", i+1, follower.String()) 215 | if i >= 10 { 216 | logger.Info(" %d followers not shown.", question.GetFollowersNum()-i-1) 217 | break 218 | } 219 | } 220 | 221 | allAnswers := question.GetAllAnswers() // 所有回答 222 | for i, answer := range allAnswers { 223 | logger.Info(" answer-%d: %s", i+1, answer.String()) 224 | filename := fmt.Sprintf("/tmp/%s-%s的回答.html", question.GetTitle(), answer.GetAuthor().GetUserID()) 225 | dumpAnswerHTML(filename, answer) 226 | if i >= 10 { 227 | logger.Info(" %d answers not shown.", len(allAnswers)-i-1) 228 | break 229 | } 230 | } 231 | 232 | topXAnswers := question.GetTopXAnswers(25) // 前 25 个回答 233 | for i, answer := range topXAnswers { 234 | logger.Info(" top-%d answer: %s", i+1, answer.String()) 235 | } 236 | 237 | // 排名第一的回答 238 | // - https://www.zhihu.com/question/28966220/answer/43346747> 239 | logger.Info(" top-1 answer: %s", question.GetTopAnswer().String()) 240 | 241 | logger.Info(" visit times: %d", question.GetVisitTimes()) // 查看次数:32942 242 | } 243 | ``` 244 | 245 | ### Answer 246 | 247 | `zhihu.Answer` 表示一个知乎答案,初始化时需要指定页面链接,也支持指定对应的问题(`*Question`,可以为 `nil`)和作者(`*User`,可以为 `nil`): 248 | 249 | ```go 250 | // 龙有九个儿子,是跟谁生的?为什么「龙生九子,各不成龙」?豆子 的答案 251 | answer := zhihu.NewAnswer("https://www.zhihu.com/question/23759686/answer/41997389", nil, nil) 252 | ``` 253 | 254 | 获取回答数据:(代码见:[example.go](examples/example.go#L95)) 255 | 256 | ```go 257 | func showAnswer(answer *zhihu.Answer) { 258 | logger.Info("Answer fields:") 259 | 260 | // 链接:https://www.zhihu.com/question/23759686/answer/41997389 261 | logger.Info(" url: %s", answer.Link) 262 | 263 | // 所属问题 264 | // 链接:https://www.zhihu.com/question/23759686 265 | // 标题:龙有九个儿子,是跟谁生的?为什么「龙生九子,各不成龙」? 266 | question := answer.GetQuestion() 267 | logger.Info(" question url: %s", question.Link) 268 | logger.Info(" question title: %s", question.GetTitle()) 269 | 270 | // 作者: 271 | logger.Info(" author: %s", answer.GetAuthor().String()) 272 | 273 | logger.Info(" upvote num: %d", answer.GetUpvote()) // 赞同数:26486 274 | logger.Info(" comments num: %d", answer.GetCommentsNum()) // 评论数:20 275 | logger.Info(" collected num: %d", answer.GetCollectedNum()) // 被收藏次数:22929 276 | logger.Info(" data ID: %d", answer.GetID()) // 数字 ID:12191779 277 | 278 | // 点赞的用户 279 | voters := answer.GetVoters() 280 | for i, voter := range voters { 281 | logger.Info(" voter-%d: %s", i+1, voter.String()) 282 | if i >= 10 { 283 | remain := len(voters) - i - 1 284 | logger.Info(" %d votes not shown.", remain) 285 | break 286 | } 287 | } 288 | } 289 | ``` 290 | 291 | ### Collection 292 | 293 | `zhihu.Collection` 表示一个收藏夹,初始化时必须指定页面 url,支持指定名称(`string` 可以为 `""`)和创建者(`creator *User`,可以为 `nil`): 294 | 295 | ```go 296 | // 黄继新 A4U 297 | collection := zhihu.NewCollection("https://www.zhihu.com/collection/19677733", "", nil) 298 | ``` 299 | 300 | 获取收藏夹数据:(代码见:[example.go](examples/example.go#L124)) 301 | 302 | ```go 303 | func showCollection(collection *zhihu.Collection) { 304 | logger.Info("Collection fields:") 305 | 306 | // 链接:https://www.zhihu.com/collection/19677733 307 | logger.Info(" url: %s", collection.Link) 308 | 309 | // 名称:A4U 310 | logger.Info(" name: %s", collection.GetName()) 311 | 312 | // 作者: 313 | logger.Info(" creator: %s", collection.GetCreator().String()) 314 | logger.Info(" followers num: %d", collection.GetFollowersNum()) // 关注者数量:29 315 | 316 | // 获取 5 个关注者 317 | for i, follower := range collection.GetFollowersN(5) { 318 | logger.Info(" top follower-%d: %s", i+1, follower.String()) 319 | } 320 | 321 | // 获取 5 个问题 322 | for i, question := range collection.GetQuestionsN(5) { 323 | logger.Info(" top question-%d: %s", i+1, question.String()) 324 | } 325 | 326 | // 获取 5 个回答 327 | for i, answer := range collection.GetAnswersN(5) { 328 | logger.Info(" top answer-%d: %s", i+1, answer.String()) 329 | } 330 | } 331 | ``` 332 | 333 | ### Topic 334 | 335 | `zhihu.Collection` 表示一个话题,初始化时必须指定页面 url,支持指定名称(`string` 可以为 `""`): 336 | 337 | ```go 338 | // Python 339 | topic := zhihu.NewTopic("https://www.zhihu.com/topic/19552832", "") 340 | ``` 341 | 342 | 获取收藏夹数据:(代码见:[example.go](examples/example.go#L237)) 343 | 344 | ```go 345 | func showTopic(topic *zhihu.Topic) { 346 | logger.Info("Topic fields:") 347 | 348 | // 链接:https://www.zhihu.com/topic/19552832 349 | logger.Info(" url: %s", topic.Link) 350 | 351 | // 名称:Python 352 | logger.Info(" name: %s", topic.GetName()) 353 | 354 | // 描述:Python 是一种面向对象的解释型计算机程序设计语言,在设计中注重代码的可读性,同时也是一种功能强大的通用型语言。 355 | logger.Info(" description: %s", topic.GetDescription()) 356 | 357 | // 关注者数量:82805 358 | logger.Info(" followers num: %d", topic.GetFollowersNum()) 359 | 360 | // 最佳答主,一般为 5 个 361 | // 362 | // 363 | // 364 | // 365 | // 366 | for i, author := range topic.GetTopAuthors() { 367 | logger.Info(" top-%d author: %s", i+1, author.String()) 368 | } 369 | } 370 | ``` 371 | 372 | ## Known Issues 373 | 374 | 无,欢迎 [提交 issues](https://github.com/DeanThompson/zhihu-go/issues) 375 | 376 | ## TODO 377 | 378 | 按优先级降序排列: 379 | 380 | * [X] 获取回答的收藏数 381 | * [X] 获取收藏夹的答案数量 382 | * [X] 获取用户的头像 383 | * [X] 获取用户的微博地址 384 | * [ ] 把答案导出到 markdown 文件 385 | * [ ] 更多的登录方式,不需要依赖图形界面打开验证码文件 386 | * [ ] 增加评论相关的 API 387 | * [ ] 增加活动相关的 API 388 | * [ ] 增加专栏相关的 API 389 | * [ ] test(暂时没想好怎么做) 390 | 391 | 很可能不会做: 392 | 393 | * [ ] 增加用户的操作,如点赞、关注等 394 | 395 | 欢迎 [提交 pull requests](https://github.com/DeanThompson/zhihu-go/pulls) 396 | 397 | ## LICENSE 398 | 399 | [The MIT license](LICENSE). 400 | -------------------------------------------------------------------------------- /answer.go: -------------------------------------------------------------------------------- 1 | package zhihu 2 | 3 | import ( 4 | "fmt" 5 | "net/url" 6 | "strconv" 7 | "strings" 8 | 9 | "github.com/PuerkitoBio/goquery" 10 | ) 11 | 12 | // Answer 是一个知乎的答案 13 | type Answer struct { 14 | *Page 15 | 16 | // question 是该答案对应的问题 17 | question *Question 18 | 19 | // author 是该答案的作者 20 | author *User 21 | } 22 | 23 | // NewAnswer 用于创建一个 Answer 对象,其中 link 是必传的,question, author 可以为 nil 24 | func NewAnswer(link string, question *Question, author *User) *Answer { 25 | return &Answer{ 26 | Page: newZhihuPage(link), 27 | question: question, 28 | author: author, 29 | } 30 | } 31 | 32 | // GetID 返回该答案的数字 ID 33 | func (a *Answer) GetID() int { 34 | if got, ok := a.getIntField("data-aid"); ok { 35 | return got 36 | } 37 | 38 | doc := a.Doc() 39 | text, _ := doc.Find("div.zm-item-answer.zm-item-expanded").Attr("data-aid") 40 | aid, _ := strconv.Atoi(text) 41 | a.setField("data-aid", aid) 42 | return aid 43 | } 44 | 45 | // GetQuestion 返回该回答所属的问题,如果 NewAnswer 时 question 不为 nil,则直接返回该值; 46 | // 否则会抓取页面并分析得到问题的链接和标题,再新建一个 Question 对象 47 | func (a *Answer) GetQuestion() *Question { 48 | if a.question != nil { 49 | return a.question 50 | } 51 | 52 | doc := a.Doc() 53 | href, _ := doc.Find("h2.zm-item-title>a").Attr("href") 54 | link := makeZhihuLink(href) 55 | title := strip(doc.Find("h2.zm-item-title").First().Text()) 56 | return NewQuestion(link, title) 57 | } 58 | 59 | // Author 返回该答案的作者 60 | func (a *Answer) GetAuthor() *User { 61 | if a.author != nil { 62 | return a.author 63 | } 64 | 65 | doc := a.Doc() 66 | sel := doc.Find("div.zm-item-answer-author-info").First() 67 | return newUserFromAnswerAuthorTag(sel) 68 | } 69 | 70 | // GetUpvote 返回赞同数 71 | func (a *Answer) GetUpvote() int { 72 | if got, ok := a.getIntField("upvote"); ok { 73 | return got 74 | } 75 | 76 | doc := a.Doc() 77 | text := strip(doc.Find("span.count").First().Text()) 78 | upvote := upvoteTextToNum(text) 79 | a.setField("upvote", upvote) 80 | return upvote 81 | } 82 | 83 | // ToMarkdown 把回答导出到 markdown 文件 84 | func (a *Answer) ToMarkdown(filename string) error { 85 | if !strings.HasSuffix(filename, ".md") && !strings.HasSuffix(filename, ".markdown") { 86 | filename += ".md" 87 | } 88 | 89 | // TODO convert to markdown 90 | md := "" 91 | 92 | return saveString(filename, md) 93 | } 94 | 95 | // ToHtml 把网页源码导出到 html 文件 96 | func (a *Answer) ToHtml(filename string) error { 97 | if !strings.HasSuffix(filename, ".html") { 98 | filename += ".html" 99 | } 100 | 101 | html, err := a.Doc().Html() 102 | if err != nil { 103 | return err 104 | } 105 | return saveString(filename, html) 106 | } 107 | 108 | // GetContent 返回回答的内容,HTML 格式 109 | func (a *Answer) GetContent() string { 110 | if got, ok := a.getStringField("content"); ok { 111 | return got 112 | } 113 | 114 | sel := a.Doc().Find("div#zh-question-answer-wrap").Find("div.zm-editable-content") 115 | content, err := answerSelectionToHtml(sel) 116 | if err != nil { 117 | logger.Error("导出 HTML 失败:%s", err.Error()) 118 | return "" 119 | } 120 | a.setField("content", content) 121 | return content 122 | } 123 | 124 | // GetVotersN 返回 n 个点赞的用户,如果 n < 0,返回所有点赞的用户 125 | func (a *Answer) GetVotersN(n int) []*User { 126 | if n == 0 { 127 | return nil 128 | } 129 | 130 | querystring := fmt.Sprintf(`params={"answer_id":"%d"}`, a.GetID()) 131 | url := makeZhihuLink("/node/AnswerFullVoteInfoV2" + "?" + querystring) 132 | doc, err := newDocumentFromURL(url) 133 | if err != nil { 134 | return nil 135 | } 136 | 137 | sel := doc.Find(".voters span") 138 | capacity := n 139 | if capacity < 0 || capacity > sel.Length() { 140 | capacity = sel.Length() 141 | } 142 | voters := make([]*User, 0, capacity) 143 | 144 | sel.EachWithBreak(func(index int, span *goquery.Selection) bool { 145 | userId := strings.Trim(strip(span.Text()), "、") 146 | var userLink string 147 | if !(userId == "匿名用户" || userId == "知乎用户") { 148 | path, _ := span.Find("a").Attr("href") 149 | userLink = makeZhihuLink(path) 150 | } 151 | voters = append(voters, NewUser(userLink, userId)) 152 | if n > 0 && len(voters) == n { 153 | return false 154 | } 155 | return true 156 | }) 157 | 158 | return voters 159 | } 160 | 161 | // GetVoters 返回点赞的用户 162 | func (a *Answer) GetVoters() []*User { 163 | return a.GetVotersN(-1) 164 | } 165 | 166 | // GetCommentsNum 返回评论数量 167 | func (a *Answer) GetCommentsNum() int { 168 | if value, ok := a.getIntField("comment-num"); ok { 169 | return value 170 | } 171 | 172 | doc := a.Doc() 173 | text := strip(doc.Find("a.meta-item.toggle-comment").Text()) 174 | rv := reMatchInt(text) 175 | a.setField("comment-num", rv) 176 | return rv 177 | } 178 | 179 | // GetCollectedNum 返回被收藏次数 180 | func (a *Answer) GetCollectedNum() int { 181 | if value, ok := a.getIntField("collected-num"); ok { 182 | return value 183 | } 184 | 185 | text := strip(a.Doc().Find(`a[data-za-l="sidebar_answer_collected_count"]`).Text()) 186 | value, _ := strconv.Atoi(text) 187 | a.setField("collected-num", value) 188 | return value 189 | } 190 | 191 | func (a *Answer) String() string { 192 | return fmt.Sprintf("", a.GetAuthor().String(), a.Link) 193 | } 194 | 195 | func (a *Answer) setContent(value string) { 196 | a.setField("content", value) 197 | } 198 | 199 | func (a *Answer) setUpvote(value int) { 200 | a.setField("upvote", value) 201 | } 202 | 203 | func upvoteTextToNum(text string) int { 204 | rv := 0 205 | if strings.HasSuffix(text, "K") { 206 | num, _ := strconv.Atoi(text[0 : len(text)-1]) 207 | rv = num * 1000 208 | } else if strings.HasPrefix(text, "W") { 209 | num, _ := strconv.Atoi(text[0 : len(text)-1]) 210 | rv = num * 10000 211 | } else { 212 | rv, _ = strconv.Atoi(text) 213 | } 214 | return rv 215 | } 216 | 217 | // 把一个回答的主体部分导出成 HTML 代码,与原码相比,做了这些操作: 218 | // 1. 去掉无用的 noscript 标签 219 | // 2. 修复 img 的 src 值 220 | // 3. 移除无用的 icon 221 | // 4. 如果是自己的回答,移除末尾的 “修改” 链接 222 | func answerSelectionToHtml(sel *goquery.Selection) (string, error) { 223 | sel.RemoveClass() 224 | 225 | sel.Find("noscript").Each(func(_ int, tag *goquery.Selection) { 226 | tag.Remove() // 把无用的 noscript 去掉 227 | }) 228 | 229 | sel.Find("i.icon-external").Each(func(_ int, tag *goquery.Selection) { 230 | tag.Remove() // 把无用的 icon 去掉 231 | }) 232 | 233 | sel.Find("a.zu-edit-button").Remove() // 把 “修改” 链接去掉 234 | 235 | // 修复 img 的 src 236 | sel.Find("img").Each(func(_ int, tag *goquery.Selection) { 237 | var src string 238 | if tag.HasClass("origin_image") { 239 | src, _ = tag.Attr("data-original") 240 | } else { 241 | src, _ = tag.Attr("data-actualsrc") 242 | } 243 | tag.SetAttr("src", src) 244 | if tag.Next().Size() == 0 { 245 | tag.AfterHtml("
") 246 | } 247 | }) 248 | 249 | // 修复 a 标签的 href,因为知乎的外链都是这种形式:https://link.zhihu.com/?target=xxx 250 | sel.Find("a").Each(func(_ int, tag *goquery.Selection) { 251 | href, _ := tag.Attr("href") 252 | if strings.Contains(href, "target=") { 253 | link, err := url.Parse(href) 254 | if err != nil { 255 | return 256 | } 257 | target := link.Query().Get("target") 258 | tag.SetAttr("href", target) 259 | } 260 | }) 261 | 262 | wrapper := `` 263 | doc, _ := goquery.NewDocumentFromReader(strings.NewReader(wrapper)) 264 | doc.Find("body").AppendSelection(sel) 265 | 266 | return doc.Html() 267 | } 268 | 269 | func newUserFromAnswerAuthorTag(sel *goquery.Selection) *User { 270 | if strip(sel.Text()) == "匿名用户" { 271 | return ANONYMOUS 272 | } 273 | 274 | node := sel.Find("a.author-link") 275 | userId := strip(node.Text()) 276 | urlPath, _ := node.Attr("href") 277 | userLink := makeZhihuLink(urlPath) 278 | return NewUser(userLink, userId) 279 | } 280 | -------------------------------------------------------------------------------- /collection.go: -------------------------------------------------------------------------------- 1 | package zhihu 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "net/url" 7 | "strconv" 8 | "strings" 9 | 10 | "github.com/PuerkitoBio/goquery" 11 | ) 12 | 13 | // Collection 是一个知乎的收藏夹页面 14 | type Collection struct { 15 | *Page 16 | 17 | // creator 是该收藏夹的创建者 18 | creator *User 19 | 20 | // name 是该收藏夹的名称 21 | name string 22 | } 23 | 24 | // NewCollection 创建一个收藏夹对象,返回 *Collection 25 | func NewCollection(link string, name string, creator *User) *Collection { 26 | if !validCollectionURL(link) { 27 | panic("收藏夹链接不正确:" + link) 28 | } 29 | 30 | return &Collection{ 31 | Page: newZhihuPage(link), 32 | creator: creator, 33 | name: name, 34 | } 35 | } 36 | 37 | // GetName 返回收藏夹的名字 38 | func (c *Collection) GetName() string { 39 | if c.name != "" { 40 | return c.name 41 | } 42 | 43 | doc := c.Doc() 44 | 45 | //

46 | // 恩恩恩 大力一点,不要停~ 47 | //

48 | c.name = strip(doc.Find("h2#zh-fav-head-title").Text()) 49 | return c.name 50 | } 51 | 52 | // GetCreator 返回收藏夹的创建者 53 | func (c *Collection) GetCreator() *User { 54 | if c.creator != nil { 55 | return c.creator 56 | } 57 | 58 | doc := c.Doc() 59 | 60 | //

61 | // 李阳良 62 | //

63 | sel := doc.Find("h2.zm-list-content-title a") 64 | userId := strip(sel.Text()) 65 | linkPath, _ := sel.Attr("href") 66 | c.creator = NewUser(makeZhihuLink(linkPath), userId) 67 | return c.creator 68 | } 69 | 70 | // GetFollowersNum 返回收藏夹的关注者数量 71 | func (c *Collection) GetFollowersNum() int { 72 | if got, ok := c.getIntField("followers-num"); ok { 73 | return got 74 | } 75 | 76 | doc := c.Doc() 77 | 78 | // 79 | // 7516 80 | // 81 | text := strip(doc.Find(`a[data-za-a="visit_collection_followers"]`).Text()) 82 | num, _ := strconv.Atoi(text) 83 | c.setField("followers-num", num) 84 | return num 85 | } 86 | 87 | // GetFollowersN 返回 n 个关注该收藏夹的用户,如果 n < 0,返回所有关注者 88 | func (c *Collection) GetFollowersN(n int) []*User { 89 | var ( 90 | link = urlJoin(c.Link, "/followers") 91 | xsrf = c.GetXSRF() 92 | ) 93 | users, err := ajaxGetFollowers(link, xsrf, n) 94 | if err != nil { 95 | return nil 96 | } 97 | return users 98 | } 99 | 100 | // GetFollowers 返回关注该收藏夹的用户 101 | func (c *Collection) GetFollowers() []*User { 102 | return c.GetFollowersN(c.GetFollowersNum()) 103 | } 104 | 105 | // GetQuestionsN 返回前 n 个问题,如果 n < 0,返回所有问题 106 | func (c *Collection) GetQuestionsN(n int) []*Question { 107 | if n == 0 { 108 | return nil 109 | } 110 | 111 | // 先获取第一页的问题 112 | questions := getQuestionsFromDoc(c.Doc()) 113 | 114 | totalPages := c.totalPages() 115 | if totalPages == 1 { 116 | if n < 0 || n > len(questions) { 117 | return questions 118 | } 119 | return questions[0:n] 120 | } 121 | 122 | // 再分页查询其他问题 123 | currentPage := 2 124 | for currentPage <= totalPages { 125 | link := fmt.Sprintf("%s?page=%d", c.Link, currentPage) 126 | doc, err := newDocumentFromURL(link) 127 | if err != nil { 128 | logger.Error("解析页面失败:%s, %s", link, err.Error()) 129 | return nil 130 | } 131 | 132 | newQuestions := getQuestionsFromDoc(doc) 133 | questions = append(questions, newQuestions...) 134 | if n > 0 && len(questions) >= n { 135 | return questions[0:n] 136 | } 137 | currentPage++ 138 | } 139 | 140 | return questions 141 | } 142 | 143 | // GetQuestions 返回收藏夹里所有的问题 144 | func (c *Collection) GetQuestions() []*Question { 145 | return c.GetQuestionsN(-1) 146 | } 147 | 148 | // GetAnswersN 返回 n 个回答,如果 n < 0,返回所有回答 149 | func (c *Collection) GetAnswersN(n int) []*Answer { 150 | if n == 0 { 151 | return nil 152 | } 153 | 154 | // 先获取第一页的回答 155 | answers := getAnswersFromDoc(c.Doc()) 156 | 157 | totalPages := c.totalPages() 158 | if totalPages == 1 { 159 | if n < 0 || n > len(answers) { 160 | return answers 161 | } 162 | return answers[0:n] 163 | } 164 | 165 | // 在分页查询 166 | currentPage := 2 167 | for currentPage <= totalPages { 168 | link := fmt.Sprintf("%s?page=%d", c.Link, currentPage) 169 | doc, err := newDocumentFromURL(link) 170 | if err != nil { 171 | logger.Error("解析页面失败:%s, %s", link, err.Error()) 172 | return nil 173 | } 174 | 175 | newAnswers := getAnswersFromDoc(doc) 176 | answers = append(answers, newAnswers...) 177 | if n > 0 && len(answers) >= n { 178 | return answers[0:n] 179 | } 180 | currentPage++ 181 | } 182 | return answers 183 | } 184 | 185 | // GetAnswers 返回收藏夹里所有的回答 186 | func (c *Collection) GetAnswers() []*Answer { 187 | return c.GetAnswersN(-1) 188 | } 189 | 190 | // GetQuestionsNum 返回收藏夹的问题数量 191 | func (c *Collection) GetQuestionsNum() int { 192 | if value, ok := c.getIntField("question-num"); ok { 193 | return value 194 | } 195 | 196 | // 根据分页情况来计算问题数量 197 | // 收藏夹页面,每一页固定 10 个问题,每个问题下可能有多个答案; 198 | totalPages := c.totalPages() 199 | lastPage := c.Doc() 200 | 201 | if totalPages > 1 { 202 | lp, err := newDocumentFromURL(fmt.Sprintf("%s?page=%d", c.Link, totalPages)) 203 | if err != nil { 204 | logger.Error("获取收藏夹最后一页失败:%s", err.Error()) 205 | return 0 206 | } 207 | lastPage = lp 208 | } 209 | 210 | numOnLastPage := lastPage.Find("#zh-list-answer-wrap h2.zm-item-title").Size() 211 | rv := (totalPages-1)*10 + numOnLastPage 212 | c.setField("question-num", rv) 213 | return rv 214 | } 215 | 216 | // GetAnswersNum 返回收藏夹的答案数量 217 | // 获取答案数量有这几种方式: 218 | // 1. 在收藏夹页面(/collections/1234567),遍历每一页,累计每页的回答数量。总请求数等于分页数。 219 | // 2. 在收藏夹创建者的个人主页,收藏夹栏目(people/xxyy/collections),有每个收藏夹的简介, 220 | // 其中就有回答数。遍历每一页(20个/页),找到对应的收藏夹,然后获取回答数。 221 | // 总请求数不确定,最好情况下 1 次;但考虑到每个用户的收藏夹并不会很多(如达到100个),可以认为最坏情况下需要 5 次。 222 | // 最终的方案可以综合以上两种方式,以收藏夹页面分页数做依据: 223 | // 如果页数大于 3(经验值),则采用方法 2;否则用方法 1 224 | // 希望能通过这样的方式来减少请求数,获得更好的性能。 225 | func (c *Collection) GetAnswersNum() int { 226 | if value, ok := c.getIntField("answer-num"); ok { 227 | return value 228 | } 229 | 230 | rv := 0 231 | totalPages := c.totalPages() 232 | if totalPages > 3 { 233 | // 从个人主页上获取 234 | page := 1 235 | linkFmt := urlJoin(c.GetCreator().Link, "/collections?page=%d") 236 | collectionHref := strings.Split(c.Link, "zhihu.com")[1] 237 | selector := fmt.Sprintf(`a.zm-profile-fav-item-title[href="%s"]`, collectionHref) 238 | for { 239 | creatorCollectionLink := fmt.Sprintf(linkFmt, page) 240 | doc, err := newDocumentFromURL(creatorCollectionLink) 241 | if err != nil { 242 | logger.Error("获取用户的收藏夹主页失败:%s", err.Error()) 243 | return 0 244 | } 245 | titleTag := doc.Find(selector).First() 246 | if titleTag.Size() == 1 { 247 | rv = reMatchInt(titleTag.Parent().Next().Contents().Eq(0).Text()) 248 | break 249 | } else { 250 | // 本页没找到,下一页 251 | if doc.Find("div.border-pager").Size() == 0 { 252 | return 0 253 | } else { 254 | pages := getTotalPages(doc) 255 | if page == pages { 256 | return 0 257 | } 258 | page++ 259 | } 260 | } 261 | } 262 | } else { 263 | selector := "#zh-list-answer-wrap div.zm-item-fav" 264 | rv = c.Doc().Find(selector).Size() 265 | currentPage := 2 266 | for currentPage <= totalPages { 267 | link := fmt.Sprintf("%s?page=%d", c.Link, currentPage) 268 | doc, err := newDocumentFromURL(link) 269 | if err != nil { 270 | logger.Error("解析页面失败:%s, %s", link, err.Error()) 271 | return 0 272 | } 273 | rv += doc.Find(selector).Size() 274 | currentPage++ 275 | } 276 | } 277 | c.setField("answer-num", rv) 278 | return rv 279 | } 280 | 281 | // GetCommentsNum 返回评论数量 282 | func (c *Collection) GetCommentsNum() int { 283 | if value, ok := c.getIntField("comment-num"); ok { 284 | return value 285 | } 286 | 287 | doc := c.Doc() 288 | text := strip(doc.Find("div#zh-list-meta-wrap a.toggle-comment").Text()) 289 | rv := reMatchInt(text) 290 | c.setField("comment-num", rv) 291 | return rv 292 | } 293 | 294 | func (c *Collection) String() string { 295 | return fmt.Sprintf("", c.GetName(), c.Link) 296 | } 297 | 298 | func ajaxGetFollowers(link string, xsrf string, total int) ([]*User, error) { 299 | if total == 0 { 300 | return nil, nil 301 | } 302 | 303 | var ( 304 | offset = 0 305 | gotDataNum = pageSize 306 | initCap = total 307 | ) 308 | 309 | if initCap < 0 { 310 | initCap = pageSize 311 | } 312 | users := make([]*User, 0, initCap) 313 | 314 | form := url.Values{} 315 | form.Set("_xsrf", xsrf) 316 | 317 | for gotDataNum == pageSize { 318 | form.Set("offset", strconv.Itoa(offset)) 319 | doc, dataNum, err := newDocByNormalAjax(link, form) 320 | if err != nil { 321 | return nil, err 322 | } 323 | 324 | doc.Find("div.zm-profile-card").Each(func(index int, sel *goquery.Selection) { 325 | thisUser := newUserFromSelector(sel) 326 | users = append(users, thisUser) 327 | }) 328 | 329 | if total > 0 && len(users) >= total { 330 | return users[:total], nil 331 | } 332 | 333 | gotDataNum = dataNum 334 | offset += gotDataNum 335 | } 336 | return users, nil 337 | } 338 | 339 | func newDocByNormalAjax(link string, form url.Values) (*goquery.Document, int, error) { 340 | gotDataNum := 0 341 | body := strings.NewReader(form.Encode()) 342 | resp, err := gSession.Ajax(link, body, link) 343 | if err != nil { 344 | logger.Error("查询关注的话题失败, 链接:%s, 参数:%s,错误:%s", link, form.Encode(), err.Error()) 345 | return nil, gotDataNum, err 346 | } 347 | 348 | defer resp.Body.Close() 349 | result := normalAjaxResult{} 350 | err = json.NewDecoder(resp.Body).Decode(&result) 351 | if err != nil { 352 | logger.Error("解析返回值 json 失败:%s", err.Error()) 353 | return nil, gotDataNum, err 354 | } 355 | 356 | topicsHtml := result.Msg[1].(string) 357 | doc, err := goquery.NewDocumentFromReader(strings.NewReader(topicsHtml)) 358 | if err != nil { 359 | logger.Error("解析返回的 HTML 失败:%s", err.Error()) 360 | return nil, gotDataNum, err 361 | } 362 | gotDataNum = int(result.Msg[0].(float64)) 363 | return doc, gotDataNum, err 364 | } 365 | 366 | func getQuestionsFromDoc(doc *goquery.Document) []*Question { 367 | questions := make([]*Question, 0, pageSize) 368 | items := doc.Find("div#zh-list-answer-wrap").Find("h2.zm-item-title") 369 | items.Each(func(index int, sel *goquery.Selection) { 370 | a := sel.Find("a") 371 | qTitle := strip(a.Text()) 372 | qHref, _ := a.Attr("href") 373 | thisQuestion := NewQuestion(makeZhihuLink(qHref), qTitle) 374 | questions = append(questions, thisQuestion) 375 | }) 376 | return questions 377 | } 378 | 379 | func getAnswersFromDoc(doc *goquery.Document) []*Answer { 380 | var answers []*Answer 381 | var lastQuestion *Question 382 | 383 | doc.Find("div.zm-item").Each(func(index int, sel *goquery.Selection) { 384 | // 回答 385 | contentTag := sel.Find("div.zm-item-rich-text") 386 | if contentTag.Size() == 0 { 387 | // 回答被建议修改 388 | reason := strip(sel.Find("div.answer-status").Text()) 389 | logger.Warn("忽略一个问题,原因:%s", reason) 390 | return 391 | } 392 | 393 | // 获取问题,如果同一个问题下收藏了多个回答,则除了第一个外,后面的回答的 HTML 部分, 394 | // 也就是 div.zm-item 里面不会有该问题的链接(a 标签),所以用 lastQuestion 标记 395 | // 最近的一个问题 396 | var thisQuestion *Question 397 | if qTag := sel.Find("h2.zm-item-title").Find("a"); qTag.Size() > 0 { 398 | qTitle := strip(qTag.Text()) 399 | qHref, _ := qTag.Attr("href") 400 | thisQuestion = NewQuestion(makeZhihuLink(qHref), qTitle) 401 | lastQuestion = thisQuestion 402 | } else { 403 | thisQuestion = lastQuestion 404 | } 405 | 406 | // 答主 407 | author := newUserFromAnswerAuthorTag(sel.Find("div.zm-item-answer-author-info")) 408 | 409 | answerHref, _ := contentTag.Attr("data-entry-url") 410 | voteText, _ := sel.Find("a.zm-item-vote-count").Attr("data-votecount") 411 | vote, _ := strconv.Atoi(voteText) 412 | thisAnswer := NewAnswer(makeZhihuLink(answerHref), thisQuestion, author) 413 | thisAnswer.setUpvote(vote) 414 | 415 | answers = append(answers, thisAnswer) 416 | }) 417 | return answers 418 | } 419 | -------------------------------------------------------------------------------- /examples/config-example.json: -------------------------------------------------------------------------------- 1 | { 2 | "account": "email-or-phonenum", 3 | "password": "your-password-here" 4 | } -------------------------------------------------------------------------------- /examples/example.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "io/ioutil" 6 | 7 | "github.com/DeanThompson/zhihu-go" 8 | ) 9 | 10 | var ( 11 | logger = zhihu.Logger{true} 12 | ) 13 | 14 | func main() { 15 | zhihu.Init("./config.json") 16 | 17 | // 黄继新,和知乎在一起 18 | user := zhihu.NewUser("https://www.zhihu.com/people/jixin", "") 19 | showUser(user) 20 | 21 | logger.Success("========== split ==========") 22 | 23 | // Python 编程,应该养成哪些好的习惯? 24 | questionUrl := "https://www.zhihu.com/question/28966220" 25 | question := zhihu.NewQuestion(questionUrl, "") 26 | showQuestion(question) 27 | 28 | logger.Success("========== split ==========") 29 | 30 | // 龙有九个儿子,是跟谁生的?为什么「龙生九子,各不成龙」?豆子 的答案 31 | answer := zhihu.NewAnswer("https://www.zhihu.com/question/23759686/answer/41997389", nil, nil) 32 | showAnswer(answer) 33 | 34 | logger.Success("========== split ==========") 35 | 36 | // 程序员为了期权加入创业公司,值得吗? 匿名用户的答案 37 | answer2 := zhihu.NewAnswer("https://www.zhihu.com/question/28023819/answer/49723406", nil, nil) 38 | showAnswer(answer2) 39 | 40 | logger.Success("========== split ==========") 41 | 42 | // 黄继新 A4U 43 | collection := zhihu.NewCollection("https://www.zhihu.com/collection/19677733", "", nil) 44 | showCollection(collection) 45 | 46 | // Python 47 | topic := zhihu.NewTopic("https://www.zhihu.com/topic/19552832", "") 48 | showTopic(topic) 49 | } 50 | 51 | func showQuestion(question *zhihu.Question) { 52 | logger.Info("Question fields:") 53 | logger.Info(" url: %s", question.Link) 54 | logger.Info(" title: %s", question.GetTitle()) 55 | logger.Info(" detail: %s", question.GetDetail()) 56 | logger.Info(" answers num: %d", question.GetAnswersNum()) 57 | logger.Info(" followers num: %d", question.GetFollowersNum()) 58 | logger.Info(" comments num: %d", question.GetCommentsNum()) 59 | 60 | for i, topic := range question.GetTopics() { 61 | logger.Info(" topic-%d: %s", i+1, topic.String()) 62 | } 63 | 64 | for i, follower := range question.GetFollowersN(5) { 65 | logger.Info(" top follower-%d: %s", i+1, follower.String()) 66 | } 67 | 68 | for i, follower := range question.GetFollowers() { 69 | logger.Info(" follower-%d: %s", i+1, follower.String()) 70 | if i >= 10 { 71 | logger.Info(" %d followers not shown.", question.GetFollowersNum()-i-1) 72 | break 73 | } 74 | } 75 | 76 | allAnswers := question.GetAllAnswers() 77 | for i, answer := range allAnswers { 78 | logger.Info(" answer-%d: %s", i+1, answer.String()) 79 | filename := fmt.Sprintf("/tmp/%s-%s的回答.html", question.GetTitle(), answer.GetAuthor().GetUserID()) 80 | dumpAnswerHTML(filename, answer) 81 | if i >= 10 { 82 | logger.Info(" %d answers not shown.", len(allAnswers)-i-1) 83 | break 84 | } 85 | } 86 | 87 | topXAnswers := question.GetTopXAnswers(25) 88 | for i, answer := range topXAnswers { 89 | logger.Info(" top-%d answer: %s", i+1, answer.String()) 90 | } 91 | 92 | logger.Info(" top-1 answer: %s", question.GetTopAnswer().String()) 93 | logger.Info(" visit times: %d", question.GetVisitTimes()) 94 | } 95 | 96 | func showAnswer(answer *zhihu.Answer) { 97 | logger.Info("Answer fields:") 98 | logger.Info(" url: %s", answer.Link) 99 | 100 | question := answer.GetQuestion() 101 | logger.Info(" question url: %s", question.Link) 102 | logger.Info(" question title: %s", question.GetTitle()) 103 | 104 | logger.Info(" author: %s", answer.GetAuthor().String()) 105 | logger.Info(" upvote num: %d", answer.GetUpvote()) 106 | logger.Info(" comments num: %d", answer.GetCommentsNum()) 107 | logger.Info(" collected num: %d", answer.GetCollectedNum()) 108 | logger.Info(" data ID: %d", answer.GetID()) 109 | 110 | // dump content 111 | filename := fmt.Sprintf("/tmp/answer_%d.html", answer.GetID()) 112 | dumpAnswerHTML(filename, answer) 113 | 114 | voters := answer.GetVoters() 115 | for i, voter := range voters { 116 | logger.Info(" voter-%d: %s", i+1, voter.String()) 117 | if i >= 10 { 118 | remain := len(voters) - i - 1 119 | logger.Info(" %d votes not shown.", remain) 120 | break 121 | } 122 | } 123 | } 124 | 125 | func showCollection(collection *zhihu.Collection) { 126 | logger.Info("Collection fields:") 127 | logger.Info(" url: %s", collection.Link) 128 | logger.Info(" name: %s", collection.GetName()) 129 | logger.Info(" creator: %s", collection.GetCreator().String()) 130 | logger.Info(" followers num: %d", collection.GetFollowersNum()) 131 | logger.Info(" comments num: %d", collection.GetCommentsNum()) 132 | logger.Info(" questions num: %d", collection.GetQuestionsNum()) 133 | logger.Info(" answers num: %d", collection.GetAnswersNum()) 134 | 135 | for i, follower := range collection.GetFollowersN(5) { 136 | logger.Info(" top follower-%d: %s", i+1, follower.String()) 137 | } 138 | 139 | for i, follower := range collection.GetFollowers() { 140 | logger.Info(" follower-%d: %s", i+1, follower.String()) 141 | } 142 | 143 | for i, question := range collection.GetQuestionsN(5) { 144 | logger.Info(" top question-%d: %s", i+1, question.String()) 145 | } 146 | 147 | for i, question := range collection.GetQuestions() { 148 | logger.Info(" question-%d: %s", i+1, question.String()) 149 | } 150 | 151 | for i, answer := range collection.GetAnswersN(5) { 152 | logger.Info(" top answer-%d: %s", i+1, answer.String()) 153 | } 154 | 155 | for i, answer := range collection.GetAnswers() { 156 | logger.Info(" answer-%d: %s", i+1, answer.String()) 157 | } 158 | } 159 | 160 | func showUser(user *zhihu.User) { 161 | logger.Info("User fields:") 162 | logger.Info(" is anonymous: %v", user.IsAnonymous()) 163 | logger.Info(" userId: %s", user.GetUserID()) 164 | logger.Info(" dataId: %s", user.GetDataID()) 165 | logger.Info(" avatar: %s", user.GetAvatar()) 166 | logger.Info(" avatar with size hd: %s", user.GetAvatarWithSize("hd")) 167 | logger.Info(" bio: %s", user.GetBio()) 168 | logger.Info(" location: %s", user.GetLocation()) 169 | logger.Info(" business: %s", user.GetBusiness()) 170 | logger.Info(" education: %s", user.GetEducation()) 171 | logger.Info(" gender: %s", user.GetGender()) 172 | logger.Info(" weibo: %s", user.GetWeiboURL()) 173 | logger.Info(" followers num: %d", user.GetFollowersNum()) 174 | logger.Info(" followees num: %d", user.GetFolloweesNum()) 175 | logger.Info(" followed columns num: %d", user.GetFollowedColumnsNum()) 176 | logger.Info(" followed topics num: %d", user.GetFollowedTopicsNum()) 177 | logger.Info(" agree num: %d", user.GetAgreeNum()) 178 | logger.Info(" thanks num: %d", user.GetThanksNum()) 179 | logger.Info(" asks num: %d", user.GetAsksNum()) 180 | logger.Info(" answers num: %d", user.GetAnswersNum()) 181 | logger.Info(" posts num: %d", user.GetPostsNum()) 182 | logger.Info(" collections num: %d", user.GetCollectionsNum()) 183 | logger.Info(" logs num: %d", user.GetLogsNum()) 184 | 185 | for i, topic := range user.GetFollowedTopicsN(5) { 186 | logger.Info(" top followed topic-%d: %s", i+1, topic.String()) 187 | } 188 | 189 | // for i, topic := range user.GetFollowedTopics() { 190 | // logger.Info(" followed topic-%d: %s", i+1, topic.String()) 191 | // } 192 | 193 | for i, follower := range user.GetFollowersN(5) { 194 | logger.Info(" top follower-%d: %s", i+1, follower.String()) 195 | } 196 | 197 | // for i, follower := range user.GetFollowers() { 198 | // logger.Info(" follower-%d: %s", i+1, follower.String()) 199 | // } 200 | 201 | for i, followee := range user.GetFolloweesN(5) { 202 | logger.Info(" top followee-%d: %s", i+1, followee.String()) 203 | } 204 | 205 | // for i, followee := range user.GetFollowees() { 206 | // logger.Info(" followee-%d: %s", i+1, followee.String()) 207 | // } 208 | 209 | for i, ask := range user.GetAsksN(5) { 210 | logger.Info(" top ask-%d: %s", i+1, ask.String()) 211 | } 212 | 213 | // for i, ask := range user.GetAsks() { 214 | // logger.Info(" ask-%d: %s", i+1, ask.String()) 215 | // } 216 | 217 | for i, answer := range user.GetAnswersN(5) { 218 | logger.Info(" top answer-%d: %s", i+1, answer.String()) 219 | } 220 | 221 | // for i, answer := range user.GetAnswers() { 222 | // logger.Info(" answer-%d: %s", i+1, answer.String()) 223 | // } 224 | 225 | for i, collection := range user.GetCollectionsN(5) { 226 | logger.Info(" top collection-%d: %s", i+1, collection.String()) 227 | } 228 | 229 | // for i, collection := range user.GetCollections() { 230 | // logger.Info(" collection-%d: %s", i+1, collection.String()) 231 | // } 232 | 233 | for i, like := range user.GetLikes() { 234 | logger.Info(" like-%d: %s", i+1, like.String()) 235 | } 236 | } 237 | 238 | func showTopic(topic *zhihu.Topic) { 239 | logger.Info("Topic fields:") 240 | logger.Info(" url: %s", topic.Link) 241 | logger.Info(" name: %s", topic.GetName()) 242 | logger.Info(" description: %s", topic.GetDescription()) 243 | logger.Info(" followers num: %d", topic.GetFollowersNum()) 244 | 245 | for i, author := range topic.GetTopAuthors() { 246 | logger.Info(" top-%d author: %s", i+1, author.String()) 247 | } 248 | } 249 | 250 | func dumpAnswerHTML(filename string, answer *zhihu.Answer) error { 251 | err := ioutil.WriteFile(filename, []byte(answer.GetContent()), 0666) 252 | if err == nil { 253 | logger.Info(" content dumped to %s", filename) 254 | } 255 | return err 256 | } 257 | -------------------------------------------------------------------------------- /log.go: -------------------------------------------------------------------------------- 1 | package zhihu 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/fatih/color" 7 | ) 8 | 9 | // Logger 是一个简单的输出工具,可以输出不同颜色的信息 10 | // TODO simple level 11 | type Logger struct { 12 | Enabled bool 13 | } 14 | 15 | func (logger *Logger) log(a ...interface{}) { 16 | if logger.Enabled { 17 | fmt.Println(a...) 18 | } 19 | } 20 | 21 | // Error 输出 error 级别的日志 22 | func (logger *Logger) Error(msg string, a ...interface{}) { 23 | logger.log(color.RedString("ERROR: "+msg, a...)) 24 | } 25 | 26 | // Warn 输出 warning 级别的日志 27 | func (logger *Logger) Warn(msg string, a ...interface{}) { 28 | logger.log(color.YellowString("WARN: "+msg, a...)) 29 | } 30 | 31 | // Warning 是 Warn 的别名 32 | func (logger *Logger) Warning(msg string, a ...interface{}) { 33 | logger.Warn(msg, a...) 34 | } 35 | 36 | // Info 输出 info 级别的日志 37 | func (logger *Logger) Info(msg string, a ...interface{}) { 38 | logger.log(color.BlueString("INFO: "+msg, a...)) 39 | } 40 | 41 | // Debug 输出 debug 级别的日志 42 | func (logger *Logger) Debug(msg string, a ...interface{}) { 43 | logger.log(color.WhiteString("DEBUG: "+msg, a...)) 44 | } 45 | 46 | // Success 输出 success 的日志,基本上与 info 一样,除了使用了绿色 47 | func (logger *Logger) Success(msg string, a ...interface{}) { 48 | logger.log(color.GreenString("SUCCESS: "+msg, a...)) 49 | } 50 | -------------------------------------------------------------------------------- /log_test.go: -------------------------------------------------------------------------------- 1 | package zhihu 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func Test_Error(t *testing.T) { 8 | var logger = Logger{Enabled: true} 9 | logger.Error("测试:输出一条 ERROR 的信息") 10 | logger.Error("测试:从 1 到 5 分别是:%d, %d, %d, %d, %d", 1, 2, 3, 4, 5) 11 | } 12 | 13 | func Test_Info(t *testing.T) { 14 | var logger = Logger{Enabled: true} 15 | logger.Info("测试:输出一条 INFO 的信息") 16 | logger.Info("测试:从 1 到 5 分别是:%d, %d, %d, %d, %d", 1, 2, 3, 4, 5) 17 | } 18 | -------------------------------------------------------------------------------- /question.go: -------------------------------------------------------------------------------- 1 | package zhihu 2 | 3 | import ( 4 | "encoding/json" 5 | "errors" 6 | "fmt" 7 | "net/url" 8 | "strconv" 9 | "strings" 10 | 11 | "github.com/PuerkitoBio/goquery" 12 | ) 13 | 14 | // Question 表示一个知乎问题,可以用于获取其标题、详情、答案等信息 15 | type Question struct { 16 | *Page 17 | 18 | // title 是该问题的标题 19 | title string 20 | } 21 | 22 | // NewQuestion 通过给定的 URL 创建一个 Question 对象 23 | func NewQuestion(link string, title string) *Question { 24 | if !validQuestionURL(link) { 25 | panic("问题链接不正确: " + link) 26 | } 27 | 28 | return &Question{ 29 | Page: newZhihuPage(link), 30 | title: title, 31 | } 32 | } 33 | 34 | // GetTitle 获取问题标题 35 | func (q *Question) GetTitle() string { 36 | if q.title != "" { 37 | return q.title 38 | } 39 | 40 | doc := q.Doc() 41 | q.title = strip(doc.Find("h2.zm-item-title").First().Text()) 42 | return q.title 43 | } 44 | 45 | // GetDetail 获取问题描述 46 | func (q *Question) GetDetail() string { 47 | if got, ok := q.getStringField("detail"); ok { 48 | return got 49 | } 50 | 51 | doc := q.Doc() 52 | detail := strip(doc.Find("div#zh-question-detail").First().Text()) 53 | q.setField("detail", detail) 54 | return detail 55 | } 56 | 57 | // GetAnswersNum 获取问题回答数量 58 | func (q *Question) GetAnswersNum() int { 59 | if got, ok := q.getIntField("answers-num"); ok { 60 | return got 61 | } 62 | 63 | doc := q.Doc() 64 | data, exists := doc.Find("h3#zh-question-answer-num").Attr("data-num") 65 | answerNum := 0 66 | if exists { 67 | answerNum, _ = strconv.Atoi(data) 68 | } 69 | q.setField("answers-num", answerNum) 70 | return answerNum 71 | } 72 | 73 | // GetFollowersNum 获取问题关注数量 74 | func (q *Question) GetFollowersNum() int { 75 | if got, ok := q.getIntField("followers-num"); ok { 76 | return got 77 | } 78 | 79 | doc := q.Doc() 80 | text := doc.Find("div.zg-gray-normal>a>strong").Text() 81 | followersNum, _ := strconv.Atoi(text) 82 | q.setField("followers-num", followersNum) 83 | return followersNum 84 | } 85 | 86 | // GetTopics 获取问题的话题列表 87 | func (q *Question) GetTopics() []*Topic { 88 | var topics []*Topic 89 | q.Doc().Find("a.zm-item-tag").Each(func(index int, sel *goquery.Selection) { 90 | name := strip(sel.Text()) 91 | href, _ := sel.Attr("href") 92 | thisTopic := NewTopic(makeZhihuLink(href), name) 93 | topics = append(topics, thisTopic) 94 | }) 95 | return topics 96 | } 97 | 98 | // GetFollowersN 返回 n 个关注者,如果 n < 0,返回所有关注者 99 | func (q *Question) GetFollowersN(n int) []*User { 100 | var ( 101 | link = urlJoin(q.Link, "/followers") 102 | xsrf = q.GetXSRF() 103 | ) 104 | users, err := ajaxGetFollowers(link, xsrf, n) 105 | if err != nil { 106 | return nil 107 | } 108 | return users 109 | } 110 | 111 | // GetFollowers 获取关注该问题的用户 112 | func (q *Question) GetFollowers() []*User { 113 | return q.GetFollowersN(q.GetFollowersNum()) 114 | } 115 | 116 | // GetAllAnswers 获取问题的所有答案 117 | func (q *Question) GetAllAnswers() []*Answer { 118 | return q.GetTopXAnswers(q.GetAnswersNum()) 119 | } 120 | 121 | // GetTopXAnswers 获取问题 Top X 的答案 122 | func (q *Question) GetTopXAnswers(x int) []*Answer { 123 | if x < 0 || x > q.GetAnswersNum() { 124 | x = q.GetAnswersNum() 125 | } 126 | 127 | // 1. 首页的回答 128 | answers := q.getAnswersOnIndex() 129 | 130 | if x < len(answers) { 131 | return answers[:x] 132 | } 133 | 134 | // 2. "更多",调用 Ajax 接口 135 | moreCount := x - pageSize 136 | if moreCount > 0 { 137 | answers = append(answers, q.getMoreAnswers(moreCount)...) 138 | } 139 | 140 | return answers 141 | } 142 | 143 | // GetTopAnswer 获取问题排名第一的答案 144 | func (q *Question) GetTopAnswer() *Answer { 145 | topAnswers := q.GetTopXAnswers(1) 146 | if len(topAnswers) >= 1 { 147 | return topAnswers[0] 148 | } 149 | return nil 150 | } 151 | 152 | // GetCommentsNum 返回问题的评论数量 153 | func (q *Question) GetCommentsNum() int { 154 | if value, ok := q.getIntField("comment-num"); ok { 155 | return value 156 | } 157 | 158 | doc := q.Doc() 159 | text := doc.Find("div.zm-meta-panel a.toggle-comment").Text() 160 | rv := reMatchInt(strip(text)) 161 | q.setField("comment-num", rv) 162 | return rv 163 | } 164 | 165 | // GetVisitTimes 获取问题的访问次数 166 | func (q *Question) GetVisitTimes() int { 167 | if got, ok := q.getIntField("visit-times"); ok { 168 | return got 169 | } 170 | 171 | doc := q.Doc() 172 | content, exists := doc.Find(`meta[itemprop="visitsCount"]`).Attr("content") 173 | visitTimes := 0 174 | if exists { 175 | visitTimes, _ = strconv.Atoi(content) 176 | } 177 | q.setField("visit-times", visitTimes) 178 | return visitTimes 179 | } 180 | 181 | func (q *Question) String() string { 182 | return fmt.Sprintf("", q.GetTitle(), q.Link) 183 | } 184 | 185 | // getAnswersOnIndex 解析问题页面,返回第一页的回答 186 | func (q *Question) getAnswersOnIndex() []*Answer { 187 | totalNum := q.GetAnswersNum() 188 | answers := make([]*Answer, 0, minInt(pageSize, totalNum)) 189 | 190 | doc := q.Doc() 191 | 192 | doc.Find("div.zm-item-answer").Each(func(index int, sel *goquery.Selection) { 193 | answers = append(answers, q.processSingleAnswer(sel)) 194 | }) 195 | return answers 196 | } 197 | 198 | // getAnswersByAjax 处理 “更多” 回答,调用 Ajax 接口 199 | func (q *Question) getAnswersByAjax(page int) ([]*Answer, error) { 200 | offset := page * pageSize 201 | if offset > q.GetAnswersNum() { 202 | return nil, errors.New("No more answers.") 203 | } 204 | 205 | // 如果 URL 是 https://www.zhihu.com/question/23759686,则 urlToken 是 23759686 206 | urlToken, _ := strconv.Atoi(q.Link[len(q.Link)-8 : len(q.Link)]) 207 | 208 | form := url.Values{} 209 | form.Set("_xsrf", q.GetXSRF()) 210 | form.Set("method", "next") 211 | form.Set("params", fmt.Sprintf(`{"url_token":%d,"pagesize":%d,"offset":%d}`, urlToken, pageSize, offset)) 212 | 213 | link := makeZhihuLink("/node/QuestionAnswerListV2") 214 | body := strings.NewReader(form.Encode()) 215 | resp, err := gSession.Ajax(link, body, q.Link) 216 | if err != nil { 217 | return nil, err 218 | } 219 | 220 | defer resp.Body.Close() 221 | result := nodeListResult{} 222 | err = json.NewDecoder(resp.Body).Decode(&result) 223 | if err != nil { 224 | return nil, err 225 | } 226 | 227 | answers := make([]*Answer, 0, len(result.Msg)) 228 | for _, answerHtml := range result.Msg { 229 | doc, err := goquery.NewDocumentFromReader(strings.NewReader(answerHtml)) 230 | if err != nil { 231 | return nil, err 232 | } 233 | thisAnswer := q.processSingleAnswer(doc.Selection) 234 | answers = append(answers, thisAnswer) 235 | } 236 | 237 | return answers, nil 238 | } 239 | 240 | // getMoreAnswers 执行多次“更多” 241 | func (q *Question) getMoreAnswers(limit int) []*Answer { 242 | answers := make([]*Answer, 0, limit) 243 | index := 0 244 | totalPage := (limit + pageSize - 1) / pageSize 245 | for index < totalPage { 246 | page := index + 1 247 | moreAnswers, err := q.getAnswersByAjax(page) 248 | if err != nil { 249 | logger.Error("加载第 %d 页回答失败,问题:%s,错误:%s", page, q.Link, err.Error()) 250 | } else { 251 | answers = append(answers, moreAnswers...) 252 | } 253 | index++ 254 | } 255 | return answers 256 | } 257 | 258 | // processSingleAnswer 处理一个回答的 HTML 片段, 259 | // 这段 HTML 可能来自问题页面,也可能来自 Ajax 接口 260 | func (q *Question) processSingleAnswer(sel *goquery.Selection) *Answer { 261 | // 1. 获取链接 262 | answerHref, _ := sel.Find("a.answer-date-link").Attr("href") 263 | answerLink := makeZhihuLink(answerHref) 264 | 265 | // 2. 获取作者 266 | authorSel := sel.Find("div.zm-item-answer-author-info") 267 | var author *User 268 | if authorSel.Find("a.author-link").Size() == 0 { 269 | // 匿名用户 270 | author = ANONYMOUS 271 | } else { 272 | // 具名用户 273 | x := authorSel.Find("a.author-link") 274 | userID := strip(x.Text()) 275 | userHref, _ := x.Attr("href") 276 | author = NewUser(makeZhihuLink(userHref), userID) 277 | } 278 | 279 | answer := NewAnswer(answerLink, q, author) 280 | 281 | // 3. 获取赞同数 282 | dataIsOwner, _ := sel.Attr("data-isowner") 283 | isOwner := dataIsOwner == "1" // 判断是否本人的回答 284 | var voteText string 285 | if isOwner { 286 | voteText = strip(sel.Find("a.zm-item-vote-count").Text()) 287 | } else { 288 | voteText = strip(sel.Find("div.zm-votebar").Find("span.count").Text()) 289 | } 290 | answer.setUpvote(upvoteTextToNum(voteText)) 291 | 292 | // 4. 获取内容 293 | content, _ := answerSelectionToHtml(sel.Find("div.zm-editable-content")) 294 | answer.setContent(content) 295 | 296 | return answer 297 | } 298 | 299 | func (q *Question) setFollowersNum(value int) { 300 | q.setField("followers-num", value) 301 | } 302 | 303 | func (q *Question) setAnswersNum(value int) { 304 | q.setField("answers-num", value) 305 | } 306 | 307 | func (q *Question) setVisitTimes(value int) { 308 | q.setField("visit-times", value) 309 | } 310 | -------------------------------------------------------------------------------- /question_test.go: -------------------------------------------------------------------------------- 1 | package zhihu 2 | 3 | import "testing" 4 | 5 | func init_session() { 6 | Init("./examples/config.json") 7 | } 8 | 9 | func Test_GetTitle(t *testing.T) { 10 | init_session() 11 | 12 | question := NewQuestion("https://www.zhihu.com/question/41171543", "") 13 | got := question.GetTitle() 14 | want := "如何评价第一局比赛 AlphaGo 战胜李世石?" 15 | logger.Info("got title: %s", got) 16 | logger.Info("expected title: %s", want) 17 | if got != want { 18 | t.Error("GetTitle() returns error result") 19 | } 20 | } 21 | 22 | func Test_GetDetail(t *testing.T) { 23 | init_session() 24 | 25 | question := NewQuestion("https://www.zhihu.com/question/41171543", "") 26 | got := question.GetDetail() 27 | want := "本题已收录至知乎圆桌 » 对弈人工智能,更多关于李世石对战人工智能的解读欢迎关注讨论。" 28 | logger.Info("got detail: %s", got) 29 | logger.Info("expected detail: %s", want) 30 | if got != want { 31 | t.Error("GetDetail() returns error result") 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /session.go: -------------------------------------------------------------------------------- 1 | package zhihu 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "io" 7 | "io/ioutil" 8 | "net/http" 9 | "net/url" 10 | "os" 11 | "path/filepath" 12 | "regexp" 13 | "strconv" 14 | "strings" 15 | "time" 16 | 17 | "github.com/juju/persistent-cookiejar" 18 | ) 19 | 20 | // Auth 是用于登录的信息,保存了用户名和密码 21 | type Auth struct { 22 | Account string `json:"account"` 23 | Password string `json:"password"` 24 | 25 | loginType string // phone_num 或 email 26 | loginURL string // 通过 Account 判断 27 | } 28 | 29 | // isEmail 判断是否通过邮箱登录 30 | func (auth *Auth) isEmail() bool { 31 | return isEmail(auth.Account) 32 | } 33 | 34 | // isPhone 判断是否通过手机号登录 35 | func (auth *Auth) isPhone() bool { 36 | return regexp.MustCompile(`^1[0-9]{10}$`).MatchString(auth.Account) 37 | } 38 | 39 | func (auth *Auth) toForm() url.Values { 40 | if auth.isEmail() { 41 | auth.loginType = "email" 42 | auth.loginURL = makeZhihuLink("/login/email") 43 | } else if auth.isPhone() { 44 | auth.loginType = "phone_num" 45 | auth.loginURL = makeZhihuLink("/login/phone_num") 46 | } else { 47 | panic("无法判断登录类型: " + auth.Account) 48 | } 49 | values := url.Values{} 50 | logger.Info("登录类型:%s, 登录地址:%s", auth.loginType, auth.loginURL) 51 | values.Set(auth.loginType, auth.Account) 52 | values.Set("password", auth.Password) 53 | values.Set("remember_me", "true") // import! 54 | return values 55 | } 56 | 57 | // Session 保持和知乎服务器的会话,用于向服务器发起请求获取 HTML 或 JSON 数据 58 | type Session struct { 59 | auth *Auth 60 | client *http.Client 61 | } 62 | 63 | type loginResult struct { 64 | R int `json:"r"` 65 | Msg string `json:"msg"` 66 | ErrorCode int `json:"errcode"` 67 | Data interface{} `json:"data"` 68 | } 69 | 70 | // NewSession 创建并返回一个 *Session 对象, 71 | // 这里没有初始化登录账号信息,账号信息用 `LoadConfig` 通过配置文件进行设置 72 | func NewSession() *Session { 73 | s := new(Session) 74 | cookieJar, _ := cookiejar.New(nil) 75 | s.client = &http.Client{ 76 | Jar: cookieJar, 77 | } 78 | return s 79 | } 80 | 81 | // LoadConfig 从配置文件中读取账号信息 82 | // 配置文件 是 JSON 格式: 83 | // { 84 | // "account": "xyz@example.com", 85 | // "password": "p@ssw0rd" 86 | // } 87 | func (s *Session) LoadConfig(cfg string) { 88 | fd, err := os.Open(cfg) 89 | if err != nil { 90 | panic("无法打开配置文件 config.json: " + err.Error()) 91 | } 92 | defer fd.Close() 93 | 94 | auth := new(Auth) 95 | err = json.NewDecoder(fd).Decode(&auth) 96 | if err != nil { 97 | panic("解析配置文件出错: " + err.Error()) 98 | } 99 | 100 | s.auth = auth 101 | // TODO 如果设置了与上一次不一样的账号,最好把 cookies 重置 102 | } 103 | 104 | // Login 登录并保存 cookies 105 | func (s *Session) Login() error { 106 | if s.authenticated() { 107 | logger.Success("已经是登录状态,不需要重复登录") 108 | return nil 109 | } 110 | 111 | form := s.buildLoginForm().Encode() 112 | body := strings.NewReader(form) 113 | req, err := http.NewRequest("POST", s.auth.loginURL, body) 114 | if err != nil { 115 | logger.Error("构造登录请求失败:%s", err.Error()) 116 | return err 117 | } 118 | 119 | headers := newHTTPHeaders(true) 120 | headers.Set("Content-Length", strconv.Itoa(len(form))) 121 | headers.Set("Content-Type", "application/x-www-form-urlencoded") 122 | headers.Set("Referer", baseZhihuURL) 123 | req.Header = headers 124 | 125 | logger.Info("登录中,用户名:%s", s.auth.Account) 126 | 127 | resp, err := s.client.Do(req) 128 | if err != nil { 129 | logger.Error("登录失败:%s", err.Error()) 130 | return err 131 | } 132 | 133 | if strings.ToLower(resp.Header.Get("Content-Type")) != "application/json" { 134 | logger.Error("服务器没有返回 json 数据") 135 | return fmt.Errorf("未知的 Content-Type: %s", resp.Header.Get("Content-Type")) 136 | } 137 | 138 | defer resp.Body.Close() 139 | result := loginResult{} 140 | content, err := ioutil.ReadAll(resp.Body) 141 | if err != nil { 142 | logger.Error("读取响应内容失败:%s", err.Error()) 143 | } 144 | 145 | logger.Info("登录响应内容:%s", strings.Replace(string(content), "\n", "", -1)) 146 | 147 | err = json.Unmarshal(content, &result) 148 | if err != nil { 149 | logger.Error("JSON 解析失败:%s", err.Error()) 150 | return err 151 | } 152 | 153 | if result.R == 0 { 154 | logger.Success("登录成功!") 155 | s.client.Jar.(*cookiejar.Jar).Save() 156 | return nil 157 | } 158 | if result.R == 1 { 159 | logger.Warn("登录失败!原因:%s", result.Msg) 160 | return fmt.Errorf("登录失败!原因:%s", result.Msg) 161 | } 162 | 163 | logger.Error("登录出现未知错误:%s", string(content)) 164 | return fmt.Errorf("登录失败,未知错误:%s", string(content)) 165 | } 166 | 167 | // Get 发起一个 GET 请求,自动处理 cookies 168 | func (s *Session) Get(url string) (*http.Response, error) { 169 | logger.Info("GET %s", url) 170 | req, err := http.NewRequest("GET", url, nil) 171 | if err != nil { 172 | logger.Error("NewRequest failed with URL: %s", url) 173 | return nil, err 174 | } 175 | 176 | req.Header = newHTTPHeaders(false) 177 | return s.client.Do(req) 178 | } 179 | 180 | // Post 发起一个 POST 请求,自动处理 cookies 181 | func (s *Session) Post(url string, bodyType string, body io.Reader) (*http.Response, error) { 182 | logger.Info("POST %s, %s", url, bodyType) 183 | req, err := http.NewRequest("POST", url, body) 184 | if err != nil { 185 | return nil, err 186 | } 187 | 188 | headers := newHTTPHeaders(false) 189 | headers.Set("Content-Type", bodyType) 190 | req.Header = headers 191 | return s.client.Do(req) 192 | } 193 | 194 | // Ajax 发起一个 Ajax 请求,自动处理 cookies 195 | func (s *Session) Ajax(url string, body io.Reader, referer string) (*http.Response, error) { 196 | logger.Info("AJAX %s, referrer %s", url, referer) 197 | req, err := http.NewRequest("POST", url, body) 198 | if err != nil { 199 | return nil, err 200 | } 201 | 202 | headers := newHTTPHeaders(true) 203 | headers.Set("Content-Type", "application/x-www-form-urlencoded") 204 | headers.Set("Referer", referer) 205 | req.Header = headers 206 | return s.client.Do(req) 207 | } 208 | 209 | // authenticated 检查是否已经登录(cookies 没有失效) 210 | func (s *Session) authenticated() bool { 211 | originURL := makeZhihuLink("/settings/profile") 212 | resp, err := s.Get(originURL) 213 | if err != nil { 214 | logger.Error("访问 profile 页面出错: %s", err.Error()) 215 | return false 216 | } 217 | 218 | // 如果没有登录,会跳转到 http://www.zhihu.com/?next=%2Fsettings%2Fprofile 219 | lastURL := resp.Request.URL.String() 220 | logger.Info("获取 profile 的请求,跳转到了:%s", lastURL) 221 | return lastURL == originURL 222 | } 223 | 224 | func (s *Session) buildLoginForm() url.Values { 225 | values := s.auth.toForm() 226 | values.Set("_xsrf", s.searchXSRF()) 227 | values.Set("captcha", s.downloadCaptcha()) 228 | return values 229 | } 230 | 231 | // 从 cookies 获取 _xsrf 用于 POST 请求 232 | func (s *Session) searchXSRF() string { 233 | resp, err := s.Get(baseZhihuURL) 234 | if err != nil { 235 | panic("获取 _xsrf 失败:" + err.Error()) 236 | } 237 | 238 | // retrieve from cookies 239 | for _, cookie := range resp.Cookies() { 240 | if cookie.Name == "_xsrf" { 241 | return cookie.Value 242 | } 243 | } 244 | 245 | return "" 246 | } 247 | 248 | // downloadCaptcha 获取验证码,用于登录 249 | func (s *Session) downloadCaptcha() string { 250 | url := makeZhihuLink(fmt.Sprintf("/captcha.gif?r=%d&type=login", 1000*time.Now().Unix())) 251 | logger.Info("获取验证码:%s", url) 252 | resp, err := s.Get(url) 253 | if err != nil { 254 | panic("获取验证码失败:" + err.Error()) 255 | } 256 | if resp.StatusCode != http.StatusOK { 257 | panic(fmt.Sprintf("获取验证码失败,StatusCode = %d", resp.StatusCode)) 258 | } 259 | 260 | defer resp.Body.Close() 261 | 262 | fileExt := strings.Split(resp.Header.Get("Content-Type"), "/")[1] 263 | verifyImg := filepath.Join(getCwd(), "verify."+fileExt) 264 | fd, err := os.OpenFile(verifyImg, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0777) 265 | if err != nil { 266 | panic("打开验证码文件失败:" + err.Error()) 267 | } 268 | defer fd.Close() 269 | 270 | io.Copy(fd, resp.Body) // 保存验证码文件 271 | openCaptchaFile(verifyImg) // 调用外部程序打开 272 | captcha := readCaptchaInput() // 读取用户输入 273 | 274 | return captcha 275 | } 276 | 277 | var ( 278 | gSession = NewSession() // 全局的 Session,调用 Init() 初始化 279 | ) 280 | 281 | // Init 用于传入配置文件,配置全局的 Session 282 | func Init(cfgFile string) { 283 | // 配置账号信息 284 | gSession.LoadConfig(cfgFile) 285 | 286 | // 登录 287 | gSession.Login() 288 | } 289 | 290 | // SetSession 用于替换默认的 session 291 | func SetSession(s *Session) { 292 | gSession = s 293 | } 294 | -------------------------------------------------------------------------------- /session_test.go: -------------------------------------------------------------------------------- 1 | package zhihu 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | const cfgFile = "./examples/config.json" 8 | 9 | func Test_searchXsrf(t *testing.T) { 10 | s := NewSession() 11 | logger.Debug("_xsrf: %s", s.searchXSRF()) 12 | } 13 | 14 | //func Test_downloadCaptcha(t *testing.T) { 15 | // s := NewSession("./example/config.json") 16 | // s.downloadCaptcha() 17 | //} 18 | 19 | //func Test_buildLoginForm(t *testing.T) { 20 | // s := &Session{} 21 | // s.LoadConfig() 22 | // values := s.buildLoginForm() 23 | // fmt.Println(values.Encode()) 24 | //} 25 | -------------------------------------------------------------------------------- /topic.go: -------------------------------------------------------------------------------- 1 | package zhihu 2 | 3 | import ( 4 | "fmt" 5 | "strconv" 6 | 7 | "github.com/PuerkitoBio/goquery" 8 | ) 9 | 10 | type Topic struct { 11 | *Page 12 | 13 | // name 是改话题的名称 14 | name string 15 | } 16 | 17 | func NewTopic(link string, name string) *Topic { 18 | if !validTopicURL(link) { 19 | panic("非法的 Topic 链接:%s" + link) 20 | } 21 | return &Topic{ 22 | Page: newZhihuPage(link), 23 | name: name, 24 | } 25 | } 26 | 27 | // GetName 返回话题名称 28 | func (t *Topic) GetName() string { 29 | if t.name != "" { 30 | return t.name 31 | } 32 | 33 | //

Python

34 | t.name = strip(t.Doc().Find("h1.zm-editable-content").Text()) 35 | return t.name 36 | } 37 | 38 | // GetDescription 返回话题的描述 39 | func (t *Topic) GetDescription() string { 40 | if got, ok := t.getStringField("description"); ok { 41 | return got 42 | } 43 | 44 | //
45 | // Python 是一种面向对象的解释型计算机程序设计语言,在设计中注重代码的可读性,同时也是一种功能强大的通用型语言。 46 | // 47 | // 修改 48 | // 49 | //
50 | description := strip(t.Doc().Find("div.zm-editable-content").Text()) 51 | t.setField("description", description) 52 | return description 53 | } 54 | 55 | // GetFollowersNum 返回关注者数量 56 | func (t *Topic) GetFollowersNum() int { 57 | if got, ok := t.getIntField("followers-num"); ok { 58 | return got 59 | } 60 | 61 | //
62 | // 63 | // 82155 64 | // 人关注了该话题 65 | //
66 | text := strip(t.Doc().Find("div.zm-topic-side-followers-info strong").Text()) 67 | num, _ := strconv.Atoi(text) 68 | t.setField("followers-num", num) 69 | return num 70 | } 71 | 72 | // GetTopAuthors 返回最佳回答者,一般来说是 5 个 73 | func (t *Topic) GetTopAuthors() []*User { 74 | authors := make([]*User, 0, 5) 75 | div := t.Doc().Find("div#zh-topic-top-answerer") 76 | div.Find("div.zm-topic-side-person-item-content").Each(func(index int, sel *goquery.Selection) { 77 | tag := sel.Find("a").First() 78 | uHref, _ := tag.Attr("href") 79 | uId := strip(tag.Text()) 80 | 81 | thisAuthor := NewUser(makeZhihuLink(uHref), uId) 82 | 83 | bio, _ := sel.Find("div.zm-topic-side-bio").Attr("title") 84 | thisAuthor.setBio(bio) 85 | 86 | authors = append(authors, thisAuthor) 87 | }) 88 | return authors 89 | } 90 | 91 | func (t *Topic) String() string { 92 | return fmt.Sprintf("", t.GetName(), t.Link) 93 | } 94 | -------------------------------------------------------------------------------- /user.go: -------------------------------------------------------------------------------- 1 | package zhihu 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "net/url" 7 | "strconv" 8 | "strings" 9 | 10 | "github.com/PuerkitoBio/goquery" 11 | ) 12 | 13 | var ( 14 | ANONYMOUS = NewUser("", "匿名用户") 15 | ) 16 | 17 | // User 表示一个知乎用户 18 | type User struct { 19 | *Page 20 | 21 | // userId 表示用户的知乎 ID(用户名) 22 | userID string 23 | } 24 | 25 | // NewUser 创建一个用户对象。 26 | // link 为空的时候表示匿名用户,此时 userId 仅允许 "匿名用户" 或 "知乎用户"; 27 | // userId 可以为空,这种情况下调用 GetUserID 会去解析用户主页 28 | func NewUser(link string, userID string) *User { 29 | if link == "" && !isAnonymous(userID) { 30 | panic("调用 NewUser 的参数不合法") 31 | } 32 | 33 | return &User{ 34 | Page: newZhihuPage(link), 35 | userID: userID, 36 | } 37 | } 38 | 39 | // GetUserID 返回用户的知乎 ID 40 | func (user *User) GetUserID() string { 41 | if user.userID != "" { 42 | return user.userID 43 | } 44 | 45 | doc := user.Doc() 46 | 47 | //
48 | // 黄继新, 49 | // 和知乎在一起 50 | //
51 | user.userID = strip(doc.Find("div.title-section.ellipsis").Find("span.name").Text()) 52 | return user.userID 53 | } 54 | 55 | // GetDataID 返回用户的 data-id 56 | func (user *User) GetDataID() string { 57 | if user.IsAnonymous() { 58 | return "" 59 | } 60 | 61 | if got, ok := user.getStringField("data-id"); ok { 62 | return got 63 | } 64 | 65 | doc := user.Doc() 66 | 67 | // 分两种情况:自己和其他用户 68 | // 1. 其他用户 69 | //
70 | // 71 | //
72 | // 73 | // 2. 自己 74 | // 75 | var dataID string 76 | btns := doc.Find("div.zm-profile-header-op-btns") 77 | if btns.Size() > 0 { 78 | // 1. 其他用户 79 | dataID, _ = btns.Find("button").Attr("data-id") 80 | } else { 81 | // 2. 自己 82 | script := doc.Find(`script[data-name="ga_vars"]`).Text() 83 | data := make(map[string]interface{}) 84 | json.Unmarshal([]byte(script), &data) 85 | dataID = data["user_hash"].(string) 86 | } 87 | user.setField("data-id", dataID) 88 | return dataID 89 | } 90 | 91 | // GetBio 返回用户的 BIO 92 | func (user *User) GetBio() string { 93 | if user.IsAnonymous() { 94 | return "" 95 | } 96 | 97 | if got, ok := user.getStringField("bio"); ok { 98 | return got 99 | } 100 | 101 | doc := user.Doc() 102 | 103 | // 程序员,用 Python 和 Go 做服务端开发。 104 | bio := strip(doc.Find("span.bio").Eq(0).Text()) 105 | user.setField("bio", bio) 106 | return bio 107 | } 108 | 109 | // GetLocation 返回用户所在地 110 | func (user *User) GetLocation() string { 111 | return user.getProfile("location") 112 | } 113 | 114 | // GetBusiness 返回用户的所在行业 115 | func (user *User) GetBusiness() string { 116 | return user.getProfile("business") 117 | } 118 | 119 | // GetEducation 返回用户的教育信息 120 | func (user *User) GetEducation() string { 121 | return user.getProfile("education") 122 | } 123 | 124 | // GetGender 返回用户的性别(male/female/unknown) 125 | func (user *User) GetGender() string { 126 | gender := "unknown" 127 | if user.IsAnonymous() { 128 | return gender 129 | } 130 | 131 | if got, ok := user.getStringField("gender"); ok { 132 | return got 133 | } 134 | 135 | doc := user.Doc() 136 | 137 | // 138 | sel := doc.Find("span.gender").Find("i") 139 | if sel.HasClass("icon-profile-male") { 140 | gender = "male" 141 | } else { 142 | gender = "female" 143 | } 144 | user.setField("gender", gender) 145 | return gender 146 | } 147 | 148 | // GetAvatar 返回用户的头像 URL,默认的尺寸 149 | func (user *User) GetAvatar() string { 150 | if user.IsAnonymous() { 151 | return "" 152 | } 153 | 154 | if got, ok := user.getStringField("avatar"); ok { 155 | return got 156 | } 157 | 158 | img := user.Doc().Find("div.body").Find("img.Avatar").First() 159 | avatar, _ := img.Attr("src") 160 | user.setField("avatar", avatar) 161 | return avatar 162 | } 163 | 164 | // GetAvatarWithSize 返回指定尺寸的的头像 URL,size 支持的值:s, xs, m, l, xl, hd, "" 165 | func (user *User) GetAvatarWithSize(size string) string { 166 | defaultAvatar := user.GetAvatar() 167 | if defaultAvatar == "" { 168 | return defaultAvatar 169 | } 170 | 171 | if !validateAvatarSize(size) { 172 | return defaultAvatar 173 | } 174 | 175 | return replaceAvatarSize(defaultAvatar, size) 176 | } 177 | 178 | // GetWeiboURL 返回用户的微博主页 URL 179 | func (user *User) GetWeiboURL() string { 180 | if user.IsAnonymous() { 181 | return "" 182 | } 183 | 184 | if got, ok := user.getStringField("weibo-url"); ok { 185 | return got 186 | } 187 | 188 | value := "" 189 | tag := user.Doc().Find("a.zm-profile-header-user-weibo") 190 | if tag.Size() > 0 { 191 | value, _ = tag.First().Attr("href") 192 | } 193 | user.setField("weibo-url", value) 194 | return value 195 | } 196 | 197 | // GetFollowersNum 返回用户的粉丝数量 198 | func (user *User) GetFollowersNum() int { 199 | return user.getFollowersNumOrFolloweesNum("followers-num") 200 | } 201 | 202 | // GetFolloweesNum 返回用户关注的数量 203 | func (user *User) GetFolloweesNum() int { 204 | return user.getFollowersNumOrFolloweesNum("followees-num") 205 | } 206 | 207 | // GetFollowedColumnsNum 返回用户关注的专栏数量 208 | func (user *User) GetFollowedColumnsNum() int { 209 | return user.getFollowedColumnsOrTopicsNum("followed-columns-num") 210 | } 211 | 212 | // GetFollowedTopicsNum 返回用户关注的话题数量 213 | func (user *User) GetFollowedTopicsNum() int { 214 | return user.getFollowedColumnsOrTopicsNum("followed-topics-num") 215 | } 216 | 217 | // GetAgreeNum 返回用户的点赞数 218 | func (user *User) GetAgreeNum() int { 219 | return user.getAgreeOrThanksNum("agree-num") 220 | } 221 | 222 | // GetThanksNum 返回用户的感谢数 223 | func (user *User) GetThanksNum() int { 224 | return user.getAgreeOrThanksNum("thanks-num") 225 | } 226 | 227 | // GetAsksNum 返回用户的提问数 228 | func (user *User) GetAsksNum() int { 229 | return user.getProfileNum("asks-num") 230 | } 231 | 232 | // GetAnswersNum 返回用户的回答数 233 | func (user *User) GetAnswersNum() int { 234 | return user.getProfileNum("answers-num") 235 | } 236 | 237 | // GetPostsNum 返回用户的专栏文章数量 238 | func (user *User) GetPostsNum() int { 239 | return user.getProfileNum("posts-num") 240 | } 241 | 242 | // GetCollectionsNum 返回用户的收藏夹数量 243 | func (user *User) GetCollectionsNum() int { 244 | return user.getProfileNum("collections-num") 245 | } 246 | 247 | // GetLogsNum 返回用户公共编辑数量 248 | func (user *User) GetLogsNum() int { 249 | return user.getProfileNum("logs-num") 250 | } 251 | 252 | // GetFolloweesN 返回前 n 个用户关注的人,如果 n < 0,返回所有关注的人 253 | func (user *User) GetFolloweesN(n int) []*User { 254 | users, err := user.getFolloweesOrFollowers("followees", n) 255 | if err != nil { 256 | logger.Error("获取 %s 关注的人失败:%s", user.String(), err.Error()) 257 | return nil 258 | } 259 | return users 260 | } 261 | 262 | // GetFollowees 返回用户关注的人 263 | func (user *User) GetFollowees() []*User { 264 | return user.GetFolloweesN(-1) 265 | } 266 | 267 | // GetFollowersN 返回前 n 个粉丝,如果 n < 0,返回所有粉丝 268 | func (user *User) GetFollowersN(n int) []*User { 269 | users, err := user.getFolloweesOrFollowers("followers", n) 270 | if err != nil { 271 | logger.Error("获取 %s 的粉丝失败:%s", user.String(), err.Error()) 272 | return nil 273 | } 274 | return users 275 | 276 | } 277 | 278 | // GetFollowers 返回用户的粉丝列表 279 | func (user *User) GetFollowers() []*User { 280 | return user.GetFollowersN(-1) 281 | } 282 | 283 | // GetAsksN 返回用户前 n 个提问,如果 n < 0, 返回所有提问 284 | func (user *User) GetAsksN(n int) []*Question { 285 | if user.IsAnonymous() { 286 | return nil 287 | } 288 | 289 | total := user.GetAsksNum() 290 | if n < 0 || n > total { 291 | n = total 292 | } 293 | if n == 0 { 294 | return nil 295 | } 296 | 297 | page := 1 298 | questions := make([]*Question, 0, n) 299 | for page < ((n-1)/pageSize + 2) { 300 | link := urlJoin(user.Link, fmt.Sprintf("/asks?page=%d", page)) 301 | doc, err := newDocumentFromURL(link) 302 | if err != nil { 303 | return nil 304 | } 305 | 306 | doc.Find("div#zh-profile-ask-list").Children().Each(func(index int, sel *goquery.Selection) { 307 | a := sel.Find("a.question_link") 308 | title := strip(a.Text()) 309 | href, _ := a.Attr("href") 310 | questionLink := makeZhihuLink(href) 311 | thisQuestion := NewQuestion(questionLink, title) 312 | 313 | // 获取回答数 314 | answersNum := reMatchInt(strip(sel.Find("div.meta").Contents().Eq(4).Text())) 315 | thisQuestion.setAnswersNum(answersNum) 316 | 317 | // 获取关注数 318 | followersNum := reMatchInt(strip(sel.Find("div.meta").Contents().Eq(6).Text())) 319 | thisQuestion.setFollowersNum(followersNum) 320 | 321 | // 获取浏览量 322 | visitTimes, _ := strconv.Atoi(strip(sel.Find("div.zm-profile-vote-num").Text())) 323 | thisQuestion.setVisitTimes(visitTimes) 324 | 325 | questions = append(questions, thisQuestion) 326 | }) 327 | 328 | if n > 0 && len(questions) >= n { 329 | return questions[:n] 330 | } 331 | 332 | page++ 333 | } 334 | return questions 335 | } 336 | 337 | // GetAsks 返回用户所有的提问 338 | func (user *User) GetAsks() []*Question { 339 | return user.GetAsksN(-1) 340 | } 341 | 342 | // GetAnswersN 返回用户前 n 个回答,如果 n < 0,返回所有回答 343 | func (user *User) GetAnswersN(n int) []*Answer { 344 | if user.IsAnonymous() { 345 | return nil 346 | } 347 | 348 | total := user.GetAnswersNum() 349 | if n < 0 || n > total { 350 | n = total 351 | } 352 | if n == 0 { 353 | return nil 354 | } 355 | 356 | page := 1 357 | answers := make([]*Answer, 0, n) 358 | for page < ((n-1)/pageSize + 2) { 359 | link := urlJoin(user.Link, fmt.Sprintf("/answers?page=%d", page)) 360 | doc, err := newDocumentFromURL(link) 361 | if err != nil { 362 | return nil 363 | } 364 | 365 | doc.Find("div#zh-profile-answer-list").Children().Each(func(index int, sel *goquery.Selection) { 366 | a := sel.Find("a.question_link") 367 | qTitle := strip(a.Text()) 368 | answerHref, _ := a.Attr("href") 369 | qLink := makeZhihuLink(answerHref[0:strings.Index(answerHref, "/answer")]) 370 | question := NewQuestion(qLink, qTitle) 371 | thisAnswer := NewAnswer(makeZhihuLink(answerHref), question, user) 372 | 373 | voteText, _ := sel.Find("a.zm-item-vote-count").Attr("data-votecount") 374 | vote, _ := strconv.Atoi(voteText) 375 | thisAnswer.setUpvote(vote) 376 | 377 | answers = append(answers, thisAnswer) 378 | }) 379 | 380 | if n > 0 && len(answers) >= n { 381 | return answers[:n] 382 | } 383 | 384 | page++ 385 | } 386 | 387 | return answers 388 | } 389 | 390 | // GetAnswers 返回用户所有的回答 391 | func (user *User) GetAnswers() []*Answer { 392 | return user.GetAnswersN(-1) 393 | } 394 | 395 | // GetCollectionsN 返回用户前 n 个收藏夹,如果 n < 0,返回所有收藏夹 396 | func (user *User) GetCollectionsN(n int) []*Collection { 397 | if user.IsAnonymous() { 398 | return nil 399 | } 400 | 401 | total := user.GetCollectionsNum() 402 | if n < 0 || n > total { 403 | n = total 404 | } 405 | if n == 0 { 406 | return nil 407 | } 408 | 409 | page := 1 410 | collections := make([]*Collection, 0, n) 411 | for page < ((n-1)/pageSize + 2) { 412 | link := urlJoin(user.Link, fmt.Sprintf("/collections?page=%d", page)) 413 | doc, err := newDocumentFromURL(link) 414 | if err != nil { 415 | return nil 416 | } 417 | 418 | doc.Find("div.zm-profile-section-item").Each(func(index int, sel *goquery.Selection) { 419 | a := sel.Find("a.zm-profile-fav-item-title") 420 | cName := strip(a.Text()) 421 | href, _ := a.Attr("href") 422 | cLink := makeZhihuLink(href) 423 | thisCollection := NewCollection(cLink, cName, user) 424 | collections = append(collections, thisCollection) 425 | }) 426 | 427 | if n > 0 && len(collections) >= n { 428 | return collections[:n] 429 | } 430 | 431 | page++ 432 | } 433 | 434 | return collections 435 | } 436 | 437 | // GetCollections 返回用户的收藏夹 438 | func (user *User) GetCollections() []*Collection { 439 | return user.GetCollectionsN(-1) 440 | } 441 | 442 | // GetFollowedTopicsN 返回用户前 n 个关注的话题,如果 n < 0,返回所有话题 443 | func (user *User) GetFollowedTopicsN(n int) []*Topic { 444 | if user.IsAnonymous() { 445 | return nil 446 | } 447 | 448 | total := user.GetFollowedTopicsNum() 449 | if n < 0 || n > total { 450 | n = total 451 | } 452 | if n == 0 { 453 | return nil 454 | } 455 | 456 | var ( 457 | link = urlJoin(user.Link, "/topics") 458 | gotDataNum = pageSize 459 | offset = 0 460 | topics = make([]*Topic, 0, n) 461 | ) 462 | 463 | form := url.Values{} 464 | form.Set("_xsrf", user.GetXSRF()) 465 | form.Set("start", "0") 466 | 467 | for gotDataNum == pageSize { 468 | form.Set("offset", strconv.Itoa(offset)) 469 | doc, dataNum, err := newDocByNormalAjax(link, form) 470 | if err != nil { 471 | return nil 472 | } 473 | 474 | doc.Find("div.zm-profile-section-item").Each(func(index int, sel *goquery.Selection) { 475 | tName := strip(sel.Find("strong").Text()) 476 | tHref, _ := sel.Find("a.zm-list-avatar-link").Attr("href") 477 | thisTopic := NewTopic(makeZhihuLink(tHref), tName) 478 | topics = append(topics, thisTopic) 479 | }) 480 | 481 | if n > 0 && len(topics) >= n { 482 | return topics[:n] 483 | } 484 | 485 | gotDataNum = dataNum 486 | offset += gotDataNum 487 | } 488 | 489 | return topics 490 | } 491 | 492 | // GetFollowedTopics 返回用户关注的话题 493 | func (user *User) GetFollowedTopics() []*Topic { 494 | return user.GetFollowedTopicsN(-1) 495 | } 496 | 497 | // GetLikes 返回用户赞过的回答 498 | func (user *User) GetLikes() []*Answer { 499 | if user.IsAnonymous() { 500 | return nil 501 | } 502 | // TODO 503 | return nil 504 | } 505 | 506 | // GetVotedAnswers 是 GetLikes 的别名 507 | func (user *User) GetVotedAnswers() []*Answer { 508 | return user.GetLikes() 509 | } 510 | 511 | // IsAnonymous 表示该用户是否匿名用户 512 | func (user *User) IsAnonymous() bool { 513 | return isAnonymous(user.userID) 514 | } 515 | 516 | func (user *User) String() string { 517 | if user.IsAnonymous() { 518 | return fmt.Sprintf("", user.userID) 519 | } 520 | return fmt.Sprintf("", user.userID, user.Link) 521 | } 522 | 523 | func (user *User) getProfile(cacheKey string) string { 524 | if user.IsAnonymous() { 525 | return "" 526 | } 527 | 528 | if got, ok := user.getStringField(cacheKey); ok { 529 | return got 530 | } 531 | 532 | doc := user.Doc() 533 | 534 | // 深圳 535 | // ... 536 | // ... 537 | value, _ := doc.Find(fmt.Sprintf("span.%s", cacheKey)).Attr("title") 538 | user.setField(cacheKey, value) 539 | return value 540 | } 541 | 542 | func (user *User) getFollowersNumOrFolloweesNum(cacheKey string) int { 543 | if user.IsAnonymous() { 544 | return 0 545 | } 546 | 547 | if got, ok := user.getIntField(cacheKey); ok { 548 | return got 549 | } 550 | 551 | var index int 552 | switch cacheKey { 553 | case "followees-num": 554 | index = 0 555 | case "followers-num": 556 | index = 1 557 | default: 558 | return 0 559 | } 560 | 561 | doc := user.Doc() 562 | 563 | // 571 | value := doc.Find("div.zm-profile-side-following a strong").Eq(index).Text() 572 | num, _ := strconv.Atoi(value) 573 | user.setField(cacheKey, num) 574 | return num 575 | } 576 | 577 | func (user *User) getFollowedColumnsOrTopicsNum(cacheKey string) int { 578 | if user.IsAnonymous() { 579 | return 0 580 | } 581 | 582 | if got, ok := user.getIntField(cacheKey); ok { 583 | return got 584 | } 585 | 586 | var selector string 587 | switch cacheKey { 588 | case "followed-topics-num": 589 | selector = "div.zm-profile-side-topics" 590 | case "followed-columns-num": 591 | selector = "div.zm-profile-side-columns" 592 | default: 593 | return 0 594 | } 595 | 596 | doc := user.Doc() 597 | result := 0 598 | sel := doc.Find(selector) 599 | if sel.Size() > 0 { 600 | text := sel.Parent().Find("a.zg-link-litblue").Find("strong").Text() 601 | result = reMatchInt(strip(text)) 602 | } 603 | user.setField(cacheKey, result) 604 | return result 605 | } 606 | 607 | func (user *User) getAgreeOrThanksNum(cacheKey string) int { 608 | if user.IsAnonymous() { 609 | return 0 610 | } 611 | 612 | var selector string 613 | switch cacheKey { 614 | case "agree-num": 615 | selector = "span.zm-profile-header-user-agree > strong" 616 | case "thanks-num": 617 | selector = "span.zm-profile-header-user-thanks > strong" 618 | default: 619 | return 0 620 | } 621 | 622 | if got, ok := user.getIntField(cacheKey); ok { 623 | return got 624 | } 625 | 626 | doc := user.Doc() 627 | 628 | //
629 | //
630 | // 获得 631 | // 68200赞同 632 | // 17511感谢 633 | //
634 | //
635 | num, _ := strconv.Atoi(doc.Find(selector).Text()) 636 | user.setField(cacheKey, num) 637 | return num 638 | } 639 | 640 | func (user *User) getProfileNum(cacheKey string) int { 641 | if user.IsAnonymous() { 642 | return 0 643 | } 644 | 645 | if got, ok := user.getIntField(cacheKey); ok { 646 | return got 647 | } 648 | 649 | var index int 650 | switch cacheKey { 651 | case "asks-num": 652 | index = 0 653 | case "answers-num": 654 | index = 1 655 | case "posts-num": 656 | index = 2 657 | case "collections-num": 658 | index = 3 659 | case "logs-num": 660 | index = 4 661 | default: 662 | return 0 663 | } 664 | 665 | doc := user.Doc() 666 | 667 | //
668 | // 主页 669 | // 提问 1336 670 | // 回答 785 671 | // 专栏文章 91 672 | // 收藏 44 673 | // 公共编辑 51471 674 | //
675 | value := doc.Find("div.profile-navbar").Find("span.num").Eq(index).Text() 676 | num, _ := strconv.Atoi(value) 677 | user.setField(cacheKey, num) 678 | return num 679 | } 680 | 681 | func (user *User) getFolloweesOrFollowers(eeOrEr string, limit int) ([]*User, error) { 682 | if user.IsAnonymous() { 683 | return nil, nil 684 | } 685 | 686 | if limit == 0 { 687 | return nil, nil 688 | } 689 | 690 | var ( 691 | referer, ajaxURL string 692 | offset, totalNum int 693 | hashID = user.GetDataID() 694 | ) 695 | 696 | if eeOrEr == "followees" { 697 | referer = urlJoin(user.Link, "/followees") 698 | ajaxURL = makeZhihuLink("/node/ProfileFolloweesListV2") 699 | totalNum = user.GetFollowersNum() 700 | } else { 701 | referer = urlJoin(user.Link, "/followers") 702 | ajaxURL = makeZhihuLink("/node/ProfileFollowersListV2") 703 | totalNum = user.GetFolloweesNum() 704 | } 705 | 706 | if limit < 0 || limit > totalNum { 707 | limit = totalNum 708 | } 709 | 710 | form := url.Values{} 711 | form.Set("_xsrf", user.GetXSRF()) 712 | form.Set("method", "next") 713 | 714 | users := make([]*User, 0, limit) 715 | for { 716 | form.Set("params", fmt.Sprintf(`{"offset":%d,"order_by":"created","hash_id":"%s"}`, offset, hashID)) 717 | body := strings.NewReader(form.Encode()) 718 | resp, err := gSession.Ajax(ajaxURL, body, referer) 719 | if err != nil { 720 | return nil, err 721 | } 722 | 723 | defer resp.Body.Close() 724 | result := nodeListResult{} 725 | err = json.NewDecoder(resp.Body).Decode(&result) 726 | if err != nil { 727 | logger.Error("json decode failed: %s", err.Error()) 728 | return nil, err 729 | } 730 | 731 | for _, userHTML := range result.Msg { 732 | thisUser, err := newUserFromHTML(userHTML) 733 | if err != nil { 734 | return nil, err 735 | } 736 | users = append(users, thisUser) 737 | if len(users) == limit { 738 | break 739 | } 740 | } 741 | 742 | // 已经获取了需要的数量,或者数量不够,但是已经到了最后一页 743 | if len(users) == limit || len(result.Msg) < pageSize { 744 | break 745 | } else { 746 | offset += pageSize 747 | } 748 | } 749 | return users, nil 750 | } 751 | 752 | func (user *User) setFollowersNum(value int) { 753 | user.setField("followers-num", value) 754 | } 755 | 756 | func (user *User) setAsksNum(value int) { 757 | user.setField("asks-num", value) 758 | } 759 | 760 | func (user *User) setAnswersNum(value int) { 761 | user.setField("answers-num", value) 762 | } 763 | 764 | func (user *User) setAgreeNum(value int) { 765 | user.setField("agree-num", value) 766 | } 767 | 768 | func (user *User) setBio(value string) { 769 | user.setField("bio", value) 770 | } 771 | 772 | func isAnonymous(userID string) bool { 773 | return userID == "匿名用户" || userID == "知乎用户" 774 | } 775 | 776 | func newUserFromHTML(html string) (*User, error) { 777 | doc, err := goquery.NewDocumentFromReader(strings.NewReader(html)) 778 | if err != nil { 779 | logger.Error("NewDocumentFromReader failed: %s", err.Error()) 780 | return nil, err 781 | } 782 | 783 | return newUserFromSelector(doc.Selection), nil 784 | } 785 | 786 | func newUserFromSelector(sel *goquery.Selection) *User { 787 | a := sel.Find("h2.zm-list-content-title").Find("a.zg-link") 788 | if a.Size() == 0 { 789 | // 匿名用户,没有用户主页入口 790 | return ANONYMOUS 791 | } 792 | 793 | userId := strip(a.Text()) 794 | link, _ := a.Attr("href") 795 | 796 | user := NewUser(link, userId) 797 | 798 | // 获取 BIO 799 | bio := strip(sel.Find("div.zg-big-gray").Text()) 800 | user.setField("bio", bio) 801 | 802 | // 获取关注者数量 803 | followersNum := reMatchInt(strip(sel.Find("div.details").Find("a").Eq(0).Text())) 804 | user.setFollowersNum(followersNum) 805 | 806 | // 获取提问数 807 | asksNum := reMatchInt(strip(sel.Find("div.details").Find("a").Eq(1).Text())) 808 | user.setAsksNum(asksNum) 809 | 810 | // 获取回答数 811 | answersNum := reMatchInt(strip(sel.Find("div.details").Find("a").Eq(2).Text())) 812 | user.setAnswersNum(answersNum) 813 | 814 | // 获取赞同数 815 | agreeNum := reMatchInt(strip(sel.Find("div.details").Find("a").Eq(3).Text())) 816 | user.setAgreeNum(agreeNum) 817 | 818 | return user 819 | } 820 | -------------------------------------------------------------------------------- /util.go: -------------------------------------------------------------------------------- 1 | package zhihu 2 | 3 | import ( 4 | "fmt" 5 | "io/ioutil" 6 | "net/http" 7 | "os" 8 | "os/exec" 9 | "path/filepath" 10 | "regexp" 11 | "runtime" 12 | "strconv" 13 | "strings" 14 | 15 | "github.com/PuerkitoBio/goquery" 16 | "github.com/fatih/color" 17 | ) 18 | 19 | const ( 20 | userAgent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36" 21 | baseZhihuURL = "https://www.zhihu.com" 22 | pageSize = 20 23 | ) 24 | 25 | var ( 26 | reQuestionURL = regexp.MustCompile("^(http|https)://www.zhihu.com/question/[0-9]{8}$") 27 | reCollectionURL = regexp.MustCompile("^(http|https)://www.zhihu.com/collection/[0-9]{8,9}$") // bugfix: for private collection 28 | reTopicURL = regexp.MustCompile("^(http|https)://www.zhihu.com/topic/[0-9]{8}$") 29 | reGetNumber = regexp.MustCompile(`([0-9])+`) 30 | reAvatarReplacer = regexp.MustCompile(`_(s|xs|m|l|xl|hd).(png|jpg)`) 31 | reIsEmail = regexp.MustCompile(`^[a-z0-9._%+\-]+@[a-z0-9.\-]+\.[a-z]{2,4}$`) 32 | logger = Logger{Enabled: true} 33 | ) 34 | 35 | func validQuestionURL(value string) bool { 36 | return reQuestionURL.MatchString(value) 37 | } 38 | 39 | func validCollectionURL(value string) bool { 40 | return reCollectionURL.MatchString(value) 41 | } 42 | 43 | func validTopicURL(value string) bool { 44 | return reTopicURL.MatchString(value) 45 | } 46 | 47 | func reMatchInt(raw string) int { 48 | matched := reGetNumber.FindStringSubmatch(raw) 49 | if len(matched) == 0 { 50 | return 0 51 | } 52 | rv, _ := strconv.Atoi(matched[0]) 53 | return rv 54 | } 55 | 56 | func validateAvatarSize(size string) bool { 57 | for _, x := range []string{"s", "xs", "m", "l", "xl", "hd"} { 58 | if size == x { 59 | return true 60 | } 61 | } 62 | return false 63 | } 64 | 65 | func replaceAvatarSize(origin string, size string) string { 66 | return reAvatarReplacer.ReplaceAllString(origin, fmt.Sprintf("_%s.$2", size)) 67 | } 68 | 69 | func isEmail(value string) bool { 70 | return reIsEmail.MatchString(value) 71 | } 72 | 73 | func newHTTPHeaders(isXhr bool) http.Header { 74 | headers := make(http.Header) 75 | headers.Set("Accept", "*/*") 76 | headers.Set("Connection", "keep-alive") 77 | headers.Set("Host", "www.zhihu.com") 78 | headers.Set("Origin", "http://www.zhihu.com") 79 | headers.Set("Pragma", "no-cache") 80 | headers.Set("User-Agent", userAgent) 81 | if isXhr { 82 | headers.Set("X-Requested-With", "XMLHttpRequest") 83 | } 84 | return headers 85 | } 86 | 87 | func strip(s string) string { 88 | return strings.TrimSpace(s) 89 | } 90 | 91 | func minInt(a, b int) int { 92 | if a > b { 93 | return b 94 | } 95 | return a 96 | } 97 | 98 | func getCwd() string { 99 | cwd, err := os.Getwd() 100 | if err != nil { 101 | panic("获取 CWD 失败:" + err.Error()) 102 | } 103 | return cwd 104 | } 105 | 106 | func save(filename string, content []byte) error { 107 | return ioutil.WriteFile(filename, content, 0666) 108 | } 109 | 110 | func saveString(filename string, content string) error { 111 | return ioutil.WriteFile(filename, []byte(content), 0666) 112 | } 113 | 114 | func openCaptchaFile(filename string) error { 115 | logger.Info("调用外部程序渲染验证码……") 116 | var args []string 117 | switch runtime.GOOS { 118 | case "linux": 119 | args = []string{"xdg-open", filename} 120 | case "darwin": 121 | args = []string{"open", filename} 122 | case "freebsd": 123 | args = []string{"open", filename} 124 | case "netbsd": 125 | args = []string{"open", filename} 126 | case "windows": 127 | var ( 128 | cmd = "url.dll,FileProtocolHandler" 129 | runDll32 = filepath.Join(os.Getenv("SYSTEMROOT"), "System32", "rundll32.exe") 130 | ) 131 | args = []string{runDll32, cmd, filename} 132 | default: 133 | fmt.Printf("无法确定操作系统,请自行打开验证码 %s 文件,并输入验证码。", filename) 134 | } 135 | 136 | logger.Info("Command: %s", strings.Join(args, " ")) 137 | 138 | err := exec.Command(args[0], args[1:]...).Run() 139 | if err != nil { 140 | return err 141 | } 142 | 143 | return nil 144 | } 145 | 146 | func readCaptchaInput() string { 147 | var captcha string 148 | fmt.Print(color.CyanString("请输入验证码:")) 149 | fmt.Scanf("%s", &captcha) 150 | return captcha 151 | } 152 | 153 | func makeZhihuLink(path string) string { 154 | return urlJoin(baseZhihuURL, path) 155 | } 156 | 157 | func urlJoin(base, path string) string { 158 | if strings.HasSuffix(base, "/") { 159 | base = strings.TrimRight(base, "/") 160 | } 161 | if strings.HasPrefix(path, "/") { 162 | path = strings.TrimLeft(path, "/") 163 | } 164 | return base + "/" + path 165 | } 166 | 167 | // newDocumentFromUrl 会请求给定的 url,并返回一个 goquery.Document 对象用于解析 168 | func newDocumentFromURL(url string) (*goquery.Document, error) { 169 | resp, err := gSession.Get(url) 170 | if err != nil { 171 | logger.Error("请求 %s 失败:%s", url, err.Error()) 172 | return nil, err 173 | } 174 | 175 | doc, err := goquery.NewDocumentFromResponse(resp) 176 | if err != nil { 177 | logger.Error("解析页面失败:%s", err.Error()) 178 | } 179 | 180 | return doc, err 181 | } 182 | 183 | // ZhihuPage 是一个知乎页面,User, Question, Answer, Collection 的公共部分 184 | type Page struct { 185 | // Link 是该页面的链接 186 | Link string 187 | 188 | // doc 是 HTML document 189 | doc *goquery.Document 190 | 191 | // fields 是字段缓存,避免重复解析页面 192 | fields map[string]interface{} 193 | } 194 | 195 | // newZhihuPage 是 private 的构造器 196 | func newZhihuPage(link string) *Page { 197 | return &Page{ 198 | Link: link, 199 | fields: make(map[string]interface{}), 200 | } 201 | } 202 | 203 | // Doc 用于获取当前问题页面的 HTML document,惰性求值 204 | func (page *Page) Doc() *goquery.Document { 205 | if page.doc != nil { 206 | return page.doc 207 | } 208 | 209 | err := page.Refresh() 210 | if err != nil { 211 | return nil 212 | } 213 | 214 | return page.doc 215 | } 216 | 217 | // Refresh 会重新载入当前页面,获取最新的数据 218 | func (page *Page) Refresh() (err error) { 219 | page.fields = make(map[string]interface{}) // 清空缓存 220 | page.doc, err = newDocumentFromURL(page.Link) // 重载页面 221 | return err 222 | } 223 | 224 | // GetXsrf 从当前页面内容抓取 xsrf 的值 225 | func (page *Page) GetXSRF() string { 226 | doc := page.Doc() 227 | value, _ := doc.Find(`input[name="_xsrf"]`).Attr("value") 228 | return value 229 | } 230 | 231 | // totalPages 获取总页数 232 | func (page *Page) totalPages() int { 233 | return getTotalPages(page.Doc()) 234 | } 235 | 236 | func (page *Page) setField(field string, value interface{}) { 237 | page.fields[field] = value 238 | } 239 | 240 | func (page *Page) getIntField(field string) (value int, exists bool) { 241 | if got, ok := page.fields[field]; ok { 242 | return got.(int), true 243 | } 244 | return 0, false 245 | } 246 | 247 | func (page *Page) getStringField(field string) (value string, exists bool) { 248 | if got, ok := page.fields[field]; ok { 249 | return got.(string), true 250 | } 251 | return "", false 252 | } 253 | 254 | func getTotalPages(doc *goquery.Document) int { 255 | pager := doc.Find("div.zm-invite-pager") 256 | if pager.Size() == 0 { 257 | return 1 258 | } 259 | text := pager.Find("span").Eq(-2).Text() 260 | pages, _ := strconv.Atoi(text) 261 | return pages 262 | } 263 | 264 | // nodeListResult 是形如 /node/XXListV2 这样的 Ajax 请求的 JSON 返回值 265 | type nodeListResult struct { 266 | R int `json:"r"` // 状态码,正确的情况为 0 267 | Msg []string `json:"msg"` // 回答内容,每个元素都是一段 HTML 片段 268 | } 269 | 270 | // normalAjaxResult 是页面内,目标 URL 和当前页面 URL 相同的 Ajax 请求返回的 JSON 数据 271 | type normalAjaxResult struct { 272 | R int `json:"r"` 273 | Msg []interface{} `json:"msg"` // 两个元素,第一个为话题数量,第二个是 HTML 片段 274 | } 275 | -------------------------------------------------------------------------------- /util_test.go: -------------------------------------------------------------------------------- 1 | package zhihu 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func Test_validQuestionURL(t *testing.T) { 8 | ioMap := map[string]bool{ 9 | "https://www.zhihu.com/question/37284137": true, 10 | "http://www.zhihu.com/question/41114729": true, 11 | "https://www.zhihu.com/question/41114729x": false, 12 | "https://www.zhihu.com/question/4111472": false, 13 | "https://www.zhihu.com/": false, 14 | } 15 | 16 | for value, expectedResult := range ioMap { 17 | if validQuestionURL(value) != expectedResult { 18 | t.Error("validQuestionURL returns error result") 19 | } 20 | } 21 | } 22 | --------------------------------------------------------------------------------