├── .gitignore ├── LICENSE ├── README.md ├── http.go ├── main_test.go ├── read.go └── regex.go /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files, Static and Dynamic libs (Shared Objects) 2 | *.o 3 | *.a 4 | *.so 5 | 6 | # Folders 7 | _obj 8 | _test 9 | 10 | # Architecture specific extensions/prefixes 11 | *.[568vq] 12 | [568vq].out 13 | 14 | *.cgo1.go 15 | *.cgo2.c 16 | _cgo_defun.c 17 | _cgo_gotypes.go 18 | _cgo_export.* 19 | 20 | _testmain.go 21 | 22 | *.exe 23 | *.test 24 | *.prof 25 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # readability 2 | readability for golang 3 | 4 | Golang版本是根据[readabiliity for node.js](https://github.com/luin/readability)以及[readability for python](https://github.com/kingwkb/readability)所改写,并加入了些自己的,比如支持gzip等。 5 | 6 | #### 引用的第三方包 7 | > github.com/PuerkitoBio/goquery 8 | > github.com/axgle/mahonia 9 | 10 | #### 使用方法 11 | 12 | ```Go 13 | 14 | package main 15 | 16 | import ( 17 | "fmt" 18 | 19 | "github.com/ying32/readability" 20 | ) 21 | 22 | func main() { 23 | test, err := readability.NewReadability("http://wd.leiting.com/home/news/news_detail.php?id=599") 24 | if err != nil { 25 | fmt.Println("failed.", err) 26 | return 27 | } 28 | test.Parse() 29 | fmt.Println(test.Title) 30 | fmt.Println(test.Content) 31 | } 32 | 33 | ``` -------------------------------------------------------------------------------- /http.go: -------------------------------------------------------------------------------- 1 | // 改自 https://github.com/kingwkb/readability python版本 2 | // 于2016-11-10 3 | // by: ying32 4 | package readability 5 | 6 | import ( 7 | "compress/flate" 8 | "compress/gzip" 9 | 10 | "io/ioutil" 11 | "net/http" 12 | "strings" 13 | 14 | "github.com/axgle/mahonia" 15 | ) 16 | 17 | func httpGet(url string) (string, error) { 18 | req, err := http.NewRequest(http.MethodGet, url, nil) 19 | if err != nil { 20 | return "", err 21 | } 22 | req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36") 23 | req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8") 24 | req.Header.Set("Accept-Encoding", "gzip, deflate") 25 | req.Header.Set("Accept-Language", "zh-CN,zh;q=0.8") 26 | req.Header.Set("Connection", "keep-alive") 27 | req.Header.Set("Cache-Control", "max-age=0") 28 | 29 | client := &http.Client{} 30 | resp, err := client.Do(req) 31 | if err != nil { 32 | return "", err 33 | } 34 | defer resp.Body.Close() 35 | contentEncoding := strings.Trim(strings.ToLower(resp.Header.Get("Content-Encoding")), " ") 36 | var bytes []byte 37 | if contentEncoding == "gzip" { 38 | x, err := gzip.NewReader(resp.Body) 39 | if err != nil { 40 | return "", err 41 | } 42 | bytes, err = ioutil.ReadAll(x) 43 | if err != nil { 44 | return "", err 45 | } 46 | } else if contentEncoding == "deflate" { 47 | x := flate.NewReader(resp.Body) 48 | bytes, err = ioutil.ReadAll(x) 49 | if err != nil { 50 | return "", err 51 | } 52 | } else { 53 | bytes, err = ioutil.ReadAll(resp.Body) 54 | if err != nil { 55 | return "", err 56 | } 57 | } 58 | 59 | srcStr := string(bytes) 60 | pageCodes := pageCodeReg.FindStringSubmatch(srcStr) 61 | if len(pageCodes) >= 2 { 62 | curCode := strings.ToLower(pageCodes[1]) 63 | if curCode == "gb2312" || curCode == "gbk" { 64 | decoder := mahonia.NewDecoder("gbk") 65 | srcStr = decoder.ConvertString(srcStr) 66 | } 67 | } 68 | return srcStr, nil 69 | } 70 | -------------------------------------------------------------------------------- /main_test.go: -------------------------------------------------------------------------------- 1 | package readability 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | ) 7 | 8 | func TestAll(t *testing.T) { 9 | test, err := NewReadability("http://wd.leiting.com/home/news/news_detail.php?id=599") 10 | if err != nil { 11 | fmt.Println("failed.", err) 12 | return 13 | } 14 | test.Parse() 15 | fmt.Println(test.Title) 16 | fmt.Println(test.Content) 17 | } 18 | -------------------------------------------------------------------------------- /read.go: -------------------------------------------------------------------------------- 1 | // 改自 https://github.com/kingwkb/readability python版本 2 | // 于2016-11-10 3 | // by: ying32 4 | package readability 5 | 6 | import ( 7 | "fmt" 8 | 9 | "crypto/md5" 10 | "errors" 11 | "math" 12 | nurl "net/url" 13 | "strings" 14 | "unicode/utf8" 15 | 16 | ghtml "html" 17 | 18 | "golang.org/x/net/html" 19 | 20 | "github.com/PuerkitoBio/goquery" 21 | ) 22 | 23 | type TCandidateItem struct { 24 | score float64 25 | node *goquery.Selection 26 | } 27 | 28 | type TReadability struct { 29 | html string 30 | url *nurl.URL 31 | htmlDoc *goquery.Document 32 | candidates map[string]TCandidateItem 33 | 34 | Title string 35 | Content string 36 | } 37 | 38 | func HashStr(node *goquery.Selection) string { 39 | if node == nil { 40 | return "" 41 | } 42 | html, _ := node.Html() 43 | return fmt.Sprintf("%x", md5.Sum([]byte(html))) 44 | } 45 | 46 | func strLen(str string) int { 47 | return utf8.RuneCountInString(str) 48 | } 49 | 50 | func NewReadability(url string) (*TReadability, error) { 51 | 52 | v := &TReadability{} 53 | var err error 54 | v.html, err = httpGet(url) 55 | if err != nil { 56 | return nil, err 57 | } 58 | v.url, _ = nurl.Parse(url) 59 | v.candidates = make(map[string]TCandidateItem, 0) 60 | 61 | v.html = replaceBrs.ReplaceAllString(v.html, "
")
62 | //v.html = replaceFonts.ReplaceAllString(v.html, `<\g<1>span>`)
63 |
64 | if v.html == "" {
65 | return nil, errors.New("html为空!")
66 | }
67 | doc, err := goquery.NewDocumentFromReader(strings.NewReader(v.html))
68 | if err != nil {
69 | return nil, err
70 | }
71 | v.htmlDoc = doc
72 | return v, nil
73 | }
74 |
75 | func (self *TReadability) removeScript() {
76 | self.htmlDoc.Find("script").Remove()
77 | }
78 |
79 | func (self *TReadability) removeStyle() {
80 | self.htmlDoc.Find("style").Remove()
81 | }
82 |
83 | func (self *TReadability) removeLink() {
84 | self.htmlDoc.Find("link").Remove()
85 | }
86 |
87 | func (self *TReadability) getTitle() string {
88 | return self.htmlDoc.Find("title").Text()
89 | }
90 |
91 | func (self *TReadability) getLinkDensity(node *goquery.Selection) float64 {
92 | if node == nil {
93 | return 0
94 | }
95 | textLength := float64(strLen(node.Text()))
96 | if textLength == 0 {
97 | return 0
98 | }
99 | linkLength := 0.0
100 | node.Find("a").Each(
101 | func(i int, link *goquery.Selection) {
102 | linkLength += float64(strLen(link.Text()))
103 | })
104 | return linkLength / textLength
105 | }
106 |
107 | func (self *TReadability) fixImagesPath(node *goquery.Selection) {
108 | if node == nil {
109 | return
110 | }
111 | node.Find("img").Each(
112 |
113 | func(i int, img *goquery.Selection) {
114 | src, _ := img.Attr("src")
115 | // dz论坛的有些img属性使用的是file字段
116 | if f, ok := img.Attr("file"); ok {
117 | src = f
118 | img.SetAttr("src", f)
119 | img.RemoveAttr("file")
120 | }
121 | if src == "" {
122 | img.Remove()
123 | return
124 | }
125 | if src != "" {
126 | if !strings.HasPrefix(src, "http://") && !strings.HasPrefix(src, "https://") {
127 | var newSrc string
128 | if strings.HasPrefix(src, "/") {
129 | newSrc = self.url.Scheme + "://" + self.url.Host + src
130 | } else {
131 | newSrc = self.url.Scheme + "://" + self.url.Host + self.url.Path + src
132 | }
133 | img.SetAttr("src", newSrc)
134 | }
135 | }
136 | })
137 | }
138 |
139 | func (self *TReadability) getClassWeight(node *goquery.Selection) float64 {
140 | weight := 0.0
141 | if str, b := node.Attr("class"); b {
142 | if negative.MatchString(str) {
143 | weight -= 25
144 | }
145 | if positive.MatchString(str) {
146 | weight += 25
147 | }
148 | }
149 | if str, b := node.Attr("id"); b {
150 | if negative.MatchString(str) {
151 | weight -= 25
152 | }
153 | if positive.MatchString(str) {
154 | weight += 25
155 | }
156 | }
157 | return weight
158 | }
159 |
160 | func (self *TReadability) initializeNode(node *goquery.Selection) TCandidateItem {
161 | contentScore := 0.0
162 | switch self.getTagName(node) {
163 | case "article":
164 | contentScore += 10
165 | case "section":
166 | contentScore += 8
167 | case "div":
168 | contentScore += 5
169 | case "pre", "blockquote", "td":
170 | contentScore += 3
171 | case "form", "ol", "dl", "dd", "dt", "li", "address":
172 | contentScore -= 3
173 | case "th", "h1", "h2", "h3", "h4", "h5", "h6":
174 | contentScore -= 5
175 | }
176 | contentScore += self.getClassWeight(node)
177 | return TCandidateItem{contentScore, node}
178 | }
179 |
180 | func (self *TReadability) cleanConditionally(e *goquery.Selection, tag string) {
181 | if e == nil {
182 | return
183 | }
184 | contentScore := 0.0
185 | e.Find(tag).Each(func(i int, node *goquery.Selection) {
186 | weight := self.getClassWeight(node)
187 | hashNode := HashStr(node)
188 | if v, ok := self.candidates[hashNode]; ok {
189 | contentScore = v.score
190 | } else {
191 | contentScore = 0
192 | }
193 |
194 | if weight+contentScore < 0 {
195 | node.Remove()
196 | } else {
197 | p := node.Find("p").Length()
198 | img := node.Find("img").Length()
199 | li := node.Find("li").Length() - 100
200 | input_html := node.Find("input_html").Length()
201 | embedCount := 0
202 | node.Find("embed").Each(func(i int, embed *goquery.Selection) {
203 | if !videos.MatchString(embed.AttrOr("src", "")) {
204 | embedCount += 1
205 | }
206 | })
207 | linkDensity := self.getLinkDensity(node)
208 | contentLength := strLen(node.Text())
209 | toRemove := false
210 | if img > p && img > 1 {
211 | toRemove = true
212 | } else if li > p && tag != "ul" && tag != "ol" {
213 | toRemove = true
214 | } else if input_html > int(math.Floor(float64(p/3))) {
215 | toRemove = true
216 | } else if contentLength < 25 && (img == 0 || img > 2) {
217 | toRemove = true
218 | } else if weight < 25 && linkDensity > 0.2 {
219 | toRemove = true
220 | } else if weight >= 25 && linkDensity > 0.5 {
221 | toRemove = true
222 | } else if (embedCount == 1 && contentLength < 35) || embedCount > 1 {
223 | toRemove = true
224 | }
225 | if toRemove {
226 | node.Remove()
227 | }
228 | }
229 | })
230 | }
231 |
232 | func (self *TReadability) cleanStyle(e *goquery.Selection) {
233 | if e == nil {
234 | return
235 | }
236 | e.Find("*").Each(func(i int, elem *goquery.Selection) {
237 | elem.RemoveAttr("class")
238 | elem.RemoveAttr("id")
239 | elem.RemoveAttr("style")
240 | elem.RemoveAttr("width")
241 | elem.RemoveAttr("height")
242 | elem.RemoveAttr("onclick")
243 | elem.RemoveAttr("onmouseover")
244 | elem.RemoveAttr("border")
245 | })
246 | }
247 |
248 | func (self *TReadability) clean(e *goquery.Selection, tag string) {
249 | if e == nil {
250 | return
251 | }
252 | isEmbed := false
253 | if tag == "object" || tag == "embed" {
254 | isEmbed = true
255 | }
256 | e.Find(tag).Each(func(i int, target *goquery.Selection) {
257 | attributeValues := ""
258 | //for _, attribute := range target.Nodes[0].Attr {
259 | // get_attr := target.
260 | // }
261 | if isEmbed && videos.MatchString(attributeValues) {
262 | return
263 | }
264 | if isEmbed && videos.MatchString(target.Text()) {
265 | return
266 | }
267 | target.Remove()
268 | })
269 | }
270 |
271 | func (self *TReadability) cleanArticle(content *goquery.Selection) string {
272 | if content == nil {
273 | return ""
274 | }
275 | self.cleanStyle(content)
276 | self.clean(content, "h1")
277 | self.clean(content, "object")
278 | self.cleanConditionally(content, "form")
279 | if content.Find("h2").Length() == 1 {
280 | self.clean(content, "h2")
281 | }
282 | if content.Find("h3").Length() == 1 {
283 | self.clean(content, "h3")
284 | }
285 | self.clean(content, "iframe")
286 | self.cleanConditionally(content, "table")
287 | self.cleanConditionally(content, "ul")
288 | self.cleanConditionally(content, "div")
289 | self.fixImagesPath(content)
290 |
291 | html, err := content.Html()
292 | if err != nil {
293 | return ""
294 | }
295 | html = ghtml.UnescapeString(html)
296 | return killBreaks.ReplaceAllString(html, "
")
297 | }
298 |
299 | func (self *TReadability) getTagName(node *goquery.Selection) string {
300 | if node == nil {
301 | return ""
302 | }
303 | return node.Nodes[0].Data
304 | }
305 |
306 | func (self *TReadability) isComment(node *goquery.Selection) bool {
307 | if node == nil {
308 | return false
309 | }
310 | return node.Nodes[0].Type == html.CommentNode
311 | }
312 |
313 | func (self *TReadability) grabArticle() string {
314 |
315 | self.htmlDoc.Find("*").Each(func(i int, elem *goquery.Selection) {
316 |
317 | if self.isComment(elem) {
318 | elem.Remove()
319 | return
320 | }
321 | unlikelyMatchString := elem.AttrOr("id", "") + " " + elem.AttrOr("class", "")
322 |
323 | if unlikelyCandidates.MatchString(unlikelyMatchString) &&
324 | !okMaybeItsACandidate.MatchString(unlikelyMatchString) &&
325 | self.getTagName(elem) != "body" {
326 | elem.Remove()
327 | return
328 | }
329 | if unlikelyElements.MatchString(self.getTagName(elem)) {
330 | elem.Remove()
331 | return
332 | }
333 | if self.getTagName(elem) == "div" {
334 | s, _ := elem.Html()
335 | if !divToPElements.MatchString(s) {
336 | elem.Nodes[0].Data = "p"
337 | }
338 | }
339 | })
340 |
341 | self.htmlDoc.Find("p").Each(func(i int, node *goquery.Selection) {
342 | parentNode := node.Parent()
343 | grandParentNode := parentNode.Parent()
344 | innerText := node.Text()
345 |
346 | if parentNode == nil || strLen(innerText) < 20 {
347 | return
348 | }
349 | parentHash := HashStr(parentNode)
350 | grandParentHash := HashStr(grandParentNode)
351 | if _, ok := self.candidates[parentHash]; !ok {
352 | self.candidates[parentHash] = self.initializeNode(parentNode)
353 | }
354 | if _, ok := self.candidates[grandParentHash]; !ok {
355 | self.candidates[grandParentHash] = self.initializeNode(grandParentNode)
356 | }
357 | contentScore := 1.0
358 | contentScore += float64(strings.Count(innerText, ","))
359 | contentScore += float64(strings.Count(innerText, ","))
360 | contentScore += math.Min(math.Floor(float64(strLen(innerText)/100)), 3)
361 |
362 | v, _ := self.candidates[parentHash]
363 | v.score += contentScore
364 | self.candidates[parentHash] = v
365 |
366 | if grandParentNode != nil {
367 | v, _ = self.candidates[grandParentHash]
368 | v.score += contentScore / 2.0
369 | self.candidates[grandParentHash] = v
370 | }
371 | })
372 |
373 | var topCandidate *TCandidateItem
374 | for k, v := range self.candidates {
375 | v.score = v.score * (1 - self.getLinkDensity(v.node))
376 | self.candidates[k] = v
377 |
378 | // fmt.Println(v.score)
379 | // fmt.Println(v.node.Text())
380 | // fmt.Println("---------------------------------------------------------------------------------------------------------")
381 | if topCandidate == nil || v.score > topCandidate.score {
382 | if topCandidate == nil {
383 | topCandidate = new(TCandidateItem)
384 | }
385 | topCandidate.score = v.score
386 | topCandidate.node = v.node
387 | }
388 | }
389 | if topCandidate != nil {
390 | // fmt.Println("topCandidate.score=", topCandidate.score)
391 | return self.cleanArticle(topCandidate.node)
392 | }
393 | return ""
394 | }
395 |
396 | func (self *TReadability) Parse() {
397 | self.removeScript()
398 | self.removeStyle()
399 | self.removeLink()
400 | self.Title = self.getTitle()
401 | self.Content = self.grabArticle()
402 | }
403 |
--------------------------------------------------------------------------------
/regex.go:
--------------------------------------------------------------------------------
1 | // 改自 https://github.com/kingwkb/readability python版本
2 | // 于2016-11-10
3 | // by: ying32
4 | package readability
5 |
6 | import (
7 | "regexp"
8 | )
9 |
10 | var (
11 | unlikelyCandidates, _ = regexp.Compile(`(?is)combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter|location`)
12 | okMaybeItsACandidate, _ = regexp.Compile(`(?is)and|article|body|column|main|shadow`)
13 | positive, _ = regexp.Compile(`(?is)article|body|content|entry|hentry|main|page|pagination|post|text|blog|story`)
14 | negative, _ = regexp.Compile(`(?is)combx|comment|com|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget`)
15 | extraneous, _ = regexp.Compile(`(?is)print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single`)
16 | divToPElements, _ = regexp.Compile(`(?is)<(a|blockquote|dl|div|img|ol|p|pre|table|ul)`)
17 | replaceBrs, _ = regexp.Compile(`(?is)(
]*>[ \n\r\t]*){2,}`)
18 | replaceFonts, _ = regexp.Compile(`(?is)<(/?)font[^>]*>`)
19 | trim, _ = regexp.Compile(`(?is)^\s+|\s+$`)
20 | normalize, _ = regexp.Compile(`(?is)\s{2,}`)
21 | killBreaks, _ = regexp.Compile(`(?is)(
(\s| ?)*)+`)
22 | videos, _ = regexp.Compile(`(?is)http://(www\.)?(youtube|vimeo)\.com`)
23 | skipFootnoteLink, _ = regexp.Compile(`(?is)^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$"`)
24 | nextLink, _ = regexp.Compile(`(?is)(next|weiter|continue|>([^\|]|$)|»([^\|]|$))`)
25 | prevLink, _ = regexp.Compile(`(?is)(prev|earl|old|new|<|«)`)
26 |
27 | unlikelyElements, _ = regexp.Compile(`(?is)(input|time|button)`)
28 |
29 | pageCodeReg, _ = regexp.Compile(`(?is)