├── .gitignore ├── LICENSE ├── README.md ├── http.go ├── main_test.go ├── read.go └── regex.go /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files, Static and Dynamic libs (Shared Objects) 2 | *.o 3 | *.a 4 | *.so 5 | 6 | # Folders 7 | _obj 8 | _test 9 | 10 | # Architecture specific extensions/prefixes 11 | *.[568vq] 12 | [568vq].out 13 | 14 | *.cgo1.go 15 | *.cgo2.c 16 | _cgo_defun.c 17 | _cgo_gotypes.go 18 | _cgo_export.* 19 | 20 | _testmain.go 21 | 22 | *.exe 23 | *.test 24 | *.prof 25 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # readability 2 | readability for golang 3 | 4 | Golang版本是根据[readabiliity for node.js](https://github.com/luin/readability)以及[readability for python](https://github.com/kingwkb/readability)所改写,并加入了些自己的,比如支持gzip等。 5 | 6 | #### 引用的第三方包 7 | > github.com/PuerkitoBio/goquery 8 | > github.com/axgle/mahonia 9 | 10 | #### 使用方法 11 | 12 | ```Go 13 | 14 | package main 15 | 16 | import ( 17 | "fmt" 18 | 19 | "github.com/ying32/readability" 20 | ) 21 | 22 | func main() { 23 | test, err := readability.NewReadability("http://wd.leiting.com/home/news/news_detail.php?id=599") 24 | if err != nil { 25 | fmt.Println("failed.", err) 26 | return 27 | } 28 | test.Parse() 29 | fmt.Println(test.Title) 30 | fmt.Println(test.Content) 31 | } 32 | 33 | ``` -------------------------------------------------------------------------------- /http.go: -------------------------------------------------------------------------------- 1 | // 改自 https://github.com/kingwkb/readability python版本 2 | // 于2016-11-10 3 | // by: ying32 4 | package readability 5 | 6 | import ( 7 | "compress/flate" 8 | "compress/gzip" 9 | 10 | "io/ioutil" 11 | "net/http" 12 | "strings" 13 | 14 | "github.com/axgle/mahonia" 15 | ) 16 | 17 | func httpGet(url string) (string, error) { 18 | req, err := http.NewRequest(http.MethodGet, url, nil) 19 | if err != nil { 20 | return "", err 21 | } 22 | req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36") 23 | req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8") 24 | req.Header.Set("Accept-Encoding", "gzip, deflate") 25 | req.Header.Set("Accept-Language", "zh-CN,zh;q=0.8") 26 | req.Header.Set("Connection", "keep-alive") 27 | req.Header.Set("Cache-Control", "max-age=0") 28 | 29 | client := &http.Client{} 30 | resp, err := client.Do(req) 31 | if err != nil { 32 | return "", err 33 | } 34 | defer resp.Body.Close() 35 | contentEncoding := strings.Trim(strings.ToLower(resp.Header.Get("Content-Encoding")), " ") 36 | var bytes []byte 37 | if contentEncoding == "gzip" { 38 | x, err := gzip.NewReader(resp.Body) 39 | if err != nil { 40 | return "", err 41 | } 42 | bytes, err = ioutil.ReadAll(x) 43 | if err != nil { 44 | return "", err 45 | } 46 | } else if contentEncoding == "deflate" { 47 | x := flate.NewReader(resp.Body) 48 | bytes, err = ioutil.ReadAll(x) 49 | if err != nil { 50 | return "", err 51 | } 52 | } else { 53 | bytes, err = ioutil.ReadAll(resp.Body) 54 | if err != nil { 55 | return "", err 56 | } 57 | } 58 | 59 | srcStr := string(bytes) 60 | pageCodes := pageCodeReg.FindStringSubmatch(srcStr) 61 | if len(pageCodes) >= 2 { 62 | curCode := strings.ToLower(pageCodes[1]) 63 | if curCode == "gb2312" || curCode == "gbk" { 64 | decoder := mahonia.NewDecoder("gbk") 65 | srcStr = decoder.ConvertString(srcStr) 66 | } 67 | } 68 | return srcStr, nil 69 | } 70 | -------------------------------------------------------------------------------- /main_test.go: -------------------------------------------------------------------------------- 1 | package readability 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | ) 7 | 8 | func TestAll(t *testing.T) { 9 | test, err := NewReadability("http://wd.leiting.com/home/news/news_detail.php?id=599") 10 | if err != nil { 11 | fmt.Println("failed.", err) 12 | return 13 | } 14 | test.Parse() 15 | fmt.Println(test.Title) 16 | fmt.Println(test.Content) 17 | } 18 | -------------------------------------------------------------------------------- /read.go: -------------------------------------------------------------------------------- 1 | // 改自 https://github.com/kingwkb/readability python版本 2 | // 于2016-11-10 3 | // by: ying32 4 | package readability 5 | 6 | import ( 7 | "fmt" 8 | 9 | "crypto/md5" 10 | "errors" 11 | "math" 12 | nurl "net/url" 13 | "strings" 14 | "unicode/utf8" 15 | 16 | ghtml "html" 17 | 18 | "golang.org/x/net/html" 19 | 20 | "github.com/PuerkitoBio/goquery" 21 | ) 22 | 23 | type TCandidateItem struct { 24 | score float64 25 | node *goquery.Selection 26 | } 27 | 28 | type TReadability struct { 29 | html string 30 | url *nurl.URL 31 | htmlDoc *goquery.Document 32 | candidates map[string]TCandidateItem 33 | 34 | Title string 35 | Content string 36 | } 37 | 38 | func HashStr(node *goquery.Selection) string { 39 | if node == nil { 40 | return "" 41 | } 42 | html, _ := node.Html() 43 | return fmt.Sprintf("%x", md5.Sum([]byte(html))) 44 | } 45 | 46 | func strLen(str string) int { 47 | return utf8.RuneCountInString(str) 48 | } 49 | 50 | func NewReadability(url string) (*TReadability, error) { 51 | 52 | v := &TReadability{} 53 | var err error 54 | v.html, err = httpGet(url) 55 | if err != nil { 56 | return nil, err 57 | } 58 | v.url, _ = nurl.Parse(url) 59 | v.candidates = make(map[string]TCandidateItem, 0) 60 | 61 | v.html = replaceBrs.ReplaceAllString(v.html, "

") 62 | //v.html = replaceFonts.ReplaceAllString(v.html, `<\g<1>span>`) 63 | 64 | if v.html == "" { 65 | return nil, errors.New("html为空!") 66 | } 67 | doc, err := goquery.NewDocumentFromReader(strings.NewReader(v.html)) 68 | if err != nil { 69 | return nil, err 70 | } 71 | v.htmlDoc = doc 72 | return v, nil 73 | } 74 | 75 | func (self *TReadability) removeScript() { 76 | self.htmlDoc.Find("script").Remove() 77 | } 78 | 79 | func (self *TReadability) removeStyle() { 80 | self.htmlDoc.Find("style").Remove() 81 | } 82 | 83 | func (self *TReadability) removeLink() { 84 | self.htmlDoc.Find("link").Remove() 85 | } 86 | 87 | func (self *TReadability) getTitle() string { 88 | return self.htmlDoc.Find("title").Text() 89 | } 90 | 91 | func (self *TReadability) getLinkDensity(node *goquery.Selection) float64 { 92 | if node == nil { 93 | return 0 94 | } 95 | textLength := float64(strLen(node.Text())) 96 | if textLength == 0 { 97 | return 0 98 | } 99 | linkLength := 0.0 100 | node.Find("a").Each( 101 | func(i int, link *goquery.Selection) { 102 | linkLength += float64(strLen(link.Text())) 103 | }) 104 | return linkLength / textLength 105 | } 106 | 107 | func (self *TReadability) fixImagesPath(node *goquery.Selection) { 108 | if node == nil { 109 | return 110 | } 111 | node.Find("img").Each( 112 | 113 | func(i int, img *goquery.Selection) { 114 | src, _ := img.Attr("src") 115 | // dz论坛的有些img属性使用的是file字段 116 | if f, ok := img.Attr("file"); ok { 117 | src = f 118 | img.SetAttr("src", f) 119 | img.RemoveAttr("file") 120 | } 121 | if src == "" { 122 | img.Remove() 123 | return 124 | } 125 | if src != "" { 126 | if !strings.HasPrefix(src, "http://") && !strings.HasPrefix(src, "https://") { 127 | var newSrc string 128 | if strings.HasPrefix(src, "/") { 129 | newSrc = self.url.Scheme + "://" + self.url.Host + src 130 | } else { 131 | newSrc = self.url.Scheme + "://" + self.url.Host + self.url.Path + src 132 | } 133 | img.SetAttr("src", newSrc) 134 | } 135 | } 136 | }) 137 | } 138 | 139 | func (self *TReadability) getClassWeight(node *goquery.Selection) float64 { 140 | weight := 0.0 141 | if str, b := node.Attr("class"); b { 142 | if negative.MatchString(str) { 143 | weight -= 25 144 | } 145 | if positive.MatchString(str) { 146 | weight += 25 147 | } 148 | } 149 | if str, b := node.Attr("id"); b { 150 | if negative.MatchString(str) { 151 | weight -= 25 152 | } 153 | if positive.MatchString(str) { 154 | weight += 25 155 | } 156 | } 157 | return weight 158 | } 159 | 160 | func (self *TReadability) initializeNode(node *goquery.Selection) TCandidateItem { 161 | contentScore := 0.0 162 | switch self.getTagName(node) { 163 | case "article": 164 | contentScore += 10 165 | case "section": 166 | contentScore += 8 167 | case "div": 168 | contentScore += 5 169 | case "pre", "blockquote", "td": 170 | contentScore += 3 171 | case "form", "ol", "dl", "dd", "dt", "li", "address": 172 | contentScore -= 3 173 | case "th", "h1", "h2", "h3", "h4", "h5", "h6": 174 | contentScore -= 5 175 | } 176 | contentScore += self.getClassWeight(node) 177 | return TCandidateItem{contentScore, node} 178 | } 179 | 180 | func (self *TReadability) cleanConditionally(e *goquery.Selection, tag string) { 181 | if e == nil { 182 | return 183 | } 184 | contentScore := 0.0 185 | e.Find(tag).Each(func(i int, node *goquery.Selection) { 186 | weight := self.getClassWeight(node) 187 | hashNode := HashStr(node) 188 | if v, ok := self.candidates[hashNode]; ok { 189 | contentScore = v.score 190 | } else { 191 | contentScore = 0 192 | } 193 | 194 | if weight+contentScore < 0 { 195 | node.Remove() 196 | } else { 197 | p := node.Find("p").Length() 198 | img := node.Find("img").Length() 199 | li := node.Find("li").Length() - 100 200 | input_html := node.Find("input_html").Length() 201 | embedCount := 0 202 | node.Find("embed").Each(func(i int, embed *goquery.Selection) { 203 | if !videos.MatchString(embed.AttrOr("src", "")) { 204 | embedCount += 1 205 | } 206 | }) 207 | linkDensity := self.getLinkDensity(node) 208 | contentLength := strLen(node.Text()) 209 | toRemove := false 210 | if img > p && img > 1 { 211 | toRemove = true 212 | } else if li > p && tag != "ul" && tag != "ol" { 213 | toRemove = true 214 | } else if input_html > int(math.Floor(float64(p/3))) { 215 | toRemove = true 216 | } else if contentLength < 25 && (img == 0 || img > 2) { 217 | toRemove = true 218 | } else if weight < 25 && linkDensity > 0.2 { 219 | toRemove = true 220 | } else if weight >= 25 && linkDensity > 0.5 { 221 | toRemove = true 222 | } else if (embedCount == 1 && contentLength < 35) || embedCount > 1 { 223 | toRemove = true 224 | } 225 | if toRemove { 226 | node.Remove() 227 | } 228 | } 229 | }) 230 | } 231 | 232 | func (self *TReadability) cleanStyle(e *goquery.Selection) { 233 | if e == nil { 234 | return 235 | } 236 | e.Find("*").Each(func(i int, elem *goquery.Selection) { 237 | elem.RemoveAttr("class") 238 | elem.RemoveAttr("id") 239 | elem.RemoveAttr("style") 240 | elem.RemoveAttr("width") 241 | elem.RemoveAttr("height") 242 | elem.RemoveAttr("onclick") 243 | elem.RemoveAttr("onmouseover") 244 | elem.RemoveAttr("border") 245 | }) 246 | } 247 | 248 | func (self *TReadability) clean(e *goquery.Selection, tag string) { 249 | if e == nil { 250 | return 251 | } 252 | isEmbed := false 253 | if tag == "object" || tag == "embed" { 254 | isEmbed = true 255 | } 256 | e.Find(tag).Each(func(i int, target *goquery.Selection) { 257 | attributeValues := "" 258 | //for _, attribute := range target.Nodes[0].Attr { 259 | // get_attr := target. 260 | // } 261 | if isEmbed && videos.MatchString(attributeValues) { 262 | return 263 | } 264 | if isEmbed && videos.MatchString(target.Text()) { 265 | return 266 | } 267 | target.Remove() 268 | }) 269 | } 270 | 271 | func (self *TReadability) cleanArticle(content *goquery.Selection) string { 272 | if content == nil { 273 | return "" 274 | } 275 | self.cleanStyle(content) 276 | self.clean(content, "h1") 277 | self.clean(content, "object") 278 | self.cleanConditionally(content, "form") 279 | if content.Find("h2").Length() == 1 { 280 | self.clean(content, "h2") 281 | } 282 | if content.Find("h3").Length() == 1 { 283 | self.clean(content, "h3") 284 | } 285 | self.clean(content, "iframe") 286 | self.cleanConditionally(content, "table") 287 | self.cleanConditionally(content, "ul") 288 | self.cleanConditionally(content, "div") 289 | self.fixImagesPath(content) 290 | 291 | html, err := content.Html() 292 | if err != nil { 293 | return "" 294 | } 295 | html = ghtml.UnescapeString(html) 296 | return killBreaks.ReplaceAllString(html, "
") 297 | } 298 | 299 | func (self *TReadability) getTagName(node *goquery.Selection) string { 300 | if node == nil { 301 | return "" 302 | } 303 | return node.Nodes[0].Data 304 | } 305 | 306 | func (self *TReadability) isComment(node *goquery.Selection) bool { 307 | if node == nil { 308 | return false 309 | } 310 | return node.Nodes[0].Type == html.CommentNode 311 | } 312 | 313 | func (self *TReadability) grabArticle() string { 314 | 315 | self.htmlDoc.Find("*").Each(func(i int, elem *goquery.Selection) { 316 | 317 | if self.isComment(elem) { 318 | elem.Remove() 319 | return 320 | } 321 | unlikelyMatchString := elem.AttrOr("id", "") + " " + elem.AttrOr("class", "") 322 | 323 | if unlikelyCandidates.MatchString(unlikelyMatchString) && 324 | !okMaybeItsACandidate.MatchString(unlikelyMatchString) && 325 | self.getTagName(elem) != "body" { 326 | elem.Remove() 327 | return 328 | } 329 | if unlikelyElements.MatchString(self.getTagName(elem)) { 330 | elem.Remove() 331 | return 332 | } 333 | if self.getTagName(elem) == "div" { 334 | s, _ := elem.Html() 335 | if !divToPElements.MatchString(s) { 336 | elem.Nodes[0].Data = "p" 337 | } 338 | } 339 | }) 340 | 341 | self.htmlDoc.Find("p").Each(func(i int, node *goquery.Selection) { 342 | parentNode := node.Parent() 343 | grandParentNode := parentNode.Parent() 344 | innerText := node.Text() 345 | 346 | if parentNode == nil || strLen(innerText) < 20 { 347 | return 348 | } 349 | parentHash := HashStr(parentNode) 350 | grandParentHash := HashStr(grandParentNode) 351 | if _, ok := self.candidates[parentHash]; !ok { 352 | self.candidates[parentHash] = self.initializeNode(parentNode) 353 | } 354 | if _, ok := self.candidates[grandParentHash]; !ok { 355 | self.candidates[grandParentHash] = self.initializeNode(grandParentNode) 356 | } 357 | contentScore := 1.0 358 | contentScore += float64(strings.Count(innerText, ",")) 359 | contentScore += float64(strings.Count(innerText, ",")) 360 | contentScore += math.Min(math.Floor(float64(strLen(innerText)/100)), 3) 361 | 362 | v, _ := self.candidates[parentHash] 363 | v.score += contentScore 364 | self.candidates[parentHash] = v 365 | 366 | if grandParentNode != nil { 367 | v, _ = self.candidates[grandParentHash] 368 | v.score += contentScore / 2.0 369 | self.candidates[grandParentHash] = v 370 | } 371 | }) 372 | 373 | var topCandidate *TCandidateItem 374 | for k, v := range self.candidates { 375 | v.score = v.score * (1 - self.getLinkDensity(v.node)) 376 | self.candidates[k] = v 377 | 378 | // fmt.Println(v.score) 379 | // fmt.Println(v.node.Text()) 380 | // fmt.Println("---------------------------------------------------------------------------------------------------------") 381 | if topCandidate == nil || v.score > topCandidate.score { 382 | if topCandidate == nil { 383 | topCandidate = new(TCandidateItem) 384 | } 385 | topCandidate.score = v.score 386 | topCandidate.node = v.node 387 | } 388 | } 389 | if topCandidate != nil { 390 | // fmt.Println("topCandidate.score=", topCandidate.score) 391 | return self.cleanArticle(topCandidate.node) 392 | } 393 | return "" 394 | } 395 | 396 | func (self *TReadability) Parse() { 397 | self.removeScript() 398 | self.removeStyle() 399 | self.removeLink() 400 | self.Title = self.getTitle() 401 | self.Content = self.grabArticle() 402 | } 403 | -------------------------------------------------------------------------------- /regex.go: -------------------------------------------------------------------------------- 1 | // 改自 https://github.com/kingwkb/readability python版本 2 | // 于2016-11-10 3 | // by: ying32 4 | package readability 5 | 6 | import ( 7 | "regexp" 8 | ) 9 | 10 | var ( 11 | unlikelyCandidates, _ = regexp.Compile(`(?is)combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter|location`) 12 | okMaybeItsACandidate, _ = regexp.Compile(`(?is)and|article|body|column|main|shadow`) 13 | positive, _ = regexp.Compile(`(?is)article|body|content|entry|hentry|main|page|pagination|post|text|blog|story`) 14 | negative, _ = regexp.Compile(`(?is)combx|comment|com|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget`) 15 | extraneous, _ = regexp.Compile(`(?is)print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single`) 16 | divToPElements, _ = regexp.Compile(`(?is)<(a|blockquote|dl|div|img|ol|p|pre|table|ul)`) 17 | replaceBrs, _ = regexp.Compile(`(?is)(]*>[ \n\r\t]*){2,}`) 18 | replaceFonts, _ = regexp.Compile(`(?is)<(/?)font[^>]*>`) 19 | trim, _ = regexp.Compile(`(?is)^\s+|\s+$`) 20 | normalize, _ = regexp.Compile(`(?is)\s{2,}`) 21 | killBreaks, _ = regexp.Compile(`(?is)((\s| ?)*)+`) 22 | videos, _ = regexp.Compile(`(?is)http://(www\.)?(youtube|vimeo)\.com`) 23 | skipFootnoteLink, _ = regexp.Compile(`(?is)^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$"`) 24 | nextLink, _ = regexp.Compile(`(?is)(next|weiter|continue|>([^\|]|$)|»([^\|]|$))`) 25 | prevLink, _ = regexp.Compile(`(?is)(prev|earl|old|new|<|«)`) 26 | 27 | unlikelyElements, _ = regexp.Compile(`(?is)(input|time|button)`) 28 | 29 | pageCodeReg, _ = regexp.Compile(`(?is)