├── .gitignore ├── go.mod ├── LICENSE.md ├── README.md ├── go.sum ├── readability_test.go ├── helpers.go └── readability.go /.gitignore: -------------------------------------------------------------------------------- 1 | /.golangcilint-* 2 | /res 3 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/cixtor/readability 2 | 3 | go 1.14 4 | 5 | require golang.org/x/net v0.8.0 6 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Copyright (c) 2010 Arc90 Inc 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Readability 2 | 3 | Readability is a library written in Go (golang) to parse, analyze and convert HTML pages into readable content. Originally an Arc90 Experiment, it is now incorporated into Safari’s Reader View. 4 | 5 | > Despite the ubiquity of reading on the web, readers remain a neglected audience. Much of our talk about web design revolves around a sense of movement: users are thought to be finding, searching, skimming, looking. We measure how frequently they click but not how long they stay on the page. We concern ourselves with their travel and participation–how they move from page to page, who they talk to when they get there–but forget the needs of those whose purpose is to be still. Readers flourish when they have space–some distance from the hubbub of the crowds–and as web designers, there is yet much we can do to help them carve out that space. 6 | > 7 | > [In Defense Of Readers](http://alistapart.com/articles/indefenseofreaders), by [Mandy Brown](http://www.aworkinglibrary.com/) 8 | 9 | ## Evolution of Readability Web Engines 10 | 11 | | Product | Year | Shutdown | 12 | |---------|------|----------| 13 | | [Instapaper](https://www.instapaper.com/) | 2008 | N/A | 14 | | [Arc90 Readability](https://code.google.com/archive/p/arc90labs-readability/) | 2009 | [Sep 30, 2016](https://medium.com/@readability/the-readability-bookmarking-service-will-shut-down-on-september-30-2016-1641cc18e02b) | 15 | | [Apple Readability](https://developer.apple.com/documentation/safariextensions/safarireader) | 2010 | N/A | 16 | | [Microsoft Reading View](https://docs.microsoft.com/en-us/microsoft-edge/dev-guide/browser-features/reading-view) | 2014 | N/A | 17 | | [Mozilla Readability](https://github.com/mozilla/readability) | 2015 | N/A | 18 | | [Mercury Reader](https://mercury.postlight.com/) | 2016 | [Apr 15, 2019](https://www.reddit.com/r/mac/comments/apkhzs/a/) | 19 | 20 | ## Reader Mode Parser Diversity 21 | 22 | All modern web browsers, except for Google Chrome, include an option to parse, analyze, and extract the main content from web pages to provide what is commonly known as “Reading Mode”. Reading Mode is a separate web rendering mode that strips out repeated and irrelevant content, this allows the web browser to extract the main content and display it cleanly and consistently to the user. 23 | 24 | | Vendor | Product | Parser | Environments | 25 | |--------|---------|--------|--------------| 26 | | Mozilla | Firefox | Mozilla Readability | Desktop and Android | 27 | | GNOME | Web | Mozilla Readability | Desktop | 28 | | Vivaldi | Vivaldi | Mozilla Readability | Desktop | 29 | | Yandex | Browser | Mozilla Readability | Desktop | 30 | | Samsung | Browser | Mozilla Readability | Android | 31 | | Apple | Safari | Safari Reader | macOS and iOS | 32 | | Maxthon | Maxthon | Maxthon Reader | Desktop | 33 | | Microsoft | Edge | EdgeHTML | Windows and Windows Mobile | 34 | | Microsoft | Edge Mobile | Chrome DOM Distiller | Android | 35 | | Google | Chrome | Chrome DOM Distiller | Android | 36 | | Postlight | Mercury Reader | Web Reader | Web / browser extension | 37 | | Instant Paper | Instapaper | Instaparser | Web / browser extension | 38 | | Mozilla | Pocket | Unknown | Web / browser extension | 39 | 40 | --- 41 | 42 | Ref: https://web.archive.org/web/20150817073201/http://lab.arc90.com/2009/03/02/readability/ 43 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= 2 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= 3 | golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= 4 | golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= 5 | golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= 6 | golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= 7 | golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= 8 | golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= 9 | golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= 10 | golang.org/x/net v0.8.0 h1:Zrh2ngAOFYneWTAIAPethzeaQLuHwhuBkuV6ZiRnUaQ= 11 | golang.org/x/net v0.8.0/go.mod h1:QVkue5JL9kW//ek3r6jTKnTFis1tRmNAW2P1shuFdJc= 12 | golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 13 | golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 14 | golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 15 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 16 | golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 17 | golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 18 | golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 19 | golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 20 | golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 21 | golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 22 | golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= 23 | golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= 24 | golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= 25 | golang.org/x/term v0.6.0/go.mod h1:m6U89DPEgQRMq3DNkDClhWw02AUbt2daBVO4cn4Hv9U= 26 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= 27 | golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= 28 | golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= 29 | golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= 30 | golang.org/x/text v0.8.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= 31 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= 32 | golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= 33 | golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= 34 | golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= 35 | golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 36 | -------------------------------------------------------------------------------- /readability_test.go: -------------------------------------------------------------------------------- 1 | package readability 2 | 3 | import ( 4 | "fmt" 5 | "io/ioutil" 6 | "os" 7 | "path/filepath" 8 | "strings" 9 | "testing" 10 | 11 | "golang.org/x/net/html" 12 | ) 13 | 14 | func TestMaxElemsToParse(t *testing.T) { 15 | input := strings.NewReader(` 16 | 17 | hello world 18 | 19 | 20 |

lorem ipsum

21 | 22 | `) 23 | 24 | parser := New() 25 | parser.MaxElemsToParse = 3 26 | _, err := parser.Parse(input, "https://cixtor.com/blog") 27 | 28 | if err.Error() != "too many elements: 5" { 29 | t.Fatalf("expecting failure due to MaxElemsToParse: %s", err) 30 | } 31 | } 32 | 33 | func TestRemoveScripts(t *testing.T) { 34 | input := strings.NewReader(` 35 | 36 | hello world 37 | 38 | 39 | 40 |

lorem ipsum

41 | 42 | 45 | 46 | `) 47 | 48 | a, err := New().Parse(input, "https://cixtor.com/blog") 49 | 50 | if err != nil { 51 | t.Fatalf("parser failure: %s", err) 52 | } 53 | 54 | if a.TextContent != "lorem ipsum" { 55 | t.Fatalf("scripts were not removed: %s", a.TextContent) 56 | } 57 | } 58 | 59 | func getNodeExcerpt(node *html.Node) string { 60 | outer := outerHTML(node) 61 | outer = strings.Join(strings.Fields(outer), "\x20") 62 | if len(outer) < 500 { 63 | return outer 64 | } 65 | return outer[:500] 66 | } 67 | 68 | func errColorDiff(label string, a string, b string) error { 69 | coloredA := "" 70 | coloredB := "" 71 | for i := 0; i < len(a); i++ { 72 | if b[i] == a[i] { 73 | coloredA += a[i : i+1] 74 | coloredB += b[i : i+1] 75 | continue 76 | } 77 | coloredA += "\x1b[0;92m" + a[i:] + "\x1b[0m" 78 | coloredB += "\x1b[0;91m" + b[i:] + "\x1b[0m" 79 | break 80 | } 81 | return fmt.Errorf("%s\n- %s\n+ %s", label, coloredA, coloredB) 82 | } 83 | 84 | func compareArticleContent(result *html.Node, expected *html.Node) error { 85 | // Make sure number of nodes is same 86 | resultNodesCount := len(children(result)) 87 | expectedNodesCount := len(children(expected)) 88 | if resultNodesCount != expectedNodesCount { 89 | return fmt.Errorf( 90 | "number of nodes is different, want %d got %d", 91 | expectedNodesCount, 92 | resultNodesCount, 93 | ) 94 | } 95 | 96 | resultNode := result 97 | expectedNode := expected 98 | for resultNode != nil && expectedNode != nil { 99 | // Get node excerpt 100 | resultExcerpt := getNodeExcerpt(resultNode) 101 | expectedExcerpt := getNodeExcerpt(expectedNode) 102 | 103 | // Compare tag name 104 | resultTagName := tagName(resultNode) 105 | expectedTagName := tagName(expectedNode) 106 | if resultTagName != expectedTagName { 107 | return fmt.Errorf( 108 | "tag name is different\nwant: %s (%s)\ngot : %s (%s)", 109 | expectedTagName, 110 | expectedExcerpt, 111 | resultTagName, 112 | resultExcerpt, 113 | ) 114 | } 115 | 116 | // Compare attributes 117 | resultAttrCount := len(resultNode.Attr) 118 | expectedAttrCount := len(expectedNode.Attr) 119 | if resultAttrCount != expectedAttrCount { 120 | return fmt.Errorf( 121 | "number of attributes is different\nwant: %d (%s)\ngot : %d (%s)", 122 | expectedAttrCount, 123 | expectedExcerpt, 124 | resultAttrCount, 125 | resultExcerpt, 126 | ) 127 | } 128 | 129 | for _, resultAttr := range resultNode.Attr { 130 | expectedAttrVal := getAttribute(expectedNode, resultAttr.Key) 131 | switch resultAttr.Key { 132 | case "href", "src": 133 | resultAttr.Val = strings.TrimSuffix(resultAttr.Val, "/") 134 | expectedAttrVal = strings.TrimSuffix(expectedAttrVal, "/") 135 | } 136 | 137 | if resultAttr.Val != expectedAttrVal { 138 | return fmt.Errorf( 139 | "attribute %s is different\nwant: %s (%s)\ngot : %s (%s)", 140 | resultAttr.Key, 141 | expectedAttrVal, 142 | expectedExcerpt, 143 | resultAttr.Val, 144 | resultExcerpt, 145 | ) 146 | } 147 | } 148 | 149 | // Compare text content 150 | resultText := strings.TrimSpace(textContent(resultNode)) 151 | expectedText := strings.TrimSpace(textContent(expectedNode)) 152 | 153 | resultText = strings.Join(strings.Fields(resultText), "\x20") 154 | expectedText = strings.Join(strings.Fields(expectedText), "\x20") 155 | 156 | if resultText != expectedText { 157 | return errColorDiff( 158 | "text content is different", 159 | expectedExcerpt, 160 | resultExcerpt, 161 | ) 162 | } 163 | 164 | // Move to next node 165 | r := Readability{} 166 | resultNode = r.getNextNode(resultNode, false) 167 | expectedNode = r.getNextNode(expectedNode, false) 168 | } 169 | 170 | return nil 171 | } 172 | 173 | func TestParse(t *testing.T) { 174 | testDir := "scenarios" 175 | testItems, err := ioutil.ReadDir(testDir) 176 | if err != nil { 177 | t.Errorf("\nfailed to read test directory") 178 | } 179 | 180 | for _, item := range testItems { 181 | if !item.IsDir() { 182 | continue 183 | } 184 | 185 | t.Run(item.Name(), func(t1 *testing.T) { 186 | // Open test file 187 | testFilePath := filepath.Join(testDir, item.Name(), "source.html") 188 | testFile, err := os.Open(testFilePath) 189 | if err != nil { 190 | t1.Errorf("\nfailed to open test file") 191 | } 192 | defer testFile.Close() 193 | 194 | // Open expected result file 195 | expectedFilePath := filepath.Join(testDir, item.Name(), "expected.html") 196 | expectedFile, err := os.Open(expectedFilePath) 197 | if err != nil { 198 | t1.Errorf("\nfailed to open expected result file") 199 | } 200 | defer expectedFile.Close() 201 | 202 | // Parse expected result 203 | expectedHTML, err := html.Parse(expectedFile) 204 | if err != nil { 205 | t1.Errorf("\nfailed to parse expected result file") 206 | } 207 | 208 | // Get article from test file 209 | resultArticle, err := New().Parse(testFile, "http://fakehost/test/page.html") 210 | if err != nil { 211 | t1.Errorf("\nfailed to parse test file") 212 | } 213 | 214 | // Parse article into HTML 215 | resultHTML, err := html.Parse(strings.NewReader(resultArticle.Content)) 216 | if err != nil { 217 | t1.Errorf("\nfailed to parse test article into HTML") 218 | } 219 | 220 | // Compare article 221 | err = compareArticleContent(resultHTML, expectedHTML) 222 | if err != nil { 223 | t1.Errorf("\n%v", err) 224 | } 225 | }) 226 | } 227 | } 228 | -------------------------------------------------------------------------------- /helpers.go: -------------------------------------------------------------------------------- 1 | package readability 2 | 3 | import ( 4 | "bytes" 5 | "net/url" 6 | "strings" 7 | 8 | "golang.org/x/net/html" 9 | ) 10 | 11 | // firstElementChild returns the object's first child Element, or nil if there 12 | // are no child elements. 13 | func firstElementChild(node *html.Node) *html.Node { 14 | for child := node.FirstChild; child != nil; child = child.NextSibling { 15 | if child.Type == html.ElementNode { 16 | return child 17 | } 18 | } 19 | 20 | return nil 21 | } 22 | 23 | // nextElementSibling returns the Element immediately following the specified 24 | // one in its parent's children list, or nil if the specified Element is the 25 | // last one in the list. 26 | func nextElementSibling(node *html.Node) *html.Node { 27 | for sibling := node.NextSibling; sibling != nil; sibling = sibling.NextSibling { 28 | if sibling.Type == html.ElementNode { 29 | return sibling 30 | } 31 | } 32 | 33 | return nil 34 | } 35 | 36 | // appendChild adds a node to the end of the list of children of a specified 37 | // parent node. If the given child is a reference to an existing node in the 38 | // document, appendChild moves it from its current position to the new position 39 | // (there is no requirement to remove the node from its parent node before 40 | // appending it to some other node). 41 | // 42 | // See: https://developer.mozilla.org/en-US/docs/Web/API/Node/appendChild 43 | func appendChild(node *html.Node, child *html.Node) { 44 | if child.Parent != nil { 45 | temp := cloneNode(child) 46 | node.AppendChild(temp) 47 | child.Parent.RemoveChild(child) 48 | return 49 | } 50 | 51 | node.AppendChild(child) 52 | } 53 | 54 | // childNodes returns list of a node's direct children. 55 | func childNodes(node *html.Node) []*html.Node { 56 | var list []*html.Node 57 | 58 | for c := node.FirstChild; c != nil; c = c.NextSibling { 59 | list = append(list, c) 60 | } 61 | 62 | return list 63 | } 64 | 65 | // includeNode determines if node is included inside nodeList. 66 | func includeNode(nodeList []*html.Node, node *html.Node) bool { 67 | for i := 0; i < len(nodeList); i++ { 68 | if nodeList[i] == node { 69 | return true 70 | } 71 | } 72 | 73 | return false 74 | } 75 | 76 | // cloneNode returns a duplicate of the node on which this method was called. 77 | // 78 | // See: https://developer.mozilla.org/en-US/docs/Web/API/Node/cloneNode 79 | func cloneNode(node *html.Node) *html.Node { 80 | clone := &html.Node{ 81 | Type: node.Type, 82 | DataAtom: node.DataAtom, 83 | Data: node.Data, 84 | Attr: make([]html.Attribute, len(node.Attr)), 85 | } 86 | 87 | copy(clone.Attr, node.Attr) 88 | 89 | for c := node.FirstChild; c != nil; c = c.NextSibling { 90 | clone.AppendChild(cloneNode(c)) 91 | } 92 | 93 | return clone 94 | } 95 | 96 | // createElement creates the HTML element specified by tagName. 97 | // 98 | // See: https://developer.mozilla.org/en-US/docs/Web/API/Document/createElement 99 | func createElement(tagName string) *html.Node { 100 | return &html.Node{Type: html.ElementNode, Data: tagName} 101 | } 102 | 103 | // createTextNode creates a new Text node. 104 | func createTextNode(data string) *html.Node { 105 | return &html.Node{Type: html.TextNode, Data: data} 106 | } 107 | 108 | // getElementsByTagName returns a collection of HTML elements with the given 109 | // tag name. If tag name is an asterisk, a list of all the available HTML nodes 110 | // will be returned instead. 111 | // 112 | // See: https://developer.mozilla.org/en-US/docs/Web/API/Document/getElementsByTagName 113 | func getElementsByTagName(node *html.Node, tag string) []*html.Node { 114 | var lst []*html.Node 115 | var fun func(*html.Node) 116 | 117 | fun = func(n *html.Node) { 118 | if n.Type == html.ElementNode && (tag == "*" || n.Data == tag) { 119 | lst = append(lst, n) 120 | } 121 | 122 | for c := n.FirstChild; c != nil; c = c.NextSibling { 123 | fun(c) 124 | } 125 | } 126 | 127 | fun(node) 128 | 129 | return lst 130 | } 131 | 132 | // getAttribute returns the value of a specified attribute on the element. If 133 | // the given attribute does not exist, the function returns an empty string. 134 | func getAttribute(node *html.Node, attrName string) string { 135 | for i := 0; i < len(node.Attr); i++ { 136 | if node.Attr[i].Key == attrName { 137 | return node.Attr[i].Val 138 | } 139 | } 140 | 141 | return "" 142 | } 143 | 144 | // setAttribute sets attribute for node. If attribute already exists, it will 145 | // be replaced. 146 | func setAttribute(node *html.Node, attrName string, attrValue string) { 147 | attrIdx := -1 148 | 149 | for i := 0; i < len(node.Attr); i++ { 150 | if node.Attr[i].Key == attrName { 151 | attrIdx = i 152 | break 153 | } 154 | } 155 | 156 | if attrIdx >= 0 { 157 | node.Attr[attrIdx].Val = attrValue 158 | return 159 | } 160 | 161 | node.Attr = append(node.Attr, html.Attribute{ 162 | Key: attrName, 163 | Val: attrValue, 164 | }) 165 | } 166 | 167 | // removeAttribute removes attribute with given name. 168 | func removeAttribute(node *html.Node, attrName string) { 169 | attrIdx := -1 170 | 171 | for i := 0; i < len(node.Attr); i++ { 172 | if node.Attr[i].Key == attrName { 173 | attrIdx = i 174 | break 175 | } 176 | } 177 | 178 | if attrIdx >= 0 { 179 | a := node.Attr 180 | a = append(a[:attrIdx], a[attrIdx+1:]...) 181 | node.Attr = a 182 | } 183 | } 184 | 185 | // hasAttribute returns a Boolean value indicating whether the specified node 186 | // has the specified attribute or not. 187 | func hasAttribute(node *html.Node, attrName string) bool { 188 | for i := 0; i < len(node.Attr); i++ { 189 | if node.Attr[i].Key == attrName { 190 | return true 191 | } 192 | } 193 | 194 | return false 195 | } 196 | 197 | // outerHTML returns an HTML serialization of the element and its descendants. 198 | func outerHTML(node *html.Node) string { 199 | var buffer bytes.Buffer 200 | 201 | if err := html.Render(&buffer, node); err != nil { 202 | return "" 203 | } 204 | 205 | return buffer.String() 206 | } 207 | 208 | // innerHTML returns the HTML content (inner HTML) of an element. 209 | func innerHTML(node *html.Node) string { 210 | var err error 211 | var buffer bytes.Buffer 212 | 213 | for child := node.FirstChild; child != nil; child = child.NextSibling { 214 | if err = html.Render(&buffer, child); err != nil { 215 | return "" 216 | } 217 | } 218 | 219 | return strings.TrimSpace(buffer.String()) 220 | } 221 | 222 | // documentElement returns the root element of the document. 223 | func documentElement(doc *html.Node) *html.Node { 224 | nodes := getElementsByTagName(doc, "html") 225 | 226 | if len(nodes) > 0 { 227 | return nodes[0] 228 | } 229 | 230 | return nil 231 | } 232 | 233 | // className returns the value of the class attribute of the element. 234 | func className(node *html.Node) string { 235 | className := getAttribute(node, "class") 236 | className = strings.TrimSpace(className) 237 | className = rxNormalize.ReplaceAllString(className, "\x20") 238 | return className 239 | } 240 | 241 | // id returns the value of the id attribute of the specified element. 242 | func id(node *html.Node) string { 243 | id := getAttribute(node, "id") 244 | id = strings.TrimSpace(id) 245 | return id 246 | } 247 | 248 | // children returns an HTMLCollection of the child elements of Node. 249 | func children(node *html.Node) []*html.Node { 250 | var children []*html.Node 251 | 252 | if node == nil { 253 | return nil 254 | } 255 | 256 | for child := node.FirstChild; child != nil; child = child.NextSibling { 257 | if child.Type == html.ElementNode { 258 | children = append(children, child) 259 | } 260 | } 261 | 262 | return children 263 | } 264 | 265 | // wordCount returns number of word in str. 266 | func wordCount(str string) int { 267 | return len(strings.Fields(str)) 268 | } 269 | 270 | // indexOf returns the first index at which a given element can be found in the 271 | // array, or -1 if it is not present. 272 | func indexOf(array []string, key string) int { 273 | for idx, val := range array { 274 | if val == key { 275 | return idx 276 | } 277 | } 278 | 279 | return -1 280 | } 281 | 282 | // replaceNode replaces a child node within the given (parent) node. 283 | // 284 | // See: https://developer.mozilla.org/en-US/docs/Web/API/Node/replaceChild 285 | func replaceNode(oldNode *html.Node, newNode *html.Node) { 286 | if oldNode.Parent == nil { 287 | return 288 | } 289 | 290 | newNode.Parent = nil 291 | newNode.PrevSibling = nil 292 | newNode.NextSibling = nil 293 | oldNode.Parent.InsertBefore(newNode, oldNode) 294 | oldNode.Parent.RemoveChild(oldNode) 295 | } 296 | 297 | // tagName returns the tag name of the element on which it’s called. 298 | // 299 | // For example, if the element is an , its tagName property is “IMG” (for 300 | // HTML documents; it may be cased differently for XML/XHTML documents). 301 | // 302 | // See: https://developer.mozilla.org/en-US/docs/Web/API/Element/tagName 303 | func tagName(node *html.Node) string { 304 | if node.Type != html.ElementNode { 305 | return "" 306 | } 307 | 308 | return node.Data 309 | } 310 | 311 | // textContent returns text content of a Node and its descendants. 312 | // 313 | // See: https://developer.mozilla.org/en-US/docs/Web/API/Node/textContent 314 | func textContent(node *html.Node) string { 315 | var buffer bytes.Buffer 316 | var finder func(*html.Node) 317 | 318 | finder = func(n *html.Node) { 319 | if n.Type == html.TextNode { 320 | buffer.WriteString(n.Data) 321 | } 322 | 323 | for c := n.FirstChild; c != nil; c = c.NextSibling { 324 | finder(c) 325 | } 326 | } 327 | 328 | finder(node) 329 | 330 | return buffer.String() 331 | } 332 | 333 | // toAbsoluteURI convert uri to absolute path based on base. 334 | // However, if uri is prefixed with hash (#), the uri won't be changed. 335 | func toAbsoluteURI(uri string, base *url.URL) string { 336 | if uri == "" || base == nil { 337 | return "" 338 | } 339 | 340 | // If it is hash tag, return as it is 341 | if uri[:1] == "#" { 342 | return uri 343 | } 344 | 345 | // If it is already an absolute URL, return as it is 346 | tmp, err := url.ParseRequestURI(uri) 347 | if err == nil && tmp.Scheme != "" && tmp.Hostname() != "" { 348 | return uri 349 | } 350 | 351 | // Otherwise, resolve against base URI. 352 | tmp, err = url.Parse(uri) 353 | if err != nil { 354 | return uri 355 | } 356 | 357 | return base.ResolveReference(tmp).String() 358 | } 359 | -------------------------------------------------------------------------------- /readability.go: -------------------------------------------------------------------------------- 1 | package readability 2 | 3 | import ( 4 | "fmt" 5 | "io" 6 | "math" 7 | "net/url" 8 | "regexp" 9 | "sort" 10 | "strconv" 11 | "strings" 12 | 13 | "golang.org/x/net/html" 14 | ) 15 | 16 | // All of the regular expressions in use within readability. 17 | // Defined up here so we don't instantiate them repeatedly in loops. 18 | var rxUnlikelyCandidates = regexp.MustCompile(`(?i)-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote`) 19 | var rxOkMaybeItsACandidate = regexp.MustCompile(`(?i)and|article|body|column|main|shadow`) 20 | var rxPositive = regexp.MustCompile(`(?i)article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story`) 21 | var rxNegative = regexp.MustCompile(`(?i)hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget`) 22 | var rxByline = regexp.MustCompile(`(?i)byline|author|dateline|writtenby|p-author`) 23 | var rxNormalize = regexp.MustCompile(`(?i)\s{2,}`) 24 | var rxVideos = regexp.MustCompile(`(?i)//(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)`) 25 | var rxWhitespace = regexp.MustCompile(`(?i)^\s*$`) 26 | var rxHasContent = regexp.MustCompile(`(?i)\S$`) 27 | var rxPropertyPattern = regexp.MustCompile(`(?i)\s*(dc|dcterm|og|twitter)\s*:\s*(author|creator|description|title|site_name|image\S*)\s*`) 28 | var rxNamePattern = regexp.MustCompile(`(?i)^\s*(?:(dc|dcterm|og|twitter|weibo:(article|webpage))\s*[\.:]\s*)?(author|creator|description|title|site_name|image)\s*$`) 29 | var rxTitleSeparator = regexp.MustCompile(`(?i) [\|\-\\/>»] `) 30 | var rxTitleHierarchySep = regexp.MustCompile(`(?i) [\\/>»] `) 31 | var rxTitleRemoveFinalPart = regexp.MustCompile(`(?i)(.*)[\|\-\\/>»] .*`) 32 | var rxTitleRemove1stPart = regexp.MustCompile(`(?i)[^\|\-\\/>»]*[\|\-\\/>»](.*)`) 33 | var rxTitleAnySeparator = regexp.MustCompile(`(?i)[\|\-\\/>»]+`) 34 | var rxDisplayNone = regexp.MustCompile(`(?i)display\s*:\s*none`) 35 | var rxSentencePeriod = regexp.MustCompile(`(?i)\.( |$)`) 36 | var rxShare = regexp.MustCompile(`(?i)share`) 37 | var rxFaviconSize = regexp.MustCompile(`(?i)(\d+)x(\d+)`) 38 | 39 | // divToPElems is a list of HTML tag names representing content dividers. 40 | var divToPElems = []string{ 41 | "a", "blockquote", "div", "dl", "img", 42 | "ol", "p", "pre", "select", "table", "ul", 43 | } 44 | 45 | // alterToDivExceptions is a list of HTML tags that we want to convert into 46 | // regular DIV elements to prevent unwanted removal when the parser is cleaning 47 | // out unnecessary Nodes. 48 | var alterToDivExceptions = []string{ 49 | "article", 50 | "div", 51 | "p", 52 | "section", 53 | } 54 | 55 | // presentationalAttributes is a list of HTML attributes used to style Nodes. 56 | var presentationalAttributes = []string{ 57 | "align", 58 | "background", 59 | "bgcolor", 60 | "border", 61 | "cellpadding", 62 | "cellspacing", 63 | "frame", 64 | "hspace", 65 | "rules", 66 | "style", 67 | "valign", 68 | "vspace", 69 | } 70 | 71 | // deprecatedSizeAttributeElems is a list of HTML tags that allow programmers 72 | // to set Width and Height attributes to define their own size but that have 73 | // already been deprecated in recent HTML specifications. 74 | var deprecatedSizeAttributeElems = []string{ 75 | "table", 76 | "th", 77 | "td", 78 | "hr", 79 | "pre", 80 | } 81 | 82 | // The commented out elements qualify as phrasing content but tend to be 83 | // removed by readability when put into paragraphs, so we ignore them here. 84 | var phrasingElems = []string{ 85 | // "canvas", "iframe", "svg", "video", 86 | "abbr", "audio", "b", "bdo", "br", "button", "cite", "code", "data", 87 | "datalist", "dfn", "em", "embed", "i", "img", "input", "kbd", "label", 88 | "mark", "math", "meter", "noscript", "object", "output", "progress", "q", 89 | "ruby", "samp", "script", "select", "small", "span", "strong", "sub", 90 | "sup", "textarea", "time", "var", "wbr", 91 | } 92 | 93 | // flags is flags that used by parser. 94 | type flags struct { 95 | stripUnlikelys bool 96 | useWeightClasses bool 97 | cleanConditionally bool 98 | } 99 | 100 | // parseAttempt is container for the result of previous parse attempts. 101 | type parseAttempt struct { 102 | articleContent *html.Node 103 | textLength int 104 | } 105 | 106 | // Article represents the metadata and content of the article. 107 | type Article struct { 108 | // Title is the heading that preceeds the article’s content, and the basis 109 | // for the article’s page name and URL. It indicates what the article is 110 | // about, and distinguishes it from other articles. The title may simply 111 | // be the name of the subject of the article, or it may be a description 112 | // of the topic. 113 | Title string 114 | 115 | // Byline is a printed line of text accompanying a news story, article, or 116 | // the like, giving the author’s name 117 | Byline string 118 | 119 | // Dir is the direction of the text in the article. 120 | // 121 | // Either Left-to-Right (LTR) or Right-to-Left (RTL). 122 | Dir string 123 | 124 | // Content is the relevant text in the article with HTML tags. 125 | Content string 126 | 127 | // TextContent is the relevant text in the article without HTML tags. 128 | TextContent string 129 | 130 | // Excerpt is the summary for the relevant text in the article. 131 | Excerpt string 132 | 133 | // SiteName is the name of the original publisher website. 134 | SiteName string 135 | 136 | // Favicon (short for favorite icon) is a file containing one or more small 137 | // icons, associated with a particular website or web page. A web designer 138 | // can create such an icon and upload it to a website (or web page) by 139 | // several means, and graphical web browsers will then make use of it. 140 | Favicon string 141 | 142 | // Image is an image URL which represents the article’s content. 143 | Image string 144 | 145 | // Length is the amount of characters in the article. 146 | Length int 147 | 148 | // Node is the first element in the HTML document. 149 | Node *html.Node 150 | } 151 | 152 | // Readability is an HTML parser that reads and extract relevant content. 153 | type Readability struct { 154 | doc *html.Node 155 | documentURI *url.URL 156 | articleTitle string 157 | articleByline string 158 | attempts []parseAttempt 159 | flags flags 160 | 161 | // MaxElemsToParse is the optional maximum number of HTML nodes to parse 162 | // from the document. If the number of elements in the document is higher 163 | // than this number, the operation immediately errors. 164 | MaxElemsToParse int 165 | 166 | // NTopCandidates is the number of top candidates to consider when the 167 | // parser is analysing how tight the competition is among candidates. 168 | NTopCandidates int 169 | 170 | // CharThresholds is the default number of chars an article must have in 171 | // order to return a result. 172 | CharThresholds int 173 | 174 | // ClassesToPreserve are the classes that readability sets itself. 175 | ClassesToPreserve []string 176 | 177 | // TagsToScore is element tags to score by default. 178 | TagsToScore []string 179 | 180 | KeepClasses bool 181 | } 182 | 183 | // New returns new Readability with sane defaults to parse simple documents. 184 | func New() *Readability { 185 | return &Readability{ 186 | MaxElemsToParse: 0, 187 | NTopCandidates: 5, 188 | CharThresholds: 500, 189 | ClassesToPreserve: []string{"page"}, 190 | TagsToScore: []string{"section", "h2", "h3", "h4", "h5", "h6", "p", "td", "pre"}, 191 | KeepClasses: false, 192 | } 193 | } 194 | 195 | // removeNodes iterates over a collection of HTML elements, calls the optional 196 | // filter function on each node, and removes the node if function returns True. 197 | // If function is not passed, removes all the nodes in the list. 198 | func (r *Readability) removeNodes(list []*html.Node, filter func(*html.Node) bool) { 199 | var node *html.Node 200 | var parentNode *html.Node 201 | 202 | for i := len(list) - 1; i >= 0; i-- { 203 | node = list[i] 204 | parentNode = node.Parent 205 | 206 | if parentNode != nil && (filter == nil || filter(node)) { 207 | parentNode.RemoveChild(node) 208 | } 209 | } 210 | } 211 | 212 | // replaceNodeTags iterates over a list, and calls setNodeTag for each node. 213 | func (r *Readability) replaceNodeTags(list []*html.Node, newTagName string) { 214 | for i := len(list) - 1; i >= 0; i-- { 215 | r.setNodeTag(list[i], newTagName) 216 | } 217 | } 218 | 219 | // forEachNode iterates over a list of HTML nodes, which doesn’t natively fully 220 | // implement the Array interface. For convenience, the current object context 221 | // is applied to the provided iterate function. 222 | func (r *Readability) forEachNode(list []*html.Node, fn func(*html.Node, int)) { 223 | for idx, node := range list { 224 | fn(node, idx) 225 | } 226 | } 227 | 228 | // someNode iterates over a NodeList, return true if any of the 229 | // provided iterate function calls returns true, false otherwise. 230 | func (r *Readability) someNode(nodeList []*html.Node, fn func(*html.Node) bool) bool { 231 | for i := 0; i < len(nodeList); i++ { 232 | if fn(nodeList[i]) { 233 | return true 234 | } 235 | } 236 | 237 | return false 238 | } 239 | 240 | // everyNode iterates over a collection of nodes, returns true if all of the 241 | // provided iterator function calls return true, otherwise returns false. For 242 | // convenience, the current object context is applied to the provided iterator 243 | // function. 244 | func (r *Readability) everyNode(list []*html.Node, fn func(*html.Node) bool) bool { 245 | for _, node := range list { 246 | if !fn(node) { 247 | return false 248 | } 249 | } 250 | 251 | return true 252 | } 253 | 254 | // concatNodeLists concats all nodelists passed as arguments. 255 | func (r *Readability) concatNodeLists(nodeLists ...[]*html.Node) []*html.Node { 256 | var result []*html.Node 257 | 258 | for i := 0; i < len(nodeLists); i++ { 259 | result = append(result, nodeLists[i]...) 260 | } 261 | 262 | return result 263 | } 264 | 265 | func (r *Readability) getAllNodesWithTag(node *html.Node, tagNames ...string) []*html.Node { 266 | var list []*html.Node 267 | 268 | for _, tag := range tagNames { 269 | list = append(list, getElementsByTagName(node, tag)...) 270 | } 271 | 272 | return list 273 | } 274 | 275 | // getArticleTitle attempts to get the article title. 276 | func (r *Readability) getArticleTitle() string { 277 | doc := r.doc 278 | curTitle := "" 279 | origTitle := "" 280 | titleHadHierarchicalSeparators := false 281 | 282 | // If they had an element with tag "title" in their HTML 283 | if nodes := getElementsByTagName(doc, "title"); len(nodes) > 0 { 284 | origTitle = r.getInnerText(nodes[0], true) 285 | curTitle = origTitle 286 | } 287 | 288 | // If there's a separator in the title, first remove the final part 289 | if rxTitleSeparator.MatchString(curTitle) { 290 | titleHadHierarchicalSeparators = rxTitleHierarchySep.MatchString(curTitle) 291 | curTitle = rxTitleRemoveFinalPart.ReplaceAllString(origTitle, "$1") 292 | 293 | // If the resulting title is too short (3 words or fewer), remove 294 | // the first part instead: 295 | if wordCount(curTitle) < 3 { 296 | curTitle = rxTitleRemove1stPart.ReplaceAllString(origTitle, "$1") 297 | } 298 | } else if strings.Index(curTitle, ": ") != -1 { 299 | // Check if we have an heading containing this exact string, so 300 | // we could assume it's the full title. 301 | headings := r.concatNodeLists( 302 | getElementsByTagName(doc, "h1"), 303 | getElementsByTagName(doc, "h2"), 304 | ) 305 | 306 | trimmedTitle := strings.TrimSpace(curTitle) 307 | match := r.someNode(headings, func(heading *html.Node) bool { 308 | return strings.TrimSpace(textContent(heading)) == trimmedTitle 309 | }) 310 | 311 | // If we don't, let's extract the title out of the original 312 | // title string. 313 | if !match { 314 | curTitle = origTitle[strings.LastIndex(origTitle, ":")+1:] 315 | 316 | // If the title is now too short, try the first colon instead: 317 | if wordCount(curTitle) < 3 { 318 | curTitle = origTitle[strings.Index(origTitle, ":")+1:] 319 | // But if we have too many words before the colon there's 320 | // something weird with the titles and the H tags so let's 321 | // just use the original title instead 322 | } else if wordCount(origTitle[:strings.Index(origTitle, ":")]) > 5 { 323 | curTitle = origTitle 324 | } 325 | } 326 | } else if len(curTitle) > 150 || len(curTitle) < 15 { 327 | if hOnes := getElementsByTagName(doc, "h1"); len(hOnes) == 1 { 328 | curTitle = r.getInnerText(hOnes[0], true) 329 | } 330 | } 331 | 332 | curTitle = strings.TrimSpace(curTitle) 333 | curTitle = rxNormalize.ReplaceAllString(curTitle, "\x20") 334 | // If we now have 4 words or fewer as our title, and either no 335 | // 'hierarchical' separators (\, /, > or ») were found in the original 336 | // title or we decreased the number of words by more than 1 word, use 337 | // the original title. 338 | curTitleWordCount := wordCount(curTitle) 339 | tmpOrigTitle := rxTitleAnySeparator.ReplaceAllString(origTitle, "") 340 | 341 | if curTitleWordCount <= 4 && 342 | (!titleHadHierarchicalSeparators || 343 | curTitleWordCount != wordCount(tmpOrigTitle)-1) { 344 | curTitle = origTitle 345 | } 346 | 347 | return curTitle 348 | } 349 | 350 | // getArticleFavicon attempts to get high quality favicon 351 | // that used in article. It will only pick favicon in PNG 352 | // format, so small favicon that uses ico file won't be picked. 353 | // Using algorithm by philippe_b. 354 | func (r *Readability) getArticleFavicon() string { 355 | favicon := "" 356 | faviconSize := -1 357 | linkElements := getElementsByTagName(r.doc, "link") 358 | 359 | r.forEachNode(linkElements, func(link *html.Node, _ int) { 360 | linkRel := strings.TrimSpace(getAttribute(link, "rel")) 361 | linkType := strings.TrimSpace(getAttribute(link, "type")) 362 | linkHref := strings.TrimSpace(getAttribute(link, "href")) 363 | linkSizes := strings.TrimSpace(getAttribute(link, "sizes")) 364 | 365 | if linkHref == "" || !strings.Contains(linkRel, "icon") { 366 | return 367 | } 368 | 369 | if linkType != "image/png" && !strings.Contains(linkHref, ".png") { 370 | return 371 | } 372 | 373 | size := 0 374 | for _, sizesLocation := range []string{linkSizes, linkHref} { 375 | sizeParts := rxFaviconSize.FindStringSubmatch(sizesLocation) 376 | if len(sizeParts) != 3 || sizeParts[1] != sizeParts[2] { 377 | continue 378 | } 379 | 380 | size, _ = strconv.Atoi(sizeParts[1]) 381 | break 382 | } 383 | 384 | if size > faviconSize { 385 | faviconSize = size 386 | favicon = linkHref 387 | } 388 | }) 389 | 390 | return toAbsoluteURI(favicon, r.documentURI) 391 | } 392 | 393 | // prepDocument prepares the HTML document for readability to scrape it. This 394 | // includes things like stripping JavaScript, CSS, and handling terrible markup 395 | // among other things. 396 | func (r *Readability) prepDocument() { 397 | doc := r.doc 398 | 399 | r.removeNodes(getElementsByTagName(doc, "style"), nil) 400 | 401 | if n := getElementsByTagName(doc, "body"); len(n) > 0 && n[0] != nil { 402 | r.replaceBrs(n[0]) 403 | } 404 | 405 | r.replaceNodeTags(getElementsByTagName(doc, "font"), "SPAN") 406 | } 407 | 408 | // nextElement finds the next element, starting from the given node, and 409 | // ignoring whitespace in between. If the given node is an element, the same 410 | // node is returned. 411 | func (r *Readability) nextElement(node *html.Node) *html.Node { 412 | next := node 413 | 414 | for next != nil && 415 | next.Type != html.ElementNode && 416 | rxWhitespace.MatchString(textContent(next)) { 417 | next = next.NextSibling 418 | } 419 | 420 | return next 421 | } 422 | 423 | // replaceBrs replaces two or more successive
elements with a single

. 424 | // Whitespace between
elements are ignored. For example: 425 | // 426 | //

foo
bar


abc
427 | // 428 | // will become: 429 | // 430 | //
foo
bar

abc

431 | func (r *Readability) replaceBrs(elem *html.Node) { 432 | r.forEachNode(r.getAllNodesWithTag(elem, "br"), func(br *html.Node, _ int) { 433 | next := br.NextSibling 434 | 435 | // Whether two or more
elements have been found and replaced with 436 | // a

block. 437 | replaced := false 438 | 439 | // If we find a
chain, remove the
nodes until we hit another 440 | // element or non-whitespace. This leaves behind the first
in the 441 | // chain (which will be replaced with a

later). 442 | for { 443 | next = r.nextElement(next) 444 | 445 | if next == nil || tagName(next) == "BR" { 446 | break 447 | } 448 | 449 | replaced = true 450 | brSibling := next.NextSibling 451 | next.Parent.RemoveChild(next) 452 | next = brSibling 453 | } 454 | 455 | // If we removed a
chain, replace the remaining
with a

. 456 | // Add all sibling nodes as children of the

until we hit another 457 | //
chain. 458 | if replaced { 459 | p := createElement("p") 460 | replaceNode(br, p) 461 | 462 | next = p.NextSibling 463 | for next != nil { 464 | // If we have hit another

, we are done adding children 465 | // to this

. 466 | if tagName(next) == "br" { 467 | nextElem := r.nextElement(next.NextSibling) 468 | if nextElem != nil && tagName(nextElem) == "br" { 469 | break 470 | } 471 | } 472 | 473 | if !r.isPhrasingContent(next) { 474 | break 475 | } 476 | 477 | // Otherwise, make this node a child of the new

. 478 | sibling := next.NextSibling 479 | appendChild(p, next) 480 | next = sibling 481 | } 482 | 483 | for p.LastChild != nil && r.isWhitespace(p.LastChild) { 484 | p.RemoveChild(p.LastChild) 485 | } 486 | 487 | if tagName(p.Parent) == "P" { 488 | r.setNodeTag(p.Parent, "div") 489 | } 490 | } 491 | }) 492 | } 493 | 494 | func (r *Readability) setNodeTag(node *html.Node, newTagName string) { 495 | if node.Type == html.ElementNode { 496 | node.Data = newTagName 497 | } 498 | 499 | // NOTES(cixtor): the original function in Readability.js is a bit longer 500 | // because it contains a fallback mechanism to set the node tag name just 501 | // in case JSDOMParser is not available, there is no need to implement this 502 | // here. 503 | } 504 | 505 | // getArticleMetadata attempts to get excerpt and byline metadata for the article. 506 | func (r *Readability) getArticleMetadata() Article { 507 | values := make(map[string]string) 508 | metaElements := getElementsByTagName(r.doc, "meta") 509 | 510 | // Find description tags. 511 | r.forEachNode(metaElements, func(element *html.Node, _ int) { 512 | elementName := getAttribute(element, "name") 513 | elementProperty := getAttribute(element, "property") 514 | content := getAttribute(element, "content") 515 | if content == "" { 516 | return 517 | } 518 | matches := []string{} 519 | name := "" 520 | 521 | if elementProperty != "" { 522 | matches = rxPropertyPattern.FindAllString(elementProperty, -1) 523 | for i := len(matches) - 1; i >= 0; i-- { 524 | // Convert to lowercase, and remove any whitespace 525 | // so we can match belops. 526 | name = strings.ToLower(matches[i]) 527 | name = strings.Join(strings.Fields(name), "") 528 | // multiple authors 529 | values[name] = strings.TrimSpace(content) 530 | } 531 | } 532 | 533 | if len(matches) == 0 && elementName != "" && rxNamePattern.MatchString(elementName) { 534 | // Convert to lowercase, remove any whitespace, and convert 535 | // dots to colons so we can match belops. 536 | name = strings.ToLower(elementName) 537 | name = strings.Join(strings.Fields(name), "") 538 | name = strings.Replace(name, ".", ":", -1) 539 | values[name] = strings.TrimSpace(content) 540 | } 541 | }) 542 | 543 | // get title 544 | metadataTitle := "" 545 | for _, name := range []string{ 546 | "dc:title", 547 | "dcterm:title", 548 | "og:title", 549 | "weibo:article:title", 550 | "weibo:webpage:title", 551 | "title", 552 | "twitter:title", 553 | } { 554 | if value, ok := values[name]; ok { 555 | metadataTitle = value 556 | break 557 | } 558 | } 559 | 560 | if metadataTitle == "" { 561 | metadataTitle = r.getArticleTitle() 562 | } 563 | 564 | // get author 565 | metadataByline := "" 566 | for _, name := range []string{ 567 | "dc:creator", 568 | "dcterm:creator", 569 | "author", 570 | } { 571 | if value, ok := values[name]; ok { 572 | metadataByline = value 573 | break 574 | } 575 | } 576 | 577 | // get description 578 | metadataExcerpt := "" 579 | for _, name := range []string{ 580 | "dc:description", 581 | "dcterm:description", 582 | "og:description", 583 | "weibo:article:description", 584 | "weibo:webpage:description", 585 | "description", 586 | "twitter:description", 587 | } { 588 | if value, ok := values[name]; ok { 589 | metadataExcerpt = value 590 | break 591 | } 592 | } 593 | 594 | // get site name 595 | metadataSiteName := values["og:site_name"] 596 | 597 | // get image thumbnail 598 | metadataImage := "" 599 | for _, name := range []string{ 600 | "og:image", 601 | "image", 602 | "twitter:image", 603 | } { 604 | if value, ok := values[name]; ok { 605 | metadataImage = toAbsoluteURI(value, r.documentURI) 606 | break 607 | } 608 | } 609 | 610 | // get favicon 611 | metadataFavicon := r.getArticleFavicon() 612 | 613 | return Article{ 614 | Title: metadataTitle, 615 | Byline: metadataByline, 616 | Excerpt: metadataExcerpt, 617 | SiteName: metadataSiteName, 618 | Image: metadataImage, 619 | Favicon: metadataFavicon, 620 | } 621 | } 622 | 623 | // prepArticle prepares the article Node for display cleaning out any inline 624 | // CSS styles, iframes, forms and stripping extraneous paragraph tags

. 625 | func (r *Readability) prepArticle(articleContent *html.Node) { 626 | r.cleanStyles(articleContent) 627 | 628 | // Check for data tables before we continue, to avoid removing 629 | // items in those tables, which will often be isolated even 630 | // though they're visually linked to other content-ful elements 631 | // (text, images, etc.). 632 | r.markDataTables(articleContent) 633 | 634 | // Clean out junk from the article content 635 | r.cleanConditionally(articleContent, "form") 636 | r.cleanConditionally(articleContent, "fieldset") 637 | r.clean(articleContent, "object") 638 | r.clean(articleContent, "embed") 639 | r.clean(articleContent, "footer") 640 | r.clean(articleContent, "link") 641 | r.clean(articleContent, "aside") 642 | 643 | // Clean out elements have "share" in their id/class combinations 644 | // from final top candidates, which means we don't remove the top 645 | // candidates even they have "share". 646 | r.forEachNode(children(articleContent), func(topCandidate *html.Node, _ int) { 647 | r.cleanMatchedNodes(topCandidate, func(node *html.Node, nodeClassID string) bool { 648 | return rxShare.MatchString(nodeClassID) && len(textContent(node)) < r.CharThresholds 649 | }) 650 | }) 651 | 652 | // If there is only one h2 and its text content substantially 653 | // equals article title, they are probably using it as a header 654 | // and not a subheader, so remove it since we already extract 655 | // the title separately. 656 | if h2s := getElementsByTagName(articleContent, "h2"); len(h2s) == 1 { 657 | h2 := h2s[0] 658 | h2Text := textContent(h2) 659 | lengthSimilarRate := float64(len(h2Text)-len(r.articleTitle)) / float64(len(r.articleTitle)) 660 | 661 | if math.Abs(lengthSimilarRate) < 0.5 { 662 | titlesMatch := false 663 | 664 | if lengthSimilarRate > 0 { 665 | titlesMatch = strings.Contains(h2Text, r.articleTitle) 666 | } else { 667 | titlesMatch = strings.Contains(r.articleTitle, h2Text) 668 | } 669 | 670 | if titlesMatch { 671 | r.clean(articleContent, "h2") 672 | } 673 | } 674 | } 675 | 676 | r.clean(articleContent, "iframe") 677 | r.clean(articleContent, "input") 678 | r.clean(articleContent, "textarea") 679 | r.clean(articleContent, "select") 680 | r.clean(articleContent, "button") 681 | r.cleanHeaders(articleContent) 682 | 683 | // Do these last as the previous stuff may have removed junk 684 | // that will affect these 685 | r.cleanConditionally(articleContent, "table") 686 | r.cleanConditionally(articleContent, "ul") 687 | r.cleanConditionally(articleContent, "div") 688 | 689 | // Remove extra paragraphs 690 | r.removeNodes(getElementsByTagName(articleContent, "p"), func(p *html.Node) bool { 691 | imgCount := len(getElementsByTagName(p, "img")) 692 | embedCount := len(getElementsByTagName(p, "embed")) 693 | objectCount := len(getElementsByTagName(p, "object")) 694 | 695 | // Nasty iframes have been removed, only remain embedded videos. 696 | iframeCount := len(getElementsByTagName(p, "iframe")) 697 | totalCount := imgCount + embedCount + objectCount + iframeCount 698 | 699 | return totalCount == 0 && r.getInnerText(p, false) == "" 700 | }) 701 | 702 | r.forEachNode(getElementsByTagName(articleContent, "br"), func(br *html.Node, _ int) { 703 | next := r.nextElement(br.NextSibling) 704 | 705 | if next != nil && tagName(next) == "p" { 706 | br.Parent.RemoveChild(br) 707 | } 708 | }) 709 | 710 | // Remove single-cell tables 711 | r.forEachNode(getElementsByTagName(articleContent, "table"), func(table *html.Node, _ int) { 712 | tbody := table 713 | 714 | if r.hasSingleTagInsideElement(table, "tbody") { 715 | tbody = firstElementChild(table) 716 | } 717 | 718 | if r.hasSingleTagInsideElement(tbody, "tr") { 719 | row := firstElementChild(tbody) 720 | 721 | if r.hasSingleTagInsideElement(row, "td") { 722 | cell := firstElementChild(row) 723 | 724 | newTag := "div" 725 | 726 | if r.everyNode(childNodes(cell), r.isPhrasingContent) { 727 | newTag = "p" 728 | } 729 | 730 | r.setNodeTag(cell, newTag) 731 | 732 | replaceNode(table, cell) 733 | } 734 | } 735 | }) 736 | } 737 | 738 | // grabArticle uses a variety of metrics (content score, classname, element 739 | // types), find the content that is most likely to be the stuff a user wants to 740 | // read. Then return it wrapped up in a div. 741 | func (r *Readability) grabArticle() *html.Node { 742 | for { 743 | doc := cloneNode(r.doc) 744 | 745 | var page *html.Node 746 | if nodes := getElementsByTagName(doc, "body"); len(nodes) > 0 { 747 | page = nodes[0] 748 | } 749 | 750 | // We can not grab an article if we do not have a page. 751 | if page == nil { 752 | return nil 753 | } 754 | 755 | // First, node prepping. Trash nodes that look cruddy (like ones with 756 | // the class name "comment", etc), and turn divs into P tags where they 757 | // have been used inappropriately (as in, where they contain no other 758 | // block level elements). 759 | var elementsToScore []*html.Node 760 | var node = documentElement(doc) 761 | 762 | for node != nil { 763 | matchString := className(node) + "\x20" + id(node) 764 | 765 | if !r.isProbablyVisible(node) { 766 | node = r.removeAndGetNext(node) 767 | continue 768 | } 769 | 770 | // Remove Node if it is a Byline. 771 | if r.checkByline(node, matchString) { 772 | node = r.removeAndGetNext(node) 773 | continue 774 | } 775 | 776 | // Remove unlikely candidates. 777 | nodeTagName := tagName(node) 778 | if r.flags.stripUnlikelys { 779 | if rxUnlikelyCandidates.MatchString(matchString) && 780 | !rxOkMaybeItsACandidate.MatchString(matchString) && 781 | !r.hasAncestorTag(node, "table", 3, nil) && 782 | nodeTagName != "body" && 783 | nodeTagName != "a" { 784 | node = r.removeAndGetNext(node) 785 | continue 786 | } 787 | } 788 | 789 | // Remove DIV, SECTION and HEADER nodes without any content. 790 | switch nodeTagName { 791 | case "div", 792 | "section", 793 | "header", 794 | "h1", 795 | "h2", 796 | "h3", 797 | "h4", 798 | "h5", 799 | "h6": 800 | if r.isElementWithoutContent(node) { 801 | node = r.removeAndGetNext(node) 802 | continue 803 | } 804 | } 805 | 806 | if indexOf(r.TagsToScore, nodeTagName) != -1 { 807 | elementsToScore = append(elementsToScore, node) 808 | } 809 | 810 | // Convert

without children block level elements into

. 811 | if nodeTagName == "div" { 812 | // Put phrasing content into paragraphs. 813 | var p *html.Node 814 | childNode := node.FirstChild 815 | 816 | for childNode != nil { 817 | nextSibling := childNode.NextSibling 818 | 819 | if r.isPhrasingContent(childNode) { 820 | if p != nil { 821 | appendChild(p, childNode) 822 | } else if !r.isWhitespace(childNode) { 823 | p = createElement("p") 824 | appendChild(p, cloneNode(childNode)) 825 | replaceNode(childNode, p) 826 | } 827 | } else if p != nil { 828 | for p.LastChild != nil && r.isWhitespace(p.LastChild) { 829 | p.RemoveChild(p.LastChild) 830 | } 831 | p = nil 832 | } 833 | 834 | childNode = nextSibling 835 | } 836 | 837 | // Sites like http://mobile.slate.com encloses each paragraph 838 | // with a DIV element. DIVs with only a P element inside and no 839 | // text content can be safely converted into plain P elements to 840 | // avoid confusing the scoring algorithm with DIVs with are, in 841 | // practice, paragraphs. 842 | if r.hasSingleTagInsideElement(node, "p") && r.getLinkDensity(node) < 0.25 { 843 | newNode := children(node)[0] 844 | replaceNode(node, newNode) 845 | node = newNode 846 | elementsToScore = append(elementsToScore, node) 847 | } else if !r.hasChildBlockElement(node) { 848 | r.setNodeTag(node, "p") 849 | elementsToScore = append(elementsToScore, node) 850 | } 851 | } 852 | 853 | node = r.getNextNode(node, false) 854 | } 855 | 856 | // Loop through all paragraphs and assign a score to them based on how 857 | // much relevant content they have. Then add their score to their parent 858 | // node. A score is determined by things like number of commas, class 859 | // names, etc. Maybe eventually link density. 860 | var candidates []*html.Node 861 | r.forEachNode(elementsToScore, func(elementToScore *html.Node, _ int) { 862 | if elementToScore.Parent == nil || tagName(elementToScore.Parent) == "" { 863 | return 864 | } 865 | 866 | // If this paragraph is less than 25 characters, don't even count it. 867 | innerText := r.getInnerText(elementToScore, true) 868 | if len(innerText) < 25 { 869 | return 870 | } 871 | 872 | // Exclude nodes with no ancestor. 873 | ancestors := r.getNodeAncestors(elementToScore, 3) 874 | if len(ancestors) == 0 { 875 | return 876 | } 877 | 878 | // Add a point for the paragraph itself as a base. 879 | contentScore := 1 880 | 881 | // Add points for any commas within this paragraph. 882 | contentScore += strings.Count(innerText, ",") 883 | 884 | // For every 100 characters in this paragraph, add another point. Up to 3 points. 885 | contentScore += int(math.Min(math.Floor(float64(len(innerText))/100.0), 3.0)) 886 | 887 | // Initialize and score ancestors. 888 | r.forEachNode(ancestors, func(ancestor *html.Node, level int) { 889 | if tagName(ancestor) == "" || ancestor.Parent == nil || ancestor.Parent.Type != html.ElementNode { 890 | return 891 | } 892 | 893 | if !r.hasContentScore(ancestor) { 894 | r.initializeNode(ancestor) 895 | candidates = append(candidates, ancestor) 896 | } 897 | 898 | // Node score divider: 899 | // - parent: 1 (no division) 900 | // - grandparent: 2 901 | // - great grandparent+: ancestor level * 3 902 | scoreDivider := 1 903 | switch level { 904 | case 0: 905 | scoreDivider = 1 906 | case 1: 907 | scoreDivider = 2 908 | default: 909 | scoreDivider = level * 3 910 | } 911 | 912 | ancestorScore := r.getContentScore(ancestor) 913 | ancestorScore += float64(contentScore) / float64(scoreDivider) 914 | 915 | r.setContentScore(ancestor, ancestorScore) 916 | }) 917 | }) 918 | 919 | // These lines are a bit different compared to Readability.js. 920 | // 921 | // In Readability.js, they fetch NTopCandidates utilising array method 922 | // like `splice` and `pop`. In Go, array method like that is not as 923 | // simple, especially since we are working with pointer. So, here we 924 | // simply sort top candidates, and limit it to max NTopCandidates. 925 | 926 | // Scale the final candidates score based on link density. Good 927 | // content should have a relatively small link density (5% or 928 | // less) and be mostly unaffected by this operation. 929 | for i := 0; i < len(candidates); i++ { 930 | candidate := candidates[i] 931 | candidateScore := r.getContentScore(candidate) * (1 - r.getLinkDensity(candidate)) 932 | r.setContentScore(candidate, candidateScore) 933 | } 934 | 935 | // After we have calculated scores, sort through all of the possible 936 | // candidate nodes we found and find the one with the highest score. 937 | sort.Slice(candidates, func(i int, j int) bool { 938 | return r.getContentScore(candidates[i]) > r.getContentScore(candidates[j]) 939 | }) 940 | 941 | var topCandidates []*html.Node 942 | 943 | if len(candidates) > r.NTopCandidates { 944 | topCandidates = candidates[:r.NTopCandidates] 945 | } else { 946 | topCandidates = candidates 947 | } 948 | 949 | var topCandidate, parentOfTopCandidate *html.Node 950 | neededToCreateTopCandidate := false 951 | if len(topCandidates) > 0 { 952 | topCandidate = topCandidates[0] 953 | } 954 | 955 | // If we still have no top candidate, just use the body as a last 956 | // resort. We also have to copy the body node so it is something 957 | // we can modify. 958 | if topCandidate == nil || tagName(topCandidate) == "body" { 959 | // Move all of the page's children into topCandidate 960 | topCandidate = createElement("div") 961 | neededToCreateTopCandidate = true 962 | // Move everything (not just elements, also text nodes etc.) 963 | // into the container so we even include text directly in the body: 964 | kids := childNodes(page) 965 | for i := 0; i < len(kids); i++ { 966 | appendChild(topCandidate, kids[i]) 967 | } 968 | 969 | appendChild(page, topCandidate) 970 | r.initializeNode(topCandidate) 971 | } else if topCandidate != nil { 972 | // Find a better top candidate node if it contains (at least three) 973 | // nodes which belong to `topCandidates` array and whose scores are 974 | // quite closed with current `topCandidate` node. 975 | topCandidateScore := r.getContentScore(topCandidate) 976 | var alternativeCandidateAncestors [][]*html.Node 977 | for i := 1; i < len(topCandidates); i++ { 978 | if r.getContentScore(topCandidates[i])/topCandidateScore >= 0.75 { 979 | topCandidateAncestors := r.getNodeAncestors(topCandidates[i], 0) 980 | alternativeCandidateAncestors = append(alternativeCandidateAncestors, topCandidateAncestors) 981 | } 982 | } 983 | 984 | minimumTopCandidates := 3 985 | if len(alternativeCandidateAncestors) >= minimumTopCandidates { 986 | parentOfTopCandidate = topCandidate.Parent 987 | for parentOfTopCandidate != nil && tagName(parentOfTopCandidate) != "body" { 988 | listContainingThisAncestor := 0 989 | for ancestorIndex := 0; ancestorIndex < len(alternativeCandidateAncestors) && listContainingThisAncestor < minimumTopCandidates; ancestorIndex++ { 990 | if includeNode(alternativeCandidateAncestors[ancestorIndex], parentOfTopCandidate) { 991 | listContainingThisAncestor++ 992 | } 993 | } 994 | 995 | if listContainingThisAncestor >= minimumTopCandidates { 996 | topCandidate = parentOfTopCandidate 997 | break 998 | } 999 | 1000 | parentOfTopCandidate = parentOfTopCandidate.Parent 1001 | } 1002 | } 1003 | 1004 | if !r.hasContentScore(topCandidate) { 1005 | r.initializeNode(topCandidate) 1006 | } 1007 | 1008 | // Because of our bonus system, parents of candidates might 1009 | // have scores themselves. They get half of the node. There 1010 | // won't be nodes with higher scores than our topCandidate, 1011 | // but if we see the score going *up* in the first few steps * 1012 | // up the tree, that's a decent sign that there might be more 1013 | // content lurking in other places that we want to unify in. 1014 | // The sibling stuff below does some of that - but only if 1015 | // we've looked high enough up the DOM tree. 1016 | parentOfTopCandidate = topCandidate.Parent 1017 | lastScore := r.getContentScore(topCandidate) 1018 | // The scores shouldn't get too lor. 1019 | scoreThreshold := lastScore / 3.0 1020 | for parentOfTopCandidate != nil && tagName(parentOfTopCandidate) != "body" { 1021 | if !r.hasContentScore(parentOfTopCandidate) { 1022 | parentOfTopCandidate = parentOfTopCandidate.Parent 1023 | continue 1024 | } 1025 | 1026 | parentScore := r.getContentScore(parentOfTopCandidate) 1027 | if parentScore < scoreThreshold { 1028 | break 1029 | } 1030 | 1031 | if parentScore > lastScore { 1032 | // Alright! We found a better parent to use. 1033 | topCandidate = parentOfTopCandidate 1034 | break 1035 | } 1036 | 1037 | lastScore = parentScore 1038 | parentOfTopCandidate = parentOfTopCandidate.Parent 1039 | } 1040 | 1041 | // If the top candidate is the only child, use parent 1042 | // instead. This will help sibling joining logic when 1043 | // adjacent content is actually located in parent's 1044 | // sibling node. 1045 | parentOfTopCandidate = topCandidate.Parent 1046 | for parentOfTopCandidate != nil && tagName(parentOfTopCandidate) != "body" && len(children(parentOfTopCandidate)) == 1 { 1047 | topCandidate = parentOfTopCandidate 1048 | parentOfTopCandidate = topCandidate.Parent 1049 | } 1050 | 1051 | if !r.hasContentScore(topCandidate) { 1052 | r.initializeNode(topCandidate) 1053 | } 1054 | } 1055 | 1056 | // Now that we have the top candidate, look through its siblings 1057 | // for content that might also be related. Things like preambles, 1058 | // content split by ads that we removed, etc. 1059 | articleContent := createElement("div") 1060 | siblingScoreThreshold := math.Max(10, r.getContentScore(topCandidate)*0.2) 1061 | 1062 | // Keep potential top candidate's parent node to try to get text direction of it later. 1063 | topCandidateScore := r.getContentScore(topCandidate) 1064 | topCandidateClassName := className(topCandidate) 1065 | 1066 | parentOfTopCandidate = topCandidate.Parent 1067 | siblings := children(parentOfTopCandidate) 1068 | for s := 0; s < len(siblings); s++ { 1069 | sibling := siblings[s] 1070 | appendNode := false 1071 | 1072 | if sibling == topCandidate { 1073 | appendNode = true 1074 | } else { 1075 | contentBonus := float64(0) 1076 | 1077 | // Give a bonus if sibling nodes and top candidates have the example same classname 1078 | if className(sibling) == topCandidateClassName && topCandidateClassName != "" { 1079 | contentBonus += topCandidateScore * 0.2 1080 | } 1081 | 1082 | if r.hasContentScore(sibling) && r.getContentScore(sibling)+contentBonus >= siblingScoreThreshold { 1083 | appendNode = true 1084 | } else if tagName(sibling) == "p" { 1085 | linkDensity := r.getLinkDensity(sibling) 1086 | nodeContent := r.getInnerText(sibling, true) 1087 | nodeLength := len(nodeContent) 1088 | 1089 | if nodeLength > 80 && linkDensity < 0.25 { 1090 | appendNode = true 1091 | } else if nodeLength < 80 && nodeLength > 0 && linkDensity == 0 && 1092 | rxSentencePeriod.MatchString(nodeContent) { 1093 | appendNode = true 1094 | } 1095 | } 1096 | } 1097 | 1098 | if appendNode { 1099 | // We have a node that is not a common block level element, 1100 | // like a FORM or TD tag. Turn it into a DIV so it does not get 1101 | // filtered out later by accident. 1102 | if indexOf(alterToDivExceptions, tagName(sibling)) == -1 { 1103 | r.setNodeTag(sibling, "div") 1104 | } 1105 | 1106 | appendChild(articleContent, sibling) 1107 | } 1108 | } 1109 | 1110 | // So we have all of the content that we need. Now we clean 1111 | // it up for presentation. 1112 | r.prepArticle(articleContent) 1113 | 1114 | if neededToCreateTopCandidate { 1115 | // We already created a fake DIV thing, and there would not have 1116 | // been any siblings left for the previous loop, so there is no 1117 | // point trying to create a new DIV and then move all the children 1118 | // over. Just assign IDs and CSS class names here. No need to append 1119 | // because that already happened anyway. 1120 | // 1121 | // By the way, this line is different with Readability.js. 1122 | // 1123 | // In Readability.js, when using `appendChild`, the node is still 1124 | // referenced. Meanwhile here, our `appendChild` will clone the 1125 | // node, put it in the new place, then delete the original. 1126 | firstChild := firstElementChild(articleContent) 1127 | if firstChild != nil && tagName(firstChild) == "div" { 1128 | setAttribute(firstChild, "id", "readability-page-1") 1129 | setAttribute(firstChild, "class", "page") 1130 | } 1131 | } else { 1132 | div := createElement("div") 1133 | 1134 | setAttribute(div, "id", "readability-page-1") 1135 | setAttribute(div, "class", "page") 1136 | 1137 | childs := childNodes(articleContent) 1138 | 1139 | for i := 0; i < len(childs); i++ { 1140 | appendChild(div, childs[i]) 1141 | } 1142 | 1143 | appendChild(articleContent, div) 1144 | } 1145 | 1146 | parseSuccessful := true 1147 | 1148 | // Now that we've gone through the full algorithm, check to see if we 1149 | // got any meaningful content. If we did not, we may need to re-run 1150 | // grabArticle with different flags set. This gives us a higher 1151 | // likelihood of finding the content, and the sieve approach gives us a 1152 | // higher likelihood of finding the -right- content. 1153 | textLength := len(r.getInnerText(articleContent, true)) 1154 | if textLength < r.CharThresholds { 1155 | parseSuccessful = false 1156 | 1157 | if r.flags.stripUnlikelys { 1158 | r.flags.stripUnlikelys = false 1159 | r.attempts = append(r.attempts, parseAttempt{ 1160 | articleContent: articleContent, 1161 | textLength: textLength, 1162 | }) 1163 | } else if r.flags.useWeightClasses { 1164 | r.flags.useWeightClasses = false 1165 | r.attempts = append(r.attempts, parseAttempt{ 1166 | articleContent: articleContent, 1167 | textLength: textLength, 1168 | }) 1169 | } else if r.flags.cleanConditionally { 1170 | r.flags.cleanConditionally = false 1171 | r.attempts = append(r.attempts, parseAttempt{ 1172 | articleContent: articleContent, 1173 | textLength: textLength, 1174 | }) 1175 | } else { 1176 | r.attempts = append(r.attempts, parseAttempt{ 1177 | articleContent: articleContent, 1178 | textLength: textLength, 1179 | }) 1180 | 1181 | // No luck after removing flags, just return the 1182 | // longest text we found during the different loops * 1183 | sort.Slice(r.attempts, func(i, j int) bool { 1184 | return r.attempts[i].textLength > r.attempts[j].textLength 1185 | }) 1186 | 1187 | // But first check if we actually have something 1188 | if r.attempts[0].textLength == 0 { 1189 | return nil 1190 | } 1191 | 1192 | articleContent = r.attempts[0].articleContent 1193 | parseSuccessful = true 1194 | } 1195 | } 1196 | 1197 | if parseSuccessful { 1198 | return articleContent 1199 | } 1200 | } 1201 | } 1202 | 1203 | // initializeNode initializes a node with the readability score. Also checks 1204 | // the className/id for special names to add to its score. 1205 | func (r *Readability) initializeNode(node *html.Node) { 1206 | contentScore := float64(r.getClassWeight(node)) 1207 | 1208 | switch tagName(node) { 1209 | case "div": 1210 | contentScore += 5 1211 | case "pre", "td", "blockquote": 1212 | contentScore += 3 1213 | case "address", "ol", "ul", "dl", "dd", "dt", "li", "form": 1214 | contentScore -= 3 1215 | case "h1", "h2", "h3", "h4", "h5", "h6", "th": 1216 | contentScore -= 5 1217 | } 1218 | 1219 | r.setContentScore(node, contentScore) 1220 | } 1221 | 1222 | // removeAndGetNext remove node and returns its next node. 1223 | func (r *Readability) removeAndGetNext(node *html.Node) *html.Node { 1224 | nextNode := r.getNextNode(node, true) 1225 | 1226 | if node.Parent != nil { 1227 | node.Parent.RemoveChild(node) 1228 | } 1229 | 1230 | return nextNode 1231 | } 1232 | 1233 | // getNextNode traverses the DOM from node to node, starting at the node passed 1234 | // in. Pass true for the second parameter to indicate this node itself (and its 1235 | // kids) are going away, and we want the next node over. Calling this in a loop 1236 | // will traverse the DOM depth-first. 1237 | // 1238 | // In Readability.js, ignoreSelfAndKids default to false. 1239 | func (r *Readability) getNextNode(node *html.Node, ignoreSelfAndKids bool) *html.Node { 1240 | // First check for kids if those are not being ignored 1241 | if firstChild := firstElementChild(node); !ignoreSelfAndKids && firstChild != nil { 1242 | return firstChild 1243 | } 1244 | 1245 | // Then for siblings... 1246 | if sibling := nextElementSibling(node); sibling != nil { 1247 | return sibling 1248 | } 1249 | 1250 | // And finally, move up the parent chain *and* find a sibling 1251 | // (because this is depth-first traversal, we will have already 1252 | // seen the parent nodes themselves). 1253 | for { 1254 | node = node.Parent 1255 | if node == nil || nextElementSibling(node) != nil { 1256 | break 1257 | } 1258 | } 1259 | 1260 | if node != nil { 1261 | return nextElementSibling(node) 1262 | } 1263 | 1264 | return nil 1265 | } 1266 | 1267 | // isValidByline checks whether the input string could be a byline. 1268 | func (r *Readability) isValidByline(byline string) bool { 1269 | byline = strings.TrimSpace(byline) 1270 | return len(byline) > 0 && len(byline) < 100 1271 | } 1272 | 1273 | // checkByline determines if a node is used as byline. 1274 | func (r *Readability) checkByline(node *html.Node, matchString string) bool { 1275 | if r.articleByline != "" { 1276 | return false 1277 | } 1278 | 1279 | rel := getAttribute(node, "rel") 1280 | itemprop := getAttribute(node, "itemprop") 1281 | nodeText := textContent(node) 1282 | if (rel == "author" || strings.Contains(itemprop, "author") || rxByline.MatchString(matchString)) && r.isValidByline(nodeText) { 1283 | nodeText = strings.TrimSpace(nodeText) 1284 | nodeText = strings.Join(strings.Fields(nodeText), "\x20") 1285 | r.articleByline = nodeText 1286 | return true 1287 | } 1288 | 1289 | return false 1290 | } 1291 | 1292 | // getNodeAncestors gets the node's direct parent and grandparents. 1293 | // 1294 | // In Readability.js, maxDepth default to 0. 1295 | func (r *Readability) getNodeAncestors(node *html.Node, maxDepth int) []*html.Node { 1296 | level := 0 1297 | ancestors := []*html.Node{} 1298 | 1299 | for node.Parent != nil { 1300 | level++ 1301 | ancestors = append(ancestors, node.Parent) 1302 | 1303 | if maxDepth > 0 && level == maxDepth { 1304 | break 1305 | } 1306 | 1307 | node = node.Parent 1308 | } 1309 | 1310 | return ancestors 1311 | } 1312 | 1313 | // setContentScore sets the readability score for a node. 1314 | func (r *Readability) setContentScore(node *html.Node, score float64) { 1315 | setAttribute(node, "data-readability-score", fmt.Sprintf("%.4f", score)) 1316 | } 1317 | 1318 | // hasContentScore checks if node has readability score. 1319 | func (r *Readability) hasContentScore(node *html.Node) bool { 1320 | return hasAttribute(node, "data-readability-score") 1321 | } 1322 | 1323 | // getContentScore gets the readability score of a node. 1324 | func (r *Readability) getContentScore(node *html.Node) float64 { 1325 | strScore := getAttribute(node, "data-readability-score") 1326 | strScore = strings.TrimSpace(strScore) 1327 | 1328 | if strScore == "" { 1329 | return 0 1330 | } 1331 | 1332 | score, err := strconv.ParseFloat(strScore, 64) 1333 | 1334 | if err != nil { 1335 | return 0 1336 | } 1337 | 1338 | return score 1339 | } 1340 | 1341 | // removeScripts removes script tags from the document. 1342 | func (r *Readability) removeScripts(doc *html.Node) { 1343 | r.removeNodes(getElementsByTagName(doc, "script"), nil) 1344 | r.removeNodes(getElementsByTagName(doc, "noscript"), nil) 1345 | } 1346 | 1347 | // hasSingleTagInsideElement check if the node has only whitespace and a single 1348 | // element with given tag. Returns false if the DIV Node contains non-empty text 1349 | // nodes or if it contains no element with given tag or more than 1 element. 1350 | func (r *Readability) hasSingleTagInsideElement(element *html.Node, tag string) bool { 1351 | // There should be exactly 1 element child with given tag 1352 | if childs := children(element); len(childs) != 1 || tagName(childs[0]) != tag { 1353 | return false 1354 | } 1355 | 1356 | // And there should be no text nodes with real content 1357 | return !r.someNode(childNodes(element), func(node *html.Node) bool { 1358 | return node.Type == html.TextNode && rxHasContent.MatchString(textContent(node)) 1359 | }) 1360 | } 1361 | 1362 | // isElementWithoutContent determines if node is empty. A node is considered 1363 | // empty is there is nothing inside or if the only things inside are HTML break 1364 | // tags
and HTML horizontal rule tags


. 1365 | func (r *Readability) isElementWithoutContent(node *html.Node) bool { 1366 | brs := getElementsByTagName(node, "br") 1367 | hrs := getElementsByTagName(node, "hr") 1368 | childs := children(node) 1369 | 1370 | return node.Type == html.ElementNode && 1371 | strings.TrimSpace(textContent(node)) == "" && 1372 | (len(childs) == 0 || len(childs) == len(brs)+len(hrs)) 1373 | } 1374 | 1375 | // hasChildBlockElement determines whether element has any children block level 1376 | // elements. 1377 | func (r *Readability) hasChildBlockElement(element *html.Node) bool { 1378 | return r.someNode(childNodes(element), func(node *html.Node) bool { 1379 | return indexOf(divToPElems, tagName(node)) != -1 || 1380 | r.hasChildBlockElement(node) 1381 | }) 1382 | } 1383 | 1384 | // isPhrasingContent determines if a node qualifies as phrasing content. 1385 | // 1386 | // See: https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#Phrasing_content 1387 | func (r *Readability) isPhrasingContent(node *html.Node) bool { 1388 | if node.Type == html.TextNode { 1389 | return true 1390 | } 1391 | 1392 | tag := tagName(node) 1393 | 1394 | if indexOf(phrasingElems, tag) != -1 { 1395 | return true 1396 | } 1397 | 1398 | return ((tag == "a" || tag == "del" || tag == "ins") && 1399 | r.everyNode(childNodes(node), r.isPhrasingContent)) 1400 | } 1401 | 1402 | // isWhitespace determines if a node only used as whitespace. 1403 | func (r *Readability) isWhitespace(node *html.Node) bool { 1404 | return (node.Type == html.TextNode && strings.TrimSpace(textContent(node)) == "") || 1405 | (node.Type == html.ElementNode && tagName(node) == "br") 1406 | } 1407 | 1408 | // getInnerText gets the inner text of a node. 1409 | // This also strips out any excess whitespace to be found. 1410 | // In Readability.js, normalizeSpaces default to true. 1411 | func (r *Readability) getInnerText(node *html.Node, normalizeSpaces bool) string { 1412 | textContent := strings.TrimSpace(textContent(node)) 1413 | 1414 | if normalizeSpaces { 1415 | textContent = rxNormalize.ReplaceAllString(textContent, "\x20") 1416 | } 1417 | 1418 | return textContent 1419 | } 1420 | 1421 | // getCharCount returns the number of times a string appears in the Node. 1422 | func (r *Readability) getCharCount(node *html.Node, s string) int { 1423 | innerText := r.getInnerText(node, true) 1424 | return strings.Count(innerText, s) 1425 | } 1426 | 1427 | // cleanStyles removes the style attribute on every node and under. 1428 | func (r *Readability) cleanStyles(node *html.Node) { 1429 | nodeTagName := tagName(node) 1430 | 1431 | if node == nil || nodeTagName == "svg" { 1432 | return 1433 | } 1434 | 1435 | // Remove `style` and deprecated presentational attributes 1436 | for i := 0; i < len(presentationalAttributes); i++ { 1437 | removeAttribute(node, presentationalAttributes[i]) 1438 | } 1439 | 1440 | if indexOf(deprecatedSizeAttributeElems, nodeTagName) != -1 { 1441 | removeAttribute(node, "width") 1442 | removeAttribute(node, "height") 1443 | } 1444 | 1445 | for child := firstElementChild(node); child != nil; child = nextElementSibling(child) { 1446 | r.cleanStyles(child) 1447 | } 1448 | } 1449 | 1450 | // getLinkDensity gets the density of links as a percentage of the content. 1451 | // This is the amount of text that is inside a link divided by the total text 1452 | // in the node. 1453 | func (r *Readability) getLinkDensity(element *html.Node) float64 { 1454 | textLength := len(r.getInnerText(element, true)) 1455 | 1456 | if textLength == 0 { 1457 | return 0 1458 | } 1459 | 1460 | linkLength := 0 1461 | 1462 | r.forEachNode(getElementsByTagName(element, "a"), func(linkNode *html.Node, _ int) { 1463 | linkLength += len(r.getInnerText(linkNode, true)) 1464 | }) 1465 | 1466 | return float64(linkLength) / float64(textLength) 1467 | } 1468 | 1469 | // getClassWeight gets an elements class/id weight. Uses regular expressions to 1470 | // tell if this element looks good or bad. 1471 | func (r *Readability) getClassWeight(node *html.Node) int { 1472 | if !r.flags.useWeightClasses { 1473 | return 0 1474 | } 1475 | 1476 | weight := 0 1477 | 1478 | // Look for a special classname 1479 | if nodeClassName := className(node); nodeClassName != "" { 1480 | if rxNegative.MatchString(nodeClassName) { 1481 | weight -= 25 1482 | } 1483 | 1484 | if rxPositive.MatchString(nodeClassName) { 1485 | weight += 25 1486 | } 1487 | } 1488 | 1489 | // Look for a special ID 1490 | if nodeID := id(node); nodeID != "" { 1491 | if rxNegative.MatchString(nodeID) { 1492 | weight -= 25 1493 | } 1494 | 1495 | if rxPositive.MatchString(nodeID) { 1496 | weight += 25 1497 | } 1498 | } 1499 | 1500 | return weight 1501 | } 1502 | 1503 | // clean cleans a node of all elements of type "tag". 1504 | func (r *Readability) clean(node *html.Node, tag string) { 1505 | isEmbed := indexOf([]string{"object", "embed", "iframe"}, tag) != -1 1506 | 1507 | r.removeNodes(getElementsByTagName(node, tag), func(element *html.Node) bool { 1508 | // Allow YouTube and Vimeo videos through as people usually want to see those. 1509 | if isEmbed { 1510 | // Check the attributes to see if any of them contain YouTube or Vimeo. 1511 | for _, attr := range element.Attr { 1512 | if rxVideos.MatchString(attr.Val) { 1513 | return false 1514 | } 1515 | } 1516 | 1517 | // For embed with tag, check inner HTML as well. 1518 | if tagName(element) == "object" && rxVideos.MatchString(innerHTML(element)) { 1519 | return false 1520 | } 1521 | } 1522 | 1523 | return true 1524 | }) 1525 | } 1526 | 1527 | // hasAncestorTag checks if a given node has one of its ancestor tag name 1528 | // matching the provided one. 1529 | // 1530 | // In Readability.js, default value for maxDepth is 3. 1531 | func (r *Readability) hasAncestorTag(node *html.Node, tag string, maxDepth int, filterFn func(*html.Node) bool) bool { 1532 | depth := 0 1533 | 1534 | for node.Parent != nil { 1535 | if maxDepth > 0 && depth > maxDepth { 1536 | return false 1537 | } 1538 | 1539 | if tagName(node.Parent) == tag && (filterFn == nil || filterFn(node.Parent)) { 1540 | return true 1541 | } 1542 | 1543 | node = node.Parent 1544 | 1545 | depth++ 1546 | } 1547 | 1548 | return false 1549 | } 1550 | 1551 | // getRowAndColumnCount returns how many rows and columns this table has. 1552 | func (r *Readability) getRowAndColumnCount(table *html.Node) (int, int) { 1553 | rows := 0 1554 | columns := 0 1555 | trs := getElementsByTagName(table, "tr") 1556 | 1557 | for i := 0; i < len(trs); i++ { 1558 | strRowSpan := getAttribute(trs[i], "rowspan") 1559 | rowSpan, _ := strconv.Atoi(strRowSpan) 1560 | 1561 | if rowSpan == 0 { 1562 | rowSpan = 1 1563 | } 1564 | 1565 | rows += rowSpan 1566 | 1567 | // Now look for column-related info 1568 | columnsInThisRow := 0 1569 | cells := getElementsByTagName(trs[i], "td") 1570 | 1571 | for j := 0; j < len(cells); j++ { 1572 | strColSpan := getAttribute(cells[j], "colspan") 1573 | colSpan, _ := strconv.Atoi(strColSpan) 1574 | 1575 | if colSpan == 0 { 1576 | colSpan = 1 1577 | } 1578 | 1579 | columnsInThisRow += colSpan 1580 | } 1581 | 1582 | if columnsInThisRow > columns { 1583 | columns = columnsInThisRow 1584 | } 1585 | } 1586 | 1587 | return rows, columns 1588 | } 1589 | 1590 | // isReadabilityDataTable determines if a Node is a data table. 1591 | func (r *Readability) isReadabilityDataTable(node *html.Node) bool { 1592 | return hasAttribute(node, "data-readability-table") 1593 | } 1594 | 1595 | // setReadabilityDataTable marks whether a Node is data table or not. 1596 | func (r *Readability) setReadabilityDataTable(node *html.Node, isDataTable bool) { 1597 | if isDataTable { 1598 | setAttribute(node, "data-readability-table", "true") 1599 | return 1600 | } 1601 | 1602 | removeAttribute(node, "data-readability-table") 1603 | } 1604 | 1605 | // markDataTables looks for "data" (as opposed to "layout") tables and mark it. 1606 | func (r *Readability) markDataTables(root *html.Node) { 1607 | tables := getElementsByTagName(root, "table") 1608 | 1609 | for i := 0; i < len(tables); i++ { 1610 | table := tables[i] 1611 | 1612 | role := getAttribute(table, "role") 1613 | if role == "presentation" { 1614 | r.setReadabilityDataTable(table, false) 1615 | continue 1616 | } 1617 | 1618 | datatable := getAttribute(table, "datatable") 1619 | if datatable == "0" { 1620 | r.setReadabilityDataTable(table, false) 1621 | continue 1622 | } 1623 | 1624 | if hasAttribute(table, "summary") { 1625 | r.setReadabilityDataTable(table, true) 1626 | continue 1627 | } 1628 | 1629 | if captions := getElementsByTagName(table, "caption"); len(captions) > 0 { 1630 | if caption := captions[0]; caption != nil && len(childNodes(caption)) > 0 { 1631 | r.setReadabilityDataTable(table, true) 1632 | continue 1633 | } 1634 | } 1635 | 1636 | // If the table has a descendant with any of these tags, consider a data table: 1637 | hasDataTableDescendantTags := false 1638 | for _, descendantTag := range []string{"col", "colgroup", "tfoot", "thead", "th"} { 1639 | descendants := getElementsByTagName(table, descendantTag) 1640 | if len(descendants) > 0 && descendants[0] != nil { 1641 | hasDataTableDescendantTags = true 1642 | break 1643 | } 1644 | } 1645 | 1646 | if hasDataTableDescendantTags { 1647 | r.setReadabilityDataTable(table, true) 1648 | continue 1649 | } 1650 | 1651 | // Nested tables indicates a layout table: 1652 | if len(getElementsByTagName(table, "table")) > 0 { 1653 | r.setReadabilityDataTable(table, false) 1654 | continue 1655 | } 1656 | 1657 | rows, columns := r.getRowAndColumnCount(table) 1658 | 1659 | if rows >= 10 || columns > 4 { 1660 | r.setReadabilityDataTable(table, true) 1661 | continue 1662 | } 1663 | 1664 | // Now just go by size entirely: 1665 | if rows*columns > 10 { 1666 | r.setReadabilityDataTable(table, true) 1667 | } 1668 | } 1669 | } 1670 | 1671 | // cleanConditionally cleans an element of all tags of type "tag" if they look 1672 | // fishy. "Fishy" is an algorithm based on content length, classnames, link 1673 | // density, number of images & embeds, etc. 1674 | func (r *Readability) cleanConditionally(element *html.Node, tag string) { 1675 | if !r.flags.cleanConditionally { 1676 | return 1677 | } 1678 | 1679 | isList := tag == "ul" || tag == "ol" 1680 | 1681 | // Gather counts for other typical elements embedded within. Traverse 1682 | // backwards so we can remove nodes at the same time without effecting 1683 | // the traversal. 1684 | r.removeNodes(getElementsByTagName(element, tag), func(node *html.Node) bool { 1685 | if tag == "table" && r.isReadabilityDataTable(node) { 1686 | return false 1687 | } 1688 | 1689 | if r.hasAncestorTag(node, "table", -1, r.isReadabilityDataTable) { 1690 | return false 1691 | } 1692 | 1693 | weight := r.getClassWeight(node) 1694 | if weight < 0 { 1695 | return true 1696 | } 1697 | 1698 | if r.getCharCount(node, ",") < 10 { 1699 | // If there are not many commas and the number of non-paragraph 1700 | // elements is more than paragraphs or other ominous signs, remove 1701 | // the element. 1702 | p := float64(len(getElementsByTagName(node, "p"))) 1703 | img := float64(len(getElementsByTagName(node, "img"))) 1704 | li := float64(len(getElementsByTagName(node, "li")) - 100) 1705 | input := float64(len(getElementsByTagName(node, "input"))) 1706 | 1707 | embedCount := 0 1708 | embeds := r.concatNodeLists( 1709 | getElementsByTagName(node, "object"), 1710 | getElementsByTagName(node, "embed"), 1711 | getElementsByTagName(node, "iframe"), 1712 | ) 1713 | 1714 | for _, embed := range embeds { 1715 | // Do not delete if Embed has attribute matching Video regex. 1716 | for _, attr := range embed.Attr { 1717 | if rxVideos.MatchString(attr.Val) { 1718 | return false 1719 | } 1720 | } 1721 | 1722 | // For embed with tag, check inner HTML as well. 1723 | if tagName(embed) == "object" && rxVideos.MatchString(innerHTML(embed)) { 1724 | return false 1725 | } 1726 | 1727 | embedCount++ 1728 | } 1729 | 1730 | linkDensity := r.getLinkDensity(node) 1731 | contentLength := len(r.getInnerText(node, true)) 1732 | 1733 | return (img > 1 && p/img < 0.5 && !r.hasAncestorTag(node, "figure", 3, nil)) || 1734 | (!isList && li > p) || 1735 | (input > math.Floor(p/3)) || 1736 | (!isList && contentLength < 25 && (img == 0 || img > 2) && !r.hasAncestorTag(node, "figure", 3, nil)) || 1737 | (!isList && weight < 25 && linkDensity > 0.2) || 1738 | (weight >= 25 && linkDensity > 0.5) || 1739 | ((embedCount == 1 && contentLength < 75) || embedCount > 1) 1740 | } 1741 | 1742 | return false 1743 | }) 1744 | } 1745 | 1746 | // cleanMatchedNodes cleans out elements whose ID and CSS class combinations 1747 | // match specific string. 1748 | func (r *Readability) cleanMatchedNodes(e *html.Node, filter func(*html.Node, string) bool) { 1749 | endOfSearchMarkerNode := r.getNextNode(e, true) 1750 | next := r.getNextNode(e, false) 1751 | 1752 | for next != nil && next != endOfSearchMarkerNode { 1753 | if filter != nil && filter(next, className(next)+"\x20"+id(next)) { 1754 | next = r.removeAndGetNext(next) 1755 | } else { 1756 | next = r.getNextNode(next, false) 1757 | } 1758 | } 1759 | } 1760 | 1761 | // cleanHeaders cleans out spurious headers from an Element. Checks things like 1762 | // classnames and link density. 1763 | func (r *Readability) cleanHeaders(e *html.Node) { 1764 | for headerIndex := 1; headerIndex < 3; headerIndex++ { 1765 | headerTag := fmt.Sprintf("h%d", headerIndex) 1766 | 1767 | r.removeNodes(getElementsByTagName(e, headerTag), func(header *html.Node) bool { 1768 | return r.getClassWeight(header) < 0 1769 | }) 1770 | } 1771 | } 1772 | 1773 | // isProbablyVisible determines if a node is visible. 1774 | func (r *Readability) isProbablyVisible(node *html.Node) bool { 1775 | nodeStyle := getAttribute(node, "style") 1776 | nodeAriaHidden := getAttribute(node, "aria-hidden") 1777 | className := getAttribute(node, "class") 1778 | 1779 | return (nodeStyle == "" || !rxDisplayNone.MatchString(nodeStyle)) && 1780 | !hasAttribute(node, "hidden") && 1781 | (nodeAriaHidden == "" || 1782 | nodeAriaHidden != "true" || 1783 | strings.Contains(className, "fallback-image")) 1784 | } 1785 | 1786 | // fixRelativeURIs converts each and uri in the given element to an 1787 | // absolute URI, ignoring #ref URIs. 1788 | func (r *Readability) fixRelativeURIs(articleContent *html.Node) { 1789 | links := r.getAllNodesWithTag(articleContent, "a") 1790 | 1791 | r.forEachNode(links, func(link *html.Node, _ int) { 1792 | href := getAttribute(link, "href") 1793 | 1794 | if href == "" { 1795 | return 1796 | } 1797 | 1798 | // Replace links with javascript: URIs with text content, since they 1799 | // will not work after scripts have been removed from the page. 1800 | if strings.HasPrefix(href, "javascript:") { 1801 | text := createTextNode(textContent(link)) 1802 | replaceNode(link, text) 1803 | return 1804 | } 1805 | 1806 | newHref := toAbsoluteURI(href, r.documentURI) 1807 | 1808 | if newHref == "" { 1809 | removeAttribute(link, "href") 1810 | return 1811 | } 1812 | 1813 | setAttribute(link, "href", newHref) 1814 | }) 1815 | 1816 | imgs := r.getAllNodesWithTag(articleContent, "img") 1817 | 1818 | r.forEachNode(imgs, func(img *html.Node, _ int) { 1819 | src := getAttribute(img, "src") 1820 | 1821 | if src == "" { 1822 | return 1823 | } 1824 | 1825 | newSrc := toAbsoluteURI(src, r.documentURI) 1826 | 1827 | if newSrc == "" { 1828 | removeAttribute(img, "src") 1829 | return 1830 | } 1831 | 1832 | setAttribute(img, "src", newSrc) 1833 | }) 1834 | } 1835 | 1836 | // cleanClasses removes the class="" attribute from every element in the given 1837 | // subtree, except those that match CLASSES_TO_PRESERVE and classesToPreserve 1838 | // array from the options object. 1839 | func (r *Readability) cleanClasses(node *html.Node) { 1840 | nodeClassName := className(node) 1841 | preservedClassName := []string{} 1842 | 1843 | for _, class := range strings.Fields(nodeClassName) { 1844 | if indexOf(r.ClassesToPreserve, class) != -1 { 1845 | preservedClassName = append(preservedClassName, class) 1846 | } 1847 | } 1848 | 1849 | if len(preservedClassName) > 0 { 1850 | setAttribute(node, "class", strings.Join(preservedClassName, "\x20")) 1851 | } else { 1852 | removeAttribute(node, "class") 1853 | } 1854 | 1855 | for child := firstElementChild(node); child != nil; child = nextElementSibling(child) { 1856 | r.cleanClasses(child) 1857 | } 1858 | } 1859 | 1860 | // clearReadabilityAttr removes Readability attribute created by the parser. 1861 | func (r *Readability) clearReadabilityAttr(node *html.Node) { 1862 | removeAttribute(node, "data-readability-score") 1863 | removeAttribute(node, "data-readability-table") 1864 | 1865 | for child := firstElementChild(node); child != nil; child = nextElementSibling(child) { 1866 | r.clearReadabilityAttr(child) 1867 | } 1868 | } 1869 | 1870 | func (r *Readability) isSingleImage(node *html.Node) bool { 1871 | if tagName(node) == "img" { 1872 | return true 1873 | } 1874 | 1875 | children := children(node) 1876 | textContent := textContent(node) 1877 | if len(children) != 1 || strings.TrimSpace(textContent) != "" { 1878 | return false 1879 | } 1880 | 1881 | return r.isSingleImage(children[0]) 1882 | } 1883 | 1884 | func (r *Readability) removeComments(doc *html.Node) { 1885 | var comments []*html.Node 1886 | var finder func(*html.Node) 1887 | 1888 | finder = func(node *html.Node) { 1889 | if node.Type == html.CommentNode { 1890 | comments = append(comments, node) 1891 | } 1892 | 1893 | for child := node.FirstChild; child != nil; child = child.NextSibling { 1894 | finder(child) 1895 | } 1896 | } 1897 | 1898 | for child := doc.FirstChild; child != nil; child = child.NextSibling { 1899 | finder(child) 1900 | } 1901 | 1902 | r.removeNodes(comments, nil) 1903 | } 1904 | 1905 | // postProcessContent runs post-process modifications to the article content. 1906 | func (r *Readability) postProcessContent(articleContent *html.Node) { 1907 | // Convert relative URIs to absolute URIs so we can open them. 1908 | r.fixRelativeURIs(articleContent) 1909 | 1910 | // Remove CSS classes. 1911 | r.cleanClasses(articleContent) 1912 | 1913 | // Remove readability attributes. 1914 | r.clearReadabilityAttr(articleContent) 1915 | } 1916 | 1917 | // Parse parses input and find the main readable content. 1918 | func (r *Readability) Parse(input io.Reader, pageURL string) (Article, error) { 1919 | var err error 1920 | 1921 | // Reset parser data 1922 | r.articleTitle = "" 1923 | r.articleByline = "" 1924 | r.attempts = []parseAttempt{} 1925 | r.flags.stripUnlikelys = true 1926 | r.flags.useWeightClasses = true 1927 | r.flags.cleanConditionally = true 1928 | 1929 | // Parse page URL. 1930 | if r.documentURI, err = url.ParseRequestURI(pageURL); err != nil { 1931 | return Article{}, fmt.Errorf("failed to parse URL: %v", err) 1932 | } 1933 | 1934 | // Parse input. 1935 | if r.doc, err = html.Parse(input); err != nil { 1936 | return Article{}, fmt.Errorf("failed to parse input: %v", err) 1937 | } 1938 | 1939 | // Avoid parsing too large documents, as per configuration option. 1940 | if r.MaxElemsToParse > 0 { 1941 | numTags := len(getElementsByTagName(r.doc, "*")) 1942 | 1943 | if numTags > r.MaxElemsToParse { 1944 | return Article{}, fmt.Errorf("too many elements: %d", numTags) 1945 | } 1946 | } 1947 | 1948 | // Remove script tags from the document. 1949 | r.removeScripts(r.doc) 1950 | 1951 | // Prepares the HTML document. 1952 | r.prepDocument() 1953 | 1954 | // Fetch metadata. 1955 | metadata := r.getArticleMetadata() 1956 | r.articleTitle = metadata.Title 1957 | 1958 | // Try to grab article content. 1959 | finalHTMLContent := "" 1960 | finalTextContent := "" 1961 | readableNode := &html.Node{} 1962 | articleContent := r.grabArticle() 1963 | 1964 | if articleContent != nil { 1965 | r.postProcessContent(articleContent) 1966 | 1967 | // If we have not found an excerpt in the article's metadata, use the 1968 | // article's first paragraph as the excerpt. This is used for displaying 1969 | // a preview of the article's content. 1970 | if metadata.Excerpt == "" { 1971 | paragraphs := getElementsByTagName(articleContent, "p") 1972 | 1973 | if len(paragraphs) > 0 { 1974 | metadata.Excerpt = strings.TrimSpace(textContent(paragraphs[0])) 1975 | } 1976 | } 1977 | 1978 | readableNode = firstElementChild(articleContent) 1979 | finalHTMLContent = innerHTML(articleContent) 1980 | finalTextContent = textContent(articleContent) 1981 | finalTextContent = strings.TrimSpace(finalTextContent) 1982 | } 1983 | 1984 | finalByline := metadata.Byline 1985 | 1986 | if finalByline == "" { 1987 | finalByline = r.articleByline 1988 | } 1989 | 1990 | return Article{ 1991 | Title: r.articleTitle, 1992 | Byline: finalByline, 1993 | Node: readableNode, 1994 | Content: finalHTMLContent, 1995 | TextContent: finalTextContent, 1996 | Length: len(finalTextContent), 1997 | Excerpt: metadata.Excerpt, 1998 | SiteName: metadata.SiteName, 1999 | Image: metadata.Image, 2000 | Favicon: metadata.Favicon, 2001 | }, nil 2002 | } 2003 | 2004 | // IsReadable decides whether the document is usable or not without parsing the 2005 | // whole thing. In the original `mozilla/readability` library, this method is 2006 | // located in `Readability-readable.js`. 2007 | func (r *Readability) IsReadable(input io.Reader) bool { 2008 | doc, err := html.Parse(input) 2009 | 2010 | if err != nil { 2011 | return false 2012 | } 2013 | 2014 | // Get

and

 nodes. Also get DIV nodes which have BR node(s) and
2015 | 	// append them into the `nodes` variable. Some articles' DOM structures
2016 | 	// might look like:
2017 | 	//
2018 | 	// 
2019 | // Sentences
2020 | //
2021 | // Sentences
2022 | //
2023 | // 2024 | // So we need to make sure only fetch the div once. 2025 | // To do so, we will use map as dictionary. 2026 | nodeList := make([]*html.Node, 0) 2027 | nodeDict := make(map[*html.Node]struct{}) 2028 | var finder func(*html.Node) 2029 | 2030 | finder = func(node *html.Node) { 2031 | if node.Type == html.ElementNode { 2032 | tag := tagName(node) 2033 | if tag == "p" || tag == "pre" { 2034 | if _, exist := nodeDict[node]; !exist { 2035 | nodeList = append(nodeList, node) 2036 | nodeDict[node] = struct{}{} 2037 | } 2038 | } else if tag == "br" && node.Parent != nil && tagName(node.Parent) == "div" { 2039 | if _, exist := nodeDict[node.Parent]; !exist { 2040 | nodeList = append(nodeList, node.Parent) 2041 | nodeDict[node.Parent] = struct{}{} 2042 | } 2043 | } 2044 | } 2045 | 2046 | for child := node.FirstChild; child != nil; child = child.NextSibling { 2047 | finder(child) 2048 | } 2049 | } 2050 | 2051 | finder(doc) 2052 | 2053 | // This is a little cheeky, we use the accumulator 'score' to decide what 2054 | // to return from this callback. 2055 | score := float64(0) 2056 | 2057 | return r.someNode(nodeList, func(node *html.Node) bool { 2058 | if !r.isProbablyVisible(node) { 2059 | return false 2060 | } 2061 | 2062 | matchString := className(node) + "\x20" + id(node) 2063 | if rxUnlikelyCandidates.MatchString(matchString) && 2064 | !rxOkMaybeItsACandidate.MatchString(matchString) { 2065 | return false 2066 | } 2067 | 2068 | if tagName(node) == "p" && r.hasAncestorTag(node, "li", -1, nil) { 2069 | return false 2070 | } 2071 | 2072 | nodeText := strings.TrimSpace(textContent(node)) 2073 | nodeTextLength := len(nodeText) 2074 | if nodeTextLength < 140 { 2075 | return false 2076 | } 2077 | 2078 | score += math.Sqrt(float64(nodeTextLength - 140)) 2079 | if score > 20 { 2080 | return true 2081 | } 2082 | 2083 | return false 2084 | }) 2085 | } 2086 | --------------------------------------------------------------------------------