├── .gitignore
├── go.mod
├── LICENSE.md
├── README.md
├── go.sum
├── readability_test.go
├── helpers.go
└── readability.go


/.gitignore:
--------------------------------------------------------------------------------
1 | /.golangcilint-*
2 | /res
3 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/cixtor/readability
2 | 
3 | go 1.14
4 | 
5 | require golang.org/x/net v0.8.0
6 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2010 Arc90 Inc
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |    http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Readability
 2 | 
 3 | Readability is a library written in Go (golang) to parse, analyze and convert HTML pages into readable content. Originally an Arc90 Experiment, it is now incorporated into Safari’s Reader View.
 4 | 
 5 | > Despite the ubiquity of reading on the web, readers remain a neglected audience. Much of our talk about web design revolves around a sense of movement: users are thought to be finding, searching, skimming, looking. We measure how frequently they click but not how long they stay on the page. We concern ourselves with their travel and participation–how they move from page to page, who they talk to when they get there–but forget the needs of those whose purpose is to be still. Readers flourish when they have space–some distance from the hubbub of the crowds–and as web designers, there is yet much we can do to help them carve out that space.
 6 | >
 7 | > [In Defense Of Readers](http://alistapart.com/articles/indefenseofreaders), by [Mandy Brown](http://www.aworkinglibrary.com/)
 8 | 
 9 | ## Evolution of Readability Web Engines
10 | 
11 | | Product | Year | Shutdown |
12 | |---------|------|----------|
13 | | [Instapaper](https://www.instapaper.com/) | 2008 | N/A |
14 | | [Arc90 Readability](https://code.google.com/archive/p/arc90labs-readability/) | 2009 | [Sep 30, 2016](https://medium.com/@readability/the-readability-bookmarking-service-will-shut-down-on-september-30-2016-1641cc18e02b) |
15 | | [Apple Readability](https://developer.apple.com/documentation/safariextensions/safarireader) | 2010 | N/A |
16 | | [Microsoft Reading View](https://docs.microsoft.com/en-us/microsoft-edge/dev-guide/browser-features/reading-view) | 2014 | N/A |
17 | | [Mozilla Readability](https://github.com/mozilla/readability) | 2015 | N/A |
18 | | [Mercury Reader](https://mercury.postlight.com/) | 2016 | [Apr 15, 2019](https://www.reddit.com/r/mac/comments/apkhzs/a/) |
19 | 
20 | ## Reader Mode Parser Diversity
21 | 
22 | All modern web browsers, except for Google Chrome, include an option to parse, analyze, and extract the main content from web pages to provide what is commonly known as “Reading Mode”. Reading Mode is a separate web rendering mode that strips out repeated and irrelevant content, this allows the web browser to extract the main content and display it cleanly and consistently to the user.
23 | 
24 | | Vendor | Product | Parser | Environments |
25 | |--------|---------|--------|--------------|
26 | | Mozilla | Firefox | Mozilla Readability | Desktop and Android |
27 | | GNOME | Web | Mozilla Readability | Desktop |
28 | | Vivaldi | Vivaldi | Mozilla Readability | Desktop |
29 | | Yandex | Browser | Mozilla Readability | Desktop |
30 | | Samsung | Browser | Mozilla Readability | Android |
31 | | Apple | Safari | Safari Reader | macOS and iOS |
32 | | Maxthon | Maxthon | Maxthon Reader | Desktop |
33 | | Microsoft | Edge | EdgeHTML | Windows and Windows Mobile |
34 | | Microsoft | Edge Mobile | Chrome DOM Distiller | Android |
35 | | Google | Chrome | Chrome DOM Distiller | Android |
36 | | Postlight | Mercury Reader | Web Reader | Web / browser extension |
37 | | Instant Paper | Instapaper | Instaparser | Web / browser extension |
38 | | Mozilla | Pocket | Unknown | Web / browser extension |
39 | 
40 | ---
41 | 
42 | Ref: https://web.archive.org/web/20150817073201/http://lab.arc90.com/2009/03/02/readability/
43 | 


--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
 1 | github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
 2 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
 3 | golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
 4 | golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
 5 | golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
 6 | golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
 7 | golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
 8 | golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
 9 | golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
10 | golang.org/x/net v0.8.0 h1:Zrh2ngAOFYneWTAIAPethzeaQLuHwhuBkuV6ZiRnUaQ=
11 | golang.org/x/net v0.8.0/go.mod h1:QVkue5JL9kW//ek3r6jTKnTFis1tRmNAW2P1shuFdJc=
12 | golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
13 | golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
14 | golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
15 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
16 | golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
17 | golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
18 | golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
19 | golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
20 | golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
21 | golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
22 | golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
23 | golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
24 | golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
25 | golang.org/x/term v0.6.0/go.mod h1:m6U89DPEgQRMq3DNkDClhWw02AUbt2daBVO4cn4Hv9U=
26 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
27 | golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
28 | golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
29 | golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
30 | golang.org/x/text v0.8.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
31 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
32 | golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
33 | golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
34 | golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
35 | golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
36 | 


--------------------------------------------------------------------------------
/readability_test.go:
--------------------------------------------------------------------------------
  1 | package readability
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"io/ioutil"
  6 | 	"os"
  7 | 	"path/filepath"
  8 | 	"strings"
  9 | 	"testing"
 10 | 
 11 | 	"golang.org/x/net/html"
 12 | )
 13 | 
 14 | func TestMaxElemsToParse(t *testing.T) {
 15 | 	input := strings.NewReader(`<html>
 16 | 		<head>
 17 | 			<title>hello world</title>
 18 | 		</head>
 19 | 		<body>
 20 | 			<p>lorem ipsum</p>
 21 | 		</body>
 22 | 		</html>`)
 23 | 
 24 | 	parser := New()
 25 | 	parser.MaxElemsToParse = 3
 26 | 	_, err := parser.Parse(input, "https://cixtor.com/blog")
 27 | 
 28 | 	if err.Error() != "too many elements: 5" {
 29 | 		t.Fatalf("expecting failure due to MaxElemsToParse: %s", err)
 30 | 	}
 31 | }
 32 | 
 33 | func TestRemoveScripts(t *testing.T) {
 34 | 	input := strings.NewReader(`<html>
 35 | 		<head>
 36 | 			<title>hello world</title>
 37 | 		</head>
 38 | 		<body>
 39 | 			<script src="/js/scripts.min.js" type="text/javascript"></script>
 40 | 			<p>lorem ipsum</p>
 41 | 			<script type="text/javascript" src="/js/scripts.min.js"></script>
 42 | 			<script type="text/javascript">
 43 | 			window.alert('Hello World');
 44 | 			</script>
 45 | 		</body>
 46 | 		</html>`)
 47 | 
 48 | 	a, err := New().Parse(input, "https://cixtor.com/blog")
 49 | 
 50 | 	if err != nil {
 51 | 		t.Fatalf("parser failure: %s", err)
 52 | 	}
 53 | 
 54 | 	if a.TextContent != "lorem ipsum" {
 55 | 		t.Fatalf("scripts were not removed: %s", a.TextContent)
 56 | 	}
 57 | }
 58 | 
 59 | func getNodeExcerpt(node *html.Node) string {
 60 | 	outer := outerHTML(node)
 61 | 	outer = strings.Join(strings.Fields(outer), "\x20")
 62 | 	if len(outer) < 500 {
 63 | 		return outer
 64 | 	}
 65 | 	return outer[:500]
 66 | }
 67 | 
 68 | func errColorDiff(label string, a string, b string) error {
 69 | 	coloredA := ""
 70 | 	coloredB := ""
 71 | 	for i := 0; i < len(a); i++ {
 72 | 		if b[i] == a[i] {
 73 | 			coloredA += a[i : i+1]
 74 | 			coloredB += b[i : i+1]
 75 | 			continue
 76 | 		}
 77 | 		coloredA += "\x1b[0;92m" + a[i:] + "\x1b[0m"
 78 | 		coloredB += "\x1b[0;91m" + b[i:] + "\x1b[0m"
 79 | 		break
 80 | 	}
 81 | 	return fmt.Errorf("%s\n- %s\n+ %s", label, coloredA, coloredB)
 82 | }
 83 | 
 84 | func compareArticleContent(result *html.Node, expected *html.Node) error {
 85 | 	// Make sure number of nodes is same
 86 | 	resultNodesCount := len(children(result))
 87 | 	expectedNodesCount := len(children(expected))
 88 | 	if resultNodesCount != expectedNodesCount {
 89 | 		return fmt.Errorf(
 90 | 			"number of nodes is different, want %d got %d",
 91 | 			expectedNodesCount,
 92 | 			resultNodesCount,
 93 | 		)
 94 | 	}
 95 | 
 96 | 	resultNode := result
 97 | 	expectedNode := expected
 98 | 	for resultNode != nil && expectedNode != nil {
 99 | 		// Get node excerpt
100 | 		resultExcerpt := getNodeExcerpt(resultNode)
101 | 		expectedExcerpt := getNodeExcerpt(expectedNode)
102 | 
103 | 		// Compare tag name
104 | 		resultTagName := tagName(resultNode)
105 | 		expectedTagName := tagName(expectedNode)
106 | 		if resultTagName != expectedTagName {
107 | 			return fmt.Errorf(
108 | 				"tag name is different\nwant: %s (%s)\ngot : %s (%s)",
109 | 				expectedTagName,
110 | 				expectedExcerpt,
111 | 				resultTagName,
112 | 				resultExcerpt,
113 | 			)
114 | 		}
115 | 
116 | 		// Compare attributes
117 | 		resultAttrCount := len(resultNode.Attr)
118 | 		expectedAttrCount := len(expectedNode.Attr)
119 | 		if resultAttrCount != expectedAttrCount {
120 | 			return fmt.Errorf(
121 | 				"number of attributes is different\nwant: %d (%s)\ngot : %d (%s)",
122 | 				expectedAttrCount,
123 | 				expectedExcerpt,
124 | 				resultAttrCount,
125 | 				resultExcerpt,
126 | 			)
127 | 		}
128 | 
129 | 		for _, resultAttr := range resultNode.Attr {
130 | 			expectedAttrVal := getAttribute(expectedNode, resultAttr.Key)
131 | 			switch resultAttr.Key {
132 | 			case "href", "src":
133 | 				resultAttr.Val = strings.TrimSuffix(resultAttr.Val, "/")
134 | 				expectedAttrVal = strings.TrimSuffix(expectedAttrVal, "/")
135 | 			}
136 | 
137 | 			if resultAttr.Val != expectedAttrVal {
138 | 				return fmt.Errorf(
139 | 					"attribute %s is different\nwant: %s (%s)\ngot : %s (%s)",
140 | 					resultAttr.Key,
141 | 					expectedAttrVal,
142 | 					expectedExcerpt,
143 | 					resultAttr.Val,
144 | 					resultExcerpt,
145 | 				)
146 | 			}
147 | 		}
148 | 
149 | 		// Compare text content
150 | 		resultText := strings.TrimSpace(textContent(resultNode))
151 | 		expectedText := strings.TrimSpace(textContent(expectedNode))
152 | 
153 | 		resultText = strings.Join(strings.Fields(resultText), "\x20")
154 | 		expectedText = strings.Join(strings.Fields(expectedText), "\x20")
155 | 
156 | 		if resultText != expectedText {
157 | 			return errColorDiff(
158 | 				"text content is different",
159 | 				expectedExcerpt,
160 | 				resultExcerpt,
161 | 			)
162 | 		}
163 | 
164 | 		// Move to next node
165 | 		r := Readability{}
166 | 		resultNode = r.getNextNode(resultNode, false)
167 | 		expectedNode = r.getNextNode(expectedNode, false)
168 | 	}
169 | 
170 | 	return nil
171 | }
172 | 
173 | func TestParse(t *testing.T) {
174 | 	testDir := "scenarios"
175 | 	testItems, err := ioutil.ReadDir(testDir)
176 | 	if err != nil {
177 | 		t.Errorf("\nfailed to read test directory")
178 | 	}
179 | 
180 | 	for _, item := range testItems {
181 | 		if !item.IsDir() {
182 | 			continue
183 | 		}
184 | 
185 | 		t.Run(item.Name(), func(t1 *testing.T) {
186 | 			// Open test file
187 | 			testFilePath := filepath.Join(testDir, item.Name(), "source.html")
188 | 			testFile, err := os.Open(testFilePath)
189 | 			if err != nil {
190 | 				t1.Errorf("\nfailed to open test file")
191 | 			}
192 | 			defer testFile.Close()
193 | 
194 | 			// Open expected result file
195 | 			expectedFilePath := filepath.Join(testDir, item.Name(), "expected.html")
196 | 			expectedFile, err := os.Open(expectedFilePath)
197 | 			if err != nil {
198 | 				t1.Errorf("\nfailed to open expected result file")
199 | 			}
200 | 			defer expectedFile.Close()
201 | 
202 | 			// Parse expected result
203 | 			expectedHTML, err := html.Parse(expectedFile)
204 | 			if err != nil {
205 | 				t1.Errorf("\nfailed to parse expected result file")
206 | 			}
207 | 
208 | 			// Get article from test file
209 | 			resultArticle, err := New().Parse(testFile, "http://fakehost/test/page.html")
210 | 			if err != nil {
211 | 				t1.Errorf("\nfailed to parse test file")
212 | 			}
213 | 
214 | 			// Parse article into HTML
215 | 			resultHTML, err := html.Parse(strings.NewReader(resultArticle.Content))
216 | 			if err != nil {
217 | 				t1.Errorf("\nfailed to parse test article into HTML")
218 | 			}
219 | 
220 | 			// Compare article
221 | 			err = compareArticleContent(resultHTML, expectedHTML)
222 | 			if err != nil {
223 | 				t1.Errorf("\n%v", err)
224 | 			}
225 | 		})
226 | 	}
227 | }
228 | 


--------------------------------------------------------------------------------
/helpers.go:
--------------------------------------------------------------------------------
  1 | package readability
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"net/url"
  6 | 	"strings"
  7 | 
  8 | 	"golang.org/x/net/html"
  9 | )
 10 | 
 11 | // firstElementChild returns the object's first child Element, or nil if there
 12 | // are no child elements.
 13 | func firstElementChild(node *html.Node) *html.Node {
 14 | 	for child := node.FirstChild; child != nil; child = child.NextSibling {
 15 | 		if child.Type == html.ElementNode {
 16 | 			return child
 17 | 		}
 18 | 	}
 19 | 
 20 | 	return nil
 21 | }
 22 | 
 23 | // nextElementSibling returns the Element immediately following the specified
 24 | // one in its parent's children list, or nil if the specified Element is the
 25 | // last one in the list.
 26 | func nextElementSibling(node *html.Node) *html.Node {
 27 | 	for sibling := node.NextSibling; sibling != nil; sibling = sibling.NextSibling {
 28 | 		if sibling.Type == html.ElementNode {
 29 | 			return sibling
 30 | 		}
 31 | 	}
 32 | 
 33 | 	return nil
 34 | }
 35 | 
 36 | // appendChild adds a node to the end of the list of children of a specified
 37 | // parent node. If the given child is a reference to an existing node in the
 38 | // document, appendChild moves it from its current position to the new position
 39 | // (there is no requirement to remove the node from its parent node before
 40 | // appending it to some other node).
 41 | //
 42 | // See: https://developer.mozilla.org/en-US/docs/Web/API/Node/appendChild
 43 | func appendChild(node *html.Node, child *html.Node) {
 44 | 	if child.Parent != nil {
 45 | 		temp := cloneNode(child)
 46 | 		node.AppendChild(temp)
 47 | 		child.Parent.RemoveChild(child)
 48 | 		return
 49 | 	}
 50 | 
 51 | 	node.AppendChild(child)
 52 | }
 53 | 
 54 | // childNodes returns list of a node's direct children.
 55 | func childNodes(node *html.Node) []*html.Node {
 56 | 	var list []*html.Node
 57 | 
 58 | 	for c := node.FirstChild; c != nil; c = c.NextSibling {
 59 | 		list = append(list, c)
 60 | 	}
 61 | 
 62 | 	return list
 63 | }
 64 | 
 65 | // includeNode determines if node is included inside nodeList.
 66 | func includeNode(nodeList []*html.Node, node *html.Node) bool {
 67 | 	for i := 0; i < len(nodeList); i++ {
 68 | 		if nodeList[i] == node {
 69 | 			return true
 70 | 		}
 71 | 	}
 72 | 
 73 | 	return false
 74 | }
 75 | 
 76 | // cloneNode returns a duplicate of the node on which this method was called.
 77 | //
 78 | // See: https://developer.mozilla.org/en-US/docs/Web/API/Node/cloneNode
 79 | func cloneNode(node *html.Node) *html.Node {
 80 | 	clone := &html.Node{
 81 | 		Type:     node.Type,
 82 | 		DataAtom: node.DataAtom,
 83 | 		Data:     node.Data,
 84 | 		Attr:     make([]html.Attribute, len(node.Attr)),
 85 | 	}
 86 | 
 87 | 	copy(clone.Attr, node.Attr)
 88 | 
 89 | 	for c := node.FirstChild; c != nil; c = c.NextSibling {
 90 | 		clone.AppendChild(cloneNode(c))
 91 | 	}
 92 | 
 93 | 	return clone
 94 | }
 95 | 
 96 | // createElement creates the HTML element specified by tagName.
 97 | //
 98 | // See: https://developer.mozilla.org/en-US/docs/Web/API/Document/createElement
 99 | func createElement(tagName string) *html.Node {
100 | 	return &html.Node{Type: html.ElementNode, Data: tagName}
101 | }
102 | 
103 | // createTextNode creates a new Text node.
104 | func createTextNode(data string) *html.Node {
105 | 	return &html.Node{Type: html.TextNode, Data: data}
106 | }
107 | 
108 | // getElementsByTagName returns a collection of HTML elements with the given
109 | // tag name. If tag name is an asterisk, a list of all the available HTML nodes
110 | // will be returned instead.
111 | //
112 | // See: https://developer.mozilla.org/en-US/docs/Web/API/Document/getElementsByTagName
113 | func getElementsByTagName(node *html.Node, tag string) []*html.Node {
114 | 	var lst []*html.Node
115 | 	var fun func(*html.Node)
116 | 
117 | 	fun = func(n *html.Node) {
118 | 		if n.Type == html.ElementNode && (tag == "*" || n.Data == tag) {
119 | 			lst = append(lst, n)
120 | 		}
121 | 
122 | 		for c := n.FirstChild; c != nil; c = c.NextSibling {
123 | 			fun(c)
124 | 		}
125 | 	}
126 | 
127 | 	fun(node)
128 | 
129 | 	return lst
130 | }
131 | 
132 | // getAttribute returns the value of a specified attribute on the element. If
133 | // the given attribute does not exist, the function returns an empty string.
134 | func getAttribute(node *html.Node, attrName string) string {
135 | 	for i := 0; i < len(node.Attr); i++ {
136 | 		if node.Attr[i].Key == attrName {
137 | 			return node.Attr[i].Val
138 | 		}
139 | 	}
140 | 
141 | 	return ""
142 | }
143 | 
144 | // setAttribute sets attribute for node. If attribute already exists, it will
145 | // be replaced.
146 | func setAttribute(node *html.Node, attrName string, attrValue string) {
147 | 	attrIdx := -1
148 | 
149 | 	for i := 0; i < len(node.Attr); i++ {
150 | 		if node.Attr[i].Key == attrName {
151 | 			attrIdx = i
152 | 			break
153 | 		}
154 | 	}
155 | 
156 | 	if attrIdx >= 0 {
157 | 		node.Attr[attrIdx].Val = attrValue
158 | 		return
159 | 	}
160 | 
161 | 	node.Attr = append(node.Attr, html.Attribute{
162 | 		Key: attrName,
163 | 		Val: attrValue,
164 | 	})
165 | }
166 | 
167 | // removeAttribute removes attribute with given name.
168 | func removeAttribute(node *html.Node, attrName string) {
169 | 	attrIdx := -1
170 | 
171 | 	for i := 0; i < len(node.Attr); i++ {
172 | 		if node.Attr[i].Key == attrName {
173 | 			attrIdx = i
174 | 			break
175 | 		}
176 | 	}
177 | 
178 | 	if attrIdx >= 0 {
179 | 		a := node.Attr
180 | 		a = append(a[:attrIdx], a[attrIdx+1:]...)
181 | 		node.Attr = a
182 | 	}
183 | }
184 | 
185 | // hasAttribute returns a Boolean value indicating whether the specified node
186 | // has the specified attribute or not.
187 | func hasAttribute(node *html.Node, attrName string) bool {
188 | 	for i := 0; i < len(node.Attr); i++ {
189 | 		if node.Attr[i].Key == attrName {
190 | 			return true
191 | 		}
192 | 	}
193 | 
194 | 	return false
195 | }
196 | 
197 | // outerHTML returns an HTML serialization of the element and its descendants.
198 | func outerHTML(node *html.Node) string {
199 | 	var buffer bytes.Buffer
200 | 
201 | 	if err := html.Render(&buffer, node); err != nil {
202 | 		return ""
203 | 	}
204 | 
205 | 	return buffer.String()
206 | }
207 | 
208 | // innerHTML returns the HTML content (inner HTML) of an element.
209 | func innerHTML(node *html.Node) string {
210 | 	var err error
211 | 	var buffer bytes.Buffer
212 | 
213 | 	for child := node.FirstChild; child != nil; child = child.NextSibling {
214 | 		if err = html.Render(&buffer, child); err != nil {
215 | 			return ""
216 | 		}
217 | 	}
218 | 
219 | 	return strings.TrimSpace(buffer.String())
220 | }
221 | 
222 | // documentElement returns the root element of the document.
223 | func documentElement(doc *html.Node) *html.Node {
224 | 	nodes := getElementsByTagName(doc, "html")
225 | 
226 | 	if len(nodes) > 0 {
227 | 		return nodes[0]
228 | 	}
229 | 
230 | 	return nil
231 | }
232 | 
233 | // className returns the value of the class attribute of the element.
234 | func className(node *html.Node) string {
235 | 	className := getAttribute(node, "class")
236 | 	className = strings.TrimSpace(className)
237 | 	className = rxNormalize.ReplaceAllString(className, "\x20")
238 | 	return className
239 | }
240 | 
241 | // id returns the value of the id attribute of the specified element.
242 | func id(node *html.Node) string {
243 | 	id := getAttribute(node, "id")
244 | 	id = strings.TrimSpace(id)
245 | 	return id
246 | }
247 | 
248 | // children returns an HTMLCollection of the child elements of Node.
249 | func children(node *html.Node) []*html.Node {
250 | 	var children []*html.Node
251 | 
252 | 	if node == nil {
253 | 		return nil
254 | 	}
255 | 
256 | 	for child := node.FirstChild; child != nil; child = child.NextSibling {
257 | 		if child.Type == html.ElementNode {
258 | 			children = append(children, child)
259 | 		}
260 | 	}
261 | 
262 | 	return children
263 | }
264 | 
265 | // wordCount returns number of word in str.
266 | func wordCount(str string) int {
267 | 	return len(strings.Fields(str))
268 | }
269 | 
270 | // indexOf returns the first index at which a given element can be found in the
271 | // array, or -1 if it is not present.
272 | func indexOf(array []string, key string) int {
273 | 	for idx, val := range array {
274 | 		if val == key {
275 | 			return idx
276 | 		}
277 | 	}
278 | 
279 | 	return -1
280 | }
281 | 
282 | // replaceNode replaces a child node within the given (parent) node.
283 | //
284 | // See: https://developer.mozilla.org/en-US/docs/Web/API/Node/replaceChild
285 | func replaceNode(oldNode *html.Node, newNode *html.Node) {
286 | 	if oldNode.Parent == nil {
287 | 		return
288 | 	}
289 | 
290 | 	newNode.Parent = nil
291 | 	newNode.PrevSibling = nil
292 | 	newNode.NextSibling = nil
293 | 	oldNode.Parent.InsertBefore(newNode, oldNode)
294 | 	oldNode.Parent.RemoveChild(oldNode)
295 | }
296 | 
297 | // tagName returns the tag name of the element on which it’s called.
298 | //
299 | // For example, if the element is an <img>, its tagName property is “IMG” (for
300 | // HTML documents; it may be cased differently for XML/XHTML documents).
301 | //
302 | // See: https://developer.mozilla.org/en-US/docs/Web/API/Element/tagName
303 | func tagName(node *html.Node) string {
304 | 	if node.Type != html.ElementNode {
305 | 		return ""
306 | 	}
307 | 
308 | 	return node.Data
309 | }
310 | 
311 | // textContent returns text content of a Node and its descendants.
312 | //
313 | // See: https://developer.mozilla.org/en-US/docs/Web/API/Node/textContent
314 | func textContent(node *html.Node) string {
315 | 	var buffer bytes.Buffer
316 | 	var finder func(*html.Node)
317 | 
318 | 	finder = func(n *html.Node) {
319 | 		if n.Type == html.TextNode {
320 | 			buffer.WriteString(n.Data)
321 | 		}
322 | 
323 | 		for c := n.FirstChild; c != nil; c = c.NextSibling {
324 | 			finder(c)
325 | 		}
326 | 	}
327 | 
328 | 	finder(node)
329 | 
330 | 	return buffer.String()
331 | }
332 | 
333 | // toAbsoluteURI convert uri to absolute path based on base.
334 | // However, if uri is prefixed with hash (#), the uri won't be changed.
335 | func toAbsoluteURI(uri string, base *url.URL) string {
336 | 	if uri == "" || base == nil {
337 | 		return ""
338 | 	}
339 | 
340 | 	// If it is hash tag, return as it is
341 | 	if uri[:1] == "#" {
342 | 		return uri
343 | 	}
344 | 
345 | 	// If it is already an absolute URL, return as it is
346 | 	tmp, err := url.ParseRequestURI(uri)
347 | 	if err == nil && tmp.Scheme != "" && tmp.Hostname() != "" {
348 | 		return uri
349 | 	}
350 | 
351 | 	// Otherwise, resolve against base URI.
352 | 	tmp, err = url.Parse(uri)
353 | 	if err != nil {
354 | 		return uri
355 | 	}
356 | 
357 | 	return base.ResolveReference(tmp).String()
358 | }
359 | 


--------------------------------------------------------------------------------
/readability.go:
--------------------------------------------------------------------------------
   1 | package readability
   2 | 
   3 | import (
   4 | 	"fmt"
   5 | 	"io"
   6 | 	"math"
   7 | 	"net/url"
   8 | 	"regexp"
   9 | 	"sort"
  10 | 	"strconv"
  11 | 	"strings"
  12 | 
  13 | 	"golang.org/x/net/html"
  14 | )
  15 | 
  16 | // All of the regular expressions in use within readability.
  17 | // Defined up here so we don't instantiate them repeatedly in loops.
  18 | var rxUnlikelyCandidates = regexp.MustCompile(`(?i)-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote`)
  19 | var rxOkMaybeItsACandidate = regexp.MustCompile(`(?i)and|article|body|column|main|shadow`)
  20 | var rxPositive = regexp.MustCompile(`(?i)article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story`)
  21 | var rxNegative = regexp.MustCompile(`(?i)hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget`)
  22 | var rxByline = regexp.MustCompile(`(?i)byline|author|dateline|writtenby|p-author`)
  23 | var rxNormalize = regexp.MustCompile(`(?i)\s{2,}`)
  24 | var rxVideos = regexp.MustCompile(`(?i)//(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)`)
  25 | var rxWhitespace = regexp.MustCompile(`(?i)^\s*$`)
  26 | var rxHasContent = regexp.MustCompile(`(?i)\S$`)
  27 | var rxPropertyPattern = regexp.MustCompile(`(?i)\s*(dc|dcterm|og|twitter)\s*:\s*(author|creator|description|title|site_name|image\S*)\s*`)
  28 | var rxNamePattern = regexp.MustCompile(`(?i)^\s*(?:(dc|dcterm|og|twitter|weibo:(article|webpage))\s*[\.:]\s*)?(author|creator|description|title|site_name|image)\s*$`)
  29 | var rxTitleSeparator = regexp.MustCompile(`(?i) [\|\-\\/>»] `)
  30 | var rxTitleHierarchySep = regexp.MustCompile(`(?i) [\\/>»] `)
  31 | var rxTitleRemoveFinalPart = regexp.MustCompile(`(?i)(.*)[\|\-\\/>»] .*`)
  32 | var rxTitleRemove1stPart = regexp.MustCompile(`(?i)[^\|\-\\/>»]*[\|\-\\/>»](.*)`)
  33 | var rxTitleAnySeparator = regexp.MustCompile(`(?i)[\|\-\\/>»]+`)
  34 | var rxDisplayNone = regexp.MustCompile(`(?i)display\s*:\s*none`)
  35 | var rxSentencePeriod = regexp.MustCompile(`(?i)\.( |$)`)
  36 | var rxShare = regexp.MustCompile(`(?i)share`)
  37 | var rxFaviconSize = regexp.MustCompile(`(?i)(\d+)x(\d+)`)
  38 | 
  39 | // divToPElems is a list of HTML tag names representing content dividers.
  40 | var divToPElems = []string{
  41 | 	"a", "blockquote", "div", "dl", "img",
  42 | 	"ol", "p", "pre", "select", "table", "ul",
  43 | }
  44 | 
  45 | // alterToDivExceptions is a list of HTML tags that we want to convert into
  46 | // regular DIV elements to prevent unwanted removal when the parser is cleaning
  47 | // out unnecessary Nodes.
  48 | var alterToDivExceptions = []string{
  49 | 	"article",
  50 | 	"div",
  51 | 	"p",
  52 | 	"section",
  53 | }
  54 | 
  55 | // presentationalAttributes is a list of HTML attributes used to style Nodes.
  56 | var presentationalAttributes = []string{
  57 | 	"align",
  58 | 	"background",
  59 | 	"bgcolor",
  60 | 	"border",
  61 | 	"cellpadding",
  62 | 	"cellspacing",
  63 | 	"frame",
  64 | 	"hspace",
  65 | 	"rules",
  66 | 	"style",
  67 | 	"valign",
  68 | 	"vspace",
  69 | }
  70 | 
  71 | // deprecatedSizeAttributeElems is a list of HTML tags that allow programmers
  72 | // to set Width and Height attributes to define their own size but that have
  73 | // already been deprecated in recent HTML specifications.
  74 | var deprecatedSizeAttributeElems = []string{
  75 | 	"table",
  76 | 	"th",
  77 | 	"td",
  78 | 	"hr",
  79 | 	"pre",
  80 | }
  81 | 
  82 | // The commented out elements qualify as phrasing content but tend to be
  83 | // removed by readability when put into paragraphs, so we ignore them here.
  84 | var phrasingElems = []string{
  85 | 	// "canvas", "iframe", "svg", "video",
  86 | 	"abbr", "audio", "b", "bdo", "br", "button", "cite", "code", "data",
  87 | 	"datalist", "dfn", "em", "embed", "i", "img", "input", "kbd", "label",
  88 | 	"mark", "math", "meter", "noscript", "object", "output", "progress", "q",
  89 | 	"ruby", "samp", "script", "select", "small", "span", "strong", "sub",
  90 | 	"sup", "textarea", "time", "var", "wbr",
  91 | }
  92 | 
  93 | // flags is flags that used by parser.
  94 | type flags struct {
  95 | 	stripUnlikelys     bool
  96 | 	useWeightClasses   bool
  97 | 	cleanConditionally bool
  98 | }
  99 | 
 100 | // parseAttempt is container for the result of previous parse attempts.
 101 | type parseAttempt struct {
 102 | 	articleContent *html.Node
 103 | 	textLength     int
 104 | }
 105 | 
 106 | // Article represents the metadata and content of the article.
 107 | type Article struct {
 108 | 	// Title is the heading that preceeds the article’s content, and the basis
 109 | 	// for the article’s page name and URL. It indicates what the article is
 110 | 	// about, and distinguishes it from other articles. The title may simply
 111 | 	// be the name of the subject of the article, or it may be a description
 112 | 	// of the topic.
 113 | 	Title string
 114 | 
 115 | 	// Byline is a printed line of text accompanying a news story, article, or
 116 | 	// the like, giving the author’s name
 117 | 	Byline string
 118 | 
 119 | 	// Dir is the direction of the text in the article.
 120 | 	//
 121 | 	// Either Left-to-Right (LTR) or Right-to-Left (RTL).
 122 | 	Dir string
 123 | 
 124 | 	// Content is the relevant text in the article with HTML tags.
 125 | 	Content string
 126 | 
 127 | 	// TextContent is the relevant text in the article without HTML tags.
 128 | 	TextContent string
 129 | 
 130 | 	// Excerpt is the summary for the relevant text in the article.
 131 | 	Excerpt string
 132 | 
 133 | 	// SiteName is the name of the original publisher website.
 134 | 	SiteName string
 135 | 
 136 | 	// Favicon (short for favorite icon) is a file containing one or more small
 137 | 	// icons, associated with a particular website or web page. A web designer
 138 | 	// can create such an icon and upload it to a website (or web page) by
 139 | 	// several means, and graphical web browsers will then make use of it.
 140 | 	Favicon string
 141 | 
 142 | 	// Image is an image URL which represents the article’s content.
 143 | 	Image string
 144 | 
 145 | 	// Length is the amount of characters in the article.
 146 | 	Length int
 147 | 
 148 | 	// Node is the first element in the HTML document.
 149 | 	Node *html.Node
 150 | }
 151 | 
 152 | // Readability is an HTML parser that reads and extract relevant content.
 153 | type Readability struct {
 154 | 	doc           *html.Node
 155 | 	documentURI   *url.URL
 156 | 	articleTitle  string
 157 | 	articleByline string
 158 | 	attempts      []parseAttempt
 159 | 	flags         flags
 160 | 
 161 | 	// MaxElemsToParse is the optional maximum number of HTML nodes to parse
 162 | 	// from the document. If the number of elements in the document is higher
 163 | 	// than this number, the operation immediately errors.
 164 | 	MaxElemsToParse int
 165 | 
 166 | 	// NTopCandidates is the number of top candidates to consider when the
 167 | 	// parser is analysing how tight the competition is among candidates.
 168 | 	NTopCandidates int
 169 | 
 170 | 	// CharThresholds is the default number of chars an article must have in
 171 | 	// order to return a result.
 172 | 	CharThresholds int
 173 | 
 174 | 	// ClassesToPreserve are the classes that readability sets itself.
 175 | 	ClassesToPreserve []string
 176 | 
 177 | 	// TagsToScore is element tags to score by default.
 178 | 	TagsToScore []string
 179 | 
 180 | 	KeepClasses bool
 181 | }
 182 | 
 183 | // New returns new Readability with sane defaults to parse simple documents.
 184 | func New() *Readability {
 185 | 	return &Readability{
 186 | 		MaxElemsToParse:   0,
 187 | 		NTopCandidates:    5,
 188 | 		CharThresholds:    500,
 189 | 		ClassesToPreserve: []string{"page"},
 190 | 		TagsToScore:       []string{"section", "h2", "h3", "h4", "h5", "h6", "p", "td", "pre"},
 191 | 		KeepClasses:       false,
 192 | 	}
 193 | }
 194 | 
 195 | // removeNodes iterates over a collection of HTML elements, calls the optional
 196 | // filter function on each node, and removes the node if function returns True.
 197 | // If function is not passed, removes all the nodes in the list.
 198 | func (r *Readability) removeNodes(list []*html.Node, filter func(*html.Node) bool) {
 199 | 	var node *html.Node
 200 | 	var parentNode *html.Node
 201 | 
 202 | 	for i := len(list) - 1; i >= 0; i-- {
 203 | 		node = list[i]
 204 | 		parentNode = node.Parent
 205 | 
 206 | 		if parentNode != nil && (filter == nil || filter(node)) {
 207 | 			parentNode.RemoveChild(node)
 208 | 		}
 209 | 	}
 210 | }
 211 | 
 212 | // replaceNodeTags iterates over a list, and calls setNodeTag for each node.
 213 | func (r *Readability) replaceNodeTags(list []*html.Node, newTagName string) {
 214 | 	for i := len(list) - 1; i >= 0; i-- {
 215 | 		r.setNodeTag(list[i], newTagName)
 216 | 	}
 217 | }
 218 | 
 219 | // forEachNode iterates over a list of HTML nodes, which doesn’t natively fully
 220 | // implement the Array interface. For convenience, the current object context
 221 | // is applied to the provided iterate function.
 222 | func (r *Readability) forEachNode(list []*html.Node, fn func(*html.Node, int)) {
 223 | 	for idx, node := range list {
 224 | 		fn(node, idx)
 225 | 	}
 226 | }
 227 | 
 228 | // someNode iterates over a NodeList, return true if any of the
 229 | // provided iterate function calls returns true, false otherwise.
 230 | func (r *Readability) someNode(nodeList []*html.Node, fn func(*html.Node) bool) bool {
 231 | 	for i := 0; i < len(nodeList); i++ {
 232 | 		if fn(nodeList[i]) {
 233 | 			return true
 234 | 		}
 235 | 	}
 236 | 
 237 | 	return false
 238 | }
 239 | 
 240 | // everyNode iterates over a collection of nodes, returns true if all of the
 241 | // provided iterator function calls return true, otherwise returns false. For
 242 | // convenience, the current object context is applied to the provided iterator
 243 | // function.
 244 | func (r *Readability) everyNode(list []*html.Node, fn func(*html.Node) bool) bool {
 245 | 	for _, node := range list {
 246 | 		if !fn(node) {
 247 | 			return false
 248 | 		}
 249 | 	}
 250 | 
 251 | 	return true
 252 | }
 253 | 
 254 | // concatNodeLists concats all nodelists passed as arguments.
 255 | func (r *Readability) concatNodeLists(nodeLists ...[]*html.Node) []*html.Node {
 256 | 	var result []*html.Node
 257 | 
 258 | 	for i := 0; i < len(nodeLists); i++ {
 259 | 		result = append(result, nodeLists[i]...)
 260 | 	}
 261 | 
 262 | 	return result
 263 | }
 264 | 
 265 | func (r *Readability) getAllNodesWithTag(node *html.Node, tagNames ...string) []*html.Node {
 266 | 	var list []*html.Node
 267 | 
 268 | 	for _, tag := range tagNames {
 269 | 		list = append(list, getElementsByTagName(node, tag)...)
 270 | 	}
 271 | 
 272 | 	return list
 273 | }
 274 | 
 275 | // getArticleTitle attempts to get the article title.
 276 | func (r *Readability) getArticleTitle() string {
 277 | 	doc := r.doc
 278 | 	curTitle := ""
 279 | 	origTitle := ""
 280 | 	titleHadHierarchicalSeparators := false
 281 | 
 282 | 	// If they had an element with tag "title" in their HTML
 283 | 	if nodes := getElementsByTagName(doc, "title"); len(nodes) > 0 {
 284 | 		origTitle = r.getInnerText(nodes[0], true)
 285 | 		curTitle = origTitle
 286 | 	}
 287 | 
 288 | 	// If there's a separator in the title, first remove the final part
 289 | 	if rxTitleSeparator.MatchString(curTitle) {
 290 | 		titleHadHierarchicalSeparators = rxTitleHierarchySep.MatchString(curTitle)
 291 | 		curTitle = rxTitleRemoveFinalPart.ReplaceAllString(origTitle, "$1")
 292 | 
 293 | 		// If the resulting title is too short (3 words or fewer), remove
 294 | 		// the first part instead:
 295 | 		if wordCount(curTitle) < 3 {
 296 | 			curTitle = rxTitleRemove1stPart.ReplaceAllString(origTitle, "$1")
 297 | 		}
 298 | 	} else if strings.Index(curTitle, ": ") != -1 {
 299 | 		// Check if we have an heading containing this exact string, so
 300 | 		// we could assume it's the full title.
 301 | 		headings := r.concatNodeLists(
 302 | 			getElementsByTagName(doc, "h1"),
 303 | 			getElementsByTagName(doc, "h2"),
 304 | 		)
 305 | 
 306 | 		trimmedTitle := strings.TrimSpace(curTitle)
 307 | 		match := r.someNode(headings, func(heading *html.Node) bool {
 308 | 			return strings.TrimSpace(textContent(heading)) == trimmedTitle
 309 | 		})
 310 | 
 311 | 		// If we don't, let's extract the title out of the original
 312 | 		// title string.
 313 | 		if !match {
 314 | 			curTitle = origTitle[strings.LastIndex(origTitle, ":")+1:]
 315 | 
 316 | 			// If the title is now too short, try the first colon instead:
 317 | 			if wordCount(curTitle) < 3 {
 318 | 				curTitle = origTitle[strings.Index(origTitle, ":")+1:]
 319 | 				// But if we have too many words before the colon there's
 320 | 				// something weird with the titles and the H tags so let's
 321 | 				// just use the original title instead
 322 | 			} else if wordCount(origTitle[:strings.Index(origTitle, ":")]) > 5 {
 323 | 				curTitle = origTitle
 324 | 			}
 325 | 		}
 326 | 	} else if len(curTitle) > 150 || len(curTitle) < 15 {
 327 | 		if hOnes := getElementsByTagName(doc, "h1"); len(hOnes) == 1 {
 328 | 			curTitle = r.getInnerText(hOnes[0], true)
 329 | 		}
 330 | 	}
 331 | 
 332 | 	curTitle = strings.TrimSpace(curTitle)
 333 | 	curTitle = rxNormalize.ReplaceAllString(curTitle, "\x20")
 334 | 	// If we now have 4 words or fewer as our title, and either no
 335 | 	// 'hierarchical' separators (\, /, > or ») were found in the original
 336 | 	// title or we decreased the number of words by more than 1 word, use
 337 | 	// the original title.
 338 | 	curTitleWordCount := wordCount(curTitle)
 339 | 	tmpOrigTitle := rxTitleAnySeparator.ReplaceAllString(origTitle, "")
 340 | 
 341 | 	if curTitleWordCount <= 4 &&
 342 | 		(!titleHadHierarchicalSeparators ||
 343 | 			curTitleWordCount != wordCount(tmpOrigTitle)-1) {
 344 | 		curTitle = origTitle
 345 | 	}
 346 | 
 347 | 	return curTitle
 348 | }
 349 | 
 350 | // getArticleFavicon attempts to get high quality favicon
 351 | // that used in article. It will only pick favicon in PNG
 352 | // format, so small favicon that uses ico file won't be picked.
 353 | // Using algorithm by philippe_b.
 354 | func (r *Readability) getArticleFavicon() string {
 355 | 	favicon := ""
 356 | 	faviconSize := -1
 357 | 	linkElements := getElementsByTagName(r.doc, "link")
 358 | 
 359 | 	r.forEachNode(linkElements, func(link *html.Node, _ int) {
 360 | 		linkRel := strings.TrimSpace(getAttribute(link, "rel"))
 361 | 		linkType := strings.TrimSpace(getAttribute(link, "type"))
 362 | 		linkHref := strings.TrimSpace(getAttribute(link, "href"))
 363 | 		linkSizes := strings.TrimSpace(getAttribute(link, "sizes"))
 364 | 
 365 | 		if linkHref == "" || !strings.Contains(linkRel, "icon") {
 366 | 			return
 367 | 		}
 368 | 
 369 | 		if linkType != "image/png" && !strings.Contains(linkHref, ".png") {
 370 | 			return
 371 | 		}
 372 | 
 373 | 		size := 0
 374 | 		for _, sizesLocation := range []string{linkSizes, linkHref} {
 375 | 			sizeParts := rxFaviconSize.FindStringSubmatch(sizesLocation)
 376 | 			if len(sizeParts) != 3 || sizeParts[1] != sizeParts[2] {
 377 | 				continue
 378 | 			}
 379 | 
 380 | 			size, _ = strconv.Atoi(sizeParts[1])
 381 | 			break
 382 | 		}
 383 | 
 384 | 		if size > faviconSize {
 385 | 			faviconSize = size
 386 | 			favicon = linkHref
 387 | 		}
 388 | 	})
 389 | 
 390 | 	return toAbsoluteURI(favicon, r.documentURI)
 391 | }
 392 | 
 393 | // prepDocument prepares the HTML document for readability to scrape it. This
 394 | // includes things like stripping JavaScript, CSS, and handling terrible markup
 395 | // among other things.
 396 | func (r *Readability) prepDocument() {
 397 | 	doc := r.doc
 398 | 
 399 | 	r.removeNodes(getElementsByTagName(doc, "style"), nil)
 400 | 
 401 | 	if n := getElementsByTagName(doc, "body"); len(n) > 0 && n[0] != nil {
 402 | 		r.replaceBrs(n[0])
 403 | 	}
 404 | 
 405 | 	r.replaceNodeTags(getElementsByTagName(doc, "font"), "SPAN")
 406 | }
 407 | 
 408 | // nextElement finds the next element, starting from the given node, and
 409 | // ignoring whitespace in between. If the given node is an element, the same
 410 | // node is returned.
 411 | func (r *Readability) nextElement(node *html.Node) *html.Node {
 412 | 	next := node
 413 | 
 414 | 	for next != nil &&
 415 | 		next.Type != html.ElementNode &&
 416 | 		rxWhitespace.MatchString(textContent(next)) {
 417 | 		next = next.NextSibling
 418 | 	}
 419 | 
 420 | 	return next
 421 | }
 422 | 
 423 | // replaceBrs replaces two or more successive <br> elements with a single <p>.
 424 | // Whitespace between <br> elements are ignored. For example:
 425 | //
 426 | //   <div>foo<br>bar<br> <br><br>abc</div>
 427 | //
 428 | // will become:
 429 | //
 430 | //   <div>foo<br>bar<p>abc</p></div>
 431 | func (r *Readability) replaceBrs(elem *html.Node) {
 432 | 	r.forEachNode(r.getAllNodesWithTag(elem, "br"), func(br *html.Node, _ int) {
 433 | 		next := br.NextSibling
 434 | 
 435 | 		// Whether two or more <br> elements have been found and replaced with
 436 | 		// a <p> block.
 437 | 		replaced := false
 438 | 
 439 | 		// If we find a <br> chain, remove the <br> nodes until we hit another
 440 | 		// element or non-whitespace. This leaves behind the first <br> in the
 441 | 		// chain (which will be replaced with a <p> later).
 442 | 		for {
 443 | 			next = r.nextElement(next)
 444 | 
 445 | 			if next == nil || tagName(next) == "BR" {
 446 | 				break
 447 | 			}
 448 | 
 449 | 			replaced = true
 450 | 			brSibling := next.NextSibling
 451 | 			next.Parent.RemoveChild(next)
 452 | 			next = brSibling
 453 | 		}
 454 | 
 455 | 		// If we removed a <br> chain, replace the remaining <br> with a <p>.
 456 | 		// Add all sibling nodes as children of the <p> until we hit another
 457 | 		// <br> chain.
 458 | 		if replaced {
 459 | 			p := createElement("p")
 460 | 			replaceNode(br, p)
 461 | 
 462 | 			next = p.NextSibling
 463 | 			for next != nil {
 464 | 				// If we have hit another <br><br>, we are done adding children
 465 | 				// to this <p>.
 466 | 				if tagName(next) == "br" {
 467 | 					nextElem := r.nextElement(next.NextSibling)
 468 | 					if nextElem != nil && tagName(nextElem) == "br" {
 469 | 						break
 470 | 					}
 471 | 				}
 472 | 
 473 | 				if !r.isPhrasingContent(next) {
 474 | 					break
 475 | 				}
 476 | 
 477 | 				// Otherwise, make this node a child of the new <p>.
 478 | 				sibling := next.NextSibling
 479 | 				appendChild(p, next)
 480 | 				next = sibling
 481 | 			}
 482 | 
 483 | 			for p.LastChild != nil && r.isWhitespace(p.LastChild) {
 484 | 				p.RemoveChild(p.LastChild)
 485 | 			}
 486 | 
 487 | 			if tagName(p.Parent) == "P" {
 488 | 				r.setNodeTag(p.Parent, "div")
 489 | 			}
 490 | 		}
 491 | 	})
 492 | }
 493 | 
 494 | func (r *Readability) setNodeTag(node *html.Node, newTagName string) {
 495 | 	if node.Type == html.ElementNode {
 496 | 		node.Data = newTagName
 497 | 	}
 498 | 
 499 | 	// NOTES(cixtor): the original function in Readability.js is a bit longer
 500 | 	// because it contains a fallback mechanism to set the node tag name just
 501 | 	// in case JSDOMParser is not available, there is no need to implement this
 502 | 	// here.
 503 | }
 504 | 
 505 | // getArticleMetadata attempts to get excerpt and byline metadata for the article.
 506 | func (r *Readability) getArticleMetadata() Article {
 507 | 	values := make(map[string]string)
 508 | 	metaElements := getElementsByTagName(r.doc, "meta")
 509 | 
 510 | 	// Find description tags.
 511 | 	r.forEachNode(metaElements, func(element *html.Node, _ int) {
 512 | 		elementName := getAttribute(element, "name")
 513 | 		elementProperty := getAttribute(element, "property")
 514 | 		content := getAttribute(element, "content")
 515 | 		if content == "" {
 516 | 			return
 517 | 		}
 518 | 		matches := []string{}
 519 | 		name := ""
 520 | 
 521 | 		if elementProperty != "" {
 522 | 			matches = rxPropertyPattern.FindAllString(elementProperty, -1)
 523 | 			for i := len(matches) - 1; i >= 0; i-- {
 524 | 				// Convert to lowercase, and remove any whitespace
 525 | 				// so we can match belops.
 526 | 				name = strings.ToLower(matches[i])
 527 | 				name = strings.Join(strings.Fields(name), "")
 528 | 				// multiple authors
 529 | 				values[name] = strings.TrimSpace(content)
 530 | 			}
 531 | 		}
 532 | 
 533 | 		if len(matches) == 0 && elementName != "" && rxNamePattern.MatchString(elementName) {
 534 | 			// Convert to lowercase, remove any whitespace, and convert
 535 | 			// dots to colons so we can match belops.
 536 | 			name = strings.ToLower(elementName)
 537 | 			name = strings.Join(strings.Fields(name), "")
 538 | 			name = strings.Replace(name, ".", ":", -1)
 539 | 			values[name] = strings.TrimSpace(content)
 540 | 		}
 541 | 	})
 542 | 
 543 | 	// get title
 544 | 	metadataTitle := ""
 545 | 	for _, name := range []string{
 546 | 		"dc:title",
 547 | 		"dcterm:title",
 548 | 		"og:title",
 549 | 		"weibo:article:title",
 550 | 		"weibo:webpage:title",
 551 | 		"title",
 552 | 		"twitter:title",
 553 | 	} {
 554 | 		if value, ok := values[name]; ok {
 555 | 			metadataTitle = value
 556 | 			break
 557 | 		}
 558 | 	}
 559 | 
 560 | 	if metadataTitle == "" {
 561 | 		metadataTitle = r.getArticleTitle()
 562 | 	}
 563 | 
 564 | 	// get author
 565 | 	metadataByline := ""
 566 | 	for _, name := range []string{
 567 | 		"dc:creator",
 568 | 		"dcterm:creator",
 569 | 		"author",
 570 | 	} {
 571 | 		if value, ok := values[name]; ok {
 572 | 			metadataByline = value
 573 | 			break
 574 | 		}
 575 | 	}
 576 | 
 577 | 	// get description
 578 | 	metadataExcerpt := ""
 579 | 	for _, name := range []string{
 580 | 		"dc:description",
 581 | 		"dcterm:description",
 582 | 		"og:description",
 583 | 		"weibo:article:description",
 584 | 		"weibo:webpage:description",
 585 | 		"description",
 586 | 		"twitter:description",
 587 | 	} {
 588 | 		if value, ok := values[name]; ok {
 589 | 			metadataExcerpt = value
 590 | 			break
 591 | 		}
 592 | 	}
 593 | 
 594 | 	// get site name
 595 | 	metadataSiteName := values["og:site_name"]
 596 | 
 597 | 	// get image thumbnail
 598 | 	metadataImage := ""
 599 | 	for _, name := range []string{
 600 | 		"og:image",
 601 | 		"image",
 602 | 		"twitter:image",
 603 | 	} {
 604 | 		if value, ok := values[name]; ok {
 605 | 			metadataImage = toAbsoluteURI(value, r.documentURI)
 606 | 			break
 607 | 		}
 608 | 	}
 609 | 
 610 | 	// get favicon
 611 | 	metadataFavicon := r.getArticleFavicon()
 612 | 
 613 | 	return Article{
 614 | 		Title:    metadataTitle,
 615 | 		Byline:   metadataByline,
 616 | 		Excerpt:  metadataExcerpt,
 617 | 		SiteName: metadataSiteName,
 618 | 		Image:    metadataImage,
 619 | 		Favicon:  metadataFavicon,
 620 | 	}
 621 | }
 622 | 
 623 | // prepArticle prepares the article Node for display cleaning out any inline
 624 | // CSS styles, iframes, forms and stripping extraneous paragraph tags <p>.
 625 | func (r *Readability) prepArticle(articleContent *html.Node) {
 626 | 	r.cleanStyles(articleContent)
 627 | 
 628 | 	// Check for data tables before we continue, to avoid removing
 629 | 	// items in those tables, which will often be isolated even
 630 | 	// though they're visually linked to other content-ful elements
 631 | 	// (text, images, etc.).
 632 | 	r.markDataTables(articleContent)
 633 | 
 634 | 	// Clean out junk from the article content
 635 | 	r.cleanConditionally(articleContent, "form")
 636 | 	r.cleanConditionally(articleContent, "fieldset")
 637 | 	r.clean(articleContent, "object")
 638 | 	r.clean(articleContent, "embed")
 639 | 	r.clean(articleContent, "footer")
 640 | 	r.clean(articleContent, "link")
 641 | 	r.clean(articleContent, "aside")
 642 | 
 643 | 	// Clean out elements have "share" in their id/class combinations
 644 | 	// from final top candidates, which means we don't remove the top
 645 | 	// candidates even they have "share".
 646 | 	r.forEachNode(children(articleContent), func(topCandidate *html.Node, _ int) {
 647 | 		r.cleanMatchedNodes(topCandidate, func(node *html.Node, nodeClassID string) bool {
 648 | 			return rxShare.MatchString(nodeClassID) && len(textContent(node)) < r.CharThresholds
 649 | 		})
 650 | 	})
 651 | 
 652 | 	// If there is only one h2 and its text content substantially
 653 | 	// equals article title, they are probably using it as a header
 654 | 	// and not a subheader, so remove it since we already extract
 655 | 	// the title separately.
 656 | 	if h2s := getElementsByTagName(articleContent, "h2"); len(h2s) == 1 {
 657 | 		h2 := h2s[0]
 658 | 		h2Text := textContent(h2)
 659 | 		lengthSimilarRate := float64(len(h2Text)-len(r.articleTitle)) / float64(len(r.articleTitle))
 660 | 
 661 | 		if math.Abs(lengthSimilarRate) < 0.5 {
 662 | 			titlesMatch := false
 663 | 
 664 | 			if lengthSimilarRate > 0 {
 665 | 				titlesMatch = strings.Contains(h2Text, r.articleTitle)
 666 | 			} else {
 667 | 				titlesMatch = strings.Contains(r.articleTitle, h2Text)
 668 | 			}
 669 | 
 670 | 			if titlesMatch {
 671 | 				r.clean(articleContent, "h2")
 672 | 			}
 673 | 		}
 674 | 	}
 675 | 
 676 | 	r.clean(articleContent, "iframe")
 677 | 	r.clean(articleContent, "input")
 678 | 	r.clean(articleContent, "textarea")
 679 | 	r.clean(articleContent, "select")
 680 | 	r.clean(articleContent, "button")
 681 | 	r.cleanHeaders(articleContent)
 682 | 
 683 | 	// Do these last as the previous stuff may have removed junk
 684 | 	// that will affect these
 685 | 	r.cleanConditionally(articleContent, "table")
 686 | 	r.cleanConditionally(articleContent, "ul")
 687 | 	r.cleanConditionally(articleContent, "div")
 688 | 
 689 | 	// Remove extra paragraphs
 690 | 	r.removeNodes(getElementsByTagName(articleContent, "p"), func(p *html.Node) bool {
 691 | 		imgCount := len(getElementsByTagName(p, "img"))
 692 | 		embedCount := len(getElementsByTagName(p, "embed"))
 693 | 		objectCount := len(getElementsByTagName(p, "object"))
 694 | 
 695 | 		// Nasty iframes have been removed, only remain embedded videos.
 696 | 		iframeCount := len(getElementsByTagName(p, "iframe"))
 697 | 		totalCount := imgCount + embedCount + objectCount + iframeCount
 698 | 
 699 | 		return totalCount == 0 && r.getInnerText(p, false) == ""
 700 | 	})
 701 | 
 702 | 	r.forEachNode(getElementsByTagName(articleContent, "br"), func(br *html.Node, _ int) {
 703 | 		next := r.nextElement(br.NextSibling)
 704 | 
 705 | 		if next != nil && tagName(next) == "p" {
 706 | 			br.Parent.RemoveChild(br)
 707 | 		}
 708 | 	})
 709 | 
 710 | 	// Remove single-cell tables
 711 | 	r.forEachNode(getElementsByTagName(articleContent, "table"), func(table *html.Node, _ int) {
 712 | 		tbody := table
 713 | 
 714 | 		if r.hasSingleTagInsideElement(table, "tbody") {
 715 | 			tbody = firstElementChild(table)
 716 | 		}
 717 | 
 718 | 		if r.hasSingleTagInsideElement(tbody, "tr") {
 719 | 			row := firstElementChild(tbody)
 720 | 
 721 | 			if r.hasSingleTagInsideElement(row, "td") {
 722 | 				cell := firstElementChild(row)
 723 | 
 724 | 				newTag := "div"
 725 | 
 726 | 				if r.everyNode(childNodes(cell), r.isPhrasingContent) {
 727 | 					newTag = "p"
 728 | 				}
 729 | 
 730 | 				r.setNodeTag(cell, newTag)
 731 | 
 732 | 				replaceNode(table, cell)
 733 | 			}
 734 | 		}
 735 | 	})
 736 | }
 737 | 
 738 | // grabArticle uses a variety of metrics (content score, classname, element
 739 | // types), find the content that is most likely to be the stuff a user wants to
 740 | // read. Then return it wrapped up in a div.
 741 | func (r *Readability) grabArticle() *html.Node {
 742 | 	for {
 743 | 		doc := cloneNode(r.doc)
 744 | 
 745 | 		var page *html.Node
 746 | 		if nodes := getElementsByTagName(doc, "body"); len(nodes) > 0 {
 747 | 			page = nodes[0]
 748 | 		}
 749 | 
 750 | 		// We can not grab an article if we do not have a page.
 751 | 		if page == nil {
 752 | 			return nil
 753 | 		}
 754 | 
 755 | 		// First, node prepping. Trash nodes that look cruddy (like ones with
 756 | 		// the class name "comment", etc), and turn divs into P tags where they
 757 | 		// have been used inappropriately (as in, where they contain no other
 758 | 		// block level elements).
 759 | 		var elementsToScore []*html.Node
 760 | 		var node = documentElement(doc)
 761 | 
 762 | 		for node != nil {
 763 | 			matchString := className(node) + "\x20" + id(node)
 764 | 
 765 | 			if !r.isProbablyVisible(node) {
 766 | 				node = r.removeAndGetNext(node)
 767 | 				continue
 768 | 			}
 769 | 
 770 | 			// Remove Node if it is a Byline.
 771 | 			if r.checkByline(node, matchString) {
 772 | 				node = r.removeAndGetNext(node)
 773 | 				continue
 774 | 			}
 775 | 
 776 | 			// Remove unlikely candidates.
 777 | 			nodeTagName := tagName(node)
 778 | 			if r.flags.stripUnlikelys {
 779 | 				if rxUnlikelyCandidates.MatchString(matchString) &&
 780 | 					!rxOkMaybeItsACandidate.MatchString(matchString) &&
 781 | 					!r.hasAncestorTag(node, "table", 3, nil) &&
 782 | 					nodeTagName != "body" &&
 783 | 					nodeTagName != "a" {
 784 | 					node = r.removeAndGetNext(node)
 785 | 					continue
 786 | 				}
 787 | 			}
 788 | 
 789 | 			// Remove DIV, SECTION and HEADER nodes without any content.
 790 | 			switch nodeTagName {
 791 | 			case "div",
 792 | 				"section",
 793 | 				"header",
 794 | 				"h1",
 795 | 				"h2",
 796 | 				"h3",
 797 | 				"h4",
 798 | 				"h5",
 799 | 				"h6":
 800 | 				if r.isElementWithoutContent(node) {
 801 | 					node = r.removeAndGetNext(node)
 802 | 					continue
 803 | 				}
 804 | 			}
 805 | 
 806 | 			if indexOf(r.TagsToScore, nodeTagName) != -1 {
 807 | 				elementsToScore = append(elementsToScore, node)
 808 | 			}
 809 | 
 810 | 			// Convert <div> without children block level elements into <p>.
 811 | 			if nodeTagName == "div" {
 812 | 				// Put phrasing content into paragraphs.
 813 | 				var p *html.Node
 814 | 				childNode := node.FirstChild
 815 | 
 816 | 				for childNode != nil {
 817 | 					nextSibling := childNode.NextSibling
 818 | 
 819 | 					if r.isPhrasingContent(childNode) {
 820 | 						if p != nil {
 821 | 							appendChild(p, childNode)
 822 | 						} else if !r.isWhitespace(childNode) {
 823 | 							p = createElement("p")
 824 | 							appendChild(p, cloneNode(childNode))
 825 | 							replaceNode(childNode, p)
 826 | 						}
 827 | 					} else if p != nil {
 828 | 						for p.LastChild != nil && r.isWhitespace(p.LastChild) {
 829 | 							p.RemoveChild(p.LastChild)
 830 | 						}
 831 | 						p = nil
 832 | 					}
 833 | 
 834 | 					childNode = nextSibling
 835 | 				}
 836 | 
 837 | 				// Sites like http://mobile.slate.com encloses each paragraph
 838 | 				// with a DIV element. DIVs with only a P element inside and no
 839 | 				// text content can be safely converted into plain P elements to
 840 | 				// avoid confusing the scoring algorithm with DIVs with are, in
 841 | 				// practice, paragraphs.
 842 | 				if r.hasSingleTagInsideElement(node, "p") && r.getLinkDensity(node) < 0.25 {
 843 | 					newNode := children(node)[0]
 844 | 					replaceNode(node, newNode)
 845 | 					node = newNode
 846 | 					elementsToScore = append(elementsToScore, node)
 847 | 				} else if !r.hasChildBlockElement(node) {
 848 | 					r.setNodeTag(node, "p")
 849 | 					elementsToScore = append(elementsToScore, node)
 850 | 				}
 851 | 			}
 852 | 
 853 | 			node = r.getNextNode(node, false)
 854 | 		}
 855 | 
 856 | 		// Loop through all paragraphs and assign a score to them based on how
 857 | 		// much relevant content they have. Then add their score to their parent
 858 | 		// node. A score is determined by things like number of commas, class
 859 | 		// names, etc. Maybe eventually link density.
 860 | 		var candidates []*html.Node
 861 | 		r.forEachNode(elementsToScore, func(elementToScore *html.Node, _ int) {
 862 | 			if elementToScore.Parent == nil || tagName(elementToScore.Parent) == "" {
 863 | 				return
 864 | 			}
 865 | 
 866 | 			// If this paragraph is less than 25 characters, don't even count it.
 867 | 			innerText := r.getInnerText(elementToScore, true)
 868 | 			if len(innerText) < 25 {
 869 | 				return
 870 | 			}
 871 | 
 872 | 			// Exclude nodes with no ancestor.
 873 | 			ancestors := r.getNodeAncestors(elementToScore, 3)
 874 | 			if len(ancestors) == 0 {
 875 | 				return
 876 | 			}
 877 | 
 878 | 			// Add a point for the paragraph itself as a base.
 879 | 			contentScore := 1
 880 | 
 881 | 			// Add points for any commas within this paragraph.
 882 | 			contentScore += strings.Count(innerText, ",")
 883 | 
 884 | 			// For every 100 characters in this paragraph, add another point. Up to 3 points.
 885 | 			contentScore += int(math.Min(math.Floor(float64(len(innerText))/100.0), 3.0))
 886 | 
 887 | 			// Initialize and score ancestors.
 888 | 			r.forEachNode(ancestors, func(ancestor *html.Node, level int) {
 889 | 				if tagName(ancestor) == "" || ancestor.Parent == nil || ancestor.Parent.Type != html.ElementNode {
 890 | 					return
 891 | 				}
 892 | 
 893 | 				if !r.hasContentScore(ancestor) {
 894 | 					r.initializeNode(ancestor)
 895 | 					candidates = append(candidates, ancestor)
 896 | 				}
 897 | 
 898 | 				// Node score divider:
 899 | 				// - parent:             1 (no division)
 900 | 				// - grandparent:        2
 901 | 				// - great grandparent+: ancestor level * 3
 902 | 				scoreDivider := 1
 903 | 				switch level {
 904 | 				case 0:
 905 | 					scoreDivider = 1
 906 | 				case 1:
 907 | 					scoreDivider = 2
 908 | 				default:
 909 | 					scoreDivider = level * 3
 910 | 				}
 911 | 
 912 | 				ancestorScore := r.getContentScore(ancestor)
 913 | 				ancestorScore += float64(contentScore) / float64(scoreDivider)
 914 | 
 915 | 				r.setContentScore(ancestor, ancestorScore)
 916 | 			})
 917 | 		})
 918 | 
 919 | 		// These lines are a bit different compared to Readability.js.
 920 | 		//
 921 | 		// In Readability.js, they fetch NTopCandidates utilising array method
 922 | 		// like `splice` and `pop`. In Go, array method like that is not as
 923 | 		// simple, especially since we are working with pointer. So, here we
 924 | 		// simply sort top candidates, and limit it to max NTopCandidates.
 925 | 
 926 | 		// Scale the final candidates score based on link density. Good
 927 | 		// content should have a relatively small link density (5% or
 928 | 		// less) and be mostly unaffected by this operation.
 929 | 		for i := 0; i < len(candidates); i++ {
 930 | 			candidate := candidates[i]
 931 | 			candidateScore := r.getContentScore(candidate) * (1 - r.getLinkDensity(candidate))
 932 | 			r.setContentScore(candidate, candidateScore)
 933 | 		}
 934 | 
 935 | 		// After we have calculated scores, sort through all of the possible
 936 | 		// candidate nodes we found and find the one with the highest score.
 937 | 		sort.Slice(candidates, func(i int, j int) bool {
 938 | 			return r.getContentScore(candidates[i]) > r.getContentScore(candidates[j])
 939 | 		})
 940 | 
 941 | 		var topCandidates []*html.Node
 942 | 
 943 | 		if len(candidates) > r.NTopCandidates {
 944 | 			topCandidates = candidates[:r.NTopCandidates]
 945 | 		} else {
 946 | 			topCandidates = candidates
 947 | 		}
 948 | 
 949 | 		var topCandidate, parentOfTopCandidate *html.Node
 950 | 		neededToCreateTopCandidate := false
 951 | 		if len(topCandidates) > 0 {
 952 | 			topCandidate = topCandidates[0]
 953 | 		}
 954 | 
 955 | 		// If we still have no top candidate, just use the body as a last
 956 | 		// resort. We also have to copy the body node so it is something
 957 | 		// we can modify.
 958 | 		if topCandidate == nil || tagName(topCandidate) == "body" {
 959 | 			// Move all of the page's children into topCandidate
 960 | 			topCandidate = createElement("div")
 961 | 			neededToCreateTopCandidate = true
 962 | 			// Move everything (not just elements, also text nodes etc.)
 963 | 			// into the container so we even include text directly in the body:
 964 | 			kids := childNodes(page)
 965 | 			for i := 0; i < len(kids); i++ {
 966 | 				appendChild(topCandidate, kids[i])
 967 | 			}
 968 | 
 969 | 			appendChild(page, topCandidate)
 970 | 			r.initializeNode(topCandidate)
 971 | 		} else if topCandidate != nil {
 972 | 			// Find a better top candidate node if it contains (at least three)
 973 | 			// nodes which belong to `topCandidates` array and whose scores are
 974 | 			// quite closed with current `topCandidate` node.
 975 | 			topCandidateScore := r.getContentScore(topCandidate)
 976 | 			var alternativeCandidateAncestors [][]*html.Node
 977 | 			for i := 1; i < len(topCandidates); i++ {
 978 | 				if r.getContentScore(topCandidates[i])/topCandidateScore >= 0.75 {
 979 | 					topCandidateAncestors := r.getNodeAncestors(topCandidates[i], 0)
 980 | 					alternativeCandidateAncestors = append(alternativeCandidateAncestors, topCandidateAncestors)
 981 | 				}
 982 | 			}
 983 | 
 984 | 			minimumTopCandidates := 3
 985 | 			if len(alternativeCandidateAncestors) >= minimumTopCandidates {
 986 | 				parentOfTopCandidate = topCandidate.Parent
 987 | 				for parentOfTopCandidate != nil && tagName(parentOfTopCandidate) != "body" {
 988 | 					listContainingThisAncestor := 0
 989 | 					for ancestorIndex := 0; ancestorIndex < len(alternativeCandidateAncestors) && listContainingThisAncestor < minimumTopCandidates; ancestorIndex++ {
 990 | 						if includeNode(alternativeCandidateAncestors[ancestorIndex], parentOfTopCandidate) {
 991 | 							listContainingThisAncestor++
 992 | 						}
 993 | 					}
 994 | 
 995 | 					if listContainingThisAncestor >= minimumTopCandidates {
 996 | 						topCandidate = parentOfTopCandidate
 997 | 						break
 998 | 					}
 999 | 
1000 | 					parentOfTopCandidate = parentOfTopCandidate.Parent
1001 | 				}
1002 | 			}
1003 | 
1004 | 			if !r.hasContentScore(topCandidate) {
1005 | 				r.initializeNode(topCandidate)
1006 | 			}
1007 | 
1008 | 			// Because of our bonus system, parents of candidates might
1009 | 			// have scores themselves. They get half of the node. There
1010 | 			// won't be nodes with higher scores than our topCandidate,
1011 | 			// but if we see the score going *up* in the first few steps *
1012 | 			// up the tree, that's a decent sign that there might be more
1013 | 			// content lurking in other places that we want to unify in.
1014 | 			// The sibling stuff below does some of that - but only if
1015 | 			// we've looked high enough up the DOM tree.
1016 | 			parentOfTopCandidate = topCandidate.Parent
1017 | 			lastScore := r.getContentScore(topCandidate)
1018 | 			// The scores shouldn't get too lor.
1019 | 			scoreThreshold := lastScore / 3.0
1020 | 			for parentOfTopCandidate != nil && tagName(parentOfTopCandidate) != "body" {
1021 | 				if !r.hasContentScore(parentOfTopCandidate) {
1022 | 					parentOfTopCandidate = parentOfTopCandidate.Parent
1023 | 					continue
1024 | 				}
1025 | 
1026 | 				parentScore := r.getContentScore(parentOfTopCandidate)
1027 | 				if parentScore < scoreThreshold {
1028 | 					break
1029 | 				}
1030 | 
1031 | 				if parentScore > lastScore {
1032 | 					// Alright! We found a better parent to use.
1033 | 					topCandidate = parentOfTopCandidate
1034 | 					break
1035 | 				}
1036 | 
1037 | 				lastScore = parentScore
1038 | 				parentOfTopCandidate = parentOfTopCandidate.Parent
1039 | 			}
1040 | 
1041 | 			// If the top candidate is the only child, use parent
1042 | 			// instead. This will help sibling joining logic when
1043 | 			// adjacent content is actually located in parent's
1044 | 			// sibling node.
1045 | 			parentOfTopCandidate = topCandidate.Parent
1046 | 			for parentOfTopCandidate != nil && tagName(parentOfTopCandidate) != "body" && len(children(parentOfTopCandidate)) == 1 {
1047 | 				topCandidate = parentOfTopCandidate
1048 | 				parentOfTopCandidate = topCandidate.Parent
1049 | 			}
1050 | 
1051 | 			if !r.hasContentScore(topCandidate) {
1052 | 				r.initializeNode(topCandidate)
1053 | 			}
1054 | 		}
1055 | 
1056 | 		// Now that we have the top candidate, look through its siblings
1057 | 		// for content that might also be related. Things like preambles,
1058 | 		// content split by ads that we removed, etc.
1059 | 		articleContent := createElement("div")
1060 | 		siblingScoreThreshold := math.Max(10, r.getContentScore(topCandidate)*0.2)
1061 | 
1062 | 		// Keep potential top candidate's parent node to try to get text direction of it later.
1063 | 		topCandidateScore := r.getContentScore(topCandidate)
1064 | 		topCandidateClassName := className(topCandidate)
1065 | 
1066 | 		parentOfTopCandidate = topCandidate.Parent
1067 | 		siblings := children(parentOfTopCandidate)
1068 | 		for s := 0; s < len(siblings); s++ {
1069 | 			sibling := siblings[s]
1070 | 			appendNode := false
1071 | 
1072 | 			if sibling == topCandidate {
1073 | 				appendNode = true
1074 | 			} else {
1075 | 				contentBonus := float64(0)
1076 | 
1077 | 				// Give a bonus if sibling nodes and top candidates have the example same classname
1078 | 				if className(sibling) == topCandidateClassName && topCandidateClassName != "" {
1079 | 					contentBonus += topCandidateScore * 0.2
1080 | 				}
1081 | 
1082 | 				if r.hasContentScore(sibling) && r.getContentScore(sibling)+contentBonus >= siblingScoreThreshold {
1083 | 					appendNode = true
1084 | 				} else if tagName(sibling) == "p" {
1085 | 					linkDensity := r.getLinkDensity(sibling)
1086 | 					nodeContent := r.getInnerText(sibling, true)
1087 | 					nodeLength := len(nodeContent)
1088 | 
1089 | 					if nodeLength > 80 && linkDensity < 0.25 {
1090 | 						appendNode = true
1091 | 					} else if nodeLength < 80 && nodeLength > 0 && linkDensity == 0 &&
1092 | 						rxSentencePeriod.MatchString(nodeContent) {
1093 | 						appendNode = true
1094 | 					}
1095 | 				}
1096 | 			}
1097 | 
1098 | 			if appendNode {
1099 | 				// We have a node that is not a common block level element,
1100 | 				// like a FORM or TD tag. Turn it into a DIV so it does not get
1101 | 				// filtered out later by accident.
1102 | 				if indexOf(alterToDivExceptions, tagName(sibling)) == -1 {
1103 | 					r.setNodeTag(sibling, "div")
1104 | 				}
1105 | 
1106 | 				appendChild(articleContent, sibling)
1107 | 			}
1108 | 		}
1109 | 
1110 | 		// So we have all of the content that we need. Now we clean
1111 | 		// it up for presentation.
1112 | 		r.prepArticle(articleContent)
1113 | 
1114 | 		if neededToCreateTopCandidate {
1115 | 			// We already created a fake DIV thing, and there would not have
1116 | 			// been any siblings left for the previous loop, so there is no
1117 | 			// point trying to create a new DIV and then move all the children
1118 | 			// over. Just assign IDs and CSS class names here. No need to append
1119 | 			// because that already happened anyway.
1120 | 			//
1121 | 			// By the way, this line is different with Readability.js.
1122 | 			//
1123 | 			// In Readability.js, when using `appendChild`, the node is still
1124 | 			// referenced. Meanwhile here, our `appendChild` will clone the
1125 | 			// node, put it in the new place, then delete the original.
1126 | 			firstChild := firstElementChild(articleContent)
1127 | 			if firstChild != nil && tagName(firstChild) == "div" {
1128 | 				setAttribute(firstChild, "id", "readability-page-1")
1129 | 				setAttribute(firstChild, "class", "page")
1130 | 			}
1131 | 		} else {
1132 | 			div := createElement("div")
1133 | 
1134 | 			setAttribute(div, "id", "readability-page-1")
1135 | 			setAttribute(div, "class", "page")
1136 | 
1137 | 			childs := childNodes(articleContent)
1138 | 
1139 | 			for i := 0; i < len(childs); i++ {
1140 | 				appendChild(div, childs[i])
1141 | 			}
1142 | 
1143 | 			appendChild(articleContent, div)
1144 | 		}
1145 | 
1146 | 		parseSuccessful := true
1147 | 
1148 | 		// Now that we've gone through the full algorithm, check to see if we
1149 | 		// got any meaningful content. If we did not, we may need to re-run
1150 | 		// grabArticle with different flags set. This gives us a higher
1151 | 		// likelihood of finding the content, and the sieve approach gives us a
1152 | 		// higher likelihood of finding the -right- content.
1153 | 		textLength := len(r.getInnerText(articleContent, true))
1154 | 		if textLength < r.CharThresholds {
1155 | 			parseSuccessful = false
1156 | 
1157 | 			if r.flags.stripUnlikelys {
1158 | 				r.flags.stripUnlikelys = false
1159 | 				r.attempts = append(r.attempts, parseAttempt{
1160 | 					articleContent: articleContent,
1161 | 					textLength:     textLength,
1162 | 				})
1163 | 			} else if r.flags.useWeightClasses {
1164 | 				r.flags.useWeightClasses = false
1165 | 				r.attempts = append(r.attempts, parseAttempt{
1166 | 					articleContent: articleContent,
1167 | 					textLength:     textLength,
1168 | 				})
1169 | 			} else if r.flags.cleanConditionally {
1170 | 				r.flags.cleanConditionally = false
1171 | 				r.attempts = append(r.attempts, parseAttempt{
1172 | 					articleContent: articleContent,
1173 | 					textLength:     textLength,
1174 | 				})
1175 | 			} else {
1176 | 				r.attempts = append(r.attempts, parseAttempt{
1177 | 					articleContent: articleContent,
1178 | 					textLength:     textLength,
1179 | 				})
1180 | 
1181 | 				// No luck after removing flags, just return the
1182 | 				// longest text we found during the different loops *
1183 | 				sort.Slice(r.attempts, func(i, j int) bool {
1184 | 					return r.attempts[i].textLength > r.attempts[j].textLength
1185 | 				})
1186 | 
1187 | 				// But first check if we actually have something
1188 | 				if r.attempts[0].textLength == 0 {
1189 | 					return nil
1190 | 				}
1191 | 
1192 | 				articleContent = r.attempts[0].articleContent
1193 | 				parseSuccessful = true
1194 | 			}
1195 | 		}
1196 | 
1197 | 		if parseSuccessful {
1198 | 			return articleContent
1199 | 		}
1200 | 	}
1201 | }
1202 | 
1203 | // initializeNode initializes a node with the readability score. Also checks
1204 | // the className/id for special names to add to its score.
1205 | func (r *Readability) initializeNode(node *html.Node) {
1206 | 	contentScore := float64(r.getClassWeight(node))
1207 | 
1208 | 	switch tagName(node) {
1209 | 	case "div":
1210 | 		contentScore += 5
1211 | 	case "pre", "td", "blockquote":
1212 | 		contentScore += 3
1213 | 	case "address", "ol", "ul", "dl", "dd", "dt", "li", "form":
1214 | 		contentScore -= 3
1215 | 	case "h1", "h2", "h3", "h4", "h5", "h6", "th":
1216 | 		contentScore -= 5
1217 | 	}
1218 | 
1219 | 	r.setContentScore(node, contentScore)
1220 | }
1221 | 
1222 | // removeAndGetNext remove node and returns its next node.
1223 | func (r *Readability) removeAndGetNext(node *html.Node) *html.Node {
1224 | 	nextNode := r.getNextNode(node, true)
1225 | 
1226 | 	if node.Parent != nil {
1227 | 		node.Parent.RemoveChild(node)
1228 | 	}
1229 | 
1230 | 	return nextNode
1231 | }
1232 | 
1233 | // getNextNode traverses the DOM from node to node, starting at the node passed
1234 | // in. Pass true for the second parameter to indicate this node itself (and its
1235 | // kids) are going away, and we want the next node over. Calling this in a loop
1236 | // will traverse the DOM depth-first.
1237 | //
1238 | // In Readability.js, ignoreSelfAndKids default to false.
1239 | func (r *Readability) getNextNode(node *html.Node, ignoreSelfAndKids bool) *html.Node {
1240 | 	// First check for kids if those are not being ignored
1241 | 	if firstChild := firstElementChild(node); !ignoreSelfAndKids && firstChild != nil {
1242 | 		return firstChild
1243 | 	}
1244 | 
1245 | 	// Then for siblings...
1246 | 	if sibling := nextElementSibling(node); sibling != nil {
1247 | 		return sibling
1248 | 	}
1249 | 
1250 | 	// And finally, move up the parent chain *and* find a sibling
1251 | 	// (because this is depth-first traversal, we will have already
1252 | 	// seen the parent nodes themselves).
1253 | 	for {
1254 | 		node = node.Parent
1255 | 		if node == nil || nextElementSibling(node) != nil {
1256 | 			break
1257 | 		}
1258 | 	}
1259 | 
1260 | 	if node != nil {
1261 | 		return nextElementSibling(node)
1262 | 	}
1263 | 
1264 | 	return nil
1265 | }
1266 | 
1267 | // isValidByline checks whether the input string could be a byline.
1268 | func (r *Readability) isValidByline(byline string) bool {
1269 | 	byline = strings.TrimSpace(byline)
1270 | 	return len(byline) > 0 && len(byline) < 100
1271 | }
1272 | 
1273 | // checkByline determines if a node is used as byline.
1274 | func (r *Readability) checkByline(node *html.Node, matchString string) bool {
1275 | 	if r.articleByline != "" {
1276 | 		return false
1277 | 	}
1278 | 
1279 | 	rel := getAttribute(node, "rel")
1280 | 	itemprop := getAttribute(node, "itemprop")
1281 | 	nodeText := textContent(node)
1282 | 	if (rel == "author" || strings.Contains(itemprop, "author") || rxByline.MatchString(matchString)) && r.isValidByline(nodeText) {
1283 | 		nodeText = strings.TrimSpace(nodeText)
1284 | 		nodeText = strings.Join(strings.Fields(nodeText), "\x20")
1285 | 		r.articleByline = nodeText
1286 | 		return true
1287 | 	}
1288 | 
1289 | 	return false
1290 | }
1291 | 
1292 | // getNodeAncestors gets the node's direct parent and grandparents.
1293 | //
1294 | // In Readability.js, maxDepth default to 0.
1295 | func (r *Readability) getNodeAncestors(node *html.Node, maxDepth int) []*html.Node {
1296 | 	level := 0
1297 | 	ancestors := []*html.Node{}
1298 | 
1299 | 	for node.Parent != nil {
1300 | 		level++
1301 | 		ancestors = append(ancestors, node.Parent)
1302 | 
1303 | 		if maxDepth > 0 && level == maxDepth {
1304 | 			break
1305 | 		}
1306 | 
1307 | 		node = node.Parent
1308 | 	}
1309 | 
1310 | 	return ancestors
1311 | }
1312 | 
1313 | // setContentScore sets the readability score for a node.
1314 | func (r *Readability) setContentScore(node *html.Node, score float64) {
1315 | 	setAttribute(node, "data-readability-score", fmt.Sprintf("%.4f", score))
1316 | }
1317 | 
1318 | // hasContentScore checks if node has readability score.
1319 | func (r *Readability) hasContentScore(node *html.Node) bool {
1320 | 	return hasAttribute(node, "data-readability-score")
1321 | }
1322 | 
1323 | // getContentScore gets the readability score of a node.
1324 | func (r *Readability) getContentScore(node *html.Node) float64 {
1325 | 	strScore := getAttribute(node, "data-readability-score")
1326 | 	strScore = strings.TrimSpace(strScore)
1327 | 
1328 | 	if strScore == "" {
1329 | 		return 0
1330 | 	}
1331 | 
1332 | 	score, err := strconv.ParseFloat(strScore, 64)
1333 | 
1334 | 	if err != nil {
1335 | 		return 0
1336 | 	}
1337 | 
1338 | 	return score
1339 | }
1340 | 
1341 | // removeScripts removes script tags from the document.
1342 | func (r *Readability) removeScripts(doc *html.Node) {
1343 | 	r.removeNodes(getElementsByTagName(doc, "script"), nil)
1344 | 	r.removeNodes(getElementsByTagName(doc, "noscript"), nil)
1345 | }
1346 | 
1347 | // hasSingleTagInsideElement check if the node has only whitespace and a single
1348 | // element with given tag. Returns false if the DIV Node contains non-empty text
1349 | // nodes or if it contains no element with given tag or more than 1 element.
1350 | func (r *Readability) hasSingleTagInsideElement(element *html.Node, tag string) bool {
1351 | 	// There should be exactly 1 element child with given tag
1352 | 	if childs := children(element); len(childs) != 1 || tagName(childs[0]) != tag {
1353 | 		return false
1354 | 	}
1355 | 
1356 | 	// And there should be no text nodes with real content
1357 | 	return !r.someNode(childNodes(element), func(node *html.Node) bool {
1358 | 		return node.Type == html.TextNode && rxHasContent.MatchString(textContent(node))
1359 | 	})
1360 | }
1361 | 
1362 | // isElementWithoutContent determines if node is empty. A node is considered
1363 | // empty is there is nothing inside or if the only things inside are HTML break
1364 | // tags <br> and HTML horizontal rule tags <hr>.
1365 | func (r *Readability) isElementWithoutContent(node *html.Node) bool {
1366 | 	brs := getElementsByTagName(node, "br")
1367 | 	hrs := getElementsByTagName(node, "hr")
1368 | 	childs := children(node)
1369 | 
1370 | 	return node.Type == html.ElementNode &&
1371 | 		strings.TrimSpace(textContent(node)) == "" &&
1372 | 		(len(childs) == 0 || len(childs) == len(brs)+len(hrs))
1373 | }
1374 | 
1375 | // hasChildBlockElement determines whether element has any children block level
1376 | // elements.
1377 | func (r *Readability) hasChildBlockElement(element *html.Node) bool {
1378 | 	return r.someNode(childNodes(element), func(node *html.Node) bool {
1379 | 		return indexOf(divToPElems, tagName(node)) != -1 ||
1380 | 			r.hasChildBlockElement(node)
1381 | 	})
1382 | }
1383 | 
1384 | // isPhrasingContent determines if a node qualifies as phrasing content.
1385 | //
1386 | // See: https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#Phrasing_content
1387 | func (r *Readability) isPhrasingContent(node *html.Node) bool {
1388 | 	if node.Type == html.TextNode {
1389 | 		return true
1390 | 	}
1391 | 
1392 | 	tag := tagName(node)
1393 | 
1394 | 	if indexOf(phrasingElems, tag) != -1 {
1395 | 		return true
1396 | 	}
1397 | 
1398 | 	return ((tag == "a" || tag == "del" || tag == "ins") &&
1399 | 		r.everyNode(childNodes(node), r.isPhrasingContent))
1400 | }
1401 | 
1402 | // isWhitespace determines if a node only used as whitespace.
1403 | func (r *Readability) isWhitespace(node *html.Node) bool {
1404 | 	return (node.Type == html.TextNode && strings.TrimSpace(textContent(node)) == "") ||
1405 | 		(node.Type == html.ElementNode && tagName(node) == "br")
1406 | }
1407 | 
1408 | // getInnerText gets the inner text of a node.
1409 | // This also strips out any excess whitespace to be found.
1410 | // In Readability.js, normalizeSpaces default to true.
1411 | func (r *Readability) getInnerText(node *html.Node, normalizeSpaces bool) string {
1412 | 	textContent := strings.TrimSpace(textContent(node))
1413 | 
1414 | 	if normalizeSpaces {
1415 | 		textContent = rxNormalize.ReplaceAllString(textContent, "\x20")
1416 | 	}
1417 | 
1418 | 	return textContent
1419 | }
1420 | 
1421 | // getCharCount returns the number of times a string appears in the Node.
1422 | func (r *Readability) getCharCount(node *html.Node, s string) int {
1423 | 	innerText := r.getInnerText(node, true)
1424 | 	return strings.Count(innerText, s)
1425 | }
1426 | 
1427 | // cleanStyles removes the style attribute on every node and under.
1428 | func (r *Readability) cleanStyles(node *html.Node) {
1429 | 	nodeTagName := tagName(node)
1430 | 
1431 | 	if node == nil || nodeTagName == "svg" {
1432 | 		return
1433 | 	}
1434 | 
1435 | 	// Remove `style` and deprecated presentational attributes
1436 | 	for i := 0; i < len(presentationalAttributes); i++ {
1437 | 		removeAttribute(node, presentationalAttributes[i])
1438 | 	}
1439 | 
1440 | 	if indexOf(deprecatedSizeAttributeElems, nodeTagName) != -1 {
1441 | 		removeAttribute(node, "width")
1442 | 		removeAttribute(node, "height")
1443 | 	}
1444 | 
1445 | 	for child := firstElementChild(node); child != nil; child = nextElementSibling(child) {
1446 | 		r.cleanStyles(child)
1447 | 	}
1448 | }
1449 | 
1450 | // getLinkDensity gets the density of links as a percentage of the content.
1451 | // This is the amount of text that is inside a link divided by the total text
1452 | // in the node.
1453 | func (r *Readability) getLinkDensity(element *html.Node) float64 {
1454 | 	textLength := len(r.getInnerText(element, true))
1455 | 
1456 | 	if textLength == 0 {
1457 | 		return 0
1458 | 	}
1459 | 
1460 | 	linkLength := 0
1461 | 
1462 | 	r.forEachNode(getElementsByTagName(element, "a"), func(linkNode *html.Node, _ int) {
1463 | 		linkLength += len(r.getInnerText(linkNode, true))
1464 | 	})
1465 | 
1466 | 	return float64(linkLength) / float64(textLength)
1467 | }
1468 | 
1469 | // getClassWeight gets an elements class/id weight. Uses regular expressions to
1470 | // tell if this element looks good or bad.
1471 | func (r *Readability) getClassWeight(node *html.Node) int {
1472 | 	if !r.flags.useWeightClasses {
1473 | 		return 0
1474 | 	}
1475 | 
1476 | 	weight := 0
1477 | 
1478 | 	// Look for a special classname
1479 | 	if nodeClassName := className(node); nodeClassName != "" {
1480 | 		if rxNegative.MatchString(nodeClassName) {
1481 | 			weight -= 25
1482 | 		}
1483 | 
1484 | 		if rxPositive.MatchString(nodeClassName) {
1485 | 			weight += 25
1486 | 		}
1487 | 	}
1488 | 
1489 | 	// Look for a special ID
1490 | 	if nodeID := id(node); nodeID != "" {
1491 | 		if rxNegative.MatchString(nodeID) {
1492 | 			weight -= 25
1493 | 		}
1494 | 
1495 | 		if rxPositive.MatchString(nodeID) {
1496 | 			weight += 25
1497 | 		}
1498 | 	}
1499 | 
1500 | 	return weight
1501 | }
1502 | 
1503 | // clean cleans a node of all elements of type "tag".
1504 | func (r *Readability) clean(node *html.Node, tag string) {
1505 | 	isEmbed := indexOf([]string{"object", "embed", "iframe"}, tag) != -1
1506 | 
1507 | 	r.removeNodes(getElementsByTagName(node, tag), func(element *html.Node) bool {
1508 | 		// Allow YouTube and Vimeo videos through as people usually want to see those.
1509 | 		if isEmbed {
1510 | 			// Check the attributes to see if any of them contain YouTube or Vimeo.
1511 | 			for _, attr := range element.Attr {
1512 | 				if rxVideos.MatchString(attr.Val) {
1513 | 					return false
1514 | 				}
1515 | 			}
1516 | 
1517 | 			// For embed with <object> tag, check inner HTML as well.
1518 | 			if tagName(element) == "object" && rxVideos.MatchString(innerHTML(element)) {
1519 | 				return false
1520 | 			}
1521 | 		}
1522 | 
1523 | 		return true
1524 | 	})
1525 | }
1526 | 
1527 | // hasAncestorTag checks if a given node has one of its ancestor tag name
1528 | // matching the provided one.
1529 | //
1530 | // In Readability.js, default value for maxDepth is 3.
1531 | func (r *Readability) hasAncestorTag(node *html.Node, tag string, maxDepth int, filterFn func(*html.Node) bool) bool {
1532 | 	depth := 0
1533 | 
1534 | 	for node.Parent != nil {
1535 | 		if maxDepth > 0 && depth > maxDepth {
1536 | 			return false
1537 | 		}
1538 | 
1539 | 		if tagName(node.Parent) == tag && (filterFn == nil || filterFn(node.Parent)) {
1540 | 			return true
1541 | 		}
1542 | 
1543 | 		node = node.Parent
1544 | 
1545 | 		depth++
1546 | 	}
1547 | 
1548 | 	return false
1549 | }
1550 | 
1551 | // getRowAndColumnCount returns how many rows and columns this table has.
1552 | func (r *Readability) getRowAndColumnCount(table *html.Node) (int, int) {
1553 | 	rows := 0
1554 | 	columns := 0
1555 | 	trs := getElementsByTagName(table, "tr")
1556 | 
1557 | 	for i := 0; i < len(trs); i++ {
1558 | 		strRowSpan := getAttribute(trs[i], "rowspan")
1559 | 		rowSpan, _ := strconv.Atoi(strRowSpan)
1560 | 
1561 | 		if rowSpan == 0 {
1562 | 			rowSpan = 1
1563 | 		}
1564 | 
1565 | 		rows += rowSpan
1566 | 
1567 | 		// Now look for column-related info
1568 | 		columnsInThisRow := 0
1569 | 		cells := getElementsByTagName(trs[i], "td")
1570 | 
1571 | 		for j := 0; j < len(cells); j++ {
1572 | 			strColSpan := getAttribute(cells[j], "colspan")
1573 | 			colSpan, _ := strconv.Atoi(strColSpan)
1574 | 
1575 | 			if colSpan == 0 {
1576 | 				colSpan = 1
1577 | 			}
1578 | 
1579 | 			columnsInThisRow += colSpan
1580 | 		}
1581 | 
1582 | 		if columnsInThisRow > columns {
1583 | 			columns = columnsInThisRow
1584 | 		}
1585 | 	}
1586 | 
1587 | 	return rows, columns
1588 | }
1589 | 
1590 | // isReadabilityDataTable determines if a Node is a data table.
1591 | func (r *Readability) isReadabilityDataTable(node *html.Node) bool {
1592 | 	return hasAttribute(node, "data-readability-table")
1593 | }
1594 | 
1595 | // setReadabilityDataTable marks whether a Node is data table or not.
1596 | func (r *Readability) setReadabilityDataTable(node *html.Node, isDataTable bool) {
1597 | 	if isDataTable {
1598 | 		setAttribute(node, "data-readability-table", "true")
1599 | 		return
1600 | 	}
1601 | 
1602 | 	removeAttribute(node, "data-readability-table")
1603 | }
1604 | 
1605 | // markDataTables looks for "data" (as opposed to "layout") tables and mark it.
1606 | func (r *Readability) markDataTables(root *html.Node) {
1607 | 	tables := getElementsByTagName(root, "table")
1608 | 
1609 | 	for i := 0; i < len(tables); i++ {
1610 | 		table := tables[i]
1611 | 
1612 | 		role := getAttribute(table, "role")
1613 | 		if role == "presentation" {
1614 | 			r.setReadabilityDataTable(table, false)
1615 | 			continue
1616 | 		}
1617 | 
1618 | 		datatable := getAttribute(table, "datatable")
1619 | 		if datatable == "0" {
1620 | 			r.setReadabilityDataTable(table, false)
1621 | 			continue
1622 | 		}
1623 | 
1624 | 		if hasAttribute(table, "summary") {
1625 | 			r.setReadabilityDataTable(table, true)
1626 | 			continue
1627 | 		}
1628 | 
1629 | 		if captions := getElementsByTagName(table, "caption"); len(captions) > 0 {
1630 | 			if caption := captions[0]; caption != nil && len(childNodes(caption)) > 0 {
1631 | 				r.setReadabilityDataTable(table, true)
1632 | 				continue
1633 | 			}
1634 | 		}
1635 | 
1636 | 		// If the table has a descendant with any of these tags, consider a data table:
1637 | 		hasDataTableDescendantTags := false
1638 | 		for _, descendantTag := range []string{"col", "colgroup", "tfoot", "thead", "th"} {
1639 | 			descendants := getElementsByTagName(table, descendantTag)
1640 | 			if len(descendants) > 0 && descendants[0] != nil {
1641 | 				hasDataTableDescendantTags = true
1642 | 				break
1643 | 			}
1644 | 		}
1645 | 
1646 | 		if hasDataTableDescendantTags {
1647 | 			r.setReadabilityDataTable(table, true)
1648 | 			continue
1649 | 		}
1650 | 
1651 | 		// Nested tables indicates a layout table:
1652 | 		if len(getElementsByTagName(table, "table")) > 0 {
1653 | 			r.setReadabilityDataTable(table, false)
1654 | 			continue
1655 | 		}
1656 | 
1657 | 		rows, columns := r.getRowAndColumnCount(table)
1658 | 
1659 | 		if rows >= 10 || columns > 4 {
1660 | 			r.setReadabilityDataTable(table, true)
1661 | 			continue
1662 | 		}
1663 | 
1664 | 		// Now just go by size entirely:
1665 | 		if rows*columns > 10 {
1666 | 			r.setReadabilityDataTable(table, true)
1667 | 		}
1668 | 	}
1669 | }
1670 | 
1671 | // cleanConditionally cleans an element of all tags of type "tag" if they look
1672 | // fishy. "Fishy" is an algorithm based on content length, classnames, link
1673 | // density, number of images & embeds, etc.
1674 | func (r *Readability) cleanConditionally(element *html.Node, tag string) {
1675 | 	if !r.flags.cleanConditionally {
1676 | 		return
1677 | 	}
1678 | 
1679 | 	isList := tag == "ul" || tag == "ol"
1680 | 
1681 | 	// Gather counts for other typical elements embedded within. Traverse
1682 | 	// backwards so we can remove nodes at the same time without effecting
1683 | 	// the traversal.
1684 | 	r.removeNodes(getElementsByTagName(element, tag), func(node *html.Node) bool {
1685 | 		if tag == "table" && r.isReadabilityDataTable(node) {
1686 | 			return false
1687 | 		}
1688 | 
1689 | 		if r.hasAncestorTag(node, "table", -1, r.isReadabilityDataTable) {
1690 | 			return false
1691 | 		}
1692 | 
1693 | 		weight := r.getClassWeight(node)
1694 | 		if weight < 0 {
1695 | 			return true
1696 | 		}
1697 | 
1698 | 		if r.getCharCount(node, ",") < 10 {
1699 | 			// If there are not many commas and the number of non-paragraph
1700 | 			// elements is more than paragraphs or other ominous signs, remove
1701 | 			// the element.
1702 | 			p := float64(len(getElementsByTagName(node, "p")))
1703 | 			img := float64(len(getElementsByTagName(node, "img")))
1704 | 			li := float64(len(getElementsByTagName(node, "li")) - 100)
1705 | 			input := float64(len(getElementsByTagName(node, "input")))
1706 | 
1707 | 			embedCount := 0
1708 | 			embeds := r.concatNodeLists(
1709 | 				getElementsByTagName(node, "object"),
1710 | 				getElementsByTagName(node, "embed"),
1711 | 				getElementsByTagName(node, "iframe"),
1712 | 			)
1713 | 
1714 | 			for _, embed := range embeds {
1715 | 				// Do not delete if Embed has attribute matching Video regex.
1716 | 				for _, attr := range embed.Attr {
1717 | 					if rxVideos.MatchString(attr.Val) {
1718 | 						return false
1719 | 					}
1720 | 				}
1721 | 
1722 | 				// For embed with <object> tag, check inner HTML as well.
1723 | 				if tagName(embed) == "object" && rxVideos.MatchString(innerHTML(embed)) {
1724 | 					return false
1725 | 				}
1726 | 
1727 | 				embedCount++
1728 | 			}
1729 | 
1730 | 			linkDensity := r.getLinkDensity(node)
1731 | 			contentLength := len(r.getInnerText(node, true))
1732 | 
1733 | 			return (img > 1 && p/img < 0.5 && !r.hasAncestorTag(node, "figure", 3, nil)) ||
1734 | 				(!isList && li > p) ||
1735 | 				(input > math.Floor(p/3)) ||
1736 | 				(!isList && contentLength < 25 && (img == 0 || img > 2) && !r.hasAncestorTag(node, "figure", 3, nil)) ||
1737 | 				(!isList && weight < 25 && linkDensity > 0.2) ||
1738 | 				(weight >= 25 && linkDensity > 0.5) ||
1739 | 				((embedCount == 1 && contentLength < 75) || embedCount > 1)
1740 | 		}
1741 | 
1742 | 		return false
1743 | 	})
1744 | }
1745 | 
1746 | // cleanMatchedNodes cleans out elements whose ID and CSS class combinations
1747 | // match specific string.
1748 | func (r *Readability) cleanMatchedNodes(e *html.Node, filter func(*html.Node, string) bool) {
1749 | 	endOfSearchMarkerNode := r.getNextNode(e, true)
1750 | 	next := r.getNextNode(e, false)
1751 | 
1752 | 	for next != nil && next != endOfSearchMarkerNode {
1753 | 		if filter != nil && filter(next, className(next)+"\x20"+id(next)) {
1754 | 			next = r.removeAndGetNext(next)
1755 | 		} else {
1756 | 			next = r.getNextNode(next, false)
1757 | 		}
1758 | 	}
1759 | }
1760 | 
1761 | // cleanHeaders cleans out spurious headers from an Element. Checks things like
1762 | // classnames and link density.
1763 | func (r *Readability) cleanHeaders(e *html.Node) {
1764 | 	for headerIndex := 1; headerIndex < 3; headerIndex++ {
1765 | 		headerTag := fmt.Sprintf("h%d", headerIndex)
1766 | 
1767 | 		r.removeNodes(getElementsByTagName(e, headerTag), func(header *html.Node) bool {
1768 | 			return r.getClassWeight(header) < 0
1769 | 		})
1770 | 	}
1771 | }
1772 | 
1773 | // isProbablyVisible determines if a node is visible.
1774 | func (r *Readability) isProbablyVisible(node *html.Node) bool {
1775 | 	nodeStyle := getAttribute(node, "style")
1776 | 	nodeAriaHidden := getAttribute(node, "aria-hidden")
1777 | 	className := getAttribute(node, "class")
1778 | 
1779 | 	return (nodeStyle == "" || !rxDisplayNone.MatchString(nodeStyle)) &&
1780 | 		!hasAttribute(node, "hidden") &&
1781 | 		(nodeAriaHidden == "" ||
1782 | 			nodeAriaHidden != "true" ||
1783 | 			strings.Contains(className, "fallback-image"))
1784 | }
1785 | 
1786 | // fixRelativeURIs converts each <a> and <img> uri in the given element to an
1787 | // absolute URI, ignoring #ref URIs.
1788 | func (r *Readability) fixRelativeURIs(articleContent *html.Node) {
1789 | 	links := r.getAllNodesWithTag(articleContent, "a")
1790 | 
1791 | 	r.forEachNode(links, func(link *html.Node, _ int) {
1792 | 		href := getAttribute(link, "href")
1793 | 
1794 | 		if href == "" {
1795 | 			return
1796 | 		}
1797 | 
1798 | 		// Replace links with javascript: URIs with text content, since they
1799 | 		// will not work after scripts have been removed from the page.
1800 | 		if strings.HasPrefix(href, "javascript:") {
1801 | 			text := createTextNode(textContent(link))
1802 | 			replaceNode(link, text)
1803 | 			return
1804 | 		}
1805 | 
1806 | 		newHref := toAbsoluteURI(href, r.documentURI)
1807 | 
1808 | 		if newHref == "" {
1809 | 			removeAttribute(link, "href")
1810 | 			return
1811 | 		}
1812 | 
1813 | 		setAttribute(link, "href", newHref)
1814 | 	})
1815 | 
1816 | 	imgs := r.getAllNodesWithTag(articleContent, "img")
1817 | 
1818 | 	r.forEachNode(imgs, func(img *html.Node, _ int) {
1819 | 		src := getAttribute(img, "src")
1820 | 
1821 | 		if src == "" {
1822 | 			return
1823 | 		}
1824 | 
1825 | 		newSrc := toAbsoluteURI(src, r.documentURI)
1826 | 
1827 | 		if newSrc == "" {
1828 | 			removeAttribute(img, "src")
1829 | 			return
1830 | 		}
1831 | 
1832 | 		setAttribute(img, "src", newSrc)
1833 | 	})
1834 | }
1835 | 
1836 | // cleanClasses removes the class="" attribute from every element in the given
1837 | // subtree, except those that match CLASSES_TO_PRESERVE and classesToPreserve
1838 | // array from the options object.
1839 | func (r *Readability) cleanClasses(node *html.Node) {
1840 | 	nodeClassName := className(node)
1841 | 	preservedClassName := []string{}
1842 | 
1843 | 	for _, class := range strings.Fields(nodeClassName) {
1844 | 		if indexOf(r.ClassesToPreserve, class) != -1 {
1845 | 			preservedClassName = append(preservedClassName, class)
1846 | 		}
1847 | 	}
1848 | 
1849 | 	if len(preservedClassName) > 0 {
1850 | 		setAttribute(node, "class", strings.Join(preservedClassName, "\x20"))
1851 | 	} else {
1852 | 		removeAttribute(node, "class")
1853 | 	}
1854 | 
1855 | 	for child := firstElementChild(node); child != nil; child = nextElementSibling(child) {
1856 | 		r.cleanClasses(child)
1857 | 	}
1858 | }
1859 | 
1860 | // clearReadabilityAttr removes Readability attribute created by the parser.
1861 | func (r *Readability) clearReadabilityAttr(node *html.Node) {
1862 | 	removeAttribute(node, "data-readability-score")
1863 | 	removeAttribute(node, "data-readability-table")
1864 | 
1865 | 	for child := firstElementChild(node); child != nil; child = nextElementSibling(child) {
1866 | 		r.clearReadabilityAttr(child)
1867 | 	}
1868 | }
1869 | 
1870 | func (r *Readability) isSingleImage(node *html.Node) bool {
1871 | 	if tagName(node) == "img" {
1872 | 		return true
1873 | 	}
1874 | 
1875 | 	children := children(node)
1876 | 	textContent := textContent(node)
1877 | 	if len(children) != 1 || strings.TrimSpace(textContent) != "" {
1878 | 		return false
1879 | 	}
1880 | 
1881 | 	return r.isSingleImage(children[0])
1882 | }
1883 | 
1884 | func (r *Readability) removeComments(doc *html.Node) {
1885 | 	var comments []*html.Node
1886 | 	var finder func(*html.Node)
1887 | 
1888 | 	finder = func(node *html.Node) {
1889 | 		if node.Type == html.CommentNode {
1890 | 			comments = append(comments, node)
1891 | 		}
1892 | 
1893 | 		for child := node.FirstChild; child != nil; child = child.NextSibling {
1894 | 			finder(child)
1895 | 		}
1896 | 	}
1897 | 
1898 | 	for child := doc.FirstChild; child != nil; child = child.NextSibling {
1899 | 		finder(child)
1900 | 	}
1901 | 
1902 | 	r.removeNodes(comments, nil)
1903 | }
1904 | 
1905 | // postProcessContent runs post-process modifications to the article content.
1906 | func (r *Readability) postProcessContent(articleContent *html.Node) {
1907 | 	// Convert relative URIs to absolute URIs so we can open them.
1908 | 	r.fixRelativeURIs(articleContent)
1909 | 
1910 | 	// Remove CSS classes.
1911 | 	r.cleanClasses(articleContent)
1912 | 
1913 | 	// Remove readability attributes.
1914 | 	r.clearReadabilityAttr(articleContent)
1915 | }
1916 | 
1917 | // Parse parses input and find the main readable content.
1918 | func (r *Readability) Parse(input io.Reader, pageURL string) (Article, error) {
1919 | 	var err error
1920 | 
1921 | 	// Reset parser data
1922 | 	r.articleTitle = ""
1923 | 	r.articleByline = ""
1924 | 	r.attempts = []parseAttempt{}
1925 | 	r.flags.stripUnlikelys = true
1926 | 	r.flags.useWeightClasses = true
1927 | 	r.flags.cleanConditionally = true
1928 | 
1929 | 	// Parse page URL.
1930 | 	if r.documentURI, err = url.ParseRequestURI(pageURL); err != nil {
1931 | 		return Article{}, fmt.Errorf("failed to parse URL: %v", err)
1932 | 	}
1933 | 
1934 | 	// Parse input.
1935 | 	if r.doc, err = html.Parse(input); err != nil {
1936 | 		return Article{}, fmt.Errorf("failed to parse input: %v", err)
1937 | 	}
1938 | 
1939 | 	// Avoid parsing too large documents, as per configuration option.
1940 | 	if r.MaxElemsToParse > 0 {
1941 | 		numTags := len(getElementsByTagName(r.doc, "*"))
1942 | 
1943 | 		if numTags > r.MaxElemsToParse {
1944 | 			return Article{}, fmt.Errorf("too many elements: %d", numTags)
1945 | 		}
1946 | 	}
1947 | 
1948 | 	// Remove script tags from the document.
1949 | 	r.removeScripts(r.doc)
1950 | 
1951 | 	// Prepares the HTML document.
1952 | 	r.prepDocument()
1953 | 
1954 | 	// Fetch metadata.
1955 | 	metadata := r.getArticleMetadata()
1956 | 	r.articleTitle = metadata.Title
1957 | 
1958 | 	// Try to grab article content.
1959 | 	finalHTMLContent := ""
1960 | 	finalTextContent := ""
1961 | 	readableNode := &html.Node{}
1962 | 	articleContent := r.grabArticle()
1963 | 
1964 | 	if articleContent != nil {
1965 | 		r.postProcessContent(articleContent)
1966 | 
1967 | 		// If we have not found an excerpt in the article's metadata, use the
1968 | 		// article's first paragraph as the excerpt. This is used for displaying
1969 | 		// a preview of the article's content.
1970 | 		if metadata.Excerpt == "" {
1971 | 			paragraphs := getElementsByTagName(articleContent, "p")
1972 | 
1973 | 			if len(paragraphs) > 0 {
1974 | 				metadata.Excerpt = strings.TrimSpace(textContent(paragraphs[0]))
1975 | 			}
1976 | 		}
1977 | 
1978 | 		readableNode = firstElementChild(articleContent)
1979 | 		finalHTMLContent = innerHTML(articleContent)
1980 | 		finalTextContent = textContent(articleContent)
1981 | 		finalTextContent = strings.TrimSpace(finalTextContent)
1982 | 	}
1983 | 
1984 | 	finalByline := metadata.Byline
1985 | 
1986 | 	if finalByline == "" {
1987 | 		finalByline = r.articleByline
1988 | 	}
1989 | 
1990 | 	return Article{
1991 | 		Title:       r.articleTitle,
1992 | 		Byline:      finalByline,
1993 | 		Node:        readableNode,
1994 | 		Content:     finalHTMLContent,
1995 | 		TextContent: finalTextContent,
1996 | 		Length:      len(finalTextContent),
1997 | 		Excerpt:     metadata.Excerpt,
1998 | 		SiteName:    metadata.SiteName,
1999 | 		Image:       metadata.Image,
2000 | 		Favicon:     metadata.Favicon,
2001 | 	}, nil
2002 | }
2003 | 
2004 | // IsReadable decides whether the document is usable or not without parsing the
2005 | // whole thing. In the original `mozilla/readability` library, this method is
2006 | // located in `Readability-readable.js`.
2007 | func (r *Readability) IsReadable(input io.Reader) bool {
2008 | 	doc, err := html.Parse(input)
2009 | 
2010 | 	if err != nil {
2011 | 		return false
2012 | 	}
2013 | 
2014 | 	// Get <p> and <pre> nodes. Also get DIV nodes which have BR node(s) and
2015 | 	// append them into the `nodes` variable. Some articles' DOM structures
2016 | 	// might look like:
2017 | 	//
2018 | 	// <div>
2019 | 	//     Sentences<br>
2020 | 	//     <br>
2021 | 	//     Sentences<br>
2022 | 	// </div>
2023 | 	//
2024 | 	// So we need to make sure only fetch the div once.
2025 | 	// To do so, we will use map as dictionary.
2026 | 	nodeList := make([]*html.Node, 0)
2027 | 	nodeDict := make(map[*html.Node]struct{})
2028 | 	var finder func(*html.Node)
2029 | 
2030 | 	finder = func(node *html.Node) {
2031 | 		if node.Type == html.ElementNode {
2032 | 			tag := tagName(node)
2033 | 			if tag == "p" || tag == "pre" {
2034 | 				if _, exist := nodeDict[node]; !exist {
2035 | 					nodeList = append(nodeList, node)
2036 | 					nodeDict[node] = struct{}{}
2037 | 				}
2038 | 			} else if tag == "br" && node.Parent != nil && tagName(node.Parent) == "div" {
2039 | 				if _, exist := nodeDict[node.Parent]; !exist {
2040 | 					nodeList = append(nodeList, node.Parent)
2041 | 					nodeDict[node.Parent] = struct{}{}
2042 | 				}
2043 | 			}
2044 | 		}
2045 | 
2046 | 		for child := node.FirstChild; child != nil; child = child.NextSibling {
2047 | 			finder(child)
2048 | 		}
2049 | 	}
2050 | 
2051 | 	finder(doc)
2052 | 
2053 | 	// This is a little cheeky, we use the accumulator 'score' to decide what
2054 | 	// to return from this callback.
2055 | 	score := float64(0)
2056 | 
2057 | 	return r.someNode(nodeList, func(node *html.Node) bool {
2058 | 		if !r.isProbablyVisible(node) {
2059 | 			return false
2060 | 		}
2061 | 
2062 | 		matchString := className(node) + "\x20" + id(node)
2063 | 		if rxUnlikelyCandidates.MatchString(matchString) &&
2064 | 			!rxOkMaybeItsACandidate.MatchString(matchString) {
2065 | 			return false
2066 | 		}
2067 | 
2068 | 		if tagName(node) == "p" && r.hasAncestorTag(node, "li", -1, nil) {
2069 | 			return false
2070 | 		}
2071 | 
2072 | 		nodeText := strings.TrimSpace(textContent(node))
2073 | 		nodeTextLength := len(nodeText)
2074 | 		if nodeTextLength < 140 {
2075 | 			return false
2076 | 		}
2077 | 
2078 | 		score += math.Sqrt(float64(nodeTextLength - 140))
2079 | 		if score > 20 {
2080 | 			return true
2081 | 		}
2082 | 
2083 | 		return false
2084 | 	})
2085 | }
2086 | 


--------------------------------------------------------------------------------