├── .gitignore ├── LICENSE ├── README.md ├── article.go ├── cleaner.go ├── configuration.go ├── crawler.go ├── doc.go ├── extractor.go ├── goose.go ├── goose.json ├── helpers.go ├── images.go ├── outputformatter.go ├── parser.go ├── stopwords.go ├── utils.go ├── videos.go └── wordstats.go /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files, Static and Dynamic libs (Shared Objects) 2 | *.o 3 | *.a 4 | *.so 5 | 6 | # Folders 7 | _obj 8 | _test 9 | 10 | # Architecture specific extensions/prefixes 11 | *.[568vq] 12 | [568vq].out 13 | 14 | *.cgo1.go 15 | *.cgo2.c 16 | _cgo_defun.c 17 | _cgo_gotypes.go 18 | _cgo_export.* 19 | 20 | _testmain.go 21 | 22 | *.exe 23 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | GoOse 2 | ===== 3 | 4 | Html Content / Article Extractor in Golang 5 | 6 | This is a golang port of "Goose" originaly licensed to Gravity.com 7 | under one or more contributor license agreements. See the NOTICE file 8 | distributed with this work for additional information 9 | regarding copyright ownership. 10 | 11 | Golang port was written by Antonio Linari 12 | 13 | Gravity.com licenses this file 14 | to you under the Apache License, Version 2.0 (the "License"); 15 | you may not use this file except in compliance 16 | with the License. You may obtain a copy of the License at 17 | 18 | http://www.apache.org/licenses/LICENSE-2.0 19 | 20 | Unless required by applicable law or agreed to in writing, software 21 | distributed under the License is distributed on an "AS IS" BASIS, 22 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 23 | See the License for the specific language governing permissions and 24 | limitations under the License. 25 | 26 | INSTALL 27 | ======= 28 | go get github.com/advancedlogic/GoOse 29 | 30 | HOW TO USE IT 31 | ============= 32 | 33 | ```Go 34 | package main 35 | 36 | import ( 37 | "github.com/advancedlogic/GoOse" 38 | ) 39 | 40 | func main() { 41 | g := goose.New() 42 | article := g.ExtractFromUrl("http://edition.cnn.com/2012/07/08/opinion/banzi-ted-open-source/index.html") 43 | println("title", article.Title) 44 | println("description", article.MetaDescription) 45 | println("keywords", article.MetaKeywords) 46 | println("content", article.CleanedText) 47 | println("url", article.FinalUrl) 48 | println("top image", article.TopImage) 49 | } 50 | ``` 51 | 52 | TODO 53 | ==== 54 | 55 | - [ ] better organize code 56 | - [ ] add comments and godoc 57 | - [ ] improve "xpath" like queries 58 | - [ ] add other image extractions techniques (imagemagick) 59 | 60 | THANKS TO 61 | ========= 62 | ``` 63 | @Martin Angers for goquery 64 | @Fatih Arslan for set 65 | GoLang team for the amazing language and net/html 66 | ``` 67 | -------------------------------------------------------------------------------- /article.go: -------------------------------------------------------------------------------- 1 | package goose 2 | 3 | import ( 4 | "github.com/PuerkitoBio/goquery" 5 | "gopkg.in/fatih/set.v0" 6 | ) 7 | 8 | type Article struct { 9 | Title string 10 | CleanedText string 11 | MetaDescription string 12 | MetaLang string 13 | MetaFavicon string 14 | MetaKeywords string 15 | CanonicalLink string 16 | Domain string 17 | TopNode *goquery.Selection 18 | TopImage string 19 | Tags *set.Set 20 | Movies *set.Set 21 | FinalUrl string 22 | LinkHash string 23 | RawHtml string 24 | Doc *goquery.Document 25 | //raw_doc 26 | PublishDate string 27 | AdditionalData map[string]string 28 | Delta int64 29 | } 30 | 31 | //Simple ToString: it shows just the title 32 | //TODO: add more fields and pretty print 33 | func (article *Article) ToString() string { 34 | out := article.Title 35 | return out 36 | } 37 | -------------------------------------------------------------------------------- /cleaner.go: -------------------------------------------------------------------------------- 1 | package goose 2 | 3 | import ( 4 | "golang.org/x/net/html" 5 | "golang.org/x/net/html/atom" 6 | "container/list" 7 | "github.com/PuerkitoBio/goquery" 8 | "log" 9 | "regexp" 10 | "strings" 11 | ) 12 | 13 | type cleaner struct { 14 | config configuration 15 | } 16 | 17 | func NewCleaner(config configuration) cleaner { 18 | return cleaner{ 19 | config: config, 20 | } 21 | } 22 | 23 | var divToPElementsPattern = regexp.MustCompile("<(a|blockquote|dl|div|img|ol|p|pre|table|ul)") 24 | var tabsRegEx, _ = regexp.Compile("\\t|^\\s+$]") 25 | var REMOVENODES_RE = regexp.MustCompile("^side$|combx|retweet|mediaarticlerelated|menucontainer|navbar|comment|PopularQuestions|contact|foot|footer|Footer|footnote|cnn_strycaptiontxt|cnn_html_slideshow|cnn_strylftcntnt|links|meta$|scroll|shoutbox|sponsor|tags|socialnetworking|socialNetworking|cnnStryHghLght|cnn_stryspcvbx|^inset$|pagetools|post-attributes|welcome_form|contentTools2|the_answers|communitypromo|runaroundLeft|subscribe|vcard|articleheadings|date|^print$|popup|author-dropdown|tools|socialtools|byline|konafilter|KonaFilter|breadcrumbs|^fn$|wp-caption-text|legende|ajoutVideo|timestamp|js_replies") 26 | var CAPTIONS_RE = regexp.MustCompile("^caption$") 27 | var GOOGLE_RE = regexp.MustCompile(" google ") 28 | var MORE_RE = regexp.MustCompile("^[^entry-]more.*$") 29 | var FACEBOOK_RE = regexp.MustCompile("[^-]facebook") 30 | var FACEBOOK_BROADCASTING_RE = regexp.MustCompile("facebook-broadcasting") 31 | var TWITTER_RE = regexp.MustCompile("[^-]twitter") 32 | 33 | func (this *cleaner) clean(article *Article) *goquery.Document { 34 | if this.config.debug { 35 | log.Println("Starting cleaning phase with Cleaner") 36 | } 37 | docToClean := article.Doc 38 | docToClean = this.cleanArticleTags(docToClean) 39 | docToClean = this.cleanEMTags(docToClean) 40 | docToClean = this.dropCaps(docToClean) 41 | docToClean = this.removeScriptsStyle(docToClean) 42 | docToClean = this.cleanBadTags(docToClean) 43 | docToClean = this.removeNodesRegEx(docToClean, CAPTIONS_RE) 44 | docToClean = this.removeNodesRegEx(docToClean, GOOGLE_RE) 45 | docToClean = this.removeNodesRegEx(docToClean, MORE_RE) 46 | docToClean = this.removeNodesRegEx(docToClean, FACEBOOK_RE) 47 | docToClean = this.removeNodesRegEx(docToClean, FACEBOOK_BROADCASTING_RE) 48 | docToClean = this.removeNodesRegEx(docToClean, TWITTER_RE) 49 | 50 | docToClean = this.cleanParaSpans(docToClean) 51 | 52 | docToClean = this.convertDivsToParagraphs(docToClean, "div") 53 | docToClean = this.convertDivsToParagraphs(docToClean, "span") 54 | docToClean = this.convertDivsToParagraphs(docToClean, "article") 55 | docToClean = this.convertDivsToParagraphs(docToClean, "pre") 56 | 57 | return docToClean 58 | } 59 | 60 | func (this *cleaner) cleanArticleTags(doc *goquery.Document) *goquery.Document { 61 | tags := [3]string{"id", "name", "class"} 62 | articles := doc.Find("article") 63 | articles.Each(func(i int, s *goquery.Selection) { 64 | for _, tag := range tags { 65 | this.config.parser.delAttr(s, tag) 66 | } 67 | }) 68 | return doc 69 | } 70 | 71 | func (this *cleaner) cleanEMTags(doc *goquery.Document) *goquery.Document { 72 | ems := doc.Find("em") 73 | ems.Each(func(i int, s *goquery.Selection) { 74 | images := s.Find("img") 75 | if images.Length() == 0 { 76 | this.config.parser.dropTag(s) 77 | } 78 | }) 79 | if this.config.debug { 80 | log.Printf("Cleaning %d EM tags\n", ems.Size()) 81 | } 82 | return doc 83 | } 84 | 85 | func (this *cleaner) cleanCites(doc *goquery.Document) *goquery.Document { 86 | cites := doc.Find("cite") 87 | cites.Each(func(i int, s *goquery.Selection) { 88 | this.config.parser.removeNode(s) 89 | }) 90 | return doc 91 | } 92 | 93 | func (this *cleaner) cleanDivs(doc *goquery.Document) *goquery.Document { 94 | frames := make(map[string]int) 95 | framesNodes := make(map[string]*list.List) 96 | divs := doc.Find("div") 97 | divs.Each(func(i int, s *goquery.Selection) { 98 | children := s.Children() 99 | if children.Size() == 0 { 100 | text := s.Text() 101 | text = strings.Trim(text, " ") 102 | text = strings.Trim(text, "\t") 103 | text = strings.ToLower(text) 104 | frames[text]++ 105 | if framesNodes[text] == nil { 106 | framesNodes[text] = list.New() 107 | } 108 | framesNodes[text].PushBack(s) 109 | } 110 | }) 111 | for text, freq := range frames { 112 | if freq > 1 { 113 | selections := framesNodes[text] 114 | for s := selections.Front(); s != nil; s = s.Next() { 115 | selection := s.Value.(*goquery.Selection) 116 | this.config.parser.removeNode(selection) 117 | } 118 | } 119 | } 120 | return doc 121 | } 122 | 123 | func (this *cleaner) dropCaps(doc *goquery.Document) *goquery.Document { 124 | items := doc.Find("span") 125 | count := 0 //remove 126 | items.Each(func(i int, s *goquery.Selection) { 127 | attribute, exists := s.Attr("class") 128 | if exists && (strings.Contains(attribute, "dropcap") || strings.Contains(attribute, "drop_cap")) { 129 | count++ 130 | this.config.parser.dropTag(s) 131 | } 132 | }) 133 | if this.config.debug { 134 | log.Printf("Cleaning %d dropcap tags\n", count) 135 | } 136 | return doc 137 | } 138 | 139 | func (this *cleaner) removeScriptsStyle(doc *goquery.Document) *goquery.Document { 140 | if this.config.debug { 141 | log.Println("Starting to remove script tags") 142 | } 143 | scripts := doc.Find("script,style") 144 | scripts.Each(func(i int, s *goquery.Selection) { 145 | this.config.parser.removeNode(s) 146 | }) 147 | if this.config.debug { 148 | log.Printf("Removed %d script and style tags\n", scripts.Size()) 149 | } 150 | 151 | //remove comments :) How???? 152 | return doc 153 | } 154 | 155 | func (this *cleaner) matchNodeRegEx(attribute string, pattern *regexp.Regexp) bool { 156 | return pattern.MatchString(attribute) 157 | } 158 | 159 | func (this *cleaner) removeNodesRegEx(doc *goquery.Document, pattern *regexp.Regexp) *goquery.Document { 160 | selectors := [3]string{"id", "class", "name"} 161 | for _, selector := range selectors { 162 | naughtyList := doc.Find("*[" + selector + "]") 163 | cont := 0 164 | naughtyList.Each(func(i int, s *goquery.Selection) { 165 | attribute, _ := s.Attr(selector) 166 | if this.matchNodeRegEx(attribute, pattern) { 167 | cont++ 168 | this.config.parser.removeNode(s) 169 | } 170 | }) 171 | 172 | if this.config.debug { 173 | log.Printf("regExRemoveNodes %d %s elements found against pattern %s\n", cont, selector, pattern.String()) 174 | } 175 | } 176 | return doc 177 | } 178 | 179 | func (this *cleaner) cleanBadTags(doc *goquery.Document) *goquery.Document { 180 | body := doc.Find("body") 181 | children := body.Children() 182 | selectors := [3]string{"id", "class", "name"} 183 | for _, selector := range selectors { 184 | children.Each(func(i int, s *goquery.Selection) { 185 | naughtyList := s.Find("*[" + selector + "]") 186 | cont := 0 187 | naughtyList.Each(func(j int, e *goquery.Selection) { 188 | attribute, _ := e.Attr(selector) 189 | if this.matchNodeRegEx(attribute, REMOVENODES_RE) { 190 | if this.config.debug { 191 | 192 | log.Printf("Cleaning: Removing node with %s: %s\n", selector, this.config.parser.name(selector, e)) 193 | } 194 | this.config.parser.removeNode(e) 195 | cont++ 196 | } 197 | }) 198 | if this.config.debug { 199 | log.Printf("%d naughty %s elements found", cont, selector) 200 | } 201 | }) 202 | } 203 | return doc 204 | } 205 | 206 | func (this *cleaner) cleanParaSpans(doc *goquery.Document) *goquery.Document { 207 | spans := doc.Find("span") 208 | spans.Each(func(i int, s *goquery.Selection) { 209 | parent := s.Parent() 210 | if parent != nil && parent.Length() > 0 && parent.Get(0).DataAtom == atom.P { 211 | node := s.Get(0) 212 | node.Data = s.Text() 213 | node.Type = html.TextNode 214 | } 215 | }) 216 | return doc 217 | } 218 | 219 | func (this *cleaner) getFlushedBuffer(fragment string) []*html.Node { 220 | output := make([]*html.Node, 0) 221 | reader := strings.NewReader(fragment) 222 | document, _ := html.Parse(reader) 223 | body := document.FirstChild.LastChild 224 | for c := body.FirstChild; c != nil; c = c.NextSibling { 225 | output = append(output, c) 226 | c.Parent = nil 227 | c.PrevSibling = nil 228 | } 229 | 230 | for _, o := range output { 231 | o.NextSibling = nil 232 | } 233 | return output 234 | } 235 | 236 | func (this *cleaner) replaceWithPara(div *goquery.Selection) { 237 | if div.Size() > 0 { 238 | node := div.Get(0) 239 | node.Data = atom.P.String() 240 | node.DataAtom = atom.P 241 | } 242 | } 243 | 244 | func (this *cleaner) tabsAndNewLinesReplacements(text string) string { 245 | text = strings.Replace(text, "\n", "\n\n", -1) 246 | text = tabsRegEx.ReplaceAllString(text, "") 247 | return text 248 | } 249 | 250 | func (this *cleaner) convertDivsToParagraphs(doc *goquery.Document, domType string) *goquery.Document { 251 | if this.config.debug { 252 | log.Println("Starting to replace bad divs...") 253 | } 254 | badDivs := 0 255 | convertedTextNodes := 0 256 | divs := doc.Find(domType) 257 | 258 | divs.Each(func(i int, div *goquery.Selection) { 259 | divHtml,_ := div.Html() 260 | if divToPElementsPattern.Match([]byte(divHtml)) { 261 | this.replaceWithPara(div) 262 | badDivs++ 263 | } else { 264 | replacementText := make([]string, 0) 265 | nodesToRemove := list.New() 266 | children := div.Contents() 267 | if this.config.debug { 268 | log.Printf("Found %d children of div\n", children.Size()) 269 | } 270 | children.EachWithBreak(func(i int, kid *goquery.Selection) bool { 271 | text := kid.Text() 272 | kidNode := kid.Get(0) 273 | tag := kidNode.Data 274 | if tag == text { 275 | tag = "#text" 276 | } 277 | if tag == "#text" { 278 | text = strings.Replace(text, "\n", "", -1) 279 | text = tabsRegEx.ReplaceAllString(text, "") 280 | if text == "" { 281 | return true 282 | } 283 | if len(text) > 1 { 284 | prev := kidNode.PrevSibling 285 | if this.config.debug { 286 | log.Printf("PARENT CLASS: %s NODENAME: %s\n", this.config.parser.name("class", div), tag) 287 | log.Printf("TEXTREPLACE: %s\n", strings.Replace(text, "\n", "", -1)) 288 | } 289 | if prev != nil && prev.DataAtom == atom.A { 290 | nodeSelection := kid.HasNodes(prev) 291 | html, _ := nodeSelection.Html() 292 | replacementText = append(replacementText, html) 293 | if this.config.debug { 294 | log.Printf("SIBLING NODENAME ADDITION: %s TEXT: %s\n", prev.Data, html) 295 | } 296 | } 297 | replacementText = append(replacementText, text) 298 | nodesToRemove.PushBack(kidNode) 299 | convertedTextNodes++ 300 | } 301 | 302 | } 303 | return true 304 | }) 305 | 306 | newNode := new(html.Node) 307 | newNode.Type = html.ElementNode 308 | newNode.Data = strings.Join(replacementText, "") 309 | newNode.DataAtom = atom.P 310 | div.First().AddNodes(newNode) 311 | 312 | for s := nodesToRemove.Front(); s != nil; s = s.Next() { 313 | node := s.Value.(*html.Node) 314 | if node != nil && node.Parent != nil { 315 | node.Parent.RemoveChild(node) 316 | } 317 | } 318 | } 319 | }) 320 | if this.config.debug { 321 | log.Printf("Found %d total divs with %d bad divs replaced and %d textnodes converted inside divs", divs.Size(), badDivs, convertedTextNodes) 322 | } 323 | return doc 324 | 325 | } 326 | -------------------------------------------------------------------------------- /configuration.go: -------------------------------------------------------------------------------- 1 | package goose 2 | 3 | import ( 4 | "github.com/advancedlogic/gojs-config" 5 | ) 6 | 7 | type configuration struct { 8 | localStoragePath string //not used in this version 9 | imagesMinBytes int //not used in this version 10 | enableImageFetching bool 11 | useMetaLanguage bool 12 | targetLanguage string 13 | imageMagickConvertPath string //not used in this version 14 | imageMagickIdentifyPath string //not used in this version 15 | browserUserAgent string 16 | debug bool 17 | extractPublishDate bool 18 | additionalDataExtractor bool 19 | 20 | //path to the stopwords folder 21 | stopWordsPath string 22 | stopWords StopWords 23 | parser *parser 24 | } 25 | 26 | func GetDefualtConfiguration(args ...string) configuration { 27 | if len(args) == 0 { 28 | return configuration{ 29 | localStoragePath: "", //not used in this version 30 | imagesMinBytes: 4500, //not used in this version 31 | enableImageFetching: true, 32 | useMetaLanguage: true, 33 | targetLanguage: "en", 34 | imageMagickConvertPath: "/usr/bin/convert", //not used in this version 35 | imageMagickIdentifyPath: "/usr/bin/identify", //not used in this version 36 | browserUserAgent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/534.52.7 (KHTML, like Gecko) Version/5.1.2 Safari/534.52.7", 37 | debug: false, 38 | extractPublishDate: false, 39 | additionalDataExtractor: false, 40 | stopWordsPath: "resources/stopwords", 41 | stopWords: NewStopwords(), //TODO with path 42 | parser: NewParser(), 43 | } 44 | } else { 45 | path := args[0] 46 | jsconfiguration, err := jsconfig.LoadConfig(path) 47 | if err != nil { 48 | panic(err.Error()) 49 | } 50 | stopWordsPath := jsconfiguration.String("stopWordsPath", "resources/stopwords") 51 | stopWords := NewStopwords() //TODO with path 52 | return configuration{ 53 | localStoragePath: jsconfiguration.String("localStoragePath", ""), //not used in this version 54 | imagesMinBytes: jsconfiguration.Int("imageMinBytes", 4500), //not used in this version 55 | enableImageFetching: jsconfiguration.Bool("enableImageFetching", true), 56 | useMetaLanguage: jsconfiguration.Bool("useMetaLanguage", true), 57 | targetLanguage: jsconfiguration.String("targetLanguage", "en"), 58 | imageMagickConvertPath: jsconfiguration.String("imageMagickConvertPath", "/usr/bin/convert"), //not used in this version 59 | imageMagickIdentifyPath: jsconfiguration.String("imageMagickIdentityPath", "/usr/bin/identify"), //not used in this version 60 | browserUserAgent: jsconfiguration.String("browserUserAgent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/534.52.7 (KHTML, like Gecko) Version/5.1.2 Safari/534.52.7"), 61 | debug: jsconfiguration.Bool("debug", false), 62 | extractPublishDate: jsconfiguration.Bool("extractPublishDate", false), //TODO 63 | additionalDataExtractor: jsconfiguration.Bool("additionalDataExtractor", false), //TODO 64 | stopWordsPath: stopWordsPath, 65 | stopWords: stopWords, 66 | parser: NewParser(), 67 | } 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /crawler.go: -------------------------------------------------------------------------------- 1 | package goose 2 | 3 | import ( 4 | "code.google.com/p/go-charset/charset" 5 | _ "code.google.com/p/go-charset/data" 6 | "github.com/PuerkitoBio/goquery" 7 | "io/ioutil" 8 | "log" 9 | "net/http" 10 | "net/http/cookiejar" 11 | "strings" 12 | ) 13 | 14 | type Crawler struct { 15 | config configuration 16 | url string 17 | rawHtml string 18 | helper Helper 19 | } 20 | 21 | func NewCrawler(config configuration, url string, rawHtml string) Crawler { 22 | return Crawler{ 23 | config: config, 24 | url: url, 25 | rawHtml: rawHtml, 26 | } 27 | } 28 | 29 | func (this Crawler) Crawl() *Article { 30 | 31 | article := new(Article) 32 | this.assignParseCandidate() 33 | this.assignHtml() 34 | 35 | if this.rawHtml == "" { 36 | return article 37 | } 38 | 39 | reader := strings.NewReader(this.rawHtml) 40 | document, err := goquery.NewDocumentFromReader(reader) 41 | 42 | if err != nil { 43 | panic(err.Error()) 44 | } 45 | 46 | attr := "" 47 | selection := document.Find("meta").EachWithBreak(func(i int, s *goquery.Selection) bool { 48 | attr, exists := s.Attr("http-equiv") 49 | if exists && attr == "Content-Type" { 50 | return false 51 | } 52 | return true 53 | }) 54 | 55 | if selection != nil { 56 | attr, _ = selection.Attr("content") 57 | if strings.HasPrefix(attr, "text/html; charset=") { 58 | cs := strings.TrimPrefix(attr, "text/html; charset=") 59 | 60 | if cs != "utf-8" { 61 | r, _ := charset.NewReader(cs, strings.NewReader(this.rawHtml)) 62 | utf8, _ := ioutil.ReadAll(r) 63 | this.rawHtml = string(utf8) 64 | reader = strings.NewReader(this.rawHtml) 65 | document, err = goquery.NewDocumentFromReader(reader) 66 | } 67 | } 68 | } 69 | 70 | if err == nil { 71 | extractor := NewExtractor(this.config) 72 | html, _ := document.Html() 73 | start := TimeInNanoseconds() 74 | article.RawHtml = html 75 | article.FinalUrl = this.helper.url 76 | article.LinkHash = this.helper.linkHash 77 | article.Doc = document 78 | article.Title = extractor.getTitle(article) 79 | article.MetaLang = extractor.getMetaLanguage(article) 80 | article.MetaFavicon = extractor.getFavicon(article) 81 | 82 | article.MetaDescription = extractor.getMetaContentWithSelector(article, "meta[name=description]") 83 | article.MetaKeywords = extractor.getMetaContentWithSelector(article, "meta[name=keywords]") 84 | article.CanonicalLink = extractor.getCanonicalLink(article) 85 | article.Domain = extractor.getDomain(article) 86 | article.Tags = extractor.getTags(article) 87 | 88 | cleaner := NewCleaner(this.config) 89 | article.Doc = cleaner.clean(article) 90 | 91 | article.TopNode = extractor.calculateBestNode(article) 92 | if article.TopNode != nil { 93 | article.TopNode = extractor.postCleanup(article.TopNode) 94 | 95 | outputFormatter := new(outputFormatter) 96 | article.CleanedText = outputFormatter.getFormattedText(article) 97 | 98 | videoExtractor := NewVideoExtractor() 99 | article.Movies = videoExtractor.GetVideos(article) 100 | 101 | article.TopImage = OpenGraphResolver(article) 102 | if article.TopImage == "" { 103 | article.TopImage = WebPageResolver(article) 104 | } 105 | } 106 | 107 | stop := TimeInNanoseconds() 108 | delta := stop - start 109 | article.Delta = delta 110 | 111 | } else { 112 | panic(err.Error()) 113 | } 114 | return article 115 | } 116 | 117 | func (this *Crawler) assignParseCandidate() { 118 | if this.rawHtml != "" { 119 | this.helper = NewRawHelper(this.url, this.rawHtml) 120 | } else { 121 | this.helper = NewUrlHelper(this.url) 122 | } 123 | } 124 | 125 | func (this *Crawler) assignHtml() { 126 | if this.rawHtml == "" { 127 | cookieJar, _ := cookiejar.New(nil) 128 | client := &http.Client{ 129 | Jar: cookieJar, 130 | } 131 | req, err := http.NewRequest("GET", this.url, nil) 132 | if err == nil { 133 | req.Header.Set("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_7) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.91 Safari/534.30") 134 | resp, err := client.Do(req) 135 | if err == nil { 136 | defer resp.Body.Close() 137 | contents, err := ioutil.ReadAll(resp.Body) 138 | if err == nil { 139 | this.rawHtml = string(contents) 140 | } else { 141 | log.Println(err.Error()) 142 | } 143 | } else { 144 | log.Println(err.Error()) 145 | } 146 | } else { 147 | log.Println(err.Error()) 148 | } 149 | } 150 | } 151 | -------------------------------------------------------------------------------- /doc.go: -------------------------------------------------------------------------------- 1 | /* 2 | This is a golang port of "Goose" originaly licensed to Gravity.com 3 | under one or more contributor license agreements. See the NOTICE file 4 | distributed with this work for additional information 5 | regarding copyright ownership. 6 | 7 | Golang port was written by Antonio Linari 8 | 9 | Gravity.com licenses this file 10 | to you under the Apache License, Version 2.0 (the "License"); 11 | you may not use this file except in compliance 12 | with the License. You may obtain a copy of the License at 13 | 14 | http://www.apache.org/licenses/LICENSE-2.0 15 | 16 | Unless required by applicable law or agreed to in writing, software 17 | distributed under the License is distributed on an "AS IS" BASIS, 18 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 19 | See the License for the specific language governing permissions and 20 | limitations under the License. 21 | */ 22 | 23 | package goose 24 | -------------------------------------------------------------------------------- /extractor.go: -------------------------------------------------------------------------------- 1 | package goose 2 | 3 | import ( 4 | "golang.org/x/net/html" 5 | "golang.org/x/net/html/atom" 6 | "container/list" 7 | "github.com/PuerkitoBio/goquery" 8 | "gopkg.in/fatih/set.v0" 9 | "log" 10 | "math" 11 | "net/url" 12 | "regexp" 13 | "strconv" 14 | "strings" 15 | ) 16 | 17 | const DEFAULT_LANGUAGE = "en" 18 | 19 | var MOTLEY_REPLACEMENT = "�" 20 | var ESCAPED_FRAGMENT_REPLACEMENT = regexp.MustCompile("#!") 21 | var TITLE_REPLACEMENTS = regexp.MustCompile("»") 22 | var PIPE_SPLITTER = regexp.MustCompile("\\|") 23 | var DASH_SPLITTER = regexp.MustCompile(" - ") 24 | var ARROWS_SPLITTER = regexp.MustCompile("»") 25 | var COLON_SPLITTER = regexp.MustCompile(":") 26 | var SPACE_SPLITTER = regexp.MustCompile(" ") 27 | var A_REL_TAG_SELECTOR = "a[rel=tag]" 28 | var A_HREF_TAG_SELECTOR = [...]string{"/tag/", "/tags/", "/topic/", "?keyword"} 29 | var RE_LANG = "^[A-Za-z]{2}$" 30 | 31 | type contentExtractor struct { 32 | config configuration 33 | } 34 | 35 | func NewExtractor(config configuration) contentExtractor { 36 | return contentExtractor{ 37 | config: config, 38 | } 39 | } 40 | 41 | //if the article has a title set in the source, use that 42 | func (this *contentExtractor) getTitle(article *Article) string { 43 | title := "" 44 | doc := article.Doc 45 | 46 | titleElement := doc.Find("title") 47 | if titleElement == nil || titleElement.Size() == 0 { 48 | return title 49 | } 50 | 51 | titleText := titleElement.Text() 52 | usedDelimiter := false 53 | 54 | if strings.Contains(titleText, "|") { 55 | titleText = this.splitTitle(RegSplit(titleText, PIPE_SPLITTER)) 56 | usedDelimiter = true 57 | } 58 | 59 | if !usedDelimiter && strings.Contains(titleText, "-") { 60 | titleText = this.splitTitle(RegSplit(titleText, DASH_SPLITTER)) 61 | usedDelimiter = true 62 | } 63 | 64 | if !usedDelimiter && strings.Contains(titleText, "»") { 65 | titleText = this.splitTitle(RegSplit(titleText, ARROWS_SPLITTER)) 66 | usedDelimiter = true 67 | } 68 | 69 | if !usedDelimiter && strings.Contains(titleText, ":") { 70 | titleText = this.splitTitle(RegSplit(titleText, COLON_SPLITTER)) 71 | usedDelimiter = true 72 | } 73 | 74 | title = strings.Replace(titleText, MOTLEY_REPLACEMENT, "", -1) 75 | 76 | if this.config.debug { 77 | log.Printf("Page title is %s\n", title) 78 | } 79 | 80 | return title 81 | } 82 | 83 | func (this *contentExtractor) splitTitle(titles []string) string { 84 | largeTextLength := 0 85 | largeTextIndex := 0 86 | for i, current := range titles { 87 | if len(current) > largeTextLength { 88 | largeTextLength = len(current) 89 | largeTextIndex = i 90 | } 91 | } 92 | title := titles[largeTextIndex] 93 | title = strings.Replace(title, "»", "»", -1) 94 | return title 95 | } 96 | 97 | //if the article has meta language set in the source, use that 98 | func (this *contentExtractor) getMetaLanguage(article *Article) string { 99 | language := "" 100 | doc := article.Doc 101 | shtml := doc.Find("html") 102 | attr, _ := shtml.Attr("lang") 103 | if attr == "" { 104 | attr, _ = doc.Attr("lang") 105 | } 106 | if attr == "" { 107 | selection := doc.Find("meta").EachWithBreak(func(i int, s *goquery.Selection) bool { 108 | attr, exists := s.Attr("http-equiv") 109 | if exists && attr == "content-language" { 110 | return false 111 | } 112 | return true 113 | }) 114 | if selection != nil { 115 | attr, _ = selection.Attr("content") 116 | } 117 | } 118 | idx := strings.LastIndex(attr, "-") 119 | if idx == -1 { 120 | language = attr 121 | } else { 122 | language = attr[0 : idx] 123 | } 124 | 125 | _, ok := sw[language] 126 | 127 | if language == "" || !ok { 128 | language = this.config.stopWords.SimpleLanguageDetector(shtml.Text()) 129 | if language == "" { 130 | language = DEFAULT_LANGUAGE 131 | } 132 | } 133 | 134 | this.config.targetLanguage = language 135 | return language 136 | } 137 | 138 | //if the article has favicon set in the source, use that 139 | func (this *contentExtractor) getFavicon(article *Article) string { 140 | favicon := "" 141 | doc := article.Doc 142 | doc.Find("link").EachWithBreak(func(i int, s *goquery.Selection) bool { 143 | attr, exists := s.Attr("rel") 144 | if exists && strings.Contains(attr, "icon") { 145 | favicon, _ = s.Attr("href") 146 | return false 147 | } 148 | return true 149 | }) 150 | return favicon 151 | } 152 | 153 | func (this *contentExtractor) getMetaContentWithSelector(article *Article, selector string) string { 154 | content := "" 155 | doc := article.Doc 156 | selection := doc.Find(selector) 157 | content, _ = selection.Attr("content") 158 | return content 159 | } 160 | 161 | func (this *contentExtractor) getMetaContent(article *Article, metaName string) string { 162 | content := "" 163 | doc := article.Doc 164 | doc.Find("meta").EachWithBreak(func(i int, s *goquery.Selection) bool { 165 | attr, exists := s.Attr("name") 166 | if exists && attr == metaName { 167 | content, _ = s.Attr("content") 168 | return false 169 | } 170 | return true 171 | }) 172 | return content 173 | } 174 | 175 | func (this *contentExtractor) getMetaContents(article *Article, metaNames *set.Set) map[string]string { 176 | contents := make(map[string]string) 177 | doc := article.Doc 178 | counter := metaNames.Size() 179 | doc.Find("meta").EachWithBreak(func(i int, s *goquery.Selection) bool { 180 | attr, exists := s.Attr("name") 181 | if exists && metaNames.Has(attr) { 182 | content, _ := s.Attr("content") 183 | contents[attr] = content 184 | counter-- 185 | if counter < 0 { 186 | return false 187 | } 188 | } 189 | return true 190 | }) 191 | return contents 192 | } 193 | 194 | //if the article has meta description set in the source, use that 195 | func (this *contentExtractor) getMetaDescription(article *Article) string { 196 | return this.getMetaContent(article, "description") 197 | } 198 | 199 | //if the article has meta keywords set in the source, use that 200 | func (this *contentExtractor) getMetKeywords(article *Article) string { 201 | return this.getMetaContent(article, "keywords") 202 | } 203 | 204 | //if the article has meta canonical link set in the url 205 | func (this *contentExtractor) getCanonicalLink(article *Article) string { 206 | doc := article.Doc 207 | metas := doc.Find("link[rel=canonical]") 208 | if metas.Length() > 0 { 209 | meta := metas.First() 210 | href, _ := meta.Attr("href") 211 | href = strings.Trim(href, "\n") 212 | href = strings.Trim(href, " ") 213 | if href != "" { 214 | return href 215 | } 216 | } 217 | return article.FinalUrl 218 | } 219 | 220 | //extract domain and use that 221 | func (this *contentExtractor) getDomain(article *Article) string { 222 | canonicalLink := article.CanonicalLink 223 | u, err := url.Parse(canonicalLink) 224 | if err == nil { 225 | return u.Host 226 | } 227 | return "" 228 | } 229 | 230 | //if the article has tags set in the source, use that 231 | func (this *contentExtractor) getTags(article *Article) *set.Set { 232 | tags := set.New() 233 | doc := article.Doc 234 | selections := doc.Find(A_REL_TAG_SELECTOR) 235 | selections.Each(func(i int, s *goquery.Selection) { 236 | tags.Add(s.Text()) 237 | }) 238 | selections = doc.Find("a") 239 | selections.Each(func(i int, s *goquery.Selection) { 240 | href, exists := s.Attr("href") 241 | if exists { 242 | for _, part := range A_HREF_TAG_SELECTOR { 243 | if strings.Contains(href, part) { 244 | tags.Add(s.Text()) 245 | } 246 | } 247 | } 248 | }) 249 | 250 | return tags 251 | } 252 | 253 | //we're going to start looking for where the clusters of paragraphs are. We'll score a cluster based on the number of stopwords 254 | //and the number of consecutive paragraphs together, which should form the cluster of text that this node is around 255 | //also store on how high up the paragraphs are, comments are usually at the bottom and should get a lower score 256 | func (this *contentExtractor) calculateBestNode(article *Article) *goquery.Selection { 257 | doc := article.Doc 258 | var topNode *goquery.Selection 259 | nodesToCheck := this.nodesToCheck(doc) 260 | if this.config.debug { 261 | log.Printf("Nodes to check %d\n", len(nodesToCheck)) 262 | } 263 | startingBoost := 1.0 264 | cnt := 0 265 | i := 0 266 | parentNodes := set.New() 267 | nodesWithText := list.New() 268 | for _, node := range nodesToCheck { 269 | textNode := node.Text() 270 | ws := this.config.stopWords.stopWordsCount(this.config.targetLanguage, textNode) 271 | highLinkDensity := this.isHighLinkDensity(node) 272 | if ws.stopWordCount > 2 && !highLinkDensity { 273 | nodesWithText.PushBack(node) 274 | } 275 | } 276 | nodesNumber := nodesWithText.Len() 277 | negativeScoring := 0 278 | bottomNegativeScoring := float64(nodesNumber) * 0.25 279 | 280 | if this.config.debug { 281 | log.Printf("About to inspect num of nodes with text %d\n", nodesNumber) 282 | } 283 | 284 | for n := nodesWithText.Front(); n != nil; n = n.Next() { 285 | node := n.Value.(*goquery.Selection) 286 | boostScore := 0.0 287 | if this.isBoostable(node) { 288 | if cnt >= 0 { 289 | boostScore = float64((1.0 / startingBoost) * 50) 290 | startingBoost += 1 291 | } 292 | } 293 | 294 | if nodesNumber > 15 { 295 | if float64(nodesNumber-i) <= bottomNegativeScoring { 296 | booster := bottomNegativeScoring - float64(nodesNumber-i) 297 | boostScore = -math.Pow(booster, 2.0) 298 | negScore := math.Abs(boostScore) + float64(negativeScoring) 299 | if negScore > 40 { 300 | boostScore = 5.0 301 | } 302 | } 303 | } 304 | 305 | if this.config.debug { 306 | log.Printf("Location Boost Score %1.5f on iteration %d id='%s' class='%s'\n", boostScore, i, this.config.parser.name("id", node), this.config.parser.name("class", node)) 307 | } 308 | textNode := node.Text() 309 | ws := this.config.stopWords.stopWordsCount(this.config.targetLanguage, textNode) 310 | upScore := ws.stopWordCount + int(boostScore) 311 | parentNode := node.Parent() 312 | this.updateScore(parentNode, upScore) 313 | this.updateNodeCount(parentNode, 1) 314 | if !parentNodes.Has(parentNode) { 315 | parentNodes.Add(parentNode) 316 | } 317 | parentParentNode := parentNode.Parent() 318 | if parentParentNode != nil { 319 | this.updateNodeCount(parentParentNode, 1) 320 | this.updateScore(parentParentNode, upScore/2.0) 321 | if !parentNodes.Has(parentParentNode) { 322 | parentNodes.Add(parentParentNode) 323 | } 324 | } 325 | cnt++ 326 | i++ 327 | } 328 | 329 | topNodeScore := 0 330 | parentNodesArray := parentNodes.List() 331 | for _, p := range parentNodesArray { 332 | e := p.(*goquery.Selection) 333 | if this.config.debug { 334 | log.Printf("ParentNode: score=%s nodeCount=%s id='%s' class='%s'\n", this.config.parser.name("gravityScore", e), this.config.parser.name("gravityNodes", e), this.config.parser.name("id", e), this.config.parser.name("class", e)) 335 | } 336 | score := this.getScore(e) 337 | if score >= topNodeScore { 338 | topNode = e 339 | topNodeScore = score 340 | } 341 | if topNode == nil { 342 | topNode = e 343 | } 344 | } 345 | return topNode 346 | } 347 | 348 | //returns the gravityScore as an integer from this node 349 | func (this *contentExtractor) getScore(node *goquery.Selection) int { 350 | return this.getNodeGravityScore(node) 351 | } 352 | 353 | func (this *contentExtractor) getNodeGravityScore(node *goquery.Selection) int { 354 | grvScoreString, exists := node.Attr("gravityScore") 355 | if !exists { 356 | return 0 357 | } 358 | grvScore, err := strconv.Atoi(grvScoreString) 359 | if err != nil { 360 | return 0 361 | } 362 | return grvScore 363 | } 364 | 365 | //adds a score to the gravityScore Attribute we put on divs 366 | //we'll get the current score then add the score we're passing in to the current 367 | func (this *contentExtractor) updateScore(node *goquery.Selection, addToScore int) { 368 | currentScore := 0 369 | var err error 370 | scoreString, _ := node.Attr("gravityScore") 371 | if scoreString != "" { 372 | currentScore, err = strconv.Atoi(scoreString) 373 | if err != nil { 374 | currentScore = 0 375 | } 376 | } 377 | newScore := currentScore + addToScore 378 | this.config.parser.setAttr(node, "gravityScore", strconv.Itoa(newScore)) 379 | } 380 | 381 | //stores how many decent nodes are under a parent node 382 | func (this *contentExtractor) updateNodeCount(node *goquery.Selection, addToCount int) { 383 | currentScore := 0 384 | var err error 385 | scoreString, _ := node.Attr("gravityNodes") 386 | if scoreString != "" { 387 | currentScore, err = strconv.Atoi(scoreString) 388 | if err != nil { 389 | currentScore = 0 390 | } 391 | } 392 | newScore := currentScore + addToCount 393 | this.config.parser.setAttr(node, "gravityNodes", strconv.Itoa(newScore)) 394 | } 395 | 396 | //a lot of times the first paragraph might be the caption under an image so we'll want to make sure if we're going to 397 | //boost a parent node that it should be connected to other paragraphs, at least for the first n paragraphs 398 | //so we'll want to make sure that the next sibling is a paragraph and has at least some substatial weight to it 399 | func (this *contentExtractor) isBoostable(node *goquery.Selection) bool { 400 | stepsAway := 0 401 | next := node.Next() 402 | for next != nil && stepsAway < node.Siblings().Length() { 403 | currentNodeTag := node.Get(0).DataAtom.String() 404 | if currentNodeTag == "p" { 405 | if stepsAway >= 3 { 406 | if this.config.debug { 407 | log.Println("Next paragraph is too far away, not boosting") 408 | } 409 | return false 410 | } 411 | 412 | paraText := node.Text() 413 | ws := this.config.stopWords.stopWordsCount(this.config.targetLanguage, paraText) 414 | if ws.stopWordCount > 5 { 415 | if this.config.debug { 416 | log.Println("We're gonna boost this node, seems content") 417 | } 418 | return true 419 | } 420 | } 421 | 422 | stepsAway++ 423 | next = next.Next() 424 | } 425 | 426 | return false 427 | } 428 | 429 | //returns a list of nodes we want to search on like paragraphs and tables 430 | func (this *contentExtractor) nodesToCheck(doc *goquery.Document) []*goquery.Selection { 431 | output := make([]*goquery.Selection, 0) 432 | tags := []string{"p", "pre", "td"} 433 | for _, tag := range tags { 434 | selections := doc.Children().Find(tag) 435 | if selections != nil { 436 | selections.Each(func(i int, s *goquery.Selection) { 437 | output = append(output, s) 438 | }) 439 | } 440 | } 441 | return output 442 | } 443 | 444 | //checks the density of links within a node, is there not much text and most of it contains bad links? 445 | //if so it's no good 446 | func (this *contentExtractor) isHighLinkDensity(node *goquery.Selection) bool { 447 | links := node.Find("a") 448 | if links == nil || links.Size() == 0 { 449 | return false 450 | } 451 | text := node.Text() 452 | words := strings.Split(text, " ") 453 | nwords := len(words) 454 | sb := make([]string, 0) 455 | links.Each(func(i int, s *goquery.Selection) { 456 | linkText := s.Text() 457 | sb = append(sb, linkText) 458 | }) 459 | linkText := strings.Join(sb, "") 460 | linkWords := strings.Split(linkText, " ") 461 | nlinkWords := len(linkWords) 462 | nlinks := links.Size() 463 | linkDivisor := float64(nlinkWords) / float64(nwords) 464 | score := linkDivisor * float64(nlinks) 465 | 466 | if this.config.debug { 467 | logText := "" 468 | if len(node.Text()) >= 51 { 469 | logText = node.Text()[0:50] 470 | } else { 471 | logText = node.Text() 472 | } 473 | log.Printf("Calculated link density score as %1.5f for node %s\n", score, logText) 474 | } 475 | if score > 1.0 { 476 | return true 477 | } 478 | return false 479 | } 480 | 481 | func (this *contentExtractor) isTableAndNoParaExist(selection *goquery.Selection) bool { 482 | subParagraph := selection.Find("p") 483 | subParagraph.Each(func(i int, s *goquery.Selection) { 484 | txt := s.Text() 485 | if len(txt) < 25 { 486 | node := s.Get(0) 487 | parent := node.Parent 488 | parent.RemoveChild(node) 489 | } 490 | }) 491 | 492 | subParagraph2 := selection.Find("p") 493 | if subParagraph2.Length() == 0 && selection.Get(0).DataAtom.String() != "td" { 494 | return true 495 | } 496 | return false 497 | } 498 | 499 | func (this *contentExtractor) isNodescoreThresholdMet(node *goquery.Selection, e *goquery.Selection) bool { 500 | topNodeScore := this.getNodeGravityScore(node) 501 | currentNodeScore := this.getNodeGravityScore(e) 502 | threasholdScore := float64(topNodeScore) * 0.08 503 | if (float64(currentNodeScore) < threasholdScore) && e.Get(0).DataAtom.String() != "td" { 504 | return false 505 | } 506 | return true 507 | } 508 | 509 | //we could have long articles that have tons of paragraphs so if we tried to calculate the base score against 510 | //the total text score of those paragraphs it would be unfair. So we need to normalize the score based on the average scoring 511 | //of the paragraphs within the top node. For example if our total score of 10 paragraphs was 1000 but each had an average value of 512 | //100 then 100 should be our base. 513 | func (this *contentExtractor) getSiblingsScore(topNode *goquery.Selection) int { 514 | base := 100000 515 | paragraphNumber := 0 516 | paragraphScore := 0 517 | nodesToCheck := topNode.Find("p") 518 | nodesToCheck.Each(func(i int, s *goquery.Selection) { 519 | textNode := s.Text() 520 | ws := this.config.stopWords.stopWordsCount(this.config.targetLanguage, textNode) 521 | highLinkDensity := this.isHighLinkDensity(s) 522 | if ws.stopWordCount > 2 && !highLinkDensity { 523 | paragraphNumber++ 524 | paragraphScore += ws.stopWordCount 525 | } 526 | }) 527 | if paragraphNumber > 0 { 528 | base = paragraphScore / paragraphNumber 529 | } 530 | return base 531 | } 532 | 533 | func (this *contentExtractor) getSiblingsContent(currentSibling *goquery.Selection, baselinescoreSiblingsPara float64) []*goquery.Selection { 534 | ps := make([]*goquery.Selection, 0) 535 | if currentSibling.Get(0).DataAtom.String() == "p" && len(currentSibling.Text()) > 0 { 536 | ps = append(ps, currentSibling) 537 | return ps 538 | } else { 539 | potentialParagraphs := currentSibling.Find("p") 540 | potentialParagraphs.Each(func(i int, s *goquery.Selection) { 541 | text := s.Text() 542 | if len(text) > 0 { 543 | ws := this.config.stopWords.stopWordsCount(this.config.targetLanguage, text) 544 | paragraphScore := ws.stopWordCount 545 | siblingBaselineScore := 0.30 546 | highLinkDensity := this.isHighLinkDensity(s) 547 | score := siblingBaselineScore * baselinescoreSiblingsPara 548 | if score < float64(paragraphScore) && !highLinkDensity { 549 | node := new(html.Node) 550 | node.Type = html.TextNode 551 | node.Data = text 552 | node.DataAtom = atom.P 553 | nodes := make([]*html.Node, 1) 554 | nodes[0] = node 555 | newSelection := new(goquery.Selection) 556 | newSelection.Nodes = nodes 557 | ps = append(ps, newSelection) 558 | } 559 | } 560 | 561 | }) 562 | } 563 | return ps 564 | } 565 | 566 | func (this *contentExtractor) walkSiblings(node *goquery.Selection) []*goquery.Selection { 567 | currentSibling := node.Prev() 568 | b := make([]*goquery.Selection, 0) 569 | for currentSibling.Length() != 0 { 570 | b = append(b, currentSibling) 571 | previousSibling := currentSibling.Prev() 572 | currentSibling = previousSibling 573 | } 574 | return b 575 | } 576 | 577 | //adds any siblings that may have a decent score to this node 578 | func (this *contentExtractor) addSiblings(topNode *goquery.Selection) *goquery.Selection { 579 | if this.config.debug { 580 | log.Println("Starting to add siblings") 581 | } 582 | baselinescoreSiblingsPara := this.getSiblingsScore(topNode) 583 | results := this.walkSiblings(topNode) 584 | for _, currentNode := range results { 585 | ps := this.getSiblingsContent(currentNode, float64(baselinescoreSiblingsPara)) 586 | for _, p := range ps { 587 | nodes := make([]*html.Node, len(topNode.Nodes)+1) 588 | nodes[0] = p.Get(0) 589 | for i, node := range topNode.Nodes { 590 | nodes[i+1] = node 591 | } 592 | topNode.Nodes = nodes 593 | } 594 | } 595 | return topNode 596 | } 597 | 598 | //remove any divs that looks like non-content, clusters of links, or paras with no gusto 599 | func (this *contentExtractor) postCleanup(targetNode *goquery.Selection) *goquery.Selection { 600 | if this.config.debug { 601 | log.Println("Starting cleanup Node") 602 | } 603 | node := this.addSiblings(targetNode) 604 | children := node.Children() 605 | children.Each(func(i int, s *goquery.Selection) { 606 | tag := s.Get(0).DataAtom.String() 607 | if tag != "p" { 608 | if this.config.debug { 609 | log.Printf("CLEANUP NODE: %s class: %s\n", this.config.parser.name("id", s), this.config.parser.name("class", s)) 610 | } 611 | //if this.isHighLinkDensity(s) || this.isTableAndNoParaExist(s) || !this.isNodescoreThresholdMet(node, s) { 612 | if this.isHighLinkDensity(s) { 613 | this.config.parser.removeNode(s) 614 | return 615 | } 616 | 617 | subParagraph := s.Find("p") 618 | subParagraph.Each(func(j int, e *goquery.Selection) { 619 | if len(e.Text()) < 25 { 620 | this.config.parser.removeNode(e) 621 | } 622 | }) 623 | 624 | subParagraph2 := s.Find("p") 625 | if subParagraph2.Length() == 0 && tag != "td" { 626 | if this.config.debug { 627 | log.Println("Removing node because it doesn't have any paragraphs") 628 | } 629 | this.config.parser.removeNode(s) 630 | } else { 631 | if this.config.debug { 632 | log.Println("Not removing TD node") 633 | } 634 | } 635 | return 636 | } 637 | }) 638 | return node 639 | } 640 | -------------------------------------------------------------------------------- /goose.go: -------------------------------------------------------------------------------- 1 | package goose 2 | 3 | type Goose struct { 4 | config configuration 5 | } 6 | 7 | func New(args ...string) Goose { 8 | 9 | return Goose{ 10 | config: GetDefualtConfiguration(args...), 11 | } 12 | } 13 | 14 | func (this Goose) ExtractFromUrl(url string) *Article { 15 | cc := NewCrawler(this.config, url, "") 16 | return cc.Crawl() 17 | } 18 | 19 | func (this Goose) ExtractFromRawHtml(url string, rawHtml string) *Article { 20 | cc := NewCrawler(this.config, url, rawHtml) 21 | return cc.Crawl() 22 | } 23 | -------------------------------------------------------------------------------- /goose.json: -------------------------------------------------------------------------------- 1 | /* 2 | This is a golang port of "Goose" originaly licensed to Gravity.com 3 | under one or more contributor license agreements. See the NOTICE file 4 | distributed with this work for additional information 5 | regarding copyright ownership. 6 | 7 | Golang port was written by Antonio Linari 8 | 9 | Gravity.com licenses this file 10 | to you under the Apache License, Version 2.0 (the "License"); 11 | you may not use this file except in compliance 12 | with the License. You may obtain a copy of the License at 13 | 14 | http://www.apache.org/licenses/LICENSE-2.0 15 | 16 | Unless required by applicable law or agreed to in writing, software 17 | distributed under the License is distributed on an "AS IS" BASIS, 18 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 19 | See the License for the specific language governing permissions and 20 | limitations under the License. 21 | */ 22 | 23 | //JSON Configuration file 24 | { 25 | "localStoragePath": "", 26 | "imagesMinBytes": 4500, 27 | "enableImageFetching": true, 28 | "useMetaLanguage": true, 29 | "targetLanguage": "en", 30 | "imageMagickConvertPath": "/usr/bin/convert", 31 | "imageMagickIdentifyPath": "/usr/bin/identify", 32 | "browserUserAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/534.52.7 (KHTML, like Gecko) Version/5.1.2 Safari/534.52.7", 33 | "debug": false, 34 | "extractPublishDate": false, 35 | "additionalDataExtractor": false, 36 | "stopWordsPath": "resources/stopwords" 37 | } 38 | -------------------------------------------------------------------------------- /helpers.go: -------------------------------------------------------------------------------- 1 | package goose 2 | 3 | import ( 4 | "crypto/md5" 5 | "fmt" 6 | "github.com/bjarneh/latinx" 7 | "io" 8 | "strings" 9 | "unicode/utf8" 10 | ) 11 | 12 | type Helper struct { 13 | urlString string 14 | url string 15 | linkHash string 16 | } 17 | 18 | func NewRawHelper(url string, rawHtml string) Helper { 19 | if utf8.ValidString(rawHtml) { 20 | converter := latinx.Get(latinx.ISO_8859_1) 21 | rawHtmlBytes, err := converter.Decode([]byte(rawHtml)) 22 | if err != nil { 23 | fmt.Println(err.Error()) 24 | } 25 | rawHtml = string(rawHtmlBytes) 26 | } 27 | h := md5.New() 28 | io.WriteString(h, url) 29 | bytes := h.Sum(nil) 30 | helper := Helper{ 31 | urlString: url, 32 | url: url, 33 | linkHash: fmt.Sprintf("%s.%d", string(bytes), TimeInNanoseconds()), 34 | } 35 | return helper 36 | } 37 | 38 | func NewUrlHelper(url string) Helper { 39 | finalUrl := "" 40 | if strings.Contains(url, "#!") { 41 | finalUrl = strings.Replace(url, "#!", "?_escaped_fragment_=", -1) 42 | } else { 43 | finalUrl = url 44 | } 45 | h := md5.New() 46 | io.WriteString(h, finalUrl) 47 | bytes := h.Sum(nil) 48 | helper := Helper{ 49 | urlString: finalUrl, 50 | url: finalUrl, 51 | linkHash: fmt.Sprintf("%s.%d", string(bytes), TimeInNanoseconds()), 52 | } 53 | return helper 54 | } 55 | -------------------------------------------------------------------------------- /images.go: -------------------------------------------------------------------------------- 1 | package goose 2 | 3 | import ( 4 | "github.com/PuerkitoBio/goquery" 5 | "regexp" 6 | "strconv" 7 | "strings" 8 | ) 9 | 10 | type candidate struct { 11 | url string 12 | surface int 13 | score int 14 | } 15 | 16 | var largebig = regexp.MustCompile("(large|big)") 17 | 18 | var rules = map[*regexp.Regexp]int{ 19 | regexp.MustCompile("(large|big)"): 1, 20 | regexp.MustCompile("upload"): 1, 21 | regexp.MustCompile("media"): 1, 22 | regexp.MustCompile("gravatar.com"): -1, 23 | regexp.MustCompile("feeds.feedburner.com"): -1, 24 | regexp.MustCompile("(?i)icon"): -1, 25 | regexp.MustCompile("(?i)logo"): -1, 26 | regexp.MustCompile("(?i)spinner"): -1, 27 | regexp.MustCompile("(?i)loading"): -1, 28 | regexp.MustCompile("(?i)ads"): -1, 29 | regexp.MustCompile("badge"): -1, 30 | regexp.MustCompile("1x1"): -1, 31 | regexp.MustCompile("pixel"): -1, 32 | regexp.MustCompile("thumbnail[s]*"): -1, 33 | regexp.MustCompile(".html|.gif|.ico|button|twitter.jpg|facebook.jpg|ap_buy_photo|digg.jpg|digg.png|delicious.png|facebook.png|reddit.jpg|doubleclick|diggthis|diggThis|adserver|/ads/|ec.atdmt.com|mediaplex.com|adsatt|view.atdmt"): -1, 34 | } 35 | 36 | func score(tag *goquery.Selection) int { 37 | src, _ := tag.Attr("src") 38 | if src == "" { 39 | src, _ = tag.Attr("data-src") 40 | } 41 | if src == "" { 42 | src, _ = tag.Attr("data-lazy-src") 43 | } 44 | if src == "" { 45 | return -1 46 | } 47 | tagScore := 0 48 | for rule, score := range rules { 49 | if rule.MatchString(src) { 50 | tagScore += score 51 | } 52 | } 53 | 54 | alt, exists := tag.Attr("alt") 55 | if exists { 56 | if strings.Contains(alt, "thumbnail") { 57 | tagScore-- 58 | } 59 | } 60 | return tagScore 61 | } 62 | 63 | func WebPageResolver(article *Article) string { 64 | doc := article.Doc 65 | imgs := doc.Find("img") 66 | topImage := "" 67 | candidates := make([]candidate, 0) 68 | significantSurface := 320 * 200 69 | significantSurfaceCount := 0 70 | src := "" 71 | imgs.Each(func(i int, tag *goquery.Selection) { 72 | surface := 0 73 | src, _ = tag.Attr("src") 74 | if src == "" { 75 | src, _ = tag.Attr("data-src") 76 | } 77 | if src == "" { 78 | src, _ = tag.Attr("data-lazy-src") 79 | } 80 | if src == "" { 81 | return 82 | } 83 | 84 | width, _ := tag.Attr("width") 85 | height, _ := tag.Attr("height") 86 | if width != "" { 87 | w, _ := strconv.Atoi(width) 88 | if height != "" { 89 | h, _ := strconv.Atoi(height) 90 | surface = w * h 91 | } else { 92 | surface = w 93 | } 94 | } else { 95 | if height != "" { 96 | surface, _ = strconv.Atoi(height) 97 | } else { 98 | surface = 0 99 | } 100 | } 101 | 102 | if surface > significantSurface { 103 | significantSurfaceCount++ 104 | } 105 | 106 | tagscore := score(tag) 107 | if tagscore >= 0 { 108 | c := candidate{ 109 | url: src, 110 | surface: surface, 111 | score: score(tag), 112 | } 113 | candidates = append(candidates, c) 114 | } 115 | }) 116 | 117 | if len(candidates) == 0 { 118 | return "" 119 | } 120 | 121 | if significantSurfaceCount > 0 { 122 | bestCandidate := findBestCandidateFromSurface(candidates) 123 | topImage = bestCandidate.url 124 | } else { 125 | bestCandidate := findBestCandidateFromScore(candidates) 126 | topImage = bestCandidate.url 127 | } 128 | 129 | if topImage != "" && !strings.HasPrefix(topImage, "http") { 130 | topImage = "http://" + topImage 131 | } 132 | 133 | return topImage 134 | } 135 | 136 | func findBestCandidateFromSurface(candidates []candidate) candidate { 137 | max := 0 138 | var bestCandidate candidate 139 | for _, candidate := range candidates { 140 | surface := candidate.surface 141 | if surface >= max { 142 | max = surface 143 | bestCandidate = candidate 144 | } 145 | } 146 | 147 | return bestCandidate 148 | } 149 | 150 | func findBestCandidateFromScore(candidates []candidate) candidate { 151 | max := 0 152 | var bestCandidate candidate 153 | for _, candidate := range candidates { 154 | score := candidate.score 155 | if score >= max { 156 | max = score 157 | bestCandidate = candidate 158 | } 159 | } 160 | 161 | return bestCandidate 162 | } 163 | 164 | type ogTag struct { 165 | tpe string 166 | attribute string 167 | name string 168 | value string 169 | } 170 | 171 | var ogTags = [4]ogTag{ 172 | ogTag{ 173 | tpe: "facebook", 174 | attribute: "property", 175 | name: "og:image", 176 | value: "content", 177 | }, 178 | ogTag{ 179 | tpe: "facebook", 180 | attribute: "rel", 181 | name: "image_src", 182 | value: "href", 183 | }, 184 | ogTag{ 185 | tpe: "twitter", 186 | attribute: "name", 187 | name: "twitter:image", 188 | value: "value", 189 | }, 190 | ogTag{ 191 | tpe: "twitter", 192 | attribute: "name", 193 | name: "twitter:image", 194 | value: "content", 195 | }, 196 | } 197 | 198 | type ogImage struct { 199 | url string 200 | tpe string 201 | score int 202 | } 203 | 204 | func OpenGraphResolver(article *Article) string { 205 | doc := article.Doc 206 | meta := doc.Find("meta") 207 | links := doc.Find("link") 208 | topImage := "" 209 | meta = meta.Union(links) 210 | ogImages := make([]ogImage, 0) 211 | meta.Each(func(i int, tag *goquery.Selection) { 212 | for _, ogTag := range ogTags { 213 | attr, exist := tag.Attr(ogTag.attribute) 214 | value, vexist := tag.Attr(ogTag.value) 215 | if exist && attr == ogTag.name && vexist { 216 | ogImage := ogImage{ 217 | url: value, 218 | tpe: ogTag.tpe, 219 | score: 0, 220 | } 221 | 222 | ogImages = append(ogImages, ogImage) 223 | } 224 | } 225 | }) 226 | 227 | if len(ogImages) == 1 { 228 | topImage = ogImages[0].url 229 | } else { 230 | for _, ogImage := range ogImages { 231 | if largebig.MatchString(ogImage.url) { 232 | ogImage.score++ 233 | } 234 | if ogImage.tpe == "twitter" { 235 | ogImage.score++ 236 | } 237 | } 238 | topImage = findBestImageFromScore(ogImages).url 239 | } 240 | 241 | if topImage != "" && !strings.HasPrefix(topImage, "http") { 242 | topImage = "http://" + topImage 243 | } 244 | 245 | return topImage 246 | } 247 | 248 | func findBestImageFromScore(ogImages []ogImage) ogImage { 249 | max := 0 250 | var bestOGImage ogImage 251 | for _, ogImage := range ogImages { 252 | score := ogImage.score 253 | //println("OG", ogImage.url, score) 254 | if score >= max { 255 | max = score 256 | bestOGImage = ogImage 257 | } 258 | } 259 | 260 | return bestOGImage 261 | } 262 | -------------------------------------------------------------------------------- /outputformatter.go: -------------------------------------------------------------------------------- 1 | package goose 2 | 3 | import ( 4 | "github.com/PuerkitoBio/goquery" 5 | "golang.org/x/net/html" 6 | "regexp" 7 | "strconv" 8 | "strings" 9 | ) 10 | 11 | var normalizeWhitespaceRegexp = regexp.MustCompile(`[ \r\f\v\t]+`) 12 | var normalizeNl = regexp.MustCompile(`\n\n[\n]+`) 13 | 14 | type outputFormatter struct { 15 | topNode *goquery.Selection 16 | config configuration 17 | language string 18 | } 19 | 20 | func (this *outputFormatter) getLanguage(article *Article) string { 21 | if this.config.useMetaLanguage { 22 | if article.MetaLang != "" { 23 | return article.MetaLang 24 | } 25 | } 26 | return this.config.targetLanguage 27 | } 28 | 29 | func (this *outputFormatter) getTopNode() *goquery.Selection { 30 | return this.topNode 31 | } 32 | 33 | func (this *outputFormatter) getFormattedText(article *Article) string { 34 | this.topNode = article.TopNode 35 | this.language = this.getLanguage(article) 36 | if this.language == "" { 37 | this.language = this.config.targetLanguage 38 | } 39 | this.removeNegativescoresNodes() 40 | this.linksToText() 41 | this.replaceTagsWithText() 42 | this.removeParagraphsWithFewWords() 43 | return this.getOutputText() 44 | } 45 | 46 | func (this *outputFormatter) convertToText() string { 47 | txts := make([]string, 0) 48 | selections := this.topNode 49 | selections.Each(func(i int, s *goquery.Selection) { 50 | txt := s.Text() 51 | if txt != "" { 52 | txt = txt //unescape 53 | txtLis := strings.Trim(txt, "\n") 54 | txts = append(txts, txtLis) 55 | } 56 | }) 57 | return strings.Join(txts, "\n\n") 58 | } 59 | 60 | func (this *outputFormatter) linksToText() { 61 | links := this.topNode.Find("a") 62 | links.Each(func(i int, a *goquery.Selection) { 63 | imgs := a.Find("img") 64 | if imgs.Length() == 0 { 65 | node := a.Get(0) 66 | node.Data = a.Text() 67 | node.Type = html.TextNode 68 | } 69 | }) 70 | } 71 | 72 | func (this *outputFormatter) getOutputText() string { 73 | 74 | out := this.topNode.Text() 75 | out = normalizeWhitespaceRegexp.ReplaceAllString(out, " ") 76 | 77 | strArr := strings.Split(out, "\n") 78 | resArr := []string{} 79 | 80 | for i, v := range strArr { 81 | v = strings.TrimSpace(v) 82 | if v != "" { 83 | resArr = append(resArr, v) 84 | } else if i > 2 && strArr[i-2] != "" { 85 | resArr = append(resArr, "") 86 | } 87 | } 88 | 89 | out = strings.Join(resArr, "\n") 90 | out = normalizeNl.ReplaceAllString(out, "\n\n") 91 | 92 | out = strings.TrimSpace(out) 93 | return out 94 | } 95 | 96 | func (this *outputFormatter) removeNegativescoresNodes() { 97 | gravityItems := this.topNode.Find("*[gravityScore]") 98 | gravityItems.Each(func(i int, s *goquery.Selection) { 99 | score := 0 100 | sscore, exists := s.Attr("gravityScore") 101 | if exists { 102 | score, _ = strconv.Atoi(sscore) 103 | if score < 1 { 104 | sNode := s.Get(0) 105 | sNode.Parent.RemoveChild(sNode) 106 | } 107 | } 108 | 109 | }) 110 | } 111 | 112 | func (this *outputFormatter) replaceTagsWithText() { 113 | strongs := this.topNode.Find("strong") 114 | strongs.Each(func(i int, strong *goquery.Selection) { 115 | text := strong.Text() 116 | node := strong.Get(0) 117 | node.Type = html.TextNode 118 | node.Data = text 119 | }) 120 | 121 | bolds := this.topNode.Find("b") 122 | bolds.Each(func(i int, bold *goquery.Selection) { 123 | text := bold.Text() 124 | node := bold.Get(0) 125 | node.Type = html.TextNode 126 | node.Data = text 127 | }) 128 | 129 | italics := this.topNode.Find("i") 130 | italics.Each(func(i int, italic *goquery.Selection) { 131 | text := italic.Text() 132 | node := italic.Get(0) 133 | node.Type = html.TextNode 134 | node.Data = text 135 | }) 136 | } 137 | 138 | func (this *outputFormatter) removeParagraphsWithFewWords() { 139 | language := this.language 140 | if language == "" { 141 | language = "en" 142 | } 143 | allNodes := this.topNode.Children() 144 | allNodes.Each(func(i int, s *goquery.Selection) { 145 | sw := this.config.stopWords.stopWordsCount(language, s.Text()) 146 | if sw.wordCount < 5 && s.Find("object").Length() == 0 && s.Find("em").Length() == 0 { 147 | node := s.Get(0) 148 | node.Parent.RemoveChild(node) 149 | } 150 | }) 151 | 152 | } 153 | -------------------------------------------------------------------------------- /parser.go: -------------------------------------------------------------------------------- 1 | package goose 2 | 3 | import ( 4 | "golang.org/x/net/html" 5 | "github.com/PuerkitoBio/goquery" 6 | ) 7 | 8 | type parser struct{} 9 | 10 | func NewParser() *parser { 11 | return &parser{} 12 | } 13 | 14 | func (this *parser) dropTag(selection *goquery.Selection) { 15 | selection.Each(func(i int, s *goquery.Selection) { 16 | node := s.Get(0) 17 | node.Data = s.Text() 18 | node.Type = html.TextNode 19 | }) 20 | } 21 | 22 | func (this *parser) indexOfAttribute(selection *goquery.Selection, attr string) int { 23 | node := selection.Get(0) 24 | for i, a := range node.Attr { 25 | if a.Key == attr { 26 | return i 27 | } 28 | } 29 | return -1 30 | } 31 | 32 | func (this *parser) delAttr(selection *goquery.Selection, attr string) { 33 | idx := this.indexOfAttribute(selection, attr) 34 | if idx > -1 { 35 | node := selection.Get(0) 36 | node.Attr = append(node.Attr[:idx], node.Attr[idx+1:]...) 37 | } 38 | } 39 | 40 | func (this *parser) getElementsByTags(div *goquery.Selection, tags []string) *goquery.Selection { 41 | selection := new(goquery.Selection) 42 | for _, tag := range tags { 43 | selections := div.Find(tag) 44 | if selections != nil { 45 | selection = selection.Union(selections) 46 | } 47 | } 48 | return selection 49 | } 50 | 51 | func (this *parser) clear(selection *goquery.Selection) { 52 | selection.Nodes = make([]*html.Node, 0) 53 | } 54 | 55 | func (this *parser) removeNode(selection *goquery.Selection) { 56 | if selection != nil { 57 | node := selection.Get(0) 58 | if node != nil && node.Parent != nil { 59 | node.Parent.RemoveChild(node) 60 | } 61 | } 62 | } 63 | 64 | func (this *parser) name(selector string, selection *goquery.Selection) string { 65 | value, exists := selection.Attr(selector) 66 | if exists { 67 | return value 68 | } 69 | return "" 70 | } 71 | 72 | func (this *parser) setAttr(selection *goquery.Selection, attr string, value string) { 73 | if selection.Size() > 0 { 74 | node := selection.Get(0) 75 | attrs := make([]html.Attribute, 0) 76 | for _, a := range node.Attr { 77 | if a.Key != attr { 78 | newAttr := new(html.Attribute) 79 | newAttr.Key = a.Key 80 | newAttr.Val = a.Val 81 | attrs = append(attrs, *newAttr) 82 | } 83 | } 84 | newAttr := new(html.Attribute) 85 | newAttr.Key = attr 86 | newAttr.Val = value 87 | attrs = append(attrs, *newAttr) 88 | node.Attr = attrs 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /stopwords.go: -------------------------------------------------------------------------------- 1 | package goose 2 | 3 | import ( 4 | "gopkg.in/fatih/set.v0" 5 | //"io/ioutil" 6 | "regexp" 7 | "strings" 8 | ) 9 | 10 | var PUNCTUATION = regexp.MustCompile("[^\\p{Ll}\\p{Lu}\\p{Lt}\\p{Lo}\\p{Nd}\\p{Pc}\\s]") 11 | 12 | type StopWords struct { 13 | cachedStopWords map[string]*set.Set 14 | } 15 | 16 | func NewStopwords() StopWords { 17 | cachedStopWords := make(map[string]*set.Set) 18 | for lang, stopwords := range sw { 19 | lines := strings.Split(stopwords, "\n") 20 | cachedStopWords[lang] = set.New() 21 | for _, line := range lines { 22 | line = strings.Trim(line, " ") 23 | line = strings.Trim(line, "\t") 24 | line = strings.Trim(line, "\r") 25 | cachedStopWords[lang].Add(line) 26 | } 27 | } 28 | return StopWords{ 29 | cachedStopWords: cachedStopWords, 30 | } 31 | } 32 | 33 | /* 34 | func NewStopwords(path string) StopWords { 35 | cachedStopWords := make(map[string]*set.Set) 36 | files, err := ioutil.ReadDir(path) 37 | if err != nil { 38 | panic(err.Error()) 39 | } 40 | for _, file := range files { 41 | name := strings.Replace(file.Name(), ".txt", "", -1) 42 | name = strings.Replace(name, "stopwords-", "", -1) 43 | name = strings.ToLower(name) 44 | 45 | stops := set.New() 46 | lines := ReadLinesOfFile(path + "/" + file.Name()) 47 | for _, line := range lines { 48 | line = strings.Trim(line, " ") 49 | stops.Add(line) 50 | } 51 | cachedStopWords[name] = stops 52 | } 53 | 54 | return StopWords{ 55 | cachedStopWords: cachedStopWords, 56 | } 57 | } 58 | */ 59 | 60 | func (this *StopWords) removePunctuation(text string) string { 61 | return PUNCTUATION.ReplaceAllString(text, "") 62 | } 63 | 64 | func (this *StopWords) stopWordsCount(lang string, text string) wordStats { 65 | if text == "" { 66 | return wordStats{} 67 | } 68 | ws := wordStats{} 69 | stopWords := set.New() 70 | text = strings.ToLower(text) 71 | items := strings.Split(text, " ") 72 | stops := this.cachedStopWords[lang] 73 | count := 0 74 | if stops != nil { 75 | for _, item := range items { 76 | if stops.Has(item) { 77 | stopWords.Add(item) 78 | count++ 79 | } 80 | } 81 | } 82 | 83 | ws.stopWordCount = stopWords.Size() 84 | ws.wordCount = len(items) 85 | ws.stopWords = stopWords 86 | 87 | return ws 88 | } 89 | 90 | func (this StopWords) SimpleLanguageDetector(text string) string { 91 | max := 0 92 | currentLang := "en" 93 | 94 | for k, _ := range sw { 95 | ws := this.stopWordsCount(k, text) 96 | if ws.stopWordCount > max { 97 | max = ws.stopWordCount 98 | currentLang = k 99 | } 100 | } 101 | 102 | return currentLang 103 | } 104 | 105 | var sw = map[string]string{ 106 | "ar": ` 107 | فى 108 | في 109 | كل 110 | لم 111 | لن 112 | له 113 | من 114 | هو 115 | هي 116 | قوة 117 | كما 118 | لها 119 | منذ 120 | وقد 121 | ولا 122 | نفسه 123 | لقاء 124 | مقابل 125 | هناك 126 | وقال 127 | وكان 128 | نهاية 129 | وقالت 130 | وكانت 131 | للامم 132 | فيه 133 | كلم 134 | لكن 135 | وفي 136 | وقف 137 | ولم 138 | ومن 139 | وهو 140 | وهي 141 | يوم 142 | فيها 143 | منها 144 | مليار 145 | لوكالة 146 | يكون 147 | يمكن 148 | مليون 149 | حيث 150 | اكد 151 | الا 152 | اما 153 | امس 154 | السابق 155 | التى 156 | التي 157 | اكثر 158 | ايار 159 | ايضا 160 | ثلاثة 161 | الذاتي 162 | الاخيرة 163 | الثاني 164 | الثانية 165 | الذى 166 | الذي 167 | الان 168 | امام 169 | ايام 170 | خلال 171 | حوالى 172 | الذين 173 | الاول 174 | الاولى 175 | بين 176 | ذلك 177 | دون 178 | حول 179 | حين 180 | الف 181 | الى 182 | انه 183 | اول 184 | ضمن 185 | انها 186 | جميع 187 | الماضي 188 | الوقت 189 | المقبل 190 | اليوم 191 | ـ 192 | ف 193 | و 194 | و6 195 | قد 196 | لا 197 | ما 198 | مع 199 | مساء 200 | هذا 201 | واحد 202 | واضاف 203 | واضافت 204 | فان 205 | قبل 206 | قال 207 | كان 208 | لدى 209 | نحو 210 | هذه 211 | وان 212 | واكد 213 | كانت 214 | واوضح 215 | مايو 216 | ب 217 | ا 218 | أ 219 | ، 220 | عشر 221 | عدد 222 | عدة 223 | عشرة 224 | عدم 225 | عام 226 | عاما 227 | عن 228 | عند 229 | عندما 230 | على 231 | عليه 232 | عليها 233 | زيارة 234 | سنة 235 | سنوات 236 | تم 237 | ضد 238 | بعد 239 | بعض 240 | اعادة 241 | اعلنت 242 | بسبب 243 | حتى 244 | اذا 245 | احد 246 | اثر 247 | برس 248 | باسم 249 | غدا 250 | شخصا 251 | صباح 252 | اطار 253 | اربعة 254 | اخرى 255 | بان 256 | اجل 257 | غير 258 | بشكل 259 | حاليا 260 | بن 261 | به 262 | ثم 263 | اف 264 | ان 265 | او 266 | اي 267 | بها 268 | صفر 269 | `, 270 | "en": ` 271 | a's 272 | able 273 | about 274 | above 275 | according 276 | accordingly 277 | across 278 | actually 279 | after 280 | afterwards 281 | again 282 | against 283 | ain't 284 | all 285 | allow 286 | allows 287 | almost 288 | alone 289 | along 290 | already 291 | also 292 | although 293 | always 294 | am 295 | among 296 | amongst 297 | an 298 | and 299 | another 300 | any 301 | anybody 302 | anyhow 303 | anyone 304 | anything 305 | anyway 306 | anyways 307 | anywhere 308 | apart 309 | appear 310 | appreciate 311 | appropriate 312 | are 313 | aren't 314 | around 315 | as 316 | aside 317 | ask 318 | asking 319 | associated 320 | at 321 | available 322 | away 323 | awfully 324 | be 325 | became 326 | because 327 | become 328 | becomes 329 | becoming 330 | been 331 | before 332 | beforehand 333 | behind 334 | being 335 | believe 336 | below 337 | beside 338 | besides 339 | best 340 | better 341 | between 342 | beyond 343 | both 344 | brief 345 | but 346 | by 347 | c 348 | c'mon 349 | c's 350 | came 351 | campaign 352 | can 353 | can't 354 | cannot 355 | cant 356 | cause 357 | causes 358 | certain 359 | certainly 360 | changes 361 | clearly 362 | co 363 | com 364 | come 365 | comes 366 | concerning 367 | consequently 368 | consider 369 | considering 370 | contain 371 | containing 372 | contains 373 | corresponding 374 | could 375 | couldn't 376 | course 377 | currently 378 | definitely 379 | described 380 | despite 381 | did 382 | didn't 383 | different 384 | do 385 | does 386 | doesn't 387 | doing 388 | don't 389 | done 390 | down 391 | downwards 392 | during 393 | each 394 | edu 395 | eight 396 | either 397 | else 398 | elsewhere 399 | enough 400 | endorsed 401 | entirely 402 | especially 403 | et 404 | etc 405 | even 406 | ever 407 | every 408 | everybody 409 | everyone 410 | everything 411 | everywhere 412 | ex 413 | exactly 414 | example 415 | except 416 | far 417 | few 418 | fifth 419 | first 420 | financial 421 | five 422 | followed 423 | following 424 | follows 425 | for 426 | former 427 | formerly 428 | forth 429 | four 430 | from 431 | further 432 | furthermore 433 | get 434 | gets 435 | getting 436 | given 437 | gives 438 | go 439 | goes 440 | going 441 | gone 442 | got 443 | gotten 444 | greetings 445 | had 446 | hadn't 447 | happens 448 | hardly 449 | has 450 | hasn't 451 | have 452 | haven't 453 | having 454 | he 455 | he's 456 | hello 457 | help 458 | hence 459 | her 460 | here 461 | here's 462 | hereafter 463 | hereby 464 | herein 465 | hereupon 466 | hers 467 | herself 468 | hi 469 | him 470 | himself 471 | his 472 | hither 473 | hopefully 474 | how 475 | howbeit 476 | however 477 | i'd 478 | i'll 479 | i'm 480 | i've 481 | if 482 | ignored 483 | immediate 484 | in 485 | inasmuch 486 | inc 487 | indeed 488 | indicate 489 | indicated 490 | indicates 491 | inner 492 | insofar 493 | instead 494 | into 495 | inward 496 | is 497 | isn't 498 | it 499 | it'd 500 | it'll 501 | it's 502 | its 503 | itself 504 | just 505 | keep 506 | keeps 507 | kept 508 | know 509 | knows 510 | known 511 | last 512 | lately 513 | later 514 | latter 515 | latterly 516 | least 517 | less 518 | lest 519 | let 520 | let's 521 | like 522 | liked 523 | likely 524 | little 525 | look 526 | looking 527 | looks 528 | ltd 529 | mainly 530 | many 531 | may 532 | maybe 533 | me 534 | mean 535 | meanwhile 536 | merely 537 | might 538 | more 539 | moreover 540 | most 541 | mostly 542 | much 543 | must 544 | my 545 | myself 546 | name 547 | namely 548 | nd 549 | near 550 | nearly 551 | necessary 552 | need 553 | needs 554 | neither 555 | never 556 | nevertheless 557 | new 558 | next 559 | nine 560 | no 561 | nobody 562 | non 563 | none 564 | noone 565 | nor 566 | normally 567 | not 568 | nothing 569 | novel 570 | now 571 | nowhere 572 | obviously 573 | of 574 | off 575 | often 576 | oh 577 | ok 578 | okay 579 | old 580 | on 581 | once 582 | one 583 | ones 584 | only 585 | onto 586 | or 587 | other 588 | others 589 | otherwise 590 | ought 591 | our 592 | ours 593 | ourselves 594 | out 595 | outside 596 | over 597 | overall 598 | own 599 | particular 600 | particularly 601 | per 602 | perhaps 603 | placed 604 | please 605 | plus 606 | possible 607 | presumably 608 | probably 609 | provides 610 | quite 611 | quote 612 | quarterly 613 | rather 614 | really 615 | reasonably 616 | regarding 617 | regardless 618 | regards 619 | relatively 620 | respectively 621 | right 622 | said 623 | same 624 | saw 625 | say 626 | saying 627 | says 628 | second 629 | secondly 630 | see 631 | seeing 632 | seem 633 | seemed 634 | seeming 635 | seems 636 | seen 637 | self 638 | selves 639 | sensible 640 | sent 641 | serious 642 | seriously 643 | seven 644 | several 645 | shall 646 | she 647 | should 648 | shouldn't 649 | since 650 | six 651 | so 652 | some 653 | somebody 654 | somehow 655 | someone 656 | something 657 | sometime 658 | sometimes 659 | somewhat 660 | somewhere 661 | soon 662 | sorry 663 | specified 664 | specify 665 | specifying 666 | still 667 | sub 668 | such 669 | sup 670 | sure 671 | t's 672 | take 673 | taken 674 | tell 675 | tends 676 | than 677 | thank 678 | thanks 679 | thanx 680 | that 681 | that's 682 | thats 683 | the 684 | their 685 | theirs 686 | them 687 | themselves 688 | then 689 | thence 690 | there 691 | there's 692 | thereafter 693 | thereby 694 | therefore 695 | therein 696 | theres 697 | thereupon 698 | these 699 | they 700 | they'd 701 | they'll 702 | they're 703 | they've 704 | think 705 | third 706 | this 707 | thorough 708 | thoroughly 709 | those 710 | though 711 | three 712 | through 713 | throughout 714 | thru 715 | thus 716 | to 717 | together 718 | too 719 | took 720 | toward 721 | towards 722 | tried 723 | tries 724 | truly 725 | try 726 | trying 727 | twice 728 | two 729 | under 730 | unfortunately 731 | unless 732 | unlikely 733 | until 734 | unto 735 | up 736 | upon 737 | us 738 | use 739 | used 740 | useful 741 | uses 742 | using 743 | usually 744 | uucp 745 | value 746 | various 747 | very 748 | via 749 | viz 750 | vs 751 | want 752 | wants 753 | was 754 | wasn't 755 | way 756 | we 757 | we'd 758 | we'll 759 | we're 760 | we've 761 | welcome 762 | well 763 | went 764 | were 765 | weren't 766 | what 767 | what's 768 | whatever 769 | when 770 | whence 771 | whenever 772 | where 773 | where's 774 | whereafter 775 | whereas 776 | whereby 777 | wherein 778 | whereupon 779 | wherever 780 | whether 781 | which 782 | while 783 | whither 784 | who 785 | who's 786 | whoever 787 | whole 788 | whom 789 | whose 790 | why 791 | will 792 | willing 793 | wish 794 | with 795 | within 796 | without 797 | won't 798 | wonder 799 | would 800 | would 801 | wouldn't 802 | yes 803 | yet 804 | you 805 | you'd 806 | you'll 807 | you're 808 | you've 809 | your 810 | yours 811 | yourself 812 | yourselves 813 | zero 814 | official 815 | sharply 816 | criticized 817 | `, 818 | "es": ` 819 | de 820 | la 821 | que 822 | el 823 | en 824 | y 825 | a 826 | los 827 | del 828 | se 829 | las 830 | por 831 | un 832 | para 833 | con 834 | no 835 | una 836 | su 837 | al 838 | lo 839 | como 840 | más 841 | pero 842 | sus 843 | le 844 | ya 845 | o 846 | este 847 | sí 848 | porque 849 | esta 850 | entre 851 | cuando 852 | muy 853 | sin 854 | sobre 855 | también 856 | me 857 | hasta 858 | hay 859 | donde 860 | quien 861 | desde 862 | todo 863 | nos 864 | durante 865 | todos 866 | uno 867 | les 868 | ni 869 | contra 870 | otros 871 | ese 872 | eso 873 | ante 874 | ellos 875 | e 876 | esto 877 | mí 878 | antes 879 | algunos 880 | qué 881 | unos 882 | yo 883 | otro 884 | otras 885 | otra 886 | él 887 | tanto 888 | esa 889 | estos 890 | mucho 891 | quienes 892 | nada 893 | muchos 894 | cual 895 | poco 896 | ella 897 | estar 898 | estas 899 | algunas 900 | algo 901 | nosotros 902 | mi 903 | mis 904 | tú 905 | te 906 | ti 907 | tu 908 | tus 909 | ellas 910 | nosotras 911 | vosotros 912 | vosotras 913 | os 914 | mío 915 | mía 916 | míos 917 | mías 918 | tuyo 919 | tuya 920 | tuyos 921 | tuyas 922 | suyo 923 | suya 924 | suyos 925 | suyas 926 | nuestro 927 | nuestra 928 | nuestros 929 | nuestras 930 | vuestro 931 | vuestra 932 | vuestros 933 | vuestras 934 | esos 935 | esas 936 | estoy 937 | estás 938 | está 939 | estamos 940 | estáis 941 | están 942 | esté 943 | estés 944 | estemos 945 | estéis 946 | estén 947 | estaré 948 | estarás 949 | estará 950 | estaremos 951 | estaréis 952 | estarán 953 | estaría 954 | estarías 955 | estaríamos 956 | estaríais 957 | estarían 958 | estaba 959 | estabas 960 | estábamos 961 | estabais 962 | estaban 963 | estuve 964 | estuviste 965 | estuvo 966 | estuvimos 967 | estuvisteis 968 | estuvieron 969 | estuviera 970 | estuvieras 971 | estuviéramos 972 | estuvierais 973 | estuvieran 974 | estuviese 975 | estuvieses 976 | estuviésemos 977 | estuvieseis 978 | estuviesen 979 | estando 980 | estado 981 | estada 982 | estados 983 | estadas 984 | estad 985 | he 986 | has 987 | ha 988 | hemos 989 | habéis 990 | han 991 | haya 992 | hayas 993 | hayamos 994 | hayáis 995 | hayan 996 | habré 997 | habrás 998 | habrá 999 | habremos 1000 | habréis 1001 | habrán 1002 | habría 1003 | habrías 1004 | habríamos 1005 | habríais 1006 | habrían 1007 | había 1008 | habías 1009 | habíamos 1010 | habíais 1011 | habían 1012 | hube 1013 | hubiste 1014 | hubo 1015 | hubimos 1016 | hubisteis 1017 | hubieron 1018 | hubiera 1019 | hubieras 1020 | hubiéramos 1021 | hubierais 1022 | hubieran 1023 | hubiese 1024 | hubieses 1025 | hubiésemos 1026 | hubieseis 1027 | hubiesen 1028 | habiendo 1029 | habido 1030 | habida 1031 | habidos 1032 | habidas 1033 | 1034 | # forms of ser, to be (not including the infinitive): 1035 | soy 1036 | eres 1037 | es 1038 | somos 1039 | sois 1040 | son 1041 | sea 1042 | seas 1043 | seamos 1044 | seáis 1045 | sean 1046 | seré 1047 | serás 1048 | será 1049 | seremos 1050 | seréis 1051 | serán 1052 | sería 1053 | serías 1054 | seríamos 1055 | seríais 1056 | serían 1057 | era 1058 | eras 1059 | éramos 1060 | erais 1061 | eran 1062 | fui 1063 | fuiste 1064 | fue 1065 | fuimos 1066 | fuisteis 1067 | fueron 1068 | fuera 1069 | fueras 1070 | fuéramos 1071 | fuerais 1072 | fueran 1073 | fuese 1074 | fueses 1075 | fuésemos 1076 | fueseis 1077 | fuesen 1078 | siendo 1079 | sido 1080 | tengo 1081 | tienes 1082 | tiene 1083 | tenemos 1084 | tenéis 1085 | tienen 1086 | tenga 1087 | tengas 1088 | tengamos 1089 | tengáis 1090 | tengan 1091 | tendré 1092 | tendrás 1093 | tendrá 1094 | tendremos 1095 | tendréis 1096 | tendrán 1097 | tendría 1098 | tendrías 1099 | tendríamos 1100 | tendríais 1101 | tendrían 1102 | tenía 1103 | tenías 1104 | teníamos 1105 | teníais 1106 | tenían 1107 | tuve 1108 | tuviste 1109 | tuvo 1110 | tuvimos 1111 | tuvisteis 1112 | tuvieron 1113 | tuviera 1114 | tuvieras 1115 | tuviéramos 1116 | tuvierais 1117 | tuvieran 1118 | tuviese 1119 | tuvieses 1120 | tuviésemos 1121 | tuvieseis 1122 | tuviesen 1123 | teniendo 1124 | tenido 1125 | tenida 1126 | tenidos 1127 | tenidas 1128 | tened 1129 | `, 1130 | "fr": ` 1131 | # Licensed to the Apache Software Foundation (ASF) under one or more 1132 | # contributor license agreements. See the NOTICE file distributed with 1133 | # this work for additional information regarding copyright ownership. 1134 | # The ASF licenses this file to You under the Apache License, Version 2.0 1135 | # (the "License"); you may not use this file except in compliance with 1136 | # the License. You may obtain a copy of the License at 1137 | # 1138 | # http://www.apache.org/licenses/LICENSE-2.0 1139 | # 1140 | # Unless required by applicable law or agreed to in writing, software 1141 | # distributed under the License is distributed on an "AS IS" BASIS, 1142 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1143 | # See the License for the specific language governing permissions and 1144 | # limitations under the License. 1145 | 1146 | #----------------------------------------------------------------------- 1147 | # a couple of test stopwords to test that the words are really being 1148 | # configured from this file: 1149 | stopworda 1150 | stopwordb 1151 | 1152 | #Standard english stop words taken from Lucene's StopAnalyzer 1153 | a 1154 | an 1155 | and 1156 | are 1157 | as 1158 | at 1159 | be 1160 | but 1161 | by 1162 | for 1163 | if 1164 | in 1165 | into 1166 | is 1167 | it 1168 | no 1169 | not 1170 | of 1171 | on 1172 | or 1173 | s 1174 | such 1175 | t 1176 | that 1177 | the 1178 | their 1179 | then 1180 | there 1181 | these 1182 | they 1183 | this 1184 | to 1185 | was 1186 | will 1187 | with 1188 | au 1189 | aux 1190 | avec 1191 | ce 1192 | ces 1193 | dans 1194 | de 1195 | des 1196 | du 1197 | elle 1198 | en 1199 | et 1200 | eux 1201 | il 1202 | je 1203 | la 1204 | le 1205 | leur 1206 | lui 1207 | ma 1208 | mais 1209 | me 1210 | même 1211 | mes 1212 | moi 1213 | mon 1214 | ne 1215 | nos 1216 | notre 1217 | nous 1218 | on 1219 | ou 1220 | par 1221 | pas 1222 | pour 1223 | qu 1224 | que 1225 | qui 1226 | sa 1227 | se 1228 | ses 1229 | son 1230 | sur 1231 | ta 1232 | te 1233 | tes 1234 | toi 1235 | ton 1236 | tu 1237 | un 1238 | une 1239 | vos 1240 | votre 1241 | vous 1242 | c 1243 | d 1244 | j 1245 | l 1246 | à 1247 | m 1248 | n 1249 | s 1250 | t 1251 | y 1252 | été 1253 | étée 1254 | étées 1255 | étés 1256 | étant 1257 | suis 1258 | es 1259 | est 1260 | sommes 1261 | êtes 1262 | sont 1263 | serai 1264 | seras 1265 | sera 1266 | serons 1267 | serez 1268 | seront 1269 | serais 1270 | serait 1271 | serions 1272 | seriez 1273 | seraient 1274 | étais 1275 | était 1276 | étions 1277 | étiez 1278 | étaient 1279 | fus 1280 | fut 1281 | fûmes 1282 | fûtes 1283 | furent 1284 | sois 1285 | soit 1286 | soyons 1287 | soyez 1288 | soient 1289 | fusse 1290 | fusses 1291 | fût 1292 | fussions 1293 | fussiez 1294 | fussent 1295 | ayant 1296 | eu 1297 | eue 1298 | eues 1299 | eus 1300 | ai 1301 | as 1302 | avons 1303 | avez 1304 | ont 1305 | aurai 1306 | auras 1307 | aura 1308 | aurons 1309 | aurez 1310 | auront 1311 | aurais 1312 | aurait 1313 | aurions 1314 | auriez 1315 | auraient 1316 | avais 1317 | avait 1318 | avions 1319 | aviez 1320 | avaient 1321 | eut 1322 | eûmes 1323 | eûtes 1324 | eurent 1325 | aie 1326 | aies 1327 | ait 1328 | ayons 1329 | ayez 1330 | aient 1331 | eusse 1332 | eusses 1333 | eût 1334 | eussions 1335 | eussiez 1336 | eussent 1337 | ceci 1338 | celà 1339 | cet 1340 | cette 1341 | ici 1342 | ils 1343 | les 1344 | leurs 1345 | quel 1346 | quels 1347 | quelle 1348 | quelles 1349 | sans 1350 | soi 1351 | 1352 | `, 1353 | "nl": ` 1354 | aan 1355 | af 1356 | al 1357 | alleen 1358 | als 1359 | altijd 1360 | ben 1361 | bent 1362 | bij 1363 | daar 1364 | dag 1365 | dan 1366 | dat 1367 | de 1368 | der 1369 | deze 1370 | die 1371 | direct 1372 | dit 1373 | doch 1374 | doen 1375 | dus 1376 | een 1377 | eens 1378 | en 1379 | er 1380 | gaan 1381 | gaat 1382 | ge 1383 | geen 1384 | geweest 1385 | graag 1386 | haar 1387 | had 1388 | heb 1389 | hebben 1390 | heeft 1391 | hem 1392 | het 1393 | hij 1394 | hoe 1395 | hun 1396 | ik 1397 | in 1398 | is 1399 | je 1400 | kan 1401 | komt 1402 | kon 1403 | kunnen 1404 | kunt 1405 | laatste 1406 | maar 1407 | maken 1408 | me 1409 | mee 1410 | meer 1411 | men 1412 | met 1413 | mij 1414 | mijn 1415 | na 1416 | naar 1417 | niet 1418 | nog 1419 | nu 1420 | of 1421 | om 1422 | omdat 1423 | onder 1424 | ons 1425 | onze 1426 | ook 1427 | op 1428 | reeds 1429 | te 1430 | toch 1431 | toen 1432 | tot 1433 | uit 1434 | uw 1435 | van 1436 | vanaf 1437 | veel 1438 | via 1439 | voor 1440 | waar 1441 | was 1442 | wat 1443 | we 1444 | weer 1445 | wel 1446 | werd 1447 | wie 1448 | wij 1449 | wilt 1450 | worden 1451 | wordt 1452 | zal 1453 | ze 1454 | zei 1455 | zelf 1456 | zich 1457 | zij 1458 | zijn 1459 | zo 1460 | zoals 1461 | zou 1462 | `, 1463 | "sv": ` 1464 | #----------------------------------------------------------------------- 1465 | # translated 1466 | #----------------------------------------------------------------------- 1467 | 1468 | kunna 1469 | om 1470 | ovan 1471 | enligt 1472 | i enlighet med detta 1473 | över 1474 | faktiskt 1475 | efter 1476 | efteråt 1477 | igen 1478 | mot 1479 | är inte 1480 | alla 1481 | tillåta 1482 | tillåter 1483 | nästan 1484 | ensam 1485 | längs 1486 | redan 1487 | också 1488 | även om 1489 | alltid 1490 | am 1491 | bland 1492 | bland 1493 | en 1494 | och 1495 | en annan 1496 | någon 1497 | någon 1498 | hur som helst 1499 | någon 1500 | något 1501 | ändå 1502 | ändå 1503 | var som helst 1504 | isär 1505 | visas 1506 | uppskatta 1507 | lämpligt 1508 | är 1509 | inte 1510 | runt 1511 | som 1512 | åt sidan 1513 | be 1514 | frågar 1515 | associerad 1516 | vid 1517 | tillgängliga 1518 | bort 1519 | väldigt 1520 | vara 1521 | blev 1522 | eftersom 1523 | bli 1524 | blir 1525 | blir 1526 | varit 1527 | innan 1528 | förhand 1529 | bakom 1530 | vara 1531 | tro 1532 | nedan 1533 | bredvid 1534 | förutom 1535 | bäst 1536 | bättre 1537 | mellan 1538 | bortom 1539 | både 1540 | kort 1541 | men 1542 | genom 1543 | c 1544 | c'mon 1545 | c: s 1546 | kom 1547 | kampanj 1548 | kan 1549 | kan inte 1550 | kan inte 1551 | cant 1552 | orsaka 1553 | orsaker 1554 | viss 1555 | säkerligen 1556 | förändringar 1557 | klart 1558 | co 1559 | com 1560 | komma 1561 | kommer 1562 | om 1563 | följaktligen 1564 | överväga 1565 | överväger 1566 | innehålla 1567 | innehållande 1568 | innehåller 1569 | motsvarande 1570 | kunde 1571 | kunde inte 1572 | kurs 1573 | närvarande 1574 | definitivt 1575 | beskrivits 1576 | trots 1577 | gjorde 1578 | inte 1579 | olika 1580 | göra 1581 | gör 1582 | inte 1583 | gör 1584 | inte 1585 | gjort 1586 | ned 1587 | nedåt 1588 | under 1589 | varje 1590 | edu 1591 | åtta 1592 | antingen 1593 | annars 1594 | någon annanstans 1595 | tillräckligt 1596 | godkändes 1597 | helt 1598 | speciellt 1599 | et 1600 | etc 1601 | även 1602 | någonsin 1603 | varje 1604 | alla 1605 | alla 1606 | allt 1607 | överallt 1608 | ex 1609 | exakt 1610 | exempel 1611 | utom 1612 | långt 1613 | få 1614 | femte 1615 | först 1616 | finansiella 1617 | fem 1618 | följt 1619 | efter 1620 | följer 1621 | för 1622 | fd 1623 | tidigare 1624 | framåt 1625 | fyra 1626 | från 1627 | ytterligare 1628 | dessutom 1629 | få 1630 | blir 1631 | få 1632 | given 1633 | ger 1634 | gå 1635 | går 1636 | gå 1637 | borta 1638 | fick 1639 | fått 1640 | hälsningar 1641 | hade 1642 | hade inte 1643 | händer 1644 | knappast 1645 | har 1646 | har inte 1647 | ha 1648 | har inte 1649 | med 1650 | han 1651 | han är 1652 | hallå 1653 | hjälpa 1654 | hence 1655 | henne 1656 | här 1657 | här finns 1658 | härefter 1659 | härmed 1660 | häri 1661 | härpå 1662 | hennes 1663 | själv 1664 | hej 1665 | honom 1666 | själv 1667 | hans 1668 | hit 1669 | förhoppningsvis 1670 | hur 1671 | howbeit 1672 | dock 1673 | jag skulle 1674 | jag ska 1675 | jag är 1676 | jag har 1677 | om 1678 | ignoreras 1679 | omedelbar 1680 | i 1681 | eftersom 1682 | inc 1683 | indeed 1684 | indikera 1685 | indikerade 1686 | indikerar 1687 | inre 1688 | mån 1689 | istället 1690 | in 1691 | inåt 1692 | är 1693 | är inte 1694 | den 1695 | det skulle 1696 | det ska 1697 | det är 1698 | dess 1699 | själv 1700 | bara 1701 | hålla 1702 | håller 1703 | hålls 1704 | vet 1705 | vet 1706 | känd 1707 | sista 1708 | nyligen 1709 | senare 1710 | senare 1711 | latterly 1712 | minst 1713 | mindre 1714 | lest 1715 | låt 1716 | låt oss 1717 | liknande 1718 | gillade 1719 | sannolikt 1720 | lite 1721 | ser 1722 | ser 1723 | ser 1724 | ltd 1725 | huvudsakligen 1726 | många 1727 | kan 1728 | kanske 1729 | mig 1730 | betyda 1731 | under tiden 1732 | endast 1733 | kanske 1734 | mer 1735 | dessutom 1736 | mest 1737 | mestadels 1738 | mycket 1739 | måste 1740 | min 1741 | själv 1742 | namn 1743 | nämligen 1744 | nd 1745 | nära 1746 | nästan 1747 | nödvändigt 1748 | behöver 1749 | behov 1750 | varken 1751 | aldrig 1752 | ändå 1753 | ny 1754 | nästa 1755 | nio 1756 | ingen 1757 | ingen 1758 | icke 1759 | ingen 1760 | ingen 1761 | eller 1762 | normalt 1763 | inte 1764 | ingenting 1765 | roman 1766 | nu 1767 | ingenstans 1768 | uppenbarligen 1769 | av 1770 | off 1771 | ofta 1772 | oh 1773 | ok 1774 | okay 1775 | gammal 1776 | på 1777 | en gång 1778 | ett 1779 | ettor 1780 | endast 1781 | på 1782 | eller 1783 | andra 1784 | andra 1785 | annars 1786 | borde 1787 | vår 1788 | vårt 1789 | oss 1790 | ut 1791 | utanför 1792 | över 1793 | övergripande 1794 | egen 1795 | särskilt 1796 | särskilt 1797 | per 1798 | kanske 1799 | placeras 1800 | vänligen 1801 | plus 1802 | möjligt 1803 | förmodligen 1804 | förmodligen 1805 | ger 1806 | ganska 1807 | citera 1808 | kvartalsvis 1809 | snarare 1810 | verkligen 1811 | rimligen 1812 | om 1813 | oavsett 1814 | gäller 1815 | relativt 1816 | respektive 1817 | höger 1818 | sa 1819 | samma 1820 | såg 1821 | säga 1822 | säger 1823 | säger 1824 | andra 1825 | det andra 1826 | se 1827 | ser 1828 | verkar 1829 | verkade 1830 | informationsproblem 1831 | verkar 1832 | sett 1833 | själv 1834 | själva 1835 | förnuftig 1836 | skickas 1837 | allvarlig 1838 | allvarligt 1839 | sju 1840 | flera 1841 | skall 1842 | hon 1843 | bör 1844 | bör inte 1845 | eftersom 1846 | sex 1847 | så 1848 | några 1849 | någon 1850 | på något sätt 1851 | någon 1852 | något 1853 | sometime 1854 | ibland 1855 | något 1856 | någonstans 1857 | snart 1858 | sorry 1859 | specificerade 1860 | ange 1861 | ange 1862 | fortfarande 1863 | sub 1864 | sådan 1865 | sup 1866 | säker 1867 | t s 1868 | ta 1869 | tas 1870 | berätta 1871 | tenderar 1872 | än 1873 | tacka 1874 | tack 1875 | thanx 1876 | att 1877 | det är 1878 | brinner 1879 | den 1880 | deras 1881 | deras 1882 | dem 1883 | själva 1884 | sedan 1885 | därifrån 1886 | där 1887 | det finns 1888 | därefter 1889 | därigenom 1890 | därför 1891 | däri 1892 | theres 1893 | därpå 1894 | dessa 1895 | de 1896 | de hade 1897 | de kommer 1898 | de är 1899 | de har 1900 | tror 1901 | tredje 1902 | detta 1903 | grundlig 1904 | grundligt 1905 | de 1906 | though 1907 | tre 1908 | genom 1909 | hela 1910 | thru 1911 | sålunda 1912 | till 1913 | tillsammans 1914 | alltför 1915 | tog 1916 | mot 1917 | mot 1918 | försökte 1919 | försöker 1920 | verkligt 1921 | försök 1922 | försöker 1923 | två gånger 1924 | två 1925 | enligt 1926 | tyvärr 1927 | såvida inte 1928 | osannolikt 1929 | tills 1930 | åt 1931 | upp 1932 | på 1933 | oss 1934 | använda 1935 | används 1936 | användbar 1937 | använder 1938 | användning 1939 | vanligtvis 1940 | uucp 1941 | värde 1942 | olika 1943 | mycket 1944 | via 1945 | viz 1946 | vs 1947 | vill 1948 | vill 1949 | var 1950 | var inte 1951 | sätt 1952 | vi 1953 | vi skulle 1954 | vi kommer 1955 | vi är 1956 | vi har 1957 | välkommen 1958 | väl 1959 | gick 1960 | var 1961 | var inte 1962 | vad 1963 | vad är 1964 | oavsett 1965 | när 1966 | varifrån 1967 | närhelst 1968 | där 1969 | var är 1970 | varefter 1971 | medan 1972 | varigenom 1973 | vari 1974 | varpå 1975 | varhelst 1976 | huruvida 1977 | som 1978 | medan 1979 | dit 1980 | som 1981 | vem är 1982 | vem 1983 | hela 1984 | vem 1985 | vars 1986 | varför 1987 | kommer 1988 | villig 1989 | önskar 1990 | med 1991 | inom 1992 | utan 1993 | kommer inte 1994 | undrar 1995 | skulle 1996 | skulle inte 1997 | ja 1998 | ännu 1999 | ni 2000 | du skulle 2001 | kommer du 2002 | du är 2003 | du har 2004 | din 2005 | själv 2006 | er 2007 | noll 2008 | tjänsteman 2009 | skarpt 2010 | kritiserade 2011 | `, 2012 | "zh": ` 2013 | 的 2014 | 一 2015 | 不 2016 | 在 2017 | 人 2018 | 有 2019 | 是 2020 | 为 2021 | 以 2022 | 于 2023 | 上 2024 | 他 2025 | 而 2026 | 后 2027 | 之 2028 | 来 2029 | 及 2030 | 了 2031 | 因 2032 | 下 2033 | 可 2034 | 到 2035 | 由 2036 | 这 2037 | 与 2038 | 也 2039 | 此 2040 | 但 2041 | 并 2042 | 个 2043 | 其 2044 | 已 2045 | 无 2046 | 小 2047 | 我 2048 | 们 2049 | 起 2050 | 最 2051 | 再 2052 | 今 2053 | 去 2054 | 好 2055 | 只 2056 | 又 2057 | 或 2058 | 很 2059 | 亦 2060 | 某 2061 | 把 2062 | 那 2063 | 你 2064 | 乃 2065 | 它 2066 | 吧 2067 | 被 2068 | 比 2069 | 别 2070 | 趁 2071 | 当 2072 | 从 2073 | 到 2074 | 得 2075 | 打 2076 | 凡 2077 | 儿 2078 | 尔 2079 | 该 2080 | 各 2081 | 给 2082 | 跟 2083 | 和 2084 | 何 2085 | 还 2086 | 即 2087 | 几 2088 | 既 2089 | 看 2090 | 据 2091 | 距 2092 | 靠 2093 | 啦 2094 | 了 2095 | 另 2096 | 么 2097 | 每 2098 | 们 2099 | 嘛 2100 | 拿 2101 | 哪 2102 | 那 2103 | 您 2104 | 凭 2105 | 且 2106 | 却 2107 | 让 2108 | 仍 2109 | 啥 2110 | 如 2111 | 若 2112 | 使 2113 | 谁 2114 | 虽 2115 | 随 2116 | 同 2117 | 所 2118 | 她 2119 | 哇 2120 | 嗡 2121 | 往 2122 | 哪 2123 | 些 2124 | 向 2125 | 沿 2126 | 哟 2127 | 用 2128 | 于 2129 | 咱 2130 | 则 2131 | 怎 2132 | 曾 2133 | 至 2134 | 致 2135 | 着 2136 | 诸 2137 | 自 2138 | `, 2139 | "ru": ` 2140 | а 2141 | е 2142 | и 2143 | ж 2144 | м 2145 | о 2146 | на 2147 | не 2148 | ни 2149 | об 2150 | но 2151 | он 2152 | мне 2153 | мои 2154 | мож 2155 | она 2156 | они 2157 | оно 2158 | мной 2159 | много 2160 | многочисленное 2161 | многочисленная 2162 | многочисленные 2163 | многочисленный 2164 | мною 2165 | мой 2166 | мог 2167 | могут 2168 | можно 2169 | может 2170 | можхо 2171 | мор 2172 | моя 2173 | моё 2174 | мочь 2175 | над 2176 | нее 2177 | оба 2178 | нам 2179 | нем 2180 | нами 2181 | ними 2182 | мимо 2183 | немного 2184 | одной 2185 | одного 2186 | менее 2187 | однажды 2188 | однако 2189 | меня 2190 | нему 2191 | меньше 2192 | ней 2193 | наверху 2194 | него 2195 | ниже 2196 | мало 2197 | надо 2198 | один 2199 | одиннадцать 2200 | одиннадцатый 2201 | назад 2202 | наиболее 2203 | недавно 2204 | миллионов 2205 | недалеко 2206 | между 2207 | низко 2208 | меля 2209 | нельзя 2210 | нибудь 2211 | непрерывно 2212 | наконец 2213 | никогда 2214 | никуда 2215 | нас 2216 | наш 2217 | нет 2218 | нею 2219 | неё 2220 | них 2221 | мира 2222 | наша 2223 | наше 2224 | наши 2225 | ничего 2226 | начала 2227 | нередко 2228 | несколько 2229 | обычно 2230 | опять 2231 | около 2232 | мы 2233 | ну 2234 | нх 2235 | от 2236 | отовсюду 2237 | особенно 2238 | нужно 2239 | очень 2240 | отсюда 2241 | в 2242 | во 2243 | вон 2244 | вниз 2245 | внизу 2246 | вокруг 2247 | вот 2248 | восемнадцать 2249 | восемнадцатый 2250 | восемь 2251 | восьмой 2252 | вверх 2253 | вам 2254 | вами 2255 | важное 2256 | важная 2257 | важные 2258 | важный 2259 | вдали 2260 | везде 2261 | ведь 2262 | вас 2263 | ваш 2264 | ваша 2265 | ваше 2266 | ваши 2267 | впрочем 2268 | весь 2269 | вдруг 2270 | вы 2271 | все 2272 | второй 2273 | всем 2274 | всеми 2275 | времени 2276 | время 2277 | всему 2278 | всего 2279 | всегда 2280 | всех 2281 | всею 2282 | всю 2283 | вся 2284 | всё 2285 | всюду 2286 | г 2287 | год 2288 | говорил 2289 | говорит 2290 | года 2291 | году 2292 | где 2293 | да 2294 | ее 2295 | за 2296 | из 2297 | ли 2298 | же 2299 | им 2300 | до 2301 | по 2302 | ими 2303 | под 2304 | иногда 2305 | довольно 2306 | именно 2307 | долго 2308 | позже 2309 | более 2310 | должно 2311 | пожалуйста 2312 | значит 2313 | иметь 2314 | больше 2315 | пока 2316 | ему 2317 | имя 2318 | пор 2319 | пора 2320 | потом 2321 | потому 2322 | после 2323 | почему 2324 | почти 2325 | посреди 2326 | ей 2327 | два 2328 | две 2329 | двенадцать 2330 | двенадцатый 2331 | двадцать 2332 | двадцатый 2333 | двух 2334 | его 2335 | дел 2336 | или 2337 | без 2338 | день 2339 | занят 2340 | занята 2341 | занято 2342 | заняты 2343 | действительно 2344 | давно 2345 | девятнадцать 2346 | девятнадцатый 2347 | девять 2348 | девятый 2349 | даже 2350 | алло 2351 | жизнь 2352 | далеко 2353 | близко 2354 | здесь 2355 | дальше 2356 | для 2357 | лет 2358 | зато 2359 | даром 2360 | первый 2361 | перед 2362 | затем 2363 | зачем 2364 | лишь 2365 | десять 2366 | десятый 2367 | ею 2368 | её 2369 | их 2370 | бы 2371 | еще 2372 | при 2373 | был 2374 | про 2375 | процентов 2376 | против 2377 | просто 2378 | бывает 2379 | бывь 2380 | если 2381 | люди 2382 | была 2383 | были 2384 | было 2385 | будем 2386 | будет 2387 | будете 2388 | будешь 2389 | прекрасно 2390 | буду 2391 | будь 2392 | будто 2393 | будут 2394 | ещё 2395 | пятнадцать 2396 | пятнадцатый 2397 | друго 2398 | другое 2399 | другой 2400 | другие 2401 | другая 2402 | других 2403 | есть 2404 | пять 2405 | быть 2406 | лучше 2407 | пятый 2408 | к 2409 | ком 2410 | конечно 2411 | кому 2412 | кого 2413 | когда 2414 | которой 2415 | которого 2416 | которая 2417 | которые 2418 | который 2419 | которых 2420 | кем 2421 | каждое 2422 | каждая 2423 | каждые 2424 | каждый 2425 | кажется 2426 | как 2427 | какой 2428 | какая 2429 | кто 2430 | кроме 2431 | куда 2432 | кругом 2433 | с 2434 | т 2435 | у 2436 | я 2437 | та 2438 | те 2439 | уж 2440 | со 2441 | то 2442 | том 2443 | снова 2444 | тому 2445 | совсем 2446 | того 2447 | тогда 2448 | тоже 2449 | собой 2450 | тобой 2451 | собою 2452 | тобою 2453 | сначала 2454 | только 2455 | уметь 2456 | тот 2457 | тою 2458 | хорошо 2459 | хотеть 2460 | хочешь 2461 | хоть 2462 | хотя 2463 | свое 2464 | свои 2465 | твой 2466 | своей 2467 | своего 2468 | своих 2469 | свою 2470 | твоя 2471 | твоё 2472 | раз 2473 | уже 2474 | сам 2475 | там 2476 | тем 2477 | чем 2478 | сама 2479 | сами 2480 | теми 2481 | само 2482 | рано 2483 | самом 2484 | самому 2485 | самой 2486 | самого 2487 | семнадцать 2488 | семнадцатый 2489 | самим 2490 | самими 2491 | самих 2492 | саму 2493 | семь 2494 | чему 2495 | раньше 2496 | сейчас 2497 | чего 2498 | сегодня 2499 | себе 2500 | тебе 2501 | сеаой 2502 | человек 2503 | разве 2504 | теперь 2505 | себя 2506 | тебя 2507 | седьмой 2508 | спасибо 2509 | слишком 2510 | так 2511 | такое 2512 | такой 2513 | такие 2514 | также 2515 | такая 2516 | сих 2517 | тех 2518 | чаще 2519 | четвертый 2520 | через 2521 | часто 2522 | шестой 2523 | шестнадцать 2524 | шестнадцатый 2525 | шесть 2526 | четыре 2527 | четырнадцать 2528 | четырнадцатый 2529 | сколько 2530 | сказал 2531 | сказала 2532 | сказать 2533 | ту 2534 | ты 2535 | три 2536 | эта 2537 | эти 2538 | что 2539 | это 2540 | чтоб 2541 | этом 2542 | этому 2543 | этой 2544 | этого 2545 | чтобы 2546 | этот 2547 | стал 2548 | туда 2549 | этим 2550 | этими 2551 | рядом 2552 | тринадцать 2553 | тринадцатый 2554 | этих 2555 | третий 2556 | тут 2557 | эту 2558 | суть 2559 | чуть 2560 | тысяч`, 2561 | } 2562 | -------------------------------------------------------------------------------- /utils.go: -------------------------------------------------------------------------------- 1 | package goose 2 | 3 | import ( 4 | "fmt" 5 | "io/ioutil" 6 | "regexp" 7 | "strings" 8 | "time" 9 | ) 10 | 11 | func ReadLinesOfFile(filename string) []string { 12 | content, err := ioutil.ReadFile(filename) 13 | if err != nil { 14 | fmt.Println(err.Error()) 15 | } 16 | lines := strings.Split(string(content), "\n") 17 | return lines 18 | } 19 | 20 | func TimeInMilliseconds() int64 { 21 | now := time.Now() 22 | return now.Unix() 23 | } 24 | 25 | func TimeInNanoseconds() int64 { 26 | now := time.Now() 27 | return now.UnixNano() 28 | } 29 | 30 | func RegSplit(text string, reg *regexp.Regexp) []string { 31 | indexes := reg.FindAllStringIndex(text, -1) 32 | laststart := 0 33 | result := make([]string, len(indexes)+1) 34 | for i, element := range indexes { 35 | result[i] = text[laststart:element[0]] 36 | laststart = element[1] 37 | } 38 | result[len(indexes)] = text[laststart:len(text)] 39 | return result 40 | } 41 | -------------------------------------------------------------------------------- /videos.go: -------------------------------------------------------------------------------- 1 | package goose 2 | 3 | import ( 4 | "github.com/PuerkitoBio/goquery" 5 | "gopkg.in/fatih/set.v0" 6 | "strconv" 7 | "strings" 8 | ) 9 | 10 | type VideoExtractor struct { 11 | article *Article 12 | config configuration 13 | candidates *set.Set 14 | movies *set.Set 15 | } 16 | 17 | type video struct { 18 | embedType string 19 | provider string 20 | width int 21 | height int 22 | embedCode string 23 | src string 24 | } 25 | 26 | func NewVideoExtractor() VideoExtractor { 27 | return VideoExtractor{ 28 | candidates: set.New(), 29 | movies: set.New(), 30 | } 31 | } 32 | 33 | var videoTags = [4]string{"iframe", "embed", "object", "video"} 34 | var videoProviders = [4]string{"youtube", "vimeo", "dailymotion", "kewego"} 35 | 36 | func (ve *VideoExtractor) getEmbedCode(node *goquery.Selection) string { 37 | return node.Text() 38 | } 39 | 40 | func (ve *VideoExtractor) getWidth(node *goquery.Selection) int { 41 | value, exists := node.Attr("width") 42 | if exists { 43 | nvalue, _ := strconv.Atoi(value) 44 | return nvalue 45 | } 46 | return 0 47 | } 48 | 49 | func (ve *VideoExtractor) getHeight(node *goquery.Selection) int { 50 | value, exists := node.Attr("height") 51 | if exists { 52 | nvalue, _ := strconv.Atoi(value) 53 | return nvalue 54 | } 55 | return 0 56 | } 57 | 58 | func (ve *VideoExtractor) getSrc(node *goquery.Selection) string { 59 | value, exists := node.Attr("src") 60 | if exists { 61 | return value 62 | } 63 | return "" 64 | } 65 | 66 | func (ve *VideoExtractor) getProvider(src string) string { 67 | if src != "" { 68 | for _, provider := range videoProviders { 69 | if strings.Contains(src, provider) { 70 | return provider 71 | } 72 | } 73 | } 74 | return "" 75 | } 76 | 77 | func (ve *VideoExtractor) getVideo(node *goquery.Selection) video { 78 | src := ve.getSrc(node) 79 | video := video{ 80 | embedCode: ve.getEmbedCode(node), 81 | embedType: node.Get(0).DataAtom.String(), 82 | width: ve.getWidth(node), 83 | height: ve.getHeight(node), 84 | src: src, 85 | provider: ve.getProvider(src), 86 | } 87 | return video 88 | } 89 | 90 | func (ve *VideoExtractor) getIFrame(node *goquery.Selection) video { 91 | return ve.getVideo(node) 92 | } 93 | 94 | func (ve *VideoExtractor) getVideoTag(node *goquery.Selection) video { 95 | return video{} 96 | } 97 | 98 | func (ve *VideoExtractor) getEmbedTag(node *goquery.Selection) video { 99 | parent := node.Parent() 100 | if parent != nil { 101 | parentTag := parent.Get(0).DataAtom.String() 102 | if parentTag == "object" { 103 | return ve.getObjectTag(node) 104 | } 105 | } 106 | return ve.getVideo(node) 107 | } 108 | 109 | func (ve *VideoExtractor) getObjectTag(node *goquery.Selection) video { 110 | childEmbedTag := node.Find("embed") 111 | if ve.candidates.Has(childEmbedTag) { 112 | ve.candidates.Remove(childEmbedTag) 113 | } 114 | srcNode := node.Find(`param[name="movie"]`) 115 | if srcNode == nil || srcNode.Length() == 0 { 116 | return video{} 117 | } 118 | 119 | src, _ := srcNode.Attr("value") 120 | provider := ve.getProvider(src) 121 | if provider == "" { 122 | return video{} 123 | } 124 | video := ve.getVideo(node) 125 | video.provider = provider 126 | video.src = src 127 | return video 128 | } 129 | 130 | func (ve *VideoExtractor) GetVideos(article *Article) *set.Set { 131 | doc := article.Doc 132 | var nodes *goquery.Selection 133 | for _, videoTag := range videoTags { 134 | tmpNodes := doc.Find(videoTag) 135 | if nodes == nil { 136 | nodes = tmpNodes 137 | } else { 138 | nodes.Union(tmpNodes) 139 | } 140 | } 141 | 142 | nodes.Each(func(i int, node *goquery.Selection) { 143 | tag := node.Get(0).DataAtom.String() 144 | var movie video 145 | switch tag { 146 | case "video": 147 | movie = ve.getVideoTag(node) 148 | break 149 | case "embed": 150 | movie = ve.getEmbedTag(node) 151 | break 152 | case "object": 153 | movie = ve.getObjectTag(node) 154 | break 155 | case "iframe": 156 | movie = ve.getIFrame(node) 157 | break 158 | default: 159 | { 160 | } 161 | } 162 | 163 | if movie.src != "" { 164 | ve.movies.Add(movie) 165 | } 166 | }) 167 | 168 | return ve.movies 169 | } 170 | -------------------------------------------------------------------------------- /wordstats.go: -------------------------------------------------------------------------------- 1 | package goose 2 | 3 | import ( 4 | "gopkg.in/fatih/set.v0" 5 | ) 6 | 7 | //some word statistics 8 | type wordStats struct { 9 | //total number of stopwords or good words that we can calculate 10 | stopWordCount int 11 | //total number of words on a node 12 | wordCount int 13 | //holds an actual list of the stop words we found 14 | stopWords *set.Set 15 | } 16 | 17 | func (this *wordStats) getStopWords() *set.Set { 18 | return this.stopWords 19 | } 20 | 21 | func (this *wordStats) setStopWords(stopWords *set.Set) { 22 | this.stopWords = stopWords 23 | } 24 | 25 | func (this *wordStats) getStopWordCount() int { 26 | return this.stopWordCount 27 | } 28 | 29 | func (this *wordStats) setStopWordCount(stopWordCount int) { 30 | this.stopWordCount = stopWordCount 31 | } 32 | 33 | func (this *wordStats) getWordCount() int { 34 | return this.wordCount 35 | } 36 | 37 | func (this *wordStats) setWordCount(wordCount int) { 38 | this.wordCount = wordCount 39 | } 40 | --------------------------------------------------------------------------------