├── .gitignore ├── .travis.yml ├── LICENSE ├── NOTICE ├── README.md ├── build.sbt ├── project ├── build.properties └── plugins.sbt ├── src ├── main │ ├── resources │ │ └── com │ │ │ └── intenthq │ │ │ └── gander │ │ │ └── text │ │ │ ├── stopwords-all.txt │ │ │ ├── stopwords-ar.txt │ │ │ ├── stopwords-ca.txt │ │ │ ├── stopwords-da.txt │ │ │ ├── stopwords-de.txt │ │ │ ├── stopwords-en.txt │ │ │ ├── stopwords-es.txt │ │ │ ├── stopwords-fi.txt │ │ │ ├── stopwords-fr.txt │ │ │ ├── stopwords-hu.txt │ │ │ ├── stopwords-id.txt │ │ │ ├── stopwords-it.txt │ │ │ ├── stopwords-ko.txt │ │ │ ├── stopwords-nb.txt │ │ │ ├── stopwords-nl.txt │ │ │ ├── stopwords-no.txt │ │ │ ├── stopwords-pl.txt │ │ │ ├── stopwords-pt.txt │ │ │ ├── stopwords-ro.txt │ │ │ ├── stopwords-ru.txt │ │ │ ├── stopwords-sv.txt │ │ │ └── stopwords-zh.txt │ └── scala │ │ └── com │ │ └── intenthq │ │ └── gander │ │ ├── DocumentCleaner.scala │ │ ├── Gander.scala │ │ ├── extractors │ │ └── ContentExtractor.scala │ │ ├── opengraph │ │ └── OpenGraphData.scala │ │ ├── text │ │ ├── StopWords.scala │ │ └── WordStats.scala │ │ └── utils │ │ ├── FileHelper.scala │ │ └── JSoup.scala └── test │ ├── resources │ ├── engineering.intenthq.com_2015_03_what-is-good-code-a-scientific-definition_.gz │ ├── globoesporte.globo.com_futebol_times_sao-paulo_noticia_2012_04_filho-do-gramado-leao-administra-o-sao-paulo-na-base-da-conversa.html.gz │ ├── internacional.elpais.com_internacional_2015_07_28_actualidad_1438076596_960360.html.gz │ ├── log4j.properties │ ├── www.apple.com_watch_.gz │ ├── www.bbc.co.uk_sport_0_football_34203622.gz │ ├── www.bbc.com_news_business-33697945.gz │ ├── www.businessinsider.com_goldman-on-the-fed-announcement-2011-9.gz │ ├── www.corriere.it_cronache_15_luglio_29_relazione-alfano-mafia-fatti-gravi-sindaco-ha-sottovalutato-25146a6c-35b0-11e5-b050-7dc71ce7db4c.shtml.gz │ ├── www.dailymail.co.uk_news_article-486484_A-spectacular-destruction-How-email-led-downfall-barrister-all.html.gz │ ├── www.fcbarcelona.com_club_detail_article_30-years-since-visit-of-pope-john-paul-ii.gz │ └── www.lancenet.com.br_sao-paulo_Leao-Arena-Barueri-casa-Tricolor_0_675532605.html.gz │ └── scala │ └── com │ └── intenthq │ └── gander │ ├── ContentExtractorSpec.scala │ ├── DocumentCleanerSpec.scala │ ├── GanderSpec.scala │ ├── extractors │ └── ContentExtractorSpec.scala │ ├── opengraph │ └── OpenGraphDataSpec.scala │ ├── text │ └── StopWordsTest.scala │ └── utils │ └── FileHelperTest.scala └── version.sbt /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | .idea/ 3 | .idea_modules/ 4 | target/ 5 | /goose.iml 6 | /goose.ipr 7 | /goose.iws 8 | config/development.scala 9 | config/production.scala 10 | config/test.scala 11 | target/ 12 | log/ 13 | perf/run_ab 14 | dist/ 15 | project/boot/ 16 | project/plugins/project/ 17 | project/plugins/src_managed/ 18 | *.log 19 | *.tmproj 20 | lib_managed/ 21 | *.swp 22 | *.iml 23 | *~ 24 | *# 25 | .#* 26 | .idea 27 | .DS_Store 28 | pmip/ 29 | .history 30 | .cache 31 | .classpath 32 | .project 33 | .settings/ 34 | Capfile.* 35 | geoip-db/ 36 | config/consumer.properties 37 | release/ 38 | *.sw* 39 | .DS_Store 40 | bin/ -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: scala 2 | scala: 3 | - 2.11.7 4 | script: "sbt clean coverage test it:test" 5 | after_success: "sbt coveralls" 6 | cache: 7 | directories: 8 | - ~/.ivy2 9 | - ~/.sbt 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | This product includes software developed by Intent HQ 2 | (http://www.intenthq.com/). 3 | 4 | This product includes software developed by Gravity.com 5 | (http://www.gravity.com/). -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Gander [![Build Status](https://img.shields.io/travis/intenthq/gander/master.svg)](https://travis-ci.org/intenthq/gander) [![Coverage Status](https://img.shields.io/coveralls/intenthq/gander.svg)](https://coveralls.io/github/intenthq/gander?branch=master) [![Maven Central](https://img.shields.io/maven-central/v/com.intenthq/gander_2.11.svg)](http://search.maven.org/#search%7Cga%7C1%7Cg%3A%22com.intenthq%22%20AND%20a%3A%22gander_2.11%22) [![Join the chat at https://gitter.im/intenthq/gander](https://img.shields.io/badge/gitter-join%20chat-green.svg)](https://gitter.im/intenthq/gander) 2 | 3 | **Gander is a scala library that extracts metadata and content from web pages.** 4 | 5 | It is based on [Goose](https://github.com/GravityLabs/goose) with the idea to: 6 | - Simplify its codebase by removing some of its functionality (like crawling, there are plenty of project that do it well) 7 | - Keep it alive (goose has been inactive for several years now) 8 | - Make its codebase more functional and take advantage of some of newer scala features 9 | 10 | ## What data does it extract? 11 | 12 | Gander will try to extract three different kinds of data from a web page: 13 | - Metadata: (title, meta description, meta keywords, language, canonical link, open graph data, 14 | publish date) 15 | - Main text for the page 16 | - Links present in the main text of the page 17 | 18 | ## Using Gander 19 | 20 | ### Adding the dependency 21 | 22 | The artefact is published in maven central. If you are using sbt you just need to add 23 | the following line (remember to replace 1.0 with the latest version): 24 | ``` 25 | "com.intenthq" % "gander" % "1.0" 26 | ``` 27 | ### In your code 28 | 29 | Gander provides a single object and a single method to access its functionality 30 | and it's pretty straightforward and intuitive to use. 31 | 32 | This three lines of code, for example, will download the specified url (using 33 | Guava) and extract the page information from the raw html: 34 | ```scala 35 | val url = "http://engineering.intenthq.com" 36 | val rawHTML = Resources.toString(new URL(url), charset) 37 | println(Gander.extract(rawHTML)) 38 | 39 | ``` 40 | 41 | You can find more examples in our tests. 42 | 43 | ## Philosophy 44 | 45 | The idea behind Gander is to do one thing and do it well. That's why we've 46 | removed some of the features that were not related to its core functionality. 47 | 48 | This project will always try to be better at extracting data and information 49 | from webpages. But it won't deal with other (probably related but not core) 50 | functionalities (like downloading html from urls). 51 | 52 | ## Collaborate 53 | 54 | Please, feel free to raise an issue, fork the repo, send pull requests... 55 | Any idea or improvement will be welcome. 56 | -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | import sbt._ 2 | 3 | organization := "com.intenthq" 4 | 5 | organizationName := "Intent HQ" 6 | 7 | organizationHomepage := Some(url("http://www.intenthq.com")) 8 | 9 | name := "gander" 10 | 11 | description := "Extracts text, metadata from web pages." 12 | 13 | homepage := Some(url("https://github.com/intenthq/gander")) 14 | 15 | developers := List( 16 | Developer(id = "albertpastrana", name = "Albert Pastrana", email = "", url = new URL("https://github.com/albertpastrana")), 17 | Developer(id = "ArturSoler", name = "Artur Soler", email = "", url = new URL("https://github.com/ArturSoler")) 18 | ) 19 | 20 | scmInfo := Some( 21 | ScmInfo( 22 | browseUrl = new URL("https://github.com/intenthq/gander"), 23 | connection = "scm:git:git@github.com:intenthq/gander.git" 24 | ) 25 | ) 26 | 27 | licenses += "Apache2" -> url("http://www.apache.org/licenses/") 28 | 29 | scalaVersion := "2.11.11" 30 | 31 | crossScalaVersions := Seq("2.11.11", "2.12.4") 32 | 33 | resolvers += "scalaz-bintray" at "http://dl.bintray.com/scalaz/releases" 34 | 35 | Defaults.itSettings 36 | 37 | scalacOptions ++= Seq( 38 | "-Xlint", 39 | "-Xfatal-warnings", 40 | "-unchecked", 41 | "-deprecation", 42 | "-feature") 43 | 44 | testOptions in Test += Tests.Argument("-oF") 45 | 46 | credentials += Credentials(Path.userHome / ".ivy2" / ".maven-credentials") 47 | 48 | libraryDependencies ++= Seq( 49 | "com.google.guava" % "guava" % "19.0", 50 | "joda-time" % "joda-time" % "2.9.3", 51 | "org.joda" % "joda-convert" % "1.8.1", 52 | "org.jsoup" % "jsoup" % "1.9.1", 53 | "org.slf4j" % "slf4j-api" % "1.7.21", 54 | "org.specs2" %% "specs2-core" % "4.0.2" % "it,test" 55 | ) 56 | 57 | scalacOptions ++= Seq("-unchecked", "-deprecation") 58 | 59 | publishTo := Some("Sonatype Snapshots Nexus" at "https://oss.sonatype.org/service/local/staging/deploy/maven2") 60 | 61 | releasePublishArtifactsAction := PgpKeys.publishSigned.value 62 | 63 | lazy val root = project.in(file(".")).configs(IntegrationTest) 64 | -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=0.13.9 2 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | resolvers += Classpaths.sbtPluginReleases 2 | 3 | addSbtPlugin("com.timushev.sbt" % "sbt-updates" % "0.3.4") 4 | 5 | addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.5.1") 6 | 7 | addSbtPlugin("org.scoverage" % "sbt-coveralls" % "1.2.2") 8 | 9 | addSbtPlugin("com.github.gseitz" % "sbt-release" % "1.0.7") 10 | 11 | addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.1.0") 12 | -------------------------------------------------------------------------------- /src/main/resources/com/intenthq/gander/text/stopwords-ar.txt: -------------------------------------------------------------------------------- 1 | فى 2 | في 3 | كل 4 | لم 5 | لن 6 | له 7 | من 8 | هو 9 | هي 10 | قوة 11 | كما 12 | لها 13 | منذ 14 | وقد 15 | ولا 16 | نفسه 17 | لقاء 18 | مقابل 19 | هناك 20 | وقال 21 | وكان 22 | نهاية 23 | وقالت 24 | وكانت 25 | للامم 26 | فيه 27 | كلم 28 | لكن 29 | وفي 30 | وقف 31 | ولم 32 | ومن 33 | وهو 34 | وهي 35 | يوم 36 | فيها 37 | منها 38 | مليار 39 | لوكالة 40 | يكون 41 | يمكن 42 | مليون 43 | حيث 44 | اكد 45 | الا 46 | اما 47 | امس 48 | السابق 49 | التى 50 | التي 51 | اكثر 52 | ايار 53 | ايضا 54 | ثلاثة 55 | الذاتي 56 | الاخيرة 57 | الثاني 58 | الثانية 59 | الذى 60 | الذي 61 | الان 62 | امام 63 | ايام 64 | خلال 65 | حوالى 66 | الذين 67 | الاول 68 | الاولى 69 | بين 70 | ذلك 71 | دون 72 | حول 73 | حين 74 | الف 75 | الى 76 | انه 77 | اول 78 | ضمن 79 | انها 80 | جميع 81 | الماضي 82 | الوقت 83 | المقبل 84 | اليوم 85 | ـ 86 | ف 87 | و 88 | و6 89 | قد 90 | لا 91 | ما 92 | مع 93 | مساء 94 | هذا 95 | واحد 96 | واضاف 97 | واضافت 98 | فان 99 | قبل 100 | قال 101 | كان 102 | لدى 103 | نحو 104 | هذه 105 | وان 106 | واكد 107 | كانت 108 | واوضح 109 | مايو 110 | ب 111 | ا 112 | أ 113 | ، 114 | عشر 115 | عدد 116 | عدة 117 | عشرة 118 | عدم 119 | عام 120 | عاما 121 | عن 122 | عند 123 | عندما 124 | على 125 | عليه 126 | عليها 127 | زيارة 128 | سنة 129 | سنوات 130 | تم 131 | ضد 132 | بعد 133 | بعض 134 | اعادة 135 | اعلنت 136 | بسبب 137 | حتى 138 | اذا 139 | احد 140 | اثر 141 | برس 142 | باسم 143 | غدا 144 | شخصا 145 | صباح 146 | اطار 147 | اربعة 148 | اخرى 149 | بان 150 | اجل 151 | غير 152 | بشكل 153 | حاليا 154 | بن 155 | به 156 | ثم 157 | اف 158 | ان 159 | او 160 | اي 161 | بها 162 | صفر -------------------------------------------------------------------------------- /src/main/resources/com/intenthq/gander/text/stopwords-ca.txt: -------------------------------------------------------------------------------- 1 | a 2 | abans 3 | ací 4 | ah 5 | així 6 | això 7 | al 8 | als 9 | aleshores 10 | algun 11 | alguna 12 | algunes 13 | alguns 14 | alhora 15 | allà 16 | allí 17 | allò 18 | altra 19 | altre 20 | altres 21 | amb 22 | ambdós 23 | ambdues 24 | apa 25 | aquell 26 | aquella 27 | aquelles 28 | aquells 29 | aquest 30 | aquesta 31 | aquestes 32 | aquests 33 | aquí 34 | baix 35 | cada 36 | cadascú 37 | cadascuna 38 | cadascunes 39 | cadascuns 40 | com 41 | contra 42 | d'un 43 | d'una 44 | d'unes 45 | d'uns 46 | dalt 47 | de 48 | del 49 | dels 50 | des 51 | després 52 | dins 53 | dintre 54 | donat 55 | doncs 56 | durant 57 | e 58 | eh 59 | el 60 | els 61 | em 62 | en 63 | encara 64 | ens 65 | entre 66 | érem 67 | eren 68 | éreu 69 | es 70 | és 71 | esta 72 | està 73 | estàvem 74 | estaven 75 | estàveu 76 | esteu 77 | et 78 | etc 79 | ets 80 | fins 81 | fora 82 | gairebé 83 | ha 84 | han 85 | has 86 | havia 87 | he 88 | hem 89 | heu 90 | hi 91 | ho 92 | i 93 | igual 94 | iguals 95 | ja 96 | l'hi 97 | la 98 | les 99 | li 100 | li'n 101 | llavors 102 | m'he 103 | ma 104 | mal 105 | malgrat 106 | mateix 107 | mateixa 108 | mateixes 109 | mateixos 110 | me 111 | mentre 112 | més 113 | meu 114 | meus 115 | meva 116 | meves 117 | molt 118 | molta 119 | moltes 120 | molts 121 | mon 122 | mons 123 | n'he 124 | n'hi 125 | ne 126 | ni 127 | no 128 | nogensmenys 129 | només 130 | nosaltres 131 | nostra 132 | nostre 133 | nostres 134 | o 135 | oh 136 | oi 137 | on 138 | pas 139 | pel 140 | pels 141 | per 142 | però 143 | perquè 144 | poc 145 | poca 146 | pocs 147 | poques 148 | potser 149 | propi 150 | qual 151 | quals 152 | quan 153 | quant 154 | que 155 | què 156 | quelcom 157 | qui 158 | quin 159 | quina 160 | quines 161 | quins 162 | s'ha 163 | s'han 164 | sa 165 | semblant 166 | semblants 167 | ses 168 | seu 169 | seus 170 | seva 171 | seva 172 | seves 173 | si 174 | sobre 175 | sobretot 176 | sóc 177 | solament 178 | sols 179 | son 180 | són 181 | sons 182 | sota 183 | sou 184 | t'ha 185 | t'han 186 | t'he 187 | ta 188 | tal 189 | també 190 | tampoc 191 | tan 192 | tant 193 | tanta 194 | tantes 195 | teu 196 | teus 197 | teva 198 | teves 199 | ton 200 | tons 201 | tot 202 | tota 203 | totes 204 | tots 205 | un 206 | una 207 | unes 208 | uns 209 | us 210 | va 211 | vaig 212 | vam 213 | van 214 | vas 215 | veu 216 | vosaltres 217 | vostra 218 | vostre 219 | vostres 220 | -------------------------------------------------------------------------------- /src/main/resources/com/intenthq/gander/text/stopwords-da.txt: -------------------------------------------------------------------------------- 1 | af 2 | alle 3 | andet 4 | andre 5 | at 6 | begge 7 | da 8 | de 9 | den 10 | denne 11 | der 12 | deres 13 | det 14 | dette 15 | dig 16 | din 17 | dog 18 | du 19 | ej 20 | eller 21 | en 22 | end 23 | ene 24 | eneste 25 | enhver 26 | et 27 | fem 28 | fire 29 | flere 30 | fleste 31 | for 32 | fordi 33 | forrige 34 | fra 35 | få 36 | før 37 | god 38 | han 39 | hans 40 | har 41 | hendes 42 | her 43 | hun 44 | hvad 45 | hvem 46 | hver 47 | hvilken 48 | hvis 49 | hvor 50 | hvordan 51 | hvorfor 52 | hvornår 53 | i 54 | ikke 55 | ind 56 | ingen 57 | intet 58 | jeg 59 | jeres 60 | kan 61 | kom 62 | kommer 63 | lav 64 | lidt 65 | lille 66 | man 67 | mand 68 | mange 69 | med 70 | meget 71 | men 72 | mens 73 | mere 74 | mig 75 | ned 76 | ni 77 | nogen 78 | noget 79 | ny 80 | nyt 81 | nær 82 | næste 83 | næsten 84 | og 85 | op 86 | otte 87 | over 88 | på 89 | se 90 | seks 91 | ses 92 | som 93 | stor 94 | store 95 | syv 96 | ti 97 | til 98 | to 99 | tre 100 | ud 101 | var 102 | -------------------------------------------------------------------------------- /src/main/resources/com/intenthq/gander/text/stopwords-de.txt: -------------------------------------------------------------------------------- 1 | 2 | /DIE 3 | Ab 4 | Aber 5 | Abgeordneten 6 | Alle 7 | Allerdings 8 | Als 9 | Alter 10 | Am 11 | Amt 12 | An 13 | Anfang 14 | Angaben 15 | Antrag 16 | April 17 | Arbeit 18 | Art 19 | Artikel 20 | Auch 21 | Auf 22 | Aufgabe 23 | Augen 24 | August 25 | Aus 26 | Außerdem 27 | Bad 28 | Band 29 | Bau 30 | Bayern 31 | Bedeutung 32 | Beginn 33 | Begriff 34 | Bei 35 | Beifall 36 | Beim 37 | Beispiel 38 | Bereich 39 | Bericht 40 | Berliner 41 | Bevölkerung 42 | Bild 43 | Bilder 44 | Bis 45 | Blick 46 | Buch 47 | Bundesregierung 48 | BÜNDNIS 49 | Bürger 50 | Bürgermeister 51 | CDU 52 | CDU/CSU 53 | China 54 | DM 55 | Da 56 | Dabei 57 | Damit 58 | Dann 59 | Das 60 | Daten 61 | Dazu 62 | Den 63 | Denn 64 | Der 65 | Deshalb 66 | Deutsche 67 | Deutschen 68 | Deutschland 69 | Dezember 70 | Die 71 | Dienstag 72 | Dies 73 | Diese 74 | Dieser 75 | Dieses 76 | Doch 77 | Donnerstag 78 | Dort 79 | Dr 80 | Dr. 81 | Druck 82 | Du 83 | Durch 84 | Ein 85 | Eine 86 | Einsatz 87 | Einwohner 88 | Eltern 89 | Ende 90 | Entscheidung 91 | Entwicklung 92 | Er 93 | Erfolg 94 | Ergebnis 95 | Erst 96 | Es 97 | Euro 98 | Europa 99 | Europäischen 100 | FC 101 | Fall 102 | Familie 103 | Februar 104 | Fenster 105 | Film 106 | Firma 107 | Folge 108 | Form 109 | Frage 110 | Fragen 111 | Frankfurt 112 | Frankfurter 113 | Frankreich 114 | Franz 115 | Frau 116 | Frauen 117 | Freitag 118 | Friedrich 119 | Für 120 | GRÜNEN 121 | Gebiet 122 | Geld 123 | Gemeinde 124 | Gemeinden 125 | Geschichte 126 | Gesellschaft 127 | Gesetz 128 | GmbH 129 | Gott 130 | Grund 131 | Gruppe 132 | Grünen 133 | Hamburg 134 | Hand 135 | Hans 136 | Haus 137 | Hause 138 | Heinrich 139 | Herr 140 | Herren 141 | Herrn 142 | Heute 143 | Hier 144 | Hilfe 145 | Hälfte 146 | Höhe 147 | ISBN 148 | Ich 149 | Ihnen 150 | Ihr 151 | Ihre 152 | Im 153 | In 154 | Informationen 155 | Interesse 156 | Internet 157 | Ist 158 | Italien 159 | Ja 160 | Jahr 161 | Jahre 162 | Jahren 163 | Jahres 164 | Jahrhundert 165 | Jahrhunderts 166 | Januar 167 | Jetzt 168 | Johann 169 | John 170 | Juli 171 | Juni 172 | Kampf 173 | Karl 174 | Karriere 175 | Kilometer 176 | Kind 177 | Kinder 178 | Kirche 179 | Klaus 180 | Kollegen 181 | Kommission 182 | Kopf 183 | Kosten 184 | Krieg 185 | Kritik 186 | Kultur 187 | Kunst 188 | Köln 189 | König 190 | Lage 191 | Land 192 | Landes 193 | Leben 194 | Leute 195 | Liebe 196 | Liste 197 | Literatur 198 | London 199 | Länder 200 | Ländern 201 | Mai 202 | Mal 203 | Man 204 | Mann 205 | Mannheim 206 | Mannschaft 207 | Mark 208 | Markt 209 | Martin 210 | Maßnahmen 211 | Meine 212 | Meinung 213 | Menschen 214 | Meter 215 | Michael 216 | Milliarden 217 | Millionen 218 | Minuten 219 | Mit 220 | Mitarbeiter 221 | Mitglied 222 | Mitglieder 223 | Mitte 224 | Mittel 225 | Mittwoch 226 | Monate 227 | Monaten 228 | Montag 229 | Morgen 230 | Musik 231 | Mutter 232 | Männer 233 | März 234 | Möglichkeit 235 | München 236 | Nach 237 | Nachdem 238 | Nacht 239 | Name 240 | Namen 241 | Neben 242 | Nein 243 | Neue 244 | New 245 | Nicht 246 | Noch 247 | Norden 248 | November 249 | Nr. 250 | Nun 251 | Nur 252 | Nähe 253 | Oktober 254 | Opfer 255 | Ort 256 | Osten 257 | PDS 258 | Paris 259 | Parlament 260 | Partei 261 | Paul 262 | Personen 263 | Peter 264 | Platz 265 | Politik 266 | Politiker 267 | Polizei 268 | Preis 269 | Problem 270 | Probleme 271 | Programm 272 | Prozent 273 | Präsident 274 | Punkt 275 | Quellen 276 | Rahmen 277 | Rat 278 | Raum 279 | Recht 280 | Regel 281 | Regie 282 | Regierung 283 | Region 284 | Reihe 285 | Richtung 286 | Rolle 287 | SPD 288 | Sache 289 | Saison 290 | Samstag 291 | Schon 292 | Schule 293 | Schweiz 294 | Schweizer 295 | Sein 296 | Seine 297 | Seit 298 | Seite 299 | Seiten 300 | September 301 | Sicherheit 302 | Sie 303 | Siehe 304 | Situation 305 | So 306 | Sohn 307 | Soldaten 308 | Sommer 309 | Sonntag 310 | Spiel 311 | Spiele 312 | Spieler 313 | Sprache 314 | St. 315 | Staat 316 | Staaten 317 | Stadt 318 | Stelle 319 | Straße 320 | Stunden 321 | Stuttgart 322 | System 323 | Tag 324 | Tage 325 | Tagen 326 | Team 327 | Teil 328 | Tel. 329 | The 330 | Thema 331 | Thomas 332 | Titel 333 | Tochter 334 | Tod 335 | Trainer 336 | USA 337 | Uhr 338 | Um 339 | Und 340 | Union 341 | Universität 342 | Unter 343 | Unternehmen 344 | Unterstützung 345 | Vater 346 | Verein 347 | Verfahren 348 | Verfügung 349 | Verlag 350 | Viele 351 | Von 352 | Vor 353 | Wahl 354 | Was 355 | Wasser 356 | Weblinks 357 | Weg 358 | Weise 359 | Weitere 360 | Welt 361 | Wenn 362 | Wer 363 | Werk 364 | Werke 365 | Westen 366 | Wie 367 | Wien 368 | Wilhelm 369 | Wir 370 | Wirtschaft 371 | Woche 372 | Wochen 373 | Wolfgang 374 | Wort 375 | Während 376 | York 377 | Zahl 378 | Zeit 379 | Zeitung 380 | Ziel 381 | Zu 382 | Zukunft 383 | Zum 384 | Zur 385 | Zusammenarbeit 386 | Zusammenhang 387 | ab 388 | aber 389 | acht 390 | alle 391 | allein 392 | allem 393 | allen 394 | aller 395 | allerdings 396 | alles 397 | als 398 | also 399 | alte 400 | alten 401 | am 402 | an 403 | ander 404 | andere 405 | anderem 406 | anderen 407 | anderer 408 | anderes 409 | anderm 410 | andern 411 | anderr 412 | anders 413 | arbeiten 414 | auch 415 | auf 416 | aufgrund 417 | aus 418 | außerdem 419 | bald 420 | bedeutet 421 | befindet 422 | begann 423 | bei 424 | beide 425 | beiden 426 | beim 427 | beispielsweise 428 | bekannt 429 | bekommen 430 | bereit 431 | bereits 432 | besonders 433 | besser 434 | besteht 435 | besten 436 | bevor 437 | bezeichnet 438 | bietet 439 | bin 440 | bis 441 | bisher 442 | bist 443 | bleiben 444 | bleibt 445 | blieb 446 | bringen 447 | bzw. 448 | c 449 | ca. 450 | da 451 | dabei 452 | dadurch 453 | dafür 454 | dagegen 455 | daher 456 | damals 457 | damit 458 | danach 459 | dann 460 | daran 461 | darauf 462 | darf 463 | darin 464 | darunter 465 | darüber 466 | das 467 | dass 468 | dasselbe 469 | davon 470 | dazu 471 | daß 472 | de 473 | dein 474 | deine 475 | deinem 476 | deinen 477 | deiner 478 | deines 479 | dem 480 | demselben 481 | den 482 | denen 483 | denn 484 | denselben 485 | der 486 | deren 487 | derer 488 | derselbe 489 | derselben 490 | derzeit 491 | des 492 | deshalb 493 | desselben 494 | dessen 495 | deutlich 496 | deutsche 497 | deutschen 498 | deutscher 499 | dich 500 | die 501 | dies 502 | diese 503 | dieselbe 504 | dieselben 505 | diesem 506 | diesen 507 | dieser 508 | dieses 509 | dir 510 | direkt 511 | doch 512 | dort 513 | dpa 514 | drei 515 | du 516 | durch 517 | dürfen 518 | eben 519 | ebenfalls 520 | ebenso 521 | ehemaligen 522 | eher 523 | eigene 524 | eigenen 525 | eigentlich 526 | ein 527 | eine 528 | einem 529 | einen 530 | einer 531 | eines 532 | einfach 533 | eingesetzt 534 | einig 535 | einige 536 | einigem 537 | einigen 538 | einiger 539 | einiges 540 | einmal 541 | einzelnen 542 | einzige 543 | electronic 544 | entwickelt 545 | er 546 | erhalten 547 | erhielt 548 | erklärt 549 | erklärte 550 | erneut 551 | erreichen 552 | erreicht 553 | erst 554 | erste 555 | ersten 556 | erster 557 | erstmals 558 | es 559 | etwa 560 | etwas 561 | euch 562 | euer 563 | eure 564 | eurem 565 | euren 566 | eurer 567 | eures 568 | europäischen 569 | fand 570 | fast 571 | fest 572 | finden 573 | findet 574 | folgenden 575 | for 576 | frei 577 | früher 578 | führen 579 | führt 580 | führte 581 | fünf 582 | für 583 | gab 584 | ganz 585 | ganze 586 | ganzen 587 | gar 588 | geben 589 | gebracht 590 | gefunden 591 | gegeben 592 | gegen 593 | gegenüber 594 | gegründet 595 | gehen 596 | geht 597 | gehören 598 | gehört 599 | gehörte 600 | gekommen 601 | gemacht 602 | gemeinsam 603 | genannt 604 | genau 605 | genommen 606 | genug 607 | gerade 608 | gesagt 609 | gesehen 610 | gestellt 611 | gestern 612 | gewann 613 | gewesen 614 | geworden 615 | gibt 616 | gilt 617 | ging 618 | gleich 619 | gleichen 620 | gleichzeitig 621 | große 622 | großen 623 | großer 624 | größte 625 | größten 626 | gut 627 | gute 628 | guten 629 | hab 630 | habe 631 | haben 632 | halten 633 | handelt 634 | hat 635 | hatte 636 | hatten 637 | heißt 638 | her 639 | heute 640 | heutigen 641 | hier 642 | hin 643 | hinaus 644 | hinter 645 | hoch 646 | hohe 647 | hohen 648 | hält 649 | hätte 650 | hätten 651 | häufig 652 | ich 653 | ihm 654 | ihn 655 | ihnen 656 | ihr 657 | ihre 658 | ihrem 659 | ihren 660 | ihrer 661 | ihres 662 | im 663 | immer 664 | in 665 | indem 666 | innerhalb 667 | ins 668 | insbesondere 669 | insgesamt 670 | internationalen 671 | inzwischen 672 | ist 673 | ja 674 | je 675 | jede 676 | jedem 677 | jeden 678 | jeder 679 | jedes 680 | jedoch 681 | jene 682 | jenem 683 | jenen 684 | jener 685 | jenes 686 | jetzt 687 | jeweils 688 | kam 689 | kamen 690 | kann 691 | kaum 692 | kein 693 | keine 694 | keinem 695 | keinen 696 | keiner 697 | keines 698 | klar 699 | kleine 700 | kleinen 701 | km 702 | knapp 703 | kommen 704 | kommt 705 | konnte 706 | konnten 707 | kurz 708 | könne 709 | können 710 | könnte 711 | könnten 712 | lag 713 | lang 714 | lange 715 | lassen 716 | laut 717 | leben 718 | lediglich 719 | leicht 720 | letzte 721 | letzten 722 | liegen 723 | liegt 724 | ließ 725 | lässt 726 | läuft 727 | machen 728 | macht 729 | machte 730 | mal 731 | man 732 | manche 733 | manchem 734 | manchen 735 | mancher 736 | manches 737 | mehr 738 | mehrere 739 | mein 740 | meine 741 | meinem 742 | meinen 743 | meiner 744 | meines 745 | meist 746 | meisten 747 | mich 748 | mindestens 749 | mir 750 | mit 751 | muss 752 | musste 753 | muß 754 | möchte 755 | möglich 756 | müsse 757 | müssen 758 | nach 759 | nachdem 760 | nahm 761 | natürlich 762 | neben 763 | nehmen 764 | neu 765 | neue 766 | neuen 767 | nicht 768 | nichts 769 | nie 770 | nimmt 771 | noch 772 | nun 773 | nur 774 | nächsten 775 | nämlich 776 | ob 777 | oben 778 | obwohl 779 | oder 780 | of 781 | oft 782 | ohne 783 | paar 784 | per 785 | politische 786 | politischen 787 | pro 788 | recht 789 | richtig 790 | rund 791 | s 792 | sagen 793 | sagt 794 | sagte 795 | sah 796 | scheint 797 | schließen 798 | schließlich 799 | schnell 800 | schon 801 | schwer 802 | sechs 803 | sehen 804 | sehr 805 | sei 806 | seien 807 | sein 808 | seine 809 | seinem 810 | seinen 811 | seiner 812 | seines 813 | seit 814 | selbst 815 | setzt 816 | setzte 817 | sich 818 | sicher 819 | sie 820 | sieben 821 | siehe 822 | sieht 823 | sind 824 | so 825 | sogar 826 | solche 827 | solchem 828 | solchen 829 | solcher 830 | solches 831 | soll 832 | sollen 833 | sollte 834 | sollten 835 | sondern 836 | sonst 837 | sowie 838 | sowohl 839 | spielen 840 | spielt 841 | spielte 842 | sprechen 843 | spricht 844 | später 845 | stand 846 | stark 847 | statt 848 | stehen 849 | steht 850 | stellen 851 | stellt 852 | stellte 853 | tatsächlich 854 | taz 855 | teilweise 856 | the 857 | trat 858 | trotz 859 | tun 860 | um 861 | und 862 | uns 863 | unse 864 | unsem 865 | unsen 866 | unser 867 | unsere 868 | unserer 869 | unses 870 | unter 871 | vergangenen 872 | verschiedene 873 | verschiedenen 874 | version 875 | versucht 876 | verwendet 877 | viel 878 | viele 879 | vielen 880 | vielleicht 881 | vier 882 | vom 883 | von 884 | vor 885 | völlig 886 | war 887 | waren 888 | warst 889 | was 890 | weg 891 | wegen 892 | weil 893 | weit 894 | weiter 895 | weitere 896 | weiteren 897 | weiterhin 898 | weiß 899 | welche 900 | welchem 901 | welchen 902 | welcher 903 | welches 904 | wenig 905 | weniger 906 | wenn 907 | wer 908 | werde 909 | werden 910 | wichtig 911 | wie 912 | wieder 913 | will 914 | wir 915 | wird 916 | wirklich 917 | wirst 918 | wissen 919 | wo 920 | wobei 921 | wohl 922 | wollen 923 | wollte 924 | worden 925 | wurde 926 | wurden 927 | während 928 | wäre 929 | würde 930 | würden 931 | z. 932 | z.B. 933 | zahlreiche 934 | zehn 935 | zeigen 936 | zeigt 937 | zu 938 | zudem 939 | zuletzt 940 | zum 941 | zumindest 942 | zunächst 943 | zur 944 | zurück 945 | zusammen 946 | zuvor 947 | zwar 948 | zwei 949 | zweite 950 | zweiten 951 | zwischen 952 | Österreich 953 | Über 954 | öffentlichen 955 | über 956 | überhaupt 957 | -------------------------------------------------------------------------------- /src/main/resources/com/intenthq/gander/text/stopwords-en.txt: -------------------------------------------------------------------------------- 1 | a's 2 | able 3 | about 4 | above 5 | according 6 | accordingly 7 | across 8 | actually 9 | after 10 | afterwards 11 | again 12 | against 13 | ain't 14 | all 15 | allow 16 | allows 17 | almost 18 | alone 19 | along 20 | already 21 | also 22 | although 23 | always 24 | am 25 | among 26 | amongst 27 | an 28 | and 29 | another 30 | any 31 | anybody 32 | anyhow 33 | anyone 34 | anything 35 | anyway 36 | anyways 37 | anywhere 38 | apart 39 | appear 40 | appreciate 41 | appropriate 42 | are 43 | aren't 44 | around 45 | as 46 | aside 47 | ask 48 | asking 49 | associated 50 | at 51 | available 52 | away 53 | awfully 54 | be 55 | became 56 | because 57 | become 58 | becomes 59 | becoming 60 | been 61 | before 62 | beforehand 63 | behind 64 | being 65 | believe 66 | below 67 | beside 68 | besides 69 | best 70 | better 71 | between 72 | beyond 73 | both 74 | brief 75 | but 76 | by 77 | c 78 | c'mon 79 | c's 80 | came 81 | campaign 82 | can 83 | can't 84 | cannot 85 | cant 86 | cause 87 | causes 88 | certain 89 | certainly 90 | changes 91 | clearly 92 | co 93 | com 94 | come 95 | comes 96 | concerning 97 | consequently 98 | consider 99 | considering 100 | contain 101 | containing 102 | contains 103 | corresponding 104 | could 105 | couldn't 106 | course 107 | criticized 108 | currently 109 | definitely 110 | described 111 | despite 112 | did 113 | didn't 114 | different 115 | do 116 | does 117 | doesn't 118 | doing 119 | don't 120 | done 121 | down 122 | downwards 123 | during 124 | each 125 | edu 126 | eight 127 | either 128 | else 129 | elsewhere 130 | endorsed 131 | enough 132 | entirely 133 | especially 134 | et 135 | etc 136 | even 137 | ever 138 | every 139 | everybody 140 | everyone 141 | everything 142 | everywhere 143 | ex 144 | exactly 145 | example 146 | except 147 | far 148 | few 149 | fifth 150 | financial 151 | first 152 | five 153 | followed 154 | following 155 | follows 156 | for 157 | former 158 | formerly 159 | forth 160 | four 161 | from 162 | further 163 | furthermore 164 | get 165 | gets 166 | getting 167 | given 168 | gives 169 | go 170 | goes 171 | going 172 | gone 173 | got 174 | gotten 175 | greetings 176 | had 177 | hadn't 178 | happens 179 | hardly 180 | has 181 | hasn't 182 | have 183 | haven't 184 | having 185 | he 186 | he's 187 | hello 188 | help 189 | hence 190 | her 191 | here 192 | here's 193 | hereafter 194 | hereby 195 | herein 196 | hereupon 197 | hers 198 | herself 199 | hi 200 | him 201 | himself 202 | his 203 | hither 204 | hopefully 205 | how 206 | howbeit 207 | however 208 | i'd 209 | i'll 210 | i'm 211 | i've 212 | if 213 | ignored 214 | immediate 215 | in 216 | inasmuch 217 | inc 218 | indeed 219 | indicate 220 | indicated 221 | indicates 222 | inner 223 | insofar 224 | instead 225 | into 226 | inward 227 | is 228 | isn't 229 | it 230 | it'd 231 | it'll 232 | it's 233 | its 234 | itself 235 | just 236 | keep 237 | keeps 238 | kept 239 | know 240 | known 241 | knows 242 | last 243 | lately 244 | later 245 | latter 246 | latterly 247 | least 248 | less 249 | lest 250 | let 251 | let's 252 | like 253 | liked 254 | likely 255 | little 256 | look 257 | looking 258 | looks 259 | ltd 260 | mainly 261 | many 262 | may 263 | maybe 264 | me 265 | mean 266 | meanwhile 267 | merely 268 | might 269 | more 270 | moreover 271 | most 272 | mostly 273 | much 274 | must 275 | my 276 | myself 277 | name 278 | namely 279 | nd 280 | near 281 | nearly 282 | necessary 283 | need 284 | needs 285 | neither 286 | never 287 | nevertheless 288 | new 289 | next 290 | nine 291 | no 292 | nobody 293 | non 294 | none 295 | noone 296 | nor 297 | normally 298 | not 299 | nothing 300 | novel 301 | now 302 | nowhere 303 | obviously 304 | of 305 | off 306 | official 307 | often 308 | oh 309 | ok 310 | okay 311 | old 312 | on 313 | once 314 | one 315 | ones 316 | only 317 | onto 318 | or 319 | other 320 | others 321 | otherwise 322 | ought 323 | our 324 | ours 325 | ourselves 326 | out 327 | outside 328 | over 329 | overall 330 | own 331 | particular 332 | particularly 333 | per 334 | perhaps 335 | placed 336 | please 337 | plus 338 | possible 339 | presumably 340 | probably 341 | provides 342 | quarterly 343 | quite 344 | quote 345 | rather 346 | really 347 | reasonably 348 | regarding 349 | regardless 350 | regards 351 | relatively 352 | respectively 353 | right 354 | said 355 | same 356 | saw 357 | say 358 | saying 359 | says 360 | second 361 | secondly 362 | see 363 | seeing 364 | seem 365 | seemed 366 | seeming 367 | seems 368 | seen 369 | self 370 | selves 371 | sensible 372 | sent 373 | serious 374 | seriously 375 | seven 376 | several 377 | shall 378 | sharply 379 | she 380 | should 381 | shouldn't 382 | since 383 | six 384 | so 385 | some 386 | somebody 387 | somehow 388 | someone 389 | something 390 | sometime 391 | sometimes 392 | somewhat 393 | somewhere 394 | soon 395 | sorry 396 | specified 397 | specify 398 | specifying 399 | still 400 | sub 401 | such 402 | sup 403 | sure 404 | t's 405 | take 406 | taken 407 | tell 408 | tends 409 | than 410 | thank 411 | thanks 412 | thanx 413 | that 414 | that's 415 | thats 416 | the 417 | their 418 | theirs 419 | them 420 | themselves 421 | then 422 | thence 423 | there 424 | there's 425 | thereafter 426 | thereby 427 | therefore 428 | therein 429 | theres 430 | thereupon 431 | these 432 | they 433 | they'd 434 | they'll 435 | they're 436 | they've 437 | think 438 | third 439 | this 440 | thorough 441 | thoroughly 442 | those 443 | though 444 | three 445 | through 446 | throughout 447 | thru 448 | thus 449 | to 450 | together 451 | too 452 | took 453 | toward 454 | towards 455 | tried 456 | tries 457 | truly 458 | try 459 | trying 460 | twice 461 | two 462 | under 463 | unfortunately 464 | unless 465 | unlikely 466 | until 467 | unto 468 | up 469 | upon 470 | us 471 | use 472 | used 473 | useful 474 | uses 475 | using 476 | usually 477 | uucp 478 | value 479 | various 480 | very 481 | via 482 | viz 483 | vs 484 | want 485 | wants 486 | was 487 | wasn't 488 | way 489 | we 490 | we'd 491 | we'll 492 | we're 493 | we've 494 | welcome 495 | well 496 | went 497 | were 498 | weren't 499 | what 500 | what's 501 | whatever 502 | when 503 | whence 504 | whenever 505 | where 506 | where's 507 | whereafter 508 | whereas 509 | whereby 510 | wherein 511 | whereupon 512 | wherever 513 | whether 514 | which 515 | while 516 | whither 517 | who 518 | who's 519 | whoever 520 | whole 521 | whom 522 | whose 523 | why 524 | will 525 | willing 526 | wish 527 | with 528 | within 529 | without 530 | won't 531 | wonder 532 | would 533 | wouldn't 534 | yes 535 | yet 536 | you 537 | you'd 538 | you'll 539 | you're 540 | you've 541 | your 542 | yours 543 | yourself 544 | yourselves 545 | zero 546 | -------------------------------------------------------------------------------- /src/main/resources/com/intenthq/gander/text/stopwords-es.txt: -------------------------------------------------------------------------------- 1 | 2 | # forms of ser, to be (not including the infinitive): 3 | a 4 | al 5 | algo 6 | algunas 7 | algunos 8 | ante 9 | antes 10 | como 11 | con 12 | contra 13 | cual 14 | cuando 15 | de 16 | del 17 | desde 18 | donde 19 | durante 20 | e 21 | el 22 | ella 23 | ellas 24 | ellos 25 | en 26 | entre 27 | era 28 | erais 29 | eran 30 | eras 31 | eres 32 | es 33 | esa 34 | esas 35 | ese 36 | eso 37 | esos 38 | esta 39 | estaba 40 | estabais 41 | estaban 42 | estabas 43 | estad 44 | estada 45 | estadas 46 | estado 47 | estados 48 | estamos 49 | estando 50 | estar 51 | estaremos 52 | estará 53 | estarán 54 | estarás 55 | estaré 56 | estaréis 57 | estaría 58 | estaríais 59 | estaríamos 60 | estarían 61 | estarías 62 | estas 63 | este 64 | estemos 65 | esto 66 | estos 67 | estoy 68 | estuve 69 | estuviera 70 | estuvierais 71 | estuvieran 72 | estuvieras 73 | estuvieron 74 | estuviese 75 | estuvieseis 76 | estuviesen 77 | estuvieses 78 | estuvimos 79 | estuviste 80 | estuvisteis 81 | estuviéramos 82 | estuviésemos 83 | estuvo 84 | está 85 | estábamos 86 | estáis 87 | están 88 | estás 89 | esté 90 | estéis 91 | estén 92 | estés 93 | fue 94 | fuera 95 | fuerais 96 | fueran 97 | fueras 98 | fueron 99 | fuese 100 | fueseis 101 | fuesen 102 | fueses 103 | fui 104 | fuimos 105 | fuiste 106 | fuisteis 107 | fuéramos 108 | fuésemos 109 | ha 110 | habida 111 | habidas 112 | habido 113 | habidos 114 | habiendo 115 | habremos 116 | habrá 117 | habrán 118 | habrás 119 | habré 120 | habréis 121 | habría 122 | habríais 123 | habríamos 124 | habrían 125 | habrías 126 | habéis 127 | había 128 | habíais 129 | habíamos 130 | habían 131 | habías 132 | han 133 | has 134 | hasta 135 | hay 136 | haya 137 | hayamos 138 | hayan 139 | hayas 140 | hayáis 141 | he 142 | hemos 143 | hube 144 | hubiera 145 | hubierais 146 | hubieran 147 | hubieras 148 | hubieron 149 | hubiese 150 | hubieseis 151 | hubiesen 152 | hubieses 153 | hubimos 154 | hubiste 155 | hubisteis 156 | hubiéramos 157 | hubiésemos 158 | hubo 159 | la 160 | las 161 | le 162 | les 163 | lo 164 | los 165 | me 166 | mi 167 | mis 168 | mucho 169 | muchos 170 | muy 171 | más 172 | mí 173 | mía 174 | mías 175 | mío 176 | míos 177 | nada 178 | ni 179 | no 180 | nos 181 | nosotras 182 | nosotros 183 | nuestra 184 | nuestras 185 | nuestro 186 | nuestros 187 | o 188 | os 189 | otra 190 | otras 191 | otro 192 | otros 193 | para 194 | pero 195 | poco 196 | por 197 | porque 198 | que 199 | quien 200 | quienes 201 | qué 202 | se 203 | sea 204 | seamos 205 | sean 206 | seas 207 | seremos 208 | será 209 | serán 210 | serás 211 | seré 212 | seréis 213 | sería 214 | seríais 215 | seríamos 216 | serían 217 | serías 218 | seáis 219 | sido 220 | siendo 221 | sin 222 | sobre 223 | sois 224 | somos 225 | son 226 | soy 227 | su 228 | sus 229 | suya 230 | suyas 231 | suyo 232 | suyos 233 | sí 234 | también 235 | tanto 236 | te 237 | tendremos 238 | tendrá 239 | tendrán 240 | tendrás 241 | tendré 242 | tendréis 243 | tendría 244 | tendríais 245 | tendríamos 246 | tendrían 247 | tendrías 248 | tened 249 | tenemos 250 | tenga 251 | tengamos 252 | tengan 253 | tengas 254 | tengo 255 | tengáis 256 | tenida 257 | tenidas 258 | tenido 259 | tenidos 260 | teniendo 261 | tenéis 262 | tenía 263 | teníais 264 | teníamos 265 | tenían 266 | tenías 267 | ti 268 | tiene 269 | tienen 270 | tienes 271 | todo 272 | todos 273 | tu 274 | tus 275 | tuve 276 | tuviera 277 | tuvierais 278 | tuvieran 279 | tuvieras 280 | tuvieron 281 | tuviese 282 | tuvieseis 283 | tuviesen 284 | tuvieses 285 | tuvimos 286 | tuviste 287 | tuvisteis 288 | tuviéramos 289 | tuviésemos 290 | tuvo 291 | tuya 292 | tuyas 293 | tuyo 294 | tuyos 295 | tú 296 | un 297 | una 298 | uno 299 | unos 300 | vosotras 301 | vosotros 302 | vuestra 303 | vuestras 304 | vuestro 305 | vuestros 306 | y 307 | ya 308 | yo 309 | él 310 | éramos 311 | -------------------------------------------------------------------------------- /src/main/resources/com/intenthq/gander/text/stopwords-fi.txt: -------------------------------------------------------------------------------- 1 | alla 2 | ansiosta 3 | ehkä 4 | ei 5 | enemmän 6 | ennen 7 | etessa 8 | f 9 | haikki 10 | he 11 | hitaasti 12 | hoikein 13 | hyvin 14 | hän 15 | ilman 16 | ja 17 | jos 18 | jälkeen 19 | kanssa 20 | kaukana 21 | kenties 22 | keskellä 23 | kesken 24 | koskaan 25 | kuinkan 26 | kukka 27 | kylliksi 28 | kyllä 29 | liian 30 | lla 31 | lla 32 | luona 33 | lähellä 34 | läpi 35 | me 36 | miksi 37 | mikä 38 | milloin 39 | milloinkan 40 | minä 41 | missä 42 | miten 43 | nopeasti 44 | nyt 45 | oikea 46 | oikealla 47 | paljon 48 | siellä 49 | sinä 50 | ssa 51 | sta 52 | suoraan 53 | tai 54 | takana 55 | takia 56 | tarpeeksi 57 | te 58 | tässä 59 | ulkopuolella 60 | vahemmän 61 | vasen 62 | vasenmalla 63 | vastan 64 | vielä 65 | vieressä 66 | vähän 67 | yhdessä 68 | ylös 69 | -------------------------------------------------------------------------------- /src/main/resources/com/intenthq/gander/text/stopwords-fr.txt: -------------------------------------------------------------------------------- 1 | ai 2 | aie 3 | aient 4 | aies 5 | ait 6 | as 7 | au 8 | aura 9 | aurai 10 | auraient 11 | aurais 12 | aurait 13 | auras 14 | aurez 15 | auriez 16 | aurions 17 | aurons 18 | auront 19 | aux 20 | avaient 21 | avais 22 | avait 23 | avec 24 | avez 25 | aviez 26 | avions 27 | avons 28 | ayant 29 | ayez 30 | ayons 31 | c 32 | ce 33 | ceci 34 | celà 35 | ces 36 | cet 37 | cette 38 | d 39 | dans 40 | de 41 | des 42 | du 43 | elle 44 | en 45 | es 46 | est 47 | et 48 | eu 49 | eue 50 | eues 51 | eurent 52 | eus 53 | eusse 54 | eussent 55 | eusses 56 | eussiez 57 | eussions 58 | eut 59 | eux 60 | eûmes 61 | eût 62 | eûtes 63 | furent 64 | fus 65 | fusse 66 | fussent 67 | fusses 68 | fussiez 69 | fussions 70 | fut 71 | fûmes 72 | fût 73 | fûtes 74 | ici 75 | il 76 | ils 77 | j 78 | je 79 | l 80 | la 81 | le 82 | les 83 | leur 84 | leurs 85 | lui 86 | m 87 | ma 88 | mais 89 | me 90 | mes 91 | moi 92 | mon 93 | même 94 | n 95 | ne 96 | nos 97 | notre 98 | nous 99 | on 100 | ont 101 | ou 102 | par 103 | pas 104 | pour 105 | qu 106 | que 107 | quel 108 | quelle 109 | quelles 110 | quels 111 | qui 112 | s 113 | sa 114 | sans 115 | se 116 | sera 117 | serai 118 | seraient 119 | serais 120 | serait 121 | seras 122 | serez 123 | seriez 124 | serions 125 | serons 126 | seront 127 | ses 128 | soi 129 | soient 130 | sois 131 | soit 132 | sommes 133 | son 134 | sont 135 | soyez 136 | soyons 137 | suis 138 | sur 139 | t 140 | ta 141 | te 142 | tes 143 | toi 144 | ton 145 | tu 146 | un 147 | une 148 | vos 149 | votre 150 | vous 151 | y 152 | à 153 | étaient 154 | étais 155 | était 156 | étant 157 | étiez 158 | étions 159 | été 160 | étée 161 | étées 162 | étés 163 | êtes 164 | -------------------------------------------------------------------------------- /src/main/resources/com/intenthq/gander/text/stopwords-hu.txt: -------------------------------------------------------------------------------- 1 | a 2 | á 3 | ahogy 4 | ahol 5 | aki 6 | akik 7 | akkor 8 | alatt 9 | által 10 | általában 11 | amely 12 | amelyek 13 | amelyekben 14 | amelyeket 15 | amelyet 16 | amelynek 17 | ami 18 | amit 19 | amolyan 20 | amp 21 | amíg 22 | amikor 23 | át 24 | abban 25 | ahhoz 26 | annak 27 | arra 28 | arról 29 | az 30 | azok 31 | azon 32 | azt 33 | azzal 34 | azért 35 | aztán 36 | azután 37 | azonban 38 | b 39 | bár 40 | be 41 | belül 42 | benne 43 | c 44 | cikk 45 | cikkek 46 | cikkeket 47 | csak 48 | d 49 | de 50 | e 51 | é 52 | eddig 53 | egész 54 | egy 55 | egyes 56 | egyetlen 57 | egyéb 58 | egyik 59 | egyre 60 | ekkor 61 | el 62 | elég 63 | ellen 64 | elő 65 | először 66 | előtt 67 | első 68 | én 69 | éppen 70 | ebben 71 | ehhez 72 | emilyen 73 | ennek 74 | erre 75 | ez 76 | ezt 77 | ezek 78 | ezen 79 | ezzel 80 | ezért 81 | és 82 | f 83 | fel 84 | felé 85 | g 86 | h 87 | hanem 88 | hiszen 89 | hogy 90 | hogyan 91 | i 92 | í 93 | igen 94 | így 95 | illetve 96 | ill. 97 | ill 98 | ilyen 99 | ilyenkor 100 | is 101 | ison 102 | ismét 103 | itt 104 | j 105 | jó 106 | jól 107 | jobban 108 | k 109 | kell 110 | kellett 111 | keresztül 112 | keressünk 113 | ki 114 | kívül 115 | között 116 | közül 117 | l 118 | legalább 119 | lehet 120 | lehetett 121 | legyen 122 | lenne 123 | lenni 124 | lesz 125 | lett 126 | m 127 | maga 128 | magát 129 | majd 130 | majd 131 | már 132 | más 133 | másik 134 | meg 135 | még 136 | mellett 137 | mert 138 | mely 139 | melyek 140 | mi 141 | mit 142 | míg 143 | miért 144 | milyen 145 | mikor 146 | minden 147 | mindent 148 | mindenki 149 | mindig 150 | mint 151 | mintha 152 | mivel 153 | most 154 | n 155 | nagy 156 | nagyobb 157 | nagyon 158 | ne 159 | néha 160 | nekem 161 | neki 162 | nem 163 | néhány 164 | nélkül 165 | nincs 166 | o 167 | ó 168 | olyan 169 | ott 170 | össze 171 | ö 172 | ő 173 | ők 174 | őket 175 | p 176 | pedig 177 | persze 178 | q 179 | r 180 | rá 181 | s 182 | saját 183 | sem 184 | semmi 185 | sok 186 | sokat 187 | sokkal 188 | sz 189 | számára 190 | szemben 191 | szerint 192 | szinte 193 | t 194 | talán 195 | tehát 196 | teljes 197 | tovább 198 | továbbá 199 | több 200 | u 201 | ú 202 | úgy 203 | ugyanis 204 | új 205 | újabb 206 | újra 207 | után 208 | utána 209 | utolsó 210 | ü 211 | ű 212 | v 213 | vagy 214 | vagyis 215 | valaki 216 | valamely 217 | valami 218 | valamint 219 | való 220 | vagyok 221 | van 222 | vannak 223 | volt 224 | voltam 225 | voltak 226 | voltunk 227 | vissza 228 | vele 229 | viszont 230 | volna 231 | számolnak 232 | szólnak 233 | szól 234 | w 235 | x 236 | y 237 | z 238 | zs 239 | a 240 | ahogy 241 | ahol 242 | aki 243 | akkor 244 | alatt 245 | általában 246 | által 247 | amely 248 | amíg 249 | amikor 250 | ami 251 | amolyan 252 | arra 253 | át 254 | az 255 | azért 256 | azonban 257 | azon 258 | aztán 259 | azt 260 | azután 261 | azzal 262 | bár 263 | be 264 | belül 265 | benne 266 | cikk 267 | csak 268 | de 269 | eddig 270 | egész 271 | egy 272 | egyéb 273 | egyes 274 | egyetlen 275 | egyik 276 | egyre 277 | ekkor 278 | el 279 | elég 280 | ellen 281 | elő 282 | először 283 | előtt 284 | első 285 | emilyen 286 | én 287 | éppen 288 | erre 289 | és 290 | e 291 | ez 292 | ezen 293 | ezért 294 | ezzel 295 | fel 296 | felé 297 | hanem 298 | hiszen 299 | hogy 300 | hogyan 301 | igen 302 | így 303 | ill. 304 | illetve 305 | ill 306 | ilyen 307 | ilyenkor 308 | ismét 309 | ison 310 | itt 311 | jó 312 | jobban 313 | jól 314 | kell 315 | keres 316 | keresztül 317 | ki 318 | kívül 319 | között 320 | közül 321 | legalább 322 | legyen 323 | lehet 324 | lenni 325 | lett 326 | maga 327 | maga 328 | majd 329 | már 330 | más 331 | másik 332 | még 333 | meg 334 | mellett 335 | mely 336 | mert 337 | miért 338 | míg 339 | mikor 340 | milyen 341 | minden 342 | mindenki 343 | mindig 344 | mi 345 | mint 346 | mintha 347 | mivel 348 | most 349 | nagy 350 | nagyobb 351 | nagyon 352 | ne 353 | néha 354 | néhány 355 | neki 356 | nélkül 357 | nem 358 | nincs 359 | ők 360 | olyan 361 | ő 362 | össze 363 | ott 364 | pedig 365 | persze 366 | rá 367 | saját 368 | s 369 | sem 370 | semmi 371 | sokkal 372 | sok 373 | számára 374 | számol 375 | szemben 376 | szerint 377 | szinte 378 | szól 379 | talán 380 | tehát 381 | teljes 382 | továbbá 383 | tovább 384 | úgy 385 | ugyanis 386 | új 387 | újabb 388 | újra 389 | utána 390 | után 391 | utolsó 392 | vagy 393 | vagyis 394 | valaki 395 | valamely 396 | valami 397 | valamint 398 | való 399 | van 400 | vissza 401 | viszont 402 | volt 403 | 404 | -------------------------------------------------------------------------------- /src/main/resources/com/intenthq/gander/text/stopwords-id.txt: -------------------------------------------------------------------------------- 1 | a 2 | abad 3 | acara 4 | aceh 5 | ada 6 | adalah 7 | adanya 8 | adapun 9 | agak 10 | agaknya 11 | agama 12 | agar 13 | agustus 14 | air 15 | akan 16 | akankah 17 | akhir 18 | akhiri 19 | akhirnya 20 | akibat 21 | aku 22 | akulah 23 | alam 24 | album 25 | amat 26 | amatlah 27 | amerika 28 | anak 29 | and 30 | anda 31 | andalah 32 | anggota 33 | antar 34 | antara 35 | antarabangsa 36 | antaranya 37 | apa 38 | apaan 39 | apabila 40 | apakah 41 | apalagi 42 | apatah 43 | api 44 | april 45 | artikel 46 | artinya 47 | as 48 | asal 49 | asalkan 50 | asas 51 | asia 52 | asing 53 | atas 54 | atau 55 | ataukah 56 | ataupun 57 | australia 58 | awal 59 | awalnya 60 | awam 61 | b 62 | badan 63 | bagai 64 | bagaikan 65 | bagaimana 66 | bagaimanakah 67 | bagaimanapun 68 | bagainamakah 69 | bagi 70 | bagian 71 | bahagian 72 | bahan 73 | baharu 74 | bahasa 75 | bahawa 76 | bahkan 77 | bahwa 78 | bahwasannya 79 | bahwasanya 80 | baik 81 | baiknya 82 | bakal 83 | bakalan 84 | balik 85 | bandar 86 | bangsa 87 | bank 88 | banyak 89 | bapak 90 | barang 91 | barangan 92 | barat 93 | baru 94 | baru-baru 95 | bawah 96 | beberapa 97 | begini 98 | beginian 99 | beginikah 100 | beginilah 101 | begitu 102 | begitukah 103 | begitulah 104 | begitupun 105 | bekas 106 | bekerja 107 | belakang 108 | belakangan 109 | belanda 110 | beli 111 | beliau 112 | belum 113 | belumlah 114 | benar 115 | benarkah 116 | benarlah 117 | bentuk 118 | berada 119 | berakhir 120 | berakhirlah 121 | berakhirnya 122 | berapa 123 | berapakah 124 | berapalah 125 | berapapun 126 | berarti 127 | berasal 128 | berat 129 | berawal 130 | berbagai 131 | berbanding 132 | berbeda 133 | berdasarkan 134 | berdatangan 135 | berharap 136 | berhasil 137 | beri 138 | berikan 139 | berikut 140 | berikutan 141 | berikutnya 142 | berita 143 | berjalan 144 | berjaya 145 | berjumlah 146 | berkaitan 147 | berkali 148 | berkali-kali 149 | berkata 150 | berkehendak 151 | berkeinginan 152 | berkenaan 153 | berlainan 154 | berlaku 155 | berlalu 156 | berlangsung 157 | berlebihan 158 | bermacam 159 | bermacam-macam 160 | bermain 161 | bermaksud 162 | bermula 163 | bernama 164 | bernilai 165 | bersama 166 | bersama-sama 167 | bersiap 168 | bertanya 169 | bertemu 170 | berturut 171 | bertutur 172 | berubah 173 | berujar 174 | berupa 175 | besar 176 | besok 177 | betul 178 | betulkah 179 | bhd 180 | biasa 181 | biasanya 182 | bidang 183 | bila 184 | bilakah 185 | bilion 186 | bintang 187 | bisa 188 | bisakah 189 | blog 190 | bn 191 | bola 192 | boleh 193 | bolehkah 194 | bolehlah 195 | buat 196 | bukan 197 | bukankah 198 | bukanlah 199 | bukannya 200 | buku 201 | bulan 202 | bumi 203 | bung 204 | bursa 205 | cadangan 206 | cara 207 | caranya 208 | catch 209 | china 210 | click 211 | code 212 | copyright 213 | cukup 214 | cukupkah 215 | cukuplah 216 | cuma 217 | daerah 218 | dagangan 219 | dahulu 220 | dalam 221 | dan 222 | dana 223 | dapat 224 | dari 225 | daripada 226 | dasar 227 | data 228 | datang 229 | datuk 230 | dekat 231 | demi 232 | demikian 233 | demikianlah 234 | dengan 235 | depan 236 | derivatives 237 | desa 238 | desember 239 | detik 240 | dewan 241 | di 242 | dia 243 | diadakan 244 | diakhiri 245 | diakhirinya 246 | dialah 247 | dianggap 248 | diantara 249 | diantaranya 250 | diberi 251 | diberikan 252 | diberikannya 253 | dibuat 254 | dibuatnya 255 | dibuka 256 | dicatatkan 257 | didapat 258 | didatangkan 259 | didirikan 260 | diduga 261 | digunakan 262 | diibaratkan 263 | diibaratkannya 264 | diingat 265 | diingatkan 266 | diinginkan 267 | dijangka 268 | dijawab 269 | dijelaskan 270 | dijelaskannya 271 | dikarenakan 272 | dikatakan 273 | dikatakannya 274 | dikenal 275 | dikerjakan 276 | diketahui 277 | diketahuinya 278 | dikira 279 | dilakukan 280 | dilalui 281 | dilihat 282 | dimaksud 283 | dimaksudkan 284 | dimaksudkannya 285 | dimaksudnya 286 | dimana 287 | diminta 288 | dimintai 289 | dimisalkan 290 | dimulai 291 | dimulailah 292 | dimulainya 293 | dimungkinkan 294 | dini 295 | diniagakan 296 | dipastikan 297 | diperbuat 298 | diperbuatnya 299 | dipergunakan 300 | diperkirakan 301 | diperlihatkan 302 | diperlukan 303 | diperlukannya 304 | dipersoalkan 305 | dipertanyakan 306 | dipunyai 307 | diri 308 | dirilis 309 | dirinya 310 | dis 311 | disampaikan 312 | disebut 313 | disebutkan 314 | disebutkannya 315 | disember 316 | disini 317 | disinilah 318 | distrik 319 | ditambahkan 320 | ditandaskan 321 | ditanya 322 | ditanyai 323 | ditanyakan 324 | ditegaskan 325 | ditemukan 326 | ditujukan 327 | ditunjuk 328 | ditunjuki 329 | ditunjukkan 330 | ditunjukkannya 331 | ditunjuknya 332 | ditutup 333 | dituturkan 334 | dituturkannya 335 | diucapkan 336 | diucapkannya 337 | diungkapkan 338 | document.write 339 | dolar 340 | dong 341 | dr 342 | dua 343 | dulu 344 | dunia 345 | effective 346 | ekonomi 347 | eksekutif 348 | eksport 349 | empat 350 | enam 351 | enggak 352 | enggaknya 353 | entah 354 | entahlah 355 | era 356 | eropa 357 | err 358 | faedah 359 | feb 360 | film 361 | gat 362 | gedung 363 | gelar 364 | gettracker 365 | global 366 | grup 367 | guna 368 | gunakan 369 | gunung 370 | hadap 371 | hadapan 372 | hal 373 | hampir 374 | hanya 375 | hanyalah 376 | harga 377 | hari 378 | harian 379 | harus 380 | haruslah 381 | harusnya 382 | hasil 383 | hendak 384 | hendaklah 385 | hendaknya 386 | hidup 387 | hingga 388 | https 389 | hubungan 390 | hukum 391 | hutan 392 | i 393 | ia 394 | iaitu 395 | ialah 396 | ibarat 397 | ibaratkan 398 | ibaratnya 399 | ibu 400 | ii 401 | iklan 402 | ikut 403 | ilmu 404 | indeks 405 | india 406 | indonesia 407 | industri 408 | informasi 409 | ingat 410 | inggris 411 | ingin 412 | inginkah 413 | inginkan 414 | ini 415 | inikah 416 | inilah 417 | internasional 418 | islam 419 | isnin 420 | isu 421 | italia 422 | itu 423 | itukah 424 | itulah 425 | jabatan 426 | jadi 427 | jadilah 428 | jadinya 429 | jakarta 430 | jalan 431 | jalur 432 | jaman 433 | jan 434 | jangan 435 | jangankan 436 | janganlah 437 | januari 438 | jauh 439 | jawa 440 | jawab 441 | jawaban 442 | jawabnya 443 | jawatan 444 | jawatankuasa 445 | jelas 446 | jelaskan 447 | jelaslah 448 | jelasnya 449 | jenis 450 | jepang 451 | jepun 452 | jerman 453 | jika 454 | jikalau 455 | jiwa 456 | jual 457 | jualan 458 | juga 459 | julai 460 | jumaat 461 | jumat 462 | jumlah 463 | jumlahnya 464 | jun 465 | juni 466 | justru 467 | juta 468 | kabar 469 | kabupaten 470 | kadar 471 | kala 472 | kalangan 473 | kalau 474 | kalaulah 475 | kalaupun 476 | kali 477 | kalian 478 | kalimantan 479 | kami 480 | kamilah 481 | kamis 482 | kamu 483 | kamulah 484 | kan 485 | kantor 486 | kapal 487 | kapan 488 | kapankah 489 | kapanpun 490 | karena 491 | karenanya 492 | karya 493 | kasus 494 | kata 495 | katakan 496 | katakanlah 497 | katanya 498 | kaunter 499 | kawasan 500 | ke 501 | keadaan 502 | kebetulan 503 | kebutuhan 504 | kecamatan 505 | kecil 506 | kedua 507 | kedua-dua 508 | keduanya 509 | kedudukan 510 | kegiatan 511 | kehidupan 512 | keinginan 513 | kejadian 514 | kekal 515 | kelamaan 516 | kelihatan 517 | kelihatannya 518 | kelima 519 | kelompok 520 | keluar 521 | keluarga 522 | kelurahan 523 | kembali 524 | kementerian 525 | kemudahan 526 | kemudian 527 | kemungkinan 528 | kemungkinannya 529 | kenaikan 530 | kenapa 531 | kenyataan 532 | kepada 533 | kepadanya 534 | kepala 535 | kepentingan 536 | keputusan 537 | kerajaan 538 | kerana 539 | kereta 540 | kerja 541 | kerjasama 542 | kes 543 | kesampaian 544 | keselamatan 545 | keseluruhan 546 | keseluruhannya 547 | kesempatan 548 | kesihatan 549 | keterangan 550 | keterlaluan 551 | ketiga 552 | ketika 553 | ketua 554 | keuntungan 555 | kewangan 556 | khamis 557 | khusus 558 | khususnya 559 | kini 560 | kinilah 561 | kira 562 | kira-kira 563 | kiranya 564 | kita 565 | kitalah 566 | klci 567 | klibor 568 | klik 569 | km 570 | kok 571 | komentar 572 | kompas 573 | komposit 574 | kondisi 575 | kontrak 576 | korban 577 | korea 578 | kos 579 | kota 580 | kuala 581 | kuasa 582 | kukuh 583 | kumpulan 584 | kurang 585 | kurangnya 586 | lagi 587 | lagian 588 | lagu 589 | lah 590 | lain 591 | lainnya 592 | laku 593 | lalu 594 | lama 595 | lamanya 596 | langkah 597 | langsung 598 | lanjut 599 | lanjutnya 600 | laporan 601 | laut 602 | lebih 603 | lembaga 604 | lepas 605 | lewat 606 | lima 607 | lingkungan 608 | login 609 | lokasi 610 | lot 611 | luar 612 | luas 613 | lumpur 614 | mac 615 | macam 616 | mahkamah 617 | mahu 618 | majlis 619 | maka 620 | makanan 621 | makanya 622 | makin 623 | maklumat 624 | malah 625 | malahan 626 | malam 627 | malaysia 628 | mampu 629 | mampukah 630 | mana 631 | manakala 632 | manalagi 633 | mantan 634 | manusia 635 | masa 636 | masalah 637 | masalahnya 638 | masih 639 | masihkah 640 | masing 641 | masing-masing 642 | masuk 643 | masyarakat 644 | mata 645 | mau 646 | maupun 647 | measure 648 | media 649 | mei 650 | melainkan 651 | melakukan 652 | melalui 653 | melawan 654 | melihat 655 | melihatnya 656 | memandangkan 657 | memang 658 | memastikan 659 | membantu 660 | membawa 661 | memberi 662 | memberikan 663 | membolehkan 664 | membuat 665 | memerlukan 666 | memihak 667 | memiliki 668 | meminta 669 | memintakan 670 | memisalkan 671 | memperbuat 672 | mempergunakan 673 | memperkirakan 674 | memperlihatkan 675 | mempersiapkan 676 | mempersoalkan 677 | mempertanyakan 678 | mempunyai 679 | memulai 680 | memungkinkan 681 | menaiki 682 | menambah 683 | menambahkan 684 | menandaskan 685 | menanti 686 | menantikan 687 | menanya 688 | menanyai 689 | menanyakan 690 | menarik 691 | menawarkan 692 | mencapai 693 | mencari 694 | mencatatkan 695 | mendapat 696 | mendapatkan 697 | mendatang 698 | mendatangi 699 | mendatangkan 700 | menegaskan 701 | menerima 702 | menerusi 703 | mengadakan 704 | mengakhiri 705 | mengaku 706 | mengalami 707 | mengambil 708 | mengapa 709 | mengatakan 710 | mengatakannya 711 | mengenai 712 | mengerjakan 713 | mengetahui 714 | menggalakkan 715 | menggunakan 716 | menghadapi 717 | menghendaki 718 | mengibaratkan 719 | mengibaratkannya 720 | mengikut 721 | mengingat 722 | mengingatkan 723 | menginginkan 724 | mengira 725 | mengucapkan 726 | mengucapkannya 727 | mengumumkan 728 | mengungkapkan 729 | mengurangkan 730 | meninggal 731 | meningkat 732 | meningkatkan 733 | menjadi 734 | menjalani 735 | menjawab 736 | menjelang 737 | menjelaskan 738 | menokok 739 | menteri 740 | menuju 741 | menunjuk 742 | menunjuki 743 | menunjukkan 744 | menunjuknya 745 | menurut 746 | menuturkan 747 | menyaksikan 748 | menyampaikan 749 | menyangkut 750 | menyatakan 751 | menyebabkan 752 | menyebutkan 753 | menyediakan 754 | menyeluruh 755 | menyiapkan 756 | merasa 757 | mereka 758 | merekalah 759 | merosot 760 | merupakan 761 | meski 762 | meskipun 763 | mesyuarat 764 | metrotv 765 | meyakini 766 | meyakinkan 767 | milik 768 | militer 769 | minat 770 | minggu 771 | minta 772 | minyak 773 | mirip 774 | misal 775 | misalkan 776 | misalnya 777 | mobil 778 | modal 779 | mohd 780 | mudah 781 | mula 782 | mulai 783 | mulailah 784 | mulanya 785 | muncul 786 | mungkin 787 | mungkinkah 788 | musik 789 | musim 790 | nah 791 | naik 792 | nama 793 | namun 794 | nanti 795 | nantinya 796 | nasional 797 | negara 798 | negara-negara 799 | negeri 800 | new 801 | niaga 802 | nilai 803 | nomor 804 | noun 805 | nov 806 | november 807 | numeral 808 | numeralia 809 | nya 810 | nyaris 811 | nyatanya 812 | of 813 | ogos 814 | okt 815 | oktober 816 | olah 817 | oleh 818 | olehnya 819 | operasi 820 | orang 821 | organisasi 822 | pada 823 | padahal 824 | padanya 825 | pagetracker 826 | pagi 827 | pak 828 | paling 829 | pameran 830 | panjang 831 | pantas 832 | papan 833 | para 834 | paras 835 | parlimen 836 | partai 837 | parti 838 | particle 839 | pasar 840 | pasaran 841 | password 842 | pasti 843 | pastilah 844 | pasukan 845 | paticle 846 | pegawai 847 | pejabat 848 | pekan 849 | pekerja 850 | pelabur 851 | pelaburan 852 | pelancongan 853 | pelanggan 854 | pelbagai 855 | peluang 856 | pemain 857 | pembangunan 858 | pemberita 859 | pembinaan 860 | pemerintah 861 | pemerintahan 862 | pemimpin 863 | pendapatan 864 | pendidikan 865 | penduduk 866 | penerbangan 867 | pengarah 868 | pengeluaran 869 | pengerusi 870 | pengguna 871 | penggunaan 872 | pengurusan 873 | peniaga 874 | peningkatan 875 | penting 876 | pentingnya 877 | per 878 | perancis 879 | perang 880 | peratus 881 | percuma 882 | perdagangan 883 | perdana 884 | peringkat 885 | perjanjian 886 | perkara 887 | perkhidmatan 888 | perladangan 889 | perlu 890 | perlukah 891 | perlunya 892 | permintaan 893 | pernah 894 | perniagaan 895 | persekutuan 896 | persen 897 | persidangan 898 | persoalan 899 | pertama 900 | pertandingan 901 | pertanyaan 902 | pertanyakan 903 | pertubuhan 904 | pertumbuhan 905 | perubahan 906 | perusahaan 907 | pesawat 908 | peserta 909 | petang 910 | pihak 911 | pihaknya 912 | pilihan 913 | pinjaman 914 | polis 915 | polisi 916 | politik 917 | pos 918 | posisi 919 | presiden 920 | prestasi 921 | produk 922 | program 923 | projek 924 | pronomia 925 | pronoun 926 | proses 927 | proton 928 | provinsi 929 | pt 930 | pubdate 931 | pukul 932 | pula 933 | pulau 934 | pun 935 | punya 936 | pusat 937 | rabu 938 | radio 939 | raja 940 | rakan 941 | rakyat 942 | ramai 943 | rantau 944 | rasa 945 | rasanya 946 | rata 947 | raya 948 | rendah 949 | republik 950 | resmi 951 | ribu 952 | ringgit 953 | root 954 | ruang 955 | rumah 956 | rupa 957 | rupanya 958 | saat 959 | saatnya 960 | sabah 961 | sabtu 962 | sahaja 963 | saham 964 | saja 965 | sajalah 966 | sakit 967 | salah 968 | saling 969 | sama 970 | sama-sama 971 | sambil 972 | sampai 973 | sampaikan 974 | sana 975 | sangat 976 | sangatlah 977 | sarawak 978 | satu 979 | sawit 980 | saya 981 | sayalah 982 | sdn 983 | se 984 | sebab 985 | sebabnya 986 | sebagai 987 | sebagaimana 988 | sebagainya 989 | sebagian 990 | sebahagian 991 | sebaik 992 | sebaiknya 993 | sebaliknya 994 | sebanyak 995 | sebarang 996 | sebegini 997 | sebegitu 998 | sebelah 999 | sebelum 1000 | sebelumnya 1001 | sebenarnya 1002 | seberapa 1003 | sebesar 1004 | sebetulnya 1005 | sebisanya 1006 | sebuah 1007 | sebut 1008 | sebutlah 1009 | sebutnya 1010 | secara 1011 | secukupnya 1012 | sedang 1013 | sedangkan 1014 | sedemikian 1015 | sedikit 1016 | sedikitnya 1017 | seenaknya 1018 | segala 1019 | segalanya 1020 | segera 1021 | segi 1022 | seharusnya 1023 | sehingga 1024 | seingat 1025 | sejak 1026 | sejarah 1027 | sejauh 1028 | sejenak 1029 | sejumlah 1030 | sekadar 1031 | sekadarnya 1032 | sekali 1033 | sekali-kali 1034 | sekalian 1035 | sekaligus 1036 | sekalipun 1037 | sekarang 1038 | sekaranglah 1039 | sekecil 1040 | seketika 1041 | sekiranya 1042 | sekitar 1043 | sekitarnya 1044 | sekolah 1045 | sektor 1046 | sekurang 1047 | sekurangnya 1048 | sekuriti 1049 | sela 1050 | selagi 1051 | selain 1052 | selaku 1053 | selalu 1054 | selama 1055 | selama-lamanya 1056 | selamanya 1057 | selanjutnya 1058 | selasa 1059 | selatan 1060 | selepas 1061 | seluruh 1062 | seluruhnya 1063 | semacam 1064 | semakin 1065 | semalam 1066 | semampu 1067 | semampunya 1068 | semasa 1069 | semasih 1070 | semata 1071 | semaunya 1072 | sementara 1073 | semisal 1074 | semisalnya 1075 | sempat 1076 | semua 1077 | semuanya 1078 | semula 1079 | sen 1080 | sendiri 1081 | sendirian 1082 | sendirinya 1083 | senin 1084 | seolah 1085 | seolah-olah 1086 | seorang 1087 | sepak 1088 | sepanjang 1089 | sepantasnya 1090 | sepantasnyalah 1091 | seperlunya 1092 | seperti 1093 | sepertinya 1094 | sepihak 1095 | sept 1096 | september 1097 | serangan 1098 | serantau 1099 | seri 1100 | serikat 1101 | sering 1102 | seringnya 1103 | serta 1104 | serupa 1105 | sesaat 1106 | sesama 1107 | sesampai 1108 | sesegera 1109 | sesekali 1110 | seseorang 1111 | sesi 1112 | sesuai 1113 | sesuatu 1114 | sesuatunya 1115 | sesudah 1116 | sesudahnya 1117 | setelah 1118 | setempat 1119 | setengah 1120 | seterusnya 1121 | setiap 1122 | setiausaha 1123 | setiba 1124 | setibanya 1125 | setidak 1126 | setidaknya 1127 | setinggi 1128 | seusai 1129 | sewaktu 1130 | siap 1131 | siapa 1132 | siapakah 1133 | siapapun 1134 | siaran 1135 | sidang 1136 | singapura 1137 | sini 1138 | sinilah 1139 | sistem 1140 | soal 1141 | soalnya 1142 | sokongan 1143 | sri 1144 | stasiun 1145 | suara 1146 | suatu 1147 | sudah 1148 | sudahkah 1149 | sudahlah 1150 | sukan 1151 | suku 1152 | sumber 1153 | sungai 1154 | supaya 1155 | surat 1156 | susut 1157 | syarikat 1158 | syed 1159 | tadi 1160 | tadinya 1161 | tahap 1162 | tahu 1163 | tahun 1164 | tak 1165 | tama 1166 | tambah 1167 | tambahnya 1168 | tampak 1169 | tampaknya 1170 | tampil 1171 | tan 1172 | tanah 1173 | tandas 1174 | tandasnya 1175 | tanggal 1176 | tanpa 1177 | tanya 1178 | tanyakan 1179 | tanyanya 1180 | tapi 1181 | tawaran 1182 | tegas 1183 | tegasnya 1184 | teknologi 1185 | telah 1186 | televisi 1187 | teman 1188 | tempat 1189 | tempatan 1190 | tempo 1191 | tempoh 1192 | tenaga 1193 | tengah 1194 | tentang 1195 | tentara 1196 | tentu 1197 | tentulah 1198 | tentunya 1199 | tepat 1200 | terakhir 1201 | terasa 1202 | terbaik 1203 | terbang 1204 | terbanyak 1205 | terbesar 1206 | terbuka 1207 | terdahulu 1208 | terdapat 1209 | terdiri 1210 | terhadap 1211 | terhadapnya 1212 | teringat 1213 | terjadi 1214 | terjadilah 1215 | terjadinya 1216 | terkait 1217 | terkenal 1218 | terkira 1219 | terlalu 1220 | terlebih 1221 | terletak 1222 | terlihat 1223 | termasuk 1224 | ternyata 1225 | tersampaikan 1226 | tersebut 1227 | tersebutlah 1228 | tertentu 1229 | tertuju 1230 | terus 1231 | terutama 1232 | testimoni 1233 | testimony 1234 | tetap 1235 | tetapi 1236 | the 1237 | tiada 1238 | tiap 1239 | tiba 1240 | tidak 1241 | tidakkah 1242 | tidaklah 1243 | tidaknya 1244 | tiga 1245 | tim 1246 | timbalan 1247 | timur 1248 | tindakan 1249 | tinggal 1250 | tinggi 1251 | tingkat 1252 | toh 1253 | tokoh 1254 | try 1255 | tun 1256 | tunai 1257 | tunjuk 1258 | turun 1259 | turut 1260 | tutur 1261 | tuturnya 1262 | tv 1263 | uang 1264 | ucap 1265 | ucapnya 1266 | udara 1267 | ujar 1268 | ujarnya 1269 | umum 1270 | umumnya 1271 | unescape 1272 | ungkap 1273 | ungkapnya 1274 | unit 1275 | universitas 1276 | untuk 1277 | untung 1278 | upaya 1279 | urus 1280 | usah 1281 | usaha 1282 | usai 1283 | user 1284 | utama 1285 | utara 1286 | var 1287 | versi 1288 | waduh 1289 | wah 1290 | wahai 1291 | wakil 1292 | waktu 1293 | waktunya 1294 | walau 1295 | walaupun 1296 | wang 1297 | wanita 1298 | warga 1299 | warta 1300 | wib 1301 | wilayah 1302 | wong 1303 | word 1304 | ya 1305 | yaitu 1306 | yakin 1307 | yakni 1308 | yang 1309 | zaman -------------------------------------------------------------------------------- /src/main/resources/com/intenthq/gander/text/stopwords-it.txt: -------------------------------------------------------------------------------- 1 | a 2 | a 3 | abbia 4 | abbiamo 5 | abbiano 6 | abbiate 7 | ad 8 | agl 9 | agl 10 | agli 11 | agli 12 | ai 13 | ai 14 | al 15 | al 16 | all 17 | all 18 | alla 19 | alla 20 | alle 21 | alle 22 | allo 23 | allo 24 | anche 25 | anche 26 | avemmo 27 | avendo 28 | avesse 29 | avessero 30 | avessi 31 | avessimo 32 | aveste 33 | avesti 34 | avete 35 | aveva 36 | avevamo 37 | avevano 38 | avevate 39 | avevi 40 | avevo 41 | avra 42 | avrai 43 | avranno 44 | avrebbe 45 | avrebbero 46 | avrei 47 | avremmo 48 | avremo 49 | avreste 50 | avresti 51 | avrete 52 | avro 53 | avrà 54 | avrò 55 | avuta 56 | avute 57 | avuti 58 | avuto 59 | c 60 | c 61 | che 62 | che 63 | chi 64 | chi 65 | ci 66 | ci 67 | coi 68 | coi 69 | col 70 | col 71 | come 72 | come 73 | con 74 | con 75 | contro 76 | contro 77 | cui 78 | cui 79 | da 80 | da 81 | dagl 82 | dagl 83 | dagli 84 | dagli 85 | dai 86 | dai 87 | dal 88 | dal 89 | dall 90 | dall 91 | dalla 92 | dalla 93 | dalle 94 | dalle 95 | dallo 96 | dallo 97 | degl 98 | degl 99 | degli 100 | degli 101 | dei 102 | dei 103 | del 104 | del 105 | dell 106 | dell 107 | della 108 | della 109 | delle 110 | delle 111 | dello 112 | dello 113 | di 114 | di 115 | dov 116 | dov 117 | dove 118 | dove 119 | e 120 | e 121 | ebbe 122 | ebbero 123 | ebbi 124 | ed 125 | ed 126 | era 127 | erano 128 | eravamo 129 | eravate 130 | eri 131 | ero 132 | essendo 133 | faccia 134 | facciamo 135 | facciano 136 | facciate 137 | faccio 138 | facemmo 139 | facendo 140 | facesse 141 | facessero 142 | facessi 143 | facessimo 144 | faceste 145 | facesti 146 | faceva 147 | facevamo 148 | facevano 149 | facevate 150 | facevi 151 | facevo 152 | fai 153 | fanno 154 | farai 155 | faranno 156 | farebbe 157 | farebbero 158 | farei 159 | faremmo 160 | faremo 161 | fareste 162 | faresti 163 | farete 164 | farà 165 | farò 166 | fece 167 | fecero 168 | feci 169 | fosse 170 | fossero 171 | fossi 172 | fossimo 173 | foste 174 | fosti 175 | fu 176 | fui 177 | fummo 178 | furono 179 | gli 180 | gli 181 | ha 182 | hai 183 | hanno 184 | ho 185 | i 186 | i 187 | il 188 | il 189 | in 190 | in 191 | io 192 | io 193 | l 194 | l 195 | la 196 | la 197 | le 198 | le 199 | lei 200 | lei 201 | li 202 | li 203 | lo 204 | lo 205 | loro 206 | loro 207 | lui 208 | lui 209 | ma 210 | ma 211 | mi 212 | mi 213 | mia 214 | mia 215 | mie 216 | mie 217 | miei 218 | miei 219 | mio 220 | mio 221 | ne 222 | ne 223 | negl 224 | negl 225 | negli 226 | negli 227 | nei 228 | nei 229 | nel 230 | nel 231 | nell 232 | nell 233 | nella 234 | nella 235 | nelle 236 | nelle 237 | nello 238 | nello 239 | noi 240 | noi 241 | non 242 | non 243 | nostra 244 | nostra 245 | nostre 246 | nostre 247 | nostri 248 | nostri 249 | nostro 250 | nostro 251 | o 252 | o 253 | per 254 | per 255 | perche 256 | perchè 257 | perché 258 | piu 259 | più 260 | più 261 | quale 262 | quale 263 | quanta 264 | quanta 265 | quante 266 | quante 267 | quanti 268 | quanti 269 | quanto 270 | quanto 271 | quella 272 | quella 273 | quelle 274 | quelle 275 | quelli 276 | quelli 277 | quello 278 | quello 279 | questa 280 | questa 281 | queste 282 | queste 283 | questi 284 | questi 285 | questo 286 | questo 287 | sarai 288 | saranno 289 | sarebbe 290 | sarebbero 291 | sarei 292 | saremmo 293 | saremo 294 | sareste 295 | saresti 296 | sarete 297 | saro 298 | sarà 299 | sarò 300 | se 301 | se 302 | sei 303 | si 304 | si 305 | sia 306 | siamo 307 | siano 308 | siate 309 | siete 310 | sono 311 | sta 312 | stai 313 | stando 314 | stanno 315 | starai 316 | staranno 317 | starebbe 318 | starebbero 319 | starei 320 | staremmo 321 | staremo 322 | stareste 323 | staresti 324 | starete 325 | starà 326 | starò 327 | stava 328 | stavamo 329 | stavano 330 | stavate 331 | stavi 332 | stavo 333 | stemmo 334 | stesse 335 | stessero 336 | stessi 337 | stessimo 338 | steste 339 | stesti 340 | stette 341 | stettero 342 | stetti 343 | stia 344 | stiamo 345 | stiano 346 | stiate 347 | sto 348 | su 349 | su 350 | sua 351 | sua 352 | sue 353 | sue 354 | sugl 355 | sugl 356 | sugli 357 | sugli 358 | sui 359 | sui 360 | sul 361 | sul 362 | sull 363 | sull 364 | sulla 365 | sulla 366 | sulle 367 | sulle 368 | sullo 369 | sullo 370 | suo 371 | suo 372 | suoi 373 | suoi 374 | ti 375 | ti 376 | tra 377 | tra 378 | tu 379 | tu 380 | tua 381 | tua 382 | tue 383 | tue 384 | tuo 385 | tuo 386 | tuoi 387 | tuoi 388 | tutti 389 | tutti 390 | tutto 391 | tutto 392 | un 393 | un 394 | una 395 | una 396 | uno 397 | uno 398 | vi 399 | vi 400 | voi 401 | voi 402 | vostra 403 | vostra 404 | vostre 405 | vostre 406 | vostri 407 | vostri 408 | vostro 409 | vostro 410 | è 411 | é 412 | ad 413 | -------------------------------------------------------------------------------- /src/main/resources/com/intenthq/gander/text/stopwords-ko.txt: -------------------------------------------------------------------------------- 1 | 가 2 | 같이 3 | 고 4 | 과 5 | 과는 6 | 과를 7 | 과의 8 | 까지 9 | 까지는 10 | 까지의 11 | 께 12 | 나 13 | 는 14 | 다 15 | 대로 16 | 도 17 | 든 18 | 라 19 | 라고 20 | 로 21 | 로는 22 | 로부터 23 | 로의 24 | 를 25 | 만 26 | 만에 27 | 만을 28 | 만의 29 | 만이 30 | 며 31 | 밖에 32 | 보다 33 | 보다는 34 | 부터 35 | 부터는 36 | 아 37 | 야 38 | 에 39 | 에게 40 | 에는 41 | 에도 42 | 에만 43 | 에서 44 | 에서는 45 | 에서도 46 | 에서의 47 | 엔 48 | 여 49 | 와 50 | 와의 51 | 요 52 | 으로 53 | 으로는 54 | 으로부터 55 | 으로써 56 | 으로의 57 | 은 58 | 을 59 | 의 60 | 이 61 | 이고 62 | 이나 63 | 이다 64 | 이라고 65 | 이라는 66 | 이며 67 | 처럼 68 | 치고 69 | 토록 70 | 하고 71 | -------------------------------------------------------------------------------- /src/main/resources/com/intenthq/gander/text/stopwords-nb.txt: -------------------------------------------------------------------------------- 1 | alle 2 | andre 3 | arbeid 4 | av 5 | begge 6 | bort 7 | bra 8 | bruke 9 | da 10 | denne 11 | der 12 | deres 13 | det 14 | din 15 | disse 16 | du 17 | eller 18 | en 19 | ene 20 | eneste 21 | enhver 22 | enn 23 | er 24 | et 25 | folk 26 | for 27 | fordi 28 | forsÛke 29 | fra 30 | fÅ 31 | fÛr 32 | fÛrst 33 | gjorde 34 | gjÛre 35 | god 36 | gÅ 37 | ha 38 | hadde 39 | han 40 | hans 41 | hennes 42 | her 43 | hva 44 | hvem 45 | hver 46 | hvilken 47 | hvis 48 | hvor 49 | hvordan 50 | hvorfor 51 | ikke 52 | inn 53 | innen 54 | kan 55 | kunne 56 | lage 57 | lang 58 | lik 59 | like 60 | makt 61 | mange 62 | med 63 | meg 64 | meget 65 | men 66 | mens 67 | mer 68 | mest 69 | min 70 | mye 71 | mÅ 72 | mÅte 73 | navn 74 | nei 75 | ny 76 | nÅ 77 | nÅr 78 | og 79 | ogsÅ 80 | om 81 | opp 82 | oss 83 | over 84 | part 85 | punkt 86 | pÅ 87 | rett 88 | riktig 89 | samme 90 | sant 91 | si 92 | siden 93 | sist 94 | skulle 95 | slik 96 | slutt 97 | som 98 | start 99 | stille 100 | tid 101 | til 102 | tilbake 103 | tilstand 104 | under 105 | ut 106 | uten 107 | var 108 | ved 109 | verdi 110 | vi 111 | vil 112 | ville 113 | vite 114 | vÅr 115 | vÖre 116 | vÖrt 117 | Å 118 | -------------------------------------------------------------------------------- /src/main/resources/com/intenthq/gander/text/stopwords-nl.txt: -------------------------------------------------------------------------------- 1 | aan 2 | af 3 | al 4 | als 5 | bij 6 | dan 7 | dat 8 | die 9 | dit 10 | een 11 | en 12 | er 13 | had 14 | heb 15 | hem 16 | het 17 | hij 18 | hoe 19 | hun 20 | ik 21 | in 22 | is 23 | je 24 | kan 25 | me 26 | men 27 | met 28 | mij 29 | nog 30 | nu 31 | of 32 | ons 33 | ook 34 | te 35 | tot 36 | uit 37 | van 38 | was 39 | wat 40 | we 41 | wel 42 | wij 43 | zal 44 | ze 45 | zei 46 | zij 47 | zo 48 | zou 49 | -------------------------------------------------------------------------------- /src/main/resources/com/intenthq/gander/text/stopwords-no.txt: -------------------------------------------------------------------------------- 1 | at 2 | av 3 | de 4 | den 5 | der 6 | det 7 | du 8 | en 9 | er 10 | et 11 | for 12 | fra 13 | før 14 | med 15 | og 16 | om 17 | over 18 | på 19 | som 20 | til 21 | ved 22 | år 23 | alle 24 | bare 25 | ble 26 | bort 27 | bra 28 | da 29 | deg 30 | dem 31 | denne 32 | dere 33 | deres 34 | det 35 | dette 36 | din 37 | disse 38 | dit 39 | ditt 40 | eller 41 | ene 42 | enn 43 | er 44 | et 45 | ett 46 | etter 47 | for 48 | fram 49 | først 50 | få 51 | god 52 | gå 53 | ha 54 | han 55 | hans 56 | har 57 | her 58 | hit 59 | hun 60 | hva 61 | hvem 62 | hver 63 | ikke 64 | inn 65 | ja 66 | jeg 67 | kan 68 | kom 69 | kun 70 | kunne 71 | lage 72 | lang 73 | lik 74 | like 75 | man 76 | mer 77 | min 78 | mot 79 | mye 80 | må 81 | måte 82 | ned 83 | nei 84 | noe 85 | noen 86 | ny 87 | nå 88 | når 89 | også 90 | opp 91 | oss 92 | seg 93 | selv 94 | si 95 | siden 96 | sin 97 | sine 98 | sist 99 | skal 100 | skulle 101 | slik 102 | som 103 | så 104 | sånn 105 | tid 106 | til 107 | under 108 | ut 109 | uten 110 | var 111 | ved 112 | vi 113 | vil 114 | vite 115 | vår 116 | å 117 | dei 118 | di 119 | då 120 | eg -------------------------------------------------------------------------------- /src/main/resources/com/intenthq/gander/text/stopwords-pl.txt: -------------------------------------------------------------------------------- 1 | a 2 | aby 3 | ach 4 | acz 5 | aczkolwiek 6 | aj 7 | albo 8 | ale 9 | alez 10 | ależ 11 | ani 12 | az 13 | aż 14 | bardziej 15 | bardzo 16 | beda 17 | bedzie 18 | bez 19 | bo 20 | bowiem 21 | by 22 | byc 23 | byl 24 | byla 25 | byli 26 | bylo 27 | byly 28 | bynajmniej 29 | być 30 | był 31 | była 32 | było 33 | były 34 | będzie 35 | będą 36 | cala 37 | cali 38 | caly 39 | cała 40 | cały 41 | ci 42 | cie 43 | ciebie 44 | cię 45 | co 46 | cokolwiek 47 | cos 48 | coś 49 | czasami 50 | czasem 51 | czemu 52 | czy 53 | czyli 54 | daleko 55 | dla 56 | dlaczego 57 | dlatego 58 | do 59 | dobrze 60 | dokad 61 | dokąd 62 | dosc 63 | dość 64 | duzo 65 | dużo 66 | dwa 67 | dwaj 68 | dwie 69 | dwoje 70 | dzis 71 | dzisiaj 72 | dziś 73 | gdy 74 | gdyby 75 | gdyz 76 | gdyż 77 | gdzie 78 | gdziekolwiek 79 | gdzies 80 | gdzieś 81 | go 82 | i 83 | ich 84 | ile 85 | im 86 | inna 87 | inne 88 | inny 89 | innych 90 | iz 91 | iż 92 | ja 93 | jak 94 | jakas 95 | jakaś 96 | jakby 97 | jaki 98 | jakichs 99 | jakichś 100 | jakie 101 | jakis 102 | jakiz 103 | jakiś 104 | jakiż 105 | jakkolwiek 106 | jako 107 | jakos 108 | jakoś 109 | je 110 | jeden 111 | jedna 112 | jednak 113 | jednakze 114 | jednakże 115 | jedno 116 | jego 117 | jej 118 | jemu 119 | jesli 120 | jest 121 | jestem 122 | jeszcze 123 | jezeli 124 | jeśli 125 | jeżeli 126 | juz 127 | już 128 | ją 129 | kazdy 130 | każdy 131 | kiedy 132 | kilka 133 | kims 134 | kimś 135 | kto 136 | ktokolwiek 137 | ktora 138 | ktore 139 | ktorego 140 | ktorej 141 | ktory 142 | ktorych 143 | ktorym 144 | ktorzy 145 | ktos 146 | ktoś 147 | która 148 | które 149 | którego 150 | której 151 | który 152 | których 153 | którym 154 | którzy 155 | ku 156 | lat 157 | lecz 158 | lub 159 | ma 160 | maja 161 | mają 162 | mam 163 | mało 164 | mi 165 | miedzy 166 | mimo 167 | między 168 | mna 169 | mnie 170 | mną 171 | moga 172 | mogą 173 | moi 174 | moim 175 | moj 176 | moja 177 | moje 178 | moze 179 | mozliwe 180 | mozna 181 | może 182 | możliwe 183 | można 184 | mu 185 | musi 186 | my 187 | mój 188 | na 189 | nad 190 | nam 191 | nami 192 | nas 193 | nasi 194 | nasz 195 | nasza 196 | nasze 197 | naszego 198 | naszych 199 | natomiast 200 | natychmiast 201 | nawet 202 | nia 203 | nic 204 | nich 205 | nie 206 | niech 207 | niego 208 | niej 209 | niemu 210 | nigdy 211 | nim 212 | nimi 213 | niz 214 | nią 215 | niż 216 | no 217 | o 218 | obok 219 | od 220 | okolo 221 | około 222 | on 223 | ona 224 | one 225 | oni 226 | ono 227 | oraz 228 | oto 229 | owszem 230 | pan 231 | pana 232 | pani 233 | po 234 | pod 235 | podczas 236 | pomimo 237 | ponad 238 | poniewaz 239 | ponieważ 240 | powinien 241 | powinna 242 | powinni 243 | powinno 244 | poza 245 | prawie 246 | przeciez 247 | przecież 248 | przed 249 | przede 250 | przedtem 251 | przez 252 | przy 253 | roku 254 | rowniez 255 | również 256 | sa 257 | sam 258 | sama 259 | sie 260 | się 261 | skad 262 | skąd 263 | soba 264 | sobie 265 | sobą 266 | sposob 267 | sposób 268 | swoje 269 | są 270 | ta 271 | tak 272 | taka 273 | taki 274 | takie 275 | takze 276 | także 277 | tam 278 | te 279 | tego 280 | tej 281 | temu 282 | ten 283 | teraz 284 | tez 285 | też 286 | to 287 | toba 288 | tobie 289 | tobą 290 | totez 291 | toteż 292 | trzeba 293 | tu 294 | tutaj 295 | twoi 296 | twoim 297 | twoj 298 | twoja 299 | twoje 300 | twym 301 | twój 302 | ty 303 | tych 304 | tylko 305 | tym 306 | u 307 | w 308 | wam 309 | wami 310 | was 311 | wasz 312 | wasza 313 | wasze 314 | we 315 | wedlug 316 | według 317 | wiec 318 | wiecej 319 | wiele 320 | wielu 321 | więc 322 | więcej 323 | wlasnie 324 | wszyscy 325 | wszystkich 326 | wszystkie 327 | wszystkim 328 | wszystko 329 | wtedy 330 | wy 331 | właśnie 332 | z 333 | za 334 | zaden 335 | zadna 336 | zadne 337 | zadnych 338 | zapewne 339 | zawsze 340 | ze 341 | zeby 342 | znow 343 | znowu 344 | znów 345 | zostal 346 | został 347 | zł 348 | żaden 349 | żadna 350 | żadne 351 | żadnych 352 | że 353 | żebya 354 | -------------------------------------------------------------------------------- /src/main/resources/com/intenthq/gander/text/stopwords-pt.txt: -------------------------------------------------------------------------------- 1 | a 2 | acerca 3 | agora 4 | algmas 5 | alguns 6 | ali 7 | ambos 8 | antes 9 | ao 10 | aos 11 | apontar 12 | aquela 13 | aquelas 14 | aquele 15 | aqueles 16 | aqui 17 | aquilo 18 | as 19 | atrás 20 | até 21 | bem 22 | bom 23 | cada 24 | caminho 25 | cima 26 | com 27 | como 28 | comprido 29 | conhecido 30 | corrente 31 | da 32 | das 33 | de 34 | debaixo 35 | dela 36 | delas 37 | dele 38 | deles 39 | dentro 40 | depois 41 | desde 42 | desligado 43 | deve 44 | devem 45 | deverá 46 | direita 47 | diz 48 | dizer 49 | do 50 | dois 51 | dos 52 | e 53 | ela 54 | elas 55 | ele 56 | eles 57 | em 58 | enquanto 59 | entre 60 | então 61 | era 62 | eram 63 | essa 64 | essas 65 | esse 66 | esses 67 | esta 68 | estado 69 | estamos 70 | estar 71 | estará 72 | estas 73 | estava 74 | estavam 75 | este 76 | esteja 77 | estejam 78 | estejamos 79 | estes 80 | esteve 81 | estive 82 | estivemos 83 | estiver 84 | estivera 85 | estiveram 86 | estiverem 87 | estivermos 88 | estivesse 89 | estivessem 90 | estivéramos 91 | estivéssemos 92 | estou 93 | está 94 | estávamos 95 | estão 96 | eu 97 | fará 98 | faz 99 | fazer 100 | fazia 101 | fez 102 | fim 103 | foi 104 | fomos 105 | for 106 | fora 107 | foram 108 | forem 109 | formos 110 | fosse 111 | fossem 112 | fui 113 | fôramos 114 | fôssemos 115 | haja 116 | hajam 117 | hajamos 118 | havemos 119 | hei 120 | horas 121 | houve 122 | houvemos 123 | houver 124 | houvera 125 | houveram 126 | houverei 127 | houverem 128 | houveremos 129 | houveria 130 | houveriam 131 | houvermos 132 | houverá 133 | houverão 134 | houveríamos 135 | houvesse 136 | houvessem 137 | houvéramos 138 | houvéssemos 139 | há 140 | hão 141 | iniciar 142 | inicio 143 | ir 144 | irá 145 | isso 146 | ista 147 | iste 148 | isto 149 | já 150 | lhe 151 | lhes 152 | ligado 153 | maioria 154 | maiorias 155 | mais 156 | mas 157 | me 158 | mesmo 159 | meu 160 | meus 161 | minha 162 | minhas 163 | muito 164 | muitos 165 | na 166 | nas 167 | nem 168 | no 169 | nome 170 | nos 171 | nossa 172 | nossas 173 | nosso 174 | nossos 175 | novo 176 | num 177 | numa 178 | não 179 | nós 180 | o 181 | onde 182 | os 183 | ou 184 | outro 185 | para 186 | parte 187 | pegar 188 | pela 189 | pelas 190 | pelo 191 | pelos 192 | pessoas 193 | pode 194 | poderá 195 | podia 196 | por 197 | porque 198 | povo 199 | promeiro 200 | qual 201 | qualquer 202 | quando 203 | que 204 | quem 205 | quieto 206 | quê 207 | saber 208 | se 209 | seja 210 | sejam 211 | sejamos 212 | sem 213 | ser 214 | serei 215 | seremos 216 | seria 217 | seriam 218 | será 219 | serão 220 | seríamos 221 | seu 222 | seus 223 | somente 224 | somos 225 | sou 226 | sua 227 | suas 228 | são 229 | só 230 | tal 231 | também 232 | te 233 | tem 234 | temos 235 | tempo 236 | tenha 237 | tenham 238 | tenhamos 239 | tenho 240 | tentar 241 | tentaram 242 | tente 243 | tentei 244 | terei 245 | teremos 246 | teria 247 | teriam 248 | terá 249 | terão 250 | teríamos 251 | teu 252 | teus 253 | teve 254 | tinha 255 | tinham 256 | tipo 257 | tive 258 | tivemos 259 | tiver 260 | tivera 261 | tiveram 262 | tiverem 263 | tivermos 264 | tivesse 265 | tivessem 266 | tivéramos 267 | tivéssemos 268 | todos 269 | trabalhar 270 | trabalho 271 | tu 272 | tua 273 | tuas 274 | tém 275 | têm 276 | tínhamos 277 | um 278 | uma 279 | umas 280 | uns 281 | usa 282 | usar 283 | valor 284 | veja 285 | ver 286 | verdade 287 | verdadeiro 288 | você 289 | vocês 290 | vos 291 | à 292 | às 293 | é 294 | éramos 295 | último 296 | -------------------------------------------------------------------------------- /src/main/resources/com/intenthq/gander/text/stopwords-ro.txt: -------------------------------------------------------------------------------- 1 | a 2 | abia 3 | acea 4 | aceasta 5 | aceea 6 | aceeasi 7 | aceia 8 | acel 9 | acela 10 | acelasi 11 | acelea 12 | acest 13 | acesta 14 | aceste 15 | acestea 16 | acestei 17 | acestia 18 | acestui 19 | acolo 20 | acum 21 | adica 22 | ai 23 | aia 24 | aici 25 | aiurea 26 | al 27 | ala 28 | alaturi 29 | ale 30 | alt 31 | alta 32 | altceva 33 | alte 34 | altfel 35 | alti 36 | altii 37 | altul 38 | am 39 | anume 40 | apoi 41 | ar 42 | are 43 | as 44 | asa 45 | asemenea 46 | asta 47 | astazi 48 | astfel 49 | asupra 50 | atare 51 | ati 52 | atit 53 | atita 54 | atitea 55 | atitia 56 | atunci 57 | au 58 | avea 59 | avem 60 | avut 61 | azi 62 | b 63 | ba 64 | bine 65 | c 66 | ca 67 | cam 68 | capat 69 | care 70 | careia 71 | carora 72 | caruia 73 | catre 74 | ce 75 | cea 76 | ceea 77 | cei 78 | ceilalti 79 | cel 80 | cele 81 | celor 82 | ceva 83 | chiar 84 | ci 85 | cind 86 | cine 87 | cineva 88 | cit 89 | cita 90 | cite 91 | citeva 92 | citi 93 | citiva 94 | conform 95 | cu 96 | cui 97 | cum 98 | cumva 99 | d 100 | da 101 | daca 102 | dar 103 | dat 104 | de 105 | deasupra 106 | deci 107 | decit 108 | degraba 109 | deja 110 | desi 111 | despre 112 | din 113 | dintr 114 | dintre 115 | doar 116 | dupa 117 | e 118 | ea 119 | ei 120 | el 121 | ele 122 | era 123 | este 124 | eu 125 | exact 126 | f 127 | face 128 | fara 129 | fata 130 | fel 131 | fi 132 | fie 133 | foarte 134 | fost 135 | g 136 | geaba 137 | h 138 | i 139 | ia 140 | iar 141 | ii 142 | il 143 | imi 144 | in 145 | inainte 146 | inapoi 147 | inca 148 | incit 149 | insa 150 | intr 151 | intre 152 | isi 153 | iti 154 | j 155 | k 156 | l 157 | la 158 | le 159 | li 160 | lor 161 | lui 162 | m 163 | ma 164 | mai 165 | mare 166 | mi 167 | mod 168 | mult 169 | multa 170 | multe 171 | multi 172 | n 173 | ne 174 | ni 175 | nici 176 | niciodata 177 | nimeni 178 | nimic 179 | niste 180 | noi 181 | nostri 182 | nou 183 | noua 184 | nu 185 | numai 186 | o 187 | or 188 | ori 189 | orice 190 | oricum 191 | p 192 | pai 193 | parca 194 | pe 195 | pentru 196 | peste 197 | pina 198 | plus 199 | prea 200 | prin 201 | putini 202 | r 203 | s 204 | sa 205 | sai 206 | sale 207 | sau 208 | se 209 | si 210 | sint 211 | sintem 212 | spre 213 | sub 214 | sus 215 | t 216 | te 217 | ti 218 | toata 219 | toate 220 | tocmai 221 | tot 222 | toti 223 | totul 224 | totusi 225 | tu 226 | tuturor 227 | u 228 | un 229 | una 230 | unde 231 | unei 232 | unele 233 | uneori 234 | unii 235 | unor 236 | unui 237 | unul 238 | v 239 | va 240 | voi 241 | vom 242 | vor 243 | vreo 244 | vreun 245 | x 246 | z 247 | -------------------------------------------------------------------------------- /src/main/resources/com/intenthq/gander/text/stopwords-ru.txt: -------------------------------------------------------------------------------- 1 | а 2 | е 3 | и 4 | ж 5 | м 6 | о 7 | на 8 | не 9 | ни 10 | об 11 | но 12 | он 13 | мне 14 | мои 15 | мож 16 | она 17 | они 18 | оно 19 | мной 20 | много 21 | многочисленное 22 | многочисленная 23 | многочисленные 24 | многочисленный 25 | мною 26 | мой 27 | мог 28 | могут 29 | можно 30 | может 31 | можхо 32 | мор 33 | моя 34 | моё 35 | мочь 36 | над 37 | нее 38 | оба 39 | нам 40 | нем 41 | нами 42 | ними 43 | мимо 44 | немного 45 | одной 46 | одного 47 | менее 48 | однажды 49 | однако 50 | меня 51 | нему 52 | меньше 53 | ней 54 | наверху 55 | него 56 | ниже 57 | мало 58 | надо 59 | один 60 | одиннадцать 61 | одиннадцатый 62 | назад 63 | наиболее 64 | недавно 65 | миллионов 66 | недалеко 67 | между 68 | низко 69 | меля 70 | нельзя 71 | нибудь 72 | непрерывно 73 | наконец 74 | никогда 75 | никуда 76 | нас 77 | наш 78 | нет 79 | нею 80 | неё 81 | них 82 | мира 83 | наша 84 | наше 85 | наши 86 | ничего 87 | начала 88 | нередко 89 | несколько 90 | обычно 91 | опять 92 | около 93 | мы 94 | ну 95 | нх 96 | от 97 | отовсюду 98 | особенно 99 | нужно 100 | очень 101 | отсюда 102 | в 103 | во 104 | вон 105 | вниз 106 | внизу 107 | вокруг 108 | вот 109 | восемнадцать 110 | восемнадцатый 111 | восемь 112 | восьмой 113 | вверх 114 | вам 115 | вами 116 | важное 117 | важная 118 | важные 119 | важный 120 | вдали 121 | везде 122 | ведь 123 | вас 124 | ваш 125 | ваша 126 | ваше 127 | ваши 128 | впрочем 129 | весь 130 | вдруг 131 | вы 132 | все 133 | второй 134 | всем 135 | всеми 136 | времени 137 | время 138 | всему 139 | всего 140 | всегда 141 | всех 142 | всею 143 | всю 144 | вся 145 | всё 146 | всюду 147 | г 148 | год 149 | говорил 150 | говорит 151 | года 152 | году 153 | где 154 | да 155 | ее 156 | за 157 | из 158 | ли 159 | же 160 | им 161 | до 162 | по 163 | ими 164 | под 165 | иногда 166 | довольно 167 | именно 168 | долго 169 | позже 170 | более 171 | должно 172 | пожалуйста 173 | значит 174 | иметь 175 | больше 176 | пока 177 | ему 178 | имя 179 | пор 180 | пора 181 | потом 182 | потому 183 | после 184 | почему 185 | почти 186 | посреди 187 | ей 188 | два 189 | две 190 | двенадцать 191 | двенадцатый 192 | двадцать 193 | двадцатый 194 | двух 195 | его 196 | дел 197 | или 198 | без 199 | день 200 | занят 201 | занята 202 | занято 203 | заняты 204 | действительно 205 | давно 206 | девятнадцать 207 | девятнадцатый 208 | девять 209 | девятый 210 | даже 211 | алло 212 | жизнь 213 | далеко 214 | близко 215 | здесь 216 | дальше 217 | для 218 | лет 219 | зато 220 | даром 221 | первый 222 | перед 223 | затем 224 | зачем 225 | лишь 226 | десять 227 | десятый 228 | ею 229 | её 230 | их 231 | бы 232 | еще 233 | при 234 | был 235 | про 236 | процентов 237 | против 238 | просто 239 | бывает 240 | бывь 241 | если 242 | люди 243 | была 244 | были 245 | было 246 | будем 247 | будет 248 | будете 249 | будешь 250 | прекрасно 251 | буду 252 | будь 253 | будто 254 | будут 255 | ещё 256 | пятнадцать 257 | пятнадцатый 258 | друго 259 | другое 260 | другой 261 | другие 262 | другая 263 | других 264 | есть 265 | пять 266 | быть 267 | лучше 268 | пятый 269 | к 270 | ком 271 | конечно 272 | кому 273 | кого 274 | когда 275 | которой 276 | которого 277 | которая 278 | которые 279 | который 280 | которых 281 | кем 282 | каждое 283 | каждая 284 | каждые 285 | каждый 286 | кажется 287 | как 288 | какой 289 | какая 290 | кто 291 | кроме 292 | куда 293 | кругом 294 | с 295 | т 296 | у 297 | я 298 | та 299 | те 300 | уж 301 | со 302 | то 303 | том 304 | снова 305 | тому 306 | совсем 307 | того 308 | тогда 309 | тоже 310 | собой 311 | тобой 312 | собою 313 | тобою 314 | сначала 315 | только 316 | уметь 317 | тот 318 | тою 319 | хорошо 320 | хотеть 321 | хочешь 322 | хоть 323 | хотя 324 | свое 325 | свои 326 | твой 327 | своей 328 | своего 329 | своих 330 | свою 331 | твоя 332 | твоё 333 | раз 334 | уже 335 | сам 336 | там 337 | тем 338 | чем 339 | сама 340 | сами 341 | теми 342 | само 343 | рано 344 | самом 345 | самому 346 | самой 347 | самого 348 | семнадцать 349 | семнадцатый 350 | самим 351 | самими 352 | самих 353 | саму 354 | семь 355 | чему 356 | раньше 357 | сейчас 358 | чего 359 | сегодня 360 | себе 361 | тебе 362 | сеаой 363 | человек 364 | разве 365 | теперь 366 | себя 367 | тебя 368 | седьмой 369 | спасибо 370 | слишком 371 | так 372 | такое 373 | такой 374 | такие 375 | также 376 | такая 377 | сих 378 | тех 379 | чаще 380 | четвертый 381 | через 382 | часто 383 | шестой 384 | шестнадцать 385 | шестнадцатый 386 | шесть 387 | четыре 388 | четырнадцать 389 | четырнадцатый 390 | сколько 391 | сказал 392 | сказала 393 | сказать 394 | ту 395 | ты 396 | три 397 | эта 398 | эти 399 | что 400 | это 401 | чтоб 402 | этом 403 | этому 404 | этой 405 | этого 406 | чтобы 407 | этот 408 | стал 409 | туда 410 | этим 411 | этими 412 | рядом 413 | тринадцать 414 | тринадцатый 415 | этих 416 | третий 417 | тут 418 | эту 419 | суть 420 | чуть 421 | тысяч 422 | -------------------------------------------------------------------------------- /src/main/resources/com/intenthq/gander/text/stopwords-sv.txt: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------------------------- 2 | # translated 3 | #----------------------------------------------------------------------- 4 | 5 | kunna 6 | om 7 | ovan 8 | enligt 9 | i enlighet med detta 10 | över 11 | faktiskt 12 | efter 13 | efteråt 14 | igen 15 | mot 16 | är inte 17 | alla 18 | tillåta 19 | tillåter 20 | nästan 21 | ensam 22 | längs 23 | redan 24 | också 25 | även om 26 | alltid 27 | am 28 | bland 29 | bland 30 | en 31 | och 32 | en annan 33 | någon 34 | någon 35 | hur som helst 36 | någon 37 | något 38 | ändå 39 | ändå 40 | var som helst 41 | isär 42 | visas 43 | uppskatta 44 | lämpligt 45 | är 46 | inte 47 | runt 48 | som 49 | åt sidan 50 | be 51 | frågar 52 | associerad 53 | vid 54 | tillgängliga 55 | bort 56 | väldigt 57 | vara 58 | blev 59 | eftersom 60 | bli 61 | blir 62 | blir 63 | varit 64 | innan 65 | förhand 66 | bakom 67 | vara 68 | tro 69 | nedan 70 | bredvid 71 | förutom 72 | bäst 73 | bättre 74 | mellan 75 | bortom 76 | både 77 | kort 78 | men 79 | genom 80 | c 81 | c'mon 82 | c: s 83 | kom 84 | kampanj 85 | kan 86 | kan inte 87 | kan inte 88 | cant 89 | orsaka 90 | orsaker 91 | viss 92 | säkerligen 93 | förändringar 94 | klart 95 | co 96 | com 97 | komma 98 | kommer 99 | om 100 | följaktligen 101 | överväga 102 | överväger 103 | innehålla 104 | innehållande 105 | innehåller 106 | motsvarande 107 | kunde 108 | kunde inte 109 | kurs 110 | närvarande 111 | definitivt 112 | beskrivits 113 | trots 114 | gjorde 115 | inte 116 | olika 117 | göra 118 | gör 119 | inte 120 | gör 121 | inte 122 | gjort 123 | ned 124 | nedåt 125 | under 126 | varje 127 | edu 128 | åtta 129 | antingen 130 | annars 131 | någon annanstans 132 | tillräckligt 133 | godkändes 134 | helt 135 | speciellt 136 | et 137 | etc 138 | även 139 | någonsin 140 | varje 141 | alla 142 | alla 143 | allt 144 | överallt 145 | ex 146 | exakt 147 | exempel 148 | utom 149 | långt 150 | få 151 | femte 152 | först 153 | finansiella 154 | fem 155 | följt 156 | efter 157 | följer 158 | för 159 | fd 160 | tidigare 161 | framåt 162 | fyra 163 | från 164 | ytterligare 165 | dessutom 166 | få 167 | blir 168 | få 169 | given 170 | ger 171 | gå 172 | går 173 | gå 174 | borta 175 | fick 176 | fått 177 | hälsningar 178 | hade 179 | hade inte 180 | händer 181 | knappast 182 | har 183 | har inte 184 | ha 185 | har inte 186 | med 187 | han 188 | han är 189 | hallå 190 | hjälpa 191 | hence 192 | henne 193 | här 194 | här finns 195 | härefter 196 | härmed 197 | häri 198 | härpå 199 | hennes 200 | själv 201 | hej 202 | honom 203 | själv 204 | hans 205 | hit 206 | förhoppningsvis 207 | hur 208 | howbeit 209 | dock 210 | jag skulle 211 | jag ska 212 | jag är 213 | jag har 214 | om 215 | ignoreras 216 | omedelbar 217 | i 218 | eftersom 219 | inc 220 | indeed 221 | indikera 222 | indikerade 223 | indikerar 224 | inre 225 | mån 226 | istället 227 | in 228 | inåt 229 | är 230 | är inte 231 | den 232 | det skulle 233 | det ska 234 | det är 235 | dess 236 | själv 237 | bara 238 | hålla 239 | håller 240 | hålls 241 | vet 242 | vet 243 | känd 244 | sista 245 | nyligen 246 | senare 247 | senare 248 | latterly 249 | minst 250 | mindre 251 | lest 252 | låt 253 | låt oss 254 | liknande 255 | gillade 256 | sannolikt 257 | lite 258 | ser 259 | ser 260 | ser 261 | ltd 262 | huvudsakligen 263 | många 264 | kan 265 | kanske 266 | mig 267 | betyda 268 | under tiden 269 | endast 270 | kanske 271 | mer 272 | dessutom 273 | mest 274 | mestadels 275 | mycket 276 | måste 277 | min 278 | själv 279 | namn 280 | nämligen 281 | nd 282 | nära 283 | nästan 284 | nödvändigt 285 | behöver 286 | behov 287 | varken 288 | aldrig 289 | ändå 290 | ny 291 | nästa 292 | nio 293 | ingen 294 | ingen 295 | icke 296 | ingen 297 | ingen 298 | eller 299 | normalt 300 | inte 301 | ingenting 302 | roman 303 | nu 304 | ingenstans 305 | uppenbarligen 306 | av 307 | off 308 | ofta 309 | oh 310 | ok 311 | okay 312 | gammal 313 | på 314 | en gång 315 | ett 316 | ettor 317 | endast 318 | på 319 | eller 320 | andra 321 | andra 322 | annars 323 | borde 324 | vår 325 | vårt 326 | oss 327 | ut 328 | utanför 329 | över 330 | övergripande 331 | egen 332 | särskilt 333 | särskilt 334 | per 335 | kanske 336 | placeras 337 | vänligen 338 | plus 339 | möjligt 340 | förmodligen 341 | förmodligen 342 | ger 343 | ganska 344 | citera 345 | kvartalsvis 346 | snarare 347 | verkligen 348 | rimligen 349 | om 350 | oavsett 351 | gäller 352 | relativt 353 | respektive 354 | höger 355 | sa 356 | samma 357 | såg 358 | säga 359 | säger 360 | säger 361 | andra 362 | det andra 363 | se 364 | ser 365 | verkar 366 | verkade 367 | informationsproblem 368 | verkar 369 | sett 370 | själv 371 | själva 372 | förnuftig 373 | skickas 374 | allvarlig 375 | allvarligt 376 | sju 377 | flera 378 | skall 379 | hon 380 | bör 381 | bör inte 382 | eftersom 383 | sex 384 | så 385 | några 386 | någon 387 | på något sätt 388 | någon 389 | något 390 | sometime 391 | ibland 392 | något 393 | någonstans 394 | snart 395 | sorry 396 | specificerade 397 | ange 398 | ange 399 | fortfarande 400 | sub 401 | sådan 402 | sup 403 | säker 404 | t s 405 | ta 406 | tas 407 | berätta 408 | tenderar 409 | än 410 | tacka 411 | tack 412 | thanx 413 | att 414 | det är 415 | brinner 416 | den 417 | deras 418 | deras 419 | dem 420 | själva 421 | sedan 422 | därifrån 423 | där 424 | det finns 425 | därefter 426 | därigenom 427 | därför 428 | däri 429 | theres 430 | därpå 431 | dessa 432 | de 433 | de hade 434 | de kommer 435 | de är 436 | de har 437 | tror 438 | tredje 439 | detta 440 | grundlig 441 | grundligt 442 | de 443 | though 444 | tre 445 | genom 446 | hela 447 | thru 448 | sålunda 449 | till 450 | tillsammans 451 | alltför 452 | tog 453 | mot 454 | mot 455 | försökte 456 | försöker 457 | verkligt 458 | försök 459 | försöker 460 | två gånger 461 | två 462 | enligt 463 | tyvärr 464 | såvida inte 465 | osannolikt 466 | tills 467 | åt 468 | upp 469 | på 470 | oss 471 | använda 472 | används 473 | användbar 474 | använder 475 | användning 476 | vanligtvis 477 | uucp 478 | värde 479 | olika 480 | mycket 481 | via 482 | viz 483 | vs 484 | vill 485 | vill 486 | var 487 | var inte 488 | sätt 489 | vi 490 | vi skulle 491 | vi kommer 492 | vi är 493 | vi har 494 | välkommen 495 | väl 496 | gick 497 | var 498 | var inte 499 | vad 500 | vad är 501 | oavsett 502 | när 503 | varifrån 504 | närhelst 505 | där 506 | var är 507 | varefter 508 | medan 509 | varigenom 510 | vari 511 | varpå 512 | varhelst 513 | huruvida 514 | som 515 | medan 516 | dit 517 | som 518 | vem är 519 | vem 520 | hela 521 | vem 522 | vars 523 | varför 524 | kommer 525 | villig 526 | önskar 527 | med 528 | inom 529 | utan 530 | kommer inte 531 | undrar 532 | skulle 533 | skulle inte 534 | ja 535 | ännu 536 | ni 537 | du skulle 538 | kommer du 539 | du är 540 | du har 541 | din 542 | själv 543 | er 544 | noll 545 | tjänsteman 546 | skarpt 547 | kritiserade 548 | -------------------------------------------------------------------------------- /src/main/resources/com/intenthq/gander/text/stopwords-zh.txt: -------------------------------------------------------------------------------- 1 | 的 2 | 一 3 | 不 4 | 在 5 | 人 6 | 有 7 | 是 8 | 为 9 | 以 10 | 于 11 | 上 12 | 他 13 | 而 14 | 后 15 | 之 16 | 来 17 | 及 18 | 了 19 | 因 20 | 下 21 | 可 22 | 到 23 | 由 24 | 这 25 | 与 26 | 也 27 | 此 28 | 但 29 | 并 30 | 个 31 | 其 32 | 已 33 | 无 34 | 小 35 | 我 36 | 们 37 | 起 38 | 最 39 | 再 40 | 今 41 | 去 42 | 好 43 | 只 44 | 又 45 | 或 46 | 很 47 | 亦 48 | 某 49 | 把 50 | 那 51 | 你 52 | 乃 53 | 它 54 | 吧 55 | 被 56 | 比 57 | 别 58 | 趁 59 | 当 60 | 从 61 | 到 62 | 得 63 | 打 64 | 凡 65 | 儿 66 | 尔 67 | 该 68 | 各 69 | 给 70 | 跟 71 | 和 72 | 何 73 | 还 74 | 即 75 | 几 76 | 既 77 | 看 78 | 据 79 | 距 80 | 靠 81 | 啦 82 | 了 83 | 另 84 | 么 85 | 每 86 | 们 87 | 嘛 88 | 拿 89 | 哪 90 | 那 91 | 您 92 | 凭 93 | 且 94 | 却 95 | 让 96 | 仍 97 | 啥 98 | 如 99 | 若 100 | 使 101 | 谁 102 | 虽 103 | 随 104 | 同 105 | 所 106 | 她 107 | 哇 108 | 嗡 109 | 往 110 | 哪 111 | 些 112 | 向 113 | 沿 114 | 哟 115 | 用 116 | 于 117 | 咱 118 | 则 119 | 怎 120 | 曾 121 | 至 122 | 致 123 | 着 124 | 诸 125 | 自 -------------------------------------------------------------------------------- /src/main/scala/com/intenthq/gander/DocumentCleaner.scala: -------------------------------------------------------------------------------- 1 | package com.intenthq.gander 2 | 3 | import java.util.regex.Pattern 4 | 5 | import com.intenthq.gander.utils.JSoup._ 6 | import org.jsoup.nodes.{Document, TextNode} 7 | 8 | object DocumentCleaner { 9 | 10 | private val captionPattern = Pattern.compile("^caption$") 11 | private val googlePattern = Pattern.compile("google") 12 | private val facebookPattern = Pattern.compile("facebook") 13 | private val twitterPattern = Pattern.compile("twitter") 14 | /** 15 | * this regex is used to remove undesirable nodes from our doc 16 | * indicate that something maybe isn't content but more of a comment, footer or some other undesirable node 17 | */ 18 | private val regExRemoveNodes = "^side$|combx|retweet|mediaarticlerelated|menucontainer|navbar|comment(?!ed)|PopularQuestions|contact|footer|Footer|footnote|cnn_strycaptiontxt|links|meta$|scroll(?!able)|shoutbox|sponsor" + 19 | "|tags|socialnetworking|socialNetworking|cnnStryHghLght|cnn_stryspcvbx|^inset$|pagetools|post-attributes|welcome_form|contentTools2|the_answers|remember-tool-tip" + 20 | "|communitypromo|promo_holder|runaroundLeft|subscribe|vcard|articleheadings|date|^print$|popup|author-dropdown|tools|socialtools|byline|konafilter|KonaFilter|breadcrumbs|^fn$" + 21 | "|wp-caption-text|overlay" 22 | private val queryNaughtyIDs = "[id~=(" + regExRemoveNodes + ")]" 23 | private val queryNaughtyClasses = "[class~=(" + regExRemoveNodes + ")]" 24 | private val queryNaughtyNames = "[name~=(" + regExRemoveNodes + ")]" 25 | 26 | def clean(doc: Document): Document = { 27 | //TODO right now this solution mutates this document 28 | // it would be very nice to implement this with an immutable solution 29 | implicit val docToClean: Document = doc.clone 30 | 31 | cleanTextTags 32 | removeScriptsAndStyles 33 | cleanBadTags 34 | removeNodesViaRegEx(captionPattern) 35 | removeNodesViaRegEx(googlePattern) 36 | removeNodesViaRegEx(facebookPattern) 37 | removeNodesViaRegEx(twitterPattern) 38 | cleanUpSpanTagsInParagraphs 39 | docToClean 40 | } 41 | 42 | /** 43 | * replaces various tags with textnodes 44 | */ 45 | private def cleanTextTags(implicit doc: Document): Unit = 46 | (byTag("em") ++ byTag("strong") ++ byTag("b") ++ byTag("i") ++ 47 | byTag("strike") ++ byTag("del") ++ byTag("ins")).foreach { node => 48 | val tn = new TextNode(node.text, doc.baseUri) 49 | node.replaceWith(tn) 50 | } 51 | 52 | private def removeScriptsAndStyles(implicit doc: Document): Unit = 53 | (byTag("script") ++ byTag("style") ++ byTag("noscript")).foreach(remove) 54 | 55 | private def cleanBadTags(implicit doc: Document): Unit = 56 | (select(queryNaughtyIDs) ++ select(queryNaughtyClasses) ++ select(queryNaughtyNames)).foreach(remove) 57 | 58 | /** 59 | * removes nodes that may have a certain pattern that matches against a class or id tag 60 | */ 61 | private def removeNodesViaRegEx(pattern: Pattern)(implicit doc: Document): Unit = 62 | (byAttrRe("id", pattern) ++ byAttrRe("class", pattern)).foreach(remove) 63 | 64 | /** 65 | * takes care of the situation where you have a span tag nested in a paragraph tag 66 | * e.g. businessweek2.txt 67 | */ 68 | private def cleanUpSpanTagsInParagraphs(implicit doc: Document): Unit = 69 | byTag("span").filter(_.parent.nodeName == "p").foreach { node => 70 | val tn = new TextNode(node.text, doc.baseUri) 71 | node.replaceWith(tn) 72 | } 73 | 74 | } -------------------------------------------------------------------------------- /src/main/scala/com/intenthq/gander/Gander.scala: -------------------------------------------------------------------------------- 1 | package com.intenthq.gander 2 | 3 | import java.util.Date 4 | 5 | import com.intenthq.gander.extractors.ContentExtractor._ 6 | import com.intenthq.gander.opengraph.OpenGraphData 7 | import org.jsoup.Jsoup 8 | 9 | import scala.util.Try 10 | 11 | 12 | case class Link(text: String, target: String) 13 | 14 | case class PageInfo(title: String, 15 | processedTitle: String, 16 | metaDescription: String, 17 | metaKeywords: String, 18 | lang: Option[String], 19 | canonicalLink: Option[String], 20 | openGraphData: OpenGraphData, 21 | cleanedText: Option[String] = None, 22 | links: Seq[Link] = Seq.empty, 23 | publishDate: Option[Date] = None) 24 | 25 | object Gander { 26 | 27 | def extract(html: String, lang: String = "all"): Option[PageInfo] = { 28 | //This is replacing the non-breaking space with a regular space 29 | val sanitised = html.replace(' ', ' ') 30 | Try(Jsoup.parse(sanitised)).toOption.map { doc => 31 | val canonicalLink = extractCanonicalLink(doc) 32 | val publishDate = extractDate(doc).map(_.toDate).orElse(canonicalLink.flatMap(extractDateFromURL)) 33 | 34 | val rawTitle = extractTitle(doc) 35 | val info = PageInfo(title = rawTitle, 36 | processedTitle = processTitle(rawTitle, canonicalLink), 37 | metaDescription = extractMetaDescription(doc), 38 | metaKeywords = extractMetaKeywords(doc), 39 | lang = extractLang(doc), 40 | canonicalLink = canonicalLink, 41 | publishDate = publishDate, 42 | openGraphData = OpenGraphData(doc) 43 | ) 44 | 45 | val cleanedDoc = DocumentCleaner.clean(doc) 46 | calculateBestNodeBasedOnClustering(cleanedDoc, lang).map { node => 47 | //some mutability beauty 48 | postExtractionCleanup(node, lang) 49 | info.copy(cleanedText = Some(node.text()), 50 | links = extractLinks(node)) 51 | }.getOrElse(info) 52 | } 53 | } 54 | 55 | } 56 | -------------------------------------------------------------------------------- /src/main/scala/com/intenthq/gander/extractors/ContentExtractor.scala: -------------------------------------------------------------------------------- 1 | package com.intenthq.gander.extractors 2 | 3 | import java.net.URL 4 | import java.text.Normalizer 5 | import java.util.Date 6 | import java.util.regex.Pattern 7 | 8 | import com.intenthq.gander.Link 9 | import com.intenthq.gander.text.{StopWords, WordStats} 10 | import com.intenthq.gander.utils.JSoup._ 11 | import org.joda.time.DateTime 12 | import org.jsoup.nodes.{Document, Element} 13 | import org.slf4j.{Logger, LoggerFactory} 14 | 15 | import scala.collection.convert.Wrappers.JListWrapper 16 | import scala.collection.mutable 17 | import scala.math._ 18 | import scala.util.Try 19 | import org.joda.time.format.ISODateTimeFormat.dateTimeParser 20 | 21 | 22 | object ContentExtractor { 23 | 24 | val logger: Logger = LoggerFactory.getLogger(getClass) 25 | 26 | def extractTitle(doc: Document): String = 27 | byTag("title")(doc).headOption.map(_.text).getOrElse("").replace("�", "").trim 28 | 29 | def processTitle(rawTitle: String, canonical: Option[String]): String = { 30 | def normalize(str: String) = 31 | Normalizer.normalize(str, Normalizer.Form.NFD).replaceAll("\\p{InCombiningDiacriticalMarks}+", "") 32 | 33 | canonical.flatMap(c => Try(new URL(c)).toOption).flatMap { url => 34 | val names = url.getAuthority.split('.').init.filter(_.length > 2).filter(_ != "www") 35 | List(""" | """, " • ", " › ", " :: ", " » ", " - ", " : ", " — ", " · ").collectFirst { 36 | case separator if rawTitle.contains(separator) => 37 | val parts = rawTitle.split(Pattern.quote(separator)) 38 | val partsNot = parts.filterNot { part => 39 | names.exists(name => normalize(part).toLowerCase.replace(" ", "").contains(name)) 40 | } 41 | partsNot.mkString(separator).trim 42 | } 43 | }.getOrElse(rawTitle) 44 | } 45 | 46 | def extractLang(doc: Document): Option[String] = 47 | byTag("html")(doc).headOption.map(_.attr("lang")).filter(_.nonEmpty).orElse( 48 | metaContent("http-equiv=Content-Language")(doc).orElse( 49 | metaContent("property=og:locale")(doc) 50 | ) 51 | ) 52 | 53 | def extractDate(doc: Document): Option[DateTime] = { 54 | metaContent("property=article:published_time")(doc).orElse( 55 | metaContent("name=DCTERMS.created")(doc).orElse( 56 | select("time[class=dt-published published entry-date]")(doc).headOption.map(_.attr("datetime").trim).orElse( 57 | select("time[itemprop=datePublished]")(doc).headOption.map(_.attr("datetime").trim).orElse( 58 | metaContent("name=DisplayDate")(doc).orElse( 59 | metaContent("name=date")(doc) 60 | ) 61 | ) 62 | ) 63 | ) 64 | ).flatMap(x => 65 | // replaceAll("/","-") is needed as ISODateTimeFormat will block on / 66 | // e.g. http://www.bbc.co.uk/sport/0/football/34203622 67 | Try(dateTimeParser.parseDateTime(x.replaceAll("/","-"))).toOption 68 | ) 69 | } 70 | 71 | private def metaContent(metaName: String)(implicit doc: Document): Option[String] = 72 | select(s"meta[$metaName]").headOption.map(_.attr("content").trim) 73 | 74 | /** 75 | * if the article has meta description set in the source, use that 76 | */ 77 | def extractMetaDescription(implicit doc: Document): String = 78 | metaContent("name=description").orElse( 79 | metaContent("og:description").orElse( 80 | metaContent("name=twitter:description") 81 | ) 82 | ).getOrElse("").trim 83 | 84 | /** 85 | * if the article has meta keywords set in the source, use that 86 | */ 87 | def extractMetaKeywords(implicit doc: Document): String = metaContent("name=keywords").getOrElse("") 88 | 89 | /** 90 | * if the article has meta canonical link set in the url 91 | */ 92 | def extractCanonicalLink(implicit doc: Document): Option[String] = 93 | select("link[rel=canonical]").headOption.map(_.attr("abs:href")).orElse( 94 | select("meta[property=og:url]").headOption.map(_.attr("abs:content")) 95 | ).orElse( 96 | select("meta[name=twitter:url]").headOption.map(_.attr("abs:content")) 97 | ).map(_.trim) 98 | 99 | def extractDateFromURL(url: String): Option[Date] = { 100 | def findYearMonthAndDay(segments: Array[String]): (Option[Int], Option[Int], Option[Int]) = { 101 | def findMonthAndDay(segments: Array[String]): (Option[Int], Option[Int]) = { 102 | def findDay(segment: String): Option[Int] = Try(segment.toInt).filter(d => d >= 1 && d <= 31).toOption 103 | Try(segments.head.toInt).filter(m => m >= 1 && m <= 12).map { month => 104 | (Some(month), findDay(segments.tail.head)) 105 | }.getOrElse((None, None)) 106 | } 107 | 108 | if (segments.isEmpty) 109 | (None, None, None) 110 | else { 111 | Try(segments.head.toInt).filter(y => y > 1970 && y < 3000).map { year => 112 | val (month, day) = findMonthAndDay(segments.tail) 113 | (Some(year), month, day) 114 | }.getOrElse(findYearMonthAndDay(segments.tail)) 115 | } 116 | } 117 | 118 | val (year, month, day) = findYearMonthAndDay(url.split("/")) 119 | year.map { y => 120 | val m = month.getOrElse(1) 121 | val d = day.getOrElse(1) 122 | new DateTime(y, m, d, 0, 0).toDate 123 | } 124 | } 125 | 126 | /** 127 | * we're going to start looking for where the clusters of paragraphs are. We'll score a cluster based on the number of stopwords 128 | * and the number of consecutive paragraphs together, which should form the cluster of text that this node is around 129 | * also store on how high up the paragraphs are, comments are usually at the bottom and should get a lower score 130 | */ 131 | def calculateBestNodeBasedOnClustering(document: Document, lang:String): Option[Element] = { 132 | implicit val doc = document.clone 133 | 134 | val nodesToCheck = byTag("p") ++ byTag("td") ++ byTag("pre") ++ byTag("strong") ++ byTag("li") ++ byTag("code") 135 | 136 | val nodesWithText = nodesToCheck.filter { node => 137 | val nodeText = node.text 138 | val wordStats = StopWords.stopWordCount(nodeText, lang) 139 | val highLinkDensity = isHighLinkDensity(node) 140 | logger.trace("Candidate: " + node.tagName() + " score: " + wordStats + " d:" + highLinkDensity + " text:" + nodeText) 141 | wordStats.stopWordCount > 2 && !highLinkDensity 142 | } 143 | 144 | val numberOfNodes = nodesWithText.size 145 | val bottomNodesForNegativeScore = numberOfNodes * 0.25 146 | 147 | logger.trace("About to inspect num of nodes with text: " + numberOfNodes) 148 | 149 | def boostScoreForNode(node: Element, startingBoost: Double, count: Int): (Double, Double) = { 150 | var newStartingBoost = startingBoost 151 | var result = 0.0 152 | if (isOkToBoost(node, lang)) { 153 | result = (1.0 / startingBoost) * 50 154 | newStartingBoost += 1 155 | } 156 | if (numberOfNodes > 15) { 157 | if ((numberOfNodes - count) <= bottomNodesForNegativeScore) { 158 | val booster = bottomNodesForNegativeScore - (numberOfNodes - count) 159 | result = -pow(booster, 2) 160 | if (abs(result) > 40) 161 | result = 5 162 | } 163 | } 164 | (newStartingBoost, result) 165 | } 166 | 167 | var count = 0 168 | var startingBoost: Double = 1.0 169 | val parentNodes = mutable.Set.empty[Element] 170 | 171 | for (node <- nodesWithText) { 172 | val (newStartingBoost, boostScore) = boostScoreForNode(node, startingBoost, count) 173 | startingBoost = newStartingBoost 174 | 175 | logger.trace("Location Boost Score: " + boostScore + " on interation: " + count + " tag='"+ node.tagName +"' id='" + node.parent.id + "' class='" + node.parent.attr("class")) 176 | 177 | val wordStats: WordStats = StopWords.stopWordCount(node.text, lang) 178 | val upscore: Int = (wordStats.stopWordCount + boostScore).toInt 179 | updateScore(node.parent, upscore) 180 | updateScore(node.parent.parent, upscore / 2) 181 | updateNodeCount(node.parent, 1) 182 | updateNodeCount(node.parent.parent, 1) 183 | parentNodes.add(node.parent) 184 | parentNodes.add(node.parent.parent) 185 | count += 1 186 | } 187 | 188 | if (parentNodes.isEmpty) 189 | None 190 | else { 191 | Some(parentNodes.maxBy(getScore)).filter(getScore(_) >= 20) 192 | } 193 | } 194 | 195 | /** 196 | * alot of times the first paragraph might be the caption under an image so we'll want to make sure if we're going to 197 | * boost a parent node that it should be connected to other paragraphs, at least for the first n paragraphs 198 | * so we'll want to make sure that the next sibling is a paragraph and has at least some substatial weight to it 199 | */ 200 | private def isOkToBoost(node: Element, lang: String): Boolean = { 201 | var stepsAway: Int = 0 202 | val minimumStopWordCount = 5 203 | val maxStepsAwayFromNode = 3 204 | 205 | walkSiblings(node) { currentNode => 206 | if (currentNode.tagName == "p" || currentNode.tagName == "strong") { 207 | if (stepsAway >= maxStepsAwayFromNode) { 208 | return false 209 | } 210 | val wordStats = StopWords.stopWordCount(currentNode.text, lang) 211 | if (wordStats.stopWordCount > minimumStopWordCount) 212 | return true 213 | stepsAway += 1 214 | } 215 | } 216 | false 217 | } 218 | 219 | private def getShortText(e: String, max: Int): String = if (e.length > max) e.take(max) + "..." else e 220 | 221 | /** 222 | * Checks the density of links within a node. If there's not much text and what's there is mostly links, 223 | * we're not interested 224 | */ 225 | private def isHighLinkDensity(implicit e: Element): Boolean = { 226 | val limit = 1.0 227 | val links = byTag("a") ++ byAttr("onclick") 228 | 229 | if (links.isEmpty) 230 | false 231 | else { 232 | val words = e.text.trim.split("\\s+") 233 | val linkWords = links.mkString(" ").split("\\s+") 234 | val numberOfLinks = links.size 235 | val numberOfWords = words.length.toDouble 236 | val numberOfLinkWords = linkWords.length.toDouble 237 | val score = numberOfLinks * numberOfLinkWords / numberOfWords 238 | 239 | logger.trace("Calculated link density score as: {} for node: {}", score, getShortText(e.text, 50)) 240 | 241 | score >= limit 242 | } 243 | } 244 | 245 | private def getScore(node: Element): Int = getGravityScoreFromNode(node).getOrElse(0) 246 | 247 | private def getGravityScoreFromNode(node: Element): Option[Int] = Try(node.attr("gravityScore").toInt).toOption 248 | 249 | /** 250 | * adds a score to the gravityScore Attribute we put on divs 251 | * we'll get the current score then add the score we're passing in to the current 252 | * 253 | * @param addToScore - the score to add to the node 254 | */ 255 | private def updateScore(node: Element, addToScore: Int) { 256 | val currentScore = Try(node.attr("gravityScore").toInt).getOrElse(0) 257 | val newScore = currentScore + addToScore 258 | node.attr("gravityScore", newScore.toString) 259 | } 260 | 261 | /** 262 | * stores how many decent nodes are under a parent node 263 | */ 264 | private def updateNodeCount(node: Element, addToCount: Int) { 265 | val currentScore = Try(node.attr("gravityNodes").toInt).getOrElse(0) 266 | val newScore: Int = currentScore + addToCount 267 | node.attr("gravityNodes", newScore.toString) 268 | } 269 | 270 | /** 271 | * pulls out links we like 272 | */ 273 | def extractLinks(implicit node: Element): Seq[Link] = 274 | select("a[href]") 275 | .filter(el => el.attr("href") != "#" && !el.attr("abs:href").trim.isEmpty) 276 | .map(el => Link(el.text, el.attr("abs:href"))) 277 | 278 | private def isTableTagAndNoParagraphsExist(implicit e: Element): Boolean = { 279 | getChildParagraphs(e).filter(_.text.length < 25).foreach(remove) 280 | 281 | val subParagraphs2 = byTag("p") 282 | if (subParagraphs2.isEmpty && e.tagName != "td") { 283 | if (e.tagName == "ul" || e.tagName == "ol") { 284 | val linkTextLength = byTag("a").map(_.text.length).sum 285 | val elementTextLength = e.text.length 286 | elementTextLength <= 2 * linkTextLength 287 | } 288 | else true 289 | } else false 290 | } 291 | 292 | /** 293 | * remove any divs that looks like non-content, clusters of links, or paras with no gusto 294 | */ 295 | def postExtractionCleanup(targetNode: Element, lang: String): Element = { 296 | val node = addSiblings(targetNode, lang) 297 | JListWrapper(node.children) 298 | .filter(e => e.tagName != "p" || isHighLinkDensity(e)) 299 | .filter(e => isHighLinkDensity(e) || isTableTagAndNoParagraphsExist(e) || !isNodeScoreThresholdMet(node, e)) 300 | .foreach(remove) 301 | node 302 | } 303 | 304 | private def isNodeScoreThresholdMet(node: Element, e: Element): Boolean = { 305 | val topNodeScore = getScore(node) 306 | val currentNodeScore = getScore(e) 307 | val thresholdScore = topNodeScore * .08 308 | !(currentNodeScore < thresholdScore && e.tagName != "td") 309 | } 310 | 311 | private def getChildParagraphs(implicit e: Element): Seq[Element] = byTag("p") ++ byTag("strong") 312 | 313 | /** 314 | * adds any siblings that may have a decent score to this node 315 | */ 316 | private def getSiblingContent(currentSibling: Element, 317 | baselineScoreForSiblingParagraphs: Int, 318 | lang: String): Option[String] = { 319 | if ((currentSibling.tagName == "p" || currentSibling.tagName == "strong") && currentSibling.text.nonEmpty) 320 | Some(currentSibling.outerHtml) 321 | else { 322 | val siblingBaseLineScore = baselineScoreForSiblingParagraphs * 0.3 323 | val text = getChildParagraphs(currentSibling) 324 | .filter(p => StopWords.stopWordCount(p.text, lang).stopWordCount >= siblingBaseLineScore) 325 | .map(p => "

" + p.text + "

") 326 | .mkString(" ") 327 | if (text.isEmpty) None else Some(text) 328 | } 329 | } 330 | 331 | private def walkSiblings[T](node: Element)(work: (Element) => T): Seq[T] = { 332 | var currentSibling = node.previousElementSibling 333 | val b = mutable.Buffer[T]() 334 | 335 | while (currentSibling != null) { 336 | b += work(currentSibling) 337 | currentSibling = currentSibling.previousElementSibling 338 | } 339 | b 340 | } 341 | 342 | private def addSiblings(topNode: Element, lang: String): Element = { 343 | val baselineScoreForSiblingParagraphs = getBaselineScoreForSiblings(topNode, lang) 344 | val results = walkSiblings(topNode) { currentNode => 345 | getSiblingContent(currentNode, baselineScoreForSiblingParagraphs, lang) 346 | }.reverse.flatten 347 | topNode.child(0).before(results.mkString) 348 | topNode 349 | } 350 | 351 | /** 352 | * we could have long articles that have tons of paragraphs so if we tried to calculate the base score against 353 | * the total text score of those paragraphs it would be unfair. So we need to normalize the score based on the average scoring 354 | * of the paragraphs within the top node. For example if our total score of 10 paragraphs was 1000 but each had an average value of 355 | * 100 then 100 should be our base. 356 | */ 357 | private def getBaselineScoreForSiblings(topNode: Element, lang: String): Int = { 358 | val nodesToCheck = getChildParagraphs(topNode) 359 | 360 | val scores = nodesToCheck.flatMap { node => 361 | val wordStats = StopWords.stopWordCount(node.text, lang) 362 | if (wordStats.stopWordCount > 2 && !isHighLinkDensity(node)) Some(wordStats.stopWordCount) 363 | else None 364 | } 365 | 366 | if (scores.nonEmpty) scores.sum / scores.length 367 | else Int.MaxValue 368 | } 369 | } 370 | -------------------------------------------------------------------------------- /src/main/scala/com/intenthq/gander/opengraph/OpenGraphData.scala: -------------------------------------------------------------------------------- 1 | /** 2 | Copyright [2014] Robby Pond 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | package com.intenthq.gander.opengraph 17 | 18 | import java.net.URL 19 | 20 | import org.joda.time.DateTime 21 | import org.joda.time.format.ISODateTimeFormat 22 | import org.jsoup.nodes.Element 23 | 24 | import scala.util.Try 25 | 26 | case class OpenGraphData(title: Option[String] = None, 27 | siteName: Option[String] = None, 28 | url: Option[URL] = None, 29 | description: Option[String] = None, 30 | image: Option[URL] = None, 31 | `type`: Option[String] = None, 32 | locale: Option[String] = None, 33 | publishedTime: Option[DateTime] = None) 34 | object OpenGraphData { 35 | 36 | def apply(elem: Element): OpenGraphData = { 37 | def attr(property: String): Option[String] = 38 | Option(elem.select(s"meta[property=$property]").first()).map(_.attr("content")) 39 | def url(x: String) = Try(new URL(x)).toOption 40 | def date(x: String) = Try(ISODateTimeFormat.dateTimeParser.parseDateTime(x)).toOption 41 | 42 | OpenGraphData(title = attr("og:title"), 43 | siteName = attr("og:site_name"), 44 | url = attr("og:url").flatMap(url), 45 | description = attr("og:description"), 46 | image = attr("og:image").flatMap(url), 47 | `type` = attr("og:type"), 48 | locale = attr("og:locale"), 49 | publishedTime = attr("article:published_time").flatMap(date)) 50 | } 51 | 52 | } -------------------------------------------------------------------------------- /src/main/scala/com/intenthq/gander/text/StopWords.scala: -------------------------------------------------------------------------------- 1 | package com.intenthq.gander.text 2 | 3 | import java.util.regex.Pattern 4 | import com.intenthq.gander.utils.FileHelper 5 | import scala.collection.mutable 6 | 7 | object StopWords { 8 | // the confusing pattern below is basically just match any non-word character excluding white-space. 9 | private val punctuationPattern = Pattern.compile("[^\\p{Ll}\\p{Lu}\\p{Lt}\\p{Lo}\\p{Nd}\\p{Pc}\\s]") 10 | 11 | private val stopWordsMap = mutable.Map.empty[String, Set[String]] 12 | 13 | private def removePunctuation(str: String): String = punctuationPattern.matcher(str).replaceAll("") 14 | 15 | def stopWords(lname: String): Set[String] = 16 | stopWordsMap.getOrElse(lname, { 17 | val stopWordsFile = "stopwords-%s.txt" format lname 18 | val stopWords = FileHelper.loadResourceFile(stopWordsFile, StopWords.getClass) 19 | .split(sys.props("line.separator")) 20 | .map(s => s.trim) 21 | .toSet 22 | stopWordsMap += lname -> stopWords 23 | stopWords 24 | }) 25 | 26 | def candidateWords(strippedInput: String, language: String): Array[String] = strippedInput.split(" ") 27 | 28 | def stopWordCount(content: String, lang: String = "en"): WordStats = { 29 | val strippedInput = removePunctuation(content) 30 | val candidates = candidateWords(strippedInput, lang) 31 | val stop = stopWords(lang) 32 | val overlappingStopWords = candidates.map(_.toLowerCase).filter(stop.contains) 33 | WordStats(overlappingStopWords.toList, candidates.length) 34 | } 35 | } -------------------------------------------------------------------------------- /src/main/scala/com/intenthq/gander/text/WordStats.scala: -------------------------------------------------------------------------------- 1 | package com.intenthq.gander.text 2 | 3 | case class WordStats(stopWords: List[String], wordCount:Int) { 4 | /** 5 | * total number of stopwords or good words that we can calculate 6 | */ 7 | val stopWordCount : Int = stopWords.size 8 | 9 | override def toString: String = 10 | "Word statistics: words = " + wordCount + ", stop words = " + 11 | stopWordCount + " (" + stopWords.mkString(", ") + ")" 12 | } 13 | -------------------------------------------------------------------------------- /src/main/scala/com/intenthq/gander/utils/FileHelper.scala: -------------------------------------------------------------------------------- 1 | package com.intenthq.gander.utils 2 | 3 | import com.google.common.base.Charsets 4 | import com.google.common.io.Resources 5 | import org.slf4j.LoggerFactory 6 | 7 | import scala.util.{Failure, Success, Try} 8 | 9 | object FileHelper { 10 | 11 | private val logger = LoggerFactory.getLogger(getClass) 12 | 13 | def loadResourceFile[A](filename: String, cls: Class[A]): String = { 14 | val url = cls.getResource(filename) 15 | Try(Resources.toString(url, Charsets.UTF_8)) match { 16 | case Success(v) => v 17 | case Failure(tr) => logger.warn(s"Error while reading $filename: $tr", tr.toString); "" 18 | } 19 | } 20 | 21 | } -------------------------------------------------------------------------------- /src/main/scala/com/intenthq/gander/utils/JSoup.scala: -------------------------------------------------------------------------------- 1 | package com.intenthq.gander.utils 2 | 3 | import java.util.regex.Pattern 4 | 5 | import org.jsoup.nodes.{Element, Document} 6 | 7 | import scala.collection.convert.Wrappers.JListWrapper 8 | 9 | object JSoup { 10 | def byTag(tag: String)(implicit elem: Element): Seq[Element] = JListWrapper(elem.getElementsByTag(tag)) 11 | 12 | def byAttrRe(attr: String, pattern: Pattern)(implicit doc: Document): Seq[Element] = 13 | JListWrapper(doc.getElementsByAttributeValueMatching(attr, pattern)) 14 | 15 | def byAttr(value: String)(implicit elem: Element): Seq[Element] = 16 | JListWrapper(elem.getElementsByAttribute(value)) 17 | 18 | def select(query: String)(implicit elem: Element): Seq[Element] = JListWrapper(elem.select(query)) 19 | 20 | def remove(elem: Element) = Option(elem.parent()).foreach(_ => elem.remove()) 21 | 22 | } 23 | -------------------------------------------------------------------------------- /src/test/resources/engineering.intenthq.com_2015_03_what-is-good-code-a-scientific-definition_.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intenthq/gander/3d4896f82694b16707cd07e6dd94c33b4aa022ae/src/test/resources/engineering.intenthq.com_2015_03_what-is-good-code-a-scientific-definition_.gz -------------------------------------------------------------------------------- /src/test/resources/globoesporte.globo.com_futebol_times_sao-paulo_noticia_2012_04_filho-do-gramado-leao-administra-o-sao-paulo-na-base-da-conversa.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intenthq/gander/3d4896f82694b16707cd07e6dd94c33b4aa022ae/src/test/resources/globoesporte.globo.com_futebol_times_sao-paulo_noticia_2012_04_filho-do-gramado-leao-administra-o-sao-paulo-na-base-da-conversa.html.gz -------------------------------------------------------------------------------- /src/test/resources/internacional.elpais.com_internacional_2015_07_28_actualidad_1438076596_960360.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intenthq/gander/3d4896f82694b16707cd07e6dd94c33b4aa022ae/src/test/resources/internacional.elpais.com_internacional_2015_07_28_actualidad_1438076596_960360.html.gz -------------------------------------------------------------------------------- /src/test/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # ***** Set root logger level to WARN and its two appenders to stdout and R. 2 | log4j.rootLogger=TRACE, stdout 3 | 4 | # ***** stdout is set to be a ConsoleAppender. 5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 6 | log4j.appender.stdout.trace.encoding=UTF-8 7 | # ***** stdout uses PatternLayout. 8 | #log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 9 | ## ***** Pattern to output the caller's file name and line number. 10 | #log4j.appender.stdout.layout.ConversionPattern=%5p [%t] (%F:%L) - %m%n 11 | 12 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 13 | log4j.appender.stdout.layout.ConversionPattern=%p %c - %m%n 14 | 15 | # if you want to see the thread names uncomment this line out 16 | #log4j.appender.stdout.layout.ConversionPattern=%p %t %c - %m%n 17 | 18 | 19 | 20 | #set httpclient debug levels 21 | log4j.logger.org.apache.component=ERROR,stdout 22 | log4j.logger.org.apache.commons.httpclient=ERROR,stdout 23 | #log4j.logger.org.apache.http=ERROR,stdout 24 | #log4j.logger.org.apache.http.wire=ERROR,stdout 25 | #log4j.logger.org.apache.http.client.protocol=ERROR,stdout 26 | #log4j.logger.org.apache.http.impl.conn=ERROR,stdout 27 | 28 | log4j.logger.net.sf.jmimemagic=WARN 29 | -------------------------------------------------------------------------------- /src/test/resources/www.apple.com_watch_.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intenthq/gander/3d4896f82694b16707cd07e6dd94c33b4aa022ae/src/test/resources/www.apple.com_watch_.gz -------------------------------------------------------------------------------- /src/test/resources/www.bbc.co.uk_sport_0_football_34203622.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intenthq/gander/3d4896f82694b16707cd07e6dd94c33b4aa022ae/src/test/resources/www.bbc.co.uk_sport_0_football_34203622.gz -------------------------------------------------------------------------------- /src/test/resources/www.bbc.com_news_business-33697945.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intenthq/gander/3d4896f82694b16707cd07e6dd94c33b4aa022ae/src/test/resources/www.bbc.com_news_business-33697945.gz -------------------------------------------------------------------------------- /src/test/resources/www.businessinsider.com_goldman-on-the-fed-announcement-2011-9.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intenthq/gander/3d4896f82694b16707cd07e6dd94c33b4aa022ae/src/test/resources/www.businessinsider.com_goldman-on-the-fed-announcement-2011-9.gz -------------------------------------------------------------------------------- /src/test/resources/www.corriere.it_cronache_15_luglio_29_relazione-alfano-mafia-fatti-gravi-sindaco-ha-sottovalutato-25146a6c-35b0-11e5-b050-7dc71ce7db4c.shtml.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intenthq/gander/3d4896f82694b16707cd07e6dd94c33b4aa022ae/src/test/resources/www.corriere.it_cronache_15_luglio_29_relazione-alfano-mafia-fatti-gravi-sindaco-ha-sottovalutato-25146a6c-35b0-11e5-b050-7dc71ce7db4c.shtml.gz -------------------------------------------------------------------------------- /src/test/resources/www.dailymail.co.uk_news_article-486484_A-spectacular-destruction-How-email-led-downfall-barrister-all.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intenthq/gander/3d4896f82694b16707cd07e6dd94c33b4aa022ae/src/test/resources/www.dailymail.co.uk_news_article-486484_A-spectacular-destruction-How-email-led-downfall-barrister-all.html.gz -------------------------------------------------------------------------------- /src/test/resources/www.fcbarcelona.com_club_detail_article_30-years-since-visit-of-pope-john-paul-ii.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intenthq/gander/3d4896f82694b16707cd07e6dd94c33b4aa022ae/src/test/resources/www.fcbarcelona.com_club_detail_article_30-years-since-visit-of-pope-john-paul-ii.gz -------------------------------------------------------------------------------- /src/test/resources/www.lancenet.com.br_sao-paulo_Leao-Arena-Barueri-casa-Tricolor_0_675532605.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intenthq/gander/3d4896f82694b16707cd07e6dd94c33b4aa022ae/src/test/resources/www.lancenet.com.br_sao-paulo_Leao-Arena-Barueri-casa-Tricolor_0_675532605.html.gz -------------------------------------------------------------------------------- /src/test/scala/com/intenthq/gander/ContentExtractorSpec.scala: -------------------------------------------------------------------------------- 1 | package com.intenthq.gander 2 | 3 | import com.intenthq.gander.extractors.ContentExtractor._ 4 | import org.jsoup.Jsoup 5 | import org.specs2.mutable.Specification 6 | 7 | class ContentExtractorSpec extends Specification { 8 | "extractTitle" >> { 9 | def docFromTitle(title: String) = Jsoup.parse(s"$title") 10 | "should extract a title" >> { 11 | val title = "the title" 12 | extractTitle(docFromTitle(title)) must_== title 13 | } 14 | "should extract an empty title" >> { 15 | val title = "" 16 | extractTitle(docFromTitle(title)) must_== title 17 | } 18 | } 19 | 20 | "extractCanonicalLink" >> { 21 | "should return none if no link found" >> { 22 | val html = 23 | """ 24 | | 25 | | 26 | |""".stripMargin 27 | extractCanonicalLink(Jsoup.parse(html)) must beNone 28 | } 29 | 30 | "should extract the canonical link from the meta tag" >> { 31 | val html = 32 | """ 33 | | 34 | | 35 | | 36 | | 37 | | 38 | |""".stripMargin 39 | extractCanonicalLink(Jsoup.parse(html)) must beSome("http://example.com/canonical") 40 | } 41 | "should extract the facebook og:url meta tag" >> { 42 | val html = 43 | """ 44 | | 45 | | 46 | | 47 | | 48 | |""".stripMargin 49 | extractCanonicalLink(Jsoup.parse(html)) must beSome("http://example.com/og") 50 | } 51 | "should extract the twitter:url meta tag" >> { 52 | val html = 53 | """ 54 | | 55 | | 56 | | 57 | |""".stripMargin 58 | extractCanonicalLink(Jsoup.parse(html)) must beSome("http://example.com/twitter") 59 | } 60 | } 61 | 62 | "extractLang" >> { 63 | "should extract lang from html tag and give priority to it" >> { 64 | val html = 65 | """ 66 | | 67 | | 68 | | 69 | | 70 | |""".stripMargin 71 | 72 | extractLang(Jsoup.parse(html)) must beSome("ca") 73 | } 74 | "should extract language from meta tag with more priority than og:locale" >> { 75 | val html = 76 | """ 77 | | 78 | | 79 | | 80 | | 81 | |""".stripMargin 82 | 83 | extractLang(Jsoup.parse(html)) must beSome("ca") 84 | } 85 | "should extract language from og:locale" >> { 86 | val html = 87 | """ 88 | | 89 | | 90 | | 91 | |""".stripMargin 92 | 93 | extractLang(Jsoup.parse(html)) must beSome("ca") 94 | } 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /src/test/scala/com/intenthq/gander/DocumentCleanerSpec.scala: -------------------------------------------------------------------------------- 1 | package com.intenthq.gander 2 | 3 | import org.jsoup.Jsoup 4 | import org.specs2.mutable.Specification 5 | 6 | class DocumentCleanerSpec extends Specification { 7 | 8 | def html(body: String) = s"$body" 9 | 10 | def check(actual: String, expected: String) = { 11 | val doc = Jsoup.parse(html(actual)) 12 | DocumentCleaner.clean(doc).toString.replaceAll(" +", " ") must_== Jsoup.parse(html(expected)).toString.replaceAll(" +", " ") 13 | } 14 | 15 | "clean" >> { 16 | "should clean em, strong, b, i, strike, del and ins tags" >> { 17 | val body = """

This is a paragraph with emphasis, strong, 18 | | bold, italics, strike, deleted 19 | | and insterted elements.

""".stripMargin 20 | val expected = """

This is a paragraph with emphasis, strong, 21 | | bold, italics, strike, deleted 22 | | and insterted elements.

""".stripMargin 23 | check(body, expected) 24 | } 25 | 26 | "should remove naughty tags" >> { 27 | val body = """

This is a paragraph.

28 | |

to remove

29 | |

hey

30 | |

to remove

31 | |

hey

32 | |

to remove

33 | |

hey

""".stripMargin 34 | val expected = """

This is a paragraph.

hey

hey

hey

""".stripMargin 35 | 36 | check(body, expected) 37 | } 38 | 39 | "should remove style and script tags" >> { 40 | val body = """

This is a paragraph with a 41 | |strong element.

""".stripMargin 42 | val expected = """

This is a paragraph with a strong element.

""".stripMargin 43 | 44 | check(body, expected) 45 | } 46 | 47 | "should remove style and script tags" >> { 48 | val body = """

This is a paragraph with a 49 | |strong element.

""".stripMargin 50 | val expected = """

This is a paragraph with a strong element.

""".stripMargin 51 | 52 | check(body, expected) 53 | } 54 | 55 | "should remove tags with caption as class or id" >> { 56 | val body = """

This is a paragraph

to remove

57 | |

to remove

hey

""".stripMargin 58 | val expected = """

This is a paragraph

hey

""".stripMargin 59 | 60 | check(body, expected) 61 | } 62 | 63 | "should remove tags with google as class or id" >> { 64 | val body = """

This is a paragraph

to remove

65 | |

to remove

hey

""".stripMargin 66 | val expected = """

This is a paragraph

hey

""".stripMargin 67 | 68 | check(body, expected) 69 | } 70 | 71 | "should remove tags with facebook as class or id" >> { 72 | val body = """

This is a paragraph

to remove

73 | |

to remove

hey

""".stripMargin 74 | val expected = """

This is a paragraph

hey

""".stripMargin 75 | 76 | check(body, expected) 77 | } 78 | 79 | "should remove tags with twitter as class or id" >> { 80 | val body = """

This is a paragraph

to remove

81 | |

to remove

hey

""".stripMargin 82 | val expected = """

This is a paragraph

hey

""".stripMargin 83 | 84 | check(body, expected) 85 | } 86 | 87 | "should clean span tags inside paragraphs" >> { 88 | val body = """

This is a paragraph in a span 89 | |and more s1s2

""".stripMargin 90 | val expected = """

This is a paragraph in a span and more s1s2

""".stripMargin 91 | 92 | check(body, expected) 93 | } 94 | } 95 | 96 | } 97 | -------------------------------------------------------------------------------- /src/test/scala/com/intenthq/gander/GanderSpec.scala: -------------------------------------------------------------------------------- 1 | package com.intenthq.gander 2 | 3 | import java.io.InputStreamReader 4 | import java.net.URL 5 | import java.nio.charset.Charset 6 | import java.util.zip.GZIPInputStream 7 | 8 | import com.google.common.base.Charsets 9 | import com.google.common.io.CharStreams 10 | import com.intenthq.gander.opengraph.OpenGraphData 11 | import org.joda.time.DateTime 12 | import org.specs2.mutable.Specification 13 | 14 | class GanderSpec extends Specification { 15 | 16 | def extract(url: String, charset: Charset = Charsets.UTF_8): PageInfo = { 17 | val resource = getClass.getResourceAsStream("/" + url.stripPrefix("http://").replace('/', '_') + ".gz") 18 | val rawHTML = CharStreams.toString(new InputStreamReader(new GZIPInputStream(resource), charset)) 19 | Gander.extract(rawHTML).get 20 | } 21 | 22 | def check(pageInfo: PageInfo, title: String, processedTitle: String, metaDescription: String, metaKeywords: String, 23 | lang: Option[String], date: Option[String], content: String, url: String, links: Seq[Link]) = { 24 | pageInfo.title must_== title 25 | pageInfo.processedTitle must_== processedTitle 26 | pageInfo.metaDescription must_== metaDescription 27 | pageInfo.metaKeywords must_== metaKeywords 28 | pageInfo.lang must_== lang 29 | pageInfo.publishDate must_== date.map(DateTime.parse(_).toDate) 30 | pageInfo.cleanedText.get must startWith(content) 31 | pageInfo.canonicalLink.map( _ must_== url).getOrElse(1 must_== 1) 32 | pageInfo.links must_== links 33 | } 34 | 35 | "UTF-8 encoding of unicode non breaking char must be sanitised as a space" >> { 36 | //Some pages (like the Apple Watch one) contain this char instead of a space 37 | //For more info check https://en.wikipedia.org/wiki/Non-breaking_space 38 | val url = "http://www.apple.com/watch/" 39 | extract(url).cleanedText.get must contain("Apple Watch") 40 | } 41 | 42 | "intenthq" >> { 43 | val url = "http://engineering.intenthq.com/2015/03/what-is-good-code-a-scientific-definition/" 44 | check(extract(url), 45 | url = url, 46 | content = "Here at Intent HQ we believe how important it is to write good code. Why? First, because writing good code is much cheaper and more fun than writing bad code. Second, because if you write good code chances are that the product you are building will be much better. Third, and more important, because writing good code is what we are supposed to do: after all, we are getting paid for doing our job well", 47 | title = "What is good code? A scientific definition. - Intent HQ Engineering blog", 48 | processedTitle = "What is good code? A scientific definition.", 49 | metaDescription = "How would you define good code? This article gives a pseudo-scientific answer to that question after asking a sample of 65 developers that same question.", 50 | metaKeywords = "", 51 | lang = Some("en-GB"), 52 | date = Some("2015-03-01"), 53 | links = List(Link("Uncle Bob", "http://en.wikipedia.org/wiki/Robert_Cecil_Martin"), 54 | Link("DRY", "http://en.wikipedia.org/wiki/Don%27t_repeat_yourself"))) 55 | } 56 | 57 | "bbc.com" >> { 58 | val url = "http://www.bbc.com/news/business-33697945" 59 | check(extract(url), 60 | url = url, 61 | content = "Disneyland Paris is facing a pricing probe following accusations that UK and German customers are being frozen out of certain price promotions.", 62 | title = "Disneyland Paris faces pricing probe - BBC News", 63 | processedTitle = "Disneyland Paris faces pricing probe", 64 | metaDescription = "Disneyland Paris is facing a pricing probe following accusations that UK and German customers are being frozen out of promotions available in other European member states.", 65 | metaKeywords = "", 66 | lang = Some("en"), 67 | date = None, 68 | links = List(Link("Financial Times said", "http://www.ft.com/cms/s/0/27e42c8e-351d-11e5-b05b-b01debd57852.html#axzz3hDFfsPCX"), 69 | Link("said in a report", "http://www.ft.com/cms/s/0/27e42c8e-351d-11e5-b05b-b01debd57852.html#axzz3hDFfsPCX"))) 70 | 71 | } 72 | 73 | "bbc.co.uk" >> { 74 | val url = "http://www.bbc.co.uk/sport/0/football/34203622" 75 | check(extract(url), 76 | url = url, 77 | content = "Manchester City's Champions League campaign got off to a disappointing start with a home defeat by last season's runners-up Juventus. City, who have struggled to make a serious impact in the Champions League and have never won their opening home game in the group stage, looked to be on course for victory when Juventus defender Giorgio Chiellini headed into his own net under pressure from Vincent Kompany.", 78 | title = "BBC Sport - Alvaro Morata & Mario Mandzukic score as Juventus shock Man City", 79 | processedTitle = "Alvaro Morata & Mario Mandzukic score as Juventus shock Man City", 80 | metaDescription = "Manchester City concede two goals in the last 20 minutes as Juventus fight back from a goal down to win at Etihad Stadium.", 81 | metaKeywords = "BBC, Sport, BBC Sport, bbc.co.uk, world, uk, international, foreign, british, online, service", 82 | lang = Some("en-GB"), 83 | date = Some("2015-09-12T18:06:48+00:00"), 84 | links = List(Link("- they reached last season's final -", "http://www.bbc.co.uk/sport/0/football/33010277"), 85 | Link("£49m move to Manchester City from Liverpool -", "http://www.bbc.co.uk/sport/0/football/33497488")) 86 | ) 87 | } 88 | 89 | "businessinsider" >> { 90 | val url = "http://www.businessinsider.com/goldman-on-the-fed-announcement-2011-9" 91 | check(extract(url), 92 | url = url, 93 | content = "From Goldman on the FOMC operation twist announcement: ------------- 1. As we had expected, the Federal Open Market Committee decided to \"do the twist\" and increase the duration of its securities holdings by selling shorter-maturity securities ($400bn of Treasuries with maturity of 3 years or less)", 94 | title = "GOLDMAN: 4 Key Points On The FOMC Announcement - Business Insider", 95 | processedTitle = "GOLDMAN: 4 Key Points On The FOMC Announcement", 96 | metaDescription = "Here it is.", 97 | metaKeywords = "", 98 | lang = Some("en"), 99 | date = Some("2011-09-21"), 100 | links = List(Link("announcement", "http://www.businessinsider.com/federal-reserve-announcement-fomc-operation-twist-2011-9"))) 101 | } 102 | 103 | "elpais" >> { 104 | val url = "http://internacional.elpais.com/internacional/2015/07/28/actualidad/1438076596_960360.html" 105 | check(extract(url), 106 | url = url, 107 | content = "Los aliados de la OTAN ofrecieron este martes respaldo político a Turquía en su ofensiva contra el Estado Islámico tras una reunión convocada de urgencia por el Gobierno de Ankara.", 108 | title = "La OTAN apoya con cautela la ofensiva turca contra el yihadismo | Internacional | EL PAÍS" , 109 | processedTitle = "La OTAN apoya con cautela la ofensiva turca contra el yihadismo", 110 | metaDescription = "La Alianza se ha reunido este martes con carácter de urgencia a pedición de Ankara para tratar el avance del Estado Islámico", 111 | metaKeywords = "otan, apoyar, cautela, ofensiva, turca, turco, yihadismo, alianza, haber, reunir, martes, urgencia, pedición, ankara, secretario, general, jens stoltenberg, resaltar, unidad, aliado", 112 | lang = Some("es"), 113 | date = Some("2015-07-29"), 114 | links = List(Link("en su ofensiva contra el Estado Islámico", "http://internacional.elpais.com/internacional/2015/07/24/actualidad/1437717227_199769.html"), 115 | Link("Jens Stoltenberg.", "http://elpais.com/tag/jens_stoltenberg/a/"), 116 | Link("que este martes hizo estallar un tramo de un gasoducto procedente de Irán", "http://internacional.elpais.com/internacional/2015/07/28/actualidad/1438079899_805996.html"), 117 | Link("onflicto entre Ankara y los simpatizantes del PKK", "http://internacional.elpais.com/internacional/2015/07/27/actualidad/1437986632_361510.html"), 118 | Link("crear una zona libre de combatientes del EI", "http://internacional.elpais.com/internacional/2015/07/27/actualidad/1438026945_461718.html"), 119 | Link("Ahmet Davutoglu", "http://elpais.com/tag/ahmet_davutoglu/a/"))) 120 | } 121 | 122 | "corriere" >> { 123 | val url = "http://www.corriere.it/cronache/15_luglio_29/relazione-alfano-mafia-fatti-gravi-sindaco-ha-sottovalutato-25146a6c-35b0-11e5-b050-7dc71ce7db4c.shtml" 124 | check(extract(url, Charsets.ISO_8859_1), 125 | url = url, 126 | content = "ROMA La strada è tracciata, la relazione potrebbe arrivare a Palazzo Chigi prima della pausa estiva. Il ministro dell’Interno Angelino Alfano non proporrà lo scioglimento per mafia del comune di Roma, ma nella relazione al governo", 127 | title = "La relazione di Alfano sulla mafia: fatti gravi, il sindaco ha sottovalutato - Corriere.it", 128 | processedTitle = "La relazione di Alfano sulla mafia: fatti gravi, il sindaco ha sottovalutato", 129 | metaDescription = "Non si propone lo scioglimento ma si lascia aperta la possibilità di una «diversa valutazione»", 130 | metaKeywords = "Ignazio Marino, Angelino Alfano", 131 | lang = Some("it"), 132 | date = None, 133 | links = List(Link("giunta guidata da Ignazio Marino", "http://roma.corriere.it/notizie/politica/15_luglio_28/giunta-marino-senatore-no-tav-esposito-assessore-trasporti-d0e76efa-34fe-11e5-984f-1e10ffe171ae.shtml"))) 134 | 135 | } 136 | 137 | "lemonde" >> { 138 | // val url = "http://www.lemonde.fr/football/article/2015/07/23/pep-guardiola-un-as-dans-la-manche-des-independantistes_4695701_1616938.html" 139 | // check(extract(url), 140 | // url = url, 141 | // content = "Dans la planète Barça, Pep Guardiola est un demi-dieu. Entraîneur du FC Barcelone entre 2008 et 2012, il a fait remporter aux Blaugrana 14 titres officiels. Dont six en une seule année : 2009", 142 | // title = "En Catalogne, Pep Guardiola, figure du Barça, se présente sur la liste indépendantiste", 143 | // processedTitle = "En Catalogne, Pep Guardiola, figure du Barça, se présente sur la liste indépendantiste", 144 | // metaDescription = "L’ancien entraîneur du FC Barcelone devrait clore la liste unitaire visant à exiger l’indépendance de la Catalogne lors des élections du 27 septembre.", 145 | // metaKeywords = "", 146 | // lang = Some("fr"), 147 | // date = Some("2015-07-23T15:57:46"), 148 | // links = List.empty) 149 | pending 150 | } 151 | 152 | "globoesporte" >> { 153 | val url = "http://globoesporte.globo.com/futebol/times/sao-paulo/noticia/2012/04/filho-do-gramado-leao-administra-o-sao-paulo-na-base-da-conversa.html" 154 | check(extract(url), 155 | url = url, 156 | content = "Emerson Leão não foi ao campo na manhã desta terça-feira no centro de treinamento do São Paulo", 157 | title = "'Filho do gramado', Leão administra o São Paulo na base da conversa | globoesporte.com", 158 | processedTitle = "'Filho do gramado', Leão administra o São Paulo na base da conversa", 159 | metaDescription = "Emerson Leão cobra liderança ao São Paulo (Foto: Mário Ângelo / Ag. Estado) Emerson Leão não foi ao campo na manhã desta terça-feira no centro de treinamento do São Paulo. Bem humorado e com roupa casual, preferiu acompanhar de longe ...", 160 | metaKeywords = "notícias, notícia, são paulo", 161 | lang = None, 162 | date = Some("2012-04-03T13:49"), 163 | links = List()) 164 | } 165 | 166 | "opengraph" >> { 167 | val url = "http://internacional.elpais.com/internacional/2015/07/28/actualidad/1438076596_960360.html" 168 | 169 | extract(url).openGraphData must_== 170 | OpenGraphData(title = Some("La OTAN apoya con cautela la ofensiva turca contra el yihadismo"), 171 | siteName = Some("EL PAÍS"), 172 | url = Some(new URL(url)), 173 | description = Some("La Alianza se ha reunido este martes con carácter de urgencia a pedición de Ankara para tratar el avance del Estado Islámico"), 174 | image = Some(new URL("http://ep00.epimg.net/internacional/imagenes/2015/07/28/actualidad/1438076596_960360_1438078067_noticia_normal.jpg")), 175 | `type` = Some("article"), 176 | locale = None, 177 | publishedTime = Some(new DateTime(2015, 7, 29, 0, 0))) 178 | 179 | } 180 | 181 | "fcbarcelona" >> { 182 | val url = "http://www.fcbarcelona.com/club/detail/article/30-years-since-visit-of-pope-john-paul-ii" 183 | 184 | check(extract(url), 185 | url = url, 186 | content = "On November 7, 1982, the Camp Nou enjoyed a historic moment.", 187 | title = "30 years since visit of Pope John Paul II | FC Barcelona", 188 | processedTitle = "30 years since visit of Pope John Paul II | FC Barcelona", 189 | metaDescription = "This Wednesday is the 30th anniversary of mass given by Pope John Paul at the Camp Nou", 190 | metaKeywords = "Josep Lluís Núñez, Camp Nou, Club, Season 2012-2013", 191 | lang = Some("en"), 192 | date = None, 193 | links = List() 194 | ) 195 | } 196 | 197 | "Daily Mail (date is malformed + publish_date misused)" >> { 198 | val url = "http://www.dailymail.co.uk/news/article-486484/A-spectacular-destruction-How-email-led-downfall-barrister-all.html" 199 | 200 | check(extract(url), 201 | url = url, 202 | content = "by PAUL BRACCHI Last updated at 01:01 09 October 2007 An Oxford First, a brilliant radio career and newly qualified as a barrister, Bruce Hyman seemed to have all life's gifts", 203 | title = "A spectacular destruction: How one email led to the downfall of a barrister who had it all | Daily Mail Online", 204 | processedTitle = "A spectacular destruction: How one email led to the downfall of a barrister who had it all", 205 | metaDescription = "An Oxford First, a brilliant radio career and newly qualified as a barrister, Bruce Hyman seemed to have all life's gifts - until a moment of utter madness put him behind bars and left his life in ruins", 206 | metaKeywords = "A,spectacular,destruction,How,email,led,downfall,barrister,all", 207 | lang = Some("en"), 208 | date = None, 209 | links = List() 210 | ) 211 | } 212 | 213 | } 214 | -------------------------------------------------------------------------------- /src/test/scala/com/intenthq/gander/extractors/ContentExtractorSpec.scala: -------------------------------------------------------------------------------- 1 | package com.intenthq.gander.extractors 2 | 3 | import org.joda.time.DateTime 4 | import org.specs2.mutable.Specification 5 | 6 | class ContentExtractorSpec extends Specification { 7 | 8 | def date(year: Int, month: Int, day: Int) = Some(new DateTime(year, month, day, 0, 0).toDate) 9 | 10 | "extractDateFromURLUnsafe" >> { 11 | " should extract the date from the path, if present" >> { 12 | ContentExtractor.extractDateFromURL("http://a.com/no/date/in/this/path") must_== None 13 | ContentExtractor.extractDateFromURL("http://a.com/not/every/number/1900/is/a/date") must_== None 14 | ContentExtractor.extractDateFromURL("http://a.com/number/2000a/plus/letters") must_== None 15 | 16 | ContentExtractor.extractDateFromURL("http://a.com/a/year/2000/and/nothing/else") must_== date(2000, 1, 1) 17 | ContentExtractor.extractDateFromURL("http://a.com/a/year/2000/and/10/not/a/month") must_== date(2000, 1, 1) 18 | ContentExtractor.extractDateFromURL("http://a.com/a/year/2000/13/not/a/month") must_== date(2000, 1, 1) 19 | 20 | ContentExtractor.extractDateFromURL("http://a.com/a/year/2000/10/and/a/month") must_== date(2000, 10, 1) 21 | ContentExtractor.extractDateFromURL("http://a.com/not/2000/10/a/20/day") must_== date(2000, 10, 1) 22 | ContentExtractor.extractDateFromURL("http://a.com/not/2000/10/32/a/day") must_== date(2000, 10, 1) 23 | 24 | ContentExtractor.extractDateFromURL("http://a.com/not/2000/10/31/a/day") must_== date(2000, 10, 31) 25 | } 26 | } 27 | 28 | "processTitle" >> { 29 | " should keep the raw title if there is no canonical" >> { 30 | ContentExtractor.processTitle("This is the title", None) must_== "This is the title" 31 | } 32 | 33 | " should keep the raw title if the domain name is not contained in the title" >> { 34 | ContentExtractor.processTitle("This is the title | Not related", Some("http://something.com")) must_== "This is the title | Not related" 35 | } 36 | 37 | " should remove the part of the title that contains the site name" >> { 38 | ContentExtractor.processTitle("This is the title | BBC News", Some("http://www.bbc.co.uk")) must_== "This is the title" 39 | } 40 | 41 | " should remove the part of the title that contains the site name, even if it's two words" >> { 42 | ContentExtractor.processTitle("Business Insider | This is the title", Some("http://www.businessinsider.com")) must_== "This is the title" 43 | } 44 | 45 | " should split the title by a dash" >> { 46 | ContentExtractor.processTitle("This is the title - BBC News", Some("http://www.bbc.co.uk")) must_== "This is the title" 47 | } 48 | 49 | " should match the title even if it uses character variations" >> { 50 | ContentExtractor.processTitle("This is the title - El País", Some("http://www.elpais.com")) must_== "This is the title" 51 | } 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/test/scala/com/intenthq/gander/opengraph/OpenGraphDataSpec.scala: -------------------------------------------------------------------------------- 1 | package com.intenthq.gander.opengraph 2 | 3 | import java.net.URL 4 | 5 | import org.joda.time.DateTime 6 | import org.jsoup.Jsoup 7 | import org.specs2.mutable.Specification 8 | 9 | class OpenGraphDataSpec extends Specification { 10 | 11 | def html(property: String, value: String) = 12 | s""" 13 | | 14 | |""".stripMargin 15 | 16 | "apply" >> { 17 | "should extract no tags correctly" >> { 18 | val elem = Jsoup.parse("") 19 | OpenGraphData(elem) must_== OpenGraphData() 20 | } 21 | "should extract an empty tag correctly" >> { 22 | val elem = Jsoup.parse(html("og:description", "")) 23 | OpenGraphData(elem).description must beSome("") 24 | } 25 | "should extract the og:title correctly" >> { 26 | val elem = Jsoup.parse(html("og:title", "the title")) 27 | OpenGraphData(elem).title must beSome("the title") 28 | } 29 | "should extract the og:site_name correctly" >> { 30 | val elem = Jsoup.parse(html("og:site_name", "the site name")) 31 | OpenGraphData(elem).siteName must beSome("the site name") 32 | } 33 | "should extract the og:url correctly" >> { 34 | val elem = Jsoup.parse(html("og:url", "http://example.com")) 35 | OpenGraphData(elem).url must beSome(new URL("http://example.com")) 36 | } 37 | "should return none if the og:url is not a valid url" >> { 38 | val elem = Jsoup.parse(html("og:url", "not a valid url")) 39 | OpenGraphData(elem).url must beNone 40 | } 41 | "should extract the og:description correctly" >> { 42 | val elem = Jsoup.parse(html("og:description", "the desc")) 43 | OpenGraphData(elem).description must beSome("the desc") 44 | } 45 | "should extract the og:image correctly" >> { 46 | val elem = Jsoup.parse(html("og:image", "http://example.com/image.png")) 47 | OpenGraphData(elem).image must beSome(new URL("http://example.com/image.png")) 48 | } 49 | "should return none if the og:image is not a valid url" >> { 50 | val elem = Jsoup.parse(html("og:image", "not a valid url")) 51 | OpenGraphData(elem).image must beNone 52 | } 53 | "should extract the og:type correctly" >> { 54 | val elem = Jsoup.parse(html("og:type", "the type")) 55 | OpenGraphData(elem).`type` must beSome("the type") 56 | } 57 | "should extract the og:locale correctly" >> { 58 | val elem = Jsoup.parse(html("og:locale", "the locale")) 59 | OpenGraphData(elem).locale must beSome("the locale") 60 | } 61 | "should extract the article:published_time correctly" >> { 62 | val elem = Jsoup.parse(html("article:published_time", "2015-07-31")) 63 | OpenGraphData(elem).publishedTime must beSome(new DateTime(2015, 7, 31, 0, 0)) 64 | } 65 | "should return none the article:published_time is not a valid date" >> { 66 | val elem = Jsoup.parse(html("article:published_time", "not a valid date")) 67 | OpenGraphData(elem).publishedTime must beNone 68 | } 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /src/test/scala/com/intenthq/gander/text/StopWordsTest.scala: -------------------------------------------------------------------------------- 1 | package com.intenthq.gander.text 2 | 3 | import org.specs2.mutable.Specification 4 | 5 | class StopWordsTest extends Specification { 6 | "StopWords" >> { 7 | " should find how many stopwords are there" >> { 8 | StopWords.stopWordCount("blah blah blah").stopWordCount must_== 0 9 | StopWords.stopWordCount("although blah de blah").stopWordCount must_== 1 10 | } 11 | 12 | " should determine which words are stopwords" >> { 13 | StopWords.stopWordCount("although blah de blah").stopWords must_== List("although") 14 | StopWords.stopWordCount("blah de blah").stopWords must_== List.empty 15 | } 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /src/test/scala/com/intenthq/gander/utils/FileHelperTest.scala: -------------------------------------------------------------------------------- 1 | package com.intenthq.gander.utils 2 | 3 | import com.intenthq.gander.text.StopWords 4 | import org.specs2.mutable.Specification 5 | 6 | class FileHelperTest extends Specification { 7 | "FileHelper" >> { 8 | " should load file contents" >> { 9 | val txt = FileHelper.loadResourceFile("stopwords-en.txt", StopWords.getClass) 10 | txt must startWith("a's") 11 | } 12 | " should return empty if the file doesn't exist" >> { 13 | val txt = FileHelper.loadResourceFile("stopwords-nonexistant.txt", StopWords.getClass) 14 | txt must beEmpty 15 | } 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /version.sbt: -------------------------------------------------------------------------------- 1 | version in ThisBuild := "1.4" 2 | --------------------------------------------------------------------------------