├── .gitignore ├── .mocha ├── .npmignore ├── .travis.yml ├── LICENSE ├── README.md ├── data └── stopwords.json ├── examples ├── ntv.txt ├── spiegel.txt ├── venturebeat.txt └── waitbutwhy.txt ├── package.json ├── src ├── lib │ ├── data_structures │ │ ├── phrase.ts │ │ ├── string_counter.ts │ │ ├── string_dictionary.ts │ │ └── word_matrix.ts │ ├── index.ts │ ├── rake.ts │ ├── stopwords │ │ ├── dutch.ts │ │ ├── english.ts │ │ ├── german.ts │ │ ├── italian.ts │ │ ├── portugese.ts │ │ ├── spanish.ts │ │ └── swedish.ts │ └── tools │ │ ├── guess_language.ts │ │ ├── parser.ts │ │ ├── preprocessor.ts │ │ ├── stemmer.ts │ │ ├── stoplist.ts │ │ └── strip.ts └── test │ ├── data_structures │ ├── string_counter.ts │ └── string_dictionary.ts │ ├── rake.ts │ └── tools │ ├── guess_language.ts │ ├── preprocessor.ts │ ├── stemmer.ts │ └── strip.ts ├── tsconfig.json └── tslint.json /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | dist -------------------------------------------------------------------------------- /.mocha: -------------------------------------------------------------------------------- 1 | dist/test/**/*.js 2 | --ui mocha-typescript 3 | --require source-map-support/register -------------------------------------------------------------------------------- /.npmignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | data -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: node_js 2 | node_js: 3 | - "node" 4 | - "7" 5 | - "6" -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU LESSER GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | 9 | This version of the GNU Lesser General Public License incorporates 10 | the terms and conditions of version 3 of the GNU General Public 11 | License, supplemented by the additional permissions listed below. 12 | 13 | 0. Additional Definitions. 14 | 15 | As used herein, "this License" refers to version 3 of the GNU Lesser 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU 17 | General Public License. 18 | 19 | "The Library" refers to a covered work governed by this License, 20 | other than an Application or a Combined Work as defined below. 21 | 22 | An "Application" is any work that makes use of an interface provided 23 | by the Library, but which is not otherwise based on the Library. 24 | Defining a subclass of a class defined by the Library is deemed a mode 25 | of using an interface provided by the Library. 26 | 27 | A "Combined Work" is a work produced by combining or linking an 28 | Application with the Library. The particular version of the Library 29 | with which the Combined Work was made is also called the "Linked 30 | Version". 31 | 32 | The "Minimal Corresponding Source" for a Combined Work means the 33 | Corresponding Source for the Combined Work, excluding any source code 34 | for portions of the Combined Work that, considered in isolation, are 35 | based on the Application, and not on the Linked Version. 36 | 37 | The "Corresponding Application Code" for a Combined Work means the 38 | object code and/or source code for the Application, including any data 39 | and utility programs needed for reproducing the Combined Work from the 40 | Application, but excluding the System Libraries of the Combined Work. 41 | 42 | 1. Exception to Section 3 of the GNU GPL. 43 | 44 | You may convey a covered work under sections 3 and 4 of this License 45 | without being bound by section 3 of the GNU GPL. 46 | 47 | 2. Conveying Modified Versions. 48 | 49 | If you modify a copy of the Library, and, in your modifications, a 50 | facility refers to a function or data to be supplied by an Application 51 | that uses the facility (other than as an argument passed when the 52 | facility is invoked), then you may convey a copy of the modified 53 | version: 54 | 55 | a) under this License, provided that you make a good faith effort to 56 | ensure that, in the event an Application does not supply the 57 | function or data, the facility still operates, and performs 58 | whatever part of its purpose remains meaningful, or 59 | 60 | b) under the GNU GPL, with none of the additional permissions of 61 | this License applicable to that copy. 62 | 63 | 3. Object Code Incorporating Material from Library Header Files. 64 | 65 | The object code form of an Application may incorporate material from 66 | a header file that is part of the Library. You may convey such object 67 | code under terms of your choice, provided that, if the incorporated 68 | material is not limited to numerical parameters, data structure 69 | layouts and accessors, or small macros, inline functions and templates 70 | (ten or fewer lines in length), you do both of the following: 71 | 72 | a) Give prominent notice with each copy of the object code that the 73 | Library is used in it and that the Library and its use are 74 | covered by this License. 75 | 76 | b) Accompany the object code with a copy of the GNU GPL and this license 77 | document. 78 | 79 | 4. Combined Works. 80 | 81 | You may convey a Combined Work under terms of your choice that, 82 | taken together, effectively do not restrict modification of the 83 | portions of the Library contained in the Combined Work and reverse 84 | engineering for debugging such modifications, if you also do each of 85 | the following: 86 | 87 | a) Give prominent notice with each copy of the Combined Work that 88 | the Library is used in it and that the Library and its use are 89 | covered by this License. 90 | 91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license 92 | document. 93 | 94 | c) For a Combined Work that displays copyright notices during 95 | execution, include the copyright notice for the Library among 96 | these notices, as well as a reference directing the user to the 97 | copies of the GNU GPL and this license document. 98 | 99 | d) Do one of the following: 100 | 101 | 0) Convey the Minimal Corresponding Source under the terms of this 102 | License, and the Corresponding Application Code in a form 103 | suitable for, and under terms that permit, the user to 104 | recombine or relink the Application with a modified version of 105 | the Linked Version to produce a modified Combined Work, in the 106 | manner specified by section 6 of the GNU GPL for conveying 107 | Corresponding Source. 108 | 109 | 1) Use a suitable shared library mechanism for linking with the 110 | Library. A suitable mechanism is one that (a) uses at run time 111 | a copy of the Library already present on the user's computer 112 | system, and (b) will operate properly with a modified version 113 | of the Library that is interface-compatible with the Linked 114 | Version. 115 | 116 | e) Provide Installation Information, but only if you would otherwise 117 | be required to provide such information under section 6 of the 118 | GNU GPL, and only to the extent that such information is 119 | necessary to install and execute a modified version of the 120 | Combined Work produced by recombining or relinking the 121 | Application with a modified version of the Linked Version. (If 122 | you use option 4d0, the Installation Information must accompany 123 | the Minimal Corresponding Source and Corresponding Application 124 | Code. If you use option 4d1, you must provide the Installation 125 | Information in the manner specified by section 6 of the GNU GPL 126 | for conveying Corresponding Source.) 127 | 128 | 5. Combined Libraries. 129 | 130 | You may place library facilities that are a work based on the 131 | Library side by side in a single library together with other library 132 | facilities that are not Applications and are not covered by this 133 | License, and convey such a combined library under terms of your 134 | choice, if you do both of the following: 135 | 136 | a) Accompany the combined library with a copy of the same work based 137 | on the Library, uncombined with any other library facilities, 138 | conveyed under the terms of this License. 139 | 140 | b) Give prominent notice with the combined library that part of it 141 | is a work based on the Library, and explaining where to find the 142 | accompanying uncombined form of the same work. 143 | 144 | 6. Revised Versions of the GNU Lesser General Public License. 145 | 146 | The Free Software Foundation may publish revised and/or new versions 147 | of the GNU Lesser General Public License from time to time. Such new 148 | versions will be similar in spirit to the present version, but may 149 | differ in detail to address new problems or concerns. 150 | 151 | Each version is given a distinguishing version number. If the 152 | Library as you received it specifies that a certain numbered version 153 | of the GNU Lesser General Public License "or any later version" 154 | applies to it, you have the option of following the terms and 155 | conditions either of that published version or of any later version 156 | published by the Free Software Foundation. If the Library as you 157 | received it does not specify a version number of the GNU Lesser 158 | General Public License, you may choose any version of the GNU Lesser 159 | General Public License ever published by the Free Software Foundation. 160 | 161 | If the Library as you received it specifies that a proxy can decide 162 | whether future versions of the GNU Lesser General Public License shall 163 | apply, that proxy's public statement of acceptance of any version is 164 | permanent authorization for you to choose that version for the 165 | Library. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RAKE.js 2 | 3 | A pure JS implementation of the Rapid Automated Keyword Extraction (RAKE) algorithm. Put in any text corpus, get back a bunch of keyphrases and keywords. 4 | 5 | [![TypeScript](https://badges.frapsoft.com/typescript/code/typescript.svg?v=101)](https://github.com/ellerbrock/typescript-badges/) 6 | [![Build Status](https://travis-ci.org/Anonyfox/rake-js.svg?branch=master)](https://travis-ci.org/Anonyfox/rake-js) 7 | [![styled with prettier](https://img.shields.io/badge/styled_with-prettier-ff69b4.svg)](https://github.com/prettier/prettier) 8 | [![License: LGPL v3](https://img.shields.io/badge/License-LGPL%20v3-blue.svg)](http://www.gnu.org/licenses/lgpl-3.0) 9 | 10 | ## Currently supported languages: 11 | 12 | - english 13 | - german 14 | - spanish 15 | - italian 16 | - dutch 17 | - portugese 18 | - swedish 19 | 20 | More languages are fairly easy to add, see the stoplist module for details. 21 | 22 | ## How to use 23 | 24 | Without any further options: 25 | 26 | ````javascript 27 | import rake from 'rake-js' 28 | 29 | const myKeywords = rake(someTextContent) // ['keyword1, ...] 30 | ```` 31 | 32 | When the language is known in advance (faster execution): 33 | 34 | ````javascript 35 | import rake from 'rake-js' 36 | 37 | const myKeywords = rake(someTextContent, { language: 'english' }) 38 | ```` 39 | 40 | When the corpus is divided by something other than whitespace (eg: `;`): 41 | 42 | ````javascript 43 | import rake from 'rake-js' 44 | 45 | const myKeywords = rake(someTextContent, { delimiters: [';+'] }) 46 | ```` 47 | 48 | ## Implementation Details 49 | 50 | This algorithm is *fast*, compared with other approaches like TextRank. The results are surprisingly good for a cross-language algorithm, and the truly relevant keywords / phrases are included in the result in most cases. For more details about the RAKE algorithm, read [the original paper](https://www.researchgate.net/publication/227988510_Automatic_Keyword_Extraction_from_Individual_Documents). 51 | 52 | There are still rough edges in the code, but I tried to translate the abstract algorithm into a solid software package, tested and typesafe. Actually I wrote this thing because I was very disappointed with all the existing solutions on NPM, and I hope this repository is easier to contribute to in the future. 53 | 54 | ## Roadmap: 55 | 56 | - [ ] support more languages (only handful are whitelisted for now) 57 | - [ ] duplicate keyword filtering 58 | - [ ] check browser compatibility 59 | 60 | ## LICENSE: 61 | 62 | LGPL-3.0. 63 | 64 | You can use this package in all your free or commercial products without any issues, but I want bugfixes and improvements to this algorithm to flow back into the public code repository. -------------------------------------------------------------------------------- /examples/ntv.txt: -------------------------------------------------------------------------------- 1 | Wissen Der Hauptspiegel des ELT wird fünf Mal größer sein als bei den heute stärksten Teleskopen.(Foto: L. Calçada/European Southern Observatory/dpa) In Chile wird auf einer Bergkuppe das größte optische Teleskop der Welt errichtet. Ab 2024 soll es neue Einblicke in ferne Galaxien ermöglichen, Dunkle Materie untersuchen und eine wichtige Frage beantworten: Wie steht es um Leben auf Exoplaneten? Ein abgelegener Berggipfel in Chiles Atacama-Wüste wird zum Zentrum eines weltweit einmaligen Projekts: Nach jahrelangen Vorbereitungen wird hier das größte optische Teleskop der Welt gebaut. Mit seinem Hauptspiegel von 39 Metern Durchmesser soll es als Riesenauge den Blick gen Himmel richten, um erdähnliche Planeten, Sterne und Galaxien zu beobachten. Mit dem Extremely Large Telescope (ELT) soll es auch neue Erkenntnisse über Dunkle Materie geben. Der 3048 Meter hohe Armazones-Berg befindet sich 130 Kilometer südlich von Antofagasta im Norden Chiles. Vor zwei Jahren wurde die Spitze gesprengt, um eine Plattform für das Teleskop zu errichten. Am 26. Mai will Chiles Staatschefin Michelle Bachelet den Grundstein legen. Ab 2024 soll das Teleskop hier sein erstes Sternenlicht einfangen. Das Projekt der Europäischen Südsternwarte (ESO) hat in der Wüste einen idealen Standort gefunden. Dank der sogenannten Humboldt-Strömung ist die Region fast ständig wolkenfrei. Die Wolken bleiben entweder über dem Pazifischen Ozean oder auf der argentinischen Seite der Anden. In rund 90 Prozent der Nächte ist der Sternenhimmel in der äußerst sauberen und trockenen Wüstenatmosphäre zur Beobachtung frei.\"Der Sprung von den gegenwärtigen Teleskopen zum ELT ist etwa so groß wie der Sprung von Galileos Auge zu seinem Teleskop\", erklärt Tim de Zeeuw, Generaldirektor der ESO. Der Hauptspiegel des ELT wird fünf Mal größer sein als bei den heute stärksten Teleskopen. Zudem wird er 13 Mal mehr Licht einfangen können, was viel schärfere Bilder ermöglicht.Astronomen-Rennen um bewohnbare ExoplanetenEines der Hauptziele des Projektes ist die Erkundung von Exoplaneten außerhalb unseres Sonnensystems, auf denen es Leben geben könnte. Erst kürzlich machten Entdeckungen bei dem Roten Zwergstern Trappist-1 und bei dem Stern Proxima Centauri Schlagzeilen. Es findet zurzeit ein Astronomen-Rennen statt, um den ersten bewohnbaren Exoplaneten zu finden. De Zeeuw ist der Ansicht, dass dieses Ziel im nächsten Jahrzehnt erreicht werden kann. \"Es ist schon kurios, dass dieses Teleskop in einem der unbelebtesten Ecken der Welt, der Atacama-Wüste, uns dabei helfen kann, Lebenszeichen woanders zu finden\", sagt der ESO-Generaldirektor.Die ESO, die von 15 europäischen Staaten und Brasilien gegründet wurde und den Hauptsitz in Garching bei München hat, verfügt bereits über drei weitere Beobachtungsstandorte in der Atacama-Wüste. Unter anderem betreibt sie hier das Very Large Telescope (VLT), das leistungsfähigste Observatorium für Beobachtungen im Bereich des sichtbaren Lichts.Hauptspiegel misst 39 MeterDas ELT wird mit fünf riesigen Spiegeln ausgestattet sein. Der größte, mit 39 Metern Durchmesser, wird aus rund 800 hexagonalen Teilstücken mit 1,4 Meter Durchmesser bestehen. Sie müssen perfekt zusammenpassen. Der niederländische Astronom De Zeeuw hat mit seinen Kollegen in jahrelanger Überzeugungsarbeit bei Politikern die 1,1 Milliarden Euro eingeworben, die zur Finanzierung notwendig sind. Das ELT-Projekt entstand Ende der 90er Jahre, als man sich in der Europäischen Südsternwarte fragte, ob es möglich sei, ein 100-Meter-Teleskop zu bauen. Das würde aber drei bis vier Milliarden Euro kosten. Nun begnügt man sich mit 39 Metern Durchmesser. Das ELT ist aber nicht das einzige Projekt, um den Himmel besser zu erkunden. In den USA werden zwei Initiativen vorangetrieben, um öffentliche Gelder zum Bau von einem Riesenteleskop zu bekommen. Eine ist die des 30-Meter-Teleskops vom California Institute of Technology (Caltech), für das Hawaii als Standort ausgewählt worden ist. Das zweite Projekt ist das des Magellan-Riesenteleskops (GMT) der Carnegie Institution for Science, das mit seinem Spiegel von 24,5 Metern Durchmesser auch in Nordchile eingerichtet werden soll.De Zeeuw glaubt, dass es für die Forschung durchaus nützlich sein kann, mehrere Riesen-Teleskope gleichzeitig zu haben. \"Es handelt sich um eine freundliche Konkurrenz. Man arbeitet schneller und besser, wenn man mit jemandem konkurrieren muss. Das ist vorteilhaft für alle\", betont er. Zwar wird es noch sieben Jahre bis zu den ersten Bildern des ELT dauern, aber nach Jahren des Bangens beginnt nun die entscheidende Etappe dieses Rekordprojekts. Quelle: n-tv.de -------------------------------------------------------------------------------- /examples/spiegel.txt: -------------------------------------------------------------------------------- 1 | Die neue Regelung verpuffte wirkungslos: Die Mieten in Deutschland sind seit Inkrafttreten des Gesetzes zur Mietpreisbremse noch stärker gestiegen als zuvor. Das geht aus einer Antwort des Bundesverbraucherschutzministeriums auf eine Anfrage der Grünen hervor. Im Jahr 2016 lag der durchschnittliche Quadratmeterpreis bei 7,65 Euro - und war damit 36 Cent höher als im Vorjahr. Das entspricht einem Anstieg von fünf Prozent. Zwischen 2012 und 2015 stiegen die Angebotsmieten pro Quadratmeter jährlich lediglich um 23 oder 24 Cent, das ist ein Plus von 10,6 Prozent - aber innerhalb von drei Jahren. Bei Erstvermietungen in Neubauten stieg der durchschnittliche Quadratmeterpreis von 7,73 Euro im Jahr 2012 auf 9,54 Euro im Jahr 2016. Berlin hatte am 1. Juni 2015 als erstes Bundesland die Mietpreisbremse eingeführt, bis Ende 2016 zogen 313 Städte und Gemeinden nach. Dort dürfen Mieten bei neuen Verträgen in der Regel nur noch zehn Prozent über der ortsüblichen Vergleichsmiete liegen. Immer wieder gibt es jedoch Zweifel an der Wirksamkeit der Regelung. Eine DIW-Studie kam im Juni 2016 zu dem Ergebnis, dass die Mietpreisbremse teilweise sogar zu Verteuerungen geführt hat - weil viele Vermieter im letzten Moment noch die Mieten erhöht haben. Eine umfangreiche Auswertung des Forschungsinstituts und Beratungsunternehmens F+B zeigt zudem, dass auch die Bestandsmieten in Deutschland immer teurer werden. \"Die Mietpreisbremse war von Anfang an eine Fehlkonstruktion\", sagt der Grünen-Abgeordnete Christian Kühn. Ihr sei durch \"unzählige Ausnahmen und Schlupflöcher\" die Bremswirkung abhandengekommen. Die SPD habe zwar erkannt, \"dass dringend Nachbesserungsbedarf\" besteht. Ein Referentenentwurf von Bundesverbraucherschutzminister Heiko Maas (SPD) vergammle aber seit eineinhalb Jahren in der Schublade. Maas setzt sich seit Langem dafür ein, Vermieter dazu zu verpflichten, die Vormiete von sich aus offenzulegen. Zudem will er den Anteil der Modernisierungskosten, den Vermieter auf ihre Mieter umlegen können, verringern. Seine Vorschläge stoßen allerdings auf Widerstand in der Union. -------------------------------------------------------------------------------- /examples/venturebeat.txt: -------------------------------------------------------------------------------- 1 | For decades, video games have been criticized for purportedly wasting time, stifling creativity, and even influencing violent behaviors. Now, it seems that video games have become an unlikely tool for AI researchers to improve their systems.\nSeeing stop signs\nTake, for example, Artur Filipowicz, an AI researcher at Princeton University who’s been trying to develop software for autonomous vehicles. This software needs to be able to properly identify a stop sign — which can vary in appearance due to surroundings, conditions, and individual differences — no matter what. Failure here could cost a human life, so it’s important for the algorithm to view lifelike, variant images of stop signs to “understand” what a stop sign is like.\nThe solution to this problem? Grand Theft Auto V. Seriously.\nIn a game criticized for its violent and adult themes, stop signs are depicted somewhat realistically, and Filipowicz has been able to make modifications to the game so that his autonomous vehicle software can navigate the graphically rendered streets and respond to stop signs as if it were in a live environment.\nMastering Atari\nA couple of years ago, DeepMind — the algorithm that recently bested a human Go champion in a feat once believed impossible — started training itself by playing a suite of Atari games. The catch is, developers didn’t tell the algorithm how to play the games. Instead, DeepMind had to learn on its own — and it did, by playing the games over and over, until it perfected the art. Now, DeepMind can beat just about any top score on any Atari video game.\nOf course, high scores weren’t the end goal. The goal was to measure and improve DeepMind’s ability to learn using only external inputs, with no central programming to tell it “how” to play.\nOpenAI universe\nPrivately funded organization OpenAI has taken the world of video game-based AI development to new levels, with a piece of software it calls Universe. With permission from the individual publishers, Universe has collected and modified a library of thousands of games, from basic games like those found on the Atari to major recent titles like Portal 2.\nEach game in the library has been tweaked to allow an appropriate AI algorithm to explore it immediately. Best of all, the software is completely free to use — making it available to any AI researcher who wants to make use of this new trend.\nThe goals\nSo why is it that video games are so good at helping AI researchers solve problems?\nUltimately, the benefits can be reduced to four main areas: Training for the real world. Like with Grand Theft Auto V, some video games can be used as simulations for a real-world environment. Because these systems aren’t tangible, they’re cost-effective and can test new AI programs safely with limited repercussions.\nProblem-solving reduction. The big problem with developing AI is figuring out how to solve complex problems. Video games often take big, complex problems and reduce them to smaller, more manageable chunks. This allows researchers to not only optimize their algorithms to master problems in chunks, but also helps them understand how machine learning can stitch those chunks together to solve a bigger, more pressing problem.\nRepeatable learning environments. Video games are also predictable to some degree, even when they’re specifically engineered to randomly develop environments. They can be played and simulated at speeds faster than a human would use, and can therefore serve as practically infinitely repeatable learning environments. For example, DeepMind was able to pick up the “tunneling” trick to beating Breakout after 600 repeated games. For a human being, that could take weeks of training. For a machine, it takes hours.\nTransferring lessons. Machine learning and human learning are very different, and AI researchers are trying to bridge that gap. Currently, machines are very good at executing series of repeatable tasks, over and over, at an astonishing rate that human brains simply can’t match. However, human brains are very good at grasping high-level concepts, coordinating learning areas to work together and transferring lessons learned in one area to another area. Training a machine to take lessons learned in a game like Breakout and apply them to a game like Portal 2 is a massive jump, but it’s helping researchers to understand how machines could transfer lessons and understand high-level concepts in the future. AI researchers relying on video games has a number of potential benefits and side effects. Video games are proving to be a safe, reliable, effective, and, most of all, cost-efficient way to test and develop new algorithms. On top of that, video game developers and publishers are recognizing the opportunities inherent in designing playground-like worlds for AI systems.\nThe future of video games in AI development is rich with potential, and we’re just starting to explore its full capabilities.\nLarry Alton is a freelance writer covering artificial intelligence.\nAbove: The Machine Intelligence Landscape. This article is part of our Artificial Intelligence series. You can download a high-resolution version of the landscape featuring 288 companies by clicking the image.\nVentureBeat's PC Gaming channel is presented by the Intel® Game Dev program. Stay informed about the latest game dev tools and tips. Get the news you can use. -------------------------------------------------------------------------------- /examples/waitbutwhy.txt: -------------------------------------------------------------------------------- 1 | PDF: We made a fancy PDF of this post for printing and offline viewing. Buy it here. (Or see a preview.)\nNote: The reason this post took three weeks to finish is that as I dug into research on Artificial Intelligence, I could not believe what I was reading. It hit me pretty quickly that what’s happening in the world of AI is not just an important topic, but by far THE most important topic for our future. So I wanted to learn as much as I could about it, and once I did that, I wanted to make sure I wrote a post that really explained this whole situation and why it matters so much. Not shockingly, that became outrageously long, so I broke it into two parts. This is Part 1—Part 2 is here. _______________\nWe are on the edge of change comparable to the rise of human life on Earth. — Vernor Vinge What does it feel like to stand here? It seems like a pretty intense place to be standing—but then you have to remember something about what it’s like to stand on a time graph: you can’t see what’s to your right. So here’s how it actually feels to stand there: Which probably feels pretty normal…\n_______________\nThe Far Future—Coming Soon\nImagine taking a time machine back to 1750—a time when the world was in a permanent power outage, long-distance communication meant either yelling loudly or firing a cannon in the air, and all transportation ran on hay. When you get there, you retrieve a dude, bring him to 2015, and then walk him around and watch him react to everything. It’s impossible for us to understand what it would be like for him to see shiny capsules racing by on a highway, talk to people who had been on the other side of the ocean earlier in the day, watch sports that were being played 1,000 miles away, hear a musical performance that happened 50 years ago, and play with my magical wizard rectangle that he could use to capture a real-life image or record a living moment, generate a map with a paranormal moving blue dot that shows him where he is, look at someone’s face and chat with them even though they’re on the other side of the country, and worlds of other inconceivable sorcery. This is all before you show him the internet or explain things like the International Space Station, the Large Hadron Collider, nuclear weapons, or general relativity.\nThis experience for him wouldn’t be surprising or shocking or even mind-blowing—those words aren’t big enough. He might actually die.\nBut here’s the interesting thing—if he then went back to 1750 and got jealous that we got to see his reaction and decided he wanted to try the same thing, he’d take the time machine and go back the same distance, get someone from around the year 1500, bring him to 1750, and show him everything. And the 1500 guy would be shocked by a lot of things—but he wouldn’t die. It would be far less of an insane experience for him, because while 1500 and 1750 were very different, they were much less different than 1750 to 2015. The 1500 guy would learn some mind-bending shit about space and physics, he’d be impressed with how committed Europe turned out to be with that new imperialism fad, and he’d have to do some major revisions of his world map conception. But watching everyday life go by in 1750—transportation, communication, etc.—definitely wouldn’t make him die.\nNo, in order for the 1750 guy to have as much fun as we had with him, he’d have to go much farther back—maybe all the way back to about 12,000 BC, before the First Agricultural Revolution gave rise to the first cities and to the concept of civilization. If someone from a purely hunter-gatherer world—from a time when humans were, more or less, just another animal species—saw the vast human empires of 1750 with their towering churches, their ocean-crossing ships, their concept of being “inside,” and their enormous mountain of collective, accumulated human knowledge and discovery—he’d likely die.\nAnd then what if, after dying, he got jealous and wanted to do the same thing. If he went back 12,000 years to 24,000 BC and got a guy and brought him to 12,000 BC, he’d show the guy everything and the guy would be like, “Okay what’s your point who cares.” For the 12,000 BC guy to have the same fun, he’d have to go back over 100,000 years and get someone he could show fire and language to for the first time.\nIn order for someone to be transported into the future and die from the level of shock they’d experience, they have to go enough years ahead that a “die level of progress,” or a Die Progress Unit (DPU) has been achieved. So a DPU took over 100,000 years in hunter-gatherer times, but at the post-Agricultural Revolution rate, it only took about 12,000 years. The post-Industrial Revolution world has moved so quickly that a 1750 person only needs to go forward a couple hundred years for a DPU to have happened.\nThis pattern—human progress moving quicker and quicker as time goes on—is what futurist Ray Kurzweil calls human history’s Law of Accelerating Returns. This happens because more advanced societies have the ability to progress at a faster rate than less advanced societies—because they’re more advanced. 19th century humanity knew more and had better technology than 15th century humanity, so it’s no surprise that humanity made far more advances in the 19th century than in the 15th century—15th century humanity was no match for 19th century humanity.11← open these\nThis works on smaller scales too. The movie Back to the Future came out in 1985, and “the past” took place in 1955. In the movie, when Michael J. Fox went back to 1955, he was caught off-guard by the newness of TVs, the prices of soda, the lack of love for shrill electric guitar, and the variation in slang. It was a different world, yes—but if the movie were made today and the past took place in 1985, the movie could have had much more fun with much bigger differences. The character would be in a time before personal computers, internet, or cell phones—today’s Marty McFly, a teenager born in the late 90s, would be much more out of place in 1985 than the movie’s Marty McFly was in 1955.\nThis is for the same reason we just discussed—the Law of Accelerating Returns. The average rate of advancement between 1985 and 2015 was higher than the rate between 1955 and 1985—because the former was a more advanced world—so much more change happened in the most recent 30 years than in the prior 30.\nSo—advances are getting bigger and bigger and happening more and more quickly. This suggests some pretty intense things about our future, right?\nKurzweil suggests that the progress of the entire 20th century would have been achieved in only 20 years at the rate of advancement in the year 2000—in other words, by 2000, the rate of progress was five times faster than the average rate of progress during the 20th century. He believes another 20th century’s worth of progress happened between 2000 and 2014 and that another 20th century’s worth of progress will happen by 2021, in only seven years. A couple decades later, he believes a 20th century’s worth of progress will happen multiple times in the same year, and even later, in less than one month. All in all, because of the Law of Accelerating Returns, Kurzweil believes that the 21st century will achieve 1,000 times the progress of the 20th century.2\nIf Kurzweil and others who agree with him are correct, then we may be as blown away by 2030 as our 1750 guy was by 2015—i.e. the next DPU might only take a couple decades—and the world in 2050 might be so vastly different than today’s world that we would barely recognize it.\nThis isn’t science fiction. It’s what many scientists smarter and more knowledgeable than you or I firmly believe—and if you look at history, it’s what we should logically predict.\nSo then why, when you hear me say something like “the world 35 years from now might be totally unrecognizable,” are you thinking, “Cool….but nahhhhhhh”? Three reasons we’re skeptical of outlandish forecasts of the future:\n1) When it comes to history, we think in straight lines. When we imagine the progress of the next 30 years, we look back to the progress of the previous 30 as an indicator of how much will likely happen. When we think about the extent to which the world will change in the 21st century, we just take the 20th century progress and add it to the year 2000. This was the same mistake our 1750 guy made when he got someone from 1500 and expected to blow his mind as much as his own was blown going the same distance ahead. It’s most intuitive for us to think linearly, when we should be thinking exponentially. If someone is being more clever about it, they might predict the advances of the next 30 years not by looking at the previous 30 years, but by taking the current rate of progress and judging based on that. They’d be more accurate, but still way off. In order to think about the future correctly, you need to imagine things moving at a much faster rate than they’re moving now. 2) The trajectory of very recent history often tells a distorted story. First, even a steep exponential curve seems linear when you only look at a tiny slice of it, the same way if you look at a little segment of a huge circle up close, it looks almost like a straight line. Second, exponential growth isn’t totally smooth and uniform. Kurzweil explains that progress happens in “S-curves”: An S is created by the wave of progress when a new paradigm sweeps the world. The curve goes through three phases:\n1. Slow growth (the early phase of exponential growth)\n2. Rapid growth (the late, explosive phase of exponential growth)\n3. A leveling off as the particular paradigm matures3\nIf you look only at very recent history, the part of the S-curve you’re on at the moment can obscure your perception of how fast things are advancing. The chunk of time between 1995 and 2007 saw the explosion of the internet, the introduction of Microsoft, Google, and Facebook into the public consciousness, the birth of social networking, and the introduction of cell phones and then smart phones. That was Phase 2: the growth spurt part of the S. But 2008 to 2015 has been less groundbreaking, at least on the technological front. Someone thinking about the future today might examine the last few years to gauge the current rate of advancement, but that’s missing the bigger picture. In fact, a new, huge Phase 2 growth spurt might be brewing right now.\n3) Our own experience makes us stubborn old men about the future. We base our ideas about the world on our personal experience, and that experience has ingrained the rate of growth of the recent past in our heads as “the way things happen.” We’re also limited by our imagination, which takes our experience and uses it to conjure future predictions—but often, what we know simply doesn’t give us the tools to think accurately about the future.2 When we hear a prediction about the future that contradicts our experience-based notion of how things work, our instinct is that the prediction must be naive. If I tell you, later in this post, that you may live to be 150, or 250, or not die at all, your instinct will be, “That’s stupid—if there’s one thing I know from history, it’s that everybody dies.” And yes, no one in the past has not died. But no one flew airplanes before airplanes were invented either.\nSo while nahhhhh might feel right as you read this post, it’s probably actually wrong. The fact is, if we’re being truly logical and expecting historical patterns to continue, we should conclude that much, much, much more should change in the coming decades than we intuitively expect. Logic also suggests that if the most advanced species on a planet keeps making larger and larger leaps forward at an ever-faster rate, at some point, they’ll make a leap so great that it completely alters life as they know it and the perception they have of what it means to be a human—kind of like how evolution kept making great leaps toward intelligence until finally it made such a large leap to the human being that it completely altered what it meant for any creature to live on planet Earth. And if you spend some time reading about what’s going on today in science and technology, you start to see a lot of signs quietly hinting that life as we currently know it cannot withstand the leap that’s coming next.\n_______________\nThe Road to Superintelligence\nWhat Is AI?\nIf you’re like me, you used to think Artificial Intelligence was a silly sci-fi concept, but lately you’ve been hearing it mentioned by serious people, and you don’t really quite get it.\nThere are three reasons a lot of people are confused about the term AI:\n1) We associate AI with movies. Star Wars. Terminator. 2001: A Space Odyssey. Even the Jetsons. And those are fiction, as are the robot characters. So it makes AI sound a little fictional to us.\n2) AI is a broad topic. It ranges from your phone’s calculator to self-driving cars to something in the future that might change the world dramatically. AI refers to all of these things, which is confusing.\n3) We use AI all the time in our daily lives, but we often don’t realize it’s AI. John McCarthy, who coined the term “Artificial Intelligence” in 1956, complained that “as soon as it works, no one calls it AI anymore.”4 Because of this phenomenon, AI often sounds like a mythical future prediction more than a reality. At the same time, it makes it sound like a pop concept from the past that never came to fruition. Ray Kurzweil says he hears people say that AI withered in the 1980s, which he compares to “insisting that the Internet died in the dot-com bust of the early 2000s.”5\nSo let’s clear things up. First, stop thinking of robots. A robot is a container for AI, sometimes mimicking the human form, sometimes not—but the AI itself is the computer inside the robot. AI is the brain, and the robot is its body—if it even has a body. For example, the software and data behind Siri is AI, the woman’s voice we hear is a personification of that AI, and there’s no robot involved at all.\nSecondly, you’ve probably heard the term “singularity” or “technological singularity.” This term has been used in math to describe an asymptote-like situation where normal rules no longer apply. It’s been used in physics to describe a phenomenon like an infinitely small, dense black hole or the point we were all squished into right before the Big Bang. Again, situations where the usual rules don’t apply. In 1993, Vernor Vinge wrote a famous essay in which he applied the term to the moment in the future when our technology’s intelligence exceeds our own—a moment for him when life as we know it will be forever changed and normal rules will no longer apply. Ray Kurzweil then muddled things a bit by defining the singularity as the time when the Law of Accelerating Returns has reached such an extreme pace that technological progress is happening at a seemingly-infinite pace, and after which we’ll be living in a whole new world. I found that many of today’s AI thinkers have stopped using the term, and it’s confusing anyway, so I won’t use it much here (even though we’ll be focusing on that idea throughout).\nFinally, while there are many different types or forms of AI since AI is a broad concept, the critical categories we need to think about are based on an AI’s caliber. There are three major AI caliber categories:\nAI Caliber 1) Artificial Narrow Intelligence (ANI): Sometimes referred to as Weak AI, Artificial Narrow Intelligence is AI that specializes in one area. There’s AI that can beat the world chess champion in chess, but that’s the only thing it does. Ask it to figure out a better way to store data on a hard drive, and it’ll look at you blankly.\nAI Caliber 2) Artificial General Intelligence (AGI): Sometimes referred to as Strong AI, or Human-Level AI, Artificial General Intelligence refers to a computer that is as smart as a human across the board—a machine that can perform any intellectual task that a human being can. Creating AGI is a much harder task than creating ANI, and we’re yet to do it. Professor Linda Gottfredson describes intelligence as “a very general mental capability that, among other things, involves the ability to reason, plan, solve problems, think abstractly, comprehend complex ideas, learn quickly, and learn from experience.” AGI would be able to do all of those things as easily as you can.\nAI Caliber 3) Artificial Superintelligence (ASI): Oxford philosopher and leading AI thinker Nick Bostrom defines superintelligence as “an intellect that is much smarter than the best human brains in practically every field, including scientific creativity, general wisdom and social skills.” Artificial Superintelligence ranges from a computer that’s just a little smarter than a human to one that’s trillions of times smarter—across the board. ASI is the reason the topic of AI is such a spicy meatball and why the words “immortality” and “extinction” will both appear in these posts multiple times.\nAs of now, humans have conquered the lowest caliber of AI—ANI—in many ways, and it’s everywhere. The AI Revolution is the road from ANI, through AGI, to ASI—a road we may or may not survive but that, either way, will change everything.\nLet’s take a close look at what the leading thinkers in the field believe this road looks like and why this revolution might happen way sooner than you might think:\nWhere We Are Currently—A World Running on ANI\nArtificial Narrow Intelligence is machine intelligence that equals or exceeds human intelligence or efficiency at a specific thing. A few examples: Cars are full of ANI systems, from the computer that figures out when the anti-lock brakes should kick in to the computer that tunes the parameters of the fuel injection systems. Google’s self-driving car, which is being tested now, will contain robust ANI systems that allow it to perceive and react to the world around it.\nYour phone is a little ANI factory. When you navigate using your map app, receive tailored music recommendations from Pandora, check tomorrow’s weather, talk to Siri, or dozens of other everyday activities, you’re using ANI.\nYour email spam filter is a classic type of ANI—it starts off loaded with intelligence about how to figure out what’s spam and what’s not, and then it learns and tailors its intelligence to you as it gets experience with your particular preferences. The Nest Thermostat does the same thing as it starts to figure out your typical routine and act accordingly.\nYou know the whole creepy thing that goes on when you search for a product on Amazon and then you see that as a “recommended for you” product on a different site, or when Facebook somehow knows who it makes sense for you to add as a friend? That’s a network of ANI systems, working together to inform each other about who you are and what you like and then using that information to decide what to show you. Same goes for Amazon’s “People who bought this also bought…” thing—that’s an ANI system whose job it is to gather info from the behavior of millions of customers and synthesize that info to cleverly upsell you so you’ll buy more things.\nGoogle Translate is another classic ANI system—impressively good at one narrow task. Voice recognition is another, and there are a bunch of apps that use those two ANIs as a tag team, allowing you to speak a sentence in one language and have the phone spit out the same sentence in another.\nWhen your plane lands, it’s not a human that decides which gate it should go to. Just like it’s not a human that determined the price of your ticket.\nThe world’s best Checkers, Chess, Scrabble, Backgammon, and Othello players are now all ANI systems.\nGoogle search is one large ANI brain with incredibly sophisticated methods for ranking pages and figuring out what to show you in particular. Same goes for Facebook’s Newsfeed.\nAnd those are just in the consumer world. Sophisticated ANI systems are widely used in sectors and industries like military, manufacturing, and finance (algorithmic high-frequency AI traders account for more than half of equity shares traded on US markets6), and in expert systems like those that help doctors make diagnoses and, most famously, IBM’s Watson, who contained enough facts and understood coy Trebek-speak well enough to soundly beat the most prolific Jeopardy champions. ANI systems as they are now aren’t especially scary. At worst, a glitchy or badly-programmed ANI can cause an isolated catastrophe like knocking out a power grid, causing a harmful nuclear power plant malfunction, or triggering a financial markets disaster (like the 2010 Flash Crash when an ANI program reacted the wrong way to an unexpected situation and caused the stock market to briefly plummet, taking $1 trillion of market value with it, only part of which was recovered when the mistake was corrected).\nBut while ANI doesn’t have the capability to cause an existential threat, we should see this increasingly large and complex ecosystem of relatively-harmless ANI as a precursor of the world-altering hurricane that’s on the way. Each new ANI innovation quietly adds another brick onto the road to AGI and ASI. Or as Aaron Saenz sees it, our world’s ANI systems “are like the amino acids in the early Earth’s primordial ooze”—the inanimate stuff of life that, one unexpected day, woke up.\nThe Road From ANI to AGI\nWhy It’s So Hard\nNothing will make you appreciate human intelligence like learning about how unbelievably challenging it is to try to create a computer as smart as we are. Building skyscrapers, putting humans in space, figuring out the details of how the Big Bang went down—all far easier than understanding our own brain or how to make something as cool as it. As of now, the human brain is the most complex object in the known universe.\nWhat’s interesting is that the hard parts of trying to build AGI (a computer as smart as humans in general, not just at one narrow specialty) are not intuitively what you’d think they are. Build a computer that can multiply two ten-digit numbers in a split second—incredibly easy. Build one that can look at a dog and answer whether it’s a dog or a cat—spectacularly difficult. Make AI that can beat any human in chess? Done. Make one that can read a paragraph from a six-year-old’s picture book and not just recognize the words but understand the meaning of them? Google is currently spending billions of dollars trying to do it. Hard things—like calculus, financial market strategy, and language translation—are mind-numbingly easy for a computer, while easy things—like vision, motion, movement, and perception—are insanely hard for it. Or, as computer scientist Donald Knuth puts it, “AI has by now succeeded in doing essentially everything that requires ‘thinking’ but has failed to do most of what people and animals do ‘without thinking.'”7\nWhat you quickly realize when you think about this is that those things that seem easy to us are actually unbelievably complicated, and they only seem easy because those skills have been optimized in us (and most animals) by hundreds of millions of years of animal evolution. When you reach your hand up toward an object, the muscles, tendons, and bones in your shoulder, elbow, and wrist instantly perform a long series of physics operations, in conjunction with your eyes, to allow you to move your hand in a straight line through three dimensions. It seems effortless to you because you have perfected software in your brain for doing it. Same idea goes for why it’s not that malware is dumb for not being able to figure out the slanty word recognition test when you sign up for a new account on a site—it’s that your brain is super impressive for being able to.\nOn the other hand, multiplying big numbers or playing chess are new activities for biological creatures and we haven’t had any time to evolve a proficiency at them, so a computer doesn’t need to work too hard to beat us. Think about it—which would you rather do, build a program that could multiply big numbers or one that could understand the essence of a B well enough that you could show it a B in any one of thousands of unpredictable fonts or handwriting and it could instantly know it was a B?\nOne fun example—when you look at this, you and a computer both can figure out that it’s a rectangle with two distinct shades, alternating: Tied so far. But if you pick up the black and reveal the whole image… …you have no problem giving a full description of the various opaque and translucent cylinders, slats, and 3-D corners, but the computer would fail miserably. It would describe what it sees—a variety of two-dimensional shapes in several different shades—which is actually what’s there. Your brain is doing a ton of fancy shit to interpret the implied depth, shade-mixing, and room lighting the picture is trying to portray.8 And looking at the picture below, a computer sees a two-dimensional white, black, and gray collage, while you easily see what it really is—a photo of an entirely-black, 3-D rock: And everything we just mentioned is still only taking in stagnant information and processing it. To be human-level intelligent, a computer would have to understand things like the difference between subtle facial expressions, the distinction between being pleased, relieved, content, satisfied, and glad, and why Braveheart was great but The Patriot was terrible.\nDaunting.\nSo how do we get there?\nFirst Key to Creating AGI: Increasing Computational Power\nOne thing that definitely needs to happen for AGI to be a possibility is an increase in the power of computer hardware. If an AI system is going to be as intelligent as the brain, it’ll need to equal the brain’s raw computing capacity.\nOne way to express this capacity is in the total calculations per second (cps) the brain could manage, and you could come to this number by figuring out the maximum cps of each structure in the brain and then adding them all together.\nRay Kurzweil came up with a shortcut by taking someone’s professional estimate for the cps of one structure and that structure’s weight compared to that of the whole brain and then multiplying proportionally to get an estimate for the total. Sounds a little iffy, but he did this a bunch of times with various professional estimates of different regions, and the total always arrived in the same ballpark—around 1016, or 10 quadrillion cps.\nCurrently, the world’s fastest supercomputer, China’s Tianhe-2, has actually beaten that number, clocking in at about 34 quadrillion cps. But Tianhe-2 is also a dick, taking up 720 square meters of space, using 24 megawatts of power (the brain runs on just 20 watts), and costing $390 million to build. Not especially applicable to wide usage, or even most commercial or industrial usage yet.\nKurzweil suggests that we think about the state of computers by looking at how many cps you can buy for $1,000. When that number reaches human-level—10 quadrillion cps—then that’ll mean AGI could become a very real part of life.\nMoore’s Law is a historically-reliable rule that the world’s maximum computing power doubles approximately every two years, meaning computer hardware advancement, like general human advancement through history, grows exponentially. Looking at how this relates to Kurzweil’s cps/$1,000 metric, we’re currently at about 10 trillion cps/$1,000, right on pace with this graph’s predicted trajectory:9 So the world’s $1,000 computers are now beating the mouse brain and they’re at about a thousandth of human level. This doesn’t sound like much until you remember that we were at about a trillionth of human level in 1985, a billionth in 1995, and a millionth in 2005. Being at a thousandth in 2015 puts us right on pace to get to an affordable computer by 2025 that rivals the power of the brain.\nSo on the hardware side, the raw power needed for AGI is technically available now, in China, and we’ll be ready for affordable, widespread AGI-caliber hardware within 10 years. But raw computational power alone doesn’t make a computer generally intelligent—the next question is, how do we bring human-level intelligence to all that power?\nSecond Key to Creating AGI: Making It Smart\nThis is the icky part. The truth is, no one really knows how to make it smart—we’re still debating how to make a computer human-level intelligent and capable of knowing what a dog and a weird-written B and a mediocre movie is. But there are a bunch of far-fetched strategies out there and at some point, one of them will work. Here are the three most common strategies I came across:\n1) Plagiarize the brain. This is like scientists toiling over how that kid who sits next to them in class is so smart and keeps doing so well on the tests, and even though they keep studying diligently, they can’t do nearly as well as that kid, and then they finally decide “k fuck it I’m just gonna copy that kid’s answers.” It makes sense—we’re stumped trying to build a super-complex computer, and there happens to be a perfect prototype for one in each of our heads.\nThe science world is working hard on reverse engineering the brain to figure out how evolution made such a rad thing—optimistic estimates say we can do this by 2030. Once we do that, we’ll know all the secrets of how the brain runs so powerfully and efficiently and we can draw inspiration from it and steal its innovations. One example of computer architecture that mimics the brain is the artificial neural network. It starts out as a network of transistor “neurons,” connected to each other with inputs and outputs, and it knows nothing—like an infant brain. The way it “learns” is it tries to do a task, say handwriting recognition, and at first, its neural firings and subsequent guesses at deciphering each letter will be completely random. But when it’s told it got something right, the transistor connections in the firing pathways that happened to create that answer are strengthened; when it’s told it was wrong, those pathways’ connections are weakened. After a lot of this trial and feedback, the network has, by itself, formed smart neural pathways and the machine has become optimized for the task. The brain learns a bit like this but in a more sophisticated way, and as we continue to study the brain, we’re discovering ingenious new ways to take advantage of neural circuitry.\nMore extreme plagiarism involves a strategy called “whole brain emulation,” where the goal is to slice a real brain into thin layers, scan each one, use software to assemble an accurate reconstructed 3-D model, and then implement the model on a powerful computer. We’d then have a computer officially capable of everything the brain is capable of—it would just need to learn and gather information. If engineers get really good, they’d be able to emulate a real brain with such exact accuracy that the brain’s full personality and memory would be intact once the brain architecture has been uploaded to a computer. If the brain belonged to Jim right before he passed away, the computer would now wake up as Jim (?), which would be a robust human-level AGI, and we could now work on turning Jim into an unimaginably smart ASI, which he’d probably be really excited about.\nHow far are we from achieving whole brain emulation? Well so far, we’ve not yet just recently been able to emulate a 1mm-long flatworm brain, which consists of just 302 total neurons. The human brain contains 100 billion. If that makes it seem like a hopeless project, remember the power of exponential progress—now that we’ve conquered the tiny worm brain, an ant might happen before too long, followed by a mouse, and suddenly this will seem much more plausible.\n2) Try to make evolution do what it did before but for us this time.\nSo if we decide the smart kid’s test is too hard to copy, we can try to copy the way he studies for the tests instead.\nHere’s something we know. Building a computer as powerful as the brain is possible—our own brain’s evolution is proof. And if the brain is just too complex for us to emulate, we could try to emulate evolution instead. The fact is, even if we can emulate a brain, that might be like trying to build an airplane by copying a bird’s wing-flapping motions—often, machines are best designed using a fresh, machine-oriented approach, not by mimicking biology exactly.\nSo how can we simulate evolution to build AGI? The method, called “genetic algorithms,” would work something like this: there would be a performance-and-evaluation process that would happen again and again (the same way biological creatures “perform” by living life and are “evaluated” by whether they manage to reproduce or not). A group of computers would try to do tasks, and the most successful ones would be bred with each other by having half of each of their programming merged together into a new computer. The less successful ones would be eliminated. Over many, many iterations, this natural selection process would produce better and better computers. The challenge would be creating an automated evaluation and breeding cycle so this evolution process could run on its own.\nThe downside of copying evolution is that evolution likes to take a billion years to do things and we want to do this in a few decades.\nBut we have a lot of advantages over evolution. First, evolution has no foresight and works randomly—it produces more unhelpful mutations than helpful ones, but we would control the process so it would only be driven by beneficial glitches and targeted tweaks. Secondly, evolution doesn’t aim for anything, including intelligence—sometimes an environment might even select against higher intelligence (since it uses a lot of energy). We, on the other hand, could specifically direct this evolutionary process toward increasing intelligence. Third, to select for intelligence, evolution has to innovate in a bunch of other ways to facilitate intelligence—like revamping the ways cells produce energy—when we can remove those extra burdens and use things like electricity. It’s no doubt we’d be much, much faster than evolution—but it’s still not clear whether we’ll be able to improve upon evolution enough to make this a viable strategy.\n3) Make this whole thing the computer’s problem, not ours.\nThis is when scientists get desperate and try to program the test to take itself. But it might be the most promising method we have.\nThe idea is that we’d build a computer whose two major skills would be doing research on AI and coding changes into itself—allowing it to not only learn but to improve its own architecture. We’d teach computers to be computer scientists so they could bootstrap their own development. And that would be their main job—figuring out how to make themselves smarter. More on this later.\nAll of This Could Happen Soon\nRapid advancements in hardware and innovative experimentation with software are happening simultaneously, and AGI could creep up on us quickly and unexpectedly for two main reasons:\n1) Exponential growth is intense and what seems like a snail’s pace of advancement can quickly race upwards—this GIF illustrates this concept nicely: 2) When it comes to software, progress can seem slow, but then one epiphany can instantly change the rate of advancement (kind of like the way science, during the time humans thought the universe was geocentric, was having difficulty calculating how the universe worked, but then the discovery that it was heliocentric suddenly made everything much easier). Or, when it comes to something like a computer that improves itself, we might seem far away but actually be just one tweak of the system away from having it become 1,000 times more effective and zooming upward to human-level intelligence.\nThe Road From AGI to ASI\nAt some point, we’ll have achieved AGI—computers with human-level general intelligence. Just a bunch of people and computers living together in equality.\nOh actually not at all.\nThe thing is, AGI with an identical level of intelligence and computational capacity as a human would still have significant advantages over humans. Like:\nHardware: Speed. The brain’s neurons max out at around 200 Hz, while today’s microprocessors (which are much slower than they will be when we reach AGI) run at 2 GHz, or 10 million times faster than our neurons. And the brain’s internal communications, which can move at about 120 m/s, are horribly outmatched by a computer’s ability to communicate optically at the speed of light.\nSize and storage. The brain is locked into its size by the shape of our skulls, and it couldn’t get much bigger anyway, or the 120 m/s internal communications would take too long to get from one brain structure to another. Computers can expand to any physical size, allowing far more hardware to be put to work, a much larger working memory (RAM), and a longterm memory (hard drive storage) that has both far greater capacity and precision than our own.\nReliability and durability. It’s not only the memories of a computer that would be more precise. Computer transistors are more accurate than biological neurons, and they’re less likely to deteriorate (and can be repaired or replaced if they do). Human brains also get fatigued easily, while computers can run nonstop, at peak performance, 24/7. Software: Editability, upgradability, and a wider breadth of possibility. Unlike the human brain, computer software can receive updates and fixes and can be easily experimented on. The upgrades could also span to areas where human brains are weak. Human vision software is superbly advanced, while its complex engineering capability is pretty low-grade. Computers could match the human on vision software but could also become equally optimized in engineering and any other area.\nCollective capability. Humans crush all other species at building a vast collective intelligence. Beginning with the development of language and the forming of large, dense communities, advancing through the inventions of writing and printing, and now intensified through tools like the internet, humanity’s collective intelligence is one of the major reasons we’ve been able to get so far ahead of all other species. And computers will be way better at it than we are. A worldwide network of AI running a particular program could regularly sync with itself so that anything any one computer learned would be instantly uploaded to all other computers. The group could also take on one goal as a unit, because there wouldn’t necessarily be dissenting opinions and motivations and self-interest, like we have within the human population.10 AI, which will likely get to AGI by being programmed to self-improve, wouldn’t see “human-level intelligence” as some important milestone—it’s only a relevant marker from our point of view—and wouldn’t have any reason to “stop” at our level. And given the advantages over us that even human intelligence-equivalent AGI would have, it’s pretty obvious that it would only hit human intelligence for a brief instant before racing onwards to the realm of superior-to-human intelligence.\nThis may shock the shit out of us when it happens. The reason is that from our perspective, A) while the intelligence of different kinds of animals varies, the main characteristic we’re aware of about any animal’s intelligence is that it’s far lower than ours, and B) we view the smartest humans as WAY smarter than the dumbest humans. Kind of like this: So as AI zooms upward in intelligence toward us, we’ll see it as simply becoming smarter, for an animal. Then, when it hits the lowest capacity of humanity—Nick Bostrom uses the term “the village idiot”—we’ll be like, “Oh wow, it’s like a dumb human. Cute!” The only thing is, in the grand spectrum of intelligence, all humans, from the village idiot to Einstein, are within a very small range—so just after hitting village idiot level and being declared to be AGI, it’ll suddenly be smarter than Einstein and we won’t know what hit us: And what happens…after that?\nAn Intelligence Explosion\nI hope you enjoyed normal time, because this is when this topic gets unnormal and scary, and it’s gonna stay that way from here forward. I want to pause here to remind you that every single thing I’m going to say is real—real science and real forecasts of the future from a large array of the most respected thinkers and scientists. Just keep remembering that.\nAnyway, as I said above, most of our current models for getting to AGI involve the AI getting there by self-improvement. And once it gets to AGI, even systems that formed and grew through methods that didn’t involve self-improvement would now be smart enough to begin self-improving if they wanted to.3\nAnd here’s where we get to an intense concept: recursive self-improvement. It works like this—\nAn AI system at a certain level—let’s say human village idiot—is programmed with the goal of improving its own intelligence. Once it does, it’s smarter—maybe at this point it’s at Einstein’s level—so now when it works to improve its intelligence, with an Einstein-level intellect, it has an easier time and it can make bigger leaps. These leaps make it much smarter than any human, allowing it to make even bigger leaps. As the leaps grow larger and happen more rapidly, the AGI soars upwards in intelligence and soon reaches the superintelligent level of an ASI system. This is called an Intelligence Explosion,11 and it’s the ultimate example of The Law of Accelerating Returns.\nThere is some debate about how soon AI will reach human-level general intelligence. The median year on a survey of hundreds of scientists about when they believed we’d be more likely than not to have reached AGI was 204012—that’s only 25 years from now, which doesn’t sound that huge until you consider that many of the thinkers in this field think it’s likely that the progression from AGI to ASI happens very quickly. Like—this could happen:\nIt takes decades for the first AI system to reach low-level general intelligence, but it finally happens. A computer is able to understand the world around it as well as a human four-year-old. Suddenly, within an hour of hitting that milestone, the system pumps out the grand theory of physics that unifies general relativity and quantum mechanics, something no human has been able to definitively do. 90 minutes after that, the AI has become an ASI, 170,000 times more intelligent than a human.\nSuperintelligence of that magnitude is not something we can remotely grasp, any more than a bumblebee can wrap its head around Keynesian Economics. In our world, smart means a 130 IQ and stupid means an 85 IQ—we don’t have a word for an IQ of 12,952.\nWhat we do know is that humans’ utter dominance on this Earth suggests a clear rule: with intelligence comes power. Which means an ASI, when we create it, will be the most powerful being in the history of life on Earth, and all living things, including humans, will be entirely at its whim—and this might happen in the next few decades.\nIf our meager brains were able to invent wifi, then something 100 or 1,000 or 1 billion times smarter than we are should have no problem controlling the positioning of each and every atom in the world in any way it likes, at any time—everything we consider magic, every power we imagine a supreme God to have will be as mundane an activity for the ASI as flipping on a light switch is for us. Creating the technology to reverse human aging, curing disease and hunger and even mortality, reprogramming the weather to protect the future of life on Earth—all suddenly possible. Also possible is the immediate end of all life on Earth. As far as we’re concerned, if an ASI comes to being, there is now an omnipotent God on Earth—and the all-important question for us is: Will it be a nice God? That’s the topic of Part 2 of this post.\n___________\nSources at the bottom of Part 2. Related Wait But Why Posts\nThe Fermi Paradox – Why don’t we see any signs of alien life?\nHow (and Why) SpaceX Will Colonize Mars – A post I got to work on with Elon Musk and one that reframed my mental picture of the future.\nOr for something totally different and yet somehow related, Why Procrastinators Procrastinate\nAnd here’s Year 1 of Wait But Why on an ebook -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "rake-js", 3 | "version": "0.1.1", 4 | "description": "A pure JS implementation of the Rapid Automated Keyword Extraction (RAKE) algorithm.", 5 | "main": "dist/lib/index.js", 6 | "scripts": { 7 | "build": "rm -rf dist && npm run pretty && tsc -p .", 8 | "lint": "tslint -c tslint.json 'src/**/*.ts' --fix", 9 | "pretty": "prettier --single-quote --no-semi --trailing-comma es5 --write 'src/**/*.ts'", 10 | "test": "npm run lint && npm run build && mocha --opts .mocha", 11 | "watch": "mocha-typescript-watch --opts .mocha", 12 | "prepublish": "npm run test" 13 | }, 14 | "keywords": [ 15 | "RAKE", 16 | "keyword", 17 | "keywords", 18 | "extraction", 19 | "keyword", 20 | "extraction", 21 | "tag", 22 | "tags", 23 | "auto-tagging" 24 | ], 25 | "author": "Maximilian Stroh (github/Anonyfox)", 26 | "license": "LGPL-3.0", 27 | "devDependencies": { 28 | "@types/mocha": "^2.2.41", 29 | "@types/node": "^7.0.21", 30 | "chai": "^3.5.0", 31 | "mocha": "^3.4.1", 32 | "mocha-typescript": "^1.1.2", 33 | "prettier": "^1.5.2", 34 | "source-map-support": "^0.4.15", 35 | "tslint": "^5.4.3", 36 | "tslint-config-prettier": "^1.1.0", 37 | "typescript": "^2.3.2" 38 | }, 39 | "dependencies": { 40 | "condense-whitespace": "^1.0.0", 41 | "franc": "^3.1.0", 42 | "lodash": "^4.17.4", 43 | "snowball": "^0.3.1" 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/lib/data_structures/phrase.ts: -------------------------------------------------------------------------------- 1 | import { map } from 'lodash' 2 | import Stemmer from '../tools/stemmer' 3 | 4 | /** 5 | * This is a container holding a bag of words and their stems. All words 6 | * together carry a combined meaning, and can be 7 | */ 8 | export default class Phrase { 9 | public text: string 10 | public words: string[] = [] 11 | public stems: string[] = [] 12 | public score: number = 0.0 13 | 14 | public isEmpty() { 15 | return this.words.length === 0 16 | } 17 | 18 | public pushWord(word: string) { 19 | if (word && word.length > 1) { 20 | this.words.push(word) 21 | } 22 | } 23 | 24 | public createText() { 25 | this.text = this.words.join(' ') 26 | } 27 | 28 | public calculateStems(stemmer: Stemmer) { 29 | this.stems = map(this.words, word => stemmer.stem(word)) 30 | } 31 | 32 | public calculateScore(stemIndex: { [stem: string]: number }) { 33 | let sum = 0.0 34 | for (const stem of this.stems) { 35 | sum += stemIndex[stem] 36 | } 37 | this.score = sum 38 | } 39 | 40 | public multiplyWith(amount: number) { 41 | this.score *= amount 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/lib/data_structures/string_counter.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Helper Class to keep track of strings (words, stems, ) and their occurences 3 | */ 4 | export default class StringCounter { 5 | // the actual data store implemented with a plain object 6 | private counter: { [word: string]: number } = {} 7 | 8 | // increase the counter for a given string 9 | public count(str: string): void { 10 | if (!str) { 11 | return 12 | } 13 | if (this.counter[str]) { 14 | this.counter[str]++ 15 | } else { 16 | this.counter[str] = 1 17 | } 18 | } 19 | 20 | // return a list of all strings counted yet 21 | public strings(): string[] { 22 | return Object.keys(this.counter) 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /src/lib/data_structures/string_dictionary.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Helper Class to keep references between strings 3 | */ 4 | export default class StringDictionary { 5 | // the actual data store implemented with a plain object 6 | private dict: { [word: string]: string } = {} 7 | 8 | // increase the counter for a given string 9 | public add(key: string, value: string): void { 10 | if (!key || !value) { 11 | return 12 | } 13 | this.dict[key] = value 14 | } 15 | 16 | // return a list of all strings counted yet 17 | public get(key: string): string { 18 | return this.dict[key] 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /src/lib/data_structures/word_matrix.ts: -------------------------------------------------------------------------------- 1 | export default class WordMatrix { 2 | public matrix: number[][] 3 | private scores: { [key: string]: number } = {} 4 | private size: number 5 | private index: { [key: string]: number } = {} 6 | 7 | constructor(private keys: string[]) { 8 | this.size = keys.length 9 | this.createIndex() 10 | this.createZeroMatrix() 11 | } 12 | 13 | public incField(row: number, col: number) { 14 | this.matrix[row][col] += 1 15 | } 16 | 17 | public getRow(row: number): number[] { 18 | return this.matrix[row] 19 | } 20 | 21 | public process(values: string[]) { 22 | const indexes = values.map(key => this.index[key]) 23 | for (const row of indexes) { 24 | for (const col of indexes) { 25 | this.matrix[row][col] += 1 26 | } 27 | } 28 | } 29 | 30 | public calculateScores() { 31 | for (const key of this.keys) { 32 | const row = this.getRow(this.index[key]) 33 | let deg = 0.0 34 | let freq = 0.0 35 | for (const col of row) { 36 | if (col !== 0) { 37 | deg += col 38 | freq += 1 39 | } 40 | } 41 | this.scores[key] = deg / freq 42 | } 43 | return this.scores 44 | } 45 | 46 | private createIndex() { 47 | const index = {} 48 | this.keys.forEach((key, i) => (index[key] = i)) 49 | this.index = index 50 | } 51 | 52 | private createZeroMatrix() { 53 | const matrix = [] 54 | for (let i = 0; i < this.size; i++) { 55 | matrix.push(Array(this.size).fill(0)) 56 | } 57 | this.matrix = matrix 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /src/lib/index.ts: -------------------------------------------------------------------------------- 1 | import { merge } from 'lodash' 2 | import { IOptions, rake } from './rake' 3 | import guessLanguage from './tools/guess_language' 4 | 5 | const defaults: IOptions = { 6 | delimiters: ['\\s+'], 7 | language: 'english', 8 | } 9 | 10 | export default function process(text: string, opts?: IOptions): string[] { 11 | const options: IOptions = merge({}, defaults, opts) 12 | if (!opts || !opts.language) { 13 | options.language = guessLanguage(text) 14 | } 15 | const params = merge(options, { corpus: text }) 16 | const keywords = rake(params) 17 | return keywords 18 | } 19 | -------------------------------------------------------------------------------- /src/lib/rake.ts: -------------------------------------------------------------------------------- 1 | import { map } from 'lodash' 2 | import Matrix from './data_structures/word_matrix' 3 | import { languageName } from './tools/guess_language' 4 | import Parser from './tools/parser' 5 | import Preprocessor from './tools/preprocessor' 6 | import Stemmer from './tools/stemmer' 7 | import load from './tools/stoplist' 8 | 9 | // can be used to tweak the algorithm or to use it without the defaults 10 | export interface IOptions { 11 | delimiters: string[] 12 | language: languageName 13 | } 14 | 15 | // the actual parameters for the RAKE algorithm 16 | export interface IParameters extends IOptions { 17 | corpus: string 18 | } 19 | 20 | export function rake(params: IParameters): string[] { 21 | // step 1: split the corpus text into a word array on `delimiters` 22 | const preprocessor = new Preprocessor(params.delimiters) 23 | const wordArray = preprocessor.process(params.corpus) 24 | 25 | // step 2: loop through all words, generate ngrams/stems/phrases/metrics 26 | const stemmer = new Stemmer(params.language) 27 | const stopwords = load(params.language) 28 | const parser = new Parser(stemmer, stopwords).process(wordArray) 29 | 30 | // step 3: build a co-occurence matrix for all words (-> stems) 31 | const stemList = stemmer.getStems() 32 | const matrix = new Matrix(stemList) 33 | for (const phrase of parser.phrases) { 34 | matrix.process(phrase.stems) 35 | } 36 | const stemScores = matrix.calculateScores() 37 | 38 | // step 4: examine the phrases with the best combined scores 39 | for (const phrase of parser.phrases) { 40 | phrase.calculateScore(stemScores) 41 | } 42 | parser.joinDuplicates() 43 | return parser.bestPhrases() 44 | } 45 | -------------------------------------------------------------------------------- /src/lib/stopwords/dutch.ts: -------------------------------------------------------------------------------- 1 | export default [ 2 | 'aan', 3 | 'achte', 4 | 'achter', 5 | 'af', 6 | 'al', 7 | 'alle', 8 | 'alleen', 9 | 'alles', 10 | 'als', 11 | 'ander', 12 | 'anders', 13 | 'beetje', 14 | 'behalve', 15 | 'beide', 16 | 'beiden', 17 | 'ben', 18 | 'beneden', 19 | 'bent', 20 | 'bij', 21 | 'bijna', 22 | 'bijv', 23 | 'blijkbaar', 24 | 'blijken', 25 | 'boven', 26 | 'bv', 27 | 'daar', 28 | 'daardoor', 29 | 'daarin', 30 | 'daarna', 31 | 'daarom', 32 | 'daaruit', 33 | 'dan', 34 | 'dat', 35 | 'de', 36 | 'deden', 37 | 'deed', 38 | 'derde', 39 | 'derhalve', 40 | 'dertig', 41 | 'deze', 42 | 'dhr', 43 | 'die', 44 | 'dit', 45 | 'doe', 46 | 'doen', 47 | 'doet', 48 | 'door', 49 | 'drie', 50 | 'duizend', 51 | 'echter', 52 | 'een', 53 | 'eens', 54 | 'eerst', 55 | 'eerste', 56 | 'eigen', 57 | 'eigenlijk', 58 | 'elk', 59 | 'elke', 60 | 'en', 61 | 'enige', 62 | 'er', 63 | 'erg', 64 | 'ergens', 65 | 'etc', 66 | 'etcetera', 67 | 'even', 68 | 'geen', 69 | 'genoeg', 70 | 'geweest', 71 | 'haar', 72 | 'haarzelf', 73 | 'had', 74 | 'hadden', 75 | 'heb', 76 | 'hebben', 77 | 'hebt', 78 | 'hedden', 79 | 'heeft', 80 | 'heel', 81 | 'hem', 82 | 'hemzelf', 83 | 'hen', 84 | 'het', 85 | 'hetzelfde', 86 | 'hier', 87 | 'hierin', 88 | 'hierna', 89 | 'hierom', 90 | 'hij', 91 | 'hijzelf', 92 | 'hoe', 93 | 'honderd', 94 | 'hun', 95 | 'ieder', 96 | 'iedere', 97 | 'iedereen', 98 | 'iemand', 99 | 'iets', 100 | 'ik', 101 | 'in', 102 | 'inderdaad', 103 | 'intussen', 104 | 'is', 105 | 'ja', 106 | 'je', 107 | 'jij', 108 | 'jijzelf', 109 | 'jou', 110 | 'jouw', 111 | 'jullie', 112 | 'kan', 113 | 'kon', 114 | 'konden', 115 | 'kun', 116 | 'kunnen', 117 | 'kunt', 118 | 'laatst', 119 | 'later', 120 | 'lijken', 121 | 'lijkt', 122 | 'maak', 123 | 'maakt', 124 | 'maakte', 125 | 'maakten', 126 | 'maar', 127 | 'mag', 128 | 'maken', 129 | 'me', 130 | 'meer', 131 | 'meest', 132 | 'meestal', 133 | 'men', 134 | 'met', 135 | 'mevr', 136 | 'mij', 137 | 'mijn', 138 | 'minder', 139 | 'miss', 140 | 'misschien', 141 | 'missen', 142 | 'mits', 143 | 'mocht', 144 | 'mochten', 145 | 'moest', 146 | 'moesten', 147 | 'moet', 148 | 'moeten', 149 | 'mogen', 150 | 'mr', 151 | 'mrs', 152 | 'mw', 153 | 'na', 154 | 'naar', 155 | 'nam', 156 | 'namelijk', 157 | 'nee', 158 | 'neem', 159 | 'negen', 160 | 'nemen', 161 | 'nergens', 162 | 'niemand', 163 | 'niet', 164 | 'niets', 165 | 'niks', 166 | 'noch', 167 | 'nochtans', 168 | 'nog', 169 | 'nooit', 170 | 'nu', 171 | 'nv', 172 | 'of', 173 | 'om', 174 | 'omdat', 175 | 'ondanks', 176 | 'onder', 177 | 'ondertussen', 178 | 'ons', 179 | 'onze', 180 | 'onzeker', 181 | 'ooit', 182 | 'ook', 183 | 'op', 184 | 'over', 185 | 'overal', 186 | 'overige', 187 | 'paar', 188 | 'per', 189 | 'recent', 190 | 'redelijk', 191 | 'samen', 192 | 'sinds', 193 | 'steeds', 194 | 'te', 195 | 'tegen', 196 | 'tegenover', 197 | 'thans', 198 | 'tien', 199 | 'tiende', 200 | 'tijdens', 201 | 'tja', 202 | 'toch', 203 | 'toe', 204 | 'tot', 205 | 'totdat', 206 | 'tussen', 207 | 'twee', 208 | 'tweede', 209 | 'u', 210 | 'uit', 211 | 'uw', 212 | 'vaak', 213 | 'van', 214 | 'vanaf', 215 | 'veel', 216 | 'veertig', 217 | 'verder', 218 | 'verscheidene', 219 | 'verschillende', 220 | 'via', 221 | 'vier', 222 | 'vierde', 223 | 'vijf', 224 | 'vijfde', 225 | 'vijftig', 226 | 'volgend', 227 | 'volgens', 228 | 'voor', 229 | 'voordat', 230 | 'voorts', 231 | 'waar', 232 | 'waarom', 233 | 'waarschijnlijk', 234 | 'wanneer', 235 | 'waren', 236 | 'was', 237 | 'wat', 238 | 'we', 239 | 'wederom', 240 | 'weer', 241 | 'weinig', 242 | 'wel', 243 | 'welk', 244 | 'welke', 245 | 'werd', 246 | 'werden', 247 | 'werder', 248 | 'whatever', 249 | 'wie', 250 | 'wij', 251 | 'wijzelf', 252 | 'wil', 253 | 'wilden', 254 | 'willen', 255 | 'word', 256 | 'worden', 257 | 'wordt', 258 | 'zal', 259 | 'ze', 260 | 'zei', 261 | 'zeker', 262 | 'zelf', 263 | 'zelfde', 264 | 'zes', 265 | 'zeven', 266 | 'zich', 267 | 'zij', 268 | 'zijn', 269 | 'zijzelf', 270 | 'zo', 271 | 'zoals', 272 | 'zodat', 273 | 'zou', 274 | 'zouden', 275 | 'zulk', 276 | 'zullen', 277 | ] 278 | -------------------------------------------------------------------------------- /src/lib/stopwords/english.ts: -------------------------------------------------------------------------------- 1 | export default [ 2 | 'a', 3 | "a's", 4 | 'able', 5 | 'about', 6 | 'above', 7 | 'according', 8 | 'accordingly', 9 | 'across', 10 | 'actually', 11 | 'after', 12 | 'afterwards', 13 | 'again', 14 | 'against', 15 | "ain't", 16 | 'all', 17 | 'allow', 18 | 'allows', 19 | 'almost', 20 | 'alone', 21 | 'along', 22 | 'already', 23 | 'also', 24 | 'although', 25 | 'always', 26 | 'am', 27 | 'among', 28 | 'amongst', 29 | 'an', 30 | 'and', 31 | 'another', 32 | 'any', 33 | 'anybody', 34 | 'anyhow', 35 | 'anyone', 36 | 'anything', 37 | 'anyway', 38 | 'anyways', 39 | 'anywhere', 40 | 'apart', 41 | 'appear', 42 | 'appreciate', 43 | 'appropriate', 44 | 'are', 45 | "aren't", 46 | 'around', 47 | 'as', 48 | 'aside', 49 | 'ask', 50 | 'asking', 51 | 'associated', 52 | 'at', 53 | 'available', 54 | 'away', 55 | 'awfully', 56 | 'b', 57 | 'be', 58 | 'became', 59 | 'because', 60 | 'become', 61 | 'becomes', 62 | 'becoming', 63 | 'been', 64 | 'before', 65 | 'beforehand', 66 | 'behind', 67 | 'being', 68 | 'believe', 69 | 'below', 70 | 'beside', 71 | 'besides', 72 | 'best', 73 | 'better', 74 | 'between', 75 | 'beyond', 76 | 'both', 77 | 'brief', 78 | 'but', 79 | 'by', 80 | 'c', 81 | "c'mon", 82 | "c's", 83 | 'came', 84 | 'can', 85 | "can't", 86 | 'cannot', 87 | 'cant', 88 | 'cause', 89 | 'causes', 90 | 'certain', 91 | 'certainly', 92 | 'changes', 93 | 'clearly', 94 | 'co', 95 | 'com', 96 | 'come', 97 | 'comes', 98 | 'concerning', 99 | 'consequently', 100 | 'consider', 101 | 'considering', 102 | 'contain', 103 | 'containing', 104 | 'contains', 105 | 'corresponding', 106 | 'could', 107 | "couldn't", 108 | 'course', 109 | 'currently', 110 | 'd', 111 | 'definitely', 112 | 'described', 113 | 'despite', 114 | 'did', 115 | "didn't", 116 | 'different', 117 | 'do', 118 | 'does', 119 | "doesn't", 120 | 'doing', 121 | "don't", 122 | 'done', 123 | 'down', 124 | 'downwards', 125 | 'during', 126 | 'e', 127 | 'each', 128 | 'edu', 129 | 'eg', 130 | 'eight', 131 | 'either', 132 | 'else', 133 | 'elsewhere', 134 | 'enough', 135 | 'entirely', 136 | 'especially', 137 | 'et', 138 | 'etc', 139 | 'even', 140 | 'ever', 141 | 'every', 142 | 'everybody', 143 | 'everyone', 144 | 'everything', 145 | 'everywhere', 146 | 'ex', 147 | 'exactly', 148 | 'example', 149 | 'except', 150 | 'f', 151 | 'far', 152 | 'few', 153 | 'fifth', 154 | 'first', 155 | 'five', 156 | 'followed', 157 | 'following', 158 | 'follows', 159 | 'for', 160 | 'former', 161 | 'formerly', 162 | 'forth', 163 | 'four', 164 | 'from', 165 | 'further', 166 | 'furthermore', 167 | 'g', 168 | 'get', 169 | 'gets', 170 | 'getting', 171 | 'given', 172 | 'gives', 173 | 'go', 174 | 'goes', 175 | 'going', 176 | 'gone', 177 | 'got', 178 | 'gotten', 179 | 'greetings', 180 | 'h', 181 | 'had', 182 | "hadn't", 183 | 'happens', 184 | 'hardly', 185 | 'has', 186 | "hasn't", 187 | 'have', 188 | "haven't", 189 | 'having', 190 | 'he', 191 | "he's", 192 | 'hello', 193 | 'help', 194 | 'hence', 195 | 'her', 196 | 'here', 197 | "here's", 198 | 'hereafter', 199 | 'hereby', 200 | 'herein', 201 | 'hereupon', 202 | 'hers', 203 | 'herself', 204 | 'hi', 205 | 'him', 206 | 'himself', 207 | 'his', 208 | 'hither', 209 | 'hopefully', 210 | 'how', 211 | 'howbeit', 212 | 'however', 213 | 'i', 214 | "i'd", 215 | "i'll", 216 | "i'm", 217 | "i've", 218 | 'ie', 219 | 'if', 220 | 'ignored', 221 | 'immediate', 222 | 'in', 223 | 'inasmuch', 224 | 'inc', 225 | 'indeed', 226 | 'indicate', 227 | 'indicated', 228 | 'indicates', 229 | 'inner', 230 | 'insofar', 231 | 'instead', 232 | 'into', 233 | 'inward', 234 | 'is', 235 | "isn't", 236 | 'it', 237 | "it'd", 238 | "it'll", 239 | "it's", 240 | 'its', 241 | 'itself', 242 | 'j', 243 | 'just', 244 | 'k', 245 | 'keep', 246 | 'keeps', 247 | 'kept', 248 | 'know', 249 | 'known', 250 | 'knows', 251 | 'l', 252 | 'last', 253 | 'lately', 254 | 'later', 255 | 'latter', 256 | 'latterly', 257 | 'least', 258 | 'less', 259 | 'lest', 260 | 'let', 261 | "let's", 262 | 'like', 263 | 'liked', 264 | 'likely', 265 | 'little', 266 | 'look', 267 | 'looking', 268 | 'looks', 269 | 'ltd', 270 | 'm', 271 | 'mainly', 272 | 'many', 273 | 'may', 274 | 'maybe', 275 | 'me', 276 | 'mean', 277 | 'meanwhile', 278 | 'merely', 279 | 'might', 280 | 'more', 281 | 'moreover', 282 | 'most', 283 | 'mostly', 284 | 'much', 285 | 'must', 286 | 'my', 287 | 'myself', 288 | 'n', 289 | 'name', 290 | 'namely', 291 | 'nd', 292 | 'near', 293 | 'nearly', 294 | 'necessary', 295 | 'need', 296 | 'needs', 297 | 'neither', 298 | 'never', 299 | 'nevertheless', 300 | 'new', 301 | 'next', 302 | 'nine', 303 | 'no', 304 | 'nobody', 305 | 'non', 306 | 'none', 307 | 'noone', 308 | 'nor', 309 | 'normally', 310 | 'not', 311 | 'nothing', 312 | 'novel', 313 | 'now', 314 | 'nowhere', 315 | 'o', 316 | 'obviously', 317 | 'of', 318 | 'off', 319 | 'often', 320 | 'oh', 321 | 'ok', 322 | 'okay', 323 | 'old', 324 | 'on', 325 | 'once', 326 | 'one', 327 | 'ones', 328 | 'only', 329 | 'onto', 330 | 'or', 331 | 'other', 332 | 'others', 333 | 'otherwise', 334 | 'ought', 335 | 'our', 336 | 'ours', 337 | 'ourselves', 338 | 'out', 339 | 'outside', 340 | 'over', 341 | 'overall', 342 | 'own', 343 | 'p', 344 | 'particular', 345 | 'particularly', 346 | 'per', 347 | 'perhaps', 348 | 'placed', 349 | 'please', 350 | 'plus', 351 | 'possible', 352 | 'presumably', 353 | 'probably', 354 | 'provides', 355 | 'q', 356 | 'que', 357 | 'quite', 358 | 'qv', 359 | 'r', 360 | 'rather', 361 | 'rd', 362 | 're', 363 | 'really', 364 | 'reasonably', 365 | 'regarding', 366 | 'regardless', 367 | 'regards', 368 | 'relatively', 369 | 'respectively', 370 | 'right', 371 | 's', 372 | 'said', 373 | 'same', 374 | 'saw', 375 | 'say', 376 | 'saying', 377 | 'says', 378 | 'second', 379 | 'secondly', 380 | 'see', 381 | 'seeing', 382 | 'seem', 383 | 'seemed', 384 | 'seeming', 385 | 'seems', 386 | 'seen', 387 | 'self', 388 | 'selves', 389 | 'sensible', 390 | 'sent', 391 | 'serious', 392 | 'seriously', 393 | 'seven', 394 | 'several', 395 | 'shall', 396 | 'she', 397 | 'should', 398 | "shouldn't", 399 | 'since', 400 | 'six', 401 | 'so', 402 | 'some', 403 | 'somebody', 404 | 'somehow', 405 | 'someone', 406 | 'something', 407 | 'sometime', 408 | 'sometimes', 409 | 'somewhat', 410 | 'somewhere', 411 | 'soon', 412 | 'sorry', 413 | 'specified', 414 | 'specify', 415 | 'specifying', 416 | 'still', 417 | 'sub', 418 | 'such', 419 | 'sup', 420 | 'sure', 421 | 't', 422 | "t's", 423 | 'take', 424 | 'taken', 425 | 'tell', 426 | 'tends', 427 | 'th', 428 | 'than', 429 | 'thank', 430 | 'thanks', 431 | 'thanx', 432 | 'that', 433 | "that's", 434 | 'thats', 435 | 'the', 436 | 'their', 437 | 'theirs', 438 | 'them', 439 | 'themselves', 440 | 'then', 441 | 'thence', 442 | 'there', 443 | "there's", 444 | 'thereafter', 445 | 'thereby', 446 | 'therefore', 447 | 'therein', 448 | 'theres', 449 | 'thereupon', 450 | 'these', 451 | 'they', 452 | "they'd", 453 | "they'll", 454 | "they're", 455 | "they've", 456 | 'think', 457 | 'third', 458 | 'this', 459 | 'thorough', 460 | 'thoroughly', 461 | 'those', 462 | 'though', 463 | 'three', 464 | 'through', 465 | 'throughout', 466 | 'thru', 467 | 'thus', 468 | 'to', 469 | 'together', 470 | 'too', 471 | 'took', 472 | 'toward', 473 | 'towards', 474 | 'tried', 475 | 'tries', 476 | 'truly', 477 | 'try', 478 | 'trying', 479 | 'twice', 480 | 'two', 481 | 'u', 482 | 'un', 483 | 'under', 484 | 'unfortunately', 485 | 'unless', 486 | 'unlikely', 487 | 'until', 488 | 'unto', 489 | 'up', 490 | 'upon', 491 | 'us', 492 | 'use', 493 | 'used', 494 | 'useful', 495 | 'uses', 496 | 'using', 497 | 'usually', 498 | 'uucp', 499 | 'v', 500 | 'value', 501 | 'various', 502 | 'very', 503 | 'via', 504 | 'viz', 505 | 'vs', 506 | 'w', 507 | 'want', 508 | 'wants', 509 | 'was', 510 | "wasn't", 511 | 'way', 512 | 'we', 513 | "we'd", 514 | "we'll", 515 | "we're", 516 | "we've", 517 | 'welcome', 518 | 'well', 519 | 'went', 520 | 'were', 521 | "weren't", 522 | 'what', 523 | "what's", 524 | 'whatever', 525 | 'when', 526 | 'whence', 527 | 'whenever', 528 | 'where', 529 | "where's", 530 | 'whereafter', 531 | 'whereas', 532 | 'whereby', 533 | 'wherein', 534 | 'whereupon', 535 | 'wherever', 536 | 'whether', 537 | 'which', 538 | 'while', 539 | 'whither', 540 | 'who', 541 | "who's", 542 | 'whoever', 543 | 'whole', 544 | 'whom', 545 | 'whose', 546 | 'why', 547 | 'will', 548 | 'willing', 549 | 'wish', 550 | 'with', 551 | 'within', 552 | 'without', 553 | "won't", 554 | 'wonder', 555 | 'would', 556 | "wouldn't", 557 | 'x', 558 | 'y', 559 | 'yes', 560 | 'yet', 561 | 'you', 562 | "you'd", 563 | "you'll", 564 | "you're", 565 | "you've", 566 | 'your', 567 | 'yours', 568 | 'yourself', 569 | 'yourselves', 570 | 'z', 571 | 'zero', 572 | ] 573 | -------------------------------------------------------------------------------- /src/lib/stopwords/german.ts: -------------------------------------------------------------------------------- 1 | export default [ 2 | 'ernst', 3 | 'ordnung', 4 | 'schluss', 5 | 'a', 6 | 'ab', 7 | 'aber', 8 | 'ach', 9 | 'acht', 10 | 'achte', 11 | 'achten', 12 | 'achter', 13 | 'achtes', 14 | 'ag', 15 | 'alle', 16 | 'allein', 17 | 'allem', 18 | 'allen', 19 | 'aller', 20 | 'allerdings', 21 | 'alles', 22 | 'allgemeinen', 23 | 'als', 24 | 'also', 25 | 'am', 26 | 'an', 27 | 'andere', 28 | 'anderen', 29 | 'andern', 30 | 'anders', 31 | 'au', 32 | 'auch', 33 | 'auf', 34 | 'aus', 35 | 'ausser', 36 | 'ausserdem', 37 | 'außer', 38 | 'außerdem', 39 | 'b', 40 | 'bald', 41 | 'bei', 42 | 'beide', 43 | 'beiden', 44 | 'beim', 45 | 'beispiel', 46 | 'bekannt', 47 | 'bereits', 48 | 'besonders', 49 | 'besser', 50 | 'besten', 51 | 'bin', 52 | 'bis', 53 | 'bisher', 54 | 'bist', 55 | 'c', 56 | 'd', 57 | 'd.h', 58 | 'da', 59 | 'dabei', 60 | 'dadurch', 61 | 'dafür', 62 | 'dagegen', 63 | 'daher', 64 | 'dahin', 65 | 'dahinter', 66 | 'damals', 67 | 'damit', 68 | 'danach', 69 | 'daneben', 70 | 'dank', 71 | 'dann', 72 | 'daran', 73 | 'darauf', 74 | 'daraus', 75 | 'darf', 76 | 'darfst', 77 | 'darin', 78 | 'darum', 79 | 'darunter', 80 | 'darüber', 81 | 'das', 82 | 'dasein', 83 | 'daselbst', 84 | 'dass', 85 | 'dasselbe', 86 | 'davon', 87 | 'davor', 88 | 'dazu', 89 | 'dazwischen', 90 | 'daß', 91 | 'dein', 92 | 'deine', 93 | 'deinem', 94 | 'deiner', 95 | 'dem', 96 | 'dementsprechend', 97 | 'demgegenüber', 98 | 'demgemäss', 99 | 'demgemäß', 100 | 'demselben', 101 | 'demzufolge', 102 | 'den', 103 | 'denen', 104 | 'denn', 105 | 'denselben', 106 | 'der', 107 | 'deren', 108 | 'derjenige', 109 | 'derjenigen', 110 | 'dermassen', 111 | 'dermaßen', 112 | 'derselbe', 113 | 'derselben', 114 | 'des', 115 | 'deshalb', 116 | 'desselben', 117 | 'dessen', 118 | 'deswegen', 119 | 'dich', 120 | 'die', 121 | 'diejenige', 122 | 'diejenigen', 123 | 'dies', 124 | 'diese', 125 | 'dieselbe', 126 | 'dieselben', 127 | 'diesem', 128 | 'diesen', 129 | 'dieser', 130 | 'dieses', 131 | 'dir', 132 | 'doch', 133 | 'dort', 134 | 'drei', 135 | 'drin', 136 | 'dritte', 137 | 'dritten', 138 | 'dritter', 139 | 'drittes', 140 | 'du', 141 | 'durch', 142 | 'durchaus', 143 | 'durfte', 144 | 'durften', 145 | 'dürfen', 146 | 'dürft', 147 | 'e', 148 | 'eben', 149 | 'ebenso', 150 | 'ehrlich', 151 | 'ei', 152 | 'ei,', 153 | 'eigen', 154 | 'eigene', 155 | 'eigenen', 156 | 'eigener', 157 | 'eigenes', 158 | 'ein', 159 | 'einander', 160 | 'eine', 161 | 'einem', 162 | 'einen', 163 | 'einer', 164 | 'eines', 165 | 'einige', 166 | 'einigen', 167 | 'einiger', 168 | 'einiges', 169 | 'einmal', 170 | 'eins', 171 | 'elf', 172 | 'en', 173 | 'ende', 174 | 'endlich', 175 | 'entweder', 176 | 'er', 177 | 'erst', 178 | 'erste', 179 | 'ersten', 180 | 'erster', 181 | 'erstes', 182 | 'es', 183 | 'etwa', 184 | 'etwas', 185 | 'euch', 186 | 'euer', 187 | 'eure', 188 | 'f', 189 | 'folgende', 190 | 'früher', 191 | 'fünf', 192 | 'fünfte', 193 | 'fünften', 194 | 'fünfter', 195 | 'fünftes', 196 | 'für', 197 | 'g', 198 | 'gab', 199 | 'ganz', 200 | 'ganze', 201 | 'ganzen', 202 | 'ganzer', 203 | 'ganzes', 204 | 'gar', 205 | 'gedurft', 206 | 'gegen', 207 | 'gegenüber', 208 | 'gehabt', 209 | 'gehen', 210 | 'geht', 211 | 'gekannt', 212 | 'gekonnt', 213 | 'gemacht', 214 | 'gemocht', 215 | 'gemusst', 216 | 'genug', 217 | 'gerade', 218 | 'gern', 219 | 'gesagt', 220 | 'geschweige', 221 | 'gewesen', 222 | 'gewollt', 223 | 'geworden', 224 | 'gibt', 225 | 'ging', 226 | 'gleich', 227 | 'gott', 228 | 'gross', 229 | 'grosse', 230 | 'grossen', 231 | 'grosser', 232 | 'grosses', 233 | 'groß', 234 | 'große', 235 | 'großen', 236 | 'großer', 237 | 'großes', 238 | 'gut', 239 | 'gute', 240 | 'guter', 241 | 'gutes', 242 | 'h', 243 | 'habe', 244 | 'haben', 245 | 'habt', 246 | 'hast', 247 | 'hat', 248 | 'hatte', 249 | 'hatten', 250 | 'hattest', 251 | 'hattet', 252 | 'heisst', 253 | 'her', 254 | 'heute', 255 | 'hier', 256 | 'hin', 257 | 'hinter', 258 | 'hoch', 259 | 'hätte', 260 | 'hätten', 261 | 'i', 262 | 'ich', 263 | 'ihm', 264 | 'ihn', 265 | 'ihnen', 266 | 'ihr', 267 | 'ihre', 268 | 'ihrem', 269 | 'ihren', 270 | 'ihrer', 271 | 'ihres', 272 | 'im', 273 | 'immer', 274 | 'in', 275 | 'indem', 276 | 'infolgedessen', 277 | 'ins', 278 | 'irgend', 279 | 'ist', 280 | 'j', 281 | 'ja', 282 | 'jahr', 283 | 'jahre', 284 | 'jahren', 285 | 'je', 286 | 'jede', 287 | 'jedem', 288 | 'jeden', 289 | 'jeder', 290 | 'jedermann', 291 | 'jedermanns', 292 | 'jedes', 293 | 'jedoch', 294 | 'jemand', 295 | 'jemandem', 296 | 'jemanden', 297 | 'jene', 298 | 'jenem', 299 | 'jenen', 300 | 'jener', 301 | 'jenes', 302 | 'jetzt', 303 | 'k', 304 | 'kam', 305 | 'kann', 306 | 'kannst', 307 | 'kaum', 308 | 'kein', 309 | 'keine', 310 | 'keinem', 311 | 'keinen', 312 | 'keiner', 313 | 'kleine', 314 | 'kleinen', 315 | 'kleiner', 316 | 'kleines', 317 | 'kommen', 318 | 'kommt', 319 | 'konnte', 320 | 'konnten', 321 | 'kurz', 322 | 'können', 323 | 'könnt', 324 | 'könnte', 325 | 'l', 326 | 'lang', 327 | 'lange', 328 | 'leicht', 329 | 'leide', 330 | 'lieber', 331 | 'los', 332 | 'm', 333 | 'machen', 334 | 'macht', 335 | 'machte', 336 | 'mag', 337 | 'magst', 338 | 'mahn', 339 | 'mal', 340 | 'man', 341 | 'manche', 342 | 'manchem', 343 | 'manchen', 344 | 'mancher', 345 | 'manches', 346 | 'mann', 347 | 'mehr', 348 | 'mein', 349 | 'meine', 350 | 'meinem', 351 | 'meinen', 352 | 'meiner', 353 | 'meines', 354 | 'mensch', 355 | 'menschen', 356 | 'mich', 357 | 'mir', 358 | 'mit', 359 | 'mittel', 360 | 'mochte', 361 | 'mochten', 362 | 'morgen', 363 | 'muss', 364 | 'musst', 365 | 'musste', 366 | 'mussten', 367 | 'muß', 368 | 'mußt', 369 | 'möchte', 370 | 'mögen', 371 | 'möglich', 372 | 'mögt', 373 | 'müssen', 374 | 'müsst', 375 | 'müßt', 376 | 'n', 377 | 'na', 378 | 'nach', 379 | 'nachdem', 380 | 'nahm', 381 | 'natürlich', 382 | 'neben', 383 | 'nein', 384 | 'neue', 385 | 'neuen', 386 | 'neun', 387 | 'neunte', 388 | 'neunten', 389 | 'neunter', 390 | 'neuntes', 391 | 'nicht', 392 | 'nichts', 393 | 'nie', 394 | 'niemand', 395 | 'niemandem', 396 | 'niemanden', 397 | 'noch', 398 | 'nun', 399 | 'nur', 400 | 'o', 401 | 'ob', 402 | 'oben', 403 | 'oder', 404 | 'offen', 405 | 'oft', 406 | 'ohne', 407 | 'p', 408 | 'q', 409 | 'r', 410 | 'recht', 411 | 'rechte', 412 | 'rechten', 413 | 'rechter', 414 | 'rechtes', 415 | 'richtig', 416 | 'rund', 417 | 's', 418 | 'sa', 419 | 'sache', 420 | 'sagt', 421 | 'sagte', 422 | 'sah', 423 | 'satt', 424 | 'schlecht', 425 | 'schon', 426 | 'sechs', 427 | 'sechste', 428 | 'sechsten', 429 | 'sechster', 430 | 'sechstes', 431 | 'sehr', 432 | 'sei', 433 | 'seid', 434 | 'seien', 435 | 'sein', 436 | 'seine', 437 | 'seinem', 438 | 'seinen', 439 | 'seiner', 440 | 'seines', 441 | 'seit', 442 | 'seitdem', 443 | 'selbst', 444 | 'sich', 445 | 'sie', 446 | 'sieben', 447 | 'siebente', 448 | 'siebenten', 449 | 'siebenter', 450 | 'siebentes', 451 | 'sind', 452 | 'so', 453 | 'solang', 454 | 'solche', 455 | 'solchem', 456 | 'solchen', 457 | 'solcher', 458 | 'solches', 459 | 'soll', 460 | 'sollen', 461 | 'sollst', 462 | 'sollt', 463 | 'sollte', 464 | 'sollten', 465 | 'sondern', 466 | 'sonst', 467 | 'soweit', 468 | 'sowie', 469 | 'später', 470 | 'startseite', 471 | 'statt', 472 | 'steht', 473 | 'suche', 474 | 't', 475 | 'tag', 476 | 'tage', 477 | 'tagen', 478 | 'tat', 479 | 'teil', 480 | 'tel', 481 | 'tritt', 482 | 'trotzdem', 483 | 'tun', 484 | 'u', 485 | 'uhr', 486 | 'um', 487 | 'und', 488 | 'und?', 489 | 'uns', 490 | 'unser', 491 | 'unsere', 492 | 'unserer', 493 | 'unter', 494 | 'v', 495 | 'vergangenen', 496 | 'viel', 497 | 'viele', 498 | 'vielem', 499 | 'vielen', 500 | 'vielleicht', 501 | 'vier', 502 | 'vierte', 503 | 'vierten', 504 | 'vierter', 505 | 'viertes', 506 | 'vom', 507 | 'von', 508 | 'vor', 509 | 'w', 510 | 'wahr?', 511 | 'wann', 512 | 'war', 513 | 'waren', 514 | 'wart', 515 | 'warum', 516 | 'was', 517 | 'wegen', 518 | 'weil', 519 | 'weit', 520 | 'weiter', 521 | 'weitere', 522 | 'weiteren', 523 | 'weiteres', 524 | 'welche', 525 | 'welchem', 526 | 'welchen', 527 | 'welcher', 528 | 'welches', 529 | 'wem', 530 | 'wen', 531 | 'wenig', 532 | 'wenige', 533 | 'weniger', 534 | 'weniges', 535 | 'wenigstens', 536 | 'wenn', 537 | 'wer', 538 | 'werde', 539 | 'werden', 540 | 'werdet', 541 | 'weshalb', 542 | 'wessen', 543 | 'wie', 544 | 'wieder', 545 | 'wieso', 546 | 'will', 547 | 'willst', 548 | 'wir', 549 | 'wird', 550 | 'wirklich', 551 | 'wirst', 552 | 'wissen', 553 | 'wo', 554 | 'wohl', 555 | 'wollen', 556 | 'wollt', 557 | 'wollte', 558 | 'wollten', 559 | 'worden', 560 | 'wurde', 561 | 'wurden', 562 | 'während', 563 | 'währenddem', 564 | 'währenddessen', 565 | 'wäre', 566 | 'würde', 567 | 'würden', 568 | 'x', 569 | 'y', 570 | 'z', 571 | 'z.b', 572 | 'zehn', 573 | 'zehnte', 574 | 'zehnten', 575 | 'zehnter', 576 | 'zehntes', 577 | 'zeit', 578 | 'zu', 579 | 'zuerst', 580 | 'zugleich', 581 | 'zum', 582 | 'zunächst', 583 | 'zur', 584 | 'zurück', 585 | 'zusammen', 586 | 'zwanzig', 587 | 'zwar', 588 | 'zwei', 589 | 'zweite', 590 | 'zweiten', 591 | 'zweiter', 592 | 'zweites', 593 | 'zwischen', 594 | 'zwölf', 595 | 'über', 596 | 'überhaupt', 597 | 'übrigens', 598 | ] 599 | -------------------------------------------------------------------------------- /src/lib/stopwords/italian.ts: -------------------------------------------------------------------------------- 1 | export default [ 2 | 'IE', 3 | 'a', 4 | 'abbastanza', 5 | 'abbia', 6 | 'abbiamo', 7 | 'abbiano', 8 | 'abbiate', 9 | 'accidenti', 10 | 'ad', 11 | 'adesso', 12 | 'affinche', 13 | 'agl', 14 | 'agli', 15 | 'ahime', 16 | 'ahimè', 17 | 'ai', 18 | 'al', 19 | 'alcuna', 20 | 'alcuni', 21 | 'alcuno', 22 | 'all', 23 | 'alla', 24 | 'alle', 25 | 'allo', 26 | 'allora', 27 | 'altri', 28 | 'altrimenti', 29 | 'altro', 30 | 'altrove', 31 | 'altrui', 32 | 'anche', 33 | 'ancora', 34 | 'anni', 35 | 'anno', 36 | 'ansa', 37 | 'anticipo', 38 | 'assai', 39 | 'attesa', 40 | 'attraverso', 41 | 'avanti', 42 | 'avemmo', 43 | 'avendo', 44 | 'avente', 45 | 'aver', 46 | 'avere', 47 | 'averlo', 48 | 'avesse', 49 | 'avessero', 50 | 'avessi', 51 | 'avessimo', 52 | 'aveste', 53 | 'avesti', 54 | 'avete', 55 | 'aveva', 56 | 'avevamo', 57 | 'avevano', 58 | 'avevate', 59 | 'avevi', 60 | 'avevo', 61 | 'avrai', 62 | 'avranno', 63 | 'avrebbe', 64 | 'avrebbero', 65 | 'avrei', 66 | 'avremmo', 67 | 'avremo', 68 | 'avreste', 69 | 'avresti', 70 | 'avrete', 71 | 'avrà', 72 | 'avrò', 73 | 'avuta', 74 | 'avute', 75 | 'avuti', 76 | 'avuto', 77 | 'basta', 78 | 'bene', 79 | 'benissimo', 80 | 'berlusconi', 81 | 'brava', 82 | 'bravo', 83 | 'c', 84 | 'casa', 85 | 'caso', 86 | 'cento', 87 | 'certa', 88 | 'certe', 89 | 'certi', 90 | 'certo', 91 | 'che', 92 | 'chi', 93 | 'chicchessia', 94 | 'chiunque', 95 | 'ci', 96 | 'ciascuna', 97 | 'ciascuno', 98 | 'cima', 99 | 'cio', 100 | 'cioe', 101 | 'cioè', 102 | 'circa', 103 | 'citta', 104 | 'città', 105 | 'ciò', 106 | 'co', 107 | 'codesta', 108 | 'codesti', 109 | 'codesto', 110 | 'cogli', 111 | 'coi', 112 | 'col', 113 | 'colei', 114 | 'coll', 115 | 'coloro', 116 | 'colui', 117 | 'come', 118 | 'cominci', 119 | 'comunque', 120 | 'con', 121 | 'concernente', 122 | 'conciliarsi', 123 | 'conclusione', 124 | 'consiglio', 125 | 'contro', 126 | 'cortesia', 127 | 'cos', 128 | 'cosa', 129 | 'cosi', 130 | 'così', 131 | 'cui', 132 | 'd', 133 | 'da', 134 | 'dagl', 135 | 'dagli', 136 | 'dai', 137 | 'dal', 138 | 'dall', 139 | 'dalla', 140 | 'dalle', 141 | 'dallo', 142 | 'dappertutto', 143 | 'davanti', 144 | 'degl', 145 | 'degli', 146 | 'dei', 147 | 'del', 148 | 'dell', 149 | 'della', 150 | 'delle', 151 | 'dello', 152 | 'dentro', 153 | 'detto', 154 | 'deve', 155 | 'di', 156 | 'dice', 157 | 'dietro', 158 | 'dire', 159 | 'dirimpetto', 160 | 'diventa', 161 | 'diventare', 162 | 'diventato', 163 | 'dopo', 164 | 'dov', 165 | 'dove', 166 | 'dovra', 167 | 'dovrà', 168 | 'dovunque', 169 | 'due', 170 | 'dunque', 171 | 'durante', 172 | 'e', 173 | 'ebbe', 174 | 'ebbero', 175 | 'ebbi', 176 | 'ecc', 177 | 'ecco', 178 | 'ed', 179 | 'effettivamente', 180 | 'egli', 181 | 'ella', 182 | 'entrambi', 183 | 'eppure', 184 | 'era', 185 | 'erano', 186 | 'eravamo', 187 | 'eravate', 188 | 'eri', 189 | 'ero', 190 | 'esempio', 191 | 'esse', 192 | 'essendo', 193 | 'esser', 194 | 'essere', 195 | 'essi', 196 | 'ex', 197 | 'fa', 198 | 'faccia', 199 | 'facciamo', 200 | 'facciano', 201 | 'facciate', 202 | 'faccio', 203 | 'facemmo', 204 | 'facendo', 205 | 'facesse', 206 | 'facessero', 207 | 'facessi', 208 | 'facessimo', 209 | 'faceste', 210 | 'facesti', 211 | 'faceva', 212 | 'facevamo', 213 | 'facevano', 214 | 'facevate', 215 | 'facevi', 216 | 'facevo', 217 | 'fai', 218 | 'fanno', 219 | 'farai', 220 | 'faranno', 221 | 'fare', 222 | 'farebbe', 223 | 'farebbero', 224 | 'farei', 225 | 'faremmo', 226 | 'faremo', 227 | 'fareste', 228 | 'faresti', 229 | 'farete', 230 | 'farà', 231 | 'farò', 232 | 'fatto', 233 | 'favore', 234 | 'fece', 235 | 'fecero', 236 | 'feci', 237 | 'fin', 238 | 'finalmente', 239 | 'finche', 240 | 'fine', 241 | 'fino', 242 | 'forse', 243 | 'forza', 244 | 'fosse', 245 | 'fossero', 246 | 'fossi', 247 | 'fossimo', 248 | 'foste', 249 | 'fosti', 250 | 'fra', 251 | 'frattempo', 252 | 'fu', 253 | 'fui', 254 | 'fummo', 255 | 'fuori', 256 | 'furono', 257 | 'futuro', 258 | 'generale', 259 | 'gia', 260 | 'giacche', 261 | 'giorni', 262 | 'giorno', 263 | 'già', 264 | 'gli', 265 | 'gliela', 266 | 'gliele', 267 | 'glieli', 268 | 'glielo', 269 | 'gliene', 270 | 'governo', 271 | 'grande', 272 | 'grazie', 273 | 'gruppo', 274 | 'ha', 275 | 'haha', 276 | 'hai', 277 | 'hanno', 278 | 'ho', 279 | 'i', 280 | 'ieri', 281 | 'il', 282 | 'improvviso', 283 | 'in', 284 | 'inc', 285 | 'infatti', 286 | 'inoltre', 287 | 'insieme', 288 | 'intanto', 289 | 'intorno', 290 | 'invece', 291 | 'io', 292 | 'l', 293 | 'la', 294 | 'lasciato', 295 | 'lato', 296 | 'lavoro', 297 | 'le', 298 | 'lei', 299 | 'li', 300 | 'lo', 301 | 'lontano', 302 | 'loro', 303 | 'lui', 304 | 'lungo', 305 | 'luogo', 306 | 'là', 307 | 'ma', 308 | 'macche', 309 | 'magari', 310 | 'maggior', 311 | 'mai', 312 | 'male', 313 | 'malgrado', 314 | 'malissimo', 315 | 'mancanza', 316 | 'marche', 317 | 'me', 318 | 'medesimo', 319 | 'mediante', 320 | 'meglio', 321 | 'meno', 322 | 'mentre', 323 | 'mesi', 324 | 'mezzo', 325 | 'mi', 326 | 'mia', 327 | 'mie', 328 | 'miei', 329 | 'mila', 330 | 'miliardi', 331 | 'milioni', 332 | 'minimi', 333 | 'ministro', 334 | 'mio', 335 | 'modo', 336 | 'molti', 337 | 'moltissimo', 338 | 'molto', 339 | 'momento', 340 | 'mondo', 341 | 'mosto', 342 | 'nazionale', 343 | 'ne', 344 | 'negl', 345 | 'negli', 346 | 'nei', 347 | 'nel', 348 | 'nell', 349 | 'nella', 350 | 'nelle', 351 | 'nello', 352 | 'nemmeno', 353 | 'neppure', 354 | 'nessun', 355 | 'nessuna', 356 | 'nessuno', 357 | 'niente', 358 | 'no', 359 | 'noi', 360 | 'non', 361 | 'nondimeno', 362 | 'nonostante', 363 | 'nonsia', 364 | 'nostra', 365 | 'nostre', 366 | 'nostri', 367 | 'nostro', 368 | 'novanta', 369 | 'nove', 370 | 'nulla', 371 | 'nuovo', 372 | 'o', 373 | 'od', 374 | 'oggi', 375 | 'ogni', 376 | 'ognuna', 377 | 'ognuno', 378 | 'oltre', 379 | 'oppure', 380 | 'ora', 381 | 'ore', 382 | 'osi', 383 | 'ossia', 384 | 'ottanta', 385 | 'otto', 386 | 'paese', 387 | 'parecchi', 388 | 'parecchie', 389 | 'parecchio', 390 | 'parte', 391 | 'partendo', 392 | 'peccato', 393 | 'peggio', 394 | 'per', 395 | 'perche', 396 | 'perchè', 397 | 'perché', 398 | 'percio', 399 | 'perciò', 400 | 'perfino', 401 | 'pero', 402 | 'persino', 403 | 'persone', 404 | 'però', 405 | 'piedi', 406 | 'pieno', 407 | 'piglia', 408 | 'piu', 409 | 'piuttosto', 410 | 'più', 411 | 'po', 412 | 'pochissimo', 413 | 'poco', 414 | 'poi', 415 | 'poiche', 416 | 'possa', 417 | 'possedere', 418 | 'posteriore', 419 | 'posto', 420 | 'potrebbe', 421 | 'preferibilmente', 422 | 'presa', 423 | 'press', 424 | 'prima', 425 | 'primo', 426 | 'principalmente', 427 | 'probabilmente', 428 | 'proprio', 429 | 'puo', 430 | 'pure', 431 | 'purtroppo', 432 | 'può', 433 | 'qualche', 434 | 'qualcosa', 435 | 'qualcuna', 436 | 'qualcuno', 437 | 'quale', 438 | 'quali', 439 | 'qualunque', 440 | 'quando', 441 | 'quanta', 442 | 'quante', 443 | 'quanti', 444 | 'quanto', 445 | 'quantunque', 446 | 'quasi', 447 | 'quattro', 448 | 'quel', 449 | 'quella', 450 | 'quelle', 451 | 'quelli', 452 | 'quello', 453 | 'quest', 454 | 'questa', 455 | 'queste', 456 | 'questi', 457 | 'questo', 458 | 'qui', 459 | 'quindi', 460 | 'realmente', 461 | 'recente', 462 | 'recentemente', 463 | 'registrazione', 464 | 'relativo', 465 | 'riecco', 466 | 'salvo', 467 | 'sara', 468 | 'sarai', 469 | 'saranno', 470 | 'sarebbe', 471 | 'sarebbero', 472 | 'sarei', 473 | 'saremmo', 474 | 'saremo', 475 | 'sareste', 476 | 'saresti', 477 | 'sarete', 478 | 'sarà', 479 | 'sarò', 480 | 'scola', 481 | 'scopo', 482 | 'scorso', 483 | 'se', 484 | 'secondo', 485 | 'seguente', 486 | 'seguito', 487 | 'sei', 488 | 'sembra', 489 | 'sembrare', 490 | 'sembrato', 491 | 'sembri', 492 | 'sempre', 493 | 'senza', 494 | 'sette', 495 | 'si', 496 | 'sia', 497 | 'siamo', 498 | 'siano', 499 | 'siate', 500 | 'siete', 501 | 'sig', 502 | 'solito', 503 | 'solo', 504 | 'soltanto', 505 | 'sono', 506 | 'sopra', 507 | 'sotto', 508 | 'spesso', 509 | 'srl', 510 | 'sta', 511 | 'stai', 512 | 'stando', 513 | 'stanno', 514 | 'starai', 515 | 'staranno', 516 | 'starebbe', 517 | 'starebbero', 518 | 'starei', 519 | 'staremmo', 520 | 'staremo', 521 | 'stareste', 522 | 'staresti', 523 | 'starete', 524 | 'starà', 525 | 'starò', 526 | 'stata', 527 | 'state', 528 | 'stati', 529 | 'stato', 530 | 'stava', 531 | 'stavamo', 532 | 'stavano', 533 | 'stavate', 534 | 'stavi', 535 | 'stavo', 536 | 'stemmo', 537 | 'stessa', 538 | 'stesse', 539 | 'stessero', 540 | 'stessi', 541 | 'stessimo', 542 | 'stesso', 543 | 'steste', 544 | 'stesti', 545 | 'stette', 546 | 'stettero', 547 | 'stetti', 548 | 'stia', 549 | 'stiamo', 550 | 'stiano', 551 | 'stiate', 552 | 'sto', 553 | 'su', 554 | 'sua', 555 | 'subito', 556 | 'successivamente', 557 | 'successivo', 558 | 'sue', 559 | 'sugl', 560 | 'sugli', 561 | 'sui', 562 | 'sul', 563 | 'sull', 564 | 'sulla', 565 | 'sulle', 566 | 'sullo', 567 | 'suo', 568 | 'suoi', 569 | 'tale', 570 | 'tali', 571 | 'talvolta', 572 | 'tanto', 573 | 'te', 574 | 'tempo', 575 | 'ti', 576 | 'titolo', 577 | 'torino', 578 | 'tra', 579 | 'tranne', 580 | 'tre', 581 | 'trenta', 582 | 'troppo', 583 | 'trovato', 584 | 'tu', 585 | 'tua', 586 | 'tue', 587 | 'tuo', 588 | 'tuoi', 589 | 'tutta', 590 | 'tuttavia', 591 | 'tutte', 592 | 'tutti', 593 | 'tutto', 594 | 'uguali', 595 | 'ulteriore', 596 | 'ultimo', 597 | 'un', 598 | 'una', 599 | 'uno', 600 | 'uomo', 601 | 'va', 602 | 'vale', 603 | 'vari', 604 | 'varia', 605 | 'varie', 606 | 'vario', 607 | 'verso', 608 | 'vi', 609 | 'via', 610 | 'vicino', 611 | 'visto', 612 | 'vita', 613 | 'voi', 614 | 'volta', 615 | 'volte', 616 | 'vostra', 617 | 'vostre', 618 | 'vostri', 619 | 'vostro', 620 | 'è', 621 | ] 622 | -------------------------------------------------------------------------------- /src/lib/stopwords/portugese.ts: -------------------------------------------------------------------------------- 1 | export default [ 2 | 'a', 3 | 'acerca', 4 | 'adeus', 5 | 'agora', 6 | 'ainda', 7 | 'algmas', 8 | 'algo', 9 | 'algumas', 10 | 'alguns', 11 | 'ali', 12 | 'além', 13 | 'ambos', 14 | 'ano', 15 | 'anos', 16 | 'antes', 17 | 'ao', 18 | 'aos', 19 | 'apenas', 20 | 'apoio', 21 | 'apontar', 22 | 'após', 23 | 'aquela', 24 | 'aquelas', 25 | 'aquele', 26 | 'aqueles', 27 | 'aqui', 28 | 'aquilo', 29 | 'as', 30 | 'assim', 31 | 'através', 32 | 'atrás', 33 | 'até', 34 | 'aí', 35 | 'baixo', 36 | 'bastante', 37 | 'bem', 38 | 'bom', 39 | 'breve', 40 | 'cada', 41 | 'caminho', 42 | 'catorze', 43 | 'cedo', 44 | 'cento', 45 | 'certamente', 46 | 'certeza', 47 | 'cima', 48 | 'cinco', 49 | 'coisa', 50 | 'com', 51 | 'como', 52 | 'comprido', 53 | 'conhecido', 54 | 'conselho', 55 | 'contra', 56 | 'corrente', 57 | 'custa', 58 | 'cá', 59 | 'da', 60 | 'daquela', 61 | 'daquele', 62 | 'dar', 63 | 'das', 64 | 'de', 65 | 'debaixo', 66 | 'demais', 67 | 'dentro', 68 | 'depois', 69 | 'desde', 70 | 'desligado', 71 | 'dessa', 72 | 'desse', 73 | 'desta', 74 | 'deste', 75 | 'deve', 76 | 'devem', 77 | 'deverá', 78 | 'dez', 79 | 'dezanove', 80 | 'dezasseis', 81 | 'dezassete', 82 | 'dezoito', 83 | 'dia', 84 | 'diante', 85 | 'direita', 86 | 'diz', 87 | 'dizem', 88 | 'dizer', 89 | 'do', 90 | 'dois', 91 | 'dos', 92 | 'doze', 93 | 'duas', 94 | 'dá', 95 | 'dão', 96 | 'dúvida', 97 | 'e', 98 | 'ela', 99 | 'elas', 100 | 'ele', 101 | 'eles', 102 | 'em', 103 | 'embora', 104 | 'enquanto', 105 | 'entre', 106 | 'então', 107 | 'era', 108 | 'essa', 109 | 'essas', 110 | 'esse', 111 | 'esses', 112 | 'esta', 113 | 'estado', 114 | 'estar', 115 | 'estará', 116 | 'estas', 117 | 'estava', 118 | 'este', 119 | 'estes', 120 | 'esteve', 121 | 'estive', 122 | 'estivemos', 123 | 'estiveram', 124 | 'estiveste', 125 | 'estivestes', 126 | 'estou', 127 | 'está', 128 | 'estás', 129 | 'estão', 130 | 'eu', 131 | 'exemplo', 132 | 'falta', 133 | 'fará', 134 | 'favor', 135 | 'faz', 136 | 'fazeis', 137 | 'fazem', 138 | 'fazemos', 139 | 'fazer', 140 | 'fazes', 141 | 'fazia', 142 | 'faço', 143 | 'fez', 144 | 'fim', 145 | 'final', 146 | 'foi', 147 | 'fomos', 148 | 'for', 149 | 'fora', 150 | 'foram', 151 | 'forma', 152 | 'foste', 153 | 'fostes', 154 | 'fui', 155 | 'geral', 156 | 'grande', 157 | 'grandes', 158 | 'grupo', 159 | 'hoje', 160 | 'horas', 161 | 'há', 162 | 'iniciar', 163 | 'inicio', 164 | 'ir', 165 | 'irá', 166 | 'isso', 167 | 'ista', 168 | 'iste', 169 | 'isto', 170 | 'já', 171 | 'lado', 172 | 'ligado', 173 | 'local', 174 | 'logo', 175 | 'longe', 176 | 'lugar', 177 | 'lá', 178 | 'maior', 179 | 'maioria', 180 | 'maiorias', 181 | 'mais', 182 | 'mal', 183 | 'mas', 184 | 'me', 185 | 'meio', 186 | 'menor', 187 | 'menos', 188 | 'meses', 189 | 'mesmo', 190 | 'meu', 191 | 'meus', 192 | 'mil', 193 | 'minha', 194 | 'minhas', 195 | 'momento', 196 | 'muito', 197 | 'muitos', 198 | 'máximo', 199 | 'mês', 200 | 'na', 201 | 'nada', 202 | 'naquela', 203 | 'naquele', 204 | 'nas', 205 | 'nem', 206 | 'nenhuma', 207 | 'nessa', 208 | 'nesse', 209 | 'nesta', 210 | 'neste', 211 | 'no', 212 | 'noite', 213 | 'nome', 214 | 'nos', 215 | 'nossa', 216 | 'nossas', 217 | 'nosso', 218 | 'nossos', 219 | 'nova', 220 | 'nove', 221 | 'novo', 222 | 'novos', 223 | 'num', 224 | 'numa', 225 | 'nunca', 226 | 'não', 227 | 'nível', 228 | 'nós', 229 | 'número', 230 | 'o', 231 | 'obra', 232 | 'obrigada', 233 | 'obrigado', 234 | 'oitava', 235 | 'oitavo', 236 | 'oito', 237 | 'onde', 238 | 'ontem', 239 | 'onze', 240 | 'os', 241 | 'ou', 242 | 'outra', 243 | 'outras', 244 | 'outro', 245 | 'outros', 246 | 'para', 247 | 'parece', 248 | 'parte', 249 | 'partir', 250 | 'pegar', 251 | 'pela', 252 | 'pelas', 253 | 'pelo', 254 | 'pelos', 255 | 'perto', 256 | 'pessoas', 257 | 'pode', 258 | 'podem', 259 | 'poder', 260 | 'poderá', 261 | 'podia', 262 | 'ponto', 263 | 'pontos', 264 | 'por', 265 | 'porque', 266 | 'porquê', 267 | 'posição', 268 | 'possivelmente', 269 | 'posso', 270 | 'possível', 271 | 'pouca', 272 | 'pouco', 273 | 'povo', 274 | 'primeira', 275 | 'primeiro', 276 | 'promeiro', 277 | 'próprio', 278 | 'próximo', 279 | 'puderam', 280 | 'pôde', 281 | 'põe', 282 | 'põem', 283 | 'qual', 284 | 'qualquer', 285 | 'quando', 286 | 'quanto', 287 | 'quarta', 288 | 'quarto', 289 | 'quatro', 290 | 'que', 291 | 'quem', 292 | 'quer', 293 | 'quero', 294 | 'questão', 295 | 'quieto', 296 | 'quinta', 297 | 'quinto', 298 | 'quinze', 299 | 'quê', 300 | 'relação', 301 | 'sabe', 302 | 'saber', 303 | 'se', 304 | 'segunda', 305 | 'segundo', 306 | 'sei', 307 | 'seis', 308 | 'sem', 309 | 'sempre', 310 | 'ser', 311 | 'seria', 312 | 'sete', 313 | 'seu', 314 | 'seus', 315 | 'sexta', 316 | 'sexto', 317 | 'sim', 318 | 'sistema', 319 | 'sob', 320 | 'sobre', 321 | 'sois', 322 | 'somente', 323 | 'somos', 324 | 'sou', 325 | 'sua', 326 | 'suas', 327 | 'são', 328 | 'sétima', 329 | 'sétimo', 330 | 'tal', 331 | 'talvez', 332 | 'também', 333 | 'tanto', 334 | 'tarde', 335 | 'te', 336 | 'tem', 337 | 'temos', 338 | 'tempo', 339 | 'tendes', 340 | 'tenho', 341 | 'tens', 342 | 'tentar', 343 | 'tentaram', 344 | 'tente', 345 | 'tentei', 346 | 'ter', 347 | 'terceira', 348 | 'terceiro', 349 | 'teu', 350 | 'teus', 351 | 'teve', 352 | 'tipo', 353 | 'tive', 354 | 'tivemos', 355 | 'tiveram', 356 | 'tiveste', 357 | 'tivestes', 358 | 'toda', 359 | 'todas', 360 | 'todo', 361 | 'todos', 362 | 'trabalhar', 363 | 'trabalho', 364 | 'treze', 365 | 'três', 366 | 'tu', 367 | 'tua', 368 | 'tuas', 369 | 'tudo', 370 | 'tão', 371 | 'têm', 372 | 'um', 373 | 'uma', 374 | 'umas', 375 | 'uns', 376 | 'usa', 377 | 'usar', 378 | 'vai', 379 | 'vais', 380 | 'valor', 381 | 'veja', 382 | 'vem', 383 | 'vens', 384 | 'ver', 385 | 'verdade', 386 | 'verdadeiro', 387 | 'vez', 388 | 'vezes', 389 | 'viagem', 390 | 'vindo', 391 | 'vinte', 392 | 'você', 393 | 'vocês', 394 | 'vos', 395 | 'vossa', 396 | 'vossas', 397 | 'vosso', 398 | 'vossos', 399 | 'vários', 400 | 'vão', 401 | 'vêm', 402 | 'vós', 403 | 'zero', 404 | 'à', 405 | 'às', 406 | 'área', 407 | 'é', 408 | 'és', 409 | 'último', 410 | ] 411 | -------------------------------------------------------------------------------- /src/lib/stopwords/spanish.ts: -------------------------------------------------------------------------------- 1 | export default [ 2 | 'a', 3 | 'actualmente', 4 | 'acuerdo', 5 | 'adelante', 6 | 'ademas', 7 | 'además', 8 | 'adrede', 9 | 'afirmó', 10 | 'agregó', 11 | 'ahi', 12 | 'ahora', 13 | 'ahí', 14 | 'al', 15 | 'algo', 16 | 'alguna', 17 | 'algunas', 18 | 'alguno', 19 | 'algunos', 20 | 'algún', 21 | 'alli', 22 | 'allí', 23 | 'alrededor', 24 | 'ambos', 25 | 'ampleamos', 26 | 'antano', 27 | 'antaño', 28 | 'ante', 29 | 'anterior', 30 | 'antes', 31 | 'apenas', 32 | 'aproximadamente', 33 | 'aquel', 34 | 'aquella', 35 | 'aquellas', 36 | 'aquello', 37 | 'aquellos', 38 | 'aqui', 39 | 'aquél', 40 | 'aquélla', 41 | 'aquéllas', 42 | 'aquéllos', 43 | 'aquí', 44 | 'arriba', 45 | 'arribaabajo', 46 | 'aseguró', 47 | 'asi', 48 | 'así', 49 | 'atras', 50 | 'aun', 51 | 'aunque', 52 | 'ayer', 53 | 'añadió', 54 | 'aún', 55 | 'b', 56 | 'bajo', 57 | 'bastante', 58 | 'bien', 59 | 'breve', 60 | 'buen', 61 | 'buena', 62 | 'buenas', 63 | 'bueno', 64 | 'buenos', 65 | 'c', 66 | 'cada', 67 | 'casi', 68 | 'cerca', 69 | 'cierta', 70 | 'ciertas', 71 | 'cierto', 72 | 'ciertos', 73 | 'cinco', 74 | 'claro', 75 | 'comentó', 76 | 'como', 77 | 'con', 78 | 'conmigo', 79 | 'conocer', 80 | 'conseguimos', 81 | 'conseguir', 82 | 'considera', 83 | 'consideró', 84 | 'consigo', 85 | 'consigue', 86 | 'consiguen', 87 | 'consigues', 88 | 'contigo', 89 | 'contra', 90 | 'cosas', 91 | 'creo', 92 | 'cual', 93 | 'cuales', 94 | 'cualquier', 95 | 'cuando', 96 | 'cuanta', 97 | 'cuantas', 98 | 'cuanto', 99 | 'cuantos', 100 | 'cuatro', 101 | 'cuenta', 102 | 'cuál', 103 | 'cuáles', 104 | 'cuándo', 105 | 'cuánta', 106 | 'cuántas', 107 | 'cuánto', 108 | 'cuántos', 109 | 'cómo', 110 | 'd', 111 | 'da', 112 | 'dado', 113 | 'dan', 114 | 'dar', 115 | 'de', 116 | 'debajo', 117 | 'debe', 118 | 'deben', 119 | 'debido', 120 | 'decir', 121 | 'dejó', 122 | 'del', 123 | 'delante', 124 | 'demasiado', 125 | 'demás', 126 | 'dentro', 127 | 'deprisa', 128 | 'desde', 129 | 'despacio', 130 | 'despues', 131 | 'después', 132 | 'detras', 133 | 'detrás', 134 | 'dia', 135 | 'dias', 136 | 'dice', 137 | 'dicen', 138 | 'dicho', 139 | 'dieron', 140 | 'diferente', 141 | 'diferentes', 142 | 'dijeron', 143 | 'dijo', 144 | 'dio', 145 | 'donde', 146 | 'dos', 147 | 'durante', 148 | 'día', 149 | 'días', 150 | 'dónde', 151 | 'e', 152 | 'ejemplo', 153 | 'el', 154 | 'ella', 155 | 'ellas', 156 | 'ello', 157 | 'ellos', 158 | 'embargo', 159 | 'empleais', 160 | 'emplean', 161 | 'emplear', 162 | 'empleas', 163 | 'empleo', 164 | 'en', 165 | 'encima', 166 | 'encuentra', 167 | 'enfrente', 168 | 'enseguida', 169 | 'entonces', 170 | 'entre', 171 | 'era', 172 | 'eramos', 173 | 'eran', 174 | 'eras', 175 | 'eres', 176 | 'es', 177 | 'esa', 178 | 'esas', 179 | 'ese', 180 | 'eso', 181 | 'esos', 182 | 'esta', 183 | 'estaba', 184 | 'estaban', 185 | 'estado', 186 | 'estados', 187 | 'estais', 188 | 'estamos', 189 | 'estan', 190 | 'estar', 191 | 'estará', 192 | 'estas', 193 | 'este', 194 | 'esto', 195 | 'estos', 196 | 'estoy', 197 | 'estuvo', 198 | 'está', 199 | 'están', 200 | 'ex', 201 | 'excepto', 202 | 'existe', 203 | 'existen', 204 | 'explicó', 205 | 'expresó', 206 | 'f', 207 | 'fin', 208 | 'final', 209 | 'fue', 210 | 'fuera', 211 | 'fueron', 212 | 'fui', 213 | 'fuimos', 214 | 'g', 215 | 'general', 216 | 'gran', 217 | 'grandes', 218 | 'gueno', 219 | 'h', 220 | 'ha', 221 | 'haber', 222 | 'habia', 223 | 'habla', 224 | 'hablan', 225 | 'habrá', 226 | 'había', 227 | 'habían', 228 | 'hace', 229 | 'haceis', 230 | 'hacemos', 231 | 'hacen', 232 | 'hacer', 233 | 'hacerlo', 234 | 'haces', 235 | 'hacia', 236 | 'haciendo', 237 | 'hago', 238 | 'han', 239 | 'hasta', 240 | 'hay', 241 | 'haya', 242 | 'he', 243 | 'hecho', 244 | 'hemos', 245 | 'hicieron', 246 | 'hizo', 247 | 'horas', 248 | 'hoy', 249 | 'hubo', 250 | 'i', 251 | 'igual', 252 | 'incluso', 253 | 'indicó', 254 | 'informo', 255 | 'informó', 256 | 'intenta', 257 | 'intentais', 258 | 'intentamos', 259 | 'intentan', 260 | 'intentar', 261 | 'intentas', 262 | 'intento', 263 | 'ir', 264 | 'j', 265 | 'junto', 266 | 'k', 267 | 'l', 268 | 'la', 269 | 'lado', 270 | 'largo', 271 | 'las', 272 | 'le', 273 | 'lejos', 274 | 'les', 275 | 'llegó', 276 | 'lleva', 277 | 'llevar', 278 | 'lo', 279 | 'los', 280 | 'luego', 281 | 'lugar', 282 | 'm', 283 | 'mal', 284 | 'manera', 285 | 'manifestó', 286 | 'mas', 287 | 'mayor', 288 | 'me', 289 | 'mediante', 290 | 'medio', 291 | 'mejor', 292 | 'mencionó', 293 | 'menos', 294 | 'menudo', 295 | 'mi', 296 | 'mia', 297 | 'mias', 298 | 'mientras', 299 | 'mio', 300 | 'mios', 301 | 'mis', 302 | 'misma', 303 | 'mismas', 304 | 'mismo', 305 | 'mismos', 306 | 'modo', 307 | 'momento', 308 | 'mucha', 309 | 'muchas', 310 | 'mucho', 311 | 'muchos', 312 | 'muy', 313 | 'más', 314 | 'mí', 315 | 'mía', 316 | 'mías', 317 | 'mío', 318 | 'míos', 319 | 'n', 320 | 'nada', 321 | 'nadie', 322 | 'ni', 323 | 'ninguna', 324 | 'ningunas', 325 | 'ninguno', 326 | 'ningunos', 327 | 'ningún', 328 | 'no', 329 | 'nos', 330 | 'nosotras', 331 | 'nosotros', 332 | 'nuestra', 333 | 'nuestras', 334 | 'nuestro', 335 | 'nuestros', 336 | 'nueva', 337 | 'nuevas', 338 | 'nuevo', 339 | 'nuevos', 340 | 'nunca', 341 | 'o', 342 | 'ocho', 343 | 'os', 344 | 'otra', 345 | 'otras', 346 | 'otro', 347 | 'otros', 348 | 'p', 349 | 'pais', 350 | 'para', 351 | 'parece', 352 | 'parte', 353 | 'partir', 354 | 'pasada', 355 | 'pasado', 356 | 'paìs', 357 | 'peor', 358 | 'pero', 359 | 'pesar', 360 | 'poca', 361 | 'pocas', 362 | 'poco', 363 | 'pocos', 364 | 'podeis', 365 | 'podemos', 366 | 'poder', 367 | 'podria', 368 | 'podriais', 369 | 'podriamos', 370 | 'podrian', 371 | 'podrias', 372 | 'podrá', 373 | 'podrán', 374 | 'podría', 375 | 'podrían', 376 | 'poner', 377 | 'por', 378 | 'porque', 379 | 'posible', 380 | 'primer', 381 | 'primera', 382 | 'primero', 383 | 'primeros', 384 | 'principalmente', 385 | 'pronto', 386 | 'propia', 387 | 'propias', 388 | 'propio', 389 | 'propios', 390 | 'proximo', 391 | 'próximo', 392 | 'próximos', 393 | 'pudo', 394 | 'pueda', 395 | 'puede', 396 | 'pueden', 397 | 'puedo', 398 | 'pues', 399 | 'q', 400 | 'qeu', 401 | 'que', 402 | 'quedó', 403 | 'queremos', 404 | 'quien', 405 | 'quienes', 406 | 'quiere', 407 | 'quiza', 408 | 'quizas', 409 | 'quizá', 410 | 'quizás', 411 | 'quién', 412 | 'quiénes', 413 | 'qué', 414 | 'r', 415 | 'raras', 416 | 'realizado', 417 | 'realizar', 418 | 'realizó', 419 | 'repente', 420 | 'respecto', 421 | 's', 422 | 'sabe', 423 | 'sabeis', 424 | 'sabemos', 425 | 'saben', 426 | 'saber', 427 | 'sabes', 428 | 'salvo', 429 | 'se', 430 | 'sea', 431 | 'sean', 432 | 'segun', 433 | 'segunda', 434 | 'segundo', 435 | 'según', 436 | 'seis', 437 | 'ser', 438 | 'sera', 439 | 'será', 440 | 'serán', 441 | 'sería', 442 | 'señaló', 443 | 'si', 444 | 'sido', 445 | 'siempre', 446 | 'siendo', 447 | 'siete', 448 | 'sigue', 449 | 'siguiente', 450 | 'sin', 451 | 'sino', 452 | 'sobre', 453 | 'sois', 454 | 'sola', 455 | 'solamente', 456 | 'solas', 457 | 'solo', 458 | 'solos', 459 | 'somos', 460 | 'son', 461 | 'soy', 462 | 'soyos', 463 | 'su', 464 | 'supuesto', 465 | 'sus', 466 | 'suya', 467 | 'suyas', 468 | 'suyo', 469 | 'sé', 470 | 'sí', 471 | 'sólo', 472 | 't', 473 | 'tal', 474 | 'tambien', 475 | 'también', 476 | 'tampoco', 477 | 'tan', 478 | 'tanto', 479 | 'tarde', 480 | 'te', 481 | 'temprano', 482 | 'tendrá', 483 | 'tendrán', 484 | 'teneis', 485 | 'tenemos', 486 | 'tener', 487 | 'tenga', 488 | 'tengo', 489 | 'tenido', 490 | 'tenía', 491 | 'tercera', 492 | 'ti', 493 | 'tiempo', 494 | 'tiene', 495 | 'tienen', 496 | 'toda', 497 | 'todas', 498 | 'todavia', 499 | 'todavía', 500 | 'todo', 501 | 'todos', 502 | 'total', 503 | 'trabaja', 504 | 'trabajais', 505 | 'trabajamos', 506 | 'trabajan', 507 | 'trabajar', 508 | 'trabajas', 509 | 'trabajo', 510 | 'tras', 511 | 'trata', 512 | 'través', 513 | 'tres', 514 | 'tu', 515 | 'tus', 516 | 'tuvo', 517 | 'tuya', 518 | 'tuyas', 519 | 'tuyo', 520 | 'tuyos', 521 | 'tú', 522 | 'u', 523 | 'ultimo', 524 | 'un', 525 | 'una', 526 | 'unas', 527 | 'uno', 528 | 'unos', 529 | 'usa', 530 | 'usais', 531 | 'usamos', 532 | 'usan', 533 | 'usar', 534 | 'usas', 535 | 'uso', 536 | 'usted', 537 | 'ustedes', 538 | 'v', 539 | 'va', 540 | 'vais', 541 | 'valor', 542 | 'vamos', 543 | 'van', 544 | 'varias', 545 | 'varios', 546 | 'vaya', 547 | 'veces', 548 | 'ver', 549 | 'verdad', 550 | 'verdadera', 551 | 'verdadero', 552 | 'vez', 553 | 'vosotras', 554 | 'vosotros', 555 | 'voy', 556 | 'vuestra', 557 | 'vuestras', 558 | 'vuestro', 559 | 'vuestros', 560 | 'w', 561 | 'x', 562 | 'y', 563 | 'ya', 564 | 'yo', 565 | 'z', 566 | 'él', 567 | 'ésa', 568 | 'ésas', 569 | 'ése', 570 | 'ésos', 571 | 'ésta', 572 | 'éstas', 573 | 'éste', 574 | 'éstos', 575 | 'última', 576 | 'últimas', 577 | 'último', 578 | 'últimos', 579 | ] 580 | -------------------------------------------------------------------------------- /src/lib/stopwords/swedish.ts: -------------------------------------------------------------------------------- 1 | export default [ 2 | 'aderton', 3 | 'adertonde', 4 | 'adjö', 5 | 'aldrig', 6 | 'alla', 7 | 'allas', 8 | 'allt', 9 | 'alltid', 10 | 'alltså', 11 | 'andra', 12 | 'andras', 13 | 'annan', 14 | 'annat', 15 | 'artonde', 16 | 'artonn', 17 | 'att', 18 | 'av', 19 | 'bakom', 20 | 'bara', 21 | 'behöva', 22 | 'behövas', 23 | 'behövde', 24 | 'behövt', 25 | 'beslut', 26 | 'beslutat', 27 | 'beslutit', 28 | 'bland', 29 | 'blev', 30 | 'bli', 31 | 'blir', 32 | 'blivit', 33 | 'bort', 34 | 'borta', 35 | 'bra', 36 | 'bäst', 37 | 'bättre', 38 | 'båda', 39 | 'bådas', 40 | 'dag', 41 | 'dagar', 42 | 'dagarna', 43 | 'dagen', 44 | 'de', 45 | 'del', 46 | 'delen', 47 | 'dem', 48 | 'den', 49 | 'denna', 50 | 'deras', 51 | 'dess', 52 | 'dessa', 53 | 'det', 54 | 'detta', 55 | 'dig', 56 | 'din', 57 | 'dina', 58 | 'dit', 59 | 'ditt', 60 | 'dock', 61 | 'du', 62 | 'där', 63 | 'därför', 64 | 'då', 65 | 'efter', 66 | 'eftersom', 67 | 'ej', 68 | 'elfte', 69 | 'eller', 70 | 'elva', 71 | 'en', 72 | 'enkel', 73 | 'enkelt', 74 | 'enkla', 75 | 'enligt', 76 | 'er', 77 | 'era', 78 | 'ert', 79 | 'ett', 80 | 'ettusen', 81 | 'fanns', 82 | 'fem', 83 | 'femte', 84 | 'femtio', 85 | 'femtionde', 86 | 'femton', 87 | 'femtonde', 88 | 'fick', 89 | 'fin', 90 | 'finnas', 91 | 'finns', 92 | 'fjorton', 93 | 'fjortonde', 94 | 'fjärde', 95 | 'fler', 96 | 'flera', 97 | 'flesta', 98 | 'fram', 99 | 'framför', 100 | 'från', 101 | 'fyra', 102 | 'fyrtio', 103 | 'fyrtionde', 104 | 'få', 105 | 'får', 106 | 'fått', 107 | 'följande', 108 | 'för', 109 | 'före', 110 | 'förlåt', 111 | 'förra', 112 | 'första', 113 | 'genast', 114 | 'genom', 115 | 'gick', 116 | 'gjorde', 117 | 'gjort', 118 | 'god', 119 | 'goda', 120 | 'godare', 121 | 'godast', 122 | 'gott', 123 | 'gälla', 124 | 'gäller', 125 | 'gällt', 126 | 'gärna', 127 | 'gå', 128 | 'går', 129 | 'gått', 130 | 'gör', 131 | 'göra', 132 | 'ha', 133 | 'hade', 134 | 'haft', 135 | 'han', 136 | 'hans', 137 | 'har', 138 | 'heller', 139 | 'hellre', 140 | 'helst', 141 | 'helt', 142 | 'henne', 143 | 'hennes', 144 | 'hit', 145 | 'hon', 146 | 'honom', 147 | 'hundra', 148 | 'hundraen', 149 | 'hundraett', 150 | 'hur', 151 | 'här', 152 | 'hög', 153 | 'höger', 154 | 'högre', 155 | 'högst', 156 | 'i', 157 | 'ibland', 158 | 'icke', 159 | 'idag', 160 | 'igen', 161 | 'igår', 162 | 'imorgon', 163 | 'in', 164 | 'inför', 165 | 'inga', 166 | 'ingen', 167 | 'ingenting', 168 | 'inget', 169 | 'innan', 170 | 'inne', 171 | 'inom', 172 | 'inte', 173 | 'inuti', 174 | 'ja', 175 | 'jag', 176 | 'ju', 177 | 'jämfört', 178 | 'kan', 179 | 'kanske', 180 | 'knappast', 181 | 'kom', 182 | 'komma', 183 | 'kommer', 184 | 'kommit', 185 | 'kr', 186 | 'kunde', 187 | 'kunna', 188 | 'kunnat', 189 | 'kvar', 190 | 'legat', 191 | 'ligga', 192 | 'ligger', 193 | 'lika', 194 | 'likställd', 195 | 'likställda', 196 | 'lilla', 197 | 'lite', 198 | 'liten', 199 | 'litet', 200 | 'länge', 201 | 'längre', 202 | 'längst', 203 | 'lätt', 204 | 'lättare', 205 | 'lättast', 206 | 'långsam', 207 | 'långsammare', 208 | 'långsammast', 209 | 'långsamt', 210 | 'långt', 211 | 'man', 212 | 'med', 213 | 'mellan', 214 | 'men', 215 | 'mer', 216 | 'mera', 217 | 'mest', 218 | 'mig', 219 | 'min', 220 | 'mina', 221 | 'mindre', 222 | 'minst', 223 | 'mitt', 224 | 'mittemot', 225 | 'mot', 226 | 'mycket', 227 | 'många', 228 | 'måste', 229 | 'möjlig', 230 | 'möjligen', 231 | 'möjligt', 232 | 'möjligtvis', 233 | 'ned', 234 | 'nederst', 235 | 'nedersta', 236 | 'nedre', 237 | 'nej', 238 | 'ner', 239 | 'ni', 240 | 'nio', 241 | 'nionde', 242 | 'nittio', 243 | 'nittionde', 244 | 'nitton', 245 | 'nittonde', 246 | 'nog', 247 | 'noll', 248 | 'nr', 249 | 'nu', 250 | 'nummer', 251 | 'när', 252 | 'nästa', 253 | 'någon', 254 | 'någonting', 255 | 'något', 256 | 'några', 257 | 'nödvändig', 258 | 'nödvändiga', 259 | 'nödvändigt', 260 | 'nödvändigtvis', 261 | 'och', 262 | 'också', 263 | 'ofta', 264 | 'oftast', 265 | 'olika', 266 | 'olikt', 267 | 'om', 268 | 'oss', 269 | 'på', 270 | 'rakt', 271 | 'redan', 272 | 'rätt', 273 | 'sade', 274 | 'sagt', 275 | 'samma', 276 | 'sedan', 277 | 'senare', 278 | 'senast', 279 | 'sent', 280 | 'sex', 281 | 'sextio', 282 | 'sextionde', 283 | 'sexton', 284 | 'sextonde', 285 | 'sig', 286 | 'sin', 287 | 'sina', 288 | 'sist', 289 | 'sista', 290 | 'siste', 291 | 'sitt', 292 | 'sitta', 293 | 'sju', 294 | 'sjunde', 295 | 'sjuttio', 296 | 'sjuttionde', 297 | 'sjutton', 298 | 'sjuttonde', 299 | 'själv', 300 | 'sjätte', 301 | 'ska', 302 | 'skall', 303 | 'skulle', 304 | 'slutligen', 305 | 'små', 306 | 'smått', 307 | 'snart', 308 | 'som', 309 | 'stor', 310 | 'stora', 311 | 'stort', 312 | 'större', 313 | 'störst', 314 | 'säga', 315 | 'säger', 316 | 'sämre', 317 | 'sämst', 318 | 'så', 319 | 'sådan', 320 | 'sådana', 321 | 'sådant', 322 | 'tack', 323 | 'tidig', 324 | 'tidigare', 325 | 'tidigast', 326 | 'tidigt', 327 | 'till', 328 | 'tills', 329 | 'tillsammans', 330 | 'tio', 331 | 'tionde', 332 | 'tjugo', 333 | 'tjugoen', 334 | 'tjugoett', 335 | 'tjugonde', 336 | 'tjugotre', 337 | 'tjugotvå', 338 | 'tjungo', 339 | 'tolfte', 340 | 'tolv', 341 | 'tre', 342 | 'tredje', 343 | 'trettio', 344 | 'trettionde', 345 | 'tretton', 346 | 'trettonde', 347 | 'två', 348 | 'tvåhundra', 349 | 'under', 350 | 'upp', 351 | 'ur', 352 | 'ursäkt', 353 | 'ut', 354 | 'utan', 355 | 'utanför', 356 | 'ute', 357 | 'vad', 358 | 'var', 359 | 'vara', 360 | 'varför', 361 | 'varifrån', 362 | 'varit', 363 | 'varje', 364 | 'varken', 365 | 'vars', 366 | 'varsågod', 367 | 'vart', 368 | 'vem', 369 | 'vems', 370 | 'verkligen', 371 | 'vi', 372 | 'vid', 373 | 'vidare', 374 | 'viktig', 375 | 'viktigare', 376 | 'viktigast', 377 | 'viktigt', 378 | 'vilka', 379 | 'vilkas', 380 | 'vilken', 381 | 'vilket', 382 | 'vill', 383 | 'vänster', 384 | 'vänstra', 385 | 'värre', 386 | 'vår', 387 | 'våra', 388 | 'vårt', 389 | 'än', 390 | 'ännu', 391 | 'är', 392 | 'även', 393 | 'åt', 394 | 'åtminstone', 395 | 'åtta', 396 | 'åttio', 397 | 'åttionde', 398 | 'åttonde', 399 | 'över', 400 | 'övermorgon', 401 | 'överst', 402 | 'övre', 403 | ] 404 | -------------------------------------------------------------------------------- /src/lib/tools/guess_language.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * This module can be used to remove non-word characters and non-content words 3 | * from strings. It is able to detect the natural language on its own. 4 | */ 5 | 6 | import * as franc from 'franc' 7 | 8 | // currently supported natural languages 9 | export type languageName = 10 | | 'german' 11 | | 'english' 12 | | 'italian' 13 | | 'dutch' 14 | | 'portugese' 15 | | 'spanish' 16 | | 'swedish' 17 | 18 | // just a few language codes of 'franc' mapped to 'nltk-stopwords'-keys for now 19 | const languageNameMapping = { 20 | deu: 'german', 21 | eng: 'english', 22 | ita: 'italian', 23 | nld: 'dutch', 24 | por: 'portuguese', 25 | spa: 'spanish', 26 | swh: 'swedish', 27 | } 28 | 29 | // detect the language of a given string or word array 30 | export default function guessLanguage(text: string): languageName { 31 | return languageNameMapping[franc(text)] || 'english' 32 | } 33 | -------------------------------------------------------------------------------- /src/lib/tools/parser.ts: -------------------------------------------------------------------------------- 1 | import { groupBy, map, sortBy, take } from 'lodash' 2 | import Phrase from '../data_structures/phrase' 3 | import Stemmer from './stemmer' 4 | import strip from './strip' 5 | 6 | /** 7 | * This Parser is able to take a bag of words (from a preprocessed text corpus) 8 | * and collect them into an array of Phrases. Phrases are n-grams of 9 | * subsequent words which may describe the corpus better than the individual 10 | * words standalone. 11 | */ 12 | export default class Parser { 13 | // hold all results 14 | public phrases: Phrase[] = [] 15 | 16 | // cache the last words until a phrase is completed 17 | private cache: Phrase 18 | 19 | // initialize with external plugins for word processing 20 | constructor(private stemmer: Stemmer, private stopwords: Set) { 21 | this.setNewPhraseCache() 22 | } 23 | 24 | // execute a given word array and add the results to the internal corpora 25 | public process(wordArray: string[]): Parser { 26 | for (const phrase of wordArray) { 27 | this.push(phrase.toLowerCase()) 28 | } 29 | this.stemAll() 30 | return this 31 | } 32 | 33 | public joinDuplicates() { 34 | const groups = groupBy(this.phrases, 'text') 35 | const resultList = [] 36 | for (const text in groups) { 37 | if (text) { 38 | const group = groups[text] 39 | const amount = group.length 40 | group[0].multiplyWith(amount) 41 | resultList.push(group[0]) 42 | } 43 | } 44 | this.phrases = resultList 45 | } 46 | 47 | public bestPhrases(): string[] { 48 | const phrases = sortBy(this.phrases, ['score', 'text']).reverse() 49 | const optimalAmount = Math.ceil(this.phrases.length / 3.0) 50 | return map(take(phrases, optimalAmount), 'text') 51 | } 52 | 53 | // add words to the internal phrase cache, or move on with the next phrase 54 | private push(phrase: string) { 55 | for (const word of phrase.split(/\s+/)) { 56 | const strippedWord = strip(word) 57 | const hasPunctuation = strippedWord !== word 58 | const isStopWord = this.stopwords.has(word) 59 | if (isStopWord || word.length < 2) { 60 | this.finalizePhraseCache() 61 | } else if (hasPunctuation) { 62 | this.cache.pushWord(strippedWord) 63 | this.finalizePhraseCache() 64 | } else { 65 | this.cache.pushWord(strippedWord) 66 | } 67 | } 68 | } 69 | 70 | // reset the internal cache to a new blank object 71 | private setNewPhraseCache() { 72 | this.cache = new Phrase() 73 | } 74 | 75 | // move the internal cache into the result list, reset the cache 76 | private finalizePhraseCache() { 77 | if (!this.cache.isEmpty()) { 78 | this.cache.createText() 79 | this.phrases.push(this.cache) 80 | this.setNewPhraseCache() 81 | } 82 | } 83 | 84 | // stemm all words in all phrases 85 | private stemAll() { 86 | for (const phrase of this.phrases) { 87 | phrase.calculateStems(this.stemmer) 88 | } 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /src/lib/tools/preprocessor.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * This module can be used to remove non-word characters and non-content words 3 | * from strings. It is able to detect the natural language on its own. 4 | */ 5 | 6 | import { map } from 'lodash' 7 | 8 | export default class Preprocessor { 9 | private splitter: RegExp 10 | 11 | constructor(private delimiters: string[]) { 12 | this.buildDelimiterRegexp() 13 | } 14 | 15 | public process(corpus: string): string[] { 16 | return corpus.replace(/\\[nrt]/g, '. ').split(this.splitter).filter(Boolean) 17 | } 18 | 19 | private buildDelimiterRegexp() { 20 | const patterns = map(this.delimiters, d => '(' + d + ')') 21 | const expression = '[' + patterns.join('') + ']' 22 | this.splitter = new RegExp(expression, 'g') 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /src/lib/tools/stemmer.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * This Stemmer is a class and not a function because it must be able to track 3 | * all processed words and stems for later usage. You can initialize one for 4 | * a given language and then use its `stem` method repeatedly. It keeps track 5 | * of both: words and their stems, simultanously. 6 | */ 7 | 8 | import * as Snowball from 'snowball' 9 | import StringCounter from '../data_structures/string_counter' 10 | import StringDictionary from '../data_structures/string_dictionary' 11 | 12 | export default class Stemmer { 13 | private wordStems = new StringDictionary() 14 | private stemCounts = new StringCounter() 15 | private stemmer: any 16 | 17 | // for a list of available languages, see 18 | // https://github.com/fortnightlabs/snowball-js/tree/master/stemmer/src/ext 19 | constructor(private language: string = 'english') { 20 | this.stemmer = new Snowball(language) 21 | } 22 | 23 | // process a given word, return the stem, and track metrics 24 | public stem(word: string): string { 25 | let stem = this.wordStems.get(word) 26 | if (!stem) { 27 | this.stemmer.setCurrent(word) 28 | this.stemmer.stem() 29 | stem = this.stemmer.getCurrent() 30 | this.wordStems.add(word, stem) 31 | } 32 | this.stemCounts.count(stem) 33 | return stem 34 | } 35 | 36 | public getStems(): string[] { 37 | return this.stemCounts.strings() 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/lib/tools/stoplist.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * This module loads a big list of stopwords for various languages from the 3 | * json file in the `data` folder, and returns the specific list for a given 4 | * language as a `Set`. 5 | * This module loads a language's stopwords array from predefined typescript 6 | * files. For more languages, ensure that the `guess_language` function can 7 | * recognize them, add their name mapping there, and create a new file with 8 | * that name in the 'stopwords'-folder. There is a vast amount of source-data 9 | * available for many languages in the top-level 'data'-folder. 10 | */ 11 | 12 | export default function load(language: string): Set { 13 | const list: string[] = require('../stopwords/' + language).default 14 | return new Set(list) 15 | } 16 | -------------------------------------------------------------------------------- /src/lib/tools/strip.ts: -------------------------------------------------------------------------------- 1 | import * as condenseWhitespace from 'condense-whitespace' 2 | 3 | // replace all non-word characters from string 4 | export default function strip(text: string): string { 5 | const txt = text 6 | .replace(/[^a-zäöüß']/gi, ' ') 7 | .replace(/(^|\s)+\w($|\s)+/g, ' ') 8 | return condenseWhitespace(txt) 9 | } 10 | -------------------------------------------------------------------------------- /src/test/data_structures/string_counter.ts: -------------------------------------------------------------------------------- 1 | import { expect } from 'chai' 2 | import { skip, slow, suite, test, timeout } from 'mocha-typescript' 3 | import StringCounter from '../../lib/data_structures/string_counter' 4 | 5 | @suite(timeout(100), slow(10)) 6 | class StringCounterTest { 7 | @test 8 | public countsNewWords() { 9 | const sc = new StringCounter() 10 | sc.count('lorem') 11 | sc.count('ipsum') 12 | const result = sc.strings() 13 | const expected = ['lorem', 'ipsum'] 14 | expect(result).to.have.same.members(expected) 15 | } 16 | 17 | @test 18 | public countsExistingWords() { 19 | const sc = new StringCounter() 20 | sc.count('lorem') 21 | sc.count('lorem') 22 | const result = sc.strings() 23 | const expected = ['lorem'] 24 | expect(result).to.have.same.members(expected) 25 | } 26 | 27 | @test 28 | public ignoresEmptyInput() { 29 | const sc = new StringCounter() 30 | sc.count('') 31 | sc.count(null) 32 | const result = sc.strings() 33 | const expected = [] 34 | expect(result).to.have.same.members(expected) 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/test/data_structures/string_dictionary.ts: -------------------------------------------------------------------------------- 1 | import { expect } from 'chai' 2 | import { skip, slow, suite, test, timeout } from 'mocha-typescript' 3 | import StringDictionary from '../../lib/data_structures/string_dictionary' 4 | 5 | @suite(timeout(100), slow(10)) 6 | class StringDictionaryTest { 7 | @test 8 | public addsNewWords() { 9 | const sd = new StringDictionary() 10 | sd.add('lorem', 'ipsum') 11 | const result = sd.get('lorem') 12 | const expected = 'ipsum' 13 | expect(result).to.be.equal(expected) 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /src/test/rake.ts: -------------------------------------------------------------------------------- 1 | import { expect } from 'chai' 2 | import { readFileSync } from 'fs' 3 | import { skip, slow, suite, test, timeout } from 'mocha-typescript' 4 | import { join } from 'path' 5 | import { IParameters, rake } from '../lib/rake' 6 | 7 | @suite(timeout(1000), slow(100)) 8 | class RAKE { 9 | @test 10 | public worksWithSimpleTexts() { 11 | const input: IParameters = { 12 | corpus: `For decades, video games have been criticized for 13 | purportedly wasting time, stifling creativity, and even 14 | influencing violent behaviors. Now, it seems that video games 15 | have become an unlikely tool for AI researchers to improve 16 | their systems.`, 17 | delimiters: ['\\s+'], 18 | language: 'english', 19 | } 20 | const expected = [ 21 | 'video games', 22 | 'purportedly wasting time', 23 | 'influencing violent behaviors', 24 | 'stifling creativity', 25 | ] 26 | 27 | const result = rake(input) 28 | // tslint:disable-next-line 29 | // console.log(result) 30 | expect(result).to.have.same.members(expected) 31 | } 32 | 33 | @test 34 | public worksWithNewsContent() { 35 | const file = join(__dirname, '..', '..', 'examples', 'venturebeat.txt') 36 | const input: IParameters = { 37 | corpus: readFileSync(file, 'utf-8'), 38 | delimiters: ['\\s+'], 39 | language: 'english', 40 | } 41 | const result = rake(input) 42 | // tslint:disable-next-line 43 | // console.log(result) 44 | expect(result).to.include('latest game dev tools') 45 | expect(result).to.include('video games') 46 | expect(result).to.include('machine learning') 47 | } 48 | 49 | @test 50 | public worksWithGermanNewsContent() { 51 | const file = join(__dirname, '..', '..', 'examples', 'spiegel.txt') 52 | const input: IParameters = { 53 | corpus: readFileSync(file, 'utf-8'), 54 | delimiters: ['\\s+'], 55 | language: 'german', 56 | } 57 | const result = rake(input) 58 | // tslint:disable-next-line 59 | // console.log(result); 60 | expect(result).to.include('mietpreisbremse') 61 | expect(result).to.include('vermieter') 62 | expect(result).to.include('deutschland') 63 | } 64 | 65 | @test 66 | public worksWithGermanPressContent() { 67 | const file = join(__dirname, '..', '..', 'examples', 'ntv.txt') 68 | const input: IParameters = { 69 | corpus: readFileSync(file, 'utf-8'), 70 | delimiters: ['\\s+'], 71 | language: 'german', 72 | } 73 | const result = rake(input) 74 | // tslint:disable-next-line 75 | // console.log(result) 76 | expect(result).to.include('teleskop') 77 | expect(result).to.include('california institute of technology caltech') 78 | expect(result).to.include('de zeeuw') 79 | } 80 | 81 | @test 82 | public worksWithLongFormContent() { 83 | const file = join(__dirname, '..', '..', 'examples', 'waitbutwhy.txt') 84 | const input: IParameters = { 85 | corpus: readFileSync(file, 'utf-8'), 86 | delimiters: ['\\s+'], 87 | language: 'english', 88 | } 89 | const result = rake(input) 90 | // tslint:disable-next-line 91 | // console.log(result); 92 | expect(result).to.include('artificial intelligence') 93 | expect(result).to.include('brain emulation') 94 | expect(result).to.include('computers') 95 | expect(result).to.include('evolution') 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /src/test/tools/guess_language.ts: -------------------------------------------------------------------------------- 1 | import { expect } from 'chai' 2 | import { skip, slow, suite, test, timeout } from 'mocha-typescript' 3 | import guessLanguage from '../../lib/tools/guess_language' 4 | 5 | @suite(timeout(3000), slow(1000)) 6 | class GuessLanguage { 7 | @test 8 | public guessesEnglish() { 9 | const result = guessLanguage('Hello World') 10 | const expected = 'english' 11 | expect(result).be.equal(expected) 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /src/test/tools/preprocessor.ts: -------------------------------------------------------------------------------- 1 | import { expect } from 'chai' 2 | import { skip, slow, suite, test, timeout } from 'mocha-typescript' 3 | import Preprocessor from '../../lib/tools/preprocessor' 4 | 5 | @suite(timeout(100), slow(10)) 6 | class Index { 7 | @test 8 | public cleanStringEnglish() { 9 | const pp = new Preprocessor(['. ']) 10 | const result = pp.process('Hello. World') 11 | const expected = ['Hello', 'World'] 12 | expect(result).to.have.same.members(expected) 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /src/test/tools/stemmer.ts: -------------------------------------------------------------------------------- 1 | import { expect } from 'chai' 2 | import { skip, slow, suite, test, timeout } from 'mocha-typescript' 3 | import Stemmer from '../../lib/tools/stemmer' 4 | 5 | @suite(timeout(100), slow(10)) 6 | class StemmerTest { 7 | @test 8 | public worksWithEnglishDefaults() { 9 | const stemmer = new Stemmer() 10 | stemmer.stem('working') 11 | const result = stemmer.getStems() 12 | const expected = ['work'] 13 | expect(result).to.have.same.members(expected) 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /src/test/tools/strip.ts: -------------------------------------------------------------------------------- 1 | import { expect } from 'chai' 2 | import { skip, slow, suite, test, timeout } from 'mocha-typescript' 3 | import strip from '../../lib/tools/strip' 4 | 5 | @suite(timeout(100), slow(10)) 6 | class StripTest { 7 | @test 8 | public removesNonAsciiCharacters() { 9 | const corpus = 'Hello +*# World' 10 | const result = strip(corpus) 11 | const expected = 'Hello World' 12 | expect(result).to.be.equal(expected) 13 | } 14 | 15 | @test 16 | public stripPunctuation() { 17 | expect(strip('test.')).to.be.equal('test') 18 | } 19 | 20 | @test 21 | public stripNoUmlauts() { 22 | expect(strip('täst')).to.be.equal('täst') 23 | } 24 | 25 | @test 26 | public stripBadWhitespace() { 27 | expect(strip(' aa \t \r \n \r\n bb ')).to.be.equal('aa bb') 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "module": "commonjs", 4 | "target": "es6", 5 | "noImplicitAny": false, 6 | "removeComments": true, 7 | "preserveConstEnums": true, 8 | "sourceMap": true, 9 | "outDir": "./dist", 10 | "experimentalDecorators": true 11 | }, 12 | "include": [ 13 | "src/**/*" 14 | ], 15 | "exclude": [ 16 | "node_modules", 17 | "**/*.spec.ts" 18 | ], 19 | "lib": [ "es5", "es6", "dom", "dom.iterable" ], 20 | "types": ["node"] 21 | } -------------------------------------------------------------------------------- /tslint.json: -------------------------------------------------------------------------------- 1 | { 2 | "defaultSeverity": "error", 3 | "extends": [ 4 | "tslint:recommended", 5 | "tslint-config-prettier" 6 | ], 7 | "jsRules": {}, 8 | "rules": { 9 | "quotemark": [true, "single", "avoid-escape"] 10 | }, 11 | "rulesDirectory": [] 12 | } --------------------------------------------------------------------------------