├── test ├── packages ├── tokenizer │ ├── packages │ ├── tokernizer_test.dart │ ├── tab_tokenizer_test.dart │ └── space_tokenizer_test.dart └── start_test.dart ├── tool ├── packages └── run_tests.sh ├── lib ├── language.dart └── src │ ├── tokenizer.dart │ └── tokenizer │ ├── abstract_tokenizer.dart │ ├── tab_tokenizer.dart │ └── space_tokenizer.dart ├── pubspec.yaml ├── .gitignore ├── LICENSE └── README.md /test/packages: -------------------------------------------------------------------------------- 1 | ../packages -------------------------------------------------------------------------------- /tool/packages: -------------------------------------------------------------------------------- 1 | ../packages -------------------------------------------------------------------------------- /test/tokenizer/packages: -------------------------------------------------------------------------------- 1 | ../../packages -------------------------------------------------------------------------------- /lib/language.dart: -------------------------------------------------------------------------------- 1 | library language; 2 | 3 | export 'src/tokenizer.dart'; 4 | -------------------------------------------------------------------------------- /test/start_test.dart: -------------------------------------------------------------------------------- 1 | library start_test; 2 | 3 | import 'tokenizer/tokernizer_test.dart' as tokernizer_test; 4 | 5 | void main() { 6 | tokernizer_test.main(); 7 | } 8 | -------------------------------------------------------------------------------- /lib/src/tokenizer.dart: -------------------------------------------------------------------------------- 1 | library language.tokenizer; 2 | 3 | part 'tokenizer/abstract_tokenizer.dart'; 4 | part 'tokenizer/space_tokenizer.dart'; 5 | part 'tokenizer/tab_tokenizer.dart'; 6 | -------------------------------------------------------------------------------- /tool/run_tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | DIR=$( cd $( dirname "${BASH_SOURCE[0]}" )/.. && pwd ) 6 | 7 | echo "Analyzing library for warnings or type errors" 8 | dart --checked $DIR/test/start_test.dart 9 | 10 | echo -e "\n✓ OK" 11 | -------------------------------------------------------------------------------- /pubspec.yaml: -------------------------------------------------------------------------------- 1 | name: language 2 | version: 0.0.1 3 | author: Karan Goel 4 | description: Nothing to see here right now. 5 | homepage: https://github.com/karan/language.dart 6 | documentation: https://github.com/karan/language.dart 7 | dev_dependencies: 8 | unittest: any 9 | -------------------------------------------------------------------------------- /lib/src/tokenizer/abstract_tokenizer.dart: -------------------------------------------------------------------------------- 1 | part of language.tokenizer; 2 | 3 | /// This abstract [Tokenizer] class declares a common interface 4 | /// for other classes. 5 | abstract class Tokenizer { 6 | 7 | /// Tokenizes the given [text] into a [List] of tokens. 8 | List tokenize(String text); 9 | 10 | } -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Don’t commit the following directories created by pub. 2 | build/ 3 | packages/ 4 | .buildlog 5 | 6 | # Or the files created by dart2js. 7 | *.dart.js 8 | *.dart.precompiled.js 9 | *.js_ 10 | *.js.deps 11 | *.js.map 12 | 13 | # Include when developing application packages. 14 | pubspec.lock 15 | 16 | .DS_Store 17 | -------------------------------------------------------------------------------- /test/tokenizer/tokernizer_test.dart: -------------------------------------------------------------------------------- 1 | library language.test.tokenizer; 2 | 3 | import 'package:unittest/unittest.dart'; 4 | 5 | import 'space_tokenizer_test.dart' as space_tokenizer_test; 6 | import 'tab_tokenizer_test.dart' as tab_tokenizer_test; 7 | 8 | void main() { 9 | group('SpaceTokenizer', space_tokenizer_test.main); 10 | group('TabTokenizer', tab_tokenizer_test.main); 11 | } 12 | -------------------------------------------------------------------------------- /lib/src/tokenizer/tab_tokenizer.dart: -------------------------------------------------------------------------------- 1 | part of language.tokenizer; 2 | 3 | /// A whitespace tokenizer that tokenizes text on tab. 4 | /// 5 | /// Example: 6 | /// 7 | /// TabTokenizer tokenizer = new TabTokenizer(); 8 | /// tokenizer.tokenize('brown fox jumps'); 9 | /// 10 | class TabTokenizer implements Tokenizer { 11 | 12 | List tokenize(String text) { 13 | return text.split('\t'); 14 | } 15 | 16 | } -------------------------------------------------------------------------------- /lib/src/tokenizer/space_tokenizer.dart: -------------------------------------------------------------------------------- 1 | part of language.tokenizer; 2 | 3 | /// A whitespace tokenizer that tokenizes text on space. 4 | /// 5 | /// Example: 6 | /// 7 | /// SpaceTokenizer tokenizer = new SpaceTokenizer(); 8 | /// tokenizer.tokenize('brown fox jumps'); 9 | /// 10 | class SpaceTokenizer implements Tokenizer { 11 | 12 | List tokenize(String text) { 13 | return text.split(' '); 14 | } 15 | 16 | } -------------------------------------------------------------------------------- /test/tokenizer/tab_tokenizer_test.dart: -------------------------------------------------------------------------------- 1 | library language.test.tokenizer.tab_tokenizer_test; 2 | 3 | import 'package:language/src/tokenizer.dart'; 4 | import 'package:unittest/unittest.dart'; 5 | 6 | void main() { 7 | TabTokenizer tokenizer = new TabTokenizer(); 8 | test('test TabTokenizer', () { 9 | 10 | expect(tokenizer.tokenize('Elizabeth\tis hungry'), 11 | orderedEquals(['Elizabeth', 'is hungry'])); 12 | 13 | expect(tokenizer.tokenize('Elizabeth\tis\thungry.'), 14 | orderedEquals(['Elizabeth', 'is', 'hungry.'])); 15 | 16 | expect(tokenizer.tokenize('Elizabeth \tis \t\thungry'), 17 | orderedEquals(['Elizabeth ', 'is ', '', 'hungry'])); 18 | 19 | }); 20 | } 21 | -------------------------------------------------------------------------------- /test/tokenizer/space_tokenizer_test.dart: -------------------------------------------------------------------------------- 1 | library language.test.tokenizer.space_tokenizer_test; 2 | 3 | import 'package:language/src/tokenizer.dart'; 4 | import 'package:unittest/unittest.dart'; 5 | 6 | void main() { 7 | SpaceTokenizer tokenizer = new SpaceTokenizer(); 8 | test('test SpaceTokenizer', () { 9 | 10 | expect(tokenizer.tokenize('Elizabeth is hungry'), 11 | orderedEquals(['Elizabeth', 'is', 'hungry'])); 12 | 13 | expect(tokenizer.tokenize('He saw the frog with the telescope.'), 14 | orderedEquals(['He', 'saw', 'the', 'frog', 'with', 'the', 'telescope.'])); 15 | 16 | expect(tokenizer.tokenize('Stand on your head!'), 17 | orderedEquals(['Stand', '', '', 'on', '', '', 'your', '', '', 'head!'])); 18 | 19 | }); 20 | } 21 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Karan Goel 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | This project is now deprecated. If you would like to complete it, feel free to send PR's. 2 | 3 | language 4 | === 5 | 6 | General natural language processing utilities for Dart. It provides a simple API for getting started with natural language processing (NLP), Artificial Intelligence (AI) and Natural Language Generation (NLG) tasks. 7 | 8 | This package will initially support English. In future, it may support other major languages like Spanish, Russian, Chinese (maybe). 9 | 10 | 11 | ### Features Overview 12 | 13 | - [Tokenization](#tokenization) 14 | - [Space Tokenizer](#space-tokenizer) 15 | - [Tab Tokenizer](#tab-tokenizer) 16 | - String distance 17 | - n-grams 18 | - Markov chain 19 | - Classifiers 20 | - Phonetics 21 | - Language identification 22 | - Summarization 23 | - Part-of-speech tagging (POS) 24 | - Sentiment Analysis 25 | - TF-IDF 26 | - Words Inflection and Lemmatization 27 | 28 | 29 | ## Tokenization 30 | 31 | #### Space Tokenizer 32 | 33 | SpaceTokenizer tokenizer = new SpaceTokenizer(); 34 | tokenizer.tokenize('brown fox jumps'); 35 | ===> ['brown', 'fox', 'jumps'] 36 | 37 | tokenizer.tokenize('Stand on your head!'); 38 | ===> ['Stand', '', '', 'on', '', '', 'your', '', '', 'head!'] 39 | 40 | #### Tab Tokenizer 41 | 42 | TabTokenizer tokenizer = new TabTokenizer(); 43 | tokenizer.tokenize('brown\tfox\tjumps'); 44 | ===> ['brown', 'fox', 'jumps'] 45 | 46 | #### Regexp Tokenizer 47 | #### Word Tokenizer 48 | #### Word-Punctuation Tokenizer 49 | #### Treebank Tokenizer 50 | 51 | ## String distance 52 | 53 | #### Jaro–Winkler algorithm 54 | #### Levenshtein algorithm 55 | #### Dice's Coefficient 56 | 57 | ## n-grams 58 | 59 | ## Markov chain 60 | 61 | http://blog.codinghorror.com/markov-and-you/ 62 | 63 | ## Classifiers 64 | 65 | #### Naive Bates 66 | #### Logistic regression 67 | 68 | ## Phonetics 69 | 70 | #### SoundEx 71 | #### Metaphone 72 | #### Double Metaphone 73 | 74 | ## Language identification 75 | 76 | ## Summarization 77 | 78 | ## Part-of-speech tagging (POS) 79 | 80 | #### TnT (?) 81 | 82 | ## Sentiment Analysis 83 | 84 | ## TF-IDF 85 | 86 | ## Words Inflection and Lemmatization 87 | 88 | #### Noun inflection 89 | #### Number inflection 90 | #### Present verb inflector 91 | 92 | ## Testing 93 | 94 | $ chmod u+x tool/run_tests.sh 95 | $ ./tool/run_tests.sh 96 | --------------------------------------------------------------------------------