├── .gitignore ├── .travis.yml ├── CODE_OF_CONDUCT.md ├── Dockerfile74 ├── Dockerfile80 ├── Dockerfile81 ├── Dockerfile82 ├── Dockerfile83 ├── Dockerfile84 ├── LICENSE ├── README.md ├── composer.json ├── interactive ├── phpunit.xml ├── src ├── Adapters │ ├── ArrayDataReaderAdapter.php │ ├── EnchantAdapter.php │ ├── JsonDataAdapter.php │ ├── JsonWriterAdapter.php │ └── PspellAdapter.php ├── Analysis │ ├── DateAnalysis.php │ ├── FreqDist.php │ ├── Keywords │ │ └── Rake.php │ └── Summarize │ │ └── Simple.php ├── Classifiers │ └── NaiveBayes.php ├── Collections │ └── DocumentArrayCollection.php ├── Collocations │ └── CollocationFinder.php ├── Comparisons │ ├── CosineSimilarityComparison.php │ ├── HammingDistanceComparison.php │ ├── JaccardIndexComparison.php │ ├── JaroWinklerComparison.php │ ├── LevenshteinComparison.php │ ├── LongestCommonSubstringComparison.php │ ├── MostFreqCharComparison.php │ └── SimilarTextComparison.php ├── Console │ └── Commands │ │ ├── NltkPackageInstallAllCommand.php │ │ ├── NltkPackageInstallCommand.php │ │ ├── NltkPackageListCommand.php │ │ ├── StopWordsCommand.php │ │ └── VocabSizeCommand.php ├── Corpus │ ├── ImportCorpus.php │ ├── NameCorpus.php │ ├── ReadCorpusAbstract.php │ ├── TextCorpus.php │ └── WordnetCorpus.php ├── Documents │ ├── ContentDocument.php │ ├── DocumentAbstract.php │ └── TokensDocument.php ├── Downloaders │ ├── DownloadPackageFactory.php │ └── NltkCorporaIndexDownloader.php ├── Exceptions │ ├── InvalidExpression.php │ └── InvalidParameterSizeException.php ├── Extracts │ ├── DateExtract.php │ ├── EmailExtract.php │ ├── HashTag.php │ ├── LambdaExtract.php │ └── UrlExtract.php ├── Filters │ ├── CharFilter.php │ ├── DomainFilter.php │ ├── EmailFilter.php │ ├── LambdaFilter.php │ ├── LowerCaseFilter.php │ ├── NumbersFilter.php │ ├── PossessiveNounFilter.php │ ├── PunctuationFilter.php │ ├── QuotesFilter.php │ ├── SpacePunctuationFilter.php │ ├── StopWordsFilter.php │ ├── StripTagsFilter.php │ ├── TrimFilter.php │ ├── UpperCaseFilter.php │ ├── UrlFilter.php │ └── WhitespaceFilter.php ├── Generators │ └── StopwordGenerator.php ├── Indexes │ ├── TfIdf.php │ └── WordnetIndex.php ├── Interfaces │ ├── IClassifier.php │ ├── ICollection.php │ ├── IDataReader.php │ ├── IDataWriter.php │ ├── IDistance.php │ ├── IExtractStrategy.php │ ├── ILexicalDiversity.php │ ├── ISimilarity.php │ ├── ISpelling.php │ ├── IStemmer.php │ └── ITokenTransformation.php ├── LexicalDiversity │ ├── Naive.php │ ├── YuleI.php │ └── YuleK.php ├── Models │ ├── ScoreKeeper.php │ └── Wordnet │ │ ├── ExceptionMap.php │ │ ├── Lemma.php │ │ └── Synset.php ├── NGrams │ ├── NGramFactory.php │ ├── Statistic2D.php │ ├── Statistic3D.php │ └── StatisticFacade.php ├── Phonetics │ ├── MetaphonePhonetic.php │ └── SoundexPhonetic.php ├── Sentiment │ └── Vader.php ├── Stemmers │ ├── DictionaryStemmer.php │ ├── LambdaStemmer.php │ ├── LancasterStemmer.php │ ├── LookupStemmer.php │ ├── MorphStemmer.php │ ├── PorterStemmer.php │ ├── RegexStemmer.php │ └── SnowballStemmer.php ├── Taggers │ ├── StanfordAbstract.php │ ├── StanfordNerTagger.php │ └── StanfordPosTagger.php ├── Tokenizers │ ├── FixedLengthTokenizer.php │ ├── GeneralTokenizer.php │ ├── LambdaTokenizer.php │ ├── PennTreeBankTokenizer.php │ ├── RegexTokenizer.php │ ├── SentenceTokenizer.php │ ├── TokenizerAbstract.php │ ├── TwitterTokenizer.php │ ├── VanderleeTokenizer.php │ └── WhitespaceTokenizer.php ├── Traits │ └── WordnetPointerSymbolMap.php ├── Utilities │ ├── Nltk │ │ └── Download │ │ │ └── Package.php │ ├── Text.php │ └── Vowels │ │ ├── EnglishVowels.php │ │ └── VowelsAbstractFactory.php └── helpers │ ├── helpers.php │ ├── interactive_help.php │ ├── print.php │ ├── simplified.php │ └── storage.php ├── storage ├── .gitkeep ├── cache │ └── .gitkeep └── corpora │ └── .gitkeep ├── tests ├── TestBaseCase.php ├── TextAnalysis │ ├── Adapters │ │ └── PspellAdapterTest.php │ ├── Analysis │ │ ├── DateAnalysisTest.php │ │ ├── FreqDistTest.php │ │ └── Keywords │ │ │ └── RakeTest.php │ ├── Classifiers │ │ └── NaiveBayesTest.php │ ├── Collections │ │ └── DocumentArrayCollectionTest.php │ ├── Collocations │ │ └── CollocationFinderTest.php │ ├── Comparisons │ │ ├── CosineSimilarityComparisonTest.php │ │ ├── HammingDistanceComparisonTest.php │ │ ├── JaccardIndexComparisonTest.php │ │ ├── JaroWinklerComparisonTest.php │ │ ├── LevenshteinComparisonTest.php │ │ ├── LongestCommonSubstringComparisonTest.php │ │ └── MostFreqCharComparisonTest.php │ ├── Corpus │ │ ├── ImportCorpusTest.php │ │ ├── NameCorpusTest.php │ │ ├── TextCorpusTest.php │ │ └── WordnetCorpusTest.php │ ├── Downloaders │ │ └── NltkCorporalIndexDownloaderTest.php │ ├── Extracts │ │ ├── DateExtractTest.php │ │ └── HashTagTest.php │ ├── Filters │ │ ├── CharFilterTest.php │ │ ├── EmailFilterTest.php │ │ ├── LambdaFilterTest.php │ │ ├── LowerCaseFilterTest.php │ │ ├── NumbersFilterTest.php │ │ ├── PossessiveNounFilterTest.php │ │ ├── PunctuationFilterTest.php │ │ ├── QuotesFilterTest.php │ │ ├── SpacePunctuationFilterTest.php │ │ ├── StopWordsFilterTest.php │ │ └── UrlFilterTest.php │ ├── Indexes │ │ ├── TfIdfTest.php │ │ └── WordnetIndexTest.php │ ├── LexicalDiversity │ │ ├── NaiveTest.php │ │ ├── YuleITest.php │ │ └── YuleKTest.php │ ├── NGrams │ │ ├── NGramFactoryTest.php │ │ └── StatisticFacadeTest.php │ ├── Sentiment │ │ └── VaderTest.php │ ├── Stemmers │ │ ├── DictionaryStemmerTest.php │ │ ├── LambdaStemmerTest.php │ │ ├── LancasterStemmerTest.php │ │ ├── LookupStemmerTest.php │ │ ├── MorphStemmerTest.php │ │ ├── PorterStemmerTest.php │ │ ├── RegexStemmerTest.php │ │ └── SnowballStemmerTest.php │ ├── Taggers │ │ ├── StanfordNerTaggerTest.php │ │ └── StanfordPosTaggerTest.php │ ├── Tokenizers │ │ ├── FixedLengthTokenizerTest.php │ │ ├── GeneralTokenizerTest.php │ │ ├── PennTreeBankTokenizerTest.php │ │ ├── RegexTokenizerTest.php │ │ ├── SentenceTokenizerTest.php │ │ └── TwitterTokenizerTest.php │ └── Utilities │ │ ├── TextTest.php │ │ └── Vowels │ │ └── EnglishVowelsTest.php ├── bootstrap.php └── data │ ├── Text │ └── Analysis │ │ ├── text.txt │ │ └── text_ngrams.txt │ ├── Tokenizers │ └── PennTreeBankTokenizerTest │ │ ├── test.txt │ │ └── tokenized │ └── books │ ├── ptbr │ └── Dom_Casmurro.txt │ └── tom_sawyer.txt └── textconsole /.gitignore: -------------------------------------------------------------------------------- 1 | nbproject/ 2 | vendor/ 3 | storage/ 4 | composer.lock 5 | .idea 6 | .phpunit.result.cache 7 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: php 2 | env: 3 | - SKIP_TEST=1 4 | php: 5 | - 7.4 6 | 7 | before_script: 8 | - composer self-update 9 | - composer install --prefer-source --no-interaction --dev 10 | 11 | script: phpunit 12 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team at dcardin2007@gmail.com. All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 72 | 73 | [homepage]: https://www.contributor-covenant.org 74 | 75 | For answers to common questions about this code of conduct, see 76 | https://www.contributor-covenant.org/faq 77 | -------------------------------------------------------------------------------- /Dockerfile74: -------------------------------------------------------------------------------- 1 | FROM php:7.4-cli 2 | 3 | RUN apt-get update && \ 4 | apt-get install -y --no-install-recommends zip libzip-dev && \ 5 | docker-php-ext-install zip 6 | 7 | RUN curl --silent --show-error https://getcomposer.org/installer | php -- --install-dir=/usr/local/bin --filename=composer 8 | 9 | RUN mkdir -p /app 10 | 11 | COPY ./ /app 12 | 13 | RUN composer --working-dir=/app install 14 | 15 | RUN cd /app && SKIP_TEST=1 ./vendor/bin/phpunit -d memory_limit=1G 16 | 17 | CMD ["/bin/sh"] 18 | -------------------------------------------------------------------------------- /Dockerfile80: -------------------------------------------------------------------------------- 1 | FROM php:8.0-cli 2 | 3 | RUN apt-get update && \ 4 | apt-get install -y --no-install-recommends zip libzip-dev libpspell-dev && \ 5 | docker-php-ext-install zip pspell 6 | 7 | RUN curl --silent --show-error https://getcomposer.org/installer | php -- --install-dir=/usr/local/bin --filename=composer 8 | 9 | RUN mkdir -p /app 10 | 11 | COPY ./ /app 12 | 13 | RUN composer --working-dir=/app install 14 | 15 | RUN cd /app && SKIP_TEST=1 ./vendor/bin/phpunit -d memory_limit=1G 16 | 17 | CMD ["/bin/sh"] 18 | -------------------------------------------------------------------------------- /Dockerfile81: -------------------------------------------------------------------------------- 1 | FROM php:8.1-cli 2 | 3 | RUN apt-get update && \ 4 | apt-get install -y --no-install-recommends zip libzip-dev libpspell-dev && \ 5 | docker-php-ext-install zip pspell 6 | 7 | RUN curl --silent --show-error https://getcomposer.org/installer | php -- --install-dir=/usr/local/bin --filename=composer 8 | 9 | RUN mkdir -p /app 10 | 11 | COPY ./ /app 12 | 13 | RUN composer --working-dir=/app install 14 | 15 | RUN cd /app && SKIP_TEST=1 ./vendor/bin/phpunit -d memory_limit=1G 16 | 17 | CMD ["/bin/sh"] 18 | -------------------------------------------------------------------------------- /Dockerfile82: -------------------------------------------------------------------------------- 1 | FROM php:8.2-cli 2 | 3 | RUN apt-get update && \ 4 | apt-get install -y --no-install-recommends zip libzip-dev libpspell-dev && \ 5 | docker-php-ext-install zip pspell 6 | 7 | RUN curl --silent --show-error https://getcomposer.org/installer | php -- --install-dir=/usr/local/bin --filename=composer 8 | 9 | RUN mkdir -p /app 10 | 11 | COPY ./ /app 12 | 13 | RUN composer --working-dir=/app install 14 | 15 | RUN cd /app && SKIP_TEST=1 ./vendor/bin/phpunit -d memory_limit=1G 16 | 17 | CMD ["/bin/sh"] 18 | -------------------------------------------------------------------------------- /Dockerfile83: -------------------------------------------------------------------------------- 1 | FROM php:8.3-cli 2 | 3 | RUN apt-get update && \ 4 | apt-get install -y --no-install-recommends zip libzip-dev libpspell-dev && \ 5 | docker-php-ext-install zip pspell 6 | 7 | RUN curl --silent --show-error https://getcomposer.org/installer | php -- --install-dir=/usr/local/bin --filename=composer 8 | 9 | RUN mkdir -p /app 10 | 11 | COPY ./ /app 12 | 13 | RUN composer --working-dir=/app install 14 | 15 | RUN cd /app && SKIP_TEST=1 ./vendor/bin/phpunit -d memory_limit=1G 16 | 17 | CMD ["/bin/sh"] 18 | -------------------------------------------------------------------------------- /Dockerfile84: -------------------------------------------------------------------------------- 1 | FROM php:8.4-cli 2 | 3 | RUN apt-get update && \ 4 | apt-get install -y --no-install-recommends zip libzip-dev libpspell-dev && \ 5 | docker-php-ext-install zip 6 | 7 | RUN curl --silent --show-error https://getcomposer.org/installer | php -- --install-dir=/usr/local/bin --filename=composer 8 | 9 | RUN mkdir -p /app 10 | 11 | COPY ./ /app 12 | 13 | RUN composer --working-dir=/app install 14 | 15 | RUN cd /app && SKIP_TEST=1 ./vendor/bin/phpunit -d memory_limit=1G 16 | 17 | CMD ["/bin/sh"] 18 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Dan Cardin 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "yooper/php-text-analysis", 3 | "description": "PHP Text Analysis is a library for performing Information Retrieval (IR) and Natural Language Processing (NLP) tasks using the PHP language", 4 | "keywords": ["nlp","ir","text analysis","natural language processing", "text classification"], 5 | "license": "MIT", 6 | "authors": [ 7 | { 8 | "name": "yooper", 9 | "email" : "dcardin2007@gmail.com" 10 | } 11 | ], 12 | "bin": ["textconsole","interactive"], 13 | "autoload": { 14 | "psr-4": { 15 | "TextAnalysis\\": "src/" 16 | }, 17 | "files": ["src/helpers/storage.php", "src/helpers/print.php", "src/helpers/simplified.php", "src/helpers/helpers.php", "src/helpers/interactive_help.php"] 18 | }, 19 | "autoload-dev": { 20 | "files": ["tests/TestBaseCase.php"] 21 | }, 22 | "require" : { 23 | "php": ">=7.4", 24 | "yooper/stop-words": "~1", 25 | "symfony/console": ">= 4.4", 26 | "wamania/php-stemmer": "^1.0 || ^2.0 || ^3.0", 27 | "yooper/nicknames": "~1" 28 | }, 29 | "require-dev": { 30 | "phpunit/phpunit": "^9", 31 | "mockery/mockery" : "^1" 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /interactive: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # run the library in an interactive mode for doing analysis 3 | echo "Welcome to the PHP Text Analysis Interactive Console" 4 | echo "Type help(); to get a list of available functions" 5 | php -a -d auto_prepend_file=./vendor/autoload.php 6 | 7 | -------------------------------------------------------------------------------- /phpunit.xml: -------------------------------------------------------------------------------- 1 | 2 | 12 | 13 | 14 | ./tests/ 15 | 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /src/Adapters/ArrayDataReaderAdapter.php: -------------------------------------------------------------------------------- 1 | data = $data; 20 | } 21 | 22 | /** 23 | * 24 | * @return array 25 | */ 26 | public function read() 27 | { 28 | return $this->data; 29 | } 30 | } 31 | 32 | -------------------------------------------------------------------------------- /src/Adapters/EnchantAdapter.php: -------------------------------------------------------------------------------- 1 | enchantBroker = enchant_broker_request_dict($r, $language); 20 | } 21 | 22 | /** 23 | * Use enchant to get word suggestions 24 | * @param string $word 25 | * @return array 26 | */ 27 | public function suggest($word) 28 | { 29 | if(!enchant_dict_check($this->enchantBroker, $word)) { 30 | return enchant_dict_suggest($this->enchantBroker, $word); 31 | } else { 32 | return [$word]; 33 | } 34 | } 35 | 36 | public function __destruct() 37 | { 38 | unset($this->enchantBroker); 39 | } 40 | } -------------------------------------------------------------------------------- /src/Adapters/JsonDataAdapter.php: -------------------------------------------------------------------------------- 1 | jsonStr = $jsonStr; 31 | $this->assoc = $assoc; 32 | } 33 | 34 | /** 35 | * Returns the json data as an array 36 | * @return array 37 | */ 38 | public function read() 39 | { 40 | return json_decode($this->jsonStr, $this->assoc); 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/Adapters/JsonWriterAdapter.php: -------------------------------------------------------------------------------- 1 | data = $data; 40 | $this->options = $options; 41 | $this->depth = $depth; 42 | } 43 | 44 | /** 45 | * return a json encoded string 46 | * @return string 47 | */ 48 | public function write() 49 | { 50 | return json_encode($this->data, $this->options, $this->depth); 51 | } 52 | } 53 | 54 | -------------------------------------------------------------------------------- /src/Adapters/PspellAdapter.php: -------------------------------------------------------------------------------- 1 | pSpell = pspell_new($language, $spelling, $jargon, $encoding, $mode); 19 | } 20 | 21 | /** 22 | * Use pspell to get word suggestions 23 | * @param string $word 24 | * @return array 25 | */ 26 | public function suggest($word) 27 | { 28 | if (!pspell_check($this->pSpell, $word)) { 29 | return pspell_suggest($this->pSpell, $word); 30 | } 31 | else { 32 | return [$word]; 33 | } 34 | } 35 | 36 | public function __destruct() 37 | { 38 | unset($this->pSpell); 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /src/Analysis/DateAnalysis.php: -------------------------------------------------------------------------------- 1 | sentences = $tokenizer->tokenize( $this->normalize($text)) ; 36 | } 37 | 38 | /** 39 | * Remove any periods from abbreviated month names 40 | * ie Mar. to March 41 | * @param string $text 42 | */ 43 | protected function normalize(string $text) : string 44 | { 45 | $search = ['jan.','feb.','mar.','apr.','may.','jun.','jul.','aug.','sep.','oct.','nov.','dec.']; 46 | $replace = [ 47 | "january", 48 | "february", 49 | "march", 50 | "april", 51 | "may", 52 | "june", 53 | "july", 54 | "august", 55 | "september", 56 | "october", 57 | "november", 58 | "december" 59 | ]; 60 | return str_ireplace($search, $replace, $text); 61 | } 62 | 63 | /** 64 | * @return DateTime[] 65 | */ 66 | public function getDates() : array 67 | { 68 | // return the cached copy 69 | if(empty($this->dates)) { 70 | $getDateFunc = function($sentence) 71 | { 72 | $date = Text::findDate($sentence); 73 | return new DateTime("{$date['year']}-{$date['month']}-{$date['day']}"); 74 | }; 75 | 76 | $this->dates = array_map($getDateFunc, $this->sentences); 77 | 78 | // re-index so nulls and offsets are correct. 79 | $this->dates = array_values(array_filter($this->dates)); 80 | } 81 | return $this->dates; 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /src/Analysis/Summarize/Simple.php: -------------------------------------------------------------------------------- 1 | $freq) 31 | { 32 | foreach($scoreKeepers as $sentenceKeeper) 33 | { 34 | if(strpos($sentenceKeeper->getToken(), (string)$token) !== false) { 35 | 36 | $sentenceKeeper->addToScore($freq); 37 | } 38 | } 39 | } 40 | 41 | usort($scoreKeepers, 'score_keeper_sort'); 42 | return $scoreKeepers; 43 | } 44 | 45 | } 46 | -------------------------------------------------------------------------------- /src/Classifiers/NaiveBayes.php: -------------------------------------------------------------------------------- 1 | labels[$label])) { 35 | $this->labels[$label] = []; 36 | $this->labelCount[$label] = 0; 37 | } 38 | 39 | $this->labelCount[$label]++; 40 | foreach($freqDist as $token => $count) 41 | { 42 | isset($this->tokenCount[$token]) ? $this->tokenCount[$token] += $count : $this->tokenCount[$token] = $count; 43 | isset($this->labels[$label][$token]) ? $this->labels[$label][$token] += $count : $this->labels[$label][$token] = $count; 44 | } 45 | } 46 | 47 | public function predict(array $tokens) 48 | { 49 | $totalDocs = $this->getDocCount(); 50 | $scores = []; 51 | 52 | foreach ($this->labelCount as $label => $docCount) 53 | { 54 | $sum = 0; 55 | $inversedDocCount = $totalDocs - $docCount; 56 | $docCountReciprocal = 1 / $docCount; 57 | $inversedDocCountReciprocal = 1 / $inversedDocCount; 58 | 59 | foreach ($tokens as $token) 60 | { 61 | $totalTokenCount = $this->tokenCount[$token] ?? 1; // prevent division by zero 62 | $tokenCount = $this->labels[$label][$token] ?? 0; 63 | $inversedTokenCount = $totalTokenCount - $tokenCount; 64 | $tokenProbabilityPositive = $tokenCount * $docCountReciprocal; 65 | $tokenProbabilityNegative = $inversedTokenCount * $inversedDocCountReciprocal; 66 | $probability = $tokenProbabilityPositive / ($tokenProbabilityPositive + $tokenProbabilityNegative); 67 | $probability = (0.5 + ($totalTokenCount * $probability)) / (1 + $totalTokenCount); 68 | $sum += log(1 - $probability) - log($probability); 69 | } 70 | $scores[$label] = 1 / (1 + exp($sum)); 71 | } 72 | arsort($scores, SORT_NUMERIC); 73 | return $scores; 74 | } 75 | 76 | public function getDocCount() : int 77 | { 78 | return array_sum( array_values( $this->labelCount)) ?? 0; 79 | } 80 | 81 | public function __destruct() 82 | { 83 | unset($this->labelCount); 84 | unset($this->labels); 85 | unset($this->tokenCount); 86 | } 87 | 88 | 89 | } 90 | -------------------------------------------------------------------------------- /src/Collocations/CollocationFinder.php: -------------------------------------------------------------------------------- 1 | tokens = $tokens; 27 | $this->nGramSize = $nGramSize; 28 | } 29 | 30 | /** 31 | * Returns a naive implementation of collocations 32 | * @return array 33 | */ 34 | public function getCollocations() 35 | { 36 | $nGramTokens = ngrams($this->tokens, $this->nGramSize); 37 | return freq_dist($nGramTokens)->getKeyValuesByFrequency(); 38 | } 39 | 40 | /** 41 | * Compute the Pointwise Mutual Information on the collocations 42 | * @return array 43 | */ 44 | public function getCollocationsByPmi() 45 | { 46 | $nGramFreqDist = freq_dist(ngrams($this->tokens, $this->nGramSize)); 47 | $unigramsFreqDist = freq_dist($this->tokens); 48 | 49 | $dataSet = []; 50 | foreach($nGramFreqDist->getKeys() as $nGramToken) 51 | { 52 | $tokens = explode(" ", $nGramToken); 53 | $tally = 1; 54 | foreach($tokens as $unigramToken) 55 | { 56 | $tally *= $unigramsFreqDist->getKeyValuesByWeight()[$unigramToken]; 57 | } 58 | 59 | // get probabilities of all tokens 60 | $dataSet[$nGramToken] = log($nGramFreqDist->getKeyValuesByWeight()[$nGramToken] / $tally ); 61 | } 62 | arsort($dataSet); 63 | return $dataSet; 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /src/Comparisons/CosineSimilarityComparison.php: -------------------------------------------------------------------------------- 1 | $freq) 35 | { 36 | if (isset($text1Freq[$term]) && isset($text2Freq[$term])) { 37 | $product += $text1Freq[$term] * $text2Freq[$term]; 38 | } 39 | } 40 | 41 | $productFunc = function($carry, $freq) 42 | { 43 | $carry += pow($freq, 2); 44 | return $carry; 45 | }; 46 | 47 | $text1VectorSum = sqrt(array_reduce(array_values($text1Freq), $productFunc, 0)); 48 | $text2VectorSum = sqrt(array_reduce(array_values($text2Freq), $productFunc, 0)); 49 | return $product / ($text1VectorSum * $text2VectorSum); 50 | 51 | } 52 | 53 | /** 54 | * 55 | * @param array $text1 56 | * @param array $text2 57 | * @return float 58 | */ 59 | public function distance($text1, $text2) 60 | { 61 | return 1 - $this->similarity($text1, $text2); 62 | } 63 | 64 | } 65 | -------------------------------------------------------------------------------- /src/Comparisons/HammingDistanceComparison.php: -------------------------------------------------------------------------------- 1 | 12 | */ 13 | class HammingDistanceComparison implements IDistance 14 | { 15 | /** 16 | * Return the hamming distance, expects the strings to be equal length 17 | * @param string $text1 18 | * @param string $text2 19 | * @return int 20 | */ 21 | public function distance($text1, $text2) 22 | { 23 | $distance = 0; 24 | $strLength = strlen($text1); 25 | for($index = 0; $index < $strLength; $index++) 26 | { 27 | if($text1[$index] != $text2[$index]) { 28 | $distance++; 29 | } 30 | } 31 | return $distance; 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/Comparisons/JaccardIndexComparison.php: -------------------------------------------------------------------------------- 1 | 11 | */ 12 | class JaccardIndexComparison implements ISimilarity 13 | { 14 | /** 15 | * Returns the Jaccard Index 16 | * @param string|array $text1 17 | * @param string|array $text2 18 | * @return float 19 | */ 20 | public function similarity($text1, $text2) 21 | { 22 | if(is_string($text1) && is_string($text2)) { 23 | $text1 = str_split($text1); 24 | $text2 = str_split($text2); 25 | } 26 | $inter = array_intersect( $text1, $text2 ); 27 | $union = array_unique( ($text1 + $text2) ); 28 | return count($inter) / count($union); 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/Comparisons/JaroWinklerComparison.php: -------------------------------------------------------------------------------- 1 | 11 | */ 12 | class JaroWinklerComparison implements ISimilarity 13 | { 14 | /** 15 | * The minimum prefix length 16 | * @var int 17 | */ 18 | protected $minPrefixLength; 19 | 20 | public function __construct($minPrefixLength = 4) 21 | { 22 | $this->minPrefixLength = $minPrefixLength; 23 | } 24 | 25 | /** 26 | * Return the similarity using the JaroWinkler algorithm 27 | * @param string $text1 28 | * @param string $text2 29 | * @return real 30 | */ 31 | public function similarity($text1, $text2) 32 | { 33 | if($text1 === $text2) { 34 | return 1.0; 35 | } 36 | 37 | // ensure that s1 is shorter than or same length as s2 38 | if (strlen($text1) > strlen($text2)) { 39 | $tmp = $text1; 40 | $text1 = $text2; 41 | $text2 = $tmp; 42 | } 43 | 44 | $strLen1 = strlen($text1); 45 | $strLen2 = strlen($text2); 46 | 47 | $maxDistance = (int)$strLen2 / 2; 48 | $commonCounter = 0; // count of common characters 49 | $transpositionCounter = 0; // count of transpositions 50 | $prevPosition = -1; 51 | for ($index = 0; $index < $strLen1; $index++) 52 | { 53 | $char = $text1[$index]; 54 | // init inner loop 55 | $jindex = max(0, $index - $maxDistance); 56 | while($jindex < min($strLen2, $index + $maxDistance)) 57 | { 58 | if ($char === $text2[$jindex]) { 59 | $commonCounter++; // common char found 60 | if ($prevPosition != -1 && $jindex < $prevPosition) { 61 | $transpositionCounter++; 62 | } 63 | $prevPosition = $jindex; 64 | break; 65 | } 66 | 67 | $jindex++; 68 | } 69 | } 70 | // no common characters between strings 71 | if($commonCounter === 0) { 72 | return 0.0; 73 | } 74 | 75 | // first compute the score 76 | $score = ( 77 | ($commonCounter / $strLen1) + 78 | ($commonCounter / $strLen2) + 79 | (($commonCounter - $transpositionCounter) / $commonCounter)) / 3.0; 80 | 81 | //init values 82 | $prefixLength = 0; // length of prefix 83 | $last = min($this->minPrefixLength, $strLen1); 84 | while($prefixLength < $last && $text1[$prefixLength] == $text2[$prefixLength]) 85 | { 86 | $prefixLength++; 87 | } 88 | 89 | return $score + (($prefixLength * (1 - $score)) / 10); 90 | } 91 | 92 | } 93 | -------------------------------------------------------------------------------- /src/Comparisons/LevenshteinComparison.php: -------------------------------------------------------------------------------- 1 | insertCost = $insertCost; 28 | $this->replaceCost = $replaceCost; 29 | $this->deleteCost = $deleteCost; 30 | 31 | } 32 | 33 | /** 34 | * Return the levenshtein distance, default costs of 1 applied 35 | * @param string $text1 36 | * @param string $text2 37 | * @return int 38 | */ 39 | public function distance($text1, $text2) 40 | { 41 | return levenshtein($text1, $text2, $this->insertCost, $this->replaceCost, $this->deleteCost); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/Comparisons/LongestCommonSubstringComparison.php: -------------------------------------------------------------------------------- 1 | 13 | */ 14 | class LongestCommonSubstringComparison implements ISimilarity, IDistance 15 | { 16 | /** 17 | * Using caching to improve performance on text2 inputs 18 | * @var boolean 19 | */ 20 | protected $useCache = false; 21 | 22 | /** 23 | * Cache for holding substring arrays key/value array 24 | * @var array 25 | */ 26 | protected $cache = []; 27 | 28 | /** 29 | * 30 | * @param boolean $useCache 31 | */ 32 | public function __construct($useCache = false) 33 | { 34 | $this->useCache = $useCache; 35 | } 36 | 37 | /** 38 | * Returns the string length of the longest common substring (LCS) 39 | * @param string $text1 40 | * @param string $text2 41 | * @return int 42 | */ 43 | public function distance($text1, $text2) 44 | { 45 | return max(mb_strlen($text1), mb_strlen($text2)) - mb_strlen($this->similarity($text1, $text2)); 46 | } 47 | 48 | /** 49 | * Returns the Longest common substring 50 | * @param string $text1 51 | * @param string $text2 52 | * @return string 53 | */ 54 | public function similarity($text1, $text2) 55 | { 56 | if($this->useCache && !isset($this->cache[$text2])) { 57 | $this->cache[$text2] = Text::getAllSubStrings($text2); 58 | } 59 | 60 | $intersection = array_intersect( Text::getAllSubStrings($text1), ($this->useCache) ? $this->cache[$text2] : Text::getAllSubStrings($text2)); 61 | $max = 0; 62 | $lcs = ''; 63 | foreach($intersection as $substr) 64 | { 65 | $strlen = mb_strlen($substr); 66 | if( $strlen > $max) { 67 | $max = $strlen; 68 | $lcs = $substr; 69 | } 70 | } 71 | return $lcs; 72 | } 73 | 74 | /** 75 | * 76 | * @return array 77 | */ 78 | public function getCache() 79 | { 80 | return $this->cache; 81 | } 82 | 83 | public function __destruct() 84 | { 85 | unset($this->cache); 86 | unset($this->useCache); 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /src/Comparisons/MostFreqCharComparison.php: -------------------------------------------------------------------------------- 1 | 13 | */ 14 | class MostFreqCharComparison implements ISimilarity, IDistance 15 | { 16 | /** 17 | * The minimum number of frequency per char to count 18 | * @var int 19 | */ 20 | protected $limit; 21 | 22 | /** 23 | * Set the minimum limit 24 | * @param int $limit 25 | */ 26 | public function __construct($limit = 2) 27 | { 28 | $this->limit = $limit; 29 | } 30 | 31 | 32 | /** 33 | * Returns the most frequently used letter with the same 34 | * frequency 35 | * @param string $text1 36 | * @param string $text2 37 | * @return int 38 | */ 39 | public function similarity($text1, $text2) 40 | { 41 | $similarity = 0; 42 | $hash1 = $this->hashString($text1); 43 | $hash2 = $this->hashString($text2); 44 | 45 | $keys = array_keys(array_intersect_key($hash1, $hash2)); 46 | foreach($keys as $key) 47 | { 48 | if($hash1[$key] === $hash2[$key] && $hash1[$key] >= $this->limit) 49 | { 50 | $similarity += $hash1[$key]; 51 | } 52 | } 53 | return $similarity; 54 | } 55 | 56 | 57 | 58 | /** 59 | * Returns a sorted hashed array with the frequency counts per character 60 | * @param string $text 61 | */ 62 | public function hashString($text) 63 | { 64 | $charList = str_split($text); 65 | $chars = array_fill_keys( $charList, 0); 66 | foreach($charList as $char) { 67 | $chars[$char]++; 68 | } 69 | return $chars; 70 | } 71 | 72 | /** 73 | * Returns the distance max string length minus similarity 74 | * @param string $text1 75 | * @param string $text2 76 | * @return int 77 | */ 78 | public function distance($text1, $text2) 79 | { 80 | return max(strlen($text1), strlen($text2)) - $this->similarity($text1, $text2); 81 | } 82 | 83 | } 84 | -------------------------------------------------------------------------------- /src/Comparisons/SimilarTextComparison.php: -------------------------------------------------------------------------------- 1 | similarity($text1, $text2); 24 | } 25 | 26 | /** 27 | * Returns similar_text call 28 | * @param string $text1 29 | * @param string $text2 30 | * @return int 31 | */ 32 | public function similarity($text1, $text2) 33 | { 34 | return similar_text($text1, $text2); 35 | } 36 | 37 | } 38 | -------------------------------------------------------------------------------- /src/Console/Commands/NltkPackageInstallAllCommand.php: -------------------------------------------------------------------------------- 1 | setName('pta:install:all') 22 | ->setDescription('Install all packages from pta data'); 23 | } 24 | 25 | protected function execute(InputInterface $input, OutputInterface $output) : int 26 | { 27 | $listPackages = (new NltkCorporaIndexDownloader())->getPackages(); 28 | 29 | foreach($listPackages as $package) 30 | { 31 | $command = $this->getApplication()->find('pta:install:package'); 32 | $args = [ 33 | 'command' => 'pta:install:package', 34 | 'package' => $package->getId() 35 | ]; 36 | 37 | $packageInstallerInput = new ArrayInput($args); 38 | $command->run($packageInstallerInput, $output); 39 | } 40 | 41 | return 0; 42 | } 43 | 44 | } 45 | -------------------------------------------------------------------------------- /src/Console/Commands/NltkPackageListCommand.php: -------------------------------------------------------------------------------- 1 | setName('pta:list') 22 | ->setDescription('List Corpora available in the pta data repo.') 23 | ->addArgument( 24 | 'url', 25 | InputArgument::OPTIONAL, 26 | 'Use a different url to download the pta/nltk package list.' 27 | ); 28 | } 29 | 30 | protected function execute(InputInterface $input, OutputInterface $output) 31 | { 32 | $url = $input->getArgument('url'); 33 | if ($url) { 34 | $downloader = new NltkCorporaIndexDownloader($url); 35 | } else { 36 | $downloader = new NltkCorporaIndexDownloader(); 37 | } 38 | 39 | $packages = $downloader->getPackages(); 40 | 41 | usort($packages, function($package1, $package2){ 42 | return strnatcasecmp($package1->getId(), $package2->getId()); 43 | }); 44 | /** @var $package \TextAnalysis\Utilities\Nltk\Download\Package */ 45 | $output->writeln("Packages available for installation:"); 46 | foreach($packages as $package) 47 | { 48 | $output->writeln(" * {$package->getId()} - {$package->getName()}"); 49 | } 50 | return 0; 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /src/Console/Commands/StopWordsCommand.php: -------------------------------------------------------------------------------- 1 | setName('stopwords:generate ') 25 | ->setDescription('Process a document or corpus of stop words, echos to command line') 26 | ->addArgument( 27 | 'path', 28 | InputArgument::REQUIRED, 29 | 'Path to a file or directory to read in. MUST be text files' 30 | ) 31 | ->addArgument( 32 | 'type', 33 | InputArgument::OPTIONAL, 34 | "type can be json or csv", 'json' 35 | ); 36 | } 37 | 38 | protected function execute(InputInterface $input, OutputInterface $output) 39 | { 40 | $path = $input->getArgument('path'); 41 | 42 | if(!file_exists($path)) { 43 | $output->writeln("{$path} is not a file or a path"); 44 | } 45 | 46 | $filePaths = []; 47 | if(is_file($path)) { 48 | $filePaths = [realpath($path)]; 49 | } elseif(is_dir($path)) { 50 | $filePaths = array_diff(scandir($path), array('..', '.')); 51 | } else { 52 | $output->writeln("{$path} is not known."); 53 | } 54 | 55 | $generator = new StopwordGenerator($filePaths); 56 | if($input->getArgument('type') === 'json') { 57 | echo json_encode($this->toArray($generator->getStopwords()), JSON_NUMERIC_CHECK | JSON_UNESCAPED_UNICODE); 58 | echo json_last_error_msg(); 59 | die; 60 | $output->write(json_encode($this->toArray($generator->getStopwords()))); 61 | } else { 62 | $stopwords = $generator->getStopwords(); 63 | $stdout = fopen('php://stdout', 'w'); 64 | echo 'token,freq'.PHP_EOL; 65 | foreach($stopwords as $token => $freq) 66 | { 67 | fputcsv($stdout, [utf8_encode($token), $freq]).PHP_EOL; 68 | } 69 | fclose($stdout); 70 | } 71 | return 0; 72 | } 73 | 74 | /** 75 | * So you can easily serialize the data to json 76 | * @return array 77 | */ 78 | protected function toArray(array $stopWords) 79 | { 80 | $data = []; 81 | foreach($stopWords as $key => $value) 82 | { 83 | $data[] = ['token' => utf8_encode($key), 'freq' => $value]; 84 | } 85 | return $data; 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /src/Console/Commands/VocabSizeCommand.php: -------------------------------------------------------------------------------- 1 | setName('vocab:size') 33 | ->setDescription('Process stdin and return the vocab size'); 34 | } 35 | 36 | protected function execute(InputInterface $input, OutputInterface $output) 37 | { 38 | if (ftell(STDIN) === 0) { 39 | $contents = ''; 40 | while (!feof(STDIN)) { 41 | $contents .= fread(STDIN, 4096); 42 | } 43 | // filtered tokens 44 | $tokens = array_map([$this,'filter'], (new GeneralTokenizer())->tokenize($contents)); 45 | $tokens = array_values( array_filter($tokens)); 46 | 47 | $ct = (new FreqDist($tokens))->getTotalUniqueTokens(); 48 | echo $ct.PHP_EOL; 49 | return $ct; 50 | 51 | } else { 52 | throw new \RuntimeException("Please pipe in STDIN"); 53 | } 54 | return 0; 55 | } 56 | 57 | protected function filter($token) 58 | { 59 | foreach($this->getFilters() as $filter) 60 | { 61 | $token = $filter->transform($token); 62 | } 63 | return $token; 64 | } 65 | 66 | /** 67 | * 68 | * @return array 69 | */ 70 | protected function getFilters() 71 | { 72 | if(empty($this->filters)) { 73 | $this->filters = [ 74 | new PossessiveNounFilter(), 75 | new QuotesFilter(['"','`']), 76 | new LowerCaseFilter(), 77 | new PunctuationFilter(), 78 | new CharFilter() 79 | ]; 80 | } 81 | return $this->filters; 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /src/Corpus/ImportCorpus.php: -------------------------------------------------------------------------------- 1 | getPackage()->getInstallationPath(); 31 | // use array values to start the indexing of the array @ zero 32 | return array_values(array_diff(scandir($installationPath), array('..', '.'))); 33 | } 34 | 35 | /** 36 | * The id of the package to load 37 | * @var string 38 | */ 39 | protected $packageId; 40 | 41 | /** 42 | * Return an array of tokenized words 43 | * @param string|null $fileId 44 | * @param \TextAnalysis\Tokenizers\TokenizerAbstract 45 | * @return array 46 | */ 47 | public function getWords($fileId = null, $tokenizer = null) 48 | { 49 | if(!$tokenizer) { 50 | $tokenizer = new GeneralTokenizer(); 51 | } 52 | $fileIds = []; 53 | if(empty($fileId)) { 54 | $fileIds = $this->getFileIds(); 55 | } else { 56 | $fileIds = [$fileId]; 57 | } 58 | 59 | $words = []; 60 | foreach($fileIds as $filename ) 61 | { 62 | $content = file_get_contents($this->getPackage()->getInstallationPath().$filename); 63 | $words = array_merge($words, $tokenizer->tokenize($content)); 64 | unset($content); 65 | } 66 | return $words; 67 | } 68 | 69 | /** 70 | * Return an array of tokenized sentences, see getWords 71 | * @param string|null $fileId 72 | * @return array 73 | */ 74 | public function getSentences($fileId = null) 75 | { 76 | return $this->getWords($fileId, new SentenceTokenizer()); 77 | } 78 | 79 | /** 80 | * Each array element is the text of the selected file loaded file, see getWords 81 | * @param $fileId 82 | * @return array of strings 83 | */ 84 | public function getRaw($fileId = null) 85 | { 86 | // does nothing with the text 87 | $lamdaFunction = function($text){ 88 | return [$text]; 89 | }; 90 | return $this->getWords($fileId, new LambdaTokenizer($lamdaFunction)); 91 | } 92 | 93 | 94 | /** 95 | * Provide the package id 96 | * @param string $packageId 97 | */ 98 | public function __construct($packageId) 99 | { 100 | $this->packageId = $packageId; 101 | } 102 | 103 | /** 104 | * 105 | * @return Package 106 | */ 107 | public function getPackage() 108 | { 109 | if(empty($this->package)) { 110 | // loads the package list from cache 111 | $packages = (new NltkCorporaIndexDownloader(null, true))->getPackages(); 112 | 113 | $filteredPackages = array_filter($packages, function($package) use ($packageId){ 114 | return ($package->getId() == $packageId); 115 | }); 116 | 117 | $this->package = array_values($filteredPackages)[0]; 118 | } 119 | return $this->package; 120 | } 121 | } 122 | -------------------------------------------------------------------------------- /src/Corpus/NameCorpus.php: -------------------------------------------------------------------------------- 1 | nickNames = new Nicknames(); 43 | 44 | if(!$dir) { 45 | $dir = get_storage_path('corpora'); 46 | } 47 | parent::__construct($dir, $lang); 48 | } 49 | 50 | public function getNickNameExact($name) : string 51 | { 52 | return $this->nickNames->query($name); 53 | } 54 | 55 | public function getNickNameFuzzy($name) : array 56 | { 57 | return $this->nickNames->fuzzy($name); 58 | } 59 | 60 | public function getFileNames(): array 61 | { 62 | return ['us_names.sqlite3']; 63 | } 64 | 65 | /** 66 | * 67 | * @param string $name 68 | * @return boolean 69 | */ 70 | public function isFirstName($name) : bool 71 | { 72 | return !empty($this->getFirstName($name)); 73 | } 74 | 75 | /** 76 | * @todo make this more flexible 77 | * @param string $name 78 | * @return array 79 | */ 80 | public function getFirstName($name) : array 81 | { 82 | if(!isset($this->firstNameCache[$name])) { 83 | $stmt = $this->getPdo()->prepare("SELECT * FROM us_names_by_year WHERE name = LOWER(:name) LIMIT 1"); 84 | $stmt->bindParam(':name', $name); 85 | $stmt->execute(); 86 | $this->firstNameCache[$name] = $stmt->fetchAll(PDO::FETCH_ASSOC) ?? []; 87 | } 88 | return $this->firstNameCache[$name]; 89 | } 90 | 91 | /** 92 | * 93 | * @param string $name 94 | * @return boolean 95 | */ 96 | public function isLastName($name) : bool 97 | { 98 | return !empty($this->getLastName($name)); 99 | } 100 | 101 | /** 102 | * 103 | * @param string $name 104 | * @return array 105 | */ 106 | public function getLastName($name) : array 107 | { 108 | if(!isset($this->lastNameCache[$name])) { 109 | $stmt = $this->getPdo()->prepare("SELECT * FROM surnames WHERE name = LOWER(:name)"); 110 | $stmt->bindParam(':name', $name); 111 | $stmt->execute(); 112 | $r = $stmt->fetch(PDO::FETCH_ASSOC); 113 | $this->lastNameCache[$name] = (!$r) ? [] : $r; 114 | } 115 | return $this->lastNameCache[$name]; 116 | } 117 | 118 | /** 119 | * 120 | * @param string $name 121 | * @return bool 122 | */ 123 | public function isFullName($name) : bool 124 | { 125 | $tokens = explode(" ", $name); 126 | if(count($tokens) < 2) { 127 | return false; 128 | } 129 | return !empty($this->isFirstName(current($tokens))) && !empty($this->isLastName(end($tokens))); 130 | } 131 | 132 | 133 | 134 | /** 135 | * Return the raw pdo 136 | * @return PDO 137 | */ 138 | public function getPdo() : PDO 139 | { 140 | if(empty($this->pdo)) { 141 | $this->pdo = new PDO("sqlite:".$this->getDir().$this->getFileNames()[0]); 142 | } 143 | return $this->pdo; 144 | } 145 | 146 | public function __destruct() 147 | { 148 | unset($this->pdo); 149 | unset($this->firstNameCache); 150 | unset($this->lastNameCache); 151 | unset($this->nickNames); 152 | } 153 | 154 | } 155 | 156 | 157 | -------------------------------------------------------------------------------- /src/Corpus/ReadCorpusAbstract.php: -------------------------------------------------------------------------------- 1 | dir = $dir; 32 | $this->lang = $lang; 33 | } 34 | 35 | /** 36 | * 37 | * @return string language to use, default is eng 38 | */ 39 | public function getLanguage() 40 | { 41 | return $this->lang; 42 | } 43 | 44 | /** 45 | * @return string the directory the corpus files are located 46 | */ 47 | public function getDir() 48 | { 49 | return $this->dir; 50 | } 51 | 52 | 53 | 54 | /** 55 | * @return string[] Return the list of file names that must be loaded to use the corpus 56 | * Should use relative paths 57 | */ 58 | abstract public function getFileNames(); 59 | 60 | 61 | } 62 | 63 | 64 | -------------------------------------------------------------------------------- /src/Documents/ContentDocument.php: -------------------------------------------------------------------------------- 1 | id = ++self::$counter; 36 | } 37 | $this->content = $content; 38 | } 39 | 40 | /** 41 | * 42 | * @return int|string 43 | */ 44 | public function getId() 45 | { 46 | return $this->id; 47 | } 48 | 49 | /** 50 | * 51 | * @return string 52 | */ 53 | public function getContent() 54 | { 55 | return $this->content; 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /src/Documents/DocumentAbstract.php: -------------------------------------------------------------------------------- 1 | tokens = $tokens; 32 | if(!$zones) { 33 | $this->zones = new \stdClass(); 34 | } 35 | } 36 | 37 | /** 38 | * Returns the set of tokens in this document, most of the time 39 | * @return mixed 40 | */ 41 | public abstract function getDocumentData(); 42 | 43 | public abstract function applyTransformation(ITokenTransformation $transformer); 44 | 45 | } 46 | -------------------------------------------------------------------------------- /src/Downloaders/NltkCorporaIndexDownloader.php: -------------------------------------------------------------------------------- 1 | url = $url; 40 | $this->useCache = $useCache; 41 | } 42 | 43 | /** 44 | * Returns an array of packages available for download from the nltk project 45 | * @return array 46 | */ 47 | public function getPackages() 48 | { 49 | if(empty($this->packages)) { 50 | 51 | $xml = $this->getXmlContent(); 52 | foreach($xml->packages->package as $package) 53 | { 54 | $data = (array)$package; 55 | extract($data['@attributes']); 56 | // checksums may not exist on some remote packages 57 | if(!isset($checksum)) { 58 | $checksum = null; 59 | } 60 | $this->packages[] = new Package($id, $checksum, $name, $subdir, $unzip, $url); 61 | } 62 | } 63 | return $this->packages; 64 | 65 | } 66 | 67 | /** 68 | * Get the useCache value 69 | * @return boolean 70 | */ 71 | protected function getUseCache() 72 | { 73 | return $this->useCache; 74 | } 75 | 76 | /** 77 | * Uses file_get_contents to pull down the content from the url 78 | * @return SimpleXMLElement 79 | */ 80 | public function getXmlContent() 81 | { 82 | if($this->getUseCache() && file_exists(get_storage_path('cache').$this->getCacheFileName())) { 83 | $contents = file_get_contents(get_storage_path('cache').$this->getCacheFileName()); 84 | } else { 85 | $contents = file_get_contents( $this->getUrl()); 86 | file_put_contents(get_storage_path('cache').$this->getCacheFileName(), $contents); 87 | } 88 | return simplexml_load_string( $contents); 89 | } 90 | 91 | /** 92 | * 93 | * @return string 94 | */ 95 | protected function getCacheFileName() 96 | { 97 | return 'pta-list.xml'; 98 | } 99 | 100 | 101 | /** 102 | * Returns the URL that file_get_contents is run against 103 | * @return string 104 | */ 105 | public function getUrl() 106 | { 107 | return $this->url; 108 | } 109 | 110 | } 111 | -------------------------------------------------------------------------------- /src/Exceptions/InvalidExpression.php: -------------------------------------------------------------------------------- 1 | 10 | */ 11 | class InvalidParameterSizeException extends Exception 12 | { 13 | //put your code here 14 | } 15 | -------------------------------------------------------------------------------- /src/Extracts/DateExtract.php: -------------------------------------------------------------------------------- 1 | verify($date)) { 25 | return new DateTime("{$date['year']}-{$date['month']}-{$date['day']}"); 26 | } 27 | return false; 28 | } 29 | 30 | /** 31 | * Verify all the required fields are set in the array 32 | * @param array $date 33 | * @return boolean 34 | */ 35 | protected function verify(array $date) 36 | { 37 | return (!empty($date['year']) && !empty($date['month']) && !empty($date['day'])); 38 | } 39 | 40 | } 41 | -------------------------------------------------------------------------------- /src/Extracts/EmailExtract.php: -------------------------------------------------------------------------------- 1 | minLength = $minLength; 22 | } 23 | 24 | /** 25 | * 26 | * @param string $token 27 | * @return false|string 28 | */ 29 | public function filter($token) 30 | { 31 | // don't count the hash tag sign -1 32 | if($token[0] === '#' && strlen($token)-1 >= $this->getMinLength()) { 33 | return $token; 34 | } 35 | return false; 36 | } 37 | 38 | public function getMinLength() : int 39 | { 40 | return $this->minLength; 41 | } 42 | 43 | } -------------------------------------------------------------------------------- /src/Extracts/LambdaExtract.php: -------------------------------------------------------------------------------- 1 | transform($token); 23 | } 24 | 25 | } 26 | -------------------------------------------------------------------------------- /src/Extracts/UrlExtract.php: -------------------------------------------------------------------------------- 1 | lambdaFunc = $lambdaFunc; 26 | } 27 | 28 | /** 29 | * Run the lambda function on the word token 30 | * @param string $word 31 | * @return string|null 32 | */ 33 | public function transform($word) 34 | { 35 | return call_user_func($this->lambdaFunc, $word); 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/Filters/LowerCaseFilter.php: -------------------------------------------------------------------------------- 1 | getRegex(), '', $word); 17 | } 18 | 19 | } 20 | -------------------------------------------------------------------------------- /src/Filters/QuotesFilter.php: -------------------------------------------------------------------------------- 1 | search = $search; 35 | $this->regex = "/([".implode("", $this->search)."])/u"; 36 | } 37 | 38 | /** 39 | * 40 | * @return string 41 | */ 42 | public function getRegex() 43 | { 44 | return $this->regex; 45 | } 46 | 47 | /** 48 | * Filter the word 49 | * @param string $word 50 | * @return string 51 | */ 52 | public function transform($word) 53 | { 54 | return preg_replace($this->getRegex(), '', $word); 55 | } 56 | 57 | public function __destruct() 58 | { 59 | unset($this->regex); 60 | unset($this->search); 61 | } 62 | } 63 | 64 | -------------------------------------------------------------------------------- /src/Filters/SpacePunctuationFilter.php: -------------------------------------------------------------------------------- 1 | ','?','@', 17 | '^','_','`','{','|','}','~','\[','\]' 18 | ]; 19 | 20 | protected $regex = ""; 21 | 22 | /** 23 | * 24 | * @param array $whiteList 25 | * @param array $blackList 26 | */ 27 | public function __construct(array $whiteList = [], array $blackList = []) 28 | { 29 | // add elements from the white list 30 | $this->searchFor = array_diff($this->searchFor, $whiteList); 31 | $this->searchFor = array_merge($this->searchFor, $blackList); 32 | $this->regex = "/([".implode("", $this->searchFor)."])/"; 33 | } 34 | 35 | /** 36 | * 37 | * @return string 38 | */ 39 | public function getRegex() 40 | { 41 | return $this->regex; 42 | } 43 | 44 | /** 45 | * 46 | * @return array returns an array of characters that are punctuation 47 | */ 48 | public function getSearchFor() 49 | { 50 | return $this->searchFor; 51 | } 52 | 53 | 54 | /** 55 | * 56 | * @param string $word 57 | * @return string 58 | */ 59 | public function transform($word) 60 | { 61 | return preg_replace($this->getRegex(), ' $1 ', $word); 62 | } 63 | 64 | public function __destruct() 65 | { 66 | unset($this->regex); 67 | unset($this->searchFor); 68 | } 69 | 70 | } 71 | -------------------------------------------------------------------------------- /src/Filters/StopWordsFilter.php: -------------------------------------------------------------------------------- 1 | stopWords = array_fill_keys($stopWords, true); 27 | } 28 | 29 | /** 30 | * Check if the stop word is in the list 31 | * @param string $token 32 | */ 33 | public function transform($token) 34 | { 35 | if(isset($this->stopWords[$token])) { 36 | return null; 37 | } 38 | return $token; 39 | } 40 | 41 | /** 42 | * release the stop words 43 | */ 44 | public function __destruct() 45 | { 46 | unset($this->stopWords); 47 | } 48 | } -------------------------------------------------------------------------------- /src/Filters/StripTagsFilter.php: -------------------------------------------------------------------------------- 1 | filePaths = $filePaths; 45 | $this->mode = (int)$mode; 46 | } 47 | 48 | /** 49 | * Returns the array of file paths 50 | * @return string[] 51 | */ 52 | public function getFilePaths() 53 | { 54 | return $this->filePaths; 55 | } 56 | 57 | /** 58 | * Returns an array of stop words and their frequencies 59 | * @return string[] 60 | */ 61 | public function getStopwords() 62 | { 63 | if(!empty($this->stopWords)) { 64 | return $this->stopWords; 65 | } 66 | 67 | foreach($this->getFilePaths() as $filePath) 68 | { 69 | $content = $this->getFileContent($filePath); 70 | $doc = new TokensDocument((new GeneralTokenizer()) 71 | ->tokenize($content) ); 72 | $doc->applyTransformation(new LowerCaseFilter()) 73 | ->applyTransformation(new PossessiveNounFilter()) 74 | ->applyTransformation(new PunctuationFilter()) 75 | ->applyTransformation(new CharFilter()); 76 | 77 | if($this->mode === self::MODE_FREQ) { 78 | $this->computeUsingFreqDist($doc->getDocumentData()); 79 | } 80 | 81 | } 82 | arsort($this->stopWords); 83 | return $this->stopWords; 84 | } 85 | 86 | /** 87 | * Adds frequency counts to the stopWords property 88 | * @param array $tokens 89 | */ 90 | protected function computeUsingFreqDist(array $tokens) 91 | { 92 | $freqDist = (new FreqDist($tokens)) 93 | ->getKeyValuesByFrequency(); 94 | 95 | foreach($freqDist as $token => $freqValue) 96 | { 97 | if(!isset($this->stopWords[$token])) { 98 | $this->stopWords[$token] = $freqValue; 99 | } else { 100 | $this->stopWords[$token] += $freqValue; 101 | } 102 | } 103 | } 104 | 105 | /** 106 | * Returns the text content from the file 107 | * @param string $filePath 108 | * @return string 109 | */ 110 | protected function getFileContent($filePath) 111 | { 112 | return file_get_contents($filePath); 113 | } 114 | 115 | public function __destruct() 116 | { 117 | unset($this->filePaths); 118 | unset($this->mode); 119 | unset($this->stopWords); 120 | } 121 | 122 | } 123 | -------------------------------------------------------------------------------- /src/Indexes/TfIdf.php: -------------------------------------------------------------------------------- 1 | buildIndex($collection); 36 | } 37 | 38 | protected function buildIndex(ICollection $collection) 39 | { 40 | foreach($collection as $id => $document){ 41 | $freqDist = freq_dist($document->getDocumentData()); 42 | foreach($freqDist->getKeyValuesByFrequency() as $key => $freq) { 43 | if(!isset($this->idf[$key])) { 44 | $this->idf[$key] = 0; 45 | } 46 | $this->idf[$key]++; 47 | } 48 | } 49 | 50 | $count = count($collection); 51 | foreach($this->idf as $key => &$value) { 52 | $value = log(($count)/($value)); 53 | } 54 | } 55 | 56 | /** 57 | * If a token is provided return just the idf for that token, 58 | * else return the entire idf 59 | * @param $token string 60 | * @return float|array 61 | */ 62 | public function getIdf($token = null) 63 | { 64 | if(!$token){ 65 | return $this->idf; 66 | } 67 | return $this->idf[$token]; 68 | } 69 | 70 | /** 71 | * Get the term frequency 72 | * @param DocumentAbstract $document - the document to evaluate 73 | * @param string $token The token to look for 74 | * @param int $mode The type of term frequency to use 75 | * @return int|float 76 | */ 77 | public function getTermFrequency(DocumentAbstract $document, $token, $mode = 1) 78 | { 79 | $freqDist = new FreqDist($document->getDocumentData()); 80 | $keyValuesByWeight = $freqDist->getKeyValuesByFrequency(); 81 | 82 | //The token does not exist in the document 83 | if(!isset($keyValuesByWeight[$token])) { 84 | return 0; 85 | } 86 | 87 | switch($mode) { 88 | 89 | case self::BOOLEAN_MODE: 90 | //a test was already performed if the token exists in the document 91 | //just return true 92 | return 1; 93 | case self::LOGARITHMIC_MODE: 94 | return log($keyValuesByWeight[$token]+1); 95 | case self::AUGMENTED_MODE: 96 | 97 | //FreqDist getKeyValuesByFrequency is already sorted 98 | //in ascending order 99 | $maxFrequency = current($keyValuesByWeight); 100 | return 0.5 + (0.5 * $keyValuesByWeight[$token]) / $maxFrequency; 101 | 102 | case self::FREQUENCY_MODE: 103 | default: 104 | return $keyValuesByWeight[$token]; 105 | } 106 | } 107 | 108 | /** 109 | * Get the term frequency 110 | * @param DocumentAbstract $document - the document to evaluate 111 | * @param string $token The token to look for 112 | * @param int $mode The type of term frequency to use 113 | * @return float 114 | */ 115 | public function getTfIdf(DocumentAbstract $document, $token, $mode = 1) 116 | { 117 | return $this->getTermFrequency($document, $token, $mode) * $this->getIdf($token); 118 | } 119 | 120 | 121 | } 122 | 123 | -------------------------------------------------------------------------------- /src/Interfaces/IClassifier.php: -------------------------------------------------------------------------------- 1 | token = $token; 33 | $this->index = $index; 34 | $this->score = $score; 35 | } 36 | 37 | public function getToken() : string 38 | { 39 | return $this->token; 40 | } 41 | 42 | public function getIndex() 43 | { 44 | return $this->index; 45 | } 46 | 47 | public function getScore() 48 | { 49 | return $this->score; 50 | } 51 | 52 | public function addToScore($score) 53 | { 54 | $this->score += $score; 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /src/Models/Wordnet/ExceptionMap.php: -------------------------------------------------------------------------------- 1 | pos = $pos; 39 | $this->target = $target; 40 | $this->exceptionList = $exceptionList; 41 | } 42 | 43 | /** 44 | * 45 | * @return string 46 | */ 47 | public function getPos() 48 | { 49 | return $this->pos; 50 | } 51 | 52 | /** 53 | * 54 | * @return string 55 | */ 56 | public function getTarget() 57 | { 58 | return $this->target; 59 | } 60 | 61 | /** 62 | * @return string[] 63 | */ 64 | public function getExceptionList() 65 | { 66 | return $this->exceptionList; 67 | } 68 | 69 | } 70 | -------------------------------------------------------------------------------- /src/Models/Wordnet/Lemma.php: -------------------------------------------------------------------------------- 1 | word = $word; 61 | $this->pos = $pos; 62 | $this->synsetCnt = $synsetCnt; 63 | $this->pCnt = $pCnt; 64 | $this->ptrSymbols = $ptrSymbols; 65 | $this->synsetOffsets = $synsetOffsets; 66 | } 67 | 68 | /** 69 | * 70 | * @return string 71 | */ 72 | public function getWord() 73 | { 74 | return $this->word; 75 | } 76 | 77 | 78 | 79 | /** 80 | * 81 | * @return int 82 | */ 83 | public function getSynsetCnt() 84 | { 85 | return $this->synsetCnt; 86 | } 87 | 88 | /** 89 | * 90 | * @return int 91 | */ 92 | public function getPCnt() 93 | { 94 | return $this->pCnt; 95 | } 96 | 97 | /** 98 | * 99 | * @return int[] 100 | */ 101 | public function getSynsetOffsets() 102 | { 103 | return $this->synsetOffsets; 104 | } 105 | 106 | /** 107 | * 108 | * @return string[] 109 | */ 110 | public function getPtrSymbols() 111 | { 112 | return $this->ptrSymbols; 113 | } 114 | 115 | /** 116 | * Get the synsets for this lemma 117 | * @return Synset[] 118 | */ 119 | public function getSynsets() 120 | { 121 | return $this->synsets; 122 | } 123 | 124 | /** 125 | * 126 | * @param Synset[] $synsets 127 | * @return \TextAnalysis\Models\Wordnet\Lemma 128 | */ 129 | public function setSynsets(array $synsets) 130 | { 131 | $this->synsets = $synsets; 132 | return $this; 133 | } 134 | 135 | } 136 | -------------------------------------------------------------------------------- /src/NGrams/StatisticFacade.php: -------------------------------------------------------------------------------- 1 | 12 | * @author Kaue Oliveira Almeida 13 | */ 14 | class StatisticFacade 15 | { 16 | protected function __construct(){} 17 | 18 | /** 19 | * Calculate the statistic for an ngram array 20 | * @param array $ngrams Array of ngrams 21 | * @param string $measure Name of the statistic measure 22 | * @param int $nGramSize Size of the ngrams 23 | * @return array Return the ngram array with the statistic values 24 | */ 25 | public static function calculate(array $ngrams, string $measure, int $nGramSize = 2) : array 26 | { 27 | $totalNgrams = array_sum(array_column($ngrams, 0)); 28 | return array_map( function($item) use($measure, $totalNgrams, $nGramSize) { 29 | if ($nGramSize == 2) { 30 | return Statistic2D::$measure($item, $totalNgrams); 31 | } elseif ($nGramSize == 3) { 32 | return Statistic3D::$measure($item, $totalNgrams); 33 | } else { 34 | throw new \Exception("Size of the ngram informed invalid", 1); 35 | } 36 | }, $ngrams); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/Phonetics/MetaphonePhonetic.php: -------------------------------------------------------------------------------- 1 | stemmer = $stemmer; 44 | $this->spell = $spell; 45 | $this->whiteList = $whiteList; 46 | } 47 | 48 | /** 49 | * Stem and then look up the word 50 | * @param string $token 51 | */ 52 | public function stem($token) 53 | { 54 | if(in_array($token, $this->whiteList)) { 55 | return $token; 56 | } 57 | return $this->spell->suggest( $this->stemmer->stem($token) )[0]; 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /src/Stemmers/LambdaStemmer.php: -------------------------------------------------------------------------------- 1 | transform($token); 24 | } 25 | 26 | } 27 | -------------------------------------------------------------------------------- /src/Stemmers/LookupStemmer.php: -------------------------------------------------------------------------------- 1 | dictionary = $reader->read(); 21 | } 22 | 23 | /** 24 | * Returns a token's stemmed root 25 | * @param string $token 26 | * @return string 27 | */ 28 | public function stem($token) 29 | { 30 | if(array_key_exists($token, $this->dictionary)){ 31 | return $this->dictionary[$token]; 32 | } 33 | return $token; 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/Stemmers/MorphStemmer.php: -------------------------------------------------------------------------------- 1 | wordnetIndex = new WordnetIndex(new WordnetCorpus(get_storage_path('corpora/wordnet'))); 32 | } 33 | 34 | /** 35 | * 36 | * @return WordnetIndex 37 | */ 38 | public function getWordnetIndex() 39 | { 40 | return $this->wordnetIndex; 41 | } 42 | 43 | /** 44 | * 45 | * @param string $token 46 | * @return string 47 | */ 48 | public function stem($token) 49 | { 50 | if(!isset($this->cache[$token])) { 51 | if(mb_strlen($token) < 3){ 52 | $this->cache[$token] = $token; 53 | } else { 54 | $this->cache[$token] = $this->getWordnetIndex()->getMorph($token); 55 | } 56 | } 57 | return $this->cache[$token]; 58 | } 59 | 60 | public function __destruct() 61 | { 62 | unset($this->cache); 63 | unset($this->wordnetIndex); 64 | } 65 | 66 | 67 | } 68 | -------------------------------------------------------------------------------- /src/Stemmers/RegexStemmer.php: -------------------------------------------------------------------------------- 1 | regexExpression = $regexExpression; 20 | $this->minimumTokenLength = $minimumTokenLength; 21 | } 22 | 23 | /** 24 | * Return a stemmed word 25 | * @param string $token 26 | * @return string 27 | */ 28 | public function stem($token) 29 | { 30 | if(strlen($token) < $this->minimumTokenLength) { 31 | return $token; 32 | } 33 | return preg_replace("/".$this->regexExpression."/i", '', $token); 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/Stemmers/SnowballStemmer.php: -------------------------------------------------------------------------------- 1 | stemmer = new $className(); 36 | } 37 | // support version 2 and above 38 | else { 39 | $this->stemmer = StemmerFactory::create (strtolower($stemmerType)); 40 | } 41 | } 42 | 43 | public function stem($token) : string 44 | { 45 | return $this->stemmer->stem($token); 46 | } 47 | 48 | } -------------------------------------------------------------------------------- /src/Taggers/StanfordNerTagger.php: -------------------------------------------------------------------------------- 1 | tmpFilePath = tempnam(sys_get_temp_dir(), "stanford_ner_"); 30 | } 31 | 32 | public function getCommand() 33 | { 34 | return escapeshellcmd( 35 | $this->getPathToJava() . 36 | " ".implode(" ", $this->getJavaOptions()) . 37 | " -cp " . $this->getJarPath() . $this->getPathSeparator() . 38 | dirname($this->getJarPath()).DIRECTORY_SEPARATOR."lib".DIRECTORY_SEPARATOR."*". 39 | " edu.stanford.nlp.ie.crf.CRFClassifier " . 40 | " -loadClassifier {$this->getClassifierPath()}" . 41 | " -textFile {$this->getTmpFilePath()}" 42 | ); 43 | } 44 | 45 | /** 46 | * 47 | * @return array 48 | */ 49 | protected function getParsedOutput() 50 | { 51 | $data = []; 52 | 53 | $filter = new PunctuationFilter(); 54 | $phrases = (new WhitespaceTokenizer())->tokenize($this->output ?? ''); 55 | foreach($phrases as $phrase) 56 | { 57 | $tokens = explode("{$this->getSeparator()}", $phrase); 58 | $type = array_pop($tokens); 59 | 60 | foreach($tokens as $token) 61 | { 62 | if(empty($token) || empty($filter->transform($token))) { 63 | continue; 64 | } 65 | $data[] = [$token,$type]; 66 | } 67 | } 68 | return $data; 69 | } 70 | 71 | } 72 | -------------------------------------------------------------------------------- /src/Taggers/StanfordPosTagger.php: -------------------------------------------------------------------------------- 1 | tmpFilePath = tempnam(sys_get_temp_dir(), "stanford_pos_"); 30 | } 31 | 32 | /** 33 | * 34 | * @return string 35 | */ 36 | public function getCommand() 37 | { 38 | return escapeshellcmd( 39 | $this->getPathToJava() . 40 | " ".implode(" ", $this->getJavaOptions()) . 41 | " -cp " . $this->getJarPath() . $this->getPathSeparator() . 42 | dirname($this->getJarPath()).DIRECTORY_SEPARATOR."lib".DIRECTORY_SEPARATOR."*". 43 | " edu.stanford.nlp.tagger.maxent.MaxentTagger " . 44 | " -model {$this->getClassifierPath()}" . 45 | " -textFile {$this->getTmpFilePath()}" . 46 | " -outputFormat tsv" 47 | ); 48 | } 49 | 50 | protected function getParsedOutput() 51 | { 52 | $data = []; 53 | 54 | $lines = explode(PHP_EOL, $this->output ?? ''); 55 | foreach($lines as $line) 56 | { 57 | $line = str_replace("\t", $this->getSeparator(), $line); 58 | $row = array_map('trim', explode($this->getSeparator(), $line)); 59 | 60 | if(empty($row[0]) || empty(end($row)) ) { 61 | continue; 62 | } 63 | $len = count($row); 64 | for($index = 0; $index < $len-1; $index++) 65 | { 66 | $data[] = [$row[$index],$row[$len-1]]; 67 | } 68 | } 69 | 70 | return $data; 71 | 72 | } 73 | 74 | } 75 | -------------------------------------------------------------------------------- /src/Tokenizers/FixedLengthTokenizer.php: -------------------------------------------------------------------------------- 1 | startPosition = $startPosition; 24 | $this->length = $length; 25 | } 26 | 27 | /** 28 | * Return array with single element 29 | * @param string $string 30 | * @return array 31 | */ 32 | public function tokenize(string $string) 33 | { 34 | if(!$this->length) { 35 | return array(substr($string, $this->startPosition)); 36 | } else { 37 | return array(substr($string, $this->startPosition, $this->length)); 38 | } 39 | } 40 | 41 | } 42 | 43 | -------------------------------------------------------------------------------- /src/Tokenizers/GeneralTokenizer.php: -------------------------------------------------------------------------------- 1 | tokenExpression = $tokenExpression; 23 | } 24 | 25 | 26 | /** 27 | * Return tokenized array from string 28 | * @param string $string 29 | * @return array 30 | */ 31 | public function tokenize(string $string) 32 | { 33 | return $this->strTokenWrapper($string); 34 | } 35 | 36 | /** 37 | * Use the php function strtok to Tokenize simple string 38 | * @internal 39 | * @return array 40 | */ 41 | protected function strTokenWrapper($string) 42 | { 43 | $token = strtok($string, $this->tokenExpression); 44 | 45 | $tokens = array(); 46 | while ($token !== false) { 47 | // avoid tokenizing white spaces 48 | if(!empty(trim($token))) { 49 | $tokens[] = $token; 50 | } 51 | $token = strtok($this->tokenExpression); 52 | } 53 | return $tokens; 54 | } 55 | } 56 | 57 | -------------------------------------------------------------------------------- /src/Tokenizers/LambdaTokenizer.php: -------------------------------------------------------------------------------- 1 | lambdaFunc = $lambdaFunc; 28 | } 29 | 30 | 31 | public function tokenize(string $string) 32 | { 33 | return call_user_func($this->lambdaFunc, $string); 34 | } 35 | 36 | } 37 | -------------------------------------------------------------------------------- /src/Tokenizers/RegexTokenizer.php: -------------------------------------------------------------------------------- 1 | pattern = $pattern; 20 | $this->flags = $flags; 21 | $this->offset = $offset; 22 | } 23 | 24 | /** 25 | * Wraps preg_match_all 26 | * @param string $string 27 | * @return array 28 | */ 29 | public function tokenize(string $string) 30 | { 31 | $matches = array(); 32 | $count = preg_match_all($this->pattern, $string, $matches, $this->flags, $this->offset); 33 | if($count === false) { 34 | return array(); 35 | } 36 | return $matches[0]; 37 | 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/Tokenizers/TokenizerAbstract.php: -------------------------------------------------------------------------------- 1 | sentence = new Sentence; 24 | } 25 | 26 | /** 27 | * Split the text into sentences 28 | * @param string $string 29 | * @return array 30 | */ 31 | public function tokenize(string $string): array 32 | { 33 | return filter_empty( $this->sentence->split($string)); 34 | } 35 | 36 | public function __destruct() 37 | { 38 | unset($this->sentence); 39 | } 40 | 41 | } 42 | -------------------------------------------------------------------------------- /src/Tokenizers/WhitespaceTokenizer.php: -------------------------------------------------------------------------------- 1 | pos = $pos; 24 | } 25 | 26 | /** 27 | * Returns single character 28 | * @return string 29 | */ 30 | public function getPos() 31 | { 32 | return $this->pos; 33 | } 34 | 35 | /** 36 | * 37 | * @param string[] $ptrSymbols 38 | */ 39 | public function setPtrSymbols(array $ptrSymbols) 40 | { 41 | $this->ptrSymbols = $ptrSymbols; 42 | } 43 | 44 | /** 45 | * 46 | * @return string[] 47 | */ 48 | public function getPtrSymbols() 49 | { 50 | return $this->ptrSymbols; 51 | } 52 | 53 | public function isAntonym() 54 | { 55 | return $this->isA('!'); 56 | } 57 | 58 | public function isHypernym() 59 | { 60 | return $this->isA('@'); 61 | } 62 | 63 | public function isInstanceHypernym() 64 | { 65 | return $this->isA('@!'); 66 | } 67 | 68 | public function isHyponym() 69 | { 70 | return $this->isA('~'); 71 | } 72 | 73 | public function isInstanceHyponym() 74 | { 75 | return $this->isA('~i'); 76 | } 77 | 78 | public function isMemberHolonym() 79 | { 80 | return $this->isA('#m'); 81 | } 82 | 83 | public function isSubstanceHolonym() 84 | { 85 | return $this->isA('#s'); 86 | } 87 | 88 | public function isPartHolonym() 89 | { 90 | return $this->isA('#p'); 91 | } 92 | 93 | public function isMemberMeronym() 94 | { 95 | return $this->isA('%m'); 96 | } 97 | 98 | public function isSubstanceMeronym() 99 | { 100 | return $this->isA('%s'); 101 | } 102 | 103 | public function isPartMeronym() 104 | { 105 | return $this->isA('%p'); 106 | } 107 | 108 | public function isAttribute() 109 | { 110 | return $this->isA('='); 111 | } 112 | 113 | public function isDerivation() 114 | { 115 | return $this->isA('+'); 116 | } 117 | 118 | public function isEntailment() 119 | { 120 | return $this->isA('*'); 121 | } 122 | 123 | public function isCause() 124 | { 125 | return $this->isA('>'); 126 | } 127 | 128 | public function isSeeAlso() 129 | { 130 | return $this->isA('>'); 131 | } 132 | 133 | public function isVerbGroup() 134 | { 135 | return $this->isA('$'); 136 | } 137 | 138 | public function isSimilarTo() 139 | { 140 | return $this->isA('$'); 141 | } 142 | 143 | public function isParticipleOfVerb() 144 | { 145 | return $this->isA('<'); 146 | } 147 | 148 | public function isPertainym() 149 | { 150 | return $this->isA('\\'); 151 | } 152 | 153 | public function isDerivedFromAdjective() 154 | { 155 | return $this->isA('\\'); 156 | } 157 | 158 | /** 159 | * 160 | * @param string $symbol 161 | * @return boolean 162 | */ 163 | protected function isA($symbol) 164 | { 165 | return in_array($symbol, $this->getPtrSymbols()); 166 | } 167 | 168 | } 169 | 170 | -------------------------------------------------------------------------------- /src/Utilities/Nltk/Download/Package.php: -------------------------------------------------------------------------------- 1 | id = $id; 35 | $this->checksum = $checksum; 36 | $this->name = $name; 37 | $this->subdir = $subdir; 38 | $this->unzip = $unzip; 39 | $this->url = $url; 40 | } 41 | 42 | public function getName() 43 | { 44 | return $this->name; 45 | } 46 | 47 | public function getChecksum() 48 | { 49 | return $this->checksum; 50 | } 51 | 52 | public function getId() 53 | { 54 | return $this->id; 55 | } 56 | 57 | public function getSubdir() 58 | { 59 | return $this->subdir; 60 | } 61 | 62 | public function getUnzip() 63 | { 64 | return $this->unzip; 65 | } 66 | 67 | public function getUrl() 68 | { 69 | return $this->url; 70 | } 71 | 72 | /** 73 | * Returns the path the package should be installed into 74 | * @return string 75 | */ 76 | public function getInstallationPath() 77 | { 78 | return get_storage_path($this->getSubdir().DIRECTORY_SEPARATOR.$this->getId()); 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /src/Utilities/Vowels/EnglishVowels.php: -------------------------------------------------------------------------------- 1 | Return a TextCorpus object', 11 | 'normalize(string $text) -> Normalize text to lower case', 12 | 'todo ....' 13 | ]; 14 | print_array($menu); 15 | } 16 | 17 | -------------------------------------------------------------------------------- /src/helpers/print.php: -------------------------------------------------------------------------------- 1 | $returnValue) 70 | { 71 | $mock->shouldReceive($methodName) 72 | ->andReturn($returnValue); 73 | } 74 | return $mock; 75 | } 76 | 77 | /** 78 | * 79 | * @return InvertedIndex 80 | */ 81 | public function getInvertedIndex() 82 | { 83 | if(!$this->invertedIndex) { 84 | $docs = [ 85 | new TokensDocument(["marquette", "michigan", "hiking", "hiking", "hiking" , "camping", "swimming"]), 86 | new TokensDocument(["ironwood", "michigan", "hiking", "biking", "camping", "swimming","marquette"]), 87 | new TokensDocument(["no","tokens","michigan"]) 88 | ]; 89 | $collection = new DocumentArrayCollection($docs); 90 | $builder = new CollectionInvertedIndexBuilder($collection); 91 | $dataReader = new ArrayDataReaderAdapter($builder->getIndex()); 92 | $this->invertedIndex = new InvertedIndex($dataReader); 93 | } 94 | return $this->invertedIndex; 95 | } 96 | } 97 | 98 | 99 | -------------------------------------------------------------------------------- /tests/TextAnalysis/Adapters/PspellAdapterTest.php: -------------------------------------------------------------------------------- 1 | assertEquals('run', $adapter->suggest("runn")[0]); 22 | $this->assertEquals('Cooper', $adapter->suggest("yooper")[0]); 23 | $this->assertEquals('flute', $adapter->suggest("flute")[0]); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /tests/TextAnalysis/Analysis/DateAnalysisTest.php: -------------------------------------------------------------------------------- 1 | assertEquals("2015-09-01", $dateAnalysis->getDates()[0]->format('Y-m-d')); 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /tests/TextAnalysis/Analysis/FreqDistTest.php: -------------------------------------------------------------------------------- 1 | assertTrue(count($freqDist->getHapaxes()) === 3); 17 | $this->assertEquals(9, $freqDist->getTotalTokens()); 18 | $this->assertEquals(6, $freqDist->getTotalUniqueTokens()); 19 | } 20 | 21 | public function testEmptyHapaxesFreqDist() 22 | { 23 | $freqDist = new FreqDist(array("time", "time", "what", "what")); 24 | $this->assertTrue(count($freqDist->getHapaxes()) === 0); 25 | $this->assertEquals(4, $freqDist->getTotalTokens()); 26 | $this->assertEquals(2, $freqDist->getTotalUniqueTokens()); 27 | } 28 | 29 | public function testSingleHapaxFreqDist() 30 | { 31 | $freqDist = new FreqDist(array("time")); 32 | $this->assertTrue(count($freqDist->getHapaxes()) === 1); 33 | $this->assertEquals(1, $freqDist->getTotalTokens()); 34 | $this->assertEquals(1, $freqDist->getTotalUniqueTokens()); 35 | } 36 | 37 | /** 38 | * 39 | */ 40 | public function testEmptyFreqDist() 41 | { 42 | $this->expectException(\TextAnalysis\Exceptions\InvalidParameterSizeException::class); 43 | $freqDist = new FreqDist([]); 44 | } 45 | } 46 | 47 | -------------------------------------------------------------------------------- /tests/TextAnalysis/Analysis/Keywords/RakeTest.php: -------------------------------------------------------------------------------- 1 | transform($this->getTestData()); 26 | //rake MUST be split on whitespace and new lines only 27 | $tokens = (new GeneralTokenizer(" \n\t\r"))->tokenize($testData); 28 | $tokenDoc = new TokensDocument($tokens); 29 | $tokenDoc->applyTransformation(new LowerCaseFilter()) 30 | ->applyTransformation(new StopWordsFilter($stopwords), true) 31 | ->applyTransformation(new PunctuationFilter(['@',':','\/']), true) 32 | ->applyTransformation(new CharFilter(), true); 33 | 34 | $rake = new Rake($tokenDoc, 3); 35 | $results = $rake->getKeywordScores(); 36 | $this->assertArrayHasKey('minimal generating sets', $results); 37 | $this->assertArrayHasKey('8/8/2016 5:51 pm', $results); 38 | } 39 | 40 | public function testSimplifiedRake() 41 | { 42 | $stopwords = array_map('trim', file(VENDOR_DIR.'yooper/stop-words/data/stop-words_english_1_en.txt')); 43 | // all punctuation must be moved 1 over. Fixes issues with sentences 44 | $testData = (new SpacePunctuationFilter([':','\/']))->transform($this->getTestData()); 45 | //rake MUST be split on whitespace and new lines only 46 | $tokens = (new GeneralTokenizer(" \n\t\r"))->tokenize($testData); 47 | $tokenDoc = new TokensDocument($tokens); 48 | $tokenDoc->applyTransformation(new LowerCaseFilter()) 49 | ->applyTransformation(new StopWordsFilter($stopwords), true) 50 | ->applyTransformation(new PunctuationFilter(['@',':','\/']), true) 51 | ->applyTransformation(new CharFilter(), true); 52 | 53 | $rake = rake($tokenDoc->toArray(), 3); 54 | $results = $rake->getKeywordScores(); 55 | $this->assertArrayHasKey('minimal generating sets', $results); 56 | $this->assertArrayHasKey('8/8/2016 5:51 pm', $results); 57 | } 58 | 59 | /** 60 | * Sample test data 61 | * @return string 62 | */ 63 | public function getTestData() 64 | { 65 | return <<assertTrue($collection->count() === 3); 28 | 29 | $this->assertEquals($collection[2]->getDocumentData(), array("no","tokens")); 30 | } 31 | 32 | public function testFiltersOnCollection() 33 | { 34 | $docs = array( 35 | new TokensDocument(array("Marquette", "Michigan's", "hiking", "hiking", "hiking" , "camping", "swimming")), 36 | new TokensDocument(array("Ironwood", "michigan", "hiking", "biking", "camping", "swimming","marquette")), 37 | new TokensDocument(array("No","Tokens")) 38 | ); 39 | 40 | $collection = new DocumentArrayCollection($docs); 41 | 42 | $filters = array( 43 | new LowerCaseFilter(), 44 | new QuotesFilter() 45 | ); 46 | 47 | $collection->applyTransformations($filters); 48 | 49 | 50 | $this->assertTrue($collection->count() === 3); 51 | 52 | $this->assertEquals(array("marquette", "michigans", "hiking", "hiking", "hiking" , "camping", "swimming"), $collection[0]->getDocumentData()); 53 | $this->assertEquals(array("ironwood", "michigan", "hiking", "biking", "camping", "swimming","marquette"),$collection[1]->getDocumentData()); 54 | $this->assertEquals(array("no","tokens"), $collection[2]->getDocumentData()); 55 | 56 | } 57 | } 58 | 59 | -------------------------------------------------------------------------------- /tests/TextAnalysis/Collocations/CollocationFinderTest.php: -------------------------------------------------------------------------------- 1 | transform(self::$text); 28 | $tokens = (new GeneralTokenizer(" \n\t\r"))->tokenize($testData); 29 | $tokenDoc = new TokensDocument($tokens); 30 | $tokenDoc->applyTransformation(new LowerCaseFilter()) 31 | ->applyTransformation(new PunctuationFilter([]), false) 32 | ->applyTransformation(new StopWordsFilter($stopwords)) 33 | ->applyTransformation(new QuotesFilter()) 34 | ->applyTransformation(new CharFilter()); 35 | 36 | $finder = new CollocationFinder($tokenDoc->toArray()); 37 | $this->assertArrayHasKey('injun joe', $finder->getCollocations()); 38 | } 39 | 40 | public function testCollocationFinderTrigram() 41 | { 42 | $stopwords = array_map('trim', file(VENDOR_DIR.'yooper/stop-words/data/stop-words_english_1_en.txt')); 43 | $testData = (new SpacePunctuationFilter())->transform(self::$text); 44 | $tokens = (new GeneralTokenizer(" \n\t\r"))->tokenize($testData); 45 | $tokenDoc = new TokensDocument($tokens); 46 | $tokenDoc->applyTransformation(new LowerCaseFilter()) 47 | ->applyTransformation(new PunctuationFilter([]), false) 48 | ->applyTransformation(new StopWordsFilter($stopwords)) 49 | ->applyTransformation(new QuotesFilter()) 50 | ->applyTransformation(new CharFilter()); 51 | 52 | $finder = new CollocationFinder($tokenDoc->toArray(), 3); 53 | $this->assertArrayHasKey('finn red handed', $finder->getCollocations()); 54 | } 55 | 56 | public function testGetCollocationsByPmi() 57 | { 58 | $testData = (new SpacePunctuationFilter())->transform(self::$text); 59 | $tokens = (new GeneralTokenizer(" \n\t\r"))->tokenize($testData); 60 | $tokenDoc = new TokensDocument($tokens); 61 | $tokenDoc->applyTransformation(new LowerCaseFilter()) 62 | ->applyTransformation(new PunctuationFilter([]), false) 63 | ->applyTransformation(new StopWordsFilter([])) 64 | ->applyTransformation(new QuotesFilter()) 65 | ->applyTransformation(new CharFilter()); 66 | 67 | $finder = new CollocationFinder($tokenDoc->toArray(), 2); 68 | $this->assertArrayHasKey('outlying cottages', $finder->getCollocationsByPmi()); 69 | 70 | } 71 | 72 | 73 | } 74 | -------------------------------------------------------------------------------- /tests/TextAnalysis/Comparisons/CosineSimilarityComparisonTest.php: -------------------------------------------------------------------------------- 1 | assertEquals(1.0, round($compare->similarity($text1, $text2), 1)); 20 | 21 | } 22 | 23 | public function testDifferent() 24 | { 25 | $text1 = ["hiking" , "hiking", "camping", "swimming"]; 26 | $text2 = ["hiking" , "biking", "camping", "swimming"]; 27 | $compare = new CosineSimilarityComparison(); 28 | $this->assertEquals(0.8, round($compare->similarity($text1, $text2), 1)); 29 | } 30 | 31 | public function testNothingInCommon() 32 | { 33 | $text1 = ["hiking", "camping", "swimming"]; 34 | $text2 = ["biking", "boating", "floating"]; 35 | $compare = new CosineSimilarityComparison(); 36 | $this->assertEquals(0, $compare->similarity($text1, $text2)); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /tests/TextAnalysis/Comparisons/HammingDistanceComparisonTest.php: -------------------------------------------------------------------------------- 1 | 12 | */ 13 | class HammingDistanceComparisonTest extends \PHPUnit\Framework\TestCase 14 | { 15 | public function testHammingDistance() 16 | { 17 | $c = new HammingDistanceComparison(); 18 | $this->assertEquals(3, $c->distance('karolin', 'kathrin')); 19 | $this->assertEquals(3, $c->distance('karolin', 'kerstin')); 20 | $this->assertEquals(2, $c->distance('1011101', '1001001')); 21 | $this->assertEquals(3, $c->distance('2173896', '2233796')); 22 | } 23 | } -------------------------------------------------------------------------------- /tests/TextAnalysis/Comparisons/JaccardIndexComparisonTest.php: -------------------------------------------------------------------------------- 1 | 12 | */ 13 | class JaccardIndexComparisonTest extends \PHPUnit\Framework\TestCase 14 | { 15 | public function testJaccardIndex() 16 | { 17 | $c = new JaccardIndexComparison(); 18 | $this->assertEquals(1, $c->similarity('a', 'a')); 19 | $this->assertEquals(1, $c->similarity(['a'], ['a'])); 20 | $this->assertEquals(1, $c->similarity(['a','b'], ['b','a'])); 21 | $this->assertEquals(.5, $c->similarity(['a','b'], ['a'])); 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /tests/TextAnalysis/Comparisons/JaroWinklerComparisonTest.php: -------------------------------------------------------------------------------- 1 | 11 | */ 12 | class JaroWinklerComparisonTest extends \PHPUnit\Framework\TestCase 13 | { 14 | public function testJaroWinkler() 15 | { 16 | $jw = new JaroWinklerComparison(); 17 | $this->assertEquals('0.961', sprintf("%1.3f", $jw->similarity('MARTHA', 'MARHTA'))); 18 | $this->assertEquals('0.840', sprintf("%1.3f", $jw->similarity('DWAYNE', 'DUANE'))); 19 | $this->assertEquals('0.813', sprintf("%1.3f", $jw->similarity('DIXON', 'DICKSONX'))); 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /tests/TextAnalysis/Comparisons/LevenshteinComparisonTest.php: -------------------------------------------------------------------------------- 1 | assertEquals(1, $comparison->distance('hat', 'cat')); 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /tests/TextAnalysis/Comparisons/LongestCommonSubstringComparisonTest.php: -------------------------------------------------------------------------------- 1 | 11 | */ 12 | class LongestSubstringComparisonTest extends \PHPUnit\Framework\TestCase 13 | { 14 | public function testLcs() 15 | { 16 | $lcs = new LongestCommonSubstringComparison(); 17 | 18 | $txt1 = "Michael"; 19 | $txt2 = "Michelle"; 20 | $this->assertEquals(4, $lcs->distance($txt2, $txt1)); 21 | $this->assertEquals("Mich", $lcs->similarity($txt2, $txt1)); 22 | 23 | $txt1 = "sunnyside"; 24 | $txt2 = "hide"; 25 | 26 | 27 | $this->assertEquals(6, $lcs->distance($txt2, $txt1)); 28 | $this->assertEquals("ide", $lcs->similarity($txt2, $txt1)); 29 | } 30 | 31 | public function testLcsWithCache() 32 | { 33 | $lcs = new LongestCommonSubstringComparison(true); 34 | $txt1 = "Michael"; 35 | $txt2 = "Michelle"; 36 | $this->assertEquals(4, $lcs->distance($txt2, $txt1)); 37 | $this->assertEquals("Mich", $lcs->similarity($txt2, $txt1)); 38 | 39 | $txt1 = "sunnyside"; 40 | $txt2 = "hide"; 41 | 42 | $this->assertEquals(6, $lcs->distance($txt2, $txt1)); 43 | $this->assertEquals("ide", $lcs->similarity($txt2, $txt1)); 44 | 45 | $this->assertCount(2, $lcs->getCache()); 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /tests/TextAnalysis/Comparisons/MostFreqCharComparisonTest.php: -------------------------------------------------------------------------------- 1 | 11 | */ 12 | class MostFreqCharComparisonTest extends \PHPUnit\Framework\TestCase 13 | { 14 | public function testComparison() 15 | { 16 | $mf = new MostFreqCharComparison(); 17 | $this->assertEquals(4, $mf->similarity('research', 'research')); 18 | $this->assertEquals(2, $mf->similarity('research', 'seeking')); 19 | $this->assertEquals(3, $mf->similarity('significant', 'capabilities')); 20 | 21 | $this->assertEquals(4, $mf->distance('research', 'research')); 22 | $this->assertEquals(6, $mf->distance('research', 'seeking')); 23 | $this->assertEquals(9, $mf->distance('significant', 'capabilities')); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /tests/TextAnalysis/Corpus/ImportCorpusTest.php: -------------------------------------------------------------------------------- 1 | shouldReceive('getInstallationPath') 20 | ->andReturn(TEST_DATA_DIR.DS.'books'.DS); 21 | 22 | $mockImportCorpus = Mockery::mock('TextAnalysis\Corpus\ImportCorpus[getPackage,getFileIds]', [null, null, null, null, null, null]) 23 | ->shouldAllowMockingProtectedMethods(); 24 | 25 | $mockImportCorpus->shouldReceive('getPackage') 26 | ->andReturn($mockPackage); 27 | 28 | $mockImportCorpus->shouldReceive('getFileIds') 29 | ->andReturn(['tom_sawyer.txt']); 30 | 31 | $this->assertEquals(['tom_sawyer.txt'], $mockImportCorpus->getFileIds()); 32 | $this->assertCount(76057, $mockImportCorpus->getWords()); 33 | $this->assertCount(1, $mockImportCorpus->getRaw()); 34 | // sentence tokenizer is too slow 35 | ///var_dump($mockImportCorpus->getSentences()); 36 | //$this->assertCount(5227, $mockImportCorpus->getSentences()); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /tests/TextAnalysis/Corpus/NameCorpusTest.php: -------------------------------------------------------------------------------- 1 | assertTrue($corpus->isFirstName('Mike')); 25 | $this->assertFalse($corpus->isFirstName('very')); 26 | } 27 | 28 | public function testGetFirstName() 29 | { 30 | if( getenv('SKIP_TEST')) { 31 | return; 32 | } 33 | 34 | $corpus = new NameCorpus(); 35 | $firstName = $corpus->getFirstName('Mike'); 36 | $this->assertNotEmpty($firstName); 37 | 38 | $this->assertEmpty($corpus->getFirstName('very')); 39 | } 40 | 41 | 42 | public function testLastNames() 43 | { 44 | if( getenv('SKIP_TEST')) { 45 | return; 46 | } 47 | 48 | $corpus = new NameCorpus(); 49 | $this->assertTrue($corpus->isLastName('Williamson')); 50 | $this->assertFalse($corpus->isLastName('Baggins')); 51 | } 52 | 53 | public function testGetLastName() 54 | { 55 | if( getenv('SKIP_TEST')) { 56 | return; 57 | } 58 | 59 | $corpus = new NameCorpus(); 60 | $lastName = $corpus->getLastName('Williamson'); 61 | $this->assertEquals(245, $lastName['rank']); 62 | 63 | $lastName = $corpus->getLastName('Baggins'); 64 | $this->assertEmpty($lastName); 65 | } 66 | 67 | 68 | public function testFullNames() 69 | { 70 | if( getenv('SKIP_TEST')) { 71 | return; 72 | } 73 | 74 | $corpus = new NameCorpus(); 75 | $this->assertTrue($corpus->isFullName('Brad Von Williamson')); 76 | $this->assertFalse($corpus->isFullName('Jimbo')); 77 | $this->assertTrue($corpus->isFullName('Bradley Thomas')); 78 | } 79 | 80 | 81 | 82 | } 83 | -------------------------------------------------------------------------------- /tests/TextAnalysis/Corpus/TextCorpusTest.php: -------------------------------------------------------------------------------- 1 | assertInstanceOf(TextCorpus::class, text($this->getText())); 19 | } 20 | 21 | public function testConcordance() 22 | { 23 | $corpus = new TextCorpus($this->getText()); 24 | $results = $corpus->concordance("tom sawyer"); 25 | $this->assertCount(34, $results); 26 | } 27 | 28 | public function testConcordancePtBr() 29 | { 30 | $corpus = new TextCorpus($this->getText('ptbr')); 31 | $results = $corpus->concordance("José",20, true, 'equal'); 32 | $this->assertCount(160, $results); 33 | } 34 | 35 | public function testTokenizer() 36 | { 37 | $corpus = new TextCorpus($this->getText()); 38 | $results = $corpus->getTokens(); 39 | $this->assertCount(76057, $results); 40 | } 41 | 42 | public function testFindAll() 43 | { 44 | $corpus = new TextCorpus($this->getText()); 45 | $results = $corpus->findAll("tom sawyer"); 46 | $this->assertCount(32, $results); 47 | } 48 | 49 | public function testDispersion() 50 | { 51 | $corpus = new TextCorpus($this->getText()); 52 | $results = $corpus->getDispersion(["tom sawyer", "huck finn"]); 53 | $this->assertCount(22, $results[0]); 54 | $this->assertCount(58, $results[1]); 55 | } 56 | 57 | } 58 | -------------------------------------------------------------------------------- /tests/TextAnalysis/Corpus/WordnetCorpusTest.php: -------------------------------------------------------------------------------- 1 | getFileNames() as $fileName) 23 | { 24 | $this->assertFileExists($wn->getDir().$fileName); 25 | } 26 | } 27 | 28 | public function testGetLexNames() 29 | { 30 | if( getenv('SKIP_TEST') || !is_dir(get_storage_path('corpora/wordnet'))) { 31 | return; 32 | } 33 | $wn = new WordnetCorpus(get_storage_path('corpora/wordnet')); 34 | $this->assertCount(45, $wn->getLexNames()); 35 | } 36 | 37 | public function testGetLemmaFromString() 38 | { 39 | $testLines = [ 40 | 'zombie n 5 3 @ %s ; 5 1 10805638 10805932 10805783 09825519 07919165' 41 | ]; 42 | $wn = new WordnetCorpus("not_checked"); 43 | $lemma = $wn->getLemmaFromString($testLines[0]); 44 | $this->assertCount(5, $lemma->getSynsetOffsets()); 45 | $this->assertEquals('n', $lemma->getPos()); 46 | $this->assertTrue($lemma->isHypernym()); 47 | } 48 | 49 | 50 | public function testGetSynsetFromString() 51 | { 52 | $testLines = [ 53 | "825519 18 n 03 automaton 1 zombi 1 zombie 1 004 @ 09606527 n 0000 + 01499999 a 0101 + 00480221 v 0101 + 00480221 v 0102 | someone who acts or responds in a mechanical or apathetic way; \only an automaton wouldn't have noticed\"" ]; 54 | $wn = new WordnetCorpus("not_checked"); 55 | $synset = $wn->getSynsetFromString($testLines[0]); 56 | $this->assertCount(3, $synset->getWords()); 57 | $this->assertCount(4, $synset->getLinkedSynsets()); 58 | } 59 | 60 | 61 | public function testGetLemmas() 62 | { 63 | if( getenv('SKIP_TEST') || !is_dir(get_storage_path('corpora/wordnet'))) { 64 | return; 65 | } 66 | $wnMock = $this->getPartialMock(WordnetCorpus::class, ['getIndexFileNames' => ['index.adj']], [get_storage_path('corpora/wordnet')]); 67 | $this->assertCount(21479, $wnMock->getLemmas()); 68 | $keys = array_keys($wnMock->getLemmas()); 69 | $lemma = $wnMock->getLemmas()[$keys[0]]; 70 | $this->assertEquals('.22-caliber', $lemma->getWord()); 71 | $this->assertTrue($lemma->isPertainym()); 72 | $this->assertFalse($lemma->isAttribute()); 73 | } 74 | 75 | 76 | public function testGetSynsets() 77 | { 78 | if( getenv('SKIP_TEST') || !is_dir(get_storage_path('corpora/wordnet'))) { 79 | return; 80 | } 81 | $wn = new WordnetCorpus(get_storage_path('corpora/wordnet')); 82 | $synset = $wn->getSynsetByOffsetAndPos(9825519, 'n'); 83 | $this->assertEquals(['automaton','zombi','zombie'], $synset->getWords()); 84 | $this->assertCount(4, $synset->getLinkedSynsets()); 85 | } 86 | 87 | public function testGetExceptionMapFromString() 88 | { 89 | $wn = new WordnetCorpus('not_used'); 90 | 91 | $e1 = $wn->getExceptionMapFromString('thieves thief', 'n'); 92 | $this->assertCount(1, $e1->getExceptionList()); 93 | $this->assertEquals('thief', $e1->getTarget()); 94 | $this->assertEquals('thieves', $e1->getExceptionList()[0]); 95 | 96 | $e2 = $wn->getExceptionMapFromString('ploughmen ploughman plowman', 'n'); 97 | $this->assertCount(2, $e2->getExceptionList()); 98 | $this->assertEquals('plowman', $e2->getTarget()); 99 | $this->assertEquals(['ploughmen', 'ploughman'], $e2->getExceptionList()); 100 | 101 | } 102 | 103 | } 104 | -------------------------------------------------------------------------------- /tests/TextAnalysis/Downloaders/NltkCorporalIndexDownloaderTest.php: -------------------------------------------------------------------------------- 1 | shouldAllowMockingProtectedMethods(); 22 | 23 | $mock->shouldReceive('getXmlContent') 24 | ->andReturn(simplexml_load_string($this->getXmlContent())); 25 | 26 | $packages = $mock->getPackages(); 27 | $this->assertCount(2, $packages); 28 | $this->assertEquals('maxent_ne_chunker', $packages[0]->getId()); 29 | $this->assertEquals('abc', $packages[1]->getId()); 30 | } 31 | 32 | /** 33 | * 34 | * @return string 35 | */ 36 | public function getXmlContent() 37 | { 38 | return << 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | XML; 63 | 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /tests/TextAnalysis/Extracts/DateExtractTest.php: -------------------------------------------------------------------------------- 1 | assertFalse($extract->filter("no date in jan. set")); 18 | $this->assertInstanceOf('DateTime', $extract->filter('jan. 12th 1999')); 19 | } 20 | 21 | } 22 | -------------------------------------------------------------------------------- /tests/TextAnalysis/Extracts/HashTagTest.php: -------------------------------------------------------------------------------- 1 | assertFalse($extract->filter("testing")); 18 | $this->assertEquals('#holiday', $extract->filter('#holiday')); 19 | $this->assertFalse($extract->filter('#DA')); 20 | } 21 | 22 | public function testMinLengthHashTag() 23 | { 24 | $extract = new HashTag(2); 25 | $this->assertEquals('#DA', $extract->filter('#DA')); 26 | $this->assertFalse($extract->filter('#1')); 27 | } 28 | 29 | } 30 | -------------------------------------------------------------------------------- /tests/TextAnalysis/Filters/CharFilterTest.php: -------------------------------------------------------------------------------- 1 | assertEquals(' ', $transformer->transform(' A ')); 20 | $this->assertEquals(' ', $transformer->transform(' ! ')); 21 | $this->assertEquals(' 9 ', $transformer->transform(' 9 ')); 22 | 23 | $this->assertEquals('A', $transformer->transform('A')); 24 | $this->assertEquals('!', $transformer->transform('!')); 25 | $this->assertEquals('9', $transformer->transform('9')); 26 | } 27 | 28 | 29 | 30 | 31 | } -------------------------------------------------------------------------------- /tests/TextAnalysis/Filters/EmailFilterTest.php: -------------------------------------------------------------------------------- 1 | assertEquals(null, $filter->transform("yooper@example.com")); 18 | $this->assertEquals(' , ' , $filter->transform("yooper.mqt@example.sub.dub.edu , yooper@example.com")); 19 | 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /tests/TextAnalysis/Filters/LambdaFilterTest.php: -------------------------------------------------------------------------------- 1 | assertEquals("tomtom", $transformer->transform("bobbob")); 19 | } 20 | 21 | public function testLambdaStrReplace() 22 | { 23 | $lambda = function($word){ 24 | return str_replace("bob", "tom", $word); 25 | }; 26 | $transformer = new LambdaFilter($lambda); 27 | $this->assertEquals("tomtom", $transformer->transform("bobbob")); 28 | } 29 | 30 | } 31 | 32 | -------------------------------------------------------------------------------- /tests/TextAnalysis/Filters/LowerCaseFilterTest.php: -------------------------------------------------------------------------------- 1 | assertEquals("yooper's", $transformer->transform("Yooper's")); 16 | } 17 | 18 | } -------------------------------------------------------------------------------- /tests/TextAnalysis/Filters/NumbersFilterTest.php: -------------------------------------------------------------------------------- 1 | assertEquals('easy street', $filter->transform("123 easy street")); 16 | $this->assertEquals('easy street', $filter->transform("easy street")); 17 | $this->assertEquals('april th,', $filter->transform("april 25th, 1992")); 18 | 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /tests/TextAnalysis/Filters/PossessiveNounFilterTest.php: -------------------------------------------------------------------------------- 1 | assertEquals("yooper lives in Marquette west side", $filter->transform("yooper's lives in Marquette's west side")); 19 | } 20 | 21 | public function testNonPossessive() 22 | { 23 | $filter = new PossessiveNounFilter(); 24 | $this->assertEquals("yooper", $filter->transform("yooper")); 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /tests/TextAnalysis/Filters/PunctuationFilterTest.php: -------------------------------------------------------------------------------- 1 | assertEquals("Yoopers", $transformer->transform("Yooper's!?;,")); 16 | $this->assertEquals("Yoopers", $transformer->transform("Yooper's!?;,")); 17 | 18 | } 19 | 20 | public function testOnDate() 21 | { 22 | $transformer = new PunctuationFilter(['\/',':'], []); 23 | $this->assertEquals('8/8/2016 5:51 PM', $transformer->transform('8/8/2016 5:51 PM')); 24 | } 25 | 26 | } 27 | -------------------------------------------------------------------------------- /tests/TextAnalysis/Filters/QuotesFilterTest.php: -------------------------------------------------------------------------------- 1 | assertEquals('Yoopers', $transformer->transform("Yooper's")); 16 | } 17 | 18 | public function testRemoveDoubleQuote() 19 | { 20 | $transformer = new QuotesFilter(); 21 | $this->assertEquals("Peninsula", $transformer->transform('"Peninsula"')); 22 | } 23 | } -------------------------------------------------------------------------------- /tests/TextAnalysis/Filters/SpacePunctuationFilterTest.php: -------------------------------------------------------------------------------- 1 | assertEquals('P . B . R . ', $filter->transform('P.B.R.')); 18 | $this->assertEquals('8 / 8 / 2016 5 : 51 PM', $filter->transform('8/8/2016 5:51 PM')); 19 | } 20 | 21 | public function testWhiteList() 22 | { 23 | $filter = new SpacePunctuationFilter([],['O','E']); 24 | $this->assertEquals('H O M E R', $filter->transform('HOMER')); 25 | } 26 | 27 | public function testBlackList() 28 | { 29 | $filter = new SpacePunctuationFilter(['\/',':']); 30 | $this->assertEquals('8/8/2016 5:51 PM', $filter->transform('8/8/2016 5:51 PM')); 31 | } 32 | 33 | } 34 | -------------------------------------------------------------------------------- /tests/TextAnalysis/Filters/StopWordsFilterTest.php: -------------------------------------------------------------------------------- 1 | loadStopwords()); 27 | $this->assertNull($stopWord->transform("again")); 28 | } 29 | 30 | public function testIsNotStopWord() 31 | { 32 | $stopWord = new StopWordsFilter($this->loadStopwords()); 33 | $this->assertEquals("peninsula", $stopWord->transform("peninsula")); 34 | } 35 | 36 | public function testIsStopWord2() 37 | { 38 | $stopWord = new StopWordsFilter($this->loadStopwords()); 39 | $this->assertNull($stopWord->transform("as")); 40 | } 41 | } -------------------------------------------------------------------------------- /tests/TextAnalysis/Filters/UrlFilterTest.php: -------------------------------------------------------------------------------- 1 | assertEquals("google.com", $filter->transform("google.com")); 19 | $this->assertEquals(" , ", $filter->transform("https://github.com/yooper/php-text-analysis/wiki , https://www.facebook.com/?query=1&field=none")); 20 | $this->assertEquals('hello', $filter->transform("hello")); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /tests/TextAnalysis/Indexes/TfIdfTest.php: -------------------------------------------------------------------------------- 1 | wordnetIdx) { 30 | $this->wordnetIdx = new WordnetIndex(new WordnetCorpus(get_storage_path('corpora/wordnet'))); 31 | } 32 | return $this->wordnetIdx; 33 | } 34 | 35 | public function testGetLemma() 36 | { 37 | if( getenv('SKIP_TEST') || !is_dir(get_storage_path('corpora/wordnet'))) { 38 | return; 39 | } 40 | 41 | $lemmas = $this->getWordnetIndex()->getLemma('programmer'); 42 | $this->assertCount(8, $lemmas[0]->getSynsets()[0]->getLinkedSynsets()); 43 | } 44 | 45 | public function testGetMorph() 46 | { 47 | if( getenv('SKIP_TEST') || !is_dir(get_storage_path('corpora/wordnet'))) { 48 | return; 49 | } 50 | $this->assertEquals('play', $this->getWordnetIndex()->getMorph('playing')); 51 | $this->assertEquals('dog', $this->getWordnetIndex()->getMorph('dogs')); 52 | $this->assertEquals('church', $this->getWordnetIndex()->getMorph('churches')); 53 | $this->assertEquals('aardwolf', $this->getWordnetIndex()->getMorph('aardwolves')); 54 | $this->assertEquals('abacus', $this->getWordnetIndex()->getMorph('abaci')); 55 | $this->assertEquals('book', $this->getWordnetIndex()->getMorph('books')); 56 | } 57 | 58 | } 59 | -------------------------------------------------------------------------------- /tests/TextAnalysis/LexicalDiversity/NaiveTest.php: -------------------------------------------------------------------------------- 1 | getText() )); 16 | $this->assertEqualsWithDelta(0.03461, $result, 0.0001); 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /tests/TextAnalysis/LexicalDiversity/YuleITest.php: -------------------------------------------------------------------------------- 1 | getText() ), \TextAnalysis\LexicalDiversity\YuleI::class); 16 | $this->assertEqualsWithDelta(135.9226, $result, 0.0001); 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /tests/TextAnalysis/LexicalDiversity/YuleKTest.php: -------------------------------------------------------------------------------- 1 | getText() ), \TextAnalysis\LexicalDiversity\YuleK::class); 16 | $this->assertEqualsWithDelta(73.5712, $result, 0.0001); 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /tests/TextAnalysis/NGrams/NGramFactoryTest.php: -------------------------------------------------------------------------------- 1 | 12 | */ 13 | class NGramFactoryTest extends \PHPUnit\Framework\TestCase 14 | { 15 | public function testBiGram() 16 | { 17 | $tokens = ["one","two","three"]; 18 | $expected = ["one two","two three"]; 19 | $bigrams = NGramFactory::create($tokens); 20 | $this->assertEquals($expected, $bigrams); 21 | $this->assertEquals($expected, bigrams($tokens)); 22 | } 23 | 24 | public function testTriGram() 25 | { 26 | $tokens = ["one","two","three","four"]; 27 | $expected = ["one two three","two three four"]; 28 | $bigrams = NGramFactory::create($tokens, NGramFactory::TRIGRAM); 29 | $this->assertEquals($expected, $bigrams); 30 | $this->assertEquals($expected, trigrams($tokens)); 31 | 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /tests/TextAnalysis/NGrams/StatisticFacadeTest.php: -------------------------------------------------------------------------------- 1 | 14 | */ 15 | class StatisticFacadeTest extends \PHPUnit\Framework\TestCase 16 | { 17 | private $text; 18 | private $tokens; 19 | 20 | public function setUp() : void 21 | { 22 | parent::setUp(); 23 | $this->text = file_get_contents(TEST_DATA_DIR . DS . 'Text'.DS.'Analysis'.DS.'text_ngrams.txt'); 24 | $tokenizer = new RegexTokenizer('/([\p{L}]+[\/\-_\']?[\p{L}]+)+|[\p{L}]+/iu'); 25 | $this->tokens = normalize_tokens($tokenizer->tokenize($this->text)); 26 | } 27 | public function testBigrams() 28 | { 29 | 30 | $ngrams = NGramFactory::create($this->tokens, 2, '<>'); 31 | 32 | $ngrams = NGramFactory::getFreq($ngrams, '<>'); 33 | 34 | //test frequency 35 | $this->assertEquals($ngrams['know<>something'], array( 0=>2, 1=> 3, 2 => 2)); 36 | 37 | //test tmi measure 38 | $ngramsStats = StatisticFacade::calculate($ngrams, 'tmi', 2); 39 | $this->assertEquals(round($ngramsStats['know<>something'], 4), 0.1612); 40 | 41 | //test ll measure 42 | $ngramsStats = StatisticFacade::calculate($ngrams, 'll', 2); 43 | $this->assertEquals(round($ngramsStats['know<>something'], 4), 13.8516); 44 | 45 | //test pmi measure 46 | $ngramsStats = StatisticFacade::calculate($ngrams, 'pmi', 2); 47 | $this->assertEquals(round($ngramsStats['know<>something'], 4), 4.3692); 48 | 49 | //test dice measure 50 | $ngramsStats = StatisticFacade::calculate($ngrams, 'dice', 2); 51 | $this->assertEquals(round($ngramsStats['know<>something'], 4), 0.8000); 52 | 53 | //test x2 measure 54 | $ngramsStats = StatisticFacade::calculate($ngrams, 'x2', 2); 55 | $this->assertEquals(round($ngramsStats['know<>something'], 4), 40.6444); 56 | 57 | //test tscore measure 58 | $ngramsStats = StatisticFacade::calculate($ngrams, 'tscore', 2); 59 | $this->assertEquals(round($ngramsStats['know<>something'], 4), 1.3458); 60 | 61 | //test phi measure 62 | $ngramsStats = StatisticFacade::calculate($ngrams, 'phi', 2); 63 | $this->assertEquals(round($ngramsStats['know<>something'], 4), 0.6556); 64 | 65 | //test odds measure 66 | $ngramsStats = StatisticFacade::calculate($ngrams, 'odds', 2); 67 | $this->assertEquals(round($ngramsStats['know<>something'], 4), 118.0000); 68 | 69 | //test leftFisher measure 70 | $ngramsStats = StatisticFacade::calculate($ngrams, 'leftFisher', 2); 71 | $this->assertEquals(round($ngramsStats['know<>something'], 4), 1.0000); 72 | 73 | //test rightFisher measure 74 | $ngramsStats = StatisticFacade::calculate($ngrams, 'rightFisher', 2); 75 | $this->assertEquals(round($ngramsStats['know<>something'], 4), 0.0016); 76 | } 77 | 78 | public function testTrigrams() 79 | { 80 | $ngrams = NGramFactory::create($this->tokens, 3, '<>'); 81 | $ngrams = NGramFactory::getFreq($ngrams, '<>'); 82 | 83 | //test frequency 84 | $this->assertEquals($ngrams['the<>know<>something'], array( 0 => 1, 1 => 4, 2 => 3, 3 => 2, 4 => 1, 5 => 1, 6 => 2)); 85 | 86 | //test tmi measure 87 | $ngramsStats = StatisticFacade::calculate($ngrams, 'tmi', 3); 88 | $this->assertEquals(round($ngramsStats['the<>know<>something'], 4), 0.2002); 89 | 90 | //test ll measure 91 | $ngramsStats = StatisticFacade::calculate($ngrams, 'll', 3); 92 | $this->assertEquals(round($ngramsStats['the<>know<>something'], 4), 16.9283); 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /tests/TextAnalysis/Stemmers/DictionaryStemmerTest.php: -------------------------------------------------------------------------------- 1 | assertEquals("judge", $stemmer->stem("judges")); 24 | // some times approach does not work 25 | $this->assertEquals('university', $stemmer->stem("universities")); 26 | $this->assertEquals('hammock', $stemmer->stem("hammok")); 27 | } 28 | 29 | } 30 | -------------------------------------------------------------------------------- /tests/TextAnalysis/Stemmers/LambdaStemmerTest.php: -------------------------------------------------------------------------------- 1 | assertEquals("tom", $stemmer->stem("tommy")); 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /tests/TextAnalysis/Stemmers/LancasterStemmerTest.php: -------------------------------------------------------------------------------- 1 | assertEquals('maxim', $stemmer->stem('maximum')); 17 | $this->assertEquals('presum', $stemmer->stem('presumably')); 18 | $this->assertEquals('multiply', $stemmer->stem('multiply')); 19 | $this->assertEquals('provid', $stemmer->stem('provision')); 20 | $this->assertEquals('ow', $stemmer->stem('owed')); 21 | $this->assertEquals('ear', $stemmer->stem('ear')); 22 | $this->assertEquals('say', $stemmer->stem('saying')); 23 | $this->assertEquals('cry', $stemmer->stem('crying')); 24 | $this->assertEquals('string', $stemmer->stem('string')); 25 | $this->assertEquals('meant', $stemmer->stem('meant')); 26 | $this->assertEquals('cem', $stemmer->stem('cement')); 27 | $this->assertEquals( null, $stemmer->stem(' ')); 28 | 29 | } 30 | } 31 | 32 | -------------------------------------------------------------------------------- /tests/TextAnalysis/Stemmers/LookupStemmerTest.php: -------------------------------------------------------------------------------- 1 | assertEquals("end", $stemmer->stem("ending")); 21 | $this->assertEquals("end", $stemmer->stem("ended")); 22 | 23 | } 24 | 25 | 26 | } 27 | -------------------------------------------------------------------------------- /tests/TextAnalysis/Stemmers/MorphStemmerTest.php: -------------------------------------------------------------------------------- 1 | assertEquals('university', $stemmer->stem('universities')); 22 | 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /tests/TextAnalysis/Stemmers/PorterStemmerTest.php: -------------------------------------------------------------------------------- 1 | assertEquals('univers', $stemmer->stem('universities')); 20 | $this->assertEquals('judg',$stemmer->stem('judges')); 21 | } 22 | 23 | public function testSimplifiedStemmer() 24 | { 25 | $this->assertEquals(['univers','judg'], stem(['universities', 'judges'])); 26 | } 27 | 28 | } 29 | -------------------------------------------------------------------------------- /tests/TextAnalysis/Stemmers/RegexStemmerTest.php: -------------------------------------------------------------------------------- 1 | assertEquals("car", $stemmer->stem("car")); 19 | $this->assertEquals("mas", $stemmer->stem("mass")); 20 | $this->assertEquals("was", $stemmer->stem("was")); 21 | $this->assertEquals("bee", $stemmer->stem("bee")); 22 | $this->assertEquals("comput", $stemmer->stem("compute")); 23 | 24 | } 25 | 26 | 27 | } 28 | -------------------------------------------------------------------------------- /tests/TextAnalysis/Stemmers/SnowballStemmerTest.php: -------------------------------------------------------------------------------- 1 | assertEquals("judg", $stemmer->stem("judges")); 18 | $this->assertEquals('ski', $stemmer->stem('skis')); 19 | $this->assertEquals('univers', $stemmer->stem('universities')); 20 | $this->assertEquals('news', $stemmer->stem('news')); 21 | } 22 | 23 | public function testSwedish() 24 | { 25 | $stemmer = new SnowballStemmer('Swedish'); 26 | $this->assertEquals("affärschef", $stemmer->stem("affärscheferna")); 27 | } 28 | 29 | public function testException() 30 | { 31 | $this->expectException('Exception'); 32 | $stemmer = new SnowballStemmer('Wookie'); 33 | } 34 | } -------------------------------------------------------------------------------- /tests/TextAnalysis/Taggers/StanfordNerTaggerTest.php: -------------------------------------------------------------------------------- 1 | expectException('RuntimeException', 'Jar not found not_available.jar'); 26 | $tagger->tag([]); 27 | } 28 | 29 | public function testClassiferNotFound() 30 | { 31 | if( getenv('SKIP_TEST') || !getenv('JAVA_HOME')) { 32 | return; 33 | } 34 | 35 | $tagger = new StanfordNerTagger(get_storage_path($this->nerPath).'stanford-ner.jar', "classifier.gz"); 36 | $this->expectException('RuntimeException', 'Classifier not found classifier.gz'); 37 | $tagger->tag([]); 38 | } 39 | 40 | public function testTempCreatedFile() 41 | { 42 | $mockTagger = Mockery::mock('TextAnalysis\Taggers\StanfordNerTagger[exec,verify]', ['bogus.jar', 'bogus.classifier']) 43 | ->shouldAllowMockingProtectedMethods(); 44 | 45 | $mockTagger->shouldReceive('exec') 46 | ->andReturnNull() 47 | ->shouldReceive('verify') 48 | ->andReturnNull(); 49 | 50 | $mockTagger->tag((new WhitespaceTokenizer())->tokenize($this->text)); 51 | $this->assertFileExists($mockTagger->getTmpFilePath()); 52 | $this->assertEquals(138, filesize($mockTagger->getTmpFilePath())); 53 | } 54 | 55 | public function testStanfordNer() 56 | { 57 | if( getenv('SKIP_TEST')) { 58 | return; 59 | } 60 | 61 | $document = new TokensDocument((new WhitespaceTokenizer())->tokenize($this->text)); 62 | $tagger = new StanfordNerTagger(); 63 | $output = $tagger->tag($document->getDocumentData()); 64 | 65 | $this->assertFileExists($tagger->getTmpFilePath()); 66 | $this->assertEquals(138, filesize($tagger->getTmpFilePath())); 67 | $this->assertEquals(['Michigan','LOCATION'], $output[15], "Did you set JAVA_HOME env variable?"); 68 | } 69 | 70 | } 71 | -------------------------------------------------------------------------------- /tests/TextAnalysis/Taggers/StanfordPosTaggerTest.php: -------------------------------------------------------------------------------- 1 | expectException('RuntimeException', 'Jar not found not_available.jar'); 26 | $tagger->tag([]); 27 | } 28 | 29 | public function testClassiferNotFound() 30 | { 31 | if( getenv('SKIP_TEST') || !getenv('JAVA_HOME')) { 32 | return; 33 | } 34 | 35 | $tagger = new StanfordPosTagger(get_storage_path($this->posPath).'stanford-postagger-3.6.0.jar', "classifier.gz"); 36 | $this->expectException('RuntimeException', 'Classifier not found classifier.gz'); 37 | $tagger->tag([]); 38 | } 39 | 40 | public function testTempCreatedFile() 41 | { 42 | $mockTagger = Mockery::mock('TextAnalysis\Taggers\StanfordPosTagger[exec,verify]', ['bogus.jar', 'bogus.classifier']) 43 | ->shouldAllowMockingProtectedMethods(); 44 | 45 | $mockTagger->shouldReceive('exec') 46 | ->andReturnNull() 47 | ->shouldReceive('verify') 48 | ->andReturnNull(); 49 | 50 | $mockTagger->tag((new WhitespaceTokenizer())->tokenize($this->text)); 51 | $this->assertFileExists($mockTagger->getTmpFilePath()); 52 | $this->assertEquals(138, filesize($mockTagger->getTmpFilePath())); 53 | } 54 | 55 | public function testStanfordPos() 56 | { 57 | if( getenv('SKIP_TEST')) { 58 | return; 59 | } 60 | 61 | $document = new TokensDocument((new WhitespaceTokenizer())->tokenize($this->text)); 62 | $tagger = new StanfordPosTagger(); 63 | $output = $tagger->tag($document->getDocumentData()); 64 | 65 | $this->assertFileExists($tagger->getTmpFilePath()); 66 | $this->assertEquals(138, filesize($tagger->getTmpFilePath())); 67 | $this->assertEquals(['Michigan','NNP'], $output[15], "Did you set JAVA_HOME env variable?"); 68 | } 69 | 70 | } 71 | -------------------------------------------------------------------------------- /tests/TextAnalysis/Tokenizers/FixedLengthTokenizerTest.php: -------------------------------------------------------------------------------- 1 | tokenize("Gabby Abby"); 19 | $this->assertCount(1, $tokens); 20 | $this->assertEquals("bby ", end($tokens)); 21 | 22 | } 23 | 24 | public function testFixedLengthNoLengthGiven() 25 | { 26 | $tokenizer = new FixedLengthTokenizer(0); 27 | $tokens = $tokenizer->tokenize("Gabby Abby"); 28 | $this->assertCount(1, $tokens); 29 | $this->assertEquals("Gabby Abby", end($tokens)); 30 | } 31 | 32 | } 33 | 34 | -------------------------------------------------------------------------------- /tests/TextAnalysis/Tokenizers/GeneralTokenizerTest.php: -------------------------------------------------------------------------------- 1 | assertCount(4, $tokenizer->tokenize("This has some words")); 18 | } 19 | 20 | public function testLineTokenizer(){ 21 | 22 | $tokenizer = new GeneralTokenizer(PHP_EOL); 23 | $this->assertCount(4, $tokenizer->tokenize("This ".PHP_EOL." has".PHP_EOL." some".PHP_EOL." words")); 24 | } 25 | } 26 | 27 | -------------------------------------------------------------------------------- /tests/TextAnalysis/Tokenizers/PennTreeBankTokenizerTest.php: -------------------------------------------------------------------------------- 1 | tokenize("Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks."); 19 | $this->assertCount(16, $tokens); 20 | } 21 | 22 | public function testTokenizer2() 23 | { 24 | $tokenizer = new PennTreeBankTokenizer(); 25 | $this->assertCount(7, $tokenizer->tokenize("They'll save and invest more.")); 26 | } 27 | 28 | public function testTokenizer3() 29 | { 30 | $tokenizer = new PennTreeBankTokenizer(); 31 | $this->assertCount(4, $tokenizer->tokenize("I'm some text")); 32 | } 33 | 34 | public function testAgainstOriginalSedImplementation() 35 | { 36 | $tokenizer = new PennTreeBankTokenizer(); 37 | $tokenized = new \SplFileObject(TEST_DATA_DIR."/Tokenizers/PennTreeBankTokenizerTest/tokenized"); 38 | $tokenized->setFlags(\SplFileObject::DROP_NEW_LINE); 39 | $sentences = new \SplFileObject(TEST_DATA_DIR."/Tokenizers/PennTreeBankTokenizerTest/test.txt"); 40 | $sentences->setFlags(\SplFileObject::DROP_NEW_LINE); 41 | 42 | $tokenized->rewind(); 43 | foreach ($sentences as $sentence) { 44 | if ($sentence) // skip empty lines 45 | { 46 | $this->assertEquals( 47 | $tokenized->current(), 48 | implode(" ",$tokenizer->tokenize($sentence)), 49 | "Sentence: '$sentence' was not tokenized correctly" 50 | ); 51 | } 52 | $tokenized->next(); 53 | } 54 | 55 | } 56 | 57 | } 58 | 59 | -------------------------------------------------------------------------------- /tests/TextAnalysis/Tokenizers/RegexTokenizerTest.php: -------------------------------------------------------------------------------- 1 | tokenize("Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks."); 18 | $this->assertCount(17, $tokens); 19 | } 20 | 21 | public function testMatchWordsOnly() 22 | { 23 | $tokenizer = new RegexTokenizer("/[A-Za-z]+/"); 24 | $tokens = $tokenizer->tokenize("Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks."); 25 | $this->assertCount(13, $tokens); 26 | } 27 | 28 | } -------------------------------------------------------------------------------- /tests/TextAnalysis/Tokenizers/SentenceTokenizerTest.php: -------------------------------------------------------------------------------- 1 | assertCount(2, $tokenizer->tokenize("This has some words. Why only 4 words?")); 17 | $this->assertCount(2, $tokenizer->tokenize("My name is Yooper. I like programming!")); 18 | $this->assertCount(2, $tokenizer->tokenize("My name is Yooper!? I like programming!! !!")); 19 | $this->assertCount(3, $tokenizer->tokenize($this->getArticle())); 20 | $this->assertCount(1, $tokenizer->tokenize("The U.S.A. recently dropped out of the T.P.P.")); 21 | } 22 | 23 | private function getArticle() 24 | { 25 | return <<tokenize('This is a common Tweet #format where @mentions and.errors!!!!like this:-))))) might #appear❤ ❤☺❤#ThisIsAHashtag!?!'); 17 | $this->assertCount(33, $tokens); 18 | 19 | } 20 | 21 | public function testForUrlAndEmail() 22 | { 23 | $tokens = (new TwitterTokenizer)->tokenize('Custom Software Development http://redbeardtechnologies.com/ 906-555-5555 or contact support at support@redbeardtechnologies.com :-)'); 24 | $this->assertCount(11, $tokens); 25 | } 26 | 27 | public function testContraction() 28 | { 29 | $tokens = (new TwitterTokenizer)->tokenize("This shouldn't be broken up"); 30 | $this->assertCount(5, $tokens); 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /tests/TextAnalysis/Utilities/TextTest.php: -------------------------------------------------------------------------------- 1 | assertCount(6, $substrings); 22 | $this->assertEquals($expected, $substrings); 23 | } 24 | 25 | public function testEndsWith() 26 | { 27 | $this->assertTrue(Text::endsWith('lunches', 's')); 28 | $this->assertTrue(Text::endsWith('lunches', 'es')); 29 | $this->assertTrue(Text::endsWith('lunches', 'hes')); 30 | $this->assertFalse(Text::endsWith('joe', 'is')); 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /tests/TextAnalysis/Utilities/Vowels/EnglishVowelsTest.php: -------------------------------------------------------------------------------- 1 | assertTrue($vowelChecker->isVowel("man", 1)); 16 | } 17 | 18 | public function testYIsVowel() 19 | { 20 | $vowelChecker = VowelsAbstractFactory::factory("English"); 21 | $this->assertTrue($vowelChecker->isVowel("try", 2)); 22 | } 23 | } 24 | 25 | 26 | -------------------------------------------------------------------------------- /tests/bootstrap.php: -------------------------------------------------------------------------------- 1 | add(new NltkPackageListCommand()); 42 | $app->add(new NltkPackageInstallCommand()); 43 | $app->add(new StopWordsCommand()); 44 | $app->add(new VocabSizeCommand()); 45 | $app->add(new NltkPackageInstallAllCommand()); 46 | 47 | $app->run(); 48 | --------------------------------------------------------------------------------