├── .gitignore
├── .travis.yml
├── CODE_OF_CONDUCT.md
├── Dockerfile74
├── Dockerfile80
├── Dockerfile81
├── Dockerfile82
├── Dockerfile83
├── Dockerfile84
├── LICENSE
├── README.md
├── composer.json
├── interactive
├── phpunit.xml
├── src
├── Adapters
│ ├── ArrayDataReaderAdapter.php
│ ├── EnchantAdapter.php
│ ├── JsonDataAdapter.php
│ ├── JsonWriterAdapter.php
│ └── PspellAdapter.php
├── Analysis
│ ├── DateAnalysis.php
│ ├── FreqDist.php
│ ├── Keywords
│ │ └── Rake.php
│ └── Summarize
│ │ └── Simple.php
├── Classifiers
│ └── NaiveBayes.php
├── Collections
│ └── DocumentArrayCollection.php
├── Collocations
│ └── CollocationFinder.php
├── Comparisons
│ ├── CosineSimilarityComparison.php
│ ├── HammingDistanceComparison.php
│ ├── JaccardIndexComparison.php
│ ├── JaroWinklerComparison.php
│ ├── LevenshteinComparison.php
│ ├── LongestCommonSubstringComparison.php
│ ├── MostFreqCharComparison.php
│ └── SimilarTextComparison.php
├── Console
│ └── Commands
│ │ ├── NltkPackageInstallAllCommand.php
│ │ ├── NltkPackageInstallCommand.php
│ │ ├── NltkPackageListCommand.php
│ │ ├── StopWordsCommand.php
│ │ └── VocabSizeCommand.php
├── Corpus
│ ├── ImportCorpus.php
│ ├── NameCorpus.php
│ ├── ReadCorpusAbstract.php
│ ├── TextCorpus.php
│ └── WordnetCorpus.php
├── Documents
│ ├── ContentDocument.php
│ ├── DocumentAbstract.php
│ └── TokensDocument.php
├── Downloaders
│ ├── DownloadPackageFactory.php
│ └── NltkCorporaIndexDownloader.php
├── Exceptions
│ ├── InvalidExpression.php
│ └── InvalidParameterSizeException.php
├── Extracts
│ ├── DateExtract.php
│ ├── EmailExtract.php
│ ├── HashTag.php
│ ├── LambdaExtract.php
│ └── UrlExtract.php
├── Filters
│ ├── CharFilter.php
│ ├── DomainFilter.php
│ ├── EmailFilter.php
│ ├── LambdaFilter.php
│ ├── LowerCaseFilter.php
│ ├── NumbersFilter.php
│ ├── PossessiveNounFilter.php
│ ├── PunctuationFilter.php
│ ├── QuotesFilter.php
│ ├── SpacePunctuationFilter.php
│ ├── StopWordsFilter.php
│ ├── StripTagsFilter.php
│ ├── TrimFilter.php
│ ├── UpperCaseFilter.php
│ ├── UrlFilter.php
│ └── WhitespaceFilter.php
├── Generators
│ └── StopwordGenerator.php
├── Indexes
│ ├── TfIdf.php
│ └── WordnetIndex.php
├── Interfaces
│ ├── IClassifier.php
│ ├── ICollection.php
│ ├── IDataReader.php
│ ├── IDataWriter.php
│ ├── IDistance.php
│ ├── IExtractStrategy.php
│ ├── ILexicalDiversity.php
│ ├── ISimilarity.php
│ ├── ISpelling.php
│ ├── IStemmer.php
│ └── ITokenTransformation.php
├── LexicalDiversity
│ ├── Naive.php
│ ├── YuleI.php
│ └── YuleK.php
├── Models
│ ├── ScoreKeeper.php
│ └── Wordnet
│ │ ├── ExceptionMap.php
│ │ ├── Lemma.php
│ │ └── Synset.php
├── NGrams
│ ├── NGramFactory.php
│ ├── Statistic2D.php
│ ├── Statistic3D.php
│ └── StatisticFacade.php
├── Phonetics
│ ├── MetaphonePhonetic.php
│ └── SoundexPhonetic.php
├── Sentiment
│ └── Vader.php
├── Stemmers
│ ├── DictionaryStemmer.php
│ ├── LambdaStemmer.php
│ ├── LancasterStemmer.php
│ ├── LookupStemmer.php
│ ├── MorphStemmer.php
│ ├── PorterStemmer.php
│ ├── RegexStemmer.php
│ └── SnowballStemmer.php
├── Taggers
│ ├── StanfordAbstract.php
│ ├── StanfordNerTagger.php
│ └── StanfordPosTagger.php
├── Tokenizers
│ ├── FixedLengthTokenizer.php
│ ├── GeneralTokenizer.php
│ ├── LambdaTokenizer.php
│ ├── PennTreeBankTokenizer.php
│ ├── RegexTokenizer.php
│ ├── SentenceTokenizer.php
│ ├── TokenizerAbstract.php
│ ├── TwitterTokenizer.php
│ ├── VanderleeTokenizer.php
│ └── WhitespaceTokenizer.php
├── Traits
│ └── WordnetPointerSymbolMap.php
├── Utilities
│ ├── Nltk
│ │ └── Download
│ │ │ └── Package.php
│ ├── Text.php
│ └── Vowels
│ │ ├── EnglishVowels.php
│ │ └── VowelsAbstractFactory.php
└── helpers
│ ├── helpers.php
│ ├── interactive_help.php
│ ├── print.php
│ ├── simplified.php
│ └── storage.php
├── storage
├── .gitkeep
├── cache
│ └── .gitkeep
└── corpora
│ └── .gitkeep
├── tests
├── TestBaseCase.php
├── TextAnalysis
│ ├── Adapters
│ │ └── PspellAdapterTest.php
│ ├── Analysis
│ │ ├── DateAnalysisTest.php
│ │ ├── FreqDistTest.php
│ │ └── Keywords
│ │ │ └── RakeTest.php
│ ├── Classifiers
│ │ └── NaiveBayesTest.php
│ ├── Collections
│ │ └── DocumentArrayCollectionTest.php
│ ├── Collocations
│ │ └── CollocationFinderTest.php
│ ├── Comparisons
│ │ ├── CosineSimilarityComparisonTest.php
│ │ ├── HammingDistanceComparisonTest.php
│ │ ├── JaccardIndexComparisonTest.php
│ │ ├── JaroWinklerComparisonTest.php
│ │ ├── LevenshteinComparisonTest.php
│ │ ├── LongestCommonSubstringComparisonTest.php
│ │ └── MostFreqCharComparisonTest.php
│ ├── Corpus
│ │ ├── ImportCorpusTest.php
│ │ ├── NameCorpusTest.php
│ │ ├── TextCorpusTest.php
│ │ └── WordnetCorpusTest.php
│ ├── Downloaders
│ │ └── NltkCorporalIndexDownloaderTest.php
│ ├── Extracts
│ │ ├── DateExtractTest.php
│ │ └── HashTagTest.php
│ ├── Filters
│ │ ├── CharFilterTest.php
│ │ ├── EmailFilterTest.php
│ │ ├── LambdaFilterTest.php
│ │ ├── LowerCaseFilterTest.php
│ │ ├── NumbersFilterTest.php
│ │ ├── PossessiveNounFilterTest.php
│ │ ├── PunctuationFilterTest.php
│ │ ├── QuotesFilterTest.php
│ │ ├── SpacePunctuationFilterTest.php
│ │ ├── StopWordsFilterTest.php
│ │ └── UrlFilterTest.php
│ ├── Indexes
│ │ ├── TfIdfTest.php
│ │ └── WordnetIndexTest.php
│ ├── LexicalDiversity
│ │ ├── NaiveTest.php
│ │ ├── YuleITest.php
│ │ └── YuleKTest.php
│ ├── NGrams
│ │ ├── NGramFactoryTest.php
│ │ └── StatisticFacadeTest.php
│ ├── Sentiment
│ │ └── VaderTest.php
│ ├── Stemmers
│ │ ├── DictionaryStemmerTest.php
│ │ ├── LambdaStemmerTest.php
│ │ ├── LancasterStemmerTest.php
│ │ ├── LookupStemmerTest.php
│ │ ├── MorphStemmerTest.php
│ │ ├── PorterStemmerTest.php
│ │ ├── RegexStemmerTest.php
│ │ └── SnowballStemmerTest.php
│ ├── Taggers
│ │ ├── StanfordNerTaggerTest.php
│ │ └── StanfordPosTaggerTest.php
│ ├── Tokenizers
│ │ ├── FixedLengthTokenizerTest.php
│ │ ├── GeneralTokenizerTest.php
│ │ ├── PennTreeBankTokenizerTest.php
│ │ ├── RegexTokenizerTest.php
│ │ ├── SentenceTokenizerTest.php
│ │ └── TwitterTokenizerTest.php
│ └── Utilities
│ │ ├── TextTest.php
│ │ └── Vowels
│ │ └── EnglishVowelsTest.php
├── bootstrap.php
└── data
│ ├── Text
│ └── Analysis
│ │ ├── text.txt
│ │ └── text_ngrams.txt
│ ├── Tokenizers
│ └── PennTreeBankTokenizerTest
│ │ ├── test.txt
│ │ └── tokenized
│ └── books
│ ├── ptbr
│ └── Dom_Casmurro.txt
│ └── tom_sawyer.txt
└── textconsole
/.gitignore:
--------------------------------------------------------------------------------
1 | nbproject/
2 | vendor/
3 | storage/
4 | composer.lock
5 | .idea
6 | .phpunit.result.cache
7 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: php
2 | env:
3 | - SKIP_TEST=1
4 | php:
5 | - 7.4
6 |
7 | before_script:
8 | - composer self-update
9 | - composer install --prefer-source --no-interaction --dev
10 |
11 | script: phpunit
12 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Contributor Covenant Code of Conduct
2 |
3 | ## Our Pledge
4 |
5 | In the interest of fostering an open and welcoming environment, we as
6 | contributors and maintainers pledge to making participation in our project and
7 | our community a harassment-free experience for everyone, regardless of age, body
8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 |
12 | ## Our Standards
13 |
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 |
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 |
23 | Examples of unacceptable behavior by participants include:
24 |
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 | advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 | address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 | professional setting
33 |
34 | ## Our Responsibilities
35 |
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 |
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 |
46 | ## Scope
47 |
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 |
55 | ## Enforcement
56 |
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at dcardin2007@gmail.com. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 |
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 |
68 | ## Attribution
69 |
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
72 |
73 | [homepage]: https://www.contributor-covenant.org
74 |
75 | For answers to common questions about this code of conduct, see
76 | https://www.contributor-covenant.org/faq
77 |
--------------------------------------------------------------------------------
/Dockerfile74:
--------------------------------------------------------------------------------
1 | FROM php:7.4-cli
2 |
3 | RUN apt-get update && \
4 | apt-get install -y --no-install-recommends zip libzip-dev && \
5 | docker-php-ext-install zip
6 |
7 | RUN curl --silent --show-error https://getcomposer.org/installer | php -- --install-dir=/usr/local/bin --filename=composer
8 |
9 | RUN mkdir -p /app
10 |
11 | COPY ./ /app
12 |
13 | RUN composer --working-dir=/app install
14 |
15 | RUN cd /app && SKIP_TEST=1 ./vendor/bin/phpunit -d memory_limit=1G
16 |
17 | CMD ["/bin/sh"]
18 |
--------------------------------------------------------------------------------
/Dockerfile80:
--------------------------------------------------------------------------------
1 | FROM php:8.0-cli
2 |
3 | RUN apt-get update && \
4 | apt-get install -y --no-install-recommends zip libzip-dev libpspell-dev && \
5 | docker-php-ext-install zip pspell
6 |
7 | RUN curl --silent --show-error https://getcomposer.org/installer | php -- --install-dir=/usr/local/bin --filename=composer
8 |
9 | RUN mkdir -p /app
10 |
11 | COPY ./ /app
12 |
13 | RUN composer --working-dir=/app install
14 |
15 | RUN cd /app && SKIP_TEST=1 ./vendor/bin/phpunit -d memory_limit=1G
16 |
17 | CMD ["/bin/sh"]
18 |
--------------------------------------------------------------------------------
/Dockerfile81:
--------------------------------------------------------------------------------
1 | FROM php:8.1-cli
2 |
3 | RUN apt-get update && \
4 | apt-get install -y --no-install-recommends zip libzip-dev libpspell-dev && \
5 | docker-php-ext-install zip pspell
6 |
7 | RUN curl --silent --show-error https://getcomposer.org/installer | php -- --install-dir=/usr/local/bin --filename=composer
8 |
9 | RUN mkdir -p /app
10 |
11 | COPY ./ /app
12 |
13 | RUN composer --working-dir=/app install
14 |
15 | RUN cd /app && SKIP_TEST=1 ./vendor/bin/phpunit -d memory_limit=1G
16 |
17 | CMD ["/bin/sh"]
18 |
--------------------------------------------------------------------------------
/Dockerfile82:
--------------------------------------------------------------------------------
1 | FROM php:8.2-cli
2 |
3 | RUN apt-get update && \
4 | apt-get install -y --no-install-recommends zip libzip-dev libpspell-dev && \
5 | docker-php-ext-install zip pspell
6 |
7 | RUN curl --silent --show-error https://getcomposer.org/installer | php -- --install-dir=/usr/local/bin --filename=composer
8 |
9 | RUN mkdir -p /app
10 |
11 | COPY ./ /app
12 |
13 | RUN composer --working-dir=/app install
14 |
15 | RUN cd /app && SKIP_TEST=1 ./vendor/bin/phpunit -d memory_limit=1G
16 |
17 | CMD ["/bin/sh"]
18 |
--------------------------------------------------------------------------------
/Dockerfile83:
--------------------------------------------------------------------------------
1 | FROM php:8.3-cli
2 |
3 | RUN apt-get update && \
4 | apt-get install -y --no-install-recommends zip libzip-dev libpspell-dev && \
5 | docker-php-ext-install zip pspell
6 |
7 | RUN curl --silent --show-error https://getcomposer.org/installer | php -- --install-dir=/usr/local/bin --filename=composer
8 |
9 | RUN mkdir -p /app
10 |
11 | COPY ./ /app
12 |
13 | RUN composer --working-dir=/app install
14 |
15 | RUN cd /app && SKIP_TEST=1 ./vendor/bin/phpunit -d memory_limit=1G
16 |
17 | CMD ["/bin/sh"]
18 |
--------------------------------------------------------------------------------
/Dockerfile84:
--------------------------------------------------------------------------------
1 | FROM php:8.4-cli
2 |
3 | RUN apt-get update && \
4 | apt-get install -y --no-install-recommends zip libzip-dev libpspell-dev && \
5 | docker-php-ext-install zip
6 |
7 | RUN curl --silent --show-error https://getcomposer.org/installer | php -- --install-dir=/usr/local/bin --filename=composer
8 |
9 | RUN mkdir -p /app
10 |
11 | COPY ./ /app
12 |
13 | RUN composer --working-dir=/app install
14 |
15 | RUN cd /app && SKIP_TEST=1 ./vendor/bin/phpunit -d memory_limit=1G
16 |
17 | CMD ["/bin/sh"]
18 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 Dan Cardin
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/composer.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "yooper/php-text-analysis",
3 | "description": "PHP Text Analysis is a library for performing Information Retrieval (IR) and Natural Language Processing (NLP) tasks using the PHP language",
4 | "keywords": ["nlp","ir","text analysis","natural language processing", "text classification"],
5 | "license": "MIT",
6 | "authors": [
7 | {
8 | "name": "yooper",
9 | "email" : "dcardin2007@gmail.com"
10 | }
11 | ],
12 | "bin": ["textconsole","interactive"],
13 | "autoload": {
14 | "psr-4": {
15 | "TextAnalysis\\": "src/"
16 | },
17 | "files": ["src/helpers/storage.php", "src/helpers/print.php", "src/helpers/simplified.php", "src/helpers/helpers.php", "src/helpers/interactive_help.php"]
18 | },
19 | "autoload-dev": {
20 | "files": ["tests/TestBaseCase.php"]
21 | },
22 | "require" : {
23 | "php": ">=7.4",
24 | "yooper/stop-words": "~1",
25 | "symfony/console": ">= 4.4",
26 | "wamania/php-stemmer": "^1.0 || ^2.0 || ^3.0",
27 | "yooper/nicknames": "~1"
28 | },
29 | "require-dev": {
30 | "phpunit/phpunit": "^9",
31 | "mockery/mockery" : "^1"
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/interactive:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # run the library in an interactive mode for doing analysis
3 | echo "Welcome to the PHP Text Analysis Interactive Console"
4 | echo "Type help(); to get a list of available functions"
5 | php -a -d auto_prepend_file=./vendor/autoload.php
6 |
7 |
--------------------------------------------------------------------------------
/phpunit.xml:
--------------------------------------------------------------------------------
1 |
2 |
12 |
13 |
14 | ./tests/
15 |
16 |
17 |
18 |
19 |
--------------------------------------------------------------------------------
/src/Adapters/ArrayDataReaderAdapter.php:
--------------------------------------------------------------------------------
1 | data = $data;
20 | }
21 |
22 | /**
23 | *
24 | * @return array
25 | */
26 | public function read()
27 | {
28 | return $this->data;
29 | }
30 | }
31 |
32 |
--------------------------------------------------------------------------------
/src/Adapters/EnchantAdapter.php:
--------------------------------------------------------------------------------
1 | enchantBroker = enchant_broker_request_dict($r, $language);
20 | }
21 |
22 | /**
23 | * Use enchant to get word suggestions
24 | * @param string $word
25 | * @return array
26 | */
27 | public function suggest($word)
28 | {
29 | if(!enchant_dict_check($this->enchantBroker, $word)) {
30 | return enchant_dict_suggest($this->enchantBroker, $word);
31 | } else {
32 | return [$word];
33 | }
34 | }
35 |
36 | public function __destruct()
37 | {
38 | unset($this->enchantBroker);
39 | }
40 | }
--------------------------------------------------------------------------------
/src/Adapters/JsonDataAdapter.php:
--------------------------------------------------------------------------------
1 | jsonStr = $jsonStr;
31 | $this->assoc = $assoc;
32 | }
33 |
34 | /**
35 | * Returns the json data as an array
36 | * @return array
37 | */
38 | public function read()
39 | {
40 | return json_decode($this->jsonStr, $this->assoc);
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/src/Adapters/JsonWriterAdapter.php:
--------------------------------------------------------------------------------
1 | data = $data;
40 | $this->options = $options;
41 | $this->depth = $depth;
42 | }
43 |
44 | /**
45 | * return a json encoded string
46 | * @return string
47 | */
48 | public function write()
49 | {
50 | return json_encode($this->data, $this->options, $this->depth);
51 | }
52 | }
53 |
54 |
--------------------------------------------------------------------------------
/src/Adapters/PspellAdapter.php:
--------------------------------------------------------------------------------
1 | pSpell = pspell_new($language, $spelling, $jargon, $encoding, $mode);
19 | }
20 |
21 | /**
22 | * Use pspell to get word suggestions
23 | * @param string $word
24 | * @return array
25 | */
26 | public function suggest($word)
27 | {
28 | if (!pspell_check($this->pSpell, $word)) {
29 | return pspell_suggest($this->pSpell, $word);
30 | }
31 | else {
32 | return [$word];
33 | }
34 | }
35 |
36 | public function __destruct()
37 | {
38 | unset($this->pSpell);
39 | }
40 | }
41 |
--------------------------------------------------------------------------------
/src/Analysis/DateAnalysis.php:
--------------------------------------------------------------------------------
1 | sentences = $tokenizer->tokenize( $this->normalize($text)) ;
36 | }
37 |
38 | /**
39 | * Remove any periods from abbreviated month names
40 | * ie Mar. to March
41 | * @param string $text
42 | */
43 | protected function normalize(string $text) : string
44 | {
45 | $search = ['jan.','feb.','mar.','apr.','may.','jun.','jul.','aug.','sep.','oct.','nov.','dec.'];
46 | $replace = [
47 | "january",
48 | "february",
49 | "march",
50 | "april",
51 | "may",
52 | "june",
53 | "july",
54 | "august",
55 | "september",
56 | "october",
57 | "november",
58 | "december"
59 | ];
60 | return str_ireplace($search, $replace, $text);
61 | }
62 |
63 | /**
64 | * @return DateTime[]
65 | */
66 | public function getDates() : array
67 | {
68 | // return the cached copy
69 | if(empty($this->dates)) {
70 | $getDateFunc = function($sentence)
71 | {
72 | $date = Text::findDate($sentence);
73 | return new DateTime("{$date['year']}-{$date['month']}-{$date['day']}");
74 | };
75 |
76 | $this->dates = array_map($getDateFunc, $this->sentences);
77 |
78 | // re-index so nulls and offsets are correct.
79 | $this->dates = array_values(array_filter($this->dates));
80 | }
81 | return $this->dates;
82 | }
83 | }
84 |
--------------------------------------------------------------------------------
/src/Analysis/Summarize/Simple.php:
--------------------------------------------------------------------------------
1 | $freq)
31 | {
32 | foreach($scoreKeepers as $sentenceKeeper)
33 | {
34 | if(strpos($sentenceKeeper->getToken(), (string)$token) !== false) {
35 |
36 | $sentenceKeeper->addToScore($freq);
37 | }
38 | }
39 | }
40 |
41 | usort($scoreKeepers, 'score_keeper_sort');
42 | return $scoreKeepers;
43 | }
44 |
45 | }
46 |
--------------------------------------------------------------------------------
/src/Classifiers/NaiveBayes.php:
--------------------------------------------------------------------------------
1 | labels[$label])) {
35 | $this->labels[$label] = [];
36 | $this->labelCount[$label] = 0;
37 | }
38 |
39 | $this->labelCount[$label]++;
40 | foreach($freqDist as $token => $count)
41 | {
42 | isset($this->tokenCount[$token]) ? $this->tokenCount[$token] += $count : $this->tokenCount[$token] = $count;
43 | isset($this->labels[$label][$token]) ? $this->labels[$label][$token] += $count : $this->labels[$label][$token] = $count;
44 | }
45 | }
46 |
47 | public function predict(array $tokens)
48 | {
49 | $totalDocs = $this->getDocCount();
50 | $scores = [];
51 |
52 | foreach ($this->labelCount as $label => $docCount)
53 | {
54 | $sum = 0;
55 | $inversedDocCount = $totalDocs - $docCount;
56 | $docCountReciprocal = 1 / $docCount;
57 | $inversedDocCountReciprocal = 1 / $inversedDocCount;
58 |
59 | foreach ($tokens as $token)
60 | {
61 | $totalTokenCount = $this->tokenCount[$token] ?? 1; // prevent division by zero
62 | $tokenCount = $this->labels[$label][$token] ?? 0;
63 | $inversedTokenCount = $totalTokenCount - $tokenCount;
64 | $tokenProbabilityPositive = $tokenCount * $docCountReciprocal;
65 | $tokenProbabilityNegative = $inversedTokenCount * $inversedDocCountReciprocal;
66 | $probability = $tokenProbabilityPositive / ($tokenProbabilityPositive + $tokenProbabilityNegative);
67 | $probability = (0.5 + ($totalTokenCount * $probability)) / (1 + $totalTokenCount);
68 | $sum += log(1 - $probability) - log($probability);
69 | }
70 | $scores[$label] = 1 / (1 + exp($sum));
71 | }
72 | arsort($scores, SORT_NUMERIC);
73 | return $scores;
74 | }
75 |
76 | public function getDocCount() : int
77 | {
78 | return array_sum( array_values( $this->labelCount)) ?? 0;
79 | }
80 |
81 | public function __destruct()
82 | {
83 | unset($this->labelCount);
84 | unset($this->labels);
85 | unset($this->tokenCount);
86 | }
87 |
88 |
89 | }
90 |
--------------------------------------------------------------------------------
/src/Collocations/CollocationFinder.php:
--------------------------------------------------------------------------------
1 | tokens = $tokens;
27 | $this->nGramSize = $nGramSize;
28 | }
29 |
30 | /**
31 | * Returns a naive implementation of collocations
32 | * @return array
33 | */
34 | public function getCollocations()
35 | {
36 | $nGramTokens = ngrams($this->tokens, $this->nGramSize);
37 | return freq_dist($nGramTokens)->getKeyValuesByFrequency();
38 | }
39 |
40 | /**
41 | * Compute the Pointwise Mutual Information on the collocations
42 | * @return array
43 | */
44 | public function getCollocationsByPmi()
45 | {
46 | $nGramFreqDist = freq_dist(ngrams($this->tokens, $this->nGramSize));
47 | $unigramsFreqDist = freq_dist($this->tokens);
48 |
49 | $dataSet = [];
50 | foreach($nGramFreqDist->getKeys() as $nGramToken)
51 | {
52 | $tokens = explode(" ", $nGramToken);
53 | $tally = 1;
54 | foreach($tokens as $unigramToken)
55 | {
56 | $tally *= $unigramsFreqDist->getKeyValuesByWeight()[$unigramToken];
57 | }
58 |
59 | // get probabilities of all tokens
60 | $dataSet[$nGramToken] = log($nGramFreqDist->getKeyValuesByWeight()[$nGramToken] / $tally );
61 | }
62 | arsort($dataSet);
63 | return $dataSet;
64 | }
65 | }
66 |
--------------------------------------------------------------------------------
/src/Comparisons/CosineSimilarityComparison.php:
--------------------------------------------------------------------------------
1 | $freq)
35 | {
36 | if (isset($text1Freq[$term]) && isset($text2Freq[$term])) {
37 | $product += $text1Freq[$term] * $text2Freq[$term];
38 | }
39 | }
40 |
41 | $productFunc = function($carry, $freq)
42 | {
43 | $carry += pow($freq, 2);
44 | return $carry;
45 | };
46 |
47 | $text1VectorSum = sqrt(array_reduce(array_values($text1Freq), $productFunc, 0));
48 | $text2VectorSum = sqrt(array_reduce(array_values($text2Freq), $productFunc, 0));
49 | return $product / ($text1VectorSum * $text2VectorSum);
50 |
51 | }
52 |
53 | /**
54 | *
55 | * @param array $text1
56 | * @param array $text2
57 | * @return float
58 | */
59 | public function distance($text1, $text2)
60 | {
61 | return 1 - $this->similarity($text1, $text2);
62 | }
63 |
64 | }
65 |
--------------------------------------------------------------------------------
/src/Comparisons/HammingDistanceComparison.php:
--------------------------------------------------------------------------------
1 |
12 | */
13 | class HammingDistanceComparison implements IDistance
14 | {
15 | /**
16 | * Return the hamming distance, expects the strings to be equal length
17 | * @param string $text1
18 | * @param string $text2
19 | * @return int
20 | */
21 | public function distance($text1, $text2)
22 | {
23 | $distance = 0;
24 | $strLength = strlen($text1);
25 | for($index = 0; $index < $strLength; $index++)
26 | {
27 | if($text1[$index] != $text2[$index]) {
28 | $distance++;
29 | }
30 | }
31 | return $distance;
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/src/Comparisons/JaccardIndexComparison.php:
--------------------------------------------------------------------------------
1 |
11 | */
12 | class JaccardIndexComparison implements ISimilarity
13 | {
14 | /**
15 | * Returns the Jaccard Index
16 | * @param string|array $text1
17 | * @param string|array $text2
18 | * @return float
19 | */
20 | public function similarity($text1, $text2)
21 | {
22 | if(is_string($text1) && is_string($text2)) {
23 | $text1 = str_split($text1);
24 | $text2 = str_split($text2);
25 | }
26 | $inter = array_intersect( $text1, $text2 );
27 | $union = array_unique( ($text1 + $text2) );
28 | return count($inter) / count($union);
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/src/Comparisons/JaroWinklerComparison.php:
--------------------------------------------------------------------------------
1 |
11 | */
12 | class JaroWinklerComparison implements ISimilarity
13 | {
14 | /**
15 | * The minimum prefix length
16 | * @var int
17 | */
18 | protected $minPrefixLength;
19 |
20 | public function __construct($minPrefixLength = 4)
21 | {
22 | $this->minPrefixLength = $minPrefixLength;
23 | }
24 |
25 | /**
26 | * Return the similarity using the JaroWinkler algorithm
27 | * @param string $text1
28 | * @param string $text2
29 | * @return real
30 | */
31 | public function similarity($text1, $text2)
32 | {
33 | if($text1 === $text2) {
34 | return 1.0;
35 | }
36 |
37 | // ensure that s1 is shorter than or same length as s2
38 | if (strlen($text1) > strlen($text2)) {
39 | $tmp = $text1;
40 | $text1 = $text2;
41 | $text2 = $tmp;
42 | }
43 |
44 | $strLen1 = strlen($text1);
45 | $strLen2 = strlen($text2);
46 |
47 | $maxDistance = (int)$strLen2 / 2;
48 | $commonCounter = 0; // count of common characters
49 | $transpositionCounter = 0; // count of transpositions
50 | $prevPosition = -1;
51 | for ($index = 0; $index < $strLen1; $index++)
52 | {
53 | $char = $text1[$index];
54 | // init inner loop
55 | $jindex = max(0, $index - $maxDistance);
56 | while($jindex < min($strLen2, $index + $maxDistance))
57 | {
58 | if ($char === $text2[$jindex]) {
59 | $commonCounter++; // common char found
60 | if ($prevPosition != -1 && $jindex < $prevPosition) {
61 | $transpositionCounter++;
62 | }
63 | $prevPosition = $jindex;
64 | break;
65 | }
66 |
67 | $jindex++;
68 | }
69 | }
70 | // no common characters between strings
71 | if($commonCounter === 0) {
72 | return 0.0;
73 | }
74 |
75 | // first compute the score
76 | $score = (
77 | ($commonCounter / $strLen1) +
78 | ($commonCounter / $strLen2) +
79 | (($commonCounter - $transpositionCounter) / $commonCounter)) / 3.0;
80 |
81 | //init values
82 | $prefixLength = 0; // length of prefix
83 | $last = min($this->minPrefixLength, $strLen1);
84 | while($prefixLength < $last && $text1[$prefixLength] == $text2[$prefixLength])
85 | {
86 | $prefixLength++;
87 | }
88 |
89 | return $score + (($prefixLength * (1 - $score)) / 10);
90 | }
91 |
92 | }
93 |
--------------------------------------------------------------------------------
/src/Comparisons/LevenshteinComparison.php:
--------------------------------------------------------------------------------
1 | insertCost = $insertCost;
28 | $this->replaceCost = $replaceCost;
29 | $this->deleteCost = $deleteCost;
30 |
31 | }
32 |
33 | /**
34 | * Return the levenshtein distance, default costs of 1 applied
35 | * @param string $text1
36 | * @param string $text2
37 | * @return int
38 | */
39 | public function distance($text1, $text2)
40 | {
41 | return levenshtein($text1, $text2, $this->insertCost, $this->replaceCost, $this->deleteCost);
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/src/Comparisons/LongestCommonSubstringComparison.php:
--------------------------------------------------------------------------------
1 |
13 | */
14 | class LongestCommonSubstringComparison implements ISimilarity, IDistance
15 | {
16 | /**
17 | * Using caching to improve performance on text2 inputs
18 | * @var boolean
19 | */
20 | protected $useCache = false;
21 |
22 | /**
23 | * Cache for holding substring arrays key/value array
24 | * @var array
25 | */
26 | protected $cache = [];
27 |
28 | /**
29 | *
30 | * @param boolean $useCache
31 | */
32 | public function __construct($useCache = false)
33 | {
34 | $this->useCache = $useCache;
35 | }
36 |
37 | /**
38 | * Returns the string length of the longest common substring (LCS)
39 | * @param string $text1
40 | * @param string $text2
41 | * @return int
42 | */
43 | public function distance($text1, $text2)
44 | {
45 | return max(mb_strlen($text1), mb_strlen($text2)) - mb_strlen($this->similarity($text1, $text2));
46 | }
47 |
48 | /**
49 | * Returns the Longest common substring
50 | * @param string $text1
51 | * @param string $text2
52 | * @return string
53 | */
54 | public function similarity($text1, $text2)
55 | {
56 | if($this->useCache && !isset($this->cache[$text2])) {
57 | $this->cache[$text2] = Text::getAllSubStrings($text2);
58 | }
59 |
60 | $intersection = array_intersect( Text::getAllSubStrings($text1), ($this->useCache) ? $this->cache[$text2] : Text::getAllSubStrings($text2));
61 | $max = 0;
62 | $lcs = '';
63 | foreach($intersection as $substr)
64 | {
65 | $strlen = mb_strlen($substr);
66 | if( $strlen > $max) {
67 | $max = $strlen;
68 | $lcs = $substr;
69 | }
70 | }
71 | return $lcs;
72 | }
73 |
74 | /**
75 | *
76 | * @return array
77 | */
78 | public function getCache()
79 | {
80 | return $this->cache;
81 | }
82 |
83 | public function __destruct()
84 | {
85 | unset($this->cache);
86 | unset($this->useCache);
87 | }
88 | }
89 |
--------------------------------------------------------------------------------
/src/Comparisons/MostFreqCharComparison.php:
--------------------------------------------------------------------------------
1 |
13 | */
14 | class MostFreqCharComparison implements ISimilarity, IDistance
15 | {
16 | /**
17 | * The minimum number of frequency per char to count
18 | * @var int
19 | */
20 | protected $limit;
21 |
22 | /**
23 | * Set the minimum limit
24 | * @param int $limit
25 | */
26 | public function __construct($limit = 2)
27 | {
28 | $this->limit = $limit;
29 | }
30 |
31 |
32 | /**
33 | * Returns the most frequently used letter with the same
34 | * frequency
35 | * @param string $text1
36 | * @param string $text2
37 | * @return int
38 | */
39 | public function similarity($text1, $text2)
40 | {
41 | $similarity = 0;
42 | $hash1 = $this->hashString($text1);
43 | $hash2 = $this->hashString($text2);
44 |
45 | $keys = array_keys(array_intersect_key($hash1, $hash2));
46 | foreach($keys as $key)
47 | {
48 | if($hash1[$key] === $hash2[$key] && $hash1[$key] >= $this->limit)
49 | {
50 | $similarity += $hash1[$key];
51 | }
52 | }
53 | return $similarity;
54 | }
55 |
56 |
57 |
58 | /**
59 | * Returns a sorted hashed array with the frequency counts per character
60 | * @param string $text
61 | */
62 | public function hashString($text)
63 | {
64 | $charList = str_split($text);
65 | $chars = array_fill_keys( $charList, 0);
66 | foreach($charList as $char) {
67 | $chars[$char]++;
68 | }
69 | return $chars;
70 | }
71 |
72 | /**
73 | * Returns the distance max string length minus similarity
74 | * @param string $text1
75 | * @param string $text2
76 | * @return int
77 | */
78 | public function distance($text1, $text2)
79 | {
80 | return max(strlen($text1), strlen($text2)) - $this->similarity($text1, $text2);
81 | }
82 |
83 | }
84 |
--------------------------------------------------------------------------------
/src/Comparisons/SimilarTextComparison.php:
--------------------------------------------------------------------------------
1 | similarity($text1, $text2);
24 | }
25 |
26 | /**
27 | * Returns similar_text call
28 | * @param string $text1
29 | * @param string $text2
30 | * @return int
31 | */
32 | public function similarity($text1, $text2)
33 | {
34 | return similar_text($text1, $text2);
35 | }
36 |
37 | }
38 |
--------------------------------------------------------------------------------
/src/Console/Commands/NltkPackageInstallAllCommand.php:
--------------------------------------------------------------------------------
1 | setName('pta:install:all')
22 | ->setDescription('Install all packages from pta data');
23 | }
24 |
25 | protected function execute(InputInterface $input, OutputInterface $output) : int
26 | {
27 | $listPackages = (new NltkCorporaIndexDownloader())->getPackages();
28 |
29 | foreach($listPackages as $package)
30 | {
31 | $command = $this->getApplication()->find('pta:install:package');
32 | $args = [
33 | 'command' => 'pta:install:package',
34 | 'package' => $package->getId()
35 | ];
36 |
37 | $packageInstallerInput = new ArrayInput($args);
38 | $command->run($packageInstallerInput, $output);
39 | }
40 |
41 | return 0;
42 | }
43 |
44 | }
45 |
--------------------------------------------------------------------------------
/src/Console/Commands/NltkPackageListCommand.php:
--------------------------------------------------------------------------------
1 | setName('pta:list')
22 | ->setDescription('List Corpora available in the pta data repo.')
23 | ->addArgument(
24 | 'url',
25 | InputArgument::OPTIONAL,
26 | 'Use a different url to download the pta/nltk package list.'
27 | );
28 | }
29 |
30 | protected function execute(InputInterface $input, OutputInterface $output)
31 | {
32 | $url = $input->getArgument('url');
33 | if ($url) {
34 | $downloader = new NltkCorporaIndexDownloader($url);
35 | } else {
36 | $downloader = new NltkCorporaIndexDownloader();
37 | }
38 |
39 | $packages = $downloader->getPackages();
40 |
41 | usort($packages, function($package1, $package2){
42 | return strnatcasecmp($package1->getId(), $package2->getId());
43 | });
44 | /** @var $package \TextAnalysis\Utilities\Nltk\Download\Package */
45 | $output->writeln("Packages available for installation:");
46 | foreach($packages as $package)
47 | {
48 | $output->writeln(" * {$package->getId()} - {$package->getName()}");
49 | }
50 | return 0;
51 | }
52 | }
53 |
--------------------------------------------------------------------------------
/src/Console/Commands/StopWordsCommand.php:
--------------------------------------------------------------------------------
1 | setName('stopwords:generate ')
25 | ->setDescription('Process a document or corpus of stop words, echos to command line')
26 | ->addArgument(
27 | 'path',
28 | InputArgument::REQUIRED,
29 | 'Path to a file or directory to read in. MUST be text files'
30 | )
31 | ->addArgument(
32 | 'type',
33 | InputArgument::OPTIONAL,
34 | "type can be json or csv", 'json'
35 | );
36 | }
37 |
38 | protected function execute(InputInterface $input, OutputInterface $output)
39 | {
40 | $path = $input->getArgument('path');
41 |
42 | if(!file_exists($path)) {
43 | $output->writeln("{$path} is not a file or a path");
44 | }
45 |
46 | $filePaths = [];
47 | if(is_file($path)) {
48 | $filePaths = [realpath($path)];
49 | } elseif(is_dir($path)) {
50 | $filePaths = array_diff(scandir($path), array('..', '.'));
51 | } else {
52 | $output->writeln("{$path} is not known.");
53 | }
54 |
55 | $generator = new StopwordGenerator($filePaths);
56 | if($input->getArgument('type') === 'json') {
57 | echo json_encode($this->toArray($generator->getStopwords()), JSON_NUMERIC_CHECK | JSON_UNESCAPED_UNICODE);
58 | echo json_last_error_msg();
59 | die;
60 | $output->write(json_encode($this->toArray($generator->getStopwords())));
61 | } else {
62 | $stopwords = $generator->getStopwords();
63 | $stdout = fopen('php://stdout', 'w');
64 | echo 'token,freq'.PHP_EOL;
65 | foreach($stopwords as $token => $freq)
66 | {
67 | fputcsv($stdout, [utf8_encode($token), $freq]).PHP_EOL;
68 | }
69 | fclose($stdout);
70 | }
71 | return 0;
72 | }
73 |
74 | /**
75 | * So you can easily serialize the data to json
76 | * @return array
77 | */
78 | protected function toArray(array $stopWords)
79 | {
80 | $data = [];
81 | foreach($stopWords as $key => $value)
82 | {
83 | $data[] = ['token' => utf8_encode($key), 'freq' => $value];
84 | }
85 | return $data;
86 | }
87 | }
88 |
--------------------------------------------------------------------------------
/src/Console/Commands/VocabSizeCommand.php:
--------------------------------------------------------------------------------
1 | setName('vocab:size')
33 | ->setDescription('Process stdin and return the vocab size');
34 | }
35 |
36 | protected function execute(InputInterface $input, OutputInterface $output)
37 | {
38 | if (ftell(STDIN) === 0) {
39 | $contents = '';
40 | while (!feof(STDIN)) {
41 | $contents .= fread(STDIN, 4096);
42 | }
43 | // filtered tokens
44 | $tokens = array_map([$this,'filter'], (new GeneralTokenizer())->tokenize($contents));
45 | $tokens = array_values( array_filter($tokens));
46 |
47 | $ct = (new FreqDist($tokens))->getTotalUniqueTokens();
48 | echo $ct.PHP_EOL;
49 | return $ct;
50 |
51 | } else {
52 | throw new \RuntimeException("Please pipe in STDIN");
53 | }
54 | return 0;
55 | }
56 |
57 | protected function filter($token)
58 | {
59 | foreach($this->getFilters() as $filter)
60 | {
61 | $token = $filter->transform($token);
62 | }
63 | return $token;
64 | }
65 |
66 | /**
67 | *
68 | * @return array
69 | */
70 | protected function getFilters()
71 | {
72 | if(empty($this->filters)) {
73 | $this->filters = [
74 | new PossessiveNounFilter(),
75 | new QuotesFilter(['"','`']),
76 | new LowerCaseFilter(),
77 | new PunctuationFilter(),
78 | new CharFilter()
79 | ];
80 | }
81 | return $this->filters;
82 | }
83 | }
84 |
--------------------------------------------------------------------------------
/src/Corpus/ImportCorpus.php:
--------------------------------------------------------------------------------
1 | getPackage()->getInstallationPath();
31 | // use array values to start the indexing of the array @ zero
32 | return array_values(array_diff(scandir($installationPath), array('..', '.')));
33 | }
34 |
35 | /**
36 | * The id of the package to load
37 | * @var string
38 | */
39 | protected $packageId;
40 |
41 | /**
42 | * Return an array of tokenized words
43 | * @param string|null $fileId
44 | * @param \TextAnalysis\Tokenizers\TokenizerAbstract
45 | * @return array
46 | */
47 | public function getWords($fileId = null, $tokenizer = null)
48 | {
49 | if(!$tokenizer) {
50 | $tokenizer = new GeneralTokenizer();
51 | }
52 | $fileIds = [];
53 | if(empty($fileId)) {
54 | $fileIds = $this->getFileIds();
55 | } else {
56 | $fileIds = [$fileId];
57 | }
58 |
59 | $words = [];
60 | foreach($fileIds as $filename )
61 | {
62 | $content = file_get_contents($this->getPackage()->getInstallationPath().$filename);
63 | $words = array_merge($words, $tokenizer->tokenize($content));
64 | unset($content);
65 | }
66 | return $words;
67 | }
68 |
69 | /**
70 | * Return an array of tokenized sentences, see getWords
71 | * @param string|null $fileId
72 | * @return array
73 | */
74 | public function getSentences($fileId = null)
75 | {
76 | return $this->getWords($fileId, new SentenceTokenizer());
77 | }
78 |
79 | /**
80 | * Each array element is the text of the selected file loaded file, see getWords
81 | * @param $fileId
82 | * @return array of strings
83 | */
84 | public function getRaw($fileId = null)
85 | {
86 | // does nothing with the text
87 | $lamdaFunction = function($text){
88 | return [$text];
89 | };
90 | return $this->getWords($fileId, new LambdaTokenizer($lamdaFunction));
91 | }
92 |
93 |
94 | /**
95 | * Provide the package id
96 | * @param string $packageId
97 | */
98 | public function __construct($packageId)
99 | {
100 | $this->packageId = $packageId;
101 | }
102 |
103 | /**
104 | *
105 | * @return Package
106 | */
107 | public function getPackage()
108 | {
109 | if(empty($this->package)) {
110 | // loads the package list from cache
111 | $packages = (new NltkCorporaIndexDownloader(null, true))->getPackages();
112 |
113 | $filteredPackages = array_filter($packages, function($package) use ($packageId){
114 | return ($package->getId() == $packageId);
115 | });
116 |
117 | $this->package = array_values($filteredPackages)[0];
118 | }
119 | return $this->package;
120 | }
121 | }
122 |
--------------------------------------------------------------------------------
/src/Corpus/NameCorpus.php:
--------------------------------------------------------------------------------
1 | nickNames = new Nicknames();
43 |
44 | if(!$dir) {
45 | $dir = get_storage_path('corpora');
46 | }
47 | parent::__construct($dir, $lang);
48 | }
49 |
50 | public function getNickNameExact($name) : string
51 | {
52 | return $this->nickNames->query($name);
53 | }
54 |
55 | public function getNickNameFuzzy($name) : array
56 | {
57 | return $this->nickNames->fuzzy($name);
58 | }
59 |
60 | public function getFileNames(): array
61 | {
62 | return ['us_names.sqlite3'];
63 | }
64 |
65 | /**
66 | *
67 | * @param string $name
68 | * @return boolean
69 | */
70 | public function isFirstName($name) : bool
71 | {
72 | return !empty($this->getFirstName($name));
73 | }
74 |
75 | /**
76 | * @todo make this more flexible
77 | * @param string $name
78 | * @return array
79 | */
80 | public function getFirstName($name) : array
81 | {
82 | if(!isset($this->firstNameCache[$name])) {
83 | $stmt = $this->getPdo()->prepare("SELECT * FROM us_names_by_year WHERE name = LOWER(:name) LIMIT 1");
84 | $stmt->bindParam(':name', $name);
85 | $stmt->execute();
86 | $this->firstNameCache[$name] = $stmt->fetchAll(PDO::FETCH_ASSOC) ?? [];
87 | }
88 | return $this->firstNameCache[$name];
89 | }
90 |
91 | /**
92 | *
93 | * @param string $name
94 | * @return boolean
95 | */
96 | public function isLastName($name) : bool
97 | {
98 | return !empty($this->getLastName($name));
99 | }
100 |
101 | /**
102 | *
103 | * @param string $name
104 | * @return array
105 | */
106 | public function getLastName($name) : array
107 | {
108 | if(!isset($this->lastNameCache[$name])) {
109 | $stmt = $this->getPdo()->prepare("SELECT * FROM surnames WHERE name = LOWER(:name)");
110 | $stmt->bindParam(':name', $name);
111 | $stmt->execute();
112 | $r = $stmt->fetch(PDO::FETCH_ASSOC);
113 | $this->lastNameCache[$name] = (!$r) ? [] : $r;
114 | }
115 | return $this->lastNameCache[$name];
116 | }
117 |
118 | /**
119 | *
120 | * @param string $name
121 | * @return bool
122 | */
123 | public function isFullName($name) : bool
124 | {
125 | $tokens = explode(" ", $name);
126 | if(count($tokens) < 2) {
127 | return false;
128 | }
129 | return !empty($this->isFirstName(current($tokens))) && !empty($this->isLastName(end($tokens)));
130 | }
131 |
132 |
133 |
134 | /**
135 | * Return the raw pdo
136 | * @return PDO
137 | */
138 | public function getPdo() : PDO
139 | {
140 | if(empty($this->pdo)) {
141 | $this->pdo = new PDO("sqlite:".$this->getDir().$this->getFileNames()[0]);
142 | }
143 | return $this->pdo;
144 | }
145 |
146 | public function __destruct()
147 | {
148 | unset($this->pdo);
149 | unset($this->firstNameCache);
150 | unset($this->lastNameCache);
151 | unset($this->nickNames);
152 | }
153 |
154 | }
155 |
156 |
157 |
--------------------------------------------------------------------------------
/src/Corpus/ReadCorpusAbstract.php:
--------------------------------------------------------------------------------
1 | dir = $dir;
32 | $this->lang = $lang;
33 | }
34 |
35 | /**
36 | *
37 | * @return string language to use, default is eng
38 | */
39 | public function getLanguage()
40 | {
41 | return $this->lang;
42 | }
43 |
44 | /**
45 | * @return string the directory the corpus files are located
46 | */
47 | public function getDir()
48 | {
49 | return $this->dir;
50 | }
51 |
52 |
53 |
54 | /**
55 | * @return string[] Return the list of file names that must be loaded to use the corpus
56 | * Should use relative paths
57 | */
58 | abstract public function getFileNames();
59 |
60 |
61 | }
62 |
63 |
64 |
--------------------------------------------------------------------------------
/src/Documents/ContentDocument.php:
--------------------------------------------------------------------------------
1 | id = ++self::$counter;
36 | }
37 | $this->content = $content;
38 | }
39 |
40 | /**
41 | *
42 | * @return int|string
43 | */
44 | public function getId()
45 | {
46 | return $this->id;
47 | }
48 |
49 | /**
50 | *
51 | * @return string
52 | */
53 | public function getContent()
54 | {
55 | return $this->content;
56 | }
57 | }
58 |
--------------------------------------------------------------------------------
/src/Documents/DocumentAbstract.php:
--------------------------------------------------------------------------------
1 | tokens = $tokens;
32 | if(!$zones) {
33 | $this->zones = new \stdClass();
34 | }
35 | }
36 |
37 | /**
38 | * Returns the set of tokens in this document, most of the time
39 | * @return mixed
40 | */
41 | public abstract function getDocumentData();
42 |
43 | public abstract function applyTransformation(ITokenTransformation $transformer);
44 |
45 | }
46 |
--------------------------------------------------------------------------------
/src/Downloaders/NltkCorporaIndexDownloader.php:
--------------------------------------------------------------------------------
1 | url = $url;
40 | $this->useCache = $useCache;
41 | }
42 |
43 | /**
44 | * Returns an array of packages available for download from the nltk project
45 | * @return array
46 | */
47 | public function getPackages()
48 | {
49 | if(empty($this->packages)) {
50 |
51 | $xml = $this->getXmlContent();
52 | foreach($xml->packages->package as $package)
53 | {
54 | $data = (array)$package;
55 | extract($data['@attributes']);
56 | // checksums may not exist on some remote packages
57 | if(!isset($checksum)) {
58 | $checksum = null;
59 | }
60 | $this->packages[] = new Package($id, $checksum, $name, $subdir, $unzip, $url);
61 | }
62 | }
63 | return $this->packages;
64 |
65 | }
66 |
67 | /**
68 | * Get the useCache value
69 | * @return boolean
70 | */
71 | protected function getUseCache()
72 | {
73 | return $this->useCache;
74 | }
75 |
76 | /**
77 | * Uses file_get_contents to pull down the content from the url
78 | * @return SimpleXMLElement
79 | */
80 | public function getXmlContent()
81 | {
82 | if($this->getUseCache() && file_exists(get_storage_path('cache').$this->getCacheFileName())) {
83 | $contents = file_get_contents(get_storage_path('cache').$this->getCacheFileName());
84 | } else {
85 | $contents = file_get_contents( $this->getUrl());
86 | file_put_contents(get_storage_path('cache').$this->getCacheFileName(), $contents);
87 | }
88 | return simplexml_load_string( $contents);
89 | }
90 |
91 | /**
92 | *
93 | * @return string
94 | */
95 | protected function getCacheFileName()
96 | {
97 | return 'pta-list.xml';
98 | }
99 |
100 |
101 | /**
102 | * Returns the URL that file_get_contents is run against
103 | * @return string
104 | */
105 | public function getUrl()
106 | {
107 | return $this->url;
108 | }
109 |
110 | }
111 |
--------------------------------------------------------------------------------
/src/Exceptions/InvalidExpression.php:
--------------------------------------------------------------------------------
1 |
10 | */
11 | class InvalidParameterSizeException extends Exception
12 | {
13 | //put your code here
14 | }
15 |
--------------------------------------------------------------------------------
/src/Extracts/DateExtract.php:
--------------------------------------------------------------------------------
1 | verify($date)) {
25 | return new DateTime("{$date['year']}-{$date['month']}-{$date['day']}");
26 | }
27 | return false;
28 | }
29 |
30 | /**
31 | * Verify all the required fields are set in the array
32 | * @param array $date
33 | * @return boolean
34 | */
35 | protected function verify(array $date)
36 | {
37 | return (!empty($date['year']) && !empty($date['month']) && !empty($date['day']));
38 | }
39 |
40 | }
41 |
--------------------------------------------------------------------------------
/src/Extracts/EmailExtract.php:
--------------------------------------------------------------------------------
1 | minLength = $minLength;
22 | }
23 |
24 | /**
25 | *
26 | * @param string $token
27 | * @return false|string
28 | */
29 | public function filter($token)
30 | {
31 | // don't count the hash tag sign -1
32 | if($token[0] === '#' && strlen($token)-1 >= $this->getMinLength()) {
33 | return $token;
34 | }
35 | return false;
36 | }
37 |
38 | public function getMinLength() : int
39 | {
40 | return $this->minLength;
41 | }
42 |
43 | }
--------------------------------------------------------------------------------
/src/Extracts/LambdaExtract.php:
--------------------------------------------------------------------------------
1 | transform($token);
23 | }
24 |
25 | }
26 |
--------------------------------------------------------------------------------
/src/Extracts/UrlExtract.php:
--------------------------------------------------------------------------------
1 | lambdaFunc = $lambdaFunc;
26 | }
27 |
28 | /**
29 | * Run the lambda function on the word token
30 | * @param string $word
31 | * @return string|null
32 | */
33 | public function transform($word)
34 | {
35 | return call_user_func($this->lambdaFunc, $word);
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/src/Filters/LowerCaseFilter.php:
--------------------------------------------------------------------------------
1 | getRegex(), '', $word);
17 | }
18 |
19 | }
20 |
--------------------------------------------------------------------------------
/src/Filters/QuotesFilter.php:
--------------------------------------------------------------------------------
1 | search = $search;
35 | $this->regex = "/([".implode("", $this->search)."])/u";
36 | }
37 |
38 | /**
39 | *
40 | * @return string
41 | */
42 | public function getRegex()
43 | {
44 | return $this->regex;
45 | }
46 |
47 | /**
48 | * Filter the word
49 | * @param string $word
50 | * @return string
51 | */
52 | public function transform($word)
53 | {
54 | return preg_replace($this->getRegex(), '', $word);
55 | }
56 |
57 | public function __destruct()
58 | {
59 | unset($this->regex);
60 | unset($this->search);
61 | }
62 | }
63 |
64 |
--------------------------------------------------------------------------------
/src/Filters/SpacePunctuationFilter.php:
--------------------------------------------------------------------------------
1 | ','?','@',
17 | '^','_','`','{','|','}','~','\[','\]'
18 | ];
19 |
20 | protected $regex = "";
21 |
22 | /**
23 | *
24 | * @param array $whiteList
25 | * @param array $blackList
26 | */
27 | public function __construct(array $whiteList = [], array $blackList = [])
28 | {
29 | // add elements from the white list
30 | $this->searchFor = array_diff($this->searchFor, $whiteList);
31 | $this->searchFor = array_merge($this->searchFor, $blackList);
32 | $this->regex = "/([".implode("", $this->searchFor)."])/";
33 | }
34 |
35 | /**
36 | *
37 | * @return string
38 | */
39 | public function getRegex()
40 | {
41 | return $this->regex;
42 | }
43 |
44 | /**
45 | *
46 | * @return array returns an array of characters that are punctuation
47 | */
48 | public function getSearchFor()
49 | {
50 | return $this->searchFor;
51 | }
52 |
53 |
54 | /**
55 | *
56 | * @param string $word
57 | * @return string
58 | */
59 | public function transform($word)
60 | {
61 | return preg_replace($this->getRegex(), ' $1 ', $word);
62 | }
63 |
64 | public function __destruct()
65 | {
66 | unset($this->regex);
67 | unset($this->searchFor);
68 | }
69 |
70 | }
71 |
--------------------------------------------------------------------------------
/src/Filters/StopWordsFilter.php:
--------------------------------------------------------------------------------
1 | stopWords = array_fill_keys($stopWords, true);
27 | }
28 |
29 | /**
30 | * Check if the stop word is in the list
31 | * @param string $token
32 | */
33 | public function transform($token)
34 | {
35 | if(isset($this->stopWords[$token])) {
36 | return null;
37 | }
38 | return $token;
39 | }
40 |
41 | /**
42 | * release the stop words
43 | */
44 | public function __destruct()
45 | {
46 | unset($this->stopWords);
47 | }
48 | }
--------------------------------------------------------------------------------
/src/Filters/StripTagsFilter.php:
--------------------------------------------------------------------------------
1 | filePaths = $filePaths;
45 | $this->mode = (int)$mode;
46 | }
47 |
48 | /**
49 | * Returns the array of file paths
50 | * @return string[]
51 | */
52 | public function getFilePaths()
53 | {
54 | return $this->filePaths;
55 | }
56 |
57 | /**
58 | * Returns an array of stop words and their frequencies
59 | * @return string[]
60 | */
61 | public function getStopwords()
62 | {
63 | if(!empty($this->stopWords)) {
64 | return $this->stopWords;
65 | }
66 |
67 | foreach($this->getFilePaths() as $filePath)
68 | {
69 | $content = $this->getFileContent($filePath);
70 | $doc = new TokensDocument((new GeneralTokenizer())
71 | ->tokenize($content) );
72 | $doc->applyTransformation(new LowerCaseFilter())
73 | ->applyTransformation(new PossessiveNounFilter())
74 | ->applyTransformation(new PunctuationFilter())
75 | ->applyTransformation(new CharFilter());
76 |
77 | if($this->mode === self::MODE_FREQ) {
78 | $this->computeUsingFreqDist($doc->getDocumentData());
79 | }
80 |
81 | }
82 | arsort($this->stopWords);
83 | return $this->stopWords;
84 | }
85 |
86 | /**
87 | * Adds frequency counts to the stopWords property
88 | * @param array $tokens
89 | */
90 | protected function computeUsingFreqDist(array $tokens)
91 | {
92 | $freqDist = (new FreqDist($tokens))
93 | ->getKeyValuesByFrequency();
94 |
95 | foreach($freqDist as $token => $freqValue)
96 | {
97 | if(!isset($this->stopWords[$token])) {
98 | $this->stopWords[$token] = $freqValue;
99 | } else {
100 | $this->stopWords[$token] += $freqValue;
101 | }
102 | }
103 | }
104 |
105 | /**
106 | * Returns the text content from the file
107 | * @param string $filePath
108 | * @return string
109 | */
110 | protected function getFileContent($filePath)
111 | {
112 | return file_get_contents($filePath);
113 | }
114 |
115 | public function __destruct()
116 | {
117 | unset($this->filePaths);
118 | unset($this->mode);
119 | unset($this->stopWords);
120 | }
121 |
122 | }
123 |
--------------------------------------------------------------------------------
/src/Indexes/TfIdf.php:
--------------------------------------------------------------------------------
1 | buildIndex($collection);
36 | }
37 |
38 | protected function buildIndex(ICollection $collection)
39 | {
40 | foreach($collection as $id => $document){
41 | $freqDist = freq_dist($document->getDocumentData());
42 | foreach($freqDist->getKeyValuesByFrequency() as $key => $freq) {
43 | if(!isset($this->idf[$key])) {
44 | $this->idf[$key] = 0;
45 | }
46 | $this->idf[$key]++;
47 | }
48 | }
49 |
50 | $count = count($collection);
51 | foreach($this->idf as $key => &$value) {
52 | $value = log(($count)/($value));
53 | }
54 | }
55 |
56 | /**
57 | * If a token is provided return just the idf for that token,
58 | * else return the entire idf
59 | * @param $token string
60 | * @return float|array
61 | */
62 | public function getIdf($token = null)
63 | {
64 | if(!$token){
65 | return $this->idf;
66 | }
67 | return $this->idf[$token];
68 | }
69 |
70 | /**
71 | * Get the term frequency
72 | * @param DocumentAbstract $document - the document to evaluate
73 | * @param string $token The token to look for
74 | * @param int $mode The type of term frequency to use
75 | * @return int|float
76 | */
77 | public function getTermFrequency(DocumentAbstract $document, $token, $mode = 1)
78 | {
79 | $freqDist = new FreqDist($document->getDocumentData());
80 | $keyValuesByWeight = $freqDist->getKeyValuesByFrequency();
81 |
82 | //The token does not exist in the document
83 | if(!isset($keyValuesByWeight[$token])) {
84 | return 0;
85 | }
86 |
87 | switch($mode) {
88 |
89 | case self::BOOLEAN_MODE:
90 | //a test was already performed if the token exists in the document
91 | //just return true
92 | return 1;
93 | case self::LOGARITHMIC_MODE:
94 | return log($keyValuesByWeight[$token]+1);
95 | case self::AUGMENTED_MODE:
96 |
97 | //FreqDist getKeyValuesByFrequency is already sorted
98 | //in ascending order
99 | $maxFrequency = current($keyValuesByWeight);
100 | return 0.5 + (0.5 * $keyValuesByWeight[$token]) / $maxFrequency;
101 |
102 | case self::FREQUENCY_MODE:
103 | default:
104 | return $keyValuesByWeight[$token];
105 | }
106 | }
107 |
108 | /**
109 | * Get the term frequency
110 | * @param DocumentAbstract $document - the document to evaluate
111 | * @param string $token The token to look for
112 | * @param int $mode The type of term frequency to use
113 | * @return float
114 | */
115 | public function getTfIdf(DocumentAbstract $document, $token, $mode = 1)
116 | {
117 | return $this->getTermFrequency($document, $token, $mode) * $this->getIdf($token);
118 | }
119 |
120 |
121 | }
122 |
123 |
--------------------------------------------------------------------------------
/src/Interfaces/IClassifier.php:
--------------------------------------------------------------------------------
1 | token = $token;
33 | $this->index = $index;
34 | $this->score = $score;
35 | }
36 |
37 | public function getToken() : string
38 | {
39 | return $this->token;
40 | }
41 |
42 | public function getIndex()
43 | {
44 | return $this->index;
45 | }
46 |
47 | public function getScore()
48 | {
49 | return $this->score;
50 | }
51 |
52 | public function addToScore($score)
53 | {
54 | $this->score += $score;
55 | }
56 | }
57 |
--------------------------------------------------------------------------------
/src/Models/Wordnet/ExceptionMap.php:
--------------------------------------------------------------------------------
1 | pos = $pos;
39 | $this->target = $target;
40 | $this->exceptionList = $exceptionList;
41 | }
42 |
43 | /**
44 | *
45 | * @return string
46 | */
47 | public function getPos()
48 | {
49 | return $this->pos;
50 | }
51 |
52 | /**
53 | *
54 | * @return string
55 | */
56 | public function getTarget()
57 | {
58 | return $this->target;
59 | }
60 |
61 | /**
62 | * @return string[]
63 | */
64 | public function getExceptionList()
65 | {
66 | return $this->exceptionList;
67 | }
68 |
69 | }
70 |
--------------------------------------------------------------------------------
/src/Models/Wordnet/Lemma.php:
--------------------------------------------------------------------------------
1 | word = $word;
61 | $this->pos = $pos;
62 | $this->synsetCnt = $synsetCnt;
63 | $this->pCnt = $pCnt;
64 | $this->ptrSymbols = $ptrSymbols;
65 | $this->synsetOffsets = $synsetOffsets;
66 | }
67 |
68 | /**
69 | *
70 | * @return string
71 | */
72 | public function getWord()
73 | {
74 | return $this->word;
75 | }
76 |
77 |
78 |
79 | /**
80 | *
81 | * @return int
82 | */
83 | public function getSynsetCnt()
84 | {
85 | return $this->synsetCnt;
86 | }
87 |
88 | /**
89 | *
90 | * @return int
91 | */
92 | public function getPCnt()
93 | {
94 | return $this->pCnt;
95 | }
96 |
97 | /**
98 | *
99 | * @return int[]
100 | */
101 | public function getSynsetOffsets()
102 | {
103 | return $this->synsetOffsets;
104 | }
105 |
106 | /**
107 | *
108 | * @return string[]
109 | */
110 | public function getPtrSymbols()
111 | {
112 | return $this->ptrSymbols;
113 | }
114 |
115 | /**
116 | * Get the synsets for this lemma
117 | * @return Synset[]
118 | */
119 | public function getSynsets()
120 | {
121 | return $this->synsets;
122 | }
123 |
124 | /**
125 | *
126 | * @param Synset[] $synsets
127 | * @return \TextAnalysis\Models\Wordnet\Lemma
128 | */
129 | public function setSynsets(array $synsets)
130 | {
131 | $this->synsets = $synsets;
132 | return $this;
133 | }
134 |
135 | }
136 |
--------------------------------------------------------------------------------
/src/NGrams/StatisticFacade.php:
--------------------------------------------------------------------------------
1 |
12 | * @author Kaue Oliveira Almeida
13 | */
14 | class StatisticFacade
15 | {
16 | protected function __construct(){}
17 |
18 | /**
19 | * Calculate the statistic for an ngram array
20 | * @param array $ngrams Array of ngrams
21 | * @param string $measure Name of the statistic measure
22 | * @param int $nGramSize Size of the ngrams
23 | * @return array Return the ngram array with the statistic values
24 | */
25 | public static function calculate(array $ngrams, string $measure, int $nGramSize = 2) : array
26 | {
27 | $totalNgrams = array_sum(array_column($ngrams, 0));
28 | return array_map( function($item) use($measure, $totalNgrams, $nGramSize) {
29 | if ($nGramSize == 2) {
30 | return Statistic2D::$measure($item, $totalNgrams);
31 | } elseif ($nGramSize == 3) {
32 | return Statistic3D::$measure($item, $totalNgrams);
33 | } else {
34 | throw new \Exception("Size of the ngram informed invalid", 1);
35 | }
36 | }, $ngrams);
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/src/Phonetics/MetaphonePhonetic.php:
--------------------------------------------------------------------------------
1 | stemmer = $stemmer;
44 | $this->spell = $spell;
45 | $this->whiteList = $whiteList;
46 | }
47 |
48 | /**
49 | * Stem and then look up the word
50 | * @param string $token
51 | */
52 | public function stem($token)
53 | {
54 | if(in_array($token, $this->whiteList)) {
55 | return $token;
56 | }
57 | return $this->spell->suggest( $this->stemmer->stem($token) )[0];
58 | }
59 | }
60 |
--------------------------------------------------------------------------------
/src/Stemmers/LambdaStemmer.php:
--------------------------------------------------------------------------------
1 | transform($token);
24 | }
25 |
26 | }
27 |
--------------------------------------------------------------------------------
/src/Stemmers/LookupStemmer.php:
--------------------------------------------------------------------------------
1 | dictionary = $reader->read();
21 | }
22 |
23 | /**
24 | * Returns a token's stemmed root
25 | * @param string $token
26 | * @return string
27 | */
28 | public function stem($token)
29 | {
30 | if(array_key_exists($token, $this->dictionary)){
31 | return $this->dictionary[$token];
32 | }
33 | return $token;
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/src/Stemmers/MorphStemmer.php:
--------------------------------------------------------------------------------
1 | wordnetIndex = new WordnetIndex(new WordnetCorpus(get_storage_path('corpora/wordnet')));
32 | }
33 |
34 | /**
35 | *
36 | * @return WordnetIndex
37 | */
38 | public function getWordnetIndex()
39 | {
40 | return $this->wordnetIndex;
41 | }
42 |
43 | /**
44 | *
45 | * @param string $token
46 | * @return string
47 | */
48 | public function stem($token)
49 | {
50 | if(!isset($this->cache[$token])) {
51 | if(mb_strlen($token) < 3){
52 | $this->cache[$token] = $token;
53 | } else {
54 | $this->cache[$token] = $this->getWordnetIndex()->getMorph($token);
55 | }
56 | }
57 | return $this->cache[$token];
58 | }
59 |
60 | public function __destruct()
61 | {
62 | unset($this->cache);
63 | unset($this->wordnetIndex);
64 | }
65 |
66 |
67 | }
68 |
--------------------------------------------------------------------------------
/src/Stemmers/RegexStemmer.php:
--------------------------------------------------------------------------------
1 | regexExpression = $regexExpression;
20 | $this->minimumTokenLength = $minimumTokenLength;
21 | }
22 |
23 | /**
24 | * Return a stemmed word
25 | * @param string $token
26 | * @return string
27 | */
28 | public function stem($token)
29 | {
30 | if(strlen($token) < $this->minimumTokenLength) {
31 | return $token;
32 | }
33 | return preg_replace("/".$this->regexExpression."/i", '', $token);
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/src/Stemmers/SnowballStemmer.php:
--------------------------------------------------------------------------------
1 | stemmer = new $className();
36 | }
37 | // support version 2 and above
38 | else {
39 | $this->stemmer = StemmerFactory::create (strtolower($stemmerType));
40 | }
41 | }
42 |
43 | public function stem($token) : string
44 | {
45 | return $this->stemmer->stem($token);
46 | }
47 |
48 | }
--------------------------------------------------------------------------------
/src/Taggers/StanfordNerTagger.php:
--------------------------------------------------------------------------------
1 | tmpFilePath = tempnam(sys_get_temp_dir(), "stanford_ner_");
30 | }
31 |
32 | public function getCommand()
33 | {
34 | return escapeshellcmd(
35 | $this->getPathToJava() .
36 | " ".implode(" ", $this->getJavaOptions()) .
37 | " -cp " . $this->getJarPath() . $this->getPathSeparator() .
38 | dirname($this->getJarPath()).DIRECTORY_SEPARATOR."lib".DIRECTORY_SEPARATOR."*".
39 | " edu.stanford.nlp.ie.crf.CRFClassifier " .
40 | " -loadClassifier {$this->getClassifierPath()}" .
41 | " -textFile {$this->getTmpFilePath()}"
42 | );
43 | }
44 |
45 | /**
46 | *
47 | * @return array
48 | */
49 | protected function getParsedOutput()
50 | {
51 | $data = [];
52 |
53 | $filter = new PunctuationFilter();
54 | $phrases = (new WhitespaceTokenizer())->tokenize($this->output ?? '');
55 | foreach($phrases as $phrase)
56 | {
57 | $tokens = explode("{$this->getSeparator()}", $phrase);
58 | $type = array_pop($tokens);
59 |
60 | foreach($tokens as $token)
61 | {
62 | if(empty($token) || empty($filter->transform($token))) {
63 | continue;
64 | }
65 | $data[] = [$token,$type];
66 | }
67 | }
68 | return $data;
69 | }
70 |
71 | }
72 |
--------------------------------------------------------------------------------
/src/Taggers/StanfordPosTagger.php:
--------------------------------------------------------------------------------
1 | tmpFilePath = tempnam(sys_get_temp_dir(), "stanford_pos_");
30 | }
31 |
32 | /**
33 | *
34 | * @return string
35 | */
36 | public function getCommand()
37 | {
38 | return escapeshellcmd(
39 | $this->getPathToJava() .
40 | " ".implode(" ", $this->getJavaOptions()) .
41 | " -cp " . $this->getJarPath() . $this->getPathSeparator() .
42 | dirname($this->getJarPath()).DIRECTORY_SEPARATOR."lib".DIRECTORY_SEPARATOR."*".
43 | " edu.stanford.nlp.tagger.maxent.MaxentTagger " .
44 | " -model {$this->getClassifierPath()}" .
45 | " -textFile {$this->getTmpFilePath()}" .
46 | " -outputFormat tsv"
47 | );
48 | }
49 |
50 | protected function getParsedOutput()
51 | {
52 | $data = [];
53 |
54 | $lines = explode(PHP_EOL, $this->output ?? '');
55 | foreach($lines as $line)
56 | {
57 | $line = str_replace("\t", $this->getSeparator(), $line);
58 | $row = array_map('trim', explode($this->getSeparator(), $line));
59 |
60 | if(empty($row[0]) || empty(end($row)) ) {
61 | continue;
62 | }
63 | $len = count($row);
64 | for($index = 0; $index < $len-1; $index++)
65 | {
66 | $data[] = [$row[$index],$row[$len-1]];
67 | }
68 | }
69 |
70 | return $data;
71 |
72 | }
73 |
74 | }
75 |
--------------------------------------------------------------------------------
/src/Tokenizers/FixedLengthTokenizer.php:
--------------------------------------------------------------------------------
1 | startPosition = $startPosition;
24 | $this->length = $length;
25 | }
26 |
27 | /**
28 | * Return array with single element
29 | * @param string $string
30 | * @return array
31 | */
32 | public function tokenize(string $string)
33 | {
34 | if(!$this->length) {
35 | return array(substr($string, $this->startPosition));
36 | } else {
37 | return array(substr($string, $this->startPosition, $this->length));
38 | }
39 | }
40 |
41 | }
42 |
43 |
--------------------------------------------------------------------------------
/src/Tokenizers/GeneralTokenizer.php:
--------------------------------------------------------------------------------
1 | tokenExpression = $tokenExpression;
23 | }
24 |
25 |
26 | /**
27 | * Return tokenized array from string
28 | * @param string $string
29 | * @return array
30 | */
31 | public function tokenize(string $string)
32 | {
33 | return $this->strTokenWrapper($string);
34 | }
35 |
36 | /**
37 | * Use the php function strtok to Tokenize simple string
38 | * @internal
39 | * @return array
40 | */
41 | protected function strTokenWrapper($string)
42 | {
43 | $token = strtok($string, $this->tokenExpression);
44 |
45 | $tokens = array();
46 | while ($token !== false) {
47 | // avoid tokenizing white spaces
48 | if(!empty(trim($token))) {
49 | $tokens[] = $token;
50 | }
51 | $token = strtok($this->tokenExpression);
52 | }
53 | return $tokens;
54 | }
55 | }
56 |
57 |
--------------------------------------------------------------------------------
/src/Tokenizers/LambdaTokenizer.php:
--------------------------------------------------------------------------------
1 | lambdaFunc = $lambdaFunc;
28 | }
29 |
30 |
31 | public function tokenize(string $string)
32 | {
33 | return call_user_func($this->lambdaFunc, $string);
34 | }
35 |
36 | }
37 |
--------------------------------------------------------------------------------
/src/Tokenizers/RegexTokenizer.php:
--------------------------------------------------------------------------------
1 | pattern = $pattern;
20 | $this->flags = $flags;
21 | $this->offset = $offset;
22 | }
23 |
24 | /**
25 | * Wraps preg_match_all
26 | * @param string $string
27 | * @return array
28 | */
29 | public function tokenize(string $string)
30 | {
31 | $matches = array();
32 | $count = preg_match_all($this->pattern, $string, $matches, $this->flags, $this->offset);
33 | if($count === false) {
34 | return array();
35 | }
36 | return $matches[0];
37 |
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/src/Tokenizers/TokenizerAbstract.php:
--------------------------------------------------------------------------------
1 | sentence = new Sentence;
24 | }
25 |
26 | /**
27 | * Split the text into sentences
28 | * @param string $string
29 | * @return array
30 | */
31 | public function tokenize(string $string): array
32 | {
33 | return filter_empty( $this->sentence->split($string));
34 | }
35 |
36 | public function __destruct()
37 | {
38 | unset($this->sentence);
39 | }
40 |
41 | }
42 |
--------------------------------------------------------------------------------
/src/Tokenizers/WhitespaceTokenizer.php:
--------------------------------------------------------------------------------
1 | pos = $pos;
24 | }
25 |
26 | /**
27 | * Returns single character
28 | * @return string
29 | */
30 | public function getPos()
31 | {
32 | return $this->pos;
33 | }
34 |
35 | /**
36 | *
37 | * @param string[] $ptrSymbols
38 | */
39 | public function setPtrSymbols(array $ptrSymbols)
40 | {
41 | $this->ptrSymbols = $ptrSymbols;
42 | }
43 |
44 | /**
45 | *
46 | * @return string[]
47 | */
48 | public function getPtrSymbols()
49 | {
50 | return $this->ptrSymbols;
51 | }
52 |
53 | public function isAntonym()
54 | {
55 | return $this->isA('!');
56 | }
57 |
58 | public function isHypernym()
59 | {
60 | return $this->isA('@');
61 | }
62 |
63 | public function isInstanceHypernym()
64 | {
65 | return $this->isA('@!');
66 | }
67 |
68 | public function isHyponym()
69 | {
70 | return $this->isA('~');
71 | }
72 |
73 | public function isInstanceHyponym()
74 | {
75 | return $this->isA('~i');
76 | }
77 |
78 | public function isMemberHolonym()
79 | {
80 | return $this->isA('#m');
81 | }
82 |
83 | public function isSubstanceHolonym()
84 | {
85 | return $this->isA('#s');
86 | }
87 |
88 | public function isPartHolonym()
89 | {
90 | return $this->isA('#p');
91 | }
92 |
93 | public function isMemberMeronym()
94 | {
95 | return $this->isA('%m');
96 | }
97 |
98 | public function isSubstanceMeronym()
99 | {
100 | return $this->isA('%s');
101 | }
102 |
103 | public function isPartMeronym()
104 | {
105 | return $this->isA('%p');
106 | }
107 |
108 | public function isAttribute()
109 | {
110 | return $this->isA('=');
111 | }
112 |
113 | public function isDerivation()
114 | {
115 | return $this->isA('+');
116 | }
117 |
118 | public function isEntailment()
119 | {
120 | return $this->isA('*');
121 | }
122 |
123 | public function isCause()
124 | {
125 | return $this->isA('>');
126 | }
127 |
128 | public function isSeeAlso()
129 | {
130 | return $this->isA('>');
131 | }
132 |
133 | public function isVerbGroup()
134 | {
135 | return $this->isA('$');
136 | }
137 |
138 | public function isSimilarTo()
139 | {
140 | return $this->isA('$');
141 | }
142 |
143 | public function isParticipleOfVerb()
144 | {
145 | return $this->isA('<');
146 | }
147 |
148 | public function isPertainym()
149 | {
150 | return $this->isA('\\');
151 | }
152 |
153 | public function isDerivedFromAdjective()
154 | {
155 | return $this->isA('\\');
156 | }
157 |
158 | /**
159 | *
160 | * @param string $symbol
161 | * @return boolean
162 | */
163 | protected function isA($symbol)
164 | {
165 | return in_array($symbol, $this->getPtrSymbols());
166 | }
167 |
168 | }
169 |
170 |
--------------------------------------------------------------------------------
/src/Utilities/Nltk/Download/Package.php:
--------------------------------------------------------------------------------
1 | id = $id;
35 | $this->checksum = $checksum;
36 | $this->name = $name;
37 | $this->subdir = $subdir;
38 | $this->unzip = $unzip;
39 | $this->url = $url;
40 | }
41 |
42 | public function getName()
43 | {
44 | return $this->name;
45 | }
46 |
47 | public function getChecksum()
48 | {
49 | return $this->checksum;
50 | }
51 |
52 | public function getId()
53 | {
54 | return $this->id;
55 | }
56 |
57 | public function getSubdir()
58 | {
59 | return $this->subdir;
60 | }
61 |
62 | public function getUnzip()
63 | {
64 | return $this->unzip;
65 | }
66 |
67 | public function getUrl()
68 | {
69 | return $this->url;
70 | }
71 |
72 | /**
73 | * Returns the path the package should be installed into
74 | * @return string
75 | */
76 | public function getInstallationPath()
77 | {
78 | return get_storage_path($this->getSubdir().DIRECTORY_SEPARATOR.$this->getId());
79 | }
80 | }
81 |
--------------------------------------------------------------------------------
/src/Utilities/Vowels/EnglishVowels.php:
--------------------------------------------------------------------------------
1 | Return a TextCorpus object',
11 | 'normalize(string $text) -> Normalize text to lower case',
12 | 'todo ....'
13 | ];
14 | print_array($menu);
15 | }
16 |
17 |
--------------------------------------------------------------------------------
/src/helpers/print.php:
--------------------------------------------------------------------------------
1 | $returnValue)
70 | {
71 | $mock->shouldReceive($methodName)
72 | ->andReturn($returnValue);
73 | }
74 | return $mock;
75 | }
76 |
77 | /**
78 | *
79 | * @return InvertedIndex
80 | */
81 | public function getInvertedIndex()
82 | {
83 | if(!$this->invertedIndex) {
84 | $docs = [
85 | new TokensDocument(["marquette", "michigan", "hiking", "hiking", "hiking" , "camping", "swimming"]),
86 | new TokensDocument(["ironwood", "michigan", "hiking", "biking", "camping", "swimming","marquette"]),
87 | new TokensDocument(["no","tokens","michigan"])
88 | ];
89 | $collection = new DocumentArrayCollection($docs);
90 | $builder = new CollectionInvertedIndexBuilder($collection);
91 | $dataReader = new ArrayDataReaderAdapter($builder->getIndex());
92 | $this->invertedIndex = new InvertedIndex($dataReader);
93 | }
94 | return $this->invertedIndex;
95 | }
96 | }
97 |
98 |
99 |
--------------------------------------------------------------------------------
/tests/TextAnalysis/Adapters/PspellAdapterTest.php:
--------------------------------------------------------------------------------
1 | assertEquals('run', $adapter->suggest("runn")[0]);
22 | $this->assertEquals('Cooper', $adapter->suggest("yooper")[0]);
23 | $this->assertEquals('flute', $adapter->suggest("flute")[0]);
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/tests/TextAnalysis/Analysis/DateAnalysisTest.php:
--------------------------------------------------------------------------------
1 | assertEquals("2015-09-01", $dateAnalysis->getDates()[0]->format('Y-m-d'));
19 | }
20 | }
21 |
--------------------------------------------------------------------------------
/tests/TextAnalysis/Analysis/FreqDistTest.php:
--------------------------------------------------------------------------------
1 | assertTrue(count($freqDist->getHapaxes()) === 3);
17 | $this->assertEquals(9, $freqDist->getTotalTokens());
18 | $this->assertEquals(6, $freqDist->getTotalUniqueTokens());
19 | }
20 |
21 | public function testEmptyHapaxesFreqDist()
22 | {
23 | $freqDist = new FreqDist(array("time", "time", "what", "what"));
24 | $this->assertTrue(count($freqDist->getHapaxes()) === 0);
25 | $this->assertEquals(4, $freqDist->getTotalTokens());
26 | $this->assertEquals(2, $freqDist->getTotalUniqueTokens());
27 | }
28 |
29 | public function testSingleHapaxFreqDist()
30 | {
31 | $freqDist = new FreqDist(array("time"));
32 | $this->assertTrue(count($freqDist->getHapaxes()) === 1);
33 | $this->assertEquals(1, $freqDist->getTotalTokens());
34 | $this->assertEquals(1, $freqDist->getTotalUniqueTokens());
35 | }
36 |
37 | /**
38 | *
39 | */
40 | public function testEmptyFreqDist()
41 | {
42 | $this->expectException(\TextAnalysis\Exceptions\InvalidParameterSizeException::class);
43 | $freqDist = new FreqDist([]);
44 | }
45 | }
46 |
47 |
--------------------------------------------------------------------------------
/tests/TextAnalysis/Analysis/Keywords/RakeTest.php:
--------------------------------------------------------------------------------
1 | transform($this->getTestData());
26 | //rake MUST be split on whitespace and new lines only
27 | $tokens = (new GeneralTokenizer(" \n\t\r"))->tokenize($testData);
28 | $tokenDoc = new TokensDocument($tokens);
29 | $tokenDoc->applyTransformation(new LowerCaseFilter())
30 | ->applyTransformation(new StopWordsFilter($stopwords), true)
31 | ->applyTransformation(new PunctuationFilter(['@',':','\/']), true)
32 | ->applyTransformation(new CharFilter(), true);
33 |
34 | $rake = new Rake($tokenDoc, 3);
35 | $results = $rake->getKeywordScores();
36 | $this->assertArrayHasKey('minimal generating sets', $results);
37 | $this->assertArrayHasKey('8/8/2016 5:51 pm', $results);
38 | }
39 |
40 | public function testSimplifiedRake()
41 | {
42 | $stopwords = array_map('trim', file(VENDOR_DIR.'yooper/stop-words/data/stop-words_english_1_en.txt'));
43 | // all punctuation must be moved 1 over. Fixes issues with sentences
44 | $testData = (new SpacePunctuationFilter([':','\/']))->transform($this->getTestData());
45 | //rake MUST be split on whitespace and new lines only
46 | $tokens = (new GeneralTokenizer(" \n\t\r"))->tokenize($testData);
47 | $tokenDoc = new TokensDocument($tokens);
48 | $tokenDoc->applyTransformation(new LowerCaseFilter())
49 | ->applyTransformation(new StopWordsFilter($stopwords), true)
50 | ->applyTransformation(new PunctuationFilter(['@',':','\/']), true)
51 | ->applyTransformation(new CharFilter(), true);
52 |
53 | $rake = rake($tokenDoc->toArray(), 3);
54 | $results = $rake->getKeywordScores();
55 | $this->assertArrayHasKey('minimal generating sets', $results);
56 | $this->assertArrayHasKey('8/8/2016 5:51 pm', $results);
57 | }
58 |
59 | /**
60 | * Sample test data
61 | * @return string
62 | */
63 | public function getTestData()
64 | {
65 | return <<assertTrue($collection->count() === 3);
28 |
29 | $this->assertEquals($collection[2]->getDocumentData(), array("no","tokens"));
30 | }
31 |
32 | public function testFiltersOnCollection()
33 | {
34 | $docs = array(
35 | new TokensDocument(array("Marquette", "Michigan's", "hiking", "hiking", "hiking" , "camping", "swimming")),
36 | new TokensDocument(array("Ironwood", "michigan", "hiking", "biking", "camping", "swimming","marquette")),
37 | new TokensDocument(array("No","Tokens"))
38 | );
39 |
40 | $collection = new DocumentArrayCollection($docs);
41 |
42 | $filters = array(
43 | new LowerCaseFilter(),
44 | new QuotesFilter()
45 | );
46 |
47 | $collection->applyTransformations($filters);
48 |
49 |
50 | $this->assertTrue($collection->count() === 3);
51 |
52 | $this->assertEquals(array("marquette", "michigans", "hiking", "hiking", "hiking" , "camping", "swimming"), $collection[0]->getDocumentData());
53 | $this->assertEquals(array("ironwood", "michigan", "hiking", "biking", "camping", "swimming","marquette"),$collection[1]->getDocumentData());
54 | $this->assertEquals(array("no","tokens"), $collection[2]->getDocumentData());
55 |
56 | }
57 | }
58 |
59 |
--------------------------------------------------------------------------------
/tests/TextAnalysis/Collocations/CollocationFinderTest.php:
--------------------------------------------------------------------------------
1 | transform(self::$text);
28 | $tokens = (new GeneralTokenizer(" \n\t\r"))->tokenize($testData);
29 | $tokenDoc = new TokensDocument($tokens);
30 | $tokenDoc->applyTransformation(new LowerCaseFilter())
31 | ->applyTransformation(new PunctuationFilter([]), false)
32 | ->applyTransformation(new StopWordsFilter($stopwords))
33 | ->applyTransformation(new QuotesFilter())
34 | ->applyTransformation(new CharFilter());
35 |
36 | $finder = new CollocationFinder($tokenDoc->toArray());
37 | $this->assertArrayHasKey('injun joe', $finder->getCollocations());
38 | }
39 |
40 | public function testCollocationFinderTrigram()
41 | {
42 | $stopwords = array_map('trim', file(VENDOR_DIR.'yooper/stop-words/data/stop-words_english_1_en.txt'));
43 | $testData = (new SpacePunctuationFilter())->transform(self::$text);
44 | $tokens = (new GeneralTokenizer(" \n\t\r"))->tokenize($testData);
45 | $tokenDoc = new TokensDocument($tokens);
46 | $tokenDoc->applyTransformation(new LowerCaseFilter())
47 | ->applyTransformation(new PunctuationFilter([]), false)
48 | ->applyTransformation(new StopWordsFilter($stopwords))
49 | ->applyTransformation(new QuotesFilter())
50 | ->applyTransformation(new CharFilter());
51 |
52 | $finder = new CollocationFinder($tokenDoc->toArray(), 3);
53 | $this->assertArrayHasKey('finn red handed', $finder->getCollocations());
54 | }
55 |
56 | public function testGetCollocationsByPmi()
57 | {
58 | $testData = (new SpacePunctuationFilter())->transform(self::$text);
59 | $tokens = (new GeneralTokenizer(" \n\t\r"))->tokenize($testData);
60 | $tokenDoc = new TokensDocument($tokens);
61 | $tokenDoc->applyTransformation(new LowerCaseFilter())
62 | ->applyTransformation(new PunctuationFilter([]), false)
63 | ->applyTransformation(new StopWordsFilter([]))
64 | ->applyTransformation(new QuotesFilter())
65 | ->applyTransformation(new CharFilter());
66 |
67 | $finder = new CollocationFinder($tokenDoc->toArray(), 2);
68 | $this->assertArrayHasKey('outlying cottages', $finder->getCollocationsByPmi());
69 |
70 | }
71 |
72 |
73 | }
74 |
--------------------------------------------------------------------------------
/tests/TextAnalysis/Comparisons/CosineSimilarityComparisonTest.php:
--------------------------------------------------------------------------------
1 | assertEquals(1.0, round($compare->similarity($text1, $text2), 1));
20 |
21 | }
22 |
23 | public function testDifferent()
24 | {
25 | $text1 = ["hiking" , "hiking", "camping", "swimming"];
26 | $text2 = ["hiking" , "biking", "camping", "swimming"];
27 | $compare = new CosineSimilarityComparison();
28 | $this->assertEquals(0.8, round($compare->similarity($text1, $text2), 1));
29 | }
30 |
31 | public function testNothingInCommon()
32 | {
33 | $text1 = ["hiking", "camping", "swimming"];
34 | $text2 = ["biking", "boating", "floating"];
35 | $compare = new CosineSimilarityComparison();
36 | $this->assertEquals(0, $compare->similarity($text1, $text2));
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/tests/TextAnalysis/Comparisons/HammingDistanceComparisonTest.php:
--------------------------------------------------------------------------------
1 |
12 | */
13 | class HammingDistanceComparisonTest extends \PHPUnit\Framework\TestCase
14 | {
15 | public function testHammingDistance()
16 | {
17 | $c = new HammingDistanceComparison();
18 | $this->assertEquals(3, $c->distance('karolin', 'kathrin'));
19 | $this->assertEquals(3, $c->distance('karolin', 'kerstin'));
20 | $this->assertEquals(2, $c->distance('1011101', '1001001'));
21 | $this->assertEquals(3, $c->distance('2173896', '2233796'));
22 | }
23 | }
--------------------------------------------------------------------------------
/tests/TextAnalysis/Comparisons/JaccardIndexComparisonTest.php:
--------------------------------------------------------------------------------
1 |
12 | */
13 | class JaccardIndexComparisonTest extends \PHPUnit\Framework\TestCase
14 | {
15 | public function testJaccardIndex()
16 | {
17 | $c = new JaccardIndexComparison();
18 | $this->assertEquals(1, $c->similarity('a', 'a'));
19 | $this->assertEquals(1, $c->similarity(['a'], ['a']));
20 | $this->assertEquals(1, $c->similarity(['a','b'], ['b','a']));
21 | $this->assertEquals(.5, $c->similarity(['a','b'], ['a']));
22 | }
23 | }
24 |
--------------------------------------------------------------------------------
/tests/TextAnalysis/Comparisons/JaroWinklerComparisonTest.php:
--------------------------------------------------------------------------------
1 |
11 | */
12 | class JaroWinklerComparisonTest extends \PHPUnit\Framework\TestCase
13 | {
14 | public function testJaroWinkler()
15 | {
16 | $jw = new JaroWinklerComparison();
17 | $this->assertEquals('0.961', sprintf("%1.3f", $jw->similarity('MARTHA', 'MARHTA')));
18 | $this->assertEquals('0.840', sprintf("%1.3f", $jw->similarity('DWAYNE', 'DUANE')));
19 | $this->assertEquals('0.813', sprintf("%1.3f", $jw->similarity('DIXON', 'DICKSONX')));
20 | }
21 | }
22 |
--------------------------------------------------------------------------------
/tests/TextAnalysis/Comparisons/LevenshteinComparisonTest.php:
--------------------------------------------------------------------------------
1 | assertEquals(1, $comparison->distance('hat', 'cat'));
18 | }
19 | }
20 |
--------------------------------------------------------------------------------
/tests/TextAnalysis/Comparisons/LongestCommonSubstringComparisonTest.php:
--------------------------------------------------------------------------------
1 |
11 | */
12 | class LongestSubstringComparisonTest extends \PHPUnit\Framework\TestCase
13 | {
14 | public function testLcs()
15 | {
16 | $lcs = new LongestCommonSubstringComparison();
17 |
18 | $txt1 = "Michael";
19 | $txt2 = "Michelle";
20 | $this->assertEquals(4, $lcs->distance($txt2, $txt1));
21 | $this->assertEquals("Mich", $lcs->similarity($txt2, $txt1));
22 |
23 | $txt1 = "sunnyside";
24 | $txt2 = "hide";
25 |
26 |
27 | $this->assertEquals(6, $lcs->distance($txt2, $txt1));
28 | $this->assertEquals("ide", $lcs->similarity($txt2, $txt1));
29 | }
30 |
31 | public function testLcsWithCache()
32 | {
33 | $lcs = new LongestCommonSubstringComparison(true);
34 | $txt1 = "Michael";
35 | $txt2 = "Michelle";
36 | $this->assertEquals(4, $lcs->distance($txt2, $txt1));
37 | $this->assertEquals("Mich", $lcs->similarity($txt2, $txt1));
38 |
39 | $txt1 = "sunnyside";
40 | $txt2 = "hide";
41 |
42 | $this->assertEquals(6, $lcs->distance($txt2, $txt1));
43 | $this->assertEquals("ide", $lcs->similarity($txt2, $txt1));
44 |
45 | $this->assertCount(2, $lcs->getCache());
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/tests/TextAnalysis/Comparisons/MostFreqCharComparisonTest.php:
--------------------------------------------------------------------------------
1 |
11 | */
12 | class MostFreqCharComparisonTest extends \PHPUnit\Framework\TestCase
13 | {
14 | public function testComparison()
15 | {
16 | $mf = new MostFreqCharComparison();
17 | $this->assertEquals(4, $mf->similarity('research', 'research'));
18 | $this->assertEquals(2, $mf->similarity('research', 'seeking'));
19 | $this->assertEquals(3, $mf->similarity('significant', 'capabilities'));
20 |
21 | $this->assertEquals(4, $mf->distance('research', 'research'));
22 | $this->assertEquals(6, $mf->distance('research', 'seeking'));
23 | $this->assertEquals(9, $mf->distance('significant', 'capabilities'));
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/tests/TextAnalysis/Corpus/ImportCorpusTest.php:
--------------------------------------------------------------------------------
1 | shouldReceive('getInstallationPath')
20 | ->andReturn(TEST_DATA_DIR.DS.'books'.DS);
21 |
22 | $mockImportCorpus = Mockery::mock('TextAnalysis\Corpus\ImportCorpus[getPackage,getFileIds]', [null, null, null, null, null, null])
23 | ->shouldAllowMockingProtectedMethods();
24 |
25 | $mockImportCorpus->shouldReceive('getPackage')
26 | ->andReturn($mockPackage);
27 |
28 | $mockImportCorpus->shouldReceive('getFileIds')
29 | ->andReturn(['tom_sawyer.txt']);
30 |
31 | $this->assertEquals(['tom_sawyer.txt'], $mockImportCorpus->getFileIds());
32 | $this->assertCount(76057, $mockImportCorpus->getWords());
33 | $this->assertCount(1, $mockImportCorpus->getRaw());
34 | // sentence tokenizer is too slow
35 | ///var_dump($mockImportCorpus->getSentences());
36 | //$this->assertCount(5227, $mockImportCorpus->getSentences());
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/tests/TextAnalysis/Corpus/NameCorpusTest.php:
--------------------------------------------------------------------------------
1 | assertTrue($corpus->isFirstName('Mike'));
25 | $this->assertFalse($corpus->isFirstName('very'));
26 | }
27 |
28 | public function testGetFirstName()
29 | {
30 | if( getenv('SKIP_TEST')) {
31 | return;
32 | }
33 |
34 | $corpus = new NameCorpus();
35 | $firstName = $corpus->getFirstName('Mike');
36 | $this->assertNotEmpty($firstName);
37 |
38 | $this->assertEmpty($corpus->getFirstName('very'));
39 | }
40 |
41 |
42 | public function testLastNames()
43 | {
44 | if( getenv('SKIP_TEST')) {
45 | return;
46 | }
47 |
48 | $corpus = new NameCorpus();
49 | $this->assertTrue($corpus->isLastName('Williamson'));
50 | $this->assertFalse($corpus->isLastName('Baggins'));
51 | }
52 |
53 | public function testGetLastName()
54 | {
55 | if( getenv('SKIP_TEST')) {
56 | return;
57 | }
58 |
59 | $corpus = new NameCorpus();
60 | $lastName = $corpus->getLastName('Williamson');
61 | $this->assertEquals(245, $lastName['rank']);
62 |
63 | $lastName = $corpus->getLastName('Baggins');
64 | $this->assertEmpty($lastName);
65 | }
66 |
67 |
68 | public function testFullNames()
69 | {
70 | if( getenv('SKIP_TEST')) {
71 | return;
72 | }
73 |
74 | $corpus = new NameCorpus();
75 | $this->assertTrue($corpus->isFullName('Brad Von Williamson'));
76 | $this->assertFalse($corpus->isFullName('Jimbo'));
77 | $this->assertTrue($corpus->isFullName('Bradley Thomas'));
78 | }
79 |
80 |
81 |
82 | }
83 |
--------------------------------------------------------------------------------
/tests/TextAnalysis/Corpus/TextCorpusTest.php:
--------------------------------------------------------------------------------
1 | assertInstanceOf(TextCorpus::class, text($this->getText()));
19 | }
20 |
21 | public function testConcordance()
22 | {
23 | $corpus = new TextCorpus($this->getText());
24 | $results = $corpus->concordance("tom sawyer");
25 | $this->assertCount(34, $results);
26 | }
27 |
28 | public function testConcordancePtBr()
29 | {
30 | $corpus = new TextCorpus($this->getText('ptbr'));
31 | $results = $corpus->concordance("José",20, true, 'equal');
32 | $this->assertCount(160, $results);
33 | }
34 |
35 | public function testTokenizer()
36 | {
37 | $corpus = new TextCorpus($this->getText());
38 | $results = $corpus->getTokens();
39 | $this->assertCount(76057, $results);
40 | }
41 |
42 | public function testFindAll()
43 | {
44 | $corpus = new TextCorpus($this->getText());
45 | $results = $corpus->findAll("tom sawyer");
46 | $this->assertCount(32, $results);
47 | }
48 |
49 | public function testDispersion()
50 | {
51 | $corpus = new TextCorpus($this->getText());
52 | $results = $corpus->getDispersion(["tom sawyer", "huck finn"]);
53 | $this->assertCount(22, $results[0]);
54 | $this->assertCount(58, $results[1]);
55 | }
56 |
57 | }
58 |
--------------------------------------------------------------------------------
/tests/TextAnalysis/Corpus/WordnetCorpusTest.php:
--------------------------------------------------------------------------------
1 | getFileNames() as $fileName)
23 | {
24 | $this->assertFileExists($wn->getDir().$fileName);
25 | }
26 | }
27 |
28 | public function testGetLexNames()
29 | {
30 | if( getenv('SKIP_TEST') || !is_dir(get_storage_path('corpora/wordnet'))) {
31 | return;
32 | }
33 | $wn = new WordnetCorpus(get_storage_path('corpora/wordnet'));
34 | $this->assertCount(45, $wn->getLexNames());
35 | }
36 |
37 | public function testGetLemmaFromString()
38 | {
39 | $testLines = [
40 | 'zombie n 5 3 @ %s ; 5 1 10805638 10805932 10805783 09825519 07919165'
41 | ];
42 | $wn = new WordnetCorpus("not_checked");
43 | $lemma = $wn->getLemmaFromString($testLines[0]);
44 | $this->assertCount(5, $lemma->getSynsetOffsets());
45 | $this->assertEquals('n', $lemma->getPos());
46 | $this->assertTrue($lemma->isHypernym());
47 | }
48 |
49 |
50 | public function testGetSynsetFromString()
51 | {
52 | $testLines = [
53 | "825519 18 n 03 automaton 1 zombi 1 zombie 1 004 @ 09606527 n 0000 + 01499999 a 0101 + 00480221 v 0101 + 00480221 v 0102 | someone who acts or responds in a mechanical or apathetic way; \only an automaton wouldn't have noticed\"" ];
54 | $wn = new WordnetCorpus("not_checked");
55 | $synset = $wn->getSynsetFromString($testLines[0]);
56 | $this->assertCount(3, $synset->getWords());
57 | $this->assertCount(4, $synset->getLinkedSynsets());
58 | }
59 |
60 |
61 | public function testGetLemmas()
62 | {
63 | if( getenv('SKIP_TEST') || !is_dir(get_storage_path('corpora/wordnet'))) {
64 | return;
65 | }
66 | $wnMock = $this->getPartialMock(WordnetCorpus::class, ['getIndexFileNames' => ['index.adj']], [get_storage_path('corpora/wordnet')]);
67 | $this->assertCount(21479, $wnMock->getLemmas());
68 | $keys = array_keys($wnMock->getLemmas());
69 | $lemma = $wnMock->getLemmas()[$keys[0]];
70 | $this->assertEquals('.22-caliber', $lemma->getWord());
71 | $this->assertTrue($lemma->isPertainym());
72 | $this->assertFalse($lemma->isAttribute());
73 | }
74 |
75 |
76 | public function testGetSynsets()
77 | {
78 | if( getenv('SKIP_TEST') || !is_dir(get_storage_path('corpora/wordnet'))) {
79 | return;
80 | }
81 | $wn = new WordnetCorpus(get_storage_path('corpora/wordnet'));
82 | $synset = $wn->getSynsetByOffsetAndPos(9825519, 'n');
83 | $this->assertEquals(['automaton','zombi','zombie'], $synset->getWords());
84 | $this->assertCount(4, $synset->getLinkedSynsets());
85 | }
86 |
87 | public function testGetExceptionMapFromString()
88 | {
89 | $wn = new WordnetCorpus('not_used');
90 |
91 | $e1 = $wn->getExceptionMapFromString('thieves thief', 'n');
92 | $this->assertCount(1, $e1->getExceptionList());
93 | $this->assertEquals('thief', $e1->getTarget());
94 | $this->assertEquals('thieves', $e1->getExceptionList()[0]);
95 |
96 | $e2 = $wn->getExceptionMapFromString('ploughmen ploughman plowman', 'n');
97 | $this->assertCount(2, $e2->getExceptionList());
98 | $this->assertEquals('plowman', $e2->getTarget());
99 | $this->assertEquals(['ploughmen', 'ploughman'], $e2->getExceptionList());
100 |
101 | }
102 |
103 | }
104 |
--------------------------------------------------------------------------------
/tests/TextAnalysis/Downloaders/NltkCorporalIndexDownloaderTest.php:
--------------------------------------------------------------------------------
1 | shouldAllowMockingProtectedMethods();
22 |
23 | $mock->shouldReceive('getXmlContent')
24 | ->andReturn(simplexml_load_string($this->getXmlContent()));
25 |
26 | $packages = $mock->getPackages();
27 | $this->assertCount(2, $packages);
28 | $this->assertEquals('maxent_ne_chunker', $packages[0]->getId());
29 | $this->assertEquals('abc', $packages[1]->getId());
30 | }
31 |
32 | /**
33 | *
34 | * @return string
35 | */
36 | public function getXmlContent()
37 | {
38 | return <<
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 | XML;
63 |
64 | }
65 | }
66 |
--------------------------------------------------------------------------------
/tests/TextAnalysis/Extracts/DateExtractTest.php:
--------------------------------------------------------------------------------
1 | assertFalse($extract->filter("no date in jan. set"));
18 | $this->assertInstanceOf('DateTime', $extract->filter('jan. 12th 1999'));
19 | }
20 |
21 | }
22 |
--------------------------------------------------------------------------------
/tests/TextAnalysis/Extracts/HashTagTest.php:
--------------------------------------------------------------------------------
1 | assertFalse($extract->filter("testing"));
18 | $this->assertEquals('#holiday', $extract->filter('#holiday'));
19 | $this->assertFalse($extract->filter('#DA'));
20 | }
21 |
22 | public function testMinLengthHashTag()
23 | {
24 | $extract = new HashTag(2);
25 | $this->assertEquals('#DA', $extract->filter('#DA'));
26 | $this->assertFalse($extract->filter('#1'));
27 | }
28 |
29 | }
30 |
--------------------------------------------------------------------------------
/tests/TextAnalysis/Filters/CharFilterTest.php:
--------------------------------------------------------------------------------
1 | assertEquals(' ', $transformer->transform(' A '));
20 | $this->assertEquals(' ', $transformer->transform(' ! '));
21 | $this->assertEquals(' 9 ', $transformer->transform(' 9 '));
22 |
23 | $this->assertEquals('A', $transformer->transform('A'));
24 | $this->assertEquals('!', $transformer->transform('!'));
25 | $this->assertEquals('9', $transformer->transform('9'));
26 | }
27 |
28 |
29 |
30 |
31 | }
--------------------------------------------------------------------------------
/tests/TextAnalysis/Filters/EmailFilterTest.php:
--------------------------------------------------------------------------------
1 | assertEquals(null, $filter->transform("yooper@example.com"));
18 | $this->assertEquals(' , ' , $filter->transform("yooper.mqt@example.sub.dub.edu , yooper@example.com"));
19 |
20 | }
21 | }
22 |
--------------------------------------------------------------------------------
/tests/TextAnalysis/Filters/LambdaFilterTest.php:
--------------------------------------------------------------------------------
1 | assertEquals("tomtom", $transformer->transform("bobbob"));
19 | }
20 |
21 | public function testLambdaStrReplace()
22 | {
23 | $lambda = function($word){
24 | return str_replace("bob", "tom", $word);
25 | };
26 | $transformer = new LambdaFilter($lambda);
27 | $this->assertEquals("tomtom", $transformer->transform("bobbob"));
28 | }
29 |
30 | }
31 |
32 |
--------------------------------------------------------------------------------
/tests/TextAnalysis/Filters/LowerCaseFilterTest.php:
--------------------------------------------------------------------------------
1 | assertEquals("yooper's", $transformer->transform("Yooper's"));
16 | }
17 |
18 | }
--------------------------------------------------------------------------------
/tests/TextAnalysis/Filters/NumbersFilterTest.php:
--------------------------------------------------------------------------------
1 | assertEquals('easy street', $filter->transform("123 easy street"));
16 | $this->assertEquals('easy street', $filter->transform("easy street"));
17 | $this->assertEquals('april th,', $filter->transform("april 25th, 1992"));
18 |
19 | }
20 | }
21 |
--------------------------------------------------------------------------------
/tests/TextAnalysis/Filters/PossessiveNounFilterTest.php:
--------------------------------------------------------------------------------
1 | assertEquals("yooper lives in Marquette west side", $filter->transform("yooper's lives in Marquette's west side"));
19 | }
20 |
21 | public function testNonPossessive()
22 | {
23 | $filter = new PossessiveNounFilter();
24 | $this->assertEquals("yooper", $filter->transform("yooper"));
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/tests/TextAnalysis/Filters/PunctuationFilterTest.php:
--------------------------------------------------------------------------------
1 | assertEquals("Yoopers", $transformer->transform("Yooper's!?;,"));
16 | $this->assertEquals("Yoopers", $transformer->transform("Yooper's!?;,"));
17 |
18 | }
19 |
20 | public function testOnDate()
21 | {
22 | $transformer = new PunctuationFilter(['\/',':'], []);
23 | $this->assertEquals('8/8/2016 5:51 PM', $transformer->transform('8/8/2016 5:51 PM'));
24 | }
25 |
26 | }
27 |
--------------------------------------------------------------------------------
/tests/TextAnalysis/Filters/QuotesFilterTest.php:
--------------------------------------------------------------------------------
1 | assertEquals('Yoopers', $transformer->transform("Yooper's"));
16 | }
17 |
18 | public function testRemoveDoubleQuote()
19 | {
20 | $transformer = new QuotesFilter();
21 | $this->assertEquals("Peninsula", $transformer->transform('"Peninsula"'));
22 | }
23 | }
--------------------------------------------------------------------------------
/tests/TextAnalysis/Filters/SpacePunctuationFilterTest.php:
--------------------------------------------------------------------------------
1 | assertEquals('P . B . R . ', $filter->transform('P.B.R.'));
18 | $this->assertEquals('8 / 8 / 2016 5 : 51 PM', $filter->transform('8/8/2016 5:51 PM'));
19 | }
20 |
21 | public function testWhiteList()
22 | {
23 | $filter = new SpacePunctuationFilter([],['O','E']);
24 | $this->assertEquals('H O M E R', $filter->transform('HOMER'));
25 | }
26 |
27 | public function testBlackList()
28 | {
29 | $filter = new SpacePunctuationFilter(['\/',':']);
30 | $this->assertEquals('8/8/2016 5:51 PM', $filter->transform('8/8/2016 5:51 PM'));
31 | }
32 |
33 | }
34 |
--------------------------------------------------------------------------------
/tests/TextAnalysis/Filters/StopWordsFilterTest.php:
--------------------------------------------------------------------------------
1 | loadStopwords());
27 | $this->assertNull($stopWord->transform("again"));
28 | }
29 |
30 | public function testIsNotStopWord()
31 | {
32 | $stopWord = new StopWordsFilter($this->loadStopwords());
33 | $this->assertEquals("peninsula", $stopWord->transform("peninsula"));
34 | }
35 |
36 | public function testIsStopWord2()
37 | {
38 | $stopWord = new StopWordsFilter($this->loadStopwords());
39 | $this->assertNull($stopWord->transform("as"));
40 | }
41 | }
--------------------------------------------------------------------------------
/tests/TextAnalysis/Filters/UrlFilterTest.php:
--------------------------------------------------------------------------------
1 | assertEquals("google.com", $filter->transform("google.com"));
19 | $this->assertEquals(" , ", $filter->transform("https://github.com/yooper/php-text-analysis/wiki , https://www.facebook.com/?query=1&field=none"));
20 | $this->assertEquals('hello', $filter->transform("hello"));
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/tests/TextAnalysis/Indexes/TfIdfTest.php:
--------------------------------------------------------------------------------
1 | wordnetIdx) {
30 | $this->wordnetIdx = new WordnetIndex(new WordnetCorpus(get_storage_path('corpora/wordnet')));
31 | }
32 | return $this->wordnetIdx;
33 | }
34 |
35 | public function testGetLemma()
36 | {
37 | if( getenv('SKIP_TEST') || !is_dir(get_storage_path('corpora/wordnet'))) {
38 | return;
39 | }
40 |
41 | $lemmas = $this->getWordnetIndex()->getLemma('programmer');
42 | $this->assertCount(8, $lemmas[0]->getSynsets()[0]->getLinkedSynsets());
43 | }
44 |
45 | public function testGetMorph()
46 | {
47 | if( getenv('SKIP_TEST') || !is_dir(get_storage_path('corpora/wordnet'))) {
48 | return;
49 | }
50 | $this->assertEquals('play', $this->getWordnetIndex()->getMorph('playing'));
51 | $this->assertEquals('dog', $this->getWordnetIndex()->getMorph('dogs'));
52 | $this->assertEquals('church', $this->getWordnetIndex()->getMorph('churches'));
53 | $this->assertEquals('aardwolf', $this->getWordnetIndex()->getMorph('aardwolves'));
54 | $this->assertEquals('abacus', $this->getWordnetIndex()->getMorph('abaci'));
55 | $this->assertEquals('book', $this->getWordnetIndex()->getMorph('books'));
56 | }
57 |
58 | }
59 |
--------------------------------------------------------------------------------
/tests/TextAnalysis/LexicalDiversity/NaiveTest.php:
--------------------------------------------------------------------------------
1 | getText() ));
16 | $this->assertEqualsWithDelta(0.03461, $result, 0.0001);
17 | }
18 | }
19 |
--------------------------------------------------------------------------------
/tests/TextAnalysis/LexicalDiversity/YuleITest.php:
--------------------------------------------------------------------------------
1 | getText() ), \TextAnalysis\LexicalDiversity\YuleI::class);
16 | $this->assertEqualsWithDelta(135.9226, $result, 0.0001);
17 | }
18 | }
19 |
--------------------------------------------------------------------------------
/tests/TextAnalysis/LexicalDiversity/YuleKTest.php:
--------------------------------------------------------------------------------
1 | getText() ), \TextAnalysis\LexicalDiversity\YuleK::class);
16 | $this->assertEqualsWithDelta(73.5712, $result, 0.0001);
17 | }
18 | }
19 |
--------------------------------------------------------------------------------
/tests/TextAnalysis/NGrams/NGramFactoryTest.php:
--------------------------------------------------------------------------------
1 |
12 | */
13 | class NGramFactoryTest extends \PHPUnit\Framework\TestCase
14 | {
15 | public function testBiGram()
16 | {
17 | $tokens = ["one","two","three"];
18 | $expected = ["one two","two three"];
19 | $bigrams = NGramFactory::create($tokens);
20 | $this->assertEquals($expected, $bigrams);
21 | $this->assertEquals($expected, bigrams($tokens));
22 | }
23 |
24 | public function testTriGram()
25 | {
26 | $tokens = ["one","two","three","four"];
27 | $expected = ["one two three","two three four"];
28 | $bigrams = NGramFactory::create($tokens, NGramFactory::TRIGRAM);
29 | $this->assertEquals($expected, $bigrams);
30 | $this->assertEquals($expected, trigrams($tokens));
31 |
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/tests/TextAnalysis/NGrams/StatisticFacadeTest.php:
--------------------------------------------------------------------------------
1 |
14 | */
15 | class StatisticFacadeTest extends \PHPUnit\Framework\TestCase
16 | {
17 | private $text;
18 | private $tokens;
19 |
20 | public function setUp() : void
21 | {
22 | parent::setUp();
23 | $this->text = file_get_contents(TEST_DATA_DIR . DS . 'Text'.DS.'Analysis'.DS.'text_ngrams.txt');
24 | $tokenizer = new RegexTokenizer('/([\p{L}]+[\/\-_\']?[\p{L}]+)+|[\p{L}]+/iu');
25 | $this->tokens = normalize_tokens($tokenizer->tokenize($this->text));
26 | }
27 | public function testBigrams()
28 | {
29 |
30 | $ngrams = NGramFactory::create($this->tokens, 2, '<>');
31 |
32 | $ngrams = NGramFactory::getFreq($ngrams, '<>');
33 |
34 | //test frequency
35 | $this->assertEquals($ngrams['know<>something'], array( 0=>2, 1=> 3, 2 => 2));
36 |
37 | //test tmi measure
38 | $ngramsStats = StatisticFacade::calculate($ngrams, 'tmi', 2);
39 | $this->assertEquals(round($ngramsStats['know<>something'], 4), 0.1612);
40 |
41 | //test ll measure
42 | $ngramsStats = StatisticFacade::calculate($ngrams, 'll', 2);
43 | $this->assertEquals(round($ngramsStats['know<>something'], 4), 13.8516);
44 |
45 | //test pmi measure
46 | $ngramsStats = StatisticFacade::calculate($ngrams, 'pmi', 2);
47 | $this->assertEquals(round($ngramsStats['know<>something'], 4), 4.3692);
48 |
49 | //test dice measure
50 | $ngramsStats = StatisticFacade::calculate($ngrams, 'dice', 2);
51 | $this->assertEquals(round($ngramsStats['know<>something'], 4), 0.8000);
52 |
53 | //test x2 measure
54 | $ngramsStats = StatisticFacade::calculate($ngrams, 'x2', 2);
55 | $this->assertEquals(round($ngramsStats['know<>something'], 4), 40.6444);
56 |
57 | //test tscore measure
58 | $ngramsStats = StatisticFacade::calculate($ngrams, 'tscore', 2);
59 | $this->assertEquals(round($ngramsStats['know<>something'], 4), 1.3458);
60 |
61 | //test phi measure
62 | $ngramsStats = StatisticFacade::calculate($ngrams, 'phi', 2);
63 | $this->assertEquals(round($ngramsStats['know<>something'], 4), 0.6556);
64 |
65 | //test odds measure
66 | $ngramsStats = StatisticFacade::calculate($ngrams, 'odds', 2);
67 | $this->assertEquals(round($ngramsStats['know<>something'], 4), 118.0000);
68 |
69 | //test leftFisher measure
70 | $ngramsStats = StatisticFacade::calculate($ngrams, 'leftFisher', 2);
71 | $this->assertEquals(round($ngramsStats['know<>something'], 4), 1.0000);
72 |
73 | //test rightFisher measure
74 | $ngramsStats = StatisticFacade::calculate($ngrams, 'rightFisher', 2);
75 | $this->assertEquals(round($ngramsStats['know<>something'], 4), 0.0016);
76 | }
77 |
78 | public function testTrigrams()
79 | {
80 | $ngrams = NGramFactory::create($this->tokens, 3, '<>');
81 | $ngrams = NGramFactory::getFreq($ngrams, '<>');
82 |
83 | //test frequency
84 | $this->assertEquals($ngrams['the<>know<>something'], array( 0 => 1, 1 => 4, 2 => 3, 3 => 2, 4 => 1, 5 => 1, 6 => 2));
85 |
86 | //test tmi measure
87 | $ngramsStats = StatisticFacade::calculate($ngrams, 'tmi', 3);
88 | $this->assertEquals(round($ngramsStats['the<>know<>something'], 4), 0.2002);
89 |
90 | //test ll measure
91 | $ngramsStats = StatisticFacade::calculate($ngrams, 'll', 3);
92 | $this->assertEquals(round($ngramsStats['the<>know<>something'], 4), 16.9283);
93 | }
94 | }
95 |
--------------------------------------------------------------------------------
/tests/TextAnalysis/Stemmers/DictionaryStemmerTest.php:
--------------------------------------------------------------------------------
1 | assertEquals("judge", $stemmer->stem("judges"));
24 | // some times approach does not work
25 | $this->assertEquals('university', $stemmer->stem("universities"));
26 | $this->assertEquals('hammock', $stemmer->stem("hammok"));
27 | }
28 |
29 | }
30 |
--------------------------------------------------------------------------------
/tests/TextAnalysis/Stemmers/LambdaStemmerTest.php:
--------------------------------------------------------------------------------
1 | assertEquals("tom", $stemmer->stem("tommy"));
22 | }
23 | }
24 |
--------------------------------------------------------------------------------
/tests/TextAnalysis/Stemmers/LancasterStemmerTest.php:
--------------------------------------------------------------------------------
1 | assertEquals('maxim', $stemmer->stem('maximum'));
17 | $this->assertEquals('presum', $stemmer->stem('presumably'));
18 | $this->assertEquals('multiply', $stemmer->stem('multiply'));
19 | $this->assertEquals('provid', $stemmer->stem('provision'));
20 | $this->assertEquals('ow', $stemmer->stem('owed'));
21 | $this->assertEquals('ear', $stemmer->stem('ear'));
22 | $this->assertEquals('say', $stemmer->stem('saying'));
23 | $this->assertEquals('cry', $stemmer->stem('crying'));
24 | $this->assertEquals('string', $stemmer->stem('string'));
25 | $this->assertEquals('meant', $stemmer->stem('meant'));
26 | $this->assertEquals('cem', $stemmer->stem('cement'));
27 | $this->assertEquals( null, $stemmer->stem(' '));
28 |
29 | }
30 | }
31 |
32 |
--------------------------------------------------------------------------------
/tests/TextAnalysis/Stemmers/LookupStemmerTest.php:
--------------------------------------------------------------------------------
1 | assertEquals("end", $stemmer->stem("ending"));
21 | $this->assertEquals("end", $stemmer->stem("ended"));
22 |
23 | }
24 |
25 |
26 | }
27 |
--------------------------------------------------------------------------------
/tests/TextAnalysis/Stemmers/MorphStemmerTest.php:
--------------------------------------------------------------------------------
1 | assertEquals('university', $stemmer->stem('universities'));
22 |
23 | }
24 | }
25 |
--------------------------------------------------------------------------------
/tests/TextAnalysis/Stemmers/PorterStemmerTest.php:
--------------------------------------------------------------------------------
1 | assertEquals('univers', $stemmer->stem('universities'));
20 | $this->assertEquals('judg',$stemmer->stem('judges'));
21 | }
22 |
23 | public function testSimplifiedStemmer()
24 | {
25 | $this->assertEquals(['univers','judg'], stem(['universities', 'judges']));
26 | }
27 |
28 | }
29 |
--------------------------------------------------------------------------------
/tests/TextAnalysis/Stemmers/RegexStemmerTest.php:
--------------------------------------------------------------------------------
1 | assertEquals("car", $stemmer->stem("car"));
19 | $this->assertEquals("mas", $stemmer->stem("mass"));
20 | $this->assertEquals("was", $stemmer->stem("was"));
21 | $this->assertEquals("bee", $stemmer->stem("bee"));
22 | $this->assertEquals("comput", $stemmer->stem("compute"));
23 |
24 | }
25 |
26 |
27 | }
28 |
--------------------------------------------------------------------------------
/tests/TextAnalysis/Stemmers/SnowballStemmerTest.php:
--------------------------------------------------------------------------------
1 | assertEquals("judg", $stemmer->stem("judges"));
18 | $this->assertEquals('ski', $stemmer->stem('skis'));
19 | $this->assertEquals('univers', $stemmer->stem('universities'));
20 | $this->assertEquals('news', $stemmer->stem('news'));
21 | }
22 |
23 | public function testSwedish()
24 | {
25 | $stemmer = new SnowballStemmer('Swedish');
26 | $this->assertEquals("affärschef", $stemmer->stem("affärscheferna"));
27 | }
28 |
29 | public function testException()
30 | {
31 | $this->expectException('Exception');
32 | $stemmer = new SnowballStemmer('Wookie');
33 | }
34 | }
--------------------------------------------------------------------------------
/tests/TextAnalysis/Taggers/StanfordNerTaggerTest.php:
--------------------------------------------------------------------------------
1 | expectException('RuntimeException', 'Jar not found not_available.jar');
26 | $tagger->tag([]);
27 | }
28 |
29 | public function testClassiferNotFound()
30 | {
31 | if( getenv('SKIP_TEST') || !getenv('JAVA_HOME')) {
32 | return;
33 | }
34 |
35 | $tagger = new StanfordNerTagger(get_storage_path($this->nerPath).'stanford-ner.jar', "classifier.gz");
36 | $this->expectException('RuntimeException', 'Classifier not found classifier.gz');
37 | $tagger->tag([]);
38 | }
39 |
40 | public function testTempCreatedFile()
41 | {
42 | $mockTagger = Mockery::mock('TextAnalysis\Taggers\StanfordNerTagger[exec,verify]', ['bogus.jar', 'bogus.classifier'])
43 | ->shouldAllowMockingProtectedMethods();
44 |
45 | $mockTagger->shouldReceive('exec')
46 | ->andReturnNull()
47 | ->shouldReceive('verify')
48 | ->andReturnNull();
49 |
50 | $mockTagger->tag((new WhitespaceTokenizer())->tokenize($this->text));
51 | $this->assertFileExists($mockTagger->getTmpFilePath());
52 | $this->assertEquals(138, filesize($mockTagger->getTmpFilePath()));
53 | }
54 |
55 | public function testStanfordNer()
56 | {
57 | if( getenv('SKIP_TEST')) {
58 | return;
59 | }
60 |
61 | $document = new TokensDocument((new WhitespaceTokenizer())->tokenize($this->text));
62 | $tagger = new StanfordNerTagger();
63 | $output = $tagger->tag($document->getDocumentData());
64 |
65 | $this->assertFileExists($tagger->getTmpFilePath());
66 | $this->assertEquals(138, filesize($tagger->getTmpFilePath()));
67 | $this->assertEquals(['Michigan','LOCATION'], $output[15], "Did you set JAVA_HOME env variable?");
68 | }
69 |
70 | }
71 |
--------------------------------------------------------------------------------
/tests/TextAnalysis/Taggers/StanfordPosTaggerTest.php:
--------------------------------------------------------------------------------
1 | expectException('RuntimeException', 'Jar not found not_available.jar');
26 | $tagger->tag([]);
27 | }
28 |
29 | public function testClassiferNotFound()
30 | {
31 | if( getenv('SKIP_TEST') || !getenv('JAVA_HOME')) {
32 | return;
33 | }
34 |
35 | $tagger = new StanfordPosTagger(get_storage_path($this->posPath).'stanford-postagger-3.6.0.jar', "classifier.gz");
36 | $this->expectException('RuntimeException', 'Classifier not found classifier.gz');
37 | $tagger->tag([]);
38 | }
39 |
40 | public function testTempCreatedFile()
41 | {
42 | $mockTagger = Mockery::mock('TextAnalysis\Taggers\StanfordPosTagger[exec,verify]', ['bogus.jar', 'bogus.classifier'])
43 | ->shouldAllowMockingProtectedMethods();
44 |
45 | $mockTagger->shouldReceive('exec')
46 | ->andReturnNull()
47 | ->shouldReceive('verify')
48 | ->andReturnNull();
49 |
50 | $mockTagger->tag((new WhitespaceTokenizer())->tokenize($this->text));
51 | $this->assertFileExists($mockTagger->getTmpFilePath());
52 | $this->assertEquals(138, filesize($mockTagger->getTmpFilePath()));
53 | }
54 |
55 | public function testStanfordPos()
56 | {
57 | if( getenv('SKIP_TEST')) {
58 | return;
59 | }
60 |
61 | $document = new TokensDocument((new WhitespaceTokenizer())->tokenize($this->text));
62 | $tagger = new StanfordPosTagger();
63 | $output = $tagger->tag($document->getDocumentData());
64 |
65 | $this->assertFileExists($tagger->getTmpFilePath());
66 | $this->assertEquals(138, filesize($tagger->getTmpFilePath()));
67 | $this->assertEquals(['Michigan','NNP'], $output[15], "Did you set JAVA_HOME env variable?");
68 | }
69 |
70 | }
71 |
--------------------------------------------------------------------------------
/tests/TextAnalysis/Tokenizers/FixedLengthTokenizerTest.php:
--------------------------------------------------------------------------------
1 | tokenize("Gabby Abby");
19 | $this->assertCount(1, $tokens);
20 | $this->assertEquals("bby ", end($tokens));
21 |
22 | }
23 |
24 | public function testFixedLengthNoLengthGiven()
25 | {
26 | $tokenizer = new FixedLengthTokenizer(0);
27 | $tokens = $tokenizer->tokenize("Gabby Abby");
28 | $this->assertCount(1, $tokens);
29 | $this->assertEquals("Gabby Abby", end($tokens));
30 | }
31 |
32 | }
33 |
34 |
--------------------------------------------------------------------------------
/tests/TextAnalysis/Tokenizers/GeneralTokenizerTest.php:
--------------------------------------------------------------------------------
1 | assertCount(4, $tokenizer->tokenize("This has some words"));
18 | }
19 |
20 | public function testLineTokenizer(){
21 |
22 | $tokenizer = new GeneralTokenizer(PHP_EOL);
23 | $this->assertCount(4, $tokenizer->tokenize("This ".PHP_EOL." has".PHP_EOL." some".PHP_EOL." words"));
24 | }
25 | }
26 |
27 |
--------------------------------------------------------------------------------
/tests/TextAnalysis/Tokenizers/PennTreeBankTokenizerTest.php:
--------------------------------------------------------------------------------
1 | tokenize("Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks.");
19 | $this->assertCount(16, $tokens);
20 | }
21 |
22 | public function testTokenizer2()
23 | {
24 | $tokenizer = new PennTreeBankTokenizer();
25 | $this->assertCount(7, $tokenizer->tokenize("They'll save and invest more."));
26 | }
27 |
28 | public function testTokenizer3()
29 | {
30 | $tokenizer = new PennTreeBankTokenizer();
31 | $this->assertCount(4, $tokenizer->tokenize("I'm some text"));
32 | }
33 |
34 | public function testAgainstOriginalSedImplementation()
35 | {
36 | $tokenizer = new PennTreeBankTokenizer();
37 | $tokenized = new \SplFileObject(TEST_DATA_DIR."/Tokenizers/PennTreeBankTokenizerTest/tokenized");
38 | $tokenized->setFlags(\SplFileObject::DROP_NEW_LINE);
39 | $sentences = new \SplFileObject(TEST_DATA_DIR."/Tokenizers/PennTreeBankTokenizerTest/test.txt");
40 | $sentences->setFlags(\SplFileObject::DROP_NEW_LINE);
41 |
42 | $tokenized->rewind();
43 | foreach ($sentences as $sentence) {
44 | if ($sentence) // skip empty lines
45 | {
46 | $this->assertEquals(
47 | $tokenized->current(),
48 | implode(" ",$tokenizer->tokenize($sentence)),
49 | "Sentence: '$sentence' was not tokenized correctly"
50 | );
51 | }
52 | $tokenized->next();
53 | }
54 |
55 | }
56 |
57 | }
58 |
59 |
--------------------------------------------------------------------------------
/tests/TextAnalysis/Tokenizers/RegexTokenizerTest.php:
--------------------------------------------------------------------------------
1 | tokenize("Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks.");
18 | $this->assertCount(17, $tokens);
19 | }
20 |
21 | public function testMatchWordsOnly()
22 | {
23 | $tokenizer = new RegexTokenizer("/[A-Za-z]+/");
24 | $tokens = $tokenizer->tokenize("Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks.");
25 | $this->assertCount(13, $tokens);
26 | }
27 |
28 | }
--------------------------------------------------------------------------------
/tests/TextAnalysis/Tokenizers/SentenceTokenizerTest.php:
--------------------------------------------------------------------------------
1 | assertCount(2, $tokenizer->tokenize("This has some words. Why only 4 words?"));
17 | $this->assertCount(2, $tokenizer->tokenize("My name is Yooper. I like programming!"));
18 | $this->assertCount(2, $tokenizer->tokenize("My name is Yooper!? I like programming!! !!"));
19 | $this->assertCount(3, $tokenizer->tokenize($this->getArticle()));
20 | $this->assertCount(1, $tokenizer->tokenize("The U.S.A. recently dropped out of the T.P.P."));
21 | }
22 |
23 | private function getArticle()
24 | {
25 | return <<tokenize('This is a common Tweet #format where @mentions and.errors!!!!like this:-))))) might #appear❤ ❤☺❤#ThisIsAHashtag!?!');
17 | $this->assertCount(33, $tokens);
18 |
19 | }
20 |
21 | public function testForUrlAndEmail()
22 | {
23 | $tokens = (new TwitterTokenizer)->tokenize('Custom Software Development http://redbeardtechnologies.com/ 906-555-5555 or contact support at support@redbeardtechnologies.com :-)');
24 | $this->assertCount(11, $tokens);
25 | }
26 |
27 | public function testContraction()
28 | {
29 | $tokens = (new TwitterTokenizer)->tokenize("This shouldn't be broken up");
30 | $this->assertCount(5, $tokens);
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/tests/TextAnalysis/Utilities/TextTest.php:
--------------------------------------------------------------------------------
1 | assertCount(6, $substrings);
22 | $this->assertEquals($expected, $substrings);
23 | }
24 |
25 | public function testEndsWith()
26 | {
27 | $this->assertTrue(Text::endsWith('lunches', 's'));
28 | $this->assertTrue(Text::endsWith('lunches', 'es'));
29 | $this->assertTrue(Text::endsWith('lunches', 'hes'));
30 | $this->assertFalse(Text::endsWith('joe', 'is'));
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/tests/TextAnalysis/Utilities/Vowels/EnglishVowelsTest.php:
--------------------------------------------------------------------------------
1 | assertTrue($vowelChecker->isVowel("man", 1));
16 | }
17 |
18 | public function testYIsVowel()
19 | {
20 | $vowelChecker = VowelsAbstractFactory::factory("English");
21 | $this->assertTrue($vowelChecker->isVowel("try", 2));
22 | }
23 | }
24 |
25 |
26 |
--------------------------------------------------------------------------------
/tests/bootstrap.php:
--------------------------------------------------------------------------------
1 | add(new NltkPackageListCommand());
42 | $app->add(new NltkPackageInstallCommand());
43 | $app->add(new StopWordsCommand());
44 | $app->add(new VocabSizeCommand());
45 | $app->add(new NltkPackageInstallAllCommand());
46 |
47 | $app->run();
48 |
--------------------------------------------------------------------------------