├── metanl ├── __init__.py ├── data │ └── freeling │ │ ├── generic_splitter.dat │ │ ├── ident.cfg │ │ ├── cy.cfg │ │ ├── it.cfg │ │ ├── ru.cfg │ │ ├── es.cfg │ │ ├── en.cfg │ │ └── pt.cfg ├── freeling.py ├── token_utils.py ├── nltk_morphy.py ├── extprocess.py └── mecab.py ├── .gitignore ├── MANIFEST.in ├── scripts ├── reformat-leeds-ja.py ├── merge_english.py └── reformat_using_rosette.py ├── LICENSE.txt ├── tests ├── test_nltk_morphy.py ├── test_tokens.py └── test_extprocesses.py ├── setup.py └── README.md /metanl/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.swp 3 | build 4 | *.egg-info/ 5 | dist 6 | pip-log.txt 7 | .coverage 8 | *~ 9 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include metanl *.txt 2 | recursive-include metanl *.dat 3 | recursive-include metanl *.cfg 4 | include README.md 5 | -------------------------------------------------------------------------------- /metanl/data/freeling/generic_splitter.dat: -------------------------------------------------------------------------------- 1 | 2 | AllowBetweenMarkers 1 3 | MaxWords 1000 4 | 5 | 6 | " " 7 | ( ) 8 | { } 9 | /* */ 10 | 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /scripts/reformat-leeds-ja.py: -------------------------------------------------------------------------------- 1 | from metanl import japanese 2 | from metanl.leeds_corpus_reader import translate_leeds_corpus 3 | 4 | translate_leeds_corpus('../metanl/data/source-data/internet-ja-forms.num', 5 | '../metanl/data/leeds-internet-ja.txt', japanese.normalize) 6 | -------------------------------------------------------------------------------- /metanl/data/freeling/ident.cfg: -------------------------------------------------------------------------------- 1 | ## 2 | #### default configuration file for Spanish analyzer 3 | ## 4 | 5 | #### General options 6 | 7 | Locale=default 8 | 9 | InputFormat=plain 10 | OutputFormat=ident 11 | LangIdentFile=$FREELINGSHARE/common/lang_ident/ident.dat 12 | 13 | ## You can launch the analyzer as a server by default 14 | ## changing the options below 15 | ServerMode=off 16 | #ServerPort=12345 17 | #ServerMaxWorkers=5 18 | #ServerQueueSize=32 19 | -------------------------------------------------------------------------------- /scripts/merge_english.py: -------------------------------------------------------------------------------- 1 | from metanl.wordlist import get_wordlist, merge_lists 2 | 3 | def merge_english(): 4 | books = get_wordlist('en-books') 5 | twitter = get_wordlist('en-twitter') 6 | combined = merge_lists([(books, '', 1e9), (twitter, '', 1e9)]) 7 | combined.save('multi-en.txt') 8 | combined.save_logarithmic('multi-en-logarithmic.txt') 9 | total = sum(combined.worddict.values()) 10 | print "Average frequency:", total / len(combined.worddict) 11 | 12 | if __name__ == '__main__': 13 | merge_english() 14 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (C) 2012 Rob Speer (rspeer@mit.edu) 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /scripts/reformat_using_rosette.py: -------------------------------------------------------------------------------- 1 | from metanl.leeds_corpus_reader import translate_leeds_corpus 2 | import socket, time 3 | 4 | def make_rosette_normalizer(lcode): 5 | from lumi_pipeline.text_readers import get_reader 6 | reader = get_reader('rosette.%s' % lcode) 7 | def normalizer(text): 8 | try: 9 | triples = reader.text_to_token_triples(text) 10 | except socket.error: 11 | time.sleep(1) 12 | print 'backing off' 13 | return normalizer(text) 14 | normalized = u' '.join(lemma.rsplit('|', 1)[0] for lemma, pos, token in triples) 15 | return normalized 16 | return normalizer 17 | 18 | def main(): 19 | for language in ('pt', 'ru', 'es', 'fr', 'it', 'zh', 'de', 'ar'): 20 | print language 21 | translate_leeds_corpus( 22 | '../metanl/data/source-data/internet-%s-forms.num' % language, 23 | '../metanl/data/wordlists/leeds-internet-%s.txt' % language, 24 | make_rosette_normalizer(language) 25 | ) 26 | 27 | if __name__ == '__main__': 28 | main() 29 | -------------------------------------------------------------------------------- /tests/test_nltk_morphy.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | 3 | from metanl.nltk_morphy import normalize_list, tag_and_stem 4 | from nose.tools import eq_ 5 | 6 | def test_normalize_list(): 7 | # Strip away articles, unless there's only an article 8 | eq_(normalize_list('the dog'), ['dog']) 9 | eq_(normalize_list('the'), ['the']) 10 | 11 | # strip out pluralization 12 | eq_(normalize_list('big dogs'), ['big', 'dog']) 13 | 14 | 15 | def test_tag_and_stem(): 16 | the_big_dogs = [(u'the', 'DT', u'the'), 17 | (u'big', 'JJ', u'big'), 18 | (u'dog', 'NNS', u'dogs')] 19 | eq_(tag_and_stem('the big dogs'), the_big_dogs) 20 | 21 | the_big_hashtag = [(u'the', 'DT', u'the'), 22 | (u'#', 'NN', u'#'), 23 | (u'big', 'JJ', u'big'), 24 | (u'dog', 'NN', u'dog')] 25 | eq_(tag_and_stem('the #big dog'), the_big_hashtag) 26 | 27 | two_sentences = [(u'i', 'PRP', u'I'), 28 | (u'can', 'MD', u'ca'), 29 | (u'not', 'RB', u"n't"), 30 | (u'.', '.', u'.'), 31 | (u'avoid', 'NNP', u'Avoid'), 32 | (u'fragment', 'NNS', u'fragments'), 33 | (u'.', '.', u'.')] 34 | eq_(tag_and_stem("I can't. Avoid fragments."), two_sentences) 35 | -------------------------------------------------------------------------------- /tests/test_tokens.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import unicode_literals 3 | from metanl.token_utils import (tokenize, untokenize, un_camel_case, 4 | string_pieces) 5 | from nose.tools import eq_ 6 | import nltk 7 | 8 | def test_tokenize(): 9 | # a snippet from Hitchhiker's Guide that just happens to have 10 | # most of the examples of punctuation we're looking for. 11 | # 12 | # TODO: test wacky behavior with "n't" and "cannot" and stuff. 13 | text1 = "Time is an illusion. Lunchtime, doubly so." 14 | text2 = ('"Very deep," said Arthur, "you should send that in to the ' 15 | 'Reader\'s Digest. They\'ve got a page for people like you."') 16 | eq_(tokenize(text1), 17 | ['Time', 'is', 'an', 'illusion', '.', 'Lunchtime', ',', 18 | 'doubly', 'so', '.'] 19 | ) 20 | eq_(untokenize(tokenize(text1)), text1) 21 | if nltk.__version__ >= '3': 22 | eq_(untokenize(tokenize(text2)), text2) 23 | 24 | def test_camel_case(): 25 | eq_(un_camel_case('1984ZXSpectrumGames'), '1984 ZX Spectrum Games') 26 | eq_(un_camel_case('aaAa aaAaA 0aA AAAa!AAA'), 27 | 'aa Aa aa Aa A 0a A AA Aa! AAA') 28 | eq_(un_camel_case('MotörHead'), 29 | 'Mot\xf6r Head') 30 | eq_(un_camel_case('MSWindows3.11ForWorkgroups'), 31 | 'MS Windows 3.11 For Workgroups') 32 | 33 | # This should not significantly affect text that is not camel-cased 34 | eq_(un_camel_case('ACM_Computing_Classification_System'), 35 | 'ACM Computing Classification System') 36 | eq_(un_camel_case('Anne_Blunt,_15th_Baroness_Wentworth'), 37 | 'Anne Blunt, 15th Baroness Wentworth') 38 | eq_(un_camel_case('Hindi-Urdu'), 39 | 'Hindi-Urdu') 40 | 41 | 42 | def test_string_pieces(): 43 | # Break as close to whitespace as possible 44 | text = "12 12 12345 123456 1234567-12345678" 45 | eq_(list(string_pieces(text, 6)), 46 | ['12 12 ', '12345 ', '123456', ' ', '123456', '7-', '123456', '78']) 47 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | version_str = '1.0c' 4 | from setuptools import setup 5 | import sys 6 | 7 | if sys.version_info.major == 2: 8 | nltk_version = 'nltk' 9 | else: 10 | nltk_version = 'nltk >= 3.0a' 11 | 12 | classifiers=[ 13 | 'Intended Audience :: Developers', 14 | 'Intended Audience :: Science/Research', 15 | 'License :: OSI Approved :: MIT License', 16 | 'Natural Language :: English', 17 | 'Operating System :: MacOS', 18 | 'Operating System :: Microsoft :: Windows', 19 | 'Operating System :: POSIX', 20 | 'Operating System :: Unix', 21 | 'Programming Language :: C', 22 | 'Programming Language :: Python :: 2.7', 23 | 'Programming Language :: Python :: 3.3', 24 | 'Topic :: Scientific/Engineering', 25 | 'Topic :: Software Development', 26 | 'Topic :: Text Processing :: Linguistic',] 27 | 28 | import os 29 | README_contents = open(os.path.join(os.path.dirname(__file__), 'README.md')).read() 30 | doclines = README_contents.split("\n") 31 | 32 | 33 | deprecation_warning = """ 34 | 35 | Note: metanl is no longer actively developed or supported. 36 | 37 | metanl was created to support the language-processing needs that ConceptNet 38 | 5 shared with code developed at Luminoso. Those needs have diverged, to the 39 | point where it made the most sense to split the functionality again. 40 | 41 | A simplified version of metanl has been moved into the `conceptnet5` 42 | package, as `conceptnet5.language`. 43 | 44 | """ 45 | sys.stderr.write(deprecation_warning) 46 | 47 | setup( 48 | name="metanl", 49 | version=version_str, 50 | maintainer='Luminoso Technologies, Inc.', 51 | maintainer_email='dev@luminoso.com', 52 | url='http://github.com/commonsense/metanl/', 53 | license = "MIT", 54 | platforms = ["any"], 55 | description = doclines[0], 56 | classifiers = classifiers, 57 | long_description = "\n".join(doclines[2:]), 58 | packages=['metanl'], 59 | package_data = {'metanl': ['data/freeling/*.cfg', 'data/freeling/*.dat']}, 60 | install_requires=[nltk_version, 'ftfy >= 3'], 61 | ) 62 | -------------------------------------------------------------------------------- /tests/test_extprocesses.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import unicode_literals 3 | 4 | from metanl.freeling import english, spanish 5 | from metanl.mecab import normalize, tag_and_stem 6 | from metanl.extprocess import unicode_is_punctuation 7 | from nose.tools import eq_ 8 | 9 | 10 | def test_english(): 11 | test_text = "This is a test.\n\nIt has two paragraphs, and that's okay." 12 | expected_result = [('this', 'DT', 'This'), ('be', 'VBZ', 'is'), 13 | ('a', 'DT', 'a'), ('test', 'NN', 'test'), 14 | ('.', '.', '.'), ('it', 'PRP', 'It'), 15 | ('have', 'VBZ', 'has'), ('two', 'DT', 'two'), 16 | ('paragraph', 'NNS', 'paragraphs'), (',', '.', ','), 17 | ('and', 'CC', 'and'), ('that', 'PRP', 'that'), 18 | ('be', 'VBZ', "'s"), ('okay', 'JJ', 'okay'), 19 | ('.', '.', '.')] 20 | eq_(english.tag_and_stem(test_text), expected_result) 21 | 22 | test_text = "this has\ntwo lines" 23 | expected_result = [('this', 'DT', 'this'), ('have', 'VBZ', 'has'), 24 | ('two', 'DT', 'two'), ('line', 'NNS', 'lines')] 25 | eq_(english.tag_and_stem(test_text), expected_result) 26 | 27 | 28 | def test_spanish(): 29 | # Spanish works, even with a lot of unicode characters 30 | test_text = '¿Dónde está mi búfalo?' 31 | expected_result = [('¿', '.', '¿'), 32 | ('dónde', 'P', 'Dónde'), 33 | ('estar', 'V', 'está'), 34 | ('mi', 'D', 'mi'), 35 | ('búfalo', 'N', 'búfalo'), 36 | ('?', '.', '?')] 37 | eq_(spanish.tag_and_stem(test_text), expected_result) 38 | 39 | 40 | def test_japanese(): 41 | eq_(normalize('これはテストです'), 'テスト') 42 | this_is_a_test = [('これ', '~名詞', 'これ'), 43 | ('は', '~助詞', 'は'), 44 | ('テスト', '名詞', 'テスト'), 45 | ('です', '~助動詞', 'です'), 46 | ('。', '.', '。')] 47 | eq_(tag_and_stem('これはテストです。'), this_is_a_test) 48 | 49 | 50 | def test_unicode_is_punctuation(): 51 | assert unicode_is_punctuation('word') is False 52 | assert unicode_is_punctuation('。') is True 53 | assert unicode_is_punctuation('-') is True 54 | assert unicode_is_punctuation('-3') is False 55 | assert unicode_is_punctuation('あ') is False 56 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Multilingual natural language tools, wrapping NLTK and other systems. 2 | 3 | ## Deprecated as of June 2014 4 | 5 | `metanl` is no longer actively developed or supported. 6 | 7 | This package was created to support the language-processing needs that 8 | [ConceptNet 5](http://conceptnet5.media.mit.edu) shared with code developed at 9 | Luminoso. Those needs have diverged, to the point where it made the most sense 10 | to split the functionality again. 11 | 12 | A simplified version of metanl has been moved into the `conceptnet5` 13 | package, as `conceptnet5.language`. 14 | 15 | 16 | ## metanl.token_utils 17 | 18 | Utilities for working with tokens: 19 | 20 | - `tokenize` splits strings into tokens, using NLTK. 21 | - `untokenize` rejoins tokens into a correctly-spaced string, using ad-hoc 22 | rules that aim to invert what NLTK does. 23 | - `un_camel_case` splits a CamelCased string into tokens. 24 | 25 | These functions make assumptions that work best in English, and work reasonably 26 | in other Western languages, and fail utterly in languages that don't use 27 | spaces. 28 | 29 | 30 | ## metanl.nltk_morphy 31 | 32 | `nltk_morphy` is a lemmatizer (a stemmer with principles). It enables you to 33 | reduce words to their root form in English, using the Morphy algorithm that's 34 | built into WordNet, together with NLTK's part of speech tagger. 35 | 36 | Morphy works best with a known part of speech. In fact, the way it works in 37 | NLTK is pretty bad if you don't specify the part of speech. The `nltk_morphy` 38 | wrapper provides: 39 | 40 | - An alignment between the POS tags that `nltk.pos_tag` outputs, and the input 41 | that Morphy expects 42 | - A strategy for tagging words whose part of speech is unknown 43 | - A small list of exceptions, for cases where Morphy returns an unintuitive 44 | or wrong result 45 | 46 | ## metanl.extprocess 47 | 48 | Sometimes, the best available NLP tools are written in some other language 49 | besides Python. They may not provide a reasonable foreign function interface. 50 | What they do often provide is a command-line utility. 51 | 52 | `metanl.extprocess` provides abstractions over utilities that take in natural 53 | language, and output a token-by-token analysis. This is used by two other 54 | modules in `metanl`. 55 | 56 | ### metanl.freeling 57 | 58 | FreeLing is an NLP tool that can analyze many European languages, including 59 | English, Spanish, Italian, Portuguese, Welsh, and Russian. This module 60 | allows you to run FreeLing in a separate process, and use its analysis 61 | results in Python. 62 | 63 | ### metanl.mecab 64 | 65 | In Japanese, NLP analyzers are particularly important, because without one 66 | you don't even know where to split words. 67 | 68 | MeCab is the most commonly used analyzer for Japanese text. This module runs 69 | MeCab in an external process, allowing you to get its complete analysis 70 | results, or just use it to tokenize or lemmatize text. 71 | 72 | As part of MeCab's operation, it outputs the phonetic spellings of the words 73 | it finds, in kana. We use this to provide a wrapper function that can 74 | romanize any Japanese text. 75 | 76 | -------------------------------------------------------------------------------- /metanl/data/freeling/cy.cfg: -------------------------------------------------------------------------------- 1 | ## 2 | #### default configuration file for Welsch analyzer 3 | ## 4 | 5 | #### General options 6 | Lang=en 7 | Locale=default 8 | 9 | #### Trace options. Only effective if we have compiled with -DVERBOSE 10 | ## 11 | #### Possible values for TraceModules (may be OR'ed) 12 | #define SPLIT_TRACE 0x00000001 13 | #define TOKEN_TRACE 0x00000002 14 | #define MACO_TRACE 0x00000004 15 | #define OPTIONS_TRACE 0x00000008 16 | #define NUMBERS_TRACE 0x00000010 17 | #define DATES_TRACE 0x00000020 18 | #define PUNCT_TRACE 0x00000040 19 | #define DICT_TRACE 0x00000080 20 | #define SUFF_TRACE 0x00000100 21 | #define LOCUT_TRACE 0x00000200 22 | #define NP_TRACE 0x00000400 23 | #define PROB_TRACE 0x00000800 24 | #define QUANT_TRACE 0x00001000 25 | #define NEC_TRACE 0x00002000 26 | #define AUTOMAT_TRACE 0x00004000 27 | #define TAGGER_TRACE 0x00008000 28 | #define HMM_TRACE 0x00010000 29 | #define RELAX_TRACE 0x00020000 30 | #define RELAX_TAGGER_TRACE 0x00040000 31 | #define CONST_GRAMMAR_TRACE 0x00080000 32 | #define SENSES_TRACE 0x00100000 33 | #define CHART_TRACE 0x00200000 34 | #define GRAMMAR_TRACE 0x00400000 35 | #define DEP_TRACE 0x00800000 36 | #define UTIL_TRACE 0x01000000 37 | 38 | TraceLevel=3 39 | TraceModule=0x0000 40 | 41 | ## Options to control the applied modules. The input may be partially 42 | ## processed, or not a full analysis may me wanted. The specific 43 | ## formats are a choice of the main program using the library, as well 44 | ## as the responsability of calling only the required modules. 45 | ## Valid input/output formats are: plain, token, splitted, morfo, tagged, parsed 46 | InputFormat=plain 47 | OutputFormat=tagged 48 | 49 | # consider each newline as a sentence end 50 | AlwaysFlush=yes 51 | 52 | #### Tokenizer options 53 | TokenizerFile=$FREELINGSHARE/cy/tokenizer.dat 54 | 55 | #### Splitter options 56 | SplitterFile=$FREELINGSHARE/cy/splitter.dat 57 | 58 | #### Morfo options 59 | AffixAnalysis=yes 60 | MultiwordsDetection=no 61 | NumbersDetection=no 62 | PunctuationDetection=yes 63 | DatesDetection=no 64 | QuantitiesDetection=no 65 | DictionarySearch=yes 66 | ProbabilityAssignment=yes 67 | OrthographicCorrection=no 68 | DecimalPoint=. 69 | ThousandPoint=, 70 | LocutionsFile=$FREELINGSHARE/cy/locucions.dat 71 | QuantitiesFile=$FREELINGSHARE/cy/quantities.dat 72 | AffixFile=$FREELINGSHARE/cy/afixos.dat 73 | ProbabilityFile=$FREELINGSHARE/cy/probabilitats.dat 74 | DictionaryFile=$FREELINGSHARE/cy/dicc.src 75 | PunctuationFile=$FREELINGSHARE/common/punct.dat 76 | ProbabilityThreshold=0.001 77 | # NER options 78 | NERecognition=no 79 | NPDataFile=$FREELINGSHARE/cy/np.dat 80 | 81 | #Spelling Corrector config file 82 | CorrectorFile=$FREELINGSHARE/cy/corrector/corrector.dat 83 | 84 | ## Phonetic encoding of words. 85 | Phonetics=no 86 | PhoneticsFile=$FREELINGSHARE/cy/phonetics.dat 87 | 88 | ## NEC options 89 | NEClassification=no 90 | NECFile=$FREELINGSHARE/cy/nec/nec-ab.dat 91 | 92 | ## Sense annotation options (none,all,mfs) 93 | SenseAnnotation=none 94 | SenseConfigFile=$FREELINGSHARE/cy/senses.dat 95 | UKBConfigFile=$FREELINGSHARE/cy/ukb.dat 96 | 97 | #### Tagger options 98 | Tagger=hmm 99 | TaggerHMMFile=$FREELINGSHARE/cy/tagger.dat 100 | TaggerRelaxFile=$FREELINGSHARE/cy/constr_gram.dat 101 | TaggerRelaxMaxIter=500 102 | TaggerRelaxScaleFactor=670.0 103 | TaggerRelaxEpsilon=0.001 104 | TaggerRetokenize=no 105 | TaggerForceSelect=none 106 | 107 | #### Parser options 108 | GrammarFile=$FREELINGSHARE/cy/chunker/grammar-chunk.dat 109 | 110 | #### Dependence Parser options 111 | DepTxalaFile=$FREELINGSHARE/cy/dep/dependences.dat 112 | 113 | #### Coreference Solver options 114 | CoreferenceResolution=no 115 | CorefFile=$FREELINGSHARE/cy/coref/coref.dat 116 | -------------------------------------------------------------------------------- /metanl/data/freeling/it.cfg: -------------------------------------------------------------------------------- 1 | ## 2 | #### default configuration file for Italian analyzer 3 | ## 4 | 5 | #### General options 6 | Lang=it 7 | Locale=default 8 | 9 | #### Trace options. Only effective if we have compiled with -DVERBOSE 10 | # 11 | ## Possible values for TraceModule (may be OR'ed) 12 | #define SPLIT_TRACE 0x00000001 13 | #define TOKEN_TRACE 0x00000002 14 | #define MACO_TRACE 0x00000004 15 | #define OPTIONS_TRACE 0x00000008 16 | #define NUMBERS_TRACE 0x00000010 17 | #define DATES_TRACE 0x00000020 18 | #define PUNCT_TRACE 0x00000040 19 | #define DICT_TRACE 0x00000080 20 | #define SUFF_TRACE 0x00000100 21 | #define LOCUT_TRACE 0x00000200 22 | #define NP_TRACE 0x00000400 23 | #define PROB_TRACE 0x00000800 24 | #define QUANT_TRACE 0x00001000 25 | #define NEC_TRACE 0x00002000 26 | #define AUTOMAT_TRACE 0x00004000 27 | #define TAGGER_TRACE 0x00008000 28 | #define HMM_TRACE 0x00010000 29 | #define RELAX_TRACE 0x00020000 30 | #define RELAX_TAGGER_TRACE 0x00040000 31 | #define CONST_GRAMMAR_TRACE 0x00080000 32 | #define SENSES_TRACE 0x00100000 33 | #define CHART_TRACE 0x00200000 34 | #define GRAMMAR_TRACE 0x00400000 35 | #define DEP_TRACE 0x00800000 36 | #define UTIL_TRACE 0x01000000 37 | 38 | TraceLevel=3 39 | TraceModule=0x0000 40 | 41 | ## Options to control the applied modules. The input may be partially 42 | ## processed, or not a full analysis may me wanted. The specific 43 | ## formats are a choice of the main program using the library, as well 44 | ## as the responsability of calling only the required modules. 45 | ## Valid input/output formats are: plain, token, splitted, morfo, tagged, parsed 46 | InputFormat=plain 47 | OutputFormat=tagged 48 | 49 | # consider each newline as a sentence end 50 | AlwaysFlush=yes 51 | 52 | #### Tokenizer options 53 | TokenizerFile=$FREELINGSHARE/it/tokenizer.dat 54 | 55 | #### Splitter options 56 | SplitterFile=$FREELINGSHARE/it/splitter.dat 57 | 58 | #### Morfo options 59 | AffixAnalysis=yes 60 | MultiwordsDetection=no 61 | NumbersDetection=no 62 | PunctuationDetection=yes 63 | DatesDetection=no 64 | QuantitiesDetection=no 65 | DictionarySearch=yes 66 | ProbabilityAssignment=yes 67 | OrthographicCorrection=no 68 | DecimalPoint=, 69 | ThousandPoint=. 70 | LocutionsFile=$FREELINGSHARE/it/locucions.dat 71 | QuantitiesFile=$FREELINGSHARE/common/quantities_default.dat 72 | AffixFile=$FREELINGSHARE/it/afixos.dat 73 | ProbabilityFile=$FREELINGSHARE/it/probabilitats.dat 74 | NPDataFile=$FREELINGSHARE/it/np.dat 75 | PunctuationFile=$FREELINGSHARE/common/punct.dat 76 | ProbabilityThreshold=0.001 77 | # NER options 78 | NERecognition=no 79 | DictionaryFile=$FREELINGSHARE/it/dicc.src 80 | 81 | #Spelling Corrector config file 82 | CorrectorFile=$FREELINGSHARE/it/corrector/corrector.dat 83 | 84 | ## Phonetic encoding of words. 85 | Phonetics=no 86 | PhoneticsFile=$FREELINGSHARE/it/phonetics.dat 87 | 88 | ## NEC options 89 | NEClassification=no 90 | NECFile=$FREELINGSHARE/it/nec/nec-ab.dat 91 | 92 | ## Sense annotation options (none,all,mfs) 93 | SenseAnnotation=none 94 | SenseConfigFile=$FREELINGSHARE/it/senses.dat 95 | UKBConfigFile=$FREELINGSHARE/it/ukb.dat 96 | 97 | #### Tagger options 98 | Tagger=hmm 99 | TaggerHMMFile=$FREELINGSHARE/it/tagger.dat 100 | TaggerRelaxFile=$FREELINGSHARE/it/constr_gram.dat 101 | TaggerRelaxMaxIter=500 102 | TaggerRelaxScaleFactor=670.0 103 | TaggerRelaxEpsilon=0.001 104 | TaggerRetokenize=yes 105 | TaggerForceSelect=tagger 106 | 107 | #### Parser options 108 | GrammarFile=$FREELINGSHARE/it/chunker/grammar-chunk.dat 109 | 110 | #### Dependence Parser options 111 | DepTxalaFile=$FREELINGSHARE/it/dep/dependences.dat 112 | 113 | #### Coreference Solver options 114 | CoreferenceResolution=no 115 | CorefFile=$FREELINGSHARE/it/coref/coref.dat 116 | -------------------------------------------------------------------------------- /metanl/data/freeling/ru.cfg: -------------------------------------------------------------------------------- 1 | ## 2 | #### default configuration file for English analyzer 3 | ## 4 | 5 | #### General options 6 | Lang=ru 7 | Locale=default 8 | 9 | #### Trace options. Only effective if we have compiled with -DVERBOSE 10 | ## 11 | #### Possible values for TraceModules (may be OR'ed) 12 | #define SPLIT_TRACE 0x00000001 13 | #define TOKEN_TRACE 0x00000002 14 | #define MACO_TRACE 0x00000004 15 | #define OPTIONS_TRACE 0x00000008 16 | #define NUMBERS_TRACE 0x00000010 17 | #define DATES_TRACE 0x00000020 18 | #define PUNCT_TRACE 0x00000040 19 | #define DICT_TRACE 0x00000080 20 | #define SUFF_TRACE 0x00000100 21 | #define LOCUT_TRACE 0x00000200 22 | #define NP_TRACE 0x00000400 23 | #define PROB_TRACE 0x00000800 24 | #define QUANT_TRACE 0x00001000 25 | #define NEC_TRACE 0x00002000 26 | #define AUTOMAT_TRACE 0x00004000 27 | #define TAGGER_TRACE 0x00008000 28 | #define HMM_TRACE 0x00010000 29 | #define RELAX_TRACE 0x00020000 30 | #define RELAX_TAGGER_TRACE 0x00040000 31 | #define CONST_GRAMMAR_TRACE 0x00080000 32 | #define SENSES_TRACE 0x00100000 33 | #define CHART_TRACE 0x00200000 34 | #define GRAMMAR_TRACE 0x00400000 35 | #define DEP_TRACE 0x00800000 36 | #define UTIL_TRACE 0x01000000 37 | 38 | TraceLevel=3 39 | TraceModule=0x0000 40 | 41 | ## Options to control the applied modules. The input may be partially 42 | ## processed, or not a full analysis may me wanted. The specific 43 | ## formats are a choice of the main program using the library, as well 44 | ## as the responsability of calling only the required modules. 45 | ## Valid input/output formats are: plain, token, splitted, morfo, tagged, parsed 46 | InputFormat=plain 47 | OutputFormat=tagged 48 | 49 | # consider each newline as a sentence end 50 | AlwaysFlush=yes 51 | 52 | #### Tokenizer options 53 | TokenizerFile=$FREELINGSHARE/ru/tokenizer.dat 54 | 55 | #### Splitter options 56 | SplitterFile=$FREELINGSHARE/ru/splitter.dat 57 | 58 | #### Morfo options 59 | AffixAnalysis=no 60 | MultiwordsDetection=no 61 | NumbersDetection=no 62 | PunctuationDetection=no 63 | DatesDetection=no 64 | QuantitiesDetection=no 65 | DictionarySearch=yes 66 | ProbabilityAssignment=yes 67 | OrthographicCorrection=no 68 | DecimalPoint=. 69 | ThousandPoint=, 70 | LocutionsFile=$FREELINGSHARE/ru/locucions.dat 71 | QuantitiesFile=$FREELINGSHARE/common/quantities_default.dat 72 | AffixFile=$FREELINGSHARE/ru/afixos.dat 73 | ProbabilityFile=$FREELINGSHARE/ru/probabilitats.dat 74 | DictionaryFile=$FREELINGSHARE/ru/dicc.src 75 | PunctuationFile=$FREELINGSHARE/common/punct.dat 76 | ProbabilityThreshold=0.001 77 | # NER options 78 | NERecognition=no 79 | NPDataFile=$FREELINGSHARE/ru/np.dat 80 | 81 | #Spelling Corrector config file 82 | CorrectorFile=$FREELINGSHARE/ru/corrector/corrector.dat 83 | 84 | ## Phonetic encoding of words. 85 | Phonetics=no 86 | PhoneticsFile=$FREELINGSHARE/ru/phonetics.dat 87 | 88 | ## NEC options 89 | NEClassification=no 90 | NECFile=$FREELINGSHARE/ru/nec/nec-ab.dat 91 | 92 | ## Sense annotation options (none,all,mfs,ukb) 93 | SenseAnnotation=none 94 | SenseConfigFile=$FREELINGSHARE/ru/senses.dat 95 | UKBConfigFile=$FREELINGSHARE/ru/ukb.dat 96 | 97 | #### Tagger options 98 | #Tagger=relax 99 | Tagger=hmm 100 | TaggerHMMFile=$FREELINGSHARE/ru/tagger.dat 101 | TaggerRelaxFile=$FREELINGSHARE/ru/constr_gram.dat 102 | TaggerRelaxMaxIter=500 103 | TaggerRelaxScaleFactor=670.0 104 | TaggerRelaxEpsilon=0.001 105 | TaggerRetokenize=yes 106 | TaggerForceSelect=tagger 107 | 108 | #### Parser options 109 | GrammarFile=$FREELINGSHARE/ru/chunker/grammar-chunk.dat 110 | 111 | #### Dependence Parser options 112 | DepTxalaFile=$FREELINGSHARE/ru/dep/dependences.dat 113 | 114 | #### Coreference Solver options 115 | CoreferenceResolution=no 116 | CorefFile=$FREELINGSHARE/ru/coref/coref.dat 117 | -------------------------------------------------------------------------------- /metanl/data/freeling/es.cfg: -------------------------------------------------------------------------------- 1 | ## 2 | #### default configuration file for Spanish analyzer 3 | ## 4 | 5 | #### General options 6 | Lang=es 7 | Locale=default 8 | 9 | #### Trace options. Only effective if we have compiled with -DVERBOSE 10 | # 11 | ## Possible values for TraceModule (may be OR'ed) 12 | #define SPLIT_TRACE 0x00000001 13 | #define TOKEN_TRACE 0x00000002 14 | #define MACO_TRACE 0x00000004 15 | #define OPTIONS_TRACE 0x00000008 16 | #define NUMBERS_TRACE 0x00000010 17 | #define DATES_TRACE 0x00000020 18 | #define PUNCT_TRACE 0x00000040 19 | #define DICT_TRACE 0x00000080 20 | #define SUFF_TRACE 0x00000100 21 | #define LOCUT_TRACE 0x00000200 22 | #define NP_TRACE 0x00000400 23 | #define PROB_TRACE 0x00000800 24 | #define QUANT_TRACE 0x00001000 25 | #define NEC_TRACE 0x00002000 26 | #define AUTOMAT_TRACE 0x00004000 27 | #define TAGGER_TRACE 0x00008000 28 | #define HMM_TRACE 0x00010000 29 | #define RELAX_TRACE 0x00020000 30 | #define RELAX_TAGGER_TRACE 0x00040000 31 | #define CONST_GRAMMAR_TRACE 0x00080000 32 | #define SENSES_TRACE 0x00100000 33 | #define CHART_TRACE 0x00200000 34 | #define GRAMMAR_TRACE 0x00400000 35 | #define DEP_TRACE 0x00800000 36 | #define UTIL_TRACE 0x01000000 37 | 38 | TraceLevel=3 39 | TraceModule=0x0000 40 | 41 | ## Options to control the applied modules. The input may be partially 42 | ## processed, or not a full analysis may me wanted. The specific 43 | ## formats are a choice of the main program using the library, as well 44 | ## as the responsability of calling only the required modules. 45 | ## Valid input/output formats are: plain, token, splitted, morfo, tagged, parsed 46 | InputFormat=plain 47 | OutputFormat=tagged 48 | 49 | # consider each newline as a sentence end 50 | AlwaysFlush=yes 51 | 52 | #### Tokenizer options 53 | TokenizerFile=$FREELINGSHARE/es/tokenizer.dat 54 | 55 | #### Splitter options 56 | SplitterFile=$FREELINGSHARE/es/splitter.dat 57 | 58 | #### Morfo options 59 | AffixAnalysis=yes 60 | MultiwordsDetection=no 61 | NumbersDetection=no 62 | PunctuationDetection=yes 63 | DatesDetection=no 64 | QuantitiesDetection=no 65 | DictionarySearch=yes 66 | ProbabilityAssignment=yes 67 | OrthographicCorrection=no 68 | DecimalPoint=, 69 | ThousandPoint=. 70 | LocutionsFile=$FREELINGSHARE/es/locucions.dat 71 | QuantitiesFile=$FREELINGSHARE/es/quantities.dat 72 | AffixFile=$FREELINGSHARE/es/afixos.dat 73 | ProbabilityFile=$FREELINGSHARE/es/probabilitats.dat 74 | DictionaryFile=$FREELINGSHARE/es/dicc.src 75 | PunctuationFile=$FREELINGSHARE/common/punct.dat 76 | ProbabilityThreshold=0.001 77 | 78 | # NER options 79 | NERecognition=no 80 | NPDataFile=$FREELINGSHARE/es/np.dat 81 | ## comment line above and uncomment that below, if you want 82 | ## a better NE recognizer (higer accuracy, lower speed) 83 | #NPDataFile=$FREELINGSHARE/es/ner/ner-ab.dat 84 | 85 | #Spelling Corrector config file 86 | CorrectorFile=$FREELINGSHARE/es/corrector/corrector.dat 87 | 88 | ## Phonetic encoding of words. 89 | Phonetics=no 90 | PhoneticsFile=$FREELINGSHARE/es/phonetics.dat 91 | 92 | ## NEC options 93 | NEClassification=no 94 | NECFile=$FREELINGSHARE/es/nec/nec-svm.dat 95 | 96 | ## Sense annotation options (none,all,mfs,ukb) 97 | SenseAnnotation=none 98 | SenseConfigFile=$FREELINGSHARE/es/senses.dat 99 | UKBConfigFile=$FREELINGSHARE/es/ukb.dat 100 | 101 | #### Tagger options 102 | Tagger=hmm 103 | TaggerHMMFile=$FREELINGSHARE/es/tagger.dat 104 | TaggerRelaxFile=$FREELINGSHARE/es/constr_gram-B.dat 105 | TaggerRelaxMaxIter=500 106 | TaggerRelaxScaleFactor=670.0 107 | TaggerRelaxEpsilon=0.001 108 | TaggerRetokenize=yes 109 | TaggerForceSelect=tagger 110 | 111 | #### Parser options 112 | GrammarFile=$FREELINGSHARE/es/chunker/grammar-chunk.dat 113 | 114 | #### Dependence Parser options 115 | DepTxalaFile=$FREELINGSHARE/es/dep/dependences.dat 116 | 117 | #### Coreference Solver options 118 | CoreferenceResolution=no 119 | CorefFile=$FREELINGSHARE/es/coref/coref.dat 120 | -------------------------------------------------------------------------------- /metanl/data/freeling/en.cfg: -------------------------------------------------------------------------------- 1 | ## 2 | #### default configuration file for English analyzer 3 | ## 4 | 5 | #### General options 6 | Lang=en 7 | Locale=default 8 | 9 | #### Trace options. Only effective if we have compiled with -DVERBOSE 10 | ## 11 | #### Possible values for TraceModules (may be OR'ed) 12 | #define SPLIT_TRACE 0x00000001 13 | #define TOKEN_TRACE 0x00000002 14 | #define MACO_TRACE 0x00000004 15 | #define OPTIONS_TRACE 0x00000008 16 | #define NUMBERS_TRACE 0x00000010 17 | #define DATES_TRACE 0x00000020 18 | #define PUNCT_TRACE 0x00000040 19 | #define DICT_TRACE 0x00000080 20 | #define SUFF_TRACE 0x00000100 21 | #define LOCUT_TRACE 0x00000200 22 | #define NP_TRACE 0x00000400 23 | #define PROB_TRACE 0x00000800 24 | #define QUANT_TRACE 0x00001000 25 | #define NEC_TRACE 0x00002000 26 | #define AUTOMAT_TRACE 0x00004000 27 | #define TAGGER_TRACE 0x00008000 28 | #define HMM_TRACE 0x00010000 29 | #define RELAX_TRACE 0x00020000 30 | #define RELAX_TAGGER_TRACE 0x00040000 31 | #define CONST_GRAMMAR_TRACE 0x00080000 32 | #define SENSES_TRACE 0x00100000 33 | #define CHART_TRACE 0x00200000 34 | #define GRAMMAR_TRACE 0x00400000 35 | #define DEP_TRACE 0x00800000 36 | #define UTIL_TRACE 0x01000000 37 | 38 | TraceLevel=3 39 | TraceModule=0x0000 40 | 41 | ## Options to control the applied modules. The input may be partially 42 | ## processed, or not a full analysis may me wanted. The specific 43 | ## formats are a choice of the main program using the library, as well 44 | ## as the responsability of calling only the required modules. 45 | ## Valid input/output formats are: plain, token, splitted, morfo, tagged, parsed 46 | InputFormat=plain 47 | OutputFormat=tagged 48 | 49 | # consider each newline as a sentence end 50 | AlwaysFlush=yes 51 | 52 | #### Tokenizer options 53 | TokenizerFile=$FREELINGSHARE/en/tokenizer.dat 54 | 55 | #### Splitter options 56 | SplitterFile=$FREELINGSHARE/en/splitter.dat 57 | 58 | #### Morfo options 59 | AffixAnalysis=yes 60 | MultiwordsDetection=no 61 | NumbersDetection=no 62 | PunctuationDetection=yes 63 | DatesDetection=no 64 | QuantitiesDetection=no 65 | DictionarySearch=yes 66 | ProbabilityAssignment=yes 67 | OrthographicCorrection=no 68 | DecimalPoint=. 69 | ThousandPoint=, 70 | LocutionsFile=$FREELINGSHARE/en/locucions.dat 71 | QuantitiesFile=$FREELINGSHARE/en/quantities.dat 72 | AffixFile=$FREELINGSHARE/en/afixos.dat 73 | ProbabilityFile=$FREELINGSHARE/en/probabilitats.dat 74 | DictionaryFile=$FREELINGSHARE/en/dicc.src 75 | PunctuationFile=$FREELINGSHARE/common/punct.dat 76 | ProbabilityThreshold=0.001 77 | 78 | # NER options 79 | NERecognition=no 80 | NPDataFile=$FREELINGSHARE/en/np.dat 81 | ## --- comment lines above and uncomment those below, if you want 82 | ## --- a better NE recognizer (higer accuracy, lower speed) 83 | #NPDataFile=$FREELINGSHARE/en/ner/ner-ab.dat 84 | 85 | #Spelling Corrector config file 86 | CorrectorFile=$FREELINGSHARE/en/corrector/corrector.dat 87 | 88 | ## Phonetic encoding of words. 89 | Phonetics=no 90 | PhoneticsFile=$FREELINGSHARE/en/phonetics.dat 91 | 92 | ## NEC options 93 | NEClassification=no 94 | NECFile=$FREELINGSHARE/en/nec/nec-svm.dat 95 | 96 | ## Sense annotation options (none,all,mfs,ukb) 97 | SenseAnnotation=none 98 | SenseConfigFile=$FREELINGSHARE/en/senses.dat 99 | UKBConfigFile=$FREELINGSHARE/en/ukb.dat 100 | 101 | #### Tagger options 102 | #Tagger=relax 103 | Tagger=hmm 104 | TaggerHMMFile=$FREELINGSHARE/en/tagger.dat 105 | TaggerRelaxFile=$FREELINGSHARE/en/constr_gram-B.dat 106 | TaggerRelaxMaxIter=500 107 | TaggerRelaxScaleFactor=670.0 108 | TaggerRelaxEpsilon=0.001 109 | TaggerRetokenize=yes 110 | TaggerForceSelect=tagger 111 | 112 | #### Parser options 113 | GrammarFile=$FREELINGSHARE/en/chunker/grammar-chunk.dat 114 | 115 | #### Dependence Parser options 116 | DepTxalaFile=$FREELINGSHARE/en/dep/dependences.dat 117 | 118 | #### Coreference Solver options 119 | CoreferenceResolution=no 120 | CorefFile=$FREELINGSHARE/en/coref/coref.dat 121 | -------------------------------------------------------------------------------- /metanl/data/freeling/pt.cfg: -------------------------------------------------------------------------------- 1 | ## 2 | #### default configuration file for Portuguese analyzer 3 | ## 4 | 5 | #### General options 6 | Lang=pt 7 | Locale=default 8 | 9 | #### Trace options. Only effective if we have compiled with -DVERBOSE 10 | # 11 | ## Possible values for TraceModule (may be OR'ed) 12 | #define SPLIT_TRACE 0x00000001 13 | #define TOKEN_TRACE 0x00000002 14 | #define MACO_TRACE 0x00000004 15 | #define OPTIONS_TRACE 0x00000008 16 | #define NUMBERS_TRACE 0x00000010 17 | #define DATES_TRACE 0x00000020 18 | #define PUNCT_TRACE 0x00000040 19 | #define DICT_TRACE 0x00000080 20 | #define SUFF_TRACE 0x00000100 21 | #define LOCUT_TRACE 0x00000200 22 | #define NP_TRACE 0x00000400 23 | #define PROB_TRACE 0x00000800 24 | #define QUANT_TRACE 0x00001000 25 | #define NEC_TRACE 0x00002000 26 | #define AUTOMAT_TRACE 0x00004000 27 | #define TAGGER_TRACE 0x00008000 28 | #define HMM_TRACE 0x00010000 29 | #define RELAX_TRACE 0x00020000 30 | #define RELAX_TAGGER_TRACE 0x00040000 31 | #define CONST_GRAMMAR_TRACE 0x00080000 32 | #define SENSES_TRACE 0x00100000 33 | #define CHART_TRACE 0x00200000 34 | #define GRAMMAR_TRACE 0x00400000 35 | #define DEP_TRACE 0x00800000 36 | #define UTIL_TRACE 0x01000000 37 | 38 | TraceLevel=3 39 | TraceModule=0x0000 40 | 41 | ## Options to control the applied modules. The input may be partially 42 | ## processed, or not a full analysis may me wanted. The specific 43 | ## formats are a choice of the main program using the library, as well 44 | ## as the responsability of calling only the required modules. 45 | ## Valid input/output formats are: plain, token, splitted, morfo, tagged, parsed 46 | InputFormat=plain 47 | OutputFormat=tagged 48 | 49 | # consider each newline as a sentence end 50 | AlwaysFlush=yes 51 | 52 | #### Tokenizer options 53 | TokenizerFile=$FREELINGSHARE/pt/tokenizer.dat 54 | 55 | #### Splitter options 56 | SplitterFile=$FREELINGSHARE/pt/splitter.dat 57 | RetokContractions=false 58 | 59 | #### Morfo options 60 | AffixAnalysis=yes 61 | MultiwordsDetection=no 62 | NumbersDetection=no 63 | PunctuationDetection=yes 64 | DatesDetection=no 65 | QuantitiesDetection=no 66 | DictionarySearch=yes 67 | ProbabilityAssignment=yes 68 | OrthographicCorrection=no 69 | DecimalPoint=, 70 | ThousandPoint=. 71 | LocutionsFile=$FREELINGSHARE/pt/locucions.dat 72 | QuantitiesFile=$FREELINGSHARE/common/quantities_default.dat 73 | AffixFile=$FREELINGSHARE/pt/afixos.dat 74 | ProbabilityFile=$FREELINGSHARE/pt/probabilitats.dat 75 | DictionaryFile=$FREELINGSHARE/pt/dicc.src 76 | PunctuationFile=$FREELINGSHARE/common/punct.dat 77 | ProbabilityThreshold=0.001 78 | 79 | #NER options 80 | NERecognition=no 81 | NPDataFile=$FREELINGSHARE/pt/np.dat 82 | ## --- comment lines above and uncomment those below, if you want 83 | ## --- a better NE recognizer (higer accuracy, lower speed) 84 | #NPDataFile=$FREELINGSHARE/pt/ner/ner-ab.dat 85 | 86 | #Spelling Corrector config file 87 | CorrectorFile=$FREELINGSHARE/pt/corrector/corrector.dat 88 | 89 | ## Phonetic encoding of words. 90 | Phonetics=no 91 | PhoneticsFile=$FREELINGSHARE/pt/phonetics.dat 92 | 93 | ## NEC options 94 | NEClassification=no 95 | NECFile=$FREELINGSHARE/pt/nec/nec-ab.dat 96 | 97 | ## Sense annotation options (none,all,mfs) 98 | SenseAnnotation=none 99 | SenseConfigFile=$FREELINGSHARE/pt/senses.dat 100 | UKBConfigFile=$FREELINGSHARE/pt/ukb.dat 101 | 102 | #### Tagger options 103 | Tagger=hmm 104 | TaggerHMMFile=$FREELINGSHARE/pt/tagger.dat 105 | TaggerRelaxFile=$FREELINGSHARE/pt/constr_gram.dat 106 | TaggerRelaxMaxIter=500 107 | TaggerRelaxScaleFactor=670.0 108 | TaggerRelaxEpsilon=0.001 109 | TaggerRetokenize=yes 110 | TaggerForceSelect=tagger 111 | 112 | #### Parser options 113 | GrammarFile=$FREELINGSHARE/pt/chunker/grammar-chunk.dat 114 | 115 | #### Dependence Parser options 116 | DepTxalaFile=$FREELINGSHARE/pt/dep/dependences.dat 117 | 118 | #### Coreference Solver options 119 | CoreferenceResolution=no 120 | CorefFile=$FREELINGSHARE/pt/coref/coref.dat 121 | -------------------------------------------------------------------------------- /metanl/freeling.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | 3 | import pkg_resources 4 | from metanl.extprocess import ProcessWrapper, ProcessError, render_safe 5 | 6 | 7 | class FreelingWrapper(ProcessWrapper): 8 | r""" 9 | Handle English, Spanish, Italian, Portuguese, or Welsh text by calling an 10 | installed copy of FreeLing. 11 | 12 | The constructor takes one argument, which is the installed filename of the 13 | language-specific config file, such as 'en.cfg'. 14 | 15 | >>> english.tag_and_stem("This is a test.\n\nIt has two paragraphs, and that's okay.") 16 | [('this', 'DT', 'This'), ('be', 'VBZ', 'is'), ('a', 'DT', 'a'), ('test', 'NN', 'test'), ('.', '.', '.'), ('it', 'PRP', 'It'), ('have', 'VBZ', 'has'), ('two', 'DT', 'two'), ('paragraph', 'NNS', 'paragraphs'), (',', '.', ','), ('and', 'CC', 'and'), ('that', 'PRP', 'that'), ('be', 'VBZ', "'s"), ('okay', 'JJ', 'okay'), ('.', '.', '.')] 17 | 18 | >>> english.tag_and_stem("this has\ntwo lines") 19 | [('this', 'DT', 'this'), ('have', 'VBZ', 'has'), ('two', 'DT', 'two'), ('line', 'NNS', 'lines')] 20 | 21 | """ 22 | def __init__(self, lang): 23 | self.lang = lang 24 | self.configfile = pkg_resources.resource_filename( 25 | __name__, 'data/freeling/%s.cfg' % lang) 26 | self.splitterfile = pkg_resources.resource_filename( 27 | __name__, 'data/freeling/generic_splitter.dat') 28 | 29 | def _get_command(self): 30 | """ 31 | Get the command for running the basic FreeLing pipeline in the 32 | specified language. 33 | 34 | The options we choose are: 35 | 36 | -f data/freeling/.cfg 37 | load our custom configuration for the language 38 | --fsplit data/freeling/generic_splitter.dat 39 | don't do any special handling of ends of sentences 40 | """ 41 | return ['analyze', '-f', self.configfile, '--fsplit', 42 | self.splitterfile] 43 | 44 | def get_record_root(self, record): 45 | """ 46 | Given a FreeLing record, return the root word. 47 | """ 48 | return record[1].lower() 49 | 50 | def get_record_token(self, record): 51 | """ 52 | The token of a FreeLing record is the first item on the line. 53 | """ 54 | return record[0] 55 | 56 | def get_record_pos(self, record): 57 | """ 58 | In English, return the third segment of the record. 59 | 60 | In other languages, this segment contains one letter for the part of 61 | speech, plus densely-encoded features that we really have no way to 62 | use. Return just the part-of-speech letter. 63 | """ 64 | if self.lang == 'en': 65 | return record[2] 66 | else: 67 | return record[2][0] 68 | 69 | def is_stopword_record(self, record, common_words=False): 70 | """ 71 | Determiners are stopwords. Detect this by checking whether their POS 72 | starts with 'D'. 73 | """ 74 | return (record[2][0] == 'D') 75 | 76 | def analyze(self, text): 77 | """ 78 | Run text through the external process, and get a list of lists 79 | ("records") that contain the analysis of each word. 80 | """ 81 | try: 82 | text = render_safe(text).strip() 83 | if not text: 84 | return [] 85 | chunks = text.split('\n') 86 | results = [] 87 | for chunk_text in chunks: 88 | if chunk_text.strip(): 89 | textbytes = (chunk_text + '\n').encode('utf-8') 90 | self.send_input(textbytes) 91 | out_line = '' 92 | while True: 93 | out_line = self.receive_output_line() 94 | out_line = out_line.decode('utf-8') 95 | 96 | if out_line == '\n': 97 | break 98 | 99 | record = out_line.strip('\n').split(' ') 100 | results.append(record) 101 | return results 102 | except ProcessError: 103 | self.restart_process() 104 | return self.analyze(text) 105 | 106 | 107 | LANGUAGES = {} 108 | english = LANGUAGES['en'] = FreelingWrapper('en') 109 | spanish = LANGUAGES['es'] = FreelingWrapper('es') 110 | italian = LANGUAGES['it'] = FreelingWrapper('it') 111 | portuguese = LANGUAGES['pt'] = FreelingWrapper('pt') 112 | russian = LANGUAGES['ru'] = FreelingWrapper('ru') 113 | welsh = LANGUAGES['cy'] = FreelingWrapper('cy') 114 | -------------------------------------------------------------------------------- /metanl/token_utils.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from __future__ import unicode_literals 3 | """ 4 | This file contains some generally useful operations you would perform to 5 | separate and join tokens. The tools apply most to English, but should also 6 | be able to do their job in any Western language that uses spaces. 7 | """ 8 | 9 | import re 10 | import unicodedata 11 | 12 | 13 | def tokenize(text): 14 | """ 15 | Split a text into tokens (words, morphemes we can separate such as 16 | "n't", and punctuation). 17 | """ 18 | return list(_tokenize_gen(text)) 19 | 20 | 21 | def _tokenize_gen(text): 22 | import nltk 23 | for sent in nltk.sent_tokenize(text): 24 | for word in nltk.word_tokenize(sent): 25 | yield word 26 | 27 | 28 | def untokenize(words): 29 | """ 30 | Untokenizing a text undoes the tokenizing operation, restoring 31 | punctuation and spaces to the places that people expect them to be. 32 | 33 | Ideally, `untokenize(tokenize(text))` should be identical to `text`, 34 | except for line breaks. 35 | """ 36 | text = ' '.join(words) 37 | step1 = text.replace("`` ", '"').replace(" ''", '"').replace('. . .', '...') 38 | step2 = step1.replace(" ( ", " (").replace(" ) ", ") ") 39 | step3 = re.sub(r' ([.,:;?!%]+)([ \'"`])', r"\1\2", step2) 40 | step4 = re.sub(r' ([.,:;?!%]+)$', r"\1", step3) 41 | step5 = step4.replace(" '", "'").replace(" n't", "n't").replace( 42 | "can not", "cannot") 43 | step6 = step5.replace(" ` ", " '") 44 | return step6.strip() 45 | 46 | 47 | # This expression scans through a reversed string to find segments of 48 | # camel-cased text. Comments show what these mean, forwards, in preference 49 | # order: 50 | CAMEL_RE = re.compile(r""" 51 | ^( [A-Z]+ # A string of all caps, such as an acronym 52 | | [^A-Z0-9 _]+[A-Z _] # A single capital letter followed by lowercase 53 | # letters, or lowercase letters on their own 54 | # after a word break 55 | | [^A-Z0-9 _]*[0-9.]+ # A number, possibly followed by lowercase 56 | # letters 57 | | [ _]+ # Extra word breaks (spaces or underscores) 58 | | [^A-Z0-9]*[^A-Z0-9_ ]+ # Miscellaneous symbols, possibly with lowercase 59 | # letters after them 60 | ) 61 | """, re.VERBOSE) 62 | 63 | 64 | def un_camel_case(text): 65 | r""" 66 | Splits apart words that are written in CamelCase. 67 | 68 | Bugs: 69 | 70 | - Non-ASCII characters are treated as lowercase letters, even if they are 71 | actually capital letters. 72 | 73 | Examples: 74 | 75 | >>> un_camel_case('1984ZXSpectrumGames') 76 | '1984 ZX Spectrum Games' 77 | 78 | >>> un_camel_case('aaAa aaAaA 0aA AAAa!AAA') 79 | 'aa Aa aa Aa A 0a A AA Aa! AAA' 80 | 81 | >>> un_camel_case('MotörHead') 82 | 'Mot\xf6r Head' 83 | 84 | >>> un_camel_case('MSWindows3.11ForWorkgroups') 85 | 'MS Windows 3.11 For Workgroups' 86 | 87 | This should not significantly affect text that is not camel-cased: 88 | 89 | >>> un_camel_case('ACM_Computing_Classification_System') 90 | 'ACM Computing Classification System' 91 | 92 | >>> un_camel_case('Anne_Blunt,_15th_Baroness_Wentworth') 93 | 'Anne Blunt, 15th Baroness Wentworth' 94 | 95 | >>> un_camel_case('Hindi-Urdu') 96 | 'Hindi-Urdu' 97 | """ 98 | revtext = text[::-1] 99 | pieces = [] 100 | while revtext: 101 | match = CAMEL_RE.match(revtext) 102 | if match: 103 | pieces.append(match.group(1)) 104 | revtext = revtext[match.end():] 105 | else: 106 | pieces.append(revtext) 107 | revtext = '' 108 | revstr = ' '.join(piece.strip(' _') for piece in pieces 109 | if piece.strip(' _')) 110 | return revstr[::-1].replace('- ', '-') 111 | 112 | 113 | # see http://www.fileformat.info/info/unicode/category/index.htm 114 | BOUNDARY_CATEGORIES = {'Cc', # control characters 115 | 'Cf', # format characters 116 | 'Cn', # "other, not assigned" 117 | 'Pc', # connector punctuation 118 | 'Pd', # dash 119 | 'Pe', # close-punctuation 120 | 'Pf', # final-quote 121 | 'Pi', # initial-quote 122 | 'Po', # other punctuation 123 | 'Zl', # line separator 124 | 'Zp', # paragraph separator 125 | 'Zs', # space separator 126 | } 127 | 128 | def string_pieces(s, maxlen=1024): 129 | """ 130 | Takes a (unicode) string and yields pieces of it that are at most `maxlen` 131 | characters, trying to break it at punctuation/whitespace. This is an 132 | important step before using a tokenizer with a maximum buffer size. 133 | """ 134 | if not s: 135 | return 136 | i = 0 137 | while True: 138 | j = i + maxlen 139 | if j >= len(s): 140 | yield s[i:] 141 | return 142 | # Using "j - 1" keeps boundary characters with the left chunk 143 | while unicodedata.category(s[j - 1]) not in BOUNDARY_CATEGORIES: 144 | j -= 1 145 | if j == i: 146 | # No boundary available; oh well. 147 | j = i + maxlen 148 | break 149 | yield s[i:j] 150 | i = j 151 | 152 | -------------------------------------------------------------------------------- /metanl/nltk_morphy.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import print_function, unicode_literals 3 | 4 | import nltk 5 | from nltk.corpus import wordnet 6 | from metanl.token_utils import untokenize, tokenize 7 | import re 8 | 9 | try: 10 | morphy = wordnet._morphy 11 | except LookupError: 12 | nltk.download('wordnet') 13 | morphy = wordnet._morphy 14 | 15 | STOPWORDS = ['the', 'a', 'an'] 16 | 17 | EXCEPTIONS = { 18 | # Avoid obsolete and obscure roots, the way lexicographers don't. 19 | 'wrought': 'wrought', # not 'work' 20 | 'media': 'media', # not 'medium' 21 | 'installed': 'install', # not 'instal' 22 | 'installing': 'install',# not 'instal' 23 | 'synapses': 'synapse', # not 'synapsis' 24 | 'soles': 'sole', # not 'sol' 25 | 'pubes': 'pube', # not 'pubis' 26 | 'dui': 'dui', # not 'duo' 27 | 'taxis': 'taxi', # not 'taxis' 28 | 29 | # Work around errors that Morphy makes. 30 | 'alas': 'alas', 31 | 'corps': 'corps', 32 | 'cos': 'cos', 33 | 'enured': 'enure', 34 | 'fiver': 'fiver', 35 | 'hinder': 'hinder', 36 | 'lobed': 'lobe', 37 | 'offerer': 'offerer', 38 | 'outer': 'outer', 39 | 'sang': 'sing', 40 | 'singing': 'sing', 41 | 'solderer': 'solderer', 42 | 'tined': 'tine', 43 | 'twiner': 'twiner', 44 | 'us': 'us', 45 | 46 | # Stem common nouns whose plurals are apparently ambiguous 47 | 'teeth': 'tooth', 48 | 'things': 'thing', 49 | 'people': 'person', 50 | 51 | # Tokenization artifacts 52 | 'wo': 'will', 53 | 'ca': 'can', 54 | "n't": 'not', 55 | } 56 | 57 | AMBIGUOUS_EXCEPTIONS = { 58 | # Avoid nouns that shadow more common verbs. 59 | 'am': 'be', 60 | 'as': 'as', 61 | 'are': 'be', 62 | 'ate': 'eat', 63 | 'bent': 'bend', 64 | 'drove': 'drive', 65 | 'fell': 'fall', 66 | 'felt': 'feel', 67 | 'found': 'find', 68 | 'has': 'have', 69 | 'lit': 'light', 70 | 'lost': 'lose', 71 | 'sat': 'sit', 72 | 'saw': 'see', 73 | 'sent': 'send', 74 | 'shook': 'shake', 75 | 'shot': 'shoot', 76 | 'slain': 'slay', 77 | 'spoke': 'speak', 78 | 'stole': 'steal', 79 | 'sung': 'sing', 80 | 'thought': 'think', 81 | 'tore': 'tear', 82 | 'was': 'be', 83 | 'won': 'win', 84 | 'feed': 'feed', 85 | } 86 | 87 | 88 | def _word_badness(word): 89 | """ 90 | Assign a heuristic to possible outputs from Morphy. Minimizing this 91 | heuristic avoids incorrect stems. 92 | """ 93 | if word.endswith('e'): 94 | return len(word) - 2 95 | elif word.endswith('ess'): 96 | return len(word) - 10 97 | elif word.endswith('ss'): 98 | return len(word) - 4 99 | else: 100 | return len(word) 101 | 102 | 103 | def _morphy_best(word, pos=None): 104 | """ 105 | Get the most likely stem for a word using Morphy, once the input has been 106 | pre-processed by morphy_stem(). 107 | """ 108 | results = [] 109 | if pos is None: 110 | pos = 'nvar' 111 | for pos_item in pos: 112 | results.extend(morphy(word, pos_item)) 113 | if not results: 114 | return None 115 | results.sort(key=lambda x: _word_badness(x)) 116 | return results[0] 117 | 118 | 119 | def morphy_stem(word, pos=None): 120 | """ 121 | Get the most likely stem for a word. If a part of speech is supplied, 122 | the stem will be more accurate. 123 | 124 | Valid parts of speech are: 125 | 126 | - 'n' or 'NN' for nouns 127 | - 'v' or 'VB' for verbs 128 | - 'a' or 'JJ' for adjectives 129 | - 'r' or 'RB' for adverbs 130 | 131 | Any other part of speech will be treated as unknown. 132 | """ 133 | word = word.lower() 134 | if pos is not None: 135 | if pos.startswith('NN'): 136 | pos = 'n' 137 | elif pos.startswith('VB'): 138 | pos = 'v' 139 | elif pos.startswith('JJ'): 140 | pos = 'a' 141 | elif pos.startswith('RB'): 142 | pos = 'r' 143 | if pos is None and word.endswith('ing') or word.endswith('ed'): 144 | pos = 'v' 145 | if pos is not None and pos not in 'nvar': 146 | pos = None 147 | if word in EXCEPTIONS: 148 | return EXCEPTIONS[word] 149 | if pos is None: 150 | if word in AMBIGUOUS_EXCEPTIONS: 151 | return AMBIGUOUS_EXCEPTIONS[word] 152 | return _morphy_best(word, pos) or word 153 | 154 | 155 | def tag_and_stem(text): 156 | """ 157 | Returns a list of (stem, tag, token) triples: 158 | 159 | - stem: the word's uninflected form 160 | - tag: the word's part of speech 161 | - token: the original word, so we can reconstruct it later 162 | """ 163 | tokens = tokenize(text) 164 | tagged = nltk.pos_tag(tokens) 165 | out = [] 166 | for token, tag in tagged: 167 | stem = morphy_stem(token, tag) 168 | out.append((stem, tag, token)) 169 | return out 170 | 171 | 172 | def good_lemma(lemma): 173 | return lemma and lemma not in STOPWORDS and lemma[0].isalnum() 174 | 175 | 176 | def normalize_list(text): 177 | """ 178 | Get a list of word stems that appear in the text. Stopwords and an initial 179 | 'to' will be stripped, unless this leaves nothing in the stem. 180 | 181 | >>> normalize_list('the dog') 182 | ['dog'] 183 | >>> normalize_list('big dogs') 184 | ['big', 'dog'] 185 | >>> normalize_list('the') 186 | ['the'] 187 | """ 188 | pieces = [morphy_stem(word) for word in tokenize(text)] 189 | pieces = [piece for piece in pieces if good_lemma(piece)] 190 | if not pieces: 191 | return [text] 192 | if pieces[0] == 'to': 193 | pieces = pieces[1:] 194 | return pieces 195 | 196 | 197 | def normalize(text): 198 | """ 199 | Get a string made from the non-stopword word stems in the text. See 200 | normalize_list(). 201 | """ 202 | return untokenize(normalize_list(text)) 203 | 204 | 205 | def normalize_topic(topic): 206 | """ 207 | Get a canonical representation of a Wikipedia topic, which may include 208 | a disambiguation string in parentheses. 209 | 210 | Returns (name, disambig), where "name" is the normalized topic name, 211 | and "disambig" is a string corresponding to the disambiguation text or 212 | None. 213 | """ 214 | # find titles of the form Foo (bar) 215 | topic = topic.replace('_', ' ') 216 | match = re.match(r'([^(]+) \(([^)]+)\)', topic) 217 | if not match: 218 | return normalize(topic), None 219 | else: 220 | return normalize(match.group(1)), 'n/' + match.group(2).strip(' _') 221 | 222 | 223 | def word_frequency(word, default_freq=0): 224 | raise NotImplementedError("Word frequency is now in the wordfreq package.") 225 | 226 | 227 | def get_wordlist(): 228 | raise NotImplementedError("Wordlists are now in the wordfreq package.") 229 | -------------------------------------------------------------------------------- /metanl/extprocess.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import unicode_literals 3 | """ 4 | Tools for using an external program as an NLP pipe. See, for example, 5 | freeling.py. 6 | """ 7 | 8 | import subprocess 9 | import unicodedata 10 | import sys 11 | from ftfy.fixes import remove_control_chars, remove_unsafe_private_use 12 | if sys.version_info.major == 2: 13 | range = xrange 14 | str_func = unicode 15 | else: 16 | str_func = str 17 | 18 | 19 | def render_safe(text): 20 | ''' 21 | Make sure the given text is safe to pass to an external process. 22 | ''' 23 | return remove_control_chars(remove_unsafe_private_use(text)) 24 | 25 | 26 | class ProcessError(IOError): 27 | """ 28 | A subclass of IOError raised when we can't start the external process. 29 | """ 30 | pass 31 | 32 | 33 | class ProcessWrapper(object): 34 | """ 35 | A ProcessWrapper uses the `subprocess` module to keep a process open that 36 | we can pipe stuff through to get NLP results. 37 | 38 | Instead of every instance immediately opening a process, however, it waits 39 | until the first time it is needed, then starts the process. 40 | 41 | Many methods are intended to be implemented by subclasses of ProcessWrapper 42 | that actually know what program they're talking to. 43 | """ 44 | def __del__(self): 45 | """ 46 | Clean up by closing the pipe. 47 | """ 48 | if hasattr(self, '_process'): 49 | self._process.stdin.close() 50 | 51 | @property 52 | def process(self): 53 | """ 54 | Store the actual process in _process. If it doesn't exist yet, create 55 | it. 56 | """ 57 | if hasattr(self, '_process'): 58 | return self._process 59 | else: 60 | self._process = self._get_process() 61 | return self._process 62 | 63 | def _get_command(self): 64 | """ 65 | This method should return the command to run, as a list 66 | of arguments that can be used by subprocess.Popen. 67 | """ 68 | raise NotImplementedError 69 | 70 | def _get_process(self): 71 | """ 72 | Create the process by running the specified command. 73 | """ 74 | command = self._get_command() 75 | return subprocess.Popen(command, bufsize=-1, close_fds=True, 76 | stdout=subprocess.PIPE, 77 | stdin=subprocess.PIPE) 78 | 79 | def get_record_root(self, record): 80 | """ 81 | Given a *record* (the data that the external process returns for a 82 | given single token), this specifies how to extract its root word 83 | (aka its lemma). 84 | """ 85 | raise NotImplementedError 86 | 87 | def get_record_token(self, record): 88 | """ 89 | Given a record, this specifies how to extract the exact word or token 90 | that was processed. 91 | """ 92 | raise NotImplementedError 93 | 94 | def analyze(self, text): 95 | """ 96 | Take text as input, run it through the external process, and return a 97 | list of *records* containing the results. 98 | """ 99 | raise NotImplementedError 100 | 101 | def send_input(self, data): 102 | self.process.stdin.write(data) 103 | self.process.stdin.flush() 104 | 105 | def receive_output_line(self): 106 | line = self.process.stdout.readline() 107 | if not line: 108 | raise ProcessError("reached end of output") 109 | return line 110 | 111 | def restart_process(self): 112 | if hasattr(self, '_process'): 113 | self._process.stdin.close() 114 | self._process = self._get_process() 115 | return self._process 116 | 117 | def tokenize_list(self, text): 118 | """ 119 | Split a text into separate words. 120 | """ 121 | return [self.get_record_token(record) for record in self.analyze(text)] 122 | 123 | def tokenize(self, text): 124 | """ 125 | Yell at people who are still using simplenlp's bad idea of 126 | tokenization. 127 | """ 128 | raise NotImplementedError("tokenize is deprecated. Use tokenize_list.") 129 | 130 | def is_stopword_record(self, record, common_words=False): 131 | """ 132 | Given a record, return whether it represents a stopword (a word that 133 | should be discarded in NLP results). 134 | 135 | Note that we want very few words to be stopwords. Words that are 136 | meaningful but simply common can be recognized by their very high word 137 | frequency, and handled appropriately. Often, we only want determiners 138 | (such as 'a', 'an', and 'the' in English) to be stopwords. 139 | 140 | Takes in a vestigial parameter, `common_words`, and ignores it. 141 | """ 142 | raise NotImplementedError 143 | 144 | def is_stopword(self, text): 145 | """ 146 | Determine whether a single word is a stopword, or whether a short 147 | phrase is made entirely of stopwords, disregarding context. 148 | 149 | Use of this function should be avoided; it's better to give the text 150 | in context and let the process determine which words are the stopwords. 151 | """ 152 | found_content_word = False 153 | for record in self.analyze(text): 154 | if not self.is_stopword_record(record): 155 | found_content_word = True 156 | break 157 | return not found_content_word 158 | 159 | def get_record_pos(self, record): 160 | """ 161 | Given a record, get the word's part of speech. 162 | 163 | This default implementation simply distinguishes stopwords from 164 | non-stopwords. 165 | """ 166 | if self.is_stopword_record(record): 167 | return 'STOP' 168 | else: 169 | return 'TERM' 170 | 171 | def normalize_list(self, text, cache=None): 172 | """ 173 | Get a canonical list representation of text, with words 174 | separated and reduced to their base forms. 175 | 176 | TODO: use the cache. 177 | """ 178 | words = [] 179 | analysis = self.analyze(text) 180 | for record in analysis: 181 | if not self.is_stopword_record(record): 182 | words.append(self.get_record_root(record)) 183 | if not words: 184 | # Don't discard stopwords if that's all you've got 185 | words = [self.get_record_token(record) for record in analysis] 186 | return words 187 | 188 | def normalize(self, text, cache=None): 189 | """ 190 | Get a canonical string representation of this text, like 191 | :meth:`normalize_list` but joined with spaces. 192 | 193 | TODO: use the cache. 194 | """ 195 | return ' '.join(self.normalize_list(text, cache)) 196 | 197 | def tag_and_stem(self, text, cache=None): 198 | """ 199 | Given some text, return a sequence of (stem, pos, text) triples as 200 | appropriate for the reader. `pos` can be as general or specific as 201 | necessary (for example, it might label all parts of speech, or it might 202 | only distinguish function words from others). 203 | 204 | Twitter-style hashtags and at-mentions have the stem and pos they would 205 | have without the leading # or @. For instance, if the reader's triple 206 | for "thing" is ('thing', 'NN', 'things'), then "#things" would come out 207 | as ('thing', 'NN', '#things'). 208 | """ 209 | analysis = self.analyze(text) 210 | triples = [] 211 | 212 | for record in analysis: 213 | root = self.get_record_root(record) 214 | token = self.get_record_token(record) 215 | 216 | if token: 217 | if unicode_is_punctuation(token): 218 | triples.append((token, '.', token)) 219 | else: 220 | pos = self.get_record_pos(record) 221 | triples.append((root, pos, token)) 222 | return triples 223 | 224 | def extract_phrases(self, text): 225 | """ 226 | Given some text, extract phrases of up to 2 content words, 227 | and map their normalized form to the complete phrase. 228 | """ 229 | analysis = self.analyze(text) 230 | for pos1 in range(len(analysis)): 231 | rec1 = analysis[pos1] 232 | if not self.is_stopword_record(rec1): 233 | yield self.get_record_root(rec1), rec1[0] 234 | for pos2 in range(pos1 + 1, len(analysis)): 235 | rec2 = analysis[pos2] 236 | if not self.is_stopword_record(rec2): 237 | roots = [self.get_record_root(rec1), 238 | self.get_record_root(rec2)] 239 | pieces = [analysis[i][0] for i in range(pos1, pos2+1)] 240 | term = ' '.join(roots) 241 | phrase = ''.join(pieces) 242 | yield term, phrase 243 | break 244 | 245 | 246 | def unicode_is_punctuation(text): 247 | """ 248 | Test if a token is made entirely of Unicode characters of the following 249 | classes: 250 | 251 | - P: punctuation 252 | - S: symbols 253 | - Z: separators 254 | - M: combining marks 255 | - C: control characters 256 | 257 | >>> unicode_is_punctuation('word') 258 | False 259 | >>> unicode_is_punctuation('。') 260 | True 261 | >>> unicode_is_punctuation('-') 262 | True 263 | >>> unicode_is_punctuation('-3') 264 | False 265 | >>> unicode_is_punctuation('あ') 266 | False 267 | """ 268 | for char in str_func(text): 269 | category = unicodedata.category(char)[0] 270 | if category not in 'PSZMC': 271 | return False 272 | return True 273 | -------------------------------------------------------------------------------- /metanl/mecab.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import print_function, unicode_literals 3 | """ 4 | This module provides some basic Japanese NLP by wrapping the output of MeCab. 5 | It can tokenize and normalize Japanese words, detect and remove stopwords, 6 | and it can even respell words in kana or romaji. 7 | 8 | This requires mecab to be installed separately. On Ubuntu: 9 | sudo apt-get install mecab mecab-ipadic-utf8 10 | 11 | >>> print(normalize('これはテストです')) 12 | テスト 13 | >>> tag_and_stem('これはテストです。') 14 | [('\u3053\u308c', '~\u540d\u8a5e', '\u3053\u308c'), ('\u306f', '~\u52a9\u8a5e', '\u306f'), ('\u30c6\u30b9\u30c8', '\u540d\u8a5e', '\u30c6\u30b9\u30c8'), ('\u3067\u3059', '~\u52a9\u52d5\u8a5e', '\u3067\u3059'), ('\u3002', '.', '\u3002')] 15 | """ 16 | 17 | from metanl.token_utils import string_pieces 18 | from metanl.extprocess import ProcessWrapper, ProcessError, render_safe 19 | from collections import namedtuple 20 | import unicodedata 21 | import re 22 | import sys 23 | if sys.version_info.major == 2: 24 | range = xrange 25 | str_func = unicode 26 | else: 27 | str_func = str 28 | 29 | 30 | class MeCabError(ProcessError): 31 | pass 32 | 33 | MeCabRecord = namedtuple('MeCabRecord', 34 | [ 35 | 'surface', 36 | 'pos', 37 | 'subclass1', 38 | 'subclass2', 39 | 'subclass3', 40 | 'conjugation', 41 | 'form', 42 | 'root', 43 | 'reading', 44 | 'pronunciation' 45 | ] 46 | ) 47 | 48 | 49 | # MeCab outputs the part of speech of its terms. We can simply identify 50 | # particular (coarse or fine) parts of speech as containing stopwords. 51 | 52 | STOPWORD_CATEGORIES = set([ 53 | '助詞', # coarse: particle 54 | '助動詞', # coarse: auxiliary verb 55 | '接続詞', # coarse: conjunction 56 | 'フィラー', # coarse: filler 57 | '記号', # coarse: symbol 58 | '非自立', # fine: 'not independent' 59 | ]) 60 | 61 | 62 | # Forms of particular words should also be considered stopwords sometimes. 63 | # 64 | # A thought: Should the rare kanji version of suru not be a stopword? 65 | # I'll need to ask someone who knows more Japanese, but it may be 66 | # that if they're using the kanji it's for particular emphasis. 67 | STOPWORD_ROOTS = set([ 68 | 'する', # suru: "to do" 69 | '為る', # suru in kanji (very rare) 70 | 'くる', # kuru: "to come" 71 | '来る', # kuru in kanji 72 | 'いく', # iku: "to go" 73 | '行く', # iku in kanji 74 | 'いる', # iru: "to be" (animate) 75 | '居る', # iru in kanji 76 | 'ある', # aru: "to exist" or "to have" 77 | '有る', # aru in kanji 78 | 'もの', # mono: "thing" 79 | '物', # mono in kanji 80 | 'よう', # yō: "way" 81 | '様', # yō in kanji 82 | 'れる', # passive suffix 83 | 'これ', # kore: "this" 84 | 'それ', # sore: "that" 85 | 'あれ', # are: "that over there" 86 | 'この', # kono: "this" 87 | 'その', # sono: "that" 88 | 'あの', # ano: "that over there", "yon" 89 | ]) 90 | 91 | 92 | class MeCabWrapper(ProcessWrapper): 93 | """ 94 | Handle Japanese text using the command-line version of MeCab. 95 | (mecab-python is convenient, but its installer is too flaky to rely on.) 96 | 97 | ja_cabocha gives more sophisticated results, but requires a large number of 98 | additional dependencies. Using this tool for Japanese requires only 99 | MeCab to be installed and accepting UTF-8 text. 100 | """ 101 | def _get_command(self): 102 | return ['mecab'] 103 | 104 | def _get_process(self): 105 | try: 106 | proc = ProcessWrapper._get_process(self) 107 | except (OSError, ProcessError): 108 | raise MeCabError("MeCab didn't start. See README.txt for details " 109 | "about installing MeCab and other Japanese NLP " 110 | "tools.") 111 | return proc 112 | 113 | def get_record_root(self, record): 114 | """ 115 | Given a MeCab record, return the root word. 116 | """ 117 | if record.root == '*': 118 | return record.surface 119 | else: 120 | return record.root 121 | 122 | def get_record_token(self, record): 123 | return record.surface 124 | 125 | def analyze(self, text): 126 | """ 127 | Runs a line of text through MeCab, and returns the results as a 128 | list of lists ("records") that contain the MeCab analysis of each 129 | word. 130 | """ 131 | try: 132 | self.process # make sure things are loaded 133 | text = render_safe(text).replace('\n', ' ').lower() 134 | results = [] 135 | for chunk in string_pieces(text): 136 | self.send_input((chunk + '\n').encode('utf-8')) 137 | while True: 138 | out_line = self.receive_output_line().decode('utf-8') 139 | if out_line == 'EOS\n': 140 | break 141 | 142 | word, info = out_line.strip('\n').split('\t') 143 | record_parts = [word] + info.split(',') 144 | 145 | # Pad the record out to have 10 parts if it doesn't 146 | record_parts += [None] * (10 - len(record_parts)) 147 | record = MeCabRecord(*record_parts) 148 | 149 | # special case for detecting nai -> n 150 | if (record.surface == 'ん' and 151 | record.conjugation == '不変化型'): 152 | # rebuild the record so that record.root is 'nai' 153 | record_parts[MeCabRecord._fields.index('root')] = 'ない' 154 | record = MeCabRecord(*record_parts) 155 | 156 | results.append(record) 157 | return results 158 | except ProcessError: 159 | self.restart_process() 160 | return self.analyze(text) 161 | 162 | def is_stopword_record(self, record): 163 | """ 164 | Determine whether a single MeCab record represents a stopword. 165 | 166 | This mostly determines words to strip based on their parts of speech. 167 | If common_words is set to True (default), it will also strip common 168 | verbs and nouns such as くる and よう. If more_stopwords is True, it 169 | will look at the sub-part of speech to remove more categories. 170 | """ 171 | # preserve negations 172 | if record.root == 'ない': 173 | return False 174 | return ( 175 | record.pos in STOPWORD_CATEGORIES or 176 | record.subclass1 in STOPWORD_CATEGORIES or 177 | record.root in STOPWORD_ROOTS 178 | ) 179 | 180 | def get_record_pos(self, record): 181 | """ 182 | Given a record, get the word's part of speech. 183 | 184 | Here we're going to return MeCab's part of speech (written in 185 | Japanese), though if it's a stopword we prefix the part of speech 186 | with '~'. 187 | """ 188 | if self.is_stopword_record(record): 189 | return '~' + record.pos 190 | else: 191 | return record.pos 192 | 193 | 194 | class NoStopwordMeCabWrapper(MeCabWrapper): 195 | """ 196 | This version of the MeCabWrapper doesn't label anything as a stopword. It's 197 | used in building ConceptNet because discarding stopwords based on MeCab 198 | categories loses too much information. 199 | """ 200 | def is_stopword_record(self, record, common_words=False): 201 | return False 202 | 203 | 204 | # Define the classes of characters we'll be trying to transliterate 205 | NOT_KANA, KANA, NN, SMALL, SMALL_Y, SMALL_TSU, PROLONG = range(7) 206 | 207 | 208 | def to_kana(text): 209 | """ 210 | Use MeCab to turn any text into its phonetic spelling, as katakana 211 | separated by spaces. 212 | """ 213 | records = MECAB.analyze(text) 214 | kana = [] 215 | for record in records: 216 | if record.pronunciation: 217 | kana.append(record.pronunciation) 218 | elif record.reading: 219 | kana.append(record.reading) 220 | else: 221 | kana.append(record.surface) 222 | return ' '.join(k for k in kana if k) 223 | 224 | 225 | def get_kana_info(char): 226 | """ 227 | Return two things about each character: 228 | 229 | - Its transliterated value (in Roman characters, if it's a kana) 230 | - A class of characters indicating how it affects the romanization 231 | """ 232 | try: 233 | name = unicodedata.name(char) 234 | except ValueError: 235 | return char, NOT_KANA 236 | 237 | # The names we're dealing with will probably look like 238 | # "KATAKANA CHARACTER ZI". 239 | if (name.startswith('HIRAGANA LETTER') or 240 | name.startswith('KATAKANA LETTER') or 241 | name.startswith('KATAKANA-HIRAGANA')): 242 | names = name.split() 243 | syllable = str_func(names[-1].lower()) 244 | 245 | if name.endswith('SMALL TU'): 246 | # The small tsu (っ) doubles the following consonant. 247 | # It'll show up as 't' on its own. 248 | return 't', SMALL_TSU 249 | elif names[-1] == 'N': 250 | return 'n', NN 251 | elif names[1] == 'PROLONGED': 252 | # The prolongation marker doubles the previous vowel. 253 | # It'll show up as '_' on its own. 254 | return '_', PROLONG 255 | elif names[-2] == 'SMALL': 256 | # Small characters tend to modify the sound of the previous 257 | # kana. If they can't modify anything, they're appended to 258 | # the letter 'x' instead. 259 | if syllable.startswith('y'): 260 | return 'x' + syllable, SMALL_Y 261 | else: 262 | return 'x' + syllable, SMALL 263 | 264 | return syllable, KANA 265 | else: 266 | if char in ROMAN_PUNCTUATION_TABLE: 267 | char = ROMAN_PUNCTUATION_TABLE[char] 268 | return char, NOT_KANA 269 | 270 | 271 | def respell_hepburn(syllable): 272 | while syllable[:2] in HEPBURN_TABLE: 273 | syllable = HEPBURN_TABLE[syllable[:2]] + syllable[2:] 274 | return syllable 275 | 276 | 277 | def romanize(text, respell=respell_hepburn): 278 | if respell is None: 279 | respell = lambda x: x 280 | 281 | kana = to_kana(str_func(text)) 282 | pieces = [] 283 | prevgroup = NOT_KANA 284 | 285 | for char in kana: 286 | roman, group = get_kana_info(char) 287 | if prevgroup == NN: 288 | # When the previous syllable is 'n' and the next syllable would 289 | # make it ambiguous, add an apostrophe. 290 | if group != KANA or roman[0] in 'aeinouy': 291 | if unicodedata.category(roman[0])[0] == 'L': 292 | pieces[-1] += "'" 293 | 294 | # Determine how to spell the current character 295 | if group == NOT_KANA: 296 | pieces.append(roman) 297 | elif group == SMALL_TSU or group == NN: 298 | pieces.append(roman) 299 | elif group == SMALL_Y: 300 | if prevgroup == KANA: 301 | # Modify the previous syllable, if that makes sense. For 302 | # example, 'ni' + 'ya' becomes 'nya'. 303 | if not pieces[-1].endswith('i'): 304 | pieces.append(roman) 305 | else: 306 | modifier = roman[1:] 307 | modified = pieces[-1] 308 | pieces[-1] = modified[:-1] + modifier 309 | else: 310 | pieces.append(roman) 311 | elif group == SMALL: 312 | # Don't respell small vowels _yet_. We'll handle that at the end. 313 | # This may be a bit ambiguous, but nobody expects to see "tea" 314 | # spelled "texi". 315 | pieces.append(roman) 316 | elif group == PROLONG: 317 | if prevgroup in (KANA, SMALL_Y, SMALL): 318 | pieces[-1] = pieces[-1][:-1] + respell(pieces[-1][-1] + '_') 319 | else: 320 | pieces.append(roman) 321 | else: # this is a normal kana 322 | if prevgroup == SMALL_TSU: 323 | if roman[0] in 'aeiouy': 324 | # wait, there's no consonant there; cope by respelling the 325 | # previous kana as 't-' 326 | pieces[-1] = 't-' 327 | else: 328 | # Turn the previous 't' into a copy of the first consonant 329 | pieces[-1] = roman[0] 330 | elif prevgroup == NN: 331 | # Let Hepburn respell 'n' as 'm' in words such as 'shimbun'. 332 | try_respell = respell(pieces[-1] + roman[0]) 333 | if try_respell[:-1] != pieces[-1]: 334 | pieces[-1] = try_respell[:-1] 335 | pieces.append(roman) 336 | prevgroup = group 337 | 338 | romantext = ''.join(respell(piece) for piece in pieces) 339 | romantext = re.sub(r'[aeiou]x([aeiou])', r'\1', romantext) 340 | return romantext 341 | 342 | 343 | # Hepburn romanization is the most familiar to English speakers. It involves 344 | # respelling certain parts of romanized words to better match their 345 | # pronunciation. For example, the name for Mount Fuji is respelled from 346 | # "huzi-san" to "fuji-san". 347 | HEPBURN_TABLE = { 348 | 'si': 'shi', 349 | 'sy': 'sh', 350 | 'ti': 'chi', 351 | 'ty': 'ch', 352 | 'tu': 'tsu', 353 | 'hu': 'fu', 354 | 'zi': 'ji', 355 | 'di': 'ji', 356 | 'zy': 'j', 357 | 'dy': 'j', 358 | 'nm': 'mm', 359 | 'nb': 'mb', 360 | 'np': 'mp', 361 | 'a_': 'aa', 362 | 'e_': 'ee', 363 | 'i_': 'ii', 364 | 'o_': 'ou', 365 | 'u_': 'uu' 366 | } 367 | ROMAN_PUNCTUATION_TABLE = { 368 | '・': '.', 369 | '。': '.', 370 | '、': ',', 371 | '!': '!', 372 | '「': '``', 373 | '」': "''", 374 | '?': '?', 375 | '〜': '~' 376 | } 377 | 378 | # Provide externally available functions. 379 | MECAB = MeCabWrapper() 380 | 381 | normalize = MECAB.normalize 382 | normalize_list = MECAB.normalize_list 383 | tokenize = MECAB.tokenize 384 | tokenize_list = MECAB.tokenize_list 385 | analyze = MECAB.analyze 386 | tag_and_stem = MECAB.tag_and_stem 387 | is_stopword = MECAB.is_stopword 388 | --------------------------------------------------------------------------------