├── metanl
├── __init__.py
├── data
│ └── freeling
│ │ ├── generic_splitter.dat
│ │ ├── ident.cfg
│ │ ├── cy.cfg
│ │ ├── it.cfg
│ │ ├── ru.cfg
│ │ ├── es.cfg
│ │ ├── en.cfg
│ │ └── pt.cfg
├── freeling.py
├── token_utils.py
├── nltk_morphy.py
├── extprocess.py
└── mecab.py
├── .gitignore
├── MANIFEST.in
├── scripts
├── reformat-leeds-ja.py
├── merge_english.py
└── reformat_using_rosette.py
├── LICENSE.txt
├── tests
├── test_nltk_morphy.py
├── test_tokens.py
└── test_extprocesses.py
├── setup.py
└── README.md
/metanl/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *.swp
3 | build
4 | *.egg-info/
5 | dist
6 | pip-log.txt
7 | .coverage
8 | *~
9 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include metanl *.txt
2 | recursive-include metanl *.dat
3 | recursive-include metanl *.cfg
4 | include README.md
5 |
--------------------------------------------------------------------------------
/metanl/data/freeling/generic_splitter.dat:
--------------------------------------------------------------------------------
1 |
2 | AllowBetweenMarkers 1
3 | MaxWords 1000
4 |
5 |
6 | " "
7 | ( )
8 | { }
9 | /* */
10 |
11 |
12 |
13 |
14 |
15 |
--------------------------------------------------------------------------------
/scripts/reformat-leeds-ja.py:
--------------------------------------------------------------------------------
1 | from metanl import japanese
2 | from metanl.leeds_corpus_reader import translate_leeds_corpus
3 |
4 | translate_leeds_corpus('../metanl/data/source-data/internet-ja-forms.num',
5 | '../metanl/data/leeds-internet-ja.txt', japanese.normalize)
6 |
--------------------------------------------------------------------------------
/metanl/data/freeling/ident.cfg:
--------------------------------------------------------------------------------
1 | ##
2 | #### default configuration file for Spanish analyzer
3 | ##
4 |
5 | #### General options
6 |
7 | Locale=default
8 |
9 | InputFormat=plain
10 | OutputFormat=ident
11 | LangIdentFile=$FREELINGSHARE/common/lang_ident/ident.dat
12 |
13 | ## You can launch the analyzer as a server by default
14 | ## changing the options below
15 | ServerMode=off
16 | #ServerPort=12345
17 | #ServerMaxWorkers=5
18 | #ServerQueueSize=32
19 |
--------------------------------------------------------------------------------
/scripts/merge_english.py:
--------------------------------------------------------------------------------
1 | from metanl.wordlist import get_wordlist, merge_lists
2 |
3 | def merge_english():
4 | books = get_wordlist('en-books')
5 | twitter = get_wordlist('en-twitter')
6 | combined = merge_lists([(books, '', 1e9), (twitter, '', 1e9)])
7 | combined.save('multi-en.txt')
8 | combined.save_logarithmic('multi-en-logarithmic.txt')
9 | total = sum(combined.worddict.values())
10 | print "Average frequency:", total / len(combined.worddict)
11 |
12 | if __name__ == '__main__':
13 | merge_english()
14 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | Copyright (C) 2012 Rob Speer (rspeer@mit.edu)
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 |
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 |
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
8 |
--------------------------------------------------------------------------------
/scripts/reformat_using_rosette.py:
--------------------------------------------------------------------------------
1 | from metanl.leeds_corpus_reader import translate_leeds_corpus
2 | import socket, time
3 |
4 | def make_rosette_normalizer(lcode):
5 | from lumi_pipeline.text_readers import get_reader
6 | reader = get_reader('rosette.%s' % lcode)
7 | def normalizer(text):
8 | try:
9 | triples = reader.text_to_token_triples(text)
10 | except socket.error:
11 | time.sleep(1)
12 | print 'backing off'
13 | return normalizer(text)
14 | normalized = u' '.join(lemma.rsplit('|', 1)[0] for lemma, pos, token in triples)
15 | return normalized
16 | return normalizer
17 |
18 | def main():
19 | for language in ('pt', 'ru', 'es', 'fr', 'it', 'zh', 'de', 'ar'):
20 | print language
21 | translate_leeds_corpus(
22 | '../metanl/data/source-data/internet-%s-forms.num' % language,
23 | '../metanl/data/wordlists/leeds-internet-%s.txt' % language,
24 | make_rosette_normalizer(language)
25 | )
26 |
27 | if __name__ == '__main__':
28 | main()
29 |
--------------------------------------------------------------------------------
/tests/test_nltk_morphy.py:
--------------------------------------------------------------------------------
1 | from __future__ import unicode_literals
2 |
3 | from metanl.nltk_morphy import normalize_list, tag_and_stem
4 | from nose.tools import eq_
5 |
6 | def test_normalize_list():
7 | # Strip away articles, unless there's only an article
8 | eq_(normalize_list('the dog'), ['dog'])
9 | eq_(normalize_list('the'), ['the'])
10 |
11 | # strip out pluralization
12 | eq_(normalize_list('big dogs'), ['big', 'dog'])
13 |
14 |
15 | def test_tag_and_stem():
16 | the_big_dogs = [(u'the', 'DT', u'the'),
17 | (u'big', 'JJ', u'big'),
18 | (u'dog', 'NNS', u'dogs')]
19 | eq_(tag_and_stem('the big dogs'), the_big_dogs)
20 |
21 | the_big_hashtag = [(u'the', 'DT', u'the'),
22 | (u'#', 'NN', u'#'),
23 | (u'big', 'JJ', u'big'),
24 | (u'dog', 'NN', u'dog')]
25 | eq_(tag_and_stem('the #big dog'), the_big_hashtag)
26 |
27 | two_sentences = [(u'i', 'PRP', u'I'),
28 | (u'can', 'MD', u'ca'),
29 | (u'not', 'RB', u"n't"),
30 | (u'.', '.', u'.'),
31 | (u'avoid', 'NNP', u'Avoid'),
32 | (u'fragment', 'NNS', u'fragments'),
33 | (u'.', '.', u'.')]
34 | eq_(tag_and_stem("I can't. Avoid fragments."), two_sentences)
35 |
--------------------------------------------------------------------------------
/tests/test_tokens.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import unicode_literals
3 | from metanl.token_utils import (tokenize, untokenize, un_camel_case,
4 | string_pieces)
5 | from nose.tools import eq_
6 | import nltk
7 |
8 | def test_tokenize():
9 | # a snippet from Hitchhiker's Guide that just happens to have
10 | # most of the examples of punctuation we're looking for.
11 | #
12 | # TODO: test wacky behavior with "n't" and "cannot" and stuff.
13 | text1 = "Time is an illusion. Lunchtime, doubly so."
14 | text2 = ('"Very deep," said Arthur, "you should send that in to the '
15 | 'Reader\'s Digest. They\'ve got a page for people like you."')
16 | eq_(tokenize(text1),
17 | ['Time', 'is', 'an', 'illusion', '.', 'Lunchtime', ',',
18 | 'doubly', 'so', '.']
19 | )
20 | eq_(untokenize(tokenize(text1)), text1)
21 | if nltk.__version__ >= '3':
22 | eq_(untokenize(tokenize(text2)), text2)
23 |
24 | def test_camel_case():
25 | eq_(un_camel_case('1984ZXSpectrumGames'), '1984 ZX Spectrum Games')
26 | eq_(un_camel_case('aaAa aaAaA 0aA AAAa!AAA'),
27 | 'aa Aa aa Aa A 0a A AA Aa! AAA')
28 | eq_(un_camel_case('MotörHead'),
29 | 'Mot\xf6r Head')
30 | eq_(un_camel_case('MSWindows3.11ForWorkgroups'),
31 | 'MS Windows 3.11 For Workgroups')
32 |
33 | # This should not significantly affect text that is not camel-cased
34 | eq_(un_camel_case('ACM_Computing_Classification_System'),
35 | 'ACM Computing Classification System')
36 | eq_(un_camel_case('Anne_Blunt,_15th_Baroness_Wentworth'),
37 | 'Anne Blunt, 15th Baroness Wentworth')
38 | eq_(un_camel_case('Hindi-Urdu'),
39 | 'Hindi-Urdu')
40 |
41 |
42 | def test_string_pieces():
43 | # Break as close to whitespace as possible
44 | text = "12 12 12345 123456 1234567-12345678"
45 | eq_(list(string_pieces(text, 6)),
46 | ['12 12 ', '12345 ', '123456', ' ', '123456', '7-', '123456', '78'])
47 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | version_str = '1.0c'
4 | from setuptools import setup
5 | import sys
6 |
7 | if sys.version_info.major == 2:
8 | nltk_version = 'nltk'
9 | else:
10 | nltk_version = 'nltk >= 3.0a'
11 |
12 | classifiers=[
13 | 'Intended Audience :: Developers',
14 | 'Intended Audience :: Science/Research',
15 | 'License :: OSI Approved :: MIT License',
16 | 'Natural Language :: English',
17 | 'Operating System :: MacOS',
18 | 'Operating System :: Microsoft :: Windows',
19 | 'Operating System :: POSIX',
20 | 'Operating System :: Unix',
21 | 'Programming Language :: C',
22 | 'Programming Language :: Python :: 2.7',
23 | 'Programming Language :: Python :: 3.3',
24 | 'Topic :: Scientific/Engineering',
25 | 'Topic :: Software Development',
26 | 'Topic :: Text Processing :: Linguistic',]
27 |
28 | import os
29 | README_contents = open(os.path.join(os.path.dirname(__file__), 'README.md')).read()
30 | doclines = README_contents.split("\n")
31 |
32 |
33 | deprecation_warning = """
34 |
35 | Note: metanl is no longer actively developed or supported.
36 |
37 | metanl was created to support the language-processing needs that ConceptNet
38 | 5 shared with code developed at Luminoso. Those needs have diverged, to the
39 | point where it made the most sense to split the functionality again.
40 |
41 | A simplified version of metanl has been moved into the `conceptnet5`
42 | package, as `conceptnet5.language`.
43 |
44 | """
45 | sys.stderr.write(deprecation_warning)
46 |
47 | setup(
48 | name="metanl",
49 | version=version_str,
50 | maintainer='Luminoso Technologies, Inc.',
51 | maintainer_email='dev@luminoso.com',
52 | url='http://github.com/commonsense/metanl/',
53 | license = "MIT",
54 | platforms = ["any"],
55 | description = doclines[0],
56 | classifiers = classifiers,
57 | long_description = "\n".join(doclines[2:]),
58 | packages=['metanl'],
59 | package_data = {'metanl': ['data/freeling/*.cfg', 'data/freeling/*.dat']},
60 | install_requires=[nltk_version, 'ftfy >= 3'],
61 | )
62 |
--------------------------------------------------------------------------------
/tests/test_extprocesses.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import unicode_literals
3 |
4 | from metanl.freeling import english, spanish
5 | from metanl.mecab import normalize, tag_and_stem
6 | from metanl.extprocess import unicode_is_punctuation
7 | from nose.tools import eq_
8 |
9 |
10 | def test_english():
11 | test_text = "This is a test.\n\nIt has two paragraphs, and that's okay."
12 | expected_result = [('this', 'DT', 'This'), ('be', 'VBZ', 'is'),
13 | ('a', 'DT', 'a'), ('test', 'NN', 'test'),
14 | ('.', '.', '.'), ('it', 'PRP', 'It'),
15 | ('have', 'VBZ', 'has'), ('two', 'DT', 'two'),
16 | ('paragraph', 'NNS', 'paragraphs'), (',', '.', ','),
17 | ('and', 'CC', 'and'), ('that', 'PRP', 'that'),
18 | ('be', 'VBZ', "'s"), ('okay', 'JJ', 'okay'),
19 | ('.', '.', '.')]
20 | eq_(english.tag_and_stem(test_text), expected_result)
21 |
22 | test_text = "this has\ntwo lines"
23 | expected_result = [('this', 'DT', 'this'), ('have', 'VBZ', 'has'),
24 | ('two', 'DT', 'two'), ('line', 'NNS', 'lines')]
25 | eq_(english.tag_and_stem(test_text), expected_result)
26 |
27 |
28 | def test_spanish():
29 | # Spanish works, even with a lot of unicode characters
30 | test_text = '¿Dónde está mi búfalo?'
31 | expected_result = [('¿', '.', '¿'),
32 | ('dónde', 'P', 'Dónde'),
33 | ('estar', 'V', 'está'),
34 | ('mi', 'D', 'mi'),
35 | ('búfalo', 'N', 'búfalo'),
36 | ('?', '.', '?')]
37 | eq_(spanish.tag_and_stem(test_text), expected_result)
38 |
39 |
40 | def test_japanese():
41 | eq_(normalize('これはテストです'), 'テスト')
42 | this_is_a_test = [('これ', '~名詞', 'これ'),
43 | ('は', '~助詞', 'は'),
44 | ('テスト', '名詞', 'テスト'),
45 | ('です', '~助動詞', 'です'),
46 | ('。', '.', '。')]
47 | eq_(tag_and_stem('これはテストです。'), this_is_a_test)
48 |
49 |
50 | def test_unicode_is_punctuation():
51 | assert unicode_is_punctuation('word') is False
52 | assert unicode_is_punctuation('。') is True
53 | assert unicode_is_punctuation('-') is True
54 | assert unicode_is_punctuation('-3') is False
55 | assert unicode_is_punctuation('あ') is False
56 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Multilingual natural language tools, wrapping NLTK and other systems.
2 |
3 | ## Deprecated as of June 2014
4 |
5 | `metanl` is no longer actively developed or supported.
6 |
7 | This package was created to support the language-processing needs that
8 | [ConceptNet 5](http://conceptnet5.media.mit.edu) shared with code developed at
9 | Luminoso. Those needs have diverged, to the point where it made the most sense
10 | to split the functionality again.
11 |
12 | A simplified version of metanl has been moved into the `conceptnet5`
13 | package, as `conceptnet5.language`.
14 |
15 |
16 | ## metanl.token_utils
17 |
18 | Utilities for working with tokens:
19 |
20 | - `tokenize` splits strings into tokens, using NLTK.
21 | - `untokenize` rejoins tokens into a correctly-spaced string, using ad-hoc
22 | rules that aim to invert what NLTK does.
23 | - `un_camel_case` splits a CamelCased string into tokens.
24 |
25 | These functions make assumptions that work best in English, and work reasonably
26 | in other Western languages, and fail utterly in languages that don't use
27 | spaces.
28 |
29 |
30 | ## metanl.nltk_morphy
31 |
32 | `nltk_morphy` is a lemmatizer (a stemmer with principles). It enables you to
33 | reduce words to their root form in English, using the Morphy algorithm that's
34 | built into WordNet, together with NLTK's part of speech tagger.
35 |
36 | Morphy works best with a known part of speech. In fact, the way it works in
37 | NLTK is pretty bad if you don't specify the part of speech. The `nltk_morphy`
38 | wrapper provides:
39 |
40 | - An alignment between the POS tags that `nltk.pos_tag` outputs, and the input
41 | that Morphy expects
42 | - A strategy for tagging words whose part of speech is unknown
43 | - A small list of exceptions, for cases where Morphy returns an unintuitive
44 | or wrong result
45 |
46 | ## metanl.extprocess
47 |
48 | Sometimes, the best available NLP tools are written in some other language
49 | besides Python. They may not provide a reasonable foreign function interface.
50 | What they do often provide is a command-line utility.
51 |
52 | `metanl.extprocess` provides abstractions over utilities that take in natural
53 | language, and output a token-by-token analysis. This is used by two other
54 | modules in `metanl`.
55 |
56 | ### metanl.freeling
57 |
58 | FreeLing is an NLP tool that can analyze many European languages, including
59 | English, Spanish, Italian, Portuguese, Welsh, and Russian. This module
60 | allows you to run FreeLing in a separate process, and use its analysis
61 | results in Python.
62 |
63 | ### metanl.mecab
64 |
65 | In Japanese, NLP analyzers are particularly important, because without one
66 | you don't even know where to split words.
67 |
68 | MeCab is the most commonly used analyzer for Japanese text. This module runs
69 | MeCab in an external process, allowing you to get its complete analysis
70 | results, or just use it to tokenize or lemmatize text.
71 |
72 | As part of MeCab's operation, it outputs the phonetic spellings of the words
73 | it finds, in kana. We use this to provide a wrapper function that can
74 | romanize any Japanese text.
75 |
76 |
--------------------------------------------------------------------------------
/metanl/data/freeling/cy.cfg:
--------------------------------------------------------------------------------
1 | ##
2 | #### default configuration file for Welsch analyzer
3 | ##
4 |
5 | #### General options
6 | Lang=en
7 | Locale=default
8 |
9 | #### Trace options. Only effective if we have compiled with -DVERBOSE
10 | ##
11 | #### Possible values for TraceModules (may be OR'ed)
12 | #define SPLIT_TRACE 0x00000001
13 | #define TOKEN_TRACE 0x00000002
14 | #define MACO_TRACE 0x00000004
15 | #define OPTIONS_TRACE 0x00000008
16 | #define NUMBERS_TRACE 0x00000010
17 | #define DATES_TRACE 0x00000020
18 | #define PUNCT_TRACE 0x00000040
19 | #define DICT_TRACE 0x00000080
20 | #define SUFF_TRACE 0x00000100
21 | #define LOCUT_TRACE 0x00000200
22 | #define NP_TRACE 0x00000400
23 | #define PROB_TRACE 0x00000800
24 | #define QUANT_TRACE 0x00001000
25 | #define NEC_TRACE 0x00002000
26 | #define AUTOMAT_TRACE 0x00004000
27 | #define TAGGER_TRACE 0x00008000
28 | #define HMM_TRACE 0x00010000
29 | #define RELAX_TRACE 0x00020000
30 | #define RELAX_TAGGER_TRACE 0x00040000
31 | #define CONST_GRAMMAR_TRACE 0x00080000
32 | #define SENSES_TRACE 0x00100000
33 | #define CHART_TRACE 0x00200000
34 | #define GRAMMAR_TRACE 0x00400000
35 | #define DEP_TRACE 0x00800000
36 | #define UTIL_TRACE 0x01000000
37 |
38 | TraceLevel=3
39 | TraceModule=0x0000
40 |
41 | ## Options to control the applied modules. The input may be partially
42 | ## processed, or not a full analysis may me wanted. The specific
43 | ## formats are a choice of the main program using the library, as well
44 | ## as the responsability of calling only the required modules.
45 | ## Valid input/output formats are: plain, token, splitted, morfo, tagged, parsed
46 | InputFormat=plain
47 | OutputFormat=tagged
48 |
49 | # consider each newline as a sentence end
50 | AlwaysFlush=yes
51 |
52 | #### Tokenizer options
53 | TokenizerFile=$FREELINGSHARE/cy/tokenizer.dat
54 |
55 | #### Splitter options
56 | SplitterFile=$FREELINGSHARE/cy/splitter.dat
57 |
58 | #### Morfo options
59 | AffixAnalysis=yes
60 | MultiwordsDetection=no
61 | NumbersDetection=no
62 | PunctuationDetection=yes
63 | DatesDetection=no
64 | QuantitiesDetection=no
65 | DictionarySearch=yes
66 | ProbabilityAssignment=yes
67 | OrthographicCorrection=no
68 | DecimalPoint=.
69 | ThousandPoint=,
70 | LocutionsFile=$FREELINGSHARE/cy/locucions.dat
71 | QuantitiesFile=$FREELINGSHARE/cy/quantities.dat
72 | AffixFile=$FREELINGSHARE/cy/afixos.dat
73 | ProbabilityFile=$FREELINGSHARE/cy/probabilitats.dat
74 | DictionaryFile=$FREELINGSHARE/cy/dicc.src
75 | PunctuationFile=$FREELINGSHARE/common/punct.dat
76 | ProbabilityThreshold=0.001
77 | # NER options
78 | NERecognition=no
79 | NPDataFile=$FREELINGSHARE/cy/np.dat
80 |
81 | #Spelling Corrector config file
82 | CorrectorFile=$FREELINGSHARE/cy/corrector/corrector.dat
83 |
84 | ## Phonetic encoding of words.
85 | Phonetics=no
86 | PhoneticsFile=$FREELINGSHARE/cy/phonetics.dat
87 |
88 | ## NEC options
89 | NEClassification=no
90 | NECFile=$FREELINGSHARE/cy/nec/nec-ab.dat
91 |
92 | ## Sense annotation options (none,all,mfs)
93 | SenseAnnotation=none
94 | SenseConfigFile=$FREELINGSHARE/cy/senses.dat
95 | UKBConfigFile=$FREELINGSHARE/cy/ukb.dat
96 |
97 | #### Tagger options
98 | Tagger=hmm
99 | TaggerHMMFile=$FREELINGSHARE/cy/tagger.dat
100 | TaggerRelaxFile=$FREELINGSHARE/cy/constr_gram.dat
101 | TaggerRelaxMaxIter=500
102 | TaggerRelaxScaleFactor=670.0
103 | TaggerRelaxEpsilon=0.001
104 | TaggerRetokenize=no
105 | TaggerForceSelect=none
106 |
107 | #### Parser options
108 | GrammarFile=$FREELINGSHARE/cy/chunker/grammar-chunk.dat
109 |
110 | #### Dependence Parser options
111 | DepTxalaFile=$FREELINGSHARE/cy/dep/dependences.dat
112 |
113 | #### Coreference Solver options
114 | CoreferenceResolution=no
115 | CorefFile=$FREELINGSHARE/cy/coref/coref.dat
116 |
--------------------------------------------------------------------------------
/metanl/data/freeling/it.cfg:
--------------------------------------------------------------------------------
1 | ##
2 | #### default configuration file for Italian analyzer
3 | ##
4 |
5 | #### General options
6 | Lang=it
7 | Locale=default
8 |
9 | #### Trace options. Only effective if we have compiled with -DVERBOSE
10 | #
11 | ## Possible values for TraceModule (may be OR'ed)
12 | #define SPLIT_TRACE 0x00000001
13 | #define TOKEN_TRACE 0x00000002
14 | #define MACO_TRACE 0x00000004
15 | #define OPTIONS_TRACE 0x00000008
16 | #define NUMBERS_TRACE 0x00000010
17 | #define DATES_TRACE 0x00000020
18 | #define PUNCT_TRACE 0x00000040
19 | #define DICT_TRACE 0x00000080
20 | #define SUFF_TRACE 0x00000100
21 | #define LOCUT_TRACE 0x00000200
22 | #define NP_TRACE 0x00000400
23 | #define PROB_TRACE 0x00000800
24 | #define QUANT_TRACE 0x00001000
25 | #define NEC_TRACE 0x00002000
26 | #define AUTOMAT_TRACE 0x00004000
27 | #define TAGGER_TRACE 0x00008000
28 | #define HMM_TRACE 0x00010000
29 | #define RELAX_TRACE 0x00020000
30 | #define RELAX_TAGGER_TRACE 0x00040000
31 | #define CONST_GRAMMAR_TRACE 0x00080000
32 | #define SENSES_TRACE 0x00100000
33 | #define CHART_TRACE 0x00200000
34 | #define GRAMMAR_TRACE 0x00400000
35 | #define DEP_TRACE 0x00800000
36 | #define UTIL_TRACE 0x01000000
37 |
38 | TraceLevel=3
39 | TraceModule=0x0000
40 |
41 | ## Options to control the applied modules. The input may be partially
42 | ## processed, or not a full analysis may me wanted. The specific
43 | ## formats are a choice of the main program using the library, as well
44 | ## as the responsability of calling only the required modules.
45 | ## Valid input/output formats are: plain, token, splitted, morfo, tagged, parsed
46 | InputFormat=plain
47 | OutputFormat=tagged
48 |
49 | # consider each newline as a sentence end
50 | AlwaysFlush=yes
51 |
52 | #### Tokenizer options
53 | TokenizerFile=$FREELINGSHARE/it/tokenizer.dat
54 |
55 | #### Splitter options
56 | SplitterFile=$FREELINGSHARE/it/splitter.dat
57 |
58 | #### Morfo options
59 | AffixAnalysis=yes
60 | MultiwordsDetection=no
61 | NumbersDetection=no
62 | PunctuationDetection=yes
63 | DatesDetection=no
64 | QuantitiesDetection=no
65 | DictionarySearch=yes
66 | ProbabilityAssignment=yes
67 | OrthographicCorrection=no
68 | DecimalPoint=,
69 | ThousandPoint=.
70 | LocutionsFile=$FREELINGSHARE/it/locucions.dat
71 | QuantitiesFile=$FREELINGSHARE/common/quantities_default.dat
72 | AffixFile=$FREELINGSHARE/it/afixos.dat
73 | ProbabilityFile=$FREELINGSHARE/it/probabilitats.dat
74 | NPDataFile=$FREELINGSHARE/it/np.dat
75 | PunctuationFile=$FREELINGSHARE/common/punct.dat
76 | ProbabilityThreshold=0.001
77 | # NER options
78 | NERecognition=no
79 | DictionaryFile=$FREELINGSHARE/it/dicc.src
80 |
81 | #Spelling Corrector config file
82 | CorrectorFile=$FREELINGSHARE/it/corrector/corrector.dat
83 |
84 | ## Phonetic encoding of words.
85 | Phonetics=no
86 | PhoneticsFile=$FREELINGSHARE/it/phonetics.dat
87 |
88 | ## NEC options
89 | NEClassification=no
90 | NECFile=$FREELINGSHARE/it/nec/nec-ab.dat
91 |
92 | ## Sense annotation options (none,all,mfs)
93 | SenseAnnotation=none
94 | SenseConfigFile=$FREELINGSHARE/it/senses.dat
95 | UKBConfigFile=$FREELINGSHARE/it/ukb.dat
96 |
97 | #### Tagger options
98 | Tagger=hmm
99 | TaggerHMMFile=$FREELINGSHARE/it/tagger.dat
100 | TaggerRelaxFile=$FREELINGSHARE/it/constr_gram.dat
101 | TaggerRelaxMaxIter=500
102 | TaggerRelaxScaleFactor=670.0
103 | TaggerRelaxEpsilon=0.001
104 | TaggerRetokenize=yes
105 | TaggerForceSelect=tagger
106 |
107 | #### Parser options
108 | GrammarFile=$FREELINGSHARE/it/chunker/grammar-chunk.dat
109 |
110 | #### Dependence Parser options
111 | DepTxalaFile=$FREELINGSHARE/it/dep/dependences.dat
112 |
113 | #### Coreference Solver options
114 | CoreferenceResolution=no
115 | CorefFile=$FREELINGSHARE/it/coref/coref.dat
116 |
--------------------------------------------------------------------------------
/metanl/data/freeling/ru.cfg:
--------------------------------------------------------------------------------
1 | ##
2 | #### default configuration file for English analyzer
3 | ##
4 |
5 | #### General options
6 | Lang=ru
7 | Locale=default
8 |
9 | #### Trace options. Only effective if we have compiled with -DVERBOSE
10 | ##
11 | #### Possible values for TraceModules (may be OR'ed)
12 | #define SPLIT_TRACE 0x00000001
13 | #define TOKEN_TRACE 0x00000002
14 | #define MACO_TRACE 0x00000004
15 | #define OPTIONS_TRACE 0x00000008
16 | #define NUMBERS_TRACE 0x00000010
17 | #define DATES_TRACE 0x00000020
18 | #define PUNCT_TRACE 0x00000040
19 | #define DICT_TRACE 0x00000080
20 | #define SUFF_TRACE 0x00000100
21 | #define LOCUT_TRACE 0x00000200
22 | #define NP_TRACE 0x00000400
23 | #define PROB_TRACE 0x00000800
24 | #define QUANT_TRACE 0x00001000
25 | #define NEC_TRACE 0x00002000
26 | #define AUTOMAT_TRACE 0x00004000
27 | #define TAGGER_TRACE 0x00008000
28 | #define HMM_TRACE 0x00010000
29 | #define RELAX_TRACE 0x00020000
30 | #define RELAX_TAGGER_TRACE 0x00040000
31 | #define CONST_GRAMMAR_TRACE 0x00080000
32 | #define SENSES_TRACE 0x00100000
33 | #define CHART_TRACE 0x00200000
34 | #define GRAMMAR_TRACE 0x00400000
35 | #define DEP_TRACE 0x00800000
36 | #define UTIL_TRACE 0x01000000
37 |
38 | TraceLevel=3
39 | TraceModule=0x0000
40 |
41 | ## Options to control the applied modules. The input may be partially
42 | ## processed, or not a full analysis may me wanted. The specific
43 | ## formats are a choice of the main program using the library, as well
44 | ## as the responsability of calling only the required modules.
45 | ## Valid input/output formats are: plain, token, splitted, morfo, tagged, parsed
46 | InputFormat=plain
47 | OutputFormat=tagged
48 |
49 | # consider each newline as a sentence end
50 | AlwaysFlush=yes
51 |
52 | #### Tokenizer options
53 | TokenizerFile=$FREELINGSHARE/ru/tokenizer.dat
54 |
55 | #### Splitter options
56 | SplitterFile=$FREELINGSHARE/ru/splitter.dat
57 |
58 | #### Morfo options
59 | AffixAnalysis=no
60 | MultiwordsDetection=no
61 | NumbersDetection=no
62 | PunctuationDetection=no
63 | DatesDetection=no
64 | QuantitiesDetection=no
65 | DictionarySearch=yes
66 | ProbabilityAssignment=yes
67 | OrthographicCorrection=no
68 | DecimalPoint=.
69 | ThousandPoint=,
70 | LocutionsFile=$FREELINGSHARE/ru/locucions.dat
71 | QuantitiesFile=$FREELINGSHARE/common/quantities_default.dat
72 | AffixFile=$FREELINGSHARE/ru/afixos.dat
73 | ProbabilityFile=$FREELINGSHARE/ru/probabilitats.dat
74 | DictionaryFile=$FREELINGSHARE/ru/dicc.src
75 | PunctuationFile=$FREELINGSHARE/common/punct.dat
76 | ProbabilityThreshold=0.001
77 | # NER options
78 | NERecognition=no
79 | NPDataFile=$FREELINGSHARE/ru/np.dat
80 |
81 | #Spelling Corrector config file
82 | CorrectorFile=$FREELINGSHARE/ru/corrector/corrector.dat
83 |
84 | ## Phonetic encoding of words.
85 | Phonetics=no
86 | PhoneticsFile=$FREELINGSHARE/ru/phonetics.dat
87 |
88 | ## NEC options
89 | NEClassification=no
90 | NECFile=$FREELINGSHARE/ru/nec/nec-ab.dat
91 |
92 | ## Sense annotation options (none,all,mfs,ukb)
93 | SenseAnnotation=none
94 | SenseConfigFile=$FREELINGSHARE/ru/senses.dat
95 | UKBConfigFile=$FREELINGSHARE/ru/ukb.dat
96 |
97 | #### Tagger options
98 | #Tagger=relax
99 | Tagger=hmm
100 | TaggerHMMFile=$FREELINGSHARE/ru/tagger.dat
101 | TaggerRelaxFile=$FREELINGSHARE/ru/constr_gram.dat
102 | TaggerRelaxMaxIter=500
103 | TaggerRelaxScaleFactor=670.0
104 | TaggerRelaxEpsilon=0.001
105 | TaggerRetokenize=yes
106 | TaggerForceSelect=tagger
107 |
108 | #### Parser options
109 | GrammarFile=$FREELINGSHARE/ru/chunker/grammar-chunk.dat
110 |
111 | #### Dependence Parser options
112 | DepTxalaFile=$FREELINGSHARE/ru/dep/dependences.dat
113 |
114 | #### Coreference Solver options
115 | CoreferenceResolution=no
116 | CorefFile=$FREELINGSHARE/ru/coref/coref.dat
117 |
--------------------------------------------------------------------------------
/metanl/data/freeling/es.cfg:
--------------------------------------------------------------------------------
1 | ##
2 | #### default configuration file for Spanish analyzer
3 | ##
4 |
5 | #### General options
6 | Lang=es
7 | Locale=default
8 |
9 | #### Trace options. Only effective if we have compiled with -DVERBOSE
10 | #
11 | ## Possible values for TraceModule (may be OR'ed)
12 | #define SPLIT_TRACE 0x00000001
13 | #define TOKEN_TRACE 0x00000002
14 | #define MACO_TRACE 0x00000004
15 | #define OPTIONS_TRACE 0x00000008
16 | #define NUMBERS_TRACE 0x00000010
17 | #define DATES_TRACE 0x00000020
18 | #define PUNCT_TRACE 0x00000040
19 | #define DICT_TRACE 0x00000080
20 | #define SUFF_TRACE 0x00000100
21 | #define LOCUT_TRACE 0x00000200
22 | #define NP_TRACE 0x00000400
23 | #define PROB_TRACE 0x00000800
24 | #define QUANT_TRACE 0x00001000
25 | #define NEC_TRACE 0x00002000
26 | #define AUTOMAT_TRACE 0x00004000
27 | #define TAGGER_TRACE 0x00008000
28 | #define HMM_TRACE 0x00010000
29 | #define RELAX_TRACE 0x00020000
30 | #define RELAX_TAGGER_TRACE 0x00040000
31 | #define CONST_GRAMMAR_TRACE 0x00080000
32 | #define SENSES_TRACE 0x00100000
33 | #define CHART_TRACE 0x00200000
34 | #define GRAMMAR_TRACE 0x00400000
35 | #define DEP_TRACE 0x00800000
36 | #define UTIL_TRACE 0x01000000
37 |
38 | TraceLevel=3
39 | TraceModule=0x0000
40 |
41 | ## Options to control the applied modules. The input may be partially
42 | ## processed, or not a full analysis may me wanted. The specific
43 | ## formats are a choice of the main program using the library, as well
44 | ## as the responsability of calling only the required modules.
45 | ## Valid input/output formats are: plain, token, splitted, morfo, tagged, parsed
46 | InputFormat=plain
47 | OutputFormat=tagged
48 |
49 | # consider each newline as a sentence end
50 | AlwaysFlush=yes
51 |
52 | #### Tokenizer options
53 | TokenizerFile=$FREELINGSHARE/es/tokenizer.dat
54 |
55 | #### Splitter options
56 | SplitterFile=$FREELINGSHARE/es/splitter.dat
57 |
58 | #### Morfo options
59 | AffixAnalysis=yes
60 | MultiwordsDetection=no
61 | NumbersDetection=no
62 | PunctuationDetection=yes
63 | DatesDetection=no
64 | QuantitiesDetection=no
65 | DictionarySearch=yes
66 | ProbabilityAssignment=yes
67 | OrthographicCorrection=no
68 | DecimalPoint=,
69 | ThousandPoint=.
70 | LocutionsFile=$FREELINGSHARE/es/locucions.dat
71 | QuantitiesFile=$FREELINGSHARE/es/quantities.dat
72 | AffixFile=$FREELINGSHARE/es/afixos.dat
73 | ProbabilityFile=$FREELINGSHARE/es/probabilitats.dat
74 | DictionaryFile=$FREELINGSHARE/es/dicc.src
75 | PunctuationFile=$FREELINGSHARE/common/punct.dat
76 | ProbabilityThreshold=0.001
77 |
78 | # NER options
79 | NERecognition=no
80 | NPDataFile=$FREELINGSHARE/es/np.dat
81 | ## comment line above and uncomment that below, if you want
82 | ## a better NE recognizer (higer accuracy, lower speed)
83 | #NPDataFile=$FREELINGSHARE/es/ner/ner-ab.dat
84 |
85 | #Spelling Corrector config file
86 | CorrectorFile=$FREELINGSHARE/es/corrector/corrector.dat
87 |
88 | ## Phonetic encoding of words.
89 | Phonetics=no
90 | PhoneticsFile=$FREELINGSHARE/es/phonetics.dat
91 |
92 | ## NEC options
93 | NEClassification=no
94 | NECFile=$FREELINGSHARE/es/nec/nec-svm.dat
95 |
96 | ## Sense annotation options (none,all,mfs,ukb)
97 | SenseAnnotation=none
98 | SenseConfigFile=$FREELINGSHARE/es/senses.dat
99 | UKBConfigFile=$FREELINGSHARE/es/ukb.dat
100 |
101 | #### Tagger options
102 | Tagger=hmm
103 | TaggerHMMFile=$FREELINGSHARE/es/tagger.dat
104 | TaggerRelaxFile=$FREELINGSHARE/es/constr_gram-B.dat
105 | TaggerRelaxMaxIter=500
106 | TaggerRelaxScaleFactor=670.0
107 | TaggerRelaxEpsilon=0.001
108 | TaggerRetokenize=yes
109 | TaggerForceSelect=tagger
110 |
111 | #### Parser options
112 | GrammarFile=$FREELINGSHARE/es/chunker/grammar-chunk.dat
113 |
114 | #### Dependence Parser options
115 | DepTxalaFile=$FREELINGSHARE/es/dep/dependences.dat
116 |
117 | #### Coreference Solver options
118 | CoreferenceResolution=no
119 | CorefFile=$FREELINGSHARE/es/coref/coref.dat
120 |
--------------------------------------------------------------------------------
/metanl/data/freeling/en.cfg:
--------------------------------------------------------------------------------
1 | ##
2 | #### default configuration file for English analyzer
3 | ##
4 |
5 | #### General options
6 | Lang=en
7 | Locale=default
8 |
9 | #### Trace options. Only effective if we have compiled with -DVERBOSE
10 | ##
11 | #### Possible values for TraceModules (may be OR'ed)
12 | #define SPLIT_TRACE 0x00000001
13 | #define TOKEN_TRACE 0x00000002
14 | #define MACO_TRACE 0x00000004
15 | #define OPTIONS_TRACE 0x00000008
16 | #define NUMBERS_TRACE 0x00000010
17 | #define DATES_TRACE 0x00000020
18 | #define PUNCT_TRACE 0x00000040
19 | #define DICT_TRACE 0x00000080
20 | #define SUFF_TRACE 0x00000100
21 | #define LOCUT_TRACE 0x00000200
22 | #define NP_TRACE 0x00000400
23 | #define PROB_TRACE 0x00000800
24 | #define QUANT_TRACE 0x00001000
25 | #define NEC_TRACE 0x00002000
26 | #define AUTOMAT_TRACE 0x00004000
27 | #define TAGGER_TRACE 0x00008000
28 | #define HMM_TRACE 0x00010000
29 | #define RELAX_TRACE 0x00020000
30 | #define RELAX_TAGGER_TRACE 0x00040000
31 | #define CONST_GRAMMAR_TRACE 0x00080000
32 | #define SENSES_TRACE 0x00100000
33 | #define CHART_TRACE 0x00200000
34 | #define GRAMMAR_TRACE 0x00400000
35 | #define DEP_TRACE 0x00800000
36 | #define UTIL_TRACE 0x01000000
37 |
38 | TraceLevel=3
39 | TraceModule=0x0000
40 |
41 | ## Options to control the applied modules. The input may be partially
42 | ## processed, or not a full analysis may me wanted. The specific
43 | ## formats are a choice of the main program using the library, as well
44 | ## as the responsability of calling only the required modules.
45 | ## Valid input/output formats are: plain, token, splitted, morfo, tagged, parsed
46 | InputFormat=plain
47 | OutputFormat=tagged
48 |
49 | # consider each newline as a sentence end
50 | AlwaysFlush=yes
51 |
52 | #### Tokenizer options
53 | TokenizerFile=$FREELINGSHARE/en/tokenizer.dat
54 |
55 | #### Splitter options
56 | SplitterFile=$FREELINGSHARE/en/splitter.dat
57 |
58 | #### Morfo options
59 | AffixAnalysis=yes
60 | MultiwordsDetection=no
61 | NumbersDetection=no
62 | PunctuationDetection=yes
63 | DatesDetection=no
64 | QuantitiesDetection=no
65 | DictionarySearch=yes
66 | ProbabilityAssignment=yes
67 | OrthographicCorrection=no
68 | DecimalPoint=.
69 | ThousandPoint=,
70 | LocutionsFile=$FREELINGSHARE/en/locucions.dat
71 | QuantitiesFile=$FREELINGSHARE/en/quantities.dat
72 | AffixFile=$FREELINGSHARE/en/afixos.dat
73 | ProbabilityFile=$FREELINGSHARE/en/probabilitats.dat
74 | DictionaryFile=$FREELINGSHARE/en/dicc.src
75 | PunctuationFile=$FREELINGSHARE/common/punct.dat
76 | ProbabilityThreshold=0.001
77 |
78 | # NER options
79 | NERecognition=no
80 | NPDataFile=$FREELINGSHARE/en/np.dat
81 | ## --- comment lines above and uncomment those below, if you want
82 | ## --- a better NE recognizer (higer accuracy, lower speed)
83 | #NPDataFile=$FREELINGSHARE/en/ner/ner-ab.dat
84 |
85 | #Spelling Corrector config file
86 | CorrectorFile=$FREELINGSHARE/en/corrector/corrector.dat
87 |
88 | ## Phonetic encoding of words.
89 | Phonetics=no
90 | PhoneticsFile=$FREELINGSHARE/en/phonetics.dat
91 |
92 | ## NEC options
93 | NEClassification=no
94 | NECFile=$FREELINGSHARE/en/nec/nec-svm.dat
95 |
96 | ## Sense annotation options (none,all,mfs,ukb)
97 | SenseAnnotation=none
98 | SenseConfigFile=$FREELINGSHARE/en/senses.dat
99 | UKBConfigFile=$FREELINGSHARE/en/ukb.dat
100 |
101 | #### Tagger options
102 | #Tagger=relax
103 | Tagger=hmm
104 | TaggerHMMFile=$FREELINGSHARE/en/tagger.dat
105 | TaggerRelaxFile=$FREELINGSHARE/en/constr_gram-B.dat
106 | TaggerRelaxMaxIter=500
107 | TaggerRelaxScaleFactor=670.0
108 | TaggerRelaxEpsilon=0.001
109 | TaggerRetokenize=yes
110 | TaggerForceSelect=tagger
111 |
112 | #### Parser options
113 | GrammarFile=$FREELINGSHARE/en/chunker/grammar-chunk.dat
114 |
115 | #### Dependence Parser options
116 | DepTxalaFile=$FREELINGSHARE/en/dep/dependences.dat
117 |
118 | #### Coreference Solver options
119 | CoreferenceResolution=no
120 | CorefFile=$FREELINGSHARE/en/coref/coref.dat
121 |
--------------------------------------------------------------------------------
/metanl/data/freeling/pt.cfg:
--------------------------------------------------------------------------------
1 | ##
2 | #### default configuration file for Portuguese analyzer
3 | ##
4 |
5 | #### General options
6 | Lang=pt
7 | Locale=default
8 |
9 | #### Trace options. Only effective if we have compiled with -DVERBOSE
10 | #
11 | ## Possible values for TraceModule (may be OR'ed)
12 | #define SPLIT_TRACE 0x00000001
13 | #define TOKEN_TRACE 0x00000002
14 | #define MACO_TRACE 0x00000004
15 | #define OPTIONS_TRACE 0x00000008
16 | #define NUMBERS_TRACE 0x00000010
17 | #define DATES_TRACE 0x00000020
18 | #define PUNCT_TRACE 0x00000040
19 | #define DICT_TRACE 0x00000080
20 | #define SUFF_TRACE 0x00000100
21 | #define LOCUT_TRACE 0x00000200
22 | #define NP_TRACE 0x00000400
23 | #define PROB_TRACE 0x00000800
24 | #define QUANT_TRACE 0x00001000
25 | #define NEC_TRACE 0x00002000
26 | #define AUTOMAT_TRACE 0x00004000
27 | #define TAGGER_TRACE 0x00008000
28 | #define HMM_TRACE 0x00010000
29 | #define RELAX_TRACE 0x00020000
30 | #define RELAX_TAGGER_TRACE 0x00040000
31 | #define CONST_GRAMMAR_TRACE 0x00080000
32 | #define SENSES_TRACE 0x00100000
33 | #define CHART_TRACE 0x00200000
34 | #define GRAMMAR_TRACE 0x00400000
35 | #define DEP_TRACE 0x00800000
36 | #define UTIL_TRACE 0x01000000
37 |
38 | TraceLevel=3
39 | TraceModule=0x0000
40 |
41 | ## Options to control the applied modules. The input may be partially
42 | ## processed, or not a full analysis may me wanted. The specific
43 | ## formats are a choice of the main program using the library, as well
44 | ## as the responsability of calling only the required modules.
45 | ## Valid input/output formats are: plain, token, splitted, morfo, tagged, parsed
46 | InputFormat=plain
47 | OutputFormat=tagged
48 |
49 | # consider each newline as a sentence end
50 | AlwaysFlush=yes
51 |
52 | #### Tokenizer options
53 | TokenizerFile=$FREELINGSHARE/pt/tokenizer.dat
54 |
55 | #### Splitter options
56 | SplitterFile=$FREELINGSHARE/pt/splitter.dat
57 | RetokContractions=false
58 |
59 | #### Morfo options
60 | AffixAnalysis=yes
61 | MultiwordsDetection=no
62 | NumbersDetection=no
63 | PunctuationDetection=yes
64 | DatesDetection=no
65 | QuantitiesDetection=no
66 | DictionarySearch=yes
67 | ProbabilityAssignment=yes
68 | OrthographicCorrection=no
69 | DecimalPoint=,
70 | ThousandPoint=.
71 | LocutionsFile=$FREELINGSHARE/pt/locucions.dat
72 | QuantitiesFile=$FREELINGSHARE/common/quantities_default.dat
73 | AffixFile=$FREELINGSHARE/pt/afixos.dat
74 | ProbabilityFile=$FREELINGSHARE/pt/probabilitats.dat
75 | DictionaryFile=$FREELINGSHARE/pt/dicc.src
76 | PunctuationFile=$FREELINGSHARE/common/punct.dat
77 | ProbabilityThreshold=0.001
78 |
79 | #NER options
80 | NERecognition=no
81 | NPDataFile=$FREELINGSHARE/pt/np.dat
82 | ## --- comment lines above and uncomment those below, if you want
83 | ## --- a better NE recognizer (higer accuracy, lower speed)
84 | #NPDataFile=$FREELINGSHARE/pt/ner/ner-ab.dat
85 |
86 | #Spelling Corrector config file
87 | CorrectorFile=$FREELINGSHARE/pt/corrector/corrector.dat
88 |
89 | ## Phonetic encoding of words.
90 | Phonetics=no
91 | PhoneticsFile=$FREELINGSHARE/pt/phonetics.dat
92 |
93 | ## NEC options
94 | NEClassification=no
95 | NECFile=$FREELINGSHARE/pt/nec/nec-ab.dat
96 |
97 | ## Sense annotation options (none,all,mfs)
98 | SenseAnnotation=none
99 | SenseConfigFile=$FREELINGSHARE/pt/senses.dat
100 | UKBConfigFile=$FREELINGSHARE/pt/ukb.dat
101 |
102 | #### Tagger options
103 | Tagger=hmm
104 | TaggerHMMFile=$FREELINGSHARE/pt/tagger.dat
105 | TaggerRelaxFile=$FREELINGSHARE/pt/constr_gram.dat
106 | TaggerRelaxMaxIter=500
107 | TaggerRelaxScaleFactor=670.0
108 | TaggerRelaxEpsilon=0.001
109 | TaggerRetokenize=yes
110 | TaggerForceSelect=tagger
111 |
112 | #### Parser options
113 | GrammarFile=$FREELINGSHARE/pt/chunker/grammar-chunk.dat
114 |
115 | #### Dependence Parser options
116 | DepTxalaFile=$FREELINGSHARE/pt/dep/dependences.dat
117 |
118 | #### Coreference Solver options
119 | CoreferenceResolution=no
120 | CorefFile=$FREELINGSHARE/pt/coref/coref.dat
121 |
--------------------------------------------------------------------------------
/metanl/freeling.py:
--------------------------------------------------------------------------------
1 | from __future__ import unicode_literals
2 |
3 | import pkg_resources
4 | from metanl.extprocess import ProcessWrapper, ProcessError, render_safe
5 |
6 |
7 | class FreelingWrapper(ProcessWrapper):
8 | r"""
9 | Handle English, Spanish, Italian, Portuguese, or Welsh text by calling an
10 | installed copy of FreeLing.
11 |
12 | The constructor takes one argument, which is the installed filename of the
13 | language-specific config file, such as 'en.cfg'.
14 |
15 | >>> english.tag_and_stem("This is a test.\n\nIt has two paragraphs, and that's okay.")
16 | [('this', 'DT', 'This'), ('be', 'VBZ', 'is'), ('a', 'DT', 'a'), ('test', 'NN', 'test'), ('.', '.', '.'), ('it', 'PRP', 'It'), ('have', 'VBZ', 'has'), ('two', 'DT', 'two'), ('paragraph', 'NNS', 'paragraphs'), (',', '.', ','), ('and', 'CC', 'and'), ('that', 'PRP', 'that'), ('be', 'VBZ', "'s"), ('okay', 'JJ', 'okay'), ('.', '.', '.')]
17 |
18 | >>> english.tag_and_stem("this has\ntwo lines")
19 | [('this', 'DT', 'this'), ('have', 'VBZ', 'has'), ('two', 'DT', 'two'), ('line', 'NNS', 'lines')]
20 |
21 | """
22 | def __init__(self, lang):
23 | self.lang = lang
24 | self.configfile = pkg_resources.resource_filename(
25 | __name__, 'data/freeling/%s.cfg' % lang)
26 | self.splitterfile = pkg_resources.resource_filename(
27 | __name__, 'data/freeling/generic_splitter.dat')
28 |
29 | def _get_command(self):
30 | """
31 | Get the command for running the basic FreeLing pipeline in the
32 | specified language.
33 |
34 | The options we choose are:
35 |
36 | -f data/freeling/.cfg
37 | load our custom configuration for the language
38 | --fsplit data/freeling/generic_splitter.dat
39 | don't do any special handling of ends of sentences
40 | """
41 | return ['analyze', '-f', self.configfile, '--fsplit',
42 | self.splitterfile]
43 |
44 | def get_record_root(self, record):
45 | """
46 | Given a FreeLing record, return the root word.
47 | """
48 | return record[1].lower()
49 |
50 | def get_record_token(self, record):
51 | """
52 | The token of a FreeLing record is the first item on the line.
53 | """
54 | return record[0]
55 |
56 | def get_record_pos(self, record):
57 | """
58 | In English, return the third segment of the record.
59 |
60 | In other languages, this segment contains one letter for the part of
61 | speech, plus densely-encoded features that we really have no way to
62 | use. Return just the part-of-speech letter.
63 | """
64 | if self.lang == 'en':
65 | return record[2]
66 | else:
67 | return record[2][0]
68 |
69 | def is_stopword_record(self, record, common_words=False):
70 | """
71 | Determiners are stopwords. Detect this by checking whether their POS
72 | starts with 'D'.
73 | """
74 | return (record[2][0] == 'D')
75 |
76 | def analyze(self, text):
77 | """
78 | Run text through the external process, and get a list of lists
79 | ("records") that contain the analysis of each word.
80 | """
81 | try:
82 | text = render_safe(text).strip()
83 | if not text:
84 | return []
85 | chunks = text.split('\n')
86 | results = []
87 | for chunk_text in chunks:
88 | if chunk_text.strip():
89 | textbytes = (chunk_text + '\n').encode('utf-8')
90 | self.send_input(textbytes)
91 | out_line = ''
92 | while True:
93 | out_line = self.receive_output_line()
94 | out_line = out_line.decode('utf-8')
95 |
96 | if out_line == '\n':
97 | break
98 |
99 | record = out_line.strip('\n').split(' ')
100 | results.append(record)
101 | return results
102 | except ProcessError:
103 | self.restart_process()
104 | return self.analyze(text)
105 |
106 |
107 | LANGUAGES = {}
108 | english = LANGUAGES['en'] = FreelingWrapper('en')
109 | spanish = LANGUAGES['es'] = FreelingWrapper('es')
110 | italian = LANGUAGES['it'] = FreelingWrapper('it')
111 | portuguese = LANGUAGES['pt'] = FreelingWrapper('pt')
112 | russian = LANGUAGES['ru'] = FreelingWrapper('ru')
113 | welsh = LANGUAGES['cy'] = FreelingWrapper('cy')
114 |
--------------------------------------------------------------------------------
/metanl/token_utils.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | from __future__ import unicode_literals
3 | """
4 | This file contains some generally useful operations you would perform to
5 | separate and join tokens. The tools apply most to English, but should also
6 | be able to do their job in any Western language that uses spaces.
7 | """
8 |
9 | import re
10 | import unicodedata
11 |
12 |
13 | def tokenize(text):
14 | """
15 | Split a text into tokens (words, morphemes we can separate such as
16 | "n't", and punctuation).
17 | """
18 | return list(_tokenize_gen(text))
19 |
20 |
21 | def _tokenize_gen(text):
22 | import nltk
23 | for sent in nltk.sent_tokenize(text):
24 | for word in nltk.word_tokenize(sent):
25 | yield word
26 |
27 |
28 | def untokenize(words):
29 | """
30 | Untokenizing a text undoes the tokenizing operation, restoring
31 | punctuation and spaces to the places that people expect them to be.
32 |
33 | Ideally, `untokenize(tokenize(text))` should be identical to `text`,
34 | except for line breaks.
35 | """
36 | text = ' '.join(words)
37 | step1 = text.replace("`` ", '"').replace(" ''", '"').replace('. . .', '...')
38 | step2 = step1.replace(" ( ", " (").replace(" ) ", ") ")
39 | step3 = re.sub(r' ([.,:;?!%]+)([ \'"`])', r"\1\2", step2)
40 | step4 = re.sub(r' ([.,:;?!%]+)$', r"\1", step3)
41 | step5 = step4.replace(" '", "'").replace(" n't", "n't").replace(
42 | "can not", "cannot")
43 | step6 = step5.replace(" ` ", " '")
44 | return step6.strip()
45 |
46 |
47 | # This expression scans through a reversed string to find segments of
48 | # camel-cased text. Comments show what these mean, forwards, in preference
49 | # order:
50 | CAMEL_RE = re.compile(r"""
51 | ^( [A-Z]+ # A string of all caps, such as an acronym
52 | | [^A-Z0-9 _]+[A-Z _] # A single capital letter followed by lowercase
53 | # letters, or lowercase letters on their own
54 | # after a word break
55 | | [^A-Z0-9 _]*[0-9.]+ # A number, possibly followed by lowercase
56 | # letters
57 | | [ _]+ # Extra word breaks (spaces or underscores)
58 | | [^A-Z0-9]*[^A-Z0-9_ ]+ # Miscellaneous symbols, possibly with lowercase
59 | # letters after them
60 | )
61 | """, re.VERBOSE)
62 |
63 |
64 | def un_camel_case(text):
65 | r"""
66 | Splits apart words that are written in CamelCase.
67 |
68 | Bugs:
69 |
70 | - Non-ASCII characters are treated as lowercase letters, even if they are
71 | actually capital letters.
72 |
73 | Examples:
74 |
75 | >>> un_camel_case('1984ZXSpectrumGames')
76 | '1984 ZX Spectrum Games'
77 |
78 | >>> un_camel_case('aaAa aaAaA 0aA AAAa!AAA')
79 | 'aa Aa aa Aa A 0a A AA Aa! AAA'
80 |
81 | >>> un_camel_case('MotörHead')
82 | 'Mot\xf6r Head'
83 |
84 | >>> un_camel_case('MSWindows3.11ForWorkgroups')
85 | 'MS Windows 3.11 For Workgroups'
86 |
87 | This should not significantly affect text that is not camel-cased:
88 |
89 | >>> un_camel_case('ACM_Computing_Classification_System')
90 | 'ACM Computing Classification System'
91 |
92 | >>> un_camel_case('Anne_Blunt,_15th_Baroness_Wentworth')
93 | 'Anne Blunt, 15th Baroness Wentworth'
94 |
95 | >>> un_camel_case('Hindi-Urdu')
96 | 'Hindi-Urdu'
97 | """
98 | revtext = text[::-1]
99 | pieces = []
100 | while revtext:
101 | match = CAMEL_RE.match(revtext)
102 | if match:
103 | pieces.append(match.group(1))
104 | revtext = revtext[match.end():]
105 | else:
106 | pieces.append(revtext)
107 | revtext = ''
108 | revstr = ' '.join(piece.strip(' _') for piece in pieces
109 | if piece.strip(' _'))
110 | return revstr[::-1].replace('- ', '-')
111 |
112 |
113 | # see http://www.fileformat.info/info/unicode/category/index.htm
114 | BOUNDARY_CATEGORIES = {'Cc', # control characters
115 | 'Cf', # format characters
116 | 'Cn', # "other, not assigned"
117 | 'Pc', # connector punctuation
118 | 'Pd', # dash
119 | 'Pe', # close-punctuation
120 | 'Pf', # final-quote
121 | 'Pi', # initial-quote
122 | 'Po', # other punctuation
123 | 'Zl', # line separator
124 | 'Zp', # paragraph separator
125 | 'Zs', # space separator
126 | }
127 |
128 | def string_pieces(s, maxlen=1024):
129 | """
130 | Takes a (unicode) string and yields pieces of it that are at most `maxlen`
131 | characters, trying to break it at punctuation/whitespace. This is an
132 | important step before using a tokenizer with a maximum buffer size.
133 | """
134 | if not s:
135 | return
136 | i = 0
137 | while True:
138 | j = i + maxlen
139 | if j >= len(s):
140 | yield s[i:]
141 | return
142 | # Using "j - 1" keeps boundary characters with the left chunk
143 | while unicodedata.category(s[j - 1]) not in BOUNDARY_CATEGORIES:
144 | j -= 1
145 | if j == i:
146 | # No boundary available; oh well.
147 | j = i + maxlen
148 | break
149 | yield s[i:j]
150 | i = j
151 |
152 |
--------------------------------------------------------------------------------
/metanl/nltk_morphy.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import print_function, unicode_literals
3 |
4 | import nltk
5 | from nltk.corpus import wordnet
6 | from metanl.token_utils import untokenize, tokenize
7 | import re
8 |
9 | try:
10 | morphy = wordnet._morphy
11 | except LookupError:
12 | nltk.download('wordnet')
13 | morphy = wordnet._morphy
14 |
15 | STOPWORDS = ['the', 'a', 'an']
16 |
17 | EXCEPTIONS = {
18 | # Avoid obsolete and obscure roots, the way lexicographers don't.
19 | 'wrought': 'wrought', # not 'work'
20 | 'media': 'media', # not 'medium'
21 | 'installed': 'install', # not 'instal'
22 | 'installing': 'install',# not 'instal'
23 | 'synapses': 'synapse', # not 'synapsis'
24 | 'soles': 'sole', # not 'sol'
25 | 'pubes': 'pube', # not 'pubis'
26 | 'dui': 'dui', # not 'duo'
27 | 'taxis': 'taxi', # not 'taxis'
28 |
29 | # Work around errors that Morphy makes.
30 | 'alas': 'alas',
31 | 'corps': 'corps',
32 | 'cos': 'cos',
33 | 'enured': 'enure',
34 | 'fiver': 'fiver',
35 | 'hinder': 'hinder',
36 | 'lobed': 'lobe',
37 | 'offerer': 'offerer',
38 | 'outer': 'outer',
39 | 'sang': 'sing',
40 | 'singing': 'sing',
41 | 'solderer': 'solderer',
42 | 'tined': 'tine',
43 | 'twiner': 'twiner',
44 | 'us': 'us',
45 |
46 | # Stem common nouns whose plurals are apparently ambiguous
47 | 'teeth': 'tooth',
48 | 'things': 'thing',
49 | 'people': 'person',
50 |
51 | # Tokenization artifacts
52 | 'wo': 'will',
53 | 'ca': 'can',
54 | "n't": 'not',
55 | }
56 |
57 | AMBIGUOUS_EXCEPTIONS = {
58 | # Avoid nouns that shadow more common verbs.
59 | 'am': 'be',
60 | 'as': 'as',
61 | 'are': 'be',
62 | 'ate': 'eat',
63 | 'bent': 'bend',
64 | 'drove': 'drive',
65 | 'fell': 'fall',
66 | 'felt': 'feel',
67 | 'found': 'find',
68 | 'has': 'have',
69 | 'lit': 'light',
70 | 'lost': 'lose',
71 | 'sat': 'sit',
72 | 'saw': 'see',
73 | 'sent': 'send',
74 | 'shook': 'shake',
75 | 'shot': 'shoot',
76 | 'slain': 'slay',
77 | 'spoke': 'speak',
78 | 'stole': 'steal',
79 | 'sung': 'sing',
80 | 'thought': 'think',
81 | 'tore': 'tear',
82 | 'was': 'be',
83 | 'won': 'win',
84 | 'feed': 'feed',
85 | }
86 |
87 |
88 | def _word_badness(word):
89 | """
90 | Assign a heuristic to possible outputs from Morphy. Minimizing this
91 | heuristic avoids incorrect stems.
92 | """
93 | if word.endswith('e'):
94 | return len(word) - 2
95 | elif word.endswith('ess'):
96 | return len(word) - 10
97 | elif word.endswith('ss'):
98 | return len(word) - 4
99 | else:
100 | return len(word)
101 |
102 |
103 | def _morphy_best(word, pos=None):
104 | """
105 | Get the most likely stem for a word using Morphy, once the input has been
106 | pre-processed by morphy_stem().
107 | """
108 | results = []
109 | if pos is None:
110 | pos = 'nvar'
111 | for pos_item in pos:
112 | results.extend(morphy(word, pos_item))
113 | if not results:
114 | return None
115 | results.sort(key=lambda x: _word_badness(x))
116 | return results[0]
117 |
118 |
119 | def morphy_stem(word, pos=None):
120 | """
121 | Get the most likely stem for a word. If a part of speech is supplied,
122 | the stem will be more accurate.
123 |
124 | Valid parts of speech are:
125 |
126 | - 'n' or 'NN' for nouns
127 | - 'v' or 'VB' for verbs
128 | - 'a' or 'JJ' for adjectives
129 | - 'r' or 'RB' for adverbs
130 |
131 | Any other part of speech will be treated as unknown.
132 | """
133 | word = word.lower()
134 | if pos is not None:
135 | if pos.startswith('NN'):
136 | pos = 'n'
137 | elif pos.startswith('VB'):
138 | pos = 'v'
139 | elif pos.startswith('JJ'):
140 | pos = 'a'
141 | elif pos.startswith('RB'):
142 | pos = 'r'
143 | if pos is None and word.endswith('ing') or word.endswith('ed'):
144 | pos = 'v'
145 | if pos is not None and pos not in 'nvar':
146 | pos = None
147 | if word in EXCEPTIONS:
148 | return EXCEPTIONS[word]
149 | if pos is None:
150 | if word in AMBIGUOUS_EXCEPTIONS:
151 | return AMBIGUOUS_EXCEPTIONS[word]
152 | return _morphy_best(word, pos) or word
153 |
154 |
155 | def tag_and_stem(text):
156 | """
157 | Returns a list of (stem, tag, token) triples:
158 |
159 | - stem: the word's uninflected form
160 | - tag: the word's part of speech
161 | - token: the original word, so we can reconstruct it later
162 | """
163 | tokens = tokenize(text)
164 | tagged = nltk.pos_tag(tokens)
165 | out = []
166 | for token, tag in tagged:
167 | stem = morphy_stem(token, tag)
168 | out.append((stem, tag, token))
169 | return out
170 |
171 |
172 | def good_lemma(lemma):
173 | return lemma and lemma not in STOPWORDS and lemma[0].isalnum()
174 |
175 |
176 | def normalize_list(text):
177 | """
178 | Get a list of word stems that appear in the text. Stopwords and an initial
179 | 'to' will be stripped, unless this leaves nothing in the stem.
180 |
181 | >>> normalize_list('the dog')
182 | ['dog']
183 | >>> normalize_list('big dogs')
184 | ['big', 'dog']
185 | >>> normalize_list('the')
186 | ['the']
187 | """
188 | pieces = [morphy_stem(word) for word in tokenize(text)]
189 | pieces = [piece for piece in pieces if good_lemma(piece)]
190 | if not pieces:
191 | return [text]
192 | if pieces[0] == 'to':
193 | pieces = pieces[1:]
194 | return pieces
195 |
196 |
197 | def normalize(text):
198 | """
199 | Get a string made from the non-stopword word stems in the text. See
200 | normalize_list().
201 | """
202 | return untokenize(normalize_list(text))
203 |
204 |
205 | def normalize_topic(topic):
206 | """
207 | Get a canonical representation of a Wikipedia topic, which may include
208 | a disambiguation string in parentheses.
209 |
210 | Returns (name, disambig), where "name" is the normalized topic name,
211 | and "disambig" is a string corresponding to the disambiguation text or
212 | None.
213 | """
214 | # find titles of the form Foo (bar)
215 | topic = topic.replace('_', ' ')
216 | match = re.match(r'([^(]+) \(([^)]+)\)', topic)
217 | if not match:
218 | return normalize(topic), None
219 | else:
220 | return normalize(match.group(1)), 'n/' + match.group(2).strip(' _')
221 |
222 |
223 | def word_frequency(word, default_freq=0):
224 | raise NotImplementedError("Word frequency is now in the wordfreq package.")
225 |
226 |
227 | def get_wordlist():
228 | raise NotImplementedError("Wordlists are now in the wordfreq package.")
229 |
--------------------------------------------------------------------------------
/metanl/extprocess.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import unicode_literals
3 | """
4 | Tools for using an external program as an NLP pipe. See, for example,
5 | freeling.py.
6 | """
7 |
8 | import subprocess
9 | import unicodedata
10 | import sys
11 | from ftfy.fixes import remove_control_chars, remove_unsafe_private_use
12 | if sys.version_info.major == 2:
13 | range = xrange
14 | str_func = unicode
15 | else:
16 | str_func = str
17 |
18 |
19 | def render_safe(text):
20 | '''
21 | Make sure the given text is safe to pass to an external process.
22 | '''
23 | return remove_control_chars(remove_unsafe_private_use(text))
24 |
25 |
26 | class ProcessError(IOError):
27 | """
28 | A subclass of IOError raised when we can't start the external process.
29 | """
30 | pass
31 |
32 |
33 | class ProcessWrapper(object):
34 | """
35 | A ProcessWrapper uses the `subprocess` module to keep a process open that
36 | we can pipe stuff through to get NLP results.
37 |
38 | Instead of every instance immediately opening a process, however, it waits
39 | until the first time it is needed, then starts the process.
40 |
41 | Many methods are intended to be implemented by subclasses of ProcessWrapper
42 | that actually know what program they're talking to.
43 | """
44 | def __del__(self):
45 | """
46 | Clean up by closing the pipe.
47 | """
48 | if hasattr(self, '_process'):
49 | self._process.stdin.close()
50 |
51 | @property
52 | def process(self):
53 | """
54 | Store the actual process in _process. If it doesn't exist yet, create
55 | it.
56 | """
57 | if hasattr(self, '_process'):
58 | return self._process
59 | else:
60 | self._process = self._get_process()
61 | return self._process
62 |
63 | def _get_command(self):
64 | """
65 | This method should return the command to run, as a list
66 | of arguments that can be used by subprocess.Popen.
67 | """
68 | raise NotImplementedError
69 |
70 | def _get_process(self):
71 | """
72 | Create the process by running the specified command.
73 | """
74 | command = self._get_command()
75 | return subprocess.Popen(command, bufsize=-1, close_fds=True,
76 | stdout=subprocess.PIPE,
77 | stdin=subprocess.PIPE)
78 |
79 | def get_record_root(self, record):
80 | """
81 | Given a *record* (the data that the external process returns for a
82 | given single token), this specifies how to extract its root word
83 | (aka its lemma).
84 | """
85 | raise NotImplementedError
86 |
87 | def get_record_token(self, record):
88 | """
89 | Given a record, this specifies how to extract the exact word or token
90 | that was processed.
91 | """
92 | raise NotImplementedError
93 |
94 | def analyze(self, text):
95 | """
96 | Take text as input, run it through the external process, and return a
97 | list of *records* containing the results.
98 | """
99 | raise NotImplementedError
100 |
101 | def send_input(self, data):
102 | self.process.stdin.write(data)
103 | self.process.stdin.flush()
104 |
105 | def receive_output_line(self):
106 | line = self.process.stdout.readline()
107 | if not line:
108 | raise ProcessError("reached end of output")
109 | return line
110 |
111 | def restart_process(self):
112 | if hasattr(self, '_process'):
113 | self._process.stdin.close()
114 | self._process = self._get_process()
115 | return self._process
116 |
117 | def tokenize_list(self, text):
118 | """
119 | Split a text into separate words.
120 | """
121 | return [self.get_record_token(record) for record in self.analyze(text)]
122 |
123 | def tokenize(self, text):
124 | """
125 | Yell at people who are still using simplenlp's bad idea of
126 | tokenization.
127 | """
128 | raise NotImplementedError("tokenize is deprecated. Use tokenize_list.")
129 |
130 | def is_stopword_record(self, record, common_words=False):
131 | """
132 | Given a record, return whether it represents a stopword (a word that
133 | should be discarded in NLP results).
134 |
135 | Note that we want very few words to be stopwords. Words that are
136 | meaningful but simply common can be recognized by their very high word
137 | frequency, and handled appropriately. Often, we only want determiners
138 | (such as 'a', 'an', and 'the' in English) to be stopwords.
139 |
140 | Takes in a vestigial parameter, `common_words`, and ignores it.
141 | """
142 | raise NotImplementedError
143 |
144 | def is_stopword(self, text):
145 | """
146 | Determine whether a single word is a stopword, or whether a short
147 | phrase is made entirely of stopwords, disregarding context.
148 |
149 | Use of this function should be avoided; it's better to give the text
150 | in context and let the process determine which words are the stopwords.
151 | """
152 | found_content_word = False
153 | for record in self.analyze(text):
154 | if not self.is_stopword_record(record):
155 | found_content_word = True
156 | break
157 | return not found_content_word
158 |
159 | def get_record_pos(self, record):
160 | """
161 | Given a record, get the word's part of speech.
162 |
163 | This default implementation simply distinguishes stopwords from
164 | non-stopwords.
165 | """
166 | if self.is_stopword_record(record):
167 | return 'STOP'
168 | else:
169 | return 'TERM'
170 |
171 | def normalize_list(self, text, cache=None):
172 | """
173 | Get a canonical list representation of text, with words
174 | separated and reduced to their base forms.
175 |
176 | TODO: use the cache.
177 | """
178 | words = []
179 | analysis = self.analyze(text)
180 | for record in analysis:
181 | if not self.is_stopword_record(record):
182 | words.append(self.get_record_root(record))
183 | if not words:
184 | # Don't discard stopwords if that's all you've got
185 | words = [self.get_record_token(record) for record in analysis]
186 | return words
187 |
188 | def normalize(self, text, cache=None):
189 | """
190 | Get a canonical string representation of this text, like
191 | :meth:`normalize_list` but joined with spaces.
192 |
193 | TODO: use the cache.
194 | """
195 | return ' '.join(self.normalize_list(text, cache))
196 |
197 | def tag_and_stem(self, text, cache=None):
198 | """
199 | Given some text, return a sequence of (stem, pos, text) triples as
200 | appropriate for the reader. `pos` can be as general or specific as
201 | necessary (for example, it might label all parts of speech, or it might
202 | only distinguish function words from others).
203 |
204 | Twitter-style hashtags and at-mentions have the stem and pos they would
205 | have without the leading # or @. For instance, if the reader's triple
206 | for "thing" is ('thing', 'NN', 'things'), then "#things" would come out
207 | as ('thing', 'NN', '#things').
208 | """
209 | analysis = self.analyze(text)
210 | triples = []
211 |
212 | for record in analysis:
213 | root = self.get_record_root(record)
214 | token = self.get_record_token(record)
215 |
216 | if token:
217 | if unicode_is_punctuation(token):
218 | triples.append((token, '.', token))
219 | else:
220 | pos = self.get_record_pos(record)
221 | triples.append((root, pos, token))
222 | return triples
223 |
224 | def extract_phrases(self, text):
225 | """
226 | Given some text, extract phrases of up to 2 content words,
227 | and map their normalized form to the complete phrase.
228 | """
229 | analysis = self.analyze(text)
230 | for pos1 in range(len(analysis)):
231 | rec1 = analysis[pos1]
232 | if not self.is_stopword_record(rec1):
233 | yield self.get_record_root(rec1), rec1[0]
234 | for pos2 in range(pos1 + 1, len(analysis)):
235 | rec2 = analysis[pos2]
236 | if not self.is_stopword_record(rec2):
237 | roots = [self.get_record_root(rec1),
238 | self.get_record_root(rec2)]
239 | pieces = [analysis[i][0] for i in range(pos1, pos2+1)]
240 | term = ' '.join(roots)
241 | phrase = ''.join(pieces)
242 | yield term, phrase
243 | break
244 |
245 |
246 | def unicode_is_punctuation(text):
247 | """
248 | Test if a token is made entirely of Unicode characters of the following
249 | classes:
250 |
251 | - P: punctuation
252 | - S: symbols
253 | - Z: separators
254 | - M: combining marks
255 | - C: control characters
256 |
257 | >>> unicode_is_punctuation('word')
258 | False
259 | >>> unicode_is_punctuation('。')
260 | True
261 | >>> unicode_is_punctuation('-')
262 | True
263 | >>> unicode_is_punctuation('-3')
264 | False
265 | >>> unicode_is_punctuation('あ')
266 | False
267 | """
268 | for char in str_func(text):
269 | category = unicodedata.category(char)[0]
270 | if category not in 'PSZMC':
271 | return False
272 | return True
273 |
--------------------------------------------------------------------------------
/metanl/mecab.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import print_function, unicode_literals
3 | """
4 | This module provides some basic Japanese NLP by wrapping the output of MeCab.
5 | It can tokenize and normalize Japanese words, detect and remove stopwords,
6 | and it can even respell words in kana or romaji.
7 |
8 | This requires mecab to be installed separately. On Ubuntu:
9 | sudo apt-get install mecab mecab-ipadic-utf8
10 |
11 | >>> print(normalize('これはテストです'))
12 | テスト
13 | >>> tag_and_stem('これはテストです。')
14 | [('\u3053\u308c', '~\u540d\u8a5e', '\u3053\u308c'), ('\u306f', '~\u52a9\u8a5e', '\u306f'), ('\u30c6\u30b9\u30c8', '\u540d\u8a5e', '\u30c6\u30b9\u30c8'), ('\u3067\u3059', '~\u52a9\u52d5\u8a5e', '\u3067\u3059'), ('\u3002', '.', '\u3002')]
15 | """
16 |
17 | from metanl.token_utils import string_pieces
18 | from metanl.extprocess import ProcessWrapper, ProcessError, render_safe
19 | from collections import namedtuple
20 | import unicodedata
21 | import re
22 | import sys
23 | if sys.version_info.major == 2:
24 | range = xrange
25 | str_func = unicode
26 | else:
27 | str_func = str
28 |
29 |
30 | class MeCabError(ProcessError):
31 | pass
32 |
33 | MeCabRecord = namedtuple('MeCabRecord',
34 | [
35 | 'surface',
36 | 'pos',
37 | 'subclass1',
38 | 'subclass2',
39 | 'subclass3',
40 | 'conjugation',
41 | 'form',
42 | 'root',
43 | 'reading',
44 | 'pronunciation'
45 | ]
46 | )
47 |
48 |
49 | # MeCab outputs the part of speech of its terms. We can simply identify
50 | # particular (coarse or fine) parts of speech as containing stopwords.
51 |
52 | STOPWORD_CATEGORIES = set([
53 | '助詞', # coarse: particle
54 | '助動詞', # coarse: auxiliary verb
55 | '接続詞', # coarse: conjunction
56 | 'フィラー', # coarse: filler
57 | '記号', # coarse: symbol
58 | '非自立', # fine: 'not independent'
59 | ])
60 |
61 |
62 | # Forms of particular words should also be considered stopwords sometimes.
63 | #
64 | # A thought: Should the rare kanji version of suru not be a stopword?
65 | # I'll need to ask someone who knows more Japanese, but it may be
66 | # that if they're using the kanji it's for particular emphasis.
67 | STOPWORD_ROOTS = set([
68 | 'する', # suru: "to do"
69 | '為る', # suru in kanji (very rare)
70 | 'くる', # kuru: "to come"
71 | '来る', # kuru in kanji
72 | 'いく', # iku: "to go"
73 | '行く', # iku in kanji
74 | 'いる', # iru: "to be" (animate)
75 | '居る', # iru in kanji
76 | 'ある', # aru: "to exist" or "to have"
77 | '有る', # aru in kanji
78 | 'もの', # mono: "thing"
79 | '物', # mono in kanji
80 | 'よう', # yō: "way"
81 | '様', # yō in kanji
82 | 'れる', # passive suffix
83 | 'これ', # kore: "this"
84 | 'それ', # sore: "that"
85 | 'あれ', # are: "that over there"
86 | 'この', # kono: "this"
87 | 'その', # sono: "that"
88 | 'あの', # ano: "that over there", "yon"
89 | ])
90 |
91 |
92 | class MeCabWrapper(ProcessWrapper):
93 | """
94 | Handle Japanese text using the command-line version of MeCab.
95 | (mecab-python is convenient, but its installer is too flaky to rely on.)
96 |
97 | ja_cabocha gives more sophisticated results, but requires a large number of
98 | additional dependencies. Using this tool for Japanese requires only
99 | MeCab to be installed and accepting UTF-8 text.
100 | """
101 | def _get_command(self):
102 | return ['mecab']
103 |
104 | def _get_process(self):
105 | try:
106 | proc = ProcessWrapper._get_process(self)
107 | except (OSError, ProcessError):
108 | raise MeCabError("MeCab didn't start. See README.txt for details "
109 | "about installing MeCab and other Japanese NLP "
110 | "tools.")
111 | return proc
112 |
113 | def get_record_root(self, record):
114 | """
115 | Given a MeCab record, return the root word.
116 | """
117 | if record.root == '*':
118 | return record.surface
119 | else:
120 | return record.root
121 |
122 | def get_record_token(self, record):
123 | return record.surface
124 |
125 | def analyze(self, text):
126 | """
127 | Runs a line of text through MeCab, and returns the results as a
128 | list of lists ("records") that contain the MeCab analysis of each
129 | word.
130 | """
131 | try:
132 | self.process # make sure things are loaded
133 | text = render_safe(text).replace('\n', ' ').lower()
134 | results = []
135 | for chunk in string_pieces(text):
136 | self.send_input((chunk + '\n').encode('utf-8'))
137 | while True:
138 | out_line = self.receive_output_line().decode('utf-8')
139 | if out_line == 'EOS\n':
140 | break
141 |
142 | word, info = out_line.strip('\n').split('\t')
143 | record_parts = [word] + info.split(',')
144 |
145 | # Pad the record out to have 10 parts if it doesn't
146 | record_parts += [None] * (10 - len(record_parts))
147 | record = MeCabRecord(*record_parts)
148 |
149 | # special case for detecting nai -> n
150 | if (record.surface == 'ん' and
151 | record.conjugation == '不変化型'):
152 | # rebuild the record so that record.root is 'nai'
153 | record_parts[MeCabRecord._fields.index('root')] = 'ない'
154 | record = MeCabRecord(*record_parts)
155 |
156 | results.append(record)
157 | return results
158 | except ProcessError:
159 | self.restart_process()
160 | return self.analyze(text)
161 |
162 | def is_stopword_record(self, record):
163 | """
164 | Determine whether a single MeCab record represents a stopword.
165 |
166 | This mostly determines words to strip based on their parts of speech.
167 | If common_words is set to True (default), it will also strip common
168 | verbs and nouns such as くる and よう. If more_stopwords is True, it
169 | will look at the sub-part of speech to remove more categories.
170 | """
171 | # preserve negations
172 | if record.root == 'ない':
173 | return False
174 | return (
175 | record.pos in STOPWORD_CATEGORIES or
176 | record.subclass1 in STOPWORD_CATEGORIES or
177 | record.root in STOPWORD_ROOTS
178 | )
179 |
180 | def get_record_pos(self, record):
181 | """
182 | Given a record, get the word's part of speech.
183 |
184 | Here we're going to return MeCab's part of speech (written in
185 | Japanese), though if it's a stopword we prefix the part of speech
186 | with '~'.
187 | """
188 | if self.is_stopword_record(record):
189 | return '~' + record.pos
190 | else:
191 | return record.pos
192 |
193 |
194 | class NoStopwordMeCabWrapper(MeCabWrapper):
195 | """
196 | This version of the MeCabWrapper doesn't label anything as a stopword. It's
197 | used in building ConceptNet because discarding stopwords based on MeCab
198 | categories loses too much information.
199 | """
200 | def is_stopword_record(self, record, common_words=False):
201 | return False
202 |
203 |
204 | # Define the classes of characters we'll be trying to transliterate
205 | NOT_KANA, KANA, NN, SMALL, SMALL_Y, SMALL_TSU, PROLONG = range(7)
206 |
207 |
208 | def to_kana(text):
209 | """
210 | Use MeCab to turn any text into its phonetic spelling, as katakana
211 | separated by spaces.
212 | """
213 | records = MECAB.analyze(text)
214 | kana = []
215 | for record in records:
216 | if record.pronunciation:
217 | kana.append(record.pronunciation)
218 | elif record.reading:
219 | kana.append(record.reading)
220 | else:
221 | kana.append(record.surface)
222 | return ' '.join(k for k in kana if k)
223 |
224 |
225 | def get_kana_info(char):
226 | """
227 | Return two things about each character:
228 |
229 | - Its transliterated value (in Roman characters, if it's a kana)
230 | - A class of characters indicating how it affects the romanization
231 | """
232 | try:
233 | name = unicodedata.name(char)
234 | except ValueError:
235 | return char, NOT_KANA
236 |
237 | # The names we're dealing with will probably look like
238 | # "KATAKANA CHARACTER ZI".
239 | if (name.startswith('HIRAGANA LETTER') or
240 | name.startswith('KATAKANA LETTER') or
241 | name.startswith('KATAKANA-HIRAGANA')):
242 | names = name.split()
243 | syllable = str_func(names[-1].lower())
244 |
245 | if name.endswith('SMALL TU'):
246 | # The small tsu (っ) doubles the following consonant.
247 | # It'll show up as 't' on its own.
248 | return 't', SMALL_TSU
249 | elif names[-1] == 'N':
250 | return 'n', NN
251 | elif names[1] == 'PROLONGED':
252 | # The prolongation marker doubles the previous vowel.
253 | # It'll show up as '_' on its own.
254 | return '_', PROLONG
255 | elif names[-2] == 'SMALL':
256 | # Small characters tend to modify the sound of the previous
257 | # kana. If they can't modify anything, they're appended to
258 | # the letter 'x' instead.
259 | if syllable.startswith('y'):
260 | return 'x' + syllable, SMALL_Y
261 | else:
262 | return 'x' + syllable, SMALL
263 |
264 | return syllable, KANA
265 | else:
266 | if char in ROMAN_PUNCTUATION_TABLE:
267 | char = ROMAN_PUNCTUATION_TABLE[char]
268 | return char, NOT_KANA
269 |
270 |
271 | def respell_hepburn(syllable):
272 | while syllable[:2] in HEPBURN_TABLE:
273 | syllable = HEPBURN_TABLE[syllable[:2]] + syllable[2:]
274 | return syllable
275 |
276 |
277 | def romanize(text, respell=respell_hepburn):
278 | if respell is None:
279 | respell = lambda x: x
280 |
281 | kana = to_kana(str_func(text))
282 | pieces = []
283 | prevgroup = NOT_KANA
284 |
285 | for char in kana:
286 | roman, group = get_kana_info(char)
287 | if prevgroup == NN:
288 | # When the previous syllable is 'n' and the next syllable would
289 | # make it ambiguous, add an apostrophe.
290 | if group != KANA or roman[0] in 'aeinouy':
291 | if unicodedata.category(roman[0])[0] == 'L':
292 | pieces[-1] += "'"
293 |
294 | # Determine how to spell the current character
295 | if group == NOT_KANA:
296 | pieces.append(roman)
297 | elif group == SMALL_TSU or group == NN:
298 | pieces.append(roman)
299 | elif group == SMALL_Y:
300 | if prevgroup == KANA:
301 | # Modify the previous syllable, if that makes sense. For
302 | # example, 'ni' + 'ya' becomes 'nya'.
303 | if not pieces[-1].endswith('i'):
304 | pieces.append(roman)
305 | else:
306 | modifier = roman[1:]
307 | modified = pieces[-1]
308 | pieces[-1] = modified[:-1] + modifier
309 | else:
310 | pieces.append(roman)
311 | elif group == SMALL:
312 | # Don't respell small vowels _yet_. We'll handle that at the end.
313 | # This may be a bit ambiguous, but nobody expects to see "tea"
314 | # spelled "texi".
315 | pieces.append(roman)
316 | elif group == PROLONG:
317 | if prevgroup in (KANA, SMALL_Y, SMALL):
318 | pieces[-1] = pieces[-1][:-1] + respell(pieces[-1][-1] + '_')
319 | else:
320 | pieces.append(roman)
321 | else: # this is a normal kana
322 | if prevgroup == SMALL_TSU:
323 | if roman[0] in 'aeiouy':
324 | # wait, there's no consonant there; cope by respelling the
325 | # previous kana as 't-'
326 | pieces[-1] = 't-'
327 | else:
328 | # Turn the previous 't' into a copy of the first consonant
329 | pieces[-1] = roman[0]
330 | elif prevgroup == NN:
331 | # Let Hepburn respell 'n' as 'm' in words such as 'shimbun'.
332 | try_respell = respell(pieces[-1] + roman[0])
333 | if try_respell[:-1] != pieces[-1]:
334 | pieces[-1] = try_respell[:-1]
335 | pieces.append(roman)
336 | prevgroup = group
337 |
338 | romantext = ''.join(respell(piece) for piece in pieces)
339 | romantext = re.sub(r'[aeiou]x([aeiou])', r'\1', romantext)
340 | return romantext
341 |
342 |
343 | # Hepburn romanization is the most familiar to English speakers. It involves
344 | # respelling certain parts of romanized words to better match their
345 | # pronunciation. For example, the name for Mount Fuji is respelled from
346 | # "huzi-san" to "fuji-san".
347 | HEPBURN_TABLE = {
348 | 'si': 'shi',
349 | 'sy': 'sh',
350 | 'ti': 'chi',
351 | 'ty': 'ch',
352 | 'tu': 'tsu',
353 | 'hu': 'fu',
354 | 'zi': 'ji',
355 | 'di': 'ji',
356 | 'zy': 'j',
357 | 'dy': 'j',
358 | 'nm': 'mm',
359 | 'nb': 'mb',
360 | 'np': 'mp',
361 | 'a_': 'aa',
362 | 'e_': 'ee',
363 | 'i_': 'ii',
364 | 'o_': 'ou',
365 | 'u_': 'uu'
366 | }
367 | ROMAN_PUNCTUATION_TABLE = {
368 | '・': '.',
369 | '。': '.',
370 | '、': ',',
371 | '!': '!',
372 | '「': '``',
373 | '」': "''",
374 | '?': '?',
375 | '〜': '~'
376 | }
377 |
378 | # Provide externally available functions.
379 | MECAB = MeCabWrapper()
380 |
381 | normalize = MECAB.normalize
382 | normalize_list = MECAB.normalize_list
383 | tokenize = MECAB.tokenize
384 | tokenize_list = MECAB.tokenize_list
385 | analyze = MECAB.analyze
386 | tag_and_stem = MECAB.tag_and_stem
387 | is_stopword = MECAB.is_stopword
388 |
--------------------------------------------------------------------------------