├── lib ├── engtagger │ ├── pos_tags.hash │ ├── pos_words.hash │ ├── unknown.yml │ ├── porter.rb │ └── tags.yml └── engtagger.rb ├── engtagger.gemspec ├── History.txt ├── README.txt ├── test └── test_engtagger.rb └── LICENSE.txt /lib/engtagger/pos_tags.hash: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diasks2/engtagger/master/lib/engtagger/pos_tags.hash -------------------------------------------------------------------------------- /lib/engtagger/pos_words.hash: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diasks2/engtagger/master/lib/engtagger/pos_words.hash -------------------------------------------------------------------------------- /engtagger.gemspec: -------------------------------------------------------------------------------- 1 | spec = Gem::Specification.new do |p| 2 | p.version = '0.1.1' 3 | p.summary = 'Example gem specification' 4 | p.name = "engtagger" 5 | p.author = "Yoichiro Hasebe" 6 | p.email = 'yohasebe@gmail.com' 7 | p.add_runtime_dependency 'hpricot' 8 | end -------------------------------------------------------------------------------- /History.txt: -------------------------------------------------------------------------------- 1 | === 0.1.0 / 2008-05-14 2 | 3 | * Modified Synopsis section of Readme.txt 4 | * Created a description of tag set in Readme.txt 5 | * Fixed a few minor bugs 6 | 7 | === 0.1.0 / 2008-05-06 8 | 9 | * Initial release 10 | * Functionalities are basically the same as those of Perl Lingua::EN::Tagger. 11 | -------------------------------------------------------------------------------- /lib/engtagger/unknown.yml: -------------------------------------------------------------------------------- 1 | --- #YAML:1.0 2 | "-abr-": { nnp: 1000 } 3 | "-cap-": { nnp: 900, nn: 48, nns: 48, vbg: 2, vbz: 2 } 4 | "-ed-": { vbn: 300, nn: 300, jj: 200, vbd: 200 } 5 | "-hyp-": { jj: 530, nn: 470 } 6 | "-hyp-adj-": { jj: 850, nn: 150 } 7 | "-ing-": { vbg: 800, jj: 180, nnp: 10, nn: 10 } 8 | "-ly-": { rb: 900, jj: 100 } 9 | "-s-": { nnp: 48, nn: 48, nns: 900, vbg: 2, vbz: 2 } 10 | "-sym-": { sym: 1000 } 11 | "-tion-": { nn: 950, nnp: 50 } 12 | "-unknown-": { nn: 875, jj: 195, nns: 20, vbz: 10 } 13 | -------------------------------------------------------------------------------- /lib/engtagger/porter.rb: -------------------------------------------------------------------------------- 1 | #! /local/ruby/bin/ruby 2 | # 3 | # $Id: stemmable.rb,v 1.2 2003/02/01 02:07:30 condit Exp $ 4 | # 5 | # See example usage at the end of this file. 6 | # 7 | 8 | module Stemmable 9 | 10 | STEP_2_LIST = { 11 | 'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance', 12 | 'izer'=>'ize', 'bli'=>'ble', 13 | 'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous', 14 | 'ization'=>'ize', 'ation'=>'ate', 15 | 'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful', 16 | 'ousness'=>'ous', 'aliti'=>'al', 17 | 'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log' 18 | } 19 | 20 | STEP_3_LIST = { 21 | 'icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic', 22 | 'ical'=>'ic', 'ful'=>'', 'ness'=>'' 23 | } 24 | 25 | 26 | SUFFIX_1_REGEXP = /( 27 | ational | 28 | tional | 29 | enci | 30 | anci | 31 | izer | 32 | bli | 33 | alli | 34 | entli | 35 | eli | 36 | ousli | 37 | ization | 38 | ation | 39 | ator | 40 | alism | 41 | iveness | 42 | fulness | 43 | ousness | 44 | aliti | 45 | iviti | 46 | biliti | 47 | logi)$/x 48 | 49 | 50 | SUFFIX_2_REGEXP = /( 51 | al | 52 | ance | 53 | ence | 54 | er | 55 | ic | 56 | able | 57 | ible | 58 | ant | 59 | ement | 60 | ment | 61 | ent | 62 | ou | 63 | ism | 64 | ate | 65 | iti | 66 | ous | 67 | ive | 68 | ize)$/x 69 | 70 | 71 | C = "[^aeiou]" # consonant 72 | V = "[aeiouy]" # vowel 73 | CC = "#{C}(?>[^aeiouy]*)" # consonant sequence 74 | VV = "#{V}(?>[aeiou]*)" # vowel sequence 75 | 76 | MGR0 = /^(#{CC})?#{VV}#{CC}/o # [cc]vvcc... is m>0 77 | MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o # [cc]vvcc[vv] is m=1 78 | MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o # [cc]vvccvvcc... is m>1 79 | VOWEL_IN_STEM = /^(#{CC})?#{V}/o # vowel in stem 80 | 81 | # 82 | # Porter stemmer in Ruby. 83 | # 84 | # This is the Porter stemming algorithm, ported to Ruby from the 85 | # version coded up in Perl. It's easy to follow against the rules 86 | # in the original paper in: 87 | # 88 | # Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14, 89 | # no. 3, pp 130-137, 90 | # 91 | # See also http://www.tartarus.org/~martin/PorterStemmer 92 | # 93 | # Send comments to raypereda@hotmail.com 94 | # 95 | 96 | def stem_porter 97 | 98 | # make a copy of the given object and convert it to a string. 99 | w = self.dup.to_str 100 | 101 | return w if w.length < 3 102 | 103 | # now map initial y to Y so that the patterns never treat it as vowel 104 | w[0] = 'Y' if w[0] == ?y 105 | 106 | # Step 1a 107 | if w =~ /(ss|i)es$/ 108 | w = $` + $1 109 | elsif w =~ /([^s])s$/ 110 | w = $` + $1 111 | end 112 | 113 | # Step 1b 114 | if w =~ /eed$/ 115 | w.chop! if $` =~ MGR0 116 | elsif w =~ /(ed|ing)$/ 117 | stem = $` 118 | if stem =~ VOWEL_IN_STEM 119 | w = stem 120 | case w 121 | when /(at|bl|iz)$/ then w << "e" 122 | when /([^aeiouylsz])\1$/ then w.chop! 123 | when /^#{CC}#{V}[^aeiouwxy]$/o then w << "e" 124 | end 125 | end 126 | end 127 | 128 | if w =~ /y$/ 129 | stem = $` 130 | w = stem + "i" if stem =~ VOWEL_IN_STEM 131 | end 132 | 133 | # Step 2 134 | if w =~ SUFFIX_1_REGEXP 135 | stem = $` 136 | suffix = $1 137 | # print "stem= " + stem + "\n" + "suffix=" + suffix + "\n" 138 | if stem =~ MGR0 139 | w = stem + STEP_2_LIST[suffix] 140 | end 141 | end 142 | 143 | # Step 3 144 | if w =~ /(icate|ative|alize|iciti|ical|ful|ness)$/ 145 | stem = $` 146 | suffix = $1 147 | if stem =~ MGR0 148 | w = stem + STEP_3_LIST[suffix] 149 | end 150 | end 151 | 152 | # Step 4 153 | if w =~ SUFFIX_2_REGEXP 154 | stem = $` 155 | if stem =~ MGR1 156 | w = stem 157 | end 158 | elsif w =~ /(s|t)(ion)$/ 159 | stem = $` + $1 160 | if stem =~ MGR1 161 | w = stem 162 | end 163 | end 164 | 165 | # Step 5 166 | if w =~ /e$/ 167 | stem = $` 168 | if (stem =~ MGR1) || 169 | (stem =~ MEQ1 && stem !~ /^#{CC}#{V}[^aeiouwxy]$/o) 170 | w = stem 171 | end 172 | end 173 | 174 | if w =~ /ll$/ && w =~ MGR1 175 | w.chop! 176 | end 177 | 178 | # and turn initial Y back to y 179 | w[0] = 'y' if w[0] == ?Y 180 | 181 | w 182 | end 183 | 184 | 185 | # 186 | # make the stem_porter the default stem method, just in case we 187 | # feel like having multiple stemmers available later. 188 | # 189 | alias stem stem_porter 190 | 191 | end 192 | 193 | # Add stem method to all Strings 194 | class String 195 | include Stemmable 196 | end 197 | -------------------------------------------------------------------------------- /README.txt: -------------------------------------------------------------------------------- 1 | = EngTagger 2 | 3 | English Part-of-Speech Tagger Library; a Ruby port of Lingua::EN::Tagger 4 | 5 | === Description 6 | 7 | A Ruby port of Perl Lingua::EN::Tagger, a probability based, corpus-trained 8 | tagger that assigns POS tags to English text based on a lookup dictionary and 9 | a set of probability values. The tagger assigns appropriate tags based on 10 | conditional probabilities--it examines the preceding tag to determine the 11 | appropriate tag for the current word. Unknown words are classified according to 12 | word morphology or can be set to be treated as nouns or other parts of speech. 13 | The tagger also extracts as many nouns and noun phrases as it can, using a set 14 | of regular expressions. 15 | 16 | === Features 17 | 18 | * Assigns POS tags to English text 19 | * Extract noun phrases from tagged text 20 | * etc. 21 | 22 | === Synopsis: 23 | 24 | require 'rubygems' 25 | require 'engtagger' 26 | 27 | # Create a parser object 28 | tgr = EngTagger.new 29 | 30 | # Sample text 31 | text = "Alice chased the big fat cat." 32 | 33 | # Add part-of-speech tags to text 34 | tagged = tgr.add_tags(text) 35 | 36 | #=> "Alice chased the big fatcat ." 37 | 38 | # Get a list of all nouns and noun phrases with occurrence counts 39 | word_list = tgr.get_words(text) 40 | 41 | #=> {"Alice"=>1, "cat"=>1, "fat cat"=>1, "big fat cat"=>1} 42 | 43 | # Get a readable version of the tagged text 44 | readable = tgr.get_readable(text) 45 | 46 | #=> "Alice/NNP chased/VBD the/DET big/JJ fat/JJ cat/NN ./PP" 47 | 48 | # Get all nouns from a tagged output 49 | nouns = tgr.get_nouns(tagged) 50 | 51 | #=> {"cat"=>1, "Alice"=>1} 52 | 53 | # Get all proper nouns 54 | proper = tgr.get_proper_nouns(tagged) 55 | 56 | #=> {"Alice"=>1} 57 | 58 | 59 | # Get all noun phrases of any syntactic level 60 | # (same as word_list but take a tagged input) 61 | nps = tgr.get_noun_phrases(tagged) 62 | 63 | #=> {"Alice"=>1, "cat"=>1, "fat cat"=>1, "big fat cat"=>1} 64 | 65 | === Tag Set 66 | 67 | The set of POS tags used here is a modified version of the Penn Treebank tagset. Tags with non-letter characters have been redefined to work better in our data structures. Also, the "Determiner" tag (DET) has been changed from 'DT', in order to avoid confusion with the HTML tag,
. 68 | 69 | CC Conjunction, coordinating and, or 70 | CD Adjective, cardinal number 3, fifteen 71 | DET Determiner this, each, some 72 | EX Pronoun, existential there there 73 | FW Foreign words 74 | IN Preposition / Conjunction for, of, although, that 75 | JJ Adjective happy, bad 76 | JJR Adjective, comparative happier, worse 77 | JJS Adjective, superlative happiest, worst 78 | LS Symbol, list item A, A. 79 | MD Verb, modal can, could, 'll 80 | NN Noun aircraft, data 81 | NNP Noun, proper London, Michael 82 | NNPS Noun, proper, plural Australians, Methodists 83 | NNS Noun, plural women, books 84 | PDT Determiner, prequalifier quite, all, half 85 | POS Possessive 's, ' 86 | PRP Determiner, possessive second mine, yours 87 | PRPS Determiner, possessive their, your 88 | RB Adverb often, not, very, here 89 | RBR Adverb, comparative faster 90 | RBS Adverb, superlative fastest 91 | RP Adverb, particle up, off, out 92 | SYM Symbol * 93 | TO Preposition to 94 | UH Interjection oh, yes, mmm 95 | VB Verb, infinitive take, live 96 | VBD Verb, past tense took, lived 97 | VBG Verb, gerund taking, living 98 | VBN Verb, past/passive participle taken, lived 99 | VBP Verb, base present form take, live 100 | VBZ Verb, present 3SG -s form takes, lives 101 | WDT Determiner, question which, whatever 102 | WP Pronoun, question who, whoever 103 | WPS Determiner, possessive & question whose 104 | WRB Adverb, question when, how, however 105 | 106 | PP Punctuation, sentence ender ., !, ? 107 | PPC Punctuation, comma , 108 | PPD Punctuation, dollar sign $ 109 | PPL Punctuation, quotation mark left `` 110 | PPR Punctuation, quotation mark right '' 111 | PPS Punctuation, colon, semicolon, elipsis :, ..., - 112 | LRB Punctuation, left bracket (, {, [ 113 | RRB Punctuation, right bracket ), }, ] 114 | 115 | === Requirements 116 | 117 | * Ruby 1.8.6 118 | * Hpricot[http://code.whytheluckystiff.net/hpricot/] (optional) 119 | 120 | === Install 121 | 122 | (sudo) gem install engtagger 123 | 124 | === Author 125 | 126 | of this Ruby library 127 | * Yoichiro Hasebe (yohasebe [at] gmail.com) 128 | 129 | of the original Perl module 130 | * Aaron Coburn (acoburn [at] middlebury.edu) 131 | 132 | === Acknowledgement 133 | 134 | This Ruby library is a direct port of Lingua::EN::Tagger available at CPAN. 135 | The credit for the crucial part of its algorithm/design therefore goes to 136 | Aaron Coburn, the author of the original Perl version. 137 | 138 | === License 139 | 140 | This library is distributed under the GPL. Please see the LICENSE file. 141 | -------------------------------------------------------------------------------- /test/test_engtagger.rb: -------------------------------------------------------------------------------- 1 | # Code Generated by ZenTest v. 3.9.2 2 | # classname: asrt / meth = ratio% 3 | # EngTagger: 0 / 24 = 0.00% 4 | 5 | $ENGTAGGER_LIB = File.join(File.dirname(__FILE__), '..', 'lib') 6 | $LOAD_PATH << $ENGTAGGER_LIB 7 | require 'test/unit' unless defined? $ZENTEST and $ZENTEST 8 | require 'engtagger' 9 | 10 | class TestEngTagger < Test::Unit::TestCase 11 | 12 | @@untagged =<Lisa Raines , a lawyer and director of government relations for the Industrial Biotechnical Association , contends that a judge well-versed in patent law and the concerns of research-based industries would have ruled otherwise . 18 | EOD 19 | 20 | def setup 21 | @tagger = EngTagger.new 22 | tagpath = File.join($ENGTAGGER_LIB, @tagger.conf[:tag_path]) 23 | wordpath = File.join($ENGTAGGER_LIB, @tagger.conf[:word_path]) 24 | if !File.exists?(tagpath) or !File.exists?(wordpath) 25 | @tagger.install 26 | end 27 | end 28 | 29 | def text_get_ext 30 | model = '[^<]+\s*' 31 | assert_equal(model, EngTagger.get_ext(model, "cd")) 32 | end 33 | 34 | def test_explain_tag 35 | assert_equal("noun", EngTagger.explain_tag("nn")) 36 | assert_equal("verb_infinitive", EngTagger.explain_tag("vb")) 37 | end 38 | 39 | def test_add_tags 40 | assert_instance_of(String, @tagger.add_tags(@@untagged)) 41 | end 42 | 43 | def test_assign_tag 44 | models = []; tests = [] 45 | models += [@tagger.conf[:unknown_word_tag], "sym"] 46 | tests += [["pp","-unknown-"], ["pp", "-sym-"]] 47 | models.length.times do |i| 48 | assert_equal(models[i],@tagger.assign_tag(*tests[i])) 49 | end 50 | tests = [] 51 | tests += [["vb","water"], ["nn", "runs"]] 52 | models.length.times do |i| 53 | result = @tagger.assign_tag(*tests[i]) 54 | assert(EngTagger.hmm.keys.index(result)) 55 | end 56 | end 57 | 58 | def test_classify_unknown_word 59 | assert_equal("*LRB*", @tagger.classify_unknown_word("{")) 60 | assert_equal("*NUM*", @tagger.classify_unknown_word("123.4567")) 61 | assert_equal("*ORD*", @tagger.classify_unknown_word("40th")) 62 | assert_equal("-abr-", @tagger.classify_unknown_word("GT-R")) 63 | assert_equal("-hyp-adj-", @tagger.classify_unknown_word("extremely-high")) 64 | assert_equal("-sym-", @tagger.classify_unknown_word("&&")) 65 | assert_equal("-ing-", @tagger.classify_unknown_word("wikiing")) 66 | assert_equal("-unknown-", @tagger.classify_unknown_word("asefasdf")) 67 | end 68 | 69 | def test_clean_text 70 | test = "I am 100% sure that Dr. Watson is too naive. I'm sorry." 71 | model = ["I","am","100","%","sure","that","Dr.","Watson","is","too","naive",".","I","'m","sorry","."] 72 | assert_equal(model, @tagger.clean_text(test)) 73 | end 74 | 75 | def test_clean_word 76 | models = []; tests = [] 77 | models += ["*NUM*"] 78 | models += ["Plays"] 79 | models += ["pleadingly"] 80 | tests += ["1973.0820", "Plays", "Pleadingly"] 81 | models.length.times do |i| 82 | assert_equal(models[i], @tagger.clean_word(tests[i])) 83 | end 84 | end 85 | 86 | def test_get_max_noun_phrases 87 | result = @tagger.get_max_noun_phrases(@@tagged) 88 | assert_instance_of(Hash, result) 89 | end 90 | 91 | def test_get_max_noun_regex 92 | assert_instance_of(Regexp, @tagger.get_max_noun_regex) 93 | end 94 | 95 | def test_get_noun_phrases 96 | result = @tagger.get_noun_phrases(@@tagged) 97 | assert_instance_of(Hash, result) 98 | end 99 | 100 | def test_get_nouns 101 | result = @tagger.get_nouns(@@tagged) 102 | assert_instance_of(Hash, result) 103 | end 104 | 105 | def test_get_proper_nouns 106 | test = "BBC means British Broadcasting Corporation ." 107 | result = @tagger.get_proper_nouns(test) 108 | assert_instance_of(Hash, result) 109 | end 110 | 111 | def test_get_readable 112 | test = "I woke up to the sound of pouring rain." 113 | result = @tagger.get_readable(test) 114 | assert(String, result) 115 | end 116 | 117 | def test_get_sentences 118 | result = @tagger.get_sentences(@@untagged) 119 | assert_equal(4, result.length) 120 | end 121 | 122 | def test_get_words 123 | @tagger.conf[:longest_noun_phrase] = 1 124 | result1 = @tagger.get_words(@@tagged) 125 | @tagger.conf[:longest_noun_phrase] = 10 126 | result2 = @tagger.get_words(@@tagged) 127 | assert_instance_of(Hash, result1) 128 | assert_instance_of(Hash, result2) 129 | end 130 | 131 | def test_reset 132 | @tagger.conf[:current_tag] = 'nn' 133 | @tagger.reset 134 | assert_equal('pp', @tagger.conf[:current_tag]) 135 | end 136 | 137 | def test_split_punct 138 | models = []; texts = [] 139 | models << ["`", "test"]; texts << "`test" 140 | models << ["``", "test"]; texts << "\"test" 141 | models << ["`", "test"]; texts << "'test" 142 | models << ["''"]; texts << '"' 143 | models << ["test", "'"]; texts << "test' " 144 | models << ["-", "test", "-"]; texts << "---test-----" 145 | models << ["test", ",", "test"]; texts << "test,test" 146 | models << ["123,456"]; texts << "123,456" 147 | models << ["test", ":"]; texts << "test:" 148 | models << ["test1", "...", "test2"]; texts << "test1...test2" 149 | models << ["{", "ab","[","(","c",")","[","d","]","]","}"]; texts << "{ab[(c)[d]]}" 150 | models << ["test", "#", "test"]; texts << "test#test" 151 | models << ["I", "'d", "like"]; texts << "I'd like" 152 | models << ["is", "n't", "so"]; texts << "isn't so" 153 | models << ["we", "'re", "all"]; texts << "we're all" 154 | 155 | texts.each_with_index do |text, index| 156 | assert_equal(models[index], @tagger.split_punct(text)) 157 | end 158 | end 159 | 160 | def test_split_sentences 161 | models = []; tests = [] 162 | models << ["He", "is", "a", "u.s.", "army", "officer", "."] 163 | tests << ["He", "is", "a", "u.s.", "army", "officer."] 164 | models << ["He", "is", "Mr.", "Johnson", ".", "He", "'s", "my", "friend", "."] 165 | tests << ["He", "is", "Mr.", "Johnson.", "He", "'s", "my", "friend."] 166 | models.length.times do |i| 167 | assert_equal(models[i], @tagger.split_sentences(tests[i])) 168 | end 169 | end 170 | 171 | def test_stem 172 | word = "gets" 173 | old = @tagger.conf[:stem] 174 | @tagger.conf[:stem] = true 175 | assert_equal("get", @tagger.stem(word)) 176 | # the following should not work since we memoize stem method 177 | # @tagger.conf[:stem] = false 178 | # assert_equal("gets", @tagger.stem(word)) 179 | @tagger.conf[:stem] = old 180 | end 181 | 182 | def test_strip_tags 183 | assert_instance_of(String, @tagger.strip_tags(@@tagged)) 184 | end 185 | 186 | def test_valid_text 187 | text = nil 188 | assert(!@tagger.valid_text(text)) 189 | text = "this is test text" 190 | assert(@tagger.valid_text(text)) 191 | text = "" 192 | assert(!@tagger.valid_text(text)) 193 | end 194 | end 195 | 196 | # Number of errors detected: 24 197 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 2, June 1991 3 | 4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc. 5 | 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | Preamble 10 | 11 | The licenses for most software are designed to take away your 12 | freedom to share and change it. By contrast, the GNU General Public 13 | License is intended to guarantee your freedom to share and change free 14 | software--to make sure the software is free for all its users. This 15 | General Public License applies to most of the Free Software 16 | Foundation's software and to any other program whose authors commit to 17 | using it. (Some other Free Software Foundation software is covered by 18 | the GNU Library General Public License instead.) You can apply it to 19 | your programs, too. 20 | 21 | When we speak of free software, we are referring to freedom, not 22 | price. Our General Public Licenses are designed to make sure that you 23 | have the freedom to distribute copies of free software (and charge for 24 | this service if you wish), that you receive source code or can get it 25 | if you want it, that you can change the software or use pieces of it 26 | in new free programs; and that you know you can do these things. 27 | 28 | To protect your rights, we need to make restrictions that forbid 29 | anyone to deny you these rights or to ask you to surrender the rights. 30 | These restrictions translate to certain responsibilities for you if you 31 | distribute copies of the software, or if you modify it. 32 | 33 | For example, if you distribute copies of such a program, whether 34 | gratis or for a fee, you must give the recipients all the rights that 35 | you have. You must make sure that they, too, receive or can get the 36 | source code. And you must show them these terms so they know their 37 | rights. 38 | 39 | We protect your rights with two steps: (1) copyright the software, and 40 | (2) offer you this license which gives you legal permission to copy, 41 | distribute and/or modify the software. 42 | 43 | Also, for each author's protection and ours, we want to make certain 44 | that everyone understands that there is no warranty for this free 45 | software. If the software is modified by someone else and passed on, we 46 | want its recipients to know that what they have is not the original, so 47 | that any problems introduced by others will not reflect on the original 48 | authors' reputations. 49 | 50 | Finally, any free program is threatened constantly by software 51 | patents. We wish to avoid the danger that redistributors of a free 52 | program will individually obtain patent licenses, in effect making the 53 | program proprietary. To prevent this, we have made it clear that any 54 | patent must be licensed for everyone's free use or not licensed at all. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | GNU GENERAL PUBLIC LICENSE 60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 61 | 62 | 0. This License applies to any program or other work which contains 63 | a notice placed by the copyright holder saying it may be distributed 64 | under the terms of this General Public License. The "Program", below, 65 | refers to any such program or work, and a "work based on the Program" 66 | means either the Program or any derivative work under copyright law: 67 | that is to say, a work containing the Program or a portion of it, 68 | either verbatim or with modifications and/or translated into another 69 | language. (Hereinafter, translation is included without limitation in 70 | the term "modification".) Each licensee is addressed as "you". 71 | 72 | Activities other than copying, distribution and modification are not 73 | covered by this License; they are outside its scope. The act of 74 | running the Program is not restricted, and the output from the Program 75 | is covered only if its contents constitute a work based on the 76 | Program (independent of having been made by running the Program). 77 | Whether that is true depends on what the Program does. 78 | 79 | 1. You may copy and distribute verbatim copies of the Program's 80 | source code as you receive it, in any medium, provided that you 81 | conspicuously and appropriately publish on each copy an appropriate 82 | copyright notice and disclaimer of warranty; keep intact all the 83 | notices that refer to this License and to the absence of any warranty; 84 | and give any other recipients of the Program a copy of this License 85 | along with the Program. 86 | 87 | You may charge a fee for the physical act of transferring a copy, and 88 | you may at your option offer warranty protection in exchange for a fee. 89 | 90 | 2. You may modify your copy or copies of the Program or any portion 91 | of it, thus forming a work based on the Program, and copy and 92 | distribute such modifications or work under the terms of Section 1 93 | above, provided that you also meet all of these conditions: 94 | 95 | a) You must cause the modified files to carry prominent notices 96 | stating that you changed the files and the date of any change. 97 | 98 | b) You must cause any work that you distribute or publish, that in 99 | whole or in part contains or is derived from the Program or any 100 | part thereof, to be licensed as a whole at no charge to all third 101 | parties under the terms of this License. 102 | 103 | c) If the modified program normally reads commands interactively 104 | when run, you must cause it, when started running for such 105 | interactive use in the most ordinary way, to print or display an 106 | announcement including an appropriate copyright notice and a 107 | notice that there is no warranty (or else, saying that you provide 108 | a warranty) and that users may redistribute the program under 109 | these conditions, and telling the user how to view a copy of this 110 | License. (Exception: if the Program itself is interactive but 111 | does not normally print such an announcement, your work based on 112 | the Program is not required to print an announcement.) 113 | 114 | These requirements apply to the modified work as a whole. If 115 | identifiable sections of that work are not derived from the Program, 116 | and can be reasonably considered independent and separate works in 117 | themselves, then this License, and its terms, do not apply to those 118 | sections when you distribute them as separate works. But when you 119 | distribute the same sections as part of a whole which is a work based 120 | on the Program, the distribution of the whole must be on the terms of 121 | this License, whose permissions for other licensees extend to the 122 | entire whole, and thus to each and every part regardless of who wrote it. 123 | 124 | Thus, it is not the intent of this section to claim rights or contest 125 | your rights to work written entirely by you; rather, the intent is to 126 | exercise the right to control the distribution of derivative or 127 | collective works based on the Program. 128 | 129 | In addition, mere aggregation of another work not based on the Program 130 | with the Program (or with a work based on the Program) on a volume of 131 | a storage or distribution medium does not bring the other work under 132 | the scope of this License. 133 | 134 | 3. You may copy and distribute the Program (or a work based on it, 135 | under Section 2) in object code or executable form under the terms of 136 | Sections 1 and 2 above provided that you also do one of the following: 137 | 138 | a) Accompany it with the complete corresponding machine-readable 139 | source code, which must be distributed under the terms of Sections 140 | 1 and 2 above on a medium customarily used for software interchange; or, 141 | 142 | b) Accompany it with a written offer, valid for at least three 143 | years, to give any third party, for a charge no more than your 144 | cost of physically performing source distribution, a complete 145 | machine-readable copy of the corresponding source code, to be 146 | distributed under the terms of Sections 1 and 2 above on a medium 147 | customarily used for software interchange; or, 148 | 149 | c) Accompany it with the information you received as to the offer 150 | to distribute corresponding source code. (This alternative is 151 | allowed only for noncommercial distribution and only if you 152 | received the program in object code or executable form with such 153 | an offer, in accord with Subsection b above.) 154 | 155 | The source code for a work means the preferred form of the work for 156 | making modifications to it. For an executable work, complete source 157 | code means all the source code for all modules it contains, plus any 158 | associated interface definition files, plus the scripts used to 159 | control compilation and installation of the executable. However, as a 160 | special exception, the source code distributed need not include 161 | anything that is normally distributed (in either source or binary 162 | form) with the major components (compiler, kernel, and so on) of the 163 | operating system on which the executable runs, unless that component 164 | itself accompanies the executable. 165 | 166 | If distribution of executable or object code is made by offering 167 | access to copy from a designated place, then offering equivalent 168 | access to copy the source code from the same place counts as 169 | distribution of the source code, even though third parties are not 170 | compelled to copy the source along with the object code. 171 | 172 | 4. You may not copy, modify, sublicense, or distribute the Program 173 | except as expressly provided under this License. Any attempt 174 | otherwise to copy, modify, sublicense or distribute the Program is 175 | void, and will automatically terminate your rights under this License. 176 | However, parties who have received copies, or rights, from you under 177 | this License will not have their licenses terminated so long as such 178 | parties remain in full compliance. 179 | 180 | 5. You are not required to accept this License, since you have not 181 | signed it. However, nothing else grants you permission to modify or 182 | distribute the Program or its derivative works. These actions are 183 | prohibited by law if you do not accept this License. Therefore, by 184 | modifying or distributing the Program (or any work based on the 185 | Program), you indicate your acceptance of this License to do so, and 186 | all its terms and conditions for copying, distributing or modifying 187 | the Program or works based on it. 188 | 189 | 6. Each time you redistribute the Program (or any work based on the 190 | Program), the recipient automatically receives a license from the 191 | original licensor to copy, distribute or modify the Program subject to 192 | these terms and conditions. You may not impose any further 193 | restrictions on the recipients' exercise of the rights granted herein. 194 | You are not responsible for enforcing compliance by third parties to 195 | this License. 196 | 197 | 7. If, as a consequence of a court judgment or allegation of patent 198 | infringement or for any other reason (not limited to patent issues), 199 | conditions are imposed on you (whether by court order, agreement or 200 | otherwise) that contradict the conditions of this License, they do not 201 | excuse you from the conditions of this License. If you cannot 202 | distribute so as to satisfy simultaneously your obligations under this 203 | License and any other pertinent obligations, then as a consequence you 204 | may not distribute the Program at all. For example, if a patent 205 | license would not permit royalty-free redistribution of the Program by 206 | all those who receive copies directly or indirectly through you, then 207 | the only way you could satisfy both it and this License would be to 208 | refrain entirely from distribution of the Program. 209 | 210 | If any portion of this section is held invalid or unenforceable under 211 | any particular circumstance, the balance of the section is intended to 212 | apply and the section as a whole is intended to apply in other 213 | circumstances. 214 | 215 | It is not the purpose of this section to induce you to infringe any 216 | patents or other property right claims or to contest validity of any 217 | such claims; this section has the sole purpose of protecting the 218 | integrity of the free software distribution system, which is 219 | implemented by public license practices. Many people have made 220 | generous contributions to the wide range of software distributed 221 | through that system in reliance on consistent application of that 222 | system; it is up to the author/donor to decide if he or she is willing 223 | to distribute software through any other system and a licensee cannot 224 | impose that choice. 225 | 226 | This section is intended to make thoroughly clear what is believed to 227 | be a consequence of the rest of this License. 228 | 229 | 8. If the distribution and/or use of the Program is restricted in 230 | certain countries either by patents or by copyrighted interfaces, the 231 | original copyright holder who places the Program under this License 232 | may add an explicit geographical distribution limitation excluding 233 | those countries, so that distribution is permitted only in or among 234 | countries not thus excluded. In such case, this License incorporates 235 | the limitation as if written in the body of this License. 236 | 237 | 9. The Free Software Foundation may publish revised and/or new versions 238 | of the General Public License from time to time. Such new versions will 239 | be similar in spirit to the present version, but may differ in detail to 240 | address new problems or concerns. 241 | 242 | Each version is given a distinguishing version number. If the Program 243 | specifies a version number of this License which applies to it and "any 244 | later version", you have the option of following the terms and conditions 245 | either of that version or of any later version published by the Free 246 | Software Foundation. If the Program does not specify a version number of 247 | this License, you may choose any version ever published by the Free Software 248 | Foundation. 249 | 250 | 10. If you wish to incorporate parts of the Program into other free 251 | programs whose distribution conditions are different, write to the author 252 | to ask for permission. For software which is copyrighted by the Free 253 | Software Foundation, write to the Free Software Foundation; we sometimes 254 | make exceptions for this. Our decision will be guided by the two goals 255 | of preserving the free status of all derivatives of our free software and 256 | of promoting the sharing and reuse of software generally. 257 | 258 | NO WARRANTY 259 | 260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 268 | REPAIR OR CORRECTION. 269 | 270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 278 | POSSIBILITY OF SUCH DAMAGES. 279 | 280 | END OF TERMS AND CONDITIONS 281 | 282 | How to Apply These Terms to Your New Programs 283 | 284 | If you develop a new program, and you want it to be of the greatest 285 | possible use to the public, the best way to achieve this is to make it 286 | free software which everyone can redistribute and change under these terms. 287 | 288 | To do so, attach the following notices to the program. It is safest 289 | to attach them to the start of each source file to most effectively 290 | convey the exclusion of warranty; and each file should have at least 291 | the "copyright" line and a pointer to where the full notice is found. 292 | 293 | 294 | Copyright (C) 295 | 296 | This program is free software; you can redistribute it and/or modify 297 | it under the terms of the GNU General Public License as published by 298 | the Free Software Foundation; either version 2 of the License, or 299 | (at your option) any later version. 300 | 301 | This program is distributed in the hope that it will be useful, 302 | but WITHOUT ANY WARRANTY; without even the implied warranty of 303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 304 | GNU General Public License for more details. 305 | 306 | You should have received a copy of the GNU General Public License 307 | along with this program; if not, write to the Free Software 308 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 309 | 310 | 311 | Also add information on how to contact you by electronic and paper mail. 312 | 313 | If the program is interactive, make it output a short notice like this 314 | when it starts in an interactive mode: 315 | 316 | Gnomovision version 69, Copyright (C) year name of author 317 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 318 | This is free software, and you are welcome to redistribute it 319 | under certain conditions; type `show c' for details. 320 | 321 | The hypothetical commands `show w' and `show c' should show the appropriate 322 | parts of the General Public License. Of course, the commands you use may 323 | be called something other than `show w' and `show c'; they could even be 324 | mouse-clicks or menu items--whatever suits your program. 325 | 326 | You should also get your employer (if you work as a programmer) or your 327 | school, if any, to sign a "copyright disclaimer" for the program, if 328 | necessary. Here is a sample; alter the names: 329 | 330 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program 331 | `Gnomovision' (which makes passes at compilers) written by James Hacker. 332 | 333 | , 1 April 1989 334 | Ty Coon, President of Vice 335 | 336 | This General Public License does not permit incorporating your program into 337 | proprietary programs. If your program is a subroutine library, you may 338 | consider it more useful to permit linking proprietary applications with the 339 | library. If this is what you want to do, use the GNU Library General 340 | Public License instead of this License. 341 | -------------------------------------------------------------------------------- /lib/engtagger/tags.yml: -------------------------------------------------------------------------------- 1 | --- #YAML:1.0 2 | cc: { cc: 0.000237618, cd: 0.0439594, det: 0.113785, ex: 0.00349638, fw: 0.000101836, in: 0.0513256, jj: 0.108829, jjr: 0.0102855, jjs: 0.00207067, lrb: 0.000339455, ls: 6.7891e-05, md: 0.0106928, nn: 0.118639, nnp: 0.161682, nnps: 0.00332666, nns: 0.0710139, pdt: 0.000577073, ppc: 0.00763773, ppd: 0.0195865, ppl: 0.00590651, pps: 0.000305509, prp: 0.0376116, prps: 0.0172104, rb: 0.0518687, rbr: 0.00264775, rbs: 0.000882583, rp: 0.000101836, rrb: 6.7891e-05, to: 0.00740012, uh: 6.7891e-05, vb: 0.0327574, vbd: 0.0380189, vbg: 0.0210462, vbn: 0.0144268, vbp: 0.011813, vbz: 0.0227435, wdt: 0.00105231, wp: 0.00213857, wps: 0.000237618, wrb: 0.00403951 } 3 | cd: { cc: 0.0167197, cd: 0.198722, det: 0.0292094, fw: 4.45266e-05, in: 0.0892758, jj: 0.0370906, jjr: 0.00211501, jjs: 0.000779215, lrb: 0.00189238, md: 0.00211501, nn: 0.205067, nnp: 0.0134693, nnps: 0.000868268, nns: 0.156511, pos: 0.000734688, pp: 0.0724002, ppc: 0.0968453, ppd: 8.90531e-05, ppl: 0.000690162, ppr: 0.000489792, pps: 0.00761404, prp: 0.000734688, prps: 8.90531e-05, rb: 0.00429681, rbr: 0.00020037, rbs: 0.00020037, rrb: 0.00866042, sym: 0.00013358, to: 0.0362892, vb: 4.45266e-05, vbd: 0.00616693, vbg: 0.00169201, vbn: 0.00311686, vbp: 0.00129127, vbz: 0.00262707, wdt: 0.000756952, wp: 0.000244896, wps: 2.22633e-05, wrb: 0.000690162 } 4 | det: { cc: 0.000711631, cd: 0.0235332, det: 0.00166047, fw: 0.000227327, in: 0.00962679, jj: 0.216346, jjr: 0.00550526, jjs: 0.00935004, lrb: 0.000474421, md: 0.00213489, nn: 0.473709, nnp: 0.113584, nnps: 0.00454653, nns: 0.0734265, pdt: 9.88377e-06, pos: 2.96513e-05, pp: 0.00157152, ppc: 0.0022535, ppd: 0.0090733, ppl: 0.00576224, ppr: 4.94188e-05, pps: 0.000355816, prp: 0.000484305, prps: 0.000642445, rb: 0.0102, rbr: 0.00179885, rbs: 0.00280699, rp: 6.91864e-05, rrb: 3.95351e-05, to: 0.000286629, uh: 1.97675e-05, vb: 0.00023721, vbd: 0.00230292, vbg: 0.00810469, vbn: 0.00839132, vbp: 0.00171978, vbz: 0.00792678, wdt: 0.000217443, wp: 0.000800585, wrb: 9.88377e-06 } 5 | ex: { det: 0.00185701, md: 0.0807799, pos: 0.0120706, ppc: 0.00371402, prp: 0.000928505, rb: 0.0222841, to: 0.000928505, vb: 0.00185701, vbd: 0.182916, vbp: 0.224698, vbz: 0.46611, wp: 0.00185701 } 6 | fw: { cc: 0.0149254, det: 0.00746269, fw: 0.246269, in: 0.0261194, jj: 0.0261194, lrb: 0.0149254, md: 0.00746269, nn: 0.0708955, nnp: 0.201493, nns: 0.0298507, pos: 0.00746269, pp: 0.108209, ppc: 0.0970149, ppd: 0.00746269, ppl: 0.00373134, ppr: 0.0559702, pps: 0.011194, rrb: 0.00746269, to: 0.0149254, vbn: 0.00373134, vbp: 0.00373134, vbz: 0.0335821 } 7 | in: { cc: 0.00128828, cd: 0.0603768, det: 0.328199, ex: 0.00150162, fw: 0.000180523, in: 0.0200791, jj: 0.0909427, jjr: 0.00493977, jjs: 0.0046936, lrb: 0.000295402, ls: 8.2056e-06, md: 0.000123084, nn: 0.108191, nnp: 0.150409, nnps: 0.00177241, nns: 0.0591706, pdt: 0.00137854, pos: 3.28224e-05, pp: 0.00195293, ppc: 0.00242886, ppd: 0.0276693, ppl: 0.00563725, ppr: 0.000155906, pps: 0.000229757, prp: 0.0301884, prps: 0.03576, rb: 0.0142449, rbr: 0.000722093, rbs: 0.000180523, rp: 1.64112e-05, rrb: 7.38504e-05, sym: 1.64112e-05, to: 0.00224833, uh: 8.2056e-06, vb: 0.000459514, vbd: 0.000672859, vbg: 0.0307956, vbn: 0.00466078, vbp: 0.000246168, vbz: 0.000582598, wdt: 0.00357764, wp: 0.00224833, wps: 3.28224e-05, wrb: 0.0016083 } 8 | jj: { cc: 0.0172469, cd: 0.0161175, det: 0.00358756, ex: 5.31491e-05, fw: 0.000132873, in: 0.0560723, jj: 0.0733723, jjr: 0.000704225, jjs: 0.000345469, lrb: 0.000651076, md: 0.000398618, nn: 0.44992, nnp: 0.0368057, nnps: 0.00159447, nns: 0.23282, pdt: 2.65745e-05, pos: 0.000172735, pp: 0.0243689, ppc: 0.0293649, ppd: 0.00265745, ppl: 0.0020861, ppr: 0.00478342, pps: 0.00309593, prp: 0.00139516, prps: 0.000265745, rb: 0.00356099, rbr: 0.000225884, rbs: 9.30109e-05, rp: 0.000106298, rrb: 0.000558065, to: 0.0274382, uh: 1.32873e-05, vb: 0.000106298, vbd: 0.00132873, vbg: 0.00324209, vbn: 0.00191337, vbp: 0.000983258, vbz: 0.00139516, wdt: 0.000119585, wp: 0.000239171, wrb: 0.000637789 } 9 | jjr: { cc: 0.0207869, cd: 0.0027221, det: 0.00668151, in: 0.320713, jj: 0.0487503, lrb: 0.000247463, md: 0.000989854, nn: 0.268003, nnp: 0.0173224, nnps: 0.000494927, nns: 0.169018, pdt: 0.000247463, pp: 0.0514724, ppc: 0.0400891, ppd: 0.000989854, ppl: 0.00123732, ppr: 0.00173224, pps: 0.00445434, prp: 0.00173224, prps: 0.000742391, rb: 0.0054442, rp: 0.00148478, rrb: 0.000494927, to: 0.023509, vb: 0.000247463, vbd: 0.0027221, vbg: 0.00173224, vbn: 0.00247463, vbp: 0.00123732, vbz: 0.00148478, wrb: 0.000742391 } 10 | jjs: { cc: 0.00835422, cd: 0.06934, det: 0.0183793, ex: 0.00125313, in: 0.161654, jj: 0.103592, lrb: 0.000417711, md: 0.000835422, nn: 0.351713, nnp: 0.0108605, nnps: 0.00167084, nns: 0.155388, pdt: 0.00125313, pp: 0.0121136, ppc: 0.0263158, ppd: 0.0108605, ppl: 0.00125313, ppr: 0.00167084, pps: 0.00459482, prp: 0.00459482, prps: 0.000417711, rb: 0.015873, to: 0.0037594, vb: 0.00292398, vbd: 0.00334169, vbg: 0.00584795, vbn: 0.00501253, vbp: 0.0121136, vbz: 0.00292398, wrb: 0.00167084 } 11 | lrb: { cc: 0.0269139, cd: 0.034689, det: 0.0717703, ex: 0.00119617, fw: 0.00299043, in: 0.0843301, jj: 0.0466507, jjr: 0.00179426, ls: 0.00119617, md: 0.00119617, nn: 0.0633971, nnp: 0.330742, nnps: 0.00179426, nns: 0.0161483, pdt: 0.000598086, ppd: 0.166866, ppl: 0.0263158, prp: 0.0197368, prps: 0.0041866, rb: 0.0305024, to: 0.0041866, uh: 0.00358852, vb: 0.0131579, vbd: 0.00119617, vbg: 0.00777512, vbn: 0.020933, vbp: 0.000598086, vbz: 0.000598086, wdt: 0.00717703, wp: 0.00538278, wrb: 0.00239234 } 12 | ls: { jj: 0.03125, nn: 0.015625, pp: 0.40625, ppc: 0.125, pps: 0.109375, rrb: 0.3125 } 13 | md: { cc: 0.00100025, cd: 0.000166708, det: 0.00408435, in: 0.00175044, jj: 0.000500125, jjr: 8.33542e-05, jjs: 8.33542e-05, lrb: 0.000166708, md: 0.000166708, nn: 0.000833542, nnp: 0.000916896, nns: 0.000666833, pp: 0.0020005, ppc: 0.00325081, ppl: 0.00483454, ppr: 0.000166708, pps: 0.000750188, prp: 0.00500125, prps: 0.000250063, rb: 0.169459, rbr: 0.00125031, rbs: 0.000166708, rrb: 0.000166708, to: 0.0030841, vb: 0.797699, vbd: 0.000500125, vbg: 8.33542e-05, vbn: 0.000583479, vbp: 0.000250063, vbz: 8.33542e-05 } 14 | nn: { cc: 0.0397962, cd: 0.00599719, det: 0.00682082, ex: 0.000109816, fw: 6.10091e-05, in: 0.247752, jj: 0.0087182, jjr: 0.00111037, jjs: 5.49082e-05, lrb: 0.00157403, ls: 6.10091e-06, md: 0.0175218, nn: 0.122201, nnp: 0.00971265, nnps: 7.93118e-05, nns: 0.0785187, pdt: 1.22018e-05, pos: 0.0216765, pp: 0.10859, ppc: 0.115027, ppd: 0.000256238, ppl: 0.00240986, ppr: 0.00516137, pps: 0.0116832, prp: 0.00430724, prps: 0.000274541, rb: 0.0177536, rbr: 0.00253188, rbs: 6.711e-05, rp: 0.000585687, rrb: 0.00173876, sym: 4.88073e-05, to: 0.0394485, vb: 0.00140321, vbd: 0.0485022, vbg: 0.00755293, vbn: 0.0103593, vbp: 0.0040022, vbz: 0.0437618, wdt: 0.00787017, wp: 0.00240376, wps: 0.000170825, wrb: 0.00236715 } 15 | nnp: { cc: 0.0419473, cd: 0.01911, det: 0.0026398, ex: 1.75402e-05, fw: 0.000429734, in: 0.0407721, jj: 0.00841928, jjr: 8.77008e-05, jjs: 8.77008e-06, lrb: 0.0034291, md: 0.0110152, nn: 0.0586719, nnp: 0.377991, nnps: 0.0156283, nns: 0.0241967, pdt: 8.77008e-06, pos: 0.0558742, pp: 0.0544184, ppc: 0.140391, ppd: 0.000236792, ppl: 0.00104364, ppr: 0.00256086, pps: 0.00705115, prp: 0.000868238, prps: 9.64709e-05, rb: 0.00898934, rbr: 0.000315723, rbs: 8.77008e-06, rp: 5.26205e-05, rrb: 0.00352557, sym: 3.50803e-05, to: 0.00755981, vb: 0.000964709, vbd: 0.0653722, vbg: 0.00169263, vbn: 0.000815618, vbp: 0.00399916, vbz: 0.0376938, wdt: 0.000938399, wp: 0.000578826, wps: 8.77008e-06, wrb: 0.000534975 } 16 | nnps: { cc: 0.0787172, cd: 0.000971817, det: 0.00323939, ex: 0.000647878, in: 0.0686751, jj: 0.00615484, jjr: 0.000323939, lrb: 0.00291545, md: 0.0233236, nn: 0.0379009, nnp: 0.284742, nnps: 0.0145773, nns: 0.0119857, pos: 0.0255912, pp: 0.0767736, ppc: 0.129252, ppl: 0.00129576, ppr: 0.00323939, pps: 0.0207321, prp: 0.00129576, prps: 0.000323939, rb: 0.0136054, rbr: 0.000323939, rrb: 0.00226757, sym: 0.000323939, to: 0.0132815, vb: 0.000971817, vbd: 0.080013, vbg: 0.00356333, vbn: 0.00259151, vbp: 0.0553936, vbz: 0.0259151, wdt: 0.000323939, wp: 0.00842242, wps: 0.000323939 } 17 | nns: { cc: 0.0593685, cd: 0.00163635, det: 0.0170803, ex: 8.11414e-05, in: 0.23504, jj: 0.0166746, jjr: 0.00119007, jjs: 6.76178e-05, lrb: 0.00425992, md: 0.0277233, nn: 0.0211373, nnp: 0.00301575, nnps: 2.70471e-05, nns: 0.0107783, pdt: 5.40943e-05, pos: 0.00922307, pp: 0.134952, ppc: 0.124079, ppd: 0.000283995, ppl: 0.00232605, ppr: 0.00405707, pps: 0.0189465, prp: 0.00462506, prps: 0.000229901, rb: 0.0311177, rbr: 0.00192035, rbs: 0.000121712, rp: 0.000689702, rrb: 0.00167692, sym: 5.40943e-05, to: 0.0396105, vb: 0.00397593, vbd: 0.0745284, vbg: 0.0141051, vbn: 0.0207451, vbp: 0.0845223, vbz: 0.00806004, wdt: 0.0124011, wp: 0.00695111, wps: 0.000446278, wrb: 0.00221786 } 18 | pdt: { det: 0.913832, jj: 0.00226757, nnp: 0.00226757, prps: 0.0816327 } 19 | pos: { cc: 0.00648268, cd: 0.0250046, det: 0.000370439, fw: 0.000185219, in: 0.00277829, jj: 0.207261, jjr: 0.00231524, jjs: 0.0253751, lrb: 0.000277829, md: 0.000926097, nn: 0.417114, nnp: 0.107242, nnps: 0.00361178, nns: 0.128264, pp: 0.00509354, ppc: 0.00592702, ppd: 0.0095388, ppl: 0.0114836, ppr: 0.000370439, pps: 0.000185219, prp: 9.26097e-05, prps: 0.000277829, rb: 0.00555658, rbr: 0.000370439, rbs: 0.00240785, rrb: 0.000926097, to: 0.000185219, vb: 0.000370439, vbd: 0.00601963, vbg: 0.0115762, vbn: 0.00768661, vbp: 0.000463049, vbz: 0.00388961, wp: 0.000185219, wrb: 0.000185219 } 20 | pp: { cc: 0.0525182, cd: 0.00978944, det: 0.205127, ex: 0.00404302, fw: 0.000164183, in: 0.119587, jj: 0.0381932, jjr: 0.00164183, jjs: 0.00217543, lrb: 0.00441243, ls: 0.000759348, md: 0.000677256, nn: 0.0371465, nnp: 0.185178, nnps: 0.00188811, nns: 0.0392193, pdt: 0.000656734, pp: 0.00014366, ppc: 6.15688e-05, ppd: 0.00014366, ppl: 0.0753602, ppr: 0.0597628, pps: 0.00285269, prp: 0.0552888, prps: 0.00722407, rb: 0.0521693, rbr: 0.00186759, rbs: 0.000513073, rrb: 0.00527439, sym: 0.000779871, to: 0.00316053, uh: 0.000595165, vb: 0.00303739, vbd: 0.000718302, vbg: 0.0111439, vbn: 0.00547962, vbp: 0.00034889, vbz: 0.00135451, wdt: 0.000636211, wp: 0.00289373, wps: 2.05229e-05, wrb: 0.00599269 } 21 | ppc: { cc: 0.0919462, cd: 0.0208826, det: 0.13353, ex: 0.00257932, fw: 0.000314148, in: 0.0867545, jj: 0.0418148, jjr: 0.00165341, jjs: 0.000975513, lrb: 0.000314148, ls: 4.96024e-05, md: 0.0101519, nn: 0.0487426, nnp: 0.127924, nnps: 0.00105818, nns: 0.0260578, pdt: 0.000214944, ppd: 0.00221557, ppl: 0.0133265, ppr: 0.0579356, pps: 8.26706e-05, prp: 0.0412692, prps: 0.00410046, rb: 0.0546122, rbr: 0.000777104, rbs: 0.000363751, rp: 8.26706e-05, to: 0.00945752, uh: 0.000396819, vb: 0.00376978, vbd: 0.0529588, vbg: 0.044113, vbn: 0.0206842, vbp: 0.00866388, vbz: 0.031481, wdt: 0.0360278, wp: 0.0122848, wps: 0.00219904, wrb: 0.00823399 } 22 | ppd: { cd: 0.990264, jj: 0.00973559 } 23 | ppl: { cc: 0.0184726, cd: 0.00777202, det: 0.147894, ex: 0.0210633, fw: 0.0032665, in: 0.0637531, jj: 0.113877, jjr: 0.0032665, jjs: 0.0032665, lrb: 0.000225276, md: 0.0117144, nn: 0.0839153, nnp: 0.0762559, nnps: 0.00168957, nns: 0.0355936, pdt: 0.000675828, ppd: 0.000225276, ppl: 0.000337914, pps: 0.000675828, prp: 0.215026, prps: 0.010588, rb: 0.0527146, rbr: 0.00168957, to: 0.00518135, uh: 0.00337914, vb: 0.0259067, vbd: 0.0114891, vbg: 0.0149809, vbn: 0.0111512, vbp: 0.0117144, vbz: 0.0198243, wdt: 0.00146429, wp: 0.00923631, wrb: 0.0117144 } 24 | ppr: { cc: 0.0600601, cd: 0.00462, det: 0.0990991, ex: 0.0017325, fw: 0.000231, in: 0.131786, jj: 0.019635, jjr: 0.000462, jjs: 0.000693001, lrb: 0.00820051, md: 0.0047355, nn: 0.0446985, nnp: 0.135251, nnps: 0.0003465, nns: 0.0294525, pdt: 0.0003465, pp: 0.0021945, ppc: 0.000808501, ppd: 0.0001155, ppl: 0.011319, ppr: 0.00646801, pps: 0.00993301, prp: 0.10857, prps: 0.004389, rb: 0.033495, rbr: 0.0003465, rbs: 0.000231, rrb: 0.0042735, sym: 0.0001155, to: 0.0167475, uh: 0.000231, vb: 0.0026565, vbd: 0.0937861, vbg: 0.0125895, vbn: 0.00704551, vbp: 0.0033495, vbz: 0.122661, wdt: 0.00924001, wp: 0.0026565, wps: 0.000577501, wrb: 0.00485101 } 25 | pps: { cc: 0.0653852, cd: 0.123542, det: 0.117299, ex: 0.00180713, fw: 0.000328569, in: 0.0791852, jj: 0.0395926, jjr: 0.00312141, jjs: 0.000985707, lrb: 0.000492854, ls: 0.00279284, md: 0.00903565, nn: 0.042714, nnp: 0.120092, nnps: 0.00114999, nns: 0.034664, pdt: 0.000492854, pp: 0.0180713, ppc: 0.000821423, ppd: 0.0300641, ppl: 0.0668638, ppr: 0.00131428, pps: 0.00443568, prp: 0.0423854, prps: 0.00706424, rb: 0.0542139, rbr: 0.000657138, rbs: 0.000657138, sym: 0.000492854, to: 0.0110071, uh: 0.000657138, vb: 0.0139642, vbd: 0.0185642, vbg: 0.021357, vbn: 0.0137999, vbp: 0.0119928, vbz: 0.0159356, wdt: 0.0108428, wp: 0.00591424, wps: 0.000164285, wrb: 0.00607853 } 26 | prp: { cc: 0.00781945, cd: 0.000936461, det: 0.0118931, ex: 0.000140469, fw: 4.68231e-05, in: 0.0351641, jj: 0.0077258, jjr: 0.00149834, jjs: 9.36461e-05, lrb: 0.000936461, md: 0.123566, nn: 0.00280938, nnp: 0.000936461, nnps: 4.68231e-05, nns: 0.00103011, pdt: 9.36461e-05, pos: 0.00112375, pp: 0.0305755, ppc: 0.021164, ppd: 0.000234115, ppl: 0.00355855, ppr: 0.000421407, pps: 0.00323079, prp: 0.00163881, prps: 0.000140469, rb: 0.0536592, rbr: 0.00145151, rbs: 0.000234115, rp: 0.00421407, rrb: 0.000468231, to: 0.0169968, vb: 0.0125018, vbd: 0.252002, vbg: 0.00257527, vbn: 0.0024348, vbp: 0.176242, vbz: 0.21843, wdt: 0.000280938, wp: 0.000655523, wps: 4.68231e-05, wrb: 0.000983284 } 27 | prps: { cc: 0.000488234, cd: 0.0210917, fw: 0.00029294, in: 9.76467e-05, jj: 0.240992, jjr: 0.00273411, jjs: 0.0106435, lrb: 0.00087882, nn: 0.441656, nnp: 0.048921, nnps: 0.00058588, nns: 0.195293, pp: 9.76467e-05, ppc: 0.00029294, ppd: 0.00790938, ppl: 0.00478469, pps: 9.76467e-05, rb: 0.00546822, rbr: 0.000195293, rbs: 0.00205058, rrb: 0.00029294, vbd: 0.00029294, vbg: 0.0075188, vbn: 0.00712821, vbz: 0.000195293 } 28 | rb: { cc: 0.00916326, cd: 0.0408682, det: 0.0473348, ex: 0.000811603, fw: 5.23615e-05, in: 0.127893, jj: 0.102079, jjr: 0.0129071, jjs: 0.000628338, lrb: 0.000575977, md: 0.0102367, nn: 0.0117552, nnp: 0.00685936, nnps: 0.000261808, nns: 0.00463399, pdt: 0.000654519, pos: 0.000183265, pp: 0.0496125, ppc: 0.0972091, ppd: 0.0123311, ppl: 0.00170175, ppr: 0.0014923, pps: 0.004346, prp: 0.00877055, prps: 0.00235627, rb: 0.0726254, rbr: 0.0077495, rbs: 0.000209446, rp: 0.00034035, rrb: 0.000445073, sym: 2.61808e-05, to: 0.02694, vb: 0.102, vbd: 0.0548749, vbg: 0.0303435, vbn: 0.081108, vbp: 0.0255262, vbz: 0.0383024, wdt: 0.000261808, wp: 0.00120431, wrb: 0.00332496 } 29 | rbr: { cc: 0.0206897, cd: 0.00045977, det: 0.0845977, in: 0.234483, jj: 0.312644, jjr: 0.00045977, lrb: 0.00045977, md: 0.00229885, nn: 0.00505747, nns: 0.00045977, pp: 0.121379, ppc: 0.0616092, ppl: 0.00229885, pps: 0.00689655, prp: 0.00321839, rb: 0.0731035, rbr: 0.00045977, rp: 0.00045977, rrb: 0.00045977, to: 0.0174713, vb: 0.0124138, vbd: 0.00735632, vbg: 0.00551724, vbn: 0.0206897, vbp: 0.00091954, vbz: 0.00321839, wrb: 0.00091954 } 30 | rbs: { det: 0.0036036, in: 0.045045, jj: 0.72973, lrb: 0.0018018, md: 0.0018018, nn: 0.0018018, nns: 0.0036036, pp: 0.00720721, ppc: 0.00720721, ppl: 0.0018018, pps: 0.0036036, prp: 0.0018018, rb: 0.115315, to: 0.00900901, vb: 0.0036036, vbd: 0.0018018, vbg: 0.0018018, vbn: 0.0540541, vbp: 0.0036036, vbz: 0.0018018 } 31 | rp: { cc: 0.0116031, cd: 0.020458, det: 0.211298, in: 0.246107, jj: 0.050687, jjr: 0.00580153, jjs: 0.00244275, lrb: 0.000610687, md: 0.000305344, nn: 0.0476336, nnp: 0.0256489, nns: 0.0500763, pdt: 0.000916031, pp: 0.0622901, ppc: 0.0445802, ppd: 0.00885496, ppl: 0.00580153, ppr: 0.00610687, pps: 0.00366412, prp: 0.000916031, prps: 0.0525191, rb: 0.0522137, rbr: 0.00396947, rp: 0.000610687, rrb: 0.000305344, to: 0.0531298, vbd: 0.00122137, vbg: 0.0180153, vbn: 0.00183206, vbp: 0.000916031, vbz: 0.000610687, wp: 0.00458015, wrb: 0.00427481 } 32 | rrb: { cc: 0.062759, cd: 0.00947306, det: 0.0396684, ex: 0.000592066, fw: 0.000592066, in: 0.123742, jj: 0.0201303, jjr: 0.00118413, jjs: 0.000592066, lrb: 0.000592066, md: 0.0213144, nn: 0.0568384, nnp: 0.0550622, nnps: 0.000592066, nns: 0.0219065, pp: 0.137951, ppc: 0.168147, ppd: 0.000592066, ppl: 0.0053286, pps: 0.0781528, prp: 0.00947306, prps: 0.00118413, rb: 0.0242747, rbs: 0.000592066, sym: 0.000592066, to: 0.0219065, vb: 0.00828893, vbd: 0.0367081, vbg: 0.0053286, vbn: 0.00651273, vbp: 0.0219065, vbz: 0.0479574, wdt: 0.0053286, wp: 0.0035524, wrb: 0.00118413 } 33 | sym: { cd: 0.0614286, fw: 0.0185714, in: 0.0185714, jj: 0.0471429, nn: 0.0471429, nnp: 0.0471429, pps: 0.604286, rb: 0.0185714, sym: 0.1, vbn: 0.0042857, vbz: 0.0328571 } 34 | to: { cc: 0.000473692, cd: 0.0787422, det: 0.112593, fw: 7.28757e-05, in: 0.00393529, jj: 0.0312272, jjr: 0.0029879, jjs: 0.000255065, lrb: 0.000255065, nn: 0.0318102, nnp: 0.0450736, nnps: 0.000364378, nns: 0.0237939, pdt: 0.000291503, pp: 0.000692319, ppc: 0.000728757, ppd: 0.0441991, ppl: 0.00389885, ppr: 0.000109314, pps: 7.28757e-05, prp: 0.00513774, prps: 0.0133727, rb: 0.00932809, rbr: 0.000947384, rbs: 3.64378e-05, rrb: 3.64378e-05, to: 7.28757e-05, uh: 3.64378e-05, vb: 0.578706, vbd: 0.000109314, vbg: 0.00688675, vbn: 0.00142108, vbp: 3.64378e-05, vbz: 7.28757e-05, wdt: 0.000728757, wp: 0.00102026, wrb: 0.000473692 } 35 | uh: { in: 0.017094, nn: 0.017094, nns: 0.00854701, pp: 0.196581, ppc: 0.529915, ppr: 0.0512821, pps: 0.034188, prp: 0.00854701, prps: 0.00854701, rrb: 0.025641, to: 0.0512821, uh: 0.034188, vb: 0.017094 } 36 | vb: { cc: 0.00924334, cd: 0.0202371, det: 0.223099, ex: 0.000552758, fw: 6.14175e-05, in: 0.112548, jj: 0.0841727, jjr: 0.0105024, jjs: 0.000675593, lrb: 0.00113622, md: 0.000460631, nn: 0.0619703, nnp: 0.0317221, nnps: 0.00049134, nns: 0.0495025, pdt: 0.00156615, pos: 0.000184253, pp: 0.0251198, ppc: 0.0173812, ppd: 0.00896696, ppl: 0.00568112, ppr: 0.00242599, pps: 0.00251812, prp: 0.0368198, prps: 0.043545, rb: 0.0489498, rbr: 0.00500553, rbs: 0.000276379, rp: 0.0320292, rrb: 0.000460631, to: 0.0424395, uh: 9.21263e-05, vb: 0.00518978, vbd: 0.00138189, vbg: 0.017504, vbn: 0.084664, vbp: 0.000368505, vbz: 0.00153544, wdt: 0.000644884, wp: 0.00365434, wps: 6.14175e-05, wrb: 0.00515907 } 37 | vbd: { cc: 0.00304146, cd: 0.0627501, det: 0.16768, ex: 0.00128061, fw: 2.66795e-05, in: 0.113841, jj: 0.0564004, jjr: 0.00749693, jjs: 0.000800384, lrb: 0.000320154, md: 0.000373513, nn: 0.0329491, nnp: 0.0568806, nnps: 0.000133397, nns: 0.0232912, pdt: 0.000693666, pp: 0.0460488, ppc: 0.0231845, ppd: 0.0156342, ppl: 0.00658983, ppr: 0.000346833, pps: 0.00317486, prp: 0.0606424, prps: 0.0258791, rb: 0.0855877, rbr: 0.00381516, rbs: 0.000373513, rp: 0.0159543, rrb: 0.000106718, to: 0.0569874, uh: 0.000106718, vb: 0.00293474, vbd: 0.00200096, vbg: 0.0234779, vbn: 0.0963663, vbp: 0.000106718, vbz: 0.000320154, wdt: 0.000160077, wp: 0.000586948, wrb: 0.00165413 } 38 | vbg: { cc: 0.00981521, cd: 0.0179306, det: 0.184789, ex: 0.000219334, in: 0.139826, jj: 0.0721062, jjr: 0.00937654, jjs: 0.000658003, lrb: 0.000383835, md: 0.000493502, nn: 0.12694, nnp: 0.0395898, nnps: 0.000658003, nns: 0.0894336, pdt: 0.00109667, pos: 5.48336e-05, pp: 0.0169984, ppc: 0.0129407, ppd: 0.00690903, ppl: 0.00509952, ppr: 0.00208368, pps: 0.00197401, prp: 0.0244558, prps: 0.0302681, rb: 0.0384932, rbr: 0.00438669, rbs: 0.000109667, rp: 0.0269781, rrb: 0.000438669, sym: 5.48336e-05, to: 0.0953008, uh: 0.000164501, vb: 0.000877337, vbd: 0.00307068, vbg: 0.00433185, vbn: 0.0253879, vbp: 0.000548336, vbz: 0.00202884, wdt: 0.000164501, wp: 0.00202884, wrb: 0.00153534 } 39 | vbn: { cc: 0.0132743, cd: 0.0116251, det: 0.0675382, ex: 8.04505e-05, in: 0.363596, jj: 0.0445696, jjr: 0.0033387, jjs: 0.000201126, lrb: 0.000402253, md: 0.00116653, nn: 0.0660097, nnp: 0.0261866, nnps: 0.000201126, nns: 0.0381738, pdt: 0.000643604, pos: 0.000120676, pp: 0.0502011, ppc: 0.0331054, ppd: 0.00659694, ppl: 0.00514883, ppr: 0.00217216, pps: 0.00418343, prp: 0.00897023, prps: 0.00993564, rb: 0.0495173, rbr: 0.00337892, rbs: 0.000241352, rp: 0.0212389, rrb: 0.000724055, to: 0.106315, vb: 0.000643604, vbd: 0.00213194, vbg: 0.0203138, vbn: 0.0319791, vbp: 0.00116653, vbz: 0.00168946, wdt: 0.000241352, wp: 0.00076428, wrb: 0.00221239 } 40 | vbp: { cc: 0.00429212, cd: 0.00897444, det: 0.119594, ex: 0.0018209, fw: 6.50322e-05, in: 0.0926058, jj: 0.0856474, jjr: 0.00851922, jjs: 0.00058529, lrb: 0.000650322, md: 0.00169084, nn: 0.0299798, nnp: 0.0186642, nnps: 0.000325161, nns: 0.0325161, pdt: 0.000780386, pos: 0.000130064, pp: 0.0194446, ppc: 0.0195747, ppd: 0.00266632, ppl: 0.00487741, ppr: 0.000455225, pps: 0.00273135, prp: 0.0357027, prps: 0.0108604, rb: 0.164076, rbr: 0.00526761, rbs: 0.000845418, rp: 0.00903947, rrb: 0.000325161, to: 0.0516356, vb: 0.00279638, vbd: 0.0036418, vbg: 0.084867, vbn: 0.165702, vbp: 0.00117058, vbz: 0.00260129, wdt: 0.000260129, wp: 0.00234116, wps: 6.50322e-05, wrb: 0.00221109 } 41 | vbz: { cc: 0.00276149, cd: 0.0202761, det: 0.162928, ex: 0.000718744, fw: 3.78286e-05, in: 0.0886325, jj: 0.073501, jjr: 0.00809533, jjs: 0.000643087, lrb: 0.000302629, md: 0.000794401, nn: 0.0347267, nnp: 0.0600719, nnps: 0.0002648, nns: 0.0160393, pdt: 0.000529601, pp: 0.0305277, ppc: 0.0251182, ppd: 0.00446378, ppl: 0.0099111, ppr: 0.000226972, pps: 0.00419898, prp: 0.0231511, prps: 0.0117269, rb: 0.135616, rbr: 0.00397201, rbs: 0.000870059, rp: 0.00696047, rrb: 0.000189143, to: 0.05122, uh: 3.78286e-05, vb: 0.00321543, vbd: 0.00287498, vbg: 0.0592775, vbn: 0.148515, vbp: 0.000189143, vbz: 0.000945716, wdt: 0.000302629, wp: 0.00223189, wrb: 0.00393418 } 42 | wdt: { cc: 0.000375728, cd: 0.0046966, det: 0.0272403, ex: 0.00169078, in: 0.00901747, jj: 0.00920534, jjr: 0.000751456, jjs: 0.000751456, lrb: 0.000187864, md: 0.134135, nn: 0.0146534, nnp: 0.0212286, nnps: 0.000187864, nns: 0.0174714, pos: 0.000187864, ppc: 0.00544806, ppd: 0.00093932, ppl: 0.000751456, pps: 0.000375728, prp: 0.0338155, prps: 0.00169078, rb: 0.0479053, rbr: 0.000187864, to: 0.00150291, vb: 0.00112718, vbd: 0.229946, vbn: 0.00093932, vbp: 0.145595, vbz: 0.287995 } 43 | wp: { cd: 0.0017319, det: 0.0439903, in: 0.017319, jj: 0.00623485, jjr: 0.000692761, jjs: 0.000692761, lrb: 0.00034638, ls: 0.00034638, md: 0.0651195, nn: 0.0128161, nnp: 0.0193973, nns: 0.00969865, pdt: 0.00034638, pos: 0.00034638, pp: 0.00138552, ppc: 0.00692761, ppl: 0.000692761, prp: 0.0841704, prps: 0.0051957, rb: 0.0450294, rbr: 0.000692761, rbs: 0.00034638, rrb: 0.00103914, to: 0.00692761, vb: 0.00381018, vbd: 0.271562, vbn: 0.000692761, vbp: 0.164184, vbz: 0.227918, wdt: 0.00034638 } 44 | wps: { cd: 0.0182648, jj: 0.182648, jjr: 0.00456621, jjs: 0.00913242, nn: 0.351598, nnp: 0.0593607, nns: 0.328767, ppd: 0.0182648, ppl: 0.00456621, rb: 0.00456621, vbg: 0.0136986, vbn: 0.00456621 } 45 | wrb: { cc: 0.00342857, cd: 0.00647619, det: 0.252952, ex: 0.00952381, in: 0.00952381, jj: 0.0963809, jjr: 0.00114286, jjs: 0.00152381, lrb: 0.000380952, md: 0.0118095, nn: 0.0609524, nnp: 0.0998095, nnps: 0.00152381, nns: 0.0788571, pdt: 0.00228571, pp: 0.00304762, ppc: 0.0110476, ppd: 0.000380952, ppl: 0.00266667, prp: 0.212571, prps: 0.023619, rb: 0.0388571, rbs: 0.000380952, rp: 0.000380952, rrb: 0.000380952, to: 0.023619, vb: 0.00228571, vbd: 0.00685714, vbg: 0.00990476, vbn: 0.0102857, vbp: 0.00609524, vbz: 0.00990476, wdt: 0.000380952, wp: 0.000380952, wrb: 0.000380952 } 46 | -------------------------------------------------------------------------------- /lib/engtagger.rb: -------------------------------------------------------------------------------- 1 | #! /local/ruby/bin/ruby 2 | 3 | $LOAD_PATH << File.join(File.dirname(__FILE__), 'engtagger') 4 | require 'rubygems' 5 | require 'kconv' 6 | require 'porter' 7 | # use hpricot for extracting English text from docs with XML like tags 8 | begin 9 | require 'hpricot' 10 | rescue LoadError 11 | $no_hpricot = true 12 | end 13 | 14 | # File paths 15 | $lexpath = File.join(File.dirname(__FILE__), 'engtagger') 16 | $word_path = File.join($lexpath, "pos_words.hash") 17 | $tag_path = File.join($lexpath, "pos_tags.hash") 18 | 19 | # for memoization (code snipet from http://eigenclass.org/hiki/bounded-space-memoization) 20 | class Module 21 | def memoize(method) 22 | # alias_method is faster than define_method + old.bind(self).call 23 | alias_method "__memoized__#{method}", method 24 | module_eval <<-EOF 25 | def #{method}(*a, &b) 26 | # assumes the block won't change the result if the args are the same 27 | (@__memoized_#{method}_cache ||= {})[a] ||= __memoized__#{method}(*a, &b) 28 | end 29 | EOF 30 | end 31 | end 32 | 33 | # English part-of-speech tagger class 34 | class EngTagger 35 | VERSION = '0.1.1' 36 | 37 | ################# 38 | # Class methods # 39 | ################# 40 | 41 | # Return a class variable that holds probability data 42 | def self.hmm 43 | return @@hmm 44 | end 45 | 46 | # Return a class variable that holds lexical data 47 | def self.lexicon 48 | return @@lexicon 49 | end 50 | 51 | # Return a regexp from a string argument that matches an XML-style pos tag 52 | def self.get_ext(tag = nil) 53 | return nil unless tag 54 | return Regexp.new("<#{tag}>[^<]+\s*") 55 | end 56 | 57 | # Regexps to match XML-style part-of-speech tags 58 | NUM = get_ext('cd') 59 | GER = get_ext('vbg') 60 | ADJ = get_ext('jj[rs]*') 61 | PART = get_ext('vbn') 62 | NN = get_ext('nn[sp]*') 63 | NNP = get_ext('nnp') 64 | PREP = get_ext('in') 65 | DET = get_ext('det') 66 | PAREN = get_ext('[lr]rb') 67 | QUOT = get_ext('ppr') 68 | SEN = get_ext('pp') 69 | WORD = get_ext('\w+') 70 | 71 | # Convert a Treebank-style, abbreviated tag into verbose definitions 72 | def self.explain_tag(tag) 73 | if TAGS[tag] 74 | return TAGS[tag] 75 | else 76 | return tag 77 | end 78 | end 79 | 80 | # The folloging is to make a hash to convert a pos tag to its definition 81 | # used by the explain_tag method 82 | tags = [ 83 | "CC", "Conjunction, coordinating", 84 | "CD", "Adjective, cardinal number", 85 | "DET", "Determiner", 86 | "EX", "Pronoun, existential there", 87 | "FW", "Foreign words", 88 | "IN", "Preposition / Conjunction", 89 | "JJ", "Adjective", 90 | "JJR", "Adjective, comparative", 91 | "JJS", "Adjective, superlative", 92 | "LS", "Symbol, list item", 93 | "MD", "Verb, modal", 94 | "NN", "Noun", 95 | "NNP", "Noun, proper", 96 | "NNPS", "Noun, proper, plural", 97 | "NNS", "Noun, plural", 98 | "PDT", "Determiner, prequalifier", 99 | "POS", "Possessive", 100 | "PRP", "Determiner, possessive second", 101 | "PRPS", "Determiner, possessive", 102 | "RB", "Adverb", 103 | "RBR", "Adverb, comparative", 104 | "RBS", "Adverb, superlative", 105 | "RP", "Adverb, particle", 106 | "SYM", "Symbol", 107 | "TO", "Preposition", 108 | "UH", "Interjection", 109 | "VB", "Verb, infinitive", 110 | "VBD", "Verb, past tense", 111 | "VBG", "Verb, gerund", 112 | "VBN", "Verb, past/passive participle", 113 | "VBP", "Verb, base present form", 114 | "VBZ", "Verb, present 3SG -s form", 115 | "WDT", "Determiner, question", 116 | "WP", "Pronoun, question", 117 | "WPS", "Determiner, possessive & question", 118 | "WRB", "Adverb, question", 119 | "PP", "Punctuation, sentence ender", 120 | "PPC", "Punctuation, comma", 121 | "PPD", "Punctuation, dollar sign", 122 | "PPL", "Punctuation, quotation mark left", 123 | "PPR", "Punctuation, quotation mark right", 124 | "PPS", "Punctuation, colon, semicolon, elipsis", 125 | "LRB", "Punctuation, left bracket", 126 | "RRB", "Punctuation, right bracket" 127 | ] 128 | tags = tags.collect{|t| t.downcase.gsub(/[\.\,\'\-\s]+/, '_')} 129 | tags = tags.collect{|t| t.gsub(/\&/, "and").gsub(/\//, "or")} 130 | TAGS = Hash[*tags] 131 | 132 | # Hash storing config values: 133 | # 134 | # * :unknown_word_tag 135 | # => (String) Tag to assign to unknown words 136 | # * :stem 137 | # => (Boolean) Stem single words using Porter module 138 | # * :weight_noun_phrases 139 | # => (Boolean) When returning occurrence counts for a noun phrase, multiply 140 | # the valuethe number of words in the NP. 141 | # * :longest_noun_phrase 142 | # => (Integer) Will ignore noun phrases longer than this threshold. This 143 | # affects only the get_words() and get_nouns() methods. 144 | # * :relax 145 | # => (Boolean) Relax the Hidden Markov Model: this may improve accuracy for 146 | # uncommon words, particularly words used polysemously 147 | # * :tag_lex 148 | # => (String) Name of the YAML file containing a hash of adjacent part of 149 | # speech tags and the probability of each 150 | # * :word_lex 151 | # => (String) Name of the YAML file containing a hash of words and corresponding 152 | # parts of speech 153 | # * :unknown_lex 154 | # => (String) Name of the YAML file containing a hash of tags for unknown 155 | # words and corresponding parts of speech 156 | # * :tag_path 157 | # => (String) Directory path of tag_lex 158 | # * :word_path 159 | # => (String) Directory path of word_lex and unknown_lex 160 | # * :debug 161 | # => (Boolean) Print debug messages 162 | attr_accessor :conf 163 | 164 | ############### 165 | # Constructor # 166 | ############### 167 | 168 | # Take a hash of parameters that override default values. 169 | # See above for details. 170 | def initialize(params = {}) 171 | @conf = Hash.new 172 | @conf[:unknown_word_tag] = '' 173 | @conf[:stem] = false 174 | @conf[:weight_noun_phrases] = false 175 | @conf[:longest_noun_phrase] = 5 176 | @conf[:relax] = false 177 | @conf[:tag_lex] = 'tags.yml' 178 | @conf[:word_lex] = 'words.yml' 179 | @conf[:unknown_lex] = 'unknown.yml' 180 | @conf[:word_path] = $word_path 181 | @conf[:tag_path] = $tag_path 182 | @conf[:debug] = false 183 | # assuming that we start analyzing from the beginninga new sentence... 184 | @conf[:current_tag] = 'pp' 185 | @conf.merge(params) if params 186 | unless File.exists?(@conf[:word_path]) and File.exists?(@conf[:tag_path]) 187 | print "Couldn't locate POS lexicon, creating a new one" if @conf[:debug] 188 | @@hmm = Hash.new 189 | @@lexicon = Hash.new 190 | else 191 | lexf = File.open(@conf[:word_path], 'r') 192 | @@lexicon = Marshal.load(lexf) 193 | lexf.close 194 | hmmf = File.open(@conf[:tag_path], 'r') 195 | @@hmm = Marshal.load(hmmf) 196 | hmmf.close 197 | end 198 | @@mnp = get_max_noun_regex 199 | end 200 | 201 | ################## 202 | # Public methods # 203 | ################## 204 | 205 | # Examine the string provided and return it fully tagged in XML style 206 | def add_tags(text, verbose = false) 207 | return nil unless valid_text(text) 208 | tagged = [] 209 | words = clean_text(text) 210 | tags = Array.new 211 | words.each do |word| 212 | cleaned_word = clean_word(word) 213 | tag = assign_tag(@conf[:current_tag], cleaned_word) 214 | @conf[:current_tag] = tag = (tag and tag != "") ? tag : 'nn' 215 | tag = EngTagger.explain_tag(tag) if verbose 216 | tagged << '<' + tag + '>' + word + '' 217 | end 218 | reset 219 | return tagged.join(' ') 220 | end 221 | 222 | def add_tags_hash(text, verbose = false) 223 | return nil unless valid_text(text) 224 | tagged = Hash.new 225 | words = clean_text(text) 226 | words.each do |word| 227 | cleaned_word = clean_word(word) 228 | tag = assign_tag(@conf[:current_tag], cleaned_word) 229 | @conf[:current_tag] = tag = (tag and tag != "") ? tag : 'nn' 230 | tag = EngTagger.explain_tag(tag) if verbose 231 | tagged[word] = tag 232 | end 233 | reset 234 | return tagged 235 | end 236 | # Given a text string, return as many nouns and noun phrases as possible. 237 | # Applies add_tags and involves three stages: 238 | # 239 | # * Tag the text 240 | # * Extract all the maximal noun phrases 241 | # * Recursively extract all noun phrases from the MNPs 242 | # 243 | def get_words(text) 244 | return false unless valid_text(text) 245 | tagged = add_tags(text) 246 | if(@conf[:longest_noun_phrase] <= 1) 247 | return get_nouns(tagged) 248 | else 249 | return get_noun_phrases(tagged) 250 | end 251 | end 252 | 253 | # Return an easy-on-the-eyes tagged version of a text string. 254 | # Applies add_tags and reformats to be easier to read. 255 | def get_readable(text, verbose = false) 256 | return nil unless valid_text(text) 257 | tagged = add_tags(text, verbose) 258 | tagged = tagged.gsub(/<\w+>([^<]+)<\/(\w+)>/o) do 259 | $1 + '/' + $2.upcase 260 | end 261 | return tagged 262 | end 263 | 264 | # Return an array of sentences (without POS tags) from a text. 265 | def get_sentences(text) 266 | return nil unless valid_text(text) 267 | tagged = add_tags(text) 268 | sentences = Array.new 269 | tagged.split(/<\/pp>/).each do |line| 270 | sentences << strip_tags(line) 271 | end 272 | sentences = sentences.map do |sentence| 273 | sentence.gsub(Regexp.new(" ('s?) ")){$1 + ' '} 274 | sentence.gsub(Regexp.new(" (\W+) ")){$1 + ' '} 275 | sentence.gsub(Regexp.new(" (`+) ")){' ' + $1} 276 | sentence.gsub(Regexp.new(" (\W+)$")){$1} 277 | sentence.gsub(Regexp.new("^(`+) ")){$1} 278 | end 279 | return sentences 280 | end 281 | 282 | # Given a POS-tagged text, this method returns a hash of all proper nouns 283 | # and their occurrence frequencies. The method is greedy and will 284 | # return multi-word phrases, if possible, so it would find ``Linguistic 285 | # Data Consortium'' as a single unit, rather than as three individual 286 | # proper nouns. This method does not stem the found words. 287 | def get_proper_nouns(tagged) 288 | return nil unless valid_text(tagged) 289 | trimmed = tagged.scan(NNP).map do |n| 290 | strip_tags(n) 291 | end 292 | nnp = Hash.new(0) 293 | trimmed.each do |n| 294 | next unless n.length < 100 # sanity check on word length 295 | nnp[n] += 1 unless n =~ /\A\s*\z/ 296 | end 297 | # Now for some fancy resolution stuff... 298 | nnp.keys.each do |key| 299 | words = key.split(/\s/) 300 | # Let's say this is an organization's name -- 301 | # (and it's got at least three words) 302 | # is there a corresponding acronym in this hash? 303 | if words.length > 2 304 | # Make a (naive) acronym out of this name 305 | acronym = words.map do |word| 306 | /\A([a-z])[a-z]*\z/ =~ word 307 | $1 308 | end.join '' 309 | # If that acronym has been seen, 310 | # remove it and add the values to 311 | # the full name 312 | if nnp[acronym] 313 | nnp[key] += nnp[acronym] 314 | nnp.delete(acronym) 315 | end 316 | end 317 | end 318 | return nnp 319 | end 320 | 321 | # Given a POS-tagged text, this method returns all nouns and their 322 | # occurrence frequencies. 323 | def get_nouns(tagged) 324 | return nil unless valid_text(tagged) 325 | NN 326 | trimmed = tagged.scan(NN).map do |n| 327 | strip_tags(n) 328 | end 329 | ret = Hash.new(0) 330 | trimmed.each do |n| 331 | n = stem(n) 332 | next unless n.length < 100 # sanity check on word length 333 | ret[n] += 1 unless n =~ /\A\s*\z/ 334 | end 335 | return ret 336 | end 337 | 338 | # Given a POS-tagged text, this method returns only the maximal noun phrases. 339 | # May be called directly, but is also used by get_noun_phrases 340 | def get_max_noun_phrases(tagged) 341 | return unless valid_text(tagged) 342 | mn_phrases = tagged.scan(@@mnp).map do |m| 343 | strip_tags(m) 344 | end 345 | ret = Hash.new(0) 346 | mn_phrases.each do |p| 347 | p = stem(p) unless p =~ /\s/ # stem single words 348 | ret[p] += 1 unless p =~ /\A\s*\z/ 349 | end 350 | return ret 351 | end 352 | 353 | # Similar to get_words, but requires a POS-tagged text as an argument. 354 | def get_noun_phrases(tagged) 355 | return nil unless valid_text(tagged) 356 | found = Hash.new(0) 357 | phrase_ext = /(?:#{PREP}|#{DET}|#{NUM})+/xo 358 | scanned = tagged.scan(@@mnp) 359 | # Find MNPs in the text, one sentence at a time 360 | # Record and split if the phrase is extended by a (?:PREP|DET|NUM) 361 | mn_phrases = [] 362 | scanned.each do |m| 363 | found[m] += 1 if phrase_ext =~ m 364 | mn_phrases += m.split(phrase_ext) 365 | end 366 | mn_phrases.each do |mnp| 367 | # Split the phrase into an array of words, and create a loop for each word, 368 | # shortening the phrase by removing the word in the first position. 369 | # Record the phrase and any single nouns that are found 370 | words = mnp.split 371 | words.length.times do |i| 372 | found[words.join(' ')] += 1 if words.length > 1 373 | w = words.shift 374 | found[w] += 1 if w =~ /#{NN}/ 375 | end 376 | end 377 | ret = Hash.new(0) 378 | found.keys.each do |f| 379 | k = strip_tags(f) 380 | v = found[f] 381 | # We weight by the word count to favor long noun phrases 382 | space_count = k.scan(/\s+/) 383 | word_count = space_count.length + 1 384 | # Throttle MNPs if necessary 385 | next if word_count > @conf[:longest_noun_phrase] 386 | k = stem(k) unless word_count > 1 # stem single words 387 | multiplier = 1 388 | multiplier = word_count if @conf[:weight_noun_phrases] 389 | ret[k] += multiplier * v 390 | end 391 | return ret 392 | end 393 | 394 | # Reads some included corpus data and saves it in a stored hash on the 395 | # local file system. This is called automatically if the tagger can't 396 | # find the stored lexicon. 397 | def install 398 | puts "Creating part-of-speech lexicon" if @conf[:debug] 399 | load_tags(@conf[:tag_lex]) 400 | load_words(@conf[:word_lex]) 401 | load_words(@conf[:unknown_lex]) 402 | File.open(@conf[:word_path], 'w') do |f| 403 | Marshal.dump(@@lexicon, f) 404 | end 405 | File.open(@conf[:tag_path], 'w') do |f| 406 | Marshal.dump(@@hmm, f) 407 | end 408 | end 409 | 410 | ################### 411 | # Private methods # 412 | ################### 413 | 414 | :private 415 | 416 | # Downcase the first letter of word 417 | def lcfirst(word) 418 | word.split(//)[0].downcase + word.split(//)[1..-1].join 419 | end 420 | 421 | # Upcase the first letter of word 422 | def ucfirst(word) 423 | word.split(//)[0].upcase + word.split(//)[1..-1].join 424 | end 425 | 426 | # Return the word stem as given by Stemmable module. This can be 427 | # turned off with the class parameter @conf[:stem] => false. 428 | def stem(word) 429 | return word unless @conf[:stem] 430 | return word.stem 431 | end 432 | 433 | # This method will reset the preceeding tag to a sentence ender (PP). 434 | # This prepares the first word of a new sentence to be tagged correctly. 435 | def reset 436 | @conf[:current_tag] = 'pp' 437 | end 438 | 439 | # Check whether the text is a valid string 440 | def valid_text(text) 441 | if !text 442 | # there's nothing to parse 443 | "method call on uninitialized variable" if @conf[:debug] 444 | return false 445 | elsif /\A\s*\z/ =~ text 446 | # text is an empty string, nothing to parse 447 | return false 448 | else 449 | # $text is valid 450 | return true 451 | end 452 | end 453 | 454 | # Return a text string with the part-of-speech tags removed 455 | def strip_tags(tagged, downcase = false) 456 | return nil unless valid_text(tagged) 457 | text = tagged.gsub(/<[^>]+>/m, "") 458 | text = text.gsub(/\s+/m, " ") 459 | text = text.gsub(/\A\s*/, "") 460 | text = text.gsub(/\s*\z/, "") 461 | if downcase 462 | return text.downcase 463 | else 464 | return text 465 | end 466 | end 467 | 468 | # Strip the provided text of HTML-style tags and separate off any punctuation 469 | # in preparation for tagging 470 | def clean_text(text) 471 | return false unless valid_text(text) 472 | text = text.toutf8 473 | unless $no_hpricot 474 | # Strip out any markup and convert entities to their proper form 475 | cleaned_text = Hpricot(text).inner_text 476 | else 477 | cleaned_text = text 478 | end 479 | tokenized = [] 480 | # Tokenize the text (splitting on punctuation as you go) 481 | cleaned_text.split(/\s+/).each do |line| 482 | tokenized += split_punct(line) 483 | end 484 | words = split_sentences(tokenized) 485 | return words 486 | end 487 | 488 | # This handles all of the trailing periods, keeping those that 489 | # belong on abbreviations and removing those that seem to be 490 | # at the end of sentences. This method makes some assumptions 491 | # about the use of capitalization in the incoming text 492 | def split_sentences(array) 493 | tokenized = array 494 | people = %w(jr mr ms mrs dr prof esq sr sen sens rep reps gov attys attys 495 | supt det mssrs rev) 496 | army = %w(col gen lt cmdr adm capt sgt cpl maj brig) 497 | inst = %w(dept univ assn bros ph.d) 498 | place = %w(arc al ave blvd bld cl ct cres exp expy dist mt mtn ft fy fwy 499 | hwy hway la pde pd plz pl rd st tce) 500 | comp = %w(mfg inc ltd co corp) 501 | state = %w(ala ariz ark cal calif colo col conn del fed fla ga ida id ill 502 | ind ia kans kan ken ky la me md is mass mich minn miss mo mont 503 | neb nebr nev mex okla ok ore penna penn pa dak tenn tex ut vt 504 | va wash wis wisc wy wyo usafa alta man ont que sask yuk) 505 | month = %w(jan feb mar apr may jun jul aug sep sept oct nov dec) 506 | misc = %w(vs etc no esp) 507 | abbr = Hash.new 508 | [people, army, inst, place, comp, state, month, misc].flatten.each do |i| 509 | abbr[i] = true 510 | end 511 | words = Array.new 512 | tokenized.each_with_index do |t, i| 513 | if tokenized[i + 1] and tokenized [i + 1] =~ /[A-Z\W]/ and tokenized[i] =~ /\A(.+)\.\z/ 514 | w = $1 515 | # Don't separate the period off words that 516 | # meet any of the following conditions: 517 | # 518 | # 1. It is defined in one of the lists above 519 | # 2. It is only one letter long: Alfred E. Sloan 520 | # 3. It has a repeating letter-dot: U.S.A. or J.C. Penney 521 | unless abbr[w.downcase] or w =~ /\A[a-z]\z/i or w =~ /[a-z](?:\.[a-z])+\z/i 522 | words << w 523 | words << '.' 524 | next 525 | end 526 | end 527 | words << tokenized[i] 528 | end 529 | # If the final word ends in a period.. 530 | if words[-1] and words[-1] =~ /\A(.*\w)\.\z/ 531 | words[-1] = $1 532 | words.push '.' 533 | end 534 | return words 535 | end 536 | 537 | # Separate punctuation from words, where appropriate. This leaves trailing 538 | # periods in place to be dealt with later. Called by the clean_text method. 539 | def split_punct(text) 540 | # If there's no punctuation, return immediately 541 | return [text] if /\A\w+\z/ =~ text 542 | # Sanity checks 543 | text = text.gsub(/\W{10,}/o, " ") 544 | 545 | # Put quotes into a standard format 546 | text = text.gsub(/`(?!`)(?=.*\w)/o, "` ") # Shift left quotes off text 547 | text = text.gsub(/"(?=.*\w)/o, " `` ") # Convert left quotes to `` 548 | text = text.gsub(/(\W|^)'(?=.*\w)/o){$1 ? $1 + " ` " : " ` "} # Convert left quotes to ` 549 | text = text.gsub(/"/, " '' ") # Convert (remaining) quotes to '' 550 | text = text.gsub(/(\w)'(?!')(?=\W|$)/o){$1 + " ' "} # Separate right single quotes 551 | 552 | # Handle all other punctuation 553 | text = text.gsub(/--+/o, " - ") # Convert and separate dashes 554 | text = text.gsub(/,(?!\d)/o, " , ") # Shift commas off everything but numbers 555 | text = text.gsub(/:/o, " :") # Shift semicolons off 556 | text = text.gsub(/(\.\.\.+)/o){" " + $1 + " "} # Shift ellipses off 557 | text = text.gsub(/([\(\[\{\}\]\)])/o){" " + $1 + " "} # Shift off brackets 558 | text = text.gsub(/([\!\?#\$%;~|])/o){" " + $1 + " "} # Shift off other ``standard'' punctuation 559 | 560 | # English-specific contractions 561 | text = text.gsub(/([A-Za-z])'([dms])\b/o){$1 + " '" + $2} # Separate off 'd 'm 's 562 | text = text.gsub(/n't\b/o, " n't") # Separate off n't 563 | text = text.gsub(/'(ve|ll|re)\b/o){" '" + $1} # Separate off 've, 'll, 're 564 | result = text.split(' ') 565 | return result 566 | end 567 | 568 | # Given a preceding tag, assign a tag word. Called by the add_tags method. 569 | # This method is a modified version of the Viterbi algorithm for part-of-speech tagging 570 | def assign_tag(prev_tag, word) 571 | if word == "-unknown-" 572 | # classify unknown words accordingly 573 | return @conf[:unknown_word_tag] 574 | elsif word == "-sym-" 575 | # If this is a symbol, tag it as a symbol 576 | return "sym" 577 | end 578 | best_so_far = 0 579 | w = @@lexicon[word] 580 | t = @@hmm 581 | 582 | # TAG THE TEXT: What follows is a modified version of the Viterbi algorithm 583 | # which is used in most POS taggers 584 | best_tag = "" 585 | t[prev_tag].keys.each do |tag| 586 | # With @config[:relax] set, this method 587 | # will also include any `open classes' of POS tags 588 | pw = 0 589 | if w[tag] 590 | pw = w[tag] 591 | elsif @conf[:relax] and tag =~ /\A(?:jj|nn|rb|vb)/ 592 | pw = 0 593 | else 594 | next 595 | end 596 | 597 | # Bayesian logic: 598 | # P = P( tag | prev_tag ) * P( tag | word ) 599 | probability = t[prev_tag][tag] * (pw + 1) 600 | # Set the tag with maximal probability 601 | if probability > best_so_far 602 | best_so_far = probability 603 | best_tag = tag 604 | end 605 | end 606 | return best_tag 607 | end 608 | 609 | # This method determines whether a word should be considered in its 610 | # lower or upper case form. This is useful in considering proper nouns 611 | # and words that begin sentences. Called by add_tags. 612 | def clean_word(word) 613 | lcf = lcfirst(word) 614 | # seen this word as it appears (lower or upper case) 615 | if @@lexicon[word] 616 | return word 617 | elsif @@lexicon[lcf] 618 | # seen this word only as lower case 619 | return lcf 620 | else 621 | # never seen this word. guess. 622 | return classify_unknown_word(word) 623 | end 624 | end 625 | 626 | # This changes any word not appearing in the lexicon to identifiable 627 | # classes of words handled by a simple unknown word classification 628 | # metric. Called by the clean_word method. 629 | def classify_unknown_word(word) 630 | if /[\(\{\[]/ =~ word # Left brackets 631 | classified = "*LRB*" 632 | elsif 633 | /[\)\}\]]/ =~ word # Right brackets 634 | classified = "*RRB*" 635 | elsif /-?(?:\d+(?:\.\d*)?|\.\d+)\z/ =~ word # Floating point number 636 | classified = "*NUM*" 637 | elsif /\A\d+[\d\/:-]+\d\z/ =~ word # Other number constructs 638 | classified = "*NUM*" 639 | elsif /\A-?\d+\w+\z/o =~ word # Ordinal number 640 | classified = "*ORD*" 641 | elsif /\A[A-Z][A-Z\.-]*\z/o =~ word # Abbreviation (all caps) 642 | classified = "-abr-" 643 | elsif /\w-\w/o =~ word # Hyphenated word 644 | /-([^-]+)\z/ =~ word 645 | h_suffix = $1 646 | if h_suffix and (@@lexicon[h_suffix] and @@lexicon[h_suffix]['jj']) 647 | # last part of this is defined as an adjective 648 | classified = "-hyp-adj-" 649 | else 650 | # last part of this is not defined as an adjective 651 | classified = "-hyp-" 652 | end 653 | elsif /\A\W+\z/o =~ word 654 | classified = "-sym-" # Symbol 655 | elsif word == ucfirst(word) 656 | classified = "-cap-" # Capitalized word 657 | elsif /ing\z/o =~ word 658 | classified = "-ing-" # Ends in 'ing' 659 | elsif /s\z/o =~ word 660 | classified = "-s-" # Ends in 's' 661 | elsif /tion\z/o =~ word 662 | classified = "-tion-" # Ends in 'tion' 663 | elsif /ly\z/o =~ word 664 | classified = "-ly-" # Ends in 'ly' 665 | elsif /ed\z/o =~ word 666 | classified = "-ed-" # Ends in 'ed 667 | else 668 | classified = "-unknown-" # Completely unknown 669 | end 670 | return classified 671 | end 672 | 673 | # This returns a compiled regexp for extracting maximal noun phrases 674 | # from a POS-tagged text. 675 | def get_max_noun_regex 676 | regex = / 677 | # optional number, gerund - adjective -participle 678 | (?:#{NUM})?(?:#{GER}|#{ADJ}|#{PART})* 679 | # Followed by one or more nouns 680 | (?:#{NN})+ 681 | (?: 682 | # Optional preposition, determinant, cardinal 683 | (?:#{PREP})*(?:#{DET})?(?:#{NUM})? 684 | # Optional gerund-adjective -participle 685 | (?:#{GER}|#{ADJ}|#{PART})* 686 | # one or more nouns 687 | (?:#{NN})+ 688 | )* 689 | /xo #/ 690 | return regex 691 | end 692 | 693 | # Load the 2-grams into a hash from YAML data: This is a naive (but fast) 694 | # YAML data parser. It will load a YAML document with a collection of key: 695 | # value entries ( {pos tag}: {probability} ) mapped onto single keys ( {tag} ). 696 | # Each map is expected to be on a single line; i.e., det: { jj: 0.2, nn: 0.5, vb: 0.0002 } 697 | def load_tags(lexicon) 698 | path = File.join($lexpath, lexicon) 699 | fh = File.open(path, 'r') 700 | while line = fh.gets 701 | /\A"?([^{"]+)"?: \{ (.*) \}/ =~ line 702 | next unless $1 and $2 703 | key, data = $1, $2 704 | tags = Hash.new 705 | items = data.split(/,\s+/) 706 | pairs = {} 707 | items.each do |i| 708 | /([^:]+):\s*(.+)/ =~ i 709 | pairs[$1] = $2.to_f 710 | end 711 | @@hmm[key] = pairs 712 | end 713 | fh.close 714 | end 715 | 716 | # Load the 2-grams into a hash from YAML data: This is a naive (but fast) 717 | # YAML data parser. It will load a YAML document with a collection of key: 718 | # value entries ( {pos tag}: {count} ) mapped onto single keys ( {a word} ). 719 | # Each map is expected to be on a single line; i.e., key: { jj: 103, nn: 34, vb: 1 } 720 | def load_words(lexicon) 721 | path = File.join($lexpath, lexicon) 722 | fh = File.open(path, 'r') 723 | while line = fh.gets 724 | /\A"?([^{"]+)"?: \{ (.*) \}/ =~ line 725 | next unless $1 and $2 726 | key, data = $1, $2 727 | tags = Hash.new 728 | items = data.split(/,\s+/) 729 | pairs = {} 730 | items.each do |i| 731 | /([^:]+):\s*(.+)/ =~ i 732 | pairs[$1] = $2.to_f 733 | end 734 | @@lexicon[key] = pairs 735 | end 736 | fh.close 737 | end 738 | 739 | #memoize the stem and assign_tag methods 740 | memoize("stem") 741 | memoize("assign_tag") 742 | end 743 | 744 | --------------------------------------------------------------------------------