├── lib ├── language │ ├── current.rb │ ├── dsl.rb │ ├── mixin.rb │ ├── codes.rb │ ├── censor.rb │ ├── class.rb │ ├── words.rb │ ├── matcher.rb │ └── codes_iso639.txt └── language.rb ├── work ├── deprecated │ └── meta │ │ ├── name │ │ ├── version │ │ ├── created │ │ ├── authors │ │ ├── collection │ │ ├── released │ │ ├── summary │ │ ├── contact │ │ ├── homepage │ │ ├── repository │ │ └── description └── test_word_filter.rb ├── REQUIRE ├── .gitignore ├── VERSION ├── HISTORY.rdoc ├── PROFILE ├── README.rdoc ├── LICENSE └── Syckfile /lib/language/current.rb: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /lib/language/dsl.rb: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /work/deprecated/meta/name: -------------------------------------------------------------------------------- 1 | language 2 | -------------------------------------------------------------------------------- /work/deprecated/meta/version: -------------------------------------------------------------------------------- 1 | 0.6.0 2 | -------------------------------------------------------------------------------- /REQUIRE: -------------------------------------------------------------------------------- 1 | development: 2 | - syckle 3 | -------------------------------------------------------------------------------- /work/deprecated/meta/created: -------------------------------------------------------------------------------- 1 | 2007-08-01 2 | -------------------------------------------------------------------------------- /work/deprecated/meta/authors: -------------------------------------------------------------------------------- 1 | Thomas Sawyer 2 | -------------------------------------------------------------------------------- /work/deprecated/meta/collection: -------------------------------------------------------------------------------- 1 | rubyworks 2 | -------------------------------------------------------------------------------- /work/deprecated/meta/released: -------------------------------------------------------------------------------- 1 | 2010-04-14 2 | -------------------------------------------------------------------------------- /work/deprecated/meta/summary: -------------------------------------------------------------------------------- 1 | Language Support Library 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | MANIFEST 2 | .cache 3 | log 4 | doc/rdoc 5 | doc/ri 6 | -------------------------------------------------------------------------------- /work/deprecated/meta/contact: -------------------------------------------------------------------------------- 1 | rubyworks-mailinglist@googlegroups.com 2 | -------------------------------------------------------------------------------- /work/deprecated/meta/homepage: -------------------------------------------------------------------------------- 1 | http://rubyworks.github.com/language 2 | -------------------------------------------------------------------------------- /work/deprecated/meta/repository: -------------------------------------------------------------------------------- 1 | git://github.com/rubyworks/language.git 2 | -------------------------------------------------------------------------------- /VERSION: -------------------------------------------------------------------------------- 1 | name : language 2 | major: 0 3 | minor: 6 4 | patch: 0 5 | date : 2010-05-29 6 | -------------------------------------------------------------------------------- /lib/language.rb: -------------------------------------------------------------------------------- 1 | require 'language/class' 2 | require 'language/censor' 3 | require 'language/words' 4 | require 'language/mixin' 5 | 6 | -------------------------------------------------------------------------------- /work/deprecated/meta/description: -------------------------------------------------------------------------------- 1 | Language is a support library for other langauge libraries. 2 | While some of it's contents are prefectly usable on there own, 3 | most are generally intended to be subclassed and extended by 4 | specific language modules, such as English. 5 | -------------------------------------------------------------------------------- /HISTORY.rdoc: -------------------------------------------------------------------------------- 1 | = Release History 2 | 3 | == 0.6.0 / 2010-05-29 4 | 5 | This is the first release of Language, however the code was previously 6 | released with the English project. Hence the current version of 7 | Language matches the present version of English. Language combines 8 | all the features previously part of English that are language 9 | netural or multi-lingual. It threfore provide a dependency for 10 | the English library. 11 | -------------------------------------------------------------------------------- /work/test_word_filter.rb: -------------------------------------------------------------------------------- 1 | require 'language/censor' 2 | require 'test/unit' 3 | 4 | class TestCensor < Test::Unit::TestCase 5 | 6 | def test_word_filter 7 | s = "this is a test" 8 | n = s.word_filter{ |w| "#{w}1" } 9 | assert_equal( 'this1 is1 a1 test1', n ) 10 | end 11 | 12 | def test_word_filter! 13 | s = "this is a test" 14 | s.word_filter!{ |w| "#{w}1" } 15 | assert_equal( 'this1 is1 a1 test1', s ) 16 | end 17 | 18 | end 19 | -------------------------------------------------------------------------------- /lib/language/mixin.rb: -------------------------------------------------------------------------------- 1 | require 'language/class' 2 | 3 | class Language 4 | 5 | module Mixin 6 | # 7 | def method_missing(s,*a,&b) 8 | return super(s,*a,&b) if s == Language.current.to_sym 9 | 10 | lang = __send__(Language.current) 11 | if lang && lang.respond_to?(s) 12 | lang.__send__(s,*a,&b) 13 | else 14 | super(s,*a,&b) 15 | end 16 | end 17 | end 18 | 19 | end 20 | 21 | class String 22 | include Language::Mixin 23 | end 24 | 25 | class Numeric 26 | include Language::Mixin 27 | end 28 | 29 | class Array 30 | include Language::Mixin 31 | end 32 | 33 | -------------------------------------------------------------------------------- /PROFILE: -------------------------------------------------------------------------------- 1 | --- 2 | title : Language 3 | suite : rubyworks 4 | summary: Language Support Library 5 | license: MIT 6 | authors: Thomas Sawyer 7 | created: 2007-08-01 8 | 9 | description: 10 | Language is a support library for other langauge libraries. 11 | While some of it's contents are prefectly usable on there own, 12 | most are generally intended to be subclassed and extended by 13 | specific language modules, such as English. 14 | 15 | resources: 16 | homepage : http://rubyworks.github.com/language 17 | development : http://github.com/rubyworks/language 18 | respository : git://github.com/rubyworks/language.git 19 | subscribe : rubyworks-mailinglist+subscribe@googlegroups.com 20 | 21 | copyright: 22 | COpyright (c) 2007 Thomas Sawyer 23 | 24 | -------------------------------------------------------------------------------- /README.rdoc: -------------------------------------------------------------------------------- 1 | = Language 2 | 3 | * home: http://rubyworks.github.com/language 4 | * work: http://github.com/rubyworks/language 5 | 6 | Language is a support library for other langauge libraries. 7 | While some of it's contents are prefectly usable on there own, 8 | most are generally intended to be subclassed and extended by 9 | specific language modules, such as English. 10 | 11 | 12 | == SYNOPSIS 13 | 14 | require 'language' 15 | 16 | "How many words?".words #=> ['How', 'many', 'words'] 17 | 18 | 19 | == INSTALLATION 20 | 21 | The usual Rubygems way: 22 | 23 | $ gem install language 24 | 25 | 26 | == COPYING 27 | 28 | (MIT License) 29 | 30 | Copyright (c) 2010 Thomas Sawyer 31 | 32 | English is distributed under the terms of the MIT license. 33 | 34 | See LICENCE for details. 35 | 36 | Some libraries are subtantial derivatives of other persons 37 | work. Fully copyright and licensing information is given 38 | for those in the corresponding source files. 39 | -------------------------------------------------------------------------------- /lib/language/codes.rb: -------------------------------------------------------------------------------- 1 | require 'language/class' 2 | 3 | class Language 4 | 5 | # A hash of International 2- and 3-letter ISO639-1 and ISO639-2 language codes. 6 | module Codes 7 | 8 | # Hash of ISO639 2--letter language codes 9 | ISO639_1 = {} 10 | 11 | # Hash of ISO639 3-letter language codes 12 | ISO639_2 = {} 13 | 14 | file = File.join(File.dirname(__FILE__), 'codes_iso639.txt') 15 | 16 | File.readlines(file).each do |line| 17 | next if /^#/ =~ line 18 | 19 | codes3, codes2, desc = line[0,7].strip, line[9,6].strip, line[15...-1].strip 20 | 21 | codes3 = codes3.split('/') 22 | codes2 = codes2.split('/') 23 | 24 | codes2.each do |code| 25 | if ISO639_1.key?(code) 26 | raise "Duplicate language code #{code}" 27 | end 28 | ISO639_1[code] = desc 29 | end 30 | 31 | codes3.each do |code| 32 | if ISO639_2.key?(code) 33 | raise "Duplicate language code #{code}" 34 | end 35 | ISO639_2[code] = desc 36 | end 37 | 38 | end 39 | 40 | end 41 | 42 | end 43 | 44 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright (c) 2009 Thomas Sawyer 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | 23 | 24 | -------------------------------------------------------------------------------- /Syckfile: -------------------------------------------------------------------------------- 1 | --- 2 | email: 3 | service : Email 4 | file : ~ 5 | subject : ~ 6 | mailto : ruby-talk@ruby-lang.org 7 | active : true 8 | 9 | gemcutter: 10 | active: true 11 | 12 | grancher: 13 | active: true 14 | 15 | box: 16 | service: Box 17 | types : [gem] 18 | active : true 19 | 20 | testrb: 21 | service : Testrb 22 | tests : ~ 23 | exclude : ~ 24 | loadpath : ~ 25 | requires : ~ 26 | live : false 27 | active : false 28 | 29 | syntax: 30 | service : Syntax 31 | loadpath : ~ 32 | exclude : ~ 33 | active : false 34 | 35 | dnote: 36 | service : DNote 37 | loadpath : ~ 38 | labels : ~ 39 | output : ~ 40 | format : ~ 41 | active : false 42 | 43 | rdoc: 44 | service : RDoc 45 | template : redfish 46 | include : ~ 47 | exclude : [Syckfile] 48 | main : ~ 49 | extra : ~ 50 | active : true 51 | 52 | ridoc: 53 | service: RIDoc 54 | include: ~ 55 | exclude: ~ 56 | active : true 57 | 58 | stats: 59 | service : Stats 60 | title : ~ 61 | loadpath : ~ 62 | exclude : ~ 63 | output : ~ 64 | active : true 65 | 66 | vclog: 67 | service : VClog 68 | format : html, atom 69 | layout : history 70 | typed : false 71 | output : ~ 72 | active : false 73 | 74 | #rubyforge: 75 | # service : Rubyforge 76 | # unixname: <%= project %> 77 | # groupid : ~ 78 | # package : <%= package %> 79 | # sitemap: 80 | # doc/rdoc: <%= package %> 81 | # active : false 82 | 83 | -------------------------------------------------------------------------------- /lib/language/censor.rb: -------------------------------------------------------------------------------- 1 | require 'language/class' 2 | 3 | class Language 4 | 5 | # = Censor 6 | # 7 | # This class allows one to define a resuable text filter. 8 | # This is useful for removing or replacing curse words or 9 | # senstive information from user input. 10 | 11 | class Censor 12 | 13 | # Default censor list. 14 | def self.default_words 15 | [] 16 | end 17 | 18 | # Abritraty rules. 19 | attr :rules 20 | 21 | # Word-oriented rules. 22 | attr :word_rules 23 | 24 | # New Censor object. 25 | # 26 | def initialize() 27 | @rules = [] 28 | @word_rules = [] 29 | 30 | self.class.default_words.each do |word| 31 | word_rule(word) 32 | end 33 | end 34 | 35 | # Create new rule. A rule consists of a string or regexp 36 | # to match against. 37 | # 38 | # NOTE: The rules must be applied in order! So we cannot 39 | # use a hash because the ordering is not guaranteed. So 40 | # an array is used instead. 41 | # 42 | def rule(match, &edit) 43 | edit = lambda{''} unless edit 44 | @rules << [match, edit] 45 | end 46 | 47 | # Rules that apply only to words. This takes the regular 48 | # expression and add word boundry matches to either side. 49 | # 50 | # filter.word_rule(/damn/){ |w| 'darn' } 51 | # 52 | # Is equivalent to teh regular rule: 53 | # 54 | # filter.rule(/\bdamn\b/){ |w| 'darn' } 55 | # 56 | def word_rule(match, &edit) 57 | edit = lambda{''} unless edit 58 | @word_rules << [/\b#{match}\b/, edit] 59 | end 60 | 61 | # Apply the set of rules (regular expression matches) to 62 | # a string. 63 | # 64 | def filter(string) 65 | rewritten_string = string.dup 66 | rules.each do |match,edit| 67 | rewritten_string.gsub!(match,edit) 68 | end 69 | return (rewritten_string or string) 70 | end 71 | 72 | alias_method :apply, :filter 73 | 74 | # Is the string clear of any matching rules? 75 | # 76 | # Note that running a filter does not necessarily clear a 77 | # a string of all matches, since the filter could apply 78 | # edits that would also match the filter expressions. 79 | # 80 | def censored?(string) 81 | case string 82 | when *matches 83 | false 84 | else 85 | true 86 | end 87 | end 88 | 89 | # 90 | # 91 | def matches 92 | rules.collect{ |match, modify| match } 93 | end 94 | 95 | end 96 | 97 | end 98 | -------------------------------------------------------------------------------- /lib/language/class.rb: -------------------------------------------------------------------------------- 1 | class Language 2 | 3 | # 4 | def self.abbreviation 5 | 'lang' 6 | end 7 | 8 | # 9 | def self.default 10 | @default || abbreviation 11 | end 12 | 13 | # 14 | def self.default=(lang) 15 | @default = lang 16 | end 17 | 18 | # 19 | def self.current 20 | @current || default 21 | end 22 | 23 | # 24 | def self.current=(lang) 25 | @current = lang 26 | end 27 | 28 | # 29 | def self.instance(string) 30 | @cache ||= {} 31 | @cache[string.object_id] = new(string) 32 | end 33 | 34 | # 35 | def initialize(subject) 36 | @self = subject 37 | end 38 | 39 | end 40 | 41 | class String 42 | # Higher-order function to invoke Language functions. 43 | def lang 44 | Language.instance(self) 45 | end 46 | end 47 | 48 | class Array 49 | # Higher-order function to invoke Language functions. 50 | def lang 51 | Language.instance(self) 52 | end 53 | end 54 | 55 | class Integer 56 | # Higher-order function to invoke Language functions. 57 | def lang 58 | Language.instance(self) 59 | end 60 | end 61 | 62 | 63 | 64 | 65 | =begin 66 | module Language 67 | extend self 68 | 69 | # Subclass this in your specific language modules. 70 | # 71 | # class English::String < Language::String 72 | # 73 | class String < ::String 74 | 75 | # 76 | def self.language 77 | Language 78 | end 79 | 80 | # 81 | def self.instance(string) 82 | @cache ||= {} 83 | @cache[string.object_id] = new(string) 84 | end 85 | 86 | # 87 | def initialize(string) 88 | super() 89 | replace(string) 90 | end 91 | 92 | def language 93 | @_language ||= self.class.language 94 | end 95 | 96 | end 97 | 98 | # TODO: We can't actually subclass Integer. 99 | # But we can fake it. However we need to sublass 100 | # it just so #is_a? works. However subclassing it causes 101 | # the .new method not to exist, how to fix? 102 | # 103 | class Integer #< ::Integer 104 | instance_methods{ |m| private m unless /^__/ =~ m.to_s } 105 | 106 | # 107 | def self.language 108 | Language 109 | end 110 | 111 | # 112 | def self.instance(integer) 113 | @cache ||= {} 114 | @cache[integer] = new(integer) 115 | end 116 | 117 | # 118 | def initialize(integer) 119 | @integer = integer 120 | end 121 | 122 | # 123 | def to_i 124 | @integer 125 | end 126 | 127 | # 128 | def method_missing(s,*a,&b) 129 | @integer.__send__(s,*a,&b) 130 | end 131 | 132 | # 133 | def language 134 | @_language ||= self.class.language 135 | end 136 | end 137 | 138 | # 139 | class Array < ::Array 140 | 141 | # 142 | def self.language 143 | Language 144 | end 145 | 146 | # 147 | def self.instance(array) 148 | @cache ||= {} 149 | @cache[array.object_id] = new(array) 150 | end 151 | 152 | def language 153 | @_language ||= self.class.language 154 | end 155 | end 156 | 157 | end 158 | =end 159 | 160 | -------------------------------------------------------------------------------- /lib/language/words.rb: -------------------------------------------------------------------------------- 1 | # This module charaterizes the most common forms of Orthography 2 | # in computer systems --words divided by spaces, used paragraphs 3 | # by blank lines, and so on. 4 | 5 | require 'language/class' 6 | 7 | class Language 8 | 9 | # If block given, iterate through each word. 10 | # 11 | # "a string".each_word { |word, range| ... } 12 | # 13 | # Returns an array of words. 14 | # 15 | # "abc 123".words #=> ["abc","123"] 16 | # 17 | def self.words(string, &yld) 18 | if block_given? 19 | string.scan(/([-'\w]+)/).each do |word| 20 | range = $~.begin(0)...$~.end(0) 21 | if yld.arity == 1 22 | yld.call(word) 23 | else 24 | yld.call(word, range) 25 | end 26 | end 27 | else 28 | string.scan(/([-'\w]+)/).flatten 29 | end 30 | end 31 | 32 | # 33 | def self.sentences(string, &yld) 34 | if block_given? 35 | string.scan(/(.*?\.\ )/).each do |sentence| 36 | range = $~.begin(0)...$~.end(0) 37 | if yld.arity == 1 38 | yld.call(sentence) 39 | else 40 | yld.call(sentence, range) 41 | end 42 | end 43 | else 44 | string.scan(/(.*?\.\ )/) 45 | end 46 | end 47 | 48 | # 49 | def self.paragraphs(string, &yld) 50 | if block_given? 51 | string.scan(/(.*?\n\s{2,})/).each do |paragraph| 52 | range = $~.begin(0)...$~.end(0) 53 | if yld.arity == 1 54 | yld.call(paragraph) 55 | else 56 | yld.call(paragraph, range) 57 | end 58 | end 59 | else 60 | string.scan(/(.*?\n\s{2,})/) 61 | end 62 | end 63 | 64 | # Word wrap a string not exceeding max width. 65 | # 66 | # puts "this is a test".word_wrap(4) 67 | # 68 | # _produces_ 69 | # 70 | # this 71 | # is a 72 | # test 73 | # 74 | # CREDIT: Gavin Kistner 75 | # CREDIT: Dayne Broderson 76 | 77 | def self.word_wrap(string, col_width=79) 78 | string = string.gsub( /(\S{#{col_width}})(?=\S)/, '\1 ' ) 79 | string = string.gsub( /(.{1,#{col_width}})(?:\s+|$)/, "\\1\n" ) 80 | string 81 | end 82 | 83 | =begin 84 | # TODO: This is alternateive from glue: worth providing? 85 | # 86 | # Enforces a maximum width of a string inside an 87 | # html container. If the string exceeds this maximum width 88 | # the string gets wraped. 89 | # 90 | # Not really useful, better use the CSS overflow: hidden 91 | # functionality. 92 | # 93 | # === Input: 94 | # the string to be wrapped 95 | # the enforced width 96 | # the separator used for wrapping 97 | # 98 | # === Output: 99 | # the wrapped string 100 | # 101 | # === Example: 102 | # text = "1111111111111111111111111111111111111111111" 103 | # text = wrap(text, 10, " ") 104 | # p text # => "1111111111 1111111111 1111111111" 105 | # 106 | # See the test cases to better understand the behaviour! 107 | 108 | # def wrap(width = 20, separator = " ") 109 | # re = /([^#{separator}]{1,#{width}})/ 110 | # scan(re).join(separator) 111 | # end 112 | =end 113 | 114 | def words(&blk) 115 | self.class.words(@self, &blk) 116 | end 117 | 118 | # 119 | def each_word(&blk) 120 | words(&blk) 121 | end 122 | 123 | def sentences(&yld) 124 | self.class.sentences(@self, &blk) 125 | end 126 | 127 | # 128 | def each_sentence(&blk) 129 | sentences(&blk) 130 | end 131 | 132 | def paragrpahs(&yld) 133 | self.class.paragraphs(@self, &blk) 134 | end 135 | 136 | # 137 | def each_paragraph(&blk) 138 | paragraphs(&blk) 139 | end 140 | 141 | # 142 | def word_wrap(col_width=79) 143 | self.class.word_wrap(@self, col_width) 144 | end 145 | 146 | # As with #word_wrap, but modifies the string in place. 147 | def word_wrap!(col_width=79) 148 | @self.replace(word_wrap(col_width=79)) 149 | end 150 | 151 | end 152 | 153 | -------------------------------------------------------------------------------- /lib/language/matcher.rb: -------------------------------------------------------------------------------- 1 | require 'language/class' 2 | 3 | class Language 4 | 5 | #= Matcher 6 | # 7 | # Matcher derives from Ruby Quiz #103, the DictionaryMatcher quiz. 8 | 9 | class Matcher 10 | 11 | attr_reader :word_count 12 | 13 | #Contains the index matched, and the word matched 14 | class MatchData < Struct.new(:index,:match) 15 | def inspect 16 | "#{match.inspect}@#{index}" 17 | end 18 | end 19 | 20 | def inspect 21 | to_s 22 | end 23 | 24 | #Create a DictionaryMatcher with no words in it 25 | def initialize 26 | @trie = {} 27 | @word_count = 0 28 | end 29 | 30 | #Add a word to the DictionaryMatcher 31 | def add(word) 32 | @word_count += 1 33 | container = @trie 34 | containers=[] 35 | 36 | i=0 37 | word.each_byte do |b| 38 | container[b] = {} unless container.has_key? b 39 | container[:depth]=i 40 | containers << container 41 | container = container[b] 42 | i+=1 43 | end 44 | containers << container 45 | 46 | container[0] = true # Mark end of word 47 | container[:depth]=i 48 | 49 | ff=compute_failure_function word 50 | ff.zip(containers).each do |pointto,container| 51 | container[:failure]=containers[pointto] if pointto 52 | end 53 | 54 | self 55 | 56 | end 57 | 58 | alias << add 59 | 60 | def compute_failure_function p 61 | m=p.size 62 | pi=[nil,0] 63 | k=0 64 | 2.upto m do |q| 65 | k=pi[k] while k>0 and p[k] != p[q-1] 66 | k=k+1 if p[k]==p[q-1] 67 | pi[q]=k 68 | end 69 | pi 70 | end 71 | private :compute_failure_function 72 | 73 | #Determine whether +string+ was previously added to the 74 | #Trie. 75 | def include?(word) 76 | container = @trie 77 | word.each_byte do |b| 78 | break unless container.has_key? b 79 | container = container[b] 80 | end 81 | container[0] 82 | end 83 | 84 | #Determines whether one of the words in the DictionaryMatcher is a 85 | #substring of 86 | #+string+. Returns the index of the match if found, +nil+ if not 87 | #found. 88 | def =~ text 89 | internal_match(text){|md| return md.index} 90 | nil 91 | end 92 | 93 | #Determine whether one of the words in the DictionaryMatcher is a 94 | #substring of 95 | #+string+. Returns a DictionaryMatcher::MatchData object if found, 96 | #+nil+ if not #found. 97 | def match text 98 | internal_match(text){|md| return md} 99 | nil 100 | end 101 | 102 | def internal_match string 103 | node=@trie 104 | pos=0 105 | string.each_byte do |b| 106 | advance=false 107 | until advance 108 | nextnode=node[b] 109 | if not nextnode 110 | if node[:failure] 111 | node=node[:failure] 112 | else 113 | advance=true 114 | end 115 | elsif nextnode[0] 116 | yield MatchData.new(pos, string[pos+1-nextnode[:depth],nextnode[:depth]]) 117 | advance=true 118 | node=@trie 119 | else 120 | advance=true 121 | node=nextnode 122 | end 123 | pos+=1 124 | end 125 | end 126 | end 127 | private :internal_match 128 | 129 | #Scans +string+ for all occurrances of strings in the 130 | #DictionaryMatcher. 131 | #Overlapping matches are skipped (only the first one is yielded), and 132 | #when some strings in the 133 | #DictionaryMatcher are substrings of others, only the shortest match 134 | #at a given position is found. 135 | def scan(text, &block) 136 | matches=[] 137 | block= lambda{ |md| matches << md } unless block 138 | internal_match(text,&block) 139 | matches 140 | end 141 | 142 | #Case equality. Similar to =~. 143 | alias_method :===, :=~ 144 | end 145 | 146 | end 147 | 148 | -------------------------------------------------------------------------------- /lib/language/codes_iso639.txt: -------------------------------------------------------------------------------- 1 | abk ab Abkhazian 2 | ace Achinese 3 | ach Acoli 4 | ada Adangme 5 | aar aa Afar 6 | afh Afrihili 7 | afr af Afrikaans 8 | afa Afro-Asiatic (Other) 9 | aka Akan 10 | akk Akkadian 11 | alb/sqi sq Albanian 12 | ale Aleut 13 | alg Algonquian languages 14 | tut Altaic (Other) 15 | amh am Amharic 16 | apa Apache languages 17 | ara ar Arabic 18 | arc Aramaic 19 | arp Arapaho 20 | arn Araucanian 21 | arw Arawak 22 | arm/hye hy Armenian 23 | art Artificial (Other) 24 | asm as Assamese 25 | ath Athapascan languages 26 | map Austronesian (Other) 27 | ava Avaric 28 | ave Avestan 29 | awa Awadhi 30 | aym ay Aymara 31 | aze az Azerbaijani 32 | nah Aztec 33 | ban Balinese 34 | bat Baltic (Other) 35 | bal Baluchi 36 | bam Bambara 37 | bai Bamileke languages 38 | bad Banda 39 | bnt Bantu (Other) 40 | bas Basa 41 | bak ba Bashkir 42 | baq/eus eu Basque 43 | bej Beja 44 | bem Bemba 45 | ben bn Bengali 46 | ber Berber (Other) 47 | bho Bhojpuri 48 | bih bh Bihari 49 | bik Bikol 50 | bin Bini 51 | bis bi Bislama 52 | bra Braj 53 | bre br Breton 54 | bug Buginese 55 | bul bg Bulgarian 56 | bua Buriat 57 | bur/mya my Burmese 58 | bel be Byelorussian 59 | cad Caddo 60 | car Carib 61 | cat ca Catalan 62 | cau Caucasian (Other) 63 | ceb Cebuano 64 | cel Celtic (Other) 65 | cai Central American Indian (Other) 66 | chg Chagatai 67 | cha Chamorro 68 | che Chechen 69 | chr Cherokee 70 | chy Cheyenne 71 | chb Chibcha 72 | chi/zho zh Chinese 73 | chn Chinook jargon 74 | cho Choctaw 75 | chu Church Slavic 76 | chv Chuvash 77 | cop Coptic 78 | cor Cornish 79 | cos co Corsican 80 | cre Cree 81 | mus Creek 82 | crp Creoles and Pidgins (Other) 83 | cpe Creoles and Pidgins, English-based (Other) 84 | cpf Creoles and Pidgins, French-based (Other) 85 | cpp Creoles and Pidgins, Portuguese-based (Other) 86 | cus Cushitic (Other) 87 | hrv hr Croatian 88 | ces/cze cs Czech 89 | dak Dakota 90 | dan da Danish 91 | del Delaware 92 | din Dinka 93 | div Divehi 94 | doi Dogri 95 | dra Dravidian (Other) 96 | dua Duala 97 | dut/nla nl Dutch 98 | dum Dutch, Middle (ca. 1050-1350) 99 | dyu Dyula 100 | dzo dz Dzongkha 101 | efi Efik 102 | egy Egyptian (Ancient) 103 | eka Ekajuk 104 | elx Elamite 105 | eng en English 106 | enm English, Middle (ca. 1100-1500) 107 | ang English, Old (ca. 450-1100) 108 | esk Eskimo (Other) 109 | epo eo Esperanto 110 | est et Estonian 111 | ewe Ewe 112 | ewo Ewondo 113 | fan Fang 114 | fat Fanti 115 | fao fo Faroese 116 | fij fj Fijian 117 | fin fi Finnish 118 | fiu Finno-Ugrian (Other) 119 | fon Fon 120 | fra/fre fr French 121 | frm French, Middle (ca. 1400-1600) 122 | fro French, Old (842- ca. 1400) 123 | fry fy Frisian 124 | ful Fulah 125 | gaa Ga 126 | gae/gdh Gaelic (Scots) 127 | glg gl Gallegan 128 | lug Ganda 129 | gay Gayo 130 | gez Geez 131 | geo/kat ka Georgian 132 | deu/ger de German 133 | gmh German, Middle High (ca. 1050-1500) 134 | goh German, Old High (ca. 750-1050) 135 | gem Germanic (Other) 136 | gil Gilbertese 137 | gon Gondi 138 | got Gothic 139 | grb Grebo 140 | grc Greek, Ancient (to 1453) 141 | ell/gre el Greek, Modern (1453-) 142 | kal kl Greenlandic 143 | grn gn Guarani 144 | guj gu Gujarati 145 | hai Haida 146 | hau ha Hausa 147 | haw Hawaiian 148 | heb he Hebrew 149 | her Herero 150 | hil Hiligaynon 151 | him Himachali 152 | hin hi Hindi 153 | hmo Hiri Motu 154 | hun hu Hungarian 155 | hup Hupa 156 | iba Iban 157 | ice/isl is Icelandic 158 | ibo Igbo 159 | ijo Ijo 160 | ilo Iloko 161 | inc Indic (Other) 162 | ine Indo-European (Other) 163 | ind id Indonesian 164 | ina ia Interlingua (International Auxiliary language Association) 165 | ile Interlingue 166 | iku iu Inuktitut 167 | ipk ik Inupiak 168 | ira Iranian (Other) 169 | gai/iri ga Irish 170 | sga Irish, Old (to 900) 171 | mga Irish, Middle (900 - 1200) 172 | iro Iroquoian languages 173 | ita it Italian 174 | jpn ja Japanese 175 | jav jv Javanese 176 | jrb Judeo-Arabic 177 | jpr Judeo-Persian 178 | kab Kabyle 179 | kac Kachin 180 | kam Kamba 181 | kan kn Kannada 182 | kau Kanuri 183 | kaa Kara-Kalpak 184 | kar Karen 185 | kas ks Kashmiri 186 | kaw Kawi 187 | kaz kk Kazakh 188 | kha Khasi 189 | khm km Khmer 190 | khi Khoisan (Other) 191 | kho Khotanese 192 | kik Kikuyu 193 | kin rw Kinyarwanda 194 | kir ky Kirghiz 195 | kom Komi 196 | kon Kongo 197 | kok Konkani 198 | kor ko Korean 199 | kpe Kpelle 200 | kro Kru 201 | kua Kuanyama 202 | kum Kumyk 203 | kur ku Kurdish 204 | kru Kurukh 205 | kus Kusaie 206 | kut Kutenai 207 | lad Ladino 208 | lah Lahnda 209 | lam Lamba 210 | oci oc Langue d'Oc (post 1500) 211 | lao lo Lao 212 | lat la Latin 213 | lav lv Latvian 214 | ltz Letzeburgesch 215 | lez Lezghian 216 | lin ln Lingala 217 | lit lt Lithuanian 218 | loz Lozi 219 | lub Luba-Katanga 220 | lui Luiseno 221 | lun Lunda 222 | luo Luo (Kenya and Tanzania) 223 | mac/mke mk Macedonian 224 | mad Madurese 225 | mag Magahi 226 | mai Maithili 227 | mak Makasar 228 | mlg mg Malagasy 229 | may/msa ms Malay 230 | mal Malayalam 231 | mlt ml Maltese 232 | man Mandingo 233 | mni Manipuri 234 | mno Manobo languages 235 | max Manx 236 | mao/mri mi Maori 237 | mar mr Marathi 238 | chm Mari 239 | mah Marshall 240 | mwr Marwari 241 | mas Masai 242 | myn Mayan languages 243 | men Mende 244 | mic Micmac 245 | min Minangkabau 246 | mis Miscellaneous (Other) 247 | moh Mohawk 248 | mol mo Moldavian 249 | mkh Mon-Kmer (Other) 250 | lol Mongo 251 | mon mn Mongolian 252 | mos Mossi 253 | mul Multiple languages 254 | mun Munda languages 255 | nau na Nauru 256 | nav Navajo 257 | nde Ndebele, North 258 | nbl Ndebele, South 259 | ndo Ndongo 260 | nep ne Nepali 261 | new Newari 262 | nic Niger-Kordofanian (Other) 263 | ssa Nilo-Saharan (Other) 264 | niu Niuean 265 | non Norse, Old 266 | nai North American Indian (Other) 267 | nor no Norwegian 268 | nno Norwegian (Nynorsk) 269 | nub Nubian languages 270 | nym Nyamwezi 271 | nya Nyanja 272 | nyn Nyankole 273 | nyo Nyoro 274 | nzi Nzima 275 | oji Ojibwa 276 | ori or Oriya 277 | orm om Oromo 278 | osa Osage 279 | oss Ossetic 280 | oto Otomian languages 281 | pal Pahlavi 282 | pau Palauan 283 | pli Pali 284 | pam Pampanga 285 | pag Pangasinan 286 | pan pa Panjabi 287 | pap Papiamento 288 | paa Papuan-Australian (Other) 289 | fas/per fa Persian 290 | peo Persian, Old (ca 600 - 400 B.C.) 291 | phn Phoenician 292 | pol pl Polish 293 | pon Ponape 294 | por pt Portuguese 295 | pra Prakrit languages 296 | pro Provencal, Old (to 1500) 297 | pus ps Pushto 298 | que qu Quechua 299 | roh rm Rhaeto-Romance 300 | raj Rajasthani 301 | rar Rarotongan 302 | roa Romance (Other) 303 | ron/rum ro Romanian 304 | rom Romany 305 | run rn Rundi 306 | rus ru Russian 307 | sal Salishan languages 308 | sam Samaritan Aramaic 309 | smi Sami languages 310 | smo sm Samoan 311 | sad Sandawe 312 | sag sg Sango 313 | san sa Sanskrit 314 | srd Sardinian 315 | sco Scots 316 | sel Selkup 317 | sem Semitic (Other) 318 | sr Serbian 319 | scr sh Serbo-Croatian 320 | srr Serer 321 | shn Shan 322 | sna sn Shona 323 | sid Sidamo 324 | bla Siksika 325 | snd sd Sindhi 326 | sin si Singhalese 327 | sit Sino-Tibetan (Other) 328 | sio Siouan languages 329 | sla Slavic (Other) 330 | ss Siswati 331 | slk/slo sk Slovak 332 | slv sl Slovenian 333 | sog Sogdian 334 | som so Somali 335 | son Songhai 336 | wen Sorbian languages 337 | nso Sotho, Northern 338 | sot st Sotho, Southern 339 | sai South American Indian (Other) 340 | esl/spa es Spanish 341 | suk Sukuma 342 | sux Sumerian 343 | sun su Sudanese 344 | sus Susu 345 | swa sw Swahili 346 | ssw Swazi 347 | sve/swe sv Swedish 348 | syr Syriac 349 | tgl tl Tagalog 350 | tah Tahitian 351 | tgk tg Tajik 352 | tmh Tamashek 353 | tam ta Tamil 354 | tat tt Tatar 355 | tel te Telugu 356 | ter Tereno 357 | tha th Thai 358 | bod/tib bo Tibetan 359 | tig Tigre 360 | tir ti Tigrinya 361 | tem Timne 362 | tiv Tivi 363 | tli Tlingit 364 | tog to Tonga (Nyasa) 365 | ton Tonga (Tonga Islands) 366 | tru Truk 367 | tsi Tsimshian 368 | tso ts Tsonga 369 | tsn tn Tswana 370 | tum Tumbuka 371 | tur tr Turkish 372 | ota Ottoman 373 | tuk tk Turkmen 374 | tyv Tuvinian 375 | twi tw Twi 376 | uga Ugaritic 377 | uig ug Uighur 378 | ukr uk Ukrainian 379 | umb Umbundu 380 | und Undetermined 381 | urd ur Urdu 382 | uzb uz Uzbek 383 | vai Vai 384 | ven Venda 385 | vie vi Vietnamese 386 | vol vo Volap�k 387 | vot Votic 388 | wak Wakashan languages 389 | wal Walamo 390 | war Waray 391 | was Washo 392 | cym/wel cy Welsh 393 | wol wo Wolof 394 | xho xh Xhosa 395 | sah Yakut 396 | yao Yao 397 | yap Yap 398 | yid yi Yiddish 399 | yor yo Yoruba 400 | zap Zapotec 401 | zen Zenaga 402 | zha za Zhuang 403 | zul zu Zulu 404 | zun Zuni 405 | --------------------------------------------------------------------------------