├── ext └── ots │ ├── version.h │ ├── ots.h │ ├── extconf.rb │ ├── libots │ ├── grader-tc.h │ ├── grader.c │ ├── text.c │ ├── grader-tf.c │ ├── article.c │ ├── highlighter.c │ ├── html.c │ ├── parser.c │ ├── relations.c │ ├── grader-tc.c │ ├── libots.h │ ├── wordlist.c │ └── stemmer.c │ └── ots.c ├── test ├── helper.rb ├── test_grader.rb ├── test_ots.rb └── test_article.rb ├── .gitignore ├── lib ├── ots.rb └── ots │ └── grader.rb ├── CHANGELOG ├── dictionaries ├── tr.xml ├── tl.xml ├── mt.xml ├── id.xml ├── mi.xml ├── lv.xml ├── eu.xml ├── el.xml ├── pl.xml ├── ia.xml ├── uk.xml ├── bg.xml ├── fi.xml ├── cy.xml ├── ga.xml ├── da.xml ├── ca.xml ├── ru.xml ├── cs.xml ├── ro.xml ├── ms.xml ├── eo.xml ├── et.xml ├── fr.xml ├── is.xml ├── it.xml ├── nl.xml ├── sv.xml ├── nn.xml ├── hu.xml ├── gl.xml ├── yi.xml ├── he.xml ├── de.xml ├── es.xml └── pt.xml ├── Rakefile ├── README.md └── ots.gemspec /ext/ots/version.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #define RUBY_OTS_VERSION "0.5.4" 3 | -------------------------------------------------------------------------------- /test/helper.rb: -------------------------------------------------------------------------------- 1 | require 'ots' 2 | require 'minitest/spec' 3 | require 'minitest/autorun' 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ext/* 2 | !ext/extconf.rb 3 | !ext/*.c 4 | !ext/*.h 5 | ext/*.so 6 | pkg/ 7 | tmp/ 8 | *.gem 9 | -------------------------------------------------------------------------------- /test/test_grader.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | require 'helper' 3 | require 'ots/grader' 4 | 5 | describe 'OTS::Grader' do 6 | it 'should load the dictionary & return stop words' do 7 | assert OTS::Grader.new(language: 'en').stop_words 8 | end 9 | end 10 | -------------------------------------------------------------------------------- /lib/ots.rb: -------------------------------------------------------------------------------- 1 | require 'ots/ots' 2 | 3 | module OTS 4 | DICTIONARY_PATH = File.absolute_path(File.dirname(__FILE__) + '/../dictionaries') 5 | # set the dictionary path, so the c extension can read files. 6 | # we can set this at compile time but bundler sometimes compiles the extension inside a temp directory. 7 | set_dictionary_path DICTIONARY_PATH 8 | end 9 | -------------------------------------------------------------------------------- /lib/ots/grader.rb: -------------------------------------------------------------------------------- 1 | require 'nokogiri' 2 | 3 | module OTS 4 | class Grader 5 | def initialize options = {} 6 | path = options[:path] || File.join(DICTIONARY_PATH, options.fetch(:language, 'en').to_s + '.xml') 7 | @xml = Nokogiri::XML(File.read(path)) 8 | end 9 | 10 | def stop_words 11 | @xml.xpath('//grader-tc/word').map {|word| word.text.downcase} 12 | end 13 | end 14 | end 15 | -------------------------------------------------------------------------------- /ext/ots/ots.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | #include "version.h" 12 | 13 | #define TO_S(v) rb_funcall(v, rb_intern("to_s"), 0) 14 | #define CSTRING(v) RSTRING_PTR(TO_S(v)) 15 | #define rb_enc_str_new2(text, enc) rb_enc_str_new(text, strlen(text), enc) 16 | -------------------------------------------------------------------------------- /test/test_ots.rb: -------------------------------------------------------------------------------- 1 | require 'helper' 2 | 3 | describe 'OTS' do 4 | it 'parse() should return an article instance' do 5 | OTS.parse("hello world").must_be_kind_of OTS::Article 6 | end 7 | 8 | it 'parse() should raise ArgumentError on invalid text' do 9 | assert_raises(ArgumentError) do 10 | OTS.parse(1) 11 | end 12 | end 13 | 14 | it 'should return a list of dictonaries' do 15 | languages = OTS.languages 16 | 17 | %w(en fr it es de ru).each do |name| 18 | assert languages.include?(name), "has #{name} language dictionary" 19 | end 20 | 21 | assert_empty languages.reject {|name| name.size == 2}, "dictionaries path should not have other junk" 22 | end 23 | end 24 | -------------------------------------------------------------------------------- /CHANGELOG: -------------------------------------------------------------------------------- 1 | === 0.5.4 (2012-02-03) 2 | 3 | * move extension into subdirectory. 4 | * added OTS#set_dictionary_path to avoid compile time path resolution. 5 | 6 | === 0.5.3 (2012-01-27) 7 | 8 | * cleanup dependencies. 9 | 10 | === 0.5.2 (2012-01-25) 11 | 12 | * added OTS::Grader 13 | 14 | === 0.5.1 (2012-01-11) 15 | 16 | * GC bugfix: hang on to encoding index rather than rb_encoding pointer. 17 | 18 | === 0.5.0 (2012-01-10) 19 | 20 | api rewrite and some dictionary parser fixes 21 | 22 | renamed: 23 | 24 | * OTS.dictionaries => OTS.languages 25 | 26 | api changes: 27 | 28 | * OTS.parse, takes an options hash now with language or dictionary options 29 | * removed OTS::Article#title 30 | * added OTS::Article#topics, returns the most important keywords 31 | * renamed the lines option in OTS::Article#summarize to sentences 32 | 33 | xml parser fixes: 34 | 35 | * uses xmlReadFile instead of xmlParseFile 36 | -------------------------------------------------------------------------------- /ext/ots/extconf.rb: -------------------------------------------------------------------------------- 1 | require 'mkmf' 2 | 3 | glib_cflags = %x{pkg-config --cflags glib-2.0}.strip 4 | glib_ldflags = %x{pkg-config --libs glib-2.0}.strip 5 | 6 | if glib_cflags.empty? 7 | warn %q{WARNING: No pkg-config found for glib-2.0, using defaults. Set GLIB_INCLUDE_DIR env to override.} 8 | dirs = ENV.fetch('GLIB_INCLUDE_DIR', '/usr/include/glib-2.0 /usr/lib/glib-2.0/include') 9 | glib_cflags = dirs.split(/\s+/).map {|dir| "-I#{dir}"}.join(' ') 10 | end 11 | 12 | if glib_ldflags.empty? 13 | warn %q{WARNING: No pkg-config found for glib-2.0, using defaults. Set GLIB_LIB env to override.} 14 | libs = ENV.fetch('GLIB_LIB', 'glib-2.0') 15 | glib_ldflags = libs.split(/\s+/).map {|lib| "-l#{lib}"}.join(' ') 16 | end 17 | 18 | $CFLAGS = glib_cflags + %Q{ -Ilibots -I/usr/include/libxml2} 19 | $LDFLAGS = glib_ldflags + %Q{ -Llibots} 20 | 21 | find_library('glib-2.0', 'main') or raise "unable to find glib-2.0" 22 | find_library('xml2', 'main') or raise "unable to find libxml2" 23 | 24 | # ugly mkmf hack: manually assign source and object directories. 25 | $srcs = Dir["{libots/*.c,*.c}"] 26 | $objs = $srcs.map {|name| File.join(File.dirname(name), File.basename(name, ".c") + ".o")} 27 | 28 | class File 29 | def self.basename name 30 | name 31 | end 32 | end 33 | 34 | create_makefile 'ots' 35 | -------------------------------------------------------------------------------- /dictionaries/tr.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | "| 6 | (| 7 | 8 | 9 | 10 | 11 | ."| 12 | ,"| 13 | .| 14 | ,| 15 | "| 16 | )| 17 | ?| 18 | :| 19 | ;| 20 | !| 21 | 22 | 23 | 24 | 25 | wrote|write 26 | came|come 27 | went|go 28 | 29 | 30 | 31 | before1|1after 32 | 33 |
34 |        before1|1after
35 |    
36 |
37 | 38 | 39 | 40 | ." 41 | ?" 42 | !" 43 | ," 44 | . 45 | ? 46 | ; 47 | | 48 | ! 49 | 50 | 51 | 52 | Dr. 53 | Mr. 54 | Mrs. 55 | U.S. 56 | Rep. 57 | Sen. 58 | 59 | 60 | 61 | bir 62 | bu 63 | o 64 | 65 |
66 | -------------------------------------------------------------------------------- /dictionaries/tl.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | "| 6 | (| 7 | 8 | 9 | 10 | 11 | ."| 12 | ,"| 13 | .| 14 | ,| 15 | "| 16 | )| 17 | ?| 18 | :| 19 | ;| 20 | !| 21 | 22 | 23 | 24 | 25 | wrote|write 26 | came|come 27 | went|go 28 | 29 | 30 | 31 | before1|1after 32 | 33 |
34 |        before1|1after
35 |    
36 |
37 | 38 | 39 | 40 | ." 41 | ?" 42 | !" 43 | ," 44 | . 45 | ? 46 | ; 47 | | 48 | ! 49 | 50 | 51 | 52 | Dr. 53 | Mr. 54 | Mrs. 55 | U.S. 56 | Rep. 57 | Sen. 58 | 59 | 60 | 61 | ako1 62 | amin 63 | atin 64 | mo 65 | nila 66 | 67 |
68 | -------------------------------------------------------------------------------- /dictionaries/mt.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | "| 6 | (| 7 | 8 | 9 | 10 | 11 | ."| 12 | ,"| 13 | .| 14 | ,| 15 | "| 16 | )| 17 | ?| 18 | :| 19 | ;| 20 | !| 21 | 22 | 23 | 24 | 25 | wrote|write 26 | came|come 27 | went|go 28 | 29 | 30 | 31 | before1|1after 32 | 33 |
34 |        before1|1after
35 |    
36 |
37 | 38 | 39 | 40 | ." 41 | ?" 42 | !" 43 | ," 44 | . 45 | ? 46 | ; 47 | | 48 | ! 49 | 50 | 51 | 52 | Dr. 53 | Mr. 54 | Mrs. 55 | U.S. 56 | Rep. 57 | Sen. 58 | 59 | 60 | 61 | jiena 62 | aħna 63 | inti 64 | iva 65 | iwa 66 | le 67 | mingħajr 68 | taħt 69 | wieħed 70 | tnejn 71 | għandi 72 | 73 |
74 | -------------------------------------------------------------------------------- /dictionaries/id.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | "| 6 | (| 7 | 8 | 9 | 10 | 11 | ."| 12 | ,"| 13 | .| 14 | ,| 15 | "| 16 | )| 17 | ?| 18 | :| 19 | ;| 20 | !| 21 | 22 | 23 | 24 | 25 | wrote|write 26 | came|come 27 | went|go 28 | 29 | 30 | 31 | before1|1after 32 | 33 |
34 |        before1|1after
35 |    
36 |
37 | 38 | 39 | 40 | ." 41 | ?" 42 | !" 43 | ," 44 | . 45 | ? 46 | ; 47 | | 48 | ! 49 | 50 | 51 | 52 | Dr. 53 | Mr. 54 | Mrs. 55 | U.S. 56 | Rep. 57 | Sen. 58 | 59 | 60 | 61 | aku 62 | anda 63 | bapak 64 | dia 65 | engkau 66 | ibu 67 | kalian 68 | kami 69 | kamu 70 | kita 71 | mereka 72 | saudara 73 | saya 74 | 75 |
76 | -------------------------------------------------------------------------------- /dictionaries/mi.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | "| 6 | (| 7 | 8 | 9 | 10 | 11 | ."| 12 | ,"| 13 | .| 14 | ,| 15 | "| 16 | )| 17 | ?| 18 | :| 19 | ;| 20 | !| 21 | 22 | 23 | 24 | 25 | wrote|write 26 | came|come 27 | went|go 28 | 29 | 30 | 31 | before1|1after 32 | 33 |
34 |        before1|1after
35 |    
36 |
37 | 38 | 39 | 40 | ." 41 | ?" 42 | !" 43 | ," 44 | . 45 | ? 46 | ; 47 | | 48 | ! 49 | 50 | 51 | 52 | Dr. 53 | Mr. 54 | Mrs. 55 | U.S. 56 | Rep. 57 | Sen. 58 | 59 | 60 | 61 | ahau 62 | au 63 | ia 64 | koe 65 | koorua 66 | koutou 67 | maatou 68 | maaua 69 | ngā 70 | raatou 71 | raaua 72 | taatou 73 | taaua 74 | te 75 | 76 |
77 | -------------------------------------------------------------------------------- /dictionaries/lv.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | "| 7 | (| 8 | 9 | 10 | 11 | 12 | ."| 13 | ,"| 14 | .| 15 | ,| 16 | "| 17 | )| 18 | ?| 19 | :| 20 | ;| 21 | !| 22 | 23 | 24 | 25 | 26 | wrote|write 27 | came|come 28 | went|go 29 | 30 | 31 | 32 | before1|1after 33 | 34 |
35 |        before1|1after
36 |    
37 |
38 | 39 | 40 | 41 | ." 42 | ?" 43 | !" 44 | ," 45 | . 46 | ? 47 | ; 48 | | 49 | ! 50 | 51 | 52 | 53 | Dr. 54 | Mr. 55 | Mrs. 56 | U.S. 57 | Rep. 58 | Sen. 59 | 60 | 61 | 62 | pa 63 | par 64 | pat 65 | pats 66 | pār 67 | pārāk 68 | pārējais 69 | pāri 70 | pēc 71 | pie 72 | pirms 73 | pret 74 | priekšu 75 | projām 76 | 77 |
78 | -------------------------------------------------------------------------------- /dictionaries/eu.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | "| 6 | (| 7 | 8 | 9 | 10 | 11 | ."| 12 | ,"| 13 | .| 14 | ,| 15 | "| 16 | )| 17 | ?| 18 | :| 19 | ;| 20 | !| 21 | 22 | 23 | 24 | 25 | wrote|write 26 | came|come 27 | went|go 28 | 29 | 30 | 31 | before1|1after 32 | 33 |
34 |        before1|1after
35 |    
36 |
37 | 38 | 39 | 40 | ." 41 | ?" 42 | !" 43 | ," 44 | . 45 | ? 46 | ; 47 | | 48 | ! 49 | 50 | 51 | 52 | Dr. 53 | Mr. 54 | Mrs. 55 | U.S. 56 | Rep. 57 | Sen. 58 | 59 | 60 | 61 | bai 62 | baita 63 | bere 64 | edo 65 | egon 66 | ere 67 | eta 68 | ez 69 | gabe 70 | hau 71 | hori 72 | hura 73 | inor 74 | izan 75 | kaixo 76 | 77 |
78 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require 'date' 2 | require 'pathname' 3 | require 'rake' 4 | require 'rake/clean' 5 | require 'rake/testtask' 6 | require 'rake/extensiontask' 7 | 8 | $rootdir = Pathname.new(__FILE__).dirname 9 | $gemspec = Gem::Specification.new do |s| 10 | s.name = 'ots' 11 | s.version = '0' # modify ext/version.h 12 | s.date = Date.today 13 | s.authors = ['Bharanee Rathna'] 14 | s.email = ['deepfryed@gmail.com'] 15 | s.summary = 'Open Text Summarizer interface for Ruby.' 16 | s.description = 'Ruby interface to libots libraries for unix.' 17 | s.homepage = 'http://github.com/deepfryed/ots' 18 | s.files = Dir['ext/**/*.{c,h}'] + Dir['{ext,test,lib}/**/*.rb'] + %w(README.md CHANGELOG) + Dir['*/*.xml'] 19 | s.extensions = %w(ext/ots/extconf.rb) 20 | s.require_paths = %w(lib ext) 21 | 22 | s.add_development_dependency('rake') 23 | s.add_development_dependency('rake-compiler') 24 | end 25 | 26 | desc 'Generate ots gemspec' 27 | task :gemspec do 28 | $gemspec.date = Date.today 29 | $gemspec.version = File.read($rootdir + 'ext/ots/version.h').scan(/[\d.]+/).first 30 | File.open('ots.gemspec', 'w') {|fh| fh.write($gemspec.to_ruby)} 31 | end 32 | 33 | desc 'compile extension' 34 | task :compile do 35 | Dir.chdir('ext/ots') do 36 | system('ruby extconf.rb && make clean && make -j2') or raise 'unable to compile ots' 37 | end 38 | end 39 | 40 | Rake::TestTask.new(:test) do |test| 41 | test.libs << 'ext' << 'lib' << 'test' 42 | test.pattern = 'test/**/test_*.rb' 43 | test.verbose = true 44 | end 45 | 46 | task default: :test 47 | task :test => [:compile] 48 | -------------------------------------------------------------------------------- /dictionaries/el.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | "| 6 | (| 7 | 8 | 9 | 10 | 11 | ."| 12 | ,"| 13 | .| 14 | ,| 15 | "| 16 | )| 17 | ?| 18 | :| 19 | ;| 20 | !| 21 | 22 | 23 | 24 | 25 | wrote|write 26 | came|come 27 | went|go 28 | 29 | 30 | 31 | before1|1after 32 | 33 |
34 |        before1|1after
35 |    
36 |
37 | 38 | 39 | 40 | ." 41 | ?" 42 | !" 43 | ," 44 | . 45 | ? 46 | ; 47 | | 48 | ! 49 | 50 | 51 | 52 | Dr. 53 | Mr. 54 | Mrs. 55 | U.S. 56 | Rep. 57 | Sen. 58 | 59 | 60 | 61 | από 62 | για 63 | δεν 64 | επειδή 65 | η 66 | ή 67 | κάθε 68 | καθένας 69 | και 70 | κανείς 71 | κατά 72 | με 73 | να 74 | πρέπει 75 | σε 76 | τα 77 | το 78 | ως 79 | 80 |
81 | -------------------------------------------------------------------------------- /dictionaries/pl.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | "| 7 | (| 8 | 9 | 10 | 11 | 12 | ."| 13 | ,"| 14 | .| 15 | ,| 16 | "| 17 | )| 18 | ?| 19 | :| 20 | ;| 21 | !| 22 | 23 | 24 | 25 | 26 | wrote|write 27 | came|come 28 | went|go 29 | 30 | 31 | 32 | before1|1after 33 | 34 |
35 |        before1|1after
36 |    
37 |
38 | 39 | 40 | 41 | ." 42 | ?" 43 | !" 44 | ," 45 | . 46 | ? 47 | ; 48 | | 49 | ! 50 | 51 | 52 | 53 | Dr. 54 | Mr. 55 | Mrs. 56 | U.S. 57 | Rep. 58 | Sen. 59 | 60 | 61 | 62 | bez 63 | dla 64 | do 65 | ja 66 | ku 67 | my 68 | na 69 | nad 70 | nie 71 | o 72 | obok 73 | od 74 | on 75 | oni 76 | po 77 | pod 78 | przeciw 79 | przeciwko 80 | przed 81 | przez 82 | robić 83 | ty 84 | u 85 | w 86 | we 87 | wy 88 | z 89 | za 90 | ze 91 | 92 |
93 | -------------------------------------------------------------------------------- /dictionaries/ia.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | "| 6 | (| 7 | 8 | 9 | 10 | 11 | ."| 12 | ,"| 13 | .| 14 | ,| 15 | "| 16 | )| 17 | ?| 18 | :| 19 | ;| 20 | !| 21 | 22 | 23 | 24 | 25 | wrote|write 26 | came|come 27 | went|go 28 | 29 | 30 | 31 | before1|1after 32 | 33 |
34 |        before1|1after
35 |    
36 |
37 | 38 | 39 | 40 | ." 41 | ?" 42 | !" 43 | ," 44 | . 45 | ? 46 | ; 47 | | 48 | ! 49 | 50 | 51 | 52 | Dr. 53 | Mr. 54 | Mrs. 55 | U.S. 56 | Rep. 57 | Sen. 58 | 59 | 60 | 61 | duo 62 | e 63 | es 64 | esser 65 | ha 66 | haber 67 | illa 68 | illas 69 | ille 70 | illes 71 | illo 72 | illos 73 | in 74 | io 75 | la 76 | las 77 | le 78 | les 79 | lo 80 | los 81 | me 82 | minus 83 | non 84 | nos 85 | ora 86 | plus 87 | quando 88 | se 89 | sed 90 | te 91 | tu 92 | un 93 | va 94 | vader 95 | vos 96 | 97 |
98 | -------------------------------------------------------------------------------- /ext/ots/libots/grader-tc.h: -------------------------------------------------------------------------------- 1 | /* 2 | * grader-tc.h 3 | * 4 | * Copyright (C) 2003 Nadav Rotem 5 | * 6 | * This program is free software; you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation; either version 2 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU Library General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program; if not, write to the Free Software 18 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 19 | */ 20 | 21 | #ifndef HAVE_GRADERTC_H 22 | #define HAVE_GRADERTC_H 23 | 24 | 25 | #include 26 | #include "libots.h" 27 | 28 | G_BEGIN_DECLS 29 | 30 | 31 | typedef struct 32 | { 33 | gchar *word; /* the word */ 34 | gchar *stem; /*stem of the word*/ 35 | gint occ; /* how many times have we seen this word in the text? */ 36 | } OtsWordEntery; 37 | 38 | /*Word list manipulations*/ 39 | void ots_free_wordlist (GList *aList); 40 | 41 | 42 | 43 | OtsWordEntery *ots_copy_wordEntery (OtsWordEntery * obj); 44 | OtsWordEntery *ots_new_wordEntery (unsigned const char *wordString); 45 | OtsWordEntery *ots_new_wordEntery_strip (unsigned const char *wordString,const OtsStemRule *rule); 46 | void ots_free_wordEntery (OtsWordEntery * WC); 47 | 48 | GList *ots_sort_list (GList* aList); 49 | GList *ots_union_list (const GList *aLst, const GList * bLst); 50 | 51 | char *ots_word_in_list (const GList *aList,const int index); 52 | char *ots_stem_in_list (const GList *aList,const int index); 53 | void ots_add_wordstat (OtsArticle * Doc,unsigned const char *wordString); 54 | 55 | 56 | /*grader*/ 57 | 58 | void ots_grade_doc_tc (OtsArticle * Doc); 59 | 60 | G_END_DECLS 61 | 62 | 63 | 64 | #endif /* HAVE_GRADERTC_H */ 65 | -------------------------------------------------------------------------------- /dictionaries/uk.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | "| 6 | (| 7 | 8 | 9 | 10 | 11 | ."| 12 | ,"| 13 | .| 14 | ,| 15 | "| 16 | )| 17 | ?| 18 | :| 19 | ;| 20 | !| 21 | 22 | 23 | 24 | 25 | wrote|write 26 | came|come 27 | went|go 28 | 29 | 30 | 31 | before1|1after 32 | 33 |
34 |        before1|1after
35 |    
36 |
37 | 38 | 39 | 40 | ." 41 | ?" 42 | !" 43 | ," 44 | . 45 | ? 46 | ; 47 | | 48 | ! 49 | 50 | 51 | 52 | Dr. 53 | Mr. 54 | Mrs. 55 | U.S. 56 | Rep. 57 | Sen. 58 | 59 | 60 | 61 | іноді 62 | відкіля 63 | вітаю 64 | два 65 | де 66 | з 67 | завжди 68 | зараз 69 | ким 70 | коли 71 | котрий 72 | куди 73 | ні 74 | ніколи 75 | нікуди 76 | навіщо 77 | нагорі 78 | незабаром 79 | нуль 80 | один 81 | позаду 82 | скільки 83 | сюди 84 | так 85 | там 86 | тоді 87 | туди 88 | тут 89 | унизу 90 | усе 91 | хто 92 | часто 93 | чому 94 | що 95 | як 96 | який 97 | 98 |
99 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # OTS 2 | 3 | ots is an interface to libots - The [Open Text Summarizer](http://libots.sourceforge.net/). 4 | 5 | ## Dependencies 6 | 7 | * ruby 1.9.1 or later 8 | * libxml2 9 | * glib2.0 10 | * homebrew (on MacOSX) 11 | 12 | ## Installation 13 | 14 | ### Debian flavors of Linux 15 | 16 | ``` 17 | 18 | # ruby & ruby development libraries (not needed if you use rvm) 19 | sudo apt-get install ruby1.9.1-dev ruby1.9.1 20 | 21 | # libxml2 and glib development libraries 22 | sudo apt-get install libxml2-dev libglib2.0-dev 23 | 24 | # install ots 25 | gem install ots 26 | 27 | ``` 28 | 29 | ### MacOSX 30 | 31 | 32 | ``` 33 | 34 | # update homebrew to latest & greatest version 35 | GIT_SSL_NO_VERIFY=1 brew update 36 | 37 | # optional: macosx normally has libxml2 installed if not try 38 | brew install libxml2 39 | 40 | # install glib 41 | brew install glib 42 | 43 | # setup the environment variables in order to install ots 44 | export CPPFLAGS=-I/usr/local/Cellar/glib/2.30.2/include/glib-2.0/ 45 | export LDFLAGS=-L/usr/local/Cellar/glib/2.30.2/lib/ 46 | export PKG_CONFIG_PATH=/usr/local/Cellar/glib/2.30.2/lib/pkgconfig/ 47 | 48 | # install ots 49 | gem install ots 50 | 51 | ``` 52 | 53 | ## API 54 | 55 | ``` 56 | OTS 57 | .parse #=> OTS::Article 58 | .languages #=> Array 59 | 60 | OTS::Article 61 | .new 62 | #topics #=> Array 63 | #keywords #=> Array 64 | #summarize #=> Array 65 | 66 | ``` 67 | 68 | ## Usage 69 | 70 | ```ruby 71 | require 'ots' 72 | article = OTS.parse("I think I need some ice cream to cool me off. It is too hot down under") 73 | article = OTS.parse("j'ai besoin de la crème glacée. il fait trop chaud en australie.", language: "fr") 74 | article = OTS.parse("j'ai besoin de la crème glacée. il fait trop chaud en australie.", dictionary: "custom.xml") 75 | 76 | article.topics 77 | article.keywords 78 | article.summarize(percent: 50) 79 | article.summarize(sentences: 1) 80 | 81 | OTS.languages #=> list of supported language dictionaries baked-in to libots 82 | ``` 83 | 84 | ## See Also 85 | 86 | [https://github.com/ssoper/summarize](https://github.com/ssoper/summarize) 87 | 88 | ## License 89 | 90 | MIT 91 | -------------------------------------------------------------------------------- /dictionaries/bg.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | "| 6 | (| 7 | 8 | 9 | 10 | 11 | ."| 12 | ,"| 13 | .| 14 | ,| 15 | "| 16 | )| 17 | ?| 18 | :| 19 | ;| 20 | !| 21 | 22 | 23 | 24 | 25 | wrote|write 26 | came|come 27 | went|go 28 | 29 | 30 | 31 | before1|1after 32 | 33 |
 34 |        before1|1after
 35 |    
36 |
37 | 38 | 39 | 40 | ." 41 | ?" 42 | !" 43 | ," 44 | . 45 | ? 46 | ; 47 | | 48 | ! 49 | 50 | 51 | 52 | Dr. 53 | Mr. 54 | Mrs. 55 | U.S. 56 | Rep. 57 | Sen. 58 | 59 | 60 | 61 | август 62 | април 63 | в 64 | всеки 65 | всичко 66 | вторник 67 | да 68 | декември 69 | за 70 | и 71 | или 72 | има 73 | което 74 | към 75 | май 76 | март 77 | на 78 | не 79 | неделя 80 | ноември 81 | октомври 82 | от 83 | петък 84 | по 85 | понеделник 86 | при 87 | с 88 | септември 89 | сряда 90 | сто 91 | събота 92 | трябва 93 | февруари 94 | хиляда 95 | че 96 | четвъртък 97 | юли 98 | юни 99 | януари 100 | 101 |
102 | -------------------------------------------------------------------------------- /dictionaries/fi.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | "| 6 | (| 7 | 8 | 9 | 10 | 11 | ."| 12 | ,"| 13 | .| 14 | ,| 15 | "| 16 | )| 17 | ?| 18 | :| 19 | ;| 20 | !| 21 | 22 | 23 | 24 | 25 | wrote|write 26 | came|come 27 | went|go 28 | 29 | 30 | 31 | before1|1after 32 | 33 |
 34 |        before1|1after
 35 |    
36 |
37 | 38 | 39 | 40 | ." 41 | ?" 42 | !" 43 | ," 44 | . 45 | ? 46 | ; 47 | | 48 | ! 49 | 50 | 51 | 52 | Dr. 53 | Mr. 54 | Mrs. 55 | U.S. 56 | Rep. 57 | Sen. 58 | 59 | 60 | 61 | ehkä 62 | enemmän 63 | että 64 | he 65 | hei 66 | hän 67 | ja 68 | jahka 69 | joo 70 | joskus 71 | jotta 72 | kaikki 73 | kuinka 74 | kun 75 | me 76 | mikä 77 | minä 78 | miten 79 | mutta 80 | myös 81 | ne 82 | no 83 | nyt 84 | olen 85 | paitsi 86 | sekä 87 | siis 88 | sillä 89 | sinä 90 | tahi 91 | tahikka 92 | tai 93 | taikka 94 | te 95 | tällä 96 | tämä 97 | tässä 98 | vaan 99 | vai 100 | vain 101 | vasta 102 | vielä 103 | yli 104 | 105 |
106 | -------------------------------------------------------------------------------- /ext/ots/libots/grader.c: -------------------------------------------------------------------------------- 1 | /* 2 | * grader.c 3 | * 4 | * Copyright (C) 2003 Nadav Rotem 5 | * 6 | * This program is free software; you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation; either version 2 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU Library General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program; if not, write to the Free Software 18 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 19 | */ 20 | 21 | #include 22 | #include 23 | #include 24 | #include "libots.h" 25 | 26 | extern void ots_grade_doc_tc (OtsArticle * Doc); 27 | 28 | /*Grader driver - will call one of the grading algorithm*/ 29 | 30 | 31 | 32 | void 33 | ots_grade_structure (OtsArticle * Doc) /*must be called after the first grader*/ 34 | { 35 | GList *li; 36 | GList *first; 37 | GList *second; 38 | OtsSentence *first_line=NULL; 39 | 40 | first = NULL; 41 | second = NULL; 42 | 43 | if (Doc==NULL) return; 44 | 45 | if (Doc->lines!=NULL) 46 | first_line= ((OtsSentence *) (Doc->lines->data)); 47 | if (NULL!=first_line) first_line->score *= 2; /*first line/title is very important so we increase its score */ 48 | 49 | /*This loop will *1.6 the score of each line that 50 | starts with \n \n , in other words a new paragraph*/ 51 | 52 | for (li = (GList *) Doc->lines; li != NULL; li = li->next) 53 | { 54 | OtsSentence *aLine = (li->data); 55 | if (NULL != aLine) /*line is there */ 56 | { 57 | first = aLine->words; /*first word? */ 58 | if (NULL != first) 59 | second = first->next; /*second word? */ 60 | if ((NULL != first) && (NULL != second)) /*have content? */ 61 | if (strcmp (first->data, "\n") && strcmp (second->data, "\n")) /*new paragraph? */ 62 | aLine->score *= 1.6; 63 | } 64 | 65 | } 66 | 67 | } 68 | 69 | /** 70 | Each grader needs to do: 71 | 1.give a ->score to each line 72 | 2.Set the ->title of the document 73 | **/ 74 | 75 | void 76 | ots_grade_doc (OtsArticle * Doc) 77 | { 78 | 79 | if (Doc==NULL) return; 80 | ots_grade_doc_tc(Doc); /*Term count*/ 81 | 82 | /* or ots_grade_doc_fc (Doc); Term Frequency */ 83 | 84 | ots_grade_structure (Doc); 85 | } 86 | -------------------------------------------------------------------------------- /dictionaries/cy.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | "| 6 | (| 7 | 8 | 9 | 10 | 11 | ."| 12 | ,"| 13 | .| 14 | ,| 15 | "| 16 | )| 17 | ?| 18 | :| 19 | ;| 20 | !| 21 | 22 | 23 | 24 | 25 | wrote|write 26 | came|come 27 | went|go 28 | 29 | 30 | 31 | before1|1after 32 | 33 |
 34 |        before1|1after
 35 |    
36 |
37 | 38 | 39 | 40 | ." 41 | ?" 42 | !" 43 | ," 44 | . 45 | ? 46 | ; 47 | | 48 | ! 49 | 50 | 51 | 52 | Dr. 53 | Mr. 54 | Mrs. 55 | U.S. 56 | Rep. 57 | Sen. 58 | 59 | 60 | 61 | a 62 | â 63 | ac 64 | achos 65 | am 66 | ar 67 | at 68 | chi 69 | dau 70 | dim 71 | diolch 72 | dwy 73 | e 74 | ei 75 | eto 76 | fe 77 | fi 78 | gan 79 | ger 80 | gyda 81 | heb 82 | heblaw 83 | hefyd 84 | hi 85 | hon 86 | hwn 87 | i 88 | iawn 89 | mewn 90 | na 91 | neb 92 | nes 93 | nhw 94 | ni 95 | o 96 | ond 97 | os 98 | paham 99 | pam 100 | pe 101 | popeth 102 | pwy 103 | rhag 104 | ti 105 | trwy 106 | un 107 | unwaith 108 | wedi 109 | wedyn 110 | weithiau 111 | wrth 112 | ychydig 113 | ymhlith 114 | ymlaen 115 | yn 116 | yrŵan 117 | 118 |
119 | -------------------------------------------------------------------------------- /ots.gemspec: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | 3 | Gem::Specification.new do |s| 4 | s.name = %q{ots} 5 | s.version = "0.5.4" 6 | 7 | s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version= 8 | s.authors = ["Bharanee Rathna"] 9 | s.date = %q{2012-02-03} 10 | s.description = %q{Ruby interface to libots libraries for unix.} 11 | s.email = ["deepfryed@gmail.com"] 12 | s.extensions = ["ext/ots/extconf.rb"] 13 | s.files = ["ext/ots/ots.c", "ext/ots/libots/text.c", "ext/ots/libots/grader-tf.c", "ext/ots/libots/stemmer.c", "ext/ots/libots/article.c", "ext/ots/libots/grader-tc.c", "ext/ots/libots/html.c", "ext/ots/libots/grader.c", "ext/ots/libots/relations.c", "ext/ots/libots/parser.c", "ext/ots/libots/dictionary.c", "ext/ots/libots/highlighter.c", "ext/ots/libots/wordlist.c", "ext/ots/ots.h", "ext/ots/version.h", "ext/ots/libots/grader-tc.h", "ext/ots/libots/libots.h", "ext/ots/extconf.rb", "test/test_article.rb", "test/test_ots.rb", "test/helper.rb", "test/test_grader.rb", "lib/ots.rb", "lib/ots/grader.rb", "README.md", "CHANGELOG", "dictionaries/cy.xml", "dictionaries/tr.xml", "dictionaries/fr.xml", "dictionaries/yi.xml", "dictionaries/ms.xml", "dictionaries/ia.xml", "dictionaries/lv.xml", "dictionaries/gl.xml", "dictionaries/cs.xml", "dictionaries/sv.xml", "dictionaries/is.xml", "dictionaries/fi.xml", "dictionaries/bg.xml", "dictionaries/uk.xml", "dictionaries/et.xml", "dictionaries/tl.xml", "dictionaries/da.xml", "dictionaries/it.xml", "dictionaries/ru.xml", "dictionaries/nl.xml", "dictionaries/eo.xml", "dictionaries/mi.xml", "dictionaries/ro.xml", "dictionaries/pl.xml", "dictionaries/ga.xml", "dictionaries/he.xml", "dictionaries/mt.xml", "dictionaries/eu.xml", "dictionaries/hu.xml", "dictionaries/en.xml", "dictionaries/de.xml", "dictionaries/el.xml", "dictionaries/pt.xml", "dictionaries/ca.xml", "dictionaries/es.xml", "dictionaries/nn.xml", "dictionaries/id.xml"] 14 | s.homepage = %q{http://github.com/deepfryed/ots} 15 | s.require_paths = ["lib", "ext"] 16 | s.rubygems_version = %q{1.3.7} 17 | s.summary = %q{Open Text Summarizer interface for Ruby.} 18 | 19 | if s.respond_to? :specification_version then 20 | current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION 21 | s.specification_version = 3 22 | 23 | if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then 24 | s.add_development_dependency(%q, [">= 0"]) 25 | s.add_development_dependency(%q, [">= 0"]) 26 | else 27 | s.add_dependency(%q, [">= 0"]) 28 | s.add_dependency(%q, [">= 0"]) 29 | end 30 | else 31 | s.add_dependency(%q, [">= 0"]) 32 | s.add_dependency(%q, [">= 0"]) 33 | end 34 | end 35 | -------------------------------------------------------------------------------- /test/test_article.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | require 'helper' 3 | 4 | describe 'OTS::Article' do 5 | before do 6 | @sample = <<-TEXT 7 | The hawksbill turtle is a critically endangered sea turtle belonging to the family Cheloniidae. 8 | It is the only species in its genus. The species has a worldwide distribution, with Atlantic and 9 | Pacific subspecies. 10 | TEXT 11 | 12 | @article = OTS::Article.new(@sample) 13 | end 14 | 15 | it 'should extract topic keywords from given document' do 16 | assert_equal %w(species turtle subspecies pacific atlantic), @article.topics 17 | end 18 | 19 | it 'should extract keywords from given document' do 20 | expect = %w{ 21 | species turtle subspecies pacific atlantic distribution worldwide genus cheloniidae family 22 | belonging sea endangered critically hawksbill 23 | } 24 | 25 | assert_equal expect, @article.keywords 26 | end 27 | 28 | 29 | it 'should summarize sentences from given document' do 30 | lines = @article.summarize(sentences: 2).map {|line| [line[:sentence].gsub(/\s+/, ' ').strip, line[:score]]} 31 | expect = [ 32 | ["The hawksbill turtle is a critically endangered sea turtle belonging to the family Cheloniidae.", 48], 33 | ["The species has a worldwide distribution, with Atlantic and Pacific subspecies.", 20], 34 | ] 35 | 36 | assert_equal expect, lines 37 | end 38 | 39 | it 'should utf8 encode strings properly' do 40 | text = "The hawksbill turtle\xE2\x80\x93is critically endangered.".force_encoding('utf-8') 41 | article = OTS.parse(text) 42 | summary = article.summarize(sentences: 1).first[:sentence] 43 | assert_equal text, summary 44 | end 45 | 46 | describe 'dictionaries' do 47 | before do 48 | @text = "j'ai besoin de la crème glacée. il fait trop chaud en australie." 49 | end 50 | 51 | it 'should load the french dictionary' do 52 | article = OTS.parse(@text, language: "fr") 53 | assert_equal "j'ai besoin de la crème glacée.", article.summarize(sentences: 1).first[:sentence] 54 | end 55 | 56 | it 'should load the french dictionary given path' do 57 | article = OTS.parse(@text, dictionary: File.join(File.dirname(__FILE__), '..', 'dictionaries', 'fr.xml')) 58 | assert_equal "j'ai besoin de la crème glacée.", article.summarize(sentences: 1).first[:sentence] 59 | end 60 | 61 | it 'should raise LoadError on invalid language or dictionaries' do 62 | assert_raises(LoadError) do 63 | OTS.parse('hello world', language: "xxx") 64 | end 65 | 66 | assert_raises(LoadError) do 67 | OTS.parse('hello world', dictionary: "xxx") 68 | end 69 | 70 | assert_raises(LoadError) do 71 | OTS.parse('hello world', dictionary: __FILE__) 72 | end 73 | end 74 | end 75 | end 76 | -------------------------------------------------------------------------------- /dictionaries/ga.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | "| 6 | (| 7 | 8 | 9 | 10 | 11 | wrote|write 12 | came|come 13 | went|go 14 | 15 | 16 | 17 | ."| 18 | ,"| 19 | .| 20 | ,| 21 | "| 22 | )| 23 | ?| 24 | :| 25 | ;| 26 | !| 27 | 28 | 29 | 30 | before1|1after 31 | 32 |
 33 |        before1|1after
 34 |    
35 |
36 | 37 | 38 | 39 | ." 40 | ?" 41 | !" 42 | ," 43 | . 44 | ? 45 | ; 46 | | 47 | ! 48 | 49 | 50 | 51 | Dr. 52 | Mr. 53 | Mrs. 54 | U.S. 55 | Rep. 56 | Sen. 57 | 58 | 59 | 60 | a 61 | ach 62 | acu 63 | agaibh 64 | againn 65 | agam 66 | agat 67 | agus 68 | aici 69 | an 70 | anois 71 | anseo 72 | aois 73 | aon 74 | cad 75 | conas 76 | de 77 | dhá 78 | do 79 | 80 | dol 81 | é 82 | faic 83 | féad 84 | fós 85 | freisin 86 | gach 87 | gairid 88 | gan 89 | i 90 | í 91 | iad 92 | iadsan 93 | iomarca 94 | istigh 95 | le 96 | 97 | mise 98 | mo 99 | muid 100 | 101 | naid 102 | náid 103 | ó 104 | óir 105 | seisean 106 | seo 107 | siadsan 108 | sibh 109 | sibhse 110 | sinne 111 | 112 | tagann 113 | tagtha 114 | téann 115 | téigh 116 | thall 117 | thíos 118 | thuas 119 | timpeall 120 | tusa 121 | uiareanta 122 | uile 123 | 124 |
125 | -------------------------------------------------------------------------------- /dictionaries/da.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | "| 6 | (| 7 | 8 | 9 | 10 | 11 | ."| 12 | ,"| 13 | .| 14 | ,| 15 | "| 16 | )| 17 | ?| 18 | :| 19 | ;| 20 | !| 21 | 22 | 23 | 24 | 25 | wrote|write 26 | came|come 27 | went|go 28 | 29 | 30 | 31 | before1|1after 32 | 33 |
 34 |        before1|1after
 35 |    
36 |
37 | 38 | 39 | 40 | ." 41 | ?" 42 | !" 43 | ," 44 | . 45 | ? 46 | ; 47 | | 48 | ! 49 | 50 | 51 | 52 | Dr. 53 | Mr. 54 | Mrs. 55 | U.S. 56 | Rep. 57 | Sen. 58 | 59 | 60 | 61 | aldrig 62 | anden 63 | at 64 | burde 65 | de 66 | den 67 | der 68 | det 69 | dig 70 | du 71 | eller 72 | en 73 | er 74 | et 75 | fordi 76 | fra 77 | før 78 | første 79 | gide 80 | ham 81 | han 82 | har 83 | hej 84 | hende 85 | hun 86 | hvad 87 | hvem 88 | hvilke 89 | hvilken 90 | hvilket 91 | hvis 92 | hvor 93 | hvordan 94 | hvorfor 95 | hvornår 96 | i 97 | ikke 98 | ingen 99 | ingenting 100 | ja 101 | jeg 102 | kan 103 | kunne 104 | kunne 105 | med 106 | men 107 | mens 108 | mere 109 | mest 110 | mig 111 | min 112 | måtte 113 | nej 114 | nogen 115 | noget 116 | når 117 | og 118 | om 119 | sig 120 | skulle 121 | som 122 | 123 | tit 124 | to 125 | turde 126 | vi 127 | ville 128 | 129 |
130 | -------------------------------------------------------------------------------- /ext/ots/libots/text.c: -------------------------------------------------------------------------------- 1 | /* 2 | * text.c 3 | * 4 | * Copyright (C) 2003 Nadav Rotem 5 | * 6 | * This program is free software; you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation; either version 2 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU Library General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program; if not, write to the Free Software 18 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 19 | */ 20 | 21 | #include 22 | #include 23 | #include 24 | 25 | #include "libots.h" 26 | 27 | unsigned char * 28 | ots_get_line_text (const OtsSentence * aLine, gboolean only_if_selected, size_t * out_size) 29 | { 30 | GList *li; 31 | GString *text; 32 | unsigned char *utf8_data; 33 | 34 | if (!(aLine)) 35 | return NULL; 36 | 37 | text = g_string_new (NULL); 38 | 39 | if (!only_if_selected || aLine->selected) 40 | { 41 | for (li = (GList *) aLine->words; li != NULL; li = li->next) /* for each word in the sentence Do: */ 42 | if (li->data && strlen (li->data)) /*if word exists*/ 43 | g_string_append (text, (char *) li->data); 44 | 45 | } 46 | 47 | if (out_size) 48 | *out_size = text->len; 49 | 50 | utf8_data = text->str; 51 | g_string_free (text, FALSE); 52 | 53 | return utf8_data; 54 | } 55 | 56 | static void 57 | ots_print_line (FILE * stream, const OtsSentence * aLine) 58 | { 59 | unsigned char *utf8_txt; 60 | size_t len; 61 | utf8_txt = ots_get_line_text (aLine, TRUE, &len); 62 | fwrite (utf8_txt, 1, len, stream); 63 | g_free (utf8_txt); 64 | } 65 | 66 | unsigned char * 67 | ots_get_doc_text (const OtsArticle * Doc, size_t * out_len) 68 | { 69 | GList *li; 70 | GString *text; 71 | unsigned char *utf8_data; 72 | size_t line_len; 73 | 74 | text = g_string_new (NULL); 75 | 76 | for (li = (GList *) Doc->lines; li != NULL; li = li->next) 77 | { 78 | utf8_data = ots_get_line_text ((OtsSentence *) li->data, TRUE, &line_len); 79 | g_string_append_len (text, utf8_data, line_len); 80 | g_free (utf8_data); 81 | } 82 | 83 | if (out_len) 84 | *out_len = text->len; 85 | utf8_data = text->str; 86 | 87 | g_string_free (text, FALSE); 88 | return utf8_data; 89 | } 90 | 91 | void 92 | ots_print_doc (FILE * stream, const OtsArticle * Doc) 93 | { 94 | GList *li; 95 | for (li = (GList *) Doc->lines; li != NULL; li = li->next) /* for each line in Article Do: */ 96 | ots_print_line (stream, (OtsSentence *) li->data); 97 | fputc ('\n', stream); 98 | } 99 | -------------------------------------------------------------------------------- /ext/ots/libots/grader-tf.c: -------------------------------------------------------------------------------- 1 | /* 2 | * grader-tf.c 3 | * 4 | * Copyright (C) 2003 Nadav Rotem 5 | * 6 | * This program is free software; you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation; either version 2 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU Library General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program; if not, write to the Free Software 18 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 19 | */ 20 | 21 | #include 22 | #include 23 | #include 24 | #include "libots.h" 25 | 26 | /*Grader - using the Term frequency algorithm. Will give each line a score*/ 27 | 28 | 29 | 30 | OtsWordTF* 31 | ots_new_OtsWordTF(const char* word,const double tf) 32 | { 33 | OtsWordTF* obj=g_new0(OtsWordTF,1); 34 | if (word!=NULL) obj->word=g_strdup(word); 35 | obj->tf=tf; 36 | return obj; 37 | } 38 | 39 | void 40 | ots_free_OtsWordTF(OtsWordTF *obj) 41 | { 42 | if (obj!=NULL) 43 | { 44 | if (obj->word!=NULL) g_free(obj->word); 45 | g_free(obj); 46 | } 47 | } 48 | 49 | void 50 | ots_free_TF_wordlist (GList * aList) 51 | { 52 | if (aList != NULL) 53 | { 54 | g_list_foreach(aList,(GFunc)ots_free_OtsWordTF, NULL); 55 | g_list_free(aList); 56 | } 57 | } 58 | 59 | 60 | void 61 | ots_grade_line_tf (OtsSentence * aLine) 62 | { 63 | 64 | return; 65 | } 66 | 67 | 68 | 69 | void 70 | ots_grade_doc_tf (OtsArticle * Doc) 71 | { 72 | 73 | GList *li; 74 | 75 | /*Load tf list*/ 76 | /*Load idf list*/ 77 | 78 | if (0 == Doc->lineCount) return; 79 | 80 | for (li = (GList *) Doc->lines; li != NULL; li = li->next) 81 | { 82 | ots_grade_line_tf ((OtsSentence *) li->data /* , tf list , idf list*/); 83 | } 84 | 85 | return; 86 | } 87 | 88 | 89 | double 90 | ots_tf_word_score (const double tf,const double idf) 91 | /*IDF: how rare is word across the collection 92 | TF: how often is word in doc */ 93 | { 94 | 95 | return tf*idf; 96 | } 97 | 98 | /* 99 | Determine frequency of query words 100 | n = (num-of-sentences words appears in) 101 | N = (total-number-of-sentences) 102 | f = n/N 103 | */ 104 | 105 | double 106 | ots_calc_idf (const int term_count,const int doc_word_count) 107 | { 108 | return -log(doc_word_count/term_count); 109 | } 110 | 111 | double 112 | ots_calc_tf (const int term_count,const int doc_word_count) 113 | { 114 | if (term_count==0) return 0; else 115 | return 0.5+0.5*(doc_word_count/term_count); 116 | } 117 | -------------------------------------------------------------------------------- /dictionaries/ca.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | "| 6 | (| 7 | 8 | 9 | 10 | 11 | ."| 12 | ,"| 13 | .| 14 | ,| 15 | "| 16 | )| 17 | ?| 18 | :| 19 | ;| 20 | !| 21 | 22 | 23 | 24 | 25 | wrote|write 26 | came|come 27 | went|go 28 | 29 | 30 | 31 | before1|1after 32 | 33 |
 34 |        before1|1after
 35 |    
36 |
37 | 38 | 39 | 40 | ." 41 | ?" 42 | !" 43 | ," 44 | . 45 | ? 46 | ; 47 | | 48 | ! 49 | 50 | 51 | 52 | Dr. 53 | Mr. 54 | Mrs. 55 | U.S. 56 | Rep. 57 | Sen. 58 | 59 | 60 | 61 | a 62 | abans 63 | al 64 | amb 65 | ambdós 66 | anar 67 | ara 68 | baix 69 | cap 70 | cert 71 | com 72 | cuál 73 | damunt 74 | de 75 | dins 76 | doble 77 | dos 78 | dues 79 | el 80 | ell 81 | ella 82 | elles 83 | ells 84 | els 85 | en 86 | ésser 87 | estar 88 | excepte 89 | jo 90 | la 91 | les 92 | lluny 93 | lo 94 | los 95 | mai 96 | me 97 | meu 98 | meus 99 | meva 100 | meves 101 | 102 | na 103 | nos 104 | nosaltres 105 | nostra 106 | nostre 107 | nostres 108 | qual 109 | quals 110 | quan 111 | quelcom 112 | quin 113 | quina 114 | quines 115 | quins 116 | se 117 | ser 118 | seu 119 | seus 120 | seva 121 | seves 122 | 123 | tenir 124 | teu 125 | teus 126 | teva 127 | teves 128 | tu 129 | u 130 | un 131 | una 132 | unes 133 | uns 134 | vosaltres 135 | vostè 136 | vostès 137 | vostra 138 | vostre 139 | vostres 140 | 141 |
142 | -------------------------------------------------------------------------------- /dictionaries/ru.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | "| 7 | (| 8 | 9 | 10 | 11 | 12 | ."| 13 | ,"| 14 | .| 15 | ,| 16 | "| 17 | )| 18 | ?| 19 | :| 20 | ;| 21 | !| 22 | 23 | 24 | 25 | 26 | wrote|write 27 | came|come 28 | went|go 29 | 30 | 31 | 32 | before1|1after 33 | 34 |
 35 |        before1|1after
 36 |    
37 |
38 | 39 | 40 | 41 | ." 42 | ?" 43 | !" 44 | ," 45 | . 46 | ? 47 | ; 48 | | 49 | ! 50 | 51 | 52 | 53 | Dr. 54 | Mr. 55 | Mrs. 56 | U.S. 57 | Rep. 58 | Sen. 59 | 60 | 61 | 62 | а 63 | без 64 | бытовать 65 | быть 66 | в 67 | вещь 68 | вниз 69 | внизу 70 | во 71 | все 72 | всегда 73 | всё 74 | где 75 | да 76 | даже 77 | два 78 | две 79 | для 80 | должен 81 | друго 82 | его 83 | её 84 | ей 85 | ему 86 | если 87 | же 88 | за 89 | и 90 | из 91 | из-за 92 | или 93 | им 94 | к 95 | каждый 96 | как 97 | меня 98 | мне 99 | мной 100 | может 101 | на 102 | наверх 103 | наверху 104 | над 105 | не 106 | ней 107 | нет 108 | нём 109 | нигде 110 | никто 111 | ноль 112 | о 113 | оба 114 | обе 115 | одна 116 | одно 117 | около 118 | он 119 | она 120 | оно 121 | от 122 | по 123 | пока 124 | поперёк 125 | после 126 | потом 127 | почему 128 | при 129 | с 130 | скоро 131 | сначала 132 | так 133 | также 134 | тебе 135 | тебя 136 | теперь 137 | тобой 138 | тогда 139 | тоже 140 | только 141 | ты 142 | у 143 | уже 144 | что 145 | чтобы 146 | это 147 | я 148 | 149 |
150 | 151 | -------------------------------------------------------------------------------- /ext/ots/libots/article.c: -------------------------------------------------------------------------------- 1 | /* 2 | * article.c 3 | * 4 | * Copyright (C) 2003 Nadav Rotem 5 | * 6 | * This program is free software; you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation; either version 2 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU Library General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program; if not, write to the Free Software 18 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 19 | */ 20 | 21 | #include 22 | #include 23 | #include 24 | 25 | #include "libots.h" 26 | #include "grader-tc.h" 27 | 28 | extern void ots_free_TF_wordlist (GList * aList); 29 | 30 | #define MAX_WORD_LENGTH 35 31 | 32 | /*Data structure related functions*/ 33 | 34 | OtsSentence * 35 | ots_new_sentence (void) 36 | { 37 | OtsSentence *aLine = g_new0 (OtsSentence, 1); 38 | aLine->words = NULL; 39 | aLine->wc = 0; 40 | aLine->selected = 0; 41 | aLine->score = 0; 42 | return aLine; 43 | } 44 | 45 | void 46 | ots_free_sentence (OtsSentence * sen) 47 | { 48 | if (sen != NULL) 49 | { 50 | g_list_foreach (sen->words, (GFunc) g_free, NULL); 51 | g_list_free (sen->words); 52 | g_free (sen); 53 | } 54 | sen=NULL; 55 | } 56 | 57 | OtsArticle * 58 | ots_new_article (void) 59 | { 60 | OtsArticle *Doc; 61 | Doc = g_new0 (OtsArticle, 1); 62 | Doc->lineCount = 0; 63 | Doc->title = NULL; 64 | Doc->stem=new_stem_rule (); 65 | Doc->lines=NULL; 66 | Doc->dict = NULL; 67 | Doc->ImpWords = NULL; 68 | Doc->wordStat = NULL; 69 | 70 | Doc->tf_terms=NULL; 71 | return Doc; 72 | } 73 | 74 | void 75 | ots_free_article (OtsArticle * art) 76 | { 77 | if (NULL != art) 78 | { 79 | free_stem_rule (art->stem); 80 | ots_free_wordlist (art->dict); 81 | ots_free_wordlist (art->ImpWords); 82 | ots_free_wordlist (art->wordStat); 83 | 84 | ots_free_TF_wordlist(art->tf_terms); 85 | 86 | g_list_foreach (art->lines, (GFunc) ots_free_sentence, NULL); 87 | g_list_free (art->lines); 88 | 89 | if (art->title != NULL) g_free (art->title); 90 | g_free (art); 91 | } 92 | art=NULL; 93 | } 94 | 95 | OtsSentence * 96 | ots_append_line (OtsArticle * Doc) 97 | { 98 | OtsSentence *aLine = ots_new_sentence (); 99 | Doc->lineCount++; 100 | Doc->lines = g_list_append (Doc->lines, aLine); 101 | return aLine; 102 | } 103 | 104 | void 105 | ots_append_word (OtsSentence * aLine,unsigned const char *aWord) 106 | { 107 | if ((aWord == NULL) || (0==strlen(aWord)) ||(NULL==aLine)) return; 108 | aLine->wc++; 109 | aLine->words = g_list_append (aLine->words, (gpointer) g_strdup (aWord)); 110 | return; 111 | } 112 | 113 | 114 | gboolean 115 | ots_is_line_selected(const OtsSentence *aLine) 116 | { 117 | if (aLine==NULL) {printf("Warning:Line=NULL\n"); return FALSE;} 118 | return (aLine->selected); 119 | } 120 | -------------------------------------------------------------------------------- /ext/ots/libots/highlighter.c: -------------------------------------------------------------------------------- 1 | /* 2 | * highlighter 3 | * 4 | * Copyright (C) 2003 Nadav Rotem 5 | * 6 | * This program is free software; you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation; either version 2 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU Library General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program; if not, write to the Free Software 18 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 19 | */ 20 | 21 | #include 22 | #include 23 | #include 24 | #include "libots.h" 25 | 26 | /*After the grader has graded the article and each 27 | sentence has a score the highlighter will select 28 | some of the sentences*/ 29 | 30 | static int 31 | ots_highlight_max_line (OtsArticle * Doc) 32 | { 33 | GList *li; 34 | int max = 0; 35 | for (li = (GList *) Doc->lines; li != NULL; li = li->next) 36 | { 37 | if (0 == (((OtsSentence *) li->data)->selected)) /* if not selected , count me in */ 38 | max = MAX (((OtsSentence *) li->data)->score, max); 39 | 40 | } 41 | 42 | for (li = (GList *) Doc->lines; li != NULL; li = li->next) 43 | { 44 | 45 | if ((((OtsSentence *) li->data)->score == max) && (((OtsSentence *) li->data)->selected == 0)) /* if score==max && not selected before ,select me; */ 46 | { 47 | ((OtsSentence *) li->data)->selected = 1; 48 | return ((OtsSentence *) li->data)->wc; 49 | } 50 | } 51 | 52 | return 0; 53 | } 54 | 55 | 56 | /* todo: impement this 57 | 58 | void 59 | ots_highlight_doc_wordcount (OtsArticle * Doc, int wordCount) 60 | 61 | void 62 | ots_highlight_doc_linecount (OtsArticle * Doc, int wordCount) 63 | 64 | 65 | 66 | void 67 | ots_highlight_doc_soft (OtsArticle * Doc, int percent) //blur selection by avrage of near sentences , will mark blocks 68 | */ 69 | 70 | void 71 | ots_highlight_doc (OtsArticle * Doc, int percent) 72 | { 73 | int i; 74 | double ratio; 75 | int wordCount; 76 | 77 | if (0 == Doc->lineCount) 78 | return; 79 | 80 | if (percent > 100) 81 | percent = 100; 82 | else if (percent < 0) 83 | percent = 0; 84 | 85 | ratio = ((double) (percent)) / (100.0); 86 | 87 | wordCount = ots_get_article_word_count (Doc); 88 | 89 | for (i = 0; i < (ratio * (double) wordCount);) 90 | { 91 | i += ots_highlight_max_line (Doc); 92 | } 93 | } 94 | 95 | void 96 | ots_highlight_doc_lines (OtsArticle * Doc, int lines) 97 | { 98 | int i; 99 | int lineCount; 100 | int tmp; 101 | 102 | if (0 == Doc->lineCount) return; 103 | 104 | lineCount = Doc->lineCount; 105 | i=0; 106 | while ((ilineCount) return; 120 | 121 | docWordCount = ots_get_article_word_count (Doc); 122 | 123 | i=0; 124 | while ((i < docWordCount) && (i <= words)) 125 | { 126 | i += ots_highlight_max_line (Doc); 127 | } 128 | } 129 | -------------------------------------------------------------------------------- /dictionaries/cs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | "| 6 | (| 7 | 8 | 9 | 10 | 11 | ."| 12 | ,"| 13 | .| 14 | ,| 15 | "| 16 | )| 17 | ?| 18 | :| 19 | ;| 20 | !| 21 | 22 | 23 | 24 | 25 | wrote|write 26 | came|come 27 | went|go 28 | 29 | 30 | 31 | before1|1after 32 | 33 |
 34 |        before1|1after
 35 |    
36 |
37 | 38 | 39 | 40 | ." 41 | ?" 42 | !" 43 | ," 44 | . 45 | ? 46 | ; 47 | | 48 | ! 49 | 50 | 51 | 52 | Dr. 53 | Mr. 54 | Mrs. 55 | U.S. 56 | Rep. 57 | Sen. 58 | 59 | 60 | 61 | a 62 | aby 63 | ale 64 | ani 65 | ano 66 | 67 | být 68 | co 69 | dělat 70 | dnes 71 | do 72 | doma 73 | domů 74 | i 75 | 76 | jak 77 | jako 78 | je 79 | jen 80 | jenom 81 | ještě 82 | ještěže 83 | ji 84 | jinak 85 | jít 86 | jsem 87 | jsi 88 | jsme 89 | jsou 90 | jste 91 | k 92 | každý 93 | kde 94 | kdo 95 | když 96 | konečně 97 | který 98 | mají 99 | 100 | mimochodem 101 | mít 102 | moc 103 | moci 104 | moct 105 | mohou 106 | mohu 107 | moje 108 | moji 109 | můj 110 | může 111 | my 112 | na 113 | naproti 114 | náš 115 | naše 116 | ne 117 | nebo 118 | něco 119 | někdy 120 | není 121 | nic 122 | o 123 | od 124 | on 125 | ona 126 | oni 127 | ono 128 | ony 129 | ovšem 130 | po 131 | protože 132 | samozřejmě 133 | se 134 | slečna 135 | tady 136 | tak 137 | také 138 | taky 139 | tam 140 | ten 141 | to 142 | totiž 143 | tu 144 | ty 145 | u 146 | v 147 | váš 148 | vaše 149 | ve 150 | velmi 151 | vlastní 152 | vy 153 | z 154 | za 155 | zase 156 | zde 157 | zítra 158 | znova 159 | že 160 | 161 |
162 | -------------------------------------------------------------------------------- /dictionaries/ro.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | "| 7 | (| 8 | 9 | 10 | 11 | 12 | ."| 13 | ,"| 14 | .| 15 | ,| 16 | "| 17 | )| 18 | ?| 19 | :| 20 | ;| 21 | !| 22 | 23 | 24 | 25 | 26 | wrote|write 27 | came|come 28 | went|go 29 | 30 | 31 | 32 | before1|1after 33 | 34 |
 35 |        before1|1after
 36 |    
37 |
38 | 39 | 40 | 41 | ." 42 | ?" 43 | !" 44 | ," 45 | . 46 | ? 47 | ; 48 | | 49 | ! 50 | 51 | 52 | 53 | Dr. 54 | Mr. 55 | Mrs. 56 | U.S. 57 | Rep. 58 | Sen. 59 | 60 | 61 | 62 | acasă 63 | acest 64 | acolo 65 | acum 66 | acuma 67 | ai 68 | aicea 69 | aici 70 | alt 71 | am 72 | apoi 73 | aproape 74 | apropro 75 | are 76 | aşa 77 | au 78 | avea 79 | avem 80 | aveţi 81 | ca 82 | 83 | când 84 | ce 85 | cine 86 | cît 87 | cîtă 88 | cîte 89 | cîţi 90 | cu 91 | da 92 | deci 93 | decît 94 | deja 95 | doamna 96 | doi 97 | domnişoara 98 | domnul 99 | două 100 | dumneaei 101 | dumnealor 102 | dumnealui 103 | dumneata 104 | dumneavoastră 105 | după 106 | ea 107 | ei 108 | el 109 | ele 110 | este 111 | eşti 112 | eu 113 | face 114 | fi 115 | fiindcă 116 | iar 117 | ieri 118 | în 119 | încă 120 | într 121 | între 122 | la 123 | lîngă 124 | lor 125 | lui 126 | mai 127 | merge 128 | meu 129 | mîine 130 | mult 131 | nicăieri 132 | nici 133 | niciodată 134 | nimeni 135 | nimic 136 | nişte 137 | noi 138 | nostru 139 | nu 140 | o 141 | pe 142 | pentru 143 | puţin 144 | sînt 145 | sînt 146 | sîntem 147 | sînteţi 148 | spre 149 | sub 150 | şi 151 | tot 152 | tu 153 | un 154 | una 155 | unde 156 | unei 157 | unor 158 | unu 159 | unui 160 | unul 161 | voi 162 | 163 |
164 | -------------------------------------------------------------------------------- /ext/ots/libots/html.c: -------------------------------------------------------------------------------- 1 | /* 2 | * html.c 3 | * 4 | * Copyright (C) 2003 Nadav Rotem 5 | * 6 | * This program is free software; you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation; either version 2 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU Library General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program; if not, write to the Free Software 18 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 19 | */ 20 | 21 | #include 22 | #include 23 | #include 24 | #include "libots.h" 25 | 26 | static unsigned char * 27 | ots_get_line_HTML (const OtsSentence * aLine, size_t * out_size) 28 | { 29 | GList *li; 30 | GString *text; 31 | unsigned char *utf8_data; 32 | char *score_str; 33 | text = g_string_new (NULL); 34 | 35 | score_str=g_new0(char,32); 36 | sprintf(score_str,"",aLine->score); 37 | g_string_append (text,score_str); 38 | g_free(score_str); 39 | 40 | if ((aLine->selected)) 41 | { 42 | g_string_append (text, 43 | ""); 44 | } 45 | else 46 | { 47 | g_string_append (text, ""); 48 | } 49 | 50 | for (li = (GList *) aLine->words; li != NULL; li = li->next) 51 | { 52 | if (0 == strcmp ((char *) li->data, "\n")) 53 | g_string_append (text, "
"); 54 | else 55 | g_string_append (text, (char *) li->data); 56 | } 57 | g_string_append (text,"
\n"); 58 | 59 | if (out_size) 60 | *out_size = text->len; 61 | 62 | utf8_data = text->str; 63 | g_string_free (text, FALSE); 64 | 65 | return utf8_data; 66 | } 67 | 68 | 69 | #if 0 70 | static void 71 | ots_print_line_HTML (FILE * stream, const OtsSentence * aLine) 72 | { 73 | unsigned char *utf8_txt; 74 | size_t len; 75 | 76 | utf8_txt = ots_get_line_HTML (aLine, &len); 77 | fwrite (utf8_txt, 1, len, stream); 78 | g_free (utf8_txt); 79 | } 80 | #endif 81 | 82 | 83 | unsigned char * 84 | ots_get_doc_HTML (const OtsArticle * Doc, size_t * out_len) 85 | { 86 | GList *li; 87 | GString *text; 88 | unsigned char *utf8_data; 89 | size_t line_len; 90 | 91 | text = g_string_new (NULL); 92 | 93 | 94 | g_string_append (text, 95 | "\n\nOTS\n\n\n\n"); 96 | g_string_append (text, "\n"); 97 | g_string_append (text, "\n"); 100 | 101 | 102 | for (li = (GList *) Doc->lines; li != NULL; li = li->next) 103 | { 104 | utf8_data = ots_get_line_HTML ((OtsSentence *) li->data, &line_len); 105 | g_string_append_len (text, utf8_data, line_len); 106 | g_free (utf8_data); 107 | } 108 | g_string_append (text, "\n"); 109 | 110 | if (out_len) 111 | *out_len = text->len; 112 | utf8_data = text->str; 113 | 114 | g_string_free (text, FALSE); 115 | return utf8_data; 116 | 117 | } 118 | 119 | 120 | 121 | void 122 | ots_print_HTML (FILE * stream, const OtsArticle * Doc) 123 | { 124 | unsigned char *utf8_txt; 125 | size_t len; 126 | 127 | utf8_txt = ots_get_doc_HTML (Doc, &len); 128 | fwrite (utf8_txt, 1, len, stream); 129 | g_free (utf8_txt); 130 | 131 | } 132 | -------------------------------------------------------------------------------- /dictionaries/ms.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | "| 7 | (| 8 | 9 | 10 | 11 | 12 | ."| 13 | ,"| 14 | .| 15 | ,| 16 | "| 17 | )| 18 | ?| 19 | :| 20 | ;| 21 | !| 22 | 23 | 24 | 25 | 26 | wrote|write 27 | came|come 28 | went|go 29 | 30 | 31 | 32 | before1|1after 33 | 34 |
 35 |        before1|1after
 36 |    
37 |
38 | 39 | 40 | 41 | ." 42 | ?" 43 | !" 44 | ," 45 | . 46 | ? 47 | ; 48 | | 49 | ! 50 | 51 | 52 | 53 | Dr. 54 | Mr. 55 | Mrs. 56 | U.S. 57 | Rep. 58 | Sen. 59 | 60 | 61 | 62 | ada 63 | adakalanya 64 | adalah 65 | adapun 66 | agar 67 | amin 68 | anda 69 | anti 70 | anu 71 | apa 72 | bagaimana 73 | banyak 74 | begini 75 | begitu 76 | belaka 77 | beliau 78 | berapa 79 | betapa 80 | bila 81 | demikian 82 | dengan 83 | di 84 | dia 85 | dikau 86 | hingga 87 | iaitu 88 | ialah 89 | jika 90 | jikalau 91 | justeru 92 | kami 93 | kamu 94 | kara 95 | kenapa 96 | kendati 97 | kita 98 | laku 99 | lepas 100 | mahupun 101 | maka 102 | mana 103 | mana-mana 104 | masing-masing 105 | mereka 106 | meskipun 107 | nya 108 | pada 109 | paling 110 | patik 111 | pergi 112 | puan 113 | pula 114 | saban 115 | sampai 116 | sangat 117 | saudara 118 | saya 119 | sebanyak 120 | sebanyak-banyaknya 121 | sedang 122 | segala 123 | segala-galanya 124 | sejak 125 | sekaligus 126 | sekalipun 127 | selalu 128 | semayam 129 | semesta 130 | sempena 131 | semua 132 | semuanya 133 | sendiri 134 | sendiri-sendiri 135 | sentiasa 136 | seraya 137 | serba 138 | serba aneka 139 | serba-serbi 140 | sering 141 | seringkali 142 | serta 143 | siapa 144 | sini 145 | supaya 146 | syahdan 147 | telah 148 | terbanyak 149 | tetapi 150 | tiada 151 | tiap 152 | tidak 153 | tuanku 154 | walaupun 155 | walhal 156 | walhasil 157 | wujud 158 | yang 159 | 160 |
161 | -------------------------------------------------------------------------------- /dictionaries/eo.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | "| 6 | (| 7 | 8 | 9 | 10 | 11 | ."| 12 | ,"| 13 | .| 14 | ,| 15 | "| 16 | )| 17 | ?| 18 | :| 19 | ;| 20 | !| 21 | 22 | 23 | 24 | 25 | wrote|write 26 | came|come 27 | went|go 28 | 29 | 30 | 31 | before1|1after 32 | 33 |
 34 |        before1|1after
 35 |    
36 |
37 | 38 | 39 | 40 | ." 41 | ?" 42 | !" 43 | ," 44 | . 45 | ? 46 | ; 47 | | 48 | ! 49 | 50 | 51 | 52 | Dr. 53 | Mr. 54 | Mrs. 55 | U.S. 56 | Rep. 57 | Sen. 58 | 59 | 60 | 61 | ajn 62 | al 63 | almenaŭ 64 | ambaŭ 65 | ankaŭ 66 | ankoraŭ 67 | anstataŭ 68 | antaŭ 69 | antaŭen 70 | apud 71 | 72 | baldaŭ 73 | ĉar 74 | ĉe 75 | ĉi 76 | ĉio 77 | ĉirkaŭ 78 | ĉiuj 79 | ĉu 80 | da 81 | dankon 82 | de 83 | do 84 | du 85 | dum 86 | 87 | el 88 | en 89 | esti 90 | ĝi 91 | ha 92 | havi 93 | hieraŭ 94 | ili 95 | inter 96 | iri 97 | jam 98 | je 99 | jen 100 | jes 101 | ĵus 102 | kaj 103 | ke 104 | kelkaj 105 | kia 106 | kial 107 | kiam 108 | kie 109 | kiel 110 | kio 111 | kioj 112 | kiu 113 | kiuj 114 | kontraŭ 115 | kun 116 | la 117 | li 118 | malantaŭ 119 | malantaŭen 120 | malsupren 121 | mi 122 | morgaŭ 123 | multaj 124 | ne 125 | ni 126 | nu 127 | nun 128 | nur 129 | ofte 130 | per 131 | plej 132 | pli 133 | plu 134 | por 135 | post 136 | poste 137 | povi 138 | preni 139 | preskaŭ 140 | preter 141 | pri 142 | pro 143 | propra 144 | saluton 145 | se 146 | sed 147 | sen 148 | si 149 | ŝi 150 | sub 151 | super 152 | supre 153 | supren 154 | sur 155 | tia 156 | tial 157 | tie 158 | tiel 159 | tio 160 | tioj 161 | tiu 162 | tiuj 163 | tra 164 | tre 165 | tuj 166 | unu 167 | uzi 168 | vi 169 | voli 170 | 171 |
172 | -------------------------------------------------------------------------------- /dictionaries/et.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | "| 6 | (| 7 | 8 | 9 | 10 | 11 | ."| 12 | ,"| 13 | .| 14 | ,| 15 | "| 16 | )| 17 | ?| 18 | :| 19 | ;| 20 | !| 21 | 22 | 23 | 24 | 25 | wrote|write 26 | came|come 27 | went|go 28 | 29 | 30 | 31 | before1|1after 32 | 33 |
 34 |        before1|1after
 35 |    
36 |
37 | 38 | 39 | 40 | ." 41 | ?" 42 | !" 43 | ," 44 | . 45 | ? 46 | ; 47 | | 48 | ! 49 | 50 | 51 | 52 | Dr. 53 | Mr. 54 | Mrs. 55 | U.S. 56 | Rep. 57 | Sen. 58 | 59 | 60 | 61 | aga 62 | ainult 63 | alates 64 | alati 65 | all 66 | ees 67 | ei 68 | esimene 69 | et 70 | hoolimata 71 | iga 72 | ilma 73 | ja 74 | jah 75 | jaoks 76 | jooksul 77 | juures 78 | ka 79 | kaheksa 80 | kaheksakümmend 81 | kaks 82 | kakskümmend 83 | kelle 84 | kes 85 | kolm 86 | kolmkümmend 87 | koos 88 | kui 89 | kümme 90 | kuni 91 | kus 92 | kuus 93 | kuuskümmend 94 | läbi 95 | lähed 96 | lähema 97 | lähen 98 | läks 99 | läksid 100 | läksime 101 | läksite 102 | ma 103 | me 104 | meie 105 | miks 106 | miljon 107 | mina 108 | mind 109 | minema 110 | mis 111 | missugune 112 | mõnikord 113 | mulle 114 | neli 115 | nelikümmend 116 | nende 117 | ning 118 | null 119 | nüüd 120 | oled 121 | olema 122 | oleme 123 | olen 124 | oli 125 | olnud 126 | omama 127 | on 128 | palju 129 | peal 130 | piki 131 | sa 132 | sada 133 | seal 134 | see 135 | sees 136 | seest 137 | seitse 138 | seitsekümmend 139 | sest 140 | siin 141 | siis 142 | sina 143 | sind 144 | sulle 145 | ta 146 | tagasi 147 | talle 148 | te 149 | teeb 150 | teed 151 | teen 152 | tegema 153 | tegi 154 | tehtud 155 | teie 156 | tema 157 | tuhat 158 | üheksa 159 | üheksakümmend 160 | ükskord 161 | üle 162 | ümber 163 | umbes 164 | vahel 165 | vastas 166 | vastu 167 | veel 168 | viis 169 | viiskümmend 170 | või 171 | 172 |
173 | -------------------------------------------------------------------------------- /dictionaries/fr.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | "| 8 | (| 9 | 10 | 11 | 12 | 13 | ."| 14 | ,"| 15 | .| 16 | ,| 17 | "| 18 | )| 19 | ?| 20 | :| 21 | ;| 22 | !| 23 | 24 | 25 | 26 | 27 | wrote|write 28 | came|come 29 | went|go 30 | 31 | 32 | 33 | before1|1after 34 | 35 |
 36 |        before1|1after
 37 |    
38 | 39 | 40 | 41 | wrote|write 42 | 43 | 44 | 45 | assist|help 46 | 47 | 48 |
49 | 50 | 51 | ." 52 | ?" 53 | !" 54 | ," 55 | . 56 | ? 57 | ; 58 | | 59 | ! 60 | 61 | 62 | 63 | Dr. 64 | Mr. 65 | Mrs. 66 | U.S. 67 | Rep. 68 | Sen. 69 | 70 | 71 | 72 | le 73 | la 74 | les 75 | un 76 | une 77 | je 78 | me 79 | moi 80 | mon 81 | ma 82 | mes 83 | nous 84 | notre 85 | nos 86 | tu 87 | te 88 | ton 89 | ta 90 | tes 91 | vous 92 | votre 93 | vos 94 | il 95 | lui 96 | son 97 | sa 98 | ses 99 | ils 100 | leur 101 | leurs 102 | elle 103 | elles 104 | on 105 | 106 | quand 107 | 108 | à 109 | aussi 110 | autre 111 | avec 112 | ça 113 | ce 114 | cet 115 | cette 116 | ces 117 | ceci 118 | cela 119 | chaque 120 | ci 121 | dans 122 | de 123 | en 124 | et 125 | entre 126 | ici 127 | jamais 128 | 129 | mais 130 | même 131 | moins 132 | ne 133 | ou 134 | par 135 | parfois 136 | pas 137 | pendant 138 | plus 139 | pour 140 | pourtant 141 | que 142 | quelque 143 | qui 144 | quois 145 | rien 146 | sans 147 | si 148 | sur 149 | tellement 150 | très 151 | trop 152 | y 153 | des 154 | du 155 | n'est 156 | être 157 | suis 158 | es 159 | est 160 | sommes 161 | êtes 162 | sont 163 | étais 164 | était 165 | étions 166 | étiez 167 | étaient 168 | été 169 | avoir 170 | ai 171 | as 172 | a 173 | avons 174 | avez 175 | ont 176 | avais 177 | avait 178 | avions 179 | aviez 180 | avaient 181 | aie 182 | aies 183 | ait 184 | ayons 185 | ayez 186 | aient 187 | eu 188 | devoir 189 | 190 | faire 191 | fait 192 | pouvoir 193 | pu 194 | vouloir 195 | voulu 196 | aller 197 | 198 | 199 |
200 | -------------------------------------------------------------------------------- /dictionaries/is.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | "| 7 | (| 8 | 9 | 10 | 11 | 12 | ."| 13 | ,"| 14 | .| 15 | ,| 16 | "| 17 | )| 18 | ?| 19 | :| 20 | ;| 21 | !| 22 | 23 | 24 | 25 | 26 | 27 | wrote|write 28 | came|come 29 | went|go 30 | 31 | 32 | 33 | before1|1after 34 | 35 |
 36 |        before1|1after
 37 |    
38 |
39 | 40 | 41 | 42 | ." 43 | ?" 44 | !" 45 | ," 46 | . 47 | ? 48 | ; 49 | | 50 | ! 51 | 52 | 53 | 54 | Dr. 55 | Mr. 56 | Mrs. 57 | U.S. 58 | Rep. 59 | Sen. 60 | 61 | 62 | 63 | 64 | af 65 | andspænis 66 | annaðhvort 67 | auk 68 | austan 69 | á 70 | án 71 | ásamt 72 | bæði 73 | eða 74 | ef 75 | eftir 76 | eiga 77 | en 78 | er 79 | ert 80 | eru 81 | eruð 82 | erum 83 | ég 84 | fara 85 | 86 | frá 87 | fyrir 88 | fyrst 89 | gagn 90 | gagnvart 91 | gegnt 92 | gegnum 93 | geta 94 | hafa 95 | hana 96 | handa 97 | hann 98 | hans 99 | hennar 100 | henni 101 | hið 102 | hin 103 | hina 104 | hinar 105 | hinir 106 | hinn 107 | hinna 108 | hinnar 109 | hinni 110 | hins 111 | hinu 112 | hinum 113 | hjá 114 | honum 115 | hún 116 | hver 117 | hverjum 118 | hvorki 119 | hvort 120 | innan 121 | í 122 | kringum 123 | með 124 | meðal 125 | meðfram 126 | mega 127 | megin 128 | mér 129 | mig 130 | milli 131 | millum 132 | mín 133 | mót 134 | móti 135 | munu 136 | nálægt 137 | neðan 138 | nema 139 | 140 | norðan 141 | ofan 142 | og 143 | okkur 144 | pro 145 | sakir 146 | sem 147 | 148 | sért 149 | séu 150 | séuð 151 | séum 152 | síðan 153 | skulu 154 | sunnan 155 | sökum 156 | til 157 | um 158 | umfram 159 | umhverfis 160 | undan 161 | undir 162 | utan 163 | úr 164 | var 165 | varst 166 | vegna 167 | vera 168 | verandi 169 | vestan 170 | við 171 | voru 172 | voruð 173 | vorum 174 | væri 175 | værir 176 | væru 177 | væruð 178 | værum 179 | yðar 180 | yður 181 | yfir 182 | ykkar 183 | ykkur 184 | það 185 | þau 186 | þá 187 | þegar 188 | þeim 189 | þeir 190 | þeirra 191 | þess 192 | þér 193 | þið 194 | þig 195 | þín 196 | þótt 197 | þú 198 | því 199 | þær 200 | 201 |
202 | -------------------------------------------------------------------------------- /dictionaries/it.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | "| 7 | (| 8 | 9 | 10 | 11 | 12 | ."| 13 | ,"| 14 | .| 15 | ,| 16 | "| 17 | )| 18 | ?| 19 | :| 20 | ;| 21 | !| 22 | 23 | 24 | 25 | 26 | wrote|write 27 | came|come 28 | went|go 29 | 30 | 31 | 32 | before1|1after 33 | 34 |
 35 |        before1|1after
 36 |    
37 |
38 | 39 | 40 | 41 | ." 42 | ?" 43 | !" 44 | ," 45 | . 46 | ? 47 | ; 48 | | 49 | ! 50 | 51 | 52 | 53 | Dr. 54 | Mr. 55 | Mrs. 56 | U.S. 57 | Rep. 58 | Sen. 59 | 60 | 61 | 62 | il 63 | lo 64 | l' 65 | i 66 | gli 67 | gl' 68 | la 69 | le 70 | un 71 | uno 72 | una 73 | un' 74 | io 75 | noi 76 | mio 77 | tu 78 | voi 79 | vostro 80 | lui 81 | lei 82 | egli 83 | ella 84 | esso 85 | essa 86 | loro 87 | essi 88 | esse 89 | suo 90 | 91 | si 92 | c'è 93 | a 94 | ad 95 | alcuno 96 | che 97 | come 98 | con 99 | così 100 | da 101 | di 102 | domani 103 | e 104 | ed 105 | in 106 | infine 107 | ma 108 | mai 109 | mentre 110 | molto 111 | 112 | nessuno 113 | nessun 114 | nessuna 115 | nessun' 116 | niente 117 | no 118 | non 119 | nulla 120 | o 121 | oggi 122 | ora 123 | per 124 | poi 125 | poiché 126 | qualche 127 | qualcuno 128 | quando 129 | questo 130 | qui 131 | se 132 | su 133 | troppo 134 | tutto 135 | al 136 | ai 137 | allo 138 | agli 139 | alla 140 | alle 141 | all' 142 | col 143 | coi 144 | collo 145 | cogli 146 | colla 147 | colle 148 | coll' 149 | dal 150 | dai 151 | dallo 152 | dagli 153 | dalla 154 | dalle 155 | dall' 156 | del 157 | dei 158 | dello 159 | degli 160 | della 161 | delle 162 | dell' 163 | nel 164 | nei 165 | nello 166 | negli 167 | nella 168 | nelle 169 | nell' 170 | pel 171 | pei 172 | sul 173 | sui 174 | sullo 175 | sugli 176 | sulla 177 | sulle 178 | sull' 179 | primo 180 | essere 181 | sono 182 | sei 183 | è 184 | siamo 185 | siete 186 | stare 187 | sto 188 | stai 189 | sta 190 | stiamo 191 | stano 192 | 193 | avere 194 | ho 195 | hai 196 | ha 197 | abbiamo 198 | avete 199 | hanno 200 | dovere 201 | potere 202 | andare 203 | va 204 | 205 | 206 |
207 | -------------------------------------------------------------------------------- /ext/ots/libots/parser.c: -------------------------------------------------------------------------------- 1 | /* 2 | * parser.c 3 | * 4 | * Copyright (C) 2003 Nadav Rotem 5 | * 6 | * This program is free software; you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation; either version 2 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU Library General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program; if not, write to the Free Software 18 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 19 | */ 20 | 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include "libots.h" 26 | 27 | #define BUFFER_SIZE (1024*8) 28 | 29 | int 30 | ots_match_post (const char *aWord,const char *post) 31 | { 32 | int i, wlen, plen; 33 | 34 | 35 | wlen = strlen (aWord); 36 | plen = strlen (post); 37 | 38 | if (plen > wlen) return 0; 39 | 40 | for (i = 0; i < plen; i++) 41 | if (aWord[wlen - plen + i] != post[i]) 42 | return 0; /* no match */ 43 | 44 | return 1; /*word match */ 45 | } 46 | 47 | void 48 | ots_parse_file (FILE * stream, OtsArticle * Doc ) 49 | { 50 | unsigned char fread_buffer[BUFFER_SIZE]; 51 | unsigned char *buffer; 52 | size_t nread, total_read, avail_size; 53 | 54 | buffer = g_new0 (unsigned char, BUFFER_SIZE); 55 | 56 | avail_size = BUFFER_SIZE; 57 | total_read = nread = 0; 58 | while ((nread = 59 | fread (fread_buffer, sizeof (unsigned char), sizeof (fread_buffer), 60 | stream)) > 0) 61 | { 62 | if (nread + total_read > avail_size) 63 | { 64 | avail_size *= 2; 65 | buffer = g_renew (unsigned char, buffer, avail_size); 66 | } 67 | 68 | strncpy (buffer + total_read, fread_buffer, nread); 69 | total_read += nread; 70 | } 71 | 72 | ots_parse_stream (buffer, total_read, Doc); 73 | g_free (buffer); 74 | } 75 | 76 | 77 | 78 | 79 | 80 | int 81 | ots_parser_should_break(const char *aWord,const OtsStemRule * rule) 82 | { 83 | GList *li; 84 | char *postfix; 85 | int toBreak=0; 86 | 87 | for (li = (GList *) rule->ParserBreak; li != NULL; li = li->next) 88 | { 89 | postfix=li->data; 90 | if (ots_match_post (aWord, postfix) ) 91 | { 92 | toBreak=1; 93 | break; 94 | } 95 | 96 | } 97 | 98 | // TODO: single character with a '.' is probably an initial but this needs to be expressed as a language rule. 99 | if (strlen(aWord) == 2 && aWord[strlen(aWord) - 1] == '.') 100 | return 0; 101 | 102 | for (li = (GList *) rule->ParserDontBreak; li != NULL; li = li->next) 103 | { 104 | postfix=li->data; 105 | if (ots_match_post (aWord, postfix) ) 106 | { 107 | toBreak=0; 108 | break; 109 | } 110 | 111 | } 112 | return toBreak; 113 | } 114 | 115 | 116 | 117 | void 118 | ots_parse_stream(const unsigned char *utf8, size_t len, OtsArticle * Doc) /*parse the unicode stream */ 119 | { 120 | 121 | OtsSentence *tmpLine = ots_append_line (Doc); 122 | OtsStemRule * rule=Doc->stem; 123 | gunichar uc; 124 | int index = 0; 125 | char *s = (char *) utf8; 126 | GString *word_buffer = g_string_new (NULL); 127 | 128 | 129 | while ((*s) && (index < len)) 130 | { 131 | uc = g_utf8_get_char (s); 132 | 133 | if (!g_unichar_isspace (uc)) /* space is the end of a word */ 134 | { 135 | 136 | g_string_append_unichar(word_buffer,uc); 137 | 138 | } 139 | else 140 | { 141 | 142 | if (0len) 143 | { 144 | ots_append_word (tmpLine, word_buffer->str); 145 | 146 | if (ots_parser_should_break(word_buffer->str,rule)) { 147 | tmpLine = ots_append_line (Doc); /* Add a new Line */ 148 | } 149 | 150 | g_string_assign (word_buffer, ""); 151 | 152 | } 153 | 154 | if (uc=='\n') {ots_append_word (tmpLine,"\n");} 155 | else 156 | {ots_append_word (tmpLine," ");} 157 | 158 | g_string_assign (word_buffer,""); 159 | } 160 | 161 | s = g_utf8_next_char (s); 162 | 163 | index++; 164 | } 165 | 166 | 167 | if (0len) /*final flush*/ 168 | { 169 | ots_append_word (tmpLine, word_buffer->str); 170 | g_string_assign (word_buffer, ""); 171 | } 172 | 173 | 174 | 175 | g_string_free (word_buffer, TRUE); 176 | } 177 | -------------------------------------------------------------------------------- /ext/ots/libots/relations.c: -------------------------------------------------------------------------------- 1 | /* 2 | * relations.c 3 | * 4 | * Copyright (C) 2003 Nadav Rotem 5 | * 6 | * This program is free software; you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation; either version 2 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU Library General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program; if not, write to the Free Software 18 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 19 | */ 20 | 21 | #include 22 | #include 23 | #include 24 | #include "grader-tc.h" 25 | 26 | #include "libots.h" 27 | /* 28 | The Inner product of two texts is defined as the number of topics they 29 | share. This set of functions implements this relations using the ots 30 | api. 31 | 32 | Application: a relation between a slashdot article and a comment made 33 | usage: ots_text_relations(story,"en",comment,"en",n); 34 | where n is the max number of most important topics to consider; safe to give a high number (ex: 20); 35 | 36 | returns: 37 | 0 - off topic 38 | n - number of topics they share 39 | 40 | */ 41 | 42 | #define OTS_MAX_TOPIC_WORD_SIZE 256 43 | 44 | /*Returns the number of topics that two blocks of text share*/ 45 | int ots_text_relations( 46 | const unsigned char *text1,const unsigned char *lang_code1, 47 | const unsigned char *text2,const unsigned char *lang_code2,const int topic_num) 48 | { 49 | GList* top1; 50 | GList* top2; 51 | int score; 52 | 53 | top1=ots_text_stem_list(text1,lang_code1,topic_num); 54 | top2=ots_text_stem_list(text2,lang_code2,topic_num); 55 | 56 | score=ots_topic_list_score(top1,top2); 57 | 58 | if (top1){g_list_foreach (top1, (GFunc) g_free, NULL);g_list_free (top1);} 59 | if (top2){g_list_foreach (top2, (GFunc) g_free, NULL);g_list_free (top2);} 60 | 61 | return score; 62 | } 63 | 64 | 65 | 66 | 67 | /*For a given text, return the list of the topics*/ 68 | char* ots_text_topics( 69 | const unsigned char *text,const unsigned char *lang_code,int topic_num) 70 | { 71 | int i; 72 | GString *word; 73 | unsigned char *str; 74 | unsigned char *tmp; 75 | OtsArticle *Art; 76 | 77 | if (NULL==text) return NULL; 78 | word = g_string_new (NULL); 79 | 80 | Art = ots_new_article (); 81 | 82 | ots_load_xml_dictionary(Art,lang_code); /*Load the dictionary*/ 83 | if (text!=NULL) ots_parse_stream (text,strlen(text), Art); /* read text , put it in struct Article */ 84 | ots_grade_doc (Art); 85 | 86 | 87 | for (i=0;i<=topic_num;i++) 88 | { 89 | tmp=ots_word_in_list(Art->ImpWords,i); 90 | if ((tmp!=NULL)&&(strlen(tmp)>0)) {g_string_append(word,tmp); 91 | g_string_append(word," "); } 92 | } 93 | 94 | 95 | str=word->str; 96 | g_string_free (word, FALSE); 97 | ots_free_article (Art); 98 | 99 | return str; 100 | } 101 | 102 | 103 | 104 | /*For a given text, return the list of the stemmed topics*/ 105 | GList* ots_text_stem_list(const unsigned char *text, const unsigned char *lang_code, int topic_num) 106 | { 107 | int i; 108 | GList *topics=NULL; 109 | unsigned char *tmp; 110 | OtsArticle *Art; 111 | 112 | if (NULL==text) return NULL; 113 | 114 | Art = ots_new_article (); 115 | 116 | ots_load_xml_dictionary(Art,lang_code); 117 | if (text!=NULL) ots_parse_stream (text,strlen(text), Art); 118 | ots_grade_doc (Art); 119 | 120 | 121 | for (i=0;i<=topic_num;i++) 122 | { 123 | tmp=ots_stem_in_list(Art->ImpWords,i); 124 | if ((tmp)&&(strlen(tmp)>0)) 125 | topics=g_list_append(topics,g_strdup(tmp)); 126 | } 127 | 128 | 129 | ots_free_article (Art); 130 | return topics; 131 | } 132 | 133 | /*Gives a score on the relations between two lists of topics; simmilar to the inner product*/ 134 | int ots_topic_list_score( 135 | const GList *topic_list1, 136 | const GList *topic_list2) 137 | { 138 | int count=0; 139 | GList *tmplist1; 140 | GList *tmplist2; 141 | 142 | if (!(topic_list1)) return 0; 143 | if (!(topic_list2)) return 0; 144 | 145 | tmplist1 = g_list_first(topic_list1); 146 | while(tmplist1) 147 | { 148 | tmplist2 = g_list_first(topic_list2); 149 | while(tmplist2) 150 | { 151 | 152 | if ((tmplist1->data)&&(tmplist2->data)&&(strlen(tmplist2->data)>1)) 153 | if (0==strncmp(tmplist1->data,tmplist2->data,OTS_MAX_TOPIC_WORD_SIZE)) 154 | {count++;} 155 | 156 | tmplist2 = g_list_next(tmplist2); 157 | } 158 | tmplist1 = g_list_next(tmplist1); 159 | } 160 | 161 | return count; 162 | } 163 | 164 | -------------------------------------------------------------------------------- /ext/ots/libots/grader-tc.c: -------------------------------------------------------------------------------- 1 | /* 2 | * grader-tc.c 3 | * 4 | * Copyright (C) 2003 Nadav Rotem 5 | * 6 | * This program is free software; you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation; either version 2 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU Library General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program; if not, write to the Free Software 18 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 19 | */ 20 | 21 | #include 22 | #include 23 | #include 24 | #include "libots.h" 25 | 26 | 27 | #include "grader-tc.h" 28 | 29 | 30 | /*Grader - Term count algorithm*/ 31 | /*This is non-normelized term frequency algorithm without using inverse document frequency database */ 32 | 33 | #define NUM_KEY_WORDS 100 /* use first n key words only */ 34 | 35 | int 36 | ots_get_article_word_count (const OtsArticle * Doc) 37 | { 38 | GList *li; 39 | int articleWC; 40 | articleWC = 0; 41 | 42 | if (Doc==NULL) return 0; 43 | 44 | for (li = (GList *) Doc->lines; li != NULL; li = li->next) 45 | { 46 | articleWC += ((OtsSentence *) li->data)->wc; 47 | } 48 | 49 | return articleWC; 50 | } 51 | 52 | 53 | /*take this line and add each word to the "wordStat" list 54 | * this list will hold all of the words in the article and the number 55 | * of times they appeared in the article. 56 | */ 57 | 58 | static void 59 | ots_line_add_wordlist(OtsArticle * Doc,const OtsSentence * aLine) 60 | { 61 | GList *li; 62 | if ((aLine==NULL) ||(NULL==Doc)) { return;} 63 | 64 | for (li = (GList *) aLine->words; li != NULL; li = li->next) /* for each word in the sentence Do: */ 65 | if (li->data && strlen (li->data)) ots_add_wordstat (Doc, (char *)li->data); 66 | 67 | return; 68 | } 69 | 70 | static void 71 | ots_create_wordlist(OtsArticle * Doc) 72 | { 73 | GList *line; 74 | if (Doc==NULL) return; 75 | 76 | for (line = (GList *) Doc->lines; line != NULL; line = line->next) 77 | { 78 | OtsSentence * aLine=line->data; 79 | if (aLine) 80 | ots_line_add_wordlist(Doc,aLine); 81 | } 82 | } 83 | 84 | 85 | 86 | 87 | static int 88 | keyVal (const int n) /* Ugly , I know */ 89 | { 90 | if (n == 1) return 3; 91 | if (n == 2) return 2; 92 | if (n == 3) return 2; 93 | if (n == 4) return 2; 94 | return 1; 95 | } 96 | 97 | 98 | static void 99 | ots_grade_line (GList *impList, OtsSentence * aLine, 100 | OtsStemRule * rule) 101 | { 102 | GList *li; 103 | GList *di; 104 | int n; 105 | char *tmp_stem; 106 | 107 | if ((aLine==NULL)||(rule==NULL)||(impList==NULL)) return; 108 | 109 | for (li = (GList *) aLine->words; li != NULL; li = li->next) /* for each word */ 110 | { 111 | n = 0; 112 | tmp_stem = ots_stem_strip ((unsigned char *) li->data, rule); 113 | 114 | for (di = (GList *) impList; 115 | ((di != NULL) && (n < NUM_KEY_WORDS)); di = di->next) 116 | { 117 | n++; 118 | if ((NULL!=((OtsWordEntery *) di->data)->stem) && (NULL!=tmp_stem)) 119 | if (0 == strcmp ((((OtsWordEntery *) di->data)->stem), tmp_stem)) 120 | { 121 | /* debug: 122 | if (0!=strcmp((((OtsWordEntery *) di->data)->word),li->data)) 123 | printf("[%s][%s] stem[%s]\n",(((OtsWordEntery *) di->data)->word),li->data,tmp);*/ 124 | 125 | aLine->score += (((OtsWordEntery *) di->data)->occ) * keyVal (n); 126 | } 127 | 128 | } 129 | 130 | g_free (tmp_stem); 131 | } 132 | 133 | } 134 | 135 | 136 | void 137 | ots_create_title_tc(OtsArticle * Doc) 138 | { 139 | 140 | char *tmp; 141 | char *word; 142 | int i; 143 | GString *title; 144 | if (NULL==Doc) return; 145 | 146 | title=g_string_new(NULL); 147 | 148 | for (i=0;i<5;i++) 149 | { 150 | word = ots_word_in_list(Doc->ImpWords,i); 151 | if (word) g_string_append(title,word); else break; 152 | if (i<4) g_string_append(title,","); 153 | } 154 | 155 | tmp=title->str; 156 | if (NULL!=title) g_string_free(title,FALSE); 157 | Doc->title=tmp; 158 | } 159 | 160 | 161 | void 162 | ots_grade_doc_tc (OtsArticle * Doc) 163 | { 164 | 165 | GList *li; 166 | if (NULL==Doc) return; 167 | ots_create_wordlist(Doc); 168 | 169 | 170 | Doc->ImpWords=ots_union_list (Doc->wordStat, Doc->dict); /* subtract from the Article wordlist all the words in the dic file (on , the , is...) */ 171 | Doc->ImpWords=ots_sort_list (Doc->ImpWords); /* sort the list , top 3 is what the article talks about (SARS , virus , cure ... ) */ 172 | 173 | /*to print wordlist: ots_print_wordlist (stdout, Doc->ImpWords);*/ 174 | 175 | if (0 == Doc->lineCount) return; 176 | 177 | for (li = (GList *) Doc->lines; li != NULL; li = li->next) 178 | { 179 | if (li->data) 180 | ots_grade_line (Doc->ImpWords, (OtsSentence *) li->data, Doc->stem); 181 | } 182 | 183 | 184 | ots_create_title_tc(Doc); 185 | } 186 | -------------------------------------------------------------------------------- /dictionaries/nl.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | "| 7 | (| 8 | 9 | 10 | 11 | 12 | ."| 13 | ,"| 14 | .| 15 | ,| 16 | "| 17 | )| 18 | ?| 19 | :| 20 | ;| 21 | !| 22 | 23 | 24 | 25 | 26 | 27 | wrote|write 28 | came|come 29 | went|go 30 | 31 | 32 | 33 | before1|1after 34 | 35 |
 36 |        before1|1after
 37 |    
38 | 39 | 40 | 41 | wrote|write 42 | 43 | 44 | 45 | assist|help 46 | 47 | 48 |
49 | 50 | 51 | ." 52 | ?" 53 | !" 54 | ," 55 | . 56 | ? 57 | ; 58 | | 59 | ! 60 | 61 | 62 | 63 | Dr. 64 | Mr. 65 | Mrs. 66 | U.S. 67 | Rep. 68 | Sen. 69 | 70 | 71 | 72 | 000 73 | aan 74 | achter 75 | af 76 | al 77 | als 78 | altijd 79 | andere 80 | anders 81 | ben 82 | bent 83 | bij 84 | boven 85 | bovendien 86 | daar 87 | daarom 88 | dan 89 | dat 90 | de 91 | deed 92 | desondanks 93 | deze 94 | die 95 | dit 96 | doe 97 | doen 98 | door 99 | dus 100 | echter 101 | een 102 | één 103 | en 104 | er 105 | ga 106 | gaan 107 | gaat 108 | geen 109 | gekund 110 | genoeg 111 | gij 112 | ging 113 | gisteren 114 | haar 115 | had 116 | hadden 117 | hare 118 | heb 119 | hebben 120 | heeft 121 | hem 122 | hen 123 | het 124 | hier 125 | hij 126 | hoe 127 | hoeveel 128 | hoewel 129 | hun 130 | hunne 131 | ik 132 | in 133 | is 134 | ja 135 | je 136 | jij 137 | jou 138 | jouw 139 | jouwe 140 | jullie 141 | kan 142 | kom 143 | komen 144 | komt 145 | kon 146 | konden 147 | kun 148 | kunnen 149 | kunt 150 | kwam 151 | langs 152 | maak 153 | maakt 154 | maakte 155 | maar 156 | maken 157 | me 158 | meer 159 | meest 160 | meestal 161 | met 162 | mij 163 | mijn 164 | mijne 165 | minstens 166 | moeten 167 | mogen 168 | morgen 169 | na 170 | naar 171 | naast 172 | natuurlijk 173 | neen 174 | nergens 175 | niet 176 | nog 177 | nogal 178 | nooit 179 | noord 180 | nu 181 | of 182 | om 183 | omdat 184 | onder 185 | ons 186 | onze 187 | ooit 188 | ook 189 | op 190 | over 191 | overal 192 | soms 193 | steeds 194 | straks 195 | te 196 | tegen 197 | terug 198 | toe 199 | tot 200 | tussen 201 | u 202 | uit 203 | uw 204 | uwe 205 | van 206 | veel 207 | vind 208 | vinden 209 | vindt 210 | voor 211 | vooral 212 | vrijwel 213 | waar 214 | waarom 215 | waarop 216 | want 217 | waren 218 | was 219 | wat 220 | we 221 | weer 222 | wees 223 | weest 224 | wel 225 | wie 226 | wij 227 | word 228 | worden 229 | wordt 230 | zal 231 | ze 232 | zelf 233 | zich 234 | zij 235 | zijn 236 | zijne 237 | zo 238 | zonder 239 | zou 240 | zouden 241 | zoveel 242 | zullen 243 | zult 244 | 245 |
246 | -------------------------------------------------------------------------------- /ext/ots/libots/libots.h: -------------------------------------------------------------------------------- 1 | /* 2 | * libots.h 3 | * 4 | * Copyright (C) 2003 Nadav Rotem 5 | * 6 | * This program is free software; you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation; either version 2 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU Library General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program; if not, write to the Free Software 18 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 19 | */ 20 | 21 | #ifndef HAVE_LIBOTS_H 22 | #define HAVE_LIBOTS_H 23 | 24 | #include 25 | 26 | G_BEGIN_DECLS 27 | 28 | typedef struct 29 | { /* the Term Frequency data structure */ 30 | char* word; 31 | double tf; /*Also used for TF*/ 32 | } OtsWordTF; 33 | 34 | 35 | typedef struct 36 | { 37 | /*a GList of char* */ 38 | GList *RemovePre; /* (a|b) replace string a with b */ 39 | GList *RemovePost; 40 | GList *step1_pre; 41 | GList *step1_post; 42 | 43 | GList *synonyms; 44 | GList *manual; 45 | 46 | GList *ParserBreak; 47 | GList *ParserDontBreak; 48 | 49 | 50 | /*to be implemented*/ 51 | GList *ReplaceChars; 52 | 53 | } OtsStemRule; 54 | 55 | 56 | typedef struct 57 | { 58 | GList *words; /* a Glist of words (char*) */ 59 | glong score; /*score set by the grader*/ 60 | gboolean selected; /*is selected?*/ 61 | gint wc; /*word count*/ 62 | void *user_data; /*pointer to the original sentence , or serial number maybe*/ 63 | } OtsSentence; 64 | 65 | 66 | typedef struct 67 | { 68 | GList *lines; /* a Glist of sentences (struct Sentence) */ 69 | gint lineCount; /*lines in the text*/ 70 | char *title; /*title , auto generated*/ 71 | 72 | OtsStemRule *stem; /*stemming & parsing rules*/ 73 | 74 | /*Term Frequency grader*/ 75 | GList *tf_terms; 76 | GList *idf_terms; 77 | 78 | 79 | /*Term Count grader*/ 80 | GList *dict; /* dictionary from xml*/ 81 | GList *wordStat; /* a wordlist of all words in the article and their occ */ 82 | GList *ImpWords; /*important words - for term count grader*/ 83 | 84 | 85 | } OtsArticle; 86 | 87 | 88 | OtsArticle *ots_new_article (void); 89 | void ots_free_article (OtsArticle *art); 90 | 91 | /*parser*/ 92 | void ots_parse_file (FILE * stream, OtsArticle * Doc); /*file input */ 93 | void ots_parse_stream(const unsigned char *utf8 , size_t len ,OtsArticle *Doc); /*parse unicode stream*/ 94 | 95 | OtsSentence *ots_append_line (OtsArticle * Doc); 96 | void ots_append_word (OtsSentence * aLine,unsigned const char *aWord); 97 | void ots_add_wordstat (OtsArticle * Doc,unsigned const char *wordString); 98 | 99 | 100 | /*dictionary*/ 101 | gboolean ots_load_xml_dictionary (OtsArticle * Doc, const char *name); 102 | 103 | int ots_get_article_word_count (const OtsArticle * Doc); 104 | 105 | 106 | /*grader*/ 107 | void ots_highlight_doc (OtsArticle * Doc, int percent); /*example: 20%*/ 108 | void ots_highlight_doc_lines (OtsArticle * Doc, int lines); /*example: 10 lines*/ 109 | void ots_highlight_doc_words (OtsArticle * Doc, int words); /*example: 50 words*/ 110 | 111 | void ots_grade_doc (OtsArticle * Doc); 112 | 113 | void ots_free_OtsWordTF(OtsWordTF *obj); /*todo: put in .h file*/ 114 | OtsWordTF* ots_new_OtsWordTF(const char* word,const double idf); 115 | 116 | 117 | /*HTML output*/ 118 | void ots_print_HTML (FILE * stream, const OtsArticle * Doc); 119 | unsigned char *ots_get_doc_HTML (const OtsArticle * Doc, size_t * out_len); 120 | 121 | /*TEXT output*/ 122 | void ots_print_doc (FILE * stream, const OtsArticle * Doc); 123 | unsigned char *ots_get_doc_text (const OtsArticle * Doc, size_t * out_len); 124 | 125 | 126 | /*Plugin writing*/ 127 | unsigned char* ots_get_line_text (const OtsSentence *aLine, gboolean only_if_selected, size_t *out_size); 128 | gboolean ots_is_line_selected(const OtsSentence *aLine); 129 | 130 | /*Stemm support*/ 131 | OtsStemRule *new_stem_rule(void); 132 | void free_stem_rule (OtsStemRule *rule); 133 | unsigned char * ots_stem_strip (unsigned const char * aWord, const OtsStemRule *rule); /*returns newly allocated string with the root of the word*/ 134 | unsigned char *ots_stem_format (unsigned const char *aWord, const OtsStemRule * rule); /*Remove leading spaces, comas, colons, etc. */ 135 | 136 | /*Relations between texts*/ 137 | 138 | /*Returns the number of topics that two blocks of text share*/ 139 | int ots_text_relations( 140 | const unsigned char *text1,const unsigned char *lang_code1, 141 | const unsigned char *text2,const unsigned char *lang_code2,const int topic_num); 142 | 143 | /*For a given text, return the list of the topics*/ 144 | char* ots_text_topics(const unsigned char *text,const unsigned char *lang_code,int topic_num); 145 | 146 | 147 | /*For a given text, return the list of the stemmed topics*/ 148 | GList* ots_text_stem_list(const unsigned char *text,const unsigned char *lang_code,int topic_num); 149 | 150 | 151 | /*Gives a score on the relations between two lists of topics; simmilar to the inner product*/ 152 | int ots_topic_list_score(const GList *topic_list1,const GList *topic_list2); 153 | 154 | G_END_DECLS 155 | 156 | 157 | 158 | #endif /* HAVE_LIBOTS_H */ 159 | -------------------------------------------------------------------------------- /dictionaries/sv.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | "| 6 | (| 7 | 8 | 9 | 10 | 11 | ."| 12 | ,"| 13 | .| 14 | ,| 15 | "| 16 | )| 17 | ?| 18 | :| 19 | ;| 20 | !| 21 | 22 | 23 | 24 | 25 | wrote|write 26 | came|come 27 | went|go 28 | 29 | 30 | 31 | before1|1after 32 | 33 |
 34 |        before1|1after
 35 |    
36 |
37 | 38 | 39 | 40 | ." 41 | ?" 42 | !" 43 | ," 44 | . 45 | ? 46 | ; 47 | | 48 | ! 49 | 50 | 51 | 52 | Dr. 53 | Mr. 54 | Mrs. 55 | U.S. 56 | Rep. 57 | Sen. 58 | 59 | 60 | 61 | all 62 | alla 63 | allestädes 64 | allra 65 | alls 66 | allt 67 | alltför 68 | alltid 69 | allting 70 | alltjämt 71 | alltmer 72 | alltnog 73 | alltsammans 74 | alltså 75 | annorlunda 76 | ar 77 | att 78 | av 79 | bara 80 | bland 81 | blev 82 | bli 83 | blir 84 | blivit 85 | de 86 | dem 87 | den 88 | denna 89 | densamme 90 | dess 91 | dessa 92 | dessförinnan 93 | det 94 | detta 95 | dig 96 | dit 97 | dittills 98 | dock 99 | du 100 | 101 | där 102 | däremot 103 | därför 104 | eftersom 105 | ej 106 | eller 107 | emedan 108 | emellan 109 | emellanåt 110 | emellertid 111 | en 112 | endast 113 | endera 114 | envar 115 | enär 116 | er 117 | ett 118 | fast 119 | fastän 120 | fick 121 | finnas 122 | flera 123 | flesta 124 | från 125 | 126 | får 127 | fåt 128 | förrän 129 | ha 130 | han 131 | heller 132 | henne 133 | hit 134 | hittills 135 | hitåt 136 | hon 137 | honom 138 | hur 139 | här 140 | i 141 | icke 142 | ifall 143 | ifrån 144 | igen 145 | igenom 146 | in 147 | ingen 148 | ingendera 149 | inget 150 | innan 151 | innanför 152 | inne 153 | ja 154 | jag 155 | jo 156 | kan 157 | kunde 158 | kunna 159 | kunnat 160 | man 161 | med 162 | medan 163 | mellan 164 | men 165 | mer 166 | mest 167 | mig 168 | mycket 169 | många 170 | måst 171 | måsta 172 | ned 173 | nedanför 174 | nedåt 175 | nej 176 | ni 177 | nu 178 | nyss 179 | någon 180 | någondera 181 | någonsin 182 | någonstans 183 | någonting 184 | några 185 | när 186 | och 187 | också 188 | om 189 | oss 190 | ovan 191 | ovanför 192 | ovanpå 193 | 194 | sedan 195 | senare 196 | sin 197 | själv 198 | ska 199 | skall 200 | skulle 201 | slags 202 | snart 203 | som 204 | somliga 205 | stundom 206 | 207 | sådan 208 | således 209 | sålunda 210 | såsom 211 | såvida 212 | såvitt 213 | sällan 214 | tack 215 | tillbaka 216 | tills 217 | upp 218 | ur 219 | ut 220 | utan 221 | va 222 | vad 223 | var 224 | vara 225 | varandra 226 | varav 227 | vardera 228 | varenda 229 | varför 230 | varifrån 231 | varje 232 | vart 233 | vem 234 | vi 235 | vid 236 | vilja 237 | vilka 238 | vilken 239 | vilket 240 | vill 241 | åt 242 | åtskillig 243 | åtskilligt 244 | än 245 | ändå 246 | ännu 247 | äntligen 248 | är 249 | även 250 | ävensom 251 | ömsom 252 | över 253 | överallt 254 | 255 |
256 | -------------------------------------------------------------------------------- /dictionaries/nn.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | "| 6 | (| 7 | 8 | 9 | 10 | 11 | ."| 12 | ,"| 13 | .| 14 | ,| 15 | "| 16 | )| 17 | ?| 18 | :| 19 | ;| 20 | !| 21 | 22 | 23 | 24 | 25 | wrote|write 26 | came|come 27 | went|go 28 | 29 | 30 | 31 | before1|1after 32 | 33 |
 34 |        before1|1after
 35 |    
36 |
37 | 38 | 39 | 40 | ." 41 | ?" 42 | !" 43 | ," 44 | . 45 | ? 46 | ; 47 | | 48 | ! 49 | 50 | 51 | 52 | Dr. 53 | Mr. 54 | Mrs. 55 | U.S. 56 | Rep. 57 | Sen. 58 | 59 | 60 | 61 | er 62 | og 63 | det 64 | i 65 | som 66 | 67 | å 68 | til 69 | ein 70 | at 71 | med 72 | for 73 | har 74 | av 75 | han 76 | dei 77 | om 78 | ikkje 79 | eg 80 | eit 81 | du 82 | dette 83 | kan 84 | den 85 | men 86 | ei 87 | ho 88 | vil 89 | seg 90 | var 91 | eller 92 | denne 93 | òg 94 | ut 95 | noko 96 | når 97 | frå 98 | berre 99 | andre 100 | skal 101 | her 102 | 103 | me 104 | alle 105 | to 106 | sjølv 107 | 108 | ser 109 | desse 110 | kjem 111 | 112 | etter 113 | opp 114 | vere 115 | der 116 | over 117 | kva 118 | blir 119 | slik 120 | no 121 | 122 | so 123 | ha 124 | får 125 | hadde 126 | fram 127 | inn 128 | går 129 | litt 130 | mellom 131 | tek 132 | fleire 133 | heilt 134 | veldig 135 | meir 136 | vart 137 | enn 138 | vi 139 | ved 140 | mot 141 | meg 142 | nok 143 | nokre 144 | sjå 145 | sin 146 | både 147 | same 148 | sett 149 | alt 150 | første 151 | gjev 152 | heile 153 | mest 154 | mykje 155 | hans 156 | før 157 | gjer 158 | korleis 159 | seie 160 | deg 161 | finst 162 | gjennom 163 | siste 164 | vore 165 | kunne 166 | ulike 167 | viss 168 | 169 | bra 170 | en 171 | kor 172 | ta 173 | vera 174 | ned 175 | si 176 | sidan 177 | sitt 178 | henne 179 | kanskje 180 | altså 181 | ville 182 | likevel 183 | tid 184 | vel 185 | anna 186 | sine 187 | under 188 | bruk 189 | fekk 190 | kvar 191 | laga 192 | like 193 | nytt 194 | ting 195 | de 196 | heller 197 | rett 198 | utan 199 | faktisk 200 | hennar 201 | saman 202 | skulle 203 | slike 204 | tre 205 | medan 206 | annan 207 | difor 208 | igjen 209 | også 210 | oss 211 | føre 212 | kom 213 | måte 214 | sagt 215 | står 216 | bli 217 | rundt 218 | tida 219 | veit 220 | finn 221 | meiner 222 | ofte 223 | gjere 224 | alltid 225 | ganske 226 | held 227 | lett 228 | elles 229 | sjølvsagt 230 | synest 231 | gjera 232 | ligg 233 | seinare 234 | styrer 235 | begge 236 | kvart 237 | bruke 238 | kome 239 | lite 240 | bruka 241 | jo 242 | din 243 | neste 244 | store 245 | fire 246 | fått 247 | kort 248 | la 249 | lenge 250 | hos 251 | nokon 252 | bør 253 | beste 254 | kven 255 | derfor 256 | nemleg 257 | slags 258 | treng 259 | grunn 260 | klart 261 | min 262 | blant 263 | 264 |
265 | -------------------------------------------------------------------------------- /dictionaries/hu.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | "| 6 | (| 7 | 8 | 9 | 10 | 11 | ."| 12 | ,"| 13 | .| 14 | ,| 15 | "| 16 | )| 17 | ?| 18 | :| 19 | ;| 20 | !| 21 | 22 | 23 | 24 | 25 | wrote|write 26 | came|come 27 | went|go 28 | 29 | 30 | 31 | before1|1after 32 | 33 |
 34 |        before1|1after
 35 |    
36 |
37 | 38 | 39 | 40 | ." 41 | ?" 42 | !" 43 | ," 44 | . 45 | ? 46 | ; 47 | | 48 | ! 49 | 50 | 51 | 52 | Dr. 53 | Mr. 54 | Mrs. 55 | U.S. 56 | Rep. 57 | Sen. 58 | 59 | 60 | 61 | a 62 | addig 63 | ám 64 | annak 65 | annyi 66 | arra 67 | át 68 | attól 69 | az 70 | azért 71 | azok 72 | be 73 | bele 74 | belé 75 | beléd 76 | beléjük 77 | belém 78 | belénk 79 | belétek 80 | belőle 81 | belőled 82 | belőlem 83 | belőletek 84 | belőlük 85 | belőlünk 86 | benne 87 | benned 88 | bennem 89 | bennetek 90 | bennük 91 | bennünk 92 | csak 93 | de 94 | e 95 | eddig 96 | egy 97 | egyik 98 | el 99 | én 100 | engem 101 | ennek 102 | ennyi 103 | erre 104 | érte 105 | érted 106 | értem 107 | értetek 108 | értük 109 | értünk 110 | és 111 | év 112 | ez 113 | ezek 114 | ezért 115 | ezt 116 | fel 117 | fog 118 | föl 119 | ha 120 | hanem 121 | három 122 | hogy 123 | hol 124 | honnan 125 | hozzá 126 | hozzád 127 | hozzájuk 128 | hozzám 129 | hozzánk 130 | hozzátok 131 | ide 132 | igen 133 | ilyen 134 | is 135 | ismét 136 | itt 137 | 138 | kell 139 | két 140 | kettő 141 | ki 142 | kicsi 143 | kicsit 144 | kis 145 | le 146 | lehet 147 | lesz 148 | lett 149 | ma 150 | majdnem 151 | már 152 | más 153 | másik 154 | meddig 155 | meg 156 | még 157 | megint 158 | mellett 159 | mennyi 160 | merre 161 | mert 162 | mettől 163 | mi 164 | miért 165 | mikor 166 | milyen 167 | minden 168 | mindenki 169 | mindig 170 | minket 171 | most 172 | nagy 173 | nagyon 174 | nála 175 | nálad 176 | nálam 177 | nálatok 178 | náluk 179 | nálunk 180 | ne 181 | négy 182 | neked 183 | nekem 184 | neki 185 | nekik 186 | nektek 187 | nekünk 188 | nem 189 | ő 190 | oda 191 | ők 192 | őket 193 | olyan 194 | ön 195 | önbe 196 | önben 197 | önbol 198 | önért 199 | önhöz 200 | önnek 201 | önnel 202 | önnél 203 | önök 204 | önökbe 205 | önökben 206 | önökből 207 | önökért 208 | önöket 209 | önökhöz 210 | önökkel 211 | önöknek 212 | önöknél 213 | önökön 214 | önökre 215 | önökről 216 | önöktől 217 | önön 218 | önre 219 | önről 220 | önt 221 | öntől 222 | össze 223 | őt 224 | ott 225 | 226 | rád 227 | rajta 228 | rajtad 229 | rajtam 230 | rajtatok 231 | rajtuk 232 | rajtunk 233 | rájuk 234 | rám 235 | ránk 236 | rátok 237 | róla 238 | rólad 239 | rólam 240 | rólatok 241 | róluk 242 | rólunk 243 | rossz 244 | s 245 | se 246 | sem 247 | semmi 248 | senki 249 | soha 250 | sok 251 | stb 252 | szét 253 | talán 254 | te 255 | téged 256 | ti 257 | titeket 258 | tőle 259 | tőled 260 | tolem 261 | toletek 262 | tőlük 263 | tőlünk 264 | új 265 | újra 266 | vagy 267 | van 268 | vannak 269 | vele 270 | veled 271 | velem 272 | veletek 273 | velük 274 | velünk 275 | vissza 276 | volt 277 | voltak 278 | 279 | 280 |
281 | -------------------------------------------------------------------------------- /ext/ots/libots/wordlist.c: -------------------------------------------------------------------------------- 1 | /* 2 | * wordlist.c 3 | * 4 | * Copyright (C) 2003 Nadav Rotem 5 | * 6 | * This program is free software; you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation; either version 2 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU Library General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program; if not, write to the Free Software 18 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 19 | */ 20 | 21 | #include 22 | #include 23 | #include 24 | 25 | #include "libots.h" 26 | #include "grader-tc.h" 27 | 28 | /*word lists manipulations , mainly for grader-tc */ 29 | 30 | OtsWordEntery * 31 | ots_new_wordEntery_strip(unsigned const char *wordString,const OtsStemRule *rule) /*for real text use*/ 32 | { 33 | OtsWordEntery *aWord = g_new0 (OtsWordEntery, 1); 34 | aWord->occ = 1; 35 | aWord->word = ots_stem_format(wordString,rule); 36 | aWord->stem = ots_stem_strip(wordString,rule); 37 | return aWord; 38 | } 39 | 40 | OtsWordEntery * 41 | ots_new_wordEntery (unsigned const char *wordString) /*for dictionary use only, no formating here*/ 42 | { 43 | OtsWordEntery *aWord = g_new0 (OtsWordEntery, 1); 44 | aWord->occ = 1; 45 | aWord->word = g_strdup (wordString); 46 | aWord->stem = g_strdup (wordString); 47 | return aWord; 48 | } 49 | 50 | 51 | void 52 | ots_free_wordEntery (OtsWordEntery * WC) 53 | { 54 | if (WC != NULL) 55 | { 56 | if (NULL!=WC->word) g_free (WC->word); 57 | if (NULL!=WC->stem) g_free (WC->stem); 58 | g_free (WC); 59 | } 60 | } 61 | 62 | void 63 | ots_free_wordlist (GList * aList) 64 | { 65 | if (aList != NULL) 66 | { 67 | g_list_foreach(aList,(GFunc)ots_free_wordEntery , NULL); 68 | g_list_free(aList); 69 | } 70 | } 71 | 72 | OtsWordEntery * 73 | ots_copy_wordEntery (OtsWordEntery * obj) 74 | { 75 | OtsWordEntery *aWord; 76 | if (obj == NULL) { return NULL;} 77 | aWord = g_new (OtsWordEntery, 1); 78 | aWord->occ = obj->occ; 79 | aWord->word = g_strdup (obj->word); 80 | if (NULL!=obj->stem) 81 | {aWord->stem = g_strdup (obj->stem);} else {aWord->stem=NULL;} 82 | return aWord; 83 | } 84 | 85 | static int 86 | ots_sort_handler (OtsWordEntery * node1, OtsWordEntery * node2) 87 | { 88 | if (node1->occ > node2->occ) 89 | return -1; 90 | if (node1->occ < node2->occ) 91 | return 1; 92 | return 0; 93 | } 94 | 95 | GList * 96 | ots_sort_list (GList* aList) 97 | { 98 | GList *newList; 99 | newList = g_list_sort (aList, (GCompareFunc) ots_sort_handler); /* sort article */ 100 | return newList; 101 | } 102 | 103 | GList * 104 | ots_union_list (const GList *aLst, const GList * bLst) 105 | { 106 | GList *li; 107 | GList *di; 108 | int insert; 109 | GList *newLst=NULL; 110 | 111 | for (li = (GList *) aLst; li != NULL; li = li->next) 112 | { 113 | insert = 1; 114 | for (di = (GList *) bLst; di != NULL; di = di->next) 115 | { 116 | if(( li->data) && (di->data) && (((OtsWordEntery *) li->data)->word) && (((OtsWordEntery *) di->data)->word)) /*all defined?*/ 117 | if (0 == g_strncasecmp ((((OtsWordEntery *) li->data)->word), /*fix me: unicode issue?*/ 118 | (((OtsWordEntery *) di->data)->word), 10)) 119 | insert = 0; /* if word in B */ 120 | 121 | } 122 | if (insert == 1) 123 | if ((li->data)) 124 | newLst = g_list_append (newLst,ots_copy_wordEntery ((OtsWordEntery *) li->data)); 125 | } 126 | 127 | return newLst; 128 | } 129 | 130 | 131 | char * 132 | ots_word_in_list (const GList *aList,const int index) /* return the String value of the n'th word */ 133 | { 134 | OtsWordEntery *obj = NULL; 135 | 136 | GList *item =(GList *)g_list_nth ((GList *)aList, index); 137 | if (item != NULL) obj = item->data; 138 | if (obj == NULL) 139 | { 140 | return NULL; 141 | } 142 | else 143 | return obj->word; 144 | } 145 | 146 | char * 147 | ots_stem_in_list (const GList *aList,const int index) /* return the String value of stem of the n'th word */ 148 | { 149 | OtsWordEntery *obj = NULL; 150 | 151 | GList *item =(GList *)g_list_nth ((GList *)aList, index); 152 | if (item != NULL) obj = item->data; 153 | if (obj == NULL) 154 | { 155 | return NULL; 156 | } 157 | else 158 | return obj->stem; 159 | } 160 | 161 | /*Adds a word to the word count of the article*/ 162 | void 163 | ots_add_wordstat (OtsArticle * Doc, 164 | unsigned const char *wordString) 165 | { 166 | GList *li; 167 | OtsWordEntery *stat; 168 | OtsStemRule * rule=Doc->stem; 169 | char *tmp = NULL; 170 | 171 | if (NULL==wordString) return; 172 | if (NULL==Doc) return; 173 | 174 | if (0==strlen(wordString)) return; 175 | if (0==strcmp(wordString," ")) return; 176 | if (0==strcmp(wordString,"\n")) return; 177 | if (0==strcmp(wordString,"\t")) return; 178 | 179 | if (wordString) 180 | tmp = ots_stem_strip (wordString, rule); 181 | 182 | for (li = (GList *) Doc->wordStat; li != NULL; li = li->next) /* search the word in current wordlist */ 183 | { 184 | if (li->data) 185 | if (0 == strcmp (tmp, ((OtsWordEntery *) li->data)->stem)) 186 | { 187 | ((OtsWordEntery *) li->data)->occ++; /* occurred in another place in the text now; */ 188 | g_free (tmp); 189 | 190 | /*printf for debug*/ 191 | /* 192 | if (0!=strcmp(((OtsWordEntery *) li->data)->word,wordString) ) 193 | printf("[%s]==[%s]\n",((OtsWordEntery *) li->data)->word,wordString); 194 | */ 195 | 196 | return; 197 | } 198 | } 199 | 200 | stat = ots_new_wordEntery_strip (wordString, rule); /* if not in list , Add stem it to the list */ 201 | if ((stat)) 202 | Doc->wordStat = g_list_prepend (Doc->wordStat, stat); 203 | g_free (tmp); 204 | return; 205 | } 206 | 207 | 208 | 209 | 210 | void 211 | ots_print_wordlist (FILE * stream, const GList * aList) 212 | { 213 | GList *li; 214 | for (li = (GList *) aList; li != NULL; li = li->next) 215 | fprintf (stream, "Word[%d][%s]\n", ((OtsWordEntery *) li->data)->occ, 216 | ((OtsWordEntery *) li->data)->word); 217 | } 218 | 219 | 220 | 221 | -------------------------------------------------------------------------------- /dictionaries/gl.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | "| 6 | (| 7 | 8 | 9 | 10 | 11 | 12 | ."| 13 | ,"| 14 | .| 15 | ,| 16 | "| 17 | )| 18 | ?| 19 | :| 20 | ;| 21 | !| 22 | 23 | 24 | 25 | 26 | 27 | wrote|write 28 | came|come 29 | went|go 30 | 31 | 32 | 33 | before1|1after 34 | 35 |
 36 |        before1|1after
 37 |    
38 |
39 | 40 | 41 | 42 | ." 43 | ?" 44 | !" 45 | ," 46 | . 47 | ? 48 | ; 49 | | 50 | ! 51 | 52 | 53 | 54 | Dr. 55 | Mr. 56 | Mrs. 57 | U.S. 58 | Rep. 59 | Sen. 60 | 61 | 62 | 63 | a 64 | acá 65 | acó 66 | actual 67 | actualmente 68 | adiante 69 | agás 70 | agora 71 | 72 | ainda 73 | algo 74 | alguén 75 | algun 76 | alguns 77 | algures 78 | ali 79 | aló 80 | ambos 81 | anterior 82 | anteriormente 83 | antes 84 | aparte 85 | apenas 86 | aquel 87 | aquela 88 | aquelas 89 | aqueles 90 | aqui 91 | aquilo 92 | as 93 | asi 94 | através 95 | baixo 96 | ben 97 | bon 98 | ca 99 | cada 100 | cal 101 | cando 102 | canto 103 | cedo 104 | co 105 | coa 106 | coas 107 | comigo 108 | como 109 | con 110 | connosco 111 | consigo 112 | contodo 113 | convosco 114 | cos 115 | cuxa 116 | cuxas 117 | cuxo 118 | cuxos 119 | de 120 | debe 121 | deber 122 | deberia 123 | deberíamos 124 | debes 125 | del 126 | dela 127 | delas 128 | deles 129 | demais 130 | desde 131 | despois 132 | di 133 | dicer 134 | diso 135 | dixemos 136 | dixo 137 | dous 138 | duas 139 | e 140 | é 141 | eis 142 | el 143 | ela 144 | elas 145 | eles 146 | en 147 | entón 148 | entre 149 | era 150 | érades 151 | éramos 152 | eran 153 | eras 154 | es 155 | esta 156 | está 157 | estaba 158 | estábades 159 | estábamos 160 | estaban 161 | estabas 162 | estades 163 | estamos 164 | están 165 | estar 166 | estás 167 | estive 168 | estiveche 169 | estivemos 170 | estiveron 171 | estivo 172 | estou 173 | etc 174 | eu 175 | excepto 176 | final 177 | foche 178 | foi 179 | fomos 180 | foran 181 | fostes 182 | fun 183 | ha 184 | ides 185 | iso 186 | isto 187 | logo 188 | máis 189 | máximo 190 | meu 191 | min 192 | miña 193 | moitas 194 | moito 195 | moitos 196 | nada 197 | nen 198 | nengun 199 | nengunha 200 | ninguén 201 | non 202 | nós 203 | nosa 204 | nosas 205 | noso 206 | nosos 207 | o 208 | obtén 209 | obter 210 | obtido 211 | obtivemos 212 | obtivo 213 | onde 214 | os 215 | outra 216 | outras 217 | outro 218 | outros 219 | par 220 | para 221 | parado 222 | parece 223 | parecer 224 | pensa 225 | pode 226 | poden 227 | poderia 228 | por 229 | primeira 230 | primeiro 231 | própria 232 | próprias 233 | próprio 234 | próprios 235 | que 236 | quen 237 | riba 238 | se 239 | ser 240 | sob 241 | sodes 242 | somos 243 | son 244 | sua 245 | tamén 246 | tan 247 | temos 248 | ten 249 | tendes 250 | teñen 251 | teño 252 | ter 253 | tes 254 | teu 255 | ti 256 | tiña 257 | tíñades 258 | tíñamos 259 | tiñan 260 | tiñas 261 | tiveche 262 | tivemos 263 | tiven 264 | tiveron 265 | tivestes 266 | tivo 267 | todo 268 | tu 269 | tua 270 | último 271 | un 272 | unha 273 | unhas 274 | unicamente 275 | uns 276 | vai 277 | vamos 278 | van 279 | várias 280 | vários 281 | vos 282 | vós 283 | vosa 284 | vosas 285 | voso 286 | vosos 287 | vou 288 | xamais 289 | 290 |
291 | -------------------------------------------------------------------------------- /dictionaries/yi.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | "| 6 | (| 7 | כ'| 8 | מ'| 9 | ס'| 10 | כ׳| 11 | מ׳| 12 | ס׳| 13 | 14 | 15 | 16 | 17 | ."| 18 | ,"| 19 | .| 20 | ,| 21 | "| 22 | )| 23 | ?| 24 | :| 25 | ;| 26 | !| 27 | 28 | 29 | 30 | 31 | געקומען|קום 32 | געװען|זײַן 33 | 34 | 35 | 36 | ן| 37 | ער| 38 | ע| 39 | ט| 40 | 41 |
 42 |        before1|1after
 43 |    
44 |
45 | 46 | 47 | 48 | ." 49 | ?" 50 | !" 51 | ," 52 | . 53 | ? 54 | ; 55 | | 56 | ! 57 | 58 | 59 | 60 | Dr. 61 | Mr. 62 | Mrs. 63 | U.S. 64 | Rep. 65 | Sen. 66 | 67 | 68 | 69 | ! 70 | ' 71 | , 72 | * 73 | - 74 | -- 75 | . 76 | 000 77 | ? 78 | | 79 | אַ 80 | אַװעק 81 | אַז 82 | אַזױ 83 | אַלע 84 | אַלעמאָל 85 | אַן 86 | אַנדער 87 | אַנדערע 88 | אַפֿילו 89 | אַצינד 90 | אַראָפּ 91 | אַרױס 92 | אַרױף 93 | אַרײַן 94 | אָבער 95 | אָדער 96 | אָט 97 | אָן 98 | אָפֿט 99 | און 100 | אונדזער 101 | אונדזערע 102 | אונטער 103 | איבער 104 | איז 105 | איך 106 | אים 107 | אין 108 | איצט 109 | איר 110 | אירע 111 | אפֿשר 112 | אױב 113 | אױך 114 | אױס 115 | אױף 116 | אױפֿן 117 | אײַער 118 | אײגן 119 | אײגענע 120 | אײגענער 121 | אײדער 122 | אײן 123 | אײנמאָל 124 | אײנס 125 | אַלײן 126 | באַקום 127 | באַקומט 128 | באַקומען 129 | ביז 130 | בין 131 | בלױז 132 | בעסער 133 | בשעת 134 | בײַ 135 | בײדע 136 | גוט 137 | גוטע 138 | גוטער 139 | גלײַך 140 | געדאַרפֿט 141 | געהאַט 142 | געזאָגט 143 | געזאָלט 144 | געטאָן 145 | געמאַכט 146 | געמוזט 147 | געמעגט 148 | געמײנט 149 | גענוג 150 | גענוצט 151 | געשטעלט 152 | געװאָלט 153 | מאָל 154 | מען 155 | נאָר 156 | אַז 157 | געװעזן 158 | נאָך 159 | געװען 160 | גײט 161 | גײען 162 | דאָ 163 | דאַרף 164 | דאָזיקע 165 | דאָזיקער 166 | דאָס 167 | דאָך 168 | דאָרט 169 | דו 170 | די 171 | דיר 172 | דיך 173 | דײַן 174 | דעם 175 | דעמאָלט 176 | דער 177 | דערפֿאַר 178 | דרײַ 179 | האָב 180 | האָבן 181 | האָט 182 | הער 183 | הײסט 184 | זאַך 185 | זאַכן 186 | זאָג 187 | זאָגן 188 | זאָל 189 | זי 190 | זיך 191 | זעט 192 | זעלביקע 193 | זעלביקער 194 | זען 195 | זײ 196 | זײַן 197 | זײַנען 198 | זענען 199 | זײער 200 | טאַקע 201 | טוט 202 | טאָן 203 | יאָ 204 | יעדער 205 | יעצט 206 | כּמעתּ 207 | לאָז 208 | לאָזט 209 | לאָמיך 210 | לאָמיר 211 | לעצט 212 | לעצטע 213 | לעצטער 214 | מאַכט 215 | מוז 216 | מיט 217 | מיך 218 | מיר 219 | מילא 220 | מעג 221 | מײַן 222 | מײנט 223 | נאָך 224 | נאָענט 225 | נאָענטע 226 | נאָר 227 | נו 228 | נוצט 229 | נוצן 230 | ניט 231 | ניצט 232 | ניצן 233 | נישט 234 | נײן 235 | סך 236 | סײַ 237 | סײַדן 238 | עטלעכע 239 | עס 240 | עפּעס 241 | ער 242 | ערשט 243 | ערשטע 244 | ערשטער 245 | פֿאַר 246 | פֿאַרשידן 247 | פֿאַרשידענע 248 | פֿאַרװאָס 249 | פֿון 250 | פֿיר 251 | פֿרי 252 | פֿריִערדיק 253 | פֿריִערדיקע 254 | פֿריִערדיקער 255 | צו 256 | צום 257 | צי 258 | צװישן 259 | צװײ 260 | קומט 261 | קומעדיק 262 | קומעדיקע 263 | קומעדיקער 264 | קען 265 | קענען 266 | קײן 267 | רבֿ 268 | שטעל 269 | שטעלט 270 | שױן 271 | װאָלט 272 | װאָס 273 | װאָסער 274 | װוּ 275 | װי 276 | װידער 277 | װיל 278 | װילט 279 | װינציק 280 | װינציקער 281 | װײַטער 282 | װעג 283 | װעגן 284 | װעט 285 | װעלכער 286 | װעלן 287 | װעמען 288 | װעמענס 289 | װען 290 | װער 291 | װײַל 292 | 293 |
294 | -------------------------------------------------------------------------------- /ext/ots/ots.c: -------------------------------------------------------------------------------- 1 | #include "ots.h" 2 | #include 3 | #include 4 | #include 5 | 6 | static VALUE mOTS, cArticle; 7 | char *DICTIONARY_DIR; 8 | 9 | static void article_free(OtsArticle *article) { 10 | if (article) 11 | ots_free_article(article); 12 | } 13 | 14 | rb_encoding* article_encoding(VALUE self) { 15 | return rb_enc_from_index((int)rb_iv_get(self, "@encoding")); 16 | } 17 | 18 | VALUE article_allocate(VALUE klass) { 19 | OtsArticle *article = ots_new_article(); 20 | return Data_Wrap_Struct(klass, 0, article_free, article); 21 | } 22 | 23 | OtsArticle* article_handle(VALUE self) { 24 | OtsArticle *article = 0; 25 | Data_Get_Struct(self, OtsArticle, article); 26 | if (!article) 27 | rb_raise(rb_eArgError, "invalid OTS::Article instance"); 28 | return article; 29 | } 30 | 31 | void article_load_dictionary(OtsArticle *article, char *name) { 32 | if (!ots_load_xml_dictionary(article, name)) { 33 | rb_raise(rb_eLoadError, "Could not find dictionary file: %s", name); 34 | } 35 | } 36 | 37 | VALUE article_initialize(int argc, VALUE *argv, VALUE self) { 38 | VALUE text, options, language, dictionary = Qnil; 39 | OtsArticle *article = article_handle(self); 40 | 41 | rb_scan_args(argc, argv, "11", &text, &options); 42 | 43 | language = rb_str_new2("en"); 44 | 45 | if (TYPE(text) != T_STRING) 46 | rb_raise(rb_eArgError, "invalid +text+"); 47 | 48 | if (!NIL_P(options)) { 49 | if (TYPE(options) != T_HASH) 50 | rb_raise(rb_eArgError, "invalid +options+ hash"); 51 | 52 | dictionary = rb_hash_aref(options, ID2SYM(rb_intern("dictionary"))); 53 | language = rb_hash_aref(options, ID2SYM(rb_intern("language"))); 54 | } 55 | 56 | if (!NIL_P(dictionary)) 57 | article_load_dictionary(article, CSTRING(dictionary)); 58 | else 59 | article_load_dictionary(article, CSTRING(language)); 60 | 61 | ots_parse_stream(RSTRING_PTR(text), RSTRING_LEN(text), article); 62 | ots_grade_doc(article); 63 | 64 | rb_iv_set(self, "@encoding", (VALUE)rb_enc_get_index(text)); 65 | 66 | return self; 67 | } 68 | 69 | 70 | VALUE article_summary(OtsArticle *article, rb_encoding *encoding) { 71 | OtsSentence *sentence; 72 | 73 | GList *line_ptr = article->lines; 74 | VALUE summary = rb_ary_new(); 75 | 76 | while (line_ptr != NULL) { 77 | sentence = (OtsSentence *)line_ptr->data; 78 | 79 | if (sentence->selected) { 80 | size_t size; 81 | unsigned char* content = ots_get_line_text(sentence, TRUE, &size); 82 | 83 | VALUE line = rb_hash_new(); 84 | rb_hash_aset(line, ID2SYM(rb_intern("sentence")), rb_enc_str_new((char *)content, size, encoding)); 85 | rb_hash_aset(line, ID2SYM(rb_intern("score")), LONG2FIX(sentence->score)); 86 | rb_ary_push(summary, line); 87 | 88 | // reset this so subsequent calls work right. 89 | sentence->selected = FALSE; 90 | } 91 | 92 | line_ptr = g_list_next(line_ptr); 93 | } 94 | 95 | return summary; 96 | } 97 | 98 | VALUE article_summarize(VALUE self, VALUE options) { 99 | VALUE lines, percent; 100 | OtsArticle *article = article_handle(self); 101 | 102 | if (TYPE(options) != T_HASH) 103 | rb_raise(rb_eArgError, "expect an options hash"); 104 | 105 | lines = rb_hash_aref(options, ID2SYM(rb_intern("sentences"))); 106 | percent = rb_hash_aref(options, ID2SYM(rb_intern("percent"))); 107 | 108 | if (NIL_P(lines) && NIL_P(percent)) 109 | rb_raise(rb_eArgError, "expect +sentences+ or +percent+"); 110 | 111 | if (lines != Qnil) 112 | ots_highlight_doc_lines(article, NUM2INT(lines)); 113 | else 114 | ots_highlight_doc(article, NUM2INT(percent)); 115 | 116 | return article_summary(article, article_encoding(self)); 117 | } 118 | 119 | VALUE article_topics(VALUE self) { 120 | OtsArticle *article = article_handle(self); 121 | 122 | return 123 | article->title ? 124 | rb_str_split(rb_enc_str_new2(article->title, article_encoding(self)), ",") : 125 | Qnil; 126 | } 127 | 128 | typedef struct { 129 | gchar *word; /* the word */ 130 | gchar *stem; /*stem of the word*/ 131 | gint occ; /* how many times have we seen this word in the text? */ 132 | } OtsWordEntry; 133 | 134 | 135 | VALUE article_keywords(VALUE self) { 136 | OtsArticle *article = article_handle(self); 137 | rb_encoding *encoding = article_encoding(self); 138 | 139 | VALUE words = rb_ary_new(); 140 | GList* word_ptr = article->ImpWords; 141 | 142 | while (word_ptr) { 143 | OtsWordEntry *data = (OtsWordEntry *)word_ptr->data; 144 | if (data && strlen(data->word) > 0) 145 | rb_ary_push(words, rb_enc_str_new2(data->word, encoding)); 146 | word_ptr = word_ptr->next; 147 | } 148 | 149 | return words; 150 | } 151 | 152 | VALUE ots_parse(int argc, VALUE *argv, VALUE self) { 153 | VALUE article = article_allocate(cArticle); 154 | article_initialize(argc, argv, article); 155 | return article; 156 | } 157 | 158 | VALUE ots_languages(VALUE self) { 159 | DIR *dir; 160 | struct dirent *entry; 161 | VALUE languages = rb_ary_new(); 162 | 163 | if ((dir = opendir(DICTIONARY_DIR))) { 164 | while ((entry = readdir(dir))) { 165 | // entry->d_type is not portable. 166 | if (strstr(entry->d_name, ".xml")) 167 | rb_ary_push(languages, rb_str_new(entry->d_name, strlen(entry->d_name) - 4)); 168 | } 169 | } 170 | else { 171 | rb_raise(rb_eIOError, "unable to open dictionary directory: %s", strerror(errno)); 172 | } 173 | 174 | closedir(dir); 175 | return languages; 176 | } 177 | 178 | VALUE ots_set_dictionary_path(VALUE self, VALUE path) { 179 | char *string = CSTRING(path); 180 | if (DICTIONARY_DIR) 181 | free(DICTIONARY_DIR); 182 | 183 | DICTIONARY_DIR = (char *)malloc(strlen(string) + 2); 184 | sprintf(DICTIONARY_DIR, "%s/", string); 185 | return Qnil; 186 | } 187 | 188 | /* init */ 189 | 190 | void Init_ots(void) { 191 | mOTS = rb_define_module("OTS"); 192 | cArticle = rb_define_class_under(mOTS, "Article", rb_cObject); 193 | 194 | rb_define_method(cArticle, "initialize", RUBY_METHOD_FUNC(article_initialize), -1); 195 | rb_define_method(cArticle, "summarize", RUBY_METHOD_FUNC(article_summarize), 1); 196 | rb_define_method(cArticle, "topics", RUBY_METHOD_FUNC(article_topics), 0); 197 | rb_define_method(cArticle, "keywords", RUBY_METHOD_FUNC(article_keywords), 0); 198 | 199 | rb_define_module_function(mOTS, "parse", RUBY_METHOD_FUNC(ots_parse), -1); 200 | rb_define_module_function(mOTS, "languages", RUBY_METHOD_FUNC(ots_languages), 0); 201 | rb_define_module_function(mOTS, "set_dictionary_path", RUBY_METHOD_FUNC(ots_set_dictionary_path), 1); 202 | 203 | rb_define_alloc_func(cArticle, article_allocate); 204 | 205 | rb_define_const(mOTS, "VERSION", rb_str_new2(RUBY_OTS_VERSION)); 206 | DICTIONARY_DIR = 0; 207 | } 208 | -------------------------------------------------------------------------------- /dictionaries/he.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | "| 7 | (| 8 | 9 | 10 | 11 | 12 | ."| 13 | ,"| 14 | .| 15 | ,| 16 | "| 17 | )| 18 | ?| 19 | :| 20 | ;| 21 | !| 22 | 23 | 24 | 25 | 26 | | 27 | 28 | 29 | 30 | תי| 31 | ה| 32 | ו| 33 | ות| 34 | ים| 35 | 36 |
 37 |        לכש|
 38 |        ב|
 39 |        כש|
 40 |        ש|
 41 |        כ|
 42 |        ל|
 43 |        ה|
 44 |    
45 | 46 |
47 | 48 | 49 | 50 | ." 51 | ?" 52 | !" 53 | ," 54 | . 55 | ? 56 | ; 57 | | 58 | ! 59 | 60 | 61 | 62 | מר. 63 | דר. 64 | 65 | 66 | 67 | 68 | אותי 69 | פ 70 | מ 71 | ליד 72 | שלא 73 | שאני 74 | אחרי 75 | ואני 76 | אך 77 | להיות 78 | בה 79 | לאחר 80 | בין 81 | עוד 82 | האלה 83 | כאלה 84 | דברים 85 | בערך 86 | עליו 87 | בגלל 88 | מן 89 | ג 90 | ם 91 | ואף 92 | בו 93 | קודם 94 | מייד 95 | מיד 96 | במשך 97 | בה 98 | והם 99 | מכל 100 | ללא 101 | אחרי 102 | לפני 103 | שאחרי 104 | מעל 105 | מ 106 | ושם 107 | באותו 108 | באו 109 | לזה 110 | לכן 111 | ו 112 | ואת 113 | כלל 114 | למען 115 | והן 116 | והם 117 | שני 118 | הייתה 119 | ועל 120 | עוד 121 | לאחר 122 | בל 123 | אלא 124 | ניכר 125 | של 126 | כן 127 | לא 128 | כאן 129 | היינו 130 | ובלי 131 | היו 132 | ולא 133 | וכן 134 | כזה 135 | כמעט 136 | וגם 137 | גם 138 | בנוסף 139 | כול 140 | זה 141 | כמו 142 | בגלל 143 | היה 144 | יהי 145 | שניהם 146 | שניהן 147 | אבל 148 | יכול 149 | עלול 150 | עשה 151 | יעשה 152 | אפילו 153 | מעט 154 | הרבה 155 | כמה 156 | אז 157 | יש 158 | אין 159 | ראשון 160 | כזה 161 | לנו 162 | להם 163 | להן 164 | לי 165 | לו 166 | לה 167 | הוא 168 | היא 169 | שהוא 170 | שהיא 171 | שהם 172 | שהן 173 | הם 174 | הן 175 | שהיה 176 | אנחנו 177 | שלה 178 | שלו 179 | איך 180 | ככה 181 | אבל 182 | אני 183 | אם 184 | עם 185 | זה 186 | זו 187 | רק 188 | ועל 189 | ולא 190 | וכן 191 | אכן 192 | כמו 193 | בערך 194 | יותר 195 | פחות 196 | הכי 197 | שלי 198 | שלה 199 | שלהם 200 | שלנו 201 | לא 202 | כן 203 | על 204 | או 205 | וגם 206 | גם 207 | אחר 208 | אולי 209 | אבל 210 | נראה 211 | צריך 212 | אז 213 | כמה 214 | כמו 215 | משהו 216 | עדיין 217 | עד 218 | זה 219 | עד 220 | אנחנו 221 | הם 222 | הן 223 | מאוד 224 | היה 225 | יהיה 226 | דרך 227 | מה 228 | מי 229 | עם 230 | כן 231 | לא 232 | אמר 233 | אמרה 234 | שוב 235 | אף 236 | אחד 237 | בגלל 238 | כי 239 | גם 240 | לפני 241 | הכי 242 | מספיק 243 | כול 244 | למה 245 | מתחת 246 | את 247 | של 248 | על-ידי 249 | עם 250 | כדי 251 | בכך 252 | הן 253 | לפי 254 | על-פי 255 | לו 256 | לה 257 | מה 258 | אין 259 | כשזה 260 | כך 261 | כיוון 262 | זו 263 | וגם 264 | הכי 265 | ואלה 266 | ואלו 267 | הם 268 | לבין 269 | למה 270 | לכאורה 271 | כך 272 | משום 273 | זוהי 274 | כי 275 | וכי 276 | אם 277 | אכן 278 | לכך 279 | את 280 | כל 281 | וכל 282 | השני 283 | הראשון 284 | השלישי 285 | הוא 286 | ומי 287 | מאז 288 | אל 289 | על 290 | הזו 291 | הזה 292 | ל 293 | נ 294 | פעם 295 | אחת 296 | אותו 297 | ב 298 | ר 299 | שום 300 | ממש 301 | היכן 302 | בכל 303 | בכל 304 | היתה 305 | אשר 306 | הכל 307 | זאת 308 | מהם 309 | כזו 310 | כבר 311 | מנת 312 | שהיו 313 | אפשר 314 | יהיו 315 | נ 316 | אחד 317 | שלך 318 | שאתה 319 | אינו 320 | איננו 321 | בעיקר 322 | ואם 323 | ועם 324 | אזי 325 | בקשר 326 | איך 327 | באיזו 328 | באיזה 329 | שזה 330 | אליו 331 | אליהם 332 | וכמו 333 | 334 |
335 | -------------------------------------------------------------------------------- /dictionaries/de.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | "| 6 | (| 7 | 8 | 9 | 10 | 11 | ."| 12 | ,"| 13 | .| 14 | ,| 15 | "| 16 | )| 17 | ?| 18 | :| 19 | ;| 20 | !| 21 | 22 | 23 | 24 | 25 | wrote|write 26 | came|come 27 | went|go 28 | 29 | 30 | 31 | before1|1after 32 | 33 |
 34 |        before1|1after
 35 |    
36 |
37 | 38 | 39 | 40 | ." 41 | ?" 42 | !" 43 | ," 44 | . 45 | ? 46 | ; 47 | | 48 | ! 49 | 50 | 51 | 52 | Dr. 53 | Mr. 54 | Mrs. 55 | U.S. 56 | Rep. 57 | Sen. 58 | 59 | 60 | 61 | ab 62 | aber 63 | ähnlich 64 | aehnlich 65 | all 66 | alle 67 | allein 68 | alles 69 | als 70 | also 71 | am 72 | an 73 | andere 74 | anderes 75 | anstatt 76 | auch 77 | auf 78 | aus 79 | ausser 80 | ausserhalb 81 | bald 82 | bei 83 | beide 84 | beim 85 | bin 86 | bis 87 | bist 88 | bitte 89 | brauche 90 | brauchen 91 | braucht 92 | co 93 | da 94 | damit 95 | dann 96 | darf 97 | darüber 98 | darueber 99 | das 100 | daß 101 | dass 102 | dein 103 | deine 104 | dem 105 | den 106 | denen 107 | denke 108 | denken 109 | denkst 110 | der 111 | des 112 | dich 113 | die 114 | diese 115 | dieser 116 | dir 117 | doch 118 | dort 119 | drei 120 | du 121 | durch 122 | dürfen 123 | duerfen 124 | ehemalig 125 | eher 126 | ein 127 | eine 128 | einem 129 | einen 130 | einer 131 | eines 132 | einmal 133 | entlang 134 | er 135 | erhalt 136 | erhalten 137 | erste 138 | es 139 | etliche 140 | etwa 141 | etwas 142 | fahre 143 | fahren 144 | fahrt 145 | fast 146 | frau 147 | fuer 148 | für 149 | fuer 150 | geben 151 | gegen 152 | gegenüber 153 | gegenueber 154 | geh 155 | gehabt 156 | gehen 157 | geht 158 | gekonnt 159 | gelegen 160 | gelasse 161 | gelassen 162 | gelasst 163 | genug 164 | gerade 165 | gesagt 166 | gesetzt 167 | getan 168 | gewesen 169 | gibt 170 | gmbh 171 | gut 172 | guten 173 | gutes 174 | hab 175 | habe 176 | haben 177 | habt 178 | hast 179 | hat 180 | hatte 181 | häufig 182 | haeufig 183 | herr 184 | heute 185 | hier 186 | ich 187 | ihn 188 | ihr 189 | im 190 | immer 191 | in 192 | initiale 193 | irgend 194 | irgendein 195 | ist 196 | ja 197 | jede 198 | jeden 199 | jeder 200 | jedes 201 | jedoch 202 | jemand 203 | jetzt 204 | kann 205 | kein 206 | keine 207 | keinen 208 | kenne 209 | kennen 210 | kennst 211 | kennt 212 | klein 213 | kleiner 214 | komm 215 | kommen 216 | kommt 217 | können 218 | koennen 219 | konnte 220 | lag 221 | letztes 222 | liegen 223 | los 224 | mache 225 | machen 226 | machst 227 | macht 228 | mag 229 | man 230 | manchmal 231 | mann 232 | mehr 233 | mein 234 | meisten 235 | mich 236 | mir 237 | mit 238 | möglicherweise 239 | moeglicherweise 240 | muss 241 | müssen 242 | muessen 243 | musste 244 | nach 245 | nächst 246 | naechst 247 | nahe 248 | nein 249 | nicht 250 | nichts 251 | nie 252 | niemand 253 | noch 254 | nur 255 | oberhalb 256 | oder 257 | oft 258 | ohne 259 | ok 260 | okay 261 | per 262 | sache 263 | sachen 264 | sagen 265 | sagt 266 | satz 267 | schon 268 | sehe 269 | sehen 270 | sehr 271 | seid 272 | seiht 273 | sein 274 | seine 275 | seiner 276 | seit 277 | selbar 278 | selben 279 | selbst 280 | selten 281 | sich 282 | sie 283 | sind 284 | sitzen 285 | so 286 | sobald 287 | sollt 288 | sollte 289 | sollten 290 | sowie 291 | tat 292 | trotz 293 | tue 294 | tun 295 | tust 296 | tut 297 | über 298 | ueber 299 | um 300 | und 301 | uns 302 | unser 303 | unten 304 | unter 305 | unterhalb 306 | unterschiedlich 307 | viel 308 | viele 309 | vier 310 | von 311 | vor 312 | vorher 313 | während 314 | waehrend 315 | wann 316 | war 317 | warum 318 | was 319 | wegen 320 | weil 321 | weise 322 | welche 323 | welchem 324 | wem 325 | wen 326 | wenige 327 | wenn 328 | wer 329 | werde 330 | werden 331 | wessen 332 | wie 333 | wieder 334 | will 335 | willst 336 | wir 337 | wird 338 | wirklich 339 | wirst 340 | wissen 341 | wo 342 | wollen 343 | wurde 344 | z.b. 345 | zu 346 | zuerst 347 | zum 348 | zur 349 | zurück 350 | zurueck 351 | zwei 352 | zwischen 353 | 354 |
355 | -------------------------------------------------------------------------------- /dictionaries/es.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | "| 8 | (| 9 | 10 | 11 | 12 | 13 | ."| 14 | ,"| 15 | .| 16 | ,| 17 | "| 18 | )| 19 | ?| 20 | :| 21 | ;| 22 | !| 23 | 24 | 25 | 26 | 27 | wrote|write 28 | came|come 29 | went|go 30 | 31 | 32 | 33 | 34 | before1|1after 35 | 36 |
 37 |        before1|1after
 38 |    
39 | 40 | 41 | 42 | wrote|write 43 | 44 | 45 | 46 | assist|help 47 | 48 | 49 |
50 | 51 | 52 | ." 53 | ?" 54 | !" 55 | ," 56 | . 57 | ? 58 | ; 59 | | 60 | ! 61 | 62 | 63 | 64 | Dr. 65 | Mr. 66 | Mrs. 67 | U.S. 68 | Rep. 69 | Sen. 70 | 71 | 72 | 73 | a 74 | acá 75 | además 76 | adiós 77 | afuera 78 | ahí 79 | ahora 80 | al 81 | algo 82 | alguien 83 | algún 84 | alguno 85 | algunos 86 | alguna 87 | algunas 88 | allá 89 | allí 90 | alrededor 91 | ambos 92 | antes 93 | apenas 94 | aquel 95 | aquél 96 | aquello 97 | aquellos 98 | aquella 99 | aquellas 100 | aquí 101 | arriba 102 | así 103 | aun 104 | aún 105 | aunque 106 | ayer 107 | bajo 108 | bajos 109 | baja 110 | bajas 111 | bien 112 | cada 113 | casi 114 | cerca 115 | cero 116 | como 117 | cómo 118 | con 119 | conmigo 120 | contigo 121 | contra 122 | cual 123 | cuál 124 | cuales 125 | cualquier 126 | cualquiera 127 | cuando 128 | cuándo 129 | cuanta 130 | cuantas 131 | cuanto 132 | cuantos 133 | cuánta 134 | cuántas 135 | cuánto 136 | cuántos 137 | cuya 138 | cuyas 139 | cuyo 140 | cuyos 141 | de 142 | deber 143 | decena 144 | del 145 | delante 146 | demás 147 | demasiada 148 | demasiadas 149 | demasiado 150 | demasiados 151 | dentro 152 | desde 153 | después 154 | detrás 155 | docena 156 | donde 157 | dónde 158 | dos 159 | durante 160 | e 161 | el 162 | él 163 | ella 164 | ellas 165 | ello 166 | ellos 167 | en 168 | encima 169 | entonces 170 | entre 171 | era 172 | erais 173 | éramos 174 | eran 175 | eras 176 | eres 177 | es 178 | esa 179 | esas 180 | ese 181 | ése 182 | eso 183 | esos 184 | esta 185 | está 186 | ésta 187 | estado 188 | están 189 | estar 190 | estas 191 | estás 192 | este 193 | éste 194 | estes 195 | esto 196 | estoy 197 | estuve 198 | estuvieron 199 | estuvo 200 | fue 201 | fuera 202 | fueron 203 | fui 204 | gusta 205 | gustan 206 | gustar 207 | gustas 208 | ha 209 | haber 210 | hacer 211 | hacia 212 | haga 213 | hagamos 214 | hagan 215 | hagas 216 | hago 217 | han 218 | has 219 | hasta 220 | hay 221 | he 222 | hecho 223 | hemos 224 | hizo 225 | hoy 226 | hube 227 | hubiera 228 | hubo 229 | iba 230 | ibais 231 | ibamos 232 | iban 233 | ibas 234 | incluso 235 | ir 236 | jamás 237 | juntos 238 | la 239 | las 240 | le 241 | les 242 | lo 243 | los 244 | luego 245 | más 246 | me 247 | menos 248 | mi 249 | 250 | mía 251 | mientras 252 | mío 253 | muy 254 | nada 255 | nadie 256 | ni 257 | ningún 258 | ninguna 259 | ningunas 260 | ninguno 261 | ningunos 262 | no 263 | nos 264 | nosotros 265 | nuestra 266 | nuestras 267 | nuestro 268 | nuestros 269 | nunca 270 | o 271 | obstante 272 | otra 273 | otras 274 | otro 275 | otros 276 | para 277 | pero 278 | poder 279 | por 280 | porque 281 | primer 282 | primera 283 | primeras 284 | primero 285 | primeros 286 | pronto 287 | propia 288 | propias 289 | propio 290 | propios 291 | pude 292 | pues 293 | que 294 | qué 295 | quien 296 | quién 297 | quienes 298 | quiénes 299 | quizá 300 | quizás 301 | reciente 302 | se 303 | según 304 | segunda 305 | segundo 306 | ser 307 | si 308 | 309 | siempre 310 | sino 311 | siquiera 312 | sobre 313 | sois 314 | somos 315 | son 316 | sos 317 | soy 318 | su 319 | sus 320 | suya 321 | suyas 322 | suyo 323 | suyos 324 | tal 325 | también 326 | tampoco 327 | tan 328 | tanta 329 | tantas 330 | tanto 331 | tantos 332 | te 333 | ten 334 | tener 335 | ti 336 | todavía 337 | toda 338 | todas 339 | todo 340 | todos 341 | tras 342 | través 343 | tu 344 | 345 | tuve 346 | tuvo 347 | tuya 348 | tuyas 349 | tuyo 350 | tuyos 351 | u 352 | un 353 | una 354 | unas 355 | única 356 | único 357 | uno 358 | unos 359 | usted 360 | ustedes 361 | vais 362 | vos 363 | vosotros 364 | voy 365 | y 366 | ya 367 | yo 368 | 369 |
370 | -------------------------------------------------------------------------------- /dictionaries/pt.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | "| 7 | (| 8 | 9 | 10 | 11 | 12 | ."| 13 | ,"| 14 | .| 15 | ,| 16 | "| 17 | )| 18 | ?| 19 | :| 20 | ;| 21 | !| 22 | 23 | 24 | 25 | 26 | wrote|write 27 | came|come 28 | went|go 29 | 30 | 31 | 32 | before1|1after 33 | 34 |
 35 |        before1|1after
 36 |    
37 |
38 | 39 | 40 | 41 | ." 42 | ?" 43 | !" 44 | ," 45 | . 46 | ? 47 | ; 48 | | 49 | ! 50 | 51 | 52 | 53 | Dr. 54 | Mr. 55 | Mrs. 56 | U.S. 57 | Rep. 58 | Sen. 59 | 60 | 61 | 62 | 000 63 | a 64 | abaixo 65 | acerca 66 | acima 67 | adiante 68 | agora 69 | ah 70 | ah-ah 71 | ai 72 | ainda 73 | além 74 | algo 75 | alguém 76 | algum 77 | alguns 78 | algures 79 | alhures 80 | ali 81 | alô 82 | ambos 83 | anterior 84 | anteriormente 85 | antes 86 | apart 87 | apenas 88 | aqueles 89 | aqui 90 | aquilo 91 | as 92 | assim 93 | através 94 | atual 95 | atualmente 96 | been 97 | bem 98 | bom 99 | 100 | cada 101 | caminho 102 | causa 103 | cedo 104 | chamada 105 | chamado 106 | co 107 | coisas 108 | colocado 109 | colocar 110 | com 111 | como 112 | contudo 113 | couldn 114 | cujo 115 | d 116 | de 117 | define 118 | deixar 119 | dela 120 | dele 121 | deles 122 | demais 123 | demasiadamente 124 | depois 125 | depressa 126 | desde 127 | desligado 128 | deve 129 | deveria 130 | didn 131 | diferente 132 | directamente 133 | disse 134 | disso 135 | dito 136 | diz 137 | doesn 138 | dois 139 | don 140 | e 141 | é 142 | eis 143 | ela 144 | elas 145 | ele 146 | eles 147 | Eles 148 | eles 149 | em 150 | enquanto 151 | Enquanto 152 | então 153 | entre 154 | era 155 | eram 156 | éramos 157 | eras 158 | éreis 159 | és 160 | esta 161 | está 162 | estais 163 | estamos 164 | estão 165 | estar 166 | estás 167 | estava 168 | estavam 169 | estávamos 170 | estavas 171 | estáveis 172 | este 173 | estes 174 | esteve 175 | estive 176 | estivemos 177 | estiveram 178 | estiveste 179 | estou 180 | etc 181 | eu 182 | excepcionalmente 183 | excepto 184 | exceto 185 | faz 186 | fazer 187 | feito 188 | fez 189 | final 190 | finalizado 191 | foi 192 | fomos 193 | fora 194 | foram 195 | foste 196 | fostes 197 | frequente 198 | fui 199 | 200 | i.e 201 | ides 202 | inicial 203 | ir 204 | isn 205 | isso 206 | it's 207 | itself 208 | 209 | jamais 210 | ligado 211 | ll 212 | logo 213 | m 214 | mais 215 | mas 216 | máximo 217 | menor 218 | menos 219 | mesmo 220 | meu 221 | minha 222 | muito 223 | muitos 224 | must 225 | nada 226 | não 227 | nele 228 | nem 229 | nisso 230 | nither 231 | nos 232 | Nos 233 | nós 234 | nossa 235 | nosso 236 | novamente 237 | nt 238 | o 239 | obter 240 | obtido 241 | Oh 242 | ok 243 | okay 244 | onde 245 | ora 246 | os 247 | ou 248 | outra 249 | outrem 250 | outro 251 | outrora 252 | outrossim 253 | par 254 | para 255 | parada 256 | parado 257 | parece 258 | pensa 259 | pensar 260 | pode 261 | podem 262 | podia 263 | por 264 | porém 265 | porque 266 | porquê 267 | pot 268 | pouco 269 | poucos 270 | precisa 271 | prefer 272 | preferia 273 | preferir 274 | primeiro 275 | própria 276 | próprio 277 | próximo 278 | qualquer 279 | Quando 280 | quando 281 | quase 282 | quatro 283 | que 284 | quem 285 | Quem 286 | quer 287 | re 288 | realmente 289 | repetir 290 | s 291 | sabe 292 | são 293 | se 294 | seguinte 295 | sem 296 | sempre 297 | sendo 298 | ser 299 | seu 300 | sim 301 | sob 302 | sobre 303 | sois 304 | somos 305 | sou 306 | Sr 307 | sua 308 | suficiente 309 | t 310 | tal 311 | Talvês 312 | talvez 313 | também 314 | tanto 315 | tão 316 | tem 317 | têm 318 | temos 319 | tendes 320 | tenho 321 | tens 322 | ter 323 | teu 324 | teve 325 | tinha 326 | tinham 327 | tínhamos 328 | tinhas 329 | tínheis 330 | tive 331 | tivemos 332 | tiveram 333 | tiveste 334 | tivestes 335 | to 336 | todo 337 | topo 338 | três 339 | tu 340 | tua 341 | tudo 342 | último 343 | um 344 | uma 345 | unicamente 346 | up 347 | us 348 | use 349 | vai 350 | vais 351 | vamos 352 | vão 353 | várias 354 | vários 355 | ve 356 | 357 | vem 358 | vistas 359 | você 360 | vocês 361 | vós 362 | vosso 363 | vou 364 | 365 |
366 | -------------------------------------------------------------------------------- /ext/ots/libots/stemmer.c: -------------------------------------------------------------------------------- 1 | /* 2 | * stemmer.c 3 | * 4 | * Copyright (C) 2003 Nadav Rotem 5 | * 6 | * This program is free software; you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation; either version 2 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU Library General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program; if not, write to the Free Software 18 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 19 | */ 20 | 21 | 22 | #include 23 | #include 24 | #include 25 | #include "libots.h" 26 | 27 | #define MAX_PREFIX_SIZE 256 28 | 29 | OtsStemRule * 30 | new_stem_rule () 31 | { 32 | OtsStemRule *rule = g_new0 (OtsStemRule, 1); 33 | return rule; 34 | } 35 | 36 | void 37 | free_stem_rule (OtsStemRule *rule) 38 | { 39 | 40 | if (rule != NULL) 41 | { 42 | g_list_foreach (rule->RemovePre, (GFunc) g_free, NULL); 43 | g_list_free (rule->RemovePre); 44 | g_list_foreach (rule->RemovePost, (GFunc) g_free, NULL); 45 | g_list_free (rule->RemovePost); 46 | 47 | g_list_foreach (rule->step1_pre, (GFunc) g_free, NULL); 48 | g_list_free (rule->step1_pre); 49 | g_list_foreach (rule->step1_post, (GFunc) g_free, NULL); 50 | g_list_free (rule->step1_post); 51 | 52 | g_list_foreach (rule->synonyms, (GFunc) g_free, NULL); 53 | g_list_free (rule->synonyms); 54 | g_list_foreach (rule->manual, (GFunc) g_free, NULL); 55 | g_list_free (rule->manual); 56 | 57 | g_list_foreach (rule->ParserBreak, (GFunc) g_free, NULL); 58 | g_list_free (rule->ParserBreak); 59 | g_list_foreach (rule->ParserDontBreak, (GFunc) g_free, NULL); 60 | g_list_free (rule->ParserDontBreak); 61 | 62 | g_list_foreach (rule->ReplaceChars, (GFunc) g_free, NULL); 63 | g_list_free (rule->ReplaceChars); 64 | 65 | g_free (rule); 66 | } 67 | return; 68 | } 69 | 70 | static void 71 | ots_stem_break (unsigned const char *comp,unsigned char *part_a,unsigned char *part_b) /*given already alocated part_a and b */ 72 | { /*example "red|blue" */ 73 | int i, j, clen; 74 | i = 0; 75 | j = 0; 76 | 77 | if (comp==NULL) return; 78 | if (part_a==NULL) return; 79 | if (part_b==NULL) return; 80 | 81 | clen = strlen (comp); 82 | 83 | 84 | part_a[0] = 0; 85 | part_b[0] = 0; 86 | 87 | while ((i < clen) && (i < MAX_PREFIX_SIZE) && (comp[i] != '|')) 88 | { 89 | part_a[i] = comp[i]; 90 | i++; 91 | } 92 | part_a[i] = 0; 93 | 94 | i++; /*skip the | mark */ 95 | while (i < clen && (j < MAX_PREFIX_SIZE)) 96 | { 97 | part_b[j] = comp[i]; 98 | i++; 99 | j++; 100 | } 101 | part_b[j] = 0; 102 | return; 103 | } 104 | 105 | 106 | static unsigned char * 107 | ots_stem_remove_pre (unsigned const char *aWord,unsigned const char *pre,unsigned const char *new) 108 | { 109 | int i, plen, wlen, nlen; 110 | unsigned char *new_str = NULL; 111 | 112 | if (aWord==NULL) return NULL; 113 | 114 | plen = strlen (pre); 115 | wlen = strlen (aWord); 116 | nlen = strlen (new); 117 | 118 | for (i = 0; i < plen; i++) 119 | if (aWord[i] != pre[i]) 120 | return NULL; /*no match */ 121 | 122 | new_str = g_new0 (char, wlen + nlen +5); 123 | for (i = 0; i <= nlen; i++) 124 | new_str[i] = new[i]; 125 | 126 | for (i = nlen; i <= nlen + wlen - plen; i++) 127 | new_str[i] = aWord[i + plen - nlen]; 128 | 129 | new_str[i + 1] = 0; 130 | return new_str; 131 | } 132 | 133 | 134 | 135 | static unsigned char * 136 | ots_stem_remove_post (unsigned const char *aWord,unsigned const char *post,unsigned const char *new) 137 | { 138 | unsigned int i, wlen, plen, nlen; 139 | unsigned char *new_str = NULL; 140 | 141 | if ((NULL==aWord)||(NULL==post)||(NULL==new)) return NULL; 142 | 143 | wlen = strlen (aWord); 144 | plen = strlen (post); 145 | nlen = strlen (new); 146 | 147 | if (plen>wlen) return NULL; 148 | 149 | 150 | for (i = 0; i < plen; i++) 151 | if (aWord[wlen - plen + i]!= post[i]) 152 | return NULL; /* no match */ 153 | 154 | new_str = g_new0 (char, wlen + nlen +5); 155 | 156 | for (i = 0; i <= wlen - plen; i++) /*place word */ 157 | new_str[i] = aWord[i]; 158 | 159 | for (i = 0; i <= nlen; i++) /*place newfix */ 160 | new_str[wlen - plen + i] = new[i]; 161 | 162 | return new_str; /*word replaced */ 163 | } 164 | 165 | 166 | 167 | static unsigned char * 168 | ots_stem_replace_word (unsigned const char *aWord,unsigned const char *old,unsigned const char *new) 169 | { 170 | 171 | if (aWord==NULL) return NULL; 172 | 173 | if ((aWord)&&(0 == strcmp (aWord, old))) 174 | { 175 | return g_strdup (new); 176 | } 177 | else 178 | { 179 | return NULL; 180 | } 181 | 182 | } 183 | 184 | 185 | 186 | 187 | 188 | unsigned char * 189 | ots_stem_format (unsigned const char *aWord, const OtsStemRule * rule) 190 | { 191 | GList *li; 192 | unsigned char *rep = NULL; 193 | unsigned char *normWord = NULL; 194 | 195 | if (aWord==NULL) return NULL; 196 | 197 | normWord = g_utf8_strdown (aWord, -1); /*lowercase the word */ 198 | 199 | char *prefix; 200 | char *newfix; 201 | 202 | prefix = g_new0 (char, MAX_PREFIX_SIZE); 203 | newfix = g_new0 (char, MAX_PREFIX_SIZE); 204 | 205 | for (li = (GList *) rule->step1_pre; li != NULL; li = li->next) 206 | { 207 | ots_stem_break (li->data, prefix, newfix); 208 | rep = ots_stem_remove_pre (normWord, prefix, newfix); 209 | if (NULL != rep) 210 | { 211 | g_free (normWord); 212 | normWord = rep; 213 | rep = NULL; 214 | } 215 | } 216 | 217 | 218 | for (li = (GList *) rule->step1_post; li != NULL; li = li->next) 219 | { 220 | ots_stem_break (li->data, prefix, newfix); 221 | rep = ots_stem_remove_post(normWord, prefix, newfix); 222 | if (NULL != rep) 223 | { 224 | g_free (normWord); 225 | normWord = rep; 226 | rep = NULL; 227 | } 228 | } 229 | 230 | g_free (prefix); 231 | g_free (newfix); 232 | 233 | return normWord; 234 | } 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | unsigned char * 244 | ots_stem_strip (unsigned const char *aWord,const OtsStemRule * rule) 245 | { 246 | GList *li; 247 | unsigned char *rep = NULL; 248 | 249 | unsigned char *prefix; 250 | unsigned char *newfix; 251 | unsigned char *normWord=NULL; 252 | 253 | prefix = g_new0 (char, MAX_PREFIX_SIZE); 254 | newfix = g_new0 (char, MAX_PREFIX_SIZE); 255 | 256 | if (aWord==NULL) return NULL; 257 | 258 | normWord = ots_stem_format (aWord,rule); 259 | 260 | 261 | for (li = (GList *) rule->manual; li != NULL; li = li->next) 262 | { 263 | ots_stem_break (li->data, prefix, newfix); 264 | rep = ots_stem_replace_word (normWord, prefix, newfix); 265 | if (NULL != rep) 266 | { 267 | g_free (normWord); 268 | normWord = rep; 269 | rep = NULL; 270 | break; 271 | } 272 | } 273 | 274 | 275 | 276 | 277 | for (li = (GList *) rule->RemovePre; li != NULL; li = li->next) 278 | { 279 | ots_stem_break (li->data, prefix, newfix); 280 | rep = ots_stem_remove_pre (normWord, prefix, newfix); 281 | if (NULL != rep) 282 | { 283 | g_free (normWord); 284 | normWord = rep; 285 | rep = NULL; 286 | break; 287 | } 288 | } 289 | 290 | 291 | for (li = (GList *) rule->RemovePost; li != NULL; li = li->next) 292 | { 293 | ots_stem_break (li->data, prefix, newfix); 294 | rep = ots_stem_remove_post (normWord, prefix, newfix); 295 | if (NULL != rep) 296 | { 297 | g_free (normWord); 298 | normWord = rep; 299 | rep = NULL; 300 | break; 301 | } 302 | 303 | } 304 | 305 | 306 | for (li = (GList *) rule->synonyms; li != NULL; li = li->next) 307 | { 308 | ots_stem_break (li->data, prefix, newfix); 309 | rep = ots_stem_replace_word (normWord, prefix, newfix); 310 | if (NULL != rep) 311 | { 312 | g_free (normWord); 313 | normWord = rep; 314 | rep = NULL; 315 | break; 316 | } 317 | } 318 | 319 | 320 | g_free (prefix); 321 | g_free (newfix); 322 | 323 | 324 | if (strlen(normWord)<3) /*stem is two letter long. thats not right. N(eed)==N(ation) ?*/ 325 | { 326 | g_free(normWord); 327 | normWord = ots_stem_format (aWord,rule); /*lowercase the word */ 328 | } 329 | 330 | 331 | return normWord; 332 | } 333 | --------------------------------------------------------------------------------