├── models ├── nob │ ├── nob-nowac-tok.bin │ └── nob-nowac-sent.bin └── es │ ├── opennlp-es-maxent-pos-es.bin │ ├── opennlp-es-perceptron-pos-es.bin │ ├── opennlp-es-maxent-pos-universal.bin │ └── opennlp-es-perceptron-pos-universal.bin ├── lang ├── es │ └── pos │ │ ├── download-conll.sh │ │ ├── convert-data.sh │ │ ├── README.txt │ │ ├── train.sh │ │ └── ner2pos.py └── nob │ └── nowac │ └── nowacTrainReformat.pl ├── README └── LICENSE /models/nob/nob-nowac-tok.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/utcompling/OpenNLP-Models/HEAD/models/nob/nob-nowac-tok.bin -------------------------------------------------------------------------------- /models/nob/nob-nowac-sent.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/utcompling/OpenNLP-Models/HEAD/models/nob/nob-nowac-sent.bin -------------------------------------------------------------------------------- /models/es/opennlp-es-maxent-pos-es.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/utcompling/OpenNLP-Models/HEAD/models/es/opennlp-es-maxent-pos-es.bin -------------------------------------------------------------------------------- /models/es/opennlp-es-perceptron-pos-es.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/utcompling/OpenNLP-Models/HEAD/models/es/opennlp-es-perceptron-pos-es.bin -------------------------------------------------------------------------------- /models/es/opennlp-es-maxent-pos-universal.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/utcompling/OpenNLP-Models/HEAD/models/es/opennlp-es-maxent-pos-universal.bin -------------------------------------------------------------------------------- /models/es/opennlp-es-perceptron-pos-universal.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/utcompling/OpenNLP-Models/HEAD/models/es/opennlp-es-perceptron-pos-universal.bin -------------------------------------------------------------------------------- /lang/es/pos/download-conll.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Downloads the Spanish data of the CoNLL 2002 Shared task: 4 | # http://www.lsi.upc.edu/~nlp/tools/nerc/nerc.html 5 | # 6 | 7 | 8 | destdir="es-conll" 9 | 10 | if [ -d $destdir ]; then 11 | echo "Destination directory already exists: $destdir." 12 | exit 1 13 | fi 14 | 15 | mkdir $destdir; 16 | cd $destdir; 17 | wget http://www.lsi.upc.edu/~nlp/tools/nerc/esp.train.gz 18 | wget http://www.lsi.upc.edu/~nlp/tools/nerc/esp.testa.gz 19 | wget http://www.lsi.upc.edu/~nlp/tools/nerc/esp.testb.gz 20 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | This project will collect code and documentation for creating models for natural language processing with the Apache OpenNLP Toolkit. The OpenNLP project website is here: 2 | 3 | http://incubator.apache.org/opennlp/ 4 | 5 | The primary reason for this project is that it is not possible to distribute models based on restricted data through the Apache sites, and many of the corpora that are available for training models must be obtained through some licensing agreement. The other purpose is to make it clear what contexts (e.g. academic, industry, etc) a given model may be used in to ensure that the wishes of copyright holders for a given corpus are respected. 6 | 7 | Ultimately, this project should replace the old model download site from Sourceforge, especially in ensuring that models are compatible with newer versions of the OpenNLP code. 8 | 9 | http://opennlp.sourceforge.net/models-1.5/ 10 | -------------------------------------------------------------------------------- /lang/es/pos/convert-data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Converts the datasets from the CoNLL Spanish NER task [1] to the format used 3 | # by OpenNLP's POS tag trainer. 4 | # 5 | # It creates two versions of the datasets, one using the existing tags and the 6 | # other with the universal POS tags, as described by: 7 | # 8 | # http://code.google.com/p/universal-pos-tags/ 9 | # 10 | # [1] http://www.lsi.upc.edu/~nlp/tools/nerc/nerc.html 11 | 12 | mkdir -p data/pos-universal 13 | mkdir -p data/pos-es 14 | 15 | # Use the files esp.train and esp.testa as training set. 16 | zcat es-conll/esp.train.gz es-conll/esp.testa.gz | python ner2pos.py - > data/pos-es/es-train.pos 17 | zcat es-conll/esp.testb.gz | python ner2pos.py - > data/pos-es/es-test.pos 18 | 19 | zcat es-conll/esp.train.gz es-conll/esp.testa.gz | python ner2pos.py --universal - > data/pos-universal/es-train.pos 20 | zcat es-conll/esp.testb.gz | python ner2pos.py --universal - > data/pos-universal/es-test.pos 21 | 22 | echo "Created files:" 23 | ls -1 data/pos-es/ 24 | ls -1 data/pos-universal/ 25 | 26 | echo "Done" 27 | echo 28 | -------------------------------------------------------------------------------- /lang/es/pos/README.txt: -------------------------------------------------------------------------------- 1 | 2 | ================================= 3 | Spanish POS Tagger OpenNLP Models 4 | ================================= 5 | 6 | These POS tagging models for Spanish were trained using the CoNLL data [1] and 7 | OpenNLP 1.5.2 [2] 8 | 9 | There are two versions using a different model type (percetron and maxent) and 10 | there are also versions of the models using the universal Part of Speech 11 | Tags [3]. 12 | 13 | The models are distributed under the same terms of the CoNLL data, so they 14 | cannot be used for commercial purposes. 15 | 16 | 17 | Creating the models 18 | =================== 19 | 20 | 1. Download the Spanish data of the CoNLL 2002 Shared task [1]: 21 | ./download-conll.sh 22 | 23 | 2. Convert the data to the format required by OpenNLP: 24 | ./convert-data.sh 25 | 26 | 3. Train the models: 27 | ./train.sh 28 | 29 | Evaluation results 30 | ================== 31 | 32 | The following are the accuracies of the models on the testb dataset: 33 | 34 | Original tags 35 | ------------- 36 | 37 | models/opennlp-es-pos-perceptron-pos-es.model 38 | 0.9606339070165875 39 | 40 | models/opennlp-es-pos-maxent-pos-es.model 41 | 0.9629507047737715 42 | 43 | Universal tags 44 | -------------- 45 | 46 | models/opennlp-es-pos-perceptron-pos-universal.model 47 | 0.9611985047893467 48 | 49 | models/opennlp-es-pos-maxent-pos-universal.model 50 | 0.9629117669963398 51 | 52 | 53 | Contact information 54 | =================== 55 | 56 | Please send your comments and questions to Juan Manuel Caicedo Carvajal: 57 | 58 | http://cavorite.com/labs/nlp/opennlp-models-es/ 59 | 60 | 61 | [1] http://www.lsi.upc.edu/~nlp/tools/nerc/nerc.html 62 | [2] http://incubator.apache.org/opennlp/ 63 | [3] http://code.google.com/p/universal-pos-tags/ 64 | 65 | -------------------------------------------------------------------------------- /lang/es/pos/train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Train the OpenNLP POS models for Spanish. 4 | # 5 | # This script requires that the training and testing data already exists in 6 | # the following directories: 7 | # 8 | # data/pos-es (for the original tagset included in the ConLL data) 9 | # data/pos-universal (for the universal POS tags) 10 | # 11 | 12 | 13 | if [ -z $OPENNLP_HOME ]; then 14 | echo "OPENNLP_HOME environment variable is not defined." 15 | exit 1 16 | fi 17 | 18 | data_dirs="pos-es pos-universal" 19 | 20 | for subd in $data_dirs; do 21 | d="data/$subd" 22 | if [ ! -f $d/es-train.pos ] || [ ! -f $d/es-test.pos ]; then 23 | echo "Directory does not contain training and testing data: $d" 24 | fi 25 | done 26 | 27 | outdir="models" 28 | if [ ! -d $outdir ]; then 29 | mkdir $outdir; 30 | fi; 31 | 32 | # Number of iterations 33 | iters=200 34 | 35 | opennlp=$OPENNLP_HOME/bin/opennlp 36 | 37 | # Temporary log file for the evaluation output 38 | log_eval=`mktemp --tmpdir opennlp_es_train.XXXXXX` 39 | 40 | model_types="perceptron maxent" 41 | 42 | for data_dir in $data_dirs; do 43 | train_path=data/$data_dir/es-train.pos 44 | test_path=data/$data_dir/es-test.pos 45 | 46 | for model_type in $model_types; do 47 | out_model="$outdir/opennlp-es-${model_type}-${data_dir}.bin" 48 | 49 | echo "Training model: $out_model" 50 | $opennlp POSTaggerTrainer -type $model_type -iterations $iters \ 51 | -lang es -encoding utf-8 -data $train_path -model $out_model 52 | 53 | eval_results=`$opennlp POSTaggerEvaluator -encoding utf-8 \ 54 | -data $test_path -model $out_model` 55 | 56 | eval_accuracy=`echo -e $eval_results | grep 'Accuracy:' | cut -d ':' -f 2` 57 | 58 | echo "Evaluating model" 59 | echo -e $eval_results 60 | echo 61 | echo "------------" 62 | echo 63 | 64 | echo -e "$out_model\n$eval_accuracy\n" >> $log_eval 65 | done 66 | done 67 | 68 | # Print results and delete temporary file 69 | echo 70 | echo "Evaluation results" 71 | echo 72 | cat $log_eval 73 | rm $log_eval 74 | echo 75 | -------------------------------------------------------------------------------- /lang/es/pos/ner2pos.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | ''' 3 | Converts a file with the CoNLL task format [1] to the format commonly used for 4 | training POS taggers. 5 | 6 | This script also allows to use universal POS tags, as described by: 7 | 8 | http://code.google.com/p/universal-pos-tags/ 9 | 10 | [1] http://www.lsi.upc.edu/~nlp/tools/nerc/nerc.html 11 | 12 | ''' 13 | import sys 14 | import argparse 15 | import codecs 16 | import fileinput 17 | 18 | # Universal tags mapping table: 19 | # http://code.google.com/p/universal-pos-tags/ 20 | # http://code.google.com/p/universal-pos-tags/source/browse/trunk/es-cast3lb.map 21 | universal_tags = { 22 | 'Fa':'.', 23 | 'Fc':'.', 24 | 'Fd':'.', 25 | 'Fe':'.', 26 | 'Fg':'.', 27 | 'Fh':'.', 28 | 'Fi':'.', 29 | 'Fp':'.', 30 | 'Fs':'.', 31 | 'Fx':'.', 32 | 'Fz':'.', 33 | 'X':'X', 34 | 'Y':'X', 35 | 'Zm':'NUM', 36 | 'Zp':'NUM', 37 | 'AO':'ADJ', 38 | 'AQ':'ADJ', 39 | 'CC':'CONJ', 40 | 'CS':'CONJ', 41 | 'DA':'DET', 42 | 'DD':'DET', 43 | 'DE':'DET', 44 | 'DI':'DET', 45 | 'DN':'DET', 46 | 'DP':'DET', 47 | 'DT':'DET', 48 | 'I':'X', 49 | 'NC':'NOUN', 50 | 'NP':'NOUN', 51 | 'P0':'PRON', 52 | 'PD':'PRON', 53 | 'PE':'PRON', 54 | 'PI':'PRON', 55 | 'PN':'PRON', 56 | 'PP':'PRON', 57 | 'PR':'PRON', 58 | 'PT':'PRON', 59 | 'PX':'PRN', 60 | 'RG':'ADV', 61 | 'RN':'ADV', 62 | 'SN':'ADP', 63 | 'SP':'ADP', 64 | 'VA':'VERB', 65 | 'VM':'VERB', 66 | 'VS':'VERB', 67 | 'W':'NUM', 68 | 'Z':'NUM' 69 | } 70 | 71 | def parse_lines(in_file, universal=False): 72 | ''' 73 | Convert 74 | ''' 75 | sentence = [] 76 | for line in in_file: 77 | parts = line.strip().split() 78 | 79 | if not parts and sentence: 80 | # Skip sentence that have onle a delimiter, such as '-' or '=' 81 | if len(sentence) > 2: 82 | print u' '.join(sentence) 83 | sentence = [] 84 | 85 | if len(parts) != 3: 86 | continue 87 | 88 | word, tag, _ = parts 89 | if word.startswith('==='): 90 | continue 91 | 92 | if universal: 93 | tag = universal_tags.get(tag, tag) 94 | 95 | sentence.append('%s_%s' % (word, tag)) 96 | 97 | 98 | 99 | def main(): 100 | parser = argparse.ArgumentParser() 101 | parser.add_argument('--universal', '-u', action='store_true', 102 | help='Use Universal Part-of-Speech Tags', default=False) 103 | parser.add_argument('--input_encoding', default='latin1', 104 | help='Input file encoding (latin1 by default)') 105 | parser.add_argument('input_file', type=argparse.FileType('r'), nargs='+', 106 | help='Input files.', default=sys.stdin) 107 | 108 | args = parser.parse_args() 109 | 110 | sys.stdout = codecs.getwriter('utf-8')(sys.stdout) 111 | 112 | 113 | for in_file in args.input_file: 114 | in_file = codecs.getreader(args.input_encoding)(in_file) 115 | parse_lines(in_file, universal=args.universal) 116 | 117 | 118 | if __name__ == '__main__': 119 | main() 120 | 121 | -------------------------------------------------------------------------------- /lang/nob/nowac/nowacTrainReformat.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | ############################################################################## 3 | # 4 | # This script takes the NOWAC corpus of Bokmål Norwegian and reformats a configuable number 5 | # of sentences from the corpus into training data for sentence detection and tokeniztion using OpenNLP 6 | # 7 | # Input: 8 | # The nowac corpus file (specified with the --in command line option) 9 | # The number of sentences to process and output in the training data (specified with the --smax command line option) 10 | # Output: 11 | # Two files: "nowacSent.train" -- a training file formatted for the SentenceDetectorTrainer of OpenNLP 12 | # "nowacTok.train" -- a training file formatted for the TokenizerTrainer of OpenNLP 13 | # 14 | # The files are written to the current directory. 15 | # 16 | # The above two files serve as the training data input files to OpenNLP's training process, 17 | # invocable using OpenNLP's command line tools, e.g.: 18 | # 19 | # bin/opennlp TokenizerTrainer -encoding UTF-8 -lang nb -data nowacTok.train -model nb-tok.bin 20 | # bin/opennlp SentenceDetectorTrainer -encoding UTF-8 -lang nb -data nowacTok.train -model nb-sent.bin 21 | # 22 | # Author: Stephan Greene 23 | ############################################################################## 24 | 25 | use strict; 26 | # Specify UTF-8 input and output 27 | use utf8; 28 | binmode STDOUT, ":utf8"; 29 | binmode STDIN, ":utf8"; 30 | 31 | # Libraries 32 | use Getopt::Long; # Command line handling 33 | 34 | # Handle command line 35 | my $infilet = ""; 36 | my $nSentMax = 100; 37 | my $help = 0; 38 | GetOptions( "in=s"=> \$infilet, 39 | "smax:i"=>\$nSentMax, 40 | "H|help"=>\$help) 41 | or usage(); 42 | 43 | usage() if (($help) || ($infilet eq "")); 44 | 45 | sub usage { 46 | print STDERR "Usage: $0\n"; 47 | print STDERR " --in [input file] nowoc corpus file\n"; 48 | print STDERR " --smax [maximum number of sentences to process -- default 100]\n"; 49 | die "Exiting\n"; 50 | } 51 | 52 | open( fh, $infilet ) or die("can't open $infilet"); 53 | open( sentOutput, ">nowacSent.train"); 54 | open( tokOutput, ">nowacTok.train"); 55 | 56 | my @tokens = (); 57 | my $nSent = 0; 58 | 59 | 60 | #1. No space after token: « ( [ 61 | #2. No space before token: » ) ] : , . ; ? ! 62 | #3. Space unknown (no spaces): | - ' 63 | #4. Alternating spacing: " 64 | #5. ?: ¶ 65 | 66 | sub writeSentence 67 | { 68 | my (@toks) = @_; 69 | my $nTok = @toks; 70 | my $nDoubleQ = 0; 71 | my @spacings = (); 72 | for( my $i = 0; $i < $nTok; ++$i ) 73 | { 74 | if ( $tokens[$i] =~ /^[«([]$/ ) { push(@spacings, 0); } 75 | elsif ( $tokens[$i] =~ /^[»)\]:,.;?!]$/ ) { pop(@spacings); push(@spacings, 0); push(@spacings, 1); } 76 | elsif ( $tokens[$i] eq "|" ) { pop(@spacings); push(@spacings, 0); push(@spacings, 0); } 77 | elsif ( $tokens[$i] eq "'" ) { pop(@spacings); push(@spacings, 0); push(@spacings, 0); } 78 | elsif ( $tokens[$i] eq "-" ) { pop(@spacings); push(@spacings, 0); push(@spacings, 0); } 79 | elsif ( $tokens[$i] eq "/" ) { pop(@spacings); push(@spacings, 0); push(@spacings, 0); } 80 | elsif ( $tokens[$i] eq "\"" ) 81 | { 82 | ++$nDoubleQ; 83 | if( $nDoubleQ % 2 == 0 ) { pop(@spacings); push(@spacings, 0); push(@spacings, 1 ); } 84 | else { push(@spacings, 0 ); } 85 | } 86 | elsif ( $i == $nTok - 1 ) { push(@spacings, 0); } 87 | else { push(@spacings, 1); } 88 | } 89 | 90 | for( my $i = 0; $i < $nTok; ++$i ) 91 | { 92 | print( sentOutput $tokens[$i]); 93 | print( tokOutput $tokens[$i]); 94 | if( $spacings[$i] == 1 ) { print( sentOutput " "); print( tokOutput " "); } 95 | else { print( tokOutput ""); } 96 | } 97 | print(sentOutput "\n"); 98 | print(tokOutput "\n"); 99 | } 100 | 101 | while ( my $line = ) 102 | { 103 | if( $line =~ /^<\/s>/) 104 | { 105 | my $nToks = @tokens; 106 | if( $nToks gt 0 ) 107 | { 108 | writeSentence(@tokens); 109 | ++$nSent; 110 | } 111 | if( $nSent != $nSentMax && $nSent % 100 == 0 ) { print("$nSent sentences processed...\n"); } 112 | if( $nSent >= $nSentMax) { last; } 113 | } 114 | elsif( $line =~ /^